From 4a33af6bb63eaa69a4a2cc0d4f222d37d7531b9c Mon Sep 17 00:00:00 2001
From: Keith Randall <khr@golang.org>
Date: Mon, 18 Jul 2016 15:52:59 -0700
Subject: [PATCH] [dev.ssa] cmd/compile: more 386 port changes

Fix up zero/move code, including duff calls and rep movs.

Handle the new ops generated by dec64.rules.

Fix constant shifts.

Change-Id: I7d89194b29b04311bfafa0fd93b9f5644af04df9
Reviewed-on: https://go-review.googlesource.com/25033
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: David Chase <drchase@google.com>
---
 src/cmd/compile/internal/ssa/gen/386.rules   |  79 ++-
 src/cmd/compile/internal/ssa/gen/386Ops.go   |   9 +-
 src/cmd/compile/internal/ssa/gen/AMD64.rules |   4 +-
 src/cmd/compile/internal/ssa/gen/dec64.rules |   2 +-
 src/cmd/compile/internal/ssa/opGen.go        |  44 +-
 src/cmd/compile/internal/ssa/rewrite.go      |  45 +-
 src/cmd/compile/internal/ssa/rewrite386.go   | 606 ++++++++++++++-----
 src/cmd/compile/internal/ssa/rewriteAMD64.go |   6 +-
 src/cmd/compile/internal/x86/ssa.go          |  29 +-
 9 files changed, 550 insertions(+), 274 deletions(-)
diff --git a/src/cmd/compile/internal/ssa/gen/386.rules b/src/cmd/compile/internal/ssa/gen/386.rules
index 6569533b77..0587be4367 100644
--- a/src/cmd/compile/internal/ssa/gen/386.rules
+++ b/src/cmd/compile/internal/ssa/gen/386.rules
@@ -83,8 +83,7 @@
 (Not x) -> (XORLconst [1] x)
 
 // Lowering pointer arithmetic
-(OffPtr [off] ptr) && is32Bit(off) -> (ADDLconst [off] ptr)
-(OffPtr [off] ptr) -> (ADDL (MOVLconst [off]) ptr)
+(OffPtr [off] ptr) -> (ADDLconst [off] ptr)
 
 (Bswap32 x) -> (BSWAPL x)
 
@@ -99,6 +98,9 @@
 (ZeroExt8to32  x) -> (MOVBLZX x)
 (ZeroExt16to32 x) -> (MOVWLZX x)
 
+(Signmask x) -> (SARLconst x [31])
+(Zeromask x) -> (SBBLcarrymask (CMPL (MOVLconst [0]) x))
+
 // Lowering truncation
 // Because we ignore high parts of registers, truncates are just copies.
 (Trunc16to8  x) -> x
@@ -161,6 +163,26 @@
 (Rsh8x16 <t> x y)  -> (SARB <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPWconst y [8])))))
 (Rsh8x8  <t> x y)  -> (SARB <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPBconst y [8])))))
 
+// constant shifts
+// generic opt rewrites all constant shifts to shift by Const64
+(Lsh32x64 x (Const64 [c])) && uint64(c) < 32 -> (SHLLconst x [c])
+(Rsh32x64 x (Const64 [c])) && uint64(c) < 32 -> (SARLconst x [c])
+(Rsh32Ux64 x (Const64 [c])) && uint64(c) < 32 -> (SHRLconst x [c])
+(Lsh16x64 x (Const64 [c])) && uint64(c) < 16 -> (SHLLconst x [c])
+(Rsh16x64 x (Const64 [c])) && uint64(c) < 16 -> (SARWconst x [c])
+(Rsh16Ux64 x (Const64 [c])) && uint64(c) < 16 -> (SHRWconst x [c])
+(Lsh8x64 x (Const64 [c])) && uint64(c) < 8 -> (SHLLconst x [c])
+(Rsh8x64 x (Const64 [c])) && uint64(c) < 8 -> (SARBconst x [c])
+(Rsh8Ux64 x (Const64 [c])) && uint64(c) < 8 -> (SHRBconst x [c])
+
+// large constant shifts
+(Lsh32x64 _ (Const64 [c])) && uint64(c) >= 32 -> (Const32 [0])
+(Rsh32Ux64 _ (Const64 [c])) && uint64(c) >= 32 -> (Const32 [0])
+(Lsh16x64 _ (Const64 [c])) && uint64(c) >= 16 -> (Const16 [0])
+(Rsh16Ux64 _ (Const64 [c])) && uint64(c) >= 16 -> (Const16 [0])
+(Lsh8x64 _ (Const64 [c])) && uint64(c) >= 8 -> (Const8 [0])
+(Rsh8Ux64 _ (Const64 [c])) && uint64(c) >= 8 -> (Const8 [0])
+
 // Lowering comparisons
 (Less32  x y) -> (SETL (CMPL x y))
 (Less16  x y) -> (SETL (CMPW x y))
@@ -241,7 +263,6 @@
 (Move [s] dst src mem) && SizeAndAlign(s).Size() == 1 -> (MOVBstore dst (MOVBload src mem) mem)
 (Move [s] dst src mem) && SizeAndAlign(s).Size() == 2 -> (MOVWstore dst (MOVWload src mem) mem)
 (Move [s] dst src mem) && SizeAndAlign(s).Size() == 4 -> (MOVLstore dst (MOVLload src mem) mem)
-(Move [s] dst src mem) && SizeAndAlign(s).Size() == 16 -> (MOVOstore dst (MOVOload src mem) mem)
 (Move [s] dst src mem) && SizeAndAlign(s).Size() == 3 ->
 	(MOVBstore [2] dst (MOVBload [2] src mem)
 		(MOVWstore dst (MOVWload src mem) mem))
@@ -254,21 +275,32 @@
 (Move [s] dst src mem) && SizeAndAlign(s).Size() == 7 ->
 	(MOVLstore [3] dst (MOVLload [3] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 8 ->
+	(MOVLstore [4] dst (MOVLload [4] src mem)
+		(MOVLstore dst (MOVLload src mem) mem))
+
+// Adjust moves to be a multiple of 4 bytes.
+(Move [s] dst src mem)
+	&& SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size()%4 != 0 ->
+	(Move [SizeAndAlign(s).Size()-SizeAndAlign(s).Size()%4]
+		(ADDLconst <dst.Type> dst [SizeAndAlign(s).Size()%4])
+		(ADDLconst <src.Type> src [SizeAndAlign(s).Size()%4])
+		(MOVLstore dst (MOVLload src mem) mem))
 
 // Medium copying uses a duff device.
 (Move [s] dst src mem)
-	&& SizeAndAlign(s).Size() >= 32 && SizeAndAlign(s).Size() <= 16*64 && SizeAndAlign(s).Size()%16 == 0
+	&& SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size() <= 4*128 && SizeAndAlign(s).Size()%4 == 0
 	&& !config.noDuffDevice ->
-	(DUFFCOPY [14*(64-SizeAndAlign(s).Size()/16)] dst src mem)
-// 14 and 64 are magic constants.  14 is the number of bytes to encode:
-//	MOVUPS	(SI), X0
-//	ADDL	$16, SI
-//	MOVUPS	X0, (DI)
-//	ADDL	$16, DI
-// and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.
+	(DUFFCOPY [10*(128-SizeAndAlign(s).Size()/4)] dst src mem)
+// 10 and 128 are magic constants.  10 is the number of bytes to encode:
+//	MOVL	(SI), CX
+//	ADDL	$4, SI
+//	MOVL	CX, (DI)
+//	ADDL	$4, DI
+// and 128 is the number of such blocks. See src/runtime/duff_386.s:duffcopy.
 
 // Large copying uses REP MOVSL.
-(Move [s] dst src mem) && (SizeAndAlign(s).Size() > 16*64 || config.noDuffDevice) && SizeAndAlign(s).Size()%8 == 0 ->
+(Move [s] dst src mem) && (SizeAndAlign(s).Size() > 4*128 || config.noDuffDevice) && SizeAndAlign(s).Size()%4 == 0 ->
 	(REPMOVSL dst src (MOVLconst [SizeAndAlign(s).Size()/4]) mem)
 
 // Lowering Zero instructions
@@ -309,11 +341,22 @@
 			(MOVLstoreconst [makeValAndOff(0,4)] destptr
 				(MOVLstoreconst [0] destptr mem))))
 
+// Medium zeroing uses a duff device.
+(Zero [s] destptr mem)
+  && SizeAndAlign(s).Size() > 16
+  && SizeAndAlign(s).Size() <= 4*128
+  && SizeAndAlign(s).Size()%4 == 0
+  && !config.noDuffDevice ->
+	(DUFFZERO [1*(128-SizeAndAlign(s).Size()/4)] destptr (MOVLconst [0]) mem)
+// 1 and 128 are magic constants.  1 is the number of bytes to encode STOSL.
+// 128 is the number of STOSL instructions in duffzero.
+// See src/runtime/duff_386.s:duffzero.
+
 // Large zeroing uses REP STOSQ.
 (Zero [s] destptr mem)
-	&& (SizeAndAlign(s).Size() > 1024 || (config.noDuffDevice && SizeAndAlign(s).Size() > 32))
-	&& SizeAndAlign(s).Size()%8 == 0 ->
-	(REPSTOSL destptr (MOVLconst [SizeAndAlign(s).Size()/8]) (MOVLconst [0]) mem)
+  && (SizeAndAlign(s).Size() > 4*128 || (config.noDuffDevice && SizeAndAlign(s).Size() > 16))
+  && SizeAndAlign(s).Size()%4 == 0 ->
+	(REPSTOSL destptr (MOVLconst [SizeAndAlign(s).Size()/4]) (MOVLconst [0]) mem)
 
 // Lowering constants
 (Const8   [val]) -> (MOVLconst [val])
@@ -596,14 +639,12 @@
 (MOVBload  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVBload  [off1+off2] {sym} ptr mem)
 (MOVSSload [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVSSload [off1+off2] {sym} ptr mem)
 (MOVSDload [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVSDload [off1+off2] {sym} ptr mem)
-(MOVOload  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVOload  [off1+off2] {sym} ptr mem)
 
 (MOVLstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVLstore  [off1+off2] {sym} ptr val mem)
 (MOVWstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVWstore  [off1+off2] {sym} ptr val mem)
 (MOVBstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVBstore  [off1+off2] {sym} ptr val mem)
 (MOVSSstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVSSstore [off1+off2] {sym} ptr val mem)
 (MOVSDstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVSDstore [off1+off2] {sym} ptr val mem)
-(MOVOstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVOstore  [off1+off2] {sym} ptr val mem)
 
 // Fold constants into stores.
 (MOVLstore [off] {sym} ptr (MOVLconst [c]) mem) && validOff(off) ->
@@ -633,8 +674,6 @@
 	(MOVSSload [off1+off2] {mergeSym(sym1,sym2)} base mem)
 (MOVSDload [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVSDload [off1+off2] {mergeSym(sym1,sym2)} base mem)
-(MOVOload [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
-	(MOVOload [off1+off2] {mergeSym(sym1,sym2)} base mem)
 
 (MOVBLSXload [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVBLSXload [off1+off2] {mergeSym(sym1,sym2)} base mem)
@@ -651,8 +690,6 @@
 	(MOVSSstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
 (MOVSDstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVSDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
-(MOVOstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
-	(MOVOstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
 
 (MOVLstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
 	(MOVLstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
diff --git a/src/cmd/compile/internal/ssa/gen/386Ops.go b/src/cmd/compile/internal/ssa/gen/386Ops.go
index 68bcfa9649..49c4cd49e4 100644
--- a/src/cmd/compile/internal/ssa/gen/386Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/386Ops.go
@@ -330,8 +330,6 @@ func init() {
 		{name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem"},     // store byte in arg1 to arg0+auxint+aux. arg2=mem
 		{name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem"},     // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem
 		{name: "MOVLstore", argLength: 3, reg: gpstore, asm: "MOVL", aux: "SymOff", typ: "Mem"},     // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem
-		{name: "MOVOload", argLength: 2, reg: fpload, asm: "MOVUPS", aux: "SymOff", typ: "Int128"},  // load 16 bytes from arg0+auxint+aux. arg1=mem
-		{name: "MOVOstore", argLength: 3, reg: fpstore, asm: "MOVUPS", aux: "SymOff", typ: "Mem"},   // store 16 bytes in arg1 to arg0+auxint+aux. arg2=mem
 
 		// indexed loads/stores
 		{name: "MOVBloadidx1", argLength: 3, reg: gploadidx, asm: "MOVBLZX", aux: "SymOff"}, // load a byte from arg0+arg1+auxint+aux. arg2=mem
@@ -360,7 +358,7 @@ func init() {
 		{name: "MOVLstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ... arg1 ...
 		{name: "MOVLstoreconstidx4", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ... 4*arg1 ...
 
-		// arg0 = (duff-adjusted) pointer to start of memory to zero
+		// arg0 = pointer to start of memory to zero
 		// arg1 = value to store (will always be zero)
 		// arg2 = mem
 		// auxint = offset into duffzero code to start executing
@@ -370,11 +368,10 @@ func init() {
 			aux:       "Int64",
 			argLength: 3,
 			reg: regInfo{
-				inputs:   []regMask{buildReg("DI"), buildReg("X0")},
+				inputs:   []regMask{buildReg("DI"), buildReg("AX")},
 				clobbers: buildReg("DI FLAGS"),
 			},
 		},
-		{name: "MOVOconst", reg: regInfo{nil, 0, []regMask{fp}}, typ: "Int128", aux: "Int128", rematerializeable: true},
 
 		// arg0 = address of memory to zero
 		// arg1 = # of 4-byte words to zero
@@ -407,7 +404,7 @@ func init() {
 			argLength: 3,
 			reg: regInfo{
 				inputs:   []regMask{buildReg("DI"), buildReg("SI")},
-				clobbers: buildReg("DI SI X0 FLAGS"), // uses X0 as a temporary
+				clobbers: buildReg("DI SI CX FLAGS"), // uses CX as a temporary
 			},
 		},
 
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules
index b429b6f627..811e810f15 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -400,8 +400,8 @@
 	(Zero [SizeAndAlign(s).Size()-8] (ADDQconst [8] destptr) (MOVQstore destptr (MOVQconst [0]) mem))
 (Zero [s] destptr mem)
 	&& SizeAndAlign(s).Size() <= 1024 && SizeAndAlign(s).Size()%16 == 0 && !config.noDuffDevice ->
-	(DUFFZERO [duffStart(SizeAndAlign(s).Size())]
-		(ADDQconst [duffAdj(SizeAndAlign(s).Size())] destptr) (MOVOconst [0])
+	(DUFFZERO [duffStartAMD64(SizeAndAlign(s).Size())]
+		(ADDQconst [duffAdjAMD64(SizeAndAlign(s).Size())] destptr) (MOVOconst [0])
 		mem)
 
 // Large zeroing uses REP STOSQ.
diff --git a/src/cmd/compile/internal/ssa/gen/dec64.rules b/src/cmd/compile/internal/ssa/gen/dec64.rules
index 47e2933872..8b2fd27669 100644
--- a/src/cmd/compile/internal/ssa/gen/dec64.rules
+++ b/src/cmd/compile/internal/ssa/gen/dec64.rules
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// This file contains rules to decompose [u]int32 types on 32-bit
+// This file contains rules to decompose [u]int64 types on 32-bit
 // architectures. These rules work together with the decomposeBuiltIn
 // pass which handles phis of these types.
 
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 47cfda86b5..a09e736b79 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -285,8 +285,6 @@ const (
 	Op386MOVBstore
 	Op386MOVWstore
 	Op386MOVLstore
-	Op386MOVOload
-	Op386MOVOstore
 	Op386MOVBloadidx1
 	Op386MOVWloadidx1
 	Op386MOVWloadidx2
@@ -306,7 +304,6 @@ const (
 	Op386MOVLstoreconstidx1
 	Op386MOVLstoreconstidx4
 	Op386DUFFZERO
-	Op386MOVOconst
 	Op386REPSTOSL
 	Op386CALLstatic
 	Op386CALLclosure
@@ -3152,32 +3149,6 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
-	{
-		name:    "MOVOload",
-		auxType: auxSymOff,
-		argLen:  2,
-		asm:     x86.AMOVUPS,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{0, 65791}, // AX CX DX BX SP BP SI DI SB
-			},
-			outputs: []outputInfo{
-				{0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7
-			},
-		},
-	},
-	{
-		name:    "MOVOstore",
-		auxType: auxSymOff,
-		argLen:  3,
-		asm:     x86.AMOVUPS,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{1, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7
-				{0, 65791}, // AX CX DX BX SP BP SI DI SB
-			},
-		},
-	},
 	{
 		name:    "MOVBloadidx1",
 		auxType: auxSymOff,
@@ -3418,22 +3389,11 @@ var opcodeTable = [...]opInfo{
 		reg: regInfo{
 			inputs: []inputInfo{
 				{0, 128}, // DI
-				{1, 256}, // X0
+				{1, 1},   // AX
 			},
 			clobbers: 131200, // DI FLAGS
 		},
 	},
-	{
-		name:              "MOVOconst",
-		auxType:           auxInt128,
-		argLen:            0,
-		rematerializeable: true,
-		reg: regInfo{
-			outputs: []outputInfo{
-				{0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7
-			},
-		},
-	},
 	{
 		name:   "REPSTOSL",
 		argLen: 4,
@@ -3502,7 +3462,7 @@ var opcodeTable = [...]opInfo{
 				{0, 128}, // DI
 				{1, 64},  // SI
 			},
-			clobbers: 131520, // SI DI X0 FLAGS
+			clobbers: 131266, // CX SI DI FLAGS
 		},
 	},
 	{
diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go
index 03c38827cc..09798eb1bd 100644
--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@@ -254,39 +254,38 @@ func isSamePtr(p1, p2 *Value) bool {
 	return false
 }
 
-// DUFFZERO consists of repeated blocks of 4 MOVUPSs + ADD,
-// See runtime/mkduff.go.
-const (
-	dzBlocks    = 16 // number of MOV/ADD blocks
-	dzBlockLen  = 4  // number of clears per block
-	dzBlockSize = 19 // size of instructions in a single block
-	dzMovSize   = 4  // size of single MOV instruction w/ offset
-	dzAddSize   = 4  // size of single ADD instruction
-	dzClearStep = 16 // number of bytes cleared by each MOV instruction
-
-	dzTailLen  = 4 // number of final STOSQ instructions
-	dzTailSize = 2 // size of single STOSQ instruction
-
-	dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block
-	dzSize     = dzBlocks * dzBlockSize
-)
-
-func duffStart(size int64) int64 {
-	x, _ := duff(size)
+func duffStartAMD64(size int64) int64 {
+	x, _ := duffAMD64(size)
 	return x
 }
-func duffAdj(size int64) int64 {
-	_, x := duff(size)
+func duffAdjAMD64(size int64) int64 {
+	_, x := duffAMD64(size)
 	return x
 }
 
 // duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
 // required to use the duffzero mechanism for a block of the given size.
-func duff(size int64) (int64, int64) {
+func duffAMD64(size int64) (int64, int64) {
+	// DUFFZERO consists of repeated blocks of 4 MOVUPSs + ADD,
+	// See runtime/mkduff.go.
+	const (
+		dzBlocks    = 16 // number of MOV/ADD blocks
+		dzBlockLen  = 4  // number of clears per block
+		dzBlockSize = 19 // size of instructions in a single block
+		dzMovSize   = 4  // size of single MOV instruction w/ offset
+		dzAddSize   = 4  // size of single ADD instruction
+		dzClearStep = 16 // number of bytes cleared by each MOV instruction
+
+		dzTailLen  = 4 // number of final STOSQ instructions
+		dzTailSize = 2 // size of single STOSQ instruction
+
+		dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block
+		dzSize     = dzBlocks * dzBlockSize
+	)
+
 	if size < 32 || size > 1024 || size%dzClearStep != 0 {
 		panic("bad duffzero size")
 	}
-	// TODO: arch-dependent
 	steps := size / dzClearStep
 	blocks := steps / dzBlockLen
 	steps %= dzBlockLen
diff --git a/src/cmd/compile/internal/ssa/rewrite386.go b/src/cmd/compile/internal/ssa/rewrite386.go
index f3f021493d..5d571c588f 100644
--- a/src/cmd/compile/internal/ssa/rewrite386.go
+++ b/src/cmd/compile/internal/ssa/rewrite386.go
@@ -240,18 +240,24 @@ func rewriteValue386(v *Value, config *Config) bool {
 		return rewriteValue386_OpLsh16x16(v, config)
 	case OpLsh16x32:
 		return rewriteValue386_OpLsh16x32(v, config)
+	case OpLsh16x64:
+		return rewriteValue386_OpLsh16x64(v, config)
 	case OpLsh16x8:
 		return rewriteValue386_OpLsh16x8(v, config)
 	case OpLsh32x16:
 		return rewriteValue386_OpLsh32x16(v, config)
 	case OpLsh32x32:
 		return rewriteValue386_OpLsh32x32(v, config)
+	case OpLsh32x64:
+		return rewriteValue386_OpLsh32x64(v, config)
 	case OpLsh32x8:
 		return rewriteValue386_OpLsh32x8(v, config)
 	case OpLsh8x16:
 		return rewriteValue386_OpLsh8x16(v, config)
 	case OpLsh8x32:
 		return rewriteValue386_OpLsh8x32(v, config)
+	case OpLsh8x64:
+		return rewriteValue386_OpLsh8x64(v, config)
 	case OpLsh8x8:
 		return rewriteValue386_OpLsh8x8(v, config)
 	case Op386MOVBLSX:
@@ -290,10 +296,6 @@ func rewriteValue386(v *Value, config *Config) bool {
 		return rewriteValue386_Op386MOVLstoreidx1(v, config)
 	case Op386MOVLstoreidx4:
 		return rewriteValue386_Op386MOVLstoreidx4(v, config)
-	case Op386MOVOload:
-		return rewriteValue386_Op386MOVOload(v, config)
-	case Op386MOVOstore:
-		return rewriteValue386_Op386MOVOstore(v, config)
 	case Op386MOVSDload:
 		return rewriteValue386_Op386MOVSDload(v, config)
 	case Op386MOVSDloadidx1:
@@ -428,36 +430,48 @@ func rewriteValue386(v *Value, config *Config) bool {
 		return rewriteValue386_OpRsh16Ux16(v, config)
 	case OpRsh16Ux32:
 		return rewriteValue386_OpRsh16Ux32(v, config)
+	case OpRsh16Ux64:
+		return rewriteValue386_OpRsh16Ux64(v, config)
 	case OpRsh16Ux8:
 		return rewriteValue386_OpRsh16Ux8(v, config)
 	case OpRsh16x16:
 		return rewriteValue386_OpRsh16x16(v, config)
 	case OpRsh16x32:
 		return rewriteValue386_OpRsh16x32(v, config)
+	case OpRsh16x64:
+		return rewriteValue386_OpRsh16x64(v, config)
 	case OpRsh16x8:
 		return rewriteValue386_OpRsh16x8(v, config)
 	case OpRsh32Ux16:
 		return rewriteValue386_OpRsh32Ux16(v, config)
 	case OpRsh32Ux32:
 		return rewriteValue386_OpRsh32Ux32(v, config)
+	case OpRsh32Ux64:
+		return rewriteValue386_OpRsh32Ux64(v, config)
 	case OpRsh32Ux8:
 		return rewriteValue386_OpRsh32Ux8(v, config)
 	case OpRsh32x16:
 		return rewriteValue386_OpRsh32x16(v, config)
 	case OpRsh32x32:
 		return rewriteValue386_OpRsh32x32(v, config)
+	case OpRsh32x64:
+		return rewriteValue386_OpRsh32x64(v, config)
 	case OpRsh32x8:
 		return rewriteValue386_OpRsh32x8(v, config)
 	case OpRsh8Ux16:
 		return rewriteValue386_OpRsh8Ux16(v, config)
 	case OpRsh8Ux32:
 		return rewriteValue386_OpRsh8Ux32(v, config)
+	case OpRsh8Ux64:
+		return rewriteValue386_OpRsh8Ux64(v, config)
 	case OpRsh8Ux8:
 		return rewriteValue386_OpRsh8Ux8(v, config)
 	case OpRsh8x16:
 		return rewriteValue386_OpRsh8x16(v, config)
 	case OpRsh8x32:
 		return rewriteValue386_OpRsh8x32(v, config)
+	case OpRsh8x64:
+		return rewriteValue386_OpRsh8x64(v, config)
 	case OpRsh8x8:
 		return rewriteValue386_OpRsh8x8(v, config)
 	case Op386SARB:
@@ -516,6 +530,8 @@ func rewriteValue386(v *Value, config *Config) bool {
 		return rewriteValue386_OpSignExt8to16(v, config)
 	case OpSignExt8to32:
 		return rewriteValue386_OpSignExt8to32(v, config)
+	case OpSignmask:
+		return rewriteValue386_OpSignmask(v, config)
 	case OpSqrt:
 		return rewriteValue386_OpSqrt(v, config)
 	case OpStaticCall:
@@ -562,6 +578,8 @@ func rewriteValue386(v *Value, config *Config) bool {
 		return rewriteValue386_OpZeroExt8to16(v, config)
 	case OpZeroExt8to32:
 		return rewriteValue386_OpZeroExt8to32(v, config)
+	case OpZeromask:
+		return rewriteValue386_OpZeromask(v, config)
 	}
 	return false
 }
@@ -4062,6 +4080,45 @@ func rewriteValue386_OpLsh16x32(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValue386_OpLsh16x64(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Lsh16x64 x (Const64 [c]))
+	// cond: uint64(c) < 16
+	// result: (SHLLconst x [c])
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpConst64 {
+			break
+		}
+		c := v_1.AuxInt
+		if !(uint64(c) < 16) {
+			break
+		}
+		v.reset(Op386SHLLconst)
+		v.AddArg(x)
+		v.AuxInt = c
+		return true
+	}
+	// match: (Lsh16x64 _ (Const64 [c]))
+	// cond: uint64(c) >= 16
+	// result: (Const16 [0])
+	for {
+		v_1 := v.Args[1]
+		if v_1.Op != OpConst64 {
+			break
+		}
+		c := v_1.AuxInt
+		if !(uint64(c) >= 16) {
+			break
+		}
+		v.reset(OpConst16)
+		v.AuxInt = 0
+		return true
+	}
+	return false
+}
 func rewriteValue386_OpLsh16x8(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
@@ -4134,6 +4191,45 @@ func rewriteValue386_OpLsh32x32(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValue386_OpLsh32x64(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Lsh32x64 x (Const64 [c]))
+	// cond: uint64(c) < 32
+	// result: (SHLLconst x [c])
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpConst64 {
+			break
+		}
+		c := v_1.AuxInt
+		if !(uint64(c) < 32) {
+			break
+		}
+		v.reset(Op386SHLLconst)
+		v.AddArg(x)
+		v.AuxInt = c
+		return true
+	}
+	// match: (Lsh32x64 _ (Const64 [c]))
+	// cond: uint64(c) >= 32
+	// result: (Const32 [0])
+	for {
+		v_1 := v.Args[1]
+		if v_1.Op != OpConst64 {
+			break
+		}
+		c := v_1.AuxInt
+		if !(uint64(c) >= 32) {
+			break
+		}
+		v.reset(OpConst32)
+		v.AuxInt = 0
+		return true
+	}
+	return false
+}
 func rewriteValue386_OpLsh32x8(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
@@ -4206,6 +4302,45 @@ func rewriteValue386_OpLsh8x32(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValue386_OpLsh8x64(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Lsh8x64 x (Const64 [c]))
+	// cond: uint64(c) < 8
+	// result: (SHLLconst x [c])
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpConst64 {
+			break
+		}
+		c := v_1.AuxInt
+		if !(uint64(c) < 8) {
+			break
+		}
+		v.reset(Op386SHLLconst)
+		v.AddArg(x)
+		v.AuxInt = c
+		return true
+	}
+	// match: (Lsh8x64 _ (Const64 [c]))
+	// cond: uint64(c) >= 8
+	// result: (Const8 [0])
+	for {
+		v_1 := v.Args[1]
+		if v_1.Op != OpConst64 {
+			break
+		}
+		c := v_1.AuxInt
+		if !(uint64(c) >= 8) {
+			break
+		}
+		v.reset(OpConst8)
+		v.AuxInt = 0
+		return true
+	}
+	return false
+}
 func rewriteValue386_OpLsh8x8(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
@@ -5997,114 +6132,6 @@ func rewriteValue386_Op386MOVLstoreidx4(v *Value, config *Config) bool {
 	}
 	return false
 }
-func rewriteValue386_Op386MOVOload(v *Value, config *Config) bool {
-	b := v.Block
-	_ = b
-	// match: (MOVOload  [off1] {sym} (ADDLconst [off2] ptr) mem)
-	// cond: is32Bit(off1+off2)
-	// result: (MOVOload  [off1+off2] {sym} ptr mem)
-	for {
-		off1 := v.AuxInt
-		sym := v.Aux
-		v_0 := v.Args[0]
-		if v_0.Op != Op386ADDLconst {
-			break
-		}
-		off2 := v_0.AuxInt
-		ptr := v_0.Args[0]
-		mem := v.Args[1]
-		if !(is32Bit(off1 + off2)) {
-			break
-		}
-		v.reset(Op386MOVOload)
-		v.AuxInt = off1 + off2
-		v.Aux = sym
-		v.AddArg(ptr)
-		v.AddArg(mem)
-		return true
-	}
-	// match: (MOVOload [off1] {sym1} (LEAL [off2] {sym2} base) mem)
-	// cond: is32Bit(off1+off2) && canMergeSym(sym1, sym2)
-	// result: (MOVOload [off1+off2] {mergeSym(sym1,sym2)} base mem)
-	for {
-		off1 := v.AuxInt
-		sym1 := v.Aux
-		v_0 := v.Args[0]
-		if v_0.Op != Op386LEAL {
-			break
-		}
-		off2 := v_0.AuxInt
-		sym2 := v_0.Aux
-		base := v_0.Args[0]
-		mem := v.Args[1]
-		if !(is32Bit(off1+off2) && canMergeSym(sym1, sym2)) {
-			break
-		}
-		v.reset(Op386MOVOload)
-		v.AuxInt = off1 + off2
-		v.Aux = mergeSym(sym1, sym2)
-		v.AddArg(base)
-		v.AddArg(mem)
-		return true
-	}
-	return false
-}
-func rewriteValue386_Op386MOVOstore(v *Value, config *Config) bool {
-	b := v.Block
-	_ = b
-	// match: (MOVOstore  [off1] {sym} (ADDLconst [off2] ptr) val mem)
-	// cond: is32Bit(off1+off2)
-	// result: (MOVOstore  [off1+off2] {sym} ptr val mem)
-	for {
-		off1 := v.AuxInt
-		sym := v.Aux
-		v_0 := v.Args[0]
-		if v_0.Op != Op386ADDLconst {
-			break
-		}
-		off2 := v_0.AuxInt
-		ptr := v_0.Args[0]
-		val := v.Args[1]
-		mem := v.Args[2]
-		if !(is32Bit(off1 + off2)) {
-			break
-		}
-		v.reset(Op386MOVOstore)
-		v.AuxInt = off1 + off2
-		v.Aux = sym
-		v.AddArg(ptr)
-		v.AddArg(val)
-		v.AddArg(mem)
-		return true
-	}
-	// match: (MOVOstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem)
-	// cond: is32Bit(off1+off2) && canMergeSym(sym1, sym2)
-	// result: (MOVOstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
-	for {
-		off1 := v.AuxInt
-		sym1 := v.Aux
-		v_0 := v.Args[0]
-		if v_0.Op != Op386LEAL {
-			break
-		}
-		off2 := v_0.AuxInt
-		sym2 := v_0.Aux
-		base := v_0.Args[0]
-		val := v.Args[1]
-		mem := v.Args[2]
-		if !(is32Bit(off1+off2) && canMergeSym(sym1, sym2)) {
-			break
-		}
-		v.reset(Op386MOVOstore)
-		v.AuxInt = off1 + off2
-		v.Aux = mergeSym(sym1, sym2)
-		v.AddArg(base)
-		v.AddArg(val)
-		v.AddArg(mem)
-		return true
-	}
-	return false
-}
 func rewriteValue386_Op386MOVSDload(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
@@ -9073,26 +9100,6 @@ func rewriteValue386_OpMove(v *Value, config *Config) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: SizeAndAlign(s).Size() == 16
-	// result: (MOVOstore dst (MOVOload src mem) mem)
-	for {
-		s := v.AuxInt
-		dst := v.Args[0]
-		src := v.Args[1]
-		mem := v.Args[2]
-		if !(SizeAndAlign(s).Size() == 16) {
-			break
-		}
-		v.reset(Op386MOVOstore)
-		v.AddArg(dst)
-		v0 := b.NewValue0(v.Line, Op386MOVOload, TypeInt128)
-		v0.AddArg(src)
-		v0.AddArg(mem)
-		v.AddArg(v0)
-		v.AddArg(mem)
-		return true
-	}
-	// match: (Move [s] dst src mem)
 	// cond: SizeAndAlign(s).Size() == 3
 	// result: (MOVBstore [2] dst (MOVBload [2] src mem) 		(MOVWstore dst (MOVWload src mem) mem))
 	for {
@@ -9209,32 +9216,92 @@ func rewriteValue386_OpMove(v *Value, config *Config) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: SizeAndAlign(s).Size() >= 32 && SizeAndAlign(s).Size() <= 16*64 && SizeAndAlign(s).Size()%16 == 0 	&& !config.noDuffDevice
-	// result: (DUFFCOPY [14*(64-SizeAndAlign(s).Size()/16)] dst src mem)
+	// cond: SizeAndAlign(s).Size() == 8
+	// result: (MOVLstore [4] dst (MOVLload [4] src mem) 		(MOVLstore dst (MOVLload src mem) mem))
+	for {
+		s := v.AuxInt
+		dst := v.Args[0]
+		src := v.Args[1]
+		mem := v.Args[2]
+		if !(SizeAndAlign(s).Size() == 8) {
+			break
+		}
+		v.reset(Op386MOVLstore)
+		v.AuxInt = 4
+		v.AddArg(dst)
+		v0 := b.NewValue0(v.Line, Op386MOVLload, config.fe.TypeUInt32())
+		v0.AuxInt = 4
+		v0.AddArg(src)
+		v0.AddArg(mem)
+		v.AddArg(v0)
+		v1 := b.NewValue0(v.Line, Op386MOVLstore, TypeMem)
+		v1.AddArg(dst)
+		v2 := b.NewValue0(v.Line, Op386MOVLload, config.fe.TypeUInt32())
+		v2.AddArg(src)
+		v2.AddArg(mem)
+		v1.AddArg(v2)
+		v1.AddArg(mem)
+		v.AddArg(v1)
+		return true
+	}
+	// match: (Move [s] dst src mem)
+	// cond: SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size()%4 != 0
+	// result: (Move [SizeAndAlign(s).Size()-SizeAndAlign(s).Size()%4] 		(ADDLconst <dst.Type> dst [SizeAndAlign(s).Size()%4]) 		(ADDLconst <src.Type> src [SizeAndAlign(s).Size()%4]) 		(MOVLstore dst (MOVLload src mem) mem))
 	for {
 		s := v.AuxInt
 		dst := v.Args[0]
 		src := v.Args[1]
 		mem := v.Args[2]
-		if !(SizeAndAlign(s).Size() >= 32 && SizeAndAlign(s).Size() <= 16*64 && SizeAndAlign(s).Size()%16 == 0 && !config.noDuffDevice) {
+		if !(SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size()%4 != 0) {
+			break
+		}
+		v.reset(OpMove)
+		v.AuxInt = SizeAndAlign(s).Size() - SizeAndAlign(s).Size()%4
+		v0 := b.NewValue0(v.Line, Op386ADDLconst, dst.Type)
+		v0.AddArg(dst)
+		v0.AuxInt = SizeAndAlign(s).Size() % 4
+		v.AddArg(v0)
+		v1 := b.NewValue0(v.Line, Op386ADDLconst, src.Type)
+		v1.AddArg(src)
+		v1.AuxInt = SizeAndAlign(s).Size() % 4
+		v.AddArg(v1)
+		v2 := b.NewValue0(v.Line, Op386MOVLstore, TypeMem)
+		v2.AddArg(dst)
+		v3 := b.NewValue0(v.Line, Op386MOVLload, config.fe.TypeUInt32())
+		v3.AddArg(src)
+		v3.AddArg(mem)
+		v2.AddArg(v3)
+		v2.AddArg(mem)
+		v.AddArg(v2)
+		return true
+	}
+	// match: (Move [s] dst src mem)
+	// cond: SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size() <= 4*128 && SizeAndAlign(s).Size()%4 == 0 	&& !config.noDuffDevice
+	// result: (DUFFCOPY [10*(128-SizeAndAlign(s).Size()/4)] dst src mem)
+	for {
+		s := v.AuxInt
+		dst := v.Args[0]
+		src := v.Args[1]
+		mem := v.Args[2]
+		if !(SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size() <= 4*128 && SizeAndAlign(s).Size()%4 == 0 && !config.noDuffDevice) {
 			break
 		}
 		v.reset(Op386DUFFCOPY)
-		v.AuxInt = 14 * (64 - SizeAndAlign(s).Size()/16)
+		v.AuxInt = 10 * (128 - SizeAndAlign(s).Size()/4)
 		v.AddArg(dst)
 		v.AddArg(src)
 		v.AddArg(mem)
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: (SizeAndAlign(s).Size() > 16*64 || config.noDuffDevice) && SizeAndAlign(s).Size()%8 == 0
+	// cond: (SizeAndAlign(s).Size() > 4*128 || config.noDuffDevice) && SizeAndAlign(s).Size()%4 == 0
 	// result: (REPMOVSL dst src (MOVLconst [SizeAndAlign(s).Size()/4]) mem)
 	for {
 		s := v.AuxInt
 		dst := v.Args[0]
 		src := v.Args[1]
 		mem := v.Args[2]
-		if !((SizeAndAlign(s).Size() > 16*64 || config.noDuffDevice) && SizeAndAlign(s).Size()%8 == 0) {
+		if !((SizeAndAlign(s).Size() > 4*128 || config.noDuffDevice) && SizeAndAlign(s).Size()%4 == 0) {
 			break
 		}
 		v.reset(Op386REPMOVSL)
@@ -10006,32 +10073,16 @@ func rewriteValue386_OpOffPtr(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
 	// match: (OffPtr [off] ptr)
-	// cond: is32Bit(off)
+	// cond:
 	// result: (ADDLconst [off] ptr)
 	for {
 		off := v.AuxInt
 		ptr := v.Args[0]
-		if !(is32Bit(off)) {
-			break
-		}
 		v.reset(Op386ADDLconst)
 		v.AuxInt = off
 		v.AddArg(ptr)
 		return true
 	}
-	// match: (OffPtr [off] ptr)
-	// cond:
-	// result: (ADDL (MOVLconst [off]) ptr)
-	for {
-		off := v.AuxInt
-		ptr := v.Args[0]
-		v.reset(Op386ADDL)
-		v0 := b.NewValue0(v.Line, Op386MOVLconst, config.fe.TypeUInt32())
-		v0.AuxInt = off
-		v.AddArg(v0)
-		v.AddArg(ptr)
-		return true
-	}
 }
 func rewriteValue386_OpOr16(v *Value, config *Config) bool {
 	b := v.Block
@@ -10243,6 +10294,45 @@ func rewriteValue386_OpRsh16Ux32(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValue386_OpRsh16Ux64(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Rsh16Ux64 x (Const64 [c]))
+	// cond: uint64(c) < 16
+	// result: (SHRWconst x [c])
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpConst64 {
+			break
+		}
+		c := v_1.AuxInt
+		if !(uint64(c) < 16) {
+			break
+		}
+		v.reset(Op386SHRWconst)
+		v.AddArg(x)
+		v.AuxInt = c
+		return true
+	}
+	// match: (Rsh16Ux64 _ (Const64 [c]))
+	// cond: uint64(c) >= 16
+	// result: (Const16 [0])
+	for {
+		v_1 := v.Args[1]
+		if v_1.Op != OpConst64 {
+			break
+		}
+		c := v_1.AuxInt
+		if !(uint64(c) >= 16) {
+			break
+		}
+		v.reset(OpConst16)
+		v.AuxInt = 0
+		return true
+	}
+	return false
+}
 func rewriteValue386_OpRsh16Ux8(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
@@ -10321,6 +10411,29 @@ func rewriteValue386_OpRsh16x32(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValue386_OpRsh16x64(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Rsh16x64 x (Const64 [c]))
+	// cond: uint64(c) < 16
+	// result: (SARWconst x [c])
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpConst64 {
+			break
+		}
+		c := v_1.AuxInt
+		if !(uint64(c) < 16) {
+			break
+		}
+		v.reset(Op386SARWconst)
+		v.AddArg(x)
+		v.AuxInt = c
+		return true
+	}
+	return false
+}
 func rewriteValue386_OpRsh16x8(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
@@ -10396,6 +10509,45 @@ func rewriteValue386_OpRsh32Ux32(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValue386_OpRsh32Ux64(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Rsh32Ux64 x (Const64 [c]))
+	// cond: uint64(c) < 32
+	// result: (SHRLconst x [c])
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpConst64 {
+			break
+		}
+		c := v_1.AuxInt
+		if !(uint64(c) < 32) {
+			break
+		}
+		v.reset(Op386SHRLconst)
+		v.AddArg(x)
+		v.AuxInt = c
+		return true
+	}
+	// match: (Rsh32Ux64 _ (Const64 [c]))
+	// cond: uint64(c) >= 32
+	// result: (Const32 [0])
+	for {
+		v_1 := v.Args[1]
+		if v_1.Op != OpConst64 {
+			break
+		}
+		c := v_1.AuxInt
+		if !(uint64(c) >= 32) {
+			break
+		}
+		v.reset(OpConst32)
+		v.AuxInt = 0
+		return true
+	}
+	return false
+}
 func rewriteValue386_OpRsh32Ux8(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
@@ -10474,6 +10626,29 @@ func rewriteValue386_OpRsh32x32(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValue386_OpRsh32x64(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Rsh32x64 x (Const64 [c]))
+	// cond: uint64(c) < 32
+	// result: (SARLconst x [c])
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpConst64 {
+			break
+		}
+		c := v_1.AuxInt
+		if !(uint64(c) < 32) {
+			break
+		}
+		v.reset(Op386SARLconst)
+		v.AddArg(x)
+		v.AuxInt = c
+		return true
+	}
+	return false
+}
 func rewriteValue386_OpRsh32x8(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
@@ -10549,6 +10724,45 @@ func rewriteValue386_OpRsh8Ux32(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValue386_OpRsh8Ux64(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Rsh8Ux64 x (Const64 [c]))
+	// cond: uint64(c) < 8
+	// result: (SHRBconst x [c])
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpConst64 {
+			break
+		}
+		c := v_1.AuxInt
+		if !(uint64(c) < 8) {
+			break
+		}
+		v.reset(Op386SHRBconst)
+		v.AddArg(x)
+		v.AuxInt = c
+		return true
+	}
+	// match: (Rsh8Ux64 _ (Const64 [c]))
+	// cond: uint64(c) >= 8
+	// result: (Const8 [0])
+	for {
+		v_1 := v.Args[1]
+		if v_1.Op != OpConst64 {
+			break
+		}
+		c := v_1.AuxInt
+		if !(uint64(c) >= 8) {
+			break
+		}
+		v.reset(OpConst8)
+		v.AuxInt = 0
+		return true
+	}
+	return false
+}
 func rewriteValue386_OpRsh8Ux8(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
@@ -10627,6 +10841,29 @@ func rewriteValue386_OpRsh8x32(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValue386_OpRsh8x64(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Rsh8x64 x (Const64 [c]))
+	// cond: uint64(c) < 8
+	// result: (SARBconst x [c])
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpConst64 {
+			break
+		}
+		c := v_1.AuxInt
+		if !(uint64(c) < 8) {
+			break
+		}
+		v.reset(Op386SARBconst)
+		v.AddArg(x)
+		v.AuxInt = c
+		return true
+	}
+	return false
+}
 func rewriteValue386_OpRsh8x8(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
@@ -12014,6 +12251,20 @@ func rewriteValue386_OpSignExt8to32(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValue386_OpSignmask(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Signmask x)
+	// cond:
+	// result: (SARLconst x [31])
+	for {
+		x := v.Args[0]
+		v.reset(Op386SARLconst)
+		v.AddArg(x)
+		v.AuxInt = 31
+		return true
+	}
+}
 func rewriteValue386_OpSqrt(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
@@ -12681,19 +12932,38 @@ func rewriteValue386_OpZero(v *Value, config *Config) bool {
 		return true
 	}
 	// match: (Zero [s] destptr mem)
-	// cond: (SizeAndAlign(s).Size() > 1024 || (config.noDuffDevice && SizeAndAlign(s).Size() > 32)) 	&& SizeAndAlign(s).Size()%8 == 0
-	// result: (REPSTOSL destptr (MOVLconst [SizeAndAlign(s).Size()/8]) (MOVLconst [0]) mem)
+	// cond: SizeAndAlign(s).Size() > 16   && SizeAndAlign(s).Size() <= 4*128   && SizeAndAlign(s).Size()%4 == 0   && !config.noDuffDevice
+	// result: (DUFFZERO [1*(128-SizeAndAlign(s).Size()/4)] destptr (MOVLconst [0]) mem)
+	for {
+		s := v.AuxInt
+		destptr := v.Args[0]
+		mem := v.Args[1]
+		if !(SizeAndAlign(s).Size() > 16 && SizeAndAlign(s).Size() <= 4*128 && SizeAndAlign(s).Size()%4 == 0 && !config.noDuffDevice) {
+			break
+		}
+		v.reset(Op386DUFFZERO)
+		v.AuxInt = 1 * (128 - SizeAndAlign(s).Size()/4)
+		v.AddArg(destptr)
+		v0 := b.NewValue0(v.Line, Op386MOVLconst, config.fe.TypeUInt32())
+		v0.AuxInt = 0
+		v.AddArg(v0)
+		v.AddArg(mem)
+		return true
+	}
+	// match: (Zero [s] destptr mem)
+	// cond: (SizeAndAlign(s).Size() > 4*128 || (config.noDuffDevice && SizeAndAlign(s).Size() > 16))   && SizeAndAlign(s).Size()%4 == 0
+	// result: (REPSTOSL destptr (MOVLconst [SizeAndAlign(s).Size()/4]) (MOVLconst [0]) mem)
 	for {
 		s := v.AuxInt
 		destptr := v.Args[0]
 		mem := v.Args[1]
-		if !((SizeAndAlign(s).Size() > 1024 || (config.noDuffDevice && SizeAndAlign(s).Size() > 32)) && SizeAndAlign(s).Size()%8 == 0) {
+		if !((SizeAndAlign(s).Size() > 4*128 || (config.noDuffDevice && SizeAndAlign(s).Size() > 16)) && SizeAndAlign(s).Size()%4 == 0) {
 			break
 		}
 		v.reset(Op386REPSTOSL)
 		v.AddArg(destptr)
 		v0 := b.NewValue0(v.Line, Op386MOVLconst, config.fe.TypeUInt32())
-		v0.AuxInt = SizeAndAlign(s).Size() / 8
+		v0.AuxInt = SizeAndAlign(s).Size() / 4
 		v.AddArg(v0)
 		v1 := b.NewValue0(v.Line, Op386MOVLconst, config.fe.TypeUInt32())
 		v1.AuxInt = 0
@@ -12742,6 +13012,24 @@ func rewriteValue386_OpZeroExt8to32(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValue386_OpZeromask(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Zeromask x)
+	// cond:
+	// result: (SBBLcarrymask (CMPL (MOVLconst [0]) x))
+	for {
+		x := v.Args[0]
+		v.reset(Op386SBBLcarrymask)
+		v0 := b.NewValue0(v.Line, Op386CMPL, TypeFlags)
+		v1 := b.NewValue0(v.Line, Op386MOVLconst, config.fe.TypeUInt32())
+		v1.AuxInt = 0
+		v0.AddArg(v1)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+}
 func rewriteBlock386(b *Block) bool {
 	switch b.Kind {
 	case Block386EQ:
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index 9888d065cd..01c268f70b 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -17415,7 +17415,7 @@ func rewriteValueAMD64_OpZero(v *Value, config *Config) bool {
 	}
 	// match: (Zero [s] destptr mem)
 	// cond: SizeAndAlign(s).Size() <= 1024 && SizeAndAlign(s).Size()%16 == 0 && !config.noDuffDevice
-	// result: (DUFFZERO [duffStart(SizeAndAlign(s).Size())] 		(ADDQconst [duffAdj(SizeAndAlign(s).Size())] destptr) (MOVOconst [0]) 		mem)
+	// result: (DUFFZERO [duffStartAMD64(SizeAndAlign(s).Size())] 		(ADDQconst [duffAdjAMD64(SizeAndAlign(s).Size())] destptr) (MOVOconst [0]) 		mem)
 	for {
 		s := v.AuxInt
 		destptr := v.Args[0]
@@ -17424,9 +17424,9 @@ func rewriteValueAMD64_OpZero(v *Value, config *Config) bool {
 			break
 		}
 		v.reset(OpAMD64DUFFZERO)
-		v.AuxInt = duffStart(SizeAndAlign(s).Size())
+		v.AuxInt = duffStartAMD64(SizeAndAlign(s).Size())
 		v0 := b.NewValue0(v.Line, OpAMD64ADDQconst, config.fe.TypeUInt64())
-		v0.AuxInt = duffAdj(SizeAndAlign(s).Size())
+		v0.AuxInt = duffAdjAMD64(SizeAndAlign(s).Size())
 		v0.AddArg(destptr)
 		v.AddArg(v0)
 		v1 := b.NewValue0(v.Line, OpAMD64MOVOconst, TypeInt128)
diff --git a/src/cmd/compile/internal/x86/ssa.go b/src/cmd/compile/internal/x86/ssa.go
index ab6410b1c3..f83afa1a58 100644
--- a/src/cmd/compile/internal/x86/ssa.go
+++ b/src/cmd/compile/internal/x86/ssa.go
@@ -101,11 +101,14 @@ func storeByType(t ssa.Type) obj.As {
 // moveByType returns the reg->reg move instruction of the given type.
 func moveByType(t ssa.Type) obj.As {
 	if t.IsFloat() {
-		// Moving the whole sse2 register is faster
-		// than moving just the correct low portion of it.
-		// There is no xmm->xmm move with 1 byte opcode,
-		// so use movups, which has 2 byte opcode.
-		return x86.AMOVUPS
+		switch t.Size() {
+		case 4:
+			return x86.AMOVSS
+		case 8:
+			return x86.AMOVSD
+		default:
+			panic(fmt.Sprintf("bad float register width %d:%s", t.Size(), t))
+		}
 	} else {
 		switch t.Size() {
 		case 1:
@@ -115,8 +118,6 @@ func moveByType(t ssa.Type) obj.As {
 			return x86.AMOVL
 		case 4:
 			return x86.AMOVL
-		case 16:
-			return x86.AMOVUPS // int128s are in SSE registers
 		default:
 			panic(fmt.Sprintf("bad int register width %d:%s", t.Size(), t))
 		}
@@ -448,7 +449,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = x
-	case ssa.Op386MOVSSload, ssa.Op386MOVSDload, ssa.Op386MOVLload, ssa.Op386MOVWload, ssa.Op386MOVBload, ssa.Op386MOVBLSXload, ssa.Op386MOVWLSXload, ssa.Op386MOVOload:
+	case ssa.Op386MOVSSload, ssa.Op386MOVSDload, ssa.Op386MOVLload, ssa.Op386MOVWload, ssa.Op386MOVBload, ssa.Op386MOVBLSXload, ssa.Op386MOVWLSXload:
 		p := gc.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_MEM
 		p.From.Reg = gc.SSARegNum(v.Args[0])
@@ -496,7 +497,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		gc.AddAux(&p.From, v)
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = gc.SSARegNum(v)
-	case ssa.Op386MOVSSstore, ssa.Op386MOVSDstore, ssa.Op386MOVLstore, ssa.Op386MOVWstore, ssa.Op386MOVBstore, ssa.Op386MOVOstore:
+	case ssa.Op386MOVSSstore, ssa.Op386MOVSDstore, ssa.Op386MOVLstore, ssa.Op386MOVWstore, ssa.Op386MOVBstore:
 		p := gc.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = gc.SSARegNum(v.Args[1])
@@ -584,12 +585,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Type = obj.TYPE_ADDR
 		p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg))
 		p.To.Offset = v.AuxInt
-	case ssa.Op386MOVOconst:
-		if v.AuxInt != 0 {
-			v.Unimplementedf("MOVOconst can only do constant=0")
-		}
-		r := gc.SSARegNum(v)
-		opregreg(x86.AXORPS, r, r)
 	case ssa.Op386DUFFCOPY:
 		p := gc.Prog(obj.ADUFFCOPY)
 		p.To.Type = obj.TYPE_ADDR
@@ -828,8 +823,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			case ssa.Op386MOVLload, ssa.Op386MOVWload, ssa.Op386MOVBload,
 				ssa.Op386MOVLstore, ssa.Op386MOVWstore, ssa.Op386MOVBstore,
 				ssa.Op386MOVBLSXload, ssa.Op386MOVWLSXload,
-				ssa.Op386MOVSSload, ssa.Op386MOVSDload, ssa.Op386MOVOload,
-				ssa.Op386MOVSSstore, ssa.Op386MOVSDstore, ssa.Op386MOVOstore:
+				ssa.Op386MOVSSload, ssa.Op386MOVSDload,
+				ssa.Op386MOVSSstore, ssa.Op386MOVSDstore:
 				if w.Args[0] == v.Args[0] && w.Aux == nil && w.AuxInt >= 0 && w.AuxInt < minZeroPage {
 					if gc.Debug_checknil != 0 && int(v.Line) > 1 {
 						gc.Warnl(v.Line, "removed nil check")
-- 
2.50.0