]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile/internal/ssagen: factor out intrinsics code
authorJoel Sing <joel@sing.id.au>
Fri, 2 Aug 2024 15:22:58 +0000 (01:22 +1000)
committerJoel Sing <joel@sing.id.au>
Tue, 20 Aug 2024 14:20:34 +0000 (14:20 +0000)
The intrinsic handling code is a good thousand lines in the fairly
large ssa.go file. This code is already reasonably self-contained - factor
it out into a separate file so that future changes are easier to manage
(and it becomes easier to add/change intrinsics for an architecture).

Change-Id: I3c18d3d1bb6332f1817d902150e736373bf1ac81
Reviewed-on: https://go-review.googlesource.com/c/go/+/605477
Reviewed-by: Carlos Amedee <carlos@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
src/cmd/compile/internal/ssagen/intrinsics.go [new file with mode: 0644]
src/cmd/compile/internal/ssagen/ssa.go

diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go
new file mode 100644 (file)
index 0000000..f44531b
--- /dev/null
@@ -0,0 +1,1047 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ssagen
+
+import (
+       "fmt"
+       "internal/buildcfg"
+
+       "cmd/compile/internal/base"
+       "cmd/compile/internal/ir"
+       "cmd/compile/internal/ssa"
+       "cmd/compile/internal/types"
+       "cmd/internal/sys"
+)
+
+var intrinsics map[intrinsicKey]intrinsicBuilder
+
+// An intrinsicBuilder converts a call node n into an ssa value that
+// implements that call as an intrinsic. args is a list of arguments to the func.
+type intrinsicBuilder func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value
+
+type intrinsicKey struct {
+       arch *sys.Arch
+       pkg  string
+       fn   string
+}
+
+func initIntrinsics() {
+       intrinsics = map[intrinsicKey]intrinsicBuilder{}
+
+       var p4 []*sys.Arch
+       var p8 []*sys.Arch
+       var lwatomics []*sys.Arch
+       for _, a := range sys.Archs {
+               if a.PtrSize == 4 {
+                       p4 = append(p4, a)
+               } else {
+                       p8 = append(p8, a)
+               }
+               if a.Family != sys.PPC64 {
+                       lwatomics = append(lwatomics, a)
+               }
+       }
+       all := sys.Archs[:]
+
+       // add adds the intrinsic b for pkg.fn for the given list of architectures.
+       add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
+               for _, a := range archs {
+                       intrinsics[intrinsicKey{a, pkg, fn}] = b
+               }
+       }
+       // addF does the same as add but operates on architecture families.
+       addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
+               for _, a := range sys.Archs {
+                       if a.InFamily(archFamilies...) {
+                               intrinsics[intrinsicKey{a, pkg, fn}] = b
+                       }
+               }
+       }
+       // alias defines pkg.fn = pkg2.fn2 for all architectures in archs for which pkg2.fn2 exists.
+       alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) {
+               aliased := false
+               for _, a := range archs {
+                       if b, ok := intrinsics[intrinsicKey{a, pkg2, fn2}]; ok {
+                               intrinsics[intrinsicKey{a, pkg, fn}] = b
+                               aliased = true
+                       }
+               }
+               if !aliased {
+                       panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn))
+               }
+       }
+
+       /******** runtime ********/
+       if !base.Flag.Cfg.Instrumenting {
+               add("runtime", "slicebytetostringtmp",
+                       func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                               // Compiler frontend optimizations emit OBYTES2STRTMP nodes
+                               // for the backend instead of slicebytetostringtmp calls
+                               // when not instrumenting.
+                               return s.newValue2(ssa.OpStringMake, n.Type(), args[0], args[1])
+                       },
+                       all...)
+       }
+       addF("internal/runtime/math", "MulUintptr",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       if s.config.PtrSize == 4 {
+                               return s.newValue2(ssa.OpMul32uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
+                       }
+                       return s.newValue2(ssa.OpMul64uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
+               },
+               sys.AMD64, sys.I386, sys.Loong64, sys.MIPS64, sys.RISCV64, sys.ARM64)
+       add("runtime", "KeepAlive",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       data := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, args[0])
+                       s.vars[memVar] = s.newValue2(ssa.OpKeepAlive, types.TypeMem, data, s.mem())
+                       return nil
+               },
+               all...)
+       add("runtime", "getclosureptr",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue0(ssa.OpGetClosurePtr, s.f.Config.Types.Uintptr)
+               },
+               all...)
+
+       add("runtime", "getcallerpc",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue0(ssa.OpGetCallerPC, s.f.Config.Types.Uintptr)
+               },
+               all...)
+
+       add("runtime", "getcallersp",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpGetCallerSP, s.f.Config.Types.Uintptr, s.mem())
+               },
+               all...)
+
+       addF("runtime", "publicationBarrier",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem())
+                       return nil
+               },
+               sys.ARM64, sys.PPC64, sys.RISCV64)
+
+       brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X}
+       if buildcfg.GOPPC64 >= 10 {
+               // Use only on Power10 as the new byte reverse instructions that Power10 provide
+               // make it worthwhile as an intrinsic
+               brev_arch = append(brev_arch, sys.PPC64)
+       }
+       /******** internal/runtime/sys ********/
+       addF("internal/runtime/sys", "Bswap32",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
+               },
+               brev_arch...)
+       addF("internal/runtime/sys", "Bswap64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
+               },
+               brev_arch...)
+
+       /****** Prefetch ******/
+       makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+               return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       s.vars[memVar] = s.newValue2(op, types.TypeMem, args[0], s.mem())
+                       return nil
+               }
+       }
+
+       // Make Prefetch intrinsics for supported platforms
+       // On the unsupported platforms stub function will be eliminated
+       addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
+               sys.AMD64, sys.ARM64, sys.PPC64)
+       addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
+               sys.AMD64, sys.ARM64, sys.PPC64)
+
+       /******** internal/runtime/atomic ********/
+       addF("internal/runtime/atomic", "Load",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue2(ssa.OpAtomicLoad32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
+                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
+               },
+               sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+       addF("internal/runtime/atomic", "Load8",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue2(ssa.OpAtomicLoad8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], s.mem())
+                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
+               },
+               sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+       addF("internal/runtime/atomic", "Load64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue2(ssa.OpAtomicLoad64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
+                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
+               },
+               sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+       addF("internal/runtime/atomic", "LoadAcq",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue2(ssa.OpAtomicLoadAcq32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
+                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
+               },
+               sys.PPC64, sys.S390X)
+       addF("internal/runtime/atomic", "LoadAcq64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
+                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
+               },
+               sys.PPC64)
+       addF("internal/runtime/atomic", "Loadp",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem())
+                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+                       return s.newValue1(ssa.OpSelect0, s.f.Config.Types.BytePtr, v)
+               },
+               sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+
+       addF("internal/runtime/atomic", "Store",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32, types.TypeMem, args[0], args[1], s.mem())
+                       return nil
+               },
+               sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+       addF("internal/runtime/atomic", "Store8",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem())
+                       return nil
+               },
+               sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+       addF("internal/runtime/atomic", "Store64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem())
+                       return nil
+               },
+               sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+       addF("internal/runtime/atomic", "StorepNoWB",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       s.vars[memVar] = s.newValue3(ssa.OpAtomicStorePtrNoWB, types.TypeMem, args[0], args[1], s.mem())
+                       return nil
+               },
+               sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.RISCV64, sys.S390X)
+       addF("internal/runtime/atomic", "StoreRel",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel32, types.TypeMem, args[0], args[1], s.mem())
+                       return nil
+               },
+               sys.PPC64, sys.S390X)
+       addF("internal/runtime/atomic", "StoreRel64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem())
+                       return nil
+               },
+               sys.PPC64)
+
+       addF("internal/runtime/atomic", "Xchg",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue3(ssa.OpAtomicExchange32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
+                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
+               },
+               sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+       addF("internal/runtime/atomic", "Xchg64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
+                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
+               },
+               sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+
+       type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool)
+
+       makeAtomicGuardedIntrinsicARM64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder {
+
+               return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       if buildcfg.GOARM64.LSE {
+                               emit(s, n, args, op1, typ, needReturn)
+                       } else {
+                               // Target Atomic feature is identified by dynamic detection
+                               addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARM64HasATOMICS, s.sb)
+                               v := s.load(types.Types[types.TBOOL], addr)
+                               b := s.endBlock()
+                               b.Kind = ssa.BlockIf
+                               b.SetControl(v)
+                               bTrue := s.f.NewBlock(ssa.BlockPlain)
+                               bFalse := s.f.NewBlock(ssa.BlockPlain)
+                               bEnd := s.f.NewBlock(ssa.BlockPlain)
+                               b.AddEdgeTo(bTrue)
+                               b.AddEdgeTo(bFalse)
+                               b.Likely = ssa.BranchLikely
+
+                               // We have atomic instructions - use it directly.
+                               s.startBlock(bTrue)
+                               emit(s, n, args, op1, typ, needReturn)
+                               s.endBlock().AddEdgeTo(bEnd)
+
+                               // Use original instruction sequence.
+                               s.startBlock(bFalse)
+                               emit(s, n, args, op0, typ, needReturn)
+                               s.endBlock().AddEdgeTo(bEnd)
+
+                               // Merge results.
+                               s.startBlock(bEnd)
+                       }
+                       if needReturn {
+                               return s.variable(n, types.Types[typ])
+                       } else {
+                               return nil
+                       }
+               }
+       }
+       makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
+               return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, true)
+       }
+       makeAtomicGuardedIntrinsicARM64old := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
+               return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, false)
+       }
+
+       atomicEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
+               v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
+               s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+               if needReturn {
+                       s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
+               }
+       }
+       addF("internal/runtime/atomic", "Xchg",
+               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, types.TUINT32, atomicEmitterARM64),
+               sys.ARM64)
+       addF("internal/runtime/atomic", "Xchg64",
+               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, atomicEmitterARM64),
+               sys.ARM64)
+
+       addF("internal/runtime/atomic", "Xadd",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
+                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
+               },
+               sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+       addF("internal/runtime/atomic", "Xadd64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
+                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
+               },
+               sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+
+       addF("internal/runtime/atomic", "Xadd",
+               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, types.TUINT32, atomicEmitterARM64),
+               sys.ARM64)
+       addF("internal/runtime/atomic", "Xadd64",
+               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, types.TUINT64, atomicEmitterARM64),
+               sys.ARM64)
+
+       addF("internal/runtime/atomic", "Cas",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
+                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+                       return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
+               },
+               sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+       addF("internal/runtime/atomic", "Cas64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
+                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+                       return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
+               },
+               sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+       addF("internal/runtime/atomic", "CasRel",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
+                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+                       return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
+               },
+               sys.PPC64)
+
+       atomicCasEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
+               v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
+               s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+               if needReturn {
+                       s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
+               }
+       }
+
+       addF("internal/runtime/atomic", "Cas",
+               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, types.TBOOL, atomicCasEmitterARM64),
+               sys.ARM64)
+       addF("internal/runtime/atomic", "Cas64",
+               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TBOOL, atomicCasEmitterARM64),
+               sys.ARM64)
+
+       // Old-style atomic logical operation API (all supported archs except arm64).
+       addF("internal/runtime/atomic", "And8",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
+                       return nil
+               },
+               sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+       addF("internal/runtime/atomic", "And",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem())
+                       return nil
+               },
+               sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+       addF("internal/runtime/atomic", "Or8",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       s.vars[memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
+                       return nil
+               },
+               sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+       addF("internal/runtime/atomic", "Or",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       s.vars[memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem())
+                       return nil
+               },
+               sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+
+       // arm64 always uses the new-style atomic logical operations, for both the
+       // old and new style API.
+       addF("internal/runtime/atomic", "And8",
+               makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64),
+               sys.ARM64)
+       addF("internal/runtime/atomic", "Or8",
+               makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr8value, ssa.OpAtomicOr8valueVariant, types.TUINT8, atomicEmitterARM64),
+               sys.ARM64)
+       addF("internal/runtime/atomic", "And64",
+               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd64value, ssa.OpAtomicAnd64valueVariant, types.TUINT64, atomicEmitterARM64),
+               sys.ARM64)
+       addF("internal/runtime/atomic", "And32",
+               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
+               sys.ARM64)
+       addF("internal/runtime/atomic", "And",
+               makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
+               sys.ARM64)
+       addF("internal/runtime/atomic", "Or64",
+               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr64value, ssa.OpAtomicOr64valueVariant, types.TUINT64, atomicEmitterARM64),
+               sys.ARM64)
+       addF("internal/runtime/atomic", "Or32",
+               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
+               sys.ARM64)
+       addF("internal/runtime/atomic", "Or",
+               makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
+               sys.ARM64)
+
+       // New-style atomic logical operations, which return the old memory value.
+       addF("internal/runtime/atomic", "And64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
+                       p0, p1 := s.split(v)
+                       s.vars[memVar] = p1
+                       return p0
+               },
+               sys.AMD64)
+       addF("internal/runtime/atomic", "And32",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
+                       p0, p1 := s.split(v)
+                       s.vars[memVar] = p1
+                       return p0
+               },
+               sys.AMD64)
+       addF("internal/runtime/atomic", "Or64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
+                       p0, p1 := s.split(v)
+                       s.vars[memVar] = p1
+                       return p0
+               },
+               sys.AMD64)
+       addF("internal/runtime/atomic", "Or32",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
+                       p0, p1 := s.split(v)
+                       s.vars[memVar] = p1
+                       return p0
+               },
+               sys.AMD64)
+
+       // Aliases for atomic load operations
+       alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...)
+       alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...)
+       alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load", p4...)
+       alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load64", p8...)
+       alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load", p4...)
+       alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load64", p8...)
+       alias("internal/runtime/atomic", "LoadAcq", "internal/runtime/atomic", "Load", lwatomics...)
+       alias("internal/runtime/atomic", "LoadAcq64", "internal/runtime/atomic", "Load64", lwatomics...)
+       alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...)
+       alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...) // linknamed
+       alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...)
+       alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...) // linknamed
+
+       // Aliases for atomic store operations
+       alias("internal/runtime/atomic", "Storeint32", "internal/runtime/atomic", "Store", all...)
+       alias("internal/runtime/atomic", "Storeint64", "internal/runtime/atomic", "Store64", all...)
+       alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store", p4...)
+       alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store64", p8...)
+       alias("internal/runtime/atomic", "StoreRel", "internal/runtime/atomic", "Store", lwatomics...)
+       alias("internal/runtime/atomic", "StoreRel64", "internal/runtime/atomic", "Store64", lwatomics...)
+       alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...)
+       alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...) // linknamed
+       alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...)
+       alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...) // linknamed
+
+       // Aliases for atomic swap operations
+       alias("internal/runtime/atomic", "Xchgint32", "internal/runtime/atomic", "Xchg", all...)
+       alias("internal/runtime/atomic", "Xchgint64", "internal/runtime/atomic", "Xchg64", all...)
+       alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg", p4...)
+       alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg64", p8...)
+
+       // Aliases for atomic add operations
+       alias("internal/runtime/atomic", "Xaddint32", "internal/runtime/atomic", "Xadd", all...)
+       alias("internal/runtime/atomic", "Xaddint64", "internal/runtime/atomic", "Xadd64", all...)
+       alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd", p4...)
+       alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd64", p8...)
+
+       // Aliases for atomic CAS operations
+       alias("internal/runtime/atomic", "Casint32", "internal/runtime/atomic", "Cas", all...)
+       alias("internal/runtime/atomic", "Casint64", "internal/runtime/atomic", "Cas64", all...)
+       alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas", p4...)
+       alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas64", p8...)
+       alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas", p4...)
+       alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas64", p8...)
+       alias("internal/runtime/atomic", "CasRel", "internal/runtime/atomic", "Cas", lwatomics...)
+
+       // Aliases for atomic And/Or operations
+       alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64)
+       alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64)
+
+       /******** math ********/
+       addF("math", "sqrt",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpSqrt, types.Types[types.TFLOAT64], args[0])
+               },
+               sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
+       addF("math", "Trunc",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpTrunc, types.Types[types.TFLOAT64], args[0])
+               },
+               sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
+       addF("math", "Ceil",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpCeil, types.Types[types.TFLOAT64], args[0])
+               },
+               sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
+       addF("math", "Floor",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpFloor, types.Types[types.TFLOAT64], args[0])
+               },
+               sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
+       addF("math", "Round",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpRound, types.Types[types.TFLOAT64], args[0])
+               },
+               sys.ARM64, sys.PPC64, sys.S390X)
+       addF("math", "RoundToEven",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpRoundToEven, types.Types[types.TFLOAT64], args[0])
+               },
+               sys.ARM64, sys.S390X, sys.Wasm)
+       addF("math", "Abs",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpAbs, types.Types[types.TFLOAT64], args[0])
+               },
+               sys.ARM64, sys.ARM, sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm, sys.MIPS, sys.MIPS64)
+       addF("math", "Copysign",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue2(ssa.OpCopysign, types.Types[types.TFLOAT64], args[0], args[1])
+               },
+               sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm)
+       addF("math", "FMA",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
+               },
+               sys.ARM64, sys.PPC64, sys.RISCV64, sys.S390X)
+       addF("math", "FMA",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       if !s.config.UseFMA {
+                               s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
+                               return s.variable(n, types.Types[types.TFLOAT64])
+                       }
+
+                       if buildcfg.GOAMD64 >= 3 {
+                               return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
+                       }
+
+                       v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasFMA)
+                       b := s.endBlock()
+                       b.Kind = ssa.BlockIf
+                       b.SetControl(v)
+                       bTrue := s.f.NewBlock(ssa.BlockPlain)
+                       bFalse := s.f.NewBlock(ssa.BlockPlain)
+                       bEnd := s.f.NewBlock(ssa.BlockPlain)
+                       b.AddEdgeTo(bTrue)
+                       b.AddEdgeTo(bFalse)
+                       b.Likely = ssa.BranchLikely // >= haswell cpus are common
+
+                       // We have the intrinsic - use it directly.
+                       s.startBlock(bTrue)
+                       s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Call the pure Go version.
+                       s.startBlock(bFalse)
+                       s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Merge results.
+                       s.startBlock(bEnd)
+                       return s.variable(n, types.Types[types.TFLOAT64])
+               },
+               sys.AMD64)
+       addF("math", "FMA",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       if !s.config.UseFMA {
+                               s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
+                               return s.variable(n, types.Types[types.TFLOAT64])
+                       }
+                       addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARMHasVFPv4, s.sb)
+                       v := s.load(types.Types[types.TBOOL], addr)
+                       b := s.endBlock()
+                       b.Kind = ssa.BlockIf
+                       b.SetControl(v)
+                       bTrue := s.f.NewBlock(ssa.BlockPlain)
+                       bFalse := s.f.NewBlock(ssa.BlockPlain)
+                       bEnd := s.f.NewBlock(ssa.BlockPlain)
+                       b.AddEdgeTo(bTrue)
+                       b.AddEdgeTo(bFalse)
+                       b.Likely = ssa.BranchLikely
+
+                       // We have the intrinsic - use it directly.
+                       s.startBlock(bTrue)
+                       s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Call the pure Go version.
+                       s.startBlock(bFalse)
+                       s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Merge results.
+                       s.startBlock(bEnd)
+                       return s.variable(n, types.Types[types.TFLOAT64])
+               },
+               sys.ARM)
+
+       makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+               return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       if buildcfg.GOAMD64 >= 2 {
+                               return s.newValue1(op, types.Types[types.TFLOAT64], args[0])
+                       }
+
+                       v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasSSE41)
+                       b := s.endBlock()
+                       b.Kind = ssa.BlockIf
+                       b.SetControl(v)
+                       bTrue := s.f.NewBlock(ssa.BlockPlain)
+                       bFalse := s.f.NewBlock(ssa.BlockPlain)
+                       bEnd := s.f.NewBlock(ssa.BlockPlain)
+                       b.AddEdgeTo(bTrue)
+                       b.AddEdgeTo(bFalse)
+                       b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
+
+                       // We have the intrinsic - use it directly.
+                       s.startBlock(bTrue)
+                       s.vars[n] = s.newValue1(op, types.Types[types.TFLOAT64], args[0])
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Call the pure Go version.
+                       s.startBlock(bFalse)
+                       s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Merge results.
+                       s.startBlock(bEnd)
+                       return s.variable(n, types.Types[types.TFLOAT64])
+               }
+       }
+       addF("math", "RoundToEven",
+               makeRoundAMD64(ssa.OpRoundToEven),
+               sys.AMD64)
+       addF("math", "Floor",
+               makeRoundAMD64(ssa.OpFloor),
+               sys.AMD64)
+       addF("math", "Ceil",
+               makeRoundAMD64(ssa.OpCeil),
+               sys.AMD64)
+       addF("math", "Trunc",
+               makeRoundAMD64(ssa.OpTrunc),
+               sys.AMD64)
+
+       /******** math/bits ********/
+       addF("math/bits", "TrailingZeros64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
+               },
+               sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+       addF("math/bits", "TrailingZeros32",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
+               },
+               sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+       addF("math/bits", "TrailingZeros16",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
+                       c := s.constInt32(types.Types[types.TUINT32], 1<<16)
+                       y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
+                       return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
+               },
+               sys.MIPS)
+       addF("math/bits", "TrailingZeros16",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
+               },
+               sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
+       addF("math/bits", "TrailingZeros16",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
+                       c := s.constInt64(types.Types[types.TUINT64], 1<<16)
+                       y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
+                       return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
+               },
+               sys.S390X, sys.PPC64)
+       addF("math/bits", "TrailingZeros8",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
+                       c := s.constInt32(types.Types[types.TUINT32], 1<<8)
+                       y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
+                       return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
+               },
+               sys.MIPS)
+       addF("math/bits", "TrailingZeros8",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
+               },
+               sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
+       addF("math/bits", "TrailingZeros8",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
+                       c := s.constInt64(types.Types[types.TUINT64], 1<<8)
+                       y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
+                       return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
+               },
+               sys.S390X)
+       alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...)
+       alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...)
+       // ReverseBytes inlines correctly, no need to intrinsify it.
+       // Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
+       // On Power10, 16-bit rotate is not available so use BRH instruction
+       if buildcfg.GOPPC64 >= 10 {
+               addF("math/bits", "ReverseBytes16",
+                       func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                               return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
+                       },
+                       sys.PPC64)
+       }
+
+       addF("math/bits", "Len64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
+               },
+               sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+       addF("math/bits", "Len32",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
+               },
+               sys.AMD64, sys.ARM64, sys.PPC64)
+       addF("math/bits", "Len32",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       if s.config.PtrSize == 4 {
+                               return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
+                       }
+                       x := s.newValue1(ssa.OpZeroExt32to64, types.Types[types.TUINT64], args[0])
+                       return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
+               },
+               sys.ARM, sys.S390X, sys.MIPS, sys.Wasm)
+       addF("math/bits", "Len16",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       if s.config.PtrSize == 4 {
+                               x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
+                               return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
+                       }
+                       x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
+                       return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
+               },
+               sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+       addF("math/bits", "Len16",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
+               },
+               sys.AMD64)
+       addF("math/bits", "Len8",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       if s.config.PtrSize == 4 {
+                               x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
+                               return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
+                       }
+                       x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
+                       return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
+               },
+               sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+       addF("math/bits", "Len8",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
+               },
+               sys.AMD64)
+       addF("math/bits", "Len",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       if s.config.PtrSize == 4 {
+                               return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
+                       }
+                       return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
+               },
+               sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+       // LeadingZeros is handled because it trivially calls Len.
+       addF("math/bits", "Reverse64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
+               },
+               sys.ARM64)
+       addF("math/bits", "Reverse32",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpBitRev32, types.Types[types.TINT], args[0])
+               },
+               sys.ARM64)
+       addF("math/bits", "Reverse16",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpBitRev16, types.Types[types.TINT], args[0])
+               },
+               sys.ARM64)
+       addF("math/bits", "Reverse8",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpBitRev8, types.Types[types.TINT], args[0])
+               },
+               sys.ARM64)
+       addF("math/bits", "Reverse",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
+               },
+               sys.ARM64)
+       addF("math/bits", "RotateLeft8",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1])
+               },
+               sys.AMD64, sys.RISCV64)
+       addF("math/bits", "RotateLeft16",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1])
+               },
+               sys.AMD64, sys.RISCV64)
+       addF("math/bits", "RotateLeft32",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1])
+               },
+               sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
+       addF("math/bits", "RotateLeft64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1])
+               },
+               sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
+       alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...)
+
+       makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+               return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       if buildcfg.GOAMD64 >= 2 {
+                               return s.newValue1(op, types.Types[types.TINT], args[0])
+                       }
+
+                       v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasPOPCNT)
+                       b := s.endBlock()
+                       b.Kind = ssa.BlockIf
+                       b.SetControl(v)
+                       bTrue := s.f.NewBlock(ssa.BlockPlain)
+                       bFalse := s.f.NewBlock(ssa.BlockPlain)
+                       bEnd := s.f.NewBlock(ssa.BlockPlain)
+                       b.AddEdgeTo(bTrue)
+                       b.AddEdgeTo(bFalse)
+                       b.Likely = ssa.BranchLikely // most machines have popcnt nowadays
+
+                       // We have the intrinsic - use it directly.
+                       s.startBlock(bTrue)
+                       s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Call the pure Go version.
+                       s.startBlock(bFalse)
+                       s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Merge results.
+                       s.startBlock(bEnd)
+                       return s.variable(n, types.Types[types.TINT])
+               }
+       }
+       addF("math/bits", "OnesCount64",
+               makeOnesCountAMD64(ssa.OpPopCount64),
+               sys.AMD64)
+       addF("math/bits", "OnesCount64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpPopCount64, types.Types[types.TINT], args[0])
+               },
+               sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
+       addF("math/bits", "OnesCount32",
+               makeOnesCountAMD64(ssa.OpPopCount32),
+               sys.AMD64)
+       addF("math/bits", "OnesCount32",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpPopCount32, types.Types[types.TINT], args[0])
+               },
+               sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
+       addF("math/bits", "OnesCount16",
+               makeOnesCountAMD64(ssa.OpPopCount16),
+               sys.AMD64)
+       addF("math/bits", "OnesCount16",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpPopCount16, types.Types[types.TINT], args[0])
+               },
+               sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm)
+       addF("math/bits", "OnesCount8",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0])
+               },
+               sys.S390X, sys.PPC64, sys.Wasm)
+       addF("math/bits", "OnesCount",
+               makeOnesCountAMD64(ssa.OpPopCount64),
+               sys.AMD64)
+       addF("math/bits", "Mul64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1])
+               },
+               sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.MIPS64, sys.RISCV64, sys.Loong64)
+       alias("math/bits", "Mul", "math/bits", "Mul64", p8...)
+       alias("internal/runtime/math", "Mul64", "math/bits", "Mul64", p8...)
+       addF("math/bits", "Add64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
+               },
+               sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
+       alias("math/bits", "Add", "math/bits", "Add64", p8...)
+       alias("internal/runtime/math", "Add64", "math/bits", "Add64", all...)
+       addF("math/bits", "Sub64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
+               },
+               sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
+       alias("math/bits", "Sub", "math/bits", "Sub64", p8...)
+       addF("math/bits", "Div64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       // check for divide-by-zero/overflow and panic with appropriate message
+                       cmpZero := s.newValue2(s.ssaOp(ir.ONE, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[2], s.zeroVal(types.Types[types.TUINT64]))
+                       s.check(cmpZero, ir.Syms.Panicdivide)
+                       cmpOverflow := s.newValue2(s.ssaOp(ir.OLT, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[0], args[2])
+                       s.check(cmpOverflow, ir.Syms.Panicoverflow)
+                       return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
+               },
+               sys.AMD64)
+       alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64)
+
+       alias("internal/runtime/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...)
+       alias("internal/runtime/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...)
+       alias("internal/runtime/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...)
+       alias("internal/runtime/sys", "Len8", "math/bits", "Len8", all...)
+       alias("internal/runtime/sys", "Len64", "math/bits", "Len64", all...)
+       alias("internal/runtime/sys", "OnesCount64", "math/bits", "OnesCount64", all...)
+
+       /******** sync/atomic ********/
+
+       // Note: these are disabled by flag_race in findIntrinsic below.
+       alias("sync/atomic", "LoadInt32", "internal/runtime/atomic", "Load", all...)
+       alias("sync/atomic", "LoadInt64", "internal/runtime/atomic", "Load64", all...)
+       alias("sync/atomic", "LoadPointer", "internal/runtime/atomic", "Loadp", all...)
+       alias("sync/atomic", "LoadUint32", "internal/runtime/atomic", "Load", all...)
+       alias("sync/atomic", "LoadUint64", "internal/runtime/atomic", "Load64", all...)
+       alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load", p4...)
+       alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load64", p8...)
+
+       alias("sync/atomic", "StoreInt32", "internal/runtime/atomic", "Store", all...)
+       alias("sync/atomic", "StoreInt64", "internal/runtime/atomic", "Store64", all...)
+       // Note: not StorePointer, that needs a write barrier.  Same below for {CompareAnd}Swap.
+       alias("sync/atomic", "StoreUint32", "internal/runtime/atomic", "Store", all...)
+       alias("sync/atomic", "StoreUint64", "internal/runtime/atomic", "Store64", all...)
+       alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store", p4...)
+       alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store64", p8...)
+
+       alias("sync/atomic", "SwapInt32", "internal/runtime/atomic", "Xchg", all...)
+       alias("sync/atomic", "SwapInt64", "internal/runtime/atomic", "Xchg64", all...)
+       alias("sync/atomic", "SwapUint32", "internal/runtime/atomic", "Xchg", all...)
+       alias("sync/atomic", "SwapUint64", "internal/runtime/atomic", "Xchg64", all...)
+       alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg", p4...)
+       alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg64", p8...)
+
+       alias("sync/atomic", "CompareAndSwapInt32", "internal/runtime/atomic", "Cas", all...)
+       alias("sync/atomic", "CompareAndSwapInt64", "internal/runtime/atomic", "Cas64", all...)
+       alias("sync/atomic", "CompareAndSwapUint32", "internal/runtime/atomic", "Cas", all...)
+       alias("sync/atomic", "CompareAndSwapUint64", "internal/runtime/atomic", "Cas64", all...)
+       alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas", p4...)
+       alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas64", p8...)
+
+       alias("sync/atomic", "AddInt32", "internal/runtime/atomic", "Xadd", all...)
+       alias("sync/atomic", "AddInt64", "internal/runtime/atomic", "Xadd64", all...)
+       alias("sync/atomic", "AddUint32", "internal/runtime/atomic", "Xadd", all...)
+       alias("sync/atomic", "AddUint64", "internal/runtime/atomic", "Xadd64", all...)
+       alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...)
+       alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...)
+
+       alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64)
+       alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64)
+       alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
+       alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
+       alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
+       alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64)
+       alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64)
+       alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
+       alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
+       alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
+
+       /******** math/big ********/
+       alias("math/big", "mulWW", "math/bits", "Mul64", p8...)
+}
+
+// findIntrinsic returns a function which builds the SSA equivalent of the
+// function identified by the symbol sym.  If sym is not an intrinsic call, returns nil.
+func findIntrinsic(sym *types.Sym) intrinsicBuilder {
+       if sym == nil || sym.Pkg == nil {
+               return nil
+       }
+       pkg := sym.Pkg.Path
+       if sym.Pkg == ir.Pkgs.Runtime {
+               pkg = "runtime"
+       }
+       if base.Flag.Race && pkg == "sync/atomic" {
+               // The race detector needs to be able to intercept these calls.
+               // We can't intrinsify them.
+               return nil
+       }
+       // Skip intrinsifying math functions (which may contain hard-float
+       // instructions) when soft-float
+       if Arch.SoftFloat && pkg == "math" {
+               return nil
+       }
+
+       fn := sym.Name
+       if ssa.IntrinsicsDisable {
+               if pkg == "runtime" && (fn == "getcallerpc" || fn == "getcallersp" || fn == "getclosureptr") {
+                       // These runtime functions don't have definitions, must be intrinsics.
+               } else {
+                       return nil
+               }
+       }
+       return intrinsics[intrinsicKey{Arch.LinkArch.Arch, pkg, fn}]
+}
+
+func IsIntrinsicCall(n *ir.CallExpr) bool {
+       if n == nil {
+               return false
+       }
+       name, ok := n.Fun.(*ir.Name)
+       if !ok {
+               return false
+       }
+       return findIntrinsic(name.Sym()) != nil
+}
index c02f5f51291d47f9f1e008d1f53a4d29832f651c..67479ace3b4fd3cbc19a13c00691e2d63c8b6e82 100644 (file)
@@ -221,6 +221,10 @@ func InitConfig() {
        ir.Syms.SigPanic = typecheck.LookupRuntimeFunc("sigpanic")
 }
 
+func InitTables() {
+       initIntrinsics()
+}
+
 // AbiForBodylessFuncStackMap returns the ABI for a bodyless function's stack map.
 // This is not necessarily the ABI used to call it.
 // Currently (1.17 dev) such a stack map is always ABI0;
@@ -4200,1037 +4204,6 @@ func (s *state) split(v *ssa.Value) (*ssa.Value, *ssa.Value) {
        return p0, p1
 }
 
-var intrinsics map[intrinsicKey]intrinsicBuilder
-
-// An intrinsicBuilder converts a call node n into an ssa value that
-// implements that call as an intrinsic. args is a list of arguments to the func.
-type intrinsicBuilder func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value
-
-type intrinsicKey struct {
-       arch *sys.Arch
-       pkg  string
-       fn   string
-}
-
-func InitTables() {
-       intrinsics = map[intrinsicKey]intrinsicBuilder{}
-
-       var p4 []*sys.Arch
-       var p8 []*sys.Arch
-       var lwatomics []*sys.Arch
-       for _, a := range sys.Archs {
-               if a.PtrSize == 4 {
-                       p4 = append(p4, a)
-               } else {
-                       p8 = append(p8, a)
-               }
-               if a.Family != sys.PPC64 {
-                       lwatomics = append(lwatomics, a)
-               }
-       }
-       all := sys.Archs[:]
-
-       // add adds the intrinsic b for pkg.fn for the given list of architectures.
-       add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
-               for _, a := range archs {
-                       intrinsics[intrinsicKey{a, pkg, fn}] = b
-               }
-       }
-       // addF does the same as add but operates on architecture families.
-       addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
-               for _, a := range sys.Archs {
-                       if a.InFamily(archFamilies...) {
-                               intrinsics[intrinsicKey{a, pkg, fn}] = b
-                       }
-               }
-       }
-       // alias defines pkg.fn = pkg2.fn2 for all architectures in archs for which pkg2.fn2 exists.
-       alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) {
-               aliased := false
-               for _, a := range archs {
-                       if b, ok := intrinsics[intrinsicKey{a, pkg2, fn2}]; ok {
-                               intrinsics[intrinsicKey{a, pkg, fn}] = b
-                               aliased = true
-                       }
-               }
-               if !aliased {
-                       panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn))
-               }
-       }
-
-       /******** runtime ********/
-       if !base.Flag.Cfg.Instrumenting {
-               add("runtime", "slicebytetostringtmp",
-                       func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                               // Compiler frontend optimizations emit OBYTES2STRTMP nodes
-                               // for the backend instead of slicebytetostringtmp calls
-                               // when not instrumenting.
-                               return s.newValue2(ssa.OpStringMake, n.Type(), args[0], args[1])
-                       },
-                       all...)
-       }
-       addF("internal/runtime/math", "MulUintptr",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       if s.config.PtrSize == 4 {
-                               return s.newValue2(ssa.OpMul32uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
-                       }
-                       return s.newValue2(ssa.OpMul64uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
-               },
-               sys.AMD64, sys.I386, sys.Loong64, sys.MIPS64, sys.RISCV64, sys.ARM64)
-       add("runtime", "KeepAlive",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       data := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, args[0])
-                       s.vars[memVar] = s.newValue2(ssa.OpKeepAlive, types.TypeMem, data, s.mem())
-                       return nil
-               },
-               all...)
-       add("runtime", "getclosureptr",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue0(ssa.OpGetClosurePtr, s.f.Config.Types.Uintptr)
-               },
-               all...)
-
-       add("runtime", "getcallerpc",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue0(ssa.OpGetCallerPC, s.f.Config.Types.Uintptr)
-               },
-               all...)
-
-       add("runtime", "getcallersp",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpGetCallerSP, s.f.Config.Types.Uintptr, s.mem())
-               },
-               all...)
-
-       addF("runtime", "publicationBarrier",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem())
-                       return nil
-               },
-               sys.ARM64, sys.PPC64, sys.RISCV64)
-
-       brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X}
-       if buildcfg.GOPPC64 >= 10 {
-               // Use only on Power10 as the new byte reverse instructions that Power10 provide
-               // make it worthwhile as an intrinsic
-               brev_arch = append(brev_arch, sys.PPC64)
-       }
-       /******** internal/runtime/sys ********/
-       addF("internal/runtime/sys", "Bswap32",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
-               },
-               brev_arch...)
-       addF("internal/runtime/sys", "Bswap64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
-               },
-               brev_arch...)
-
-       /****** Prefetch ******/
-       makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-               return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       s.vars[memVar] = s.newValue2(op, types.TypeMem, args[0], s.mem())
-                       return nil
-               }
-       }
-
-       // Make Prefetch intrinsics for supported platforms
-       // On the unsupported platforms stub function will be eliminated
-       addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
-               sys.AMD64, sys.ARM64, sys.PPC64)
-       addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
-               sys.AMD64, sys.ARM64, sys.PPC64)
-
-       /******** internal/runtime/atomic ********/
-       addF("internal/runtime/atomic", "Load",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue2(ssa.OpAtomicLoad32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
-                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
-               },
-               sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-       addF("internal/runtime/atomic", "Load8",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue2(ssa.OpAtomicLoad8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], s.mem())
-                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
-               },
-               sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-       addF("internal/runtime/atomic", "Load64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue2(ssa.OpAtomicLoad64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
-                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
-               },
-               sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-       addF("internal/runtime/atomic", "LoadAcq",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue2(ssa.OpAtomicLoadAcq32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
-                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
-               },
-               sys.PPC64, sys.S390X)
-       addF("internal/runtime/atomic", "LoadAcq64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
-                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
-               },
-               sys.PPC64)
-       addF("internal/runtime/atomic", "Loadp",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem())
-                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-                       return s.newValue1(ssa.OpSelect0, s.f.Config.Types.BytePtr, v)
-               },
-               sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-
-       addF("internal/runtime/atomic", "Store",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32, types.TypeMem, args[0], args[1], s.mem())
-                       return nil
-               },
-               sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-       addF("internal/runtime/atomic", "Store8",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem())
-                       return nil
-               },
-               sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-       addF("internal/runtime/atomic", "Store64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem())
-                       return nil
-               },
-               sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-       addF("internal/runtime/atomic", "StorepNoWB",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       s.vars[memVar] = s.newValue3(ssa.OpAtomicStorePtrNoWB, types.TypeMem, args[0], args[1], s.mem())
-                       return nil
-               },
-               sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.RISCV64, sys.S390X)
-       addF("internal/runtime/atomic", "StoreRel",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel32, types.TypeMem, args[0], args[1], s.mem())
-                       return nil
-               },
-               sys.PPC64, sys.S390X)
-       addF("internal/runtime/atomic", "StoreRel64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem())
-                       return nil
-               },
-               sys.PPC64)
-
-       addF("internal/runtime/atomic", "Xchg",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue3(ssa.OpAtomicExchange32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
-                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
-               },
-               sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-       addF("internal/runtime/atomic", "Xchg64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
-                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
-               },
-               sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-
-       type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool)
-
-       makeAtomicGuardedIntrinsicARM64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder {
-
-               return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       if buildcfg.GOARM64.LSE {
-                               emit(s, n, args, op1, typ, needReturn)
-                       } else {
-                               // Target Atomic feature is identified by dynamic detection
-                               addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARM64HasATOMICS, s.sb)
-                               v := s.load(types.Types[types.TBOOL], addr)
-                               b := s.endBlock()
-                               b.Kind = ssa.BlockIf
-                               b.SetControl(v)
-                               bTrue := s.f.NewBlock(ssa.BlockPlain)
-                               bFalse := s.f.NewBlock(ssa.BlockPlain)
-                               bEnd := s.f.NewBlock(ssa.BlockPlain)
-                               b.AddEdgeTo(bTrue)
-                               b.AddEdgeTo(bFalse)
-                               b.Likely = ssa.BranchLikely
-
-                               // We have atomic instructions - use it directly.
-                               s.startBlock(bTrue)
-                               emit(s, n, args, op1, typ, needReturn)
-                               s.endBlock().AddEdgeTo(bEnd)
-
-                               // Use original instruction sequence.
-                               s.startBlock(bFalse)
-                               emit(s, n, args, op0, typ, needReturn)
-                               s.endBlock().AddEdgeTo(bEnd)
-
-                               // Merge results.
-                               s.startBlock(bEnd)
-                       }
-                       if needReturn {
-                               return s.variable(n, types.Types[typ])
-                       } else {
-                               return nil
-                       }
-               }
-       }
-       makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
-               return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, true)
-       }
-       makeAtomicGuardedIntrinsicARM64old := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
-               return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, false)
-       }
-
-       atomicEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
-               v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
-               s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-               if needReturn {
-                       s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
-               }
-       }
-       addF("internal/runtime/atomic", "Xchg",
-               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, types.TUINT32, atomicEmitterARM64),
-               sys.ARM64)
-       addF("internal/runtime/atomic", "Xchg64",
-               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, atomicEmitterARM64),
-               sys.ARM64)
-
-       addF("internal/runtime/atomic", "Xadd",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
-                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
-               },
-               sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-       addF("internal/runtime/atomic", "Xadd64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
-                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-                       return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
-               },
-               sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-
-       addF("internal/runtime/atomic", "Xadd",
-               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, types.TUINT32, atomicEmitterARM64),
-               sys.ARM64)
-       addF("internal/runtime/atomic", "Xadd64",
-               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, types.TUINT64, atomicEmitterARM64),
-               sys.ARM64)
-
-       addF("internal/runtime/atomic", "Cas",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
-                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-                       return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
-               },
-               sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-       addF("internal/runtime/atomic", "Cas64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
-                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-                       return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
-               },
-               sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-       addF("internal/runtime/atomic", "CasRel",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
-                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-                       return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
-               },
-               sys.PPC64)
-
-       atomicCasEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
-               v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
-               s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-               if needReturn {
-                       s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
-               }
-       }
-
-       addF("internal/runtime/atomic", "Cas",
-               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, types.TBOOL, atomicCasEmitterARM64),
-               sys.ARM64)
-       addF("internal/runtime/atomic", "Cas64",
-               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TBOOL, atomicCasEmitterARM64),
-               sys.ARM64)
-
-       // Old-style atomic logical operation API (all supported archs except arm64).
-       addF("internal/runtime/atomic", "And8",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
-                       return nil
-               },
-               sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-       addF("internal/runtime/atomic", "And",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem())
-                       return nil
-               },
-               sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-       addF("internal/runtime/atomic", "Or8",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       s.vars[memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
-                       return nil
-               },
-               sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-       addF("internal/runtime/atomic", "Or",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       s.vars[memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem())
-                       return nil
-               },
-               sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-
-       // arm64 always uses the new-style atomic logical operations, for both the
-       // old and new style API.
-       addF("internal/runtime/atomic", "And8",
-               makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64),
-               sys.ARM64)
-       addF("internal/runtime/atomic", "Or8",
-               makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr8value, ssa.OpAtomicOr8valueVariant, types.TUINT8, atomicEmitterARM64),
-               sys.ARM64)
-       addF("internal/runtime/atomic", "And64",
-               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd64value, ssa.OpAtomicAnd64valueVariant, types.TUINT64, atomicEmitterARM64),
-               sys.ARM64)
-       addF("internal/runtime/atomic", "And32",
-               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
-               sys.ARM64)
-       addF("internal/runtime/atomic", "And",
-               makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
-               sys.ARM64)
-       addF("internal/runtime/atomic", "Or64",
-               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr64value, ssa.OpAtomicOr64valueVariant, types.TUINT64, atomicEmitterARM64),
-               sys.ARM64)
-       addF("internal/runtime/atomic", "Or32",
-               makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
-               sys.ARM64)
-       addF("internal/runtime/atomic", "Or",
-               makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
-               sys.ARM64)
-
-       // New-style atomic logical operations, which return the old memory value.
-       addF("internal/runtime/atomic", "And64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
-                       p0, p1 := s.split(v)
-                       s.vars[memVar] = p1
-                       return p0
-               },
-               sys.AMD64)
-       addF("internal/runtime/atomic", "And32",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
-                       p0, p1 := s.split(v)
-                       s.vars[memVar] = p1
-                       return p0
-               },
-               sys.AMD64)
-       addF("internal/runtime/atomic", "Or64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
-                       p0, p1 := s.split(v)
-                       s.vars[memVar] = p1
-                       return p0
-               },
-               sys.AMD64)
-       addF("internal/runtime/atomic", "Or32",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
-                       p0, p1 := s.split(v)
-                       s.vars[memVar] = p1
-                       return p0
-               },
-               sys.AMD64)
-
-       // Aliases for atomic load operations
-       alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...)
-       alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...)
-       alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load", p4...)
-       alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load64", p8...)
-       alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load", p4...)
-       alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load64", p8...)
-       alias("internal/runtime/atomic", "LoadAcq", "internal/runtime/atomic", "Load", lwatomics...)
-       alias("internal/runtime/atomic", "LoadAcq64", "internal/runtime/atomic", "Load64", lwatomics...)
-       alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...)
-       alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...) // linknamed
-       alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...)
-       alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...) // linknamed
-
-       // Aliases for atomic store operations
-       alias("internal/runtime/atomic", "Storeint32", "internal/runtime/atomic", "Store", all...)
-       alias("internal/runtime/atomic", "Storeint64", "internal/runtime/atomic", "Store64", all...)
-       alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store", p4...)
-       alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store64", p8...)
-       alias("internal/runtime/atomic", "StoreRel", "internal/runtime/atomic", "Store", lwatomics...)
-       alias("internal/runtime/atomic", "StoreRel64", "internal/runtime/atomic", "Store64", lwatomics...)
-       alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...)
-       alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...) // linknamed
-       alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...)
-       alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...) // linknamed
-
-       // Aliases for atomic swap operations
-       alias("internal/runtime/atomic", "Xchgint32", "internal/runtime/atomic", "Xchg", all...)
-       alias("internal/runtime/atomic", "Xchgint64", "internal/runtime/atomic", "Xchg64", all...)
-       alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg", p4...)
-       alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg64", p8...)
-
-       // Aliases for atomic add operations
-       alias("internal/runtime/atomic", "Xaddint32", "internal/runtime/atomic", "Xadd", all...)
-       alias("internal/runtime/atomic", "Xaddint64", "internal/runtime/atomic", "Xadd64", all...)
-       alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd", p4...)
-       alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd64", p8...)
-
-       // Aliases for atomic CAS operations
-       alias("internal/runtime/atomic", "Casint32", "internal/runtime/atomic", "Cas", all...)
-       alias("internal/runtime/atomic", "Casint64", "internal/runtime/atomic", "Cas64", all...)
-       alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas", p4...)
-       alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas64", p8...)
-       alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas", p4...)
-       alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas64", p8...)
-       alias("internal/runtime/atomic", "CasRel", "internal/runtime/atomic", "Cas", lwatomics...)
-
-       // Aliases for atomic And/Or operations
-       alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64)
-       alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64)
-
-       /******** math ********/
-       addF("math", "sqrt",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpSqrt, types.Types[types.TFLOAT64], args[0])
-               },
-               sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
-       addF("math", "Trunc",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpTrunc, types.Types[types.TFLOAT64], args[0])
-               },
-               sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
-       addF("math", "Ceil",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpCeil, types.Types[types.TFLOAT64], args[0])
-               },
-               sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
-       addF("math", "Floor",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpFloor, types.Types[types.TFLOAT64], args[0])
-               },
-               sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
-       addF("math", "Round",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpRound, types.Types[types.TFLOAT64], args[0])
-               },
-               sys.ARM64, sys.PPC64, sys.S390X)
-       addF("math", "RoundToEven",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpRoundToEven, types.Types[types.TFLOAT64], args[0])
-               },
-               sys.ARM64, sys.S390X, sys.Wasm)
-       addF("math", "Abs",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpAbs, types.Types[types.TFLOAT64], args[0])
-               },
-               sys.ARM64, sys.ARM, sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm, sys.MIPS, sys.MIPS64)
-       addF("math", "Copysign",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue2(ssa.OpCopysign, types.Types[types.TFLOAT64], args[0], args[1])
-               },
-               sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm)
-       addF("math", "FMA",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
-               },
-               sys.ARM64, sys.PPC64, sys.RISCV64, sys.S390X)
-       addF("math", "FMA",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       if !s.config.UseFMA {
-                               s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
-                               return s.variable(n, types.Types[types.TFLOAT64])
-                       }
-
-                       if buildcfg.GOAMD64 >= 3 {
-                               return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
-                       }
-
-                       v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasFMA)
-                       b := s.endBlock()
-                       b.Kind = ssa.BlockIf
-                       b.SetControl(v)
-                       bTrue := s.f.NewBlock(ssa.BlockPlain)
-                       bFalse := s.f.NewBlock(ssa.BlockPlain)
-                       bEnd := s.f.NewBlock(ssa.BlockPlain)
-                       b.AddEdgeTo(bTrue)
-                       b.AddEdgeTo(bFalse)
-                       b.Likely = ssa.BranchLikely // >= haswell cpus are common
-
-                       // We have the intrinsic - use it directly.
-                       s.startBlock(bTrue)
-                       s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
-                       s.endBlock().AddEdgeTo(bEnd)
-
-                       // Call the pure Go version.
-                       s.startBlock(bFalse)
-                       s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
-                       s.endBlock().AddEdgeTo(bEnd)
-
-                       // Merge results.
-                       s.startBlock(bEnd)
-                       return s.variable(n, types.Types[types.TFLOAT64])
-               },
-               sys.AMD64)
-       addF("math", "FMA",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       if !s.config.UseFMA {
-                               s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
-                               return s.variable(n, types.Types[types.TFLOAT64])
-                       }
-                       addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARMHasVFPv4, s.sb)
-                       v := s.load(types.Types[types.TBOOL], addr)
-                       b := s.endBlock()
-                       b.Kind = ssa.BlockIf
-                       b.SetControl(v)
-                       bTrue := s.f.NewBlock(ssa.BlockPlain)
-                       bFalse := s.f.NewBlock(ssa.BlockPlain)
-                       bEnd := s.f.NewBlock(ssa.BlockPlain)
-                       b.AddEdgeTo(bTrue)
-                       b.AddEdgeTo(bFalse)
-                       b.Likely = ssa.BranchLikely
-
-                       // We have the intrinsic - use it directly.
-                       s.startBlock(bTrue)
-                       s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
-                       s.endBlock().AddEdgeTo(bEnd)
-
-                       // Call the pure Go version.
-                       s.startBlock(bFalse)
-                       s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
-                       s.endBlock().AddEdgeTo(bEnd)
-
-                       // Merge results.
-                       s.startBlock(bEnd)
-                       return s.variable(n, types.Types[types.TFLOAT64])
-               },
-               sys.ARM)
-
-       makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-               return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       if buildcfg.GOAMD64 >= 2 {
-                               return s.newValue1(op, types.Types[types.TFLOAT64], args[0])
-                       }
-
-                       v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasSSE41)
-                       b := s.endBlock()
-                       b.Kind = ssa.BlockIf
-                       b.SetControl(v)
-                       bTrue := s.f.NewBlock(ssa.BlockPlain)
-                       bFalse := s.f.NewBlock(ssa.BlockPlain)
-                       bEnd := s.f.NewBlock(ssa.BlockPlain)
-                       b.AddEdgeTo(bTrue)
-                       b.AddEdgeTo(bFalse)
-                       b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
-
-                       // We have the intrinsic - use it directly.
-                       s.startBlock(bTrue)
-                       s.vars[n] = s.newValue1(op, types.Types[types.TFLOAT64], args[0])
-                       s.endBlock().AddEdgeTo(bEnd)
-
-                       // Call the pure Go version.
-                       s.startBlock(bFalse)
-                       s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
-                       s.endBlock().AddEdgeTo(bEnd)
-
-                       // Merge results.
-                       s.startBlock(bEnd)
-                       return s.variable(n, types.Types[types.TFLOAT64])
-               }
-       }
-       addF("math", "RoundToEven",
-               makeRoundAMD64(ssa.OpRoundToEven),
-               sys.AMD64)
-       addF("math", "Floor",
-               makeRoundAMD64(ssa.OpFloor),
-               sys.AMD64)
-       addF("math", "Ceil",
-               makeRoundAMD64(ssa.OpCeil),
-               sys.AMD64)
-       addF("math", "Trunc",
-               makeRoundAMD64(ssa.OpTrunc),
-               sys.AMD64)
-
-       /******** math/bits ********/
-       addF("math/bits", "TrailingZeros64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
-               },
-               sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
-       addF("math/bits", "TrailingZeros32",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
-               },
-               sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
-       addF("math/bits", "TrailingZeros16",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
-                       c := s.constInt32(types.Types[types.TUINT32], 1<<16)
-                       y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
-                       return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
-               },
-               sys.MIPS)
-       addF("math/bits", "TrailingZeros16",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
-               },
-               sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
-       addF("math/bits", "TrailingZeros16",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
-                       c := s.constInt64(types.Types[types.TUINT64], 1<<16)
-                       y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
-                       return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
-               },
-               sys.S390X, sys.PPC64)
-       addF("math/bits", "TrailingZeros8",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
-                       c := s.constInt32(types.Types[types.TUINT32], 1<<8)
-                       y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
-                       return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
-               },
-               sys.MIPS)
-       addF("math/bits", "TrailingZeros8",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
-               },
-               sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
-       addF("math/bits", "TrailingZeros8",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
-                       c := s.constInt64(types.Types[types.TUINT64], 1<<8)
-                       y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
-                       return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
-               },
-               sys.S390X)
-       alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...)
-       alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...)
-       // ReverseBytes inlines correctly, no need to intrinsify it.
-       // Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
-       // On Power10, 16-bit rotate is not available so use BRH instruction
-       if buildcfg.GOPPC64 >= 10 {
-               addF("math/bits", "ReverseBytes16",
-                       func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                               return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
-                       },
-                       sys.PPC64)
-       }
-
-       addF("math/bits", "Len64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
-               },
-               sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
-       addF("math/bits", "Len32",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
-               },
-               sys.AMD64, sys.ARM64, sys.PPC64)
-       addF("math/bits", "Len32",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       if s.config.PtrSize == 4 {
-                               return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
-                       }
-                       x := s.newValue1(ssa.OpZeroExt32to64, types.Types[types.TUINT64], args[0])
-                       return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
-               },
-               sys.ARM, sys.S390X, sys.MIPS, sys.Wasm)
-       addF("math/bits", "Len16",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       if s.config.PtrSize == 4 {
-                               x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
-                               return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
-                       }
-                       x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
-                       return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
-               },
-               sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
-       addF("math/bits", "Len16",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
-               },
-               sys.AMD64)
-       addF("math/bits", "Len8",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       if s.config.PtrSize == 4 {
-                               x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
-                               return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
-                       }
-                       x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
-                       return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
-               },
-               sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
-       addF("math/bits", "Len8",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
-               },
-               sys.AMD64)
-       addF("math/bits", "Len",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       if s.config.PtrSize == 4 {
-                               return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
-                       }
-                       return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
-               },
-               sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
-       // LeadingZeros is handled because it trivially calls Len.
-       addF("math/bits", "Reverse64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
-               },
-               sys.ARM64)
-       addF("math/bits", "Reverse32",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpBitRev32, types.Types[types.TINT], args[0])
-               },
-               sys.ARM64)
-       addF("math/bits", "Reverse16",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpBitRev16, types.Types[types.TINT], args[0])
-               },
-               sys.ARM64)
-       addF("math/bits", "Reverse8",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpBitRev8, types.Types[types.TINT], args[0])
-               },
-               sys.ARM64)
-       addF("math/bits", "Reverse",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
-               },
-               sys.ARM64)
-       addF("math/bits", "RotateLeft8",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1])
-               },
-               sys.AMD64, sys.RISCV64)
-       addF("math/bits", "RotateLeft16",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1])
-               },
-               sys.AMD64, sys.RISCV64)
-       addF("math/bits", "RotateLeft32",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1])
-               },
-               sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
-       addF("math/bits", "RotateLeft64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1])
-               },
-               sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
-       alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...)
-
-       makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-               return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       if buildcfg.GOAMD64 >= 2 {
-                               return s.newValue1(op, types.Types[types.TINT], args[0])
-                       }
-
-                       v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasPOPCNT)
-                       b := s.endBlock()
-                       b.Kind = ssa.BlockIf
-                       b.SetControl(v)
-                       bTrue := s.f.NewBlock(ssa.BlockPlain)
-                       bFalse := s.f.NewBlock(ssa.BlockPlain)
-                       bEnd := s.f.NewBlock(ssa.BlockPlain)
-                       b.AddEdgeTo(bTrue)
-                       b.AddEdgeTo(bFalse)
-                       b.Likely = ssa.BranchLikely // most machines have popcnt nowadays
-
-                       // We have the intrinsic - use it directly.
-                       s.startBlock(bTrue)
-                       s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
-                       s.endBlock().AddEdgeTo(bEnd)
-
-                       // Call the pure Go version.
-                       s.startBlock(bFalse)
-                       s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
-                       s.endBlock().AddEdgeTo(bEnd)
-
-                       // Merge results.
-                       s.startBlock(bEnd)
-                       return s.variable(n, types.Types[types.TINT])
-               }
-       }
-       addF("math/bits", "OnesCount64",
-               makeOnesCountAMD64(ssa.OpPopCount64),
-               sys.AMD64)
-       addF("math/bits", "OnesCount64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpPopCount64, types.Types[types.TINT], args[0])
-               },
-               sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
-       addF("math/bits", "OnesCount32",
-               makeOnesCountAMD64(ssa.OpPopCount32),
-               sys.AMD64)
-       addF("math/bits", "OnesCount32",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpPopCount32, types.Types[types.TINT], args[0])
-               },
-               sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
-       addF("math/bits", "OnesCount16",
-               makeOnesCountAMD64(ssa.OpPopCount16),
-               sys.AMD64)
-       addF("math/bits", "OnesCount16",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpPopCount16, types.Types[types.TINT], args[0])
-               },
-               sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm)
-       addF("math/bits", "OnesCount8",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0])
-               },
-               sys.S390X, sys.PPC64, sys.Wasm)
-       addF("math/bits", "OnesCount",
-               makeOnesCountAMD64(ssa.OpPopCount64),
-               sys.AMD64)
-       addF("math/bits", "Mul64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1])
-               },
-               sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.MIPS64, sys.RISCV64, sys.Loong64)
-       alias("math/bits", "Mul", "math/bits", "Mul64", p8...)
-       alias("internal/runtime/math", "Mul64", "math/bits", "Mul64", p8...)
-       addF("math/bits", "Add64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
-               },
-               sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
-       alias("math/bits", "Add", "math/bits", "Add64", p8...)
-       alias("internal/runtime/math", "Add64", "math/bits", "Add64", all...)
-       addF("math/bits", "Sub64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
-               },
-               sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
-       alias("math/bits", "Sub", "math/bits", "Sub64", p8...)
-       addF("math/bits", "Div64",
-               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-                       // check for divide-by-zero/overflow and panic with appropriate message
-                       cmpZero := s.newValue2(s.ssaOp(ir.ONE, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[2], s.zeroVal(types.Types[types.TUINT64]))
-                       s.check(cmpZero, ir.Syms.Panicdivide)
-                       cmpOverflow := s.newValue2(s.ssaOp(ir.OLT, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[0], args[2])
-                       s.check(cmpOverflow, ir.Syms.Panicoverflow)
-                       return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
-               },
-               sys.AMD64)
-       alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64)
-
-       alias("internal/runtime/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...)
-       alias("internal/runtime/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...)
-       alias("internal/runtime/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...)
-       alias("internal/runtime/sys", "Len8", "math/bits", "Len8", all...)
-       alias("internal/runtime/sys", "Len64", "math/bits", "Len64", all...)
-       alias("internal/runtime/sys", "OnesCount64", "math/bits", "OnesCount64", all...)
-
-       /******** sync/atomic ********/
-
-       // Note: these are disabled by flag_race in findIntrinsic below.
-       alias("sync/atomic", "LoadInt32", "internal/runtime/atomic", "Load", all...)
-       alias("sync/atomic", "LoadInt64", "internal/runtime/atomic", "Load64", all...)
-       alias("sync/atomic", "LoadPointer", "internal/runtime/atomic", "Loadp", all...)
-       alias("sync/atomic", "LoadUint32", "internal/runtime/atomic", "Load", all...)
-       alias("sync/atomic", "LoadUint64", "internal/runtime/atomic", "Load64", all...)
-       alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load", p4...)
-       alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load64", p8...)
-
-       alias("sync/atomic", "StoreInt32", "internal/runtime/atomic", "Store", all...)
-       alias("sync/atomic", "StoreInt64", "internal/runtime/atomic", "Store64", all...)
-       // Note: not StorePointer, that needs a write barrier.  Same below for {CompareAnd}Swap.
-       alias("sync/atomic", "StoreUint32", "internal/runtime/atomic", "Store", all...)
-       alias("sync/atomic", "StoreUint64", "internal/runtime/atomic", "Store64", all...)
-       alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store", p4...)
-       alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store64", p8...)
-
-       alias("sync/atomic", "SwapInt32", "internal/runtime/atomic", "Xchg", all...)
-       alias("sync/atomic", "SwapInt64", "internal/runtime/atomic", "Xchg64", all...)
-       alias("sync/atomic", "SwapUint32", "internal/runtime/atomic", "Xchg", all...)
-       alias("sync/atomic", "SwapUint64", "internal/runtime/atomic", "Xchg64", all...)
-       alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg", p4...)
-       alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg64", p8...)
-
-       alias("sync/atomic", "CompareAndSwapInt32", "internal/runtime/atomic", "Cas", all...)
-       alias("sync/atomic", "CompareAndSwapInt64", "internal/runtime/atomic", "Cas64", all...)
-       alias("sync/atomic", "CompareAndSwapUint32", "internal/runtime/atomic", "Cas", all...)
-       alias("sync/atomic", "CompareAndSwapUint64", "internal/runtime/atomic", "Cas64", all...)
-       alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas", p4...)
-       alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas64", p8...)
-
-       alias("sync/atomic", "AddInt32", "internal/runtime/atomic", "Xadd", all...)
-       alias("sync/atomic", "AddInt64", "internal/runtime/atomic", "Xadd64", all...)
-       alias("sync/atomic", "AddUint32", "internal/runtime/atomic", "Xadd", all...)
-       alias("sync/atomic", "AddUint64", "internal/runtime/atomic", "Xadd64", all...)
-       alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...)
-       alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...)
-
-       alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64)
-       alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64)
-       alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
-       alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
-       alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
-       alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64)
-       alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64)
-       alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
-       alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
-       alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
-
-       /******** math/big ********/
-       alias("math/big", "mulWW", "math/bits", "Mul64", p8...)
-}
-
-// findIntrinsic returns a function which builds the SSA equivalent of the
-// function identified by the symbol sym.  If sym is not an intrinsic call, returns nil.
-func findIntrinsic(sym *types.Sym) intrinsicBuilder {
-       if sym == nil || sym.Pkg == nil {
-               return nil
-       }
-       pkg := sym.Pkg.Path
-       if sym.Pkg == ir.Pkgs.Runtime {
-               pkg = "runtime"
-       }
-       if base.Flag.Race && pkg == "sync/atomic" {
-               // The race detector needs to be able to intercept these calls.
-               // We can't intrinsify them.
-               return nil
-       }
-       // Skip intrinsifying math functions (which may contain hard-float
-       // instructions) when soft-float
-       if Arch.SoftFloat && pkg == "math" {
-               return nil
-       }
-
-       fn := sym.Name
-       if ssa.IntrinsicsDisable {
-               if pkg == "runtime" && (fn == "getcallerpc" || fn == "getcallersp" || fn == "getclosureptr") {
-                       // These runtime functions don't have definitions, must be intrinsics.
-               } else {
-                       return nil
-               }
-       }
-       return intrinsics[intrinsicKey{Arch.LinkArch.Arch, pkg, fn}]
-}
-
-func IsIntrinsicCall(n *ir.CallExpr) bool {
-       if n == nil {
-               return false
-       }
-       name, ok := n.Fun.(*ir.Name)
-       if !ok {
-               return false
-       }
-       return findIntrinsic(name.Sym()) != nil
-}
-
 // intrinsicCall converts a call to a recognized intrinsic function into the intrinsic SSA operation.
 func (s *state) intrinsicCall(n *ir.CallExpr) *ssa.Value {
        v := findIntrinsic(n.Fun.Sym())(s, n, s.intrinsicArgs(n))