Source file src/cmd/compile/internal/ssagen/intrinsics.go

     1  // Copyright 2024 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package ssagen
     6  
     7  import (
     8  	"fmt"
     9  	"internal/abi"
    10  	"internal/buildcfg"
    11  
    12  	"cmd/compile/internal/base"
    13  	"cmd/compile/internal/ir"
    14  	"cmd/compile/internal/ssa"
    15  	"cmd/compile/internal/types"
    16  	"cmd/internal/sys"
    17  )
    18  
    19  var intrinsics intrinsicBuilders
    20  
    21  // An intrinsicBuilder converts a call node n into an ssa value that
    22  // implements that call as an intrinsic. args is a list of arguments to the func.
    23  type intrinsicBuilder func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value
    24  
    25  type intrinsicKey struct {
    26  	arch *sys.Arch
    27  	pkg  string
    28  	fn   string
    29  }
    30  
    31  // intrinsicBuildConfig specifies the config to use for intrinsic building.
    32  type intrinsicBuildConfig struct {
    33  	instrumenting bool
    34  
    35  	go386     string
    36  	goamd64   int
    37  	goarm     buildcfg.GoarmFeatures
    38  	goarm64   buildcfg.Goarm64Features
    39  	gomips    string
    40  	gomips64  string
    41  	goppc64   int
    42  	goriscv64 int
    43  }
    44  
    45  type intrinsicBuilders map[intrinsicKey]intrinsicBuilder
    46  
    47  // add adds the intrinsic builder b for pkg.fn for the given architecture.
    48  func (ib intrinsicBuilders) add(arch *sys.Arch, pkg, fn string, b intrinsicBuilder) {
    49  	if _, found := ib[intrinsicKey{arch, pkg, fn}]; found {
    50  		panic(fmt.Sprintf("intrinsic already exists for %v.%v on %v", pkg, fn, arch.Name))
    51  	}
    52  	ib[intrinsicKey{arch, pkg, fn}] = b
    53  }
    54  
    55  // addForArchs adds the intrinsic builder b for pkg.fn for the given architectures.
    56  func (ib intrinsicBuilders) addForArchs(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
    57  	for _, arch := range archs {
    58  		ib.add(arch, pkg, fn, b)
    59  	}
    60  }
    61  
    62  // addForFamilies does the same as addForArchs but operates on architecture families.
    63  func (ib intrinsicBuilders) addForFamilies(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
    64  	for _, arch := range sys.Archs {
    65  		if arch.InFamily(archFamilies...) {
    66  			intrinsics.add(arch, pkg, fn, b)
    67  		}
    68  	}
    69  }
    70  
    71  // alias aliases pkg.fn to targetPkg.targetFn for all architectures in archs
    72  // for which targetPkg.targetFn already exists.
    73  func (ib intrinsicBuilders) alias(pkg, fn, targetPkg, targetFn string, archs ...*sys.Arch) {
    74  	// TODO(jsing): Consider making this work even if the alias is added
    75  	// before the intrinsic.
    76  	aliased := false
    77  	for _, arch := range archs {
    78  		if b := intrinsics.lookup(arch, targetPkg, targetFn); b != nil {
    79  			intrinsics.add(arch, pkg, fn, b)
    80  			aliased = true
    81  		}
    82  	}
    83  	if !aliased {
    84  		panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn))
    85  	}
    86  }
    87  
    88  // lookup looks up the intrinsic for a pkg.fn on the specified architecture.
    89  func (ib intrinsicBuilders) lookup(arch *sys.Arch, pkg, fn string) intrinsicBuilder {
    90  	return intrinsics[intrinsicKey{arch, pkg, fn}]
    91  }
    92  
    93  func initIntrinsics(cfg *intrinsicBuildConfig) {
    94  	if cfg == nil {
    95  		cfg = &intrinsicBuildConfig{
    96  			instrumenting: base.Flag.Cfg.Instrumenting,
    97  			go386:         buildcfg.GO386,
    98  			goamd64:       buildcfg.GOAMD64,
    99  			goarm:         buildcfg.GOARM,
   100  			goarm64:       buildcfg.GOARM64,
   101  			gomips:        buildcfg.GOMIPS,
   102  			gomips64:      buildcfg.GOMIPS64,
   103  			goppc64:       buildcfg.GOPPC64,
   104  			goriscv64:     buildcfg.GORISCV64,
   105  		}
   106  	}
   107  	intrinsics = intrinsicBuilders{}
   108  
   109  	var p4 []*sys.Arch
   110  	var p8 []*sys.Arch
   111  	var lwatomics []*sys.Arch
   112  	for _, a := range sys.Archs {
   113  		if a.PtrSize == 4 {
   114  			p4 = append(p4, a)
   115  		} else {
   116  			p8 = append(p8, a)
   117  		}
   118  		if a.Family != sys.PPC64 {
   119  			lwatomics = append(lwatomics, a)
   120  		}
   121  	}
   122  	all := sys.Archs[:]
   123  
   124  	add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
   125  		intrinsics.addForArchs(pkg, fn, b, archs...)
   126  	}
   127  	addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
   128  		intrinsics.addForFamilies(pkg, fn, b, archFamilies...)
   129  	}
   130  	alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) {
   131  		intrinsics.alias(pkg, fn, pkg2, fn2, archs...)
   132  	}
   133  
   134  	/******** runtime ********/
   135  	if !cfg.instrumenting {
   136  		add("runtime", "slicebytetostringtmp",
   137  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   138  				// Compiler frontend optimizations emit OBYTES2STRTMP nodes
   139  				// for the backend instead of slicebytetostringtmp calls
   140  				// when not instrumenting.
   141  				return s.newValue2(ssa.OpStringMake, n.Type(), args[0], args[1])
   142  			},
   143  			all...)
   144  	}
   145  	addF("internal/runtime/math", "MulUintptr",
   146  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   147  			if s.config.PtrSize == 4 {
   148  				return s.newValue2(ssa.OpMul32uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
   149  			}
   150  			return s.newValue2(ssa.OpMul64uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
   151  		},
   152  		sys.AMD64, sys.I386, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.ARM64)
   153  	add("runtime", "KeepAlive",
   154  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   155  			data := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, args[0])
   156  			s.vars[memVar] = s.newValue2(ssa.OpKeepAlive, types.TypeMem, data, s.mem())
   157  			return nil
   158  		},
   159  		all...)
   160  
   161  	addF("runtime", "publicationBarrier",
   162  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   163  			s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem())
   164  			return nil
   165  		},
   166  		sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64)
   167  
   168  	/******** internal/runtime/sys ********/
   169  	add("internal/runtime/sys", "GetCallerPC",
   170  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   171  			return s.newValue0(ssa.OpGetCallerPC, s.f.Config.Types.Uintptr)
   172  		},
   173  		all...)
   174  
   175  	add("internal/runtime/sys", "GetCallerSP",
   176  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   177  			return s.newValue1(ssa.OpGetCallerSP, s.f.Config.Types.Uintptr, s.mem())
   178  		},
   179  		all...)
   180  
   181  	add("internal/runtime/sys", "GetClosurePtr",
   182  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   183  			return s.newValue0(ssa.OpGetClosurePtr, s.f.Config.Types.Uintptr)
   184  		},
   185  		all...)
   186  
   187  	addF("internal/runtime/sys", "Bswap32",
   188  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   189  			return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
   190  		},
   191  		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X)
   192  	addF("internal/runtime/sys", "Bswap64",
   193  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   194  			return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
   195  		},
   196  		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X)
   197  
   198  	if cfg.goppc64 >= 10 {
   199  		// Use only on Power10 as the new byte reverse instructions that Power10 provide
   200  		// make it worthwhile as an intrinsic
   201  		addF("internal/runtime/sys", "Bswap32",
   202  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   203  				return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
   204  			},
   205  			sys.PPC64)
   206  		addF("internal/runtime/sys", "Bswap64",
   207  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   208  				return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
   209  			},
   210  			sys.PPC64)
   211  	}
   212  
   213  	if cfg.goriscv64 >= 22 {
   214  		addF("internal/runtime/sys", "Bswap32",
   215  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   216  				return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
   217  			},
   218  			sys.RISCV64)
   219  		addF("internal/runtime/sys", "Bswap64",
   220  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   221  				return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
   222  			},
   223  			sys.RISCV64)
   224  	}
   225  
   226  	/****** Prefetch ******/
   227  	makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   228  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   229  			s.vars[memVar] = s.newValue2(op, types.TypeMem, args[0], s.mem())
   230  			return nil
   231  		}
   232  	}
   233  
   234  	// Make Prefetch intrinsics for supported platforms
   235  	// On the unsupported platforms stub function will be eliminated
   236  	addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
   237  		sys.AMD64, sys.ARM64, sys.PPC64)
   238  	addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
   239  		sys.AMD64, sys.ARM64, sys.PPC64)
   240  
   241  	/******** internal/runtime/atomic ********/
   242  	type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool)
   243  
   244  	addF("internal/runtime/atomic", "Load",
   245  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   246  			v := s.newValue2(ssa.OpAtomicLoad32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
   247  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   248  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   249  		},
   250  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   251  	addF("internal/runtime/atomic", "Load8",
   252  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   253  			v := s.newValue2(ssa.OpAtomicLoad8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], s.mem())
   254  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   255  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
   256  		},
   257  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   258  	addF("internal/runtime/atomic", "Load64",
   259  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   260  			v := s.newValue2(ssa.OpAtomicLoad64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
   261  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   262  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   263  		},
   264  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   265  	addF("internal/runtime/atomic", "LoadAcq",
   266  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   267  			v := s.newValue2(ssa.OpAtomicLoadAcq32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
   268  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   269  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   270  		},
   271  		sys.PPC64)
   272  	addF("internal/runtime/atomic", "LoadAcq64",
   273  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   274  			v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
   275  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   276  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   277  		},
   278  		sys.PPC64)
   279  	addF("internal/runtime/atomic", "Loadp",
   280  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   281  			v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem())
   282  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   283  			return s.newValue1(ssa.OpSelect0, s.f.Config.Types.BytePtr, v)
   284  		},
   285  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   286  
   287  	addF("internal/runtime/atomic", "Store",
   288  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   289  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32, types.TypeMem, args[0], args[1], s.mem())
   290  			return nil
   291  		},
   292  		sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   293  	addF("internal/runtime/atomic", "Store8",
   294  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   295  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem())
   296  			return nil
   297  		},
   298  		sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   299  	addF("internal/runtime/atomic", "Store64",
   300  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   301  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem())
   302  			return nil
   303  		},
   304  		sys.AMD64, sys.ARM64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   305  	addF("internal/runtime/atomic", "StorepNoWB",
   306  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   307  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStorePtrNoWB, types.TypeMem, args[0], args[1], s.mem())
   308  			return nil
   309  		},
   310  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.RISCV64, sys.S390X)
   311  	addF("internal/runtime/atomic", "StoreRel",
   312  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   313  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel32, types.TypeMem, args[0], args[1], s.mem())
   314  			return nil
   315  		},
   316  		sys.PPC64)
   317  	addF("internal/runtime/atomic", "StoreRel64",
   318  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   319  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem())
   320  			return nil
   321  		},
   322  		sys.PPC64)
   323  
   324  	makeAtomicStoreGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
   325  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   326  			// Target Atomic feature is identified by dynamic detection
   327  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
   328  			v := s.load(types.Types[types.TBOOL], addr)
   329  			b := s.endBlock()
   330  			b.Kind = ssa.BlockIf
   331  			b.SetControl(v)
   332  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   333  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   334  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   335  			b.AddEdgeTo(bTrue)
   336  			b.AddEdgeTo(bFalse)
   337  			b.Likely = ssa.BranchLikely
   338  
   339  			// We have atomic instructions - use it directly.
   340  			s.startBlock(bTrue)
   341  			emit(s, n, args, op1, typ, false)
   342  			s.endBlock().AddEdgeTo(bEnd)
   343  
   344  			// Use original instruction sequence.
   345  			s.startBlock(bFalse)
   346  			emit(s, n, args, op0, typ, false)
   347  			s.endBlock().AddEdgeTo(bEnd)
   348  
   349  			// Merge results.
   350  			s.startBlock(bEnd)
   351  
   352  			return nil
   353  		}
   354  	}
   355  
   356  	atomicStoreEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   357  		v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
   358  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   359  		if needReturn {
   360  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   361  		}
   362  	}
   363  
   364  	addF("internal/runtime/atomic", "Store8",
   365  		makeAtomicStoreGuardedIntrinsicLoong64(ssa.OpAtomicStore8, ssa.OpAtomicStore8Variant, types.TUINT8, atomicStoreEmitterLoong64),
   366  		sys.Loong64)
   367  	addF("internal/runtime/atomic", "Store",
   368  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   369  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32Variant, types.TypeMem, args[0], args[1], s.mem())
   370  			return nil
   371  		},
   372  		sys.Loong64)
   373  	addF("internal/runtime/atomic", "Store64",
   374  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   375  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64Variant, types.TypeMem, args[0], args[1], s.mem())
   376  			return nil
   377  		},
   378  		sys.Loong64)
   379  
   380  	addF("internal/runtime/atomic", "Xchg8",
   381  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   382  			v := s.newValue3(ssa.OpAtomicExchange8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], args[1], s.mem())
   383  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   384  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
   385  		},
   386  		sys.AMD64, sys.PPC64)
   387  	addF("internal/runtime/atomic", "Xchg",
   388  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   389  			v := s.newValue3(ssa.OpAtomicExchange32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   390  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   391  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   392  		},
   393  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   394  	addF("internal/runtime/atomic", "Xchg64",
   395  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   396  			v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   397  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   398  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   399  		},
   400  		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   401  
   402  	makeAtomicGuardedIntrinsicARM64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder {
   403  
   404  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   405  			if cfg.goarm64.LSE {
   406  				emit(s, n, args, op1, typ, needReturn)
   407  			} else {
   408  				// Target Atomic feature is identified by dynamic detection
   409  				addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARM64HasATOMICS, s.sb)
   410  				v := s.load(types.Types[types.TBOOL], addr)
   411  				b := s.endBlock()
   412  				b.Kind = ssa.BlockIf
   413  				b.SetControl(v)
   414  				bTrue := s.f.NewBlock(ssa.BlockPlain)
   415  				bFalse := s.f.NewBlock(ssa.BlockPlain)
   416  				bEnd := s.f.NewBlock(ssa.BlockPlain)
   417  				b.AddEdgeTo(bTrue)
   418  				b.AddEdgeTo(bFalse)
   419  				b.Likely = ssa.BranchLikely
   420  
   421  				// We have atomic instructions - use it directly.
   422  				s.startBlock(bTrue)
   423  				emit(s, n, args, op1, typ, needReturn)
   424  				s.endBlock().AddEdgeTo(bEnd)
   425  
   426  				// Use original instruction sequence.
   427  				s.startBlock(bFalse)
   428  				emit(s, n, args, op0, typ, needReturn)
   429  				s.endBlock().AddEdgeTo(bEnd)
   430  
   431  				// Merge results.
   432  				s.startBlock(bEnd)
   433  			}
   434  			if needReturn {
   435  				return s.variable(n, types.Types[typ])
   436  			} else {
   437  				return nil
   438  			}
   439  		}
   440  	}
   441  	makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
   442  		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, true)
   443  	}
   444  	makeAtomicGuardedIntrinsicARM64old := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
   445  		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, false)
   446  	}
   447  
   448  	atomicEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   449  		v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
   450  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   451  		if needReturn {
   452  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   453  		}
   454  	}
   455  	addF("internal/runtime/atomic", "Xchg8",
   456  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange8, ssa.OpAtomicExchange8Variant, types.TUINT8, atomicEmitterARM64),
   457  		sys.ARM64)
   458  	addF("internal/runtime/atomic", "Xchg",
   459  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, types.TUINT32, atomicEmitterARM64),
   460  		sys.ARM64)
   461  	addF("internal/runtime/atomic", "Xchg64",
   462  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, atomicEmitterARM64),
   463  		sys.ARM64)
   464  
   465  	makeAtomicXchg8GuardedIntrinsicLoong64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   466  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   467  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
   468  			v := s.load(types.Types[types.TBOOL], addr)
   469  			b := s.endBlock()
   470  			b.Kind = ssa.BlockIf
   471  			b.SetControl(v)
   472  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   473  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   474  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   475  			b.AddEdgeTo(bTrue)
   476  			b.AddEdgeTo(bFalse)
   477  			b.Likely = ssa.BranchLikely // most loong64 machines support the amswapdb.b
   478  
   479  			// We have the intrinsic - use it directly.
   480  			s.startBlock(bTrue)
   481  			s.vars[n] = s.newValue3(op, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], args[1], s.mem())
   482  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, s.vars[n])
   483  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], s.vars[n])
   484  			s.endBlock().AddEdgeTo(bEnd)
   485  
   486  			// Call the pure Go version.
   487  			s.startBlock(bFalse)
   488  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TUINT8]
   489  			s.endBlock().AddEdgeTo(bEnd)
   490  
   491  			// Merge results.
   492  			s.startBlock(bEnd)
   493  			return s.variable(n, types.Types[types.TUINT8])
   494  		}
   495  	}
   496  	addF("internal/runtime/atomic", "Xchg8",
   497  		makeAtomicXchg8GuardedIntrinsicLoong64(ssa.OpAtomicExchange8Variant),
   498  		sys.Loong64)
   499  
   500  	addF("internal/runtime/atomic", "Xadd",
   501  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   502  			v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   503  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   504  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   505  		},
   506  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   507  	addF("internal/runtime/atomic", "Xadd64",
   508  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   509  			v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   510  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   511  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   512  		},
   513  		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   514  
   515  	addF("internal/runtime/atomic", "Xadd",
   516  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, types.TUINT32, atomicEmitterARM64),
   517  		sys.ARM64)
   518  	addF("internal/runtime/atomic", "Xadd64",
   519  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, types.TUINT64, atomicEmitterARM64),
   520  		sys.ARM64)
   521  
   522  	addF("internal/runtime/atomic", "Cas",
   523  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   524  			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   525  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   526  			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
   527  		},
   528  		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   529  	addF("internal/runtime/atomic", "Cas64",
   530  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   531  			v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   532  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   533  			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
   534  		},
   535  		sys.AMD64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   536  	addF("internal/runtime/atomic", "CasRel",
   537  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   538  			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   539  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   540  			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
   541  		},
   542  		sys.PPC64)
   543  
   544  	atomicCasEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   545  		v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   546  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   547  		if needReturn {
   548  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   549  		}
   550  	}
   551  
   552  	addF("internal/runtime/atomic", "Cas",
   553  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, types.TBOOL, atomicCasEmitterARM64),
   554  		sys.ARM64)
   555  	addF("internal/runtime/atomic", "Cas64",
   556  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TBOOL, atomicCasEmitterARM64),
   557  		sys.ARM64)
   558  
   559  	atomicCasEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   560  		v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   561  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   562  		if needReturn {
   563  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   564  		}
   565  	}
   566  
   567  	makeAtomicCasGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, emit atomicOpEmitter) intrinsicBuilder {
   568  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   569  			// Target Atomic feature is identified by dynamic detection
   570  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAMCAS, s.sb)
   571  			v := s.load(types.Types[types.TBOOL], addr)
   572  			b := s.endBlock()
   573  			b.Kind = ssa.BlockIf
   574  			b.SetControl(v)
   575  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   576  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   577  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   578  			b.AddEdgeTo(bTrue)
   579  			b.AddEdgeTo(bFalse)
   580  			b.Likely = ssa.BranchLikely
   581  
   582  			// We have atomic instructions - use it directly.
   583  			s.startBlock(bTrue)
   584  			emit(s, n, args, op1, types.TBOOL, true)
   585  			s.endBlock().AddEdgeTo(bEnd)
   586  
   587  			// Use original instruction sequence.
   588  			s.startBlock(bFalse)
   589  			emit(s, n, args, op0, types.TBOOL, true)
   590  			s.endBlock().AddEdgeTo(bEnd)
   591  
   592  			// Merge results.
   593  			s.startBlock(bEnd)
   594  
   595  			return s.variable(n, types.Types[types.TBOOL])
   596  		}
   597  	}
   598  
   599  	addF("internal/runtime/atomic", "Cas",
   600  		makeAtomicCasGuardedIntrinsicLoong64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, atomicCasEmitterLoong64),
   601  		sys.Loong64)
   602  	addF("internal/runtime/atomic", "Cas64",
   603  		makeAtomicCasGuardedIntrinsicLoong64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, atomicCasEmitterLoong64),
   604  		sys.Loong64)
   605  
   606  	// Old-style atomic logical operation API (all supported archs except arm64).
   607  	addF("internal/runtime/atomic", "And8",
   608  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   609  			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
   610  			return nil
   611  		},
   612  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   613  	addF("internal/runtime/atomic", "And",
   614  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   615  			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem())
   616  			return nil
   617  		},
   618  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   619  	addF("internal/runtime/atomic", "Or8",
   620  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   621  			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
   622  			return nil
   623  		},
   624  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   625  	addF("internal/runtime/atomic", "Or",
   626  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   627  			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem())
   628  			return nil
   629  		},
   630  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   631  
   632  	// arm64 always uses the new-style atomic logical operations, for both the
   633  	// old and new style API.
   634  	addF("internal/runtime/atomic", "And8",
   635  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64),
   636  		sys.ARM64)
   637  	addF("internal/runtime/atomic", "Or8",
   638  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr8value, ssa.OpAtomicOr8valueVariant, types.TUINT8, atomicEmitterARM64),
   639  		sys.ARM64)
   640  	addF("internal/runtime/atomic", "And64",
   641  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd64value, ssa.OpAtomicAnd64valueVariant, types.TUINT64, atomicEmitterARM64),
   642  		sys.ARM64)
   643  	addF("internal/runtime/atomic", "And32",
   644  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
   645  		sys.ARM64)
   646  	addF("internal/runtime/atomic", "And",
   647  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
   648  		sys.ARM64)
   649  	addF("internal/runtime/atomic", "Or64",
   650  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr64value, ssa.OpAtomicOr64valueVariant, types.TUINT64, atomicEmitterARM64),
   651  		sys.ARM64)
   652  	addF("internal/runtime/atomic", "Or32",
   653  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
   654  		sys.ARM64)
   655  	addF("internal/runtime/atomic", "Or",
   656  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
   657  		sys.ARM64)
   658  
   659  	// New-style atomic logical operations, which return the old memory value.
   660  	addF("internal/runtime/atomic", "And64",
   661  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   662  			v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   663  			p0, p1 := s.split(v)
   664  			s.vars[memVar] = p1
   665  			return p0
   666  		},
   667  		sys.AMD64, sys.Loong64)
   668  	addF("internal/runtime/atomic", "And32",
   669  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   670  			v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   671  			p0, p1 := s.split(v)
   672  			s.vars[memVar] = p1
   673  			return p0
   674  		},
   675  		sys.AMD64, sys.Loong64)
   676  	addF("internal/runtime/atomic", "Or64",
   677  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   678  			v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   679  			p0, p1 := s.split(v)
   680  			s.vars[memVar] = p1
   681  			return p0
   682  		},
   683  		sys.AMD64, sys.Loong64)
   684  	addF("internal/runtime/atomic", "Or32",
   685  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   686  			v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   687  			p0, p1 := s.split(v)
   688  			s.vars[memVar] = p1
   689  			return p0
   690  		},
   691  		sys.AMD64, sys.Loong64)
   692  
   693  	// Aliases for atomic load operations
   694  	alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...)
   695  	alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...)
   696  	alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load", p4...)
   697  	alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load64", p8...)
   698  	alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load", p4...)
   699  	alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load64", p8...)
   700  	alias("internal/runtime/atomic", "LoadAcq", "internal/runtime/atomic", "Load", lwatomics...)
   701  	alias("internal/runtime/atomic", "LoadAcq64", "internal/runtime/atomic", "Load64", lwatomics...)
   702  	alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...)
   703  	alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...) // linknamed
   704  	alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...)
   705  	alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...) // linknamed
   706  
   707  	// Aliases for atomic store operations
   708  	alias("internal/runtime/atomic", "Storeint32", "internal/runtime/atomic", "Store", all...)
   709  	alias("internal/runtime/atomic", "Storeint64", "internal/runtime/atomic", "Store64", all...)
   710  	alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store", p4...)
   711  	alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store64", p8...)
   712  	alias("internal/runtime/atomic", "StoreRel", "internal/runtime/atomic", "Store", lwatomics...)
   713  	alias("internal/runtime/atomic", "StoreRel64", "internal/runtime/atomic", "Store64", lwatomics...)
   714  	alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...)
   715  	alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...) // linknamed
   716  	alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...)
   717  	alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...) // linknamed
   718  
   719  	// Aliases for atomic swap operations
   720  	alias("internal/runtime/atomic", "Xchgint32", "internal/runtime/atomic", "Xchg", all...)
   721  	alias("internal/runtime/atomic", "Xchgint64", "internal/runtime/atomic", "Xchg64", all...)
   722  	alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg", p4...)
   723  	alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg64", p8...)
   724  
   725  	// Aliases for atomic add operations
   726  	alias("internal/runtime/atomic", "Xaddint32", "internal/runtime/atomic", "Xadd", all...)
   727  	alias("internal/runtime/atomic", "Xaddint64", "internal/runtime/atomic", "Xadd64", all...)
   728  	alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd", p4...)
   729  	alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd64", p8...)
   730  
   731  	// Aliases for atomic CAS operations
   732  	alias("internal/runtime/atomic", "Casint32", "internal/runtime/atomic", "Cas", all...)
   733  	alias("internal/runtime/atomic", "Casint64", "internal/runtime/atomic", "Cas64", all...)
   734  	alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas", p4...)
   735  	alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas64", p8...)
   736  	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas", p4...)
   737  	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas64", p8...)
   738  	alias("internal/runtime/atomic", "CasRel", "internal/runtime/atomic", "Cas", lwatomics...)
   739  
   740  	// Aliases for atomic And/Or operations
   741  	alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchLoong64)
   742  	alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchLoong64)
   743  
   744  	/******** math ********/
   745  	addF("math", "sqrt",
   746  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   747  			return s.newValue1(ssa.OpSqrt, types.Types[types.TFLOAT64], args[0])
   748  		},
   749  		sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
   750  	addF("math", "Trunc",
   751  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   752  			return s.newValue1(ssa.OpTrunc, types.Types[types.TFLOAT64], args[0])
   753  		},
   754  		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
   755  	addF("math", "Ceil",
   756  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   757  			return s.newValue1(ssa.OpCeil, types.Types[types.TFLOAT64], args[0])
   758  		},
   759  		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
   760  	addF("math", "Floor",
   761  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   762  			return s.newValue1(ssa.OpFloor, types.Types[types.TFLOAT64], args[0])
   763  		},
   764  		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
   765  	addF("math", "Round",
   766  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   767  			return s.newValue1(ssa.OpRound, types.Types[types.TFLOAT64], args[0])
   768  		},
   769  		sys.ARM64, sys.PPC64, sys.S390X)
   770  	addF("math", "RoundToEven",
   771  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   772  			return s.newValue1(ssa.OpRoundToEven, types.Types[types.TFLOAT64], args[0])
   773  		},
   774  		sys.ARM64, sys.S390X, sys.Wasm)
   775  	addF("math", "Abs",
   776  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   777  			return s.newValue1(ssa.OpAbs, types.Types[types.TFLOAT64], args[0])
   778  		},
   779  		sys.ARM64, sys.ARM, sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm, sys.MIPS, sys.MIPS64)
   780  	addF("math", "Copysign",
   781  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   782  			return s.newValue2(ssa.OpCopysign, types.Types[types.TFLOAT64], args[0], args[1])
   783  		},
   784  		sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm)
   785  	addF("math", "FMA",
   786  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   787  			return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   788  		},
   789  		sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X)
   790  	addF("math", "FMA",
   791  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   792  			if cfg.goamd64 >= 3 {
   793  				return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   794  			}
   795  
   796  			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasFMA)
   797  			b := s.endBlock()
   798  			b.Kind = ssa.BlockIf
   799  			b.SetControl(v)
   800  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   801  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   802  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   803  			b.AddEdgeTo(bTrue)
   804  			b.AddEdgeTo(bFalse)
   805  			b.Likely = ssa.BranchLikely // >= haswell cpus are common
   806  
   807  			// We have the intrinsic - use it directly.
   808  			s.startBlock(bTrue)
   809  			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   810  			s.endBlock().AddEdgeTo(bEnd)
   811  
   812  			// Call the pure Go version.
   813  			s.startBlock(bFalse)
   814  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   815  			s.endBlock().AddEdgeTo(bEnd)
   816  
   817  			// Merge results.
   818  			s.startBlock(bEnd)
   819  			return s.variable(n, types.Types[types.TFLOAT64])
   820  		},
   821  		sys.AMD64)
   822  	addF("math", "FMA",
   823  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   824  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARMHasVFPv4, s.sb)
   825  			v := s.load(types.Types[types.TBOOL], addr)
   826  			b := s.endBlock()
   827  			b.Kind = ssa.BlockIf
   828  			b.SetControl(v)
   829  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   830  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   831  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   832  			b.AddEdgeTo(bTrue)
   833  			b.AddEdgeTo(bFalse)
   834  			b.Likely = ssa.BranchLikely
   835  
   836  			// We have the intrinsic - use it directly.
   837  			s.startBlock(bTrue)
   838  			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   839  			s.endBlock().AddEdgeTo(bEnd)
   840  
   841  			// Call the pure Go version.
   842  			s.startBlock(bFalse)
   843  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   844  			s.endBlock().AddEdgeTo(bEnd)
   845  
   846  			// Merge results.
   847  			s.startBlock(bEnd)
   848  			return s.variable(n, types.Types[types.TFLOAT64])
   849  		},
   850  		sys.ARM)
   851  
   852  	makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   853  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   854  			if cfg.goamd64 >= 2 {
   855  				return s.newValue1(op, types.Types[types.TFLOAT64], args[0])
   856  			}
   857  
   858  			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasSSE41)
   859  			b := s.endBlock()
   860  			b.Kind = ssa.BlockIf
   861  			b.SetControl(v)
   862  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   863  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   864  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   865  			b.AddEdgeTo(bTrue)
   866  			b.AddEdgeTo(bFalse)
   867  			b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
   868  
   869  			// We have the intrinsic - use it directly.
   870  			s.startBlock(bTrue)
   871  			s.vars[n] = s.newValue1(op, types.Types[types.TFLOAT64], args[0])
   872  			s.endBlock().AddEdgeTo(bEnd)
   873  
   874  			// Call the pure Go version.
   875  			s.startBlock(bFalse)
   876  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   877  			s.endBlock().AddEdgeTo(bEnd)
   878  
   879  			// Merge results.
   880  			s.startBlock(bEnd)
   881  			return s.variable(n, types.Types[types.TFLOAT64])
   882  		}
   883  	}
   884  	addF("math", "RoundToEven",
   885  		makeRoundAMD64(ssa.OpRoundToEven),
   886  		sys.AMD64)
   887  	addF("math", "Floor",
   888  		makeRoundAMD64(ssa.OpFloor),
   889  		sys.AMD64)
   890  	addF("math", "Ceil",
   891  		makeRoundAMD64(ssa.OpCeil),
   892  		sys.AMD64)
   893  	addF("math", "Trunc",
   894  		makeRoundAMD64(ssa.OpTrunc),
   895  		sys.AMD64)
   896  
   897  	/******** math/bits ********/
   898  	addF("math/bits", "TrailingZeros64",
   899  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   900  			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
   901  		},
   902  		sys.AMD64, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
   903  	addF("math/bits", "TrailingZeros64",
   904  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   905  			lo := s.newValue1(ssa.OpInt64Lo, types.Types[types.TUINT32], args[0])
   906  			hi := s.newValue1(ssa.OpInt64Hi, types.Types[types.TUINT32], args[0])
   907  			return s.newValue2(ssa.OpCtz64On32, types.Types[types.TINT], lo, hi)
   908  		},
   909  		sys.I386)
   910  	addF("math/bits", "TrailingZeros32",
   911  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   912  			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
   913  		},
   914  		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
   915  	addF("math/bits", "TrailingZeros16",
   916  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   917  			return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
   918  		},
   919  		sys.AMD64, sys.ARM, sys.ARM64, sys.I386, sys.MIPS, sys.Loong64, sys.PPC64, sys.S390X, sys.Wasm)
   920  	addF("math/bits", "TrailingZeros8",
   921  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   922  			return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
   923  		},
   924  		sys.AMD64, sys.ARM, sys.ARM64, sys.I386, sys.MIPS, sys.Loong64, sys.PPC64, sys.S390X, sys.Wasm)
   925  
   926  	if cfg.goriscv64 >= 22 {
   927  		addF("math/bits", "TrailingZeros64",
   928  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   929  				return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
   930  			},
   931  			sys.RISCV64)
   932  		addF("math/bits", "TrailingZeros32",
   933  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   934  				return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
   935  			},
   936  			sys.RISCV64)
   937  		addF("math/bits", "TrailingZeros16",
   938  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   939  				return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
   940  			},
   941  			sys.RISCV64)
   942  		addF("math/bits", "TrailingZeros8",
   943  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   944  				return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
   945  			},
   946  			sys.RISCV64)
   947  	}
   948  
   949  	// ReverseBytes inlines correctly, no need to intrinsify it.
   950  	alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...)
   951  	alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...)
   952  	// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
   953  	addF("math/bits", "ReverseBytes16",
   954  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   955  			return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
   956  		},
   957  		sys.Loong64)
   958  	if cfg.goppc64 >= 10 {
   959  		// On Power10, 16-bit rotate is not available so use BRH instruction
   960  		addF("math/bits", "ReverseBytes16",
   961  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   962  				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
   963  			},
   964  			sys.PPC64)
   965  	}
   966  	if cfg.goriscv64 >= 22 {
   967  		addF("math/bits", "ReverseBytes16",
   968  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   969  				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
   970  			},
   971  			sys.RISCV64)
   972  	}
   973  
   974  	addF("math/bits", "Len64",
   975  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   976  			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
   977  		},
   978  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
   979  	addF("math/bits", "Len32",
   980  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   981  			return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
   982  		},
   983  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
   984  	addF("math/bits", "Len16",
   985  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   986  			return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
   987  		},
   988  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
   989  	addF("math/bits", "Len8",
   990  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   991  			return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
   992  		},
   993  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
   994  
   995  	if cfg.goriscv64 >= 22 {
   996  		addF("math/bits", "Len64",
   997  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   998  				return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
   999  			},
  1000  			sys.RISCV64)
  1001  		addF("math/bits", "Len32",
  1002  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1003  				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
  1004  			},
  1005  			sys.RISCV64)
  1006  		addF("math/bits", "Len16",
  1007  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1008  				return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
  1009  			},
  1010  			sys.RISCV64)
  1011  		addF("math/bits", "Len8",
  1012  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1013  				return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
  1014  			},
  1015  			sys.RISCV64)
  1016  	}
  1017  
  1018  	alias("math/bits", "Len", "math/bits", "Len64", p8...)
  1019  	alias("math/bits", "Len", "math/bits", "Len32", p4...)
  1020  
  1021  	// LeadingZeros is handled because it trivially calls Len.
  1022  	addF("math/bits", "Reverse64",
  1023  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1024  			return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
  1025  		},
  1026  		sys.ARM64, sys.Loong64)
  1027  	addF("math/bits", "Reverse32",
  1028  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1029  			return s.newValue1(ssa.OpBitRev32, types.Types[types.TINT], args[0])
  1030  		},
  1031  		sys.ARM64, sys.Loong64)
  1032  	addF("math/bits", "Reverse16",
  1033  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1034  			return s.newValue1(ssa.OpBitRev16, types.Types[types.TINT], args[0])
  1035  		},
  1036  		sys.ARM64, sys.Loong64)
  1037  	addF("math/bits", "Reverse8",
  1038  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1039  			return s.newValue1(ssa.OpBitRev8, types.Types[types.TINT], args[0])
  1040  		},
  1041  		sys.ARM64, sys.Loong64)
  1042  	addF("math/bits", "Reverse",
  1043  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1044  			return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
  1045  		},
  1046  		sys.ARM64, sys.Loong64)
  1047  	addF("math/bits", "RotateLeft8",
  1048  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1049  			return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1])
  1050  		},
  1051  		sys.AMD64, sys.RISCV64)
  1052  	addF("math/bits", "RotateLeft16",
  1053  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1054  			return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1])
  1055  		},
  1056  		sys.AMD64, sys.RISCV64)
  1057  	addF("math/bits", "RotateLeft32",
  1058  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1059  			return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1])
  1060  		},
  1061  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
  1062  	addF("math/bits", "RotateLeft64",
  1063  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1064  			return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1])
  1065  		},
  1066  		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
  1067  	alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...)
  1068  
  1069  	makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1070  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1071  			if cfg.goamd64 >= 2 {
  1072  				return s.newValue1(op, types.Types[types.TINT], args[0])
  1073  			}
  1074  
  1075  			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasPOPCNT)
  1076  			b := s.endBlock()
  1077  			b.Kind = ssa.BlockIf
  1078  			b.SetControl(v)
  1079  			bTrue := s.f.NewBlock(ssa.BlockPlain)
  1080  			bFalse := s.f.NewBlock(ssa.BlockPlain)
  1081  			bEnd := s.f.NewBlock(ssa.BlockPlain)
  1082  			b.AddEdgeTo(bTrue)
  1083  			b.AddEdgeTo(bFalse)
  1084  			b.Likely = ssa.BranchLikely // most machines have popcnt nowadays
  1085  
  1086  			// We have the intrinsic - use it directly.
  1087  			s.startBlock(bTrue)
  1088  			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
  1089  			s.endBlock().AddEdgeTo(bEnd)
  1090  
  1091  			// Call the pure Go version.
  1092  			s.startBlock(bFalse)
  1093  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
  1094  			s.endBlock().AddEdgeTo(bEnd)
  1095  
  1096  			// Merge results.
  1097  			s.startBlock(bEnd)
  1098  			return s.variable(n, types.Types[types.TINT])
  1099  		}
  1100  	}
  1101  
  1102  	makeOnesCountLoong64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1103  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1104  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLSX, s.sb)
  1105  			v := s.load(types.Types[types.TBOOL], addr)
  1106  			b := s.endBlock()
  1107  			b.Kind = ssa.BlockIf
  1108  			b.SetControl(v)
  1109  			bTrue := s.f.NewBlock(ssa.BlockPlain)
  1110  			bFalse := s.f.NewBlock(ssa.BlockPlain)
  1111  			bEnd := s.f.NewBlock(ssa.BlockPlain)
  1112  			b.AddEdgeTo(bTrue)
  1113  			b.AddEdgeTo(bFalse)
  1114  			b.Likely = ssa.BranchLikely // most loong64 machines support the LSX
  1115  
  1116  			// We have the intrinsic - use it directly.
  1117  			s.startBlock(bTrue)
  1118  			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
  1119  			s.endBlock().AddEdgeTo(bEnd)
  1120  
  1121  			// Call the pure Go version.
  1122  			s.startBlock(bFalse)
  1123  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
  1124  			s.endBlock().AddEdgeTo(bEnd)
  1125  
  1126  			// Merge results.
  1127  			s.startBlock(bEnd)
  1128  			return s.variable(n, types.Types[types.TINT])
  1129  		}
  1130  	}
  1131  
  1132  	makeOnesCountRISCV64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1133  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1134  			if cfg.goriscv64 >= 22 {
  1135  				return s.newValue1(op, types.Types[types.TINT], args[0])
  1136  			}
  1137  
  1138  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.RISCV64HasZbb, s.sb)
  1139  			v := s.load(types.Types[types.TBOOL], addr)
  1140  			b := s.endBlock()
  1141  			b.Kind = ssa.BlockIf
  1142  			b.SetControl(v)
  1143  			bTrue := s.f.NewBlock(ssa.BlockPlain)
  1144  			bFalse := s.f.NewBlock(ssa.BlockPlain)
  1145  			bEnd := s.f.NewBlock(ssa.BlockPlain)
  1146  			b.AddEdgeTo(bTrue)
  1147  			b.AddEdgeTo(bFalse)
  1148  			b.Likely = ssa.BranchLikely // Majority of RISC-V support Zbb.
  1149  
  1150  			// We have the intrinsic - use it directly.
  1151  			s.startBlock(bTrue)
  1152  			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
  1153  			s.endBlock().AddEdgeTo(bEnd)
  1154  
  1155  			// Call the pure Go version.
  1156  			s.startBlock(bFalse)
  1157  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
  1158  			s.endBlock().AddEdgeTo(bEnd)
  1159  
  1160  			// Merge results.
  1161  			s.startBlock(bEnd)
  1162  			return s.variable(n, types.Types[types.TINT])
  1163  		}
  1164  	}
  1165  
  1166  	addF("math/bits", "OnesCount64",
  1167  		makeOnesCountAMD64(ssa.OpPopCount64),
  1168  		sys.AMD64)
  1169  	addF("math/bits", "OnesCount64",
  1170  		makeOnesCountLoong64(ssa.OpPopCount64),
  1171  		sys.Loong64)
  1172  	addF("math/bits", "OnesCount64",
  1173  		makeOnesCountRISCV64(ssa.OpPopCount64),
  1174  		sys.RISCV64)
  1175  	addF("math/bits", "OnesCount64",
  1176  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1177  			return s.newValue1(ssa.OpPopCount64, types.Types[types.TINT], args[0])
  1178  		},
  1179  		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
  1180  	addF("math/bits", "OnesCount32",
  1181  		makeOnesCountAMD64(ssa.OpPopCount32),
  1182  		sys.AMD64)
  1183  	addF("math/bits", "OnesCount32",
  1184  		makeOnesCountLoong64(ssa.OpPopCount32),
  1185  		sys.Loong64)
  1186  	addF("math/bits", "OnesCount32",
  1187  		makeOnesCountRISCV64(ssa.OpPopCount32),
  1188  		sys.RISCV64)
  1189  	addF("math/bits", "OnesCount32",
  1190  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1191  			return s.newValue1(ssa.OpPopCount32, types.Types[types.TINT], args[0])
  1192  		},
  1193  		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
  1194  	addF("math/bits", "OnesCount16",
  1195  		makeOnesCountAMD64(ssa.OpPopCount16),
  1196  		sys.AMD64)
  1197  	addF("math/bits", "OnesCount16",
  1198  		makeOnesCountLoong64(ssa.OpPopCount16),
  1199  		sys.Loong64)
  1200  	addF("math/bits", "OnesCount16",
  1201  		makeOnesCountRISCV64(ssa.OpPopCount16),
  1202  		sys.RISCV64)
  1203  	addF("math/bits", "OnesCount16",
  1204  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1205  			return s.newValue1(ssa.OpPopCount16, types.Types[types.TINT], args[0])
  1206  		},
  1207  		sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm)
  1208  	addF("math/bits", "OnesCount8",
  1209  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1210  			return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0])
  1211  		},
  1212  		sys.S390X, sys.PPC64, sys.Wasm)
  1213  
  1214  	if cfg.goriscv64 >= 22 {
  1215  		addF("math/bits", "OnesCount8",
  1216  			makeOnesCountRISCV64(ssa.OpPopCount8),
  1217  			sys.RISCV64)
  1218  	}
  1219  
  1220  	alias("math/bits", "OnesCount", "math/bits", "OnesCount64", p8...)
  1221  
  1222  	addF("math/bits", "Mul64",
  1223  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1224  			return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1])
  1225  		},
  1226  		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.MIPS64, sys.RISCV64, sys.Loong64)
  1227  	alias("math/bits", "Mul", "math/bits", "Mul64", p8...)
  1228  	alias("internal/runtime/math", "Mul64", "math/bits", "Mul64", p8...)
  1229  	addF("math/bits", "Add64",
  1230  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1231  			return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
  1232  		},
  1233  		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
  1234  	alias("math/bits", "Add", "math/bits", "Add64", p8...)
  1235  	alias("internal/runtime/math", "Add64", "math/bits", "Add64", all...)
  1236  	addF("math/bits", "Sub64",
  1237  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1238  			return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
  1239  		},
  1240  		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
  1241  	alias("math/bits", "Sub", "math/bits", "Sub64", p8...)
  1242  	addF("math/bits", "Div64",
  1243  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1244  			// check for divide-by-zero/overflow and panic with appropriate message
  1245  			cmpZero := s.newValue2(s.ssaOp(ir.ONE, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[2], s.zeroVal(types.Types[types.TUINT64]))
  1246  			s.check(cmpZero, ir.Syms.Panicdivide)
  1247  			cmpOverflow := s.newValue2(s.ssaOp(ir.OLT, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[0], args[2])
  1248  			s.check(cmpOverflow, ir.Syms.Panicoverflow)
  1249  			return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
  1250  		},
  1251  		sys.AMD64)
  1252  	alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64)
  1253  
  1254  	alias("internal/runtime/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...)
  1255  	alias("internal/runtime/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...)
  1256  	alias("internal/runtime/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...)
  1257  	alias("internal/runtime/sys", "Len8", "math/bits", "Len8", all...)
  1258  	alias("internal/runtime/sys", "Len64", "math/bits", "Len64", all...)
  1259  	alias("internal/runtime/sys", "OnesCount64", "math/bits", "OnesCount64", all...)
  1260  
  1261  	/******** sync/atomic ********/
  1262  
  1263  	// Note: these are disabled by flag_race in findIntrinsic below.
  1264  	alias("sync/atomic", "LoadInt32", "internal/runtime/atomic", "Load", all...)
  1265  	alias("sync/atomic", "LoadInt64", "internal/runtime/atomic", "Load64", all...)
  1266  	alias("sync/atomic", "LoadPointer", "internal/runtime/atomic", "Loadp", all...)
  1267  	alias("sync/atomic", "LoadUint32", "internal/runtime/atomic", "Load", all...)
  1268  	alias("sync/atomic", "LoadUint64", "internal/runtime/atomic", "Load64", all...)
  1269  	alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load", p4...)
  1270  	alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load64", p8...)
  1271  
  1272  	alias("sync/atomic", "StoreInt32", "internal/runtime/atomic", "Store", all...)
  1273  	alias("sync/atomic", "StoreInt64", "internal/runtime/atomic", "Store64", all...)
  1274  	// Note: not StorePointer, that needs a write barrier.  Same below for {CompareAnd}Swap.
  1275  	alias("sync/atomic", "StoreUint32", "internal/runtime/atomic", "Store", all...)
  1276  	alias("sync/atomic", "StoreUint64", "internal/runtime/atomic", "Store64", all...)
  1277  	alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store", p4...)
  1278  	alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store64", p8...)
  1279  
  1280  	alias("sync/atomic", "SwapInt32", "internal/runtime/atomic", "Xchg", all...)
  1281  	alias("sync/atomic", "SwapInt64", "internal/runtime/atomic", "Xchg64", all...)
  1282  	alias("sync/atomic", "SwapUint32", "internal/runtime/atomic", "Xchg", all...)
  1283  	alias("sync/atomic", "SwapUint64", "internal/runtime/atomic", "Xchg64", all...)
  1284  	alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg", p4...)
  1285  	alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg64", p8...)
  1286  
  1287  	alias("sync/atomic", "CompareAndSwapInt32", "internal/runtime/atomic", "Cas", all...)
  1288  	alias("sync/atomic", "CompareAndSwapInt64", "internal/runtime/atomic", "Cas64", all...)
  1289  	alias("sync/atomic", "CompareAndSwapUint32", "internal/runtime/atomic", "Cas", all...)
  1290  	alias("sync/atomic", "CompareAndSwapUint64", "internal/runtime/atomic", "Cas64", all...)
  1291  	alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas", p4...)
  1292  	alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas64", p8...)
  1293  
  1294  	alias("sync/atomic", "AddInt32", "internal/runtime/atomic", "Xadd", all...)
  1295  	alias("sync/atomic", "AddInt64", "internal/runtime/atomic", "Xadd64", all...)
  1296  	alias("sync/atomic", "AddUint32", "internal/runtime/atomic", "Xadd", all...)
  1297  	alias("sync/atomic", "AddUint64", "internal/runtime/atomic", "Xadd64", all...)
  1298  	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...)
  1299  	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...)
  1300  
  1301  	alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1302  	alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1303  	alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1304  	alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1305  	alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1306  	alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1307  	alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1308  	alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1309  	alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1310  	alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1311  
  1312  	/******** math/big ********/
  1313  	alias("math/big", "mulWW", "math/bits", "Mul64", p8...)
  1314  
  1315  	/******** internal/runtime/maps ********/
  1316  
  1317  	// Important: The intrinsic implementations below return a packed
  1318  	// bitset, while the portable Go implementation uses an unpacked
  1319  	// representation (one bit set in each byte).
  1320  	//
  1321  	// Thus we must replace most bitset methods with implementations that
  1322  	// work with the packed representation.
  1323  	//
  1324  	// TODO(prattmic): The bitset implementations don't use SIMD, so they
  1325  	// could be handled with build tags (though that would break
  1326  	// -d=ssa/intrinsics/off=1).
  1327  
  1328  	// With a packed representation we no longer need to shift the result
  1329  	// of TrailingZeros64.
  1330  	alias("internal/runtime/maps", "bitsetFirst", "internal/runtime/sys", "TrailingZeros64", sys.ArchAMD64)
  1331  
  1332  	addF("internal/runtime/maps", "bitsetRemoveBelow",
  1333  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1334  			b := args[0]
  1335  			i := args[1]
  1336  
  1337  			// Clear the lower i bits in b.
  1338  			//
  1339  			// out = b &^ ((1 << i) - 1)
  1340  
  1341  			one := s.constInt64(types.Types[types.TUINT64], 1)
  1342  
  1343  			mask := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT64], one, i)
  1344  			mask = s.newValue2(ssa.OpSub64, types.Types[types.TUINT64], mask, one)
  1345  			mask = s.newValue1(ssa.OpCom64, types.Types[types.TUINT64], mask)
  1346  
  1347  			return s.newValue2(ssa.OpAnd64, types.Types[types.TUINT64], b, mask)
  1348  		},
  1349  		sys.AMD64)
  1350  
  1351  	addF("internal/runtime/maps", "bitsetLowestSet",
  1352  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1353  			b := args[0]
  1354  
  1355  			// Test the lowest bit in b.
  1356  			//
  1357  			// out = (b & 1) == 1
  1358  
  1359  			one := s.constInt64(types.Types[types.TUINT64], 1)
  1360  			and := s.newValue2(ssa.OpAnd64, types.Types[types.TUINT64], b, one)
  1361  			return s.newValue2(ssa.OpEq64, types.Types[types.TBOOL], and, one)
  1362  		},
  1363  		sys.AMD64)
  1364  
  1365  	addF("internal/runtime/maps", "bitsetShiftOutLowest",
  1366  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1367  			b := args[0]
  1368  
  1369  			// Right shift out the lowest bit in b.
  1370  			//
  1371  			// out = b >> 1
  1372  
  1373  			one := s.constInt64(types.Types[types.TUINT64], 1)
  1374  			return s.newValue2(ssa.OpRsh64Ux64, types.Types[types.TUINT64], b, one)
  1375  		},
  1376  		sys.AMD64)
  1377  
  1378  	addF("internal/runtime/maps", "ctrlGroupMatchH2",
  1379  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1380  			g := args[0]
  1381  			h := args[1]
  1382  
  1383  			// Explicit copies to fp registers. See
  1384  			// https://go.dev/issue/70451.
  1385  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1386  			hfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, h)
  1387  
  1388  			// Broadcast h2 into each byte of a word.
  1389  			var broadcast *ssa.Value
  1390  			if buildcfg.GOAMD64 >= 4 {
  1391  				// VPBROADCASTB saves 1 instruction vs PSHUFB
  1392  				// because the input can come from a GP
  1393  				// register, while PSHUFB requires moving into
  1394  				// an FP register first.
  1395  				//
  1396  				// Nominally PSHUFB would require a second
  1397  				// additional instruction to load the control
  1398  				// mask into a FP register. But broadcast uses
  1399  				// a control mask of 0, and the register ABI
  1400  				// already defines X15 as a zero register.
  1401  				broadcast = s.newValue1(ssa.OpAMD64VPBROADCASTB, types.TypeInt128, h) // use gp copy of h
  1402  			} else if buildcfg.GOAMD64 >= 2 {
  1403  				// PSHUFB performs a byte broadcast when given
  1404  				// a control input of 0.
  1405  				broadcast = s.newValue1(ssa.OpAMD64PSHUFBbroadcast, types.TypeInt128, hfp)
  1406  			} else {
  1407  				// No direct byte broadcast. First we must
  1408  				// duplicate the lower byte and then do a
  1409  				// 16-bit broadcast.
  1410  
  1411  				// "Unpack" h2 with itself. This duplicates the
  1412  				// input, resulting in h2 in the lower two
  1413  				// bytes.
  1414  				unpack := s.newValue2(ssa.OpAMD64PUNPCKLBW, types.TypeInt128, hfp, hfp)
  1415  
  1416  				// Copy the lower 16-bits of unpack into every
  1417  				// 16-bit slot in the lower 64-bits of the
  1418  				// output register. Note that immediate 0
  1419  				// selects the low word as the source for every
  1420  				// destination slot.
  1421  				broadcast = s.newValue1I(ssa.OpAMD64PSHUFLW, types.TypeInt128, 0, unpack)
  1422  
  1423  				// No need to broadcast into the upper 64-bits,
  1424  				// as we don't use those.
  1425  			}
  1426  
  1427  			// Compare each byte of the control word with h2. Each
  1428  			// matching byte has every bit set.
  1429  			eq := s.newValue2(ssa.OpAMD64PCMPEQB, types.TypeInt128, broadcast, gfp)
  1430  
  1431  			// Construct a "byte mask": each output bit is equal to
  1432  			// the sign bit each input byte.
  1433  			//
  1434  			// This results in a packed output (bit N set means
  1435  			// byte N matched).
  1436  			//
  1437  			// NOTE: See comment above on bitsetFirst.
  1438  			out := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], eq)
  1439  
  1440  			// g is only 64-bits so the upper 64-bits of the
  1441  			// 128-bit register will be zero. If h2 is also zero,
  1442  			// then we'll get matches on those bytes. Truncate the
  1443  			// upper bits to ignore such matches.
  1444  			ret := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
  1445  
  1446  			return ret
  1447  		},
  1448  		sys.AMD64)
  1449  
  1450  	addF("internal/runtime/maps", "ctrlGroupMatchEmpty",
  1451  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1452  			// An empty slot is   1000 0000
  1453  			// A deleted slot is  1111 1110
  1454  			// A full slot is     0??? ????
  1455  
  1456  			g := args[0]
  1457  
  1458  			// Explicit copy to fp register. See
  1459  			// https://go.dev/issue/70451.
  1460  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1461  
  1462  			if buildcfg.GOAMD64 >= 2 {
  1463  				// "PSIGNB negates each data element of the
  1464  				// destination operand (the first operand) if
  1465  				// the signed integer value of the
  1466  				// corresponding data element in the source
  1467  				// operand (the second operand) is less than
  1468  				// zero. If the signed integer value of a data
  1469  				// element in the source operand is positive,
  1470  				// the corresponding data element in the
  1471  				// destination operand is unchanged. If a data
  1472  				// element in the source operand is zero, the
  1473  				// corresponding data element in the
  1474  				// destination operand is set to zero" - Intel SDM
  1475  				//
  1476  				// If we pass the group control word as both
  1477  				// arguments:
  1478  				// - Full slots are unchanged.
  1479  				// - Deleted slots are negated, becoming
  1480  				//   0000 0010.
  1481  				// - Empty slots are negated, becoming
  1482  				//   1000 0000 (unchanged!).
  1483  				//
  1484  				// The result is that only empty slots have the
  1485  				// sign bit set. We then use PMOVMSKB to
  1486  				// extract the sign bits.
  1487  				sign := s.newValue2(ssa.OpAMD64PSIGNB, types.TypeInt128, gfp, gfp)
  1488  
  1489  				// Construct a "byte mask": each output bit is
  1490  				// equal to the sign bit each input byte. The
  1491  				// sign bit is only set for empty or deleted
  1492  				// slots.
  1493  				//
  1494  				// This results in a packed output (bit N set
  1495  				// means byte N matched).
  1496  				//
  1497  				// NOTE: See comment above on bitsetFirst.
  1498  				ret := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], sign)
  1499  
  1500  				// g is only 64-bits so the upper 64-bits of
  1501  				// the 128-bit register will be zero. PSIGNB
  1502  				// will keep all of these bytes zero, so no
  1503  				// need to truncate.
  1504  
  1505  				return ret
  1506  			}
  1507  
  1508  			// No PSIGNB, simply do byte equality with ctrlEmpty.
  1509  
  1510  			// Load ctrlEmpty into each byte of a control word.
  1511  			var ctrlsEmpty uint64 = abi.SwissMapCtrlEmpty
  1512  			e := s.constInt64(types.Types[types.TUINT64], int64(ctrlsEmpty))
  1513  			// Explicit copy to fp register. See
  1514  			// https://go.dev/issue/70451.
  1515  			efp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, e)
  1516  
  1517  			// Compare each byte of the control word with ctrlEmpty. Each
  1518  			// matching byte has every bit set.
  1519  			eq := s.newValue2(ssa.OpAMD64PCMPEQB, types.TypeInt128, efp, gfp)
  1520  
  1521  			// Construct a "byte mask": each output bit is equal to
  1522  			// the sign bit each input byte.
  1523  			//
  1524  			// This results in a packed output (bit N set means
  1525  			// byte N matched).
  1526  			//
  1527  			// NOTE: See comment above on bitsetFirst.
  1528  			out := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], eq)
  1529  
  1530  			// g is only 64-bits so the upper 64-bits of the
  1531  			// 128-bit register will be zero. The upper 64-bits of
  1532  			// efp are also zero, so we'll get matches on those
  1533  			// bytes. Truncate the upper bits to ignore such
  1534  			// matches.
  1535  			return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
  1536  		},
  1537  		sys.AMD64)
  1538  
  1539  	addF("internal/runtime/maps", "ctrlGroupMatchEmptyOrDeleted",
  1540  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1541  			// An empty slot is   1000 0000
  1542  			// A deleted slot is  1111 1110
  1543  			// A full slot is     0??? ????
  1544  			//
  1545  			// A slot is empty or deleted iff bit 7 (sign bit) is
  1546  			// set.
  1547  
  1548  			g := args[0]
  1549  
  1550  			// Explicit copy to fp register. See
  1551  			// https://go.dev/issue/70451.
  1552  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1553  
  1554  			// Construct a "byte mask": each output bit is equal to
  1555  			// the sign bit each input byte. The sign bit is only
  1556  			// set for empty or deleted slots.
  1557  			//
  1558  			// This results in a packed output (bit N set means
  1559  			// byte N matched).
  1560  			//
  1561  			// NOTE: See comment above on bitsetFirst.
  1562  			ret := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], gfp)
  1563  
  1564  			// g is only 64-bits so the upper 64-bits of the
  1565  			// 128-bit register will be zero. Zero will never match
  1566  			// ctrlEmpty or ctrlDeleted, so no need to truncate.
  1567  
  1568  			return ret
  1569  		},
  1570  		sys.AMD64)
  1571  
  1572  	addF("internal/runtime/maps", "ctrlGroupMatchFull",
  1573  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1574  			// An empty slot is   1000 0000
  1575  			// A deleted slot is  1111 1110
  1576  			// A full slot is     0??? ????
  1577  			//
  1578  			// A slot is full iff bit 7 (sign bit) is unset.
  1579  
  1580  			g := args[0]
  1581  
  1582  			// Explicit copy to fp register. See
  1583  			// https://go.dev/issue/70451.
  1584  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1585  
  1586  			// Construct a "byte mask": each output bit is equal to
  1587  			// the sign bit each input byte. The sign bit is only
  1588  			// set for empty or deleted slots.
  1589  			//
  1590  			// This results in a packed output (bit N set means
  1591  			// byte N matched).
  1592  			//
  1593  			// NOTE: See comment above on bitsetFirst.
  1594  			mask := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], gfp)
  1595  
  1596  			// Invert the mask to set the bits for the full slots.
  1597  			out := s.newValue1(ssa.OpCom16, types.Types[types.TUINT16], mask)
  1598  
  1599  			// g is only 64-bits so the upper 64-bits of the
  1600  			// 128-bit register will be zero, with bit 7 unset.
  1601  			// Truncate the upper bits to ignore these.
  1602  			return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
  1603  		},
  1604  		sys.AMD64)
  1605  }
  1606  
  1607  // findIntrinsic returns a function which builds the SSA equivalent of the
  1608  // function identified by the symbol sym.  If sym is not an intrinsic call, returns nil.
  1609  func findIntrinsic(sym *types.Sym) intrinsicBuilder {
  1610  	if sym == nil || sym.Pkg == nil {
  1611  		return nil
  1612  	}
  1613  	pkg := sym.Pkg.Path
  1614  	if sym.Pkg == ir.Pkgs.Runtime {
  1615  		pkg = "runtime"
  1616  	}
  1617  	if base.Flag.Race && pkg == "sync/atomic" {
  1618  		// The race detector needs to be able to intercept these calls.
  1619  		// We can't intrinsify them.
  1620  		return nil
  1621  	}
  1622  	// Skip intrinsifying math functions (which may contain hard-float
  1623  	// instructions) when soft-float
  1624  	if Arch.SoftFloat && pkg == "math" {
  1625  		return nil
  1626  	}
  1627  
  1628  	fn := sym.Name
  1629  	if ssa.IntrinsicsDisable {
  1630  		if pkg == "internal/runtime/sys" && (fn == "GetCallerPC" || fn == "GrtCallerSP" || fn == "GetClosurePtr") {
  1631  			// These runtime functions don't have definitions, must be intrinsics.
  1632  		} else {
  1633  			return nil
  1634  		}
  1635  	}
  1636  	return intrinsics.lookup(Arch.LinkArch.Arch, pkg, fn)
  1637  }
  1638  
  1639  func IsIntrinsicCall(n *ir.CallExpr) bool {
  1640  	if n == nil {
  1641  		return false
  1642  	}
  1643  	name, ok := n.Fun.(*ir.Name)
  1644  	if !ok {
  1645  		return false
  1646  	}
  1647  	return findIntrinsic(name.Sym()) != nil
  1648  }
  1649  

View as plain text