Source file src/cmd/compile/internal/ssa/config.go

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package ssa
     6  
     7  import (
     8  	"cmd/compile/internal/abi"
     9  	"cmd/compile/internal/base"
    10  	"cmd/compile/internal/ir"
    11  	"cmd/compile/internal/types"
    12  	"cmd/internal/obj"
    13  	"cmd/internal/src"
    14  )
    15  
    16  // A Config holds readonly compilation information.
    17  // It is created once, early during compilation,
    18  // and shared across all compilations.
    19  type Config struct {
    20  	arch           string // "amd64", etc.
    21  	PtrSize        int64  // 4 or 8; copy of cmd/internal/sys.Arch.PtrSize
    22  	RegSize        int64  // 4 or 8; copy of cmd/internal/sys.Arch.RegSize
    23  	Types          Types
    24  	lowerBlock     blockRewriter  // block lowering function, first round
    25  	lowerValue     valueRewriter  // value lowering function, first round
    26  	lateLowerBlock blockRewriter  // block lowering function that needs to be run after the first round; only used on some architectures
    27  	lateLowerValue valueRewriter  // value lowering function that needs to be run after the first round; only used on some architectures
    28  	splitLoad      valueRewriter  // function for splitting merged load ops; only used on some architectures
    29  	registers      []Register     // machine registers
    30  	gpRegMask      regMask        // general purpose integer register mask
    31  	fpRegMask      regMask        // floating point register mask
    32  	fp32RegMask    regMask        // floating point register mask
    33  	fp64RegMask    regMask        // floating point register mask
    34  	specialRegMask regMask        // special register mask
    35  	intParamRegs   []int8         // register numbers of integer param (in/out) registers
    36  	floatParamRegs []int8         // register numbers of floating param (in/out) registers
    37  	ABI1           *abi.ABIConfig // "ABIInternal" under development // TODO change comment when this becomes current
    38  	ABI0           *abi.ABIConfig
    39  	FPReg          int8      // register number of frame pointer, -1 if not used
    40  	LinkReg        int8      // register number of link register if it is a general purpose register, -1 if not used
    41  	hasGReg        bool      // has hardware g register
    42  	ctxt           *obj.Link // Generic arch information
    43  	optimize       bool      // Do optimization
    44  	SoftFloat      bool      //
    45  	Race           bool      // race detector enabled
    46  	BigEndian      bool      //
    47  	unalignedOK    bool      // Unaligned loads/stores are ok
    48  	haveBswap64    bool      // architecture implements Bswap64
    49  	haveBswap32    bool      // architecture implements Bswap32
    50  	haveBswap16    bool      // architecture implements Bswap16
    51  	haveCondSelect bool      // architecture implements CondSelect
    52  
    53  	// mulRecipes[x] = function to build v * x from v.
    54  	mulRecipes map[int64]mulRecipe
    55  }
    56  
    57  type mulRecipe struct {
    58  	cost  int
    59  	build func(*Value, *Value) *Value // build(m, v) returns v * x built at m.
    60  }
    61  
    62  type (
    63  	blockRewriter func(*Block) bool
    64  	valueRewriter func(*Value) bool
    65  )
    66  
    67  type Types struct {
    68  	Bool       *types.Type
    69  	Int8       *types.Type
    70  	Int16      *types.Type
    71  	Int32      *types.Type
    72  	Int64      *types.Type
    73  	UInt8      *types.Type
    74  	UInt16     *types.Type
    75  	UInt32     *types.Type
    76  	UInt64     *types.Type
    77  	Int        *types.Type
    78  	Float32    *types.Type
    79  	Float64    *types.Type
    80  	UInt       *types.Type
    81  	Uintptr    *types.Type
    82  	String     *types.Type
    83  	BytePtr    *types.Type // TODO: use unsafe.Pointer instead?
    84  	Int32Ptr   *types.Type
    85  	UInt32Ptr  *types.Type
    86  	IntPtr     *types.Type
    87  	UintptrPtr *types.Type
    88  	Float32Ptr *types.Type
    89  	Float64Ptr *types.Type
    90  	BytePtrPtr *types.Type
    91  	Vec128     *types.Type
    92  	Vec256     *types.Type
    93  	Vec512     *types.Type
    94  	Mask       *types.Type
    95  }
    96  
    97  // NewTypes creates and populates a Types.
    98  func NewTypes() *Types {
    99  	t := new(Types)
   100  	t.SetTypPtrs()
   101  	return t
   102  }
   103  
   104  // SetTypPtrs populates t.
   105  func (t *Types) SetTypPtrs() {
   106  	t.Bool = types.Types[types.TBOOL]
   107  	t.Int8 = types.Types[types.TINT8]
   108  	t.Int16 = types.Types[types.TINT16]
   109  	t.Int32 = types.Types[types.TINT32]
   110  	t.Int64 = types.Types[types.TINT64]
   111  	t.UInt8 = types.Types[types.TUINT8]
   112  	t.UInt16 = types.Types[types.TUINT16]
   113  	t.UInt32 = types.Types[types.TUINT32]
   114  	t.UInt64 = types.Types[types.TUINT64]
   115  	t.Int = types.Types[types.TINT]
   116  	t.Float32 = types.Types[types.TFLOAT32]
   117  	t.Float64 = types.Types[types.TFLOAT64]
   118  	t.UInt = types.Types[types.TUINT]
   119  	t.Uintptr = types.Types[types.TUINTPTR]
   120  	t.String = types.Types[types.TSTRING]
   121  	t.BytePtr = types.NewPtr(types.Types[types.TUINT8])
   122  	t.Int32Ptr = types.NewPtr(types.Types[types.TINT32])
   123  	t.UInt32Ptr = types.NewPtr(types.Types[types.TUINT32])
   124  	t.IntPtr = types.NewPtr(types.Types[types.TINT])
   125  	t.UintptrPtr = types.NewPtr(types.Types[types.TUINTPTR])
   126  	t.Float32Ptr = types.NewPtr(types.Types[types.TFLOAT32])
   127  	t.Float64Ptr = types.NewPtr(types.Types[types.TFLOAT64])
   128  	t.BytePtrPtr = types.NewPtr(types.NewPtr(types.Types[types.TUINT8]))
   129  	t.Vec128 = types.TypeVec128
   130  	t.Vec256 = types.TypeVec256
   131  	t.Vec512 = types.TypeVec512
   132  	t.Mask = types.TypeMask
   133  }
   134  
   135  type Logger interface {
   136  	// Logf logs a message from the compiler.
   137  	Logf(string, ...any)
   138  
   139  	// Log reports whether logging is not a no-op
   140  	// some logging calls account for more than a few heap allocations.
   141  	Log() bool
   142  
   143  	// Fatalf reports a compiler error and exits.
   144  	Fatalf(pos src.XPos, msg string, args ...any)
   145  
   146  	// Warnl writes compiler messages in the form expected by "errorcheck" tests
   147  	Warnl(pos src.XPos, fmt_ string, args ...any)
   148  
   149  	// Forwards the Debug flags from gc
   150  	Debug_checknil() bool
   151  }
   152  
   153  type Frontend interface {
   154  	Logger
   155  
   156  	// StringData returns a symbol pointing to the given string's contents.
   157  	StringData(string) *obj.LSym
   158  
   159  	// Given the name for a compound type, returns the name we should use
   160  	// for the parts of that compound type.
   161  	SplitSlot(parent *LocalSlot, suffix string, offset int64, t *types.Type) LocalSlot
   162  
   163  	// Syslook returns a symbol of the runtime function/variable with the
   164  	// given name.
   165  	Syslook(string) *obj.LSym
   166  
   167  	// UseWriteBarrier reports whether write barrier is enabled
   168  	UseWriteBarrier() bool
   169  
   170  	// Func returns the ir.Func of the function being compiled.
   171  	Func() *ir.Func
   172  }
   173  
   174  // NewConfig returns a new configuration object for the given architecture.
   175  func NewConfig(arch string, types Types, ctxt *obj.Link, optimize, softfloat bool) *Config {
   176  	c := &Config{arch: arch, Types: types}
   177  	switch arch {
   178  	case "amd64":
   179  		c.PtrSize = 8
   180  		c.RegSize = 8
   181  		c.lowerBlock = rewriteBlockAMD64
   182  		c.lowerValue = rewriteValueAMD64
   183  		c.lateLowerBlock = rewriteBlockAMD64latelower
   184  		c.lateLowerValue = rewriteValueAMD64latelower
   185  		c.splitLoad = rewriteValueAMD64splitload
   186  		c.registers = registersAMD64[:]
   187  		c.gpRegMask = gpRegMaskAMD64
   188  		c.fpRegMask = fpRegMaskAMD64
   189  		c.specialRegMask = specialRegMaskAMD64
   190  		c.intParamRegs = paramIntRegAMD64
   191  		c.floatParamRegs = paramFloatRegAMD64
   192  		c.FPReg = framepointerRegAMD64
   193  		c.LinkReg = linkRegAMD64
   194  		c.hasGReg = true
   195  		c.unalignedOK = true
   196  		c.haveBswap64 = true
   197  		c.haveBswap32 = true
   198  		c.haveBswap16 = true
   199  		c.haveCondSelect = true
   200  	case "386":
   201  		c.PtrSize = 4
   202  		c.RegSize = 4
   203  		c.lowerBlock = rewriteBlock386
   204  		c.lowerValue = rewriteValue386
   205  		c.splitLoad = rewriteValue386splitload
   206  		c.registers = registers386[:]
   207  		c.gpRegMask = gpRegMask386
   208  		c.fpRegMask = fpRegMask386
   209  		c.FPReg = framepointerReg386
   210  		c.LinkReg = linkReg386
   211  		c.hasGReg = false
   212  		c.unalignedOK = true
   213  		c.haveBswap32 = true
   214  		c.haveBswap16 = true
   215  	case "arm":
   216  		c.PtrSize = 4
   217  		c.RegSize = 4
   218  		c.lowerBlock = rewriteBlockARM
   219  		c.lowerValue = rewriteValueARM
   220  		c.registers = registersARM[:]
   221  		c.gpRegMask = gpRegMaskARM
   222  		c.fpRegMask = fpRegMaskARM
   223  		c.FPReg = framepointerRegARM
   224  		c.LinkReg = linkRegARM
   225  		c.hasGReg = true
   226  	case "arm64":
   227  		c.PtrSize = 8
   228  		c.RegSize = 8
   229  		c.lowerBlock = rewriteBlockARM64
   230  		c.lowerValue = rewriteValueARM64
   231  		c.lateLowerBlock = rewriteBlockARM64latelower
   232  		c.lateLowerValue = rewriteValueARM64latelower
   233  		c.registers = registersARM64[:]
   234  		c.gpRegMask = gpRegMaskARM64
   235  		c.fpRegMask = fpRegMaskARM64
   236  		c.intParamRegs = paramIntRegARM64
   237  		c.floatParamRegs = paramFloatRegARM64
   238  		c.FPReg = framepointerRegARM64
   239  		c.LinkReg = linkRegARM64
   240  		c.hasGReg = true
   241  		c.unalignedOK = true
   242  		c.haveBswap64 = true
   243  		c.haveBswap32 = true
   244  		c.haveBswap16 = true
   245  		c.haveCondSelect = true
   246  	case "ppc64":
   247  		c.BigEndian = true
   248  		fallthrough
   249  	case "ppc64le":
   250  		c.PtrSize = 8
   251  		c.RegSize = 8
   252  		c.lowerBlock = rewriteBlockPPC64
   253  		c.lowerValue = rewriteValuePPC64
   254  		c.lateLowerBlock = rewriteBlockPPC64latelower
   255  		c.lateLowerValue = rewriteValuePPC64latelower
   256  		c.registers = registersPPC64[:]
   257  		c.gpRegMask = gpRegMaskPPC64
   258  		c.fpRegMask = fpRegMaskPPC64
   259  		c.specialRegMask = specialRegMaskPPC64
   260  		c.intParamRegs = paramIntRegPPC64
   261  		c.floatParamRegs = paramFloatRegPPC64
   262  		c.FPReg = framepointerRegPPC64
   263  		c.LinkReg = linkRegPPC64
   264  		c.hasGReg = true
   265  		c.unalignedOK = true
   266  		// Note: ppc64 has register bswap ops only when GOPPC64>=10.
   267  		// But it has bswap+load and bswap+store ops for all ppc64 variants.
   268  		// That is the sense we're using them here - they are only used
   269  		// in contexts where they can be merged with a load or store.
   270  		c.haveBswap64 = true
   271  		c.haveBswap32 = true
   272  		c.haveBswap16 = true
   273  		c.haveCondSelect = true
   274  	case "mips64":
   275  		c.BigEndian = true
   276  		fallthrough
   277  	case "mips64le":
   278  		c.PtrSize = 8
   279  		c.RegSize = 8
   280  		c.lowerBlock = rewriteBlockMIPS64
   281  		c.lowerValue = rewriteValueMIPS64
   282  		c.lateLowerBlock = rewriteBlockMIPS64latelower
   283  		c.lateLowerValue = rewriteValueMIPS64latelower
   284  		c.registers = registersMIPS64[:]
   285  		c.gpRegMask = gpRegMaskMIPS64
   286  		c.fpRegMask = fpRegMaskMIPS64
   287  		c.specialRegMask = specialRegMaskMIPS64
   288  		c.FPReg = framepointerRegMIPS64
   289  		c.LinkReg = linkRegMIPS64
   290  		c.hasGReg = true
   291  	case "loong64":
   292  		c.PtrSize = 8
   293  		c.RegSize = 8
   294  		c.lowerBlock = rewriteBlockLOONG64
   295  		c.lowerValue = rewriteValueLOONG64
   296  		c.lateLowerBlock = rewriteBlockLOONG64latelower
   297  		c.lateLowerValue = rewriteValueLOONG64latelower
   298  		c.registers = registersLOONG64[:]
   299  		c.gpRegMask = gpRegMaskLOONG64
   300  		c.fpRegMask = fpRegMaskLOONG64
   301  		c.intParamRegs = paramIntRegLOONG64
   302  		c.floatParamRegs = paramFloatRegLOONG64
   303  		c.FPReg = framepointerRegLOONG64
   304  		c.LinkReg = linkRegLOONG64
   305  		c.hasGReg = true
   306  		c.unalignedOK = true
   307  		c.haveBswap64 = true
   308  		c.haveBswap32 = true
   309  		c.haveBswap16 = true
   310  		c.haveCondSelect = true
   311  	case "s390x":
   312  		c.PtrSize = 8
   313  		c.RegSize = 8
   314  		c.lowerBlock = rewriteBlockS390X
   315  		c.lowerValue = rewriteValueS390X
   316  		c.registers = registersS390X[:]
   317  		c.gpRegMask = gpRegMaskS390X
   318  		c.fpRegMask = fpRegMaskS390X
   319  		c.intParamRegs = paramIntRegS390X
   320  		c.floatParamRegs = paramFloatRegS390X
   321  		c.FPReg = framepointerRegS390X
   322  		c.LinkReg = linkRegS390X
   323  		c.hasGReg = true
   324  		c.BigEndian = true
   325  		c.unalignedOK = true
   326  		c.haveBswap64 = true
   327  		c.haveBswap32 = true
   328  		c.haveBswap16 = true // only for loads&stores, see ppc64 comment
   329  	case "mips":
   330  		c.BigEndian = true
   331  		fallthrough
   332  	case "mipsle":
   333  		c.PtrSize = 4
   334  		c.RegSize = 4
   335  		c.lowerBlock = rewriteBlockMIPS
   336  		c.lowerValue = rewriteValueMIPS
   337  		c.registers = registersMIPS[:]
   338  		c.gpRegMask = gpRegMaskMIPS
   339  		c.fpRegMask = fpRegMaskMIPS
   340  		c.specialRegMask = specialRegMaskMIPS
   341  		c.FPReg = framepointerRegMIPS
   342  		c.LinkReg = linkRegMIPS
   343  		c.hasGReg = true
   344  	case "riscv64":
   345  		c.PtrSize = 8
   346  		c.RegSize = 8
   347  		c.lowerBlock = rewriteBlockRISCV64
   348  		c.lowerValue = rewriteValueRISCV64
   349  		c.lateLowerBlock = rewriteBlockRISCV64latelower
   350  		c.lateLowerValue = rewriteValueRISCV64latelower
   351  		c.registers = registersRISCV64[:]
   352  		c.gpRegMask = gpRegMaskRISCV64
   353  		c.fpRegMask = fpRegMaskRISCV64
   354  		c.intParamRegs = paramIntRegRISCV64
   355  		c.floatParamRegs = paramFloatRegRISCV64
   356  		c.FPReg = framepointerRegRISCV64
   357  		c.hasGReg = true
   358  	case "wasm":
   359  		c.PtrSize = 8
   360  		c.RegSize = 8
   361  		c.lowerBlock = rewriteBlockWasm
   362  		c.lowerValue = rewriteValueWasm
   363  		c.registers = registersWasm[:]
   364  		c.gpRegMask = gpRegMaskWasm
   365  		c.fpRegMask = fpRegMaskWasm
   366  		c.fp32RegMask = fp32RegMaskWasm
   367  		c.fp64RegMask = fp64RegMaskWasm
   368  		c.FPReg = framepointerRegWasm
   369  		c.LinkReg = linkRegWasm
   370  		c.hasGReg = true
   371  		c.unalignedOK = true
   372  		c.haveCondSelect = true
   373  	default:
   374  		ctxt.Diag("arch %s not implemented", arch)
   375  	}
   376  	c.ctxt = ctxt
   377  	c.optimize = optimize
   378  	c.SoftFloat = softfloat
   379  	if softfloat {
   380  		c.floatParamRegs = nil // no FP registers in softfloat mode
   381  	}
   382  
   383  	c.ABI0 = abi.NewABIConfig(0, 0, ctxt.Arch.FixedFrameSize, 0)
   384  	c.ABI1 = abi.NewABIConfig(len(c.intParamRegs), len(c.floatParamRegs), ctxt.Arch.FixedFrameSize, 1)
   385  
   386  	if ctxt.Flag_shared {
   387  		// LoweredWB is secretly a CALL and CALLs on 386 in
   388  		// shared mode get rewritten by obj6.go to go through
   389  		// the GOT, which clobbers BX.
   390  		opcodeTable[Op386LoweredWB].reg.clobbers |= 1 << 3 // BX
   391  	}
   392  
   393  	c.buildRecipes(arch)
   394  
   395  	return c
   396  }
   397  
   398  func (c *Config) Ctxt() *obj.Link { return c.ctxt }
   399  
   400  func (c *Config) haveByteSwap(size int64) bool {
   401  	switch size {
   402  	case 8:
   403  		return c.haveBswap64
   404  	case 4:
   405  		return c.haveBswap32
   406  	case 2:
   407  		return c.haveBswap16
   408  	default:
   409  		base.Fatalf("bad size %d\n", size)
   410  		return false
   411  	}
   412  }
   413  
   414  func (c *Config) buildRecipes(arch string) {
   415  	// Information for strength-reducing multiplies.
   416  	type linearCombo struct {
   417  		// we can compute a*x+b*y in one instruction
   418  		a, b int64
   419  		// cost, in arbitrary units (tenths of cycles, usually)
   420  		cost int
   421  		// builds SSA value for a*x+b*y. Use the position
   422  		// information from m.
   423  		build func(m, x, y *Value) *Value
   424  	}
   425  
   426  	// List all the linear combination instructions we have.
   427  	var linearCombos []linearCombo
   428  	r := func(a, b int64, cost int, build func(m, x, y *Value) *Value) {
   429  		linearCombos = append(linearCombos, linearCombo{a: a, b: b, cost: cost, build: build})
   430  	}
   431  	var mulCost int
   432  	switch arch {
   433  	case "amd64":
   434  		// Assumes that the following costs from https://gmplib.org/~tege/x86-timing.pdf:
   435  		//    1 - addq, shlq, leaq, negq, subq
   436  		//    3 - imulq
   437  		// These costs limit the rewrites to two instructions.
   438  		// Operations which have to happen in place (and thus
   439  		// may require a reg-reg move) score slightly higher.
   440  		mulCost = 30
   441  		// add
   442  		r(1, 1, 10,
   443  			func(m, x, y *Value) *Value {
   444  				v := m.Block.NewValue2(m.Pos, OpAMD64ADDQ, m.Type, x, y)
   445  				if m.Type.Size() == 4 {
   446  					v.Op = OpAMD64ADDL
   447  				}
   448  				return v
   449  			})
   450  		// neg
   451  		r(-1, 0, 11,
   452  			func(m, x, y *Value) *Value {
   453  				v := m.Block.NewValue1(m.Pos, OpAMD64NEGQ, m.Type, x)
   454  				if m.Type.Size() == 4 {
   455  					v.Op = OpAMD64NEGL
   456  				}
   457  				return v
   458  			})
   459  		// sub
   460  		r(1, -1, 11,
   461  			func(m, x, y *Value) *Value {
   462  				v := m.Block.NewValue2(m.Pos, OpAMD64SUBQ, m.Type, x, y)
   463  				if m.Type.Size() == 4 {
   464  					v.Op = OpAMD64SUBL
   465  				}
   466  				return v
   467  			})
   468  		// lea
   469  		r(1, 2, 10,
   470  			func(m, x, y *Value) *Value {
   471  				v := m.Block.NewValue2(m.Pos, OpAMD64LEAQ2, m.Type, x, y)
   472  				if m.Type.Size() == 4 {
   473  					v.Op = OpAMD64LEAL2
   474  				}
   475  				return v
   476  			})
   477  		r(1, 4, 10,
   478  			func(m, x, y *Value) *Value {
   479  				v := m.Block.NewValue2(m.Pos, OpAMD64LEAQ4, m.Type, x, y)
   480  				if m.Type.Size() == 4 {
   481  					v.Op = OpAMD64LEAL4
   482  				}
   483  				return v
   484  			})
   485  		r(1, 8, 10,
   486  			func(m, x, y *Value) *Value {
   487  				v := m.Block.NewValue2(m.Pos, OpAMD64LEAQ8, m.Type, x, y)
   488  				if m.Type.Size() == 4 {
   489  					v.Op = OpAMD64LEAL8
   490  				}
   491  				return v
   492  			})
   493  		// regular shifts
   494  		for i := 2; i < 64; i++ {
   495  			r(1<<i, 0, 11,
   496  				func(m, x, y *Value) *Value {
   497  					v := m.Block.NewValue1I(m.Pos, OpAMD64SHLQconst, m.Type, int64(i), x)
   498  					if m.Type.Size() == 4 {
   499  						v.Op = OpAMD64SHLLconst
   500  					}
   501  					return v
   502  				})
   503  		}
   504  
   505  	case "arm64":
   506  		// Rationale (for M2 ultra):
   507  		// - multiply is 3 cycles.
   508  		// - add/neg/sub/shift are 1 cycle.
   509  		// - add/neg/sub+shiftLL are 2 cycles.
   510  		// We break ties against the multiply because using a
   511  		// multiply also needs to load the constant into a register.
   512  		// (It's 3 cycles and 2 instructions either way, but the
   513  		// linear combo one might use 1 less register.)
   514  		// The multiply constant might get lifted out of a loop though. Hmm....
   515  		// Other arm64 chips have different tradeoffs.
   516  		// Some chip's add+shift instructions are 1 cycle for shifts up to 4
   517  		// and 2 cycles for shifts bigger than 4. So weight the larger shifts
   518  		// a bit more.
   519  		// TODO: figure out a happy medium.
   520  		mulCost = 35
   521  		// add
   522  		r(1, 1, 10,
   523  			func(m, x, y *Value) *Value {
   524  				return m.Block.NewValue2(m.Pos, OpARM64ADD, m.Type, x, y)
   525  			})
   526  		// neg
   527  		r(-1, 0, 10,
   528  			func(m, x, y *Value) *Value {
   529  				return m.Block.NewValue1(m.Pos, OpARM64NEG, m.Type, x)
   530  			})
   531  		// sub
   532  		r(1, -1, 10,
   533  			func(m, x, y *Value) *Value {
   534  				return m.Block.NewValue2(m.Pos, OpARM64SUB, m.Type, x, y)
   535  			})
   536  		// regular shifts
   537  		for i := 1; i < 64; i++ {
   538  			c := 10
   539  			if i == 1 {
   540  				// Prefer x<<1 over x+x.
   541  				// Note that we eventually reverse this decision in ARM64latelower.rules,
   542  				// but this makes shift combining rules in ARM64.rules simpler.
   543  				c--
   544  			}
   545  			r(1<<i, 0, c,
   546  				func(m, x, y *Value) *Value {
   547  					return m.Block.NewValue1I(m.Pos, OpARM64SLLconst, m.Type, int64(i), x)
   548  				})
   549  		}
   550  		// ADDshiftLL
   551  		for i := 1; i < 64; i++ {
   552  			c := 20
   553  			if i > 4 {
   554  				c++
   555  			}
   556  			r(1, 1<<i, c,
   557  				func(m, x, y *Value) *Value {
   558  					return m.Block.NewValue2I(m.Pos, OpARM64ADDshiftLL, m.Type, int64(i), x, y)
   559  				})
   560  		}
   561  		// NEGshiftLL
   562  		for i := 1; i < 64; i++ {
   563  			c := 20
   564  			if i > 4 {
   565  				c++
   566  			}
   567  			r(-1<<i, 0, c,
   568  				func(m, x, y *Value) *Value {
   569  					return m.Block.NewValue1I(m.Pos, OpARM64NEGshiftLL, m.Type, int64(i), x)
   570  				})
   571  		}
   572  		// SUBshiftLL
   573  		for i := 1; i < 64; i++ {
   574  			c := 20
   575  			if i > 4 {
   576  				c++
   577  			}
   578  			r(1, -1<<i, c,
   579  				func(m, x, y *Value) *Value {
   580  					return m.Block.NewValue2I(m.Pos, OpARM64SUBshiftLL, m.Type, int64(i), x, y)
   581  				})
   582  		}
   583  	case "loong64":
   584  		// - multiply is 4 cycles.
   585  		// - add/sub/shift/alsl are 1 cycle.
   586  		// On loong64, using a multiply also needs to load the constant into a register.
   587  		// TODO: figure out a happy medium.
   588  		mulCost = 45
   589  
   590  		// add
   591  		r(1, 1, 10,
   592  			func(m, x, y *Value) *Value {
   593  				return m.Block.NewValue2(m.Pos, OpLOONG64ADDV, m.Type, x, y)
   594  			})
   595  		// neg
   596  		r(-1, 0, 10,
   597  			func(m, x, y *Value) *Value {
   598  				return m.Block.NewValue1(m.Pos, OpLOONG64NEGV, m.Type, x)
   599  			})
   600  		// sub
   601  		r(1, -1, 10,
   602  			func(m, x, y *Value) *Value {
   603  				return m.Block.NewValue2(m.Pos, OpLOONG64SUBV, m.Type, x, y)
   604  			})
   605  
   606  		// regular shifts
   607  		for i := 1; i < 64; i++ {
   608  			c := 10
   609  			if i == 1 {
   610  				// Prefer x<<1 over x+x.
   611  				// Note that we eventually reverse this decision in LOONG64latelower.rules,
   612  				// but this makes shift combining rules in LOONG64.rules simpler.
   613  				c--
   614  			}
   615  			r(1<<i, 0, c,
   616  				func(m, x, y *Value) *Value {
   617  					return m.Block.NewValue1I(m.Pos, OpLOONG64SLLVconst, m.Type, int64(i), x)
   618  				})
   619  		}
   620  
   621  		// ADDshiftLLV
   622  		for i := 1; i < 5; i++ {
   623  			c := 10
   624  			r(1, 1<<i, c,
   625  				func(m, x, y *Value) *Value {
   626  					return m.Block.NewValue2I(m.Pos, OpLOONG64ADDshiftLLV, m.Type, int64(i), x, y)
   627  				})
   628  		}
   629  	}
   630  
   631  	c.mulRecipes = map[int64]mulRecipe{}
   632  
   633  	// Single-instruction recipes.
   634  	// The only option for the input value(s) is v.
   635  	for _, combo := range linearCombos {
   636  		x := combo.a + combo.b
   637  		cost := combo.cost
   638  		old := c.mulRecipes[x]
   639  		if (old.build == nil || cost < old.cost) && cost < mulCost {
   640  			c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
   641  				return combo.build(m, v, v)
   642  			}}
   643  		}
   644  	}
   645  	// Two-instruction recipes.
   646  	// A: Both of the outer's inputs are from the same single-instruction recipe.
   647  	// B: First input is v and the second is from a single-instruction recipe.
   648  	// C: Second input is v and the first is from a single-instruction recipe.
   649  	// A is slightly preferred because it often needs 1 less register, so it
   650  	// goes first.
   651  
   652  	// A
   653  	for _, inner := range linearCombos {
   654  		for _, outer := range linearCombos {
   655  			x := (inner.a + inner.b) * (outer.a + outer.b)
   656  			cost := inner.cost + outer.cost
   657  			old := c.mulRecipes[x]
   658  			if (old.build == nil || cost < old.cost) && cost < mulCost {
   659  				c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
   660  					v = inner.build(m, v, v)
   661  					return outer.build(m, v, v)
   662  				}}
   663  			}
   664  		}
   665  	}
   666  
   667  	// B
   668  	for _, inner := range linearCombos {
   669  		for _, outer := range linearCombos {
   670  			x := outer.a + outer.b*(inner.a+inner.b)
   671  			cost := inner.cost + outer.cost
   672  			old := c.mulRecipes[x]
   673  			if (old.build == nil || cost < old.cost) && cost < mulCost {
   674  				c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
   675  					return outer.build(m, v, inner.build(m, v, v))
   676  				}}
   677  			}
   678  		}
   679  	}
   680  
   681  	// C
   682  	for _, inner := range linearCombos {
   683  		for _, outer := range linearCombos {
   684  			x := outer.a*(inner.a+inner.b) + outer.b
   685  			cost := inner.cost + outer.cost
   686  			old := c.mulRecipes[x]
   687  			if (old.build == nil || cost < old.cost) && cost < mulCost {
   688  				c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
   689  					return outer.build(m, inner.build(m, v, v), v)
   690  				}}
   691  			}
   692  		}
   693  	}
   694  
   695  	// Currently we only process 3 linear combination instructions for loong64.
   696  	if arch == "loong64" {
   697  		// Three-instruction recipes.
   698  		// D: The first and the second are all single-instruction recipes, and they are also the third's inputs.
   699  		// E: The first single-instruction is the second's input, and the second is the third's input.
   700  
   701  		// D
   702  		for _, first := range linearCombos {
   703  			for _, second := range linearCombos {
   704  				for _, third := range linearCombos {
   705  					x := third.a*(first.a+first.b) + third.b*(second.a+second.b)
   706  					cost := first.cost + second.cost + third.cost
   707  					old := c.mulRecipes[x]
   708  					if (old.build == nil || cost < old.cost) && cost < mulCost {
   709  						c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
   710  							v1 := first.build(m, v, v)
   711  							v2 := second.build(m, v, v)
   712  							return third.build(m, v1, v2)
   713  						}}
   714  					}
   715  				}
   716  			}
   717  		}
   718  
   719  		// E
   720  		for _, first := range linearCombos {
   721  			for _, second := range linearCombos {
   722  				for _, third := range linearCombos {
   723  					x := third.a*(second.a*(first.a+first.b)+second.b) + third.b
   724  					cost := first.cost + second.cost + third.cost
   725  					old := c.mulRecipes[x]
   726  					if (old.build == nil || cost < old.cost) && cost < mulCost {
   727  						c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
   728  							v1 := first.build(m, v, v)
   729  							v2 := second.build(m, v1, v)
   730  							return third.build(m, v2, v)
   731  						}}
   732  					}
   733  				}
   734  			}
   735  		}
   736  	}
   737  
   738  	// These cases should be handled specially by rewrite rules.
   739  	// (Otherwise v * 1 == (neg (neg v)))
   740  	delete(c.mulRecipes, 0)
   741  	delete(c.mulRecipes, 1)
   742  
   743  	// Currently:
   744  	// len(c.mulRecipes) == 5984 on arm64
   745  	//                       680 on amd64
   746  	//                      9738 on loong64
   747  	// This function takes ~2.5ms on arm64.
   748  	//println(len(c.mulRecipes))
   749  }
   750  

View as plain text