// Copyright 2026 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package arm64 import ( "cmd/internal/obj" "fmt" "iter" "math" "math/bits" ) // instEncoder represents an instruction encoder. type instEncoder struct { goOp obj.As // Go opcode mnemonic fixedBits uint32 // Known bits args []operand // Operands, in Go order } type varBits struct { // The low and high bit index in the binary encoding, exclusive on hi lo, hi int encoded bool // If true then its value is already encoded bits uint32 } // component is the component of an binary encoding. // e.g. for operand ., 's encoding function might be described as: // // For the "Byte and halfword" variant: is the size specifier, // sz // 0 B // 1 H // bit range mappings: // sz: [22:23) // // Then sz is the component of the binary encoding. type component uint16 type elemEncoder struct { fn func(uint32) (uint32, bool) // comp is the component of the binary encoding. comp component } // operand is the operand type of an instruction. type operand struct { class AClass // Operand class, register, constant, memory operation etc. // The elements that this operand includes, this only includes the encoding-related parts // They are represented as a list of pointers to the encoding functions. // The first returned value is the encoded binary, the second is the ok signal. // The encoding functions return the ok signal for deduplication purposes: // For example: // SDOT ., ., . // SDOT .H, .B, .B // SDOT .S, .H, .H // // and are specified in the encoding text, that there is a constraint "T = 4*Tb". // We don't know this fact by looking at the encoding format solely, without this information // the first encoding domain entails the other 2. And at instruction matching phase we simply // cannot deduplicate them. So we defer this deduplication to the encoding phase. // We need the ok signal with [elemEncoder.comp] field to deduplicate them. elemEncoders []elemEncoder } // opsInProg returns an iterator over the operands ([Addr]) of p func opsInProg(p *obj.Prog) iter.Seq[*obj.Addr] { return func(yield func(*obj.Addr) bool) { // Go order: From, Reg, RestArgs..., To // For SVE, Reg is unused as it's so common that registers have arrangements. if p.From.Type != obj.TYPE_NONE { if !yield(&p.From) { return } } for j := range p.RestArgs { if !yield(&p.RestArgs[j].Addr) { return } } if p.To.Type != obj.TYPE_NONE { if !yield(&p.To) { return } } } } // aclass returns the AClass of an Addr. func aclass(a *obj.Addr) AClass { if a.Type == obj.TYPE_REG { if a.Offset&(int64(1)<<62) != 0 { return AC_PREGSEL } if a.Reg >= REG_Z0 && a.Reg <= REG_Z31 { return AC_ZREG } if a.Reg >= REG_P0 && a.Reg <= REG_PN15 { return AC_PREG } if a.Reg >= REG_ARNG && a.Reg < REG_ELEM { return AC_ARNG } if a.Reg >= REG_ZARNG && a.Reg < REG_ZARNGELEM { return AC_ARNG } if a.Reg >= REG_ZARNGELEM && a.Reg < REG_PZELEM { return AC_ARNGIDX } if a.Reg >= REG_PZELEM && a.Reg < REG_PARNGZM { if a.Reg&(1<<5) == 0 { return AC_ZREGIDX } else { return AC_PREGIDX } } if a.Reg >= REG_PARNGZM && a.Reg < REG_PARNGZM_END { switch (a.Reg >> 5) & 15 { case PRED_M, PRED_Z: return AC_PREGZM default: return AC_ARNG } } if a.Reg >= REG_V0 && a.Reg <= REG_V31 { return AC_VREG } if a.Reg >= REG_R0 && a.Reg <= REG_R31 || a.Reg == REG_RSP { return AC_SPZGREG } } if a.Type == obj.TYPE_CONST || a.Type == obj.TYPE_FCONST { return AC_IMM } if a.Type == obj.TYPE_REGLIST { if a.Scale > 0 { return AC_REGLIST_RANGE } switch (a.Offset >> 12) & 0xf { case 0x7: return AC_REGLIST1 case 0xa: return AC_REGLIST2 case 0x6: return AC_REGLIST3 case 0x2: return AC_REGLIST4 } } if a.Type == obj.TYPE_MEM { if a.Index == 0 { if a.Scale&-32768 != 0 { return AC_MEMOFFMULVL } return AC_MEMOFF } return AC_MEMEXT } if a.Type == obj.TYPE_SPECIAL { return AC_SPECIAL } panic(fmt.Errorf("unknown AClass, addr = %v\n", a)) } // addrComponent returns the binary (component) of the stored element in a at index, for operand // of type aclass. // // For example, for operand of type AC_ARNG, it has 2 permissible components (identified by index) // 0. register: // 1. arrangement: // // They are stored in a.Reg as: // // reg | (arrangement << 5) // // More details are in the comments in the switch cases of this function. func addrComponent(a *obj.Addr, acl AClass, index int) uint32 { switch acl { // AClass: AC_PREGSEL // GNU mnemonic: .[, ] // Go mnemonic: // [selreg, $idximm](preg.T) // Encoding: // Type = TYPE_REG // Offset = packed bits: preg (5 bits) | T (4 bits) | selreg (5 bits) | idximm (6 bits) | sentinel (bit 62) case AC_PREGSEL: switch index { case 0: return uint32(a.Offset & 31) case 1: return uint32((a.Offset >> 5) & 15) case 2: return uint32((a.Offset>>9)&31 + REG_R0) // encoding functions assume a ARM64 register case 3: // This is to check the width of selreg, which is guaranteed to be W in AC_PREGSEL. // W check always returns true as it's resolved in the generator already. // So we just return a default value 0 here. return 0 case 4: return uint32((a.Offset >> 14) & 63) default: panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl)) } // AClass: AC_ARNG, AC_PREG, AC_PREGZ, AC_PREGM, AC_ZREG // GNU mnemonic: . Or / (T is M or Z) // Go mnemonic: // reg. // Encoding: // Type = TYPE_REG // Reg = reg | (arrangement or predication << 5) case AC_ARNG, AC_PREG, AC_PREGZM, AC_ZREG: switch index { case 0: return uint32(a.Reg & 31) case 1: return uint32((a.Reg >> 5) & 15) default: panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl)) } // AClass: AC_ARNGIDX, AC_PREGIDX, AC_ZREGIDX // GNU mnemonic: .[] // Go mnemonic: // reg.T[index] // Encoding: // Type = TYPE_REG // Reg = reg | (arrangement << 5) // Index = index case AC_ARNGIDX, AC_PREGIDX, AC_ZREGIDX: switch index { case 0: return uint32(a.Reg & 31) case 1: // Arrangement return uint32((a.Reg >> 5) & 15) case 2: // Index return uint32(a.Index) default: panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl)) } // AClass: AC_SPZGREG, AC_VREG // GNU mnemonic: // Go mnemonic: // reg (the width is already represented in the opcode) // Encoding: // Type = TYPE_REG // Reg = reg case AC_SPZGREG, AC_VREG: switch index { case 0: // These are all width checks, they should map to no-op checks altogether. return 0 case 1: return uint32(a.Reg) default: panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl)) } // AClass: AC_IMM // GNU mnemonic: , // Go mnemonic: // $imm<> 32 & 0b11 sum := 32 if prefix == 2 { sum = 16 } switch acl { case AC_REGLIST1: if index > 2 { panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl)) } case AC_REGLIST2: if index > 4 { panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl)) } case AC_REGLIST3: if index > 6 { panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl)) } case AC_REGLIST4: if index > 8 { panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl)) } case AC_REGLIST_RANGE: // It behaves just like a AC_REGLIST2 if index > 4 { panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl)) } } switch index % 2 { case 0: // register if a.Scale > 0 { // For register ranges in SVE we allow discontiguous registers. return uint32((firstReg + (index/2)*int(a.Scale)) % sum) } return uint32((firstReg + index/2) % sum) case 1: // arrangement curQ := a.Offset >> 30 & 0b11 curSize := a.Offset >> 10 & 0b11 switch curQ { case 0: switch curSize { case 0: return ARNG_8B case 1: return ARNG_4H case 2: return ARNG_2S case 3: return ARNG_1D default: panic(fmt.Errorf("unknown size value at %d in AClass %d", index, acl)) } case 1: switch curSize { case 0: return ARNG_16B case 1: return ARNG_8H case 2: return ARNG_4S case 3: return ARNG_2D default: panic(fmt.Errorf("unknown size value at %d in AClass %d", index, acl)) } case 2: switch curSize { case 1: return ARNG_B case 2: return ARNG_H case 3: return ARNG_S default: panic(fmt.Errorf("unknown size value at %d in AClass %d", index, acl)) } case 3: switch curSize { case 1: return ARNG_D case 2: return ARNG_Q default: panic(fmt.Errorf("unknown size value at %d in AClass %d", index, acl)) } default: panic(fmt.Errorf("unknown Q value at %d in AClass %d", index, acl)) } default: panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl)) } // AClass: AC_SPECIAL // GNU mnemonic: // Go mnemonic: // special // Encoding: // Type = TYPE_SPECIAL // Offset = SpecialOperand enum value case AC_SPECIAL: switch index { case 0: return uint32(a.Offset) default: panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl)) } // AClass: AC_MEMOFF, AC_MEMOFFMULVL // GNU mnemonic: [., #] // Go mnemonic: // imm(reg.T) // Encoding: // Type = TYPE_MEM // Reg = Base register (with arrangement if applicable) // Offset = Immediate offset case AC_MEMOFF, AC_MEMOFFMULVL: switch index { case 0: return uint32(a.Reg & 31) case 1: return uint32((a.Reg >> 5) & 15) case 2: return uint32(a.Offset) default: panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl)) } // AClass: AC_MEMEXT // GNU mnemonic: [., ., ] // Go mnemonic: // (reg2.T2.mod<> 5) & 15) case 2: return uint32(a.Reg) case 3: return uint32((a.Reg >> 5) & 15) case 4: // mod is either 1 (UXTW), 2 (SXTW), or 4 (LSL) mod := uint32((a.Scale >> 9) & 0x7) amount := uint32((a.Scale >> 12) & 0x7) if mod == 0 && amount > 0 { // LSL is implied when no extension is specified but amount > 0 mod |= 1 << 2 } return mod case 5: return uint32((a.Scale >> 12) & 0x7) default: panic(fmt.Errorf("unknown elm index at %d in AClass %d", index, acl)) } } // TODO: handle more AClasses. panic(fmt.Errorf("unknown AClass %d", acl)) } var codeI1Tsz uint32 = 0xffffffff var codeImm2Tsz uint32 = 0xfffffffe var codeShift161919212223 uint32 = 0xfffffffd var codeShift161919212224 uint32 = 0xfffffffc var codeShift588102224 uint32 = 0xfffffffb var codeLogicalImmArrEncoding uint32 = 0xfffffffa var codeImm3Tsize1621 uint32 = 0xfffffff9 var codeShiftI1TszhTszl uint32 = 0xfffffff8 var codeNoOp uint32 = 0xfffffff7 // encodeI1Tsz is the implementation of the following encoding logic: // Is the immediate index, in the range 0 to one less than the number of elements in 128 bits, encoded in "i1:tsz". // bit range mappings: // i1: [20:21) // tsz: [16:20) // Note: // // arr is the arrangement. // This encoding is aligned to the high bit of the box, according to the spec. func encodeI1Tsz(v, arr uint32) (uint32, bool) { switch arr { case ARNG_B: if v > 15 { return 0, false } return v << 17, true case ARNG_H: if v > 7 { return 0, false } return v << 18, true case ARNG_S: if v > 3 { return 0, false } return v << 19, true case ARNG_D: if v > 1 { return 0, false } return v << 20, true case ARNG_Q: if v > 0 { return 0, false } return 0, true default: return 0, false } } // encodeImm2Tsz is the implementation of the following encoding logic: // Is the immediate index, in the range 0 to one less than the number of elements in 512 bits, encoded in "imm2:tsz". // bit range mappings: // imm2: [22:24) // tsz: [16:21) // Note: // // arr is the arrangement. // This encoding is aligned to the high bit of the box, according to the spec. func encodeImm2Tsz(v, arr uint32) (uint32, bool) { switch arr { case ARNG_B: if v > 63 { return 0, false } v <<= 1 return (v&31)<<16 | (v>>5)<<22, true case ARNG_H: if v > 31 { return 0, false } v <<= 2 return (v&31)<<16 | (v>>5)<<22, true case ARNG_S: if v > 15 { return 0, false } v <<= 3 return (v&31)<<16 | (v>>5)<<22, true case ARNG_D: if v > 7 { return 0, false } v <<= 4 return (v&31)<<16 | (v>>5)<<22, true case ARNG_Q: if v > 3 { return 0, false } v <<= 5 return (v&31)<<16 | (v>>5)<<22, true default: return 0, false } } type arrAlignType int const ( arrAlignBHSD arrAlignType = iota arrAlignHSD arrAlignBHS ) // encodeShiftTriple encodes an shift immediate value in "tszh:tszl:imm3". // tszh, tszl, imm3 are in ranges, sorted by bit position. // These shifts are also bounded by arrangement element size. func encodeShiftTriple(v uint32, r [6]int, prevAddr *obj.Addr, op obj.As) (uint32, bool) { // The previous op must be a scalable vector, and we need its arrangement. acl := aclass(prevAddr) if acl != AC_ARNG { return 0, false } arr := addrComponent(prevAddr, acl, 1) // Get arrangement elemBits := uint32(0) switch arr { case ARNG_B: elemBits = 8 case ARNG_H: elemBits = 16 case ARNG_S: elemBits = 32 case ARNG_D: elemBits = 64 default: return 0, false } if v >= elemBits { return 0, false } var C uint32 // Unfortunately these information are in the decoding ASL. // For these instructions, the esize (see comment in the switch below) // is derived from the destination arrangement, however how this function is called is deriving // the esize from one of the source. // We need to address this discrepancy. effectiveEsize := elemBits switch op { case AZRSHRNB, AZRSHRNT, AZSHRNB, AZSHRNT, AZSQRSHRNB, AZSQRSHRNT, AZSQRSHRUNB, AZSQRSHRUNT, AZSQSHRNB, AZSQSHRNT, AZSQSHRUNB, AZSQSHRUNT, AZUQRSHRNB, AZUQRSHRNT, AZUQSHRNB, AZUQSHRNT: effectiveEsize = elemBits / 2 } switch op { case AZASR, AZLSR, AZURSHR, AZASRD, AZRSHRNB, AZRSHRNT, AZSHRNB, AZSHRNT, AZSQRSHRNB, AZSQRSHRNT, AZSQRSHRUNB, AZSQRSHRUNT, AZSQSHRNB, AZSQSHRNT, AZSQSHRUNB, AZSQSHRUNT, AZSRSHR, AZUQRSHRNB, AZUQRSHRNT, AZUQSHRNB, AZUQSHRNT, AZURSRA, AZUSRA, AZXAR, AZSRI, AZSRSRA, AZSSRA: // ASL: let shift : integer = (2 * esize) - UInt(tsize::imm3); if v == 0 { return 0, false } C = (2 * effectiveEsize) - v default: // ASL: let shift : integer = UInt(tsize::imm3) - esize; C = effectiveEsize + v } var chunks [3]uint32 for i := 0; i < 6; i += 2 { chunks[i/2] = C & ((1 << (r[i+1] - r[i])) - 1) C >>= (r[i+1] - r[i]) } return uint32((chunks[0] << r[0]) | (chunks[1] << r[2]) | (chunks[2] << r[4])), true } // encodeLogicalImmEncoding is the implementation of the following encoding logic: // Is the size specifier, // imm13 // 0xxxxxx0xxxxx S // 0xxxxxx10xxxx H // 0xxxxxx110xxx B // 0xxxxxx1110xx B // 0xxxxxx11110x B // 0xxxxxx11111x RESERVED // 1xxxxxxxxxxxx D // At the meantime: // Is a 64, 32, 16 or 8-bit bitmask consisting of replicated 2, 4, 8, 16, 32 or 64 bit fields, // each field containing a rotated run of non-zero bits, encoded in the "imm13" field. // // bit range mappings: // imm13: [5:18) // // ARM created a "clever" recipe that can generate useful repeating 8-64 bit bitmasks. // Instead of storing the literal binary number, the processor reads a 13-bit recipe // using three fields (bits from high to low): // N (1 bit), immr (6 bits), and imms (6 bits). // // How the recipe works: // Every logical immediate represents a repeating pattern (like repeating tiles). The processor // uses the three fields to figure out the size of the tile, how many 1s are in the tile, and // how far to rotate it. // The N bit combined with the upper bits of imms determines the width of the repeating block. // Depending on these bits, the fundamental block can be 2, 4, 8, 16, 32, or 64 bits wide. // The lower bits of imms dictate exactly how many contiguous 1s exist inside that block. // The immr value tells the processor how many bits to rotate that block to the right. // Finally, the resulting block is duplicated to fill a standard 64-bit lane. func encodeLogicalImmArrEncoding(v uint64, adjacentAddr *obj.Addr) (uint32, bool) { acl := aclass(adjacentAddr) if acl != AC_ARNG { return 0, false } arr := addrComponent(adjacentAddr, acl, 1) // Replicate the given immediate to fill a full 64-bit lane. // This ensures our pattern-shrinking logic naturally respects the vector lane bounds. var val uint64 switch arr { case ARNG_B: // 8-bit lane v8 := uint64(v & 0xFF) val = v8 * 0x0101010101010101 case ARNG_H: // 16-bit lane v16 := uint64(v & 0xFFFF) val = v16 * 0x0001000100010001 case ARNG_S: // 32-bit lane v32 := uint64(v) val = v32 | (v32 << 32) case ARNG_D: // 64-bit lane val = uint64(v) default: return 0, false } // Reject all zeros or all ones (handled by MOV/EOR, invalid for AND/ORR immediates) if val == 0 || val == ^uint64(0) { return 0, false } // Find the absolute smallest repeating pattern size (64 down to 2) size := uint64(64) for size > 2 { half := size / 2 mask := (uint64(1) << half) - 1 lower := val & mask upper := (val >> half) & mask // If the top half matches the bottom half, shrink our window if lower == upper { size = half val = lower } else { break } } // Count the contiguous ones in this minimal pattern mask := (uint64(1) << size) - 1 val &= mask ones := bits.OnesCount64(val) // Find the right-rotation (rot) needed to align the 1s at the bottom expected := (uint64(1) << ones) - 1 rot := -1 for r := 0; r < int(size); r++ { // Right rotate 'val' by 'r' bits within a 'size'-bit window rotated := ((val >> r) | (val << (int(size) - r))) & mask if rotated == expected { rot = r break } } if rot == -1 { return 0, false } // immr is the amount the hardware must right-rotate the base pattern. // Since 'rot' is how much we right-rotated the target to find the base, // the hardware needs the inverse rotation. immr := uint32((int(size) - rot) % int(size)) // If we couldn't find a rotation that forms a perfect contiguous block of 1s, it's invalid. if rot == -1 { return 0, false } // Encode N, immr, and imms n := uint32(0) if size == 64 { n = 1 } // The imms prefix is mathematically generated by (~(size*2 - 1) & 0x3F). // We then OR it with the number of ones (minus 1). imms := (uint32(^(size*2 - 1)) & 0x3F) | uint32(ones-1) // Construct the final 13-bit field: N (1) | immr (6) | imms (6) imm13 := (n << 12) | (immr << 6) | imms // Shift by 5 to place imm13 into instruction bits [5:17] return imm13 << 5, true } // encodeImm3Tsize1621 is the implementation of the following encoding logic: // Is the immediate shift amount, in the range 1 to number of bits per element, encoded in "tsize:imm3". // bit range mappings: // imm3: [16:19) // tsize: [19:21) // // srcArr is the in the source reglist (ranged). func encodeImm3Tsize1621(v uint32, srcArr uint32) (uint32, bool) { // From ARM ASL: let shift : integer = (2 * esize) - UInt(tsize::imm3); // Then tsize::imm3 = (2 * esize) - shift. var size uint32 switch srcArr { case ARNG_H: // It's the destination size, which is half the source. size = 8 case ARNG_S: size = 16 default: return 0, false } if v < 1 || v > size { return 0, false } return (((2*size - v) & 0x1f) << 16), true } // encodeShiftI1TszhTszl is the implementation of the following encoding logic: // Is the element index, in the range 0 to one less than the number of vector elements in a 128-bit vector register, encoded in "i1:tszh:tszl". // bit range mappings: // i1: [23:24) // tszh: [22:23) // tszl: [18:21) // // arr is the arrangement func encodeShiftI1TszhTszl(v uint32, arr uint32) (uint32, bool) { var shift, max uint32 switch arr { case ARNG_B: shift = 1 max = 16 case ARNG_H: shift = 2 max = 8 case ARNG_S: shift = 3 max = 4 case ARNG_D: shift = 4 max = 2 default: return 0, false } if v >= max { return 0, false } v <<= shift return ((v & 0x7) << 18) | ((v >> 3) << 22), true } // tryEncode tries to encode p with i, it returns the encoded binary and ok signal. func (i *instEncoder) tryEncode(p *obj.Prog) (uint32, bool) { bin := i.fixedBits // Some elements are encoded in the same component, they need to be equal. // For example { .-. }. // The 2 instances of must encode to the same value. encoded := map[component]uint32{} var addrs []*obj.Addr for addr := range opsInProg(p) { addrs = append(addrs, addr) } if len(addrs) != len(i.args) { return 0, false } for opIdx, addr := range addrs { if opIdx >= len(i.args) { return 0, false } op := i.args[opIdx] acl := aclass(addr) if acl != op.class { return 0, false } for i, enc := range op.elemEncoders { val := addrComponent(addr, acl, i) if (p.As == AZFCPY || p.As == AZFDUP) && acl == AC_IMM { // These instructions expects ARM's 8-bit float encoding. // Reinterpret the uint32 bits back as a float32, then convert to float64 for chipfloat7 fval := float64(math.Float32frombits(val)) encode := (&ctxt7{}).chipfloat7(fval) if encode == -1 { // Handle error or return false to indicate mismatch return 0, false } val = uint32(encode) } if b, ok := enc.fn(val); ok || b != 0 { specialB := uint32(b) if !ok { specialB = b switch b { case codeI1Tsz: b, ok = encodeI1Tsz(val, addrComponent(addr, acl, i-1)) case codeImm2Tsz: b, ok = encodeImm2Tsz(val, addrComponent(addr, acl, i-1)) case codeShift161919212223: b, ok = encodeShiftTriple(val, [6]int{16, 19, 19, 21, 22, 23}, addrs[opIdx+1], p.As) case codeShift161919212224: b, ok = encodeShiftTriple(val, [6]int{16, 19, 19, 21, 22, 24}, addrs[opIdx+1], p.As) case codeShift588102224: b, ok = encodeShiftTriple(val, [6]int{5, 8, 8, 10, 22, 24}, addrs[opIdx+1], p.As) case codeLogicalImmArrEncoding: // Now that we know this is an immediate. // ARM64 allows imm13 to encode up to 64 bits of immediates. // addrComponent is not the right fit here, we need to extract [Offset] fields manually. b, ok = encodeLogicalImmArrEncoding(uint64(addr.Offset), addrs[opIdx+1]) case codeImm3Tsize1621: b, ok = encodeImm3Tsize1621(val, addrComponent(addrs[opIdx+1], aclass(addrs[opIdx+1]), 1)) case codeShiftI1TszhTszl: b, ok = encodeShiftI1TszhTszl(val, addrComponent(addr, AC_PREGSEL, 1)) case codeNoOp: b, ok = 0, true default: panic(fmt.Errorf("unknown encoding function code %d", b)) } } if !ok { return 0, false } bin |= b if _, ok := encoded[enc.comp]; ok && b != encoded[enc.comp] { if specialB == codeNoOp { // NoOp encodings don't need checks. continue } return 0, false } if enc.comp != enc_NIL && specialB != codeNoOp { // NoOp encodings don't need bookkeeping. encoded[enc.comp] = b } } else { return 0, false } } } return bin, true }