diff options
Diffstat (limited to 'vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend')
51 files changed, 25568 insertions, 0 deletions
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go new file mode 100644 index 000000000..cf91c6b7a --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go @@ -0,0 +1,170 @@ +package backend + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +type ( + // FunctionABI represents the ABI information for a function which corresponds to a ssa.Signature. + FunctionABI struct { + Initialized bool + + Args, Rets []ABIArg + ArgStackSize, RetStackSize int64 + + ArgIntRealRegs byte + ArgFloatRealRegs byte + RetIntRealRegs byte + RetFloatRealRegs byte + } + + // ABIArg represents either argument or return value's location. + ABIArg struct { + // Index is the index of the argument. + Index int + // Kind is the kind of the argument. + Kind ABIArgKind + // Reg is valid if Kind == ABIArgKindReg. + // This VReg must be based on RealReg. + Reg regalloc.VReg + // Offset is valid if Kind == ABIArgKindStack. + // This is the offset from the beginning of either arg or ret stack slot. + Offset int64 + // Type is the type of the argument. + Type ssa.Type + } + + // ABIArgKind is the kind of ABI argument. + ABIArgKind byte +) + +const ( + // ABIArgKindReg represents an argument passed in a register. + ABIArgKindReg = iota + // ABIArgKindStack represents an argument passed in the stack. + ABIArgKindStack +) + +// String implements fmt.Stringer. +func (a *ABIArg) String() string { + return fmt.Sprintf("args[%d]: %s", a.Index, a.Kind) +} + +// String implements fmt.Stringer. +func (a ABIArgKind) String() string { + switch a { + case ABIArgKindReg: + return "reg" + case ABIArgKindStack: + return "stack" + default: + panic("BUG") + } +} + +// Init initializes the abiImpl for the given signature. +func (a *FunctionABI) Init(sig *ssa.Signature, argResultInts, argResultFloats []regalloc.RealReg) { + if len(a.Rets) < len(sig.Results) { + a.Rets = make([]ABIArg, len(sig.Results)) + } + a.Rets = a.Rets[:len(sig.Results)] + a.RetStackSize = a.setABIArgs(a.Rets, sig.Results, argResultInts, argResultFloats) + if argsNum := len(sig.Params); len(a.Args) < argsNum { + a.Args = make([]ABIArg, argsNum) + } + a.Args = a.Args[:len(sig.Params)] + a.ArgStackSize = a.setABIArgs(a.Args, sig.Params, argResultInts, argResultFloats) + + // Gather the real registers usages in arg/return. + a.ArgIntRealRegs, a.ArgFloatRealRegs = 0, 0 + a.RetIntRealRegs, a.RetFloatRealRegs = 0, 0 + for i := range a.Rets { + r := &a.Rets[i] + if r.Kind == ABIArgKindReg { + if r.Type.IsInt() { + a.RetIntRealRegs++ + } else { + a.RetFloatRealRegs++ + } + } + } + for i := range a.Args { + arg := &a.Args[i] + if arg.Kind == ABIArgKindReg { + if arg.Type.IsInt() { + a.ArgIntRealRegs++ + } else { + a.ArgFloatRealRegs++ + } + } + } + + a.Initialized = true +} + +// setABIArgs sets the ABI arguments in the given slice. This assumes that len(s) >= len(types) +// where if len(s) > len(types), the last elements of s is for the multi-return slot. +func (a *FunctionABI) setABIArgs(s []ABIArg, types []ssa.Type, ints, floats []regalloc.RealReg) (stackSize int64) { + il, fl := len(ints), len(floats) + + var stackOffset int64 + intParamIndex, floatParamIndex := 0, 0 + for i, typ := range types { + arg := &s[i] + arg.Index = i + arg.Type = typ + if typ.IsInt() { + if intParamIndex >= il { + arg.Kind = ABIArgKindStack + const slotSize = 8 // Align 8 bytes. + arg.Offset = stackOffset + stackOffset += slotSize + } else { + arg.Kind = ABIArgKindReg + arg.Reg = regalloc.FromRealReg(ints[intParamIndex], regalloc.RegTypeInt) + intParamIndex++ + } + } else { + if floatParamIndex >= fl { + arg.Kind = ABIArgKindStack + slotSize := int64(8) // Align at least 8 bytes. + if typ.Bits() == 128 { // Vector. + slotSize = 16 + } + arg.Offset = stackOffset + stackOffset += slotSize + } else { + arg.Kind = ABIArgKindReg + arg.Reg = regalloc.FromRealReg(floats[floatParamIndex], regalloc.RegTypeFloat) + floatParamIndex++ + } + } + } + return stackOffset +} + +func (a *FunctionABI) AlignedArgResultStackSlotSize() uint32 { + stackSlotSize := a.RetStackSize + a.ArgStackSize + // Align stackSlotSize to 16 bytes. + stackSlotSize = (stackSlotSize + 15) &^ 15 + // Check overflow 32-bit. + if stackSlotSize > 0xFFFFFFFF { + panic("ABI stack slot size overflow") + } + return uint32(stackSlotSize) +} + +func (a *FunctionABI) ABIInfoAsUint64() uint64 { + return uint64(a.ArgIntRealRegs)<<56 | + uint64(a.ArgFloatRealRegs)<<48 | + uint64(a.RetIntRealRegs)<<40 | + uint64(a.RetFloatRealRegs)<<32 | + uint64(a.AlignedArgResultStackSlotSize()) +} + +func ABIInfoFromUint64(info uint64) (argIntRealRegs, argFloatRealRegs, retIntRealRegs, retFloatRealRegs byte, stackSlotSize uint32) { + return byte(info >> 56), byte(info >> 48), byte(info >> 40), byte(info >> 32), uint32(info) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go new file mode 100644 index 000000000..dd67da43e --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go @@ -0,0 +1,3 @@ +// Package backend must be free of Wasm-specific concept. In other words, +// this package must not import internal/wasm package. +package backend diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go new file mode 100644 index 000000000..59bbfe02d --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go @@ -0,0 +1,417 @@ +package backend + +import ( + "context" + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// NewCompiler returns a new Compiler that can generate a machine code. +func NewCompiler(ctx context.Context, mach Machine, builder ssa.Builder) Compiler { + return newCompiler(ctx, mach, builder) +} + +func newCompiler(_ context.Context, mach Machine, builder ssa.Builder) *compiler { + argResultInts, argResultFloats := mach.ArgsResultsRegs() + c := &compiler{ + mach: mach, ssaBuilder: builder, + nextVRegID: regalloc.VRegIDNonReservedBegin, + argResultInts: argResultInts, + argResultFloats: argResultFloats, + } + mach.SetCompiler(c) + return c +} + +// Compiler is the backend of wazevo which takes ssa.Builder and Machine, +// use the information there to emit the final machine code. +type Compiler interface { + // SSABuilder returns the ssa.Builder used by this compiler. + SSABuilder() ssa.Builder + + // Compile executes the following steps: + // 1. Lower() + // 2. RegAlloc() + // 3. Finalize() + // 4. Encode() + // + // Each step can be called individually for testing purpose, therefore they are exposed in this interface too. + // + // The returned byte slices are the machine code and the relocation information for the machine code. + // The caller is responsible for copying them immediately since the compiler may reuse the buffer. + Compile(ctx context.Context) (_ []byte, _ []RelocationInfo, _ error) + + // Lower lowers the given ssa.Instruction to the machine-specific instructions. + Lower() + + // RegAlloc performs the register allocation after Lower is called. + RegAlloc() + + // Finalize performs the finalization of the compilation, including machine code emission. + // This must be called after RegAlloc. + Finalize(ctx context.Context) error + + // Buf returns the buffer of the encoded machine code. This is only used for testing purpose. + Buf() []byte + + BufPtr() *[]byte + + // Format returns the debug string of the current state of the compiler. + Format() string + + // Init initializes the internal state of the compiler for the next compilation. + Init() + + // AllocateVReg allocates a new virtual register of the given type. + AllocateVReg(typ ssa.Type) regalloc.VReg + + // ValueDefinition returns the definition of the given value. + ValueDefinition(ssa.Value) *SSAValueDefinition + + // VRegOf returns the virtual register of the given ssa.Value. + VRegOf(value ssa.Value) regalloc.VReg + + // TypeOf returns the ssa.Type of the given virtual register. + TypeOf(regalloc.VReg) ssa.Type + + // MatchInstr returns true if the given definition is from an instruction with the given opcode, the current group ID, + // and a refcount of 1. That means, the instruction can be merged/swapped within the current instruction group. + MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool + + // MatchInstrOneOf is the same as MatchInstr but for multiple opcodes. If it matches one of ssa.Opcode, + // this returns the opcode. Otherwise, this returns ssa.OpcodeInvalid. + // + // Note: caller should be careful to avoid excessive allocation on opcodes slice. + MatchInstrOneOf(def *SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode + + // AddRelocationInfo appends the relocation information for the function reference at the current buffer offset. + AddRelocationInfo(funcRef ssa.FuncRef) + + // AddSourceOffsetInfo appends the source offset information for the given offset. + AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset) + + // SourceOffsetInfo returns the source offset information for the current buffer offset. + SourceOffsetInfo() []SourceOffsetInfo + + // EmitByte appends a byte to the buffer. Used during the code emission. + EmitByte(b byte) + + // Emit4Bytes appends 4 bytes to the buffer. Used during the code emission. + Emit4Bytes(b uint32) + + // Emit8Bytes appends 8 bytes to the buffer. Used during the code emission. + Emit8Bytes(b uint64) + + // GetFunctionABI returns the ABI information for the given signature. + GetFunctionABI(sig *ssa.Signature) *FunctionABI +} + +// RelocationInfo represents the relocation information for a call instruction. +type RelocationInfo struct { + // Offset represents the offset from the beginning of the machine code of either a function or the entire module. + Offset int64 + // Target is the target function of the call instruction. + FuncRef ssa.FuncRef +} + +// compiler implements Compiler. +type compiler struct { + mach Machine + currentGID ssa.InstructionGroupID + ssaBuilder ssa.Builder + // nextVRegID is the next virtual register ID to be allocated. + nextVRegID regalloc.VRegID + // ssaValueToVRegs maps ssa.ValueID to regalloc.VReg. + ssaValueToVRegs [] /* VRegID to */ regalloc.VReg + // ssaValueDefinitions maps ssa.ValueID to its definition. + ssaValueDefinitions []SSAValueDefinition + // ssaValueRefCounts is a cached list obtained by ssa.Builder.ValueRefCounts(). + ssaValueRefCounts []int + // returnVRegs is the list of virtual registers that store the return values. + returnVRegs []regalloc.VReg + varEdges [][2]regalloc.VReg + varEdgeTypes []ssa.Type + constEdges []struct { + cInst *ssa.Instruction + dst regalloc.VReg + } + vRegSet []bool + vRegIDs []regalloc.VRegID + tempRegs []regalloc.VReg + tmpVals []ssa.Value + ssaTypeOfVRegID [] /* VRegID to */ ssa.Type + buf []byte + relocations []RelocationInfo + sourceOffsets []SourceOffsetInfo + // abis maps ssa.SignatureID to the ABI implementation. + abis []FunctionABI + argResultInts, argResultFloats []regalloc.RealReg +} + +// SourceOffsetInfo is a data to associate the source offset with the executable offset. +type SourceOffsetInfo struct { + // SourceOffset is the source offset in the original source code. + SourceOffset ssa.SourceOffset + // ExecutableOffset is the offset in the compiled executable. + ExecutableOffset int64 +} + +// Compile implements Compiler.Compile. +func (c *compiler) Compile(ctx context.Context) ([]byte, []RelocationInfo, error) { + c.Lower() + if wazevoapi.PrintSSAToBackendIRLowering && wazevoapi.PrintEnabledIndex(ctx) { + fmt.Printf("[[[after lowering for %s ]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format()) + } + if wazevoapi.DeterministicCompilationVerifierEnabled { + wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After lowering to ISA specific IR", c.Format()) + } + c.RegAlloc() + if wazevoapi.PrintRegisterAllocated && wazevoapi.PrintEnabledIndex(ctx) { + fmt.Printf("[[[after regalloc for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format()) + } + if wazevoapi.DeterministicCompilationVerifierEnabled { + wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After Register Allocation", c.Format()) + } + if err := c.Finalize(ctx); err != nil { + return nil, nil, err + } + if wazevoapi.PrintFinalizedMachineCode && wazevoapi.PrintEnabledIndex(ctx) { + fmt.Printf("[[[after finalize for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format()) + } + if wazevoapi.DeterministicCompilationVerifierEnabled { + wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After Finalization", c.Format()) + } + return c.buf, c.relocations, nil +} + +// RegAlloc implements Compiler.RegAlloc. +func (c *compiler) RegAlloc() { + c.mach.RegAlloc() +} + +// Finalize implements Compiler.Finalize. +func (c *compiler) Finalize(ctx context.Context) error { + c.mach.PostRegAlloc() + return c.mach.Encode(ctx) +} + +// setCurrentGroupID sets the current instruction group ID. +func (c *compiler) setCurrentGroupID(gid ssa.InstructionGroupID) { + c.currentGID = gid +} + +// assignVirtualRegisters assigns a virtual register to each ssa.ValueID Valid in the ssa.Builder. +func (c *compiler) assignVirtualRegisters() { + builder := c.ssaBuilder + refCounts := builder.ValueRefCounts() + c.ssaValueRefCounts = refCounts + + need := len(refCounts) + if need >= len(c.ssaValueToVRegs) { + c.ssaValueToVRegs = append(c.ssaValueToVRegs, make([]regalloc.VReg, need+1)...) + } + if need >= len(c.ssaValueDefinitions) { + c.ssaValueDefinitions = append(c.ssaValueDefinitions, make([]SSAValueDefinition, need+1)...) + } + + for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() { + // First we assign a virtual register to each parameter. + for i := 0; i < blk.Params(); i++ { + p := blk.Param(i) + pid := p.ID() + typ := p.Type() + vreg := c.AllocateVReg(typ) + c.ssaValueToVRegs[pid] = vreg + c.ssaValueDefinitions[pid] = SSAValueDefinition{BlockParamValue: p, BlkParamVReg: vreg} + c.ssaTypeOfVRegID[vreg.ID()] = p.Type() + } + + // Assigns each value to a virtual register produced by instructions. + for cur := blk.Root(); cur != nil; cur = cur.Next() { + r, rs := cur.Returns() + var N int + if r.Valid() { + id := r.ID() + ssaTyp := r.Type() + typ := r.Type() + vReg := c.AllocateVReg(typ) + c.ssaValueToVRegs[id] = vReg + c.ssaValueDefinitions[id] = SSAValueDefinition{ + Instr: cur, + N: 0, + RefCount: refCounts[id], + } + c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp + N++ + } + for _, r := range rs { + id := r.ID() + ssaTyp := r.Type() + vReg := c.AllocateVReg(ssaTyp) + c.ssaValueToVRegs[id] = vReg + c.ssaValueDefinitions[id] = SSAValueDefinition{ + Instr: cur, + N: N, + RefCount: refCounts[id], + } + c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp + N++ + } + } + } + + for i, retBlk := 0, builder.ReturnBlock(); i < retBlk.Params(); i++ { + typ := retBlk.Param(i).Type() + vReg := c.AllocateVReg(typ) + c.returnVRegs = append(c.returnVRegs, vReg) + c.ssaTypeOfVRegID[vReg.ID()] = typ + } +} + +// AllocateVReg implements Compiler.AllocateVReg. +func (c *compiler) AllocateVReg(typ ssa.Type) regalloc.VReg { + regType := regalloc.RegTypeOf(typ) + r := regalloc.VReg(c.nextVRegID).SetRegType(regType) + + id := r.ID() + if int(id) >= len(c.ssaTypeOfVRegID) { + c.ssaTypeOfVRegID = append(c.ssaTypeOfVRegID, make([]ssa.Type, id+1)...) + } + c.ssaTypeOfVRegID[id] = typ + c.nextVRegID++ + return r +} + +// Init implements Compiler.Init. +func (c *compiler) Init() { + c.currentGID = 0 + c.nextVRegID = regalloc.VRegIDNonReservedBegin + c.returnVRegs = c.returnVRegs[:0] + c.mach.Reset() + c.varEdges = c.varEdges[:0] + c.constEdges = c.constEdges[:0] + c.buf = c.buf[:0] + c.sourceOffsets = c.sourceOffsets[:0] + c.relocations = c.relocations[:0] +} + +// ValueDefinition implements Compiler.ValueDefinition. +func (c *compiler) ValueDefinition(value ssa.Value) *SSAValueDefinition { + return &c.ssaValueDefinitions[value.ID()] +} + +// VRegOf implements Compiler.VRegOf. +func (c *compiler) VRegOf(value ssa.Value) regalloc.VReg { + return c.ssaValueToVRegs[value.ID()] +} + +// Format implements Compiler.Format. +func (c *compiler) Format() string { + return c.mach.Format() +} + +// TypeOf implements Compiler.Format. +func (c *compiler) TypeOf(v regalloc.VReg) ssa.Type { + return c.ssaTypeOfVRegID[v.ID()] +} + +// MatchInstr implements Compiler.MatchInstr. +func (c *compiler) MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool { + instr := def.Instr + return def.IsFromInstr() && + instr.Opcode() == opcode && + instr.GroupID() == c.currentGID && + def.RefCount < 2 +} + +// MatchInstrOneOf implements Compiler.MatchInstrOneOf. +func (c *compiler) MatchInstrOneOf(def *SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode { + instr := def.Instr + if !def.IsFromInstr() { + return ssa.OpcodeInvalid + } + + if instr.GroupID() != c.currentGID { + return ssa.OpcodeInvalid + } + + if def.RefCount >= 2 { + return ssa.OpcodeInvalid + } + + opcode := instr.Opcode() + for _, op := range opcodes { + if opcode == op { + return opcode + } + } + return ssa.OpcodeInvalid +} + +// SSABuilder implements Compiler .SSABuilder. +func (c *compiler) SSABuilder() ssa.Builder { + return c.ssaBuilder +} + +// AddSourceOffsetInfo implements Compiler.AddSourceOffsetInfo. +func (c *compiler) AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset) { + c.sourceOffsets = append(c.sourceOffsets, SourceOffsetInfo{ + SourceOffset: sourceOffset, + ExecutableOffset: executableOffset, + }) +} + +// SourceOffsetInfo implements Compiler.SourceOffsetInfo. +func (c *compiler) SourceOffsetInfo() []SourceOffsetInfo { + return c.sourceOffsets +} + +// AddRelocationInfo implements Compiler.AddRelocationInfo. +func (c *compiler) AddRelocationInfo(funcRef ssa.FuncRef) { + c.relocations = append(c.relocations, RelocationInfo{ + Offset: int64(len(c.buf)), + FuncRef: funcRef, + }) +} + +// Emit8Bytes implements Compiler.Emit8Bytes. +func (c *compiler) Emit8Bytes(b uint64) { + c.buf = append(c.buf, byte(b), byte(b>>8), byte(b>>16), byte(b>>24), byte(b>>32), byte(b>>40), byte(b>>48), byte(b>>56)) +} + +// Emit4Bytes implements Compiler.Emit4Bytes. +func (c *compiler) Emit4Bytes(b uint32) { + c.buf = append(c.buf, byte(b), byte(b>>8), byte(b>>16), byte(b>>24)) +} + +// EmitByte implements Compiler.EmitByte. +func (c *compiler) EmitByte(b byte) { + c.buf = append(c.buf, b) +} + +// Buf implements Compiler.Buf. +func (c *compiler) Buf() []byte { + return c.buf +} + +// BufPtr implements Compiler.BufPtr. +func (c *compiler) BufPtr() *[]byte { + return &c.buf +} + +func (c *compiler) GetFunctionABI(sig *ssa.Signature) *FunctionABI { + if int(sig.ID) >= len(c.abis) { + c.abis = append(c.abis, make([]FunctionABI, int(sig.ID)+1)...) + } + + abi := &c.abis[sig.ID] + if abi.Initialized { + return abi + } + + abi.Init(sig, c.argResultInts, c.argResultFloats) + return abi +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go new file mode 100644 index 000000000..80e65668a --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go @@ -0,0 +1,226 @@ +package backend + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// Lower implements Compiler.Lower. +func (c *compiler) Lower() { + c.assignVirtualRegisters() + c.mach.SetCurrentABI(c.GetFunctionABI(c.ssaBuilder.Signature())) + c.mach.ExecutableContext().StartLoweringFunction(c.ssaBuilder.BlockIDMax()) + c.lowerBlocks() +} + +// lowerBlocks lowers each block in the ssa.Builder. +func (c *compiler) lowerBlocks() { + builder := c.ssaBuilder + for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() { + c.lowerBlock(blk) + } + + ectx := c.mach.ExecutableContext() + // After lowering all blocks, we need to link adjacent blocks to layout one single instruction list. + var prev ssa.BasicBlock + for next := builder.BlockIteratorReversePostOrderBegin(); next != nil; next = builder.BlockIteratorReversePostOrderNext() { + if prev != nil { + ectx.LinkAdjacentBlocks(prev, next) + } + prev = next + } +} + +func (c *compiler) lowerBlock(blk ssa.BasicBlock) { + mach := c.mach + ectx := mach.ExecutableContext() + ectx.StartBlock(blk) + + // We traverse the instructions in reverse order because we might want to lower multiple + // instructions together. + cur := blk.Tail() + + // First gather the branching instructions at the end of the blocks. + var br0, br1 *ssa.Instruction + if cur.IsBranching() { + br0 = cur + cur = cur.Prev() + if cur != nil && cur.IsBranching() { + br1 = cur + cur = cur.Prev() + } + } + + if br0 != nil { + c.lowerBranches(br0, br1) + } + + if br1 != nil && br0 == nil { + panic("BUG? when a block has conditional branch but doesn't end with an unconditional branch?") + } + + // Now start lowering the non-branching instructions. + for ; cur != nil; cur = cur.Prev() { + c.setCurrentGroupID(cur.GroupID()) + if cur.Lowered() { + continue + } + + switch cur.Opcode() { + case ssa.OpcodeReturn: + rets := cur.ReturnVals() + if len(rets) > 0 { + c.mach.LowerReturns(rets) + } + c.mach.InsertReturn() + default: + mach.LowerInstr(cur) + } + ectx.FlushPendingInstructions() + } + + // Finally, if this is the entry block, we have to insert copies of arguments from the real location to the VReg. + if blk.EntryBlock() { + c.lowerFunctionArguments(blk) + } + + ectx.EndBlock() +} + +// lowerBranches is called right after StartBlock and before any LowerInstr call if +// there are branches to the given block. br0 is the very end of the block and b1 is the before the br0 if it exists. +// At least br0 is not nil, but br1 can be nil if there's no branching before br0. +// +// See ssa.Instruction IsBranching, and the comment on ssa.BasicBlock. +func (c *compiler) lowerBranches(br0, br1 *ssa.Instruction) { + ectx := c.mach.ExecutableContext() + + c.setCurrentGroupID(br0.GroupID()) + c.mach.LowerSingleBranch(br0) + ectx.FlushPendingInstructions() + if br1 != nil { + c.setCurrentGroupID(br1.GroupID()) + c.mach.LowerConditionalBranch(br1) + ectx.FlushPendingInstructions() + } + + if br0.Opcode() == ssa.OpcodeJump { + _, args, target := br0.BranchData() + argExists := len(args) != 0 + if argExists && br1 != nil { + panic("BUG: critical edge split failed") + } + if argExists && target.ReturnBlock() { + if len(args) > 0 { + c.mach.LowerReturns(args) + } + } else if argExists { + c.lowerBlockArguments(args, target) + } + } + ectx.FlushPendingInstructions() +} + +func (c *compiler) lowerFunctionArguments(entry ssa.BasicBlock) { + ectx := c.mach.ExecutableContext() + + c.tmpVals = c.tmpVals[:0] + for i := 0; i < entry.Params(); i++ { + p := entry.Param(i) + if c.ssaValueRefCounts[p.ID()] > 0 { + c.tmpVals = append(c.tmpVals, p) + } else { + // If the argument is not used, we can just pass an invalid value. + c.tmpVals = append(c.tmpVals, ssa.ValueInvalid) + } + } + c.mach.LowerParams(c.tmpVals) + ectx.FlushPendingInstructions() +} + +// lowerBlockArguments lowers how to pass arguments to the given successor block. +func (c *compiler) lowerBlockArguments(args []ssa.Value, succ ssa.BasicBlock) { + if len(args) != succ.Params() { + panic("BUG: mismatched number of arguments") + } + + c.varEdges = c.varEdges[:0] + c.varEdgeTypes = c.varEdgeTypes[:0] + c.constEdges = c.constEdges[:0] + for i := 0; i < len(args); i++ { + dst := succ.Param(i) + src := args[i] + + dstReg := c.VRegOf(dst) + srcDef := c.ssaValueDefinitions[src.ID()] + if srcDef.IsFromInstr() && srcDef.Instr.Constant() { + c.constEdges = append(c.constEdges, struct { + cInst *ssa.Instruction + dst regalloc.VReg + }{cInst: srcDef.Instr, dst: dstReg}) + } else { + srcReg := c.VRegOf(src) + // Even when the src=dst, insert the move so that we can keep such registers keep-alive. + c.varEdges = append(c.varEdges, [2]regalloc.VReg{srcReg, dstReg}) + c.varEdgeTypes = append(c.varEdgeTypes, src.Type()) + } + } + + // Check if there's an overlap among the dsts and srcs in varEdges. + c.vRegIDs = c.vRegIDs[:0] + for _, edge := range c.varEdges { + src := edge[0].ID() + if int(src) >= len(c.vRegSet) { + c.vRegSet = append(c.vRegSet, make([]bool, src+1)...) + } + c.vRegSet[src] = true + c.vRegIDs = append(c.vRegIDs, src) + } + separated := true + for _, edge := range c.varEdges { + dst := edge[1].ID() + if int(dst) >= len(c.vRegSet) { + c.vRegSet = append(c.vRegSet, make([]bool, dst+1)...) + } else { + if c.vRegSet[dst] { + separated = false + break + } + } + } + for _, id := range c.vRegIDs { + c.vRegSet[id] = false // reset for the next use. + } + + if separated { + // If there's no overlap, we can simply move the source to destination. + for i, edge := range c.varEdges { + src, dst := edge[0], edge[1] + c.mach.InsertMove(dst, src, c.varEdgeTypes[i]) + } + } else { + // Otherwise, we allocate a temporary registers and move the source to the temporary register, + // + // First move all of them to temporary registers. + c.tempRegs = c.tempRegs[:0] + for i, edge := range c.varEdges { + src := edge[0] + typ := c.varEdgeTypes[i] + temp := c.AllocateVReg(typ) + c.tempRegs = append(c.tempRegs, temp) + c.mach.InsertMove(temp, src, typ) + } + // Then move the temporary registers to the destination. + for i, edge := range c.varEdges { + temp := c.tempRegs[i] + dst := edge[1] + c.mach.InsertMove(dst, temp, c.varEdgeTypes[i]) + } + } + + // Finally, move the constants. + for _, edge := range c.constEdges { + cInst, dst := edge.cInst, edge.dst + c.mach.InsertLoadConstantBlockArg(cInst, dst) + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go new file mode 100644 index 000000000..81c6a6b62 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go @@ -0,0 +1,219 @@ +package backend + +import ( + "fmt" + "math" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +type ExecutableContext interface { + // StartLoweringFunction is called when the lowering of the given function is started. + // maximumBlockID is the maximum value of ssa.BasicBlockID existing in the function. + StartLoweringFunction(maximumBlockID ssa.BasicBlockID) + + // LinkAdjacentBlocks is called after finished lowering all blocks in order to create one single instruction list. + LinkAdjacentBlocks(prev, next ssa.BasicBlock) + + // StartBlock is called when the compilation of the given block is started. + // The order of this being called is the reverse post order of the ssa.BasicBlock(s) as we iterate with + // ssa.Builder BlockIteratorReversePostOrderBegin and BlockIteratorReversePostOrderEnd. + StartBlock(ssa.BasicBlock) + + // EndBlock is called when the compilation of the current block is finished. + EndBlock() + + // FlushPendingInstructions flushes the pending instructions to the buffer. + // This will be called after the lowering of each SSA Instruction. + FlushPendingInstructions() +} + +type ExecutableContextT[Instr any] struct { + CurrentSSABlk ssa.BasicBlock + + // InstrPool is the InstructionPool of instructions. + InstructionPool wazevoapi.Pool[Instr] + asNop func(*Instr) + setNext func(*Instr, *Instr) + setPrev func(*Instr, *Instr) + + // RootInstr is the root instruction of the executable. + RootInstr *Instr + labelPositionPool wazevoapi.Pool[LabelPosition[Instr]] + NextLabel Label + // LabelPositions maps a label to the instructions of the region which the label represents. + LabelPositions map[Label]*LabelPosition[Instr] + OrderedBlockLabels []*LabelPosition[Instr] + + // PerBlockHead and PerBlockEnd are the head and tail of the instruction list per currently-compiled ssa.BasicBlock. + PerBlockHead, PerBlockEnd *Instr + // PendingInstructions are the instructions which are not yet emitted into the instruction list. + PendingInstructions []*Instr + + // SsaBlockIDToLabels maps an SSA block ID to the label. + SsaBlockIDToLabels []Label +} + +func NewExecutableContextT[Instr any]( + resetInstruction func(*Instr), + setNext func(*Instr, *Instr), + setPrev func(*Instr, *Instr), + asNop func(*Instr), +) *ExecutableContextT[Instr] { + return &ExecutableContextT[Instr]{ + InstructionPool: wazevoapi.NewPool[Instr](resetInstruction), + asNop: asNop, + setNext: setNext, + setPrev: setPrev, + labelPositionPool: wazevoapi.NewPool[LabelPosition[Instr]](resetLabelPosition[Instr]), + LabelPositions: make(map[Label]*LabelPosition[Instr]), + NextLabel: LabelInvalid, + } +} + +func resetLabelPosition[T any](l *LabelPosition[T]) { + *l = LabelPosition[T]{} +} + +// StartLoweringFunction implements ExecutableContext. +func (e *ExecutableContextT[Instr]) StartLoweringFunction(max ssa.BasicBlockID) { + imax := int(max) + if len(e.SsaBlockIDToLabels) <= imax { + // Eagerly allocate labels for the blocks since the underlying slice will be used for the next iteration. + e.SsaBlockIDToLabels = append(e.SsaBlockIDToLabels, make([]Label, imax+1)...) + } +} + +func (e *ExecutableContextT[Instr]) StartBlock(blk ssa.BasicBlock) { + e.CurrentSSABlk = blk + + l := e.SsaBlockIDToLabels[e.CurrentSSABlk.ID()] + if l == LabelInvalid { + l = e.AllocateLabel() + e.SsaBlockIDToLabels[blk.ID()] = l + } + + end := e.allocateNop0() + e.PerBlockHead, e.PerBlockEnd = end, end + + labelPos, ok := e.LabelPositions[l] + if !ok { + labelPos = e.AllocateLabelPosition(l) + e.LabelPositions[l] = labelPos + } + e.OrderedBlockLabels = append(e.OrderedBlockLabels, labelPos) + labelPos.Begin, labelPos.End = end, end + labelPos.SB = blk +} + +// EndBlock implements ExecutableContext. +func (e *ExecutableContextT[T]) EndBlock() { + // Insert nop0 as the head of the block for convenience to simplify the logic of inserting instructions. + e.insertAtPerBlockHead(e.allocateNop0()) + + l := e.SsaBlockIDToLabels[e.CurrentSSABlk.ID()] + e.LabelPositions[l].Begin = e.PerBlockHead + + if e.CurrentSSABlk.EntryBlock() { + e.RootInstr = e.PerBlockHead + } +} + +func (e *ExecutableContextT[T]) insertAtPerBlockHead(i *T) { + if e.PerBlockHead == nil { + e.PerBlockHead = i + e.PerBlockEnd = i + return + } + e.setNext(i, e.PerBlockHead) + e.setPrev(e.PerBlockHead, i) + e.PerBlockHead = i +} + +// FlushPendingInstructions implements ExecutableContext. +func (e *ExecutableContextT[T]) FlushPendingInstructions() { + l := len(e.PendingInstructions) + if l == 0 { + return + } + for i := l - 1; i >= 0; i-- { // reverse because we lower instructions in reverse order. + e.insertAtPerBlockHead(e.PendingInstructions[i]) + } + e.PendingInstructions = e.PendingInstructions[:0] +} + +func (e *ExecutableContextT[T]) Reset() { + e.labelPositionPool.Reset() + e.InstructionPool.Reset() + for l := Label(0); l <= e.NextLabel; l++ { + delete(e.LabelPositions, l) + } + e.PendingInstructions = e.PendingInstructions[:0] + e.OrderedBlockLabels = e.OrderedBlockLabels[:0] + e.RootInstr = nil + e.SsaBlockIDToLabels = e.SsaBlockIDToLabels[:0] + e.PerBlockHead, e.PerBlockEnd = nil, nil + e.NextLabel = LabelInvalid +} + +// AllocateLabel allocates an unused label. +func (e *ExecutableContextT[T]) AllocateLabel() Label { + e.NextLabel++ + return e.NextLabel +} + +func (e *ExecutableContextT[T]) AllocateLabelPosition(la Label) *LabelPosition[T] { + l := e.labelPositionPool.Allocate() + l.L = la + return l +} + +func (e *ExecutableContextT[T]) GetOrAllocateSSABlockLabel(blk ssa.BasicBlock) Label { + if blk.ReturnBlock() { + return LabelReturn + } + l := e.SsaBlockIDToLabels[blk.ID()] + if l == LabelInvalid { + l = e.AllocateLabel() + e.SsaBlockIDToLabels[blk.ID()] = l + } + return l +} + +func (e *ExecutableContextT[T]) allocateNop0() *T { + i := e.InstructionPool.Allocate() + e.asNop(i) + return i +} + +// LinkAdjacentBlocks implements backend.Machine. +func (e *ExecutableContextT[T]) LinkAdjacentBlocks(prev, next ssa.BasicBlock) { + prevLabelPos := e.LabelPositions[e.GetOrAllocateSSABlockLabel(prev)] + nextLabelPos := e.LabelPositions[e.GetOrAllocateSSABlockLabel(next)] + e.setNext(prevLabelPos.End, nextLabelPos.Begin) +} + +// LabelPosition represents the regions of the generated code which the label represents. +type LabelPosition[Instr any] struct { + SB ssa.BasicBlock + L Label + Begin, End *Instr + BinaryOffset int64 +} + +// Label represents a position in the generated code which is either +// a real instruction or the constant InstructionPool (e.g. jump tables). +// +// This is exactly the same as the traditional "label" in assembly code. +type Label uint32 + +const ( + LabelInvalid Label = 0 + LabelReturn Label = math.MaxUint32 +) + +// String implements backend.Machine. +func (l Label) String() string { + return fmt.Sprintf("L%d", l) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go new file mode 100644 index 000000000..6fe6d7b3c --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go @@ -0,0 +1,33 @@ +package backend + +import "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + +// GoFunctionCallRequiredStackSize returns the size of the stack required for the Go function call. +// argBegin is the index of the first argument in the signature which is not either execution context or module context. +func GoFunctionCallRequiredStackSize(sig *ssa.Signature, argBegin int) (ret, retUnaligned int64) { + var paramNeededInBytes, resultNeededInBytes int64 + for _, p := range sig.Params[argBegin:] { + s := int64(p.Size()) + if s < 8 { + s = 8 // We use uint64 for all basic types, except SIMD v128. + } + paramNeededInBytes += s + } + for _, r := range sig.Results { + s := int64(r.Size()) + if s < 8 { + s = 8 // We use uint64 for all basic types, except SIMD v128. + } + resultNeededInBytes += s + } + + if paramNeededInBytes > resultNeededInBytes { + ret = paramNeededInBytes + } else { + ret = resultNeededInBytes + } + retUnaligned = ret + // Align to 16 bytes. + ret = (ret + 15) &^ 15 + return +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go new file mode 100644 index 000000000..130f8c621 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go @@ -0,0 +1,186 @@ +package amd64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// For the details of the ABI, see: +// https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#amd64-architecture + +var ( + intArgResultRegs = []regalloc.RealReg{rax, rbx, rcx, rdi, rsi, r8, r9, r10, r11} + floatArgResultRegs = []regalloc.RealReg{xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7} +) + +var regInfo = ®alloc.RegisterInfo{ + AllocatableRegisters: [regalloc.NumRegType][]regalloc.RealReg{ + regalloc.RegTypeInt: { + rax, rcx, rdx, rbx, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15, + }, + regalloc.RegTypeFloat: { + xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, + }, + }, + CalleeSavedRegisters: regalloc.NewRegSet( + rdx, r12, r13, r14, r15, + xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, + ), + CallerSavedRegisters: regalloc.NewRegSet( + rax, rcx, rbx, rsi, rdi, r8, r9, r10, r11, + xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, + ), + RealRegToVReg: []regalloc.VReg{ + rax: raxVReg, rcx: rcxVReg, rdx: rdxVReg, rbx: rbxVReg, rsp: rspVReg, rbp: rbpVReg, rsi: rsiVReg, rdi: rdiVReg, + r8: r8VReg, r9: r9VReg, r10: r10VReg, r11: r11VReg, r12: r12VReg, r13: r13VReg, r14: r14VReg, r15: r15VReg, + xmm0: xmm0VReg, xmm1: xmm1VReg, xmm2: xmm2VReg, xmm3: xmm3VReg, xmm4: xmm4VReg, xmm5: xmm5VReg, xmm6: xmm6VReg, + xmm7: xmm7VReg, xmm8: xmm8VReg, xmm9: xmm9VReg, xmm10: xmm10VReg, xmm11: xmm11VReg, xmm12: xmm12VReg, + xmm13: xmm13VReg, xmm14: xmm14VReg, xmm15: xmm15VReg, + }, + RealRegName: func(r regalloc.RealReg) string { return regNames[r] }, + RealRegType: func(r regalloc.RealReg) regalloc.RegType { + if r < xmm0 { + return regalloc.RegTypeInt + } + return regalloc.RegTypeFloat + }, +} + +// ArgsResultsRegs implements backend.Machine. +func (m *machine) ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) { + return intArgResultRegs, floatArgResultRegs +} + +// LowerParams implements backend.Machine. +func (m *machine) LowerParams(args []ssa.Value) { + a := m.currentABI + + for i, ssaArg := range args { + if !ssaArg.Valid() { + continue + } + reg := m.c.VRegOf(ssaArg) + arg := &a.Args[i] + if arg.Kind == backend.ABIArgKindReg { + m.InsertMove(reg, arg.Reg, arg.Type) + } else { + // + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | ReturnAddress | + // | Caller_RBP | + // +-----------------+ <-- RBP + // | ........... | + // | clobbered M | + // | ............ | + // | clobbered 0 | + // | spill slot N | + // | ........... | + // | spill slot 0 | + // RSP--> +-----------------+ + // (low address) + + // Load the value from the arg stack slot above the current RBP. + load := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmRBPReg(uint32(arg.Offset + 16))) + switch arg.Type { + case ssa.TypeI32: + load.asMovzxRmR(extModeLQ, mem, reg) + case ssa.TypeI64: + load.asMov64MR(mem, reg) + case ssa.TypeF32: + load.asXmmUnaryRmR(sseOpcodeMovss, mem, reg) + case ssa.TypeF64: + load.asXmmUnaryRmR(sseOpcodeMovsd, mem, reg) + case ssa.TypeV128: + load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, reg) + default: + panic("BUG") + } + m.insert(load) + } + } +} + +// LowerReturns implements backend.Machine. +func (m *machine) LowerReturns(rets []ssa.Value) { + // Load the XMM registers first as it might need a temporary register to inline + // constant return. + a := m.currentABI + for i, ret := range rets { + r := &a.Rets[i] + if !r.Type.IsInt() { + m.LowerReturn(ret, r) + } + } + // Then load the GPR registers. + for i, ret := range rets { + r := &a.Rets[i] + if r.Type.IsInt() { + m.LowerReturn(ret, r) + } + } +} + +func (m *machine) LowerReturn(ret ssa.Value, r *backend.ABIArg) { + reg := m.c.VRegOf(ret) + if def := m.c.ValueDefinition(ret); def.IsFromInstr() { + // Constant instructions are inlined. + if inst := def.Instr; inst.Constant() { + m.insertLoadConstant(inst, reg) + } + } + if r.Kind == backend.ABIArgKindReg { + m.InsertMove(r.Reg, reg, ret.Type()) + } else { + // + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | ReturnAddress | + // | Caller_RBP | + // +-----------------+ <-- RBP + // | ........... | + // | clobbered M | + // | ............ | + // | clobbered 0 | + // | spill slot N | + // | ........... | + // | spill slot 0 | + // RSP--> +-----------------+ + // (low address) + + // Store the value to the return stack slot above the current RBP. + store := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmRBPReg(uint32(m.currentABI.ArgStackSize + 16 + r.Offset))) + switch r.Type { + case ssa.TypeI32: + store.asMovRM(reg, mem, 4) + case ssa.TypeI64: + store.asMovRM(reg, mem, 8) + case ssa.TypeF32: + store.asXmmMovRM(sseOpcodeMovss, reg, mem) + case ssa.TypeF64: + store.asXmmMovRM(sseOpcodeMovsd, reg, mem) + case ssa.TypeV128: + store.asXmmMovRM(sseOpcodeMovdqu, reg, mem) + } + m.insert(store) + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go new file mode 100644 index 000000000..cbf1cfdc5 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go @@ -0,0 +1,9 @@ +package amd64 + +// entrypoint enters the machine code generated by this backend which begins with the preamble generated by functionABI.EmitGoEntryPreamble below. +// This implements wazevo.entrypoint, and see the comments there for detail. +func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr) + +// afterGoFunctionCallEntrypoint enters the machine code after growing the stack. +// This implements wazevo.afterGoFunctionCallEntrypoint, and see the comments there for detail. +func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr) diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s new file mode 100644 index 000000000..e9cb131d1 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s @@ -0,0 +1,29 @@ +#include "funcdata.h" +#include "textflag.h" + +// entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr +TEXT ·entrypoint(SB), NOSPLIT|NOFRAME, $0-48 + MOVQ preambleExecutable+0(FP), R11 + MOVQ functionExectuable+8(FP), R14 + MOVQ executionContextPtr+16(FP), AX // First argument is passed in AX. + MOVQ moduleContextPtr+24(FP), BX // Second argument is passed in BX. + MOVQ paramResultSlicePtr+32(FP), R12 + MOVQ goAllocatedStackSlicePtr+40(FP), R13 + JMP R11 + +// afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr) +TEXT ·afterGoFunctionCallEntrypoint(SB), NOSPLIT|NOFRAME, $0-32 + MOVQ executable+0(FP), CX + MOVQ executionContextPtr+8(FP), AX // First argument is passed in AX. + + // Save the stack pointer and frame pointer. + MOVQ BP, 16(AX) // 16 == ExecutionContextOffsetOriginalFramePointer + MOVQ SP, 24(AX) // 24 == ExecutionContextOffsetOriginalStackPointer + + // Then set the stack pointer and frame pointer to the values we got from the Go runtime. + MOVQ framePointer+24(FP), BP + + // WARNING: do not update SP before BP, because the Go translates (FP) as (SP) + 8. + MOVQ stackPointer+16(FP), SP + + JMP CX diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go new file mode 100644 index 000000000..882d06c06 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go @@ -0,0 +1,248 @@ +package amd64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +var ( + executionContextPtrReg = raxVReg + + // Followings are callee saved registers. They can be used freely in the entry preamble + // since the preamble is called via Go assembly function which has stack-based ABI. + + // savedExecutionContextPtr also must be a callee-saved reg so that they can be used in the prologue and epilogue. + savedExecutionContextPtr = rdxVReg + // paramResultSlicePtr must match with entrypoint function in abi_entry_amd64.s. + paramResultSlicePtr = r12VReg + // goAllocatedStackPtr must match with entrypoint function in abi_entry_amd64.s. + goAllocatedStackPtr = r13VReg + // functionExecutable must match with entrypoint function in abi_entry_amd64.s. + functionExecutable = r14VReg + tmpIntReg = r15VReg + tmpXmmReg = xmm15VReg +) + +// CompileEntryPreamble implements backend.Machine. +func (m *machine) CompileEntryPreamble(sig *ssa.Signature) []byte { + root := m.compileEntryPreamble(sig) + m.encodeWithoutSSA(root) + buf := m.c.Buf() + return buf +} + +func (m *machine) compileEntryPreamble(sig *ssa.Signature) *instruction { + abi := backend.FunctionABI{} + abi.Init(sig, intArgResultRegs, floatArgResultRegs) + + root := m.allocateNop() + + //// ----------------------------------- prologue ----------------------------------- //// + + // First, we save executionContextPtrReg into a callee-saved register so that it can be used in epilogue as well. + // mov %executionContextPtrReg, %savedExecutionContextPtr + cur := m.move64(executionContextPtrReg, savedExecutionContextPtr, root) + + // Next is to save the original RBP and RSP into the execution context. + cur = m.saveOriginalRSPRBP(cur) + + // Now set the RSP to the Go-allocated stack pointer. + // mov %goAllocatedStackPtr, %rsp + cur = m.move64(goAllocatedStackPtr, rspVReg, cur) + + if stackSlotSize := abi.AlignedArgResultStackSlotSize(); stackSlotSize > 0 { + // Allocate stack slots for the arguments and return values. + // sub $stackSlotSize, %rsp + spDec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(stackSlotSize)), rspVReg, true) + cur = linkInstr(cur, spDec) + } + + var offset uint32 + for i := range abi.Args { + if i < 2 { + // module context ptr and execution context ptr are passed in rax and rbx by the Go assembly function. + continue + } + arg := &abi.Args[i] + cur = m.goEntryPreamblePassArg(cur, paramResultSlicePtr, offset, arg) + if arg.Type == ssa.TypeV128 { + offset += 16 + } else { + offset += 8 + } + } + + // Zero out RBP so that the unwind/stack growth code can correctly detect the end of the stack. + zerosRbp := m.allocateInstr().asAluRmiR(aluRmiROpcodeXor, newOperandReg(rbpVReg), rbpVReg, true) + cur = linkInstr(cur, zerosRbp) + + // Now ready to call the real function. Note that at this point stack pointer is already set to the Go-allocated, + // which is aligned to 16 bytes. + call := m.allocateInstr().asCallIndirect(newOperandReg(functionExecutable), &abi) + cur = linkInstr(cur, call) + + //// ----------------------------------- epilogue ----------------------------------- //// + + // Read the results from regs and the stack, and set them correctly into the paramResultSlicePtr. + offset = 0 + for i := range abi.Rets { + r := &abi.Rets[i] + cur = m.goEntryPreamblePassResult(cur, paramResultSlicePtr, offset, r, uint32(abi.ArgStackSize)) + if r.Type == ssa.TypeV128 { + offset += 16 + } else { + offset += 8 + } + } + + // Finally, restore the original RBP and RSP. + cur = m.restoreOriginalRSPRBP(cur) + + ret := m.allocateInstr().asRet() + linkInstr(cur, ret) + return root +} + +// saveOriginalRSPRBP saves the original RSP and RBP into the execution context. +func (m *machine) saveOriginalRSPRBP(cur *instruction) *instruction { + // mov %rbp, wazevoapi.ExecutionContextOffsetOriginalFramePointer(%executionContextPtrReg) + // mov %rsp, wazevoapi.ExecutionContextOffsetOriginalStackPointer(%executionContextPtrReg) + cur = m.loadOrStore64AtExecutionCtx(executionContextPtrReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, rbpVReg, true, cur) + cur = m.loadOrStore64AtExecutionCtx(executionContextPtrReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, rspVReg, true, cur) + return cur +} + +// restoreOriginalRSPRBP restores the original RSP and RBP from the execution context. +func (m *machine) restoreOriginalRSPRBP(cur *instruction) *instruction { + // mov wazevoapi.ExecutionContextOffsetOriginalFramePointer(%executionContextPtrReg), %rbp + // mov wazevoapi.ExecutionContextOffsetOriginalStackPointer(%executionContextPtrReg), %rsp + cur = m.loadOrStore64AtExecutionCtx(savedExecutionContextPtr, wazevoapi.ExecutionContextOffsetOriginalFramePointer, rbpVReg, false, cur) + cur = m.loadOrStore64AtExecutionCtx(savedExecutionContextPtr, wazevoapi.ExecutionContextOffsetOriginalStackPointer, rspVReg, false, cur) + return cur +} + +func (m *machine) move64(src, dst regalloc.VReg, prev *instruction) *instruction { + mov := m.allocateInstr().asMovRR(src, dst, true) + return linkInstr(prev, mov) +} + +func (m *machine) loadOrStore64AtExecutionCtx(execCtx regalloc.VReg, offset wazevoapi.Offset, r regalloc.VReg, store bool, prev *instruction) *instruction { + mem := newOperandMem(m.newAmodeImmReg(offset.U32(), execCtx)) + instr := m.allocateInstr() + if store { + instr.asMovRM(r, mem, 8) + } else { + instr.asMov64MR(mem, r) + } + return linkInstr(prev, instr) +} + +// This is for debugging. +func (m *machine) linkUD2(cur *instruction) *instruction { //nolint + return linkInstr(cur, m.allocateInstr().asUD2()) +} + +func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regalloc.VReg, offsetInParamSlice uint32, arg *backend.ABIArg) *instruction { + var dst regalloc.VReg + argTyp := arg.Type + if arg.Kind == backend.ABIArgKindStack { + // Caller saved registers ca + switch argTyp { + case ssa.TypeI32, ssa.TypeI64: + dst = tmpIntReg + case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: + dst = tmpXmmReg + default: + panic("BUG") + } + } else { + dst = arg.Reg + } + + load := m.allocateInstr() + a := newOperandMem(m.newAmodeImmReg(offsetInParamSlice, paramSlicePtr)) + switch arg.Type { + case ssa.TypeI32: + load.asMovzxRmR(extModeLQ, a, dst) + case ssa.TypeI64: + load.asMov64MR(a, dst) + case ssa.TypeF32: + load.asXmmUnaryRmR(sseOpcodeMovss, a, dst) + case ssa.TypeF64: + load.asXmmUnaryRmR(sseOpcodeMovsd, a, dst) + case ssa.TypeV128: + load.asXmmUnaryRmR(sseOpcodeMovdqu, a, dst) + } + + cur = linkInstr(cur, load) + if arg.Kind == backend.ABIArgKindStack { + // Store back to the stack. + store := m.allocateInstr() + a := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset), rspVReg)) + switch arg.Type { + case ssa.TypeI32: + store.asMovRM(dst, a, 4) + case ssa.TypeI64: + store.asMovRM(dst, a, 8) + case ssa.TypeF32: + store.asXmmMovRM(sseOpcodeMovss, dst, a) + case ssa.TypeF64: + store.asXmmMovRM(sseOpcodeMovsd, dst, a) + case ssa.TypeV128: + store.asXmmMovRM(sseOpcodeMovdqu, dst, a) + } + cur = linkInstr(cur, store) + } + return cur +} + +func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr regalloc.VReg, offsetInResultSlice uint32, result *backend.ABIArg, resultStackSlotBeginOffset uint32) *instruction { + var r regalloc.VReg + if result.Kind == backend.ABIArgKindStack { + // Load the value to the temporary. + load := m.allocateInstr() + offset := resultStackSlotBeginOffset + uint32(result.Offset) + a := newOperandMem(m.newAmodeImmReg(offset, rspVReg)) + switch result.Type { + case ssa.TypeI32: + r = tmpIntReg + load.asMovzxRmR(extModeLQ, a, r) + case ssa.TypeI64: + r = tmpIntReg + load.asMov64MR(a, r) + case ssa.TypeF32: + r = tmpXmmReg + load.asXmmUnaryRmR(sseOpcodeMovss, a, r) + case ssa.TypeF64: + r = tmpXmmReg + load.asXmmUnaryRmR(sseOpcodeMovsd, a, r) + case ssa.TypeV128: + r = tmpXmmReg + load.asXmmUnaryRmR(sseOpcodeMovdqu, a, r) + default: + panic("BUG") + } + cur = linkInstr(cur, load) + } else { + r = result.Reg + } + + store := m.allocateInstr() + a := newOperandMem(m.newAmodeImmReg(offsetInResultSlice, resultSlicePtr)) + switch result.Type { + case ssa.TypeI32: + store.asMovRM(r, a, 4) + case ssa.TypeI64: + store.asMovRM(r, a, 8) + case ssa.TypeF32: + store.asXmmMovRM(sseOpcodeMovss, r, a) + case ssa.TypeF64: + store.asXmmMovRM(sseOpcodeMovsd, r, a) + case ssa.TypeV128: + store.asXmmMovRM(sseOpcodeMovdqu, r, a) + } + + return linkInstr(cur, store) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go new file mode 100644 index 000000000..751050aff --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go @@ -0,0 +1,443 @@ +package amd64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +var calleeSavedVRegs = []regalloc.VReg{ + rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg, + xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg, +} + +// CompileGoFunctionTrampoline implements backend.Machine. +func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte { + ectx := m.ectx + argBegin := 1 // Skips exec context by default. + if needModuleContextPtr { + argBegin++ + } + + abi := &backend.FunctionABI{} + abi.Init(sig, intArgResultRegs, floatArgResultRegs) + m.currentABI = abi + + cur := m.allocateNop() + ectx.RootInstr = cur + + // Execution context is always the first argument. + execCtrPtr := raxVReg + + // First we update RBP and RSP just like the normal prologue. + // + // (high address) (high address) + // RBP ----> +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | ====> | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | Return Addr | | Return Addr | + // RSP ----> +-----------------+ | Caller_RBP | + // (low address) +-----------------+ <----- RSP, RBP + // + cur = m.setupRBPRSP(cur) + + goSliceSizeAligned, goSliceSizeAlignedUnaligned := backend.GoFunctionCallRequiredStackSize(sig, argBegin) + cur = m.insertStackBoundsCheck(goSliceSizeAligned+8 /* size of the Go slice */, cur) + + // Save the callee saved registers. + cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs) + + if needModuleContextPtr { + moduleCtrPtr := rbxVReg // Module context is always the second argument. + mem := m.newAmodeImmReg( + wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.U32(), + execCtrPtr) + store := m.allocateInstr().asMovRM(moduleCtrPtr, newOperandMem(mem), 8) + cur = linkInstr(cur, store) + } + + // Now let's advance the RSP to the stack slot for the arguments. + // + // (high address) (high address) + // +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | =======> | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | Return Addr | | Return Addr | + // | Caller_RBP | | Caller_RBP | + // RBP,RSP --> +-----------------+ +-----------------+ <----- RBP + // (low address) | arg[N]/ret[M] | + // | .......... | + // | arg[1]/ret[1] | + // | arg[0]/ret[0] | + // +-----------------+ <----- RSP + // (low address) + // + // where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions, + // therefore will be accessed as the usual []uint64. So that's where we need to pass/receive + // the arguments/return values to/from Go function. + cur = m.addRSP(-int32(goSliceSizeAligned), cur) + + // Next, we need to store all the arguments to the stack in the typical Wasm stack style. + var offsetInGoSlice int32 + for i := range abi.Args[argBegin:] { + arg := &abi.Args[argBegin+i] + var v regalloc.VReg + if arg.Kind == backend.ABIArgKindReg { + v = arg.Reg + } else { + // We have saved callee saved registers, so we can use them. + if arg.Type.IsInt() { + v = r15VReg + } else { + v = xmm15VReg + } + mem := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg)) + load := m.allocateInstr() + switch arg.Type { + case ssa.TypeI32: + load.asMovzxRmR(extModeLQ, mem, v) + case ssa.TypeI64: + load.asMov64MR(mem, v) + case ssa.TypeF32: + load.asXmmUnaryRmR(sseOpcodeMovss, mem, v) + case ssa.TypeF64: + load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v) + case ssa.TypeV128: + load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v) + default: + panic("BUG") + } + cur = linkInstr(cur, load) + } + + store := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg)) + switch arg.Type { + case ssa.TypeI32: + store.asMovRM(v, mem, 4) + offsetInGoSlice += 8 // always uint64 rep. + case ssa.TypeI64: + store.asMovRM(v, mem, 8) + offsetInGoSlice += 8 + case ssa.TypeF32: + store.asXmmMovRM(sseOpcodeMovss, v, mem) + offsetInGoSlice += 8 // always uint64 rep. + case ssa.TypeF64: + store.asXmmMovRM(sseOpcodeMovsd, v, mem) + offsetInGoSlice += 8 + case ssa.TypeV128: + store.asXmmMovRM(sseOpcodeMovdqu, v, mem) + offsetInGoSlice += 16 + default: + panic("BUG") + } + cur = linkInstr(cur, store) + } + + // Finally we push the size of the slice to the stack so the stack looks like: + // + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | Return Addr | + // | Caller_RBP | + // +-----------------+ <----- RBP + // | arg[N]/ret[M] | + // | .......... | + // | arg[1]/ret[1] | + // | arg[0]/ret[0] | + // | slice size | + // +-----------------+ <----- RSP + // (low address) + // + // push $sliceSize + cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandImm32(uint32(goSliceSizeAlignedUnaligned)))) + + // Load the exitCode to the register. + exitCodeReg := r12VReg // Callee saved which is already saved. + cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(exitCode), false)) + + saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg) + cur = linkInstr(cur, setExitCode) + cur = linkInstr(cur, saveRsp) + cur = linkInstr(cur, saveRbp) + + // Ready to exit the execution. + cur = m.storeReturnAddressAndExit(cur, execCtrPtr) + + // We don't need the slice size anymore, so pop it. + cur = m.addRSP(8, cur) + + // Ready to set up the results. + offsetInGoSlice = 0 + // To avoid overwriting with the execution context pointer by the result, we need to track the offset, + // and defer the restoration of the result to the end of this function. + var argOverlapWithExecCtxOffset int32 = -1 + for i := range abi.Rets { + r := &abi.Rets[i] + var v regalloc.VReg + isRegResult := r.Kind == backend.ABIArgKindReg + if isRegResult { + v = r.Reg + if v.RealReg() == execCtrPtr.RealReg() { + argOverlapWithExecCtxOffset = offsetInGoSlice + offsetInGoSlice += 8 // always uint64 rep. + continue + } + } else { + if r.Type.IsInt() { + v = r15VReg + } else { + v = xmm15VReg + } + } + + load := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg)) + switch r.Type { + case ssa.TypeI32: + load.asMovzxRmR(extModeLQ, mem, v) + offsetInGoSlice += 8 // always uint64 rep. + case ssa.TypeI64: + load.asMov64MR(mem, v) + offsetInGoSlice += 8 + case ssa.TypeF32: + load.asXmmUnaryRmR(sseOpcodeMovss, mem, v) + offsetInGoSlice += 8 // always uint64 rep. + case ssa.TypeF64: + load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v) + offsetInGoSlice += 8 + case ssa.TypeV128: + load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v) + offsetInGoSlice += 16 + default: + panic("BUG") + } + cur = linkInstr(cur, load) + + if !isRegResult { + // We need to store it back to the result slot above rbp. + store := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmReg(uint32(abi.ArgStackSize+r.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg)) + switch r.Type { + case ssa.TypeI32: + store.asMovRM(v, mem, 4) + case ssa.TypeI64: + store.asMovRM(v, mem, 8) + case ssa.TypeF32: + store.asXmmMovRM(sseOpcodeMovss, v, mem) + case ssa.TypeF64: + store.asXmmMovRM(sseOpcodeMovsd, v, mem) + case ssa.TypeV128: + store.asXmmMovRM(sseOpcodeMovdqu, v, mem) + default: + panic("BUG") + } + cur = linkInstr(cur, store) + } + } + + // Before return, we need to restore the callee saved registers. + cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs) + + if argOverlapWithExecCtxOffset >= 0 { + // At this point execCtt is not used anymore, so we can finally store the + // result to the register which overlaps with the execution context pointer. + mem := newOperandMem(m.newAmodeImmReg(uint32(argOverlapWithExecCtxOffset), rspVReg)) + load := m.allocateInstr().asMov64MR(mem, execCtrPtr) + cur = linkInstr(cur, load) + } + + // Finally ready to return. + cur = m.revertRBPRSP(cur) + linkInstr(cur, m.allocateInstr().asRet()) + + m.encodeWithoutSSA(ectx.RootInstr) + return m.c.Buf() +} + +func (m *machine) saveRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction { + offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64() + for _, v := range regs { + store := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx)) + switch v.RegType() { + case regalloc.RegTypeInt: + store.asMovRM(v, mem, 8) + case regalloc.RegTypeFloat: + store.asXmmMovRM(sseOpcodeMovdqu, v, mem) + default: + panic("BUG") + } + cur = linkInstr(cur, store) + offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally. + } + return cur +} + +func (m *machine) restoreRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction { + offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64() + for _, v := range regs { + load := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx)) + switch v.RegType() { + case regalloc.RegTypeInt: + load.asMov64MR(mem, v) + case regalloc.RegTypeFloat: + load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v) + default: + panic("BUG") + } + cur = linkInstr(cur, load) + offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally. + } + return cur +} + +func (m *machine) storeReturnAddressAndExit(cur *instruction, execCtx regalloc.VReg) *instruction { + readRip := m.allocateInstr() + cur = linkInstr(cur, readRip) + + ripReg := r12VReg // Callee saved which is already saved. + saveRip := m.allocateInstr().asMovRM( + ripReg, + newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)), + 8, + ) + cur = linkInstr(cur, saveRip) + + exit := m.allocateExitSeq(execCtx) + cur = linkInstr(cur, exit) + + nop, l := m.allocateBrTarget() + cur = linkInstr(cur, nop) + readRip.asLEA(newOperandLabel(l), ripReg) + return cur +} + +// saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient +// stack space left. Basically this is the all allocatable registers except for RSP and RBP, and RAX which contains the +// execution context pointer. ExecCtx pointer is always the first argument so we don't need to save it. +var stackGrowSaveVRegs = []regalloc.VReg{ + rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg, + rcxVReg, rbxVReg, rsiVReg, rdiVReg, r8VReg, r9VReg, r10VReg, r11VReg, + xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg, + xmm0VReg, xmm1VReg, xmm2VReg, xmm3VReg, xmm4VReg, xmm5VReg, xmm6VReg, xmm7VReg, +} + +// CompileStackGrowCallSequence implements backend.Machine. +func (m *machine) CompileStackGrowCallSequence() []byte { + ectx := m.ectx + + cur := m.allocateNop() + ectx.RootInstr = cur + + cur = m.setupRBPRSP(cur) + + // Execution context is always the first argument. + execCtrPtr := raxVReg + + // Save the callee saved and argument registers. + cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs) + + // Load the exitCode to the register. + exitCodeReg := r12VReg // Already saved. + cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(wazevoapi.ExitCodeGrowStack), false)) + + saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg) + cur = linkInstr(cur, setExitCode) + cur = linkInstr(cur, saveRsp) + cur = linkInstr(cur, saveRbp) + + // Ready to exit the execution. + cur = m.storeReturnAddressAndExit(cur, execCtrPtr) + + // After the exit, restore the saved registers. + cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs) + + // Finally ready to return. + cur = m.revertRBPRSP(cur) + linkInstr(cur, m.allocateInstr().asRet()) + + m.encodeWithoutSSA(ectx.RootInstr) + return m.c.Buf() +} + +// insertStackBoundsCheck will insert the instructions after `cur` to check the +// stack bounds, and if there's no sufficient spaces required for the function, +// exit the execution and try growing it in Go world. +func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction { + // add $requiredStackSize, %rsp ;; Temporarily update the sp. + // cmp ExecutionContextOffsetStackBottomPtr(%rax), %rsp ;; Compare the stack bottom and the sp. + // ja .ok + // sub $requiredStackSize, %rsp ;; Reverse the temporary update. + // pushq r15 ;; save the temporary. + // mov $requiredStackSize, %r15 + // mov %15, ExecutionContextOffsetStackGrowRequiredSize(%rax) ;; Set the required size in the execution context. + // popq r15 ;; restore the temporary. + // callq *ExecutionContextOffsetStackGrowCallTrampolineAddress(%rax) ;; Call the Go function to grow the stack. + // jmp .cont + // .ok: + // sub $requiredStackSize, %rsp ;; Reverse the temporary update. + // .cont: + cur = m.addRSP(-int32(requiredStackSize), cur) + cur = linkInstr(cur, m.allocateInstr().asCmpRmiR(true, + newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackBottomPtr.U32(), raxVReg)), + rspVReg, true)) + + ja := m.allocateInstr() + cur = linkInstr(cur, ja) + + cur = m.addRSP(int32(requiredStackSize), cur) + + // Save the temporary. + + cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r15VReg))) + // Load the required size to the temporary. + cur = linkInstr(cur, m.allocateInstr().asImm(r15VReg, uint64(requiredStackSize), true)) + // Set the required size in the execution context. + cur = linkInstr(cur, m.allocateInstr().asMovRM(r15VReg, + newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.U32(), raxVReg)), 8)) + // Restore the temporary. + cur = linkInstr(cur, m.allocateInstr().asPop64(r15VReg)) + // Call the Go function to grow the stack. + cur = linkInstr(cur, m.allocateInstr().asCallIndirect(newOperandMem(m.newAmodeImmReg( + wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.U32(), raxVReg)), nil)) + // Jump to the continuation. + jmpToCont := m.allocateInstr() + cur = linkInstr(cur, jmpToCont) + + // .ok: + okInstr, ok := m.allocateBrTarget() + cur = linkInstr(cur, okInstr) + ja.asJmpIf(condNBE, newOperandLabel(ok)) + // On the ok path, we only need to reverse the temporary update. + cur = m.addRSP(int32(requiredStackSize), cur) + + // .cont: + contInstr, cont := m.allocateBrTarget() + cur = linkInstr(cur, contInstr) + jmpToCont.asJmp(newOperandLabel(cont)) + + return cur +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go new file mode 100644 index 000000000..75cbeab75 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go @@ -0,0 +1,168 @@ +package amd64 + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +type cond byte + +const ( + // condO represents (overflow) condition. + condO cond = iota + // condNO represents (no overflow) condition. + condNO + // condB represents (< unsigned) condition. + condB + // condNB represents (>= unsigned) condition. + condNB + // condZ represents (zero) condition. + condZ + // condNZ represents (not-zero) condition. + condNZ + // condBE represents (<= unsigned) condition. + condBE + // condNBE represents (> unsigned) condition. + condNBE + // condS represents (negative) condition. + condS + // condNS represents (not-negative) condition. + condNS + // condP represents (parity) condition. + condP + // condNP represents (not parity) condition. + condNP + // condL represents (< signed) condition. + condL + // condNL represents (>= signed) condition. + condNL + // condLE represents (<= signed) condition. + condLE + // condNLE represents (> signed) condition. + condNLE + + condInvalid +) + +func (c cond) String() string { + switch c { + case condO: + return "o" + case condNO: + return "no" + case condB: + return "b" + case condNB: + return "nb" + case condZ: + return "z" + case condNZ: + return "nz" + case condBE: + return "be" + case condNBE: + return "nbe" + case condS: + return "s" + case condNS: + return "ns" + case condL: + return "l" + case condNL: + return "nl" + case condLE: + return "le" + case condNLE: + return "nle" + case condP: + return "p" + case condNP: + return "np" + default: + panic("unreachable") + } +} + +func condFromSSAIntCmpCond(origin ssa.IntegerCmpCond) cond { + switch origin { + case ssa.IntegerCmpCondEqual: + return condZ + case ssa.IntegerCmpCondNotEqual: + return condNZ + case ssa.IntegerCmpCondSignedLessThan: + return condL + case ssa.IntegerCmpCondSignedGreaterThanOrEqual: + return condNL + case ssa.IntegerCmpCondSignedGreaterThan: + return condNLE + case ssa.IntegerCmpCondSignedLessThanOrEqual: + return condLE + case ssa.IntegerCmpCondUnsignedLessThan: + return condB + case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual: + return condNB + case ssa.IntegerCmpCondUnsignedGreaterThan: + return condNBE + case ssa.IntegerCmpCondUnsignedLessThanOrEqual: + return condBE + default: + panic("unreachable") + } +} + +func condFromSSAFloatCmpCond(origin ssa.FloatCmpCond) cond { + switch origin { + case ssa.FloatCmpCondGreaterThanOrEqual: + return condNB + case ssa.FloatCmpCondGreaterThan: + return condNBE + case ssa.FloatCmpCondEqual, ssa.FloatCmpCondNotEqual, ssa.FloatCmpCondLessThan, ssa.FloatCmpCondLessThanOrEqual: + panic(fmt.Sprintf("cond %s must be treated as a special case", origin)) + default: + panic("unreachable") + } +} + +func (c cond) encoding() byte { + return byte(c) +} + +func (c cond) invert() cond { + switch c { + case condO: + return condNO + case condNO: + return condO + case condB: + return condNB + case condNB: + return condB + case condZ: + return condNZ + case condNZ: + return condZ + case condBE: + return condNBE + case condNBE: + return condBE + case condS: + return condNS + case condNS: + return condS + case condP: + return condNP + case condNP: + return condP + case condL: + return condNL + case condNL: + return condL + case condLE: + return condNLE + case condNLE: + return condLE + default: + panic("unreachable") + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go new file mode 100644 index 000000000..5e731e822 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go @@ -0,0 +1,35 @@ +package amd64 + +// extMode represents the mode of extension in movzx/movsx. +type extMode byte + +const ( + // extModeBL represents Byte -> Longword. + extModeBL extMode = iota + // extModeBQ represents Byte -> Quadword. + extModeBQ + // extModeWL represents Word -> Longword. + extModeWL + // extModeWQ represents Word -> Quadword. + extModeWQ + // extModeLQ represents Longword -> Quadword. + extModeLQ +) + +// String implements fmt.Stringer. +func (e extMode) String() string { + switch e { + case extModeBL: + return "bl" + case extModeBQ: + return "bq" + case extModeWL: + return "wl" + case extModeWQ: + return "wq" + case extModeLQ: + return "lq" + default: + panic("BUG: invalid ext mode") + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go new file mode 100644 index 000000000..d27e79c0e --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go @@ -0,0 +1,2472 @@ +package amd64 + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +type instruction struct { + prev, next *instruction + op1, op2 operand + u1, u2 uint64 + b1 bool + addedBeforeRegAlloc bool + kind instructionKind +} + +// Next implements regalloc.Instr. +func (i *instruction) Next() regalloc.Instr { + return i.next +} + +// Prev implements regalloc.Instr. +func (i *instruction) Prev() regalloc.Instr { + return i.prev +} + +// IsCall implements regalloc.Instr. +func (i *instruction) IsCall() bool { return i.kind == call } + +// IsIndirectCall implements regalloc.Instr. +func (i *instruction) IsIndirectCall() bool { return i.kind == callIndirect } + +// IsReturn implements regalloc.Instr. +func (i *instruction) IsReturn() bool { return i.kind == ret } + +// AddedBeforeRegAlloc implements regalloc.Instr. +func (i *instruction) AddedBeforeRegAlloc() bool { return i.addedBeforeRegAlloc } + +// String implements regalloc.Instr. +func (i *instruction) String() string { + switch i.kind { + case nop0: + return "nop" + case sourceOffsetInfo: + return fmt.Sprintf("source_offset_info %d", i.u1) + case ret: + return "ret" + case imm: + if i.b1 { + return fmt.Sprintf("movabsq $%d, %s", int64(i.u1), i.op2.format(true)) + } else { + return fmt.Sprintf("movl $%d, %s", int32(i.u1), i.op2.format(false)) + } + case aluRmiR: + return fmt.Sprintf("%s %s, %s", aluRmiROpcode(i.u1), i.op1.format(i.b1), i.op2.format(i.b1)) + case movRR: + if i.b1 { + return fmt.Sprintf("movq %s, %s", i.op1.format(true), i.op2.format(true)) + } else { + return fmt.Sprintf("movl %s, %s", i.op1.format(false), i.op2.format(false)) + } + case xmmRmR: + return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(false), i.op2.format(false)) + case gprToXmm: + return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(i.b1), i.op2.format(i.b1)) + case xmmUnaryRmR: + return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(false), i.op2.format(false)) + case xmmUnaryRmRImm: + return fmt.Sprintf("%s $%d, %s, %s", sseOpcode(i.u1), roundingMode(i.u2), i.op1.format(false), i.op2.format(false)) + case unaryRmR: + var suffix string + if i.b1 { + suffix = "q" + } else { + suffix = "l" + } + return fmt.Sprintf("%s%s %s, %s", unaryRmROpcode(i.u1), suffix, i.op1.format(i.b1), i.op2.format(i.b1)) + case not: + var op string + if i.b1 { + op = "notq" + } else { + op = "notl" + } + return fmt.Sprintf("%s %s", op, i.op1.format(i.b1)) + case neg: + var op string + if i.b1 { + op = "negq" + } else { + op = "negl" + } + return fmt.Sprintf("%s %s", op, i.op1.format(i.b1)) + case div: + var prefix string + var op string + if i.b1 { + op = "divq" + } else { + op = "divl" + } + if i.u1 != 0 { + prefix = "i" + } + return fmt.Sprintf("%s%s %s", prefix, op, i.op1.format(i.b1)) + case mulHi: + signed, _64 := i.u1 != 0, i.b1 + var op string + switch { + case signed && _64: + op = "imulq" + case !signed && _64: + op = "mulq" + case signed && !_64: + op = "imull" + case !signed && !_64: + op = "mull" + } + return fmt.Sprintf("%s %s", op, i.op1.format(i.b1)) + case signExtendData: + var op string + if i.b1 { + op = "cqo" + } else { + op = "cdq" + } + return op + case movzxRmR: + return fmt.Sprintf("movzx.%s %s, %s", extMode(i.u1), i.op1.format(true), i.op2.format(true)) + case mov64MR: + return fmt.Sprintf("movq %s, %s", i.op1.format(true), i.op2.format(true)) + case lea: + return fmt.Sprintf("lea %s, %s", i.op1.format(true), i.op2.format(true)) + case movsxRmR: + return fmt.Sprintf("movsx.%s %s, %s", extMode(i.u1), i.op1.format(true), i.op2.format(true)) + case movRM: + var suffix string + switch i.u1 { + case 1: + suffix = "b" + case 2: + suffix = "w" + case 4: + suffix = "l" + case 8: + suffix = "q" + } + return fmt.Sprintf("mov.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true)) + case shiftR: + var suffix string + if i.b1 { + suffix = "q" + } else { + suffix = "l" + } + return fmt.Sprintf("%s%s %s, %s", shiftROp(i.u1), suffix, i.op1.format(false), i.op2.format(i.b1)) + case xmmRmiReg: + return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(true), i.op2.format(true)) + case cmpRmiR: + var op, suffix string + if i.u1 != 0 { + op = "cmp" + } else { + op = "test" + } + if i.b1 { + suffix = "q" + } else { + suffix = "l" + } + if op == "test" && i.op1.kind == operandKindMem { + // Print consistently with AT&T syntax. + return fmt.Sprintf("%s%s %s, %s", op, suffix, i.op2.format(i.b1), i.op1.format(i.b1)) + } + return fmt.Sprintf("%s%s %s, %s", op, suffix, i.op1.format(i.b1), i.op2.format(i.b1)) + case setcc: + return fmt.Sprintf("set%s %s", cond(i.u1), i.op2.format(true)) + case cmove: + var suffix string + if i.b1 { + suffix = "q" + } else { + suffix = "l" + } + return fmt.Sprintf("cmov%s%s %s, %s", cond(i.u1), suffix, i.op1.format(i.b1), i.op2.format(i.b1)) + case push64: + return fmt.Sprintf("pushq %s", i.op1.format(true)) + case pop64: + return fmt.Sprintf("popq %s", i.op1.format(true)) + case xmmMovRM: + return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(true), i.op2.format(true)) + case xmmLoadConst: + panic("TODO") + case xmmToGpr: + return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(i.b1), i.op2.format(i.b1)) + case cvtUint64ToFloatSeq: + panic("TODO") + case cvtFloatToSintSeq: + panic("TODO") + case cvtFloatToUintSeq: + panic("TODO") + case xmmMinMaxSeq: + panic("TODO") + case xmmCmpRmR: + return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(false), i.op2.format(false)) + case xmmRmRImm: + op := sseOpcode(i.u1) + r1, r2 := i.op1.format(op == sseOpcodePextrq || op == sseOpcodePinsrq), + i.op2.format(op == sseOpcodePextrq || op == sseOpcodePinsrq) + return fmt.Sprintf("%s $%d, %s, %s", op, i.u2, r1, r2) + case jmp: + return fmt.Sprintf("jmp %s", i.op1.format(true)) + case jmpIf: + return fmt.Sprintf("j%s %s", cond(i.u1), i.op1.format(true)) + case jmpTableIsland: + return fmt.Sprintf("jump_table_island: jmp_table_index=%d", i.u1) + case exitSequence: + return fmt.Sprintf("exit_sequence %s", i.op1.format(true)) + case ud2: + return "ud2" + case call: + return fmt.Sprintf("call %s", ssa.FuncRef(i.u1)) + case callIndirect: + return fmt.Sprintf("callq *%s", i.op1.format(true)) + case xchg: + var suffix string + switch i.u1 { + case 1: + suffix = "b" + case 2: + suffix = "w" + case 4: + suffix = "l" + case 8: + suffix = "q" + } + return fmt.Sprintf("xchg.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true)) + case zeros: + return fmt.Sprintf("xor %s, %s", i.op2.format(true), i.op2.format(true)) + case fcvtToSintSequence: + execCtx, src, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat := i.fcvtToSintSequenceData() + return fmt.Sprintf( + "fcvtToSintSequence execCtx=%s, src=%s, tmpGp=%s, tmpGp2=%s, tmpXmm=%s, src64=%v, dst64=%v, sat=%v", + formatVRegSized(execCtx, true), + formatVRegSized(src, true), + formatVRegSized(tmpGp, true), + formatVRegSized(tmpGp2, true), + formatVRegSized(tmpXmm, true), src64, dst64, sat) + case fcvtToUintSequence: + execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat := i.fcvtToUintSequenceData() + return fmt.Sprintf( + "fcvtToUintSequence execCtx=%s, src=%s, tmpGp=%s, tmpGp2=%s, tmpXmm=%s, tmpXmm2=%s, src64=%v, dst64=%v, sat=%v", + formatVRegSized(execCtx, true), + formatVRegSized(src, true), + formatVRegSized(tmpGp, true), + formatVRegSized(tmpGp2, true), + formatVRegSized(tmpXmm, true), + formatVRegSized(tmpXmm2, true), src64, dst64, sat) + case idivRemSequence: + execCtx, divisor, tmpGp, isDiv, signed, _64 := i.idivRemSequenceData() + return fmt.Sprintf("idivRemSequence execCtx=%s, divisor=%s, tmpGp=%s, isDiv=%v, signed=%v, _64=%v", + formatVRegSized(execCtx, true), formatVRegSized(divisor, _64), formatVRegSized(tmpGp, _64), isDiv, signed, _64) + case defineUninitializedReg: + return fmt.Sprintf("defineUninitializedReg %s", i.op2.format(true)) + case xmmCMov: + return fmt.Sprintf("xmmcmov%s %s, %s", cond(i.u1), i.op1.format(true), i.op2.format(true)) + case blendvpd: + return fmt.Sprintf("blendvpd %s, %s, %%xmm0", i.op1.format(false), i.op2.format(false)) + case mfence: + return "mfence" + case lockcmpxchg: + var suffix string + switch i.u1 { + case 1: + suffix = "b" + case 2: + suffix = "w" + case 4: + suffix = "l" + case 8: + suffix = "q" + } + return fmt.Sprintf("lock cmpxchg.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true)) + case lockxadd: + var suffix string + switch i.u1 { + case 1: + suffix = "b" + case 2: + suffix = "w" + case 4: + suffix = "l" + case 8: + suffix = "q" + } + return fmt.Sprintf("lock xadd.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true)) + + case nopUseReg: + return fmt.Sprintf("nop_use_reg %s", i.op1.format(true)) + + default: + panic(fmt.Sprintf("BUG: %d", int(i.kind))) + } +} + +// Defs implements regalloc.Instr. +func (i *instruction) Defs(regs *[]regalloc.VReg) []regalloc.VReg { + *regs = (*regs)[:0] + switch dk := defKinds[i.kind]; dk { + case defKindNone: + case defKindOp2: + *regs = append(*regs, i.op2.reg()) + case defKindCall: + _, _, retIntRealRegs, retFloatRealRegs, _ := backend.ABIInfoFromUint64(i.u2) + for i := byte(0); i < retIntRealRegs; i++ { + *regs = append(*regs, regInfo.RealRegToVReg[intArgResultRegs[i]]) + } + for i := byte(0); i < retFloatRealRegs; i++ { + *regs = append(*regs, regInfo.RealRegToVReg[floatArgResultRegs[i]]) + } + case defKindDivRem: + _, _, _, isDiv, _, _ := i.idivRemSequenceData() + if isDiv { + *regs = append(*regs, raxVReg) + } else { + *regs = append(*regs, rdxVReg) + } + default: + panic(fmt.Sprintf("BUG: invalid defKind \"%s\" for %s", dk, i)) + } + return *regs +} + +// Uses implements regalloc.Instr. +func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg { + *regs = (*regs)[:0] + switch uk := useKinds[i.kind]; uk { + case useKindNone: + case useKindOp1Op2Reg, useKindOp1RegOp2: + opAny, opReg := &i.op1, &i.op2 + if uk == useKindOp1RegOp2 { + opAny, opReg = opReg, opAny + } + // The destination operand (op2) can be only reg, + // the source operand (op1) can be imm32, reg or mem. + switch opAny.kind { + case operandKindReg: + *regs = append(*regs, opAny.reg()) + case operandKindMem: + opAny.addressMode().uses(regs) + case operandKindImm32: + default: + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + if opReg.kind != operandKindReg { + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + *regs = append(*regs, opReg.reg()) + case useKindOp1: + op := i.op1 + switch op.kind { + case operandKindReg: + *regs = append(*regs, op.reg()) + case operandKindMem: + op.addressMode().uses(regs) + case operandKindImm32, operandKindLabel: + default: + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + case useKindCallInd: + op := i.op1 + switch op.kind { + case operandKindReg: + *regs = append(*regs, op.reg()) + case operandKindMem: + op.addressMode().uses(regs) + default: + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + fallthrough + case useKindCall: + argIntRealRegs, argFloatRealRegs, _, _, _ := backend.ABIInfoFromUint64(i.u2) + for i := byte(0); i < argIntRealRegs; i++ { + *regs = append(*regs, regInfo.RealRegToVReg[intArgResultRegs[i]]) + } + for i := byte(0); i < argFloatRealRegs; i++ { + *regs = append(*regs, regInfo.RealRegToVReg[floatArgResultRegs[i]]) + } + case useKindFcvtToSintSequence: + execCtx, src, tmpGp, tmpGp2, tmpXmm, _, _, _ := i.fcvtToSintSequenceData() + *regs = append(*regs, execCtx, src, tmpGp, tmpGp2, tmpXmm) + case useKindFcvtToUintSequence: + execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, _, _, _ := i.fcvtToUintSequenceData() + *regs = append(*regs, execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2) + case useKindDivRem: + execCtx, divisor, tmpGp, _, _, _ := i.idivRemSequenceData() + // idiv uses rax and rdx as implicit operands. + *regs = append(*regs, raxVReg, rdxVReg, execCtx, divisor, tmpGp) + case useKindBlendvpd: + *regs = append(*regs, xmm0VReg) + + opAny, opReg := &i.op1, &i.op2 + switch opAny.kind { + case operandKindReg: + *regs = append(*regs, opAny.reg()) + case operandKindMem: + opAny.addressMode().uses(regs) + default: + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + if opReg.kind != operandKindReg { + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + *regs = append(*regs, opReg.reg()) + + case useKindRaxOp1RegOp2: + opReg, opAny := &i.op1, &i.op2 + *regs = append(*regs, raxVReg, opReg.reg()) + switch opAny.kind { + case operandKindReg: + *regs = append(*regs, opAny.reg()) + case operandKindMem: + opAny.addressMode().uses(regs) + default: + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + if opReg.kind != operandKindReg { + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + + default: + panic(fmt.Sprintf("BUG: invalid useKind %s for %s", uk, i)) + } + return *regs +} + +// AssignUse implements regalloc.Instr. +func (i *instruction) AssignUse(index int, v regalloc.VReg) { + switch uk := useKinds[i.kind]; uk { + case useKindNone: + case useKindCallInd: + if index != 0 { + panic("BUG") + } + op := &i.op1 + switch op.kind { + case operandKindReg: + op.setReg(v) + case operandKindMem: + op.addressMode().assignUses(index, v) + default: + panic("BUG") + } + case useKindOp1Op2Reg, useKindOp1RegOp2: + op, opMustBeReg := &i.op1, &i.op2 + if uk == useKindOp1RegOp2 { + op, opMustBeReg = opMustBeReg, op + } + switch op.kind { + case operandKindReg: + if index == 0 { + op.setReg(v) + } else if index == 1 { + opMustBeReg.setReg(v) + } else { + panic("BUG") + } + case operandKindMem: + nregs := op.addressMode().nregs() + if index < nregs { + op.addressMode().assignUses(index, v) + } else if index == nregs { + opMustBeReg.setReg(v) + } else { + panic("BUG") + } + case operandKindImm32: + if index == 0 { + opMustBeReg.setReg(v) + } else { + panic("BUG") + } + default: + panic(fmt.Sprintf("BUG: invalid operand pair: %s", i)) + } + case useKindOp1: + op := &i.op1 + switch op.kind { + case operandKindReg: + if index != 0 { + panic("BUG") + } + op.setReg(v) + case operandKindMem: + op.addressMode().assignUses(index, v) + default: + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + case useKindFcvtToSintSequence: + switch index { + case 0: + i.op1.addressMode().base = v + case 1: + i.op1.addressMode().index = v + case 2: + i.op2.addressMode().base = v + case 3: + i.op2.addressMode().index = v + case 4: + i.u1 = uint64(v) + default: + panic("BUG") + } + case useKindFcvtToUintSequence: + switch index { + case 0: + i.op1.addressMode().base = v + case 1: + i.op1.addressMode().index = v + case 2: + i.op2.addressMode().base = v + case 3: + i.op2.addressMode().index = v + case 4: + i.u1 = uint64(v) + case 5: + i.u2 = uint64(v) + default: + panic("BUG") + } + case useKindDivRem: + switch index { + case 0: + if v != raxVReg { + panic("BUG") + } + case 1: + if v != rdxVReg { + panic("BUG") + } + case 2: + i.op1.setReg(v) + case 3: + i.op2.setReg(v) + case 4: + i.u1 = uint64(v) + default: + panic("BUG") + } + case useKindBlendvpd: + op, opMustBeReg := &i.op1, &i.op2 + if index == 0 { + if v.RealReg() != xmm0 { + panic("BUG") + } + } else { + switch op.kind { + case operandKindReg: + switch index { + case 1: + op.setReg(v) + case 2: + opMustBeReg.setReg(v) + default: + panic("BUG") + } + case operandKindMem: + nregs := op.addressMode().nregs() + index-- + if index < nregs { + op.addressMode().assignUses(index, v) + } else if index == nregs { + opMustBeReg.setReg(v) + } else { + panic("BUG") + } + default: + panic(fmt.Sprintf("BUG: invalid operand pair: %s", i)) + } + } + + case useKindRaxOp1RegOp2: + switch index { + case 0: + if v.RealReg() != rax { + panic("BUG") + } + case 1: + i.op1.setReg(v) + default: + op := &i.op2 + switch op.kind { + case operandKindReg: + switch index { + case 1: + op.setReg(v) + case 2: + op.setReg(v) + default: + panic("BUG") + } + case operandKindMem: + nregs := op.addressMode().nregs() + index -= 2 + if index < nregs { + op.addressMode().assignUses(index, v) + } else if index == nregs { + op.setReg(v) + } else { + panic("BUG") + } + default: + panic(fmt.Sprintf("BUG: invalid operand pair: %s", i)) + } + } + default: + panic(fmt.Sprintf("BUG: invalid useKind %s for %s", uk, i)) + } +} + +// AssignDef implements regalloc.Instr. +func (i *instruction) AssignDef(reg regalloc.VReg) { + switch dk := defKinds[i.kind]; dk { + case defKindNone: + case defKindOp2: + i.op2.setReg(reg) + default: + panic(fmt.Sprintf("BUG: invalid defKind \"%s\" for %s", dk, i)) + } +} + +// IsCopy implements regalloc.Instr. +func (i *instruction) IsCopy() bool { + k := i.kind + if k == movRR { + return true + } + if k == xmmUnaryRmR { + if i.op1.kind == operandKindReg { + sse := sseOpcode(i.u1) + return sse == sseOpcodeMovss || sse == sseOpcodeMovsd || sse == sseOpcodeMovdqu + } + } + return false +} + +func resetInstruction(i *instruction) { + *i = instruction{} +} + +func setNext(i *instruction, next *instruction) { + i.next = next +} + +func setPrev(i *instruction, prev *instruction) { + i.prev = prev +} + +func asNop(i *instruction) { + i.kind = nop0 +} + +func (i *instruction) asNop0WithLabel(label backend.Label) *instruction { //nolint + i.kind = nop0 + i.u1 = uint64(label) + return i +} + +func (i *instruction) nop0Label() backend.Label { + return backend.Label(i.u1) +} + +type instructionKind byte + +const ( + nop0 instructionKind = iota + 1 + + // Integer arithmetic/bit-twiddling: (add sub and or xor mul, etc.) (32 64) (reg addr imm) reg + aluRmiR + + // Instructions on GPR that only read src and defines dst (dst is not modified): bsr, etc. + unaryRmR + + // Bitwise not + not + + // Integer negation + neg + + // Integer quotient and remainder: (div idiv) $rax $rdx (reg addr) + div + + // The high bits (RDX) of a (un)signed multiply: RDX:RAX := RAX * rhs. + mulHi + + // Do a sign-extend based on the sign of the value in rax into rdx: (cwd cdq cqo) + // or al into ah: (cbw) + signExtendData + + // Constant materialization: (imm32 imm64) reg. + // Either: movl $imm32, %reg32 or movabsq $imm64, %reg64. + imm + + // GPR to GPR move: mov (64 32) reg reg. + movRR + + // movzxRmR is zero-extended loads or move (R to R), except for 64 bits: movz (bl bq wl wq lq) addr reg. + // Note that the lq variant doesn't really exist since the default zero-extend rule makes it + // unnecessary. For that case we emit the equivalent "movl AM, reg32". + movzxRmR + + // mov64MR is a plain 64-bit integer load, since movzxRmR can't represent that. + mov64MR + + // Loads the memory address of addr into dst. + lea + + // Sign-extended loads and moves: movs (bl bq wl wq lq) addr reg. + movsxRmR + + // Integer stores: mov (b w l q) reg addr. + movRM + + // Arithmetic shifts: (shl shr sar) (b w l q) imm reg. + shiftR + + // Arithmetic SIMD shifts. + xmmRmiReg + + // Integer comparisons/tests: cmp or test (b w l q) (reg addr imm) reg. + cmpRmiR + + // Materializes the requested condition code in the destination reg. + setcc + + // Integer conditional move. + // Overwrites the destination register. + cmove + + // pushq (reg addr imm) + push64 + + // popq reg + pop64 + + // XMM (scalar or vector) binary op: (add sub and or xor mul adc? sbb?) (32 64) (reg addr) reg + xmmRmR + + // XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg. + // + // This differs from xmmRmR in that the dst register of xmmUnaryRmR is not used in the + // computation of the instruction dst value and so does not have to be a previously valid + // value. This is characteristic of mov instructions. + xmmUnaryRmR + + // XMM (scalar or vector) unary op with immediate: roundss, roundsd, etc. + // + // This differs from XMM_RM_R_IMM in that the dst register of + // XmmUnaryRmRImm is not used in the computation of the instruction dst + // value and so does not have to be a previously valid value. + xmmUnaryRmRImm + + // XMM (scalar or vector) unary op (from xmm to mem): stores, movd, movq + xmmMovRM + + // XMM (vector) unary op (to move a constant value into an xmm register): movups + xmmLoadConst + + // XMM (scalar) unary op (from xmm to integer reg): movd, movq, cvtts{s,d}2si + xmmToGpr + + // XMM (scalar) unary op (from integer to float reg): movd, movq, cvtsi2s{s,d} + gprToXmm + + // Converts an unsigned int64 to a float32/float64. + cvtUint64ToFloatSeq + + // Converts a scalar xmm to a signed int32/int64. + cvtFloatToSintSeq + + // Converts a scalar xmm to an unsigned int32/int64. + cvtFloatToUintSeq + + // A sequence to compute min/max with the proper NaN semantics for xmm registers. + xmmMinMaxSeq + + // Float comparisons/tests: cmp (b w l q) (reg addr imm) reg. + xmmCmpRmR + + // A binary XMM instruction with an 8-bit immediate: e.g. cmp (ps pd) imm (reg addr) reg + xmmRmRImm + + // Direct call: call simm32. + // Note that the offset is the relative to the *current RIP*, which points to the first byte of the next instruction. + call + + // Indirect call: callq (reg mem). + callIndirect + + // Return. + ret + + // Jump: jmp (reg, mem, imm32 or label) + jmp + + // Jump conditionally: jcond cond label. + jmpIf + + // jmpTableIsland is to emit the jump table. + jmpTableIsland + + // exitSequence exits the execution and go back to the Go world. + exitSequence + + // An instruction that will always trigger the illegal instruction exception. + ud2 + + // xchg is described in https://www.felixcloutier.com/x86/xchg. + // This instruction uses two operands, where one of them can be a memory address, and swaps their values. + // If the dst is a memory address, the execution is atomic. + xchg + + // lockcmpxchg is the cmpxchg instruction https://www.felixcloutier.com/x86/cmpxchg with a lock prefix. + lockcmpxchg + + // zeros puts zeros into the destination register. This is implemented as xor reg, reg for + // either integer or XMM registers. The reason why we have this instruction instead of using aluRmiR + // is that it requires the already-defined registers. From reg alloc's perspective, this defines + // the destination register and takes no inputs. + zeros + + // sourceOffsetInfo is a dummy instruction to emit source offset info. + // The existence of this instruction does not affect the execution. + sourceOffsetInfo + + // defineUninitializedReg is a no-op instruction that defines a register without a defining instruction. + defineUninitializedReg + + // fcvtToSintSequence is a sequence of instructions to convert a float to a signed integer. + fcvtToSintSequence + + // fcvtToUintSequence is a sequence of instructions to convert a float to an unsigned integer. + fcvtToUintSequence + + // xmmCMov is a conditional move instruction for XMM registers. Lowered after register allocation. + xmmCMov + + // idivRemSequence is a sequence of instructions to compute both the quotient and remainder of a division. + idivRemSequence + + // blendvpd is https://www.felixcloutier.com/x86/blendvpd. + blendvpd + + // mfence is https://www.felixcloutier.com/x86/mfence + mfence + + // lockxadd is xadd https://www.felixcloutier.com/x86/xadd with a lock prefix. + lockxadd + + // nopUseReg is a meta instruction that uses one register and does nothing. + nopUseReg + + instrMax +) + +func (i *instruction) asMFence() *instruction { + i.kind = mfence + return i +} + +func (i *instruction) asNopUseReg(r regalloc.VReg) *instruction { + i.kind = nopUseReg + i.op1 = newOperandReg(r) + return i +} + +func (i *instruction) asIdivRemSequence(execCtx, divisor, tmpGp regalloc.VReg, isDiv, signed, _64 bool) *instruction { + i.kind = idivRemSequence + i.op1 = newOperandReg(execCtx) + i.op2 = newOperandReg(divisor) + i.u1 = uint64(tmpGp) + if isDiv { + i.u2 |= 1 + } + if signed { + i.u2 |= 2 + } + if _64 { + i.u2 |= 4 + } + return i +} + +func (i *instruction) idivRemSequenceData() ( + execCtx, divisor, tmpGp regalloc.VReg, isDiv, signed, _64 bool, +) { + if i.kind != idivRemSequence { + panic("BUG") + } + return i.op1.reg(), i.op2.reg(), regalloc.VReg(i.u1), i.u2&1 != 0, i.u2&2 != 0, i.u2&4 != 0 +} + +func (i *instruction) asXmmCMov(cc cond, x operand, rd regalloc.VReg, size byte) *instruction { + i.kind = xmmCMov + i.op1 = x + i.op2 = newOperandReg(rd) + i.u1 = uint64(cc) + i.u2 = uint64(size) + return i +} + +func (i *instruction) asDefineUninitializedReg(r regalloc.VReg) *instruction { + i.kind = defineUninitializedReg + i.op2 = newOperandReg(r) + return i +} + +func (m *machine) allocateFcvtToUintSequence( + execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2 regalloc.VReg, + src64, dst64, sat bool, +) *instruction { + i := m.allocateInstr() + i.kind = fcvtToUintSequence + op1a := m.amodePool.Allocate() + op2a := m.amodePool.Allocate() + i.op1 = newOperandMem(op1a) + i.op2 = newOperandMem(op2a) + if src64 { + op1a.imm32 = 1 + } else { + op1a.imm32 = 0 + } + if dst64 { + op1a.imm32 |= 2 + } + if sat { + op1a.imm32 |= 4 + } + + op1a.base = execCtx + op1a.index = src + op2a.base = tmpGp + op2a.index = tmpGp2 + i.u1 = uint64(tmpXmm) + i.u2 = uint64(tmpXmm2) + return i +} + +func (i *instruction) fcvtToUintSequenceData() ( + execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2 regalloc.VReg, src64, dst64, sat bool, +) { + if i.kind != fcvtToUintSequence { + panic("BUG") + } + op1a := i.op1.addressMode() + op2a := i.op2.addressMode() + return op1a.base, op1a.index, op2a.base, op2a.index, regalloc.VReg(i.u1), regalloc.VReg(i.u2), + op1a.imm32&1 != 0, op1a.imm32&2 != 0, op1a.imm32&4 != 0 +} + +func (m *machine) allocateFcvtToSintSequence( + execCtx, src, tmpGp, tmpGp2, tmpXmm regalloc.VReg, + src64, dst64, sat bool, +) *instruction { + i := m.allocateInstr() + i.kind = fcvtToSintSequence + op1a := m.amodePool.Allocate() + op2a := m.amodePool.Allocate() + i.op1 = newOperandMem(op1a) + i.op2 = newOperandMem(op2a) + op1a.base = execCtx + op1a.index = src + op2a.base = tmpGp + op2a.index = tmpGp2 + i.u1 = uint64(tmpXmm) + if src64 { + i.u2 = 1 + } else { + i.u2 = 0 + } + if dst64 { + i.u2 |= 2 + } + if sat { + i.u2 |= 4 + } + return i +} + +func (i *instruction) fcvtToSintSequenceData() ( + execCtx, src, tmpGp, tmpGp2, tmpXmm regalloc.VReg, src64, dst64, sat bool, +) { + if i.kind != fcvtToSintSequence { + panic("BUG") + } + op1a := i.op1.addressMode() + op2a := i.op2.addressMode() + return op1a.base, op1a.index, op2a.base, op2a.index, regalloc.VReg(i.u1), + i.u2&1 != 0, i.u2&2 != 0, i.u2&4 != 0 +} + +func (k instructionKind) String() string { + switch k { + case nop0: + return "nop" + case ret: + return "ret" + case imm: + return "imm" + case aluRmiR: + return "aluRmiR" + case movRR: + return "movRR" + case xmmRmR: + return "xmmRmR" + case gprToXmm: + return "gprToXmm" + case xmmUnaryRmR: + return "xmmUnaryRmR" + case xmmUnaryRmRImm: + return "xmmUnaryRmRImm" + case unaryRmR: + return "unaryRmR" + case not: + return "not" + case neg: + return "neg" + case div: + return "div" + case mulHi: + return "mulHi" + case signExtendData: + return "signExtendData" + case movzxRmR: + return "movzxRmR" + case mov64MR: + return "mov64MR" + case lea: + return "lea" + case movsxRmR: + return "movsxRmR" + case movRM: + return "movRM" + case shiftR: + return "shiftR" + case xmmRmiReg: + return "xmmRmiReg" + case cmpRmiR: + return "cmpRmiR" + case setcc: + return "setcc" + case cmove: + return "cmove" + case push64: + return "push64" + case pop64: + return "pop64" + case xmmMovRM: + return "xmmMovRM" + case xmmLoadConst: + return "xmmLoadConst" + case xmmToGpr: + return "xmmToGpr" + case cvtUint64ToFloatSeq: + return "cvtUint64ToFloatSeq" + case cvtFloatToSintSeq: + return "cvtFloatToSintSeq" + case cvtFloatToUintSeq: + return "cvtFloatToUintSeq" + case xmmMinMaxSeq: + return "xmmMinMaxSeq" + case xmmCmpRmR: + return "xmmCmpRmR" + case xmmRmRImm: + return "xmmRmRImm" + case jmpIf: + return "jmpIf" + case jmp: + return "jmp" + case jmpTableIsland: + return "jmpTableIsland" + case exitSequence: + return "exit_sequence" + case ud2: + return "ud2" + case xchg: + return "xchg" + case zeros: + return "zeros" + case fcvtToSintSequence: + return "fcvtToSintSequence" + case fcvtToUintSequence: + return "fcvtToUintSequence" + case xmmCMov: + return "xmmCMov" + case idivRemSequence: + return "idivRemSequence" + case mfence: + return "mfence" + case lockcmpxchg: + return "lockcmpxchg" + case lockxadd: + return "lockxadd" + default: + panic("BUG") + } +} + +type aluRmiROpcode byte + +const ( + aluRmiROpcodeAdd aluRmiROpcode = iota + 1 + aluRmiROpcodeSub + aluRmiROpcodeAnd + aluRmiROpcodeOr + aluRmiROpcodeXor + aluRmiROpcodeMul +) + +func (a aluRmiROpcode) String() string { + switch a { + case aluRmiROpcodeAdd: + return "add" + case aluRmiROpcodeSub: + return "sub" + case aluRmiROpcodeAnd: + return "and" + case aluRmiROpcodeOr: + return "or" + case aluRmiROpcodeXor: + return "xor" + case aluRmiROpcodeMul: + return "imul" + default: + panic("BUG") + } +} + +func (i *instruction) asJmpIf(cond cond, target operand) *instruction { + i.kind = jmpIf + i.u1 = uint64(cond) + i.op1 = target + return i +} + +// asJmpTableSequence is used to emit the jump table. +// targetSliceIndex is the index of the target slice in machine.jmpTableTargets. +func (i *instruction) asJmpTableSequence(targetSliceIndex int, targetCount int) *instruction { + i.kind = jmpTableIsland + i.u1 = uint64(targetSliceIndex) + i.u2 = uint64(targetCount) + return i +} + +func (i *instruction) asJmp(target operand) *instruction { + i.kind = jmp + i.op1 = target + return i +} + +func (i *instruction) jmpLabel() backend.Label { + switch i.kind { + case jmp, jmpIf, lea, xmmUnaryRmR: + return i.op1.label() + default: + panic("BUG") + } +} + +func (i *instruction) asLEA(target operand, rd regalloc.VReg) *instruction { + i.kind = lea + i.op1 = target + i.op2 = newOperandReg(rd) + return i +} + +func (i *instruction) asCall(ref ssa.FuncRef, abi *backend.FunctionABI) *instruction { + i.kind = call + i.u1 = uint64(ref) + if abi != nil { + i.u2 = abi.ABIInfoAsUint64() + } + return i +} + +func (i *instruction) asCallIndirect(ptr operand, abi *backend.FunctionABI) *instruction { + if ptr.kind != operandKindReg && ptr.kind != operandKindMem { + panic("BUG") + } + i.kind = callIndirect + i.op1 = ptr + if abi != nil { + i.u2 = abi.ABIInfoAsUint64() + } + return i +} + +func (i *instruction) asRet() *instruction { + i.kind = ret + return i +} + +func (i *instruction) asImm(dst regalloc.VReg, value uint64, _64 bool) *instruction { + i.kind = imm + i.op2 = newOperandReg(dst) + i.u1 = value + i.b1 = _64 + return i +} + +func (i *instruction) asAluRmiR(op aluRmiROpcode, rm operand, rd regalloc.VReg, _64 bool) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem && rm.kind != operandKindImm32 { + panic("BUG") + } + i.kind = aluRmiR + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + i.b1 = _64 + return i +} + +func (i *instruction) asZeros(dst regalloc.VReg) *instruction { + i.kind = zeros + i.op2 = newOperandReg(dst) + return i +} + +func (i *instruction) asBlendvpd(rm operand, rd regalloc.VReg) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = blendvpd + i.op1 = rm + i.op2 = newOperandReg(rd) + return i +} + +func (i *instruction) asXmmRmR(op sseOpcode, rm operand, rd regalloc.VReg) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = xmmRmR + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + return i +} + +func (i *instruction) asXmmRmRImm(op sseOpcode, imm uint8, rm operand, rd regalloc.VReg) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = xmmRmRImm + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + i.u2 = uint64(imm) + return i +} + +func (i *instruction) asGprToXmm(op sseOpcode, rm operand, rd regalloc.VReg, _64 bool) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = gprToXmm + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + i.b1 = _64 + return i +} + +func (i *instruction) asEmitSourceOffsetInfo(l ssa.SourceOffset) *instruction { + i.kind = sourceOffsetInfo + i.u1 = uint64(l) + return i +} + +func (i *instruction) sourceOffsetInfo() ssa.SourceOffset { + return ssa.SourceOffset(i.u1) +} + +func (i *instruction) asXmmToGpr(op sseOpcode, rm, rd regalloc.VReg, _64 bool) *instruction { + i.kind = xmmToGpr + i.op1 = newOperandReg(rm) + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + i.b1 = _64 + return i +} + +func (i *instruction) asMovRM(rm regalloc.VReg, rd operand, size byte) *instruction { + if rd.kind != operandKindMem { + panic("BUG") + } + i.kind = movRM + i.op1 = newOperandReg(rm) + i.op2 = rd + i.u1 = uint64(size) + return i +} + +func (i *instruction) asMovsxRmR(ext extMode, src operand, rd regalloc.VReg) *instruction { + if src.kind != operandKindReg && src.kind != operandKindMem { + panic("BUG") + } + i.kind = movsxRmR + i.op1 = src + i.op2 = newOperandReg(rd) + i.u1 = uint64(ext) + return i +} + +func (i *instruction) asMovzxRmR(ext extMode, src operand, rd regalloc.VReg) *instruction { + if src.kind != operandKindReg && src.kind != operandKindMem { + panic("BUG") + } + i.kind = movzxRmR + i.op1 = src + i.op2 = newOperandReg(rd) + i.u1 = uint64(ext) + return i +} + +func (i *instruction) asSignExtendData(_64 bool) *instruction { + i.kind = signExtendData + i.b1 = _64 + return i +} + +func (i *instruction) asUD2() *instruction { + i.kind = ud2 + return i +} + +func (i *instruction) asDiv(rn operand, signed bool, _64 bool) *instruction { + i.kind = div + i.op1 = rn + i.b1 = _64 + if signed { + i.u1 = 1 + } + return i +} + +func (i *instruction) asMov64MR(rm operand, rd regalloc.VReg) *instruction { + if rm.kind != operandKindMem { + panic("BUG") + } + i.kind = mov64MR + i.op1 = rm + i.op2 = newOperandReg(rd) + return i +} + +func (i *instruction) asMovRR(rm, rd regalloc.VReg, _64 bool) *instruction { + i.kind = movRR + i.op1 = newOperandReg(rm) + i.op2 = newOperandReg(rd) + i.b1 = _64 + return i +} + +func (i *instruction) asNot(rm operand, _64 bool) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = not + i.op1 = rm + i.b1 = _64 + return i +} + +func (i *instruction) asNeg(rm operand, _64 bool) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = neg + i.op1 = rm + i.b1 = _64 + return i +} + +func (i *instruction) asMulHi(rm operand, signed, _64 bool) *instruction { + if rm.kind != operandKindReg && (rm.kind != operandKindMem) { + panic("BUG") + } + i.kind = mulHi + i.op1 = rm + i.b1 = _64 + if signed { + i.u1 = 1 + } + return i +} + +func (i *instruction) asUnaryRmR(op unaryRmROpcode, rm operand, rd regalloc.VReg, _64 bool) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = unaryRmR + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + i.b1 = _64 + return i +} + +func (i *instruction) asShiftR(op shiftROp, amount operand, rd regalloc.VReg, _64 bool) *instruction { + if amount.kind != operandKindReg && amount.kind != operandKindImm32 { + panic("BUG") + } + i.kind = shiftR + i.op1 = amount + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + i.b1 = _64 + return i +} + +func (i *instruction) asXmmRmiReg(op sseOpcode, rm operand, rd regalloc.VReg) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindImm32 && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = xmmRmiReg + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + return i +} + +func (i *instruction) asCmpRmiR(cmp bool, rm operand, rn regalloc.VReg, _64 bool) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindImm32 && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = cmpRmiR + i.op1 = rm + i.op2 = newOperandReg(rn) + if cmp { + i.u1 = 1 + } + i.b1 = _64 + return i +} + +func (i *instruction) asSetcc(c cond, rd regalloc.VReg) *instruction { + i.kind = setcc + i.op2 = newOperandReg(rd) + i.u1 = uint64(c) + return i +} + +func (i *instruction) asCmove(c cond, rm operand, rd regalloc.VReg, _64 bool) *instruction { + i.kind = cmove + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(c) + i.b1 = _64 + return i +} + +func (m *machine) allocateExitSeq(execCtx regalloc.VReg) *instruction { + i := m.allocateInstr() + i.kind = exitSequence + i.op1 = newOperandReg(execCtx) + // Allocate the address mode that will be used in encoding the exit sequence. + i.op2 = newOperandMem(m.amodePool.Allocate()) + return i +} + +func (i *instruction) asXmmUnaryRmR(op sseOpcode, rm operand, rd regalloc.VReg) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = xmmUnaryRmR + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + return i +} + +func (i *instruction) asXmmUnaryRmRImm(op sseOpcode, imm byte, rm operand, rd regalloc.VReg) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = xmmUnaryRmRImm + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + i.u2 = uint64(imm) + return i +} + +func (i *instruction) asXmmCmpRmR(op sseOpcode, rm operand, rd regalloc.VReg) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = xmmCmpRmR + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + return i +} + +func (i *instruction) asXmmMovRM(op sseOpcode, rm regalloc.VReg, rd operand) *instruction { + if rd.kind != operandKindMem { + panic("BUG") + } + i.kind = xmmMovRM + i.op1 = newOperandReg(rm) + i.op2 = rd + i.u1 = uint64(op) + return i +} + +func (i *instruction) asPop64(rm regalloc.VReg) *instruction { + i.kind = pop64 + i.op1 = newOperandReg(rm) + return i +} + +func (i *instruction) asPush64(op operand) *instruction { + if op.kind != operandKindReg && op.kind != operandKindMem && op.kind != operandKindImm32 { + panic("BUG") + } + i.kind = push64 + i.op1 = op + return i +} + +func (i *instruction) asXCHG(rm regalloc.VReg, rd operand, size byte) *instruction { + i.kind = xchg + i.op1 = newOperandReg(rm) + i.op2 = rd + i.u1 = uint64(size) + return i +} + +func (i *instruction) asLockCmpXCHG(rm regalloc.VReg, rd *amode, size byte) *instruction { + i.kind = lockcmpxchg + i.op1 = newOperandReg(rm) + i.op2 = newOperandMem(rd) + i.u1 = uint64(size) + return i +} + +func (i *instruction) asLockXAdd(rm regalloc.VReg, rd *amode, size byte) *instruction { + i.kind = lockxadd + i.op1 = newOperandReg(rm) + i.op2 = newOperandMem(rd) + i.u1 = uint64(size) + return i +} + +type unaryRmROpcode byte + +const ( + unaryRmROpcodeBsr unaryRmROpcode = iota + unaryRmROpcodeBsf + unaryRmROpcodeLzcnt + unaryRmROpcodeTzcnt + unaryRmROpcodePopcnt +) + +func (u unaryRmROpcode) String() string { + switch u { + case unaryRmROpcodeBsr: + return "bsr" + case unaryRmROpcodeBsf: + return "bsf" + case unaryRmROpcodeLzcnt: + return "lzcnt" + case unaryRmROpcodeTzcnt: + return "tzcnt" + case unaryRmROpcodePopcnt: + return "popcnt" + default: + panic("BUG") + } +} + +type shiftROp byte + +const ( + shiftROpRotateLeft shiftROp = 0 + shiftROpRotateRight shiftROp = 1 + shiftROpShiftLeft shiftROp = 4 + shiftROpShiftRightLogical shiftROp = 5 + shiftROpShiftRightArithmetic shiftROp = 7 +) + +func (s shiftROp) String() string { + switch s { + case shiftROpRotateLeft: + return "rol" + case shiftROpRotateRight: + return "ror" + case shiftROpShiftLeft: + return "shl" + case shiftROpShiftRightLogical: + return "shr" + case shiftROpShiftRightArithmetic: + return "sar" + default: + panic("BUG") + } +} + +type sseOpcode byte + +const ( + sseOpcodeInvalid sseOpcode = iota + sseOpcodeAddps + sseOpcodeAddpd + sseOpcodeAddss + sseOpcodeAddsd + sseOpcodeAndps + sseOpcodeAndpd + sseOpcodeAndnps + sseOpcodeAndnpd + sseOpcodeBlendvps + sseOpcodeBlendvpd + sseOpcodeComiss + sseOpcodeComisd + sseOpcodeCmpps + sseOpcodeCmppd + sseOpcodeCmpss + sseOpcodeCmpsd + sseOpcodeCvtdq2ps + sseOpcodeCvtdq2pd + sseOpcodeCvtsd2ss + sseOpcodeCvtsd2si + sseOpcodeCvtsi2ss + sseOpcodeCvtsi2sd + sseOpcodeCvtss2si + sseOpcodeCvtss2sd + sseOpcodeCvttps2dq + sseOpcodeCvttss2si + sseOpcodeCvttsd2si + sseOpcodeDivps + sseOpcodeDivpd + sseOpcodeDivss + sseOpcodeDivsd + sseOpcodeInsertps + sseOpcodeMaxps + sseOpcodeMaxpd + sseOpcodeMaxss + sseOpcodeMaxsd + sseOpcodeMinps + sseOpcodeMinpd + sseOpcodeMinss + sseOpcodeMinsd + sseOpcodeMovaps + sseOpcodeMovapd + sseOpcodeMovd + sseOpcodeMovdqa + sseOpcodeMovdqu + sseOpcodeMovlhps + sseOpcodeMovmskps + sseOpcodeMovmskpd + sseOpcodeMovq + sseOpcodeMovss + sseOpcodeMovsd + sseOpcodeMovups + sseOpcodeMovupd + sseOpcodeMulps + sseOpcodeMulpd + sseOpcodeMulss + sseOpcodeMulsd + sseOpcodeOrps + sseOpcodeOrpd + sseOpcodePabsb + sseOpcodePabsw + sseOpcodePabsd + sseOpcodePackssdw + sseOpcodePacksswb + sseOpcodePackusdw + sseOpcodePackuswb + sseOpcodePaddb + sseOpcodePaddd + sseOpcodePaddq + sseOpcodePaddw + sseOpcodePaddsb + sseOpcodePaddsw + sseOpcodePaddusb + sseOpcodePaddusw + sseOpcodePalignr + sseOpcodePand + sseOpcodePandn + sseOpcodePavgb + sseOpcodePavgw + sseOpcodePcmpeqb + sseOpcodePcmpeqw + sseOpcodePcmpeqd + sseOpcodePcmpeqq + sseOpcodePcmpgtb + sseOpcodePcmpgtw + sseOpcodePcmpgtd + sseOpcodePcmpgtq + sseOpcodePextrb + sseOpcodePextrw + sseOpcodePextrd + sseOpcodePextrq + sseOpcodePinsrb + sseOpcodePinsrw + sseOpcodePinsrd + sseOpcodePinsrq + sseOpcodePmaddwd + sseOpcodePmaxsb + sseOpcodePmaxsw + sseOpcodePmaxsd + sseOpcodePmaxub + sseOpcodePmaxuw + sseOpcodePmaxud + sseOpcodePminsb + sseOpcodePminsw + sseOpcodePminsd + sseOpcodePminub + sseOpcodePminuw + sseOpcodePminud + sseOpcodePmovmskb + sseOpcodePmovsxbd + sseOpcodePmovsxbw + sseOpcodePmovsxbq + sseOpcodePmovsxwd + sseOpcodePmovsxwq + sseOpcodePmovsxdq + sseOpcodePmovzxbd + sseOpcodePmovzxbw + sseOpcodePmovzxbq + sseOpcodePmovzxwd + sseOpcodePmovzxwq + sseOpcodePmovzxdq + sseOpcodePmulld + sseOpcodePmullw + sseOpcodePmuludq + sseOpcodePor + sseOpcodePshufb + sseOpcodePshufd + sseOpcodePsllw + sseOpcodePslld + sseOpcodePsllq + sseOpcodePsraw + sseOpcodePsrad + sseOpcodePsrlw + sseOpcodePsrld + sseOpcodePsrlq + sseOpcodePsubb + sseOpcodePsubd + sseOpcodePsubq + sseOpcodePsubw + sseOpcodePsubsb + sseOpcodePsubsw + sseOpcodePsubusb + sseOpcodePsubusw + sseOpcodePtest + sseOpcodePunpckhbw + sseOpcodePunpcklbw + sseOpcodePxor + sseOpcodeRcpss + sseOpcodeRoundps + sseOpcodeRoundpd + sseOpcodeRoundss + sseOpcodeRoundsd + sseOpcodeRsqrtss + sseOpcodeSqrtps + sseOpcodeSqrtpd + sseOpcodeSqrtss + sseOpcodeSqrtsd + sseOpcodeSubps + sseOpcodeSubpd + sseOpcodeSubss + sseOpcodeSubsd + sseOpcodeUcomiss + sseOpcodeUcomisd + sseOpcodeXorps + sseOpcodeXorpd + sseOpcodePmulhrsw + sseOpcodeUnpcklps + sseOpcodeCvtps2pd + sseOpcodeCvtpd2ps + sseOpcodeCvttpd2dq + sseOpcodeShufps + sseOpcodePmaddubsw +) + +func (s sseOpcode) String() string { + switch s { + case sseOpcodeInvalid: + return "invalid" + case sseOpcodeAddps: + return "addps" + case sseOpcodeAddpd: + return "addpd" + case sseOpcodeAddss: + return "addss" + case sseOpcodeAddsd: + return "addsd" + case sseOpcodeAndps: + return "andps" + case sseOpcodeAndpd: + return "andpd" + case sseOpcodeAndnps: + return "andnps" + case sseOpcodeAndnpd: + return "andnpd" + case sseOpcodeBlendvps: + return "blendvps" + case sseOpcodeBlendvpd: + return "blendvpd" + case sseOpcodeComiss: + return "comiss" + case sseOpcodeComisd: + return "comisd" + case sseOpcodeCmpps: + return "cmpps" + case sseOpcodeCmppd: + return "cmppd" + case sseOpcodeCmpss: + return "cmpss" + case sseOpcodeCmpsd: + return "cmpsd" + case sseOpcodeCvtdq2ps: + return "cvtdq2ps" + case sseOpcodeCvtdq2pd: + return "cvtdq2pd" + case sseOpcodeCvtsd2ss: + return "cvtsd2ss" + case sseOpcodeCvtsd2si: + return "cvtsd2si" + case sseOpcodeCvtsi2ss: + return "cvtsi2ss" + case sseOpcodeCvtsi2sd: + return "cvtsi2sd" + case sseOpcodeCvtss2si: + return "cvtss2si" + case sseOpcodeCvtss2sd: + return "cvtss2sd" + case sseOpcodeCvttps2dq: + return "cvttps2dq" + case sseOpcodeCvttss2si: + return "cvttss2si" + case sseOpcodeCvttsd2si: + return "cvttsd2si" + case sseOpcodeDivps: + return "divps" + case sseOpcodeDivpd: + return "divpd" + case sseOpcodeDivss: + return "divss" + case sseOpcodeDivsd: + return "divsd" + case sseOpcodeInsertps: + return "insertps" + case sseOpcodeMaxps: + return "maxps" + case sseOpcodeMaxpd: + return "maxpd" + case sseOpcodeMaxss: + return "maxss" + case sseOpcodeMaxsd: + return "maxsd" + case sseOpcodeMinps: + return "minps" + case sseOpcodeMinpd: + return "minpd" + case sseOpcodeMinss: + return "minss" + case sseOpcodeMinsd: + return "minsd" + case sseOpcodeMovaps: + return "movaps" + case sseOpcodeMovapd: + return "movapd" + case sseOpcodeMovd: + return "movd" + case sseOpcodeMovdqa: + return "movdqa" + case sseOpcodeMovdqu: + return "movdqu" + case sseOpcodeMovlhps: + return "movlhps" + case sseOpcodeMovmskps: + return "movmskps" + case sseOpcodeMovmskpd: + return "movmskpd" + case sseOpcodeMovq: + return "movq" + case sseOpcodeMovss: + return "movss" + case sseOpcodeMovsd: + return "movsd" + case sseOpcodeMovups: + return "movups" + case sseOpcodeMovupd: + return "movupd" + case sseOpcodeMulps: + return "mulps" + case sseOpcodeMulpd: + return "mulpd" + case sseOpcodeMulss: + return "mulss" + case sseOpcodeMulsd: + return "mulsd" + case sseOpcodeOrps: + return "orps" + case sseOpcodeOrpd: + return "orpd" + case sseOpcodePabsb: + return "pabsb" + case sseOpcodePabsw: + return "pabsw" + case sseOpcodePabsd: + return "pabsd" + case sseOpcodePackssdw: + return "packssdw" + case sseOpcodePacksswb: + return "packsswb" + case sseOpcodePackusdw: + return "packusdw" + case sseOpcodePackuswb: + return "packuswb" + case sseOpcodePaddb: + return "paddb" + case sseOpcodePaddd: + return "paddd" + case sseOpcodePaddq: + return "paddq" + case sseOpcodePaddw: + return "paddw" + case sseOpcodePaddsb: + return "paddsb" + case sseOpcodePaddsw: + return "paddsw" + case sseOpcodePaddusb: + return "paddusb" + case sseOpcodePaddusw: + return "paddusw" + case sseOpcodePalignr: + return "palignr" + case sseOpcodePand: + return "pand" + case sseOpcodePandn: + return "pandn" + case sseOpcodePavgb: + return "pavgb" + case sseOpcodePavgw: + return "pavgw" + case sseOpcodePcmpeqb: + return "pcmpeqb" + case sseOpcodePcmpeqw: + return "pcmpeqw" + case sseOpcodePcmpeqd: + return "pcmpeqd" + case sseOpcodePcmpeqq: + return "pcmpeqq" + case sseOpcodePcmpgtb: + return "pcmpgtb" + case sseOpcodePcmpgtw: + return "pcmpgtw" + case sseOpcodePcmpgtd: + return "pcmpgtd" + case sseOpcodePcmpgtq: + return "pcmpgtq" + case sseOpcodePextrb: + return "pextrb" + case sseOpcodePextrw: + return "pextrw" + case sseOpcodePextrd: + return "pextrd" + case sseOpcodePextrq: + return "pextrq" + case sseOpcodePinsrb: + return "pinsrb" + case sseOpcodePinsrw: + return "pinsrw" + case sseOpcodePinsrd: + return "pinsrd" + case sseOpcodePinsrq: + return "pinsrq" + case sseOpcodePmaddwd: + return "pmaddwd" + case sseOpcodePmaxsb: + return "pmaxsb" + case sseOpcodePmaxsw: + return "pmaxsw" + case sseOpcodePmaxsd: + return "pmaxsd" + case sseOpcodePmaxub: + return "pmaxub" + case sseOpcodePmaxuw: + return "pmaxuw" + case sseOpcodePmaxud: + return "pmaxud" + case sseOpcodePminsb: + return "pminsb" + case sseOpcodePminsw: + return "pminsw" + case sseOpcodePminsd: + return "pminsd" + case sseOpcodePminub: + return "pminub" + case sseOpcodePminuw: + return "pminuw" + case sseOpcodePminud: + return "pminud" + case sseOpcodePmovmskb: + return "pmovmskb" + case sseOpcodePmovsxbd: + return "pmovsxbd" + case sseOpcodePmovsxbw: + return "pmovsxbw" + case sseOpcodePmovsxbq: + return "pmovsxbq" + case sseOpcodePmovsxwd: + return "pmovsxwd" + case sseOpcodePmovsxwq: + return "pmovsxwq" + case sseOpcodePmovsxdq: + return "pmovsxdq" + case sseOpcodePmovzxbd: + return "pmovzxbd" + case sseOpcodePmovzxbw: + return "pmovzxbw" + case sseOpcodePmovzxbq: + return "pmovzxbq" + case sseOpcodePmovzxwd: + return "pmovzxwd" + case sseOpcodePmovzxwq: + return "pmovzxwq" + case sseOpcodePmovzxdq: + return "pmovzxdq" + case sseOpcodePmulld: + return "pmulld" + case sseOpcodePmullw: + return "pmullw" + case sseOpcodePmuludq: + return "pmuludq" + case sseOpcodePor: + return "por" + case sseOpcodePshufb: + return "pshufb" + case sseOpcodePshufd: + return "pshufd" + case sseOpcodePsllw: + return "psllw" + case sseOpcodePslld: + return "pslld" + case sseOpcodePsllq: + return "psllq" + case sseOpcodePsraw: + return "psraw" + case sseOpcodePsrad: + return "psrad" + case sseOpcodePsrlw: + return "psrlw" + case sseOpcodePsrld: + return "psrld" + case sseOpcodePsrlq: + return "psrlq" + case sseOpcodePsubb: + return "psubb" + case sseOpcodePsubd: + return "psubd" + case sseOpcodePsubq: + return "psubq" + case sseOpcodePsubw: + return "psubw" + case sseOpcodePsubsb: + return "psubsb" + case sseOpcodePsubsw: + return "psubsw" + case sseOpcodePsubusb: + return "psubusb" + case sseOpcodePsubusw: + return "psubusw" + case sseOpcodePtest: + return "ptest" + case sseOpcodePunpckhbw: + return "punpckhbw" + case sseOpcodePunpcklbw: + return "punpcklbw" + case sseOpcodePxor: + return "pxor" + case sseOpcodeRcpss: + return "rcpss" + case sseOpcodeRoundps: + return "roundps" + case sseOpcodeRoundpd: + return "roundpd" + case sseOpcodeRoundss: + return "roundss" + case sseOpcodeRoundsd: + return "roundsd" + case sseOpcodeRsqrtss: + return "rsqrtss" + case sseOpcodeSqrtps: + return "sqrtps" + case sseOpcodeSqrtpd: + return "sqrtpd" + case sseOpcodeSqrtss: + return "sqrtss" + case sseOpcodeSqrtsd: + return "sqrtsd" + case sseOpcodeSubps: + return "subps" + case sseOpcodeSubpd: + return "subpd" + case sseOpcodeSubss: + return "subss" + case sseOpcodeSubsd: + return "subsd" + case sseOpcodeUcomiss: + return "ucomiss" + case sseOpcodeUcomisd: + return "ucomisd" + case sseOpcodeXorps: + return "xorps" + case sseOpcodeXorpd: + return "xorpd" + case sseOpcodePmulhrsw: + return "pmulhrsw" + case sseOpcodeUnpcklps: + return "unpcklps" + case sseOpcodeCvtps2pd: + return "cvtps2pd" + case sseOpcodeCvtpd2ps: + return "cvtpd2ps" + case sseOpcodeCvttpd2dq: + return "cvttpd2dq" + case sseOpcodeShufps: + return "shufps" + case sseOpcodePmaddubsw: + return "pmaddubsw" + default: + panic("BUG") + } +} + +type roundingMode uint8 + +const ( + roundingModeNearest roundingMode = iota + roundingModeDown + roundingModeUp + roundingModeZero +) + +func (r roundingMode) String() string { + switch r { + case roundingModeNearest: + return "nearest" + case roundingModeDown: + return "down" + case roundingModeUp: + return "up" + case roundingModeZero: + return "zero" + default: + panic("BUG") + } +} + +// cmpPred is the immediate value for a comparison operation in xmmRmRImm. +type cmpPred uint8 + +const ( + // cmpPredEQ_OQ is Equal (ordered, non-signaling) + cmpPredEQ_OQ cmpPred = iota + // cmpPredLT_OS is Less-than (ordered, signaling) + cmpPredLT_OS + // cmpPredLE_OS is Less-than-or-equal (ordered, signaling) + cmpPredLE_OS + // cmpPredUNORD_Q is Unordered (non-signaling) + cmpPredUNORD_Q + // cmpPredNEQ_UQ is Not-equal (unordered, non-signaling) + cmpPredNEQ_UQ + // cmpPredNLT_US is Not-less-than (unordered, signaling) + cmpPredNLT_US + // cmpPredNLE_US is Not-less-than-or-equal (unordered, signaling) + cmpPredNLE_US + // cmpPredORD_Q is Ordered (non-signaling) + cmpPredORD_Q + // cmpPredEQ_UQ is Equal (unordered, non-signaling) + cmpPredEQ_UQ + // cmpPredNGE_US is Not-greater-than-or-equal (unordered, signaling) + cmpPredNGE_US + // cmpPredNGT_US is Not-greater-than (unordered, signaling) + cmpPredNGT_US + // cmpPredFALSE_OQ is False (ordered, non-signaling) + cmpPredFALSE_OQ + // cmpPredNEQ_OQ is Not-equal (ordered, non-signaling) + cmpPredNEQ_OQ + // cmpPredGE_OS is Greater-than-or-equal (ordered, signaling) + cmpPredGE_OS + // cmpPredGT_OS is Greater-than (ordered, signaling) + cmpPredGT_OS + // cmpPredTRUE_UQ is True (unordered, non-signaling) + cmpPredTRUE_UQ + // Equal (ordered, signaling) + cmpPredEQ_OS + // Less-than (ordered, nonsignaling) + cmpPredLT_OQ + // Less-than-or-equal (ordered, nonsignaling) + cmpPredLE_OQ + // Unordered (signaling) + cmpPredUNORD_S + // Not-equal (unordered, signaling) + cmpPredNEQ_US + // Not-less-than (unordered, nonsignaling) + cmpPredNLT_UQ + // Not-less-than-or-equal (unordered, nonsignaling) + cmpPredNLE_UQ + // Ordered (signaling) + cmpPredORD_S + // Equal (unordered, signaling) + cmpPredEQ_US + // Not-greater-than-or-equal (unordered, non-signaling) + cmpPredNGE_UQ + // Not-greater-than (unordered, nonsignaling) + cmpPredNGT_UQ + // False (ordered, signaling) + cmpPredFALSE_OS + // Not-equal (ordered, signaling) + cmpPredNEQ_OS + // Greater-than-or-equal (ordered, nonsignaling) + cmpPredGE_OQ + // Greater-than (ordered, nonsignaling) + cmpPredGT_OQ + // True (unordered, signaling) + cmpPredTRUE_US +) + +func (r cmpPred) String() string { + switch r { + case cmpPredEQ_OQ: + return "eq_oq" + case cmpPredLT_OS: + return "lt_os" + case cmpPredLE_OS: + return "le_os" + case cmpPredUNORD_Q: + return "unord_q" + case cmpPredNEQ_UQ: + return "neq_uq" + case cmpPredNLT_US: + return "nlt_us" + case cmpPredNLE_US: + return "nle_us" + case cmpPredORD_Q: + return "ord_q" + case cmpPredEQ_UQ: + return "eq_uq" + case cmpPredNGE_US: + return "nge_us" + case cmpPredNGT_US: + return "ngt_us" + case cmpPredFALSE_OQ: + return "false_oq" + case cmpPredNEQ_OQ: + return "neq_oq" + case cmpPredGE_OS: + return "ge_os" + case cmpPredGT_OS: + return "gt_os" + case cmpPredTRUE_UQ: + return "true_uq" + case cmpPredEQ_OS: + return "eq_os" + case cmpPredLT_OQ: + return "lt_oq" + case cmpPredLE_OQ: + return "le_oq" + case cmpPredUNORD_S: + return "unord_s" + case cmpPredNEQ_US: + return "neq_us" + case cmpPredNLT_UQ: + return "nlt_uq" + case cmpPredNLE_UQ: + return "nle_uq" + case cmpPredORD_S: + return "ord_s" + case cmpPredEQ_US: + return "eq_us" + case cmpPredNGE_UQ: + return "nge_uq" + case cmpPredNGT_UQ: + return "ngt_uq" + case cmpPredFALSE_OS: + return "false_os" + case cmpPredNEQ_OS: + return "neq_os" + case cmpPredGE_OQ: + return "ge_oq" + case cmpPredGT_OQ: + return "gt_oq" + case cmpPredTRUE_US: + return "true_us" + default: + panic("BUG") + } +} + +func linkInstr(prev, next *instruction) *instruction { + prev.next = next + next.prev = prev + return next +} + +type defKind byte + +const ( + defKindNone defKind = iota + 1 + defKindOp2 + defKindCall + defKindDivRem +) + +var defKinds = [instrMax]defKind{ + nop0: defKindNone, + ret: defKindNone, + movRR: defKindOp2, + movRM: defKindNone, + xmmMovRM: defKindNone, + aluRmiR: defKindNone, + shiftR: defKindNone, + imm: defKindOp2, + unaryRmR: defKindOp2, + xmmRmiReg: defKindNone, + xmmUnaryRmR: defKindOp2, + xmmUnaryRmRImm: defKindOp2, + xmmCmpRmR: defKindNone, + xmmRmR: defKindNone, + xmmRmRImm: defKindNone, + mov64MR: defKindOp2, + movsxRmR: defKindOp2, + movzxRmR: defKindOp2, + gprToXmm: defKindOp2, + xmmToGpr: defKindOp2, + cmove: defKindNone, + call: defKindCall, + callIndirect: defKindCall, + ud2: defKindNone, + jmp: defKindNone, + jmpIf: defKindNone, + jmpTableIsland: defKindNone, + cmpRmiR: defKindNone, + exitSequence: defKindNone, + lea: defKindOp2, + setcc: defKindOp2, + zeros: defKindOp2, + sourceOffsetInfo: defKindNone, + fcvtToSintSequence: defKindNone, + defineUninitializedReg: defKindOp2, + fcvtToUintSequence: defKindNone, + xmmCMov: defKindOp2, + idivRemSequence: defKindDivRem, + blendvpd: defKindNone, + mfence: defKindNone, + xchg: defKindNone, + lockcmpxchg: defKindNone, + lockxadd: defKindNone, + neg: defKindNone, + nopUseReg: defKindNone, +} + +// String implements fmt.Stringer. +func (d defKind) String() string { + switch d { + case defKindNone: + return "none" + case defKindOp2: + return "op2" + case defKindCall: + return "call" + case defKindDivRem: + return "divrem" + default: + return "invalid" + } +} + +type useKind byte + +const ( + useKindNone useKind = iota + 1 + useKindOp1 + // useKindOp1Op2Reg is Op1 can be any operand, Op2 must be a register. + useKindOp1Op2Reg + // useKindOp1RegOp2 is Op1 must be a register, Op2 can be any operand. + useKindOp1RegOp2 + // useKindRaxOp1RegOp2 is Op1 must be a register, Op2 can be any operand, and RAX is used. + useKindRaxOp1RegOp2 + useKindDivRem + useKindBlendvpd + useKindCall + useKindCallInd + useKindFcvtToSintSequence + useKindFcvtToUintSequence +) + +var useKinds = [instrMax]useKind{ + nop0: useKindNone, + ret: useKindNone, + movRR: useKindOp1, + movRM: useKindOp1RegOp2, + xmmMovRM: useKindOp1RegOp2, + cmove: useKindOp1Op2Reg, + aluRmiR: useKindOp1Op2Reg, + shiftR: useKindOp1Op2Reg, + imm: useKindNone, + unaryRmR: useKindOp1, + xmmRmiReg: useKindOp1Op2Reg, + xmmUnaryRmR: useKindOp1, + xmmUnaryRmRImm: useKindOp1, + xmmCmpRmR: useKindOp1Op2Reg, + xmmRmR: useKindOp1Op2Reg, + xmmRmRImm: useKindOp1Op2Reg, + mov64MR: useKindOp1, + movzxRmR: useKindOp1, + movsxRmR: useKindOp1, + gprToXmm: useKindOp1, + xmmToGpr: useKindOp1, + call: useKindCall, + callIndirect: useKindCallInd, + ud2: useKindNone, + jmpIf: useKindOp1, + jmp: useKindOp1, + cmpRmiR: useKindOp1Op2Reg, + exitSequence: useKindOp1, + lea: useKindOp1, + jmpTableIsland: useKindNone, + setcc: useKindNone, + zeros: useKindNone, + sourceOffsetInfo: useKindNone, + fcvtToSintSequence: useKindFcvtToSintSequence, + defineUninitializedReg: useKindNone, + fcvtToUintSequence: useKindFcvtToUintSequence, + xmmCMov: useKindOp1, + idivRemSequence: useKindDivRem, + blendvpd: useKindBlendvpd, + mfence: useKindNone, + xchg: useKindOp1RegOp2, + lockcmpxchg: useKindRaxOp1RegOp2, + lockxadd: useKindOp1RegOp2, + neg: useKindOp1, + nopUseReg: useKindOp1, +} + +func (u useKind) String() string { + switch u { + case useKindNone: + return "none" + case useKindOp1: + return "op1" + case useKindOp1Op2Reg: + return "op1op2Reg" + case useKindOp1RegOp2: + return "op1RegOp2" + case useKindCall: + return "call" + case useKindCallInd: + return "callInd" + default: + return "invalid" + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go new file mode 100644 index 000000000..6637b428c --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go @@ -0,0 +1,1683 @@ +package amd64 + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +func (i *instruction) encode(c backend.Compiler) (needsLabelResolution bool) { + switch kind := i.kind; kind { + case nop0, sourceOffsetInfo, defineUninitializedReg, fcvtToSintSequence, fcvtToUintSequence, nopUseReg: + case ret: + encodeRet(c) + case imm: + dst := regEncodings[i.op2.reg().RealReg()] + con := i.u1 + if i.b1 { // 64 bit. + if lower32willSignExtendTo64(con) { + // Sign extend mov(imm32). + encodeRegReg(c, + legacyPrefixesNone, + 0xc7, 1, + 0, + dst, + rexInfo(0).setW(), + ) + c.Emit4Bytes(uint32(con)) + } else { + c.EmitByte(rexEncodingW | dst.rexBit()) + c.EmitByte(0xb8 | dst.encoding()) + c.Emit8Bytes(con) + } + } else { + if dst.rexBit() > 0 { + c.EmitByte(rexEncodingDefault | 0x1) + } + c.EmitByte(0xb8 | dst.encoding()) + c.Emit4Bytes(uint32(con)) + } + + case aluRmiR: + var rex rexInfo + if i.b1 { + rex = rex.setW() + } else { + rex = rex.clearW() + } + + dst := regEncodings[i.op2.reg().RealReg()] + + aluOp := aluRmiROpcode(i.u1) + if aluOp == aluRmiROpcodeMul { + op1 := i.op1 + const regMemOpc, regMemOpcNum = 0x0FAF, 2 + switch op1.kind { + case operandKindReg: + src := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, legacyPrefixesNone, regMemOpc, regMemOpcNum, dst, src, rex) + case operandKindMem: + m := i.op1.addressMode() + encodeRegMem(c, legacyPrefixesNone, regMemOpc, regMemOpcNum, dst, m, rex) + case operandKindImm32: + imm8 := lower8willSignExtendTo32(op1.imm32()) + var opc uint32 + if imm8 { + opc = 0x6b + } else { + opc = 0x69 + } + encodeRegReg(c, legacyPrefixesNone, opc, 1, dst, dst, rex) + if imm8 { + c.EmitByte(byte(op1.imm32())) + } else { + c.Emit4Bytes(op1.imm32()) + } + default: + panic("BUG: invalid operand kind") + } + } else { + const opcodeNum = 1 + var opcR, opcM, subOpcImm uint32 + switch aluOp { + case aluRmiROpcodeAdd: + opcR, opcM, subOpcImm = 0x01, 0x03, 0x0 + case aluRmiROpcodeSub: + opcR, opcM, subOpcImm = 0x29, 0x2b, 0x5 + case aluRmiROpcodeAnd: + opcR, opcM, subOpcImm = 0x21, 0x23, 0x4 + case aluRmiROpcodeOr: + opcR, opcM, subOpcImm = 0x09, 0x0b, 0x1 + case aluRmiROpcodeXor: + opcR, opcM, subOpcImm = 0x31, 0x33, 0x6 + default: + panic("BUG: invalid aluRmiROpcode") + } + + op1 := i.op1 + switch op1.kind { + case operandKindReg: + src := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, legacyPrefixesNone, opcR, opcodeNum, src, dst, rex) + case operandKindMem: + m := i.op1.addressMode() + encodeRegMem(c, legacyPrefixesNone, opcM, opcodeNum, dst, m, rex) + case operandKindImm32: + imm8 := lower8willSignExtendTo32(op1.imm32()) + var opc uint32 + if imm8 { + opc = 0x83 + } else { + opc = 0x81 + } + encodeRegReg(c, legacyPrefixesNone, opc, opcodeNum, regEnc(subOpcImm), dst, rex) + if imm8 { + c.EmitByte(byte(op1.imm32())) + } else { + c.Emit4Bytes(op1.imm32()) + } + default: + panic("BUG: invalid operand kind") + } + } + + case movRR: + src := regEncodings[i.op1.reg().RealReg()] + dst := regEncodings[i.op2.reg().RealReg()] + var rex rexInfo + if i.b1 { + rex = rex.setW() + } else { + rex = rex.clearW() + } + encodeRegReg(c, legacyPrefixesNone, 0x89, 1, src, dst, rex) + + case xmmRmR, blendvpd: + op := sseOpcode(i.u1) + var legPrex legacyPrefixes + var opcode uint32 + var opcodeNum uint32 + switch op { + case sseOpcodeAddps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F58, 2 + case sseOpcodeAddpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F58, 2 + case sseOpcodeAddss: + legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F58, 2 + case sseOpcodeAddsd: + legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F58, 2 + case sseOpcodeAndps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F54, 2 + case sseOpcodeAndpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F54, 2 + case sseOpcodeAndnps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F55, 2 + case sseOpcodeAndnpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F55, 2 + case sseOpcodeBlendvps: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3814, 3 + case sseOpcodeBlendvpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3815, 3 + case sseOpcodeDivps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5E, 2 + case sseOpcodeDivpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5E, 2 + case sseOpcodeDivss: + legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5E, 2 + case sseOpcodeDivsd: + legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5E, 2 + case sseOpcodeMaxps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5F, 2 + case sseOpcodeMaxpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5F, 2 + case sseOpcodeMaxss: + legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5F, 2 + case sseOpcodeMaxsd: + legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5F, 2 + case sseOpcodeMinps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5D, 2 + case sseOpcodeMinpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5D, 2 + case sseOpcodeMinss: + legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5D, 2 + case sseOpcodeMinsd: + legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5D, 2 + case sseOpcodeMovlhps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F16, 2 + case sseOpcodeMovsd: + legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F10, 2 + case sseOpcodeMulps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F59, 2 + case sseOpcodeMulpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F59, 2 + case sseOpcodeMulss: + legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F59, 2 + case sseOpcodeMulsd: + legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F59, 2 + case sseOpcodeOrpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F56, 2 + case sseOpcodeOrps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F56, 2 + case sseOpcodePackssdw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F6B, 2 + case sseOpcodePacksswb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F63, 2 + case sseOpcodePackusdw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F382B, 3 + case sseOpcodePackuswb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F67, 2 + case sseOpcodePaddb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFC, 2 + case sseOpcodePaddd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFE, 2 + case sseOpcodePaddq: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD4, 2 + case sseOpcodePaddw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFD, 2 + case sseOpcodePaddsb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEC, 2 + case sseOpcodePaddsw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FED, 2 + case sseOpcodePaddusb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDC, 2 + case sseOpcodePaddusw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDD, 2 + case sseOpcodePand: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDB, 2 + case sseOpcodePandn: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDF, 2 + case sseOpcodePavgb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE0, 2 + case sseOpcodePavgw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE3, 2 + case sseOpcodePcmpeqb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F74, 2 + case sseOpcodePcmpeqw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F75, 2 + case sseOpcodePcmpeqd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F76, 2 + case sseOpcodePcmpeqq: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3829, 3 + case sseOpcodePcmpgtb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F64, 2 + case sseOpcodePcmpgtw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F65, 2 + case sseOpcodePcmpgtd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F66, 2 + case sseOpcodePcmpgtq: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3837, 3 + case sseOpcodePmaddwd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF5, 2 + case sseOpcodePmaxsb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383C, 3 + case sseOpcodePmaxsw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEE, 2 + case sseOpcodePmaxsd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383D, 3 + case sseOpcodePmaxub: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDE, 2 + case sseOpcodePmaxuw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383E, 3 + case sseOpcodePmaxud: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383F, 3 + case sseOpcodePminsb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3838, 3 + case sseOpcodePminsw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEA, 2 + case sseOpcodePminsd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3839, 3 + case sseOpcodePminub: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDA, 2 + case sseOpcodePminuw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383A, 3 + case sseOpcodePminud: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383B, 3 + case sseOpcodePmulld: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3840, 3 + case sseOpcodePmullw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD5, 2 + case sseOpcodePmuludq: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF4, 2 + case sseOpcodePor: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEB, 2 + case sseOpcodePshufb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3800, 3 + case sseOpcodePsubb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF8, 2 + case sseOpcodePsubd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFA, 2 + case sseOpcodePsubq: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFB, 2 + case sseOpcodePsubw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF9, 2 + case sseOpcodePsubsb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE8, 2 + case sseOpcodePsubsw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE9, 2 + case sseOpcodePsubusb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD8, 2 + case sseOpcodePsubusw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD9, 2 + case sseOpcodePunpckhbw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F68, 2 + case sseOpcodePunpcklbw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F60, 2 + case sseOpcodePxor: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEF, 2 + case sseOpcodeSubps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5C, 2 + case sseOpcodeSubpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5C, 2 + case sseOpcodeSubss: + legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5C, 2 + case sseOpcodeSubsd: + legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5C, 2 + case sseOpcodeXorps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F57, 2 + case sseOpcodeXorpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F57, 2 + case sseOpcodePmulhrsw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F380B, 3 + case sseOpcodeUnpcklps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F14, 2 + case sseOpcodePmaddubsw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3804, 3 + default: + if kind == blendvpd { + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3815, 3 + } else { + panic(fmt.Sprintf("Unsupported sseOpcode: %s", op)) + } + } + + dst := regEncodings[i.op2.reg().RealReg()] + + rex := rexInfo(0).clearW() + op1 := i.op1 + if op1.kind == operandKindReg { + src := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, legPrex, opcode, opcodeNum, dst, src, rex) + } else if i.op1.kind == operandKindMem { + m := i.op1.addressMode() + encodeRegMem(c, legPrex, opcode, opcodeNum, dst, m, rex) + } else { + panic("BUG: invalid operand kind") + } + + case gprToXmm: + var legPrefix legacyPrefixes + var opcode uint32 + const opcodeNum = 2 + switch sseOpcode(i.u1) { + case sseOpcodeMovd, sseOpcodeMovq: + legPrefix, opcode = legacyPrefixes0x66, 0x0f6e + case sseOpcodeCvtsi2ss: + legPrefix, opcode = legacyPrefixes0xF3, 0x0f2a + case sseOpcodeCvtsi2sd: + legPrefix, opcode = legacyPrefixes0xF2, 0x0f2a + default: + panic(fmt.Sprintf("Unsupported sseOpcode: %s", sseOpcode(i.u1))) + } + + var rex rexInfo + if i.b1 { + rex = rex.setW() + } else { + rex = rex.clearW() + } + dst := regEncodings[i.op2.reg().RealReg()] + + op1 := i.op1 + if op1.kind == operandKindReg { + src := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, legPrefix, opcode, opcodeNum, dst, src, rex) + } else if i.op1.kind == operandKindMem { + m := i.op1.addressMode() + encodeRegMem(c, legPrefix, opcode, opcodeNum, dst, m, rex) + } else { + panic("BUG: invalid operand kind") + } + + case xmmUnaryRmR: + var prefix legacyPrefixes + var opcode uint32 + var opcodeNum uint32 + op := sseOpcode(i.u1) + switch op { + case sseOpcodeCvtss2sd: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5A, 2 + case sseOpcodeCvtsd2ss: + prefix, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5A, 2 + case sseOpcodeMovaps: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F28, 2 + case sseOpcodeMovapd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F28, 2 + case sseOpcodeMovdqa: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F6F, 2 + case sseOpcodeMovdqu: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F6F, 2 + case sseOpcodeMovsd: + prefix, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F10, 2 + case sseOpcodeMovss: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F10, 2 + case sseOpcodeMovups: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F10, 2 + case sseOpcodeMovupd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F10, 2 + case sseOpcodePabsb: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F381C, 3 + case sseOpcodePabsw: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F381D, 3 + case sseOpcodePabsd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F381E, 3 + case sseOpcodePmovsxbd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3821, 3 + case sseOpcodePmovsxbw: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3820, 3 + case sseOpcodePmovsxbq: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3822, 3 + case sseOpcodePmovsxwd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3823, 3 + case sseOpcodePmovsxwq: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3824, 3 + case sseOpcodePmovsxdq: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3825, 3 + case sseOpcodePmovzxbd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3831, 3 + case sseOpcodePmovzxbw: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3830, 3 + case sseOpcodePmovzxbq: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3832, 3 + case sseOpcodePmovzxwd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3833, 3 + case sseOpcodePmovzxwq: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3834, 3 + case sseOpcodePmovzxdq: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3835, 3 + case sseOpcodeSqrtps: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F51, 2 + case sseOpcodeSqrtpd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F51, 2 + case sseOpcodeSqrtss: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F51, 2 + case sseOpcodeSqrtsd: + prefix, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F51, 2 + case sseOpcodeXorps: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F57, 2 + case sseOpcodeXorpd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F57, 2 + case sseOpcodeCvtdq2ps: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F5B, 2 + case sseOpcodeCvtdq2pd: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0FE6, 2 + case sseOpcodeCvtps2pd: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F5A, 2 + case sseOpcodeCvtpd2ps: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5A, 2 + case sseOpcodeCvttps2dq: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5B, 2 + case sseOpcodeCvttpd2dq: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE6, 2 + default: + panic(fmt.Sprintf("Unsupported sseOpcode: %s", op)) + } + + dst := regEncodings[i.op2.reg().RealReg()] + + rex := rexInfo(0).clearW() + op1 := i.op1 + if op1.kind == operandKindReg { + src := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, prefix, opcode, opcodeNum, dst, src, rex) + } else if i.op1.kind == operandKindMem { + m := i.op1.addressMode() + needsLabelResolution = encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex) + } else { + panic("BUG: invalid operand kind") + } + + case xmmUnaryRmRImm: + var prefix legacyPrefixes + var opcode uint32 + var opcodeNum uint32 + op := sseOpcode(i.u1) + switch op { + case sseOpcodeRoundps: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a08, 3 + case sseOpcodeRoundss: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a0a, 3 + case sseOpcodeRoundpd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a09, 3 + case sseOpcodeRoundsd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a0b, 3 + } + rex := rexInfo(0).clearW() + dst := regEncodings[i.op2.reg().RealReg()] + op1 := i.op1 + if op1.kind == operandKindReg { + src := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, prefix, opcode, opcodeNum, dst, src, rex) + } else if i.op1.kind == operandKindMem { + m := i.op1.addressMode() + encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex) + } else { + panic("BUG: invalid operand kind") + } + + c.EmitByte(byte(i.u2)) + + case unaryRmR: + var prefix legacyPrefixes + var opcode uint32 + var opcodeNum uint32 + op := unaryRmROpcode(i.u1) + // We assume size is either 32 or 64. + switch op { + case unaryRmROpcodeBsr: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0fbd, 2 + case unaryRmROpcodeBsf: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0fbc, 2 + case unaryRmROpcodeLzcnt: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0fbd, 2 + case unaryRmROpcodeTzcnt: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0fbc, 2 + case unaryRmROpcodePopcnt: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0fb8, 2 + default: + panic(fmt.Sprintf("Unsupported unaryRmROpcode: %s", op)) + } + + dst := regEncodings[i.op2.reg().RealReg()] + + rex := rexInfo(0) + if i.b1 { // 64 bit. + rex = rexInfo(0).setW() + } else { + rex = rexInfo(0).clearW() + } + op1 := i.op1 + if op1.kind == operandKindReg { + src := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, prefix, opcode, opcodeNum, dst, src, rex) + } else if i.op1.kind == operandKindMem { + m := i.op1.addressMode() + encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex) + } else { + panic("BUG: invalid operand kind") + } + + case not: + var prefix legacyPrefixes + src := regEncodings[i.op1.reg().RealReg()] + rex := rexInfo(0) + if i.b1 { // 64 bit. + rex = rexInfo(0).setW() + } else { + rex = rexInfo(0).clearW() + } + subopcode := uint8(2) + encodeEncEnc(c, prefix, 0xf7, 1, subopcode, uint8(src), rex) + + case neg: + var prefix legacyPrefixes + src := regEncodings[i.op1.reg().RealReg()] + rex := rexInfo(0) + if i.b1 { // 64 bit. + rex = rexInfo(0).setW() + } else { + rex = rexInfo(0).clearW() + } + subopcode := uint8(3) + encodeEncEnc(c, prefix, 0xf7, 1, subopcode, uint8(src), rex) + + case div: + rex := rexInfo(0) + if i.b1 { // 64 bit. + rex = rexInfo(0).setW() + } else { + rex = rexInfo(0).clearW() + } + var subopcode uint8 + if i.u1 != 0 { // Signed. + subopcode = 7 + } else { + subopcode = 6 + } + + divisor := i.op1 + if divisor.kind == operandKindReg { + src := regEncodings[divisor.reg().RealReg()] + encodeEncEnc(c, legacyPrefixesNone, 0xf7, 1, subopcode, uint8(src), rex) + } else if divisor.kind == operandKindMem { + m := divisor.addressMode() + encodeEncMem(c, legacyPrefixesNone, 0xf7, 1, subopcode, m, rex) + } else { + panic("BUG: invalid operand kind") + } + + case mulHi: + var prefix legacyPrefixes + rex := rexInfo(0) + if i.b1 { // 64 bit. + rex = rexInfo(0).setW() + } else { + rex = rexInfo(0).clearW() + } + + signed := i.u1 != 0 + var subopcode uint8 + if signed { + subopcode = 5 + } else { + subopcode = 4 + } + + // src1 is implicitly rax, + // dst_lo is implicitly rax, + // dst_hi is implicitly rdx. + src2 := i.op1 + if src2.kind == operandKindReg { + src := regEncodings[src2.reg().RealReg()] + encodeEncEnc(c, prefix, 0xf7, 1, subopcode, uint8(src), rex) + } else if src2.kind == operandKindMem { + m := src2.addressMode() + encodeEncMem(c, prefix, 0xf7, 1, subopcode, m, rex) + } else { + panic("BUG: invalid operand kind") + } + + case signExtendData: + if i.b1 { // 64 bit. + c.EmitByte(0x48) + c.EmitByte(0x99) + } else { + c.EmitByte(0x99) + } + case movzxRmR, movsxRmR: + signed := i.kind == movsxRmR + + ext := extMode(i.u1) + var opcode uint32 + var opcodeNum uint32 + var rex rexInfo + switch ext { + case extModeBL: + if signed { + opcode, opcodeNum, rex = 0x0fbe, 2, rex.clearW() + } else { + opcode, opcodeNum, rex = 0x0fb6, 2, rex.clearW() + } + case extModeBQ: + if signed { + opcode, opcodeNum, rex = 0x0fbe, 2, rex.setW() + } else { + opcode, opcodeNum, rex = 0x0fb6, 2, rex.setW() + } + case extModeWL: + if signed { + opcode, opcodeNum, rex = 0x0fbf, 2, rex.clearW() + } else { + opcode, opcodeNum, rex = 0x0fb7, 2, rex.clearW() + } + case extModeWQ: + if signed { + opcode, opcodeNum, rex = 0x0fbf, 2, rex.setW() + } else { + opcode, opcodeNum, rex = 0x0fb7, 2, rex.setW() + } + case extModeLQ: + if signed { + opcode, opcodeNum, rex = 0x63, 1, rex.setW() + } else { + opcode, opcodeNum, rex = 0x8b, 1, rex.clearW() + } + default: + panic("BUG: invalid extMode") + } + + op := i.op1 + dst := regEncodings[i.op2.reg().RealReg()] + switch op.kind { + case operandKindReg: + src := regEncodings[op.reg().RealReg()] + if ext == extModeBL || ext == extModeBQ { + // Some destinations must be encoded with REX.R = 1. + if e := src.encoding(); e >= 4 && e <= 7 { + rex = rex.always() + } + } + encodeRegReg(c, legacyPrefixesNone, opcode, opcodeNum, dst, src, rex) + case operandKindMem: + m := op.addressMode() + encodeRegMem(c, legacyPrefixesNone, opcode, opcodeNum, dst, m, rex) + default: + panic("BUG: invalid operand kind") + } + + case mov64MR: + m := i.op1.addressMode() + encodeLoad64(c, m, i.op2.reg().RealReg()) + + case lea: + needsLabelResolution = true + dst := regEncodings[i.op2.reg().RealReg()] + rex := rexInfo(0).setW() + const opcode, opcodeNum = 0x8d, 1 + switch i.op1.kind { + case operandKindMem: + a := i.op1.addressMode() + encodeRegMem(c, legacyPrefixesNone, opcode, opcodeNum, dst, a, rex) + case operandKindLabel: + rex.encode(c, regRexBit(byte(dst)), 0) + c.EmitByte(byte((opcode) & 0xff)) + + // Indicate "LEAQ [RIP + 32bit displacement]. + // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing + c.EmitByte(encodeModRM(0b00, dst.encoding(), 0b101)) + + // This will be resolved later, so we just emit a placeholder (0xffffffff for testing). + c.Emit4Bytes(0xffffffff) + default: + panic("BUG: invalid operand kind") + } + + case movRM: + m := i.op2.addressMode() + src := regEncodings[i.op1.reg().RealReg()] + + var rex rexInfo + switch i.u1 { + case 1: + if e := src.encoding(); e >= 4 && e <= 7 { + rex = rex.always() + } + encodeRegMem(c, legacyPrefixesNone, 0x88, 1, src, m, rex.clearW()) + case 2: + encodeRegMem(c, legacyPrefixes0x66, 0x89, 1, src, m, rex.clearW()) + case 4: + encodeRegMem(c, legacyPrefixesNone, 0x89, 1, src, m, rex.clearW()) + case 8: + encodeRegMem(c, legacyPrefixesNone, 0x89, 1, src, m, rex.setW()) + default: + panic(fmt.Sprintf("BUG: invalid size %d: %s", i.u1, i.String())) + } + + case shiftR: + src := regEncodings[i.op2.reg().RealReg()] + amount := i.op1 + + var opcode uint32 + var prefix legacyPrefixes + rex := rexInfo(0) + if i.b1 { // 64 bit. + rex = rexInfo(0).setW() + } else { + rex = rexInfo(0).clearW() + } + + switch amount.kind { + case operandKindReg: + if amount.reg() != rcxVReg { + panic("BUG: invalid reg operand: must be rcx") + } + opcode, prefix = 0xd3, legacyPrefixesNone + encodeEncEnc(c, prefix, opcode, 1, uint8(i.u1), uint8(src), rex) + case operandKindImm32: + opcode, prefix = 0xc1, legacyPrefixesNone + encodeEncEnc(c, prefix, opcode, 1, uint8(i.u1), uint8(src), rex) + c.EmitByte(byte(amount.imm32())) + default: + panic("BUG: invalid operand kind") + } + case xmmRmiReg: + const legPrefix = legacyPrefixes0x66 + rex := rexInfo(0).clearW() + dst := regEncodings[i.op2.reg().RealReg()] + + var opcode uint32 + var regDigit uint8 + + op := sseOpcode(i.u1) + op1 := i.op1 + if i.op1.kind == operandKindImm32 { + switch op { + case sseOpcodePsllw: + opcode, regDigit = 0x0f71, 6 + case sseOpcodePslld: + opcode, regDigit = 0x0f72, 6 + case sseOpcodePsllq: + opcode, regDigit = 0x0f73, 6 + case sseOpcodePsraw: + opcode, regDigit = 0x0f71, 4 + case sseOpcodePsrad: + opcode, regDigit = 0x0f72, 4 + case sseOpcodePsrlw: + opcode, regDigit = 0x0f71, 2 + case sseOpcodePsrld: + opcode, regDigit = 0x0f72, 2 + case sseOpcodePsrlq: + opcode, regDigit = 0x0f73, 2 + default: + panic("invalid opcode") + } + + encodeEncEnc(c, legPrefix, opcode, 2, regDigit, uint8(dst), rex) + imm32 := op1.imm32() + if imm32 > 0xff&imm32 { + panic("immediate value does not fit 1 byte") + } + c.EmitByte(uint8(imm32)) + } else { + switch op { + case sseOpcodePsllw: + opcode = 0x0ff1 + case sseOpcodePslld: + opcode = 0x0ff2 + case sseOpcodePsllq: + opcode = 0x0ff3 + case sseOpcodePsraw: + opcode = 0x0fe1 + case sseOpcodePsrad: + opcode = 0x0fe2 + case sseOpcodePsrlw: + opcode = 0x0fd1 + case sseOpcodePsrld: + opcode = 0x0fd2 + case sseOpcodePsrlq: + opcode = 0x0fd3 + default: + panic("invalid opcode") + } + + if op1.kind == operandKindReg { + reg := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, legPrefix, opcode, 2, dst, reg, rex) + } else if op1.kind == operandKindMem { + m := op1.addressMode() + encodeRegMem(c, legPrefix, opcode, 2, dst, m, rex) + } else { + panic("BUG: invalid operand kind") + } + } + + case cmpRmiR: + var opcode uint32 + isCmp := i.u1 != 0 + rex := rexInfo(0) + _64 := i.b1 + if _64 { // 64 bit. + rex = rex.setW() + } else { + rex = rex.clearW() + } + dst := regEncodings[i.op2.reg().RealReg()] + op1 := i.op1 + switch op1.kind { + case operandKindReg: + reg := regEncodings[op1.reg().RealReg()] + if isCmp { + opcode = 0x39 + } else { + opcode = 0x85 + } + // Here we swap the encoding of the operands for CMP to be consistent with the output of LLVM/GCC. + encodeRegReg(c, legacyPrefixesNone, opcode, 1, reg, dst, rex) + + case operandKindMem: + if isCmp { + opcode = 0x3b + } else { + opcode = 0x85 + } + m := op1.addressMode() + encodeRegMem(c, legacyPrefixesNone, opcode, 1, dst, m, rex) + + case operandKindImm32: + imm32 := op1.imm32() + useImm8 := isCmp && lower8willSignExtendTo32(imm32) + var subopcode uint8 + + switch { + case isCmp && useImm8: + opcode, subopcode = 0x83, 7 + case isCmp && !useImm8: + opcode, subopcode = 0x81, 7 + default: + opcode, subopcode = 0xf7, 0 + } + encodeEncEnc(c, legacyPrefixesNone, opcode, 1, subopcode, uint8(dst), rex) + if useImm8 { + c.EmitByte(uint8(imm32)) + } else { + c.Emit4Bytes(imm32) + } + + default: + panic("BUG: invalid operand kind") + } + case setcc: + cc := cond(i.u1) + dst := regEncodings[i.op2.reg().RealReg()] + rex := rexInfo(0).clearW().always() + opcode := uint32(0x0f90) + uint32(cc) + encodeEncEnc(c, legacyPrefixesNone, opcode, 2, 0, uint8(dst), rex) + case cmove: + cc := cond(i.u1) + dst := regEncodings[i.op2.reg().RealReg()] + rex := rexInfo(0) + if i.b1 { // 64 bit. + rex = rex.setW() + } else { + rex = rex.clearW() + } + opcode := uint32(0x0f40) + uint32(cc) + src := i.op1 + switch src.kind { + case operandKindReg: + srcReg := regEncodings[src.reg().RealReg()] + encodeRegReg(c, legacyPrefixesNone, opcode, 2, dst, srcReg, rex) + case operandKindMem: + m := src.addressMode() + encodeRegMem(c, legacyPrefixesNone, opcode, 2, dst, m, rex) + default: + panic("BUG: invalid operand kind") + } + case push64: + op := i.op1 + + switch op.kind { + case operandKindReg: + dst := regEncodings[op.reg().RealReg()] + if dst.rexBit() > 0 { + c.EmitByte(rexEncodingDefault | 0x1) + } + c.EmitByte(0x50 | dst.encoding()) + case operandKindMem: + m := op.addressMode() + encodeRegMem( + c, legacyPrefixesNone, 0xff, 1, regEnc(6), m, rexInfo(0).clearW(), + ) + case operandKindImm32: + c.EmitByte(0x68) + c.Emit4Bytes(op.imm32()) + default: + panic("BUG: invalid operand kind") + } + + case pop64: + dst := regEncodings[i.op1.reg().RealReg()] + if dst.rexBit() > 0 { + c.EmitByte(rexEncodingDefault | 0x1) + } + c.EmitByte(0x58 | dst.encoding()) + + case xmmMovRM: + var legPrefix legacyPrefixes + var opcode uint32 + const opcodeNum = 2 + switch sseOpcode(i.u1) { + case sseOpcodeMovaps: + legPrefix, opcode = legacyPrefixesNone, 0x0f29 + case sseOpcodeMovapd: + legPrefix, opcode = legacyPrefixes0x66, 0x0f29 + case sseOpcodeMovdqa: + legPrefix, opcode = legacyPrefixes0x66, 0x0f7f + case sseOpcodeMovdqu: + legPrefix, opcode = legacyPrefixes0xF3, 0x0f7f + case sseOpcodeMovss: + legPrefix, opcode = legacyPrefixes0xF3, 0x0f11 + case sseOpcodeMovsd: + legPrefix, opcode = legacyPrefixes0xF2, 0x0f11 + case sseOpcodeMovups: + legPrefix, opcode = legacyPrefixesNone, 0x0f11 + case sseOpcodeMovupd: + legPrefix, opcode = legacyPrefixes0x66, 0x0f11 + default: + panic(fmt.Sprintf("Unsupported sseOpcode: %s", sseOpcode(i.u1))) + } + + dst := regEncodings[i.op1.reg().RealReg()] + encodeRegMem(c, legPrefix, opcode, opcodeNum, dst, i.op2.addressMode(), rexInfo(0).clearW()) + case xmmLoadConst: + panic("TODO") + case xmmToGpr: + var legPrefix legacyPrefixes + var opcode uint32 + var argSwap bool + const opcodeNum = 2 + switch sseOpcode(i.u1) { + case sseOpcodeMovd, sseOpcodeMovq: + legPrefix, opcode, argSwap = legacyPrefixes0x66, 0x0f7e, false + case sseOpcodeMovmskps: + legPrefix, opcode, argSwap = legacyPrefixesNone, 0x0f50, true + case sseOpcodeMovmskpd: + legPrefix, opcode, argSwap = legacyPrefixes0x66, 0x0f50, true + case sseOpcodePmovmskb: + legPrefix, opcode, argSwap = legacyPrefixes0x66, 0x0fd7, true + case sseOpcodeCvttss2si: + legPrefix, opcode, argSwap = legacyPrefixes0xF3, 0x0f2c, true + case sseOpcodeCvttsd2si: + legPrefix, opcode, argSwap = legacyPrefixes0xF2, 0x0f2c, true + default: + panic(fmt.Sprintf("Unsupported sseOpcode: %s", sseOpcode(i.u1))) + } + + var rex rexInfo + if i.b1 { + rex = rex.setW() + } else { + rex = rex.clearW() + } + src := regEncodings[i.op1.reg().RealReg()] + dst := regEncodings[i.op2.reg().RealReg()] + if argSwap { + src, dst = dst, src + } + encodeRegReg(c, legPrefix, opcode, opcodeNum, src, dst, rex) + + case cvtUint64ToFloatSeq: + panic("TODO") + case cvtFloatToSintSeq: + panic("TODO") + case cvtFloatToUintSeq: + panic("TODO") + case xmmMinMaxSeq: + panic("TODO") + case xmmCmpRmR: + var prefix legacyPrefixes + var opcode uint32 + var opcodeNum uint32 + rex := rexInfo(0) + _64 := i.b1 + if _64 { // 64 bit. + rex = rex.setW() + } else { + rex = rex.clearW() + } + + op := sseOpcode(i.u1) + switch op { + case sseOpcodePtest: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3817, 3 + case sseOpcodeUcomisd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f2e, 2 + case sseOpcodeUcomiss: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0f2e, 2 + default: + panic(fmt.Sprintf("Unsupported sseOpcode: %s", op)) + } + + dst := regEncodings[i.op2.reg().RealReg()] + op1 := i.op1 + switch op1.kind { + case operandKindReg: + reg := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, prefix, opcode, opcodeNum, dst, reg, rex) + + case operandKindMem: + m := op1.addressMode() + encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex) + + default: + panic("BUG: invalid operand kind") + } + case xmmRmRImm: + op := sseOpcode(i.u1) + var legPrex legacyPrefixes + var opcode uint32 + var opcodeNum uint32 + var swap bool + switch op { + case sseOpcodeCmpps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0FC2, 2 + case sseOpcodeCmppd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FC2, 2 + case sseOpcodeCmpss: + legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0FC2, 2 + case sseOpcodeCmpsd: + legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0FC2, 2 + case sseOpcodeInsertps: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A21, 3 + case sseOpcodePalignr: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A0F, 3 + case sseOpcodePinsrb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A20, 3 + case sseOpcodePinsrw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FC4, 2 + case sseOpcodePinsrd, sseOpcodePinsrq: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A22, 3 + case sseOpcodePextrb: + swap = true + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A14, 3 + case sseOpcodePextrw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FC5, 2 + case sseOpcodePextrd, sseOpcodePextrq: + swap = true + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A16, 3 + case sseOpcodePshufd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F70, 2 + case sseOpcodeRoundps: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A08, 3 + case sseOpcodeRoundpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A09, 3 + case sseOpcodeShufps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0FC6, 2 + default: + panic(fmt.Sprintf("Unsupported sseOpcode: %s", op)) + } + + dst := regEncodings[i.op2.reg().RealReg()] + + var rex rexInfo + if op == sseOpcodePextrq || op == sseOpcodePinsrq { + rex = rexInfo(0).setW() + } else { + rex = rexInfo(0).clearW() + } + op1 := i.op1 + if op1.kind == operandKindReg { + src := regEncodings[op1.reg().RealReg()] + if swap { + src, dst = dst, src + } + encodeRegReg(c, legPrex, opcode, opcodeNum, dst, src, rex) + } else if i.op1.kind == operandKindMem { + if swap { + panic("BUG: this is not possible to encode") + } + m := i.op1.addressMode() + encodeRegMem(c, legPrex, opcode, opcodeNum, dst, m, rex) + } else { + panic("BUG: invalid operand kind") + } + + c.EmitByte(byte(i.u2)) + + case jmp: + const ( + regMemOpcode = 0xff + regMemOpcodeNum = 1 + regMemSubOpcode = 4 + ) + op := i.op1 + switch op.kind { + case operandKindLabel: + needsLabelResolution = true + fallthrough + case operandKindImm32: + c.EmitByte(0xe9) + c.Emit4Bytes(op.imm32()) + case operandKindMem: + m := op.addressMode() + encodeRegMem(c, + legacyPrefixesNone, + regMemOpcode, regMemOpcodeNum, + regMemSubOpcode, m, rexInfo(0).clearW(), + ) + case operandKindReg: + r := op.reg().RealReg() + encodeRegReg( + c, + legacyPrefixesNone, + regMemOpcode, regMemOpcodeNum, + regMemSubOpcode, + regEncodings[r], rexInfo(0).clearW(), + ) + default: + panic("BUG: invalid operand kind") + } + + case jmpIf: + op := i.op1 + switch op.kind { + case operandKindLabel: + needsLabelResolution = true + fallthrough + case operandKindImm32: + c.EmitByte(0x0f) + c.EmitByte(0x80 | cond(i.u1).encoding()) + c.Emit4Bytes(op.imm32()) + default: + panic("BUG: invalid operand kind") + } + + case jmpTableIsland: + needsLabelResolution = true + for tc := uint64(0); tc < i.u2; tc++ { + c.Emit8Bytes(0) + } + + case exitSequence: + execCtx := i.op1.reg() + allocatedAmode := i.op2.addressMode() + + // Restore the RBP, RSP, and return to the Go code: + *allocatedAmode = amode{ + kindWithShift: uint32(amodeImmReg), base: execCtx, + imm32: wazevoapi.ExecutionContextOffsetOriginalFramePointer.U32(), + } + encodeLoad64(c, allocatedAmode, rbp) + allocatedAmode.imm32 = wazevoapi.ExecutionContextOffsetOriginalStackPointer.U32() + encodeLoad64(c, allocatedAmode, rsp) + encodeRet(c) + + case ud2: + c.EmitByte(0x0f) + c.EmitByte(0x0b) + + case call: + c.EmitByte(0xe8) + // Meaning that the call target is a function value, and requires relocation. + c.AddRelocationInfo(ssa.FuncRef(i.u1)) + // Note that this is zero as a placeholder for the call target if it's a function value. + c.Emit4Bytes(uint32(i.u2)) + + case callIndirect: + op := i.op1 + + const opcodeNum = 1 + const opcode = 0xff + rex := rexInfo(0).clearW() + switch op.kind { + case operandKindReg: + dst := regEncodings[op.reg().RealReg()] + encodeRegReg(c, + legacyPrefixesNone, + opcode, opcodeNum, + regEnc(2), + dst, + rex, + ) + case operandKindMem: + m := op.addressMode() + encodeRegMem(c, + legacyPrefixesNone, + opcode, opcodeNum, + regEnc(2), + m, + rex, + ) + default: + panic("BUG: invalid operand kind") + } + + case xchg: + src, dst := regEncodings[i.op1.reg().RealReg()], i.op2 + size := i.u1 + + var rex rexInfo + var opcode uint32 + lp := legacyPrefixesNone + switch size { + case 8: + opcode = 0x87 + rex = rexInfo(0).setW() + case 4: + opcode = 0x87 + rex = rexInfo(0).clearW() + case 2: + lp = legacyPrefixes0x66 + opcode = 0x87 + rex = rexInfo(0).clearW() + case 1: + opcode = 0x86 + if i.op2.kind == operandKindReg { + panic("TODO?: xchg on two 1-byte registers") + } + // Some destinations must be encoded with REX.R = 1. + if e := src.encoding(); e >= 4 && e <= 7 { + rex = rexInfo(0).always() + } + default: + panic(fmt.Sprintf("BUG: invalid size %d: %s", size, i.String())) + } + + switch dst.kind { + case operandKindMem: + m := dst.addressMode() + encodeRegMem(c, lp, opcode, 1, src, m, rex) + case operandKindReg: + r := dst.reg().RealReg() + encodeRegReg(c, lp, opcode, 1, src, regEncodings[r], rex) + default: + panic("BUG: invalid operand kind") + } + + case lockcmpxchg: + src, dst := regEncodings[i.op1.reg().RealReg()], i.op2 + size := i.u1 + + var rex rexInfo + var opcode uint32 + lp := legacyPrefixes0xF0 // Lock prefix. + switch size { + case 8: + opcode = 0x0FB1 + rex = rexInfo(0).setW() + case 4: + opcode = 0x0FB1 + rex = rexInfo(0).clearW() + case 2: + lp = legacyPrefixes0x660xF0 // Legacy prefix + Lock prefix. + opcode = 0x0FB1 + rex = rexInfo(0).clearW() + case 1: + opcode = 0x0FB0 + // Some destinations must be encoded with REX.R = 1. + if e := src.encoding(); e >= 4 && e <= 7 { + rex = rexInfo(0).always() + } + default: + panic(fmt.Sprintf("BUG: invalid size %d: %s", size, i.String())) + } + + switch dst.kind { + case operandKindMem: + m := dst.addressMode() + encodeRegMem(c, lp, opcode, 2, src, m, rex) + default: + panic("BUG: invalid operand kind") + } + + case lockxadd: + src, dst := regEncodings[i.op1.reg().RealReg()], i.op2 + size := i.u1 + + var rex rexInfo + var opcode uint32 + lp := legacyPrefixes0xF0 // Lock prefix. + switch size { + case 8: + opcode = 0x0FC1 + rex = rexInfo(0).setW() + case 4: + opcode = 0x0FC1 + rex = rexInfo(0).clearW() + case 2: + lp = legacyPrefixes0x660xF0 // Legacy prefix + Lock prefix. + opcode = 0x0FC1 + rex = rexInfo(0).clearW() + case 1: + opcode = 0x0FC0 + // Some destinations must be encoded with REX.R = 1. + if e := src.encoding(); e >= 4 && e <= 7 { + rex = rexInfo(0).always() + } + default: + panic(fmt.Sprintf("BUG: invalid size %d: %s", size, i.String())) + } + + switch dst.kind { + case operandKindMem: + m := dst.addressMode() + encodeRegMem(c, lp, opcode, 2, src, m, rex) + default: + panic("BUG: invalid operand kind") + } + + case zeros: + r := i.op2.reg() + if r.RegType() == regalloc.RegTypeInt { + i.asAluRmiR(aluRmiROpcodeXor, newOperandReg(r), r, true) + } else { + i.asXmmRmR(sseOpcodePxor, newOperandReg(r), r) + } + i.encode(c) + + case mfence: + // https://www.felixcloutier.com/x86/mfence + c.EmitByte(0x0f) + c.EmitByte(0xae) + c.EmitByte(0xf0) + + default: + panic(fmt.Sprintf("TODO: %v", i.kind)) + } + return +} + +func encodeLoad64(c backend.Compiler, m *amode, rd regalloc.RealReg) { + dst := regEncodings[rd] + encodeRegMem(c, legacyPrefixesNone, 0x8b, 1, dst, m, rexInfo(0).setW()) +} + +func encodeRet(c backend.Compiler) { + c.EmitByte(0xc3) +} + +func encodeEncEnc( + c backend.Compiler, + legPrefixes legacyPrefixes, + opcodes uint32, + opcodeNum uint32, + r uint8, + rm uint8, + rex rexInfo, +) { + legPrefixes.encode(c) + rex.encode(c, r>>3, rm>>3) + + for opcodeNum > 0 { + opcodeNum-- + c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff)) + } + c.EmitByte(encodeModRM(3, r&7, rm&7)) +} + +func encodeRegReg( + c backend.Compiler, + legPrefixes legacyPrefixes, + opcodes uint32, + opcodeNum uint32, + r regEnc, + rm regEnc, + rex rexInfo, +) { + encodeEncEnc(c, legPrefixes, opcodes, opcodeNum, uint8(r), uint8(rm), rex) +} + +func encodeModRM(mod byte, reg byte, rm byte) byte { + return mod<<6 | reg<<3 | rm +} + +func encodeSIB(shift byte, encIndex byte, encBase byte) byte { + return shift<<6 | encIndex<<3 | encBase +} + +func encodeRegMem( + c backend.Compiler, legPrefixes legacyPrefixes, opcodes uint32, opcodeNum uint32, r regEnc, m *amode, rex rexInfo, +) (needsLabelResolution bool) { + needsLabelResolution = encodeEncMem(c, legPrefixes, opcodes, opcodeNum, uint8(r), m, rex) + return +} + +func encodeEncMem( + c backend.Compiler, legPrefixes legacyPrefixes, opcodes uint32, opcodeNum uint32, r uint8, m *amode, rex rexInfo, +) (needsLabelResolution bool) { + legPrefixes.encode(c) + + const ( + modNoDisplacement = 0b00 + modShortDisplacement = 0b01 + modLongDisplacement = 0b10 + + useSBI = 4 // the encoding of rsp or r12 register. + ) + + switch m.kind() { + case amodeImmReg, amodeImmRBP: + base := m.base.RealReg() + baseEnc := regEncodings[base] + + rex.encode(c, regRexBit(r), baseEnc.rexBit()) + + for opcodeNum > 0 { + opcodeNum-- + c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff)) + } + + // SIB byte is the last byte of the memory encoding before the displacement + const sibByte = 0x24 // == encodeSIB(0, 4, 4) + + immZero, baseRbp, baseR13 := m.imm32 == 0, base == rbp, base == r13 + short := lower8willSignExtendTo32(m.imm32) + rspOrR12 := base == rsp || base == r12 + + if immZero && !baseRbp && !baseR13 { // rbp or r13 can't be used as base for without displacement encoding. + c.EmitByte(encodeModRM(modNoDisplacement, regEncoding(r), baseEnc.encoding())) + if rspOrR12 { + c.EmitByte(sibByte) + } + } else if short { // Note: this includes the case where m.imm32 == 0 && base == rbp || base == r13. + c.EmitByte(encodeModRM(modShortDisplacement, regEncoding(r), baseEnc.encoding())) + if rspOrR12 { + c.EmitByte(sibByte) + } + c.EmitByte(byte(m.imm32)) + } else { + c.EmitByte(encodeModRM(modLongDisplacement, regEncoding(r), baseEnc.encoding())) + if rspOrR12 { + c.EmitByte(sibByte) + } + c.Emit4Bytes(m.imm32) + } + + case amodeRegRegShift: + base := m.base.RealReg() + baseEnc := regEncodings[base] + index := m.index.RealReg() + indexEnc := regEncodings[index] + + if index == rsp { + panic("BUG: rsp can't be used as index of addressing mode") + } + + rex.encodeForIndex(c, regEnc(r), indexEnc, baseEnc) + + for opcodeNum > 0 { + opcodeNum-- + c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff)) + } + + immZero, baseRbp, baseR13 := m.imm32 == 0, base == rbp, base == r13 + if immZero && !baseRbp && !baseR13 { // rbp or r13 can't be used as base for without displacement encoding. (curious why? because it's interpreted as RIP relative addressing). + c.EmitByte(encodeModRM(modNoDisplacement, regEncoding(r), useSBI)) + c.EmitByte(encodeSIB(m.shift(), indexEnc.encoding(), baseEnc.encoding())) + } else if lower8willSignExtendTo32(m.imm32) { + c.EmitByte(encodeModRM(modShortDisplacement, regEncoding(r), useSBI)) + c.EmitByte(encodeSIB(m.shift(), indexEnc.encoding(), baseEnc.encoding())) + c.EmitByte(byte(m.imm32)) + } else { + c.EmitByte(encodeModRM(modLongDisplacement, regEncoding(r), useSBI)) + c.EmitByte(encodeSIB(m.shift(), indexEnc.encoding(), baseEnc.encoding())) + c.Emit4Bytes(m.imm32) + } + + case amodeRipRel: + rex.encode(c, regRexBit(r), 0) + for opcodeNum > 0 { + opcodeNum-- + c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff)) + } + + // Indicate "LEAQ [RIP + 32bit displacement]. + // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing + c.EmitByte(encodeModRM(0b00, regEncoding(r), 0b101)) + + // This will be resolved later, so we just emit a placeholder. + needsLabelResolution = true + c.Emit4Bytes(0) + + default: + panic("BUG: invalid addressing mode") + } + return +} + +const ( + rexEncodingDefault byte = 0x40 + rexEncodingW = rexEncodingDefault | 0x08 +) + +// rexInfo is a bit set to indicate: +// +// 0x01: W bit must be cleared. +// 0x02: REX prefix must be emitted. +type rexInfo byte + +func (ri rexInfo) setW() rexInfo { + return ri | 0x01 +} + +func (ri rexInfo) clearW() rexInfo { + return ri & 0x02 +} + +func (ri rexInfo) always() rexInfo { + return ri | 0x02 +} + +func (ri rexInfo) notAlways() rexInfo { //nolint + return ri & 0x01 +} + +func (ri rexInfo) encode(c backend.Compiler, r uint8, b uint8) { + var w byte = 0 + if ri&0x01 != 0 { + w = 0x01 + } + rex := rexEncodingDefault | w<<3 | r<<2 | b + if rex != rexEncodingDefault || ri&0x02 != 0 { + c.EmitByte(rex) + } +} + +func (ri rexInfo) encodeForIndex(c backend.Compiler, encR regEnc, encIndex regEnc, encBase regEnc) { + var w byte = 0 + if ri&0x01 != 0 { + w = 0x01 + } + r := encR.rexBit() + x := encIndex.rexBit() + b := encBase.rexBit() + rex := byte(0x40) | w<<3 | r<<2 | x<<1 | b + if rex != 0x40 || ri&0x02 != 0 { + c.EmitByte(rex) + } +} + +type regEnc byte + +func (r regEnc) rexBit() byte { + return regRexBit(byte(r)) +} + +func (r regEnc) encoding() byte { + return regEncoding(byte(r)) +} + +func regRexBit(r byte) byte { + return r >> 3 +} + +func regEncoding(r byte) byte { + return r & 0x07 +} + +var regEncodings = [...]regEnc{ + rax: 0b000, + rcx: 0b001, + rdx: 0b010, + rbx: 0b011, + rsp: 0b100, + rbp: 0b101, + rsi: 0b110, + rdi: 0b111, + r8: 0b1000, + r9: 0b1001, + r10: 0b1010, + r11: 0b1011, + r12: 0b1100, + r13: 0b1101, + r14: 0b1110, + r15: 0b1111, + xmm0: 0b000, + xmm1: 0b001, + xmm2: 0b010, + xmm3: 0b011, + xmm4: 0b100, + xmm5: 0b101, + xmm6: 0b110, + xmm7: 0b111, + xmm8: 0b1000, + xmm9: 0b1001, + xmm10: 0b1010, + xmm11: 0b1011, + xmm12: 0b1100, + xmm13: 0b1101, + xmm14: 0b1110, + xmm15: 0b1111, +} + +type legacyPrefixes byte + +const ( + legacyPrefixesNone legacyPrefixes = iota + legacyPrefixes0x66 + legacyPrefixes0xF0 + legacyPrefixes0x660xF0 + legacyPrefixes0xF2 + legacyPrefixes0xF3 +) + +func (p legacyPrefixes) encode(c backend.Compiler) { + switch p { + case legacyPrefixesNone: + case legacyPrefixes0x66: + c.EmitByte(0x66) + case legacyPrefixes0xF0: + c.EmitByte(0xf0) + case legacyPrefixes0x660xF0: + c.EmitByte(0x66) + c.EmitByte(0xf0) + case legacyPrefixes0xF2: + c.EmitByte(0xf2) + case legacyPrefixes0xF3: + c.EmitByte(0xf3) + default: + panic("BUG: invalid legacy prefix") + } +} + +func lower32willSignExtendTo64(x uint64) bool { + xs := int64(x) + return xs == int64(uint64(int32(xs))) +} + +func lower8willSignExtendTo32(x uint32) bool { + xs := int32(x) + return xs == ((xs << 24) >> 24) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go new file mode 100644 index 000000000..55d05ef63 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go @@ -0,0 +1,71 @@ +package amd64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// lowerConstant allocates a new VReg and inserts the instruction to load the constant value. +func (m *machine) lowerConstant(instr *ssa.Instruction) (vr regalloc.VReg) { + val := instr.Return() + valType := val.Type() + + vr = m.c.AllocateVReg(valType) + m.insertLoadConstant(instr, vr) + return +} + +// InsertLoadConstantBlockArg implements backend.Machine. +func (m *machine) InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) { + m.insertLoadConstant(instr, vr) +} + +func (m *machine) insertLoadConstant(instr *ssa.Instruction, vr regalloc.VReg) { + val := instr.Return() + valType := val.Type() + v := instr.ConstantVal() + + bits := valType.Bits() + if bits < 64 { // Clear the redundant bits just in case it's unexpectedly sign-extended, etc. + v = v & ((1 << valType.Bits()) - 1) + } + + switch valType { + case ssa.TypeF32, ssa.TypeF64: + m.lowerFconst(vr, v, bits == 64) + case ssa.TypeI32, ssa.TypeI64: + m.lowerIconst(vr, v, bits == 64) + default: + panic("BUG") + } +} + +func (m *machine) lowerFconst(dst regalloc.VReg, c uint64, _64 bool) { + if c == 0 { + xor := m.allocateInstr().asZeros(dst) + m.insert(xor) + } else { + var tmpType ssa.Type + if _64 { + tmpType = ssa.TypeI64 + } else { + tmpType = ssa.TypeI32 + } + tmpInt := m.c.AllocateVReg(tmpType) + loadToGP := m.allocateInstr().asImm(tmpInt, c, _64) + m.insert(loadToGP) + + movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpInt), dst, _64) + m.insert(movToXmm) + } +} + +func (m *machine) lowerIconst(dst regalloc.VReg, c uint64, _64 bool) { + i := m.allocateInstr() + if c == 0 { + i.asZeros(dst) + } else { + i.asImm(dst, c, _64) + } + m.insert(i) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go new file mode 100644 index 000000000..bee673d25 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go @@ -0,0 +1,187 @@ +package amd64 + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +var addendsMatchOpcodes = [...]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst, ssa.OpcodeIshl} + +type addend struct { + r regalloc.VReg + off int64 + shift byte +} + +func (a addend) String() string { + return fmt.Sprintf("addend{r=%s, off=%d, shift=%d}", a.r, a.off, a.shift) +} + +// lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions. +func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32) (am *amode) { + def := m.c.ValueDefinition(ptr) + + if offsetBase&0x80000000 != 0 { + // Special casing the huge base offset whose MSB is set. In x64, the immediate is always + // sign-extended, but our IR semantics requires the offset base is always unsigned. + // Note that this should be extremely rare or even this shouldn't hit in the real application, + // therefore we don't need to optimize this case in my opinion. + + a := m.lowerAddend(def) + off64 := a.off + int64(offsetBase) + offsetBaseReg := m.c.AllocateVReg(ssa.TypeI64) + m.lowerIconst(offsetBaseReg, uint64(off64), true) + if a.r != regalloc.VRegInvalid { + return m.newAmodeRegRegShift(0, offsetBaseReg, a.r, a.shift) + } else { + return m.newAmodeImmReg(0, offsetBaseReg) + } + } + + if op := m.c.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op == ssa.OpcodeIadd { + add := def.Instr + x, y := add.Arg2() + xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) + ax := m.lowerAddend(xDef) + ay := m.lowerAddend(yDef) + add.MarkLowered() + return m.lowerAddendsToAmode(ax, ay, offsetBase) + } else { + // If it is not an Iadd, then we lower the one addend. + a := m.lowerAddend(def) + // off is always 0 if r is valid. + if a.r != regalloc.VRegInvalid { + if a.shift != 0 { + tmpReg := m.c.AllocateVReg(ssa.TypeI64) + m.lowerIconst(tmpReg, 0, true) + return m.newAmodeRegRegShift(offsetBase, tmpReg, a.r, a.shift) + } + return m.newAmodeImmReg(offsetBase, a.r) + } else { + off64 := a.off + int64(offsetBase) + tmpReg := m.c.AllocateVReg(ssa.TypeI64) + m.lowerIconst(tmpReg, uint64(off64), true) + return m.newAmodeImmReg(0, tmpReg) + } + } +} + +func (m *machine) lowerAddendsToAmode(x, y addend, offBase uint32) *amode { + if x.r != regalloc.VRegInvalid && x.off != 0 || y.r != regalloc.VRegInvalid && y.off != 0 { + panic("invalid input") + } + + u64 := uint64(x.off+y.off) + uint64(offBase) + if u64 != 0 { + if _, ok := asImm32(u64, false); !ok { + tmpReg := m.c.AllocateVReg(ssa.TypeI64) + m.lowerIconst(tmpReg, u64, true) + // Blank u64 as it has been already lowered. + u64 = 0 + + if x.r == regalloc.VRegInvalid { + x.r = tmpReg + } else if y.r == regalloc.VRegInvalid { + y.r = tmpReg + } else { + // We already know that either rx or ry is invalid, + // so we overwrite it with the temporary register. + panic("BUG") + } + } + } + + u32 := uint32(u64) + switch { + // We assume rx, ry are valid iff offx, offy are 0. + case x.r != regalloc.VRegInvalid && y.r != regalloc.VRegInvalid: + switch { + case x.shift != 0 && y.shift != 0: + // Cannot absorb two shifted registers, must lower one to a shift instruction. + shifted := m.allocateInstr() + shifted.asShiftR(shiftROpShiftLeft, newOperandImm32(uint32(x.shift)), x.r, true) + m.insert(shifted) + + return m.newAmodeRegRegShift(u32, x.r, y.r, y.shift) + case x.shift != 0 && y.shift == 0: + // Swap base and index. + x, y = y, x + fallthrough + default: + return m.newAmodeRegRegShift(u32, x.r, y.r, y.shift) + } + case x.r == regalloc.VRegInvalid && y.r != regalloc.VRegInvalid: + x, y = y, x + fallthrough + case x.r != regalloc.VRegInvalid && y.r == regalloc.VRegInvalid: + if x.shift != 0 { + zero := m.c.AllocateVReg(ssa.TypeI64) + m.lowerIconst(zero, 0, true) + return m.newAmodeRegRegShift(u32, zero, x.r, x.shift) + } + return m.newAmodeImmReg(u32, x.r) + default: // Both are invalid: use the offset. + tmpReg := m.c.AllocateVReg(ssa.TypeI64) + m.lowerIconst(tmpReg, u64, true) + return m.newAmodeImmReg(0, tmpReg) + } +} + +func (m *machine) lowerAddend(x *backend.SSAValueDefinition) addend { + if x.IsFromBlockParam() { + return addend{x.BlkParamVReg, 0, 0} + } + // Ensure the addend is not referenced in multiple places; we will discard nested Iadds. + op := m.c.MatchInstrOneOf(x, addendsMatchOpcodes[:]) + if op != ssa.OpcodeInvalid && op != ssa.OpcodeIadd { + return m.lowerAddendFromInstr(x.Instr) + } + p := m.getOperand_Reg(x) + return addend{p.reg(), 0, 0} +} + +// lowerAddendFromInstr takes an instruction returns a Vreg and an offset that can be used in an address mode. +// The Vreg is regalloc.VRegInvalid if the addend cannot be lowered to a register. +// The offset is 0 if the addend can be lowered to a register. +func (m *machine) lowerAddendFromInstr(instr *ssa.Instruction) addend { + instr.MarkLowered() + switch op := instr.Opcode(); op { + case ssa.OpcodeIconst: + u64 := instr.ConstantVal() + if instr.Return().Type().Bits() == 32 { + return addend{regalloc.VRegInvalid, int64(int32(u64)), 0} // sign-extend. + } else { + return addend{regalloc.VRegInvalid, int64(u64), 0} + } + case ssa.OpcodeUExtend, ssa.OpcodeSExtend: + input := instr.Arg() + inputDef := m.c.ValueDefinition(input) + if input.Type().Bits() != 32 { + panic("BUG: invalid input type " + input.Type().String()) + } + constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant() + switch { + case constInst && op == ssa.OpcodeSExtend: + return addend{regalloc.VRegInvalid, int64(uint32(inputDef.Instr.ConstantVal())), 0} + case constInst && op == ssa.OpcodeUExtend: + return addend{regalloc.VRegInvalid, int64(int32(inputDef.Instr.ConstantVal())), 0} // sign-extend! + default: + r := m.getOperand_Reg(inputDef) + return addend{r.reg(), 0, 0} + } + case ssa.OpcodeIshl: + // If the addend is a shift, we can only handle it if the shift amount is a constant. + x, amount := instr.Arg2() + amountDef := m.c.ValueDefinition(amount) + if amountDef.IsFromInstr() && amountDef.Instr.Constant() && amountDef.Instr.ConstantVal() <= 3 { + r := m.getOperand_Reg(m.c.ValueDefinition(x)) + return addend{r.reg(), 0, uint8(amountDef.Instr.ConstantVal())} + } + r := m.getOperand_Reg(m.c.ValueDefinition(x)) + return addend{r.reg(), 0, 0} + } + panic("BUG: invalid opcode") +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go new file mode 100644 index 000000000..310ad2203 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go @@ -0,0 +1,3611 @@ +package amd64 + +import ( + "context" + "encoding/binary" + "fmt" + "math" + "strings" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" + "github.com/tetratelabs/wazero/internal/platform" +) + +// NewBackend returns a new backend for arm64. +func NewBackend() backend.Machine { + ectx := backend.NewExecutableContextT[instruction]( + resetInstruction, + setNext, + setPrev, + asNop, + ) + return &machine{ + ectx: ectx, + cpuFeatures: platform.CpuFeatures, + regAlloc: regalloc.NewAllocator(regInfo), + spillSlots: map[regalloc.VRegID]int64{}, + amodePool: wazevoapi.NewPool[amode](nil), + constSwizzleMaskConstIndex: -1, + constSqmulRoundSatIndex: -1, + constI8x16SHLMaskTableIndex: -1, + constI8x16LogicalSHRMaskTableIndex: -1, + constF64x2CvtFromIMaskIndex: -1, + constTwop52Index: -1, + constI32sMaxOnF64x2Index: -1, + constI32uMaxOnF64x2Index: -1, + constAllOnesI8x16Index: -1, + constAllOnesI16x8Index: -1, + constExtAddPairwiseI16x8uMask1Index: -1, + constExtAddPairwiseI16x8uMask2Index: -1, + } +} + +type ( + // machine implements backend.Machine for amd64. + machine struct { + c backend.Compiler + ectx *backend.ExecutableContextT[instruction] + stackBoundsCheckDisabled bool + + amodePool wazevoapi.Pool[amode] + + cpuFeatures platform.CpuFeatureFlags + + regAlloc regalloc.Allocator + regAllocFn *backend.RegAllocFunction[*instruction, *machine] + regAllocStarted bool + + spillSlotSize int64 + spillSlots map[regalloc.VRegID]int64 + currentABI *backend.FunctionABI + clobberedRegs []regalloc.VReg + + maxRequiredStackSizeForCalls int64 + + labelResolutionPends []labelResolutionPend + + jmpTableTargets [][]uint32 + consts []_const + + constSwizzleMaskConstIndex, constSqmulRoundSatIndex, + constI8x16SHLMaskTableIndex, constI8x16LogicalSHRMaskTableIndex, + constF64x2CvtFromIMaskIndex, constTwop52Index, + constI32sMaxOnF64x2Index, constI32uMaxOnF64x2Index, + constAllOnesI8x16Index, constAllOnesI16x8Index, + constExtAddPairwiseI16x8uMask1Index, constExtAddPairwiseI16x8uMask2Index int + } + + _const struct { + lo, hi uint64 + _var []byte + label *labelPosition + } + + labelResolutionPend struct { + instr *instruction + instrOffset int64 + // imm32Offset is the offset of the last 4 bytes of the instruction. + imm32Offset int64 + } + + labelPosition = backend.LabelPosition[instruction] +) + +func (m *machine) getOrAllocateConstLabel(i *int, _var []byte) backend.Label { + index := *i + if index == -1 { + label := m.allocateLabel() + index = len(m.consts) + m.consts = append(m.consts, _const{ + _var: _var, + label: label, + }) + *i = index + } + return m.consts[index].label.L +} + +// Reset implements backend.Machine. +func (m *machine) Reset() { + m.consts = m.consts[:0] + m.clobberedRegs = m.clobberedRegs[:0] + for key := range m.spillSlots { + m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key)) + } + for _, key := range m.clobberedRegs { + delete(m.spillSlots, regalloc.VRegID(key)) + } + + m.stackBoundsCheckDisabled = false + m.ectx.Reset() + + m.regAllocFn.Reset() + m.regAlloc.Reset() + m.regAllocStarted = false + m.clobberedRegs = m.clobberedRegs[:0] + + m.spillSlotSize = 0 + m.maxRequiredStackSizeForCalls = 0 + + m.amodePool.Reset() + m.jmpTableTargets = m.jmpTableTargets[:0] + m.constSwizzleMaskConstIndex = -1 + m.constSqmulRoundSatIndex = -1 + m.constI8x16SHLMaskTableIndex = -1 + m.constI8x16LogicalSHRMaskTableIndex = -1 + m.constF64x2CvtFromIMaskIndex = -1 + m.constTwop52Index = -1 + m.constI32sMaxOnF64x2Index = -1 + m.constI32uMaxOnF64x2Index = -1 + m.constAllOnesI8x16Index = -1 + m.constAllOnesI16x8Index = -1 + m.constExtAddPairwiseI16x8uMask1Index = -1 + m.constExtAddPairwiseI16x8uMask2Index = -1 +} + +// ExecutableContext implements backend.Machine. +func (m *machine) ExecutableContext() backend.ExecutableContext { return m.ectx } + +// DisableStackCheck implements backend.Machine. +func (m *machine) DisableStackCheck() { m.stackBoundsCheckDisabled = true } + +// SetCompiler implements backend.Machine. +func (m *machine) SetCompiler(c backend.Compiler) { + m.c = c + m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, c.SSABuilder(), c) +} + +// SetCurrentABI implements backend.Machine. +func (m *machine) SetCurrentABI(abi *backend.FunctionABI) { + m.currentABI = abi +} + +// RegAlloc implements backend.Machine. +func (m *machine) RegAlloc() { + rf := m.regAllocFn + for _, pos := range m.ectx.OrderedBlockLabels { + rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End) + } + + m.regAllocStarted = true + m.regAlloc.DoAllocation(rf) + // Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes. + m.spillSlotSize = (m.spillSlotSize + 15) &^ 15 +} + +// InsertReturn implements backend.Machine. +func (m *machine) InsertReturn() { + i := m.allocateInstr().asRet() + m.insert(i) +} + +// LowerSingleBranch implements backend.Machine. +func (m *machine) LowerSingleBranch(b *ssa.Instruction) { + ectx := m.ectx + switch b.Opcode() { + case ssa.OpcodeJump: + _, _, targetBlk := b.BranchData() + if b.IsFallthroughJump() { + return + } + jmp := m.allocateInstr() + target := ectx.GetOrAllocateSSABlockLabel(targetBlk) + if target == backend.LabelReturn { + jmp.asRet() + } else { + jmp.asJmp(newOperandLabel(target)) + } + m.insert(jmp) + case ssa.OpcodeBrTable: + index, target := b.BrTableData() + m.lowerBrTable(index, target) + default: + panic("BUG: unexpected branch opcode" + b.Opcode().String()) + } +} + +func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) { + // TODO: reuse the slice! + labels := make([]uint32, len(targets)) + for j, target := range targets { + labels[j] = uint32(m.ectx.GetOrAllocateSSABlockLabel(target)) + } + index = len(m.jmpTableTargets) + m.jmpTableTargets = append(m.jmpTableTargets, labels) + return +} + +var condBranchMatches = [...]ssa.Opcode{ssa.OpcodeIcmp, ssa.OpcodeFcmp} + +func (m *machine) lowerBrTable(index ssa.Value, targets []ssa.BasicBlock) { + _v := m.getOperand_Reg(m.c.ValueDefinition(index)) + v := m.copyToTmp(_v.reg()) + + // First, we need to do the bounds check. + maxIndex := m.c.AllocateVReg(ssa.TypeI32) + m.lowerIconst(maxIndex, uint64(len(targets)-1), false) + cmp := m.allocateInstr().asCmpRmiR(true, newOperandReg(maxIndex), v, false) + m.insert(cmp) + + // Then do the conditional move maxIndex to v if v > maxIndex. + cmov := m.allocateInstr().asCmove(condNB, newOperandReg(maxIndex), v, false) + m.insert(cmov) + + // Now that v has the correct index. Load the address of the jump table into the addr. + addr := m.c.AllocateVReg(ssa.TypeI64) + leaJmpTableAddr := m.allocateInstr() + m.insert(leaJmpTableAddr) + + // Then add the target's offset into jmpTableAddr. + loadTargetOffsetFromJmpTable := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, + // Shift by 3 because each entry is 8 bytes. + newOperandMem(m.newAmodeRegRegShift(0, addr, v, 3)), addr, true) + m.insert(loadTargetOffsetFromJmpTable) + + // Now ready to jump. + jmp := m.allocateInstr().asJmp(newOperandReg(addr)) + m.insert(jmp) + + jmpTableBegin, jmpTableBeginLabel := m.allocateBrTarget() + m.insert(jmpTableBegin) + leaJmpTableAddr.asLEA(newOperandLabel(jmpTableBeginLabel), addr) + + jmpTable := m.allocateInstr() + targetSliceIndex := m.addJmpTableTarget(targets) + jmpTable.asJmpTableSequence(targetSliceIndex, len(targets)) + m.insert(jmpTable) +} + +// LowerConditionalBranch implements backend.Machine. +func (m *machine) LowerConditionalBranch(b *ssa.Instruction) { + exctx := m.ectx + cval, args, targetBlk := b.BranchData() + if len(args) > 0 { + panic(fmt.Sprintf( + "conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s", + exctx.CurrentSSABlk, + targetBlk, + )) + } + + target := exctx.GetOrAllocateSSABlockLabel(targetBlk) + cvalDef := m.c.ValueDefinition(cval) + + switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) { + case ssa.OpcodeIcmp: + cvalInstr := cvalDef.Instr + x, y, c := cvalInstr.IcmpData() + + cc := condFromSSAIntCmpCond(c) + if b.Opcode() == ssa.OpcodeBrz { + cc = cc.invert() + } + + // First, perform the comparison and set the flag. + xd, yd := m.c.ValueDefinition(x), m.c.ValueDefinition(y) + if !m.tryLowerBandToFlag(xd, yd) { + m.lowerIcmpToFlag(xd, yd, x.Type() == ssa.TypeI64) + } + + // Then perform the conditional branch. + m.insert(m.allocateInstr().asJmpIf(cc, newOperandLabel(target))) + cvalDef.Instr.MarkLowered() + case ssa.OpcodeFcmp: + cvalInstr := cvalDef.Instr + + f1, f2, and := m.lowerFcmpToFlags(cvalInstr) + isBrz := b.Opcode() == ssa.OpcodeBrz + if isBrz { + f1 = f1.invert() + } + if f2 == condInvalid { + m.insert(m.allocateInstr().asJmpIf(f1, newOperandLabel(target))) + } else { + if isBrz { + f2 = f2.invert() + and = !and + } + jmp1, jmp2 := m.allocateInstr(), m.allocateInstr() + m.insert(jmp1) + m.insert(jmp2) + notTaken, notTakenLabel := m.allocateBrTarget() + m.insert(notTaken) + if and { + jmp1.asJmpIf(f1.invert(), newOperandLabel(notTakenLabel)) + jmp2.asJmpIf(f2, newOperandLabel(target)) + } else { + jmp1.asJmpIf(f1, newOperandLabel(target)) + jmp2.asJmpIf(f2, newOperandLabel(target)) + } + } + + cvalDef.Instr.MarkLowered() + default: + v := m.getOperand_Reg(cvalDef) + + var cc cond + if b.Opcode() == ssa.OpcodeBrz { + cc = condZ + } else { + cc = condNZ + } + + // Perform test %v, %v to set the flag. + cmp := m.allocateInstr().asCmpRmiR(false, v, v.reg(), false) + m.insert(cmp) + m.insert(m.allocateInstr().asJmpIf(cc, newOperandLabel(target))) + } +} + +// LowerInstr implements backend.Machine. +func (m *machine) LowerInstr(instr *ssa.Instruction) { + if l := instr.SourceOffset(); l.Valid() { + info := m.allocateInstr().asEmitSourceOffsetInfo(l) + m.insert(info) + } + + switch op := instr.Opcode(); op { + case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable: + panic("BUG: branching instructions are handled by LowerBranches") + case ssa.OpcodeReturn: + panic("BUG: return must be handled by backend.Compiler") + case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined. + case ssa.OpcodeCall, ssa.OpcodeCallIndirect: + m.lowerCall(instr) + case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32: + m.lowerStore(instr) + case ssa.OpcodeIadd: + m.lowerAluRmiROp(instr, aluRmiROpcodeAdd) + case ssa.OpcodeIsub: + m.lowerAluRmiROp(instr, aluRmiROpcodeSub) + case ssa.OpcodeImul: + m.lowerAluRmiROp(instr, aluRmiROpcodeMul) + case ssa.OpcodeSdiv, ssa.OpcodeUdiv, ssa.OpcodeSrem, ssa.OpcodeUrem: + isDiv := op == ssa.OpcodeSdiv || op == ssa.OpcodeUdiv + isSigned := op == ssa.OpcodeSdiv || op == ssa.OpcodeSrem + m.lowerIDivRem(instr, isDiv, isSigned) + case ssa.OpcodeBand: + m.lowerAluRmiROp(instr, aluRmiROpcodeAnd) + case ssa.OpcodeBor: + m.lowerAluRmiROp(instr, aluRmiROpcodeOr) + case ssa.OpcodeBxor: + m.lowerAluRmiROp(instr, aluRmiROpcodeXor) + case ssa.OpcodeIshl: + m.lowerShiftR(instr, shiftROpShiftLeft) + case ssa.OpcodeSshr: + m.lowerShiftR(instr, shiftROpShiftRightArithmetic) + case ssa.OpcodeUshr: + m.lowerShiftR(instr, shiftROpShiftRightLogical) + case ssa.OpcodeRotl: + m.lowerShiftR(instr, shiftROpRotateLeft) + case ssa.OpcodeRotr: + m.lowerShiftR(instr, shiftROpRotateRight) + case ssa.OpcodeClz: + m.lowerClz(instr) + case ssa.OpcodeCtz: + m.lowerCtz(instr) + case ssa.OpcodePopcnt: + m.lowerUnaryRmR(instr, unaryRmROpcodePopcnt) + case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv: + m.lowerXmmRmR(instr) + case ssa.OpcodeFabs: + m.lowerFabsFneg(instr) + case ssa.OpcodeFneg: + m.lowerFabsFneg(instr) + case ssa.OpcodeCeil: + m.lowerRound(instr, roundingModeUp) + case ssa.OpcodeFloor: + m.lowerRound(instr, roundingModeDown) + case ssa.OpcodeTrunc: + m.lowerRound(instr, roundingModeZero) + case ssa.OpcodeNearest: + m.lowerRound(instr, roundingModeNearest) + case ssa.OpcodeFmin, ssa.OpcodeFmax: + m.lowerFminFmax(instr) + case ssa.OpcodeFcopysign: + m.lowerFcopysign(instr) + case ssa.OpcodeBitcast: + m.lowerBitcast(instr) + case ssa.OpcodeSqrt: + m.lowerSqrt(instr) + case ssa.OpcodeFpromote: + v := instr.Arg() + rn := m.getOperand_Reg(m.c.ValueDefinition(v)) + rd := m.c.VRegOf(instr.Return()) + cnt := m.allocateInstr() + cnt.asXmmUnaryRmR(sseOpcodeCvtss2sd, rn, rd) + m.insert(cnt) + case ssa.OpcodeFdemote: + v := instr.Arg() + rn := m.getOperand_Reg(m.c.ValueDefinition(v)) + rd := m.c.VRegOf(instr.Return()) + cnt := m.allocateInstr() + cnt.asXmmUnaryRmR(sseOpcodeCvtsd2ss, rn, rd) + m.insert(cnt) + case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat: + x, ctx := instr.Arg2() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + ctxVReg := m.c.VRegOf(ctx) + m.lowerFcvtToSint(ctxVReg, rn.reg(), rd, x.Type() == ssa.TypeF64, + instr.Return().Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat) + case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat: + x, ctx := instr.Arg2() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + ctxVReg := m.c.VRegOf(ctx) + m.lowerFcvtToUint(ctxVReg, rn.reg(), rd, x.Type() == ssa.TypeF64, + instr.Return().Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat) + case ssa.OpcodeFcvtFromSint: + x := instr.Arg() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := newOperandReg(m.c.VRegOf(instr.Return())) + m.lowerFcvtFromSint(rn, rd, + x.Type() == ssa.TypeI64, instr.Return().Type().Bits() == 64) + case ssa.OpcodeFcvtFromUint: + x := instr.Arg() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := newOperandReg(m.c.VRegOf(instr.Return())) + m.lowerFcvtFromUint(rn, rd, x.Type() == ssa.TypeI64, + instr.Return().Type().Bits() == 64) + case ssa.OpcodeVanyTrue: + m.lowerVanyTrue(instr) + case ssa.OpcodeVallTrue: + m.lowerVallTrue(instr) + case ssa.OpcodeVhighBits: + m.lowerVhighBits(instr) + case ssa.OpcodeVbnot: + m.lowerVbnot(instr) + case ssa.OpcodeVband: + x, y := instr.Arg2() + m.lowerVbBinOp(sseOpcodePand, x, y, instr.Return()) + case ssa.OpcodeVbor: + x, y := instr.Arg2() + m.lowerVbBinOp(sseOpcodePor, x, y, instr.Return()) + case ssa.OpcodeVbxor: + x, y := instr.Arg2() + m.lowerVbBinOp(sseOpcodePxor, x, y, instr.Return()) + case ssa.OpcodeVbandnot: + m.lowerVbandnot(instr, sseOpcodePandn) + case ssa.OpcodeVbitselect: + m.lowerVbitselect(instr) + case ssa.OpcodeVIadd: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePaddb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePaddw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePaddd + case ssa.VecLaneI64x2: + vecOp = sseOpcodePaddq + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVSaddSat: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePaddsb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePaddsw + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVUaddSat: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePaddusb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePaddusw + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVIsub: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePsubb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePsubw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePsubd + case ssa.VecLaneI64x2: + vecOp = sseOpcodePsubq + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVSsubSat: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePsubsb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePsubsw + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVUsubSat: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePsubusb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePsubusw + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVImul: + m.lowerVImul(instr) + case ssa.OpcodeVIneg: + x, lane := instr.ArgWithLane() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePsubb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePsubw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePsubd + case ssa.VecLaneI64x2: + vecOp = sseOpcodePsubq + default: + panic("BUG") + } + + tmp := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asZeros(tmp)) + + i := m.allocateInstr() + i.asXmmRmR(vecOp, rn, tmp) + m.insert(i) + + m.copyTo(tmp, rd) + case ssa.OpcodeVFadd: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneF32x4: + vecOp = sseOpcodeAddps + case ssa.VecLaneF64x2: + vecOp = sseOpcodeAddpd + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVFsub: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneF32x4: + vecOp = sseOpcodeSubps + case ssa.VecLaneF64x2: + vecOp = sseOpcodeSubpd + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVFdiv: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneF32x4: + vecOp = sseOpcodeDivps + case ssa.VecLaneF64x2: + vecOp = sseOpcodeDivpd + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVFmul: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneF32x4: + vecOp = sseOpcodeMulps + case ssa.VecLaneF64x2: + vecOp = sseOpcodeMulpd + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVFneg: + x, lane := instr.ArgWithLane() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + + tmp := m.c.AllocateVReg(ssa.TypeV128) + + var shiftOp, xorOp sseOpcode + var shiftAmt uint32 + switch lane { + case ssa.VecLaneF32x4: + shiftOp, shiftAmt, xorOp = sseOpcodePslld, 31, sseOpcodeXorps + case ssa.VecLaneF64x2: + shiftOp, shiftAmt, xorOp = sseOpcodePsllq, 63, sseOpcodeXorpd + } + + zero := m.allocateInstr() + zero.asZeros(tmp) + m.insert(zero) + + // Set all bits on tmp by CMPPD with arg=0 (== pseudo CMPEQPD instruction). + // See https://www.felixcloutier.com/x86/cmpps + // + // Note: if we do not clear all the bits ^ with XORPS, this might end up not setting ones on some lane + // if the lane is NaN. + cmp := m.allocateInstr() + cmp.asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_UQ), newOperandReg(tmp), tmp) + m.insert(cmp) + + // Do the left shift on each lane to set only the most significant bit in each. + i := m.allocateInstr() + i.asXmmRmiReg(shiftOp, newOperandImm32(shiftAmt), tmp) + m.insert(i) + + // Get the negated result by XOR on each lane with tmp. + i = m.allocateInstr() + i.asXmmRmR(xorOp, rn, tmp) + m.insert(i) + + m.copyTo(tmp, rd) + + case ssa.OpcodeVSqrt: + x, lane := instr.ArgWithLane() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + + var vecOp sseOpcode + switch lane { + case ssa.VecLaneF32x4: + vecOp = sseOpcodeSqrtps + case ssa.VecLaneF64x2: + vecOp = sseOpcodeSqrtpd + } + i := m.allocateInstr() + i.asXmmUnaryRmR(vecOp, rn, rd) + m.insert(i) + + case ssa.OpcodeVImin: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePminsb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePminsw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePminsd + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVUmin: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePminub + case ssa.VecLaneI16x8: + vecOp = sseOpcodePminuw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePminud + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVImax: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePmaxsb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePmaxsw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePmaxsd + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVUmax: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePmaxub + case ssa.VecLaneI16x8: + vecOp = sseOpcodePmaxuw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePmaxud + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVAvgRound: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePavgb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePavgw + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVIcmp: + x, y, c, lane := instr.VIcmpData() + m.lowerVIcmp(x, y, c, instr.Return(), lane) + + case ssa.OpcodeVFcmp: + x, y, c, lane := instr.VFcmpData() + m.lowerVFcmp(x, y, c, instr.Return(), lane) + + case ssa.OpcodeExtractlane: + x, index, signed, lane := instr.ExtractlaneData() + m.lowerExtractLane(x, index, signed, instr.Return(), lane) + + case ssa.OpcodeInsertlane: + x, y, index, lane := instr.InsertlaneData() + m.lowerInsertLane(x, y, index, instr.Return(), lane) + + case ssa.OpcodeSwizzle: + x, y, _ := instr.Arg2WithLane() + m.lowerSwizzle(x, y, instr.Return()) + + case ssa.OpcodeShuffle: + x, y, lo, hi := instr.ShuffleData() + m.lowerShuffle(x, y, lo, hi, instr.Return()) + + case ssa.OpcodeSplat: + x, lane := instr.ArgWithLane() + m.lowerSplat(x, instr.Return(), lane) + + case ssa.OpcodeSqmulRoundSat: + x, y := instr.Arg2() + m.lowerSqmulRoundSat(x, y, instr.Return()) + + case ssa.OpcodeVZeroExtLoad: + ptr, offset, typ := instr.VZeroExtLoadData() + var sseOp sseOpcode + // Both movss and movsd clears the higher bits of the destination register upt 128 bits. + // https://www.felixcloutier.com/x86/movss + // https://www.felixcloutier.com/x86/movsd + if typ == ssa.TypeF32 { + sseOp = sseOpcodeMovss + } else { + sseOp = sseOpcodeMovsd + } + mem := m.lowerToAddressMode(ptr, offset) + dst := m.c.VRegOf(instr.Return()) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandMem(mem), dst)) + + case ssa.OpcodeVMinPseudo: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneF32x4: + vecOp = sseOpcodeMinps + case ssa.VecLaneF64x2: + vecOp = sseOpcodeMinpd + default: + panic("BUG: unexpected lane type") + } + m.lowerVbBinOpUnaligned(vecOp, y, x, instr.Return()) + + case ssa.OpcodeVMaxPseudo: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneF32x4: + vecOp = sseOpcodeMaxps + case ssa.VecLaneF64x2: + vecOp = sseOpcodeMaxpd + default: + panic("BUG: unexpected lane type") + } + m.lowerVbBinOpUnaligned(vecOp, y, x, instr.Return()) + + case ssa.OpcodeVIshl: + x, y, lane := instr.Arg2WithLane() + m.lowerVIshl(x, y, instr.Return(), lane) + + case ssa.OpcodeVSshr: + x, y, lane := instr.Arg2WithLane() + m.lowerVSshr(x, y, instr.Return(), lane) + + case ssa.OpcodeVUshr: + x, y, lane := instr.Arg2WithLane() + m.lowerVUshr(x, y, instr.Return(), lane) + + case ssa.OpcodeVCeil: + x, lane := instr.ArgWithLane() + m.lowerVRound(x, instr.Return(), 0x2, lane == ssa.VecLaneF64x2) + + case ssa.OpcodeVFloor: + x, lane := instr.ArgWithLane() + m.lowerVRound(x, instr.Return(), 0x1, lane == ssa.VecLaneF64x2) + + case ssa.OpcodeVTrunc: + x, lane := instr.ArgWithLane() + m.lowerVRound(x, instr.Return(), 0x3, lane == ssa.VecLaneF64x2) + + case ssa.OpcodeVNearest: + x, lane := instr.ArgWithLane() + m.lowerVRound(x, instr.Return(), 0x0, lane == ssa.VecLaneF64x2) + + case ssa.OpcodeExtIaddPairwise: + x, lane, signed := instr.ExtIaddPairwiseData() + m.lowerExtIaddPairwise(x, instr.Return(), lane, signed) + + case ssa.OpcodeUwidenLow, ssa.OpcodeSwidenLow: + x, lane := instr.ArgWithLane() + m.lowerWidenLow(x, instr.Return(), lane, op == ssa.OpcodeSwidenLow) + + case ssa.OpcodeUwidenHigh, ssa.OpcodeSwidenHigh: + x, lane := instr.ArgWithLane() + m.lowerWidenHigh(x, instr.Return(), lane, op == ssa.OpcodeSwidenHigh) + + case ssa.OpcodeLoadSplat: + ptr, offset, lane := instr.LoadSplatData() + m.lowerLoadSplat(ptr, offset, instr.Return(), lane) + + case ssa.OpcodeVFcvtFromUint, ssa.OpcodeVFcvtFromSint: + x, lane := instr.ArgWithLane() + m.lowerVFcvtFromInt(x, instr.Return(), lane, op == ssa.OpcodeVFcvtFromSint) + + case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat: + x, lane := instr.ArgWithLane() + m.lowerVFcvtToIntSat(x, instr.Return(), lane, op == ssa.OpcodeVFcvtToSintSat) + + case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow: + x, y, lane := instr.Arg2WithLane() + m.lowerNarrow(x, y, instr.Return(), lane, op == ssa.OpcodeSnarrow) + + case ssa.OpcodeFvpromoteLow: + x := instr.Arg() + src := m.getOperand_Reg(m.c.ValueDefinition(x)) + dst := m.c.VRegOf(instr.Return()) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtps2pd, src, dst)) + + case ssa.OpcodeFvdemote: + x := instr.Arg() + src := m.getOperand_Reg(m.c.ValueDefinition(x)) + dst := m.c.VRegOf(instr.Return()) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtpd2ps, src, dst)) + + case ssa.OpcodeWideningPairwiseDotProductS: + x, y := instr.Arg2() + m.lowerWideningPairwiseDotProductS(x, y, instr.Return()) + + case ssa.OpcodeVIabs: + m.lowerVIabs(instr) + case ssa.OpcodeVIpopcnt: + m.lowerVIpopcnt(instr) + case ssa.OpcodeVFmin: + m.lowerVFmin(instr) + case ssa.OpcodeVFmax: + m.lowerVFmax(instr) + case ssa.OpcodeVFabs: + m.lowerVFabs(instr) + case ssa.OpcodeUndefined: + m.insert(m.allocateInstr().asUD2()) + case ssa.OpcodeExitWithCode: + execCtx, code := instr.ExitWithCodeData() + m.lowerExitWithCode(m.c.VRegOf(execCtx), code) + case ssa.OpcodeExitIfTrueWithCode: + execCtx, c, code := instr.ExitIfTrueWithCodeData() + m.lowerExitIfTrueWithCode(m.c.VRegOf(execCtx), c, code) + case ssa.OpcodeLoad: + ptr, offset, typ := instr.LoadData() + dst := m.c.VRegOf(instr.Return()) + m.lowerLoad(ptr, offset, typ, dst) + case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32: + ptr, offset, _ := instr.LoadData() + ret := m.c.VRegOf(instr.Return()) + m.lowerExtLoad(op, ptr, offset, ret) + case ssa.OpcodeVconst: + result := m.c.VRegOf(instr.Return()) + lo, hi := instr.VconstData() + m.lowerVconst(result, lo, hi) + case ssa.OpcodeSExtend, ssa.OpcodeUExtend: + from, to, signed := instr.ExtendData() + m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed) + case ssa.OpcodeIcmp: + m.lowerIcmp(instr) + case ssa.OpcodeFcmp: + m.lowerFcmp(instr) + case ssa.OpcodeSelect: + cval, x, y := instr.SelectData() + m.lowerSelect(x, y, cval, instr.Return()) + case ssa.OpcodeIreduce: + rn := m.getOperand_Mem_Reg(m.c.ValueDefinition(instr.Arg())) + retVal := instr.Return() + rd := m.c.VRegOf(retVal) + + if retVal.Type() != ssa.TypeI32 { + panic("TODO?: Ireduce to non-i32") + } + m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, rn, rd)) + + case ssa.OpcodeAtomicLoad: + ptr := instr.Arg() + size := instr.AtomicTargetSize() + dst := m.c.VRegOf(instr.Return()) + + // At this point, the ptr is ensured to be aligned, so using a normal load is atomic. + // https://github.com/golang/go/blob/adead1a93f472affa97c494ef19f2f492ee6f34a/src/runtime/internal/atomic/atomic_amd64.go#L30 + mem := newOperandMem(m.lowerToAddressMode(ptr, 0)) + load := m.allocateInstr() + switch size { + case 8: + load.asMov64MR(mem, dst) + case 4: + load.asMovzxRmR(extModeLQ, mem, dst) + case 2: + load.asMovzxRmR(extModeWQ, mem, dst) + case 1: + load.asMovzxRmR(extModeBQ, mem, dst) + default: + panic("BUG") + } + m.insert(load) + + case ssa.OpcodeFence: + m.insert(m.allocateInstr().asMFence()) + + case ssa.OpcodeAtomicStore: + ptr, _val := instr.Arg2() + size := instr.AtomicTargetSize() + + val := m.getOperand_Reg(m.c.ValueDefinition(_val)) + // The content on the val register will be overwritten by xchg, so we need to copy it to a temporary register. + copied := m.copyToTmp(val.reg()) + + mem := newOperandMem(m.lowerToAddressMode(ptr, 0)) + store := m.allocateInstr().asXCHG(copied, mem, byte(size)) + m.insert(store) + + case ssa.OpcodeAtomicCas: + addr, exp, repl := instr.Arg3() + size := instr.AtomicTargetSize() + m.lowerAtomicCas(addr, exp, repl, size, instr.Return()) + + case ssa.OpcodeAtomicRmw: + addr, val := instr.Arg2() + atomicOp, size := instr.AtomicRmwData() + m.lowerAtomicRmw(atomicOp, addr, val, size, instr.Return()) + + default: + panic("TODO: lowering " + op.String()) + } +} + +func (m *machine) lowerAtomicRmw(op ssa.AtomicRmwOp, addr, val ssa.Value, size uint64, ret ssa.Value) { + mem := m.lowerToAddressMode(addr, 0) + _val := m.getOperand_Reg(m.c.ValueDefinition(val)) + + switch op { + case ssa.AtomicRmwOpAdd, ssa.AtomicRmwOpSub: + valCopied := m.copyToTmp(_val.reg()) + if op == ssa.AtomicRmwOpSub { + // Negate the value. + m.insert(m.allocateInstr().asNeg(newOperandReg(valCopied), true)) + } + m.insert(m.allocateInstr().asLockXAdd(valCopied, mem, byte(size))) + m.clearHigherBitsForAtomic(valCopied, size, ret.Type()) + m.copyTo(valCopied, m.c.VRegOf(ret)) + + case ssa.AtomicRmwOpAnd, ssa.AtomicRmwOpOr, ssa.AtomicRmwOpXor: + accumulator := raxVReg + // Reserve rax for the accumulator to make regalloc happy. + // Note: do this initialization before defining valCopied, because it might be the same register and + // if that happens, the unnecessary load/store will be performed inside the loop. + // This can be mitigated in any way once the register allocator is clever enough. + m.insert(m.allocateInstr().asDefineUninitializedReg(accumulator)) + + // Copy the value to a temporary register. + valCopied := m.copyToTmp(_val.reg()) + m.clearHigherBitsForAtomic(valCopied, size, ret.Type()) + + memOp := newOperandMem(mem) + tmp := m.c.AllocateVReg(ssa.TypeI64) + beginLoop, beginLoopLabel := m.allocateBrTarget() + { + m.insert(beginLoop) + // Reset the value on tmp by the original value. + m.copyTo(valCopied, tmp) + // Load the current value at the memory location into accumulator. + switch size { + case 1: + m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, memOp, accumulator)) + case 2: + m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, memOp, accumulator)) + case 4: + m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, memOp, accumulator)) + case 8: + m.insert(m.allocateInstr().asMov64MR(memOp, accumulator)) + default: + panic("BUG") + } + // Then perform the logical operation on the accumulator and the value on tmp. + switch op { + case ssa.AtomicRmwOpAnd: + m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, newOperandReg(accumulator), tmp, true)) + case ssa.AtomicRmwOpOr: + m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeOr, newOperandReg(accumulator), tmp, true)) + case ssa.AtomicRmwOpXor: + m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeXor, newOperandReg(accumulator), tmp, true)) + default: + panic("BUG") + } + // Finally, try compare-exchange the value at the memory location with the tmp. + m.insert(m.allocateInstr().asLockCmpXCHG(tmp, memOp.addressMode(), byte(size))) + // If it succeeds, ZF will be set, and we can break the loop. + m.insert(m.allocateInstr().asJmpIf(condNZ, newOperandLabel(beginLoopLabel))) + } + + // valCopied must be alive at the end of the loop. + m.insert(m.allocateInstr().asNopUseReg(valCopied)) + + // At this point, accumulator contains the result. + m.clearHigherBitsForAtomic(accumulator, size, ret.Type()) + m.copyTo(accumulator, m.c.VRegOf(ret)) + + case ssa.AtomicRmwOpXchg: + valCopied := m.copyToTmp(_val.reg()) + + m.insert(m.allocateInstr().asXCHG(valCopied, newOperandMem(mem), byte(size))) + m.clearHigherBitsForAtomic(valCopied, size, ret.Type()) + m.copyTo(valCopied, m.c.VRegOf(ret)) + + default: + panic("BUG") + } +} + +func (m *machine) lowerAtomicCas(addr, exp, repl ssa.Value, size uint64, ret ssa.Value) { + mem := m.lowerToAddressMode(addr, 0) + expOp := m.getOperand_Reg(m.c.ValueDefinition(exp)) + replOp := m.getOperand_Reg(m.c.ValueDefinition(repl)) + + accumulator := raxVReg + m.copyTo(expOp.reg(), accumulator) + m.insert(m.allocateInstr().asLockCmpXCHG(replOp.reg(), mem, byte(size))) + m.clearHigherBitsForAtomic(accumulator, size, ret.Type()) + m.copyTo(accumulator, m.c.VRegOf(ret)) +} + +func (m *machine) clearHigherBitsForAtomic(r regalloc.VReg, valSize uint64, resultType ssa.Type) { + switch resultType { + case ssa.TypeI32: + switch valSize { + case 1: + m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(r), r)) + case 2: + m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(r), r)) + } + case ssa.TypeI64: + switch valSize { + case 1: + m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(r), r)) + case 2: + m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, newOperandReg(r), r)) + case 4: + m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, newOperandReg(r), r)) + } + } +} + +func (m *machine) lowerFcmp(instr *ssa.Instruction) { + f1, f2, and := m.lowerFcmpToFlags(instr) + rd := m.c.VRegOf(instr.Return()) + if f2 == condInvalid { + tmp := m.c.AllocateVReg(ssa.TypeI32) + m.insert(m.allocateInstr().asSetcc(f1, tmp)) + // On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match + // the semantics of Icmp that sets either 0 or 1. + m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp), rd)) + } else { + tmp1, tmp2 := m.c.AllocateVReg(ssa.TypeI32), m.c.AllocateVReg(ssa.TypeI32) + m.insert(m.allocateInstr().asSetcc(f1, tmp1)) + m.insert(m.allocateInstr().asSetcc(f2, tmp2)) + var op aluRmiROpcode + if and { + op = aluRmiROpcodeAnd + } else { + op = aluRmiROpcodeOr + } + m.insert(m.allocateInstr().asAluRmiR(op, newOperandReg(tmp1), tmp2, false)) + m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp2), rd)) + } +} + +func (m *machine) lowerIcmp(instr *ssa.Instruction) { + x, y, c := instr.IcmpData() + m.lowerIcmpToFlag(m.c.ValueDefinition(x), m.c.ValueDefinition(y), x.Type() == ssa.TypeI64) + rd := m.c.VRegOf(instr.Return()) + tmp := m.c.AllocateVReg(ssa.TypeI32) + m.insert(m.allocateInstr().asSetcc(condFromSSAIntCmpCond(c), tmp)) + // On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match + // the semantics of Icmp that sets either 0 or 1. + m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp), rd)) +} + +func (m *machine) lowerSelect(x, y, cval, ret ssa.Value) { + xo, yo := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y)) + rd := m.c.VRegOf(ret) + + var cond cond + cvalDef := m.c.ValueDefinition(cval) + switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) { + case ssa.OpcodeIcmp: + icmp := cvalDef.Instr + xc, yc, cc := icmp.IcmpData() + m.lowerIcmpToFlag(m.c.ValueDefinition(xc), m.c.ValueDefinition(yc), xc.Type() == ssa.TypeI64) + cond = condFromSSAIntCmpCond(cc) + icmp.Lowered() + default: // TODO: match ssa.OpcodeFcmp for optimization, but seems a bit complex. + cv := m.getOperand_Reg(cvalDef) + test := m.allocateInstr().asCmpRmiR(false, cv, cv.reg(), false) + m.insert(test) + cond = condNZ + } + + if typ := x.Type(); typ.IsInt() { + _64 := typ.Bits() == 64 + mov := m.allocateInstr() + tmp := m.c.AllocateVReg(typ) + switch yo.kind { + case operandKindReg: + mov.asMovRR(yo.reg(), tmp, _64) + case operandKindMem: + if _64 { + mov.asMov64MR(yo, tmp) + } else { + mov.asMovzxRmR(extModeLQ, yo, tmp) + } + default: + panic("BUG") + } + m.insert(mov) + cmov := m.allocateInstr().asCmove(cond, xo, tmp, _64) + m.insert(cmov) + m.insert(m.allocateInstr().asMovRR(tmp, rd, _64)) + } else { + mov := m.allocateInstr() + tmp := m.c.AllocateVReg(typ) + switch typ { + case ssa.TypeF32: + mov.asXmmUnaryRmR(sseOpcodeMovss, yo, tmp) + case ssa.TypeF64: + mov.asXmmUnaryRmR(sseOpcodeMovsd, yo, tmp) + case ssa.TypeV128: + mov.asXmmUnaryRmR(sseOpcodeMovdqu, yo, tmp) + default: + panic("BUG") + } + m.insert(mov) + + cmov := m.allocateInstr().asXmmCMov(cond, xo, tmp, typ.Size()) + m.insert(cmov) + + m.copyTo(tmp, rd) + } +} + +func (m *machine) lowerXmmCmovAfterRegAlloc(i *instruction) { + x := i.op1 + rd := i.op2.reg() + cond := cond(i.u1) + + jcc := m.allocateInstr() + m.insert(jcc) + + mov := m.allocateInstr() + switch i.u2 { + case 4: + mov.asXmmUnaryRmR(sseOpcodeMovss, x, rd) + case 8: + mov.asXmmUnaryRmR(sseOpcodeMovsd, x, rd) + case 16: + mov.asXmmUnaryRmR(sseOpcodeMovdqu, x, rd) + default: + panic("BUG") + } + m.insert(mov) + + nop, end := m.allocateBrTarget() + m.insert(nop) + jcc.asJmpIf(cond.invert(), newOperandLabel(end)) +} + +func (m *machine) lowerExtend(_arg, ret ssa.Value, from, to byte, signed bool) { + rd0 := m.c.VRegOf(ret) + arg := m.getOperand_Mem_Reg(m.c.ValueDefinition(_arg)) + + rd := m.c.AllocateVReg(ret.Type()) + + ext := m.allocateInstr() + switch { + case from == 8 && to == 16 && signed: + ext.asMovsxRmR(extModeBQ, arg, rd) + case from == 8 && to == 16 && !signed: + ext.asMovzxRmR(extModeBL, arg, rd) + case from == 8 && to == 32 && signed: + ext.asMovsxRmR(extModeBL, arg, rd) + case from == 8 && to == 32 && !signed: + ext.asMovzxRmR(extModeBQ, arg, rd) + case from == 8 && to == 64 && signed: + ext.asMovsxRmR(extModeBQ, arg, rd) + case from == 8 && to == 64 && !signed: + ext.asMovzxRmR(extModeBQ, arg, rd) + case from == 16 && to == 32 && signed: + ext.asMovsxRmR(extModeWL, arg, rd) + case from == 16 && to == 32 && !signed: + ext.asMovzxRmR(extModeWL, arg, rd) + case from == 16 && to == 64 && signed: + ext.asMovsxRmR(extModeWQ, arg, rd) + case from == 16 && to == 64 && !signed: + ext.asMovzxRmR(extModeWQ, arg, rd) + case from == 32 && to == 64 && signed: + ext.asMovsxRmR(extModeLQ, arg, rd) + case from == 32 && to == 64 && !signed: + ext.asMovzxRmR(extModeLQ, arg, rd) + default: + panic(fmt.Sprintf("BUG: unhandled extend: from=%d, to=%d, signed=%t", from, to, signed)) + } + m.insert(ext) + + m.copyTo(rd, rd0) +} + +func (m *machine) lowerVconst(dst regalloc.VReg, lo, hi uint64) { + if lo == 0 && hi == 0 { + m.insert(m.allocateInstr().asZeros(dst)) + return + } + + load := m.allocateInstr() + constLabel := m.allocateLabel() + m.consts = append(m.consts, _const{label: constLabel, lo: lo, hi: hi}) + load.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(constLabel.L)), dst) + m.insert(load) +} + +func (m *machine) lowerCtz(instr *ssa.Instruction) { + if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) { + m.lowerUnaryRmR(instr, unaryRmROpcodeTzcnt) + } else { + // On processors that do not support TZCNT, the BSF instruction is + // executed instead. The key difference between TZCNT and BSF + // instruction is that if source operand is zero, the content of + // destination operand is undefined. + // https://www.felixcloutier.com/x86/tzcnt.html + + x := instr.Arg() + if !x.Type().IsInt() { + panic("BUG?") + } + _64 := x.Type().Bits() == 64 + + xDef := m.c.ValueDefinition(x) + tmp := m.c.AllocateVReg(x.Type()) + rm := m.getOperand_Reg(xDef) + + // First, we have to check if the target is non-zero. + test := m.allocateInstr() + test.asCmpRmiR(false, rm, rm.reg(), _64) + m.insert(test) + + jmpNz := m.allocateInstr() + m.insert(jmpNz) + + // If the value is zero, we just push the const value. + m.lowerIconst(tmp, uint64(x.Type().Bits()), _64) + + // Now jump right after the non-zero case. + jmpAtEnd := m.allocateInstr() + m.insert(jmpAtEnd) + + // jmpNz target label is set here. + nop, nz := m.allocateBrTarget() + jmpNz.asJmpIf(condNZ, newOperandLabel(nz)) + m.insert(nop) + + // Emit the non-zero case. + bsr := m.allocateInstr() + bsr.asUnaryRmR(unaryRmROpcodeBsf, rm, tmp, _64) + m.insert(bsr) + + // jmpAtEnd target label is set here. + nopEnd, end := m.allocateBrTarget() + jmpAtEnd.asJmp(newOperandLabel(end)) + m.insert(nopEnd) + + m.copyTo(tmp, m.c.VRegOf(instr.Return())) + } +} + +func (m *machine) lowerClz(instr *ssa.Instruction) { + if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) { + m.lowerUnaryRmR(instr, unaryRmROpcodeLzcnt) + } else { + // On processors that do not support LZCNT, we combine BSR (calculating + // most significant set bit) with XOR. This logic is described in + // "Replace Raw Assembly Code with Builtin Intrinsics" section in: + // https://developer.apple.com/documentation/apple-silicon/addressing-architectural-differences-in-your-macos-code. + + x := instr.Arg() + if !x.Type().IsInt() { + panic("BUG?") + } + _64 := x.Type().Bits() == 64 + + xDef := m.c.ValueDefinition(x) + rm := m.getOperand_Reg(xDef) + tmp := m.c.AllocateVReg(x.Type()) + + // First, we have to check if the rm is non-zero as BSR is undefined + // on zero. See https://www.felixcloutier.com/x86/bsr. + test := m.allocateInstr() + test.asCmpRmiR(false, rm, rm.reg(), _64) + m.insert(test) + + jmpNz := m.allocateInstr() + m.insert(jmpNz) + + // If the value is zero, we just push the const value. + m.lowerIconst(tmp, uint64(x.Type().Bits()), _64) + + // Now jump right after the non-zero case. + jmpAtEnd := m.allocateInstr() + m.insert(jmpAtEnd) + + // jmpNz target label is set here. + nop, nz := m.allocateBrTarget() + jmpNz.asJmpIf(condNZ, newOperandLabel(nz)) + m.insert(nop) + + // Emit the non-zero case. + bsr := m.allocateInstr() + bsr.asUnaryRmR(unaryRmROpcodeBsr, rm, tmp, _64) + m.insert(bsr) + + // Now we XOR the value with the bit length minus one. + xor := m.allocateInstr() + xor.asAluRmiR(aluRmiROpcodeXor, newOperandImm32(uint32(x.Type().Bits()-1)), tmp, _64) + m.insert(xor) + + // jmpAtEnd target label is set here. + nopEnd, end := m.allocateBrTarget() + jmpAtEnd.asJmp(newOperandLabel(end)) + m.insert(nopEnd) + + m.copyTo(tmp, m.c.VRegOf(instr.Return())) + } +} + +func (m *machine) lowerUnaryRmR(si *ssa.Instruction, op unaryRmROpcode) { + x := si.Arg() + if !x.Type().IsInt() { + panic("BUG?") + } + _64 := x.Type().Bits() == 64 + + xDef := m.c.ValueDefinition(x) + rm := m.getOperand_Mem_Reg(xDef) + rd := m.c.VRegOf(si.Return()) + + instr := m.allocateInstr() + instr.asUnaryRmR(op, rm, rd, _64) + m.insert(instr) +} + +func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, dst regalloc.VReg) { + mem := newOperandMem(m.lowerToAddressMode(ptr, offset)) + load := m.allocateInstr() + switch typ { + case ssa.TypeI32: + load.asMovzxRmR(extModeLQ, mem, dst) + case ssa.TypeI64: + load.asMov64MR(mem, dst) + case ssa.TypeF32: + load.asXmmUnaryRmR(sseOpcodeMovss, mem, dst) + case ssa.TypeF64: + load.asXmmUnaryRmR(sseOpcodeMovsd, mem, dst) + case ssa.TypeV128: + load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, dst) + default: + panic("BUG") + } + m.insert(load) +} + +func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, dst regalloc.VReg) { + mem := newOperandMem(m.lowerToAddressMode(ptr, offset)) + load := m.allocateInstr() + switch op { + case ssa.OpcodeUload8: + load.asMovzxRmR(extModeBQ, mem, dst) + case ssa.OpcodeUload16: + load.asMovzxRmR(extModeWQ, mem, dst) + case ssa.OpcodeUload32: + load.asMovzxRmR(extModeLQ, mem, dst) + case ssa.OpcodeSload8: + load.asMovsxRmR(extModeBQ, mem, dst) + case ssa.OpcodeSload16: + load.asMovsxRmR(extModeWQ, mem, dst) + case ssa.OpcodeSload32: + load.asMovsxRmR(extModeLQ, mem, dst) + default: + panic("BUG") + } + m.insert(load) +} + +func (m *machine) lowerExitIfTrueWithCode(execCtx regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) { + condDef := m.c.ValueDefinition(cond) + if !m.c.MatchInstr(condDef, ssa.OpcodeIcmp) { + panic("TODO: ExitIfTrue must come after Icmp at the moment: " + condDef.Instr.Opcode().String()) + } + cvalInstr := condDef.Instr + cvalInstr.MarkLowered() + + // We need to copy the execution context to a temp register, because if it's spilled, + // it might end up being reloaded inside the exiting branch. + execCtxTmp := m.copyToTmp(execCtx) + + x, y, c := cvalInstr.IcmpData() + xx, yy := m.c.ValueDefinition(x), m.c.ValueDefinition(y) + if !m.tryLowerBandToFlag(xx, yy) { + m.lowerIcmpToFlag(xx, yy, x.Type() == ssa.TypeI64) + } + + jmpIf := m.allocateInstr() + m.insert(jmpIf) + l := m.lowerExitWithCode(execCtxTmp, code) + jmpIf.asJmpIf(condFromSSAIntCmpCond(c).invert(), newOperandLabel(l)) +} + +func (m *machine) tryLowerBandToFlag(x, y *backend.SSAValueDefinition) (ok bool) { + var target *backend.SSAValueDefinition + if x.IsFromInstr() && x.Instr.Constant() && x.Instr.ConstantVal() == 0 { + if m.c.MatchInstr(y, ssa.OpcodeBand) { + target = y + } + } + + if y.IsFromInstr() && y.Instr.Constant() && y.Instr.ConstantVal() == 0 { + if m.c.MatchInstr(x, ssa.OpcodeBand) { + target = x + } + } + + if target == nil { + return false + } + + bandInstr := target.Instr + bandX, bandY := bandInstr.Arg2() + + xx := m.getOperand_Reg(m.c.ValueDefinition(bandX)) + yy := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(bandY)) + test := m.allocateInstr().asCmpRmiR(false, yy, xx.reg(), bandX.Type() == ssa.TypeI64) + m.insert(test) + bandInstr.MarkLowered() + return true +} + +func (m *machine) allocateExitInstructions(execCtx, exitCodeReg regalloc.VReg) (saveRsp, saveRbp, setExitCode *instruction) { + saveRsp = m.allocateInstr().asMovRM( + rspVReg, + newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.U32(), execCtx)), + 8, + ) + + saveRbp = m.allocateInstr().asMovRM( + rbpVReg, + newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetFramePointerBeforeGoCall.U32(), execCtx)), + 8, + ) + setExitCode = m.allocateInstr().asMovRM( + exitCodeReg, + newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetExitCodeOffset.U32(), execCtx)), + 4, + ) + return +} + +func (m *machine) lowerExitWithCode(execCtx regalloc.VReg, code wazevoapi.ExitCode) (afterLabel backend.Label) { + exitCodeReg := rbpVReg + saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtx, exitCodeReg) + + // Set save RSP, RBP, and write exit code. + m.insert(saveRsp) + m.insert(saveRbp) + m.lowerIconst(exitCodeReg, uint64(code), false) + m.insert(setExitCode) + + ripReg := rbpVReg + + // Next is to save the current address for stack unwinding. + nop, currentAddrLabel := m.allocateBrTarget() + m.insert(nop) + readRip := m.allocateInstr().asLEA(newOperandLabel(currentAddrLabel), ripReg) + m.insert(readRip) + saveRip := m.allocateInstr().asMovRM( + ripReg, + newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)), + 8, + ) + m.insert(saveRip) + + // Finally exit. + exitSq := m.allocateExitSeq(execCtx) + m.insert(exitSq) + + // Return the label for continuation. + continuation, afterLabel := m.allocateBrTarget() + m.insert(continuation) + return afterLabel +} + +func (m *machine) lowerAluRmiROp(si *ssa.Instruction, op aluRmiROpcode) { + x, y := si.Arg2() + if !x.Type().IsInt() { + panic("BUG?") + } + + _64 := x.Type().Bits() == 64 + + xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) + + // TODO: commutative args can be swapped if one of them is an immediate. + rn := m.getOperand_Reg(xDef) + rm := m.getOperand_Mem_Imm32_Reg(yDef) + rd := m.c.VRegOf(si.Return()) + + // rn is being overwritten, so we first copy its value to a temp register, + // in case it is referenced again later. + tmp := m.copyToTmp(rn.reg()) + + alu := m.allocateInstr() + alu.asAluRmiR(op, rm, tmp, _64) + m.insert(alu) + + // tmp now contains the result, we copy it to the dest register. + m.copyTo(tmp, rd) +} + +func (m *machine) lowerShiftR(si *ssa.Instruction, op shiftROp) { + x, amt := si.Arg2() + if !x.Type().IsInt() { + panic("BUG?") + } + _64 := x.Type().Bits() == 64 + + xDef, amtDef := m.c.ValueDefinition(x), m.c.ValueDefinition(amt) + + opAmt := m.getOperand_Imm32_Reg(amtDef) + rx := m.getOperand_Reg(xDef) + rd := m.c.VRegOf(si.Return()) + + // rx is being overwritten, so we first copy its value to a temp register, + // in case it is referenced again later. + tmpDst := m.copyToTmp(rx.reg()) + + if opAmt.kind == operandKindReg { + // If opAmt is a register we must copy its value to rcx, + // because shiftR encoding mandates that the shift amount is in rcx. + m.copyTo(opAmt.reg(), rcxVReg) + + alu := m.allocateInstr() + alu.asShiftR(op, newOperandReg(rcxVReg), tmpDst, _64) + m.insert(alu) + + } else { + alu := m.allocateInstr() + alu.asShiftR(op, opAmt, tmpDst, _64) + m.insert(alu) + } + + // tmp now contains the result, we copy it to the dest register. + m.copyTo(tmpDst, rd) +} + +func (m *machine) lowerXmmRmR(instr *ssa.Instruction) { + x, y := instr.Arg2() + if !x.Type().IsFloat() { + panic("BUG?") + } + _64 := x.Type().Bits() == 64 + + var op sseOpcode + if _64 { + switch instr.Opcode() { + case ssa.OpcodeFadd: + op = sseOpcodeAddsd + case ssa.OpcodeFsub: + op = sseOpcodeSubsd + case ssa.OpcodeFmul: + op = sseOpcodeMulsd + case ssa.OpcodeFdiv: + op = sseOpcodeDivsd + default: + panic("BUG") + } + } else { + switch instr.Opcode() { + case ssa.OpcodeFadd: + op = sseOpcodeAddss + case ssa.OpcodeFsub: + op = sseOpcodeSubss + case ssa.OpcodeFmul: + op = sseOpcodeMulss + case ssa.OpcodeFdiv: + op = sseOpcodeDivss + default: + panic("BUG") + } + } + + xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) + rn := m.getOperand_Reg(yDef) + rm := m.getOperand_Reg(xDef) + rd := m.c.VRegOf(instr.Return()) + + // rm is being overwritten, so we first copy its value to a temp register, + // in case it is referenced again later. + tmp := m.copyToTmp(rm.reg()) + + xmm := m.allocateInstr().asXmmRmR(op, rn, tmp) + m.insert(xmm) + + m.copyTo(tmp, rd) +} + +func (m *machine) lowerSqrt(instr *ssa.Instruction) { + x := instr.Arg() + if !x.Type().IsFloat() { + panic("BUG") + } + _64 := x.Type().Bits() == 64 + var op sseOpcode + if _64 { + op = sseOpcodeSqrtsd + } else { + op = sseOpcodeSqrtss + } + + xDef := m.c.ValueDefinition(x) + rm := m.getOperand_Mem_Reg(xDef) + rd := m.c.VRegOf(instr.Return()) + + xmm := m.allocateInstr().asXmmUnaryRmR(op, rm, rd) + m.insert(xmm) +} + +func (m *machine) lowerFabsFneg(instr *ssa.Instruction) { + x := instr.Arg() + if !x.Type().IsFloat() { + panic("BUG") + } + _64 := x.Type().Bits() == 64 + var op sseOpcode + var mask uint64 + if _64 { + switch instr.Opcode() { + case ssa.OpcodeFabs: + mask, op = 0x7fffffffffffffff, sseOpcodeAndpd + case ssa.OpcodeFneg: + mask, op = 0x8000000000000000, sseOpcodeXorpd + } + } else { + switch instr.Opcode() { + case ssa.OpcodeFabs: + mask, op = 0x7fffffff, sseOpcodeAndps + case ssa.OpcodeFneg: + mask, op = 0x80000000, sseOpcodeXorps + } + } + + tmp := m.c.AllocateVReg(x.Type()) + + xDef := m.c.ValueDefinition(x) + rm := m.getOperand_Reg(xDef) + rd := m.c.VRegOf(instr.Return()) + + m.lowerFconst(tmp, mask, _64) + + xmm := m.allocateInstr().asXmmRmR(op, rm, tmp) + m.insert(xmm) + + m.copyTo(tmp, rd) +} + +func (m *machine) lowerStore(si *ssa.Instruction) { + value, ptr, offset, storeSizeInBits := si.StoreData() + rm := m.getOperand_Reg(m.c.ValueDefinition(value)) + mem := newOperandMem(m.lowerToAddressMode(ptr, offset)) + + store := m.allocateInstr() + switch value.Type() { + case ssa.TypeI32: + store.asMovRM(rm.reg(), mem, storeSizeInBits/8) + case ssa.TypeI64: + store.asMovRM(rm.reg(), mem, storeSizeInBits/8) + case ssa.TypeF32: + store.asXmmMovRM(sseOpcodeMovss, rm.reg(), mem) + case ssa.TypeF64: + store.asXmmMovRM(sseOpcodeMovsd, rm.reg(), mem) + case ssa.TypeV128: + store.asXmmMovRM(sseOpcodeMovdqu, rm.reg(), mem) + default: + panic("BUG") + } + m.insert(store) +} + +func (m *machine) lowerCall(si *ssa.Instruction) { + isDirectCall := si.Opcode() == ssa.OpcodeCall + var indirectCalleePtr ssa.Value + var directCallee ssa.FuncRef + var sigID ssa.SignatureID + var args []ssa.Value + var isMemmove bool + if isDirectCall { + directCallee, sigID, args = si.CallData() + } else { + indirectCalleePtr, sigID, args, isMemmove = si.CallIndirectData() + } + calleeABI := m.c.GetFunctionABI(m.c.SSABuilder().ResolveSignature(sigID)) + + stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize()) + if m.maxRequiredStackSizeForCalls < stackSlotSize+16 { + m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // 16 == return address + RBP. + } + + // Note: See machine.SetupPrologue for the stack layout. + // The stack pointer decrease/increase will be inserted later in the compilation. + + for i, arg := range args { + reg := m.c.VRegOf(arg) + def := m.c.ValueDefinition(arg) + m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize) + } + + if isMemmove { + // Go's memmove *might* use all xmm0-xmm15, so we need to release them. + // https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#architecture-specifics + // https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/runtime/memmove_amd64.s#L271-L286 + for i := regalloc.RealReg(0); i < 16; i++ { + m.insert(m.allocateInstr().asDefineUninitializedReg(regInfo.RealRegToVReg[xmm0+i])) + } + } + + if isDirectCall { + call := m.allocateInstr().asCall(directCallee, calleeABI) + m.insert(call) + } else { + ptrOp := m.getOperand_Mem_Reg(m.c.ValueDefinition(indirectCalleePtr)) + callInd := m.allocateInstr().asCallIndirect(ptrOp, calleeABI) + m.insert(callInd) + } + + if isMemmove { + for i := regalloc.RealReg(0); i < 16; i++ { + m.insert(m.allocateInstr().asNopUseReg(regInfo.RealRegToVReg[xmm0+i])) + } + } + + var index int + r1, rs := si.Returns() + if r1.Valid() { + m.callerGenFunctionReturnVReg(calleeABI, 0, m.c.VRegOf(r1), stackSlotSize) + index++ + } + + for _, r := range rs { + m.callerGenFunctionReturnVReg(calleeABI, index, m.c.VRegOf(r), stackSlotSize) + index++ + } +} + +// callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the +// caller side of the function call. +func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def *backend.SSAValueDefinition, stackSlotSize int64) { + arg := &a.Args[argIndex] + if def != nil && def.IsFromInstr() { + // Constant instructions are inlined. + if inst := def.Instr; inst.Constant() { + m.insertLoadConstant(inst, reg) + } + } + if arg.Kind == backend.ABIArgKindReg { + m.InsertMove(arg.Reg, reg, arg.Type) + } else { + store := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmReg( + // -stackSlotSize because the stack pointer is not yet decreased. + uint32(arg.Offset-stackSlotSize), rspVReg)) + switch arg.Type { + case ssa.TypeI32: + store.asMovRM(reg, mem, 4) + case ssa.TypeI64: + store.asMovRM(reg, mem, 8) + case ssa.TypeF32: + store.asXmmMovRM(sseOpcodeMovss, reg, mem) + case ssa.TypeF64: + store.asXmmMovRM(sseOpcodeMovsd, reg, mem) + case ssa.TypeV128: + store.asXmmMovRM(sseOpcodeMovdqu, reg, mem) + default: + panic("BUG") + } + m.insert(store) + } +} + +func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex int, reg regalloc.VReg, stackSlotSize int64) { + r := &a.Rets[retIndex] + if r.Kind == backend.ABIArgKindReg { + m.InsertMove(reg, r.Reg, r.Type) + } else { + load := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmReg( + // -stackSlotSize because the stack pointer is not yet decreased. + uint32(a.ArgStackSize+r.Offset-stackSlotSize), rspVReg)) + switch r.Type { + case ssa.TypeI32: + load.asMovzxRmR(extModeLQ, mem, reg) + case ssa.TypeI64: + load.asMov64MR(mem, reg) + case ssa.TypeF32: + load.asXmmUnaryRmR(sseOpcodeMovss, mem, reg) + case ssa.TypeF64: + load.asXmmUnaryRmR(sseOpcodeMovsd, mem, reg) + case ssa.TypeV128: + load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, reg) + default: + panic("BUG") + } + m.insert(load) + } +} + +// InsertMove implements backend.Machine. +func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) { + switch typ { + case ssa.TypeI32, ssa.TypeI64: + i := m.allocateInstr().asMovRR(src, dst, typ.Bits() == 64) + m.insert(i) + case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: + var op sseOpcode + switch typ { + case ssa.TypeF32: + op = sseOpcodeMovss + case ssa.TypeF64: + op = sseOpcodeMovsd + case ssa.TypeV128: + op = sseOpcodeMovdqa + } + i := m.allocateInstr().asXmmUnaryRmR(op, newOperandReg(src), dst) + m.insert(i) + default: + panic("BUG") + } +} + +// Format implements backend.Machine. +func (m *machine) Format() string { + ectx := m.ectx + begins := map[*instruction]backend.Label{} + for l, pos := range ectx.LabelPositions { + begins[pos.Begin] = l + } + + irBlocks := map[backend.Label]ssa.BasicBlockID{} + for i, l := range ectx.SsaBlockIDToLabels { + irBlocks[l] = ssa.BasicBlockID(i) + } + + var lines []string + for cur := ectx.RootInstr; cur != nil; cur = cur.next { + if l, ok := begins[cur]; ok { + var labelStr string + if blkID, ok := irBlocks[l]; ok { + labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID) + } else { + labelStr = fmt.Sprintf("%s:", l) + } + lines = append(lines, labelStr) + } + if cur.kind == nop0 { + continue + } + lines = append(lines, "\t"+cur.String()) + } + for _, vc := range m.consts { + if vc._var == nil { + lines = append(lines, fmt.Sprintf("%s: const [%d %d]", vc.label.L, vc.lo, vc.hi)) + } else { + lines = append(lines, fmt.Sprintf("%s: const %#x", vc.label.L, vc._var)) + } + } + return "\n" + strings.Join(lines, "\n") + "\n" +} + +func (m *machine) encodeWithoutSSA(root *instruction) { + m.labelResolutionPends = m.labelResolutionPends[:0] + ectx := m.ectx + + bufPtr := m.c.BufPtr() + for cur := root; cur != nil; cur = cur.next { + offset := int64(len(*bufPtr)) + if cur.kind == nop0 { + l := cur.nop0Label() + if pos, ok := ectx.LabelPositions[l]; ok { + pos.BinaryOffset = offset + } + } + + needLabelResolution := cur.encode(m.c) + if needLabelResolution { + m.labelResolutionPends = append(m.labelResolutionPends, + labelResolutionPend{instr: cur, imm32Offset: int64(len(*bufPtr)) - 4}, + ) + } + } + + for i := range m.labelResolutionPends { + p := &m.labelResolutionPends[i] + switch p.instr.kind { + case jmp, jmpIf, lea: + target := p.instr.jmpLabel() + targetOffset := ectx.LabelPositions[target].BinaryOffset + imm32Offset := p.imm32Offset + jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction. + binary.LittleEndian.PutUint32((*bufPtr)[imm32Offset:], uint32(jmpOffset)) + default: + panic("BUG") + } + } +} + +// Encode implements backend.Machine Encode. +func (m *machine) Encode(ctx context.Context) (err error) { + ectx := m.ectx + bufPtr := m.c.BufPtr() + + var fn string + var fnIndex int + var labelToSSABlockID map[backend.Label]ssa.BasicBlockID + if wazevoapi.PerfMapEnabled { + fn = wazevoapi.GetCurrentFunctionName(ctx) + labelToSSABlockID = make(map[backend.Label]ssa.BasicBlockID) + for i, l := range ectx.SsaBlockIDToLabels { + labelToSSABlockID[l] = ssa.BasicBlockID(i) + } + fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx) + } + + m.labelResolutionPends = m.labelResolutionPends[:0] + for _, pos := range ectx.OrderedBlockLabels { + offset := int64(len(*bufPtr)) + pos.BinaryOffset = offset + for cur := pos.Begin; cur != pos.End.next; cur = cur.next { + offset := int64(len(*bufPtr)) + + switch cur.kind { + case nop0: + l := cur.nop0Label() + if pos, ok := ectx.LabelPositions[l]; ok { + pos.BinaryOffset = offset + } + case sourceOffsetInfo: + m.c.AddSourceOffsetInfo(offset, cur.sourceOffsetInfo()) + } + + needLabelResolution := cur.encode(m.c) + if needLabelResolution { + m.labelResolutionPends = append(m.labelResolutionPends, + labelResolutionPend{instr: cur, instrOffset: offset, imm32Offset: int64(len(*bufPtr)) - 4}, + ) + } + } + + if wazevoapi.PerfMapEnabled { + l := pos.L + var labelStr string + if blkID, ok := labelToSSABlockID[l]; ok { + labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID) + } else { + labelStr = l.String() + } + size := int64(len(*bufPtr)) - offset + wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr)) + } + } + + for i := range m.consts { + offset := int64(len(*bufPtr)) + vc := &m.consts[i] + vc.label.BinaryOffset = offset + if vc._var == nil { + lo, hi := vc.lo, vc.hi + m.c.Emit8Bytes(lo) + m.c.Emit8Bytes(hi) + } else { + for _, b := range vc._var { + m.c.EmitByte(b) + } + } + } + + buf := *bufPtr + for i := range m.labelResolutionPends { + p := &m.labelResolutionPends[i] + switch p.instr.kind { + case jmp, jmpIf, lea, xmmUnaryRmR: + target := p.instr.jmpLabel() + targetOffset := ectx.LabelPositions[target].BinaryOffset + imm32Offset := p.imm32Offset + jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction. + binary.LittleEndian.PutUint32(buf[imm32Offset:], uint32(jmpOffset)) + case jmpTableIsland: + tableBegin := p.instrOffset + // Each entry is the offset from the beginning of the jmpTableIsland instruction in 8 bytes. + targets := m.jmpTableTargets[p.instr.u1] + for i, l := range targets { + targetOffset := ectx.LabelPositions[backend.Label(l)].BinaryOffset + jmpOffset := targetOffset - tableBegin + binary.LittleEndian.PutUint64(buf[tableBegin+int64(i)*8:], uint64(jmpOffset)) + } + default: + panic("BUG") + } + } + return +} + +// ResolveRelocations implements backend.Machine. +func (m *machine) ResolveRelocations(refToBinaryOffset []int, binary []byte, relocations []backend.RelocationInfo, _ []int) { + for _, r := range relocations { + offset := r.Offset + calleeFnOffset := refToBinaryOffset[r.FuncRef] + // offset is the offset of the last 4 bytes of the call instruction. + callInstrOffsetBytes := binary[offset : offset+4] + diff := int64(calleeFnOffset) - (offset + 4) // +4 because we want the offset of the next instruction (In x64, RIP always points to the next instruction). + callInstrOffsetBytes[0] = byte(diff) + callInstrOffsetBytes[1] = byte(diff >> 8) + callInstrOffsetBytes[2] = byte(diff >> 16) + callInstrOffsetBytes[3] = byte(diff >> 24) + } +} + +// CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo. +func (m *machine) CallTrampolineIslandInfo(_ int) (_, _ int, _ error) { return } + +func (m *machine) lowerIcmpToFlag(xd, yd *backend.SSAValueDefinition, _64 bool) { + x := m.getOperand_Reg(xd) + y := m.getOperand_Mem_Imm32_Reg(yd) + cmp := m.allocateInstr().asCmpRmiR(true, y, x.reg(), _64) + m.insert(cmp) +} + +func (m *machine) lowerFcmpToFlags(instr *ssa.Instruction) (f1, f2 cond, and bool) { + x, y, c := instr.FcmpData() + switch c { + case ssa.FloatCmpCondEqual: + f1, f2 = condNP, condZ + and = true + case ssa.FloatCmpCondNotEqual: + f1, f2 = condP, condNZ + case ssa.FloatCmpCondLessThan: + f1 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThan) + f2 = condInvalid + x, y = y, x + case ssa.FloatCmpCondLessThanOrEqual: + f1 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThanOrEqual) + f2 = condInvalid + x, y = y, x + default: + f1 = condFromSSAFloatCmpCond(c) + f2 = condInvalid + } + + var opc sseOpcode + if x.Type() == ssa.TypeF32 { + opc = sseOpcodeUcomiss + } else { + opc = sseOpcodeUcomisd + } + + xr := m.getOperand_Reg(m.c.ValueDefinition(x)) + yr := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + m.insert(m.allocateInstr().asXmmCmpRmR(opc, yr, xr.reg())) + return +} + +// allocateInstr allocates an instruction. +func (m *machine) allocateInstr() *instruction { + instr := m.ectx.InstructionPool.Allocate() + if !m.regAllocStarted { + instr.addedBeforeRegAlloc = true + } + return instr +} + +func (m *machine) allocateNop() *instruction { + instr := m.allocateInstr() + instr.kind = nop0 + return instr +} + +func (m *machine) insert(i *instruction) { + ectx := m.ectx + ectx.PendingInstructions = append(ectx.PendingInstructions, i) +} + +func (m *machine) allocateBrTarget() (nop *instruction, l backend.Label) { //nolint + pos := m.allocateLabel() + l = pos.L + nop = m.allocateInstr() + nop.asNop0WithLabel(l) + pos.Begin, pos.End = nop, nop + return +} + +func (m *machine) allocateLabel() *labelPosition { + ectx := m.ectx + l := ectx.AllocateLabel() + pos := ectx.AllocateLabelPosition(l) + ectx.LabelPositions[l] = pos + return pos +} + +func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 { + offset, ok := m.spillSlots[id] + if !ok { + offset = m.spillSlotSize + m.spillSlots[id] = offset + m.spillSlotSize += int64(size) + } + return offset +} + +func (m *machine) copyTo(src regalloc.VReg, dst regalloc.VReg) { + mov := m.allocateInstr() + if src.RegType() == regalloc.RegTypeInt { + mov.asMovRR(src, dst, true) + } else { + mov.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), dst) + } + m.insert(mov) +} + +func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg { + typ := m.c.TypeOf(v) + tmp := m.c.AllocateVReg(typ) + m.copyTo(v, tmp) + return tmp +} + +func (m *machine) requiredStackSize() int64 { + return m.maxRequiredStackSizeForCalls + + m.frameSize() + + 16 + // Need for stack checking. + 16 // return address and the caller RBP. +} + +func (m *machine) frameSize() int64 { + s := m.clobberedRegSlotSize() + m.spillSlotSize + if s&0xf != 0 { + panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s)) + } + return s +} + +func (m *machine) clobberedRegSlotSize() int64 { + return int64(len(m.clobberedRegs) * 16) +} + +func (m *machine) lowerIDivRem(si *ssa.Instruction, isDiv bool, signed bool) { + x, y, execCtx := si.Arg3() + + dividend := m.getOperand_Reg(m.c.ValueDefinition(x)) + divisor := m.getOperand_Reg(m.c.ValueDefinition(y)) + ctxVReg := m.c.VRegOf(execCtx) + tmpGp := m.c.AllocateVReg(si.Return().Type()) + + m.copyTo(dividend.reg(), raxVReg) + m.insert(m.allocateInstr().asDefineUninitializedReg(rdxVReg)) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp)) + seq := m.allocateInstr().asIdivRemSequence(ctxVReg, divisor.reg(), tmpGp, isDiv, signed, x.Type().Bits() == 64) + m.insert(seq) + rd := m.c.VRegOf(si.Return()) + if isDiv { + m.copyTo(raxVReg, rd) + } else { + m.copyTo(rdxVReg, rd) + } +} + +func (m *machine) lowerIDivRemSequenceAfterRegAlloc(i *instruction) { + execCtx, divisor, tmpGp, isDiv, signed, _64 := i.idivRemSequenceData() + + dividend := raxVReg + + // Ensure yr is not zero. + test := m.allocateInstr() + test.asCmpRmiR(false, newOperandReg(divisor), divisor, _64) + m.insert(test) + + jnz := m.allocateInstr() + m.insert(jnz) + + nz := m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerDivisionByZero) + + // If not zero, we can proceed with the division. + jnz.asJmpIf(condNZ, newOperandLabel(nz)) + + var ifRemNeg1 *instruction + if signed { + var neg1 uint64 + if _64 { + neg1 = 0xffffffffffffffff + } else { + neg1 = 0xffffffff + } + m.lowerIconst(tmpGp, neg1, _64) + + if isDiv { + // For signed division, we have to have branches for "math.MinInt{32,64} / -1" + // case which results in the floating point exception via division error as + // the resulting value exceeds the maximum of signed int. + + // First, we check if the divisor is -1. + cmp := m.allocateInstr() + cmp.asCmpRmiR(true, newOperandReg(tmpGp), divisor, _64) + m.insert(cmp) + + ifNotNeg1 := m.allocateInstr() + m.insert(ifNotNeg1) + + var minInt uint64 + if _64 { + minInt = 0x8000000000000000 + } else { + minInt = 0x80000000 + } + m.lowerIconst(tmpGp, minInt, _64) + + // Next we check if the quotient is the most negative value for the signed integer, i.e. + // if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1) respectively. + cmp2 := m.allocateInstr() + cmp2.asCmpRmiR(true, newOperandReg(tmpGp), dividend, _64) + m.insert(cmp2) + + ifNotMinInt := m.allocateInstr() + m.insert(ifNotMinInt) + + // Trap if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1), + // as that is the overflow in division as the result becomes 2^31 which is larger than + // the maximum of signed 32-bit int (2^31-1). + end := m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) + ifNotNeg1.asJmpIf(condNZ, newOperandLabel(end)) + ifNotMinInt.asJmpIf(condNZ, newOperandLabel(end)) + } else { + // If it is remainder, zeros DX register and compare the divisor to -1. + xor := m.allocateInstr().asZeros(rdxVReg) + m.insert(xor) + + // We check if the divisor is -1. + cmp := m.allocateInstr() + cmp.asCmpRmiR(true, newOperandReg(tmpGp), divisor, _64) + m.insert(cmp) + + ifRemNeg1 = m.allocateInstr() + m.insert(ifRemNeg1) + } + + // Sign-extend DX register to have 2*x.Type().Bits() dividend over DX and AX registers. + sed := m.allocateInstr() + sed.asSignExtendData(_64) + m.insert(sed) + } else { + // Zeros DX register to have 2*x.Type().Bits() dividend over DX and AX registers. + zeros := m.allocateInstr().asZeros(rdxVReg) + m.insert(zeros) + } + + div := m.allocateInstr() + div.asDiv(newOperandReg(divisor), signed, _64) + m.insert(div) + + nop, end := m.allocateBrTarget() + m.insert(nop) + // If we are compiling a Rem instruction, when the divisor is -1 we land at the end of the function. + if ifRemNeg1 != nil { + ifRemNeg1.asJmpIf(condZ, newOperandLabel(end)) + } +} + +func (m *machine) lowerRound(instr *ssa.Instruction, imm roundingMode) { + x := instr.Arg() + if !x.Type().IsFloat() { + panic("BUG?") + } + var op sseOpcode + if x.Type().Bits() == 64 { + op = sseOpcodeRoundsd + } else { + op = sseOpcodeRoundss + } + + xDef := m.c.ValueDefinition(x) + rm := m.getOperand_Mem_Reg(xDef) + rd := m.c.VRegOf(instr.Return()) + + xmm := m.allocateInstr().asXmmUnaryRmRImm(op, uint8(imm), rm, rd) + m.insert(xmm) +} + +func (m *machine) lowerFminFmax(instr *ssa.Instruction) { + x, y := instr.Arg2() + if !x.Type().IsFloat() { + panic("BUG?") + } + + _64 := x.Type().Bits() == 64 + isMin := instr.Opcode() == ssa.OpcodeFmin + var minMaxOp sseOpcode + + switch { + case _64 && isMin: + minMaxOp = sseOpcodeMinpd + case _64 && !isMin: + minMaxOp = sseOpcodeMaxpd + case !_64 && isMin: + minMaxOp = sseOpcodeMinps + case !_64 && !isMin: + minMaxOp = sseOpcodeMaxps + } + + xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) + rm := m.getOperand_Reg(xDef) + // We cannot ensure that y is aligned to 16 bytes, so we have to use it on reg. + rn := m.getOperand_Reg(yDef) + rd := m.c.VRegOf(instr.Return()) + + tmp := m.copyToTmp(rm.reg()) + + // Check if this is (either x1 or x2 is NaN) or (x1 equals x2) case. + cmp := m.allocateInstr() + if _64 { + cmp.asXmmCmpRmR(sseOpcodeUcomisd, rn, tmp) + } else { + cmp.asXmmCmpRmR(sseOpcodeUcomiss, rn, tmp) + } + m.insert(cmp) + + // At this point, we have the three cases of conditional flags below + // (See https://www.felixcloutier.com/x86/ucomiss#operation for detail.) + // + // 1) Two values are NaN-free and different: All flags are cleared. + // 2) Two values are NaN-free and equal: Only ZF flags is set. + // 3) One of Two values is NaN: ZF, PF and CF flags are set. + + // Jump instruction to handle 1) case by checking the ZF flag + // as ZF is only set for 2) and 3) cases. + nanFreeOrDiffJump := m.allocateInstr() + m.insert(nanFreeOrDiffJump) + + // Start handling 2) and 3). + + // Jump if one of two values is NaN by checking the parity flag (PF). + ifIsNan := m.allocateInstr() + m.insert(ifIsNan) + + // Start handling 2) NaN-free and equal. + + // Before we exit this case, we have to ensure that positive zero (or negative zero for min instruction) is + // returned if two values are positive and negative zeros. + var op sseOpcode + switch { + case !_64 && isMin: + op = sseOpcodeOrps + case _64 && isMin: + op = sseOpcodeOrpd + case !_64 && !isMin: + op = sseOpcodeAndps + case _64 && !isMin: + op = sseOpcodeAndpd + } + orAnd := m.allocateInstr() + orAnd.asXmmRmR(op, rn, tmp) + m.insert(orAnd) + + // Done, jump to end. + sameExitJump := m.allocateInstr() + m.insert(sameExitJump) + + // Start handling 3) either is NaN. + isNanTarget, isNan := m.allocateBrTarget() + m.insert(isNanTarget) + ifIsNan.asJmpIf(condP, newOperandLabel(isNan)) + + // We emit the ADD instruction to produce the NaN in tmp. + add := m.allocateInstr() + if _64 { + add.asXmmRmR(sseOpcodeAddsd, rn, tmp) + } else { + add.asXmmRmR(sseOpcodeAddss, rn, tmp) + } + m.insert(add) + + // Exit from the NaN case branch. + nanExitJmp := m.allocateInstr() + m.insert(nanExitJmp) + + // Start handling 1). + doMinMaxTarget, doMinMax := m.allocateBrTarget() + m.insert(doMinMaxTarget) + nanFreeOrDiffJump.asJmpIf(condNZ, newOperandLabel(doMinMax)) + + // Now handle the NaN-free and different values case. + minMax := m.allocateInstr() + minMax.asXmmRmR(minMaxOp, rn, tmp) + m.insert(minMax) + + endNop, end := m.allocateBrTarget() + m.insert(endNop) + nanExitJmp.asJmp(newOperandLabel(end)) + sameExitJump.asJmp(newOperandLabel(end)) + + m.copyTo(tmp, rd) +} + +func (m *machine) lowerFcopysign(instr *ssa.Instruction) { + x, y := instr.Arg2() + if !x.Type().IsFloat() { + panic("BUG") + } + + _64 := x.Type().Bits() == 64 + + xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) + rm := m.getOperand_Reg(xDef) + rn := m.getOperand_Reg(yDef) + rd := m.c.VRegOf(instr.Return()) + + // Clear the non-sign bits of src via AND with the mask. + var opAnd, opOr sseOpcode + var signMask uint64 + if _64 { + signMask, opAnd, opOr = 0x8000000000000000, sseOpcodeAndpd, sseOpcodeOrpd + } else { + signMask, opAnd, opOr = 0x80000000, sseOpcodeAndps, sseOpcodeOrps + } + + signBitReg := m.c.AllocateVReg(x.Type()) + m.lowerFconst(signBitReg, signMask, _64) + nonSignBitReg := m.c.AllocateVReg(x.Type()) + m.lowerFconst(nonSignBitReg, ^signMask, _64) + + // Extract the sign bits of rn. + and := m.allocateInstr().asXmmRmR(opAnd, rn, signBitReg) + m.insert(and) + + // Clear the sign bit of dst via AND with the non-sign bit mask. + xor := m.allocateInstr().asXmmRmR(opAnd, rm, nonSignBitReg) + m.insert(xor) + + // Copy the sign bits of src to dst via OR. + or := m.allocateInstr().asXmmRmR(opOr, newOperandReg(signBitReg), nonSignBitReg) + m.insert(or) + + m.copyTo(nonSignBitReg, rd) +} + +func (m *machine) lowerBitcast(instr *ssa.Instruction) { + x, dstTyp := instr.BitcastData() + srcTyp := x.Type() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + switch { + case srcTyp == ssa.TypeF32 && dstTyp == ssa.TypeI32: + cvt := m.allocateInstr().asXmmToGpr(sseOpcodeMovd, rn.reg(), rd, false) + m.insert(cvt) + case srcTyp == ssa.TypeI32 && dstTyp == ssa.TypeF32: + cvt := m.allocateInstr().asGprToXmm(sseOpcodeMovd, rn, rd, false) + m.insert(cvt) + case srcTyp == ssa.TypeF64 && dstTyp == ssa.TypeI64: + cvt := m.allocateInstr().asXmmToGpr(sseOpcodeMovq, rn.reg(), rd, true) + m.insert(cvt) + case srcTyp == ssa.TypeI64 && dstTyp == ssa.TypeF64: + cvt := m.allocateInstr().asGprToXmm(sseOpcodeMovq, rn, rd, true) + m.insert(cvt) + default: + panic(fmt.Sprintf("invalid bitcast from %s to %s", srcTyp, dstTyp)) + } +} + +func (m *machine) lowerFcvtToSint(ctxVReg, rn, rd regalloc.VReg, src64, dst64, sat bool) { + var tmpXmm regalloc.VReg + if dst64 { + tmpXmm = m.c.AllocateVReg(ssa.TypeF64) + } else { + tmpXmm = m.c.AllocateVReg(ssa.TypeF32) + } + + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm)) + tmpGp, tmpGp2 := m.c.AllocateVReg(ssa.TypeI64), m.c.AllocateVReg(ssa.TypeI64) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp)) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp2)) + + m.insert(m.allocateFcvtToSintSequence(ctxVReg, rn, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat)) + m.copyTo(tmpGp, rd) +} + +func (m *machine) lowerFcvtToSintSequenceAfterRegalloc(i *instruction) { + execCtx, src, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat := i.fcvtToSintSequenceData() + var cmpOp, truncOp sseOpcode + if src64 { + cmpOp, truncOp = sseOpcodeUcomisd, sseOpcodeCvttsd2si + } else { + cmpOp, truncOp = sseOpcodeUcomiss, sseOpcodeCvttss2si + } + + trunc := m.allocateInstr() + trunc.asXmmToGpr(truncOp, src, tmpGp, dst64) + m.insert(trunc) + + // Check if the dst operand was INT_MIN, by checking it against 1. + cmp1 := m.allocateInstr() + cmp1.asCmpRmiR(true, newOperandImm32(1), tmpGp, dst64) + m.insert(cmp1) + + // If no overflow, then we are done. + doneTarget, done := m.allocateBrTarget() + ifNoOverflow := m.allocateInstr() + ifNoOverflow.asJmpIf(condNO, newOperandLabel(done)) + m.insert(ifNoOverflow) + + // Now, check for NaN. + cmpNan := m.allocateInstr() + cmpNan.asXmmCmpRmR(cmpOp, newOperandReg(src), src) + m.insert(cmpNan) + + // We allocate the "non-nan target" here, but we will insert it later. + notNanTarget, notNaN := m.allocateBrTarget() + ifNotNan := m.allocateInstr() + ifNotNan.asJmpIf(condNP, newOperandLabel(notNaN)) + m.insert(ifNotNan) + + if sat { + // If NaN and saturating, return 0. + zeroDst := m.allocateInstr().asZeros(tmpGp) + m.insert(zeroDst) + + jmpEnd := m.allocateInstr() + jmpEnd.asJmp(newOperandLabel(done)) + m.insert(jmpEnd) + + // Otherwise: + m.insert(notNanTarget) + + // Zero-out the tmp register. + zero := m.allocateInstr().asZeros(tmpXmm) + m.insert(zero) + + cmpXmm := m.allocateInstr().asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src) + m.insert(cmpXmm) + + // if >= jump to end. + jmpEnd2 := m.allocateInstr() + jmpEnd2.asJmpIf(condB, newOperandLabel(done)) + m.insert(jmpEnd2) + + // Otherwise, saturate to INT_MAX. + if dst64 { + m.lowerIconst(tmpGp, math.MaxInt64, dst64) + } else { + m.lowerIconst(tmpGp, math.MaxInt32, dst64) + } + + } else { + + // If non-sat, NaN, trap. + m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeInvalidConversionToInteger) + + // Otherwise, we will jump here. + m.insert(notNanTarget) + + // jump over trap if src larger than threshold + condAboveThreshold := condNB + + // The magic constants are various combination of minInt for int[32|64] represented as float[32|64]. + var minInt uint64 + switch { + case src64 && dst64: + minInt = 0xc3e0000000000000 + case src64 && !dst64: + condAboveThreshold = condNBE + minInt = 0xC1E0_0000_0020_0000 + case !src64 && dst64: + minInt = 0xDF00_0000 + case !src64 && !dst64: + minInt = 0xCF00_0000 + } + + loadToGP := m.allocateInstr().asImm(tmpGp2, minInt, src64) + m.insert(loadToGP) + + movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp2), tmpXmm, src64) + m.insert(movToXmm) + + cmpXmm := m.allocateInstr().asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src) + m.insert(cmpXmm) + + jmpIfLarger := m.allocateInstr() + checkPositiveTarget, checkPositive := m.allocateBrTarget() + jmpIfLarger.asJmpIf(condAboveThreshold, newOperandLabel(checkPositive)) + m.insert(jmpIfLarger) + + m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) + + // If positive, it was a real overflow. + m.insert(checkPositiveTarget) + + // Zero out the temp register. + xorpd := m.allocateInstr() + xorpd.asXmmRmR(sseOpcodeXorpd, newOperandReg(tmpXmm), tmpXmm) + m.insert(xorpd) + + pos := m.allocateInstr() + pos.asXmmCmpRmR(cmpOp, newOperandReg(src), tmpXmm) + m.insert(pos) + + // If >= jump to end. + jmp := m.allocateInstr().asJmpIf(condNB, newOperandLabel(done)) + m.insert(jmp) + m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) + } + + m.insert(doneTarget) +} + +func (m *machine) lowerFcvtToUint(ctxVReg, rn, rd regalloc.VReg, src64, dst64, sat bool) { + tmpXmm, tmpXmm2 := m.c.AllocateVReg(ssa.TypeF64), m.c.AllocateVReg(ssa.TypeF64) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm)) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm2)) + tmpGp, tmpGp2 := m.c.AllocateVReg(ssa.TypeI64), m.c.AllocateVReg(ssa.TypeI64) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp)) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp2)) + + m.insert(m.allocateFcvtToUintSequence( + ctxVReg, rn, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat, + )) + m.copyTo(tmpGp, rd) +} + +func (m *machine) lowerFcvtToUintSequenceAfterRegalloc(i *instruction) { + execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat := i.fcvtToUintSequenceData() + + var subOp, cmpOp, truncOp sseOpcode + if src64 { + subOp, cmpOp, truncOp = sseOpcodeSubsd, sseOpcodeUcomisd, sseOpcodeCvttsd2si + } else { + subOp, cmpOp, truncOp = sseOpcodeSubss, sseOpcodeUcomiss, sseOpcodeCvttss2si + } + + doneTarget, done := m.allocateBrTarget() + + switch { + case src64 && dst64: + loadToGP := m.allocateInstr().asImm(tmpGp, 0x43e0000000000000, true) + m.insert(loadToGP) + movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, true) + m.insert(movToXmm) + case src64 && !dst64: + loadToGP := m.allocateInstr().asImm(tmpGp, 0x41e0000000000000, true) + m.insert(loadToGP) + movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, true) + m.insert(movToXmm) + case !src64 && dst64: + loadToGP := m.allocateInstr().asImm(tmpGp, 0x5f000000, false) + m.insert(loadToGP) + movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, false) + m.insert(movToXmm) + case !src64 && !dst64: + loadToGP := m.allocateInstr().asImm(tmpGp, 0x4f000000, false) + m.insert(loadToGP) + movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, false) + m.insert(movToXmm) + } + + cmp := m.allocateInstr() + cmp.asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src) + m.insert(cmp) + + // If above `tmp` ("large threshold"), jump to `ifAboveThreshold` + ifAboveThresholdTarget, ifAboveThreshold := m.allocateBrTarget() + jmpIfAboveThreshold := m.allocateInstr() + jmpIfAboveThreshold.asJmpIf(condNB, newOperandLabel(ifAboveThreshold)) + m.insert(jmpIfAboveThreshold) + + ifNotNaNTarget, ifNotNaN := m.allocateBrTarget() + jmpIfNotNaN := m.allocateInstr() + jmpIfNotNaN.asJmpIf(condNP, newOperandLabel(ifNotNaN)) + m.insert(jmpIfNotNaN) + + // If NaN, handle the error condition. + if sat { + // On NaN, saturating, we just return 0. + zeros := m.allocateInstr().asZeros(tmpGp) + m.insert(zeros) + + jmpEnd := m.allocateInstr() + jmpEnd.asJmp(newOperandLabel(done)) + m.insert(jmpEnd) + } else { + // On NaN, non-saturating, we trap. + m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeInvalidConversionToInteger) + } + + // If not NaN, land here. + m.insert(ifNotNaNTarget) + + // Truncation happens here. + + trunc := m.allocateInstr() + trunc.asXmmToGpr(truncOp, src, tmpGp, dst64) + m.insert(trunc) + + // Check if the result is negative. + cmpNeg := m.allocateInstr() + cmpNeg.asCmpRmiR(true, newOperandImm32(0), tmpGp, dst64) + m.insert(cmpNeg) + + // If non-neg, jump to end. + jmpIfNonNeg := m.allocateInstr() + jmpIfNonNeg.asJmpIf(condNL, newOperandLabel(done)) + m.insert(jmpIfNonNeg) + + if sat { + // If the input was "small" (< 2**(width -1)), the only way to get an integer + // overflow is because the input was too small: saturate to the min value, i.e. 0. + zeros := m.allocateInstr().asZeros(tmpGp) + m.insert(zeros) + + jmpEnd := m.allocateInstr() + jmpEnd.asJmp(newOperandLabel(done)) + m.insert(jmpEnd) + } else { + // If not saturating, trap. + m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) + } + + // If above the threshold, land here. + m.insert(ifAboveThresholdTarget) + + // tmpDiff := threshold - rn. + copySrc := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), tmpXmm2) + m.insert(copySrc) + + sub := m.allocateInstr() + sub.asXmmRmR(subOp, newOperandReg(tmpXmm), tmpXmm2) // must be -0x8000000000000000 + m.insert(sub) + + trunc2 := m.allocateInstr() + trunc2.asXmmToGpr(truncOp, tmpXmm2, tmpGp, dst64) + m.insert(trunc2) + + // Check if the result is negative. + cmpNeg2 := m.allocateInstr().asCmpRmiR(true, newOperandImm32(0), tmpGp, dst64) + m.insert(cmpNeg2) + + ifNextLargeTarget, ifNextLarge := m.allocateBrTarget() + jmpIfNextLarge := m.allocateInstr() + jmpIfNextLarge.asJmpIf(condNL, newOperandLabel(ifNextLarge)) + m.insert(jmpIfNextLarge) + + if sat { + // The input was "large" (>= maxInt), so the only way to get an integer + // overflow is because the input was too large: saturate to the max value. + var maxInt uint64 + if dst64 { + maxInt = math.MaxUint64 + } else { + maxInt = math.MaxUint32 + } + m.lowerIconst(tmpGp, maxInt, dst64) + + jmpToEnd := m.allocateInstr() + jmpToEnd.asJmp(newOperandLabel(done)) + m.insert(jmpToEnd) + } else { + // If not saturating, trap. + m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) + } + + m.insert(ifNextLargeTarget) + + var op operand + if dst64 { + m.lowerIconst(tmpGp2, 0x8000000000000000, true) + op = newOperandReg(tmpGp2) + } else { + op = newOperandImm32(0x80000000) + } + + add := m.allocateInstr() + add.asAluRmiR(aluRmiROpcodeAdd, op, tmpGp, dst64) + m.insert(add) + + m.insert(doneTarget) +} + +func (m *machine) lowerFcvtFromSint(rn, rd operand, src64, dst64 bool) { + var op sseOpcode + if dst64 { + op = sseOpcodeCvtsi2sd + } else { + op = sseOpcodeCvtsi2ss + } + + trunc := m.allocateInstr() + trunc.asGprToXmm(op, rn, rd.reg(), src64) + m.insert(trunc) +} + +func (m *machine) lowerFcvtFromUint(rn, rd operand, src64, dst64 bool) { + var op sseOpcode + if dst64 { + op = sseOpcodeCvtsi2sd + } else { + op = sseOpcodeCvtsi2ss + } + + // Src is 32 bit, then we just perform the conversion with 64 bit width. + // + // See the following link for why we use 64bit conversion for unsigned 32bit integer sources: + // https://stackoverflow.com/questions/41495498/fpu-operations-generated-by-gcc-during-casting-integer-to-float. + // + // Here's the summary: + // >> CVTSI2SS is indeed designed for converting a signed integer to a scalar single-precision float, + // >> not an unsigned integer like you have here. So what gives? Well, a 64-bit processor has 64-bit wide + // >> registers available, so the unsigned 32-bit input values can be stored as signed 64-bit intermediate values, + // >> which allows CVTSI2SS to be used after all. + // + if !src64 { + // Before we convert, we have to clear the higher 32-bits of the 64-bit register + // to get the correct result. + tmp := m.c.AllocateVReg(ssa.TypeI32) + m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, rn, tmp)) + m.insert(m.allocateInstr().asGprToXmm(op, newOperandReg(tmp), rd.reg(), true)) + return + } + + // If uint64, we have to do a bit more work. + endTarget, end := m.allocateBrTarget() + + var tmpXmm regalloc.VReg + if dst64 { + tmpXmm = m.c.AllocateVReg(ssa.TypeF64) + } else { + tmpXmm = m.c.AllocateVReg(ssa.TypeF32) + } + + // Check if the most significant bit (sign bit) is set. + test := m.allocateInstr() + test.asCmpRmiR(false, rn, rn.reg(), src64) + m.insert(test) + + // Jump if the sign bit is set. + ifSignTarget, ifSign := m.allocateBrTarget() + jmpIfNeg := m.allocateInstr() + jmpIfNeg.asJmpIf(condS, newOperandLabel(ifSign)) + m.insert(jmpIfNeg) + + // If the sign bit is not set, we could fit the unsigned int into float32/float64. + // So, we convert it to float and emit jump instruction to exit from this branch. + cvt := m.allocateInstr() + cvt.asGprToXmm(op, rn, tmpXmm, src64) + m.insert(cvt) + + // We are done, jump to end. + jmpEnd := m.allocateInstr() + jmpEnd.asJmp(newOperandLabel(end)) + m.insert(jmpEnd) + + // Now handling the case where sign-bit is set. + // We emit the following sequences: + // mov %rn, %tmp + // shr 1, %tmp + // mov %rn, %tmp2 + // and 1, %tmp2 + // or %tmp2, %tmp + // cvtsi2ss %tmp, %xmm0 + // addsd %xmm0, %xmm0 + m.insert(ifSignTarget) + + tmp := m.copyToTmp(rn.reg()) + shr := m.allocateInstr() + shr.asShiftR(shiftROpShiftRightLogical, newOperandImm32(1), tmp, src64) + m.insert(shr) + + tmp2 := m.copyToTmp(rn.reg()) + and := m.allocateInstr() + and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp2, src64) + m.insert(and) + + or := m.allocateInstr() + or.asAluRmiR(aluRmiROpcodeOr, newOperandReg(tmp2), tmp, src64) + m.insert(or) + + cvt2 := m.allocateInstr() + cvt2.asGprToXmm(op, newOperandReg(tmp), tmpXmm, src64) + m.insert(cvt2) + + addsd := m.allocateInstr() + if dst64 { + addsd.asXmmRmR(sseOpcodeAddsd, newOperandReg(tmpXmm), tmpXmm) + } else { + addsd.asXmmRmR(sseOpcodeAddss, newOperandReg(tmpXmm), tmpXmm) + } + m.insert(addsd) + + m.insert(endTarget) + m.copyTo(tmpXmm, rd.reg()) +} + +func (m *machine) lowerVanyTrue(instr *ssa.Instruction) { + x := instr.Arg() + rm := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + + tmp := m.c.AllocateVReg(ssa.TypeI32) + + cmp := m.allocateInstr() + cmp.asXmmCmpRmR(sseOpcodePtest, rm, rm.reg()) + m.insert(cmp) + + setcc := m.allocateInstr() + setcc.asSetcc(condNZ, tmp) + m.insert(setcc) + + // Clear the irrelevant bits. + and := m.allocateInstr() + and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp, false) + m.insert(and) + + m.copyTo(tmp, rd) +} + +func (m *machine) lowerVallTrue(instr *ssa.Instruction) { + x, lane := instr.ArgWithLane() + var op sseOpcode + switch lane { + case ssa.VecLaneI8x16: + op = sseOpcodePcmpeqb + case ssa.VecLaneI16x8: + op = sseOpcodePcmpeqw + case ssa.VecLaneI32x4: + op = sseOpcodePcmpeqd + case ssa.VecLaneI64x2: + op = sseOpcodePcmpeqq + } + rm := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + + tmp := m.c.AllocateVReg(ssa.TypeV128) + + zeros := m.allocateInstr() + zeros.asZeros(tmp) + m.insert(zeros) + + pcmp := m.allocateInstr() + pcmp.asXmmRmR(op, rm, tmp) + m.insert(pcmp) + + test := m.allocateInstr() + test.asXmmCmpRmR(sseOpcodePtest, newOperandReg(tmp), tmp) + m.insert(test) + + tmp2 := m.c.AllocateVReg(ssa.TypeI32) + + setcc := m.allocateInstr() + setcc.asSetcc(condZ, tmp2) + m.insert(setcc) + + // Clear the irrelevant bits. + and := m.allocateInstr() + and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp2, false) + m.insert(and) + + m.copyTo(tmp2, rd) +} + +func (m *machine) lowerVhighBits(instr *ssa.Instruction) { + x, lane := instr.ArgWithLane() + rm := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + switch lane { + case ssa.VecLaneI8x16: + mov := m.allocateInstr() + mov.asXmmToGpr(sseOpcodePmovmskb, rm.reg(), rd, false) + m.insert(mov) + + case ssa.VecLaneI16x8: + // When we have: + // R1 = [R1(w1), R1(w2), R1(w3), R1(w4), R1(w5), R1(w6), R1(w7), R1(v8)] + // R2 = [R2(w1), R2(w2), R2(w3), R2(v4), R2(w5), R2(w6), R2(w7), R2(v8)] + // where RX(wn) is n-th signed word (16-bit) of RX register, + // + // "PACKSSWB R1, R2" produces + // R1 = [ + // byte_sat(R1(w1)), byte_sat(R1(w2)), byte_sat(R1(w3)), byte_sat(R1(w4)), + // byte_sat(R1(w5)), byte_sat(R1(w6)), byte_sat(R1(w7)), byte_sat(R1(w8)), + // byte_sat(R2(w1)), byte_sat(R2(w2)), byte_sat(R2(w3)), byte_sat(R2(w4)), + // byte_sat(R2(w5)), byte_sat(R2(w6)), byte_sat(R2(w7)), byte_sat(R2(w8)), + // ] + // where R1 is the destination register, and + // byte_sat(w) = int8(w) if w fits as signed 8-bit, + // 0x80 if w is less than 0x80 + // 0x7F if w is greater than 0x7f + // + // See https://www.felixcloutier.com/x86/packsswb:packssdw for detail. + // + // Therefore, v.register ends up having i-th and (i+8)-th bit set if i-th lane is negative (for i in 0..8). + tmp := m.copyToTmp(rm.reg()) + res := m.c.AllocateVReg(ssa.TypeI32) + + pak := m.allocateInstr() + pak.asXmmRmR(sseOpcodePacksswb, rm, tmp) + m.insert(pak) + + mov := m.allocateInstr() + mov.asXmmToGpr(sseOpcodePmovmskb, tmp, res, false) + m.insert(mov) + + // Clear the higher bits than 8. + shr := m.allocateInstr() + shr.asShiftR(shiftROpShiftRightLogical, newOperandImm32(8), res, false) + m.insert(shr) + + m.copyTo(res, rd) + + case ssa.VecLaneI32x4: + mov := m.allocateInstr() + mov.asXmmToGpr(sseOpcodeMovmskps, rm.reg(), rd, true) + m.insert(mov) + + case ssa.VecLaneI64x2: + mov := m.allocateInstr() + mov.asXmmToGpr(sseOpcodeMovmskpd, rm.reg(), rd, true) + m.insert(mov) + } +} + +func (m *machine) lowerVbnot(instr *ssa.Instruction) { + x := instr.Arg() + xDef := m.c.ValueDefinition(x) + rm := m.getOperand_Reg(xDef) + rd := m.c.VRegOf(instr.Return()) + + tmp := m.copyToTmp(rm.reg()) + tmp2 := m.c.AllocateVReg(ssa.TypeV128) + + // Ensure tmp2 is considered defined by regalloc. + m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2)) + + // Set all bits on tmp register. + pak := m.allocateInstr() + pak.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp2), tmp2) + m.insert(pak) + + // Then XOR with tmp to reverse all bits on v.register. + xor := m.allocateInstr() + xor.asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp) + m.insert(xor) + + m.copyTo(tmp, rd) +} + +func (m *machine) lowerSplat(x, ret ssa.Value, lane ssa.VecLane) { + tmpDst := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst)) + + switch lane { + case ssa.VecLaneI8x16: + tmp := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmp)) + xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, xx, tmpDst)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpDst)) + case ssa.VecLaneI16x8: + xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, xx, tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, xx, tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) + case ssa.VecLaneI32x4: + xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, xx, tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) + case ssa.VecLaneI64x2: + xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, xx, tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, xx, tmpDst)) + case ssa.VecLaneF32x4: + xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, 0, xx, tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) + case ssa.VecLaneF64x2: + xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, xx, tmpDst)) + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + m.copyTo(tmpDst, m.c.VRegOf(ret)) +} + +func (m *machine) lowerShuffle(x, y ssa.Value, lo, hi uint64, ret ssa.Value) { + var xMask, yMask [2]uint64 + for i := 0; i < 8; i++ { + loLane := byte(lo >> (i * 8)) + if loLane < 16 { + xMask[0] |= uint64(loLane) << (i * 8) + yMask[0] |= uint64(0x80) << (i * 8) + } else { + xMask[0] |= uint64(0x80) << (i * 8) + yMask[0] |= uint64(loLane-16) << (i * 8) + } + hiLane := byte(hi >> (i * 8)) + if hiLane < 16 { + xMask[1] |= uint64(hiLane) << (i * 8) + yMask[1] |= uint64(0x80) << (i * 8) + } else { + xMask[1] |= uint64(0x80) << (i * 8) + yMask[1] |= uint64(hiLane-16) << (i * 8) + } + } + + xmaskLabel := m.allocateLabel() + m.consts = append(m.consts, _const{lo: xMask[0], hi: xMask[1], label: xmaskLabel}) + ymaskLabel := m.allocateLabel() + m.consts = append(m.consts, _const{lo: yMask[0], hi: yMask[1], label: ymaskLabel}) + + xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y)) + tmpX, tmpY := m.copyToTmp(xx.reg()), m.copyToTmp(yy.reg()) + + // Apply mask to X. + tmp := m.c.AllocateVReg(ssa.TypeV128) + loadMaskLo := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(xmaskLabel.L)), tmp) + m.insert(loadMaskLo) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpX)) + + // Apply mask to Y. + loadMaskHi := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(ymaskLabel.L)), tmp) + m.insert(loadMaskHi) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpY)) + + // Combine the results. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeOrps, newOperandReg(tmpX), tmpY)) + + m.copyTo(tmpY, m.c.VRegOf(ret)) +} + +func (m *machine) lowerVbBinOpUnaligned(op sseOpcode, x, y, ret ssa.Value) { + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rm := m.getOperand_Reg(m.c.ValueDefinition(y)) + rd := m.c.VRegOf(ret) + + tmp := m.copyToTmp(rn.reg()) + + binOp := m.allocateInstr() + binOp.asXmmRmR(op, rm, tmp) + m.insert(binOp) + + m.copyTo(tmp, rd) +} + +func (m *machine) lowerVbBinOp(op sseOpcode, x, y, ret ssa.Value) { + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rm := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + rd := m.c.VRegOf(ret) + + tmp := m.copyToTmp(rn.reg()) + + binOp := m.allocateInstr() + binOp.asXmmRmR(op, rm, tmp) + m.insert(binOp) + + m.copyTo(tmp, rd) +} + +func (m *machine) lowerVFcmp(x, y ssa.Value, c ssa.FloatCmpCond, ret ssa.Value, lane ssa.VecLane) { + var cmpOp sseOpcode + switch lane { + case ssa.VecLaneF32x4: + cmpOp = sseOpcodeCmpps + case ssa.VecLaneF64x2: + cmpOp = sseOpcodeCmppd + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + xx, yy := m.c.ValueDefinition(x), m.c.ValueDefinition(y) + var cmpImm cmpPred + switch c { + case ssa.FloatCmpCondGreaterThan: + yy, xx = xx, yy + cmpImm = cmpPredLT_OS + case ssa.FloatCmpCondGreaterThanOrEqual: + yy, xx = xx, yy + cmpImm = cmpPredLE_OS + case ssa.FloatCmpCondEqual: + cmpImm = cmpPredEQ_OQ + case ssa.FloatCmpCondNotEqual: + cmpImm = cmpPredNEQ_UQ + case ssa.FloatCmpCondLessThan: + cmpImm = cmpPredLT_OS + case ssa.FloatCmpCondLessThanOrEqual: + cmpImm = cmpPredLE_OS + default: + panic(fmt.Sprintf("invalid float comparison condition: %s", c)) + } + + tmp := m.c.AllocateVReg(ssa.TypeV128) + xxx := m.getOperand_Mem_Reg(xx) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, xxx, tmp)) + + rm := m.getOperand_Mem_Reg(yy) + m.insert(m.allocateInstr().asXmmRmRImm(cmpOp, byte(cmpImm), rm, tmp)) + + m.copyTo(tmp, m.c.VRegOf(ret)) +} + +func (m *machine) lowerVIcmp(x, y ssa.Value, c ssa.IntegerCmpCond, ret ssa.Value, lane ssa.VecLane) { + var eq, gt, maxu, minu, mins sseOpcode + switch lane { + case ssa.VecLaneI8x16: + eq, gt, maxu, minu, mins = sseOpcodePcmpeqb, sseOpcodePcmpgtb, sseOpcodePmaxub, sseOpcodePminub, sseOpcodePminsb + case ssa.VecLaneI16x8: + eq, gt, maxu, minu, mins = sseOpcodePcmpeqw, sseOpcodePcmpgtw, sseOpcodePmaxuw, sseOpcodePminuw, sseOpcodePminsw + case ssa.VecLaneI32x4: + eq, gt, maxu, minu, mins = sseOpcodePcmpeqd, sseOpcodePcmpgtd, sseOpcodePmaxud, sseOpcodePminud, sseOpcodePminsd + case ssa.VecLaneI64x2: + eq, gt = sseOpcodePcmpeqq, sseOpcodePcmpgtq + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + tmp := m.c.AllocateVReg(ssa.TypeV128) + var op operand + switch c { + case ssa.IntegerCmpCondSignedLessThanOrEqual: + if lane == ssa.VecLaneI64x2 { + x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + // Copy x to tmp. + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp)) + op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + } else { + y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + // Copy y to tmp. + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp)) + op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + } + case ssa.IntegerCmpCondSignedGreaterThanOrEqual: + if lane == ssa.VecLaneI64x2 { + y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + // Copy y to tmp. + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp)) + op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + } else { + x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + // Copy x to tmp. + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp)) + op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + } + case ssa.IntegerCmpCondSignedLessThan, ssa.IntegerCmpCondUnsignedLessThan, ssa.IntegerCmpCondUnsignedLessThanOrEqual: + y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + // Copy y to tmp. + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp)) + op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + default: + x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + // Copy x to tmp. + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp)) + op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + } + + switch c { + case ssa.IntegerCmpCondEqual: + m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) + case ssa.IntegerCmpCondNotEqual: + // First we compare for equality. + m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) + // Then flip the bits. To do so, we set all bits on tmp2. + tmp2 := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2)) + m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2)) + // And then xor with tmp. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp)) + case ssa.IntegerCmpCondSignedGreaterThan, ssa.IntegerCmpCondSignedLessThan: + m.insert(m.allocateInstr().asXmmRmR(gt, op, tmp)) + case ssa.IntegerCmpCondSignedGreaterThanOrEqual, ssa.IntegerCmpCondSignedLessThanOrEqual: + if lane == ssa.VecLaneI64x2 { + m.insert(m.allocateInstr().asXmmRmR(gt, op, tmp)) + // Then flip the bits. To do so, we set all bits on tmp2. + tmp2 := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2)) + m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2)) + // And then xor with tmp. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp)) + } else { + // First take min of x and y. + m.insert(m.allocateInstr().asXmmRmR(mins, op, tmp)) + // Then compare for equality. + m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) + } + case ssa.IntegerCmpCondUnsignedGreaterThan, ssa.IntegerCmpCondUnsignedLessThan: + // First maxu of x and y. + m.insert(m.allocateInstr().asXmmRmR(maxu, op, tmp)) + // Then compare for equality. + m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) + // Then flip the bits. To do so, we set all bits on tmp2. + tmp2 := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2)) + m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2)) + // And then xor with tmp. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp)) + case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual, ssa.IntegerCmpCondUnsignedLessThanOrEqual: + m.insert(m.allocateInstr().asXmmRmR(minu, op, tmp)) + m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) + default: + panic("BUG") + } + + m.copyTo(tmp, m.c.VRegOf(ret)) +} + +func (m *machine) lowerVbandnot(instr *ssa.Instruction, op sseOpcode) { + x, y := instr.Arg2() + xDef := m.c.ValueDefinition(x) + yDef := m.c.ValueDefinition(y) + rm, rn := m.getOperand_Reg(xDef), m.getOperand_Reg(yDef) + rd := m.c.VRegOf(instr.Return()) + + tmp := m.copyToTmp(rn.reg()) + + // pandn between rn, rm. + pand := m.allocateInstr() + pand.asXmmRmR(sseOpcodePandn, rm, tmp) + m.insert(pand) + + m.copyTo(tmp, rd) +} + +func (m *machine) lowerVbitselect(instr *ssa.Instruction) { + c, x, y := instr.SelectData() + xDef := m.c.ValueDefinition(x) + yDef := m.c.ValueDefinition(y) + rm, rn := m.getOperand_Reg(xDef), m.getOperand_Reg(yDef) + creg := m.getOperand_Reg(m.c.ValueDefinition(c)) + rd := m.c.VRegOf(instr.Return()) + + tmpC := m.copyToTmp(creg.reg()) + tmpX := m.copyToTmp(rm.reg()) + + // And between c, x (overwrites x). + pand := m.allocateInstr() + pand.asXmmRmR(sseOpcodePand, creg, tmpX) + m.insert(pand) + + // Andn between y, c (overwrites c). + pandn := m.allocateInstr() + pandn.asXmmRmR(sseOpcodePandn, rn, tmpC) + m.insert(pandn) + + por := m.allocateInstr() + por.asXmmRmR(sseOpcodePor, newOperandReg(tmpC), tmpX) + m.insert(por) + + m.copyTo(tmpX, rd) +} + +func (m *machine) lowerVFmin(instr *ssa.Instruction) { + x, y, lane := instr.Arg2WithLane() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rm := m.getOperand_Reg(m.c.ValueDefinition(y)) + rd := m.c.VRegOf(instr.Return()) + + var min, cmp, andn, or, srl /* shift right logical */ sseOpcode + var shiftNumToInverseNaN uint32 + if lane == ssa.VecLaneF32x4 { + min, cmp, andn, or, srl, shiftNumToInverseNaN = sseOpcodeMinps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodePsrld, 0xa + } else { + min, cmp, andn, or, srl, shiftNumToInverseNaN = sseOpcodeMinpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodePsrlq, 0xd + } + + tmp1 := m.copyToTmp(rn.reg()) + tmp2 := m.copyToTmp(rm.reg()) + + // tmp1=min(rn, rm) + minIns1 := m.allocateInstr() + minIns1.asXmmRmR(min, rn, tmp2) + m.insert(minIns1) + + // tmp2=min(rm, rn) + minIns2 := m.allocateInstr() + minIns2.asXmmRmR(min, rm, tmp1) + m.insert(minIns2) + + // tmp3:=tmp1=min(rn, rm) + tmp3 := m.copyToTmp(tmp1) + + // tmp1 = -0 if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN + // NaN if rn == NaN || rm == NaN + // min(rm, rm) otherwise + orIns := m.allocateInstr() + orIns.asXmmRmR(or, newOperandReg(tmp2), tmp1) + m.insert(orIns) + + // tmp3 is originally min(rn,rm). + // tmp3 = 0^ (set all bits) if rn == NaN || rm == NaN + // 0 otherwise + cmpIns := m.allocateInstr() + cmpIns.asXmmRmRImm(cmp, uint8(cmpPredUNORD_Q), newOperandReg(tmp2), tmp3) + m.insert(cmpIns) + + // tmp1 = -0 if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN + // ^0 if rn == NaN || rm == NaN + // min(v1, v2) otherwise + orIns2 := m.allocateInstr() + orIns2.asXmmRmR(or, newOperandReg(tmp3), tmp1) + m.insert(orIns2) + + // tmp3 = set all bits on the mantissa bits + // 0 otherwise + shift := m.allocateInstr() + shift.asXmmRmiReg(srl, newOperandImm32(shiftNumToInverseNaN), tmp3) + m.insert(shift) + + // tmp3 = tmp1 and !tmp3 + // = -0 if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN + // set all bits on exponential and sign bit (== NaN) if rn == NaN || rm == NaN + // min(rn, rm) otherwise + andnIns := m.allocateInstr() + andnIns.asXmmRmR(andn, newOperandReg(tmp1), tmp3) + m.insert(andnIns) + + m.copyTo(tmp3, rd) +} + +func (m *machine) lowerVFmax(instr *ssa.Instruction) { + x, y, lane := instr.Arg2WithLane() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rm := m.getOperand_Reg(m.c.ValueDefinition(y)) + rd := m.c.VRegOf(instr.Return()) + + var max, cmp, andn, or, xor, sub, srl /* shift right logical */ sseOpcode + var shiftNumToInverseNaN uint32 + if lane == ssa.VecLaneF32x4 { + max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = sseOpcodeMaxps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodeXorps, sseOpcodeSubps, sseOpcodePsrld, 0xa + } else { + max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = sseOpcodeMaxpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodeXorpd, sseOpcodeSubpd, sseOpcodePsrlq, 0xd + } + + tmp0 := m.copyToTmp(rm.reg()) + tmp1 := m.copyToTmp(rn.reg()) + + // tmp0=max(rn, rm) + maxIns1 := m.allocateInstr() + maxIns1.asXmmRmR(max, rn, tmp0) + m.insert(maxIns1) + + // tmp1=max(rm, rn) + maxIns2 := m.allocateInstr() + maxIns2.asXmmRmR(max, rm, tmp1) + m.insert(maxIns2) + + // tmp2=max(rm, rn) + tmp2 := m.copyToTmp(tmp1) + + // tmp2 = -0 if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) + // 0 if (rn == 0 && rm == 0) + // -0 if (rn == -0 && rm == -0) + // v1^v2 if rn == NaN || rm == NaN + // 0 otherwise + xorInstr := m.allocateInstr() + xorInstr.asXmmRmR(xor, newOperandReg(tmp0), tmp2) + m.insert(xorInstr) + // tmp1 = -0 if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) + // 0 if (rn == 0 && rm == 0) + // -0 if (rn == -0 && rm == -0) + // NaN if rn == NaN || rm == NaN + // max(v1, v2) otherwise + orInstr := m.allocateInstr() + orInstr.asXmmRmR(or, newOperandReg(tmp2), tmp1) + m.insert(orInstr) + + tmp3 := m.copyToTmp(tmp1) + + // tmp3 = 0 if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) || (rn == 0 && rm == 0) + // -0 if (rn == -0 && rm == -0) + // NaN if rn == NaN || rm == NaN + // max(v1, v2) otherwise + // + // Note: -0 - (-0) = 0 (!= -0) in floating point operation. + subIns := m.allocateInstr() + subIns.asXmmRmR(sub, newOperandReg(tmp2), tmp3) + m.insert(subIns) + + // tmp1 = 0^ if rn == NaN || rm == NaN + cmpIns := m.allocateInstr() + cmpIns.asXmmRmRImm(cmp, uint8(cmpPredUNORD_Q), newOperandReg(tmp1), tmp1) + m.insert(cmpIns) + + // tmp1 = set all bits on the mantissa bits + // 0 otherwise + shift := m.allocateInstr() + shift.asXmmRmiReg(srl, newOperandImm32(shiftNumToInverseNaN), tmp1) + m.insert(shift) + + andnIns := m.allocateInstr() + andnIns.asXmmRmR(andn, newOperandReg(tmp3), tmp1) + m.insert(andnIns) + + m.copyTo(tmp1, rd) +} + +func (m *machine) lowerVFabs(instr *ssa.Instruction) { + x, lane := instr.ArgWithLane() + rm := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + + tmp := m.c.AllocateVReg(ssa.TypeV128) + + def := m.allocateInstr() + def.asDefineUninitializedReg(tmp) + m.insert(def) + + // Set all bits on tmp. + pcmp := m.allocateInstr() + pcmp.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp) + m.insert(pcmp) + + switch lane { + case ssa.VecLaneF32x4: + // Shift right packed single floats by 1 to clear the sign bits. + shift := m.allocateInstr() + shift.asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp) + m.insert(shift) + // Clear the sign bit of rm. + andp := m.allocateInstr() + andp.asXmmRmR(sseOpcodeAndpd, rm, tmp) + m.insert(andp) + case ssa.VecLaneF64x2: + // Shift right packed single floats by 1 to clear the sign bits. + shift := m.allocateInstr() + shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(1), tmp) + m.insert(shift) + // Clear the sign bit of rm. + andp := m.allocateInstr() + andp.asXmmRmR(sseOpcodeAndps, rm, tmp) + m.insert(andp) + } + + m.copyTo(tmp, rd) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go new file mode 100644 index 000000000..8fa974c66 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go @@ -0,0 +1,304 @@ +package amd64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" +) + +// PostRegAlloc implements backend.Machine. +func (m *machine) PostRegAlloc() { + m.setupPrologue() + m.postRegAlloc() +} + +func (m *machine) setupPrologue() { + cur := m.ectx.RootInstr + prevInitInst := cur.next + + // At this point, we have the stack layout as follows: + // + // (high address) + // +-----------------+ <----- RBP (somewhere in the middle of the stack) + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | Return Addr | + // RSP ----> +-----------------+ + // (low address) + + // First, we push the RBP, and update the RBP to the current RSP. + // + // (high address) (high address) + // RBP ----> +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | ====> | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | Return Addr | | Return Addr | + // RSP ----> +-----------------+ | Caller_RBP | + // (low address) +-----------------+ <----- RSP, RBP + // + cur = m.setupRBPRSP(cur) + + if !m.stackBoundsCheckDisabled { + cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur) + } + + // + // (high address) + // +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | xxxxx | | xxxxx | + // | Return Addr | | Return Addr | + // | Caller_RBP | ====> | Caller_RBP | + // RBP,RSP->+-----------------+ +-----------------+ <----- RBP + // (low address) | clobbered M | + // | clobbered 1 | + // | ........... | + // | clobbered 0 | + // +-----------------+ <----- RSP + // + if regs := m.clobberedRegs; len(regs) > 0 { + for i := range regs { + r := regs[len(regs)-1-i] // Reverse order. + if r.RegType() == regalloc.RegTypeInt { + cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r))) + } else { + // Push the XMM register is not supported by the PUSH instruction. + cur = m.addRSP(-16, cur) + push := m.allocateInstr().asXmmMovRM( + sseOpcodeMovdqu, r, newOperandMem(m.newAmodeImmReg(0, rspVReg)), + ) + cur = linkInstr(cur, push) + } + } + } + + if size := m.spillSlotSize; size > 0 { + // Simply decrease the RSP to allocate the spill slots. + // sub $size, %rsp + cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(size)), rspVReg, true)) + + // At this point, we have the stack layout as follows: + // + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | ReturnAddress | + // | Caller_RBP | + // +-----------------+ <--- RBP + // | clobbered M | + // | ............ | + // | clobbered 1 | + // | clobbered 0 | + // | spill slot N | + // | ............ | + // | spill slot 0 | + // +-----------------+ <--- RSP + // (low address) + } + + linkInstr(cur, prevInitInst) +} + +// postRegAlloc does multiple things while walking through the instructions: +// 1. Inserts the epilogue code. +// 2. Removes the redundant copy instruction. +// 3. Inserts the dec/inc RSP instruction right before/after the call instruction. +// 4. Lowering that is supposed to be done after regalloc. +func (m *machine) postRegAlloc() { + ectx := m.ectx + for cur := ectx.RootInstr; cur != nil; cur = cur.next { + switch k := cur.kind; k { + case ret: + m.setupEpilogueAfter(cur.prev) + continue + case fcvtToSintSequence, fcvtToUintSequence: + m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0] + if k == fcvtToSintSequence { + m.lowerFcvtToSintSequenceAfterRegalloc(cur) + } else { + m.lowerFcvtToUintSequenceAfterRegalloc(cur) + } + prev := cur.prev + next := cur.next + cur := prev + for _, instr := range m.ectx.PendingInstructions { + cur = linkInstr(cur, instr) + } + linkInstr(cur, next) + continue + case xmmCMov: + m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0] + m.lowerXmmCmovAfterRegAlloc(cur) + prev := cur.prev + next := cur.next + cur := prev + for _, instr := range m.ectx.PendingInstructions { + cur = linkInstr(cur, instr) + } + linkInstr(cur, next) + continue + case idivRemSequence: + m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0] + m.lowerIDivRemSequenceAfterRegAlloc(cur) + prev := cur.prev + next := cur.next + cur := prev + for _, instr := range m.ectx.PendingInstructions { + cur = linkInstr(cur, instr) + } + linkInstr(cur, next) + continue + case call, callIndirect: + // At this point, reg alloc is done, therefore we can safely insert dec/inc RPS instruction + // right before/after the call instruction. If this is done before reg alloc, the stack slot + // can point to the wrong location and therefore results in a wrong value. + call := cur + next := call.next + _, _, _, _, size := backend.ABIInfoFromUint64(call.u2) + if size > 0 { + dec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(size), rspVReg, true) + linkInstr(call.prev, dec) + linkInstr(dec, call) + inc := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(size), rspVReg, true) + linkInstr(call, inc) + linkInstr(inc, next) + } + continue + } + + // Removes the redundant copy instruction. + if cur.IsCopy() && cur.op1.reg().RealReg() == cur.op2.reg().RealReg() { + prev, next := cur.prev, cur.next + // Remove the copy instruction. + prev.next = next + if next != nil { + next.prev = prev + } + } + } +} + +func (m *machine) setupEpilogueAfter(cur *instruction) { + prevNext := cur.next + + // At this point, we have the stack layout as follows: + // + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | ReturnAddress | + // | Caller_RBP | + // +-----------------+ <--- RBP + // | clobbered M | + // | ............ | + // | clobbered 1 | + // | clobbered 0 | + // | spill slot N | + // | ............ | + // | spill slot 0 | + // +-----------------+ <--- RSP + // (low address) + + if size := m.spillSlotSize; size > 0 { + // Simply increase the RSP to free the spill slots. + // add $size, %rsp + cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(uint32(size)), rspVReg, true)) + } + + // + // (high address) + // +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | ReturnAddress | | ReturnAddress | + // | Caller_RBP | | Caller_RBP | + // RBP ---> +-----------------+ ========> +-----------------+ <---- RSP, RBP + // | clobbered M | + // | ............ | + // | clobbered 1 | + // | clobbered 0 | + // RSP ---> +-----------------+ + // (low address) + // + if regs := m.clobberedRegs; len(regs) > 0 { + for _, r := range regs { + if r.RegType() == regalloc.RegTypeInt { + cur = linkInstr(cur, m.allocateInstr().asPop64(r)) + } else { + // Pop the XMM register is not supported by the POP instruction. + pop := m.allocateInstr().asXmmUnaryRmR( + sseOpcodeMovdqu, newOperandMem(m.newAmodeImmReg(0, rspVReg)), r, + ) + cur = linkInstr(cur, pop) + cur = m.addRSP(16, cur) + } + } + } + + // Now roll back the RSP to RBP, and pop the caller's RBP. + cur = m.revertRBPRSP(cur) + + linkInstr(cur, prevNext) +} + +func (m *machine) addRSP(offset int32, cur *instruction) *instruction { + if offset == 0 { + return cur + } + opcode := aluRmiROpcodeAdd + if offset < 0 { + opcode = aluRmiROpcodeSub + offset = -offset + } + return linkInstr(cur, m.allocateInstr().asAluRmiR(opcode, newOperandImm32(uint32(offset)), rspVReg, true)) +} + +func (m *machine) setupRBPRSP(cur *instruction) *instruction { + cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(rbpVReg))) + cur = linkInstr(cur, m.allocateInstr().asMovRR(rspVReg, rbpVReg, true)) + return cur +} + +func (m *machine) revertRBPRSP(cur *instruction) *instruction { + cur = linkInstr(cur, m.allocateInstr().asMovRR(rbpVReg, rspVReg, true)) + cur = linkInstr(cur, m.allocateInstr().asPop64(rbpVReg)) + return cur +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go new file mode 100644 index 000000000..0bb28ee9e --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go @@ -0,0 +1,153 @@ +package amd64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// InsertMoveBefore implements backend.RegAllocFunctionMachine. +func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) { + typ := src.RegType() + if typ != dst.RegType() { + panic("BUG: src and dst must have the same type") + } + + mov := m.allocateInstr() + if typ == regalloc.RegTypeInt { + mov.asMovRR(src, dst, true) + } else { + mov.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), dst) + } + + cur := instr.prev + prevNext := cur.next + cur = linkInstr(cur, mov) + linkInstr(cur, prevNext) +} + +// InsertStoreRegisterAt implements backend.RegAllocFunctionMachine. +func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction { + if !v.IsRealReg() { + panic("BUG: VReg must be backed by real reg to be stored") + } + + typ := m.c.TypeOf(v) + + var prevNext, cur *instruction + if after { + cur, prevNext = instr, instr.next + } else { + cur, prevNext = instr.prev, instr + } + + offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size()) + store := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmReg(uint32(offsetFromSP), rspVReg)) + switch typ { + case ssa.TypeI32: + store.asMovRM(v, mem, 4) + case ssa.TypeI64: + store.asMovRM(v, mem, 8) + case ssa.TypeF32: + store.asXmmMovRM(sseOpcodeMovss, v, mem) + case ssa.TypeF64: + store.asXmmMovRM(sseOpcodeMovsd, v, mem) + case ssa.TypeV128: + store.asXmmMovRM(sseOpcodeMovdqu, v, mem) + } + + cur = linkInstr(cur, store) + return linkInstr(cur, prevNext) +} + +// InsertReloadRegisterAt implements backend.RegAllocFunctionMachine. +func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction { + if !v.IsRealReg() { + panic("BUG: VReg must be backed by real reg to be stored") + } + + typ := m.c.TypeOf(v) + var prevNext, cur *instruction + if after { + cur, prevNext = instr, instr.next + } else { + cur, prevNext = instr.prev, instr + } + + // Load the value to the temporary. + load := m.allocateInstr() + offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size()) + a := newOperandMem(m.newAmodeImmReg(uint32(offsetFromSP), rspVReg)) + switch typ { + case ssa.TypeI32: + load.asMovzxRmR(extModeLQ, a, v) + case ssa.TypeI64: + load.asMov64MR(a, v) + case ssa.TypeF32: + load.asXmmUnaryRmR(sseOpcodeMovss, a, v) + case ssa.TypeF64: + load.asXmmUnaryRmR(sseOpcodeMovsd, a, v) + case ssa.TypeV128: + load.asXmmUnaryRmR(sseOpcodeMovdqu, a, v) + default: + panic("BUG") + } + + cur = linkInstr(cur, load) + return linkInstr(cur, prevNext) +} + +// ClobberedRegisters implements backend.RegAllocFunctionMachine. +func (m *machine) ClobberedRegisters(regs []regalloc.VReg) { + m.clobberedRegs = append(m.clobberedRegs[:0], regs...) +} + +// Swap implements backend.RegAllocFunctionMachine. +func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) { + if x1.RegType() == regalloc.RegTypeInt { + prevNext := cur.next + xc := m.allocateInstr().asXCHG(x1, newOperandReg(x2), 8) + cur = linkInstr(cur, xc) + linkInstr(cur, prevNext) + } else { + if tmp.Valid() { + prevNext := cur.next + m.InsertMoveBefore(tmp, x1, prevNext) + m.InsertMoveBefore(x1, x2, prevNext) + m.InsertMoveBefore(x2, tmp, prevNext) + } else { + prevNext := cur.next + r2 := x2.RealReg() + // Temporarily spill x1 to stack. + cur = m.InsertStoreRegisterAt(x1, cur, true).prev + // Then move x2 to x1. + cur = linkInstr(cur, m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqa, newOperandReg(x2), x1)) + linkInstr(cur, prevNext) + // Then reload the original value on x1 from stack to r2. + m.InsertReloadRegisterAt(x1.SetRealReg(r2), cur, true) + } + } +} + +// LastInstrForInsertion implements backend.RegAllocFunctionMachine. +func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction { + cur := end + for cur.kind == nop0 { + cur = cur.prev + if cur == begin { + return end + } + } + switch cur.kind { + case jmp: + return cur + default: + return end + } +} + +// SSABlockLabel implements backend.RegAllocFunctionMachine. +func (m *machine) SSABlockLabel(id ssa.BasicBlockID) backend.Label { + return m.ectx.SsaBlockIDToLabels[id] +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go new file mode 100644 index 000000000..539a8b754 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go @@ -0,0 +1,992 @@ +package amd64 + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +var swizzleMask = [16]byte{ + 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, + 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, +} + +func (m *machine) lowerSwizzle(x, y ssa.Value, ret ssa.Value) { + masklabel := m.getOrAllocateConstLabel(&m.constSwizzleMaskConstIndex, swizzleMask[:]) + + // Load mask to maskReg. + maskReg := m.c.AllocateVReg(ssa.TypeV128) + loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(masklabel)), maskReg) + m.insert(loadMask) + + // Copy x and y to tmp registers. + xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + tmpDst := m.copyToTmp(xx.reg()) + yy := m.getOperand_Reg(m.c.ValueDefinition(y)) + tmpX := m.copyToTmp(yy.reg()) + + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddusb, newOperandReg(maskReg), tmpX)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpX), tmpDst)) + + // Copy the result to the destination register. + m.copyTo(tmpDst, m.c.VRegOf(ret)) +} + +func (m *machine) lowerInsertLane(x, y ssa.Value, index byte, ret ssa.Value, lane ssa.VecLane) { + // Copy x to tmp. + tmpDst := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), tmpDst)) + + yy := m.getOperand_Reg(m.c.ValueDefinition(y)) + switch lane { + case ssa.VecLaneI8x16: + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, index, yy, tmpDst)) + case ssa.VecLaneI16x8: + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, index, yy, tmpDst)) + case ssa.VecLaneI32x4: + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, index, yy, tmpDst)) + case ssa.VecLaneI64x2: + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, index, yy, tmpDst)) + case ssa.VecLaneF32x4: + // In INSERTPS instruction, the destination index is encoded at 4 and 5 bits of the argument. + // See https://www.felixcloutier.com/x86/insertps + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, index<<4, yy, tmpDst)) + case ssa.VecLaneF64x2: + if index == 0 { + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, yy, tmpDst)) + } else { + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, yy, tmpDst)) + } + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + m.copyTo(tmpDst, m.c.VRegOf(ret)) +} + +func (m *machine) lowerExtractLane(x ssa.Value, index byte, signed bool, ret ssa.Value, lane ssa.VecLane) { + // Pextr variants are used to extract a lane from a vector register. + xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + + tmpDst := m.c.AllocateVReg(ret.Type()) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst)) + switch lane { + case ssa.VecLaneI8x16: + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrb, index, xx, tmpDst)) + if signed { + m.insert(m.allocateInstr().asMovsxRmR(extModeBL, newOperandReg(tmpDst), tmpDst)) + } else { + m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(tmpDst), tmpDst)) + } + case ssa.VecLaneI16x8: + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrw, index, xx, tmpDst)) + if signed { + m.insert(m.allocateInstr().asMovsxRmR(extModeWL, newOperandReg(tmpDst), tmpDst)) + } else { + m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(tmpDst), tmpDst)) + } + case ssa.VecLaneI32x4: + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrd, index, xx, tmpDst)) + case ssa.VecLaneI64x2: + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, index, xx, tmpDst)) + case ssa.VecLaneF32x4: + if index == 0 { + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovss, xx, tmpDst)) + } else { + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, index, xx, tmpDst)) + } + case ssa.VecLaneF64x2: + if index == 0 { + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst)) + } else { + m.copyTo(xx.reg(), tmpDst) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0b00_00_11_10, newOperandReg(tmpDst), tmpDst)) + } + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + m.copyTo(tmpDst, m.c.VRegOf(ret)) +} + +var sqmulRoundSat = [16]byte{ + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, +} + +func (m *machine) lowerSqmulRoundSat(x, y, ret ssa.Value) { + // See https://github.com/WebAssembly/simd/pull/365 for the following logic. + maskLabel := m.getOrAllocateConstLabel(&m.constSqmulRoundSatIndex, sqmulRoundSat[:]) + + tmp := m.c.AllocateVReg(ssa.TypeV128) + loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp) + m.insert(loadMask) + + xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + tmpX := m.copyToTmp(xx.reg()) + + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmulhrsw, yy, tmpX)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmpX), tmp)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmpX)) + + m.copyTo(tmpX, m.c.VRegOf(ret)) +} + +func (m *machine) lowerVUshr(x, y, ret ssa.Value, lane ssa.VecLane) { + switch lane { + case ssa.VecLaneI8x16: + m.lowerVUshri8x16(x, y, ret) + case ssa.VecLaneI16x8, ssa.VecLaneI32x4, ssa.VecLaneI64x2: + m.lowerShr(x, y, ret, lane, false) + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } +} + +// i8x16LogicalSHRMaskTable is necessary for emulating non-existent packed bytes logical right shifts on amd64. +// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits. +var i8x16LogicalSHRMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes. + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // for 1 shift + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, // for 2 shift + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // for 3 shift + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // for 4 shift + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // for 5 shift + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // for 6 shift + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // for 7 shift +} + +func (m *machine) lowerVUshri8x16(x, y, ret ssa.Value) { + tmpGpReg := m.c.AllocateVReg(ssa.TypeI32) + // Load the modulo 8 mask to tmpReg. + m.lowerIconst(tmpGpReg, 0x7, false) + // Take the modulo 8 of the shift amount. + shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)) + m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, tmpGpReg, false)) + + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + + vecTmp := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), vecTmp, false)) + m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrlw, newOperandReg(vecTmp), xx)) + + maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16LogicalSHRMaskTableIndex, i8x16LogicalSHRMaskTable[:]) + base := m.c.AllocateVReg(ssa.TypeI64) + lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base) + m.insert(lea) + + // Shift tmpGpReg by 4 to multiply the shift amount by 16. + m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false)) + + mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0) + loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), vecTmp) + m.insert(loadMask) + + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(vecTmp), xx)) + m.copyTo(xx, m.c.VRegOf(ret)) +} + +func (m *machine) lowerVSshr(x, y, ret ssa.Value, lane ssa.VecLane) { + switch lane { + case ssa.VecLaneI8x16: + m.lowerVSshri8x16(x, y, ret) + case ssa.VecLaneI16x8, ssa.VecLaneI32x4: + m.lowerShr(x, y, ret, lane, true) + case ssa.VecLaneI64x2: + m.lowerVSshri64x2(x, y, ret) + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } +} + +func (m *machine) lowerVSshri8x16(x, y, ret ssa.Value) { + shiftAmtReg := m.c.AllocateVReg(ssa.TypeI32) + // Load the modulo 8 mask to tmpReg. + m.lowerIconst(shiftAmtReg, 0x7, false) + // Take the modulo 8 of the shift amount. + shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)) + m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, shiftAmtReg, false)) + + // Copy the x value to two temporary registers. + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + vecTmp := m.c.AllocateVReg(ssa.TypeV128) + m.copyTo(xx, vecTmp) + + // Assuming that we have + // xx = [b1, ..., b16] + // vecTmp = [b1, ..., b16] + // at this point, then we use PUNPCKLBW and PUNPCKHBW to produce: + // xx = [b1, b1, b2, b2, ..., b8, b8] + // vecTmp = [b9, b9, b10, b10, ..., b16, b16] + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpcklbw, newOperandReg(xx), xx)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpckhbw, newOperandReg(vecTmp), vecTmp)) + + // Adding 8 to the shift amount, and then move the amount to vecTmp2. + vecTmp2 := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(8), shiftAmtReg, false)) + m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(shiftAmtReg), vecTmp2, false)) + + // Perform the word packed arithmetic right shifts on vreg and vecTmp. + // This changes these two registers as: + // xx = [xxx, b1 >> s, xxx, b2 >> s, ..., xxx, b8 >> s] + // vecTmp = [xxx, b9 >> s, xxx, b10 >> s, ..., xxx, b16 >> s] + // where xxx is 1 or 0 depending on each byte's sign, and ">>" is the arithmetic shift on a byte. + m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), xx)) + m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), vecTmp)) + + // Finally, we can get the result by packing these two word vectors. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePacksswb, newOperandReg(vecTmp), xx)) + + m.copyTo(xx, m.c.VRegOf(ret)) +} + +func (m *machine) lowerVSshri64x2(x, y, ret ssa.Value) { + // Load the shift amount to RCX. + shiftAmt := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, shiftAmt, rcxVReg)) + + tmpGp := m.c.AllocateVReg(ssa.TypeI64) + + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xxReg := m.copyToTmp(_xx.reg()) + + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 0, newOperandReg(xxReg), tmpGp)) + m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), xxReg)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 1, newOperandReg(xxReg), tmpGp)) + m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), xxReg)) + + m.copyTo(xxReg, m.c.VRegOf(ret)) +} + +func (m *machine) lowerShr(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) { + var modulo uint64 + var shiftOp sseOpcode + switch lane { + case ssa.VecLaneI16x8: + modulo = 0xf + if signed { + shiftOp = sseOpcodePsraw + } else { + shiftOp = sseOpcodePsrlw + } + case ssa.VecLaneI32x4: + modulo = 0x1f + if signed { + shiftOp = sseOpcodePsrad + } else { + shiftOp = sseOpcodePsrld + } + case ssa.VecLaneI64x2: + modulo = 0x3f + if signed { + panic("BUG") + } + shiftOp = sseOpcodePsrlq + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + + tmpGpReg := m.c.AllocateVReg(ssa.TypeI32) + // Load the modulo 8 mask to tmpReg. + m.lowerIconst(tmpGpReg, modulo, false) + // Take the modulo 8 of the shift amount. + m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, + m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false)) + // And move it to a xmm register. + tmpVec := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false)) + + // Then do the actual shift. + m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx)) + + m.copyTo(xx, m.c.VRegOf(ret)) +} + +func (m *machine) lowerVIshl(x, y, ret ssa.Value, lane ssa.VecLane) { + var modulo uint64 + var shiftOp sseOpcode + var isI8x16 bool + switch lane { + case ssa.VecLaneI8x16: + isI8x16 = true + modulo = 0x7 + shiftOp = sseOpcodePsllw + case ssa.VecLaneI16x8: + modulo = 0xf + shiftOp = sseOpcodePsllw + case ssa.VecLaneI32x4: + modulo = 0x1f + shiftOp = sseOpcodePslld + case ssa.VecLaneI64x2: + modulo = 0x3f + shiftOp = sseOpcodePsllq + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + + tmpGpReg := m.c.AllocateVReg(ssa.TypeI32) + // Load the modulo 8 mask to tmpReg. + m.lowerIconst(tmpGpReg, modulo, false) + // Take the modulo 8 of the shift amount. + m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, + m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false)) + // And move it to a xmm register. + tmpVec := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false)) + + // Then do the actual shift. + m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx)) + + if isI8x16 { + maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16SHLMaskTableIndex, i8x16SHLMaskTable[:]) + base := m.c.AllocateVReg(ssa.TypeI64) + lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base) + m.insert(lea) + + // Shift tmpGpReg by 4 to multiply the shift amount by 16. + m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false)) + + mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0) + loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), tmpVec) + m.insert(loadMask) + + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(tmpVec), xx)) + } + + m.copyTo(xx, m.c.VRegOf(ret)) +} + +// i8x16SHLMaskTable is necessary for emulating non-existent packed bytes left shifts on amd64. +// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits. +var i8x16SHLMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes. + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, // for 1 shift + 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, // for 2 shift + 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, // for 3 shift + 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // for 4 shift + 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, // for 5 shift + 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, // for 6 shift + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // for 7 shift +} + +func (m *machine) lowerVRound(x, ret ssa.Value, imm byte, _64 bool) { + xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + var round sseOpcode + if _64 { + round = sseOpcodeRoundpd + } else { + round = sseOpcodeRoundps + } + m.insert(m.allocateInstr().asXmmUnaryRmRImm(round, imm, xx, m.c.VRegOf(ret))) +} + +var ( + allOnesI8x16 = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1} + allOnesI16x8 = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0} + extAddPairwiseI16x8uMask1 = [16]byte{0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80} + extAddPairwiseI16x8uMask2 = [16]byte{0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00} +) + +func (m *machine) lowerExtIaddPairwise(x, ret ssa.Value, srcLane ssa.VecLane, signed bool) { + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + switch srcLane { + case ssa.VecLaneI8x16: + allOneReg := m.c.AllocateVReg(ssa.TypeV128) + mask := m.getOrAllocateConstLabel(&m.constAllOnesI8x16Index, allOnesI8x16[:]) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOneReg)) + + var resultReg regalloc.VReg + if signed { + resultReg = allOneReg + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(xx), resultReg)) + } else { + // Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned. + resultReg = xx + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(allOneReg), resultReg)) + } + m.copyTo(resultReg, m.c.VRegOf(ret)) + + case ssa.VecLaneI16x8: + if signed { + allOnesReg := m.c.AllocateVReg(ssa.TypeV128) + mask := m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:]) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOnesReg)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(allOnesReg), xx)) + m.copyTo(xx, m.c.VRegOf(ret)) + } else { + maskReg := m.c.AllocateVReg(ssa.TypeV128) + mask := m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask1Index, extAddPairwiseI16x8uMask1[:]) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg)) + + // Flip the sign bits on xx. + // + // Assuming that xx = [w1, ..., w8], now we have, + // xx[i] = int8(-w1) for i = 0...8 + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(maskReg), xx)) + + mask = m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:]) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg)) + + // For i = 0,..4 (as this results in i32x4 lanes), now we have + // xx[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1))) + // c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(maskReg), xx)) + + mask = m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask2Index, extAddPairwiseI16x8uMask2[:]) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg)) + + // vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)). + // c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(maskReg), xx)) + + m.copyTo(xx, m.c.VRegOf(ret)) + } + default: + panic(fmt.Sprintf("invalid lane type: %s", srcLane)) + } +} + +func (m *machine) lowerWidenLow(x, ret ssa.Value, lane ssa.VecLane, signed bool) { + var sseOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + if signed { + sseOp = sseOpcodePmovsxbw + } else { + sseOp = sseOpcodePmovzxbw + } + case ssa.VecLaneI16x8: + if signed { + sseOp = sseOpcodePmovsxwd + } else { + sseOp = sseOpcodePmovzxwd + } + case ssa.VecLaneI32x4: + if signed { + sseOp = sseOpcodePmovsxdq + } else { + sseOp = sseOpcodePmovzxdq + } + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, xx, m.c.VRegOf(ret))) +} + +func (m *machine) lowerWidenHigh(x, ret ssa.Value, lane ssa.VecLane, signed bool) { + tmp := m.c.AllocateVReg(ssa.TypeV128) + xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + m.copyTo(xx.reg(), tmp) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePalignr, 8, newOperandReg(tmp), tmp)) + + var sseOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + if signed { + sseOp = sseOpcodePmovsxbw + } else { + sseOp = sseOpcodePmovzxbw + } + case ssa.VecLaneI16x8: + if signed { + sseOp = sseOpcodePmovsxwd + } else { + sseOp = sseOpcodePmovzxwd + } + case ssa.VecLaneI32x4: + if signed { + sseOp = sseOpcodePmovsxdq + } else { + sseOp = sseOpcodePmovzxdq + } + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandReg(tmp), m.c.VRegOf(ret))) +} + +func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, ret ssa.Value, lane ssa.VecLane) { + tmpDst, tmpGp := m.c.AllocateVReg(ssa.TypeV128), m.c.AllocateVReg(ssa.TypeI64) + am := newOperandMem(m.lowerToAddressMode(ptr, offset)) + + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst)) + switch lane { + case ssa.VecLaneI8x16: + m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, am, tmpGp)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, newOperandReg(tmpGp), tmpDst)) + tmpZeroVec := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asZeros(tmpZeroVec)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpZeroVec), tmpDst)) + case ssa.VecLaneI16x8: + m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, am, tmpGp)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, newOperandReg(tmpGp), tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, newOperandReg(tmpGp), tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) + case ssa.VecLaneI32x4: + m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, am, tmpGp)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, newOperandReg(tmpGp), tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) + case ssa.VecLaneI64x2: + m.insert(m.allocateInstr().asMov64MR(am, tmpGp)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), tmpDst)) + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + m.copyTo(tmpDst, m.c.VRegOf(ret)) +} + +var f64x2CvtFromIMask = [16]byte{ + 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +} + +func (m *machine) lowerVFcvtFromInt(x, ret ssa.Value, lane ssa.VecLane, signed bool) { + switch lane { + case ssa.VecLaneF32x4: + if signed { + xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, xx, m.c.VRegOf(ret))) + } else { + xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + // Copy the value to two temporary registers. + tmp := m.copyToTmp(xx.reg()) + tmp2 := m.copyToTmp(xx.reg()) + + // Clear the higher 16 bits of each 32-bit element. + m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePslld, newOperandImm32(0xa), tmp)) + m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0xa), tmp)) + + // Subtract the higher 16-bits from tmp2: clear the lower 16-bits of tmp2. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubd, newOperandReg(tmp), tmp2)) + + // Convert the lower 16-bits in tmp. + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp)) + + // Left shift by one and convert tmp2, meaning that halved conversion result of higher 16-bits in tmp2. + m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp2)) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp2), tmp2)) + + // Double the converted halved higher 16bits. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp2), tmp2)) + + // Get the conversion result by add tmp (holding lower 16-bit conversion) into tmp2. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp), tmp2)) + + m.copyTo(tmp2, m.c.VRegOf(ret)) + } + case ssa.VecLaneF64x2: + if signed { + xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2pd, xx, m.c.VRegOf(ret))) + } else { + maskReg := m.c.AllocateVReg(ssa.TypeV128) + maskLabel := m.getOrAllocateConstLabel(&m.constF64x2CvtFromIMaskIndex, f64x2CvtFromIMask[:]) + // maskReg = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00] + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg)) + + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + + // Given that we have xx = [d1, d2, d3, d4], this results in + // xx = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]] + // = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52] + // ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeUnpcklps, newOperandReg(maskReg), xx)) + + // maskReg = [float64(0x1.0p52), float64(0x1.0p52)] + maskLabel = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:]) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg)) + + // Now, we get the result as + // xx = [float64(uint32(d1)), float64(uint32(d2))] + // because the following equality always satisfies: + // float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubpd, newOperandReg(maskReg), xx)) + + m.copyTo(xx, m.c.VRegOf(ret)) + } + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } +} + +var ( + // i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes. + i32sMaxOnF64x2 = [16]byte{ + 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0) + 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0) + } + + // i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes. + i32uMaxOnF64x2 = [16]byte{ + 0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0) + 0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0) + } + + // twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that + // with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics, + // like addition or subtraction, the resulted floating point holds exactly the same + // bit representations in 32-bit integer on its mantissa. + // + // Note: the name twop52 is common across various compiler ecosystem. + // E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28 + // E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html + twop52 = [16]byte{ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52) + } +) + +func (m *machine) lowerVFcvtToIntSat(x, ret ssa.Value, lane ssa.VecLane, signed bool) { + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + + switch lane { + case ssa.VecLaneF32x4: + if signed { + tmp := m.copyToTmp(xx) + + // Assuming we have xx = [v1, v2, v3, v4]. + // + // Set all bits if lane is not NaN on tmp. + // tmp[i] = 0xffffffff if vi != NaN + // = 0 if vi == NaN + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp)) + + // Clear NaN lanes on xx, meaning that + // xx[i] = vi if vi != NaN + // 0 if vi == NaN + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp), xx)) + + // tmp[i] = ^vi if vi != NaN + // = 0xffffffff if vi == NaN + // which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeXorps, newOperandReg(xx), tmp)) + + // xx[i] = int32(vi) if vi != NaN and xx is not overflowing. + // = 0x80000000 if vi != NaN and xx is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq) + // = 0 if vi == NaN + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx)) + + // Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane. + // + // tmp[i] = 0x80000000 if vi is positive + // = any satisfying any&0x80000000 = 0 if vi is negative or zero. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(xx), tmp)) + + // Arithmetic right shifting tmp by 31, meaning that we have + // tmp[i] = 0xffffffff if vi is positive, 0 otherwise. + m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrad, newOperandImm32(0x1f), tmp)) + + // Flipping 0x80000000 if vi is positive, otherwise keep intact. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), xx)) + } else { + tmp := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asZeros(tmp)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxps, newOperandReg(tmp), xx)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp)) + m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0x1), tmp)) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp)) + tmp2 := m.copyToTmp(xx) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubps, newOperandReg(tmp), tmp2)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredLE_OS), newOperandReg(tmp2), tmp)) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(tmp2), tmp2)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp2)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaxsd, newOperandReg(tmp), tmp2)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(tmp2), xx)) + } + + case ssa.VecLaneF64x2: + tmp2 := m.c.AllocateVReg(ssa.TypeV128) + if signed { + tmp := m.copyToTmp(xx) + + // Set all bits for non-NaN lanes, zeros otherwise. + // I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise. + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp)) + + maskLabel := m.getOrAllocateConstLabel(&m.constI32sMaxOnF64x2Index, i32sMaxOnF64x2[:]) + // Load the 2147483647 into tmp2's each lane. + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp2)) + + // tmp[i] = 2147483647 if vi != NaN, 0 otherwise. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp2), tmp)) + + // MINPD returns the source register's value as-is, so we have + // xx[i] = vi if vi != NaN + // = 0 if vi == NaN + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp), xx)) + + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttpd2dq, newOperandReg(xx), xx)) + } else { + tmp := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asZeros(tmp)) + + // xx[i] = vi if vi != NaN && vi > 0 + // = 0 if vi == NaN || vi <= 0 + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxpd, newOperandReg(tmp), xx)) + + // tmp2[i] = float64(math.MaxUint32) = math.MaxUint32 + maskIndex := m.getOrAllocateConstLabel(&m.constI32uMaxOnF64x2Index, i32uMaxOnF64x2[:]) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2)) + + // xx[i] = vi if vi != NaN && vi > 0 && vi <= math.MaxUint32 + // = 0 otherwise + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp2), xx)) + + // Round the floating points into integer. + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeRoundpd, 0x3, newOperandReg(xx), xx)) + + // tmp2[i] = float64(0x1.0p52) + maskIndex = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:]) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2)) + + // xx[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32 + // = 0 otherwise + // + // This means that xx[i] holds exactly the same bit of uint32(vi) in its lower 32-bits. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddpd, newOperandReg(tmp2), xx)) + + // At this point, we have + // xx = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)] + // tmp = [0, 0, 0, 0] + // as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in + // xx = [xx[00], xx[10], tmp[00], tmp[00]] = [xx[00], xx[10], 0, 0] + // meaning that for i = 0 and 1, we have + // xx[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32 + // = 0 otherwise. + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeShufps, 0b00_00_10_00, newOperandReg(tmp), xx)) + } + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + m.copyTo(xx, m.c.VRegOf(ret)) +} + +func (m *machine) lowerNarrow(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) { + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + + var sseOp sseOpcode + switch lane { + case ssa.VecLaneI16x8: + if signed { + sseOp = sseOpcodePacksswb + } else { + sseOp = sseOpcodePackuswb + } + case ssa.VecLaneI32x4: + if signed { + sseOp = sseOpcodePackssdw + } else { + sseOp = sseOpcodePackusdw + } + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + m.insert(m.allocateInstr().asXmmRmR(sseOp, yy, xx)) + m.copyTo(xx, m.c.VRegOf(ret)) +} + +func (m *machine) lowerWideningPairwiseDotProductS(x, y, ret ssa.Value) { + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, yy, xx)) + m.copyTo(xx, m.c.VRegOf(ret)) +} + +func (m *machine) lowerVIabs(instr *ssa.Instruction) { + x, lane := instr.ArgWithLane() + rd := m.c.VRegOf(instr.Return()) + + if lane == ssa.VecLaneI64x2 { + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + + blendReg := xmm0VReg + m.insert(m.allocateInstr().asDefineUninitializedReg(blendReg)) + + tmp := m.copyToTmp(_xx.reg()) + xx := m.copyToTmp(_xx.reg()) + + // Clear all bits on blendReg. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(blendReg), blendReg)) + // Subtract xx from blendMaskReg. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubq, newOperandReg(xx), blendReg)) + // Copy the subtracted value ^^ back into tmp. + m.copyTo(blendReg, xx) + + m.insert(m.allocateInstr().asBlendvpd(newOperandReg(tmp), xx)) + + m.copyTo(xx, rd) + } else { + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePabsb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePabsw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePabsd + } + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + + i := m.allocateInstr() + i.asXmmUnaryRmR(vecOp, rn, rd) + m.insert(i) + } +} + +func (m *machine) lowerVIpopcnt(instr *ssa.Instruction) { + x := instr.Arg() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + + tmp1 := m.c.AllocateVReg(ssa.TypeV128) + m.lowerVconst(tmp1, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f) + + // Copy input into tmp2. + tmp2 := m.copyToTmp(rn.reg()) + + // Given that we have: + // rm = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn. + // + // Take PAND on tmp1 and tmp2, so that we mask out all the higher bits. + // tmp2 = [l1, ..., l16]. + pand := m.allocateInstr() + pand.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp2) + m.insert(pand) + + // Do logical (packed word) right shift by 4 on rm and PAND against the mask (tmp1); meaning that we have + // tmp3 = [h1, ...., h16]. + tmp3 := m.copyToTmp(rn.reg()) + psrlw := m.allocateInstr() + psrlw.asXmmRmiReg(sseOpcodePsrlw, newOperandImm32(4), tmp3) + m.insert(psrlw) + + pand2 := m.allocateInstr() + pand2.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp3) + m.insert(pand2) + + // Read the popcntTable into tmp4, and we have + // tmp4 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04] + tmp4 := m.c.AllocateVReg(ssa.TypeV128) + m.lowerVconst(tmp4, 0x03_02_02_01_02_01_01_00, 0x04_03_03_02_03_02_02_01) + + // Make a copy for later. + tmp5 := m.copyToTmp(tmp4) + + // tmp4 = [popcnt(l1), ..., popcnt(l16)]. + pshufb := m.allocateInstr() + pshufb.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp2), tmp4) + m.insert(pshufb) + + pshufb2 := m.allocateInstr() + pshufb2.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp3), tmp5) + m.insert(pshufb2) + + // tmp4 + tmp5 is the result. + paddb := m.allocateInstr() + paddb.asXmmRmR(sseOpcodePaddb, newOperandReg(tmp4), tmp5) + m.insert(paddb) + + m.copyTo(tmp5, rd) +} + +func (m *machine) lowerVImul(instr *ssa.Instruction) { + x, y, lane := instr.Arg2WithLane() + rd := m.c.VRegOf(instr.Return()) + if lane == ssa.VecLaneI64x2 { + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rm := m.getOperand_Reg(m.c.ValueDefinition(y)) + // Assuming that we have + // rm = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high] + // rn = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high] + // where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane. + + // Copy rn into tmp1. + tmp1 := m.copyToTmp(rn.reg()) + + // And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high] + shift := m.allocateInstr() + shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp1) + m.insert(shift) + + // Execute "pmuludq rm,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit. + mul := m.allocateInstr() + mul.asXmmRmR(sseOpcodePmuludq, rm, tmp1) + m.insert(mul) + + // Copy rm value into tmp2. + tmp2 := m.copyToTmp(rm.reg()) + + // And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high] + shift2 := m.allocateInstr() + shift2.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp2) + m.insert(shift2) + + // Execute "pmuludq rm,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit. + mul2 := m.allocateInstr() + mul2.asXmmRmR(sseOpcodePmuludq, rn, tmp2) + m.insert(mul2) + + // Adds tmp1 and tmp2 and do the logical left shift by 32-bit, + // which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32] + add := m.allocateInstr() + add.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp2), tmp1) + m.insert(add) + + shift3 := m.allocateInstr() + shift3.asXmmRmiReg(sseOpcodePsllq, newOperandImm32(32), tmp1) + m.insert(shift3) + + // Copy rm value into tmp3. + tmp3 := m.copyToTmp(rm.reg()) + + // "pmuludq rm,tmp3" makes tmp3 = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit. + mul3 := m.allocateInstr() + mul3.asXmmRmR(sseOpcodePmuludq, rn, tmp3) + m.insert(mul3) + + // Finally, we get the result by computing tmp1 + tmp3, + // which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo] + add2 := m.allocateInstr() + add2.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp3), tmp1) + m.insert(add2) + + m.copyTo(tmp1, rd) + + } else { + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI16x8: + vecOp = sseOpcodePmullw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePmulld + default: + panic("unsupported: " + lane.String()) + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go new file mode 100644 index 000000000..c6fcb8673 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go @@ -0,0 +1,346 @@ +package amd64 + +import ( + "fmt" + "unsafe" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +type operand struct { + kind operandKind + data uint64 +} + +type operandKind byte + +const ( + // operandKindReg is an operand which is an integer Register. + operandKindReg operandKind = iota + 1 + + // operandKindMem is a value in Memory. + // 32, 64, or 128 bit value. + operandKindMem + + // operandKindImm32 is a signed-32-bit integer immediate value. + operandKindImm32 + + // operandKindLabel is a label. + operandKindLabel +) + +// String implements fmt.Stringer. +func (o operandKind) String() string { + switch o { + case operandKindReg: + return "reg" + case operandKindMem: + return "mem" + case operandKindImm32: + return "imm32" + case operandKindLabel: + return "label" + default: + panic("BUG: invalid operand kind") + } +} + +// format returns the string representation of the operand. +// _64 is only for the case where the operand is a register, and it's integer. +func (o *operand) format(_64 bool) string { + switch o.kind { + case operandKindReg: + return formatVRegSized(o.reg(), _64) + case operandKindMem: + return o.addressMode().String() + case operandKindImm32: + return fmt.Sprintf("$%d", int32(o.imm32())) + case operandKindLabel: + return backend.Label(o.imm32()).String() + default: + panic(fmt.Sprintf("BUG: invalid operand: %s", o.kind)) + } +} + +//go:inline +func (o *operand) reg() regalloc.VReg { + return regalloc.VReg(o.data) +} + +//go:inline +func (o *operand) setReg(r regalloc.VReg) { + o.data = uint64(r) +} + +//go:inline +func (o *operand) addressMode() *amode { + return wazevoapi.PtrFromUintptr[amode](uintptr(o.data)) +} + +//go:inline +func (o *operand) imm32() uint32 { + return uint32(o.data) +} + +func (o *operand) label() backend.Label { + switch o.kind { + case operandKindLabel: + return backend.Label(o.data) + case operandKindMem: + mem := o.addressMode() + if mem.kind() != amodeRipRel { + panic("BUG: invalid label") + } + return backend.Label(mem.imm32) + default: + panic("BUG: invalid operand kind") + } +} + +func newOperandLabel(label backend.Label) operand { + return operand{kind: operandKindLabel, data: uint64(label)} +} + +func newOperandReg(r regalloc.VReg) operand { + return operand{kind: operandKindReg, data: uint64(r)} +} + +func newOperandImm32(imm32 uint32) operand { + return operand{kind: operandKindImm32, data: uint64(imm32)} +} + +func newOperandMem(amode *amode) operand { + return operand{kind: operandKindMem, data: uint64(uintptr(unsafe.Pointer(amode)))} +} + +// amode is a memory operand (addressing mode). +type amode struct { + kindWithShift uint32 + imm32 uint32 + base regalloc.VReg + + // For amodeRegRegShift: + index regalloc.VReg +} + +type amodeKind byte + +const ( + // amodeRegRegShift calculates sign-extend-32-to-64(Immediate) + base + amodeImmReg amodeKind = iota + 1 + + // amodeImmRBP is the same as amodeImmReg, but the base register is fixed to RBP. + // The only differece is that it doesn't tell the register allocator to use RBP which is distracting for the + // register allocator. + amodeImmRBP + + // amodeRegRegShift calculates sign-extend-32-to-64(Immediate) + base + (Register2 << Shift) + amodeRegRegShift + + // amodeRipRel is a RIP-relative addressing mode specified by the label. + amodeRipRel + + // TODO: there are other addressing modes such as the one without base register. +) + +func (a *amode) kind() amodeKind { + return amodeKind(a.kindWithShift & 0xff) +} + +func (a *amode) shift() byte { + return byte(a.kindWithShift >> 8) +} + +func (a *amode) uses(rs *[]regalloc.VReg) { + switch a.kind() { + case amodeImmReg: + *rs = append(*rs, a.base) + case amodeRegRegShift: + *rs = append(*rs, a.base, a.index) + case amodeImmRBP, amodeRipRel: + default: + panic("BUG: invalid amode kind") + } +} + +func (a *amode) nregs() int { + switch a.kind() { + case amodeImmReg: + return 1 + case amodeRegRegShift: + return 2 + case amodeImmRBP, amodeRipRel: + return 0 + default: + panic("BUG: invalid amode kind") + } +} + +func (a *amode) assignUses(i int, reg regalloc.VReg) { + switch a.kind() { + case amodeImmReg: + if i == 0 { + a.base = reg + } else { + panic("BUG: invalid amode assignment") + } + case amodeRegRegShift: + if i == 0 { + a.base = reg + } else if i == 1 { + a.index = reg + } else { + panic("BUG: invalid amode assignment") + } + default: + panic("BUG: invalid amode assignment") + } +} + +func (m *machine) newAmodeImmReg(imm32 uint32, base regalloc.VReg) *amode { + ret := m.amodePool.Allocate() + *ret = amode{kindWithShift: uint32(amodeImmReg), imm32: imm32, base: base} + return ret +} + +func (m *machine) newAmodeImmRBPReg(imm32 uint32) *amode { + ret := m.amodePool.Allocate() + *ret = amode{kindWithShift: uint32(amodeImmRBP), imm32: imm32, base: rbpVReg} + return ret +} + +func (m *machine) newAmodeRegRegShift(imm32 uint32, base, index regalloc.VReg, shift byte) *amode { + if shift > 3 { + panic(fmt.Sprintf("BUG: invalid shift (must be 3>=): %d", shift)) + } + ret := m.amodePool.Allocate() + *ret = amode{kindWithShift: uint32(amodeRegRegShift) | uint32(shift)<<8, imm32: imm32, base: base, index: index} + return ret +} + +func (m *machine) newAmodeRipRel(label backend.Label) *amode { + ret := m.amodePool.Allocate() + *ret = amode{kindWithShift: uint32(amodeRipRel), imm32: uint32(label)} + return ret +} + +// String implements fmt.Stringer. +func (a *amode) String() string { + switch a.kind() { + case amodeImmReg, amodeImmRBP: + if a.imm32 == 0 { + return fmt.Sprintf("(%s)", formatVRegSized(a.base, true)) + } + return fmt.Sprintf("%d(%s)", int32(a.imm32), formatVRegSized(a.base, true)) + case amodeRegRegShift: + shift := 1 << a.shift() + if a.imm32 == 0 { + return fmt.Sprintf( + "(%s,%s,%d)", + formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift) + } + return fmt.Sprintf( + "%d(%s,%s,%d)", + int32(a.imm32), formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift) + case amodeRipRel: + return fmt.Sprintf("%s(%%rip)", backend.Label(a.imm32)) + default: + panic("BUG: invalid amode kind") + } +} + +func (m *machine) getOperand_Mem_Reg(def *backend.SSAValueDefinition) (op operand) { + if def.IsFromBlockParam() { + return newOperandReg(def.BlkParamVReg) + } + + if def.SSAValue().Type() == ssa.TypeV128 { + // SIMD instructions require strict memory alignment, so we don't support the memory operand for V128 at the moment. + return m.getOperand_Reg(def) + } + + if m.c.MatchInstr(def, ssa.OpcodeLoad) { + instr := def.Instr + ptr, offset, _ := instr.LoadData() + op = newOperandMem(m.lowerToAddressMode(ptr, offset)) + instr.MarkLowered() + return op + } + return m.getOperand_Reg(def) +} + +func (m *machine) getOperand_Mem_Imm32_Reg(def *backend.SSAValueDefinition) (op operand) { + if def.IsFromBlockParam() { + return newOperandReg(def.BlkParamVReg) + } + + if m.c.MatchInstr(def, ssa.OpcodeLoad) { + instr := def.Instr + ptr, offset, _ := instr.LoadData() + op = newOperandMem(m.lowerToAddressMode(ptr, offset)) + instr.MarkLowered() + return op + } + return m.getOperand_Imm32_Reg(def) +} + +func (m *machine) getOperand_Imm32_Reg(def *backend.SSAValueDefinition) (op operand) { + if def.IsFromBlockParam() { + return newOperandReg(def.BlkParamVReg) + } + + instr := def.Instr + if instr.Constant() { + // If the operation is 64-bit, x64 sign-extends the 32-bit immediate value. + // Therefore, we need to check if the immediate value is within the 32-bit range and if the sign bit is set, + // we should not use the immediate value. + if op, ok := asImm32Operand(instr.ConstantVal(), instr.Return().Type() == ssa.TypeI32); ok { + instr.MarkLowered() + return op + } + } + return m.getOperand_Reg(def) +} + +func asImm32Operand(val uint64, allowSignExt bool) (operand, bool) { + if imm32, ok := asImm32(val, allowSignExt); ok { + return newOperandImm32(imm32), true + } + return operand{}, false +} + +func asImm32(val uint64, allowSignExt bool) (uint32, bool) { + u32val := uint32(val) + if uint64(u32val) != val { + return 0, false + } + if !allowSignExt && u32val&0x80000000 != 0 { + return 0, false + } + return u32val, true +} + +func (m *machine) getOperand_Reg(def *backend.SSAValueDefinition) (op operand) { + var v regalloc.VReg + if def.IsFromBlockParam() { + v = def.BlkParamVReg + } else { + instr := def.Instr + if instr.Constant() { + // We inline all the constant instructions so that we could reduce the register usage. + v = m.lowerConstant(instr) + instr.MarkLowered() + } else { + if n := def.N; n == 0 { + v = m.c.VRegOf(instr.Return()) + } else { + _, rs := instr.Returns() + v = m.c.VRegOf(rs[n-1]) + } + } + } + return newOperandReg(v) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go new file mode 100644 index 000000000..5219837e3 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go @@ -0,0 +1,11 @@ +//go:build !tinygo + +package amd64 + +import "reflect" + +// setSliceLimits sets both Cap and Len for the given reflected slice. +func setSliceLimits(s *reflect.SliceHeader, limit uintptr) { + s.Len = int(limit) + s.Cap = int(limit) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go new file mode 100644 index 000000000..df4cf46ec --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go @@ -0,0 +1,11 @@ +//go:build tinygo + +package amd64 + +import "reflect" + +// setSliceLimits sets both Cap and Len for the given reflected slice. +func setSliceLimits(s *reflect.SliceHeader, limit uintptr) { + s.Len = limit + s.Len = limit +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go new file mode 100644 index 000000000..4aec856fa --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go @@ -0,0 +1,181 @@ +package amd64 + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" +) + +// Amd64-specific registers. +const ( + // rax is a gp register. + rax = regalloc.RealRegInvalid + 1 + iota + // rcx is a gp register. + rcx + // rdx is a gp register. + rdx + // rbx is a gp register. + rbx + // rsp is a gp register. + rsp + // rbp is a gp register. + rbp + // rsi is a gp register. + rsi + // rdi is a gp register. + rdi + // r8 is a gp register. + r8 + // r9 is a gp register. + r9 + // r10 is a gp register. + r10 + // r11 is a gp register. + r11 + // r12 is a gp register. + r12 + // r13 is a gp register. + r13 + // r14 is a gp register. + r14 + // r15 is a gp register. + r15 + + // xmm0 is a vector register. + xmm0 + // xmm1 is a vector register. + xmm1 + // xmm2 is a vector register. + xmm2 + // xmm3 is a vector register. + xmm3 + // xmm4 is a vector register. + xmm4 + // xmm5 is a vector register. + xmm5 + // xmm6 is a vector register. + xmm6 + // xmm7 is a vector register. + xmm7 + // xmm8 is a vector register. + xmm8 + // xmm9 is a vector register. + xmm9 + // xmm10 is a vector register. + xmm10 + // xmm11 is a vector register. + xmm11 + // xmm12 is a vector register. + xmm12 + // xmm13 is a vector register. + xmm13 + // xmm14 is a vector register. + xmm14 + // xmm15 is a vector register. + xmm15 +) + +var ( + raxVReg = regalloc.FromRealReg(rax, regalloc.RegTypeInt) + rcxVReg = regalloc.FromRealReg(rcx, regalloc.RegTypeInt) + rdxVReg = regalloc.FromRealReg(rdx, regalloc.RegTypeInt) + rbxVReg = regalloc.FromRealReg(rbx, regalloc.RegTypeInt) + rspVReg = regalloc.FromRealReg(rsp, regalloc.RegTypeInt) + rbpVReg = regalloc.FromRealReg(rbp, regalloc.RegTypeInt) + rsiVReg = regalloc.FromRealReg(rsi, regalloc.RegTypeInt) + rdiVReg = regalloc.FromRealReg(rdi, regalloc.RegTypeInt) + r8VReg = regalloc.FromRealReg(r8, regalloc.RegTypeInt) + r9VReg = regalloc.FromRealReg(r9, regalloc.RegTypeInt) + r10VReg = regalloc.FromRealReg(r10, regalloc.RegTypeInt) + r11VReg = regalloc.FromRealReg(r11, regalloc.RegTypeInt) + r12VReg = regalloc.FromRealReg(r12, regalloc.RegTypeInt) + r13VReg = regalloc.FromRealReg(r13, regalloc.RegTypeInt) + r14VReg = regalloc.FromRealReg(r14, regalloc.RegTypeInt) + r15VReg = regalloc.FromRealReg(r15, regalloc.RegTypeInt) + + xmm0VReg = regalloc.FromRealReg(xmm0, regalloc.RegTypeFloat) + xmm1VReg = regalloc.FromRealReg(xmm1, regalloc.RegTypeFloat) + xmm2VReg = regalloc.FromRealReg(xmm2, regalloc.RegTypeFloat) + xmm3VReg = regalloc.FromRealReg(xmm3, regalloc.RegTypeFloat) + xmm4VReg = regalloc.FromRealReg(xmm4, regalloc.RegTypeFloat) + xmm5VReg = regalloc.FromRealReg(xmm5, regalloc.RegTypeFloat) + xmm6VReg = regalloc.FromRealReg(xmm6, regalloc.RegTypeFloat) + xmm7VReg = regalloc.FromRealReg(xmm7, regalloc.RegTypeFloat) + xmm8VReg = regalloc.FromRealReg(xmm8, regalloc.RegTypeFloat) + xmm9VReg = regalloc.FromRealReg(xmm9, regalloc.RegTypeFloat) + xmm10VReg = regalloc.FromRealReg(xmm10, regalloc.RegTypeFloat) + xmm11VReg = regalloc.FromRealReg(xmm11, regalloc.RegTypeFloat) + xmm12VReg = regalloc.FromRealReg(xmm12, regalloc.RegTypeFloat) + xmm13VReg = regalloc.FromRealReg(xmm13, regalloc.RegTypeFloat) + xmm14VReg = regalloc.FromRealReg(xmm14, regalloc.RegTypeFloat) + xmm15VReg = regalloc.FromRealReg(xmm15, regalloc.RegTypeFloat) +) + +var regNames = [...]string{ + rax: "rax", + rcx: "rcx", + rdx: "rdx", + rbx: "rbx", + rsp: "rsp", + rbp: "rbp", + rsi: "rsi", + rdi: "rdi", + r8: "r8", + r9: "r9", + r10: "r10", + r11: "r11", + r12: "r12", + r13: "r13", + r14: "r14", + r15: "r15", + xmm0: "xmm0", + xmm1: "xmm1", + xmm2: "xmm2", + xmm3: "xmm3", + xmm4: "xmm4", + xmm5: "xmm5", + xmm6: "xmm6", + xmm7: "xmm7", + xmm8: "xmm8", + xmm9: "xmm9", + xmm10: "xmm10", + xmm11: "xmm11", + xmm12: "xmm12", + xmm13: "xmm13", + xmm14: "xmm14", + xmm15: "xmm15", +} + +func formatVRegSized(r regalloc.VReg, _64 bool) string { + if r.IsRealReg() { + if r.RegType() == regalloc.RegTypeInt { + rr := r.RealReg() + orig := regNames[rr] + if rr <= rdi { + if _64 { + return "%" + orig + } else { + return "%e" + orig[1:] + } + } else { + if _64 { + return "%" + orig + } else { + return "%" + orig + "d" + } + } + } else { + return "%" + regNames[r.RealReg()] + } + } else { + if r.RegType() == regalloc.RegTypeInt { + if _64 { + return fmt.Sprintf("%%r%d?", r.ID()) + } else { + return fmt.Sprintf("%%r%dd?", r.ID()) + } + } else { + return fmt.Sprintf("%%xmm%d?", r.ID()) + } + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go new file mode 100644 index 000000000..05ba5f027 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go @@ -0,0 +1,128 @@ +package amd64 + +import ( + "encoding/binary" + "reflect" + "unsafe" + + "github.com/tetratelabs/wazero/internal/wasmdebug" +) + +func stackView(rbp, top uintptr) []byte { + var stackBuf []byte + { + // TODO: use unsafe.Slice after floor version is set to Go 1.20. + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf)) + hdr.Data = rbp + setSliceLimits(hdr, top-rbp) + } + return stackBuf +} + +// UnwindStack implements wazevo.unwindStack. +func UnwindStack(_, rbp, top uintptr, returnAddresses []uintptr) []uintptr { + stackBuf := stackView(rbp, top) + + for i := uint64(0); i < uint64(len(stackBuf)); { + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | ReturnAddress | + // | Caller_RBP | + // +-----------------+ <---- Caller_RBP + // | ........... | + // | clobbered M | + // | ............ | + // | clobbered 0 | + // | spill slot N | + // | ............ | + // | spill slot 0 | + // | ReturnAddress | + // | Caller_RBP | + // +-----------------+ <---- RBP + // (low address) + + callerRBP := binary.LittleEndian.Uint64(stackBuf[i:]) + retAddr := binary.LittleEndian.Uint64(stackBuf[i+8:]) + returnAddresses = append(returnAddresses, uintptr(retAddr)) + i = callerRBP - uint64(rbp) + if len(returnAddresses) == wasmdebug.MaxFrames { + break + } + } + return returnAddresses +} + +// GoCallStackView implements wazevo.goCallStackView. +func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 { + // (high address) + // +-----------------+ <----+ + // | xxxxxxxxxxx | | ;; optional unused space to make it 16-byte aligned. + // ^ | arg[N]/ret[M] | | + // sliceSize | | ............ | | SizeInBytes/8 + // | | arg[1]/ret[1] | | + // v | arg[0]/ret[0] | <----+ + // | SizeInBytes | + // +-----------------+ <---- stackPointerBeforeGoCall + // (low address) + data := unsafe.Pointer(uintptr(unsafe.Pointer(stackPointerBeforeGoCall)) + 8) + size := *stackPointerBeforeGoCall / 8 + return unsafe.Slice((*uint64)(data), int(size)) +} + +func AdjustClonedStack(oldRsp, oldTop, rsp, rbp, top uintptr) { + diff := uint64(rsp - oldRsp) + + newBuf := stackView(rbp, top) + for i := uint64(0); i < uint64(len(newBuf)); { + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | ReturnAddress | + // | Caller_RBP | + // +-----------------+ <---- Caller_RBP + // | ........... | + // | clobbered M | + // | ............ | + // | clobbered 0 | + // | spill slot N | + // | ............ | + // | spill slot 0 | + // | ReturnAddress | + // | Caller_RBP | + // +-----------------+ <---- RBP + // (low address) + + callerRBP := binary.LittleEndian.Uint64(newBuf[i:]) + if callerRBP == 0 { + // End of stack. + break + } + if i64 := int64(callerRBP); i64 < int64(oldRsp) || i64 >= int64(oldTop) { + panic("BUG: callerRBP is out of range") + } + if int(callerRBP) < 0 { + panic("BUG: callerRBP is negative") + } + adjustedCallerRBP := callerRBP + diff + if int(adjustedCallerRBP) < 0 { + panic("BUG: adjustedCallerRBP is negative") + } + binary.LittleEndian.PutUint64(newBuf[i:], adjustedCallerRBP) + i = adjustedCallerRBP - uint64(rbp) + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go new file mode 100644 index 000000000..6615471c6 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go @@ -0,0 +1,332 @@ +package arm64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// References: +// * https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#arm64-architecture +// * https://developer.arm.com/documentation/102374/0101/Procedure-Call-Standard + +var ( + intParamResultRegs = []regalloc.RealReg{x0, x1, x2, x3, x4, x5, x6, x7} + floatParamResultRegs = []regalloc.RealReg{v0, v1, v2, v3, v4, v5, v6, v7} +) + +var regInfo = ®alloc.RegisterInfo{ + AllocatableRegisters: [regalloc.NumRegType][]regalloc.RealReg{ + // We don't allocate: + // - x18: Reserved by the macOS: https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms#Respect-the-purpose-of-specific-CPU-registers + // - x28: Reserved by Go runtime. + // - x27(=tmpReg): because of the reason described on tmpReg. + regalloc.RegTypeInt: { + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x19, x20, x21, x22, x23, x24, x25, + x26, x29, x30, + // These are the argument/return registers. Less preferred in the allocation. + x7, x6, x5, x4, x3, x2, x1, x0, + }, + regalloc.RegTypeFloat: { + v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, + // These are the argument/return registers. Less preferred in the allocation. + v7, v6, v5, v4, v3, v2, v1, v0, + }, + }, + CalleeSavedRegisters: regalloc.NewRegSet( + x19, x20, x21, x22, x23, x24, x25, x26, x28, + v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + ), + CallerSavedRegisters: regalloc.NewRegSet( + x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x29, x30, + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, + ), + RealRegToVReg: []regalloc.VReg{ + x0: x0VReg, x1: x1VReg, x2: x2VReg, x3: x3VReg, x4: x4VReg, x5: x5VReg, x6: x6VReg, x7: x7VReg, x8: x8VReg, x9: x9VReg, x10: x10VReg, x11: x11VReg, x12: x12VReg, x13: x13VReg, x14: x14VReg, x15: x15VReg, x16: x16VReg, x17: x17VReg, x18: x18VReg, x19: x19VReg, x20: x20VReg, x21: x21VReg, x22: x22VReg, x23: x23VReg, x24: x24VReg, x25: x25VReg, x26: x26VReg, x27: x27VReg, x28: x28VReg, x29: x29VReg, x30: x30VReg, + v0: v0VReg, v1: v1VReg, v2: v2VReg, v3: v3VReg, v4: v4VReg, v5: v5VReg, v6: v6VReg, v7: v7VReg, v8: v8VReg, v9: v9VReg, v10: v10VReg, v11: v11VReg, v12: v12VReg, v13: v13VReg, v14: v14VReg, v15: v15VReg, v16: v16VReg, v17: v17VReg, v18: v18VReg, v19: v19VReg, v20: v20VReg, v21: v21VReg, v22: v22VReg, v23: v23VReg, v24: v24VReg, v25: v25VReg, v26: v26VReg, v27: v27VReg, v28: v28VReg, v29: v29VReg, v30: v30VReg, v31: v31VReg, + }, + RealRegName: func(r regalloc.RealReg) string { return regNames[r] }, + RealRegType: func(r regalloc.RealReg) regalloc.RegType { + if r < v0 { + return regalloc.RegTypeInt + } + return regalloc.RegTypeFloat + }, +} + +// ArgsResultsRegs implements backend.Machine. +func (m *machine) ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) { + return intParamResultRegs, floatParamResultRegs +} + +// LowerParams implements backend.FunctionABI. +func (m *machine) LowerParams(args []ssa.Value) { + a := m.currentABI + + for i, ssaArg := range args { + if !ssaArg.Valid() { + continue + } + reg := m.compiler.VRegOf(ssaArg) + arg := &a.Args[i] + if arg.Kind == backend.ABIArgKindReg { + m.InsertMove(reg, arg.Reg, arg.Type) + } else { + // TODO: we could use pair load if there's consecutive loads for the same type. + // + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | <-| + // | ReturnAddress | | + // +-----------------+ | + // | ........... | | + // | clobbered M | | argStackOffset: is unknown at this point of compilation. + // | ............ | | + // | clobbered 0 | | + // | spill slot N | | + // | ........... | | + // | spill slot 0 | | + // SP---> +-----------------+ <-+ + // (low address) + + bits := arg.Type.Bits() + // At this point of compilation, we don't yet know how much space exist below the return address. + // So we instruct the address mode to add the `argStackOffset` to the offset at the later phase of compilation. + amode := addressMode{imm: arg.Offset, rn: spVReg, kind: addressModeKindArgStackSpace} + load := m.allocateInstr() + switch arg.Type { + case ssa.TypeI32, ssa.TypeI64: + load.asULoad(operandNR(reg), amode, bits) + case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: + load.asFpuLoad(operandNR(reg), amode, bits) + default: + panic("BUG") + } + m.insert(load) + m.unresolvedAddressModes = append(m.unresolvedAddressModes, load) + } + } +} + +// LowerReturns lowers the given returns. +func (m *machine) LowerReturns(rets []ssa.Value) { + a := m.currentABI + + l := len(rets) - 1 + for i := range rets { + // Reverse order in order to avoid overwriting the stack returns existing in the return registers. + ret := rets[l-i] + r := &a.Rets[l-i] + reg := m.compiler.VRegOf(ret) + if def := m.compiler.ValueDefinition(ret); def.IsFromInstr() { + // Constant instructions are inlined. + if inst := def.Instr; inst.Constant() { + val := inst.Return() + valType := val.Type() + v := inst.ConstantVal() + m.insertLoadConstant(v, valType, reg) + } + } + if r.Kind == backend.ABIArgKindReg { + m.InsertMove(r.Reg, reg, ret.Type()) + } else { + // TODO: we could use pair store if there's consecutive stores for the same type. + // + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | <-+ + // | arg X | | + // | ....... | | + // | arg 1 | | + // | arg 0 | | + // | ReturnAddress | | + // +-----------------+ | + // | ........... | | + // | spill slot M | | retStackOffset: is unknown at this point of compilation. + // | ............ | | + // | spill slot 2 | | + // | spill slot 1 | | + // | clobbered 0 | | + // | clobbered 1 | | + // | ........... | | + // | clobbered N | | + // SP---> +-----------------+ <-+ + // (low address) + + bits := r.Type.Bits() + + // At this point of compilation, we don't yet know how much space exist below the return address. + // So we instruct the address mode to add the `retStackOffset` to the offset at the later phase of compilation. + amode := addressMode{imm: r.Offset, rn: spVReg, kind: addressModeKindResultStackSpace} + store := m.allocateInstr() + store.asStore(operandNR(reg), amode, bits) + m.insert(store) + m.unresolvedAddressModes = append(m.unresolvedAddressModes, store) + } + } +} + +// callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the +// caller side of the function call. +func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def *backend.SSAValueDefinition, slotBegin int64) { + arg := &a.Args[argIndex] + if def != nil && def.IsFromInstr() { + // Constant instructions are inlined. + if inst := def.Instr; inst.Constant() { + val := inst.Return() + valType := val.Type() + v := inst.ConstantVal() + m.insertLoadConstant(v, valType, reg) + } + } + if arg.Kind == backend.ABIArgKindReg { + m.InsertMove(arg.Reg, reg, arg.Type) + } else { + // TODO: we could use pair store if there's consecutive stores for the same type. + // + // Note that at this point, stack pointer is already adjusted. + bits := arg.Type.Bits() + amode := m.resolveAddressModeForOffset(arg.Offset-slotBegin, bits, spVReg, false) + store := m.allocateInstr() + store.asStore(operandNR(reg), amode, bits) + m.insert(store) + } +} + +func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex int, reg regalloc.VReg, slotBegin int64) { + r := &a.Rets[retIndex] + if r.Kind == backend.ABIArgKindReg { + m.InsertMove(reg, r.Reg, r.Type) + } else { + // TODO: we could use pair load if there's consecutive loads for the same type. + amode := m.resolveAddressModeForOffset(a.ArgStackSize+r.Offset-slotBegin, r.Type.Bits(), spVReg, false) + ldr := m.allocateInstr() + switch r.Type { + case ssa.TypeI32, ssa.TypeI64: + ldr.asULoad(operandNR(reg), amode, r.Type.Bits()) + case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: + ldr.asFpuLoad(operandNR(reg), amode, r.Type.Bits()) + default: + panic("BUG") + } + m.insert(ldr) + } +} + +func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) (*instruction, addressMode) { + exct := m.executableContext + exct.PendingInstructions = exct.PendingInstructions[:0] + mode := m.resolveAddressModeForOffset(offset, dstBits, rn, allowTmpRegUse) + for _, instr := range exct.PendingInstructions { + cur = linkInstr(cur, instr) + } + return cur, mode +} + +func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) addressMode { + if rn.RegType() != regalloc.RegTypeInt { + panic("BUG: rn should be a pointer: " + formatVRegSized(rn, 64)) + } + var amode addressMode + if offsetFitsInAddressModeKindRegUnsignedImm12(dstBits, offset) { + amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: rn, imm: offset} + } else if offsetFitsInAddressModeKindRegSignedImm9(offset) { + amode = addressMode{kind: addressModeKindRegSignedImm9, rn: rn, imm: offset} + } else { + var indexReg regalloc.VReg + if allowTmpRegUse { + m.lowerConstantI64(tmpRegVReg, offset) + indexReg = tmpRegVReg + } else { + indexReg = m.compiler.AllocateVReg(ssa.TypeI64) + m.lowerConstantI64(indexReg, offset) + } + amode = addressMode{kind: addressModeKindRegReg, rn: rn, rm: indexReg, extOp: extendOpUXTX /* indicates index rm is 64-bit */} + } + return amode +} + +func (m *machine) lowerCall(si *ssa.Instruction) { + isDirectCall := si.Opcode() == ssa.OpcodeCall + var indirectCalleePtr ssa.Value + var directCallee ssa.FuncRef + var sigID ssa.SignatureID + var args []ssa.Value + if isDirectCall { + directCallee, sigID, args = si.CallData() + } else { + indirectCalleePtr, sigID, args, _ /* on arm64, the calling convention is compatible with the Go runtime */ = si.CallIndirectData() + } + calleeABI := m.compiler.GetFunctionABI(m.compiler.SSABuilder().ResolveSignature(sigID)) + + stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize()) + if m.maxRequiredStackSizeForCalls < stackSlotSize+16 { + m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // return address frame. + } + + for i, arg := range args { + reg := m.compiler.VRegOf(arg) + def := m.compiler.ValueDefinition(arg) + m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize) + } + + if isDirectCall { + call := m.allocateInstr() + call.asCall(directCallee, calleeABI) + m.insert(call) + } else { + ptr := m.compiler.VRegOf(indirectCalleePtr) + callInd := m.allocateInstr() + callInd.asCallIndirect(ptr, calleeABI) + m.insert(callInd) + } + + var index int + r1, rs := si.Returns() + if r1.Valid() { + m.callerGenFunctionReturnVReg(calleeABI, 0, m.compiler.VRegOf(r1), stackSlotSize) + index++ + } + + for _, r := range rs { + m.callerGenFunctionReturnVReg(calleeABI, index, m.compiler.VRegOf(r), stackSlotSize) + index++ + } +} + +func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add bool) { + if imm12Operand, ok := asImm12Operand(uint64(diff)); ok { + alu := m.allocateInstr() + var ao aluOp + if add { + ao = aluOpAdd + } else { + ao = aluOpSub + } + alu.asALU(ao, operandNR(rd), operandNR(spVReg), imm12Operand, true) + m.insert(alu) + } else { + m.lowerConstantI64(tmpRegVReg, diff) + alu := m.allocateInstr() + var ao aluOp + if add { + ao = aluOpAdd + } else { + ao = aluOpSub + } + alu.asALU(ao, operandNR(rd), operandNR(spVReg), operandNR(tmpRegVReg), true) + m.insert(alu) + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go new file mode 100644 index 000000000..5f0c613df --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go @@ -0,0 +1,9 @@ +package arm64 + +// entrypoint enters the machine code generated by this backend which begins with the preamble generated by functionABI.EmitGoEntryPreamble below. +// This implements wazevo.entrypoint, and see the comments there for detail. +func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr) + +// afterGoFunctionCallEntrypoint enters the machine code after growing the stack. +// This implements wazevo.afterGoFunctionCallEntrypoint, and see the comments there for detail. +func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr) diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s new file mode 100644 index 000000000..0b579f852 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s @@ -0,0 +1,29 @@ +//go:build arm64 + +#include "funcdata.h" +#include "textflag.h" + +// See the comments on EmitGoEntryPreamble for what this function is supposed to do. +TEXT ·entrypoint(SB), NOSPLIT|NOFRAME, $0-48 + MOVD preambleExecutable+0(FP), R27 + MOVD functionExectuable+8(FP), R24 + MOVD executionContextPtr+16(FP), R0 + MOVD moduleContextPtr+24(FP), R1 + MOVD paramResultSlicePtr+32(FP), R19 + MOVD goAllocatedStackSlicePtr+40(FP), R26 + JMP (R27) + +TEXT ·afterGoFunctionCallEntrypoint(SB), NOSPLIT|NOFRAME, $0-32 + MOVD goCallReturnAddress+0(FP), R20 + MOVD executionContextPtr+8(FP), R0 + MOVD stackPointer+16(FP), R19 + + // Save the current FP(R29), SP and LR(R30) into the wazevo.executionContext (stored in R0). + MOVD R29, 16(R0) // Store FP(R29) into [RO, #ExecutionContextOffsets.OriginalFramePointer] + MOVD RSP, R27 // Move SP to R27 (temporary register) since SP cannot be stored directly in str instructions. + MOVD R27, 24(R0) // Store R27 into [RO, #ExecutionContextOffsets.OriginalFramePointer] + MOVD R30, 32(R0) // Store R30 into [R0, #ExecutionContextOffsets.GoReturnAddress] + + // Load the new stack pointer (which sits somewhere in Go-allocated stack) into SP. + MOVD R19, RSP + JMP (R20) diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go new file mode 100644 index 000000000..7a9cceb33 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go @@ -0,0 +1,230 @@ +package arm64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// CompileEntryPreamble implements backend.Machine. This assumes `entrypoint` function (in abi_go_entry_arm64.s) passes: +// +// 1. First (execution context ptr) and Second arguments are already passed in x0, and x1. +// 2. param/result slice ptr in x19; the pointer to []uint64{} which is used to pass arguments and accept return values. +// 3. Go-allocated stack slice ptr in x26. +// 4. Function executable in x24. +// +// also SP and FP are correct Go-runtime-based values, and LR is the return address to the Go-side caller. +func (m *machine) CompileEntryPreamble(signature *ssa.Signature) []byte { + root := m.constructEntryPreamble(signature) + m.encode(root) + return m.compiler.Buf() +} + +var ( + executionContextPtrReg = x0VReg + // callee-saved regs so that they can be used in the prologue and epilogue. + paramResultSlicePtr = x19VReg + savedExecutionContextPtr = x20VReg + // goAllocatedStackPtr is not used in the epilogue. + goAllocatedStackPtr = x26VReg + // paramResultSliceCopied is not used in the epilogue. + paramResultSliceCopied = x25VReg + // tmpRegVReg is not used in the epilogue. + functionExecutable = x24VReg +) + +func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regalloc.VReg, arg *backend.ABIArg, argStartOffsetFromSP int64) *instruction { + typ := arg.Type + bits := typ.Bits() + isStackArg := arg.Kind == backend.ABIArgKindStack + + var loadTargetReg operand + if !isStackArg { + loadTargetReg = operandNR(arg.Reg) + } else { + switch typ { + case ssa.TypeI32, ssa.TypeI64: + loadTargetReg = operandNR(x15VReg) + case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: + loadTargetReg = operandNR(v15VReg) + default: + panic("TODO?") + } + } + + var postIndexImm int64 + if typ == ssa.TypeV128 { + postIndexImm = 16 // v128 is represented as 2x64-bit in Go slice. + } else { + postIndexImm = 8 + } + loadMode := addressMode{kind: addressModeKindPostIndex, rn: paramSlicePtr, imm: postIndexImm} + + instr := m.allocateInstr() + switch typ { + case ssa.TypeI32: + instr.asULoad(loadTargetReg, loadMode, 32) + case ssa.TypeI64: + instr.asULoad(loadTargetReg, loadMode, 64) + case ssa.TypeF32: + instr.asFpuLoad(loadTargetReg, loadMode, 32) + case ssa.TypeF64: + instr.asFpuLoad(loadTargetReg, loadMode, 64) + case ssa.TypeV128: + instr.asFpuLoad(loadTargetReg, loadMode, 128) + } + cur = linkInstr(cur, instr) + + if isStackArg { + var storeMode addressMode + cur, storeMode = m.resolveAddressModeForOffsetAndInsert(cur, argStartOffsetFromSP+arg.Offset, bits, spVReg, true) + toStack := m.allocateInstr() + toStack.asStore(loadTargetReg, storeMode, bits) + cur = linkInstr(cur, toStack) + } + return cur +} + +func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr regalloc.VReg, result *backend.ABIArg, resultStartOffsetFromSP int64) *instruction { + isStackArg := result.Kind == backend.ABIArgKindStack + typ := result.Type + bits := typ.Bits() + + var storeTargetReg operand + if !isStackArg { + storeTargetReg = operandNR(result.Reg) + } else { + switch typ { + case ssa.TypeI32, ssa.TypeI64: + storeTargetReg = operandNR(x15VReg) + case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: + storeTargetReg = operandNR(v15VReg) + default: + panic("TODO?") + } + } + + var postIndexImm int64 + if typ == ssa.TypeV128 { + postIndexImm = 16 // v128 is represented as 2x64-bit in Go slice. + } else { + postIndexImm = 8 + } + + if isStackArg { + var loadMode addressMode + cur, loadMode = m.resolveAddressModeForOffsetAndInsert(cur, resultStartOffsetFromSP+result.Offset, bits, spVReg, true) + toReg := m.allocateInstr() + switch typ { + case ssa.TypeI32, ssa.TypeI64: + toReg.asULoad(storeTargetReg, loadMode, bits) + case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: + toReg.asFpuLoad(storeTargetReg, loadMode, bits) + default: + panic("TODO?") + } + cur = linkInstr(cur, toReg) + } + + mode := addressMode{kind: addressModeKindPostIndex, rn: resultSlicePtr, imm: postIndexImm} + instr := m.allocateInstr() + instr.asStore(storeTargetReg, mode, bits) + cur = linkInstr(cur, instr) + return cur +} + +func (m *machine) constructEntryPreamble(sig *ssa.Signature) (root *instruction) { + abi := backend.FunctionABI{} + abi.Init(sig, intParamResultRegs, floatParamResultRegs) + + root = m.allocateNop() + + //// ----------------------------------- prologue ----------------------------------- //// + + // First, we save executionContextPtrReg into a callee-saved register so that it can be used in epilogue as well. + // mov savedExecutionContextPtr, x0 + cur := m.move64(savedExecutionContextPtr, executionContextPtrReg, root) + + // Next, save the current FP, SP and LR into the wazevo.executionContext: + // str fp, [savedExecutionContextPtr, #OriginalFramePointer] + // mov tmp, sp ;; sp cannot be str'ed directly. + // str sp, [savedExecutionContextPtr, #OriginalStackPointer] + // str lr, [savedExecutionContextPtr, #GoReturnAddress] + cur = m.loadOrStoreAtExecutionContext(fpVReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, true, cur) + cur = m.move64(tmpRegVReg, spVReg, cur) + cur = m.loadOrStoreAtExecutionContext(tmpRegVReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, true, cur) + cur = m.loadOrStoreAtExecutionContext(lrVReg, wazevoapi.ExecutionContextOffsetGoReturnAddress, true, cur) + + // Then, move the Go-allocated stack pointer to SP: + // mov sp, goAllocatedStackPtr + cur = m.move64(spVReg, goAllocatedStackPtr, cur) + + prReg := paramResultSlicePtr + if len(abi.Args) > 2 && len(abi.Rets) > 0 { + // paramResultSlicePtr is modified during the execution of goEntryPreamblePassArg, + // so copy it to another reg. + cur = m.move64(paramResultSliceCopied, paramResultSlicePtr, cur) + prReg = paramResultSliceCopied + } + + stackSlotSize := int64(abi.AlignedArgResultStackSlotSize()) + for i := range abi.Args { + if i < 2 { + // module context ptr and execution context ptr are passed in x0 and x1 by the Go assembly function. + continue + } + arg := &abi.Args[i] + cur = m.goEntryPreamblePassArg(cur, prReg, arg, -stackSlotSize) + } + + // Call the real function. + bl := m.allocateInstr() + bl.asCallIndirect(functionExecutable, &abi) + cur = linkInstr(cur, bl) + + ///// ----------------------------------- epilogue ----------------------------------- ///// + + // Store the register results into paramResultSlicePtr. + for i := range abi.Rets { + cur = m.goEntryPreamblePassResult(cur, paramResultSlicePtr, &abi.Rets[i], abi.ArgStackSize-stackSlotSize) + } + + // Finally, restore the FP, SP and LR, and return to the Go code. + // ldr fp, [savedExecutionContextPtr, #OriginalFramePointer] + // ldr tmp, [savedExecutionContextPtr, #OriginalStackPointer] + // mov sp, tmp ;; sp cannot be str'ed directly. + // ldr lr, [savedExecutionContextPtr, #GoReturnAddress] + // ret ;; --> return to the Go code + cur = m.loadOrStoreAtExecutionContext(fpVReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, false, cur) + cur = m.loadOrStoreAtExecutionContext(tmpRegVReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, false, cur) + cur = m.move64(spVReg, tmpRegVReg, cur) + cur = m.loadOrStoreAtExecutionContext(lrVReg, wazevoapi.ExecutionContextOffsetGoReturnAddress, false, cur) + retInst := m.allocateInstr() + retInst.asRet() + linkInstr(cur, retInst) + return +} + +func (m *machine) move64(dst, src regalloc.VReg, prev *instruction) *instruction { + instr := m.allocateInstr() + instr.asMove64(dst, src) + return linkInstr(prev, instr) +} + +func (m *machine) loadOrStoreAtExecutionContext(d regalloc.VReg, offset wazevoapi.Offset, store bool, prev *instruction) *instruction { + instr := m.allocateInstr() + mode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: savedExecutionContextPtr, imm: offset.I64()} + if store { + instr.asStore(operandNR(d), mode, 64) + } else { + instr.asULoad(operandNR(d), mode, 64) + } + return linkInstr(prev, instr) +} + +func linkInstr(prev, next *instruction) *instruction { + prev.next = next + next.prev = prev + return next +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go new file mode 100644 index 000000000..466b1f960 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go @@ -0,0 +1,428 @@ +package arm64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +var calleeSavedRegistersSorted = []regalloc.VReg{ + x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg, + v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg, +} + +// CompileGoFunctionTrampoline implements backend.Machine. +func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte { + exct := m.executableContext + argBegin := 1 // Skips exec context by default. + if needModuleContextPtr { + argBegin++ + } + + abi := &backend.FunctionABI{} + abi.Init(sig, intParamResultRegs, floatParamResultRegs) + m.currentABI = abi + + cur := m.allocateInstr() + cur.asNop0() + exct.RootInstr = cur + + // Execution context is always the first argument. + execCtrPtr := x0VReg + + // In the following, we create the following stack layout: + // + // (high address) + // SP ------> +-----------------+ <----+ + // | ....... | | + // | ret Y | | + // | ....... | | + // | ret 0 | | + // | arg X | | size_of_arg_ret + // | ....... | | + // | arg 1 | | + // | arg 0 | <----+ <-------- originalArg0Reg + // | size_of_arg_ret | + // | ReturnAddress | + // +-----------------+ <----+ + // | xxxx | | ;; might be padded to make it 16-byte aligned. + // +--->| arg[N]/ret[M] | | + // sliceSize| | ............ | | goCallStackSize + // | | arg[1]/ret[1] | | + // +--->| arg[0]/ret[0] | <----+ <-------- arg0ret0AddrReg + // | sliceSize | + // | frame_size | + // +-----------------+ + // (low address) + // + // where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions, + // therefore will be accessed as the usual []uint64. So that's where we need to pass/receive + // the arguments/return values. + + // First of all, to update the SP, and create "ReturnAddress + size_of_arg_ret". + cur = m.createReturnAddrAndSizeOfArgRetSlot(cur) + + const frameInfoSize = 16 // == frame_size + sliceSize. + + // Next, we should allocate the stack for the Go function call if necessary. + goCallStackSize, sliceSizeInBytes := backend.GoFunctionCallRequiredStackSize(sig, argBegin) + cur = m.insertStackBoundsCheck(goCallStackSize+frameInfoSize, cur) + + originalArg0Reg := x17VReg // Caller save, so we can use it for whatever we want. + if m.currentABI.AlignedArgResultStackSlotSize() > 0 { + // At this point, SP points to `ReturnAddress`, so add 16 to get the original arg 0 slot. + cur = m.addsAddOrSubStackPointer(cur, originalArg0Reg, frameInfoSize, true) + } + + // Save the callee saved registers. + cur = m.saveRegistersInExecutionContext(cur, calleeSavedRegistersSorted) + + if needModuleContextPtr { + offset := wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.I64() + if !offsetFitsInAddressModeKindRegUnsignedImm12(64, offset) { + panic("BUG: too large or un-aligned offset for goFunctionCallCalleeModuleContextOpaque in execution context") + } + + // Module context is always the second argument. + moduleCtrPtr := x1VReg + store := m.allocateInstr() + amode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtrPtr, imm: offset} + store.asStore(operandNR(moduleCtrPtr), amode, 64) + cur = linkInstr(cur, store) + } + + // Advances the stack pointer. + cur = m.addsAddOrSubStackPointer(cur, spVReg, goCallStackSize, false) + + // Copy the pointer to x15VReg. + arg0ret0AddrReg := x15VReg // Caller save, so we can use it for whatever we want. + copySp := m.allocateInstr() + copySp.asMove64(arg0ret0AddrReg, spVReg) + cur = linkInstr(cur, copySp) + + // Next, we need to store all the arguments to the stack in the typical Wasm stack style. + for i := range abi.Args[argBegin:] { + arg := &abi.Args[argBegin+i] + store := m.allocateInstr() + var v regalloc.VReg + if arg.Kind == backend.ABIArgKindReg { + v = arg.Reg + } else { + cur, v = m.goFunctionCallLoadStackArg(cur, originalArg0Reg, arg, + // Caller save, so we can use it for whatever we want. + x11VReg, v11VReg) + } + + var sizeInBits byte + if arg.Type == ssa.TypeV128 { + sizeInBits = 128 + } else { + sizeInBits = 64 + } + store.asStore(operandNR(v), + addressMode{ + kind: addressModeKindPostIndex, + rn: arg0ret0AddrReg, imm: int64(sizeInBits / 8), + }, sizeInBits) + cur = linkInstr(cur, store) + } + + // Finally, now that we've advanced SP to arg[0]/ret[0], we allocate `frame_size + sliceSize`. + var frameSizeReg, sliceSizeReg regalloc.VReg + if goCallStackSize > 0 { + cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, goCallStackSize) + frameSizeReg = tmpRegVReg + cur = m.lowerConstantI64AndInsert(cur, x16VReg, sliceSizeInBytes/8) + sliceSizeReg = x16VReg + } else { + frameSizeReg = xzrVReg + sliceSizeReg = xzrVReg + } + _amode := addressModePreOrPostIndex(spVReg, -16, true) + storeP := m.allocateInstr() + storeP.asStorePair64(frameSizeReg, sliceSizeReg, _amode) + cur = linkInstr(cur, storeP) + + // Set the exit status on the execution context. + cur = m.setExitCode(cur, x0VReg, exitCode) + + // Save the current stack pointer. + cur = m.saveCurrentStackPointer(cur, x0VReg) + + // Exit the execution. + cur = m.storeReturnAddressAndExit(cur) + + // After the call, we need to restore the callee saved registers. + cur = m.restoreRegistersInExecutionContext(cur, calleeSavedRegistersSorted) + + // Get the pointer to the arg[0]/ret[0]: We need to skip `frame_size + sliceSize`. + if len(abi.Rets) > 0 { + cur = m.addsAddOrSubStackPointer(cur, arg0ret0AddrReg, frameInfoSize, true) + } + + // Advances the SP so that it points to `ReturnAddress`. + cur = m.addsAddOrSubStackPointer(cur, spVReg, frameInfoSize+goCallStackSize, true) + ldr := m.allocateInstr() + // And load the return address. + ldr.asULoad(operandNR(lrVReg), + addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64) + cur = linkInstr(cur, ldr) + + originalRet0Reg := x17VReg // Caller save, so we can use it for whatever we want. + if m.currentABI.RetStackSize > 0 { + cur = m.addsAddOrSubStackPointer(cur, originalRet0Reg, m.currentABI.ArgStackSize, true) + } + + // Make the SP point to the original address (above the result slot). + if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 { + cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true) + } + + for i := range abi.Rets { + r := &abi.Rets[i] + if r.Kind == backend.ABIArgKindReg { + loadIntoReg := m.allocateInstr() + mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg} + switch r.Type { + case ssa.TypeI32: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + loadIntoReg.asULoad(operandNR(r.Reg), mode, 32) + case ssa.TypeI64: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + loadIntoReg.asULoad(operandNR(r.Reg), mode, 64) + case ssa.TypeF32: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 32) + case ssa.TypeF64: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 64) + case ssa.TypeV128: + mode.imm = 16 + loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 128) + default: + panic("TODO") + } + cur = linkInstr(cur, loadIntoReg) + } else { + // First we need to load the value to a temporary just like ^^. + intTmp, floatTmp := x11VReg, v11VReg + loadIntoTmpReg := m.allocateInstr() + mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg} + var resultReg regalloc.VReg + switch r.Type { + case ssa.TypeI32: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 32) + resultReg = intTmp + case ssa.TypeI64: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 64) + resultReg = intTmp + case ssa.TypeF32: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 32) + resultReg = floatTmp + case ssa.TypeF64: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 64) + resultReg = floatTmp + case ssa.TypeV128: + mode.imm = 16 + loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 128) + resultReg = floatTmp + default: + panic("TODO") + } + cur = linkInstr(cur, loadIntoTmpReg) + cur = m.goFunctionCallStoreStackResult(cur, originalRet0Reg, r, resultReg) + } + } + + ret := m.allocateInstr() + ret.asRet() + linkInstr(cur, ret) + + m.encode(m.executableContext.RootInstr) + return m.compiler.Buf() +} + +func (m *machine) saveRegistersInExecutionContext(cur *instruction, regs []regalloc.VReg) *instruction { + offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64() + for _, v := range regs { + store := m.allocateInstr() + var sizeInBits byte + switch v.RegType() { + case regalloc.RegTypeInt: + sizeInBits = 64 + case regalloc.RegTypeFloat: + sizeInBits = 128 + } + store.asStore(operandNR(v), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + // Execution context is always the first argument. + rn: x0VReg, imm: offset, + }, sizeInBits) + store.prev = cur + cur.next = store + cur = store + offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally store regs at the offset of multiple of 16. + } + return cur +} + +func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []regalloc.VReg) *instruction { + offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64() + for _, v := range regs { + load := m.allocateInstr() + var as func(dst operand, amode addressMode, sizeInBits byte) + var sizeInBits byte + switch v.RegType() { + case regalloc.RegTypeInt: + as = load.asULoad + sizeInBits = 64 + case regalloc.RegTypeFloat: + as = load.asFpuLoad + sizeInBits = 128 + } + as(operandNR(v), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + // Execution context is always the first argument. + rn: x0VReg, imm: offset, + }, sizeInBits) + cur = linkInstr(cur, load) + offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally load regs at the offset of multiple of 16. + } + return cur +} + +func (m *machine) lowerConstantI64AndInsert(cur *instruction, dst regalloc.VReg, v int64) *instruction { + exct := m.executableContext + exct.PendingInstructions = exct.PendingInstructions[:0] + m.lowerConstantI64(dst, v) + for _, instr := range exct.PendingInstructions { + cur = linkInstr(cur, instr) + } + return cur +} + +func (m *machine) lowerConstantI32AndInsert(cur *instruction, dst regalloc.VReg, v int32) *instruction { + exct := m.executableContext + exct.PendingInstructions = exct.PendingInstructions[:0] + m.lowerConstantI32(dst, v) + for _, instr := range exct.PendingInstructions { + cur = linkInstr(cur, instr) + } + return cur +} + +func (m *machine) setExitCode(cur *instruction, execCtr regalloc.VReg, exitCode wazevoapi.ExitCode) *instruction { + constReg := x17VReg // caller-saved, so we can use it. + cur = m.lowerConstantI32AndInsert(cur, constReg, int32(exitCode)) + + // Set the exit status on the execution context. + setExistStatus := m.allocateInstr() + setExistStatus.asStore(operandNR(constReg), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: execCtr, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(), + }, 32) + cur = linkInstr(cur, setExistStatus) + return cur +} + +func (m *machine) storeReturnAddressAndExit(cur *instruction) *instruction { + // Read the return address into tmp, and store it in the execution context. + adr := m.allocateInstr() + adr.asAdr(tmpRegVReg, exitSequenceSize+8) + cur = linkInstr(cur, adr) + + storeReturnAddr := m.allocateInstr() + storeReturnAddr.asStore(operandNR(tmpRegVReg), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + // Execution context is always the first argument. + rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(), + }, 64) + cur = linkInstr(cur, storeReturnAddr) + + // Exit the execution. + trapSeq := m.allocateInstr() + trapSeq.asExitSequence(x0VReg) + cur = linkInstr(cur, trapSeq) + return cur +} + +func (m *machine) saveCurrentStackPointer(cur *instruction, execCtr regalloc.VReg) *instruction { + // Save the current stack pointer: + // mov tmp, sp, + // str tmp, [exec_ctx, #stackPointerBeforeGoCall] + movSp := m.allocateInstr() + movSp.asMove64(tmpRegVReg, spVReg) + cur = linkInstr(cur, movSp) + + strSp := m.allocateInstr() + strSp.asStore(operandNR(tmpRegVReg), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: execCtr, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(), + }, 64) + cur = linkInstr(cur, strSp) + return cur +} + +func (m *machine) goFunctionCallLoadStackArg(cur *instruction, originalArg0Reg regalloc.VReg, arg *backend.ABIArg, intVReg, floatVReg regalloc.VReg) (*instruction, regalloc.VReg) { + load := m.allocateInstr() + var result regalloc.VReg + mode := addressMode{kind: addressModeKindPostIndex, rn: originalArg0Reg} + switch arg.Type { + case ssa.TypeI32: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + load.asULoad(operandNR(intVReg), mode, 32) + result = intVReg + case ssa.TypeI64: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + load.asULoad(operandNR(intVReg), mode, 64) + result = intVReg + case ssa.TypeF32: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + load.asFpuLoad(operandNR(floatVReg), mode, 32) + result = floatVReg + case ssa.TypeF64: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + load.asFpuLoad(operandNR(floatVReg), mode, 64) + result = floatVReg + case ssa.TypeV128: + mode.imm = 16 + load.asFpuLoad(operandNR(floatVReg), mode, 128) + result = floatVReg + default: + panic("TODO") + } + + cur = linkInstr(cur, load) + return cur, result +} + +func (m *machine) goFunctionCallStoreStackResult(cur *instruction, originalRet0Reg regalloc.VReg, result *backend.ABIArg, resultVReg regalloc.VReg) *instruction { + store := m.allocateInstr() + mode := addressMode{kind: addressModeKindPostIndex, rn: originalRet0Reg} + var sizeInBits byte + switch result.Type { + case ssa.TypeI32, ssa.TypeF32: + mode.imm = 8 + sizeInBits = 32 + case ssa.TypeI64, ssa.TypeF64: + mode.imm = 8 + sizeInBits = 64 + case ssa.TypeV128: + mode.imm = 16 + sizeInBits = 128 + default: + panic("TODO") + } + store.asStore(operandNR(resultVReg), mode, sizeInBits) + return linkInstr(cur, store) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go new file mode 100644 index 000000000..6f6cdd1b2 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go @@ -0,0 +1,215 @@ +package arm64 + +import ( + "strconv" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +type ( + cond uint64 + condKind byte +) + +const ( + // condKindRegisterZero represents a condition which checks if the register is zero. + // This indicates that the instruction must be encoded as CBZ: + // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/CBZ--Compare-and-Branch-on-Zero- + condKindRegisterZero condKind = iota + // condKindRegisterNotZero indicates that the instruction must be encoded as CBNZ: + // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/CBNZ--Compare-and-Branch-on-Nonzero- + condKindRegisterNotZero + // condKindCondFlagSet indicates that the instruction must be encoded as B.cond: + // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/B-cond--Branch-conditionally- + condKindCondFlagSet +) + +// kind returns the kind of condition which is stored in the first two bits. +func (c cond) kind() condKind { + return condKind(c & 0b11) +} + +func (c cond) asUint64() uint64 { + return uint64(c) +} + +// register returns the register for register conditions. +// This panics if the condition is not a register condition (condKindRegisterZero or condKindRegisterNotZero). +func (c cond) register() regalloc.VReg { + if c.kind() != condKindRegisterZero && c.kind() != condKindRegisterNotZero { + panic("condition is not a register") + } + return regalloc.VReg(c >> 2) +} + +func registerAsRegZeroCond(r regalloc.VReg) cond { + return cond(r)<<2 | cond(condKindRegisterZero) +} + +func registerAsRegNotZeroCond(r regalloc.VReg) cond { + return cond(r)<<2 | cond(condKindRegisterNotZero) +} + +func (c cond) flag() condFlag { + if c.kind() != condKindCondFlagSet { + panic("condition is not a flag") + } + return condFlag(c >> 2) +} + +func (c condFlag) asCond() cond { + return cond(c)<<2 | cond(condKindCondFlagSet) +} + +// condFlag represents a condition flag for conditional branches. +// The value matches the encoding of condition flags in the ARM64 instruction set. +// https://developer.arm.com/documentation/den0024/a/The-A64-instruction-set/Data-processing-instructions/Conditional-instructions +type condFlag uint8 + +const ( + eq condFlag = iota // eq represents "equal" + ne // ne represents "not equal" + hs // hs represents "higher or same" + lo // lo represents "lower" + mi // mi represents "minus or negative result" + pl // pl represents "plus or positive result" + vs // vs represents "overflow set" + vc // vc represents "overflow clear" + hi // hi represents "higher" + ls // ls represents "lower or same" + ge // ge represents "greater or equal" + lt // lt represents "less than" + gt // gt represents "greater than" + le // le represents "less than or equal" + al // al represents "always" + nv // nv represents "never" +) + +// invert returns the inverted condition. +func (c condFlag) invert() condFlag { + switch c { + case eq: + return ne + case ne: + return eq + case hs: + return lo + case lo: + return hs + case mi: + return pl + case pl: + return mi + case vs: + return vc + case vc: + return vs + case hi: + return ls + case ls: + return hi + case ge: + return lt + case lt: + return ge + case gt: + return le + case le: + return gt + case al: + return nv + case nv: + return al + default: + panic(c) + } +} + +// String implements fmt.Stringer. +func (c condFlag) String() string { + switch c { + case eq: + return "eq" + case ne: + return "ne" + case hs: + return "hs" + case lo: + return "lo" + case mi: + return "mi" + case pl: + return "pl" + case vs: + return "vs" + case vc: + return "vc" + case hi: + return "hi" + case ls: + return "ls" + case ge: + return "ge" + case lt: + return "lt" + case gt: + return "gt" + case le: + return "le" + case al: + return "al" + case nv: + return "nv" + default: + panic(strconv.Itoa(int(c))) + } +} + +// condFlagFromSSAIntegerCmpCond returns the condition flag for the given ssa.IntegerCmpCond. +func condFlagFromSSAIntegerCmpCond(c ssa.IntegerCmpCond) condFlag { + switch c { + case ssa.IntegerCmpCondEqual: + return eq + case ssa.IntegerCmpCondNotEqual: + return ne + case ssa.IntegerCmpCondSignedLessThan: + return lt + case ssa.IntegerCmpCondSignedGreaterThanOrEqual: + return ge + case ssa.IntegerCmpCondSignedGreaterThan: + return gt + case ssa.IntegerCmpCondSignedLessThanOrEqual: + return le + case ssa.IntegerCmpCondUnsignedLessThan: + return lo + case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual: + return hs + case ssa.IntegerCmpCondUnsignedGreaterThan: + return hi + case ssa.IntegerCmpCondUnsignedLessThanOrEqual: + return ls + default: + panic(c) + } +} + +// condFlagFromSSAFloatCmpCond returns the condition flag for the given ssa.FloatCmpCond. +func condFlagFromSSAFloatCmpCond(c ssa.FloatCmpCond) condFlag { + switch c { + case ssa.FloatCmpCondEqual: + return eq + case ssa.FloatCmpCondNotEqual: + return ne + case ssa.FloatCmpCondLessThan: + return mi + case ssa.FloatCmpCondLessThanOrEqual: + return ls + case ssa.FloatCmpCondGreaterThan: + return gt + case ssa.FloatCmpCondGreaterThanOrEqual: + return ge + default: + panic(c) + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go new file mode 100644 index 000000000..8aabc5997 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go @@ -0,0 +1,2545 @@ +package arm64 + +import ( + "fmt" + "math" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +type ( + // instruction represents either a real instruction in arm64, or the meta instructions + // that are convenient for code generation. For example, inline constants are also treated + // as instructions. + // + // Basically, each instruction knows how to get encoded in binaries. Hence, the final output of compilation + // can be considered equivalent to the sequence of such instructions. + // + // Each field is interpreted depending on the kind. + // + // TODO: optimize the layout later once the impl settles. + instruction struct { + prev, next *instruction + u1, u2, u3 uint64 + rd, rm, rn, ra operand + amode addressMode + kind instructionKind + addedBeforeRegAlloc bool + } + + // instructionKind represents the kind of instruction. + // This controls how the instruction struct is interpreted. + instructionKind byte +) + +func asNop0(i *instruction) { + i.kind = nop0 +} + +func setNext(i, next *instruction) { + i.next = next +} + +func setPrev(i, prev *instruction) { + i.prev = prev +} + +// IsCall implements regalloc.Instr IsCall. +func (i *instruction) IsCall() bool { + return i.kind == call +} + +// IsIndirectCall implements regalloc.Instr IsIndirectCall. +func (i *instruction) IsIndirectCall() bool { + return i.kind == callInd +} + +// IsReturn implements regalloc.Instr IsReturn. +func (i *instruction) IsReturn() bool { + return i.kind == ret +} + +// Next implements regalloc.Instr Next. +func (i *instruction) Next() regalloc.Instr { + return i.next +} + +// Prev implements regalloc.Instr Prev. +func (i *instruction) Prev() regalloc.Instr { + return i.prev +} + +// AddedBeforeRegAlloc implements regalloc.Instr AddedBeforeRegAlloc. +func (i *instruction) AddedBeforeRegAlloc() bool { + return i.addedBeforeRegAlloc +} + +type defKind byte + +const ( + defKindNone defKind = iota + 1 + defKindRD + defKindCall +) + +var defKinds = [numInstructionKinds]defKind{ + adr: defKindRD, + aluRRR: defKindRD, + aluRRRR: defKindRD, + aluRRImm12: defKindRD, + aluRRBitmaskImm: defKindRD, + aluRRRShift: defKindRD, + aluRRImmShift: defKindRD, + aluRRRExtend: defKindRD, + bitRR: defKindRD, + movZ: defKindRD, + movK: defKindRD, + movN: defKindRD, + mov32: defKindRD, + mov64: defKindRD, + fpuMov64: defKindRD, + fpuMov128: defKindRD, + fpuRR: defKindRD, + fpuRRR: defKindRD, + nop0: defKindNone, + call: defKindCall, + callInd: defKindCall, + ret: defKindNone, + store8: defKindNone, + store16: defKindNone, + store32: defKindNone, + store64: defKindNone, + exitSequence: defKindNone, + condBr: defKindNone, + br: defKindNone, + brTableSequence: defKindNone, + cSet: defKindRD, + extend: defKindRD, + fpuCmp: defKindNone, + uLoad8: defKindRD, + uLoad16: defKindRD, + uLoad32: defKindRD, + sLoad8: defKindRD, + sLoad16: defKindRD, + sLoad32: defKindRD, + uLoad64: defKindRD, + fpuLoad32: defKindRD, + fpuLoad64: defKindRD, + fpuLoad128: defKindRD, + vecLoad1R: defKindRD, + loadFpuConst32: defKindRD, + loadFpuConst64: defKindRD, + loadFpuConst128: defKindRD, + fpuStore32: defKindNone, + fpuStore64: defKindNone, + fpuStore128: defKindNone, + udf: defKindNone, + cSel: defKindRD, + fpuCSel: defKindRD, + movToVec: defKindRD, + movFromVec: defKindRD, + movFromVecSigned: defKindRD, + vecDup: defKindRD, + vecDupElement: defKindRD, + vecExtract: defKindRD, + vecMisc: defKindRD, + vecMovElement: defKindRD, + vecLanes: defKindRD, + vecShiftImm: defKindRD, + vecTbl: defKindRD, + vecTbl2: defKindRD, + vecPermute: defKindRD, + vecRRR: defKindRD, + vecRRRRewrite: defKindNone, + fpuToInt: defKindRD, + intToFpu: defKindRD, + cCmpImm: defKindNone, + movToFPSR: defKindNone, + movFromFPSR: defKindRD, + emitSourceOffsetInfo: defKindNone, + atomicRmw: defKindRD, + atomicCas: defKindNone, + atomicLoad: defKindRD, + atomicStore: defKindNone, + dmb: defKindNone, + loadConstBlockArg: defKindRD, +} + +// Defs returns the list of regalloc.VReg that are defined by the instruction. +// In order to reduce the number of allocations, the caller can pass the slice to be used. +func (i *instruction) Defs(regs *[]regalloc.VReg) []regalloc.VReg { + *regs = (*regs)[:0] + switch defKinds[i.kind] { + case defKindNone: + case defKindRD: + *regs = append(*regs, i.rd.nr()) + case defKindCall: + _, _, retIntRealRegs, retFloatRealRegs, _ := backend.ABIInfoFromUint64(i.u2) + for i := byte(0); i < retIntRealRegs; i++ { + *regs = append(*regs, regInfo.RealRegToVReg[intParamResultRegs[i]]) + } + for i := byte(0); i < retFloatRealRegs; i++ { + *regs = append(*regs, regInfo.RealRegToVReg[floatParamResultRegs[i]]) + } + default: + panic(fmt.Sprintf("defKind for %v not defined", i)) + } + return *regs +} + +// AssignDef implements regalloc.Instr AssignDef. +func (i *instruction) AssignDef(reg regalloc.VReg) { + switch defKinds[i.kind] { + case defKindNone: + case defKindRD: + i.rd = i.rd.assignReg(reg) + case defKindCall: + panic("BUG: call instructions shouldn't be assigned") + default: + panic(fmt.Sprintf("defKind for %v not defined", i)) + } +} + +type useKind byte + +const ( + useKindNone useKind = iota + 1 + useKindRN + useKindRNRM + useKindRNRMRA + useKindRNRN1RM + useKindCall + useKindCallInd + useKindAMode + useKindRNAMode + useKindCond + // useKindRDRewrite indicates an instruction where RD is used both as a source and destination. + // A temporary register for RD must be allocated explicitly with the source copied to this + // register before the instruction and the value copied from this register to the instruction + // return register. + useKindRDRewrite +) + +var useKinds = [numInstructionKinds]useKind{ + udf: useKindNone, + aluRRR: useKindRNRM, + aluRRRR: useKindRNRMRA, + aluRRImm12: useKindRN, + aluRRBitmaskImm: useKindRN, + aluRRRShift: useKindRNRM, + aluRRImmShift: useKindRN, + aluRRRExtend: useKindRNRM, + bitRR: useKindRN, + movZ: useKindNone, + movK: useKindNone, + movN: useKindNone, + mov32: useKindRN, + mov64: useKindRN, + fpuMov64: useKindRN, + fpuMov128: useKindRN, + fpuRR: useKindRN, + fpuRRR: useKindRNRM, + nop0: useKindNone, + call: useKindCall, + callInd: useKindCallInd, + ret: useKindNone, + store8: useKindRNAMode, + store16: useKindRNAMode, + store32: useKindRNAMode, + store64: useKindRNAMode, + exitSequence: useKindRN, + condBr: useKindCond, + br: useKindNone, + brTableSequence: useKindRN, + cSet: useKindNone, + extend: useKindRN, + fpuCmp: useKindRNRM, + uLoad8: useKindAMode, + uLoad16: useKindAMode, + uLoad32: useKindAMode, + sLoad8: useKindAMode, + sLoad16: useKindAMode, + sLoad32: useKindAMode, + uLoad64: useKindAMode, + fpuLoad32: useKindAMode, + fpuLoad64: useKindAMode, + fpuLoad128: useKindAMode, + fpuStore32: useKindRNAMode, + fpuStore64: useKindRNAMode, + fpuStore128: useKindRNAMode, + loadFpuConst32: useKindNone, + loadFpuConst64: useKindNone, + loadFpuConst128: useKindNone, + vecLoad1R: useKindRN, + cSel: useKindRNRM, + fpuCSel: useKindRNRM, + movToVec: useKindRN, + movFromVec: useKindRN, + movFromVecSigned: useKindRN, + vecDup: useKindRN, + vecDupElement: useKindRN, + vecExtract: useKindRNRM, + cCmpImm: useKindRN, + vecMisc: useKindRN, + vecMovElement: useKindRN, + vecLanes: useKindRN, + vecShiftImm: useKindRN, + vecTbl: useKindRNRM, + vecTbl2: useKindRNRN1RM, + vecRRR: useKindRNRM, + vecRRRRewrite: useKindRDRewrite, + vecPermute: useKindRNRM, + fpuToInt: useKindRN, + intToFpu: useKindRN, + movToFPSR: useKindRN, + movFromFPSR: useKindNone, + adr: useKindNone, + emitSourceOffsetInfo: useKindNone, + atomicRmw: useKindRNRM, + atomicCas: useKindRDRewrite, + atomicLoad: useKindRN, + atomicStore: useKindRNRM, + loadConstBlockArg: useKindNone, + dmb: useKindNone, +} + +// Uses returns the list of regalloc.VReg that are used by the instruction. +// In order to reduce the number of allocations, the caller can pass the slice to be used. +func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg { + *regs = (*regs)[:0] + switch useKinds[i.kind] { + case useKindNone: + case useKindRN: + if rn := i.rn.reg(); rn.Valid() { + *regs = append(*regs, rn) + } + case useKindRNRM: + if rn := i.rn.reg(); rn.Valid() { + *regs = append(*regs, rn) + } + if rm := i.rm.reg(); rm.Valid() { + *regs = append(*regs, rm) + } + case useKindRNRMRA: + if rn := i.rn.reg(); rn.Valid() { + *regs = append(*regs, rn) + } + if rm := i.rm.reg(); rm.Valid() { + *regs = append(*regs, rm) + } + if ra := i.ra.reg(); ra.Valid() { + *regs = append(*regs, ra) + } + case useKindRNRN1RM: + if rn := i.rn.reg(); rn.Valid() && rn.IsRealReg() { + rn1 := regalloc.FromRealReg(rn.RealReg()+1, rn.RegType()) + *regs = append(*regs, rn, rn1) + } + if rm := i.rm.reg(); rm.Valid() { + *regs = append(*regs, rm) + } + case useKindAMode: + if amodeRN := i.amode.rn; amodeRN.Valid() { + *regs = append(*regs, amodeRN) + } + if amodeRM := i.amode.rm; amodeRM.Valid() { + *regs = append(*regs, amodeRM) + } + case useKindRNAMode: + *regs = append(*regs, i.rn.reg()) + if amodeRN := i.amode.rn; amodeRN.Valid() { + *regs = append(*regs, amodeRN) + } + if amodeRM := i.amode.rm; amodeRM.Valid() { + *regs = append(*regs, amodeRM) + } + case useKindCond: + cnd := cond(i.u1) + if cnd.kind() != condKindCondFlagSet { + *regs = append(*regs, cnd.register()) + } + case useKindCallInd: + *regs = append(*regs, i.rn.nr()) + fallthrough + case useKindCall: + argIntRealRegs, argFloatRealRegs, _, _, _ := backend.ABIInfoFromUint64(i.u2) + for i := byte(0); i < argIntRealRegs; i++ { + *regs = append(*regs, regInfo.RealRegToVReg[intParamResultRegs[i]]) + } + for i := byte(0); i < argFloatRealRegs; i++ { + *regs = append(*regs, regInfo.RealRegToVReg[floatParamResultRegs[i]]) + } + case useKindRDRewrite: + *regs = append(*regs, i.rn.reg()) + *regs = append(*regs, i.rm.reg()) + *regs = append(*regs, i.rd.reg()) + default: + panic(fmt.Sprintf("useKind for %v not defined", i)) + } + return *regs +} + +func (i *instruction) AssignUse(index int, reg regalloc.VReg) { + switch useKinds[i.kind] { + case useKindNone: + case useKindRN: + if rn := i.rn.reg(); rn.Valid() { + i.rn = i.rn.assignReg(reg) + } + case useKindRNRM: + if index == 0 { + if rn := i.rn.reg(); rn.Valid() { + i.rn = i.rn.assignReg(reg) + } + } else { + if rm := i.rm.reg(); rm.Valid() { + i.rm = i.rm.assignReg(reg) + } + } + case useKindRDRewrite: + if index == 0 { + if rn := i.rn.reg(); rn.Valid() { + i.rn = i.rn.assignReg(reg) + } + } else if index == 1 { + if rm := i.rm.reg(); rm.Valid() { + i.rm = i.rm.assignReg(reg) + } + } else { + if rd := i.rd.reg(); rd.Valid() { + i.rd = i.rd.assignReg(reg) + } + } + case useKindRNRN1RM: + if index == 0 { + if rn := i.rn.reg(); rn.Valid() { + i.rn = i.rn.assignReg(reg) + } + if rn1 := i.rn.reg() + 1; rn1.Valid() { + i.rm = i.rm.assignReg(reg + 1) + } + } else { + if rm := i.rm.reg(); rm.Valid() { + i.rm = i.rm.assignReg(reg) + } + } + case useKindRNRMRA: + if index == 0 { + if rn := i.rn.reg(); rn.Valid() { + i.rn = i.rn.assignReg(reg) + } + } else if index == 1 { + if rm := i.rm.reg(); rm.Valid() { + i.rm = i.rm.assignReg(reg) + } + } else { + if ra := i.ra.reg(); ra.Valid() { + i.ra = i.ra.assignReg(reg) + } + } + case useKindAMode: + if index == 0 { + if amodeRN := i.amode.rn; amodeRN.Valid() { + i.amode.rn = reg + } + } else { + if amodeRM := i.amode.rm; amodeRM.Valid() { + i.amode.rm = reg + } + } + case useKindRNAMode: + if index == 0 { + i.rn = i.rn.assignReg(reg) + } else if index == 1 { + if amodeRN := i.amode.rn; amodeRN.Valid() { + i.amode.rn = reg + } else { + panic("BUG") + } + } else { + if amodeRM := i.amode.rm; amodeRM.Valid() { + i.amode.rm = reg + } else { + panic("BUG") + } + } + case useKindCond: + c := cond(i.u1) + switch c.kind() { + case condKindRegisterZero: + i.u1 = uint64(registerAsRegZeroCond(reg)) + case condKindRegisterNotZero: + i.u1 = uint64(registerAsRegNotZeroCond(reg)) + } + case useKindCall: + panic("BUG: call instructions shouldn't be assigned") + case useKindCallInd: + i.rn = i.rn.assignReg(reg) + default: + panic(fmt.Sprintf("useKind for %v not defined", i)) + } +} + +func (i *instruction) asCall(ref ssa.FuncRef, abi *backend.FunctionABI) { + i.kind = call + i.u1 = uint64(ref) + if abi != nil { + i.u2 = abi.ABIInfoAsUint64() + } +} + +func (i *instruction) asCallIndirect(ptr regalloc.VReg, abi *backend.FunctionABI) { + i.kind = callInd + i.rn = operandNR(ptr) + if abi != nil { + i.u2 = abi.ABIInfoAsUint64() + } +} + +func (i *instruction) callFuncRef() ssa.FuncRef { + return ssa.FuncRef(i.u1) +} + +// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false) +func (i *instruction) asMOVZ(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) { + i.kind = movZ + i.rd = operandNR(dst) + i.u1 = imm + i.u2 = shift + if dst64bit { + i.u3 = 1 + } +} + +// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false) +func (i *instruction) asMOVK(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) { + i.kind = movK + i.rd = operandNR(dst) + i.u1 = imm + i.u2 = shift + if dst64bit { + i.u3 = 1 + } +} + +// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false) +func (i *instruction) asMOVN(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) { + i.kind = movN + i.rd = operandNR(dst) + i.u1 = imm + i.u2 = shift + if dst64bit { + i.u3 = 1 + } +} + +func (i *instruction) asNop0() *instruction { + i.kind = nop0 + return i +} + +func (i *instruction) asNop0WithLabel(l label) { + i.kind = nop0 + i.u1 = uint64(l) +} + +func (i *instruction) nop0Label() label { + return label(i.u1) +} + +func (i *instruction) asRet() { + i.kind = ret +} + +func (i *instruction) asStorePair64(src1, src2 regalloc.VReg, amode addressMode) { + i.kind = storeP64 + i.rn = operandNR(src1) + i.rm = operandNR(src2) + i.amode = amode +} + +func (i *instruction) asLoadPair64(src1, src2 regalloc.VReg, amode addressMode) { + i.kind = loadP64 + i.rn = operandNR(src1) + i.rm = operandNR(src2) + i.amode = amode +} + +func (i *instruction) asStore(src operand, amode addressMode, sizeInBits byte) { + switch sizeInBits { + case 8: + i.kind = store8 + case 16: + i.kind = store16 + case 32: + if src.reg().RegType() == regalloc.RegTypeInt { + i.kind = store32 + } else { + i.kind = fpuStore32 + } + case 64: + if src.reg().RegType() == regalloc.RegTypeInt { + i.kind = store64 + } else { + i.kind = fpuStore64 + } + case 128: + i.kind = fpuStore128 + } + i.rn = src + i.amode = amode +} + +func (i *instruction) asSLoad(dst operand, amode addressMode, sizeInBits byte) { + switch sizeInBits { + case 8: + i.kind = sLoad8 + case 16: + i.kind = sLoad16 + case 32: + i.kind = sLoad32 + default: + panic("BUG") + } + i.rd = dst + i.amode = amode +} + +func (i *instruction) asULoad(dst operand, amode addressMode, sizeInBits byte) { + switch sizeInBits { + case 8: + i.kind = uLoad8 + case 16: + i.kind = uLoad16 + case 32: + i.kind = uLoad32 + case 64: + i.kind = uLoad64 + } + i.rd = dst + i.amode = amode +} + +func (i *instruction) asFpuLoad(dst operand, amode addressMode, sizeInBits byte) { + switch sizeInBits { + case 32: + i.kind = fpuLoad32 + case 64: + i.kind = fpuLoad64 + case 128: + i.kind = fpuLoad128 + } + i.rd = dst + i.amode = amode +} + +func (i *instruction) asVecLoad1R(rd, rn operand, arr vecArrangement) { + // NOTE: currently only has support for no-offset loads, though it is suspicious that + // we would need to support offset load (that is only available for post-index). + i.kind = vecLoad1R + i.rd = rd + i.rn = rn + i.u1 = uint64(arr) +} + +func (i *instruction) asCSet(rd regalloc.VReg, mask bool, c condFlag) { + i.kind = cSet + i.rd = operandNR(rd) + i.u1 = uint64(c) + if mask { + i.u2 = 1 + } +} + +func (i *instruction) asCSel(rd, rn, rm operand, c condFlag, _64bit bool) { + i.kind = cSel + i.rd = rd + i.rn = rn + i.rm = rm + i.u1 = uint64(c) + if _64bit { + i.u3 = 1 + } +} + +func (i *instruction) asFpuCSel(rd, rn, rm operand, c condFlag, _64bit bool) { + i.kind = fpuCSel + i.rd = rd + i.rn = rn + i.rm = rm + i.u1 = uint64(c) + if _64bit { + i.u3 = 1 + } +} + +func (i *instruction) asBr(target label) { + if target == labelReturn { + panic("BUG: call site should special case for returnLabel") + } + i.kind = br + i.u1 = uint64(target) +} + +func (i *instruction) asBrTableSequence(indexReg regalloc.VReg, targetIndex, targetCounts int) { + i.kind = brTableSequence + i.rn = operandNR(indexReg) + i.u1 = uint64(targetIndex) + i.u2 = uint64(targetCounts) +} + +func (i *instruction) brTableSequenceOffsetsResolved() { + i.u3 = 1 // indicate that the offsets are resolved, for debugging. +} + +func (i *instruction) brLabel() label { + return label(i.u1) +} + +// brOffsetResolved is called when the target label is resolved. +func (i *instruction) brOffsetResolve(offset int64) { + i.u2 = uint64(offset) + i.u3 = 1 // indicate that the offset is resolved, for debugging. +} + +func (i *instruction) brOffset() int64 { + return int64(i.u2) +} + +// asCondBr encodes a conditional branch instruction. is64bit is only needed when cond is not flag. +func (i *instruction) asCondBr(c cond, target label, is64bit bool) { + i.kind = condBr + i.u1 = c.asUint64() + i.u2 = uint64(target) + if is64bit { + i.u3 = 1 + } +} + +func (i *instruction) setCondBrTargets(target label) { + i.u2 = uint64(target) +} + +func (i *instruction) condBrLabel() label { + return label(i.u2) +} + +// condBrOffsetResolve is called when the target label is resolved. +func (i *instruction) condBrOffsetResolve(offset int64) { + i.rd.data = uint64(offset) + i.rd.data2 = 1 // indicate that the offset is resolved, for debugging. +} + +// condBrOffsetResolved returns true if condBrOffsetResolve is already called. +func (i *instruction) condBrOffsetResolved() bool { + return i.rd.data2 == 1 +} + +func (i *instruction) condBrOffset() int64 { + return int64(i.rd.data) +} + +func (i *instruction) condBrCond() cond { + return cond(i.u1) +} + +func (i *instruction) condBr64bit() bool { + return i.u3 == 1 +} + +func (i *instruction) asLoadFpuConst32(rd regalloc.VReg, raw uint64) { + i.kind = loadFpuConst32 + i.u1 = raw + i.rd = operandNR(rd) +} + +func (i *instruction) asLoadFpuConst64(rd regalloc.VReg, raw uint64) { + i.kind = loadFpuConst64 + i.u1 = raw + i.rd = operandNR(rd) +} + +func (i *instruction) asLoadFpuConst128(rd regalloc.VReg, lo, hi uint64) { + i.kind = loadFpuConst128 + i.u1 = lo + i.u2 = hi + i.rd = operandNR(rd) +} + +func (i *instruction) asFpuCmp(rn, rm operand, is64bit bool) { + i.kind = fpuCmp + i.rn, i.rm = rn, rm + if is64bit { + i.u3 = 1 + } +} + +func (i *instruction) asCCmpImm(rn operand, imm uint64, c condFlag, flag byte, is64bit bool) { + i.kind = cCmpImm + i.rn = rn + i.rm.data = imm + i.u1 = uint64(c) + i.u2 = uint64(flag) + if is64bit { + i.u3 = 1 + } +} + +// asALU setups a basic ALU instruction. +func (i *instruction) asALU(aluOp aluOp, rd, rn, rm operand, dst64bit bool) { + switch rm.kind { + case operandKindNR: + i.kind = aluRRR + case operandKindSR: + i.kind = aluRRRShift + case operandKindER: + i.kind = aluRRRExtend + case operandKindImm12: + i.kind = aluRRImm12 + default: + panic("BUG") + } + i.u1 = uint64(aluOp) + i.rd, i.rn, i.rm = rd, rn, rm + if dst64bit { + i.u3 = 1 + } +} + +// asALU setups a basic ALU instruction. +func (i *instruction) asALURRRR(aluOp aluOp, rd, rn, rm, ra operand, dst64bit bool) { + i.kind = aluRRRR + i.u1 = uint64(aluOp) + i.rd, i.rn, i.rm, i.ra = rd, rn, rm, ra + if dst64bit { + i.u3 = 1 + } +} + +// asALUShift setups a shift based ALU instruction. +func (i *instruction) asALUShift(aluOp aluOp, rd, rn, rm operand, dst64bit bool) { + switch rm.kind { + case operandKindNR: + i.kind = aluRRR // If the shift amount op is a register, then the instruction is encoded as a normal ALU instruction with two register operands. + case operandKindShiftImm: + i.kind = aluRRImmShift + default: + panic("BUG") + } + i.u1 = uint64(aluOp) + i.rd, i.rn, i.rm = rd, rn, rm + if dst64bit { + i.u3 = 1 + } +} + +func (i *instruction) asALUBitmaskImm(aluOp aluOp, rd, rn regalloc.VReg, imm uint64, dst64bit bool) { + i.kind = aluRRBitmaskImm + i.u1 = uint64(aluOp) + i.rn, i.rd = operandNR(rn), operandNR(rd) + i.u2 = imm + if dst64bit { + i.u3 = 1 + } +} + +func (i *instruction) asMovToFPSR(rn regalloc.VReg) { + i.kind = movToFPSR + i.rn = operandNR(rn) +} + +func (i *instruction) asMovFromFPSR(rd regalloc.VReg) { + i.kind = movFromFPSR + i.rd = operandNR(rd) +} + +func (i *instruction) asBitRR(bitOp bitOp, rd, rn regalloc.VReg, is64bit bool) { + i.kind = bitRR + i.rn, i.rd = operandNR(rn), operandNR(rd) + i.u1 = uint64(bitOp) + if is64bit { + i.u2 = 1 + } +} + +func (i *instruction) asFpuRRR(op fpuBinOp, rd, rn, rm operand, dst64bit bool) { + i.kind = fpuRRR + i.u1 = uint64(op) + i.rd, i.rn, i.rm = rd, rn, rm + if dst64bit { + i.u3 = 1 + } +} + +func (i *instruction) asFpuRR(op fpuUniOp, rd, rn operand, dst64bit bool) { + i.kind = fpuRR + i.u1 = uint64(op) + i.rd, i.rn = rd, rn + if dst64bit { + i.u3 = 1 + } +} + +func (i *instruction) asExtend(rd, rn regalloc.VReg, fromBits, toBits byte, signed bool) { + i.kind = extend + i.rn, i.rd = operandNR(rn), operandNR(rd) + i.u1 = uint64(fromBits) + i.u2 = uint64(toBits) + if signed { + i.u3 = 1 + } +} + +func (i *instruction) asMove32(rd, rn regalloc.VReg) { + i.kind = mov32 + i.rn, i.rd = operandNR(rn), operandNR(rd) +} + +func (i *instruction) asMove64(rd, rn regalloc.VReg) *instruction { + i.kind = mov64 + i.rn, i.rd = operandNR(rn), operandNR(rd) + return i +} + +func (i *instruction) asFpuMov64(rd, rn regalloc.VReg) { + i.kind = fpuMov64 + i.rn, i.rd = operandNR(rn), operandNR(rd) +} + +func (i *instruction) asFpuMov128(rd, rn regalloc.VReg) *instruction { + i.kind = fpuMov128 + i.rn, i.rd = operandNR(rn), operandNR(rd) + return i +} + +func (i *instruction) asMovToVec(rd, rn operand, arr vecArrangement, index vecIndex) { + i.kind = movToVec + i.rd = rd + i.rn = rn + i.u1, i.u2 = uint64(arr), uint64(index) +} + +func (i *instruction) asMovFromVec(rd, rn operand, arr vecArrangement, index vecIndex, signed bool) { + if signed { + i.kind = movFromVecSigned + } else { + i.kind = movFromVec + } + i.rd = rd + i.rn = rn + i.u1, i.u2 = uint64(arr), uint64(index) +} + +func (i *instruction) asVecDup(rd, rn operand, arr vecArrangement) { + i.kind = vecDup + i.u1 = uint64(arr) + i.rn, i.rd = rn, rd +} + +func (i *instruction) asVecDupElement(rd, rn operand, arr vecArrangement, index vecIndex) { + i.kind = vecDupElement + i.u1 = uint64(arr) + i.rn, i.rd = rn, rd + i.u2 = uint64(index) +} + +func (i *instruction) asVecExtract(rd, rn, rm operand, arr vecArrangement, index uint32) { + i.kind = vecExtract + i.u1 = uint64(arr) + i.rn, i.rm, i.rd = rn, rm, rd + i.u2 = uint64(index) +} + +func (i *instruction) asVecMovElement(rd, rn operand, arr vecArrangement, rdIndex, rnIndex vecIndex) { + i.kind = vecMovElement + i.u1 = uint64(arr) + i.u2, i.u3 = uint64(rdIndex), uint64(rnIndex) + i.rn, i.rd = rn, rd +} + +func (i *instruction) asVecMisc(op vecOp, rd, rn operand, arr vecArrangement) { + i.kind = vecMisc + i.u1 = uint64(op) + i.rn, i.rd = rn, rd + i.u2 = uint64(arr) +} + +func (i *instruction) asVecLanes(op vecOp, rd, rn operand, arr vecArrangement) { + i.kind = vecLanes + i.u1 = uint64(op) + i.rn, i.rd = rn, rd + i.u2 = uint64(arr) +} + +func (i *instruction) asVecShiftImm(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction { + i.kind = vecShiftImm + i.u1 = uint64(op) + i.rn, i.rm, i.rd = rn, rm, rd + i.u2 = uint64(arr) + return i +} + +func (i *instruction) asVecTbl(nregs byte, rd, rn, rm operand, arr vecArrangement) { + switch nregs { + case 0, 1: + i.kind = vecTbl + case 2: + i.kind = vecTbl2 + if !rn.reg().IsRealReg() { + panic("rn is not a RealReg") + } + if rn.realReg() == v31 { + panic("rn cannot be v31") + } + default: + panic(fmt.Sprintf("unsupported number of registers %d", nregs)) + } + i.rn, i.rm, i.rd = rn, rm, rd + i.u2 = uint64(arr) +} + +func (i *instruction) asVecPermute(op vecOp, rd, rn, rm operand, arr vecArrangement) { + i.kind = vecPermute + i.u1 = uint64(op) + i.rn, i.rm, i.rd = rn, rm, rd + i.u2 = uint64(arr) +} + +func (i *instruction) asVecRRR(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction { + i.kind = vecRRR + i.u1 = uint64(op) + i.rn, i.rd, i.rm = rn, rd, rm + i.u2 = uint64(arr) + return i +} + +// asVecRRRRewrite encodes a vector instruction that rewrites the destination register. +// IMPORTANT: the destination register must be already defined before this instruction. +func (i *instruction) asVecRRRRewrite(op vecOp, rd, rn, rm operand, arr vecArrangement) { + i.kind = vecRRRRewrite + i.u1 = uint64(op) + i.rn, i.rd, i.rm = rn, rd, rm + i.u2 = uint64(arr) +} + +func (i *instruction) IsCopy() bool { + op := i.kind + // We do not include mov32 as it is not a copy instruction in the sense that it does not preserve the upper 32 bits, + // and it is only used in the translation of IReduce, not the actual copy indeed. + return op == mov64 || op == fpuMov64 || op == fpuMov128 +} + +// String implements fmt.Stringer. +func (i *instruction) String() (str string) { + is64SizeBitToSize := func(u3 uint64) byte { + if u3 == 0 { + return 32 + } + return 64 + } + + switch i.kind { + case nop0: + if i.u1 != 0 { + l := label(i.u1) + str = fmt.Sprintf("%s:", l) + } else { + str = "nop0" + } + case aluRRR: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(), + formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), + i.rm.format(size)) + case aluRRRR: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("%s %s, %s, %s, %s", aluOp(i.u1).String(), + formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.ra.nr(), size)) + case aluRRImm12: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(), + formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), i.rm.format(size)) + case aluRRBitmaskImm: + size := is64SizeBitToSize(i.u3) + rd, rn := formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size) + if size == 32 { + str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, uint32(i.u2)) + } else { + str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, i.u2) + } + case aluRRImmShift: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("%s %s, %s, %#x", + aluOp(i.u1).String(), + formatVRegSized(i.rd.nr(), size), + formatVRegSized(i.rn.nr(), size), + i.rm.shiftImm(), + ) + case aluRRRShift: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("%s %s, %s, %s", + aluOp(i.u1).String(), + formatVRegSized(i.rd.nr(), size), + formatVRegSized(i.rn.nr(), size), + i.rm.format(size), + ) + case aluRRRExtend: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(), + formatVRegSized(i.rd.nr(), size), + formatVRegSized(i.rn.nr(), size), + // Regardless of the source size, the register is formatted in 32-bit. + i.rm.format(32), + ) + case bitRR: + size := is64SizeBitToSize(i.u2) + str = fmt.Sprintf("%s %s, %s", + bitOp(i.u1), + formatVRegSized(i.rd.nr(), size), + formatVRegSized(i.rn.nr(), size), + ) + case uLoad8: + str = fmt.Sprintf("ldrb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32)) + case sLoad8: + str = fmt.Sprintf("ldrsb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32)) + case uLoad16: + str = fmt.Sprintf("ldrh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32)) + case sLoad16: + str = fmt.Sprintf("ldrsh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32)) + case uLoad32: + str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32)) + case sLoad32: + str = fmt.Sprintf("ldrs %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32)) + case uLoad64: + str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64)) + case store8: + str = fmt.Sprintf("strb %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(8)) + case store16: + str = fmt.Sprintf("strh %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(16)) + case store32: + str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(32)) + case store64: + str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64)) + case storeP64: + str = fmt.Sprintf("stp %s, %s, %s", + formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64)) + case loadP64: + str = fmt.Sprintf("ldp %s, %s, %s", + formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64)) + case mov64: + str = fmt.Sprintf("mov %s, %s", + formatVRegSized(i.rd.nr(), 64), + formatVRegSized(i.rn.nr(), 64)) + case mov32: + str = fmt.Sprintf("mov %s, %s", formatVRegSized(i.rd.nr(), 32), formatVRegSized(i.rn.nr(), 32)) + case movZ: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("movz %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16) + case movN: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("movn %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16) + case movK: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("movk %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16) + case extend: + fromBits, toBits := byte(i.u1), byte(i.u2) + + var signedStr string + if i.u3 == 1 { + signedStr = "s" + } else { + signedStr = "u" + } + var fromStr string + switch fromBits { + case 8: + fromStr = "b" + case 16: + fromStr = "h" + case 32: + fromStr = "w" + } + str = fmt.Sprintf("%sxt%s %s, %s", signedStr, fromStr, formatVRegSized(i.rd.nr(), toBits), formatVRegSized(i.rn.nr(), 32)) + case cSel: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("csel %s, %s, %s, %s", + formatVRegSized(i.rd.nr(), size), + formatVRegSized(i.rn.nr(), size), + formatVRegSized(i.rm.nr(), size), + condFlag(i.u1), + ) + case cSet: + if i.u2 != 0 { + str = fmt.Sprintf("csetm %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1)) + } else { + str = fmt.Sprintf("cset %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1)) + } + case cCmpImm: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("ccmp %s, #%#x, #%#x, %s", + formatVRegSized(i.rn.nr(), size), i.rm.data, + i.u2&0b1111, + condFlag(i.u1)) + case fpuMov64: + str = fmt.Sprintf("mov %s, %s", + formatVRegVec(i.rd.nr(), vecArrangement8B, vecIndexNone), + formatVRegVec(i.rn.nr(), vecArrangement8B, vecIndexNone)) + case fpuMov128: + str = fmt.Sprintf("mov %s, %s", + formatVRegVec(i.rd.nr(), vecArrangement16B, vecIndexNone), + formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone)) + case fpuMovFromVec: + panic("TODO") + case fpuRR: + dstSz := is64SizeBitToSize(i.u3) + srcSz := dstSz + op := fpuUniOp(i.u1) + switch op { + case fpuUniOpCvt32To64: + srcSz = 32 + case fpuUniOpCvt64To32: + srcSz = 64 + } + str = fmt.Sprintf("%s %s, %s", op.String(), + formatVRegSized(i.rd.nr(), dstSz), formatVRegSized(i.rn.nr(), srcSz)) + case fpuRRR: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("%s %s, %s, %s", fpuBinOp(i.u1).String(), + formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size)) + case fpuRRI: + panic("TODO") + case fpuRRRR: + panic("TODO") + case fpuCmp: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("fcmp %s, %s", + formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size)) + case fpuLoad32: + str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32)) + case fpuStore32: + str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(64)) + case fpuLoad64: + str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64)) + case fpuStore64: + str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64)) + case fpuLoad128: + str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 128), i.amode.format(64)) + case fpuStore128: + str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 128), i.amode.format(64)) + case loadFpuConst32: + str = fmt.Sprintf("ldr %s, #8; b 8; data.f32 %f", formatVRegSized(i.rd.nr(), 32), math.Float32frombits(uint32(i.u1))) + case loadFpuConst64: + str = fmt.Sprintf("ldr %s, #8; b 16; data.f64 %f", formatVRegSized(i.rd.nr(), 64), math.Float64frombits(i.u1)) + case loadFpuConst128: + str = fmt.Sprintf("ldr %s, #8; b 32; data.v128 %016x %016x", + formatVRegSized(i.rd.nr(), 128), i.u1, i.u2) + case fpuToInt: + var op, src, dst string + if signed := i.u1 == 1; signed { + op = "fcvtzs" + } else { + op = "fcvtzu" + } + if src64 := i.u2 == 1; src64 { + src = formatVRegWidthVec(i.rn.nr(), vecArrangementD) + } else { + src = formatVRegWidthVec(i.rn.nr(), vecArrangementS) + } + if dst64 := i.u3 == 1; dst64 { + dst = formatVRegSized(i.rd.nr(), 64) + } else { + dst = formatVRegSized(i.rd.nr(), 32) + } + str = fmt.Sprintf("%s %s, %s", op, dst, src) + + case intToFpu: + var op, src, dst string + if signed := i.u1 == 1; signed { + op = "scvtf" + } else { + op = "ucvtf" + } + if src64 := i.u2 == 1; src64 { + src = formatVRegSized(i.rn.nr(), 64) + } else { + src = formatVRegSized(i.rn.nr(), 32) + } + if dst64 := i.u3 == 1; dst64 { + dst = formatVRegWidthVec(i.rd.nr(), vecArrangementD) + } else { + dst = formatVRegWidthVec(i.rd.nr(), vecArrangementS) + } + str = fmt.Sprintf("%s %s, %s", op, dst, src) + case fpuCSel: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("fcsel %s, %s, %s, %s", + formatVRegSized(i.rd.nr(), size), + formatVRegSized(i.rn.nr(), size), + formatVRegSized(i.rm.nr(), size), + condFlag(i.u1), + ) + case movToVec: + var size byte + arr := vecArrangement(i.u1) + switch arr { + case vecArrangementB, vecArrangementH, vecArrangementS: + size = 32 + case vecArrangementD: + size = 64 + default: + panic("unsupported arrangement " + arr.String()) + } + str = fmt.Sprintf("ins %s, %s", formatVRegVec(i.rd.nr(), arr, vecIndex(i.u2)), formatVRegSized(i.rn.nr(), size)) + case movFromVec, movFromVecSigned: + var size byte + var opcode string + arr := vecArrangement(i.u1) + signed := i.kind == movFromVecSigned + switch arr { + case vecArrangementB, vecArrangementH, vecArrangementS: + size = 32 + if signed { + opcode = "smov" + } else { + opcode = "umov" + } + case vecArrangementD: + size = 64 + if signed { + opcode = "smov" + } else { + opcode = "mov" + } + default: + panic("unsupported arrangement " + arr.String()) + } + str = fmt.Sprintf("%s %s, %s", opcode, formatVRegSized(i.rd.nr(), size), formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2))) + case vecDup: + str = fmt.Sprintf("dup %s, %s", + formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone), + formatVRegSized(i.rn.nr(), 64), + ) + case vecDupElement: + arr := vecArrangement(i.u1) + str = fmt.Sprintf("dup %s, %s", + formatVRegVec(i.rd.nr(), arr, vecIndexNone), + formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)), + ) + case vecDupFromFpu: + panic("TODO") + case vecExtract: + str = fmt.Sprintf("ext %s, %s, %s, #%d", + formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone), + formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndexNone), + formatVRegVec(i.rm.nr(), vecArrangement(i.u1), vecIndexNone), + uint32(i.u2), + ) + case vecExtend: + panic("TODO") + case vecMovElement: + str = fmt.Sprintf("mov %s, %s", + formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndex(i.u2)), + formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndex(i.u3)), + ) + case vecMiscNarrow: + panic("TODO") + case vecRRR, vecRRRRewrite: + str = fmt.Sprintf("%s %s, %s, %s", + vecOp(i.u1), + formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone), + formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone), + formatVRegVec(i.rm.nr(), vecArrangement(i.u2), vecIndexNone), + ) + case vecMisc: + vop := vecOp(i.u1) + if vop == vecOpCmeq0 { + str = fmt.Sprintf("cmeq %s, %s, #0", + formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone), + formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone)) + } else { + str = fmt.Sprintf("%s %s, %s", + vop, + formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone), + formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone)) + } + case vecLanes: + arr := vecArrangement(i.u2) + var destArr vecArrangement + switch arr { + case vecArrangement8B, vecArrangement16B: + destArr = vecArrangementH + case vecArrangement4H, vecArrangement8H: + destArr = vecArrangementS + case vecArrangement4S: + destArr = vecArrangementD + default: + panic("invalid arrangement " + arr.String()) + } + str = fmt.Sprintf("%s %s, %s", + vecOp(i.u1), + formatVRegWidthVec(i.rd.nr(), destArr), + formatVRegVec(i.rn.nr(), arr, vecIndexNone)) + case vecShiftImm: + arr := vecArrangement(i.u2) + str = fmt.Sprintf("%s %s, %s, #%d", + vecOp(i.u1), + formatVRegVec(i.rd.nr(), arr, vecIndexNone), + formatVRegVec(i.rn.nr(), arr, vecIndexNone), + i.rm.shiftImm()) + case vecTbl: + arr := vecArrangement(i.u2) + str = fmt.Sprintf("tbl %s, { %s }, %s", + formatVRegVec(i.rd.nr(), arr, vecIndexNone), + formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone), + formatVRegVec(i.rm.nr(), arr, vecIndexNone)) + case vecTbl2: + arr := vecArrangement(i.u2) + rd, rn, rm := i.rd.nr(), i.rn.nr(), i.rm.nr() + rn1 := regalloc.FromRealReg(rn.RealReg()+1, rn.RegType()) + str = fmt.Sprintf("tbl %s, { %s, %s }, %s", + formatVRegVec(rd, arr, vecIndexNone), + formatVRegVec(rn, vecArrangement16B, vecIndexNone), + formatVRegVec(rn1, vecArrangement16B, vecIndexNone), + formatVRegVec(rm, arr, vecIndexNone)) + case vecPermute: + arr := vecArrangement(i.u2) + str = fmt.Sprintf("%s %s, %s, %s", + vecOp(i.u1), + formatVRegVec(i.rd.nr(), arr, vecIndexNone), + formatVRegVec(i.rn.nr(), arr, vecIndexNone), + formatVRegVec(i.rm.nr(), arr, vecIndexNone)) + case movToFPSR: + str = fmt.Sprintf("msr fpsr, %s", formatVRegSized(i.rn.nr(), 64)) + case movFromFPSR: + str = fmt.Sprintf("mrs %s fpsr", formatVRegSized(i.rd.nr(), 64)) + case call: + str = fmt.Sprintf("bl %s", ssa.FuncRef(i.u1)) + case callInd: + str = fmt.Sprintf("bl %s", formatVRegSized(i.rn.nr(), 64)) + case ret: + str = "ret" + case br: + target := label(i.u1) + if i.u3 != 0 { + str = fmt.Sprintf("b #%#x (%s)", i.brOffset(), target.String()) + } else { + str = fmt.Sprintf("b %s", target.String()) + } + case condBr: + size := is64SizeBitToSize(i.u3) + c := cond(i.u1) + target := label(i.u2) + switch c.kind() { + case condKindRegisterZero: + if !i.condBrOffsetResolved() { + str = fmt.Sprintf("cbz %s, (%s)", formatVRegSized(c.register(), size), target.String()) + } else { + str = fmt.Sprintf("cbz %s, #%#x %s", formatVRegSized(c.register(), size), i.condBrOffset(), target.String()) + } + case condKindRegisterNotZero: + if offset := i.condBrOffset(); offset != 0 { + str = fmt.Sprintf("cbnz %s, #%#x (%s)", formatVRegSized(c.register(), size), offset, target.String()) + } else { + str = fmt.Sprintf("cbnz %s, %s", formatVRegSized(c.register(), size), target.String()) + } + case condKindCondFlagSet: + if offset := i.condBrOffset(); offset != 0 { + if target == labelInvalid { + str = fmt.Sprintf("b.%s #%#x", c.flag(), offset) + } else { + str = fmt.Sprintf("b.%s #%#x, (%s)", c.flag(), offset, target.String()) + } + } else { + str = fmt.Sprintf("b.%s %s", c.flag(), target.String()) + } + } + case adr: + str = fmt.Sprintf("adr %s, #%#x", formatVRegSized(i.rd.nr(), 64), int64(i.u1)) + case brTableSequence: + targetIndex := i.u1 + str = fmt.Sprintf("br_table_sequence %s, table_index=%d", formatVRegSized(i.rn.nr(), 64), targetIndex) + case exitSequence: + str = fmt.Sprintf("exit_sequence %s", formatVRegSized(i.rn.nr(), 64)) + case atomicRmw: + m := atomicRmwOp(i.u1).String() + size := byte(32) + switch i.u2 { + case 8: + size = 64 + case 2: + m = m + "h" + case 1: + m = m + "b" + } + str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64)) + case atomicCas: + m := "casal" + size := byte(32) + switch i.u2 { + case 8: + size = 64 + case 2: + m = m + "h" + case 1: + m = m + "b" + } + str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64)) + case atomicLoad: + m := "ldar" + size := byte(32) + switch i.u2 { + case 8: + size = 64 + case 2: + m = m + "h" + case 1: + m = m + "b" + } + str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64)) + case atomicStore: + m := "stlr" + size := byte(32) + switch i.u2 { + case 8: + size = 64 + case 2: + m = m + "h" + case 1: + m = m + "b" + } + str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64)) + case dmb: + str = "dmb" + case udf: + str = "udf" + case emitSourceOffsetInfo: + str = fmt.Sprintf("source_offset_info %d", ssa.SourceOffset(i.u1)) + case vecLoad1R: + str = fmt.Sprintf("ld1r {%s}, [%s]", formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone), formatVRegSized(i.rn.nr(), 64)) + case loadConstBlockArg: + str = fmt.Sprintf("load_const_block_arg %s, %#x", formatVRegSized(i.rd.nr(), 64), i.u1) + default: + panic(i.kind) + } + return +} + +func (i *instruction) asAdr(rd regalloc.VReg, offset int64) { + i.kind = adr + i.rd = operandNR(rd) + i.u1 = uint64(offset) +} + +func (i *instruction) asAtomicRmw(op atomicRmwOp, rn, rs, rt operand, size uint64) { + i.kind = atomicRmw + i.rd, i.rn, i.rm = rt, rn, rs + i.u1 = uint64(op) + i.u2 = size +} + +func (i *instruction) asAtomicCas(rn, rs, rt operand, size uint64) { + i.kind = atomicCas + i.rm, i.rn, i.rd = rt, rn, rs + i.u2 = size +} + +func (i *instruction) asAtomicLoad(rn, rt operand, size uint64) { + i.kind = atomicLoad + i.rn, i.rd = rn, rt + i.u2 = size +} + +func (i *instruction) asAtomicStore(rn, rt operand, size uint64) { + i.kind = atomicStore + i.rn, i.rm = rn, rt + i.u2 = size +} + +func (i *instruction) asDMB() { + i.kind = dmb +} + +// TODO: delete unnecessary things. +const ( + // nop0 represents a no-op of zero size. + nop0 instructionKind = iota + 1 + // aluRRR represents an ALU operation with two register sources and a register destination. + aluRRR + // aluRRRR represents an ALU operation with three register sources and a register destination. + aluRRRR + // aluRRImm12 represents an ALU operation with a register source and an immediate-12 source, with a register destination. + aluRRImm12 + // aluRRBitmaskImm represents an ALU operation with a register source and a bitmask immediate, with a register destination. + aluRRBitmaskImm + // aluRRImmShift represents an ALU operation with a register source and an immediate-shifted source, with a register destination. + aluRRImmShift + // aluRRRShift represents an ALU operation with two register sources, one of which can be shifted, with a register destination. + aluRRRShift + // aluRRRExtend represents an ALU operation with two register sources, one of which can be extended, with a register destination. + aluRRRExtend + // bitRR represents a bit op instruction with a single register source. + bitRR + // uLoad8 represents an unsigned 8-bit load. + uLoad8 + // sLoad8 represents a signed 8-bit load into 64-bit register. + sLoad8 + // uLoad16 represents an unsigned 16-bit load into 64-bit register. + uLoad16 + // sLoad16 represents a signed 16-bit load into 64-bit register. + sLoad16 + // uLoad32 represents an unsigned 32-bit load into 64-bit register. + uLoad32 + // sLoad32 represents a signed 32-bit load into 64-bit register. + sLoad32 + // uLoad64 represents a 64-bit load. + uLoad64 + // store8 represents an 8-bit store. + store8 + // store16 represents a 16-bit store. + store16 + // store32 represents a 32-bit store. + store32 + // store64 represents a 64-bit store. + store64 + // storeP64 represents a store of a pair of registers. + storeP64 + // loadP64 represents a load of a pair of registers. + loadP64 + // mov64 represents a MOV instruction. These are encoded as ORR's but we keep them separate for better handling. + mov64 + // mov32 represents a 32-bit MOV. This zeroes the top 32 bits of the destination. + mov32 + // movZ represents a MOVZ with a 16-bit immediate. + movZ + // movN represents a MOVN with a 16-bit immediate. + movN + // movK represents a MOVK with a 16-bit immediate. + movK + // extend represents a sign- or zero-extend operation. + extend + // cSel represents a conditional-select operation. + cSel + // cSet represents a conditional-set operation. + cSet + // cCmpImm represents a conditional comparison with an immediate. + cCmpImm + // fpuMov64 represents a FPU move. Distinct from a vector-register move; moving just 64 bits appears to be significantly faster. + fpuMov64 + // fpuMov128 represents a vector register move. + fpuMov128 + // fpuMovFromVec represents a move to scalar from a vector element. + fpuMovFromVec + // fpuRR represents a 1-op FPU instruction. + fpuRR + // fpuRRR represents a 2-op FPU instruction. + fpuRRR + // fpuRRI represents a 2-op FPU instruction with immediate value. + fpuRRI + // fpuRRRR represents a 3-op FPU instruction. + fpuRRRR + // fpuCmp represents a FPU comparison, either 32 or 64 bit. + fpuCmp + // fpuLoad32 represents a floating-point load, single-precision (32 bit). + fpuLoad32 + // fpuStore32 represents a floating-point store, single-precision (32 bit). + fpuStore32 + // fpuLoad64 represents a floating-point load, double-precision (64 bit). + fpuLoad64 + // fpuStore64 represents a floating-point store, double-precision (64 bit). + fpuStore64 + // fpuLoad128 represents a floating-point/vector load, 128 bit. + fpuLoad128 + // fpuStore128 represents a floating-point/vector store, 128 bit. + fpuStore128 + // loadFpuConst32 represents a load of a 32-bit floating-point constant. + loadFpuConst32 + // loadFpuConst64 represents a load of a 64-bit floating-point constant. + loadFpuConst64 + // loadFpuConst128 represents a load of a 128-bit floating-point constant. + loadFpuConst128 + // vecLoad1R represents a load of a one single-element structure that replicates to all lanes of a vector. + vecLoad1R + // fpuToInt represents a conversion from FP to integer. + fpuToInt + // intToFpu represents a conversion from integer to FP. + intToFpu + // fpuCSel represents a 32/64-bit FP conditional select. + fpuCSel + // movToVec represents a move to a vector element from a GPR. + movToVec + // movFromVec represents an unsigned move from a vector element to a GPR. + movFromVec + // movFromVecSigned represents a signed move from a vector element to a GPR. + movFromVecSigned + // vecDup represents a duplication of general-purpose register to vector. + vecDup + // vecDupElement represents a duplication of a vector element to vector or scalar. + vecDupElement + // vecDupFromFpu represents a duplication of scalar to vector. + vecDupFromFpu + // vecExtract represents a vector extraction operation. + vecExtract + // vecExtend represents a vector extension operation. + vecExtend + // vecMovElement represents a move vector element to another vector element operation. + vecMovElement + // vecMiscNarrow represents a vector narrowing operation. + vecMiscNarrow + // vecRRR represents a vector ALU operation. + vecRRR + // vecRRRRewrite is exactly the same as vecRRR except that this rewrites the destination register. + // For example, BSL instruction rewrites the destination register, and the existing value influences the result. + // Therefore, the "destination" register in vecRRRRewrite will be treated as "use" which makes the register outlive + // the instruction while this instruction doesn't have "def" in the context of register allocation. + vecRRRRewrite + // vecMisc represents a vector two register miscellaneous instruction. + vecMisc + // vecLanes represents a vector instruction across lanes. + vecLanes + // vecShiftImm represents a SIMD scalar shift by immediate instruction. + vecShiftImm + // vecTbl represents a table vector lookup - single register table. + vecTbl + // vecTbl2 represents a table vector lookup - two register table. + vecTbl2 + // vecPermute represents a vector permute instruction. + vecPermute + // movToNZCV represents a move to the FPSR. + movToFPSR + // movFromNZCV represents a move from the FPSR. + movFromFPSR + // call represents a machine call instruction. + call + // callInd represents a machine indirect-call instruction. + callInd + // ret represents a machine return instruction. + ret + // br represents an unconditional branch. + br + // condBr represents a conditional branch. + condBr + // adr represents a compute the address (using a PC-relative offset) of a memory location. + adr + // brTableSequence represents a jump-table sequence. + brTableSequence + // exitSequence consists of multiple instructions, and exits the execution immediately. + // See encodeExitSequence. + exitSequence + // atomicRmw represents an atomic read-modify-write operation with two register sources and a register destination. + atomicRmw + // atomicCas represents an atomic compare-and-swap operation with three register sources. The value is loaded to + // the source register containing the comparison value. + atomicCas + // atomicLoad represents an atomic load with one source register and a register destination. + atomicLoad + // atomicStore represents an atomic store with two source registers and no destination. + atomicStore + // dmb represents the data memory barrier instruction in inner-shareable (ish) mode. + dmb + // UDF is the undefined instruction. For debugging only. + udf + // loadConstBlockArg represents a load of a constant block argument. + loadConstBlockArg + + // emitSourceOffsetInfo is a dummy instruction to emit source offset info. + // The existence of this instruction does not affect the execution. + emitSourceOffsetInfo + + // ------------------- do not define below this line ------------------- + numInstructionKinds +) + +func (i *instruction) asLoadConstBlockArg(v uint64, typ ssa.Type, dst regalloc.VReg) *instruction { + i.kind = loadConstBlockArg + i.u1 = v + i.u2 = uint64(typ) + i.rd = operandNR(dst) + return i +} + +func (i *instruction) loadConstBlockArgData() (v uint64, typ ssa.Type, dst regalloc.VReg) { + return i.u1, ssa.Type(i.u2), i.rd.nr() +} + +func (i *instruction) asEmitSourceOffsetInfo(l ssa.SourceOffset) *instruction { + i.kind = emitSourceOffsetInfo + i.u1 = uint64(l) + return i +} + +func (i *instruction) sourceOffsetInfo() ssa.SourceOffset { + return ssa.SourceOffset(i.u1) +} + +func (i *instruction) asUDF() *instruction { + i.kind = udf + return i +} + +func (i *instruction) asFpuToInt(rd, rn operand, rdSigned, src64bit, dst64bit bool) { + i.kind = fpuToInt + i.rn = rn + i.rd = rd + if rdSigned { + i.u1 = 1 + } + if src64bit { + i.u2 = 1 + } + if dst64bit { + i.u3 = 1 + } +} + +func (i *instruction) asIntToFpu(rd, rn operand, rnSigned, src64bit, dst64bit bool) { + i.kind = intToFpu + i.rn = rn + i.rd = rd + if rnSigned { + i.u1 = 1 + } + if src64bit { + i.u2 = 1 + } + if dst64bit { + i.u3 = 1 + } +} + +func (i *instruction) asExitSequence(ctx regalloc.VReg) *instruction { + i.kind = exitSequence + i.rn = operandNR(ctx) + return i +} + +// aluOp determines the type of ALU operation. Instructions whose kind is one of +// aluRRR, aluRRRR, aluRRImm12, aluRRBitmaskImm, aluRRImmShift, aluRRRShift and aluRRRExtend +// would use this type. +type aluOp int + +func (a aluOp) String() string { + switch a { + case aluOpAdd: + return "add" + case aluOpSub: + return "sub" + case aluOpOrr: + return "orr" + case aluOpOrn: + return "orn" + case aluOpAnd: + return "and" + case aluOpAnds: + return "ands" + case aluOpBic: + return "bic" + case aluOpEor: + return "eor" + case aluOpAddS: + return "adds" + case aluOpSubS: + return "subs" + case aluOpSMulH: + return "sMulH" + case aluOpUMulH: + return "uMulH" + case aluOpSDiv: + return "sdiv" + case aluOpUDiv: + return "udiv" + case aluOpRotR: + return "ror" + case aluOpLsr: + return "lsr" + case aluOpAsr: + return "asr" + case aluOpLsl: + return "lsl" + case aluOpMAdd: + return "madd" + case aluOpMSub: + return "msub" + } + panic(int(a)) +} + +const ( + // 32/64-bit Add. + aluOpAdd aluOp = iota + // 32/64-bit Subtract. + aluOpSub + // 32/64-bit Bitwise OR. + aluOpOrr + // 32/64-bit Bitwise OR NOT. + aluOpOrn + // 32/64-bit Bitwise AND. + aluOpAnd + // 32/64-bit Bitwise ANDS. + aluOpAnds + // 32/64-bit Bitwise AND NOT. + aluOpBic + // 32/64-bit Bitwise XOR (Exclusive OR). + aluOpEor + // 32/64-bit Add setting flags. + aluOpAddS + // 32/64-bit Subtract setting flags. + aluOpSubS + // Signed multiply, high-word result. + aluOpSMulH + // Unsigned multiply, high-word result. + aluOpUMulH + // 64-bit Signed divide. + aluOpSDiv + // 64-bit Unsigned divide. + aluOpUDiv + // 32/64-bit Rotate right. + aluOpRotR + // 32/64-bit Logical shift right. + aluOpLsr + // 32/64-bit Arithmetic shift right. + aluOpAsr + // 32/64-bit Logical shift left. + aluOpLsl /// Multiply-add + + // MAdd and MSub are only applicable for aluRRRR. + aluOpMAdd + aluOpMSub +) + +// vecOp determines the type of vector operation. Instructions whose kind is one of +// vecOpCnt would use this type. +type vecOp int + +// String implements fmt.Stringer. +func (b vecOp) String() string { + switch b { + case vecOpCnt: + return "cnt" + case vecOpCmeq: + return "cmeq" + case vecOpCmgt: + return "cmgt" + case vecOpCmhi: + return "cmhi" + case vecOpCmge: + return "cmge" + case vecOpCmhs: + return "cmhs" + case vecOpFcmeq: + return "fcmeq" + case vecOpFcmgt: + return "fcmgt" + case vecOpFcmge: + return "fcmge" + case vecOpCmeq0: + return "cmeq0" + case vecOpUaddlv: + return "uaddlv" + case vecOpBit: + return "bit" + case vecOpBic: + return "bic" + case vecOpBsl: + return "bsl" + case vecOpNot: + return "not" + case vecOpAnd: + return "and" + case vecOpOrr: + return "orr" + case vecOpEOR: + return "eor" + case vecOpFadd: + return "fadd" + case vecOpAdd: + return "add" + case vecOpAddp: + return "addp" + case vecOpAddv: + return "addv" + case vecOpSub: + return "sub" + case vecOpFsub: + return "fsub" + case vecOpSmin: + return "smin" + case vecOpUmin: + return "umin" + case vecOpUminv: + return "uminv" + case vecOpSmax: + return "smax" + case vecOpUmax: + return "umax" + case vecOpUmaxp: + return "umaxp" + case vecOpUrhadd: + return "urhadd" + case vecOpFmul: + return "fmul" + case vecOpSqrdmulh: + return "sqrdmulh" + case vecOpMul: + return "mul" + case vecOpUmlal: + return "umlal" + case vecOpFdiv: + return "fdiv" + case vecOpFsqrt: + return "fsqrt" + case vecOpAbs: + return "abs" + case vecOpFabs: + return "fabs" + case vecOpNeg: + return "neg" + case vecOpFneg: + return "fneg" + case vecOpFrintp: + return "frintp" + case vecOpFrintm: + return "frintm" + case vecOpFrintn: + return "frintn" + case vecOpFrintz: + return "frintz" + case vecOpFcvtl: + return "fcvtl" + case vecOpFcvtn: + return "fcvtn" + case vecOpFcvtzu: + return "fcvtzu" + case vecOpFcvtzs: + return "fcvtzs" + case vecOpScvtf: + return "scvtf" + case vecOpUcvtf: + return "ucvtf" + case vecOpSqxtn: + return "sqxtn" + case vecOpUqxtn: + return "uqxtn" + case vecOpSqxtun: + return "sqxtun" + case vecOpRev64: + return "rev64" + case vecOpXtn: + return "xtn" + case vecOpShll: + return "shll" + case vecOpSshl: + return "sshl" + case vecOpSshll: + return "sshll" + case vecOpUshl: + return "ushl" + case vecOpUshll: + return "ushll" + case vecOpSshr: + return "sshr" + case vecOpZip1: + return "zip1" + case vecOpFmin: + return "fmin" + case vecOpFmax: + return "fmax" + case vecOpSmull: + return "smull" + case vecOpSmull2: + return "smull2" + } + panic(int(b)) +} + +const ( + vecOpCnt vecOp = iota + vecOpCmeq0 + vecOpCmeq + vecOpCmgt + vecOpCmhi + vecOpCmge + vecOpCmhs + vecOpFcmeq + vecOpFcmgt + vecOpFcmge + vecOpUaddlv + vecOpBit + vecOpBic + vecOpBsl + vecOpNot + vecOpAnd + vecOpOrr + vecOpEOR + vecOpAdd + vecOpFadd + vecOpAddv + vecOpSqadd + vecOpUqadd + vecOpAddp + vecOpSub + vecOpFsub + vecOpSqsub + vecOpUqsub + vecOpSmin + vecOpUmin + vecOpUminv + vecOpFmin + vecOpSmax + vecOpUmax + vecOpUmaxp + vecOpFmax + vecOpUrhadd + vecOpMul + vecOpFmul + vecOpSqrdmulh + vecOpUmlal + vecOpFdiv + vecOpFsqrt + vecOpAbs + vecOpFabs + vecOpNeg + vecOpFneg + vecOpFrintm + vecOpFrintn + vecOpFrintp + vecOpFrintz + vecOpFcvtl + vecOpFcvtn + vecOpFcvtzs + vecOpFcvtzu + vecOpScvtf + vecOpUcvtf + vecOpSqxtn + vecOpSqxtun + vecOpUqxtn + vecOpRev64 + vecOpXtn + vecOpShll + vecOpSshl + vecOpSshll + vecOpUshl + vecOpUshll + vecOpSshr + vecOpZip1 + vecOpSmull + vecOpSmull2 +) + +// bitOp determines the type of bitwise operation. Instructions whose kind is one of +// bitOpRbit and bitOpClz would use this type. +type bitOp int + +// String implements fmt.Stringer. +func (b bitOp) String() string { + switch b { + case bitOpRbit: + return "rbit" + case bitOpClz: + return "clz" + } + panic(int(b)) +} + +const ( + // 32/64-bit Rbit. + bitOpRbit bitOp = iota + // 32/64-bit Clz. + bitOpClz +) + +// fpuUniOp represents a unary floating-point unit (FPU) operation. +type fpuUniOp byte + +const ( + fpuUniOpNeg fpuUniOp = iota + fpuUniOpCvt32To64 + fpuUniOpCvt64To32 + fpuUniOpSqrt + fpuUniOpRoundPlus + fpuUniOpRoundMinus + fpuUniOpRoundZero + fpuUniOpRoundNearest + fpuUniOpAbs +) + +// String implements the fmt.Stringer. +func (f fpuUniOp) String() string { + switch f { + case fpuUniOpNeg: + return "fneg" + case fpuUniOpCvt32To64: + return "fcvt" + case fpuUniOpCvt64To32: + return "fcvt" + case fpuUniOpSqrt: + return "fsqrt" + case fpuUniOpRoundPlus: + return "frintp" + case fpuUniOpRoundMinus: + return "frintm" + case fpuUniOpRoundZero: + return "frintz" + case fpuUniOpRoundNearest: + return "frintn" + case fpuUniOpAbs: + return "fabs" + } + panic(int(f)) +} + +// fpuBinOp represents a binary floating-point unit (FPU) operation. +type fpuBinOp byte + +const ( + fpuBinOpAdd = iota + fpuBinOpSub + fpuBinOpMul + fpuBinOpDiv + fpuBinOpMax + fpuBinOpMin +) + +// String implements the fmt.Stringer. +func (f fpuBinOp) String() string { + switch f { + case fpuBinOpAdd: + return "fadd" + case fpuBinOpSub: + return "fsub" + case fpuBinOpMul: + return "fmul" + case fpuBinOpDiv: + return "fdiv" + case fpuBinOpMax: + return "fmax" + case fpuBinOpMin: + return "fmin" + } + panic(int(f)) +} + +// extMode represents the mode of a register operand extension. +// For example, aluRRRExtend instructions need this info to determine the extensions. +type extMode byte + +const ( + extModeNone extMode = iota + // extModeZeroExtend64 suggests a zero-extension to 32 bits if the original bit size is less than 32. + extModeZeroExtend32 + // extModeSignExtend64 stands for a sign-extension to 32 bits if the original bit size is less than 32. + extModeSignExtend32 + // extModeZeroExtend64 suggests a zero-extension to 64 bits if the original bit size is less than 64. + extModeZeroExtend64 + // extModeSignExtend64 stands for a sign-extension to 64 bits if the original bit size is less than 64. + extModeSignExtend64 +) + +func (e extMode) bits() byte { + switch e { + case extModeZeroExtend32, extModeSignExtend32: + return 32 + case extModeZeroExtend64, extModeSignExtend64: + return 64 + default: + return 0 + } +} + +func (e extMode) signed() bool { + switch e { + case extModeSignExtend32, extModeSignExtend64: + return true + default: + return false + } +} + +func extModeOf(t ssa.Type, signed bool) extMode { + switch t.Bits() { + case 32: + if signed { + return extModeSignExtend32 + } + return extModeZeroExtend32 + case 64: + if signed { + return extModeSignExtend64 + } + return extModeZeroExtend64 + default: + panic("TODO? do we need narrower than 32 bits?") + } +} + +type extendOp byte + +const ( + extendOpUXTB extendOp = 0b000 + extendOpUXTH extendOp = 0b001 + extendOpUXTW extendOp = 0b010 + // extendOpUXTX does nothing, but convenient symbol that officially exists. See: + // https://stackoverflow.com/questions/72041372/what-do-the-uxtx-and-sxtx-extensions-mean-for-32-bit-aarch64-adds-instruct + extendOpUXTX extendOp = 0b011 + extendOpSXTB extendOp = 0b100 + extendOpSXTH extendOp = 0b101 + extendOpSXTW extendOp = 0b110 + // extendOpSXTX does nothing, but convenient symbol that officially exists. See: + // https://stackoverflow.com/questions/72041372/what-do-the-uxtx-and-sxtx-extensions-mean-for-32-bit-aarch64-adds-instruct + extendOpSXTX extendOp = 0b111 + extendOpNone extendOp = 0xff +) + +func (e extendOp) srcBits() byte { + switch e { + case extendOpUXTB, extendOpSXTB: + return 8 + case extendOpUXTH, extendOpSXTH: + return 16 + case extendOpUXTW, extendOpSXTW: + return 32 + case extendOpUXTX, extendOpSXTX: + return 64 + } + panic(int(e)) +} + +func (e extendOp) String() string { + switch e { + case extendOpUXTB: + return "UXTB" + case extendOpUXTH: + return "UXTH" + case extendOpUXTW: + return "UXTW" + case extendOpUXTX: + return "UXTX" + case extendOpSXTB: + return "SXTB" + case extendOpSXTH: + return "SXTH" + case extendOpSXTW: + return "SXTW" + case extendOpSXTX: + return "SXTX" + } + panic(int(e)) +} + +func extendOpFrom(signed bool, from byte) extendOp { + switch from { + case 8: + if signed { + return extendOpSXTB + } + return extendOpUXTB + case 16: + if signed { + return extendOpSXTH + } + return extendOpUXTH + case 32: + if signed { + return extendOpSXTW + } + return extendOpUXTW + case 64: + if signed { + return extendOpSXTX + } + return extendOpUXTX + } + panic("invalid extendOpFrom") +} + +type shiftOp byte + +const ( + shiftOpLSL shiftOp = 0b00 + shiftOpLSR shiftOp = 0b01 + shiftOpASR shiftOp = 0b10 + shiftOpROR shiftOp = 0b11 +) + +func (s shiftOp) String() string { + switch s { + case shiftOpLSL: + return "lsl" + case shiftOpLSR: + return "lsr" + case shiftOpASR: + return "asr" + case shiftOpROR: + return "ror" + } + panic(int(s)) +} + +const exitSequenceSize = 6 * 4 // 6 instructions as in encodeExitSequence. + +// size returns the size of the instruction in encoded bytes. +func (i *instruction) size() int64 { + switch i.kind { + case exitSequence: + return exitSequenceSize // 5 instructions as in encodeExitSequence. + case nop0, loadConstBlockArg: + return 0 + case emitSourceOffsetInfo: + return 0 + case loadFpuConst32: + if i.u1 == 0 { + return 4 // zero loading can be encoded as a single instruction. + } + return 4 + 4 + 4 + case loadFpuConst64: + if i.u1 == 0 { + return 4 // zero loading can be encoded as a single instruction. + } + return 4 + 4 + 8 + case loadFpuConst128: + if i.u1 == 0 && i.u2 == 0 { + return 4 // zero loading can be encoded as a single instruction. + } + return 4 + 4 + 16 + case brTableSequence: + return 4*4 + int64(i.u2)*4 + default: + return 4 + } +} + +// vecArrangement is the arrangement of data within a vector register. +type vecArrangement byte + +const ( + // vecArrangementNone is an arrangement indicating no data is stored. + vecArrangementNone vecArrangement = iota + // vecArrangement8B is an arrangement of 8 bytes (64-bit vector) + vecArrangement8B + // vecArrangement16B is an arrangement of 16 bytes (128-bit vector) + vecArrangement16B + // vecArrangement4H is an arrangement of 4 half precisions (64-bit vector) + vecArrangement4H + // vecArrangement8H is an arrangement of 8 half precisions (128-bit vector) + vecArrangement8H + // vecArrangement2S is an arrangement of 2 single precisions (64-bit vector) + vecArrangement2S + // vecArrangement4S is an arrangement of 4 single precisions (128-bit vector) + vecArrangement4S + // vecArrangement1D is an arrangement of 1 double precision (64-bit vector) + vecArrangement1D + // vecArrangement2D is an arrangement of 2 double precisions (128-bit vector) + vecArrangement2D + + // Assign each vector size specifier to a vector arrangement ID. + // Instructions can only have an arrangement or a size specifier, but not both, so it + // simplifies the internal representation of vector instructions by being able to + // store either into the same field. + + // vecArrangementB is a size specifier of byte + vecArrangementB + // vecArrangementH is a size specifier of word (16-bit) + vecArrangementH + // vecArrangementS is a size specifier of double word (32-bit) + vecArrangementS + // vecArrangementD is a size specifier of quad word (64-bit) + vecArrangementD + // vecArrangementQ is a size specifier of the entire vector (128-bit) + vecArrangementQ +) + +// String implements fmt.Stringer +func (v vecArrangement) String() (ret string) { + switch v { + case vecArrangement8B: + ret = "8B" + case vecArrangement16B: + ret = "16B" + case vecArrangement4H: + ret = "4H" + case vecArrangement8H: + ret = "8H" + case vecArrangement2S: + ret = "2S" + case vecArrangement4S: + ret = "4S" + case vecArrangement1D: + ret = "1D" + case vecArrangement2D: + ret = "2D" + case vecArrangementB: + ret = "B" + case vecArrangementH: + ret = "H" + case vecArrangementS: + ret = "S" + case vecArrangementD: + ret = "D" + case vecArrangementQ: + ret = "Q" + case vecArrangementNone: + ret = "none" + default: + panic(v) + } + return +} + +// vecIndex is the index of an element of a vector register +type vecIndex byte + +// vecIndexNone indicates no vector index specified. +const vecIndexNone = ^vecIndex(0) + +func ssaLaneToArrangement(lane ssa.VecLane) vecArrangement { + switch lane { + case ssa.VecLaneI8x16: + return vecArrangement16B + case ssa.VecLaneI16x8: + return vecArrangement8H + case ssa.VecLaneI32x4: + return vecArrangement4S + case ssa.VecLaneI64x2: + return vecArrangement2D + case ssa.VecLaneF32x4: + return vecArrangement4S + case ssa.VecLaneF64x2: + return vecArrangement2D + default: + panic(lane) + } +} + +// atomicRmwOp is the type of atomic read-modify-write operation. +type atomicRmwOp byte + +const ( + // atomicRmwOpAdd is an atomic add operation. + atomicRmwOpAdd atomicRmwOp = iota + // atomicRmwOpClr is an atomic clear operation, i.e. AND NOT. + atomicRmwOpClr + // atomicRmwOpSet is an atomic set operation, i.e. OR. + atomicRmwOpSet + // atomicRmwOpEor is an atomic exclusive OR operation. + atomicRmwOpEor + // atomicRmwOpSwp is an atomic swap operation. + atomicRmwOpSwp +) + +// String implements fmt.Stringer +func (a atomicRmwOp) String() string { + switch a { + case atomicRmwOpAdd: + return "ldaddal" + case atomicRmwOpClr: + return "ldclral" + case atomicRmwOpSet: + return "ldsetal" + case atomicRmwOpEor: + return "ldeoral" + case atomicRmwOpSwp: + return "swpal" + } + panic(fmt.Sprintf("unknown atomicRmwOp: %d", a)) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go new file mode 100644 index 000000000..227a96474 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go @@ -0,0 +1,2351 @@ +package arm64 + +import ( + "context" + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// Encode implements backend.Machine Encode. +func (m *machine) Encode(ctx context.Context) error { + m.resolveRelativeAddresses(ctx) + m.encode(m.executableContext.RootInstr) + if l := len(m.compiler.Buf()); l > maxFunctionExecutableSize { + return fmt.Errorf("function size exceeds the limit: %d > %d", l, maxFunctionExecutableSize) + } + return nil +} + +func (m *machine) encode(root *instruction) { + for cur := root; cur != nil; cur = cur.next { + cur.encode(m) + } +} + +func (i *instruction) encode(m *machine) { + c := m.compiler + switch kind := i.kind; kind { + case nop0, emitSourceOffsetInfo, loadConstBlockArg: + case exitSequence: + encodeExitSequence(c, i.rn.reg()) + case ret: + // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/RET--Return-from-subroutine-?lang=en + c.Emit4Bytes(encodeRet()) + case br: + imm := i.brOffset() + c.Emit4Bytes(encodeUnconditionalBranch(false, imm)) + case call: + // We still don't know the exact address of the function to call, so we emit a placeholder. + c.AddRelocationInfo(i.callFuncRef()) + c.Emit4Bytes(encodeUnconditionalBranch(true, 0)) // 0 = placeholder + case callInd: + c.Emit4Bytes(encodeUnconditionalBranchReg(regNumberInEncoding[i.rn.realReg()], true)) + case store8, store16, store32, store64, fpuStore32, fpuStore64, fpuStore128: + c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rn.realReg()], i.amode)) + case uLoad8, uLoad16, uLoad32, uLoad64, sLoad8, sLoad16, sLoad32, fpuLoad32, fpuLoad64, fpuLoad128: + c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rd.realReg()], i.amode)) + case vecLoad1R: + c.Emit4Bytes(encodeVecLoad1R( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(i.u1))) + case condBr: + imm19 := i.condBrOffset() + if imm19%4 != 0 { + panic("imm26 for branch must be a multiple of 4") + } + + imm19U32 := uint32(imm19/4) & 0b111_11111111_11111111 + brCond := i.condBrCond() + switch brCond.kind() { + case condKindRegisterZero: + rt := regNumberInEncoding[brCond.register().RealReg()] + c.Emit4Bytes(encodeCBZCBNZ(rt, false, imm19U32, i.condBr64bit())) + case condKindRegisterNotZero: + rt := regNumberInEncoding[brCond.register().RealReg()] + c.Emit4Bytes(encodeCBZCBNZ(rt, true, imm19U32, i.condBr64bit())) + case condKindCondFlagSet: + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B-cond--Branch-conditionally- + fl := brCond.flag() + c.Emit4Bytes(0b01010100<<24 | (imm19U32 << 5) | uint32(fl)) + default: + panic("BUG") + } + case movN: + c.Emit4Bytes(encodeMoveWideImmediate(0b00, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3)) + case movZ: + c.Emit4Bytes(encodeMoveWideImmediate(0b10, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3)) + case movK: + c.Emit4Bytes(encodeMoveWideImmediate(0b11, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3)) + case mov32: + to, from := i.rd.realReg(), i.rn.realReg() + c.Emit4Bytes(encodeAsMov32(regNumberInEncoding[from], regNumberInEncoding[to])) + case mov64: + to, from := i.rd.realReg(), i.rn.realReg() + toIsSp := to == sp + fromIsSp := from == sp + c.Emit4Bytes(encodeMov64(regNumberInEncoding[to], regNumberInEncoding[from], toIsSp, fromIsSp)) + case loadP64, storeP64: + rt, rt2 := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()] + amode := i.amode + rn := regNumberInEncoding[amode.rn.RealReg()] + var pre bool + switch amode.kind { + case addressModeKindPostIndex: + case addressModeKindPreIndex: + pre = true + default: + panic("BUG") + } + c.Emit4Bytes(encodePreOrPostIndexLoadStorePair64(pre, kind == loadP64, rn, rt, rt2, amode.imm)) + case loadFpuConst32: + rd := regNumberInEncoding[i.rd.realReg()] + if i.u1 == 0 { + c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B)) + } else { + encodeLoadFpuConst32(c, rd, i.u1) + } + case loadFpuConst64: + rd := regNumberInEncoding[i.rd.realReg()] + if i.u1 == 0 { + c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B)) + } else { + encodeLoadFpuConst64(c, regNumberInEncoding[i.rd.realReg()], i.u1) + } + case loadFpuConst128: + rd := regNumberInEncoding[i.rd.realReg()] + lo, hi := i.u1, i.u2 + if lo == 0 && hi == 0 { + c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement16B)) + } else { + encodeLoadFpuConst128(c, rd, lo, hi) + } + case aluRRRR: + c.Emit4Bytes(encodeAluRRRR( + aluOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + regNumberInEncoding[i.ra.realReg()], + uint32(i.u3), + )) + case aluRRImmShift: + c.Emit4Bytes(encodeAluRRImm( + aluOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + uint32(i.rm.shiftImm()), + uint32(i.u3), + )) + case aluRRR: + rn := i.rn.realReg() + c.Emit4Bytes(encodeAluRRR( + aluOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[rn], + regNumberInEncoding[i.rm.realReg()], + i.u3 == 1, + rn == sp, + )) + case aluRRRExtend: + rm, exo, to := i.rm.er() + c.Emit4Bytes(encodeAluRRRExtend( + aluOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[rm.RealReg()], + exo, + to, + )) + case aluRRRShift: + r, amt, sop := i.rm.sr() + c.Emit4Bytes(encodeAluRRRShift( + aluOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[r.RealReg()], + uint32(amt), + sop, + i.u3 == 1, + )) + case aluRRBitmaskImm: + c.Emit4Bytes(encodeAluBitmaskImmediate( + aluOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + i.u2, + i.u3 == 1, + )) + case bitRR: + c.Emit4Bytes(encodeBitRR( + bitOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + uint32(i.u2)), + ) + case aluRRImm12: + imm12, shift := i.rm.imm12() + c.Emit4Bytes(encodeAluRRImm12( + aluOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + imm12, shift, + i.u3 == 1, + )) + case fpuRRR: + c.Emit4Bytes(encodeFpuRRR( + fpuBinOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + i.u3 == 1, + )) + case fpuMov64, fpuMov128: + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/MOV--vector---Move-vector--an-alias-of-ORR--vector--register-- + rd := regNumberInEncoding[i.rd.realReg()] + rn := regNumberInEncoding[i.rn.realReg()] + var q uint32 + if kind == fpuMov128 { + q = 0b1 + } + c.Emit4Bytes(q<<30 | 0b1110101<<21 | rn<<16 | 0b000111<<10 | rn<<5 | rd) + case cSet: + rd := regNumberInEncoding[i.rd.realReg()] + cf := condFlag(i.u1) + if i.u2 == 1 { + // https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/CSETM--Conditional-Set-Mask--an-alias-of-CSINV- + // Note that we set 64bit version here. + c.Emit4Bytes(0b1101101010011111<<16 | uint32(cf.invert())<<12 | 0b011111<<5 | rd) + } else { + // https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/CSET--Conditional-Set--an-alias-of-CSINC- + // Note that we set 64bit version here. + c.Emit4Bytes(0b1001101010011111<<16 | uint32(cf.invert())<<12 | 0b111111<<5 | rd) + } + case extend: + c.Emit4Bytes(encodeExtend(i.u3 == 1, byte(i.u1), byte(i.u2), regNumberInEncoding[i.rd.realReg()], regNumberInEncoding[i.rn.realReg()])) + case fpuCmp: + // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/FCMP--Floating-point-quiet-Compare--scalar--?lang=en + rn, rm := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()] + var ftype uint32 + if i.u3 == 1 { + ftype = 0b01 // double precision. + } + c.Emit4Bytes(0b1111<<25 | ftype<<22 | 1<<21 | rm<<16 | 0b1<<13 | rn<<5) + case udf: + // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UDF--Permanently-Undefined-?lang=en + if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable { + c.Emit4Bytes(dummyInstruction) + } else { + c.Emit4Bytes(0) + } + case adr: + c.Emit4Bytes(encodeAdr(regNumberInEncoding[i.rd.realReg()], uint32(i.u1))) + case cSel: + c.Emit4Bytes(encodeConditionalSelect( + kind, + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + condFlag(i.u1), + i.u3 == 1, + )) + case fpuCSel: + c.Emit4Bytes(encodeFpuCSel( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + condFlag(i.u1), + i.u3 == 1, + )) + case movToVec: + c.Emit4Bytes(encodeMoveToVec( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(byte(i.u1)), + vecIndex(i.u2), + )) + case movFromVec, movFromVecSigned: + c.Emit4Bytes(encodeMoveFromVec( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(byte(i.u1)), + vecIndex(i.u2), + i.kind == movFromVecSigned, + )) + case vecDup: + c.Emit4Bytes(encodeVecDup( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(byte(i.u1)))) + case vecDupElement: + c.Emit4Bytes(encodeVecDupElement( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(byte(i.u1)), + vecIndex(i.u2))) + case vecExtract: + c.Emit4Bytes(encodeVecExtract( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + vecArrangement(byte(i.u1)), + uint32(i.u2))) + case vecPermute: + c.Emit4Bytes(encodeVecPermute( + vecOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + vecArrangement(byte(i.u2)))) + case vecMovElement: + c.Emit4Bytes(encodeVecMovElement( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(i.u1), + uint32(i.u2), uint32(i.u3), + )) + case vecMisc: + c.Emit4Bytes(encodeAdvancedSIMDTwoMisc( + vecOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(i.u2), + )) + case vecLanes: + c.Emit4Bytes(encodeVecLanes( + vecOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(i.u2), + )) + case vecShiftImm: + c.Emit4Bytes(encodeVecShiftImm( + vecOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + uint32(i.rm.shiftImm()), + vecArrangement(i.u2), + )) + case vecTbl: + c.Emit4Bytes(encodeVecTbl( + 1, + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + vecArrangement(i.u2)), + ) + case vecTbl2: + c.Emit4Bytes(encodeVecTbl( + 2, + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + vecArrangement(i.u2)), + ) + case brTableSequence: + targets := m.jmpTableTargets[i.u1] + encodeBrTableSequence(c, i.rn.reg(), targets) + case fpuToInt, intToFpu: + c.Emit4Bytes(encodeCnvBetweenFloatInt(i)) + case fpuRR: + c.Emit4Bytes(encodeFloatDataOneSource( + fpuUniOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + i.u3 == 1, + )) + case vecRRR: + if op := vecOp(i.u1); op == vecOpBsl || op == vecOpBit || op == vecOpUmlal { + panic(fmt.Sprintf("vecOp %s must use vecRRRRewrite instead of vecRRR", op.String())) + } + fallthrough + case vecRRRRewrite: + c.Emit4Bytes(encodeVecRRR( + vecOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + vecArrangement(i.u2), + )) + case cCmpImm: + // Conditional compare (immediate) in https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en + sf := uint32(i.u3 & 0b1) + nzcv := uint32(i.u2 & 0b1111) + cond := uint32(condFlag(i.u1)) + imm := uint32(i.rm.data & 0b11111) + rn := regNumberInEncoding[i.rn.realReg()] + c.Emit4Bytes( + sf<<31 | 0b111101001<<22 | imm<<16 | cond<<12 | 0b1<<11 | rn<<5 | nzcv, + ) + case movFromFPSR: + rt := regNumberInEncoding[i.rd.realReg()] + c.Emit4Bytes(encodeSystemRegisterMove(rt, true)) + case movToFPSR: + rt := regNumberInEncoding[i.rn.realReg()] + c.Emit4Bytes(encodeSystemRegisterMove(rt, false)) + case atomicRmw: + c.Emit4Bytes(encodeAtomicRmw( + atomicRmwOp(i.u1), + regNumberInEncoding[i.rm.realReg()], + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + uint32(i.u2), + )) + case atomicCas: + c.Emit4Bytes(encodeAtomicCas( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rm.realReg()], + regNumberInEncoding[i.rn.realReg()], + uint32(i.u2), + )) + case atomicLoad: + c.Emit4Bytes(encodeAtomicLoadStore( + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rd.realReg()], + uint32(i.u2), + 1, + )) + case atomicStore: + c.Emit4Bytes(encodeAtomicLoadStore( + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + uint32(i.u2), + 0, + )) + case dmb: + c.Emit4Bytes(encodeDMB()) + default: + panic(i.String()) + } +} + +func encodeMov64(rd, rn uint32, toIsSp, fromIsSp bool) uint32 { + if toIsSp || fromIsSp { + // This is an alias of ADD (immediate): + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MOV--to-from-SP---Move-between-register-and-stack-pointer--an-alias-of-ADD--immediate-- + return encodeAddSubtractImmediate(0b100, 0, 0, rn, rd) + } else { + // This is an alias of ORR (shifted register): + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MOV--register---Move--register---an-alias-of-ORR--shifted-register-- + return encodeLogicalShiftedRegister(0b101, 0, rn, 0, regNumberInEncoding[xzr], rd) + } +} + +// encodeSystemRegisterMove encodes as "System register move" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en +// +// Note that currently we only supports read/write of FPSR. +func encodeSystemRegisterMove(rt uint32, fromSystem bool) uint32 { + ret := 0b11010101<<24 | 0b11011<<16 | 0b01000100<<8 | 0b001<<5 | rt + if fromSystem { + ret |= 0b1 << 21 + } + return ret +} + +// encodeVecRRR encodes as either "Advanced SIMD three *" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeVecRRR(op vecOp, rd, rn, rm uint32, arr vecArrangement) uint32 { + switch op { + case vecOpBit: + _, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b10 /* always has size 0b10 */, 0b1, q) + case vecOpBic: + if arr > vecArrangement16B { + panic("unsupported arrangement: " + arr.String()) + } + _, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b01 /* always has size 0b01 */, 0b0, q) + case vecOpBsl: + if arr > vecArrangement16B { + panic("unsupported arrangement: " + arr.String()) + } + _, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b01 /* always has size 0b01 */, 0b1, q) + case vecOpAnd: + if arr > vecArrangement16B { + panic("unsupported arrangement: " + arr.String()) + } + _, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b00 /* always has size 0b00 */, 0b0, q) + case vecOpOrr: + _, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b10 /* always has size 0b10 */, 0b0, q) + case vecOpEOR: + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, size, 0b1, q) + case vecOpCmeq: + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10001, size, 0b1, q) + case vecOpCmgt: + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00110, size, 0b0, q) + case vecOpCmhi: + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00110, size, 0b1, q) + case vecOpCmge: + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00111, size, 0b0, q) + case vecOpCmhs: + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00111, size, 0b1, q) + case vecOpFcmeq: + var size, q uint32 + switch arr { + case vecArrangement4S: + size, q = 0b00, 0b1 + case vecArrangement2S: + size, q = 0b00, 0b0 + case vecArrangement2D: + size, q = 0b01, 0b1 + default: + panic("unsupported arrangement: " + arr.String()) + } + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11100, size, 0b0, q) + case vecOpFcmgt: + if arr < vecArrangement2S || arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11100, size, 0b1, q) + case vecOpFcmge: + var size, q uint32 + switch arr { + case vecArrangement4S: + size, q = 0b00, 0b1 + case vecArrangement2S: + size, q = 0b00, 0b0 + case vecArrangement2D: + size, q = 0b01, 0b1 + default: + panic("unsupported arrangement: " + arr.String()) + } + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11100, size, 0b1, q) + case vecOpAdd: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10000, size, 0b0, q) + case vecOpSqadd: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00001, size, 0b0, q) + case vecOpUqadd: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00001, size, 0b1, q) + case vecOpAddp: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10111, size, 0b0, q) + case vecOpSqsub: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00101, size, 0b0, q) + case vecOpUqsub: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00101, size, 0b1, q) + case vecOpSub: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10000, size, 0b1, q) + case vecOpFmin: + if arr < vecArrangement2S || arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11110, size, 0b0, q) + case vecOpSmin: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01101, size, 0b0, q) + case vecOpUmin: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01101, size, 0b1, q) + case vecOpFmax: + var size, q uint32 + switch arr { + case vecArrangement4S: + size, q = 0b00, 0b1 + case vecArrangement2S: + size, q = 0b00, 0b0 + case vecArrangement2D: + size, q = 0b01, 0b1 + default: + panic("unsupported arrangement: " + arr.String()) + } + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11110, size, 0b0, q) + case vecOpFadd: + var size, q uint32 + switch arr { + case vecArrangement4S: + size, q = 0b00, 0b1 + case vecArrangement2S: + size, q = 0b00, 0b0 + case vecArrangement2D: + size, q = 0b01, 0b1 + default: + panic("unsupported arrangement: " + arr.String()) + } + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11010, size, 0b0, q) + case vecOpFsub: + if arr < vecArrangement2S || arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11010, size, 0b0, q) + case vecOpFmul: + var size, q uint32 + switch arr { + case vecArrangement4S: + size, q = 0b00, 0b1 + case vecArrangement2S: + size, q = 0b00, 0b0 + case vecArrangement2D: + size, q = 0b01, 0b1 + default: + panic("unsupported arrangement: " + arr.String()) + } + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11011, size, 0b1, q) + case vecOpSqrdmulh: + if arr < vecArrangement4H || arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10110, size, 0b1, q) + case vecOpFdiv: + var size, q uint32 + switch arr { + case vecArrangement4S: + size, q = 0b00, 0b1 + case vecArrangement2S: + size, q = 0b00, 0b0 + case vecArrangement2D: + size, q = 0b01, 0b1 + default: + panic("unsupported arrangement: " + arr.String()) + } + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11111, size, 0b1, q) + case vecOpSmax: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01100, size, 0b0, q) + case vecOpUmax: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01100, size, 0b1, q) + case vecOpUmaxp: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10100, size, 0b1, q) + case vecOpUrhadd: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00010, size, 0b1, q) + case vecOpMul: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10011, size, 0b0, q) + case vecOpUmlal: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeDifferent(rd, rn, rm, 0b1000, size, 0b1, q) + case vecOpSshl: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01000, size, 0b0, q) + case vecOpUshl: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01000, size, 0b1, q) + + case vecOpSmull: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, _ := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeDifferent(rd, rn, rm, 0b1100, size, 0b0, 0b0) + + case vecOpSmull2: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, _ := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeDifferent(rd, rn, rm, 0b1100, size, 0b0, 0b1) + + default: + panic("TODO: " + op.String()) + } +} + +func arrToSizeQEncoded(arr vecArrangement) (size, q uint32) { + switch arr { + case vecArrangement16B: + q = 0b1 + fallthrough + case vecArrangement8B: + size = 0b00 + case vecArrangement8H: + q = 0b1 + fallthrough + case vecArrangement4H: + size = 0b01 + case vecArrangement4S: + q = 0b1 + fallthrough + case vecArrangement2S: + size = 0b10 + case vecArrangement2D: + q = 0b1 + fallthrough + case vecArrangement1D: + size = 0b11 + default: + panic("BUG") + } + return +} + +// encodeAdvancedSIMDThreeSame encodes as "Advanced SIMD three same" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeAdvancedSIMDThreeSame(rd, rn, rm, opcode, size, U, Q uint32) uint32 { + return Q<<30 | U<<29 | 0b111<<25 | size<<22 | 0b1<<21 | rm<<16 | opcode<<11 | 0b1<<10 | rn<<5 | rd +} + +// encodeAdvancedSIMDThreeDifferent encodes as "Advanced SIMD three different" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeAdvancedSIMDThreeDifferent(rd, rn, rm, opcode, size, U, Q uint32) uint32 { + return Q<<30 | U<<29 | 0b111<<25 | size<<22 | 0b1<<21 | rm<<16 | opcode<<12 | rn<<5 | rd +} + +// encodeFloatDataOneSource encodes as "Floating-point data-processing (1 source)" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp +func encodeFloatDataOneSource(op fpuUniOp, rd, rn uint32, dst64bit bool) uint32 { + var opcode, ptype uint32 + switch op { + case fpuUniOpCvt32To64: + opcode = 0b000101 + case fpuUniOpCvt64To32: + opcode = 0b000100 + ptype = 0b01 + case fpuUniOpNeg: + opcode = 0b000010 + if dst64bit { + ptype = 0b01 + } + case fpuUniOpSqrt: + opcode = 0b000011 + if dst64bit { + ptype = 0b01 + } + case fpuUniOpRoundPlus: + opcode = 0b001001 + if dst64bit { + ptype = 0b01 + } + case fpuUniOpRoundMinus: + opcode = 0b001010 + if dst64bit { + ptype = 0b01 + } + case fpuUniOpRoundZero: + opcode = 0b001011 + if dst64bit { + ptype = 0b01 + } + case fpuUniOpRoundNearest: + opcode = 0b001000 + if dst64bit { + ptype = 0b01 + } + case fpuUniOpAbs: + opcode = 0b000001 + if dst64bit { + ptype = 0b01 + } + default: + panic("BUG") + } + return 0b1111<<25 | ptype<<22 | 0b1<<21 | opcode<<15 | 0b1<<14 | rn<<5 | rd +} + +// encodeCnvBetweenFloatInt encodes as "Conversion between floating-point and integer" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeCnvBetweenFloatInt(i *instruction) uint32 { + rd := regNumberInEncoding[i.rd.realReg()] + rn := regNumberInEncoding[i.rn.realReg()] + + var opcode uint32 + var rmode uint32 + var ptype uint32 + var sf uint32 + switch i.kind { + case intToFpu: // Either UCVTF or SCVTF. + rmode = 0b00 + + signed := i.u1 == 1 + src64bit := i.u2 == 1 + dst64bit := i.u3 == 1 + if signed { + opcode = 0b010 + } else { + opcode = 0b011 + } + if src64bit { + sf = 0b1 + } + if dst64bit { + ptype = 0b01 + } else { + ptype = 0b00 + } + case fpuToInt: // Either FCVTZU or FCVTZS. + rmode = 0b11 + + signed := i.u1 == 1 + src64bit := i.u2 == 1 + dst64bit := i.u3 == 1 + + if signed { + opcode = 0b000 + } else { + opcode = 0b001 + } + if dst64bit { + sf = 0b1 + } + if src64bit { + ptype = 0b01 + } else { + ptype = 0b00 + } + } + return sf<<31 | 0b1111<<25 | ptype<<22 | 0b1<<21 | rmode<<19 | opcode<<16 | rn<<5 | rd +} + +// encodeAdr encodes a PC-relative ADR instruction. +// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/ADR--Form-PC-relative-address- +func encodeAdr(rd uint32, offset uint32) uint32 { + if offset >= 1<<20 { + panic("BUG: too large adr instruction") + } + return offset&0b11<<29 | 0b1<<28 | offset&0b1111111111_1111111100<<3 | rd +} + +// encodeFpuCSel encodes as "Floating-point conditional select" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeFpuCSel(rd, rn, rm uint32, c condFlag, _64bit bool) uint32 { + var ftype uint32 + if _64bit { + ftype = 0b01 // double precision. + } + return 0b1111<<25 | ftype<<22 | 0b1<<21 | rm<<16 | uint32(c)<<12 | 0b11<<10 | rn<<5 | rd +} + +// encodeMoveToVec encodes as "Move general-purpose register to a vector element" (represented as `ins`) in +// https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/MOV--vector--from-general- +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--from-general---Move-general-purpose-register-to-a-vector-element--an-alias-of-INS--general--?lang=en +func encodeMoveToVec(rd, rn uint32, arr vecArrangement, index vecIndex) uint32 { + var imm5 uint32 + switch arr { + case vecArrangementB: + imm5 |= 0b1 + imm5 |= uint32(index) << 1 + if index > 0b1111 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", index)) + } + case vecArrangementH: + imm5 |= 0b10 + imm5 |= uint32(index) << 2 + if index > 0b111 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", index)) + } + case vecArrangementS: + imm5 |= 0b100 + imm5 |= uint32(index) << 3 + if index > 0b11 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", index)) + } + case vecArrangementD: + imm5 |= 0b1000 + imm5 |= uint32(index) << 4 + if index > 0b1 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", index)) + } + default: + panic("Unsupported arrangement " + arr.String()) + } + + return 0b01001110000<<21 | imm5<<16 | 0b000111<<10 | rn<<5 | rd +} + +// encodeMoveToVec encodes as "Move vector element to another vector element, mov (element)" (represented as `ins`) in +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--element---Move-vector-element-to-another-vector-element--an-alias-of-INS--element--?lang=en +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en +func encodeVecMovElement(rd, rn uint32, arr vecArrangement, srcIndex, dstIndex uint32) uint32 { + var imm4, imm5 uint32 + switch arr { + case vecArrangementB: + imm5 |= 0b1 + imm5 |= srcIndex << 1 + imm4 = dstIndex + if srcIndex > 0b1111 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", srcIndex)) + } + case vecArrangementH: + imm5 |= 0b10 + imm5 |= srcIndex << 2 + imm4 = dstIndex << 1 + if srcIndex > 0b111 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", srcIndex)) + } + case vecArrangementS: + imm5 |= 0b100 + imm5 |= srcIndex << 3 + imm4 = dstIndex << 2 + if srcIndex > 0b11 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", srcIndex)) + } + case vecArrangementD: + imm5 |= 0b1000 + imm5 |= srcIndex << 4 + imm4 = dstIndex << 3 + if srcIndex > 0b1 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", srcIndex)) + } + default: + panic("Unsupported arrangement " + arr.String()) + } + + return 0b01101110000<<21 | imm5<<16 | imm4<<11 | 0b1<<10 | rn<<5 | rd +} + +// encodeUnconditionalBranchReg encodes as "Unconditional branch (register)" in: +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en +func encodeUnconditionalBranchReg(rn uint32, link bool) uint32 { + var opc uint32 + if link { + opc = 0b0001 + } + return 0b1101011<<25 | opc<<21 | 0b11111<<16 | rn<<5 +} + +// encodeMoveFromVec encodes as "Move vector element to a general-purpose register" +// (represented as `umov` when dest is 32-bit, `umov` otherwise) in +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register-?lang=en +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--to-general---Move-vector-element-to-general-purpose-register--an-alias-of-UMOV-?lang=en +func encodeMoveFromVec(rd, rn uint32, arr vecArrangement, index vecIndex, signed bool) uint32 { + var op, imm4, q, imm5 uint32 + switch { + case arr == vecArrangementB: + imm5 |= 0b1 + imm5 |= uint32(index) << 1 + if index > 0b1111 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", index)) + } + case arr == vecArrangementH: + imm5 |= 0b10 + imm5 |= uint32(index) << 2 + if index > 0b111 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", index)) + } + case arr == vecArrangementS && signed: + q = 0b1 + fallthrough + case arr == vecArrangementS: + imm5 |= 0b100 + imm5 |= uint32(index) << 3 + if index > 0b11 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", index)) + } + case arr == vecArrangementD && !signed: + imm5 |= 0b1000 + imm5 |= uint32(index) << 4 + q = 0b1 + if index > 0b1 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", index)) + } + default: + panic("Unsupported arrangement " + arr.String()) + } + if signed { + op, imm4 = 0, 0b0101 + } else { + op, imm4 = 0, 0b0111 + } + return op<<29 | 0b01110000<<21 | q<<30 | imm5<<16 | imm4<<11 | 1<<10 | rn<<5 | rd +} + +// encodeVecDup encodes as "Duplicate general-purpose register to vector" DUP (general) +// (represented as `dup`) +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--general---Duplicate-general-purpose-register-to-vector-?lang=en +func encodeVecDup(rd, rn uint32, arr vecArrangement) uint32 { + var q, imm5 uint32 + switch arr { + case vecArrangement8B: + q, imm5 = 0b0, 0b1 + case vecArrangement16B: + q, imm5 = 0b1, 0b1 + case vecArrangement4H: + q, imm5 = 0b0, 0b10 + case vecArrangement8H: + q, imm5 = 0b1, 0b10 + case vecArrangement2S: + q, imm5 = 0b0, 0b100 + case vecArrangement4S: + q, imm5 = 0b1, 0b100 + case vecArrangement2D: + q, imm5 = 0b1, 0b1000 + default: + panic("Unsupported arrangement " + arr.String()) + } + return q<<30 | 0b001110000<<21 | imm5<<16 | 0b000011<<10 | rn<<5 | rd +} + +// encodeVecDup encodes as "Duplicate vector element to vector or scalar" DUP (element). +// (represented as `dup`) +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar- +func encodeVecDupElement(rd, rn uint32, arr vecArrangement, srcIndex vecIndex) uint32 { + var q, imm5 uint32 + q = 0b1 + switch arr { + case vecArrangementB: + imm5 |= 0b1 + imm5 |= uint32(srcIndex) << 1 + case vecArrangementH: + imm5 |= 0b10 + imm5 |= uint32(srcIndex) << 2 + case vecArrangementS: + imm5 |= 0b100 + imm5 |= uint32(srcIndex) << 3 + case vecArrangementD: + imm5 |= 0b1000 + imm5 |= uint32(srcIndex) << 4 + default: + panic("unsupported arrangement" + arr.String()) + } + + return q<<30 | 0b001110000<<21 | imm5<<16 | 0b1<<10 | rn<<5 | rd +} + +// encodeVecExtract encodes as "Advanced SIMD extract." +// Currently only `ext` is defined. +// https://developer.arm.com/documentation/ddi0602/2023-06/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp +// https://developer.arm.com/documentation/ddi0602/2023-06/SIMD-FP-Instructions/EXT--Extract-vector-from-pair-of-vectors-?lang=en +func encodeVecExtract(rd, rn, rm uint32, arr vecArrangement, index uint32) uint32 { + var q, imm4 uint32 + switch arr { + case vecArrangement8B: + q, imm4 = 0, 0b0111&uint32(index) + case vecArrangement16B: + q, imm4 = 1, 0b1111&uint32(index) + default: + panic("Unsupported arrangement " + arr.String()) + } + return q<<30 | 0b101110000<<21 | rm<<16 | imm4<<11 | rn<<5 | rd +} + +// encodeVecPermute encodes as "Advanced SIMD permute." +// https://developer.arm.com/documentation/ddi0602/2023-06/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp +func encodeVecPermute(op vecOp, rd, rn, rm uint32, arr vecArrangement) uint32 { + var q, size, opcode uint32 + switch op { + case vecOpZip1: + opcode = 0b011 + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q = arrToSizeQEncoded(arr) + default: + panic("TODO: " + op.String()) + } + return q<<30 | 0b001110<<24 | size<<22 | rm<<16 | opcode<<12 | 0b10<<10 | rn<<5 | rd +} + +// encodeConditionalSelect encodes as "Conditional select" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#condsel +func encodeConditionalSelect(kind instructionKind, rd, rn, rm uint32, c condFlag, _64bit bool) uint32 { + if kind != cSel { + panic("TODO: support other conditional select") + } + + ret := 0b110101<<23 | rm<<16 | uint32(c)<<12 | rn<<5 | rd + if _64bit { + ret |= 0b1 << 31 + } + return ret +} + +const dummyInstruction uint32 = 0x14000000 // "b 0" + +// encodeLoadFpuConst32 encodes the following three instructions: +// +// ldr s8, #8 ;; literal load of data.f32 +// b 8 ;; skip the data +// data.f32 xxxxxxx +func encodeLoadFpuConst32(c backend.Compiler, rd uint32, rawF32 uint64) { + c.Emit4Bytes( + // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--?lang=en + 0b111<<26 | (0x8/4)<<5 | rd, + ) + c.Emit4Bytes(encodeUnconditionalBranch(false, 8)) // b 8 + if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable { + // Inlined data.f32 cannot be disassembled, so we add a dummy instruction here. + c.Emit4Bytes(dummyInstruction) + } else { + c.Emit4Bytes(uint32(rawF32)) // data.f32 xxxxxxx + } +} + +// encodeLoadFpuConst64 encodes the following three instructions: +// +// ldr d8, #8 ;; literal load of data.f64 +// b 12 ;; skip the data +// data.f64 xxxxxxx +func encodeLoadFpuConst64(c backend.Compiler, rd uint32, rawF64 uint64) { + c.Emit4Bytes( + // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--?lang=en + 0b1<<30 | 0b111<<26 | (0x8/4)<<5 | rd, + ) + c.Emit4Bytes(encodeUnconditionalBranch(false, 12)) // b 12 + if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable { + // Inlined data.f64 cannot be disassembled, so we add dummy instructions here. + c.Emit4Bytes(dummyInstruction) + c.Emit4Bytes(dummyInstruction) + } else { + // data.f64 xxxxxxx + c.Emit4Bytes(uint32(rawF64)) + c.Emit4Bytes(uint32(rawF64 >> 32)) + } +} + +// encodeLoadFpuConst128 encodes the following three instructions: +// +// ldr v8, #8 ;; literal load of data.f64 +// b 20 ;; skip the data +// data.v128 xxxxxxx +func encodeLoadFpuConst128(c backend.Compiler, rd uint32, lo, hi uint64) { + c.Emit4Bytes( + // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--?lang=en + 0b1<<31 | 0b111<<26 | (0x8/4)<<5 | rd, + ) + c.Emit4Bytes(encodeUnconditionalBranch(false, 20)) // b 20 + if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable { + // Inlined data.v128 cannot be disassembled, so we add dummy instructions here. + c.Emit4Bytes(dummyInstruction) + c.Emit4Bytes(dummyInstruction) + c.Emit4Bytes(dummyInstruction) + c.Emit4Bytes(dummyInstruction) + } else { + // data.v128 xxxxxxx + c.Emit4Bytes(uint32(lo)) + c.Emit4Bytes(uint32(lo >> 32)) + c.Emit4Bytes(uint32(hi)) + c.Emit4Bytes(uint32(hi >> 32)) + } +} + +// encodeAluRRRR encodes as Data-processing (3 source) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en +func encodeAluRRRR(op aluOp, rd, rn, rm, ra, _64bit uint32) uint32 { + var oO, op31 uint32 + switch op { + case aluOpMAdd: + op31, oO = 0b000, 0b0 + case aluOpMSub: + op31, oO = 0b000, 0b1 + default: + panic("TODO/BUG") + } + return _64bit<<31 | 0b11011<<24 | op31<<21 | rm<<16 | oO<<15 | ra<<10 | rn<<5 | rd +} + +// encodeBitRR encodes as Data-processing (1 source) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en +func encodeBitRR(op bitOp, rd, rn, _64bit uint32) uint32 { + var opcode2, opcode uint32 + switch op { + case bitOpRbit: + opcode2, opcode = 0b00000, 0b000000 + case bitOpClz: + opcode2, opcode = 0b00000, 0b000100 + default: + panic("TODO/BUG") + } + return _64bit<<31 | 0b1_0_11010110<<21 | opcode2<<15 | opcode<<10 | rn<<5 | rd +} + +func encodeAsMov32(rn, rd uint32) uint32 { + // This is an alias of ORR (shifted register): + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MOV--register---Move--register---an-alias-of-ORR--shifted-register-- + return encodeLogicalShiftedRegister(0b001, 0, rn, 0, regNumberInEncoding[xzr], rd) +} + +// encodeExtend encodes extension instructions. +func encodeExtend(signed bool, from, to byte, rd, rn uint32) uint32 { + // UTXB: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UXTB--Unsigned-Extend-Byte--an-alias-of-UBFM-?lang=en + // UTXH: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UXTH--Unsigned-Extend-Halfword--an-alias-of-UBFM-?lang=en + // STXB: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SXTB--Signed-Extend-Byte--an-alias-of-SBFM- + // STXH: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SXTH--Sign-Extend-Halfword--an-alias-of-SBFM- + // STXW: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SXTW--Sign-Extend-Word--an-alias-of-SBFM- + var _31to10 uint32 + switch { + case !signed && from == 8 && to == 32: + // 32-bit UXTB + _31to10 = 0b0101001100000000000111 + case !signed && from == 16 && to == 32: + // 32-bit UXTH + _31to10 = 0b0101001100000000001111 + case !signed && from == 8 && to == 64: + // 64-bit UXTB + _31to10 = 0b0101001100000000000111 + case !signed && from == 16 && to == 64: + // 64-bit UXTH + _31to10 = 0b0101001100000000001111 + case !signed && from == 32 && to == 64: + return encodeAsMov32(rn, rd) + case signed && from == 8 && to == 32: + // 32-bit SXTB + _31to10 = 0b0001001100000000000111 + case signed && from == 16 && to == 32: + // 32-bit SXTH + _31to10 = 0b0001001100000000001111 + case signed && from == 8 && to == 64: + // 64-bit SXTB + _31to10 = 0b1001001101000000000111 + case signed && from == 16 && to == 64: + // 64-bit SXTH + _31to10 = 0b1001001101000000001111 + case signed && from == 32 && to == 64: + // SXTW + _31to10 = 0b1001001101000000011111 + default: + panic("BUG") + } + return _31to10<<10 | rn<<5 | rd +} + +func encodeLoadOrStore(kind instructionKind, rt uint32, amode addressMode) uint32 { + var _22to31 uint32 + var bits int64 + switch kind { + case uLoad8: + _22to31 = 0b0011100001 + bits = 8 + case sLoad8: + _22to31 = 0b0011100010 + bits = 8 + case uLoad16: + _22to31 = 0b0111100001 + bits = 16 + case sLoad16: + _22to31 = 0b0111100010 + bits = 16 + case uLoad32: + _22to31 = 0b1011100001 + bits = 32 + case sLoad32: + _22to31 = 0b1011100010 + bits = 32 + case uLoad64: + _22to31 = 0b1111100001 + bits = 64 + case fpuLoad32: + _22to31 = 0b1011110001 + bits = 32 + case fpuLoad64: + _22to31 = 0b1111110001 + bits = 64 + case fpuLoad128: + _22to31 = 0b0011110011 + bits = 128 + case store8: + _22to31 = 0b0011100000 + bits = 8 + case store16: + _22to31 = 0b0111100000 + bits = 16 + case store32: + _22to31 = 0b1011100000 + bits = 32 + case store64: + _22to31 = 0b1111100000 + bits = 64 + case fpuStore32: + _22to31 = 0b1011110000 + bits = 32 + case fpuStore64: + _22to31 = 0b1111110000 + bits = 64 + case fpuStore128: + _22to31 = 0b0011110010 + bits = 128 + default: + panic("BUG") + } + + switch amode.kind { + case addressModeKindRegScaledExtended: + return encodeLoadOrStoreExtended(_22to31, + regNumberInEncoding[amode.rn.RealReg()], + regNumberInEncoding[amode.rm.RealReg()], + rt, true, amode.extOp) + case addressModeKindRegScaled: + return encodeLoadOrStoreExtended(_22to31, + regNumberInEncoding[amode.rn.RealReg()], regNumberInEncoding[amode.rm.RealReg()], + rt, true, extendOpNone) + case addressModeKindRegExtended: + return encodeLoadOrStoreExtended(_22to31, + regNumberInEncoding[amode.rn.RealReg()], regNumberInEncoding[amode.rm.RealReg()], + rt, false, amode.extOp) + case addressModeKindRegReg: + return encodeLoadOrStoreExtended(_22to31, + regNumberInEncoding[amode.rn.RealReg()], regNumberInEncoding[amode.rm.RealReg()], + rt, false, extendOpNone) + case addressModeKindRegSignedImm9: + // e.g. https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled-- + return encodeLoadOrStoreSIMM9(_22to31, 0b00 /* unscaled */, regNumberInEncoding[amode.rn.RealReg()], rt, amode.imm) + case addressModeKindPostIndex: + return encodeLoadOrStoreSIMM9(_22to31, 0b01 /* post index */, regNumberInEncoding[amode.rn.RealReg()], rt, amode.imm) + case addressModeKindPreIndex: + return encodeLoadOrStoreSIMM9(_22to31, 0b11 /* pre index */, regNumberInEncoding[amode.rn.RealReg()], rt, amode.imm) + case addressModeKindRegUnsignedImm12: + // "unsigned immediate" in https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Loads-and-Stores?lang=en + rn := regNumberInEncoding[amode.rn.RealReg()] + imm := amode.imm + div := bits / 8 + if imm != 0 && !offsetFitsInAddressModeKindRegUnsignedImm12(byte(bits), imm) { + panic("BUG") + } + imm /= div + return _22to31<<22 | 0b1<<24 | uint32(imm&0b111111111111)<<10 | rn<<5 | rt + default: + panic("BUG") + } +} + +// encodeVecLoad1R encodes as Load one single-element structure and Replicate to all lanes (of one register) in +// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LD1R--Load-one-single-element-structure-and-Replicate-to-all-lanes--of-one-register--?lang=en#sa_imm +func encodeVecLoad1R(rt, rn uint32, arr vecArrangement) uint32 { + size, q := arrToSizeQEncoded(arr) + return q<<30 | 0b001101010000001100<<12 | size<<10 | rn<<5 | rt +} + +// encodeAluBitmaskImmediate encodes as Logical (immediate) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en +func encodeAluBitmaskImmediate(op aluOp, rd, rn uint32, imm uint64, _64bit bool) uint32 { + var _31to23 uint32 + switch op { + case aluOpAnd: + _31to23 = 0b00_100100 + case aluOpOrr: + _31to23 = 0b01_100100 + case aluOpEor: + _31to23 = 0b10_100100 + case aluOpAnds: + _31to23 = 0b11_100100 + default: + panic("BUG") + } + if _64bit { + _31to23 |= 0b1 << 8 + } + immr, imms, N := bitmaskImmediate(imm, _64bit) + return _31to23<<23 | uint32(N)<<22 | uint32(immr)<<16 | uint32(imms)<<10 | rn<<5 | rd +} + +func bitmaskImmediate(c uint64, is64bit bool) (immr, imms, N byte) { + var size uint32 + switch { + case c != c>>32|c<<32: + size = 64 + case c != c>>16|c<<48: + size = 32 + c = uint64(int32(c)) + case c != c>>8|c<<56: + size = 16 + c = uint64(int16(c)) + case c != c>>4|c<<60: + size = 8 + c = uint64(int8(c)) + case c != c>>2|c<<62: + size = 4 + c = uint64(int64(c<<60) >> 60) + default: + size = 2 + c = uint64(int64(c<<62) >> 62) + } + + neg := false + if int64(c) < 0 { + c = ^c + neg = true + } + + onesSize, nonZeroPos := getOnesSequenceSize(c) + if neg { + nonZeroPos = onesSize + nonZeroPos + onesSize = size - onesSize + } + + var mode byte = 32 + if is64bit && size == 64 { + N, mode = 0b1, 64 + } + + immr = byte((size - nonZeroPos) & (size - 1) & uint32(mode-1)) + imms = byte((onesSize - 1) | 63&^(size<<1-1)) + return +} + +func getOnesSequenceSize(x uint64) (size, nonZeroPos uint32) { + // Take 0b00111000 for example: + y := getLowestBit(x) // = 0b0000100 + nonZeroPos = setBitPos(y) // = 2 + size = setBitPos(x+y) - nonZeroPos // = setBitPos(0b0100000) - 2 = 5 - 2 = 3 + return +} + +func setBitPos(x uint64) (ret uint32) { + for ; ; ret++ { + if x == 0b1 { + break + } + x = x >> 1 + } + return +} + +// encodeLoadOrStoreExtended encodes store/load instruction as "extended register offset" in Load/store register (register offset): +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Loads-and-Stores?lang=en +func encodeLoadOrStoreExtended(_22to32 uint32, rn, rm, rt uint32, scaled bool, extOp extendOp) uint32 { + var option uint32 + switch extOp { + case extendOpUXTW: + option = 0b010 + case extendOpSXTW: + option = 0b110 + case extendOpNone: + option = 0b111 + default: + panic("BUG") + } + var s uint32 + if scaled { + s = 0b1 + } + return _22to32<<22 | 0b1<<21 | rm<<16 | option<<13 | s<<12 | 0b10<<10 | rn<<5 | rt +} + +// encodeLoadOrStoreSIMM9 encodes store/load instruction as one of post-index, pre-index or unscaled immediate as in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Loads-and-Stores?lang=en +func encodeLoadOrStoreSIMM9(_22to32, _1011 uint32, rn, rt uint32, imm9 int64) uint32 { + return _22to32<<22 | (uint32(imm9)&0b111111111)<<12 | _1011<<10 | rn<<5 | rt +} + +// encodeFpuRRR encodes as single or double precision (depending on `_64bit`) of Floating-point data-processing (2 source) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeFpuRRR(op fpuBinOp, rd, rn, rm uint32, _64bit bool) (ret uint32) { + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADD--vector--Add-vectors--scalar--floating-point-and-integer- + var opcode uint32 + switch op { + case fpuBinOpAdd: + opcode = 0b0010 + case fpuBinOpSub: + opcode = 0b0011 + case fpuBinOpMul: + opcode = 0b0000 + case fpuBinOpDiv: + opcode = 0b0001 + case fpuBinOpMax: + opcode = 0b0100 + case fpuBinOpMin: + opcode = 0b0101 + default: + panic("BUG") + } + var ptype uint32 + if _64bit { + ptype = 0b01 + } + return 0b1111<<25 | ptype<<22 | 0b1<<21 | rm<<16 | opcode<<12 | 0b1<<11 | rn<<5 | rd +} + +// encodeAluRRImm12 encodes as Add/subtract (immediate) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en +func encodeAluRRImm12(op aluOp, rd, rn uint32, imm12 uint16, shiftBit byte, _64bit bool) uint32 { + var _31to24 uint32 + switch op { + case aluOpAdd: + _31to24 = 0b00_10001 + case aluOpAddS: + _31to24 = 0b01_10001 + case aluOpSub: + _31to24 = 0b10_10001 + case aluOpSubS: + _31to24 = 0b11_10001 + default: + panic("BUG") + } + if _64bit { + _31to24 |= 0b1 << 7 + } + return _31to24<<24 | uint32(shiftBit)<<22 | uint32(imm12&0b111111111111)<<10 | rn<<5 | rd +} + +// encodeAluRRR encodes as Data Processing (shifted register), depending on aluOp. +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift +func encodeAluRRRShift(op aluOp, rd, rn, rm, amount uint32, shiftOp shiftOp, _64bit bool) uint32 { + var _31to24 uint32 + var opc, n uint32 + switch op { + case aluOpAdd: + _31to24 = 0b00001011 + case aluOpAddS: + _31to24 = 0b00101011 + case aluOpSub: + _31to24 = 0b01001011 + case aluOpSubS: + _31to24 = 0b01101011 + case aluOpAnd, aluOpOrr, aluOpEor, aluOpAnds: + // "Logical (shifted register)". + switch op { + case aluOpAnd: + // all zeros + case aluOpOrr: + opc = 0b01 + case aluOpEor: + opc = 0b10 + case aluOpAnds: + opc = 0b11 + } + _31to24 = 0b000_01010 + default: + panic(op.String()) + } + + if _64bit { + _31to24 |= 0b1 << 7 + } + + var shift uint32 + switch shiftOp { + case shiftOpLSL: + shift = 0b00 + case shiftOpLSR: + shift = 0b01 + case shiftOpASR: + shift = 0b10 + default: + panic(shiftOp.String()) + } + return opc<<29 | n<<21 | _31to24<<24 | shift<<22 | rm<<16 | (amount << 10) | (rn << 5) | rd +} + +// "Add/subtract (extended register)" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_ext +func encodeAluRRRExtend(ao aluOp, rd, rn, rm uint32, extOp extendOp, to byte) uint32 { + var s, op uint32 + switch ao { + case aluOpAdd: + op = 0b0 + case aluOpAddS: + op, s = 0b0, 0b1 + case aluOpSub: + op = 0b1 + case aluOpSubS: + op, s = 0b1, 0b1 + default: + panic("BUG: extended register operand can be used only for add/sub") + } + + var sf uint32 + if to == 64 { + sf = 0b1 + } + + var option uint32 + switch extOp { + case extendOpUXTB: + option = 0b000 + case extendOpUXTH: + option = 0b001 + case extendOpUXTW: + option = 0b010 + case extendOpSXTB: + option = 0b100 + case extendOpSXTH: + option = 0b101 + case extendOpSXTW: + option = 0b110 + case extendOpSXTX, extendOpUXTX: + panic(fmt.Sprintf("%s is essentially noop, and should be handled much earlier than encoding", extOp.String())) + } + return sf<<31 | op<<30 | s<<29 | 0b1011001<<21 | rm<<16 | option<<13 | rn<<5 | rd +} + +// encodeAluRRR encodes as Data Processing (register), depending on aluOp. +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en +func encodeAluRRR(op aluOp, rd, rn, rm uint32, _64bit, isRnSp bool) uint32 { + var _31to21, _15to10 uint32 + switch op { + case aluOpAdd: + if isRnSp { + // "Extended register" with UXTW. + _31to21 = 0b00001011_001 + _15to10 = 0b011000 + } else { + // "Shifted register" with shift = 0 + _31to21 = 0b00001011_000 + } + case aluOpAddS: + if isRnSp { + panic("TODO") + } + // "Shifted register" with shift = 0 + _31to21 = 0b00101011_000 + case aluOpSub: + if isRnSp { + // "Extended register" with UXTW. + _31to21 = 0b01001011_001 + _15to10 = 0b011000 + } else { + // "Shifted register" with shift = 0 + _31to21 = 0b01001011_000 + } + case aluOpSubS: + if isRnSp { + panic("TODO") + } + // "Shifted register" with shift = 0 + _31to21 = 0b01101011_000 + case aluOpAnd, aluOpOrr, aluOpOrn, aluOpEor, aluOpAnds: + // "Logical (shifted register)". + var opc, n uint32 + switch op { + case aluOpAnd: + // all zeros + case aluOpOrr: + opc = 0b01 + case aluOpOrn: + opc = 0b01 + n = 1 + case aluOpEor: + opc = 0b10 + case aluOpAnds: + opc = 0b11 + } + _31to21 = 0b000_01010_000 | opc<<8 | n + case aluOpLsl, aluOpAsr, aluOpLsr, aluOpRotR: + // "Data-processing (2 source)". + _31to21 = 0b00011010_110 + switch op { + case aluOpLsl: + _15to10 = 0b001000 + case aluOpLsr: + _15to10 = 0b001001 + case aluOpAsr: + _15to10 = 0b001010 + case aluOpRotR: + _15to10 = 0b001011 + } + case aluOpSDiv: + // "Data-processing (2 source)". + _31to21 = 0b11010110 + _15to10 = 0b000011 + case aluOpUDiv: + // "Data-processing (2 source)". + _31to21 = 0b11010110 + _15to10 = 0b000010 + default: + panic(op.String()) + } + if _64bit { + _31to21 |= 0b1 << 10 + } + return _31to21<<21 | rm<<16 | (_15to10 << 10) | (rn << 5) | rd +} + +// encodeLogicalShiftedRegister encodes as Logical (shifted register) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en +func encodeLogicalShiftedRegister(sf_opc uint32, shift_N uint32, rm uint32, imm6 uint32, rn, rd uint32) (ret uint32) { + ret = sf_opc << 29 + ret |= 0b01010 << 24 + ret |= shift_N << 21 + ret |= rm << 16 + ret |= imm6 << 10 + ret |= rn << 5 + ret |= rd + return +} + +// encodeAddSubtractImmediate encodes as Add/subtract (immediate) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en +func encodeAddSubtractImmediate(sf_op_s uint32, sh uint32, imm12 uint32, rn, rd uint32) (ret uint32) { + ret = sf_op_s << 29 + ret |= 0b100010 << 23 + ret |= sh << 22 + ret |= imm12 << 10 + ret |= rn << 5 + ret |= rd + return +} + +// encodePreOrPostIndexLoadStorePair64 encodes as Load/store pair (pre/post-indexed) in +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers- +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STP--Store-Pair-of-Registers- +func encodePreOrPostIndexLoadStorePair64(pre bool, load bool, rn, rt, rt2 uint32, imm7 int64) (ret uint32) { + if imm7%8 != 0 { + panic("imm7 for pair load/store must be a multiple of 8") + } + imm7 /= 8 + ret = rt + ret |= rn << 5 + ret |= rt2 << 10 + ret |= (uint32(imm7) & 0b1111111) << 15 + if load { + ret |= 0b1 << 22 + } + ret |= 0b101010001 << 23 + if pre { + ret |= 0b1 << 24 + } + return +} + +// encodeUnconditionalBranch encodes as B or BL instructions: +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B--Branch- +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link- +func encodeUnconditionalBranch(link bool, imm26 int64) (ret uint32) { + if imm26%4 != 0 { + panic("imm26 for branch must be a multiple of 4") + } + imm26 /= 4 + ret = uint32(imm26 & 0b11_11111111_11111111_11111111) + ret |= 0b101 << 26 + if link { + ret |= 0b1 << 31 + } + return +} + +// encodeCBZCBNZ encodes as either CBZ or CBNZ: +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CBZ--Compare-and-Branch-on-Zero- +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CBNZ--Compare-and-Branch-on-Nonzero- +func encodeCBZCBNZ(rt uint32, nz bool, imm19 uint32, _64bit bool) (ret uint32) { + ret = rt + ret |= imm19 << 5 + if nz { + ret |= 1 << 24 + } + ret |= 0b11010 << 25 + if _64bit { + ret |= 1 << 31 + } + return +} + +// encodeMoveWideImmediate encodes as either MOVZ, MOVN or MOVK, as Move wide (immediate) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en +// +// "shift" must have been divided by 16 at this point. +func encodeMoveWideImmediate(opc uint32, rd uint32, imm, shift, _64bit uint64) (ret uint32) { + ret = rd + ret |= uint32(imm&0xffff) << 5 + ret |= (uint32(shift)) << 21 + ret |= 0b100101 << 23 + ret |= opc << 29 + ret |= uint32(_64bit) << 31 + return +} + +// encodeAluRRImm encodes as "Bitfield" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en#log_imm +func encodeAluRRImm(op aluOp, rd, rn, amount, _64bit uint32) uint32 { + var opc uint32 + var immr, imms uint32 + switch op { + case aluOpLsl: + // LSL (immediate) is an alias for UBFM. + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/UBFM--Unsigned-Bitfield-Move-?lang=en + opc = 0b10 + if amount == 0 { + // This can be encoded as NOP, but we don't do it for consistency: lsr xn, xm, #0 + immr = 0 + if _64bit == 1 { + imms = 0b111111 + } else { + imms = 0b11111 + } + } else { + if _64bit == 1 { + immr = 64 - amount + } else { + immr = (32 - amount) & 0b11111 + } + imms = immr - 1 + } + case aluOpLsr: + // LSR (immediate) is an alias for UBFM. + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LSR--immediate---Logical-Shift-Right--immediate---an-alias-of-UBFM-?lang=en + opc = 0b10 + imms, immr = 0b011111|_64bit<<5, amount + case aluOpAsr: + // ASR (immediate) is an alias for SBFM. + // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SBFM--Signed-Bitfield-Move-?lang=en + opc = 0b00 + imms, immr = 0b011111|_64bit<<5, amount + default: + panic(op.String()) + } + return _64bit<<31 | opc<<29 | 0b100110<<23 | _64bit<<22 | immr<<16 | imms<<10 | rn<<5 | rd +} + +// encodeVecLanes encodes as Data Processing (Advanced SIMD across lanes) depending on vecOp in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeVecLanes(op vecOp, rd uint32, rn uint32, arr vecArrangement) uint32 { + var u, q, size, opcode uint32 + switch arr { + case vecArrangement8B: + q, size = 0b0, 0b00 + case vecArrangement16B: + q, size = 0b1, 0b00 + case vecArrangement4H: + q, size = 0, 0b01 + case vecArrangement8H: + q, size = 1, 0b01 + case vecArrangement4S: + q, size = 1, 0b10 + default: + panic("unsupported arrangement: " + arr.String()) + } + switch op { + case vecOpUaddlv: + u, opcode = 1, 0b00011 + case vecOpUminv: + u, opcode = 1, 0b11010 + case vecOpAddv: + u, opcode = 0, 0b11011 + default: + panic("unsupported or illegal vecOp: " + op.String()) + } + return q<<30 | u<<29 | 0b1110<<24 | size<<22 | 0b11000<<17 | opcode<<12 | 0b10<<10 | rn<<5 | rd +} + +// encodeVecLanes encodes as Data Processing (Advanced SIMD scalar shift by immediate) depending on vecOp in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeVecShiftImm(op vecOp, rd uint32, rn, amount uint32, arr vecArrangement) uint32 { + var u, q, immh, immb, opcode uint32 + switch op { + case vecOpSshll: + u, opcode = 0b0, 0b10100 + case vecOpUshll: + u, opcode = 0b1, 0b10100 + case vecOpSshr: + u, opcode = 0, 0b00000 + default: + panic("unsupported or illegal vecOp: " + op.String()) + } + switch arr { + case vecArrangement16B: + q = 0b1 + fallthrough + case vecArrangement8B: + immh = 0b0001 + immb = 8 - uint32(amount&0b111) + case vecArrangement8H: + q = 0b1 + fallthrough + case vecArrangement4H: + v := 16 - uint32(amount&0b1111) + immb = v & 0b111 + immh = 0b0010 | (v >> 3) + case vecArrangement4S: + q = 0b1 + fallthrough + case vecArrangement2S: + v := 32 - uint32(amount&0b11111) + immb = v & 0b111 + immh = 0b0100 | (v >> 3) + case vecArrangement2D: + q = 0b1 + v := 64 - uint32(amount&0b111111) + immb = v & 0b111 + immh = 0b1000 | (v >> 3) + default: + panic("unsupported arrangement: " + arr.String()) + } + return q<<30 | u<<29 | 0b011110<<23 | immh<<19 | immb<<16 | 0b000001<<10 | opcode<<11 | 0b1<<10 | rn<<5 | rd +} + +// encodeVecTbl encodes as Data Processing (Advanced SIMD table lookup) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp +// +// Note: tblOp may encode tbl1, tbl2... in the future. Currently, it is ignored. +func encodeVecTbl(nregs, rd, rn, rm uint32, arr vecArrangement) uint32 { + var q, op2, len, op uint32 + + switch nregs { + case 1: + // tbl: single-register + len = 0b00 + case 2: + // tbl2: 2-register table + len = 0b01 + default: + panic(fmt.Sprintf("unsupported number or registers %d", nregs)) + } + switch arr { + case vecArrangement8B: + q = 0b0 + case vecArrangement16B: + q = 0b1 + default: + panic("unsupported arrangement: " + arr.String()) + } + + return q<<30 | 0b001110<<24 | op2<<22 | rm<<16 | len<<13 | op<<12 | rn<<5 | rd +} + +// encodeVecMisc encodes as Data Processing (Advanced SIMD two-register miscellaneous) depending on vecOp in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp +func encodeAdvancedSIMDTwoMisc(op vecOp, rd, rn uint32, arr vecArrangement) uint32 { + var q, u, size, opcode uint32 + switch op { + case vecOpCnt: + opcode = 0b00101 + switch arr { + case vecArrangement8B: + q, size = 0b0, 0b00 + case vecArrangement16B: + q, size = 0b1, 0b00 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpCmeq0: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + opcode = 0b01001 + size, q = arrToSizeQEncoded(arr) + case vecOpNot: + u = 1 + opcode = 0b00101 + switch arr { + case vecArrangement8B: + q, size = 0b0, 0b00 + case vecArrangement16B: + q, size = 0b1, 0b00 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpAbs: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + opcode = 0b01011 + u = 0b0 + size, q = arrToSizeQEncoded(arr) + case vecOpNeg: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + opcode = 0b01011 + u = 0b1 + size, q = arrToSizeQEncoded(arr) + case vecOpFabs: + if arr < vecArrangement2S || arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + opcode = 0b01111 + u = 0b0 + size, q = arrToSizeQEncoded(arr) + case vecOpFneg: + if arr < vecArrangement2S || arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + opcode = 0b01111 + u = 0b1 + size, q = arrToSizeQEncoded(arr) + case vecOpFrintm: + u = 0b0 + opcode = 0b11001 + switch arr { + case vecArrangement2S: + q, size = 0b0, 0b00 + case vecArrangement4S: + q, size = 0b1, 0b00 + case vecArrangement2D: + q, size = 0b1, 0b01 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpFrintn: + u = 0b0 + opcode = 0b11000 + switch arr { + case vecArrangement2S: + q, size = 0b0, 0b00 + case vecArrangement4S: + q, size = 0b1, 0b00 + case vecArrangement2D: + q, size = 0b1, 0b01 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpFrintp: + u = 0b0 + opcode = 0b11000 + if arr < vecArrangement2S || arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q = arrToSizeQEncoded(arr) + case vecOpFrintz: + u = 0b0 + opcode = 0b11001 + if arr < vecArrangement2S || arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q = arrToSizeQEncoded(arr) + case vecOpFsqrt: + if arr < vecArrangement2S || arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + opcode = 0b11111 + u = 0b1 + size, q = arrToSizeQEncoded(arr) + case vecOpFcvtl: + opcode = 0b10111 + u = 0b0 + switch arr { + case vecArrangement2S: + size, q = 0b01, 0b0 + case vecArrangement4H: + size, q = 0b00, 0b0 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpFcvtn: + opcode = 0b10110 + u = 0b0 + switch arr { + case vecArrangement2S: + size, q = 0b01, 0b0 + case vecArrangement4H: + size, q = 0b00, 0b0 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpFcvtzs: + opcode = 0b11011 + u = 0b0 + switch arr { + case vecArrangement2S: + q, size = 0b0, 0b10 + case vecArrangement4S: + q, size = 0b1, 0b10 + case vecArrangement2D: + q, size = 0b1, 0b11 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpFcvtzu: + opcode = 0b11011 + u = 0b1 + switch arr { + case vecArrangement2S: + q, size = 0b0, 0b10 + case vecArrangement4S: + q, size = 0b1, 0b10 + case vecArrangement2D: + q, size = 0b1, 0b11 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpScvtf: + opcode = 0b11101 + u = 0b0 + switch arr { + case vecArrangement4S: + q, size = 0b1, 0b00 + case vecArrangement2S: + q, size = 0b0, 0b00 + case vecArrangement2D: + q, size = 0b1, 0b01 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpUcvtf: + opcode = 0b11101 + u = 0b1 + switch arr { + case vecArrangement4S: + q, size = 0b1, 0b00 + case vecArrangement2S: + q, size = 0b0, 0b00 + case vecArrangement2D: + q, size = 0b1, 0b01 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpSqxtn: + // When q == 1 it encodes sqxtn2 (operates on upper 64 bits). + opcode = 0b10100 + u = 0b0 + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q = arrToSizeQEncoded(arr) + case vecOpUqxtn: + // When q == 1 it encodes uqxtn2 (operates on upper 64 bits). + opcode = 0b10100 + u = 0b1 + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q = arrToSizeQEncoded(arr) + case vecOpSqxtun: + // When q == 1 it encodes sqxtun2 (operates on upper 64 bits). + opcode = 0b10010 // 0b10100 + u = 0b1 + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q = arrToSizeQEncoded(arr) + case vecOpRev64: + opcode = 0b00000 + size, q = arrToSizeQEncoded(arr) + case vecOpXtn: + u = 0b0 + opcode = 0b10010 + size, q = arrToSizeQEncoded(arr) + case vecOpShll: + u = 0b1 + opcode = 0b10011 + switch arr { + case vecArrangement8B: + q, size = 0b0, 0b00 + case vecArrangement4H: + q, size = 0b0, 0b01 + case vecArrangement2S: + q, size = 0b0, 0b10 + default: + panic("unsupported arrangement: " + arr.String()) + } + default: + panic("unsupported or illegal vecOp: " + op.String()) + } + return q<<30 | u<<29 | 0b01110<<24 | size<<22 | 0b10000<<17 | opcode<<12 | 0b10<<10 | rn<<5 | rd +} + +// brTableSequenceOffsetTableBegin is the offset inside the brTableSequence where the table begins after 4 instructions +const brTableSequenceOffsetTableBegin = 16 + +func encodeBrTableSequence(c backend.Compiler, index regalloc.VReg, targets []uint32) { + tmpRegNumber := regNumberInEncoding[tmp] + indexNumber := regNumberInEncoding[index.RealReg()] + + // adr tmpReg, PC+16 (PC+16 is the address of the first label offset) + // ldrsw index, [tmpReg, index, UXTW 2] ;; index = int64(*(tmpReg + index*8)) + // add tmpReg, tmpReg, index + // br tmpReg + // [offset_to_l1, offset_to_l2, ..., offset_to_lN] + c.Emit4Bytes(encodeAdr(tmpRegNumber, 16)) + c.Emit4Bytes(encodeLoadOrStore(sLoad32, indexNumber, + addressMode{kind: addressModeKindRegScaledExtended, rn: tmpRegVReg, rm: index, extOp: extendOpUXTW}, + )) + c.Emit4Bytes(encodeAluRRR(aluOpAdd, tmpRegNumber, tmpRegNumber, indexNumber, true, false)) + c.Emit4Bytes(encodeUnconditionalBranchReg(tmpRegNumber, false)) + + // Offsets are resolved in ResolveRelativeAddress phase. + for _, offset := range targets { + if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable { + // Inlined offset tables cannot be disassembled properly, so pad dummy instructions to make the debugging easier. + c.Emit4Bytes(dummyInstruction) + } else { + c.Emit4Bytes(offset) + } + } +} + +// encodeExitSequence matches the implementation detail of functionABI.emitGoEntryPreamble. +func encodeExitSequence(c backend.Compiler, ctxReg regalloc.VReg) { + // Restore the FP, SP and LR, and return to the Go code: + // ldr lr, [ctxReg, #GoReturnAddress] + // ldr fp, [ctxReg, #OriginalFramePointer] + // ldr tmp, [ctxReg, #OriginalStackPointer] + // mov sp, tmp ;; sp cannot be str'ed directly. + // ret ;; --> return to the Go code + + var ctxEvicted bool + if ctx := ctxReg.RealReg(); ctx == fp || ctx == lr { + // In order to avoid overwriting the context register, we move ctxReg to tmp. + c.Emit4Bytes(encodeMov64(regNumberInEncoding[tmp], regNumberInEncoding[ctx], false, false)) + ctxReg = tmpRegVReg + ctxEvicted = true + } + + restoreLr := encodeLoadOrStore( + uLoad64, + regNumberInEncoding[lr], + addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: ctxReg, + imm: wazevoapi.ExecutionContextOffsetGoReturnAddress.I64(), + }, + ) + + restoreFp := encodeLoadOrStore( + uLoad64, + regNumberInEncoding[fp], + addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: ctxReg, + imm: wazevoapi.ExecutionContextOffsetOriginalFramePointer.I64(), + }, + ) + + restoreSpToTmp := encodeLoadOrStore( + uLoad64, + regNumberInEncoding[tmp], + addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: ctxReg, + imm: wazevoapi.ExecutionContextOffsetOriginalStackPointer.I64(), + }, + ) + + movTmpToSp := encodeAddSubtractImmediate(0b100, 0, 0, + regNumberInEncoding[tmp], regNumberInEncoding[sp]) + + c.Emit4Bytes(restoreFp) + c.Emit4Bytes(restoreLr) + c.Emit4Bytes(restoreSpToTmp) + c.Emit4Bytes(movTmpToSp) + c.Emit4Bytes(encodeRet()) + if !ctxEvicted { + // In order to have the fixed-length exit sequence, we need to padd the binary. + // Since this will never be reached, we insert a dummy instruction. + c.Emit4Bytes(dummyInstruction) + } +} + +func encodeRet() uint32 { + // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/RET--Return-from-subroutine-?lang=en + return 0b1101011001011111<<16 | regNumberInEncoding[lr]<<5 +} + +func encodeAtomicRmw(op atomicRmwOp, rs, rt, rn uint32, size uint32) uint32 { + var _31to21, _15to10, sz uint32 + + switch size { + case 8: + sz = 0b11 + case 4: + sz = 0b10 + case 2: + sz = 0b01 + case 1: + sz = 0b00 + } + + _31to21 = 0b00111000_111 | sz<<9 + + switch op { + case atomicRmwOpAdd: + _15to10 = 0b000000 + case atomicRmwOpClr: + _15to10 = 0b000100 + case atomicRmwOpSet: + _15to10 = 0b001100 + case atomicRmwOpEor: + _15to10 = 0b001000 + case atomicRmwOpSwp: + _15to10 = 0b100000 + } + + return _31to21<<21 | rs<<16 | _15to10<<10 | rn<<5 | rt +} + +func encodeAtomicCas(rs, rt, rn uint32, size uint32) uint32 { + var _31to21, _15to10, sz uint32 + + switch size { + case 8: + sz = 0b11 + case 4: + sz = 0b10 + case 2: + sz = 0b01 + case 1: + sz = 0b00 + } + + _31to21 = 0b00001000_111 | sz<<9 + _15to10 = 0b111111 + + return _31to21<<21 | rs<<16 | _15to10<<10 | rn<<5 | rt +} + +func encodeAtomicLoadStore(rn, rt, size, l uint32) uint32 { + var _31to21, _20to16, _15to10, sz uint32 + + switch size { + case 8: + sz = 0b11 + case 4: + sz = 0b10 + case 2: + sz = 0b01 + case 1: + sz = 0b00 + } + + _31to21 = 0b00001000_100 | sz<<9 | l<<1 + _20to16 = 0b11111 + _15to10 = 0b111111 + + return _31to21<<21 | _20to16<<16 | _15to10<<10 | rn<<5 | rt +} + +func encodeDMB() uint32 { + return 0b11010101000000110011101110111111 +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go new file mode 100644 index 000000000..698b382d4 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go @@ -0,0 +1,301 @@ +package arm64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// lowerConstant allocates a new VReg and inserts the instruction to load the constant value. +func (m *machine) lowerConstant(instr *ssa.Instruction) (vr regalloc.VReg) { + val := instr.Return() + valType := val.Type() + + vr = m.compiler.AllocateVReg(valType) + v := instr.ConstantVal() + m.insertLoadConstant(v, valType, vr) + return +} + +// InsertLoadConstantBlockArg implements backend.Machine. +func (m *machine) InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) { + val := instr.Return() + valType := val.Type() + v := instr.ConstantVal() + load := m.allocateInstr() + load.asLoadConstBlockArg(v, valType, vr) + m.insert(load) +} + +func (m *machine) lowerLoadConstantBlockArgAfterRegAlloc(i *instruction) { + v, typ, dst := i.loadConstBlockArgData() + m.insertLoadConstant(v, typ, dst) +} + +func (m *machine) insertLoadConstant(v uint64, valType ssa.Type, vr regalloc.VReg) { + if valType.Bits() < 64 { // Clear the redundant bits just in case it's unexpectedly sign-extended, etc. + v = v & ((1 << valType.Bits()) - 1) + } + + switch valType { + case ssa.TypeF32: + loadF := m.allocateInstr() + loadF.asLoadFpuConst32(vr, v) + m.insert(loadF) + case ssa.TypeF64: + loadF := m.allocateInstr() + loadF.asLoadFpuConst64(vr, v) + m.insert(loadF) + case ssa.TypeI32: + if v == 0 { + m.InsertMove(vr, xzrVReg, ssa.TypeI32) + } else { + m.lowerConstantI32(vr, int32(v)) + } + case ssa.TypeI64: + if v == 0 { + m.InsertMove(vr, xzrVReg, ssa.TypeI64) + } else { + m.lowerConstantI64(vr, int64(v)) + } + default: + panic("TODO") + } +} + +// The following logics are based on the old asm/arm64 package. +// https://github.com/tetratelabs/wazero/blob/39f2ff23a6d609e10c82b9cc0b981f6de5b87a9c/internal/asm/arm64/impl.go + +func (m *machine) lowerConstantI32(dst regalloc.VReg, c int32) { + // Following the logic here: + // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1637 + ic := int64(uint32(c)) + if ic >= 0 && (ic <= 0xfff || (ic&0xfff) == 0 && (uint64(ic>>12) <= 0xfff)) { + if isBitMaskImmediate(uint64(c), false) { + m.lowerConstViaBitMaskImmediate(uint64(uint32(c)), dst, false) + return + } + } + + if t := const16bitAligned(int64(uint32(c))); t >= 0 { + // If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000 + // We could load it into temporary with movk. + m.insertMOVZ(dst, uint64(uint32(c)>>(16*t)), t, false) + } else if t := const16bitAligned(int64(^c)); t >= 0 { + // Also, if the inverse of the const can fit within 16-bit range, do the same ^^. + m.insertMOVN(dst, uint64(^c>>(16*t)), t, false) + } else if isBitMaskImmediate(uint64(uint32(c)), false) { + m.lowerConstViaBitMaskImmediate(uint64(c), dst, false) + } else { + // Otherwise, we use MOVZ and MOVK to load it. + c16 := uint16(c) + m.insertMOVZ(dst, uint64(c16), 0, false) + c16 = uint16(uint32(c) >> 16) + m.insertMOVK(dst, uint64(c16), 1, false) + } +} + +func (m *machine) lowerConstantI64(dst regalloc.VReg, c int64) { + // Following the logic here: + // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1798-L1852 + if c >= 0 && (c <= 0xfff || (c&0xfff) == 0 && (uint64(c>>12) <= 0xfff)) { + if isBitMaskImmediate(uint64(c), true) { + m.lowerConstViaBitMaskImmediate(uint64(c), dst, true) + return + } + } + + if t := const16bitAligned(c); t >= 0 { + // If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000 + // We could load it into temporary with movk. + m.insertMOVZ(dst, uint64(c)>>(16*t), t, true) + } else if t := const16bitAligned(^c); t >= 0 { + // Also, if the reverse of the const can fit within 16-bit range, do the same ^^. + m.insertMOVN(dst, uint64(^c)>>(16*t), t, true) + } else if isBitMaskImmediate(uint64(c), true) { + m.lowerConstViaBitMaskImmediate(uint64(c), dst, true) + } else { + m.load64bitConst(c, dst) + } +} + +func (m *machine) lowerConstViaBitMaskImmediate(c uint64, dst regalloc.VReg, b64 bool) { + instr := m.allocateInstr() + instr.asALUBitmaskImm(aluOpOrr, dst, xzrVReg, c, b64) + m.insert(instr) +} + +// isBitMaskImmediate determines if the value can be encoded as "bitmask immediate". +// +// Such an immediate is a 32-bit or 64-bit pattern viewed as a vector of identical elements of size e = 2, 4, 8, 16, 32, or 64 bits. +// Each element contains the same sub-pattern: a single run of 1 to e-1 non-zero bits, rotated by 0 to e-1 bits. +// +// See https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/MOV--bitmask-immediate- +func isBitMaskImmediate(x uint64, _64 bool) bool { + // All zeros and ones are not "bitmask immediate" by definition. + if x == 0 || (_64 && x == 0xffff_ffff_ffff_ffff) || (!_64 && x == 0xffff_ffff) { + return false + } + + switch { + case x != x>>32|x<<32: + // e = 64 + case x != x>>16|x<<48: + // e = 32 (x == x>>32|x<<32). + // e.g. 0x00ff_ff00_00ff_ff00 + x = uint64(int32(x)) + case x != x>>8|x<<56: + // e = 16 (x == x>>16|x<<48). + // e.g. 0x00ff_00ff_00ff_00ff + x = uint64(int16(x)) + case x != x>>4|x<<60: + // e = 8 (x == x>>8|x<<56). + // e.g. 0x0f0f_0f0f_0f0f_0f0f + x = uint64(int8(x)) + default: + // e = 4 or 2. + return true + } + return sequenceOfSetbits(x) || sequenceOfSetbits(^x) +} + +// sequenceOfSetbits returns true if the number's binary representation is the sequence set bit (1). +// For example: 0b1110 -> true, 0b1010 -> false +func sequenceOfSetbits(x uint64) bool { + y := getLowestBit(x) + // If x is a sequence of set bit, this should results in the number + // with only one set bit (i.e. power of two). + y += x + return (y-1)&y == 0 +} + +func getLowestBit(x uint64) uint64 { + return x & (^x + 1) +} + +// const16bitAligned check if the value is on the 16-bit alignment. +// If so, returns the shift num divided by 16, and otherwise -1. +func const16bitAligned(v int64) (ret int) { + ret = -1 + for s := 0; s < 64; s += 16 { + if (uint64(v) &^ (uint64(0xffff) << uint(s))) == 0 { + ret = s / 16 + break + } + } + return +} + +// load64bitConst loads a 64-bit constant into the register, following the same logic to decide how to load large 64-bit +// consts as in the Go assembler. +// +// See https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6632-L6759 +func (m *machine) load64bitConst(c int64, dst regalloc.VReg) { + var bits [4]uint64 + var zeros, negs int + for i := 0; i < 4; i++ { + bits[i] = uint64(c) >> uint(i*16) & 0xffff + if v := bits[i]; v == 0 { + zeros++ + } else if v == 0xffff { + negs++ + } + } + + if zeros == 3 { + // one MOVZ instruction. + for i, v := range bits { + if v != 0 { + m.insertMOVZ(dst, v, i, true) + } + } + } else if negs == 3 { + // one MOVN instruction. + for i, v := range bits { + if v != 0xffff { + v = ^v + m.insertMOVN(dst, v, i, true) + } + } + } else if zeros == 2 { + // one MOVZ then one OVK. + var movz bool + for i, v := range bits { + if !movz && v != 0 { // MOVZ. + m.insertMOVZ(dst, v, i, true) + movz = true + } else if v != 0 { + m.insertMOVK(dst, v, i, true) + } + } + + } else if negs == 2 { + // one MOVN then one or two MOVK. + var movn bool + for i, v := range bits { // Emit MOVN. + if !movn && v != 0xffff { + v = ^v + // https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN + m.insertMOVN(dst, v, i, true) + movn = true + } else if v != 0xffff { + m.insertMOVK(dst, v, i, true) + } + } + + } else if zeros == 1 { + // one MOVZ then two MOVK. + var movz bool + for i, v := range bits { + if !movz && v != 0 { // MOVZ. + m.insertMOVZ(dst, v, i, true) + movz = true + } else if v != 0 { + m.insertMOVK(dst, v, i, true) + } + } + + } else if negs == 1 { + // one MOVN then two MOVK. + var movn bool + for i, v := range bits { // Emit MOVN. + if !movn && v != 0xffff { + v = ^v + // https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN + m.insertMOVN(dst, v, i, true) + movn = true + } else if v != 0xffff { + m.insertMOVK(dst, v, i, true) + } + } + + } else { + // one MOVZ then up to three MOVK. + var movz bool + for i, v := range bits { + if !movz && v != 0 { // MOVZ. + m.insertMOVZ(dst, v, i, true) + movz = true + } else if v != 0 { + m.insertMOVK(dst, v, i, true) + } + } + } +} + +func (m *machine) insertMOVZ(dst regalloc.VReg, v uint64, shift int, dst64 bool) { + instr := m.allocateInstr() + instr.asMOVZ(dst, v, uint64(shift), dst64) + m.insert(instr) +} + +func (m *machine) insertMOVK(dst regalloc.VReg, v uint64, shift int, dst64 bool) { + instr := m.allocateInstr() + instr.asMOVK(dst, v, uint64(shift), dst64) + m.insert(instr) +} + +func (m *machine) insertMOVN(dst regalloc.VReg, v uint64, shift int, dst64 bool) { + instr := m.allocateInstr() + instr.asMOVN(dst, v, uint64(shift), dst64) + m.insert(instr) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go new file mode 100644 index 000000000..2bb234e8c --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go @@ -0,0 +1,2221 @@ +package arm64 + +// Files prefixed as lower_instr** do the instruction selection, meaning that lowering SSA level instructions +// into machine specific instructions. +// +// Importantly, what the lower** functions does includes tree-matching; find the pattern from the given instruction tree, +// and merge the multiple instructions if possible. It can be considered as "N:1" instruction selection. + +import ( + "fmt" + "math" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// LowerSingleBranch implements backend.Machine. +func (m *machine) LowerSingleBranch(br *ssa.Instruction) { + ectx := m.executableContext + switch br.Opcode() { + case ssa.OpcodeJump: + _, _, targetBlk := br.BranchData() + if br.IsFallthroughJump() { + return + } + b := m.allocateInstr() + target := ectx.GetOrAllocateSSABlockLabel(targetBlk) + if target == labelReturn { + b.asRet() + } else { + b.asBr(target) + } + m.insert(b) + case ssa.OpcodeBrTable: + m.lowerBrTable(br) + default: + panic("BUG: unexpected branch opcode" + br.Opcode().String()) + } +} + +func (m *machine) lowerBrTable(i *ssa.Instruction) { + index, targets := i.BrTableData() + indexOperand := m.getOperand_NR(m.compiler.ValueDefinition(index), extModeNone) + + // Firstly, we have to do the bounds check of the index, and + // set it to the default target (sitting at the end of the list) if it's out of bounds. + + // mov maxIndexReg #maximum_index + // subs wzr, index, maxIndexReg + // csel adjustedIndex, maxIndexReg, index, hs ;; if index is higher or equal than maxIndexReg. + maxIndexReg := m.compiler.AllocateVReg(ssa.TypeI32) + m.lowerConstantI32(maxIndexReg, int32(len(targets)-1)) + subs := m.allocateInstr() + subs.asALU(aluOpSubS, operandNR(xzrVReg), indexOperand, operandNR(maxIndexReg), false) + m.insert(subs) + csel := m.allocateInstr() + adjustedIndex := m.compiler.AllocateVReg(ssa.TypeI32) + csel.asCSel(operandNR(adjustedIndex), operandNR(maxIndexReg), indexOperand, hs, false) + m.insert(csel) + + brSequence := m.allocateInstr() + + tableIndex := m.addJmpTableTarget(targets) + brSequence.asBrTableSequence(adjustedIndex, tableIndex, len(targets)) + m.insert(brSequence) +} + +// LowerConditionalBranch implements backend.Machine. +func (m *machine) LowerConditionalBranch(b *ssa.Instruction) { + exctx := m.executableContext + cval, args, targetBlk := b.BranchData() + if len(args) > 0 { + panic(fmt.Sprintf( + "conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s", + exctx.CurrentSSABlk, + targetBlk, + )) + } + + target := exctx.GetOrAllocateSSABlockLabel(targetBlk) + cvalDef := m.compiler.ValueDefinition(cval) + + switch { + case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction. + cvalInstr := cvalDef.Instr + x, y, c := cvalInstr.IcmpData() + cc, signed := condFlagFromSSAIntegerCmpCond(c), c.Signed() + if b.Opcode() == ssa.OpcodeBrz { + cc = cc.invert() + } + + if !m.tryLowerBandToFlag(x, y) { + m.lowerIcmpToFlag(x, y, signed) + } + cbr := m.allocateInstr() + cbr.asCondBr(cc.asCond(), target, false /* ignored */) + m.insert(cbr) + cvalDef.Instr.MarkLowered() + case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly. + cvalInstr := cvalDef.Instr + x, y, c := cvalInstr.FcmpData() + cc := condFlagFromSSAFloatCmpCond(c) + if b.Opcode() == ssa.OpcodeBrz { + cc = cc.invert() + } + m.lowerFcmpToFlag(x, y) + cbr := m.allocateInstr() + cbr.asCondBr(cc.asCond(), target, false /* ignored */) + m.insert(cbr) + cvalDef.Instr.MarkLowered() + default: + rn := m.getOperand_NR(cvalDef, extModeNone) + var c cond + if b.Opcode() == ssa.OpcodeBrz { + c = registerAsRegZeroCond(rn.nr()) + } else { + c = registerAsRegNotZeroCond(rn.nr()) + } + cbr := m.allocateInstr() + cbr.asCondBr(c, target, false) + m.insert(cbr) + } +} + +func (m *machine) tryLowerBandToFlag(x, y ssa.Value) (ok bool) { + xx := m.compiler.ValueDefinition(x) + yy := m.compiler.ValueDefinition(y) + if xx.IsFromInstr() && xx.Instr.Constant() && xx.Instr.ConstantVal() == 0 { + if m.compiler.MatchInstr(yy, ssa.OpcodeBand) { + bandInstr := yy.Instr + m.lowerBitwiseAluOp(bandInstr, aluOpAnds, true) + ok = true + bandInstr.MarkLowered() + return + } + } + + if yy.IsFromInstr() && yy.Instr.Constant() && yy.Instr.ConstantVal() == 0 { + if m.compiler.MatchInstr(xx, ssa.OpcodeBand) { + bandInstr := xx.Instr + m.lowerBitwiseAluOp(bandInstr, aluOpAnds, true) + ok = true + bandInstr.MarkLowered() + return + } + } + return +} + +// LowerInstr implements backend.Machine. +func (m *machine) LowerInstr(instr *ssa.Instruction) { + if l := instr.SourceOffset(); l.Valid() { + info := m.allocateInstr().asEmitSourceOffsetInfo(l) + m.insert(info) + } + + switch op := instr.Opcode(); op { + case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable: + panic("BUG: branching instructions are handled by LowerBranches") + case ssa.OpcodeReturn: + panic("BUG: return must be handled by backend.Compiler") + case ssa.OpcodeIadd, ssa.OpcodeIsub: + m.lowerSubOrAdd(instr, op == ssa.OpcodeIadd) + case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv, ssa.OpcodeFmax, ssa.OpcodeFmin: + m.lowerFpuBinOp(instr) + case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined. + case ssa.OpcodeExitWithCode: + execCtx, code := instr.ExitWithCodeData() + m.lowerExitWithCode(m.compiler.VRegOf(execCtx), code) + case ssa.OpcodeExitIfTrueWithCode: + execCtx, c, code := instr.ExitIfTrueWithCodeData() + m.lowerExitIfTrueWithCode(m.compiler.VRegOf(execCtx), c, code) + case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32: + m.lowerStore(instr) + case ssa.OpcodeLoad: + dst := instr.Return() + ptr, offset, typ := instr.LoadData() + m.lowerLoad(ptr, offset, typ, dst) + case ssa.OpcodeVZeroExtLoad: + dst := instr.Return() + ptr, offset, typ := instr.VZeroExtLoadData() + m.lowerLoad(ptr, offset, typ, dst) + case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32: + ptr, offset, _ := instr.LoadData() + ret := m.compiler.VRegOf(instr.Return()) + m.lowerExtLoad(op, ptr, offset, ret) + case ssa.OpcodeCall, ssa.OpcodeCallIndirect: + m.lowerCall(instr) + case ssa.OpcodeIcmp: + m.lowerIcmp(instr) + case ssa.OpcodeVIcmp: + m.lowerVIcmp(instr) + case ssa.OpcodeVFcmp: + m.lowerVFcmp(instr) + case ssa.OpcodeVCeil: + m.lowerVecMisc(vecOpFrintp, instr) + case ssa.OpcodeVFloor: + m.lowerVecMisc(vecOpFrintm, instr) + case ssa.OpcodeVTrunc: + m.lowerVecMisc(vecOpFrintz, instr) + case ssa.OpcodeVNearest: + m.lowerVecMisc(vecOpFrintn, instr) + case ssa.OpcodeVMaxPseudo: + m.lowerVMinMaxPseudo(instr, true) + case ssa.OpcodeVMinPseudo: + m.lowerVMinMaxPseudo(instr, false) + case ssa.OpcodeBand: + m.lowerBitwiseAluOp(instr, aluOpAnd, false) + case ssa.OpcodeBor: + m.lowerBitwiseAluOp(instr, aluOpOrr, false) + case ssa.OpcodeBxor: + m.lowerBitwiseAluOp(instr, aluOpEor, false) + case ssa.OpcodeIshl: + m.lowerShifts(instr, extModeNone, aluOpLsl) + case ssa.OpcodeSshr: + if instr.Return().Type().Bits() == 64 { + m.lowerShifts(instr, extModeSignExtend64, aluOpAsr) + } else { + m.lowerShifts(instr, extModeSignExtend32, aluOpAsr) + } + case ssa.OpcodeUshr: + if instr.Return().Type().Bits() == 64 { + m.lowerShifts(instr, extModeZeroExtend64, aluOpLsr) + } else { + m.lowerShifts(instr, extModeZeroExtend32, aluOpLsr) + } + case ssa.OpcodeRotl: + m.lowerRotl(instr) + case ssa.OpcodeRotr: + m.lowerRotr(instr) + case ssa.OpcodeSExtend, ssa.OpcodeUExtend: + from, to, signed := instr.ExtendData() + m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed) + case ssa.OpcodeFcmp: + x, y, c := instr.FcmpData() + m.lowerFcmp(x, y, instr.Return(), c) + case ssa.OpcodeImul: + x, y := instr.Arg2() + result := instr.Return() + m.lowerImul(x, y, result) + case ssa.OpcodeUndefined: + undef := m.allocateInstr() + undef.asUDF() + m.insert(undef) + case ssa.OpcodeSelect: + c, x, y := instr.SelectData() + if x.Type() == ssa.TypeV128 { + rc := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.lowerSelectVec(rc, rn, rm, rd) + } else { + m.lowerSelect(c, x, y, instr.Return()) + } + case ssa.OpcodeClz: + x := instr.Arg() + result := instr.Return() + m.lowerClz(x, result) + case ssa.OpcodeCtz: + x := instr.Arg() + result := instr.Return() + m.lowerCtz(x, result) + case ssa.OpcodePopcnt: + x := instr.Arg() + result := instr.Return() + m.lowerPopcnt(x, result) + case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat: + x, ctx := instr.Arg2() + result := instr.Return() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(result)) + ctxVReg := m.compiler.VRegOf(ctx) + m.lowerFpuToInt(rd, rn, ctxVReg, true, x.Type() == ssa.TypeF64, + result.Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat) + case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat: + x, ctx := instr.Arg2() + result := instr.Return() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(result)) + ctxVReg := m.compiler.VRegOf(ctx) + m.lowerFpuToInt(rd, rn, ctxVReg, false, x.Type() == ssa.TypeF64, + result.Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat) + case ssa.OpcodeFcvtFromSint: + x := instr.Arg() + result := instr.Return() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(result)) + m.lowerIntToFpu(rd, rn, true, x.Type() == ssa.TypeI64, result.Type().Bits() == 64) + case ssa.OpcodeFcvtFromUint: + x := instr.Arg() + result := instr.Return() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(result)) + m.lowerIntToFpu(rd, rn, false, x.Type() == ssa.TypeI64, result.Type().Bits() == 64) + case ssa.OpcodeFdemote: + v := instr.Arg() + rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + cnt := m.allocateInstr() + cnt.asFpuRR(fpuUniOpCvt64To32, rd, rn, false) + m.insert(cnt) + case ssa.OpcodeFpromote: + v := instr.Arg() + rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + cnt := m.allocateInstr() + cnt.asFpuRR(fpuUniOpCvt32To64, rd, rn, true) + m.insert(cnt) + case ssa.OpcodeIreduce: + rn := m.getOperand_NR(m.compiler.ValueDefinition(instr.Arg()), extModeNone) + retVal := instr.Return() + rd := m.compiler.VRegOf(retVal) + + if retVal.Type() != ssa.TypeI32 { + panic("TODO?: Ireduce to non-i32") + } + mov := m.allocateInstr() + mov.asMove32(rd, rn.reg()) + m.insert(mov) + case ssa.OpcodeFneg: + m.lowerFpuUniOp(fpuUniOpNeg, instr.Arg(), instr.Return()) + case ssa.OpcodeSqrt: + m.lowerFpuUniOp(fpuUniOpSqrt, instr.Arg(), instr.Return()) + case ssa.OpcodeCeil: + m.lowerFpuUniOp(fpuUniOpRoundPlus, instr.Arg(), instr.Return()) + case ssa.OpcodeFloor: + m.lowerFpuUniOp(fpuUniOpRoundMinus, instr.Arg(), instr.Return()) + case ssa.OpcodeTrunc: + m.lowerFpuUniOp(fpuUniOpRoundZero, instr.Arg(), instr.Return()) + case ssa.OpcodeNearest: + m.lowerFpuUniOp(fpuUniOpRoundNearest, instr.Arg(), instr.Return()) + case ssa.OpcodeFabs: + m.lowerFpuUniOp(fpuUniOpAbs, instr.Arg(), instr.Return()) + case ssa.OpcodeBitcast: + m.lowerBitcast(instr) + case ssa.OpcodeFcopysign: + x, y := instr.Arg2() + m.lowerFcopysign(x, y, instr.Return()) + case ssa.OpcodeSdiv, ssa.OpcodeUdiv: + x, y, ctx := instr.Arg3() + ctxVReg := m.compiler.VRegOf(ctx) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.lowerIDiv(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSdiv) + case ssa.OpcodeSrem, ssa.OpcodeUrem: + x, y, ctx := instr.Arg3() + ctxVReg := m.compiler.VRegOf(ctx) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.lowerIRem(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem) + case ssa.OpcodeVconst: + result := m.compiler.VRegOf(instr.Return()) + lo, hi := instr.VconstData() + v := m.allocateInstr() + v.asLoadFpuConst128(result, lo, hi) + m.insert(v) + case ssa.OpcodeVbnot: + x := instr.Arg() + ins := m.allocateInstr() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + ins.asVecMisc(vecOpNot, rd, rn, vecArrangement16B) + m.insert(ins) + case ssa.OpcodeVbxor: + x, y := instr.Arg2() + m.lowerVecRRR(vecOpEOR, x, y, instr.Return(), vecArrangement16B) + case ssa.OpcodeVbor: + x, y := instr.Arg2() + m.lowerVecRRR(vecOpOrr, x, y, instr.Return(), vecArrangement16B) + case ssa.OpcodeVband: + x, y := instr.Arg2() + m.lowerVecRRR(vecOpAnd, x, y, instr.Return(), vecArrangement16B) + case ssa.OpcodeVbandnot: + x, y := instr.Arg2() + m.lowerVecRRR(vecOpBic, x, y, instr.Return(), vecArrangement16B) + case ssa.OpcodeVbitselect: + c, x, y := instr.SelectData() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + creg := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone) + tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + // creg is overwritten by BSL, so we need to move it to the result register before the instruction + // in case when it is used somewhere else. + mov := m.allocateInstr() + mov.asFpuMov128(tmp.nr(), creg.nr()) + m.insert(mov) + + ins := m.allocateInstr() + ins.asVecRRRRewrite(vecOpBsl, tmp, rn, rm, vecArrangement16B) + m.insert(ins) + + mov2 := m.allocateInstr() + rd := m.compiler.VRegOf(instr.Return()) + mov2.asFpuMov128(rd, tmp.nr()) + m.insert(mov2) + case ssa.OpcodeVanyTrue, ssa.OpcodeVallTrue: + x, lane := instr.ArgWithLane() + var arr vecArrangement + if op == ssa.OpcodeVallTrue { + arr = ssaLaneToArrangement(lane) + } + rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.lowerVcheckTrue(op, rm, rd, arr) + case ssa.OpcodeVhighBits: + x, lane := instr.ArgWithLane() + rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + arr := ssaLaneToArrangement(lane) + m.lowerVhighBits(rm, rd, arr) + case ssa.OpcodeVIadd: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpAdd, x, y, instr.Return(), arr) + case ssa.OpcodeExtIaddPairwise: + v, lane, signed := instr.ExtIaddPairwiseData() + vv := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) + + tmpLo, tmpHi := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + var widen vecOp + if signed { + widen = vecOpSshll + } else { + widen = vecOpUshll + } + + var loArr, hiArr, dstArr vecArrangement + switch lane { + case ssa.VecLaneI8x16: + loArr, hiArr, dstArr = vecArrangement8B, vecArrangement16B, vecArrangement8H + case ssa.VecLaneI16x8: + loArr, hiArr, dstArr = vecArrangement4H, vecArrangement8H, vecArrangement4S + case ssa.VecLaneI32x4: + loArr, hiArr, dstArr = vecArrangement2S, vecArrangement4S, vecArrangement2D + default: + panic("unsupported lane " + lane.String()) + } + + widenLo := m.allocateInstr().asVecShiftImm(widen, tmpLo, vv, operandShiftImm(0), loArr) + widenHi := m.allocateInstr().asVecShiftImm(widen, tmpHi, vv, operandShiftImm(0), hiArr) + addp := m.allocateInstr().asVecRRR(vecOpAddp, operandNR(m.compiler.VRegOf(instr.Return())), tmpLo, tmpHi, dstArr) + m.insert(widenLo) + m.insert(widenHi) + m.insert(addp) + + case ssa.OpcodeVSaddSat: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpSqadd, x, y, instr.Return(), arr) + case ssa.OpcodeVUaddSat: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpUqadd, x, y, instr.Return(), arr) + case ssa.OpcodeVIsub: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpSub, x, y, instr.Return(), arr) + case ssa.OpcodeVSsubSat: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpSqsub, x, y, instr.Return(), arr) + case ssa.OpcodeVUsubSat: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpUqsub, x, y, instr.Return(), arr) + case ssa.OpcodeVImin: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpSmin, x, y, instr.Return(), arr) + case ssa.OpcodeVUmin: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpUmin, x, y, instr.Return(), arr) + case ssa.OpcodeVImax: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpSmax, x, y, instr.Return(), arr) + case ssa.OpcodeVUmax: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpUmax, x, y, instr.Return(), arr) + case ssa.OpcodeVAvgRound: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpUrhadd, x, y, instr.Return(), arr) + case ssa.OpcodeVImul: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.lowerVIMul(rd, rn, rm, arr) + case ssa.OpcodeVIabs: + m.lowerVecMisc(vecOpAbs, instr) + case ssa.OpcodeVIneg: + m.lowerVecMisc(vecOpNeg, instr) + case ssa.OpcodeVIpopcnt: + m.lowerVecMisc(vecOpCnt, instr) + case ssa.OpcodeVIshl, + ssa.OpcodeVSshr, ssa.OpcodeVUshr: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.lowerVShift(op, rd, rn, rm, arr) + case ssa.OpcodeVSqrt: + m.lowerVecMisc(vecOpFsqrt, instr) + case ssa.OpcodeVFabs: + m.lowerVecMisc(vecOpFabs, instr) + case ssa.OpcodeVFneg: + m.lowerVecMisc(vecOpFneg, instr) + case ssa.OpcodeVFmin: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpFmin, x, y, instr.Return(), arr) + case ssa.OpcodeVFmax: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpFmax, x, y, instr.Return(), arr) + case ssa.OpcodeVFadd: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpFadd, x, y, instr.Return(), arr) + case ssa.OpcodeVFsub: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpFsub, x, y, instr.Return(), arr) + case ssa.OpcodeVFmul: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpFmul, x, y, instr.Return(), arr) + case ssa.OpcodeSqmulRoundSat: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpSqrdmulh, x, y, instr.Return(), arr) + case ssa.OpcodeVFdiv: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpFdiv, x, y, instr.Return(), arr) + case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat: + x, lane := instr.ArgWithLane() + arr := ssaLaneToArrangement(lane) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.lowerVfpuToInt(rd, rn, arr, op == ssa.OpcodeVFcvtToSintSat) + case ssa.OpcodeVFcvtFromSint, ssa.OpcodeVFcvtFromUint: + x, lane := instr.ArgWithLane() + arr := ssaLaneToArrangement(lane) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.lowerVfpuFromInt(rd, rn, arr, op == ssa.OpcodeVFcvtFromSint) + case ssa.OpcodeSwidenLow, ssa.OpcodeUwidenLow: + x, lane := instr.ArgWithLane() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + + var arr vecArrangement + switch lane { + case ssa.VecLaneI8x16: + arr = vecArrangement8B + case ssa.VecLaneI16x8: + arr = vecArrangement4H + case ssa.VecLaneI32x4: + arr = vecArrangement2S + } + + shll := m.allocateInstr() + if signed := op == ssa.OpcodeSwidenLow; signed { + shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr) + } else { + shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr) + } + m.insert(shll) + case ssa.OpcodeSwidenHigh, ssa.OpcodeUwidenHigh: + x, lane := instr.ArgWithLane() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + + arr := ssaLaneToArrangement(lane) + + shll := m.allocateInstr() + if signed := op == ssa.OpcodeSwidenHigh; signed { + shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr) + } else { + shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr) + } + m.insert(shll) + + case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow: + x, y, lane := instr.Arg2WithLane() + var arr, arr2 vecArrangement + switch lane { + case ssa.VecLaneI16x8: // I16x8 + arr = vecArrangement8B + arr2 = vecArrangement16B // Implies sqxtn2. + case ssa.VecLaneI32x4: + arr = vecArrangement4H + arr2 = vecArrangement8H // Implies sqxtn2. + default: + panic("unsupported lane " + lane.String()) + } + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + + tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + loQxtn := m.allocateInstr() + hiQxtn := m.allocateInstr() + if signed := op == ssa.OpcodeSnarrow; signed { + // Narrow lanes on rn and write them into lower-half of rd. + loQxtn.asVecMisc(vecOpSqxtn, tmp, rn, arr) // low + // Narrow lanes on rm and write them into higher-half of rd. + hiQxtn.asVecMisc(vecOpSqxtn, tmp, rm, arr2) // high (sqxtn2) + } else { + // Narrow lanes on rn and write them into lower-half of rd. + loQxtn.asVecMisc(vecOpSqxtun, tmp, rn, arr) // low + // Narrow lanes on rm and write them into higher-half of rd. + hiQxtn.asVecMisc(vecOpSqxtun, tmp, rm, arr2) // high (sqxtn2) + } + m.insert(loQxtn) + m.insert(hiQxtn) + + mov := m.allocateInstr() + mov.asFpuMov128(rd.nr(), tmp.nr()) + m.insert(mov) + case ssa.OpcodeFvpromoteLow: + x, lane := instr.ArgWithLane() + if lane != ssa.VecLaneF32x4 { + panic("unsupported lane type " + lane.String()) + } + ins := m.allocateInstr() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + ins.asVecMisc(vecOpFcvtl, rd, rn, vecArrangement2S) + m.insert(ins) + case ssa.OpcodeFvdemote: + x, lane := instr.ArgWithLane() + if lane != ssa.VecLaneF64x2 { + panic("unsupported lane type " + lane.String()) + } + ins := m.allocateInstr() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + ins.asVecMisc(vecOpFcvtn, rd, rn, vecArrangement2S) + m.insert(ins) + case ssa.OpcodeExtractlane: + x, index, signed, lane := instr.ExtractlaneData() + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + + mov := m.allocateInstr() + switch lane { + case ssa.VecLaneI8x16: + mov.asMovFromVec(rd, rn, vecArrangementB, vecIndex(index), signed) + case ssa.VecLaneI16x8: + mov.asMovFromVec(rd, rn, vecArrangementH, vecIndex(index), signed) + case ssa.VecLaneI32x4: + mov.asMovFromVec(rd, rn, vecArrangementS, vecIndex(index), signed) + case ssa.VecLaneI64x2: + mov.asMovFromVec(rd, rn, vecArrangementD, vecIndex(index), signed) + case ssa.VecLaneF32x4: + mov.asVecMovElement(rd, rn, vecArrangementS, vecIndex(0), vecIndex(index)) + case ssa.VecLaneF64x2: + mov.asVecMovElement(rd, rn, vecArrangementD, vecIndex(0), vecIndex(index)) + default: + panic("unsupported lane: " + lane.String()) + } + + m.insert(mov) + + case ssa.OpcodeInsertlane: + x, y, index, lane := instr.InsertlaneData() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + // Initially mov rn to tmp. + mov1 := m.allocateInstr() + mov1.asFpuMov128(tmpReg.nr(), rn.nr()) + m.insert(mov1) + + // movToVec and vecMovElement do not clear the remaining bits to zero, + // thus, we can mov rm in-place to tmp. + mov2 := m.allocateInstr() + switch lane { + case ssa.VecLaneI8x16: + mov2.asMovToVec(tmpReg, rm, vecArrangementB, vecIndex(index)) + case ssa.VecLaneI16x8: + mov2.asMovToVec(tmpReg, rm, vecArrangementH, vecIndex(index)) + case ssa.VecLaneI32x4: + mov2.asMovToVec(tmpReg, rm, vecArrangementS, vecIndex(index)) + case ssa.VecLaneI64x2: + mov2.asMovToVec(tmpReg, rm, vecArrangementD, vecIndex(index)) + case ssa.VecLaneF32x4: + mov2.asVecMovElement(tmpReg, rm, vecArrangementS, vecIndex(index), vecIndex(0)) + case ssa.VecLaneF64x2: + mov2.asVecMovElement(tmpReg, rm, vecArrangementD, vecIndex(index), vecIndex(0)) + } + m.insert(mov2) + + // Finally mov tmp to rd. + mov3 := m.allocateInstr() + mov3.asFpuMov128(rd.nr(), tmpReg.nr()) + m.insert(mov3) + + case ssa.OpcodeSwizzle: + x, y, lane := instr.Arg2WithLane() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + + arr := ssaLaneToArrangement(lane) + + // tbl <rd>.<arr>, { <rn>.<arr> }, <rm>.<arr> + tbl1 := m.allocateInstr() + tbl1.asVecTbl(1, rd, rn, rm, arr) + m.insert(tbl1) + + case ssa.OpcodeShuffle: + x, y, lane1, lane2 := instr.ShuffleData() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + + m.lowerShuffle(rd, rn, rm, lane1, lane2) + + case ssa.OpcodeSplat: + x, lane := instr.ArgWithLane() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + + dup := m.allocateInstr() + switch lane { + case ssa.VecLaneI8x16: + dup.asVecDup(rd, rn, vecArrangement16B) + case ssa.VecLaneI16x8: + dup.asVecDup(rd, rn, vecArrangement8H) + case ssa.VecLaneI32x4: + dup.asVecDup(rd, rn, vecArrangement4S) + case ssa.VecLaneI64x2: + dup.asVecDup(rd, rn, vecArrangement2D) + case ssa.VecLaneF32x4: + dup.asVecDupElement(rd, rn, vecArrangementS, vecIndex(0)) + case ssa.VecLaneF64x2: + dup.asVecDupElement(rd, rn, vecArrangementD, vecIndex(0)) + } + m.insert(dup) + + case ssa.OpcodeWideningPairwiseDotProductS: + x, y := instr.Arg2() + xx, yy := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone), + m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + tmp, tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + m.insert(m.allocateInstr().asVecRRR(vecOpSmull, tmp, xx, yy, vecArrangement8H)) + m.insert(m.allocateInstr().asVecRRR(vecOpSmull2, tmp2, xx, yy, vecArrangement8H)) + m.insert(m.allocateInstr().asVecRRR(vecOpAddp, tmp, tmp, tmp2, vecArrangement4S)) + + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.insert(m.allocateInstr().asFpuMov128(rd.nr(), tmp.nr())) + + case ssa.OpcodeLoadSplat: + ptr, offset, lane := instr.LoadSplatData() + m.lowerLoadSplat(ptr, offset, lane, instr.Return()) + + case ssa.OpcodeAtomicRmw: + m.lowerAtomicRmw(instr) + + case ssa.OpcodeAtomicCas: + m.lowerAtomicCas(instr) + + case ssa.OpcodeAtomicLoad: + m.lowerAtomicLoad(instr) + + case ssa.OpcodeAtomicStore: + m.lowerAtomicStore(instr) + + case ssa.OpcodeFence: + instr := m.allocateInstr() + instr.asDMB() + m.insert(instr) + + default: + panic("TODO: lowering " + op.String()) + } + m.executableContext.FlushPendingInstructions() +} + +func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) { + // `tbl2` requires 2 consecutive registers, so we arbitrarily pick v29, v30. + vReg, wReg := v29VReg, v30VReg + + // Initialize v29, v30 to rn, rm. + movv := m.allocateInstr() + movv.asFpuMov128(vReg, rn.nr()) + m.insert(movv) + + movw := m.allocateInstr() + movw.asFpuMov128(wReg, rm.nr()) + m.insert(movw) + + // `lane1`, `lane2` are already encoded as two u64s with the right layout: + // lane1 := lane[7]<<56 | ... | lane[1]<<8 | lane[0] + // lane2 := lane[15]<<56 | ... | lane[9]<<8 | lane[8] + // Thus, we can use loadFpuConst128. + tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + lfc := m.allocateInstr() + lfc.asLoadFpuConst128(tmp.nr(), lane1, lane2) + m.insert(lfc) + + // tbl <rd>.16b, { <vReg>.16B, <wReg>.16b }, <tmp>.16b + tbl2 := m.allocateInstr() + tbl2.asVecTbl(2, rd, operandNR(vReg), tmp, vecArrangement16B) + m.insert(tbl2) +} + +func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangement) { + var modulo byte + switch arr { + case vecArrangement16B: + modulo = 0x7 // Modulo 8. + case vecArrangement8H: + modulo = 0xf // Modulo 16. + case vecArrangement4S: + modulo = 0x1f // Modulo 32. + case vecArrangement2D: + modulo = 0x3f // Modulo 64. + default: + panic("unsupported arrangment " + arr.String()) + } + + rtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) + vtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + and := m.allocateInstr() + and.asALUBitmaskImm(aluOpAnd, rtmp.nr(), rm.nr(), uint64(modulo), true) + m.insert(and) + + if op != ssa.OpcodeVIshl { + // Negate the amount to make this as right shift. + neg := m.allocateInstr() + neg.asALU(aluOpSub, rtmp, operandNR(xzrVReg), rtmp, true) + m.insert(neg) + } + + // Copy the shift amount into a vector register as sshl/ushl requires it to be there. + dup := m.allocateInstr() + dup.asVecDup(vtmp, rtmp, arr) + m.insert(dup) + + if op == ssa.OpcodeVIshl || op == ssa.OpcodeVSshr { + sshl := m.allocateInstr() + sshl.asVecRRR(vecOpSshl, rd, rn, vtmp, arr) + m.insert(sshl) + } else { + ushl := m.allocateInstr() + ushl.asVecRRR(vecOpUshl, rd, rn, vtmp, arr) + m.insert(ushl) + } +} + +func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangement) { + tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + // Special case VallTrue for i64x2. + if op == ssa.OpcodeVallTrue && arr == vecArrangement2D { + // cmeq v3?.2d, v2?.2d, #0 + // addp v3?.2d, v3?.2d, v3?.2d + // fcmp v3?, v3? + // cset dst, eq + + ins := m.allocateInstr() + ins.asVecMisc(vecOpCmeq0, tmp, rm, vecArrangement2D) + m.insert(ins) + + addp := m.allocateInstr() + addp.asVecRRR(vecOpAddp, tmp, tmp, tmp, vecArrangement2D) + m.insert(addp) + + fcmp := m.allocateInstr() + fcmp.asFpuCmp(tmp, tmp, true) + m.insert(fcmp) + + cset := m.allocateInstr() + cset.asCSet(rd.nr(), false, eq) + m.insert(cset) + + return + } + + // Create a scalar value with umaxp or uminv, then compare it against zero. + ins := m.allocateInstr() + if op == ssa.OpcodeVanyTrue { + // umaxp v4?.16b, v2?.16b, v2?.16b + ins.asVecRRR(vecOpUmaxp, tmp, rm, rm, vecArrangement16B) + } else { + // uminv d4?, v2?.4s + ins.asVecLanes(vecOpUminv, tmp, rm, arr) + } + m.insert(ins) + + // mov x3?, v4?.d[0] + // ccmp x3?, #0x0, #0x0, al + // cset x3?, ne + // mov x0, x3? + + movv := m.allocateInstr() + movv.asMovFromVec(rd, tmp, vecArrangementD, vecIndex(0), false) + m.insert(movv) + + fc := m.allocateInstr() + fc.asCCmpImm(rd, uint64(0), al, 0, true) + m.insert(fc) + + cset := m.allocateInstr() + cset.asCSet(rd.nr(), false, ne) + m.insert(cset) +} + +func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) { + r0 := operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) + v0 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + v1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + switch arr { + case vecArrangement16B: + // sshr v6?.16b, v2?.16b, #7 + // movz x4?, #0x201, lsl 0 + // movk x4?, #0x804, lsl 16 + // movk x4?, #0x2010, lsl 32 + // movk x4?, #0x8040, lsl 48 + // dup v5?.2d, x4? + // and v6?.16b, v6?.16b, v5?.16b + // ext v5?.16b, v6?.16b, v6?.16b, #8 + // zip1 v5?.16b, v6?.16b, v5?.16b + // addv s5?, v5?.8h + // umov s3?, v5?.h[0] + + // Right arithmetic shift on the original vector and store the result into v1. So we have: + // v1[i] = 0xff if vi<0, 0 otherwise. + sshr := m.allocateInstr() + sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(7), vecArrangement16B) + m.insert(sshr) + + // Load the bit mask into r0. + m.insertMOVZ(r0.nr(), 0x0201, 0, true) + m.insertMOVK(r0.nr(), 0x0804, 1, true) + m.insertMOVK(r0.nr(), 0x2010, 2, true) + m.insertMOVK(r0.nr(), 0x8040, 3, true) + + // dup r0 to v0. + dup := m.allocateInstr() + dup.asVecDup(v0, r0, vecArrangement2D) + m.insert(dup) + + // Lane-wise logical AND with the bit mask, meaning that we have + // v[i] = (1 << i) if vi<0, 0 otherwise. + // + // Below, we use the following notation: + // wi := (1 << i) if vi<0, 0 otherwise. + and := m.allocateInstr() + and.asVecRRR(vecOpAnd, v1, v1, v0, vecArrangement16B) + m.insert(and) + + // Swap the lower and higher 8 byte elements, and write it into v0, meaning that we have + // v0[i] = w(i+8) if i < 8, w(i-8) otherwise. + ext := m.allocateInstr() + ext.asVecExtract(v0, v1, v1, vecArrangement16B, uint32(8)) + m.insert(ext) + + // v = [w0, w8, ..., w7, w15] + zip1 := m.allocateInstr() + zip1.asVecPermute(vecOpZip1, v0, v1, v0, vecArrangement16B) + m.insert(zip1) + + // v.h[0] = w0 + ... + w15 + addv := m.allocateInstr() + addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H) + m.insert(addv) + + // Extract the v.h[0] as the result. + movfv := m.allocateInstr() + movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false) + m.insert(movfv) + case vecArrangement8H: + // sshr v6?.8h, v2?.8h, #15 + // movz x4?, #0x1, lsl 0 + // movk x4?, #0x2, lsl 16 + // movk x4?, #0x4, lsl 32 + // movk x4?, #0x8, lsl 48 + // dup v5?.2d, x4? + // lsl x4?, x4?, 0x4 + // ins v5?.d[1], x4? + // and v5?.16b, v6?.16b, v5?.16b + // addv s5?, v5?.8h + // umov s3?, v5?.h[0] + + // Right arithmetic shift on the original vector and store the result into v1. So we have: + // v[i] = 0xffff if vi<0, 0 otherwise. + sshr := m.allocateInstr() + sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(15), vecArrangement8H) + m.insert(sshr) + + // Load the bit mask into r0. + m.lowerConstantI64(r0.nr(), 0x0008000400020001) + + // dup r0 to vector v0. + dup := m.allocateInstr() + dup.asVecDup(v0, r0, vecArrangement2D) + m.insert(dup) + + lsl := m.allocateInstr() + lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(4), true) + m.insert(lsl) + + movv := m.allocateInstr() + movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1)) + m.insert(movv) + + // Lane-wise logical AND with the bitmask, meaning that we have + // v[i] = (1 << i) if vi<0, 0 otherwise for i=0..3 + // = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7 + and := m.allocateInstr() + and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B) + m.insert(and) + + addv := m.allocateInstr() + addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H) + m.insert(addv) + + movfv := m.allocateInstr() + movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false) + m.insert(movfv) + case vecArrangement4S: + // sshr v6?.8h, v2?.8h, #15 + // movz x4?, #0x1, lsl 0 + // movk x4?, #0x2, lsl 16 + // movk x4?, #0x4, lsl 32 + // movk x4?, #0x8, lsl 48 + // dup v5?.2d, x4? + // lsl x4?, x4?, 0x4 + // ins v5?.d[1], x4? + // and v5?.16b, v6?.16b, v5?.16b + // addv s5?, v5?.8h + // umov s3?, v5?.h[0] + + // Right arithmetic shift on the original vector and store the result into v1. So we have: + // v[i] = 0xffffffff if vi<0, 0 otherwise. + sshr := m.allocateInstr() + sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(31), vecArrangement4S) + m.insert(sshr) + + // Load the bit mask into r0. + m.lowerConstantI64(r0.nr(), 0x0000000200000001) + + // dup r0 to vector v0. + dup := m.allocateInstr() + dup.asVecDup(v0, r0, vecArrangement2D) + m.insert(dup) + + lsl := m.allocateInstr() + lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(2), true) + m.insert(lsl) + + movv := m.allocateInstr() + movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1)) + m.insert(movv) + + // Lane-wise logical AND with the bitmask, meaning that we have + // v[i] = (1 << i) if vi<0, 0 otherwise for i in [0, 1] + // = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3] + and := m.allocateInstr() + and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B) + m.insert(and) + + addv := m.allocateInstr() + addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement4S) + m.insert(addv) + + movfv := m.allocateInstr() + movfv.asMovFromVec(rd, v0, vecArrangementS, vecIndex(0), false) + m.insert(movfv) + case vecArrangement2D: + // mov d3?, v2?.d[0] + // mov x4?, v2?.d[1] + // lsr x4?, x4?, 0x3f + // lsr d3?, d3?, 0x3f + // add s3?, s3?, w4?, lsl #1 + + // Move the lower 64-bit int into result. + movv0 := m.allocateInstr() + movv0.asMovFromVec(rd, rm, vecArrangementD, vecIndex(0), false) + m.insert(movv0) + + // Move the higher 64-bit int into r0. + movv1 := m.allocateInstr() + movv1.asMovFromVec(r0, rm, vecArrangementD, vecIndex(1), false) + m.insert(movv1) + + // Move the sign bit into the least significant bit. + lsr1 := m.allocateInstr() + lsr1.asALUShift(aluOpLsr, r0, r0, operandShiftImm(63), true) + m.insert(lsr1) + + lsr2 := m.allocateInstr() + lsr2.asALUShift(aluOpLsr, rd, rd, operandShiftImm(63), true) + m.insert(lsr2) + + // rd = (r0<<1) | rd + lsl := m.allocateInstr() + lsl.asALU(aluOpAdd, rd, rd, operandSR(r0.nr(), 1, shiftOpLSL), false) + m.insert(lsl) + default: + panic("Unsupported " + arr.String()) + } +} + +func (m *machine) lowerVecMisc(op vecOp, instr *ssa.Instruction) { + x, lane := instr.ArgWithLane() + arr := ssaLaneToArrangement(lane) + ins := m.allocateInstr() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + ins.asVecMisc(op, rd, rn, arr) + m.insert(ins) +} + +func (m *machine) lowerVecRRR(op vecOp, x, y, ret ssa.Value, arr vecArrangement) { + ins := m.allocateInstr() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(ret)) + ins.asVecRRR(op, rd, rn, rm, arr) + m.insert(ins) +} + +func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) { + if arr != vecArrangement2D { + mul := m.allocateInstr() + mul.asVecRRR(vecOpMul, rd, rn, rm, arr) + m.insert(mul) + } else { + tmp1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + tmp3 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + tmpRes := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + // Following the algorithm in https://chromium-review.googlesource.com/c/v8/v8/+/1781696 + rev64 := m.allocateInstr() + rev64.asVecMisc(vecOpRev64, tmp2, rm, vecArrangement4S) + m.insert(rev64) + + mul := m.allocateInstr() + mul.asVecRRR(vecOpMul, tmp2, tmp2, rn, vecArrangement4S) + m.insert(mul) + + xtn1 := m.allocateInstr() + xtn1.asVecMisc(vecOpXtn, tmp1, rn, vecArrangement2S) + m.insert(xtn1) + + addp := m.allocateInstr() + addp.asVecRRR(vecOpAddp, tmp2, tmp2, tmp2, vecArrangement4S) + m.insert(addp) + + xtn2 := m.allocateInstr() + xtn2.asVecMisc(vecOpXtn, tmp3, rm, vecArrangement2S) + m.insert(xtn2) + + // Note: do not write the result directly into result yet. This is the same reason as in bsl. + // In short, in UMLAL instruction, the result register is also one of the source register, and + // the value on the result register is significant. + shll := m.allocateInstr() + shll.asVecMisc(vecOpShll, tmpRes, tmp2, vecArrangement2S) + m.insert(shll) + + umlal := m.allocateInstr() + umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, tmp3, tmp1, vecArrangement2S) + m.insert(umlal) + + mov := m.allocateInstr() + mov.asFpuMov128(rd.nr(), tmpRes.nr()) + m.insert(mov) + } +} + +func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) { + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + + // Note: this usage of tmp is important. + // BSL modifies the destination register, so we need to use a temporary register so that + // the actual definition of the destination register happens *after* the BSL instruction. + // That way, we can force the spill instruction to be inserted after the BSL instruction. + tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + fcmgt := m.allocateInstr() + if max { + fcmgt.asVecRRR(vecOpFcmgt, tmp, rm, rn, arr) + } else { + // If min, swap the args. + fcmgt.asVecRRR(vecOpFcmgt, tmp, rn, rm, arr) + } + m.insert(fcmgt) + + bsl := m.allocateInstr() + bsl.asVecRRRRewrite(vecOpBsl, tmp, rm, rn, vecArrangement16B) + m.insert(bsl) + + res := operandNR(m.compiler.VRegOf(instr.Return())) + mov2 := m.allocateInstr() + mov2.asFpuMov128(res.nr(), tmp.nr()) + m.insert(mov2) +} + +func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) { + div := m.allocateInstr() + + if signed { + div.asALU(aluOpSDiv, rd, rn, rm, _64bit) + } else { + div.asALU(aluOpUDiv, rd, rn, rm, _64bit) + } + m.insert(div) + + // Check if rm is zero: + m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero) + + // rd = rn-rd*rm by MSUB instruction. + msub := m.allocateInstr() + msub.asALURRRR(aluOpMSub, rd, rd, rm, rn, _64bit) + m.insert(msub) +} + +func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) { + div := m.allocateInstr() + + if signed { + div.asALU(aluOpSDiv, rd, rn, rm, _64bit) + } else { + div.asALU(aluOpUDiv, rd, rn, rm, _64bit) + } + m.insert(div) + + // Check if rm is zero: + m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero) + + if signed { + // We need to check the signed overflow which happens iff "math.MinInt{32,64} / -1" + minusOneCheck := m.allocateInstr() + // Sets eq condition if rm == -1. + minusOneCheck.asALU(aluOpAddS, operandNR(xzrVReg), rm, operandImm12(1, 0), _64bit) + m.insert(minusOneCheck) + + ccmp := m.allocateInstr() + // If eq condition is set, sets the flag by the result based on "rn - 1", otherwise clears the flag. + ccmp.asCCmpImm(rn, 1, eq, 0, _64bit) + m.insert(ccmp) + + // Check the overflow flag. + m.exitIfNot(execCtxVReg, vs.invert().asCond(), false, wazevoapi.ExitCodeIntegerOverflow) + } +} + +// exitIfNot emits a conditional branch to exit if the condition is not met. +// If `c` (cond type) is a register, `cond64bit` must be chosen to indicate whether the register is 32-bit or 64-bit. +// Otherwise, `cond64bit` is ignored. +func (m *machine) exitIfNot(execCtxVReg regalloc.VReg, c cond, cond64bit bool, code wazevoapi.ExitCode) { + execCtxTmp := m.copyToTmp(execCtxVReg) + + cbr := m.allocateInstr() + m.insert(cbr) + m.lowerExitWithCode(execCtxTmp, code) + // Conditional branch target is after exit. + l := m.insertBrTargetLabel() + cbr.asCondBr(c, l, cond64bit) +} + +func (m *machine) lowerFcopysign(x, y, ret ssa.Value) { + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + var tmpI, tmpF operand + _64 := x.Type() == ssa.TypeF64 + if _64 { + tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) + tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) + } else { + tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF32)) + tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI32)) + } + rd := m.compiler.VRegOf(ret) + m.lowerFcopysignImpl(operandNR(rd), rn, rm, tmpI, tmpF, _64) +} + +func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool) { + // This is exactly the same code emitted by GCC for "__builtin_copysign": + // + // mov x0, -9223372036854775808 + // fmov d2, x0 + // vbit v0.8b, v1.8b, v2.8b + // + + setMSB := m.allocateInstr() + if _64bit { + m.lowerConstantI64(tmpI.nr(), math.MinInt64) + setMSB.asMovToVec(tmpF, tmpI, vecArrangementD, vecIndex(0)) + } else { + m.lowerConstantI32(tmpI.nr(), math.MinInt32) + setMSB.asMovToVec(tmpF, tmpI, vecArrangementS, vecIndex(0)) + } + m.insert(setMSB) + + tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) + + mov := m.allocateInstr() + mov.asFpuMov64(tmpReg.nr(), rn.nr()) + m.insert(mov) + + vbit := m.allocateInstr() + vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, tmpF, vecArrangement8B) + m.insert(vbit) + + movDst := m.allocateInstr() + movDst.asFpuMov64(rd.nr(), tmpReg.nr()) + m.insert(movDst) +} + +func (m *machine) lowerBitcast(instr *ssa.Instruction) { + v, dstType := instr.BitcastData() + srcType := v.Type() + rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + srcInt := srcType.IsInt() + dstInt := dstType.IsInt() + switch { + case srcInt && !dstInt: // Int to Float: + mov := m.allocateInstr() + var arr vecArrangement + if srcType.Bits() == 64 { + arr = vecArrangementD + } else { + arr = vecArrangementS + } + mov.asMovToVec(rd, rn, arr, vecIndex(0)) + m.insert(mov) + case !srcInt && dstInt: // Float to Int: + mov := m.allocateInstr() + var arr vecArrangement + if dstType.Bits() == 64 { + arr = vecArrangementD + } else { + arr = vecArrangementS + } + mov.asMovFromVec(rd, rn, arr, vecIndex(0), false) + m.insert(mov) + default: + panic("TODO?BUG?") + } +} + +func (m *machine) lowerFpuUniOp(op fpuUniOp, in, out ssa.Value) { + rn := m.getOperand_NR(m.compiler.ValueDefinition(in), extModeNone) + rd := operandNR(m.compiler.VRegOf(out)) + + neg := m.allocateInstr() + neg.asFpuRR(op, rd, rn, in.Type().Bits() == 64) + m.insert(neg) +} + +func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) { + if !nonTrapping { + // First of all, we have to clear the FPU flags. + flagClear := m.allocateInstr() + flagClear.asMovToFPSR(xzrVReg) + m.insert(flagClear) + } + + // Then, do the conversion which doesn't trap inherently. + cvt := m.allocateInstr() + cvt.asFpuToInt(rd, rn, signed, src64bit, dst64bit) + m.insert(cvt) + + if !nonTrapping { + tmpReg := m.compiler.AllocateVReg(ssa.TypeI64) + + // After the conversion, check the FPU flags. + getFlag := m.allocateInstr() + getFlag.asMovFromFPSR(tmpReg) + m.insert(getFlag) + + execCtx := m.copyToTmp(ctx) + _rn := operandNR(m.copyToTmp(rn.nr())) + + // Check if the conversion was undefined by comparing the status with 1. + // See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register + alu := m.allocateInstr() + alu.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpReg), operandImm12(1, 0), true) + m.insert(alu) + + // If it is not undefined, we can return the result. + ok := m.allocateInstr() + m.insert(ok) + + // Otherwise, we have to choose the status depending on it is overflow or NaN conversion. + + // Comparing itself to check if it is a NaN. + fpuCmp := m.allocateInstr() + fpuCmp.asFpuCmp(_rn, _rn, src64bit) + m.insert(fpuCmp) + // If the VC flag is not set (== VS flag is set), it is a NaN. + m.exitIfNot(execCtx, vc.asCond(), false, wazevoapi.ExitCodeInvalidConversionToInteger) + // Otherwise, it is an overflow. + m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) + + // Conditional branch target is after exit. + l := m.insertBrTargetLabel() + ok.asCondBr(ne.asCond(), l, false /* ignored */) + } +} + +func (m *machine) lowerIntToFpu(rd, rn operand, signed, src64bit, dst64bit bool) { + cvt := m.allocateInstr() + cvt.asIntToFpu(rd, rn, signed, src64bit, dst64bit) + m.insert(cvt) +} + +func (m *machine) lowerFpuBinOp(si *ssa.Instruction) { + instr := m.allocateInstr() + var op fpuBinOp + switch si.Opcode() { + case ssa.OpcodeFadd: + op = fpuBinOpAdd + case ssa.OpcodeFsub: + op = fpuBinOpSub + case ssa.OpcodeFmul: + op = fpuBinOpMul + case ssa.OpcodeFdiv: + op = fpuBinOpDiv + case ssa.OpcodeFmax: + op = fpuBinOpMax + case ssa.OpcodeFmin: + op = fpuBinOpMin + } + x, y := si.Arg2() + xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) + rn := m.getOperand_NR(xDef, extModeNone) + rm := m.getOperand_NR(yDef, extModeNone) + rd := operandNR(m.compiler.VRegOf(si.Return())) + instr.asFpuRRR(op, rd, rn, rm, x.Type().Bits() == 64) + m.insert(instr) +} + +func (m *machine) lowerSubOrAdd(si *ssa.Instruction, add bool) { + x, y := si.Arg2() + if !x.Type().IsInt() { + panic("BUG?") + } + + xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) + rn := m.getOperand_NR(xDef, extModeNone) + rm, yNegated := m.getOperand_MaybeNegatedImm12_ER_SR_NR(yDef, extModeNone) + + var aop aluOp + switch { + case add && !yNegated: // rn+rm = x+y + aop = aluOpAdd + case add && yNegated: // rn-rm = x-(-y) = x+y + aop = aluOpSub + case !add && !yNegated: // rn-rm = x-y + aop = aluOpSub + case !add && yNegated: // rn+rm = x-(-y) = x-y + aop = aluOpAdd + } + rd := operandNR(m.compiler.VRegOf(si.Return())) + alu := m.allocateInstr() + alu.asALU(aop, rd, rn, rm, x.Type().Bits() == 64) + m.insert(alu) +} + +// InsertMove implements backend.Machine. +func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) { + instr := m.allocateInstr() + switch typ { + case ssa.TypeI32, ssa.TypeI64: + instr.asMove64(dst, src) + case ssa.TypeF32, ssa.TypeF64: + instr.asFpuMov64(dst, src) + case ssa.TypeV128: + instr.asFpuMov128(dst, src) + default: + panic("TODO") + } + m.insert(instr) +} + +func (m *machine) lowerIcmp(si *ssa.Instruction) { + x, y, c := si.IcmpData() + flag := condFlagFromSSAIntegerCmpCond(c) + + in64bit := x.Type().Bits() == 64 + var ext extMode + if in64bit { + if c.Signed() { + ext = extModeSignExtend64 + } else { + ext = extModeZeroExtend64 + } + } else { + if c.Signed() { + ext = extModeSignExtend32 + } else { + ext = extModeZeroExtend32 + } + } + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext) + rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), ext) + alu := m.allocateInstr() + alu.asALU(aluOpSubS, operandNR(xzrVReg), rn, rm, in64bit) + m.insert(alu) + + cset := m.allocateInstr() + cset.asCSet(m.compiler.VRegOf(si.Return()), false, flag) + m.insert(cset) +} + +func (m *machine) lowerVIcmp(si *ssa.Instruction) { + x, y, c, lane := si.VIcmpData() + flag := condFlagFromSSAIntegerCmpCond(c) + arr := ssaLaneToArrangement(lane) + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(si.Return())) + + switch flag { + case eq: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr) + m.insert(cmp) + case ne: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr) + m.insert(cmp) + not := m.allocateInstr() + not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B) + m.insert(not) + case ge: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmge, rd, rn, rm, arr) + m.insert(cmp) + case gt: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmgt, rd, rn, rm, arr) + m.insert(cmp) + case le: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmge, rd, rm, rn, arr) // rm, rn are swapped + m.insert(cmp) + case lt: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmgt, rd, rm, rn, arr) // rm, rn are swapped + m.insert(cmp) + case hs: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmhs, rd, rn, rm, arr) + m.insert(cmp) + case hi: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmhi, rd, rn, rm, arr) + m.insert(cmp) + case ls: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmhs, rd, rm, rn, arr) // rm, rn are swapped + m.insert(cmp) + case lo: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmhi, rd, rm, rn, arr) // rm, rn are swapped + m.insert(cmp) + } +} + +func (m *machine) lowerVFcmp(si *ssa.Instruction) { + x, y, c, lane := si.VFcmpData() + flag := condFlagFromSSAFloatCmpCond(c) + arr := ssaLaneToArrangement(lane) + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(si.Return())) + + switch flag { + case eq: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr) + m.insert(cmp) + case ne: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr) + m.insert(cmp) + not := m.allocateInstr() + not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B) + m.insert(not) + case ge: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpFcmge, rd, rn, rm, arr) + m.insert(cmp) + case gt: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpFcmgt, rd, rn, rm, arr) + m.insert(cmp) + case mi: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpFcmgt, rd, rm, rn, arr) // rm, rn are swapped + m.insert(cmp) + case ls: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpFcmge, rd, rm, rn, arr) // rm, rn are swapped + m.insert(cmp) + } +} + +func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool) { + cvt := m.allocateInstr() + if signed { + cvt.asVecMisc(vecOpFcvtzs, rd, rn, arr) + } else { + cvt.asVecMisc(vecOpFcvtzu, rd, rn, arr) + } + m.insert(cvt) + + if arr == vecArrangement2D { + narrow := m.allocateInstr() + if signed { + narrow.asVecMisc(vecOpSqxtn, rd, rd, vecArrangement2S) + } else { + narrow.asVecMisc(vecOpUqxtn, rd, rd, vecArrangement2S) + } + m.insert(narrow) + } +} + +func (m *machine) lowerVfpuFromInt(rd, rn operand, arr vecArrangement, signed bool) { + cvt := m.allocateInstr() + if signed { + cvt.asVecMisc(vecOpScvtf, rd, rn, arr) + } else { + cvt.asVecMisc(vecOpUcvtf, rd, rn, arr) + } + m.insert(cvt) +} + +func (m *machine) lowerShifts(si *ssa.Instruction, ext extMode, aluOp aluOp) { + x, amount := si.Arg2() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext) + rm := m.getOperand_ShiftImm_NR(m.compiler.ValueDefinition(amount), ext, x.Type().Bits()) + rd := operandNR(m.compiler.VRegOf(si.Return())) + + alu := m.allocateInstr() + alu.asALUShift(aluOp, rd, rn, rm, x.Type().Bits() == 64) + m.insert(alu) +} + +func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp, ignoreResult bool) { + x, y := si.Arg2() + + xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) + rn := m.getOperand_NR(xDef, extModeNone) + + var rd operand + if ignoreResult { + rd = operandNR(xzrVReg) + } else { + rd = operandNR(m.compiler.VRegOf(si.Return())) + } + + _64 := x.Type().Bits() == 64 + alu := m.allocateInstr() + if instr := yDef.Instr; instr != nil && instr.Constant() { + c := instr.ConstantVal() + if isBitMaskImmediate(c, _64) { + // Constant bit wise operations can be lowered to a single instruction. + alu.asALUBitmaskImm(op, rd.nr(), rn.nr(), c, _64) + m.insert(alu) + return + } + } + + rm := m.getOperand_SR_NR(yDef, extModeNone) + alu.asALU(op, rd, rn, rm, _64) + m.insert(alu) +} + +func (m *machine) lowerRotl(si *ssa.Instruction) { + x, y := si.Arg2() + r := si.Return() + _64 := r.Type().Bits() == 64 + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + var tmp operand + if _64 { + tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) + } else { + tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32)) + } + rd := operandNR(m.compiler.VRegOf(r)) + + // Encode rotl as neg + rotr: neg is a sub against the zero-reg. + m.lowerRotlImpl(rd, rn, rm, tmp, _64) +} + +func (m *machine) lowerRotlImpl(rd, rn, rm, tmp operand, is64bit bool) { + // Encode rotl as neg + rotr: neg is a sub against the zero-reg. + neg := m.allocateInstr() + neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rm, is64bit) + m.insert(neg) + alu := m.allocateInstr() + alu.asALU(aluOpRotR, rd, rn, tmp, is64bit) + m.insert(alu) +} + +func (m *machine) lowerRotr(si *ssa.Instruction) { + x, y := si.Arg2() + + xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) + rn := m.getOperand_NR(xDef, extModeNone) + rm := m.getOperand_NR(yDef, extModeNone) + rd := operandNR(m.compiler.VRegOf(si.Return())) + + alu := m.allocateInstr() + alu.asALU(aluOpRotR, rd, rn, rm, si.Return().Type().Bits() == 64) + m.insert(alu) +} + +func (m *machine) lowerExtend(arg, ret ssa.Value, from, to byte, signed bool) { + rd := m.compiler.VRegOf(ret) + def := m.compiler.ValueDefinition(arg) + + if instr := def.Instr; !signed && from == 32 && instr != nil { + // We can optimize out the unsigned extend because: + // Writes to the W register set bits [63:32] of the X register to zero + // https://developer.arm.com/documentation/den0024/a/An-Introduction-to-the-ARMv8-Instruction-Sets/The-ARMv8-instruction-sets/Distinguishing-between-32-bit-and-64-bit-A64-instructions + switch instr.Opcode() { + case + ssa.OpcodeIadd, ssa.OpcodeIsub, ssa.OpcodeLoad, + ssa.OpcodeBand, ssa.OpcodeBor, ssa.OpcodeBnot, + ssa.OpcodeIshl, ssa.OpcodeUshr, ssa.OpcodeSshr, + ssa.OpcodeRotl, ssa.OpcodeRotr, + ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32: + // So, if the argument is the result of a 32-bit operation, we can just copy the register. + // It is highly likely that this copy will be optimized out after register allocation. + rn := m.compiler.VRegOf(arg) + mov := m.allocateInstr() + // Note: do not use move32 as it will be lowered to a 32-bit move, which is not copy (that is actually the impl of UExtend). + mov.asMove64(rd, rn) + m.insert(mov) + return + default: + } + } + rn := m.getOperand_NR(def, extModeNone) + + ext := m.allocateInstr() + ext.asExtend(rd, rn.nr(), from, to, signed) + m.insert(ext) +} + +func (m *machine) lowerFcmp(x, y, result ssa.Value, c ssa.FloatCmpCond) { + rn, rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone), m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + + fc := m.allocateInstr() + fc.asFpuCmp(rn, rm, x.Type().Bits() == 64) + m.insert(fc) + + cset := m.allocateInstr() + cset.asCSet(m.compiler.VRegOf(result), false, condFlagFromSSAFloatCmpCond(c)) + m.insert(cset) +} + +func (m *machine) lowerImul(x, y, result ssa.Value) { + rd := m.compiler.VRegOf(result) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + + // TODO: if this comes before Add/Sub, we could merge it by putting it into the place of xzrVReg. + + mul := m.allocateInstr() + mul.asALURRRR(aluOpMAdd, operandNR(rd), rn, rm, operandNR(xzrVReg), x.Type().Bits() == 64) + m.insert(mul) +} + +func (m *machine) lowerClz(x, result ssa.Value) { + rd := m.compiler.VRegOf(result) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + clz := m.allocateInstr() + clz.asBitRR(bitOpClz, rd, rn.nr(), x.Type().Bits() == 64) + m.insert(clz) +} + +func (m *machine) lowerCtz(x, result ssa.Value) { + rd := m.compiler.VRegOf(result) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rbit := m.allocateInstr() + _64 := x.Type().Bits() == 64 + var tmpReg regalloc.VReg + if _64 { + tmpReg = m.compiler.AllocateVReg(ssa.TypeI64) + } else { + tmpReg = m.compiler.AllocateVReg(ssa.TypeI32) + } + rbit.asBitRR(bitOpRbit, tmpReg, rn.nr(), _64) + m.insert(rbit) + + clz := m.allocateInstr() + clz.asBitRR(bitOpClz, rd, tmpReg, _64) + m.insert(clz) +} + +func (m *machine) lowerPopcnt(x, result ssa.Value) { + // arm64 doesn't have an instruction for population count on scalar register, + // so we use the vector instruction `cnt`. + // This is exactly what the official Go implements bits.OneCount. + // For example, "func () int { return bits.OneCount(10) }" is compiled as + // + // MOVD $10, R0 ;; Load 10. + // FMOVD R0, F0 + // VCNT V0.B8, V0.B8 + // UADDLV V0.B8, V0 + // + // In aarch64 asm, FMOVD is encoded as `ins`, VCNT is `cnt`, + // and the registers may use different names. In our encoding we use the following + // instructions: + // + // ins v0.d[0], x0 ;; mov from GPR to vec (FMOV above) is encoded as INS + // cnt v0.16b, v0.16b ;; we use vec arrangement 16b + // uaddlv h0, v0.8b ;; h0 is still v0 with the dest width specifier 'H', implied when src arrangement is 8b + // mov x5, v0.d[0] ;; finally we mov the result back to a GPR + // + + rd := operandNR(m.compiler.VRegOf(result)) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + + rf1 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) + ins := m.allocateInstr() + ins.asMovToVec(rf1, rn, vecArrangementD, vecIndex(0)) + m.insert(ins) + + rf2 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) + cnt := m.allocateInstr() + cnt.asVecMisc(vecOpCnt, rf2, rf1, vecArrangement16B) + m.insert(cnt) + + rf3 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) + uaddlv := m.allocateInstr() + uaddlv.asVecLanes(vecOpUaddlv, rf3, rf2, vecArrangement8B) + m.insert(uaddlv) + + mov := m.allocateInstr() + mov.asMovFromVec(rd, rf3, vecArrangementD, vecIndex(0), false) + m.insert(mov) +} + +// lowerExitWithCode lowers the lowerExitWithCode takes a context pointer as argument. +func (m *machine) lowerExitWithCode(execCtxVReg regalloc.VReg, code wazevoapi.ExitCode) { + tmpReg1 := m.compiler.AllocateVReg(ssa.TypeI32) + loadExitCodeConst := m.allocateInstr() + loadExitCodeConst.asMOVZ(tmpReg1, uint64(code), 0, true) + + setExitCode := m.allocateInstr() + setExitCode.asStore(operandNR(tmpReg1), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(), + }, 32) + + // In order to unwind the stack, we also need to push the current stack pointer: + tmp2 := m.compiler.AllocateVReg(ssa.TypeI64) + movSpToTmp := m.allocateInstr() + movSpToTmp.asMove64(tmp2, spVReg) + strSpToExecCtx := m.allocateInstr() + strSpToExecCtx.asStore(operandNR(tmp2), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(), + }, 64) + // Also the address of this exit. + tmp3 := m.compiler.AllocateVReg(ssa.TypeI64) + currentAddrToTmp := m.allocateInstr() + currentAddrToTmp.asAdr(tmp3, 0) + storeCurrentAddrToExecCtx := m.allocateInstr() + storeCurrentAddrToExecCtx.asStore(operandNR(tmp3), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(), + }, 64) + + exitSeq := m.allocateInstr() + exitSeq.asExitSequence(execCtxVReg) + + m.insert(loadExitCodeConst) + m.insert(setExitCode) + m.insert(movSpToTmp) + m.insert(strSpToExecCtx) + m.insert(currentAddrToTmp) + m.insert(storeCurrentAddrToExecCtx) + m.insert(exitSeq) +} + +func (m *machine) lowerIcmpToFlag(x, y ssa.Value, signed bool) { + if x.Type() != y.Type() { + panic( + fmt.Sprintf("TODO(maybe): support icmp with different types: v%d=%s != v%d=%s", + x.ID(), x.Type(), y.ID(), y.Type())) + } + + extMod := extModeOf(x.Type(), signed) + + // First operand must be in pure register form. + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extMod) + // Second operand can be in any of Imm12, ER, SR, or NR form supported by the SUBS instructions. + rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), extMod) + + alu := m.allocateInstr() + // subs zr, rn, rm + alu.asALU( + aluOpSubS, + // We don't need the result, just need to set flags. + operandNR(xzrVReg), + rn, + rm, + x.Type().Bits() == 64, + ) + m.insert(alu) +} + +func (m *machine) lowerFcmpToFlag(x, y ssa.Value) { + if x.Type() != y.Type() { + panic("TODO(maybe): support icmp with different types") + } + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + cmp := m.allocateInstr() + cmp.asFpuCmp(rn, rm, x.Type().Bits() == 64) + m.insert(cmp) +} + +func (m *machine) lowerExitIfTrueWithCode(execCtxVReg regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) { + condDef := m.compiler.ValueDefinition(cond) + if !m.compiler.MatchInstr(condDef, ssa.OpcodeIcmp) { + panic("TODO: OpcodeExitIfTrueWithCode must come after Icmp at the moment: " + condDef.Instr.Opcode().String()) + } + condDef.Instr.MarkLowered() + + cvalInstr := condDef.Instr + x, y, c := cvalInstr.IcmpData() + signed := c.Signed() + + if !m.tryLowerBandToFlag(x, y) { + m.lowerIcmpToFlag(x, y, signed) + } + + // We need to copy the execution context to a temp register, because if it's spilled, + // it might end up being reloaded inside the exiting branch. + execCtxTmp := m.copyToTmp(execCtxVReg) + + // We have to skip the entire exit sequence if the condition is false. + cbr := m.allocateInstr() + m.insert(cbr) + m.lowerExitWithCode(execCtxTmp, code) + // conditional branch target is after exit. + l := m.insertBrTargetLabel() + cbr.asCondBr(condFlagFromSSAIntegerCmpCond(c).invert().asCond(), l, false /* ignored */) +} + +func (m *machine) lowerSelect(c, x, y, result ssa.Value) { + cvalDef := m.compiler.ValueDefinition(c) + + var cc condFlag + switch { + case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction. + cvalInstr := cvalDef.Instr + x, y, c := cvalInstr.IcmpData() + cc = condFlagFromSSAIntegerCmpCond(c) + m.lowerIcmpToFlag(x, y, c.Signed()) + cvalDef.Instr.MarkLowered() + case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly. + cvalInstr := cvalDef.Instr + x, y, c := cvalInstr.FcmpData() + cc = condFlagFromSSAFloatCmpCond(c) + m.lowerFcmpToFlag(x, y) + cvalDef.Instr.MarkLowered() + default: + rn := m.getOperand_NR(cvalDef, extModeNone) + if c.Type() != ssa.TypeI32 && c.Type() != ssa.TypeI64 { + panic("TODO?BUG?: support select with non-integer condition") + } + alu := m.allocateInstr() + // subs zr, rn, zr + alu.asALU( + aluOpSubS, + // We don't need the result, just need to set flags. + operandNR(xzrVReg), + rn, + operandNR(xzrVReg), + c.Type().Bits() == 64, + ) + m.insert(alu) + cc = ne + } + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + + rd := operandNR(m.compiler.VRegOf(result)) + switch x.Type() { + case ssa.TypeI32, ssa.TypeI64: + // csel rd, rn, rm, cc + csel := m.allocateInstr() + csel.asCSel(rd, rn, rm, cc, x.Type().Bits() == 64) + m.insert(csel) + case ssa.TypeF32, ssa.TypeF64: + // fcsel rd, rn, rm, cc + fcsel := m.allocateInstr() + fcsel.asFpuCSel(rd, rn, rm, cc, x.Type().Bits() == 64) + m.insert(fcsel) + default: + panic("BUG") + } +} + +func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) { + // First check if `rc` is zero or not. + checkZero := m.allocateInstr() + checkZero.asALU(aluOpSubS, operandNR(xzrVReg), rc, operandNR(xzrVReg), false) + m.insert(checkZero) + + // Then use CSETM to set all bits to one if `rc` is zero. + allOnesOrZero := m.compiler.AllocateVReg(ssa.TypeI64) + cset := m.allocateInstr() + cset.asCSet(allOnesOrZero, true, ne) + m.insert(cset) + + // Then move the bits to the result vector register. + tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + dup := m.allocateInstr() + dup.asVecDup(tmp2, operandNR(allOnesOrZero), vecArrangement2D) + m.insert(dup) + + // Now that `tmp2` has either all bits one or zero depending on `rc`, + // we can use bsl to select between `rn` and `rm`. + ins := m.allocateInstr() + ins.asVecRRRRewrite(vecOpBsl, tmp2, rn, rm, vecArrangement16B) + m.insert(ins) + + // Finally, move the result to the destination register. + mov2 := m.allocateInstr() + mov2.asFpuMov128(rd.nr(), tmp2.nr()) + m.insert(mov2) +} + +func (m *machine) lowerAtomicRmw(si *ssa.Instruction) { + ssaOp, size := si.AtomicRmwData() + + var op atomicRmwOp + var negateArg bool + var flipArg bool + switch ssaOp { + case ssa.AtomicRmwOpAdd: + op = atomicRmwOpAdd + case ssa.AtomicRmwOpSub: + op = atomicRmwOpAdd + negateArg = true + case ssa.AtomicRmwOpAnd: + op = atomicRmwOpClr + flipArg = true + case ssa.AtomicRmwOpOr: + op = atomicRmwOpSet + case ssa.AtomicRmwOpXor: + op = atomicRmwOpEor + case ssa.AtomicRmwOpXchg: + op = atomicRmwOpSwp + default: + panic(fmt.Sprintf("unknown ssa atomic rmw op: %s", ssaOp)) + } + + addr, val := si.Arg2() + addrDef, valDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(val) + rn := m.getOperand_NR(addrDef, extModeNone) + rt := operandNR(m.compiler.VRegOf(si.Return())) + rs := m.getOperand_NR(valDef, extModeNone) + + _64 := si.Return().Type().Bits() == 64 + var tmp operand + if _64 { + tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) + } else { + tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32)) + } + m.lowerAtomicRmwImpl(op, rn, rs, rt, tmp, size, negateArg, flipArg, _64) +} + +func (m *machine) lowerAtomicRmwImpl(op atomicRmwOp, rn, rs, rt, tmp operand, size uint64, negateArg, flipArg, dst64bit bool) { + switch { + case negateArg: + neg := m.allocateInstr() + neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rs, dst64bit) + m.insert(neg) + case flipArg: + flip := m.allocateInstr() + flip.asALU(aluOpOrn, tmp, operandNR(xzrVReg), rs, dst64bit) + m.insert(flip) + default: + tmp = rs + } + + rmw := m.allocateInstr() + rmw.asAtomicRmw(op, rn, tmp, rt, size) + m.insert(rmw) +} + +func (m *machine) lowerAtomicCas(si *ssa.Instruction) { + addr, exp, repl := si.Arg3() + size := si.AtomicTargetSize() + + addrDef, expDef, replDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(exp), m.compiler.ValueDefinition(repl) + rn := m.getOperand_NR(addrDef, extModeNone) + rt := m.getOperand_NR(replDef, extModeNone) + rs := m.getOperand_NR(expDef, extModeNone) + tmp := operandNR(m.compiler.AllocateVReg(si.Return().Type())) + + _64 := si.Return().Type().Bits() == 64 + // rs is overwritten by CAS, so we need to move it to the result register before the instruction + // in case when it is used somewhere else. + mov := m.allocateInstr() + if _64 { + mov.asMove64(tmp.nr(), rs.nr()) + } else { + mov.asMove32(tmp.nr(), rs.nr()) + } + m.insert(mov) + + m.lowerAtomicCasImpl(rn, tmp, rt, size) + + mov2 := m.allocateInstr() + rd := m.compiler.VRegOf(si.Return()) + if _64 { + mov2.asMove64(rd, tmp.nr()) + } else { + mov2.asMove32(rd, tmp.nr()) + } + m.insert(mov2) +} + +func (m *machine) lowerAtomicCasImpl(rn, rs, rt operand, size uint64) { + cas := m.allocateInstr() + cas.asAtomicCas(rn, rs, rt, size) + m.insert(cas) +} + +func (m *machine) lowerAtomicLoad(si *ssa.Instruction) { + addr := si.Arg() + size := si.AtomicTargetSize() + + addrDef := m.compiler.ValueDefinition(addr) + rn := m.getOperand_NR(addrDef, extModeNone) + rt := operandNR(m.compiler.VRegOf(si.Return())) + + m.lowerAtomicLoadImpl(rn, rt, size) +} + +func (m *machine) lowerAtomicLoadImpl(rn, rt operand, size uint64) { + ld := m.allocateInstr() + ld.asAtomicLoad(rn, rt, size) + m.insert(ld) +} + +func (m *machine) lowerAtomicStore(si *ssa.Instruction) { + addr, val := si.Arg2() + size := si.AtomicTargetSize() + + addrDef := m.compiler.ValueDefinition(addr) + valDef := m.compiler.ValueDefinition(val) + rn := m.getOperand_NR(addrDef, extModeNone) + rt := m.getOperand_NR(valDef, extModeNone) + + m.lowerAtomicStoreImpl(rn, rt, size) +} + +func (m *machine) lowerAtomicStoreImpl(rn, rt operand, size uint64) { + ld := m.allocateInstr() + ld.asAtomicStore(rn, rt, size) + m.insert(ld) +} + +// copyToTmp copies the given regalloc.VReg to a temporary register. This is called before cbr to avoid the regalloc issue +// e.g. reload happening in the middle of the exit sequence which is not the path the normal path executes +func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg { + typ := m.compiler.TypeOf(v) + mov := m.allocateInstr() + tmp := m.compiler.AllocateVReg(typ) + if typ.IsInt() { + mov.asMove64(tmp, v) + } else { + mov.asFpuMov128(tmp, v) + } + m.insert(mov) + return tmp +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go new file mode 100644 index 000000000..d9fbf1789 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go @@ -0,0 +1,350 @@ +package arm64 + +// This file contains the logic to "find and determine operands" for instructions. +// In order to finalize the form of an operand, we might end up merging/eliminating +// the source instructions into an operand whenever possible. + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +type ( + // operand represents an operand of an instruction whose type is determined by the kind. + operand struct { + kind operandKind + data, data2 uint64 + } + operandKind byte +) + +// Here's the list of operand kinds. We use the abbreviation of the kind name not only for these consts, +// but also names of functions which return the operand of the kind. +const ( + // operandKindNR represents "NormalRegister" (NR). This is literally the register without any special operation unlike others. + operandKindNR operandKind = iota + // operandKindSR represents "Shifted Register" (SR). This is a register which is shifted by a constant. + // Some of the arm64 instructions can take this kind of operand. + operandKindSR + // operandKindER represents "Extended Register (ER). This is a register which is sign/zero-extended to a larger size. + // Some of the arm64 instructions can take this kind of operand. + operandKindER + // operandKindImm12 represents "Immediate 12" (Imm12). This is a 12-bit immediate value which can be either shifted by 12 or not. + // See asImm12 function for detail. + operandKindImm12 + // operandKindShiftImm represents "Shifted Immediate" (ShiftImm) used by shift operations. + operandKindShiftImm +) + +// String implements fmt.Stringer for debugging. +func (o operand) format(size byte) string { + switch o.kind { + case operandKindNR: + return formatVRegSized(o.nr(), size) + case operandKindSR: + r, amt, sop := o.sr() + return fmt.Sprintf("%s, %s #%d", formatVRegSized(r, size), sop, amt) + case operandKindER: + r, eop, _ := o.er() + return fmt.Sprintf("%s %s", formatVRegSized(r, size), eop) + case operandKindImm12: + imm12, shiftBit := o.imm12() + if shiftBit == 1 { + return fmt.Sprintf("#%#x", uint64(imm12)<<12) + } else { + return fmt.Sprintf("#%#x", imm12) + } + default: + panic(fmt.Sprintf("unknown operand kind: %d", o.kind)) + } +} + +// operandNR encodes the given VReg as an operand of operandKindNR. +func operandNR(r regalloc.VReg) operand { + return operand{kind: operandKindNR, data: uint64(r)} +} + +// nr decodes the underlying VReg assuming the operand is of operandKindNR. +func (o operand) nr() regalloc.VReg { + return regalloc.VReg(o.data) +} + +// operandER encodes the given VReg as an operand of operandKindER. +func operandER(r regalloc.VReg, eop extendOp, to byte) operand { + if to < 32 { + panic("TODO?BUG?: when we need to extend to less than 32 bits?") + } + return operand{kind: operandKindER, data: uint64(r), data2: uint64(eop)<<32 | uint64(to)} +} + +// er decodes the underlying VReg, extend operation, and the target size assuming the operand is of operandKindER. +func (o operand) er() (r regalloc.VReg, eop extendOp, to byte) { + return regalloc.VReg(o.data), extendOp(o.data2>>32) & 0xff, byte(o.data2 & 0xff) +} + +// operandSR encodes the given VReg as an operand of operandKindSR. +func operandSR(r regalloc.VReg, amt byte, sop shiftOp) operand { + return operand{kind: operandKindSR, data: uint64(r), data2: uint64(amt)<<32 | uint64(sop)} +} + +// sr decodes the underlying VReg, shift amount, and shift operation assuming the operand is of operandKindSR. +func (o operand) sr() (r regalloc.VReg, amt byte, sop shiftOp) { + return regalloc.VReg(o.data), byte(o.data2>>32) & 0xff, shiftOp(o.data2) & 0xff +} + +// operandImm12 encodes the given imm12 as an operand of operandKindImm12. +func operandImm12(imm12 uint16, shiftBit byte) operand { + return operand{kind: operandKindImm12, data: uint64(imm12) | uint64(shiftBit)<<32} +} + +// imm12 decodes the underlying imm12 data assuming the operand is of operandKindImm12. +func (o operand) imm12() (v uint16, shiftBit byte) { + return uint16(o.data), byte(o.data >> 32) +} + +// operandShiftImm encodes the given amount as an operand of operandKindShiftImm. +func operandShiftImm(amount byte) operand { + return operand{kind: operandKindShiftImm, data: uint64(amount)} +} + +// shiftImm decodes the underlying shift amount data assuming the operand is of operandKindShiftImm. +func (o operand) shiftImm() byte { + return byte(o.data) +} + +// reg returns the register of the operand if applicable. +func (o operand) reg() regalloc.VReg { + switch o.kind { + case operandKindNR: + return o.nr() + case operandKindSR: + r, _, _ := o.sr() + return r + case operandKindER: + r, _, _ := o.er() + return r + case operandKindImm12: + // Does not have a register. + case operandKindShiftImm: + // Does not have a register. + default: + panic(o.kind) + } + return regalloc.VRegInvalid +} + +func (o operand) realReg() regalloc.RealReg { + return o.nr().RealReg() +} + +func (o operand) assignReg(v regalloc.VReg) operand { + switch o.kind { + case operandKindNR: + return operandNR(v) + case operandKindSR: + _, amt, sop := o.sr() + return operandSR(v, amt, sop) + case operandKindER: + _, eop, to := o.er() + return operandER(v, eop, to) + case operandKindImm12: + // Does not have a register. + case operandKindShiftImm: + // Does not have a register. + } + panic(o.kind) +} + +// ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def). +// +// `mode` is used to extend the operand if the bit length is smaller than mode.bits(). +// If the operand can be expressed as operandKindImm12, `mode` is ignored. +func (m *machine) getOperand_Imm12_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) { + if def.IsFromBlockParam() { + return operandNR(def.BlkParamVReg) + } + + instr := def.Instr + if instr.Opcode() == ssa.OpcodeIconst { + if imm12Op, ok := asImm12Operand(instr.ConstantVal()); ok { + instr.MarkLowered() + return imm12Op + } + } + return m.getOperand_ER_SR_NR(def, mode) +} + +// getOperand_MaybeNegatedImm12_ER_SR_NR is almost the same as getOperand_Imm12_ER_SR_NR, but this might negate the immediate value. +// If the immediate value is negated, the second return value is true, otherwise always false. +func (m *machine) getOperand_MaybeNegatedImm12_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand, negatedImm12 bool) { + if def.IsFromBlockParam() { + return operandNR(def.BlkParamVReg), false + } + + instr := def.Instr + if instr.Opcode() == ssa.OpcodeIconst { + c := instr.ConstantVal() + if imm12Op, ok := asImm12Operand(c); ok { + instr.MarkLowered() + return imm12Op, false + } + + signExtended := int64(c) + if def.SSAValue().Type().Bits() == 32 { + signExtended = (signExtended << 32) >> 32 + } + negatedWithoutSign := -signExtended + if imm12Op, ok := asImm12Operand(uint64(negatedWithoutSign)); ok { + instr.MarkLowered() + return imm12Op, true + } + } + return m.getOperand_ER_SR_NR(def, mode), false +} + +// ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def). +// +// `mode` is used to extend the operand if the bit length is smaller than mode.bits(). +func (m *machine) getOperand_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) { + if def.IsFromBlockParam() { + return operandNR(def.BlkParamVReg) + } + + if m.compiler.MatchInstr(def, ssa.OpcodeSExtend) || m.compiler.MatchInstr(def, ssa.OpcodeUExtend) { + extInstr := def.Instr + + signed := extInstr.Opcode() == ssa.OpcodeSExtend + innerExtFromBits, innerExtToBits := extInstr.ExtendFromToBits() + modeBits, modeSigned := mode.bits(), mode.signed() + if mode == extModeNone || innerExtToBits == modeBits { + eop := extendOpFrom(signed, innerExtFromBits) + extArg := m.getOperand_NR(m.compiler.ValueDefinition(extInstr.Arg()), extModeNone) + op = operandER(extArg.nr(), eop, innerExtToBits) + extInstr.MarkLowered() + return + } + + if innerExtToBits > modeBits { + panic("BUG?TODO?: need the results of inner extension to be larger than the mode") + } + + switch { + case (!signed && !modeSigned) || (signed && modeSigned): + // Two sign/zero extensions are equivalent to one sign/zero extension for the larger size. + eop := extendOpFrom(modeSigned, innerExtFromBits) + op = operandER(m.compiler.VRegOf(extInstr.Arg()), eop, modeBits) + extInstr.MarkLowered() + case (signed && !modeSigned) || (!signed && modeSigned): + // We need to {sign, zero}-extend the result of the {zero,sign} extension. + eop := extendOpFrom(modeSigned, innerExtToBits) + op = operandER(m.compiler.VRegOf(extInstr.Return()), eop, modeBits) + // Note that we failed to merge the inner extension instruction this case. + } + return + } + return m.getOperand_SR_NR(def, mode) +} + +// ensureValueNR returns an operand of either operandKindSR or operandKindNR from the given value (defined by `def). +// +// `mode` is used to extend the operand if the bit length is smaller than mode.bits(). +func (m *machine) getOperand_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) { + if def.IsFromBlockParam() { + return operandNR(def.BlkParamVReg) + } + + if m.compiler.MatchInstr(def, ssa.OpcodeIshl) { + // Check if the shift amount is constant instruction. + targetVal, amountVal := def.Instr.Arg2() + targetVReg := m.getOperand_NR(m.compiler.ValueDefinition(targetVal), extModeNone).nr() + amountDef := m.compiler.ValueDefinition(amountVal) + if amountDef.IsFromInstr() && amountDef.Instr.Constant() { + // If that is the case, we can use the shifted register operand (SR). + c := byte(amountDef.Instr.ConstantVal()) & (targetVal.Type().Bits() - 1) // Clears the unnecessary bits. + def.Instr.MarkLowered() + amountDef.Instr.MarkLowered() + return operandSR(targetVReg, c, shiftOpLSL) + } + } + return m.getOperand_NR(def, mode) +} + +// getOperand_ShiftImm_NR returns an operand of either operandKindShiftImm or operandKindNR from the given value (defined by `def). +func (m *machine) getOperand_ShiftImm_NR(def *backend.SSAValueDefinition, mode extMode, shiftBitWidth byte) (op operand) { + if def.IsFromBlockParam() { + return operandNR(def.BlkParamVReg) + } + + instr := def.Instr + if instr.Constant() { + amount := byte(instr.ConstantVal()) & (shiftBitWidth - 1) // Clears the unnecessary bits. + return operandShiftImm(amount) + } + return m.getOperand_NR(def, mode) +} + +// ensureValueNR returns an operand of operandKindNR from the given value (defined by `def). +// +// `mode` is used to extend the operand if the bit length is smaller than mode.bits(). +func (m *machine) getOperand_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) { + var v regalloc.VReg + if def.IsFromBlockParam() { + v = def.BlkParamVReg + } else { + instr := def.Instr + if instr.Constant() { + // We inline all the constant instructions so that we could reduce the register usage. + v = m.lowerConstant(instr) + instr.MarkLowered() + } else { + if n := def.N; n == 0 { + v = m.compiler.VRegOf(instr.Return()) + } else { + _, rs := instr.Returns() + v = m.compiler.VRegOf(rs[n-1]) + } + } + } + + r := v + switch inBits := def.SSAValue().Type().Bits(); { + case mode == extModeNone: + case inBits == 32 && (mode == extModeZeroExtend32 || mode == extModeSignExtend32): + case inBits == 32 && mode == extModeZeroExtend64: + extended := m.compiler.AllocateVReg(ssa.TypeI64) + ext := m.allocateInstr() + ext.asExtend(extended, v, 32, 64, false) + m.insert(ext) + r = extended + case inBits == 32 && mode == extModeSignExtend64: + extended := m.compiler.AllocateVReg(ssa.TypeI64) + ext := m.allocateInstr() + ext.asExtend(extended, v, 32, 64, true) + m.insert(ext) + r = extended + case inBits == 64 && (mode == extModeZeroExtend64 || mode == extModeSignExtend64): + } + return operandNR(r) +} + +func asImm12Operand(val uint64) (op operand, ok bool) { + v, shiftBit, ok := asImm12(val) + if !ok { + return operand{}, false + } + return operandImm12(v, shiftBit), true +} + +func asImm12(val uint64) (v uint16, shiftBit byte, ok bool) { + const mask1, mask2 uint64 = 0xfff, 0xfff_000 + if val&^mask1 == 0 { + return uint16(val), 0, true + } else if val&^mask2 == 0 { + return uint16(val >> 12), 1, true + } else { + return 0, 0, false + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go new file mode 100644 index 000000000..4842eaa38 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go @@ -0,0 +1,440 @@ +package arm64 + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +type ( + // addressMode represents an ARM64 addressing mode. + // + // https://developer.arm.com/documentation/102374/0101/Loads-and-stores---addressing + // TODO: use the bit-packed layout like operand struct. + addressMode struct { + kind addressModeKind + rn, rm regalloc.VReg + extOp extendOp + imm int64 + } + + // addressModeKind represents the kind of ARM64 addressing mode. + addressModeKind byte +) + +const ( + // addressModeKindRegExtended takes a base register and an index register. The index register is sign/zero-extended, + // and then scaled by bits(type)/8. + // + // e.g. + // - ldrh w1, [x2, w3, SXTW #1] ;; sign-extended and scaled by 2 (== LSL #1) + // - strh w1, [x2, w3, UXTW #1] ;; zero-extended and scaled by 2 (== LSL #1) + // - ldr w1, [x2, w3, SXTW #2] ;; sign-extended and scaled by 4 (== LSL #2) + // - str x1, [x2, w3, UXTW #3] ;; zero-extended and scaled by 8 (== LSL #3) + // + // See the following pages: + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--register---Load-Register-Halfword--register-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--register---Load-Register--register-- + addressModeKindRegScaledExtended addressModeKind = iota + + // addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without extension factor. + addressModeKindRegScaled + + // addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without scale factor. + addressModeKindRegExtended + + // addressModeKindRegReg takes a base register and an index register. The index register is not either scaled or extended. + addressModeKindRegReg + + // addressModeKindRegSignedImm9 takes a base register and a 9-bit "signed" immediate offset (-256 to 255). + // The immediate will be sign-extended, and be added to the base register. + // This is a.k.a. "unscaled" since the immediate is not scaled. + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled-- + addressModeKindRegSignedImm9 + + // addressModeKindRegUnsignedImm12 takes a base register and a 12-bit "unsigned" immediate offset. scaled by + // the size of the type. In other words, the actual offset will be imm12 * bits(type)/8. + // See "Unsigned offset" in the following pages: + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate-- + addressModeKindRegUnsignedImm12 + + // addressModePostIndex takes a base register and a 9-bit "signed" immediate offset. + // After the load/store, the base register will be updated by the offset. + // + // Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset. + // + // See "Post-index" in the following pages for examples: + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers- + addressModeKindPostIndex + + // addressModePostIndex takes a base register and a 9-bit "signed" immediate offset. + // Before the load/store, the base register will be updated by the offset. + // + // Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset. + // + // See "Pre-index" in the following pages for examples: + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers- + addressModeKindPreIndex + + // addressModeKindArgStackSpace is used to resolve the address of the argument stack space + // exiting right above the stack pointer. Since we don't know the exact stack space needed for a function + // at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above. + addressModeKindArgStackSpace + + // addressModeKindResultStackSpace is used to resolve the address of the result stack space + // exiting right above the stack pointer. Since we don't know the exact stack space needed for a function + // at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above. + addressModeKindResultStackSpace +) + +func (a addressMode) format(dstSizeBits byte) (ret string) { + base := formatVRegSized(a.rn, 64) + if rn := a.rn; rn.RegType() != regalloc.RegTypeInt { + panic("invalid base register type: " + a.rn.RegType().String()) + } else if rn.IsRealReg() && v0 <= a.rn.RealReg() && a.rn.RealReg() <= v30 { + panic("BUG: likely a bug in reg alloc or reset behavior") + } + + switch a.kind { + case addressModeKindRegScaledExtended: + amount := a.sizeInBitsToShiftAmount(dstSizeBits) + ret = fmt.Sprintf("[%s, %s, %s #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp, amount) + case addressModeKindRegScaled: + amount := a.sizeInBitsToShiftAmount(dstSizeBits) + ret = fmt.Sprintf("[%s, %s, lsl #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), amount) + case addressModeKindRegExtended: + ret = fmt.Sprintf("[%s, %s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp) + case addressModeKindRegReg: + ret = fmt.Sprintf("[%s, %s]", base, formatVRegSized(a.rm, a.indexRegBits())) + case addressModeKindRegSignedImm9: + if a.imm != 0 { + ret = fmt.Sprintf("[%s, #%#x]", base, a.imm) + } else { + ret = fmt.Sprintf("[%s]", base) + } + case addressModeKindRegUnsignedImm12: + if a.imm != 0 { + ret = fmt.Sprintf("[%s, #%#x]", base, a.imm) + } else { + ret = fmt.Sprintf("[%s]", base) + } + case addressModeKindPostIndex: + ret = fmt.Sprintf("[%s], #%#x", base, a.imm) + case addressModeKindPreIndex: + ret = fmt.Sprintf("[%s, #%#x]!", base, a.imm) + case addressModeKindArgStackSpace: + ret = fmt.Sprintf("[#arg_space, #%#x]", a.imm) + case addressModeKindResultStackSpace: + ret = fmt.Sprintf("[#ret_space, #%#x]", a.imm) + } + return +} + +func addressModePreOrPostIndex(rn regalloc.VReg, imm int64, preIndex bool) addressMode { + if !offsetFitsInAddressModeKindRegSignedImm9(imm) { + panic(fmt.Sprintf("BUG: offset %#x does not fit in addressModeKindRegSignedImm9", imm)) + } + if preIndex { + return addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm} + } else { + return addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm} + } +} + +func offsetFitsInAddressModeKindRegUnsignedImm12(dstSizeInBits byte, offset int64) bool { + divisor := int64(dstSizeInBits) / 8 + return 0 < offset && offset%divisor == 0 && offset/divisor < 4096 +} + +func offsetFitsInAddressModeKindRegSignedImm9(offset int64) bool { + return -256 <= offset && offset <= 255 +} + +func (a addressMode) indexRegBits() byte { + bits := a.extOp.srcBits() + if bits != 32 && bits != 64 { + panic("invalid index register for address mode. it must be either 32 or 64 bits") + } + return bits +} + +func (a addressMode) sizeInBitsToShiftAmount(sizeInBits byte) (lsl byte) { + switch sizeInBits { + case 8: + lsl = 0 + case 16: + lsl = 1 + case 32: + lsl = 2 + case 64: + lsl = 3 + } + return +} + +func extLoadSignSize(op ssa.Opcode) (size byte, signed bool) { + switch op { + case ssa.OpcodeUload8: + size, signed = 8, false + case ssa.OpcodeUload16: + size, signed = 16, false + case ssa.OpcodeUload32: + size, signed = 32, false + case ssa.OpcodeSload8: + size, signed = 8, true + case ssa.OpcodeSload16: + size, signed = 16, true + case ssa.OpcodeSload32: + size, signed = 32, true + default: + panic("BUG") + } + return +} + +func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, ret regalloc.VReg) { + size, signed := extLoadSignSize(op) + amode := m.lowerToAddressMode(ptr, offset, size) + load := m.allocateInstr() + if signed { + load.asSLoad(operandNR(ret), amode, size) + } else { + load.asULoad(operandNR(ret), amode, size) + } + m.insert(load) +} + +func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, ret ssa.Value) { + amode := m.lowerToAddressMode(ptr, offset, typ.Bits()) + + dst := m.compiler.VRegOf(ret) + load := m.allocateInstr() + switch typ { + case ssa.TypeI32, ssa.TypeI64: + load.asULoad(operandNR(dst), amode, typ.Bits()) + case ssa.TypeF32, ssa.TypeF64: + load.asFpuLoad(operandNR(dst), amode, typ.Bits()) + case ssa.TypeV128: + load.asFpuLoad(operandNR(dst), amode, 128) + default: + panic("TODO") + } + m.insert(load) +} + +func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, lane ssa.VecLane, ret ssa.Value) { + // vecLoad1R has offset address mode (base+imm) only for post index, so we simply add the offset to the base. + base := m.getOperand_NR(m.compiler.ValueDefinition(ptr), extModeNone).nr() + offsetReg := m.compiler.AllocateVReg(ssa.TypeI64) + m.lowerConstantI64(offsetReg, int64(offset)) + addedBase := m.addReg64ToReg64(base, offsetReg) + + rd := operandNR(m.compiler.VRegOf(ret)) + + ld1r := m.allocateInstr() + ld1r.asVecLoad1R(rd, operandNR(addedBase), ssaLaneToArrangement(lane)) + m.insert(ld1r) +} + +func (m *machine) lowerStore(si *ssa.Instruction) { + // TODO: merge consecutive stores into a single pair store instruction. + value, ptr, offset, storeSizeInBits := si.StoreData() + amode := m.lowerToAddressMode(ptr, offset, storeSizeInBits) + + valueOp := m.getOperand_NR(m.compiler.ValueDefinition(value), extModeNone) + store := m.allocateInstr() + store.asStore(valueOp, amode, storeSizeInBits) + m.insert(store) +} + +// lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions. +func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode addressMode) { + // TODO: currently the instruction selection logic doesn't support addressModeKindRegScaledExtended and + // addressModeKindRegScaled since collectAddends doesn't take ssa.OpcodeIshl into account. This should be fixed + // to support more efficient address resolution. + + a32s, a64s, offset := m.collectAddends(ptr) + offset += int64(offsetBase) + return m.lowerToAddressModeFromAddends(a32s, a64s, size, offset) +} + +// lowerToAddressModeFromAddends creates an addressMode from a list of addends collected by collectAddends. +// During the construction, this might emit additional instructions. +// +// Extracted as a separate function for easy testing. +func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32], a64s *wazevoapi.Queue[regalloc.VReg], size byte, offset int64) (amode addressMode) { + switch a64sExist, a32sExist := !a64s.Empty(), !a32s.Empty(); { + case a64sExist && a32sExist: + var base regalloc.VReg + base = a64s.Dequeue() + var a32 addend32 + a32 = a32s.Dequeue() + amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext} + case a64sExist && offsetFitsInAddressModeKindRegUnsignedImm12(size, offset): + var base regalloc.VReg + base = a64s.Dequeue() + amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset} + offset = 0 + case a64sExist && offsetFitsInAddressModeKindRegSignedImm9(offset): + var base regalloc.VReg + base = a64s.Dequeue() + amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset} + offset = 0 + case a64sExist: + var base regalloc.VReg + base = a64s.Dequeue() + if !a64s.Empty() { + index := a64s.Dequeue() + amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */} + } else { + amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0} + } + case a32sExist: + base32 := a32s.Dequeue() + + // First we need 64-bit base. + base := m.compiler.AllocateVReg(ssa.TypeI64) + baseExt := m.allocateInstr() + var signed bool + if base32.ext == extendOpSXTW { + signed = true + } + baseExt.asExtend(base, base32.r, 32, 64, signed) + m.insert(baseExt) + + if !a32s.Empty() { + index := a32s.Dequeue() + amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext} + } else { + amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0} + } + default: // Only static offsets. + tmpReg := m.compiler.AllocateVReg(ssa.TypeI64) + m.lowerConstantI64(tmpReg, offset) + amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0} + offset = 0 + } + + baseReg := amode.rn + if offset > 0 { + baseReg = m.addConstToReg64(baseReg, offset) // baseReg += offset + } + + for !a64s.Empty() { + a64 := a64s.Dequeue() + baseReg = m.addReg64ToReg64(baseReg, a64) // baseReg += a64 + } + + for !a32s.Empty() { + a32 := a32s.Dequeue() + baseReg = m.addRegToReg64Ext(baseReg, a32.r, a32.ext) // baseReg += (a32 extended to 64-bit) + } + amode.rn = baseReg + return +} + +var addendsMatchOpcodes = [4]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst} + +func (m *machine) collectAddends(ptr ssa.Value) (addends32 *wazevoapi.Queue[addend32], addends64 *wazevoapi.Queue[regalloc.VReg], offset int64) { + m.addendsWorkQueue.Reset() + m.addends32.Reset() + m.addends64.Reset() + m.addendsWorkQueue.Enqueue(ptr) + + for !m.addendsWorkQueue.Empty() { + v := m.addendsWorkQueue.Dequeue() + + def := m.compiler.ValueDefinition(v) + switch op := m.compiler.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op { + case ssa.OpcodeIadd: + // If the addend is an add, we recursively collect its operands. + x, y := def.Instr.Arg2() + m.addendsWorkQueue.Enqueue(x) + m.addendsWorkQueue.Enqueue(y) + def.Instr.MarkLowered() + case ssa.OpcodeIconst: + // If the addend is constant, we just statically merge it into the offset. + ic := def.Instr + u64 := ic.ConstantVal() + if ic.Return().Type().Bits() == 32 { + offset += int64(int32(u64)) // sign-extend. + } else { + offset += int64(u64) + } + def.Instr.MarkLowered() + case ssa.OpcodeUExtend, ssa.OpcodeSExtend: + input := def.Instr.Arg() + if input.Type().Bits() != 32 { + panic("illegal size: " + input.Type().String()) + } + + var ext extendOp + if op == ssa.OpcodeUExtend { + ext = extendOpUXTW + } else { + ext = extendOpSXTW + } + + inputDef := m.compiler.ValueDefinition(input) + constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant() + switch { + case constInst && ext == extendOpUXTW: + // Zero-extension of a 32-bit constant can be merged into the offset. + offset += int64(uint32(inputDef.Instr.ConstantVal())) + case constInst && ext == extendOpSXTW: + // Sign-extension of a 32-bit constant can be merged into the offset. + offset += int64(int32(inputDef.Instr.ConstantVal())) // sign-extend! + default: + m.addends32.Enqueue(addend32{r: m.getOperand_NR(inputDef, extModeNone).nr(), ext: ext}) + } + def.Instr.MarkLowered() + continue + default: + // If the addend is not one of them, we simply use it as-is (without merging!), optionally zero-extending it. + m.addends64.Enqueue(m.getOperand_NR(def, extModeZeroExtend64 /* optional zero ext */).nr()) + } + } + return &m.addends32, &m.addends64, offset +} + +func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) { + rd = m.compiler.AllocateVReg(ssa.TypeI64) + alu := m.allocateInstr() + if imm12Op, ok := asImm12Operand(uint64(c)); ok { + alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), imm12Op, true) + } else if imm12Op, ok = asImm12Operand(uint64(-c)); ok { + alu.asALU(aluOpSub, operandNR(rd), operandNR(r), imm12Op, true) + } else { + tmp := m.compiler.AllocateVReg(ssa.TypeI64) + m.load64bitConst(c, tmp) + alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), operandNR(tmp), true) + } + m.insert(alu) + return +} + +func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) { + rd = m.compiler.AllocateVReg(ssa.TypeI64) + alu := m.allocateInstr() + alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandNR(rm), true) + m.insert(alu) + return +} + +func (m *machine) addRegToReg64Ext(rn, rm regalloc.VReg, ext extendOp) (rd regalloc.VReg) { + rd = m.compiler.AllocateVReg(ssa.TypeI64) + alu := m.allocateInstr() + alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandER(rm, ext, 64), true) + m.insert(alu) + return +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go new file mode 100644 index 000000000..b435d9ba9 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go @@ -0,0 +1,515 @@ +package arm64 + +import ( + "context" + "fmt" + "strings" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +type ( + // machine implements backend.Machine. + machine struct { + compiler backend.Compiler + executableContext *backend.ExecutableContextT[instruction] + currentABI *backend.FunctionABI + + regAlloc regalloc.Allocator + regAllocFn *backend.RegAllocFunction[*instruction, *machine] + + // addendsWorkQueue is used during address lowering, defined here for reuse. + addendsWorkQueue wazevoapi.Queue[ssa.Value] + addends32 wazevoapi.Queue[addend32] + // addends64 is used during address lowering, defined here for reuse. + addends64 wazevoapi.Queue[regalloc.VReg] + unresolvedAddressModes []*instruction + + // condBrRelocs holds the conditional branches which need offset relocation. + condBrRelocs []condBrReloc + + // jmpTableTargets holds the labels of the jump table targets. + jmpTableTargets [][]uint32 + + // spillSlotSize is the size of the stack slot in bytes used for spilling registers. + // During the execution of the function, the stack looks like: + // + // + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | xxxxx | + // | ReturnAddress | + // +-----------------+ <<-| + // | ........... | | + // | spill slot M | | <--- spillSlotSize + // | ............ | | + // | spill slot 2 | | + // | spill slot 1 | <<-+ + // | clobbered N | + // | ........... | + // | clobbered 1 | + // | clobbered 0 | + // SP---> +-----------------+ + // (low address) + // + // and it represents the size of the space between FP and the first spilled slot. This must be a multiple of 16. + // Also note that this is only known after register allocation. + spillSlotSize int64 + spillSlots map[regalloc.VRegID]int64 // regalloc.VRegID to offset. + // clobberedRegs holds real-register backed VRegs saved at the function prologue, and restored at the epilogue. + clobberedRegs []regalloc.VReg + + maxRequiredStackSizeForCalls int64 + stackBoundsCheckDisabled bool + + regAllocStarted bool + } + + addend32 struct { + r regalloc.VReg + ext extendOp + } + + condBrReloc struct { + cbr *instruction + // currentLabelPos is the labelPosition within which condBr is defined. + currentLabelPos *labelPosition + // Next block's labelPosition. + nextLabel label + offset int64 + } + + labelPosition = backend.LabelPosition[instruction] + label = backend.Label +) + +const ( + labelReturn = backend.LabelReturn + labelInvalid = backend.LabelInvalid +) + +// NewBackend returns a new backend for arm64. +func NewBackend() backend.Machine { + m := &machine{ + spillSlots: make(map[regalloc.VRegID]int64), + executableContext: newExecutableContext(), + regAlloc: regalloc.NewAllocator(regInfo), + } + return m +} + +func newExecutableContext() *backend.ExecutableContextT[instruction] { + return backend.NewExecutableContextT[instruction](resetInstruction, setNext, setPrev, asNop0) +} + +// ExecutableContext implements backend.Machine. +func (m *machine) ExecutableContext() backend.ExecutableContext { + return m.executableContext +} + +// RegAlloc implements backend.Machine Function. +func (m *machine) RegAlloc() { + rf := m.regAllocFn + for _, pos := range m.executableContext.OrderedBlockLabels { + rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End) + } + + m.regAllocStarted = true + m.regAlloc.DoAllocation(rf) + // Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes. + m.spillSlotSize = (m.spillSlotSize + 15) &^ 15 +} + +// Reset implements backend.Machine. +func (m *machine) Reset() { + m.clobberedRegs = m.clobberedRegs[:0] + for key := range m.spillSlots { + m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key)) + } + for _, key := range m.clobberedRegs { + delete(m.spillSlots, regalloc.VRegID(key)) + } + m.clobberedRegs = m.clobberedRegs[:0] + m.regAllocStarted = false + m.regAlloc.Reset() + m.regAllocFn.Reset() + m.spillSlotSize = 0 + m.unresolvedAddressModes = m.unresolvedAddressModes[:0] + m.maxRequiredStackSizeForCalls = 0 + m.executableContext.Reset() + m.jmpTableTargets = m.jmpTableTargets[:0] +} + +// SetCurrentABI implements backend.Machine SetCurrentABI. +func (m *machine) SetCurrentABI(abi *backend.FunctionABI) { + m.currentABI = abi +} + +// DisableStackCheck implements backend.Machine DisableStackCheck. +func (m *machine) DisableStackCheck() { + m.stackBoundsCheckDisabled = true +} + +// SetCompiler implements backend.Machine. +func (m *machine) SetCompiler(ctx backend.Compiler) { + m.compiler = ctx + m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, ctx.SSABuilder(), ctx) +} + +func (m *machine) insert(i *instruction) { + ectx := m.executableContext + ectx.PendingInstructions = append(ectx.PendingInstructions, i) +} + +func (m *machine) insertBrTargetLabel() label { + nop, l := m.allocateBrTarget() + m.insert(nop) + return l +} + +func (m *machine) allocateBrTarget() (nop *instruction, l label) { + ectx := m.executableContext + l = ectx.AllocateLabel() + nop = m.allocateInstr() + nop.asNop0WithLabel(l) + pos := ectx.AllocateLabelPosition(l) + pos.Begin, pos.End = nop, nop + ectx.LabelPositions[l] = pos + return +} + +// allocateInstr allocates an instruction. +func (m *machine) allocateInstr() *instruction { + instr := m.executableContext.InstructionPool.Allocate() + if !m.regAllocStarted { + instr.addedBeforeRegAlloc = true + } + return instr +} + +func resetInstruction(i *instruction) { + *i = instruction{} +} + +func (m *machine) allocateNop() *instruction { + instr := m.allocateInstr() + instr.asNop0() + return instr +} + +func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruction) { + amode := &i.amode + switch amode.kind { + case addressModeKindResultStackSpace: + amode.imm += ret0offset + case addressModeKindArgStackSpace: + amode.imm += arg0offset + default: + panic("BUG") + } + + var sizeInBits byte + switch i.kind { + case store8, uLoad8: + sizeInBits = 8 + case store16, uLoad16: + sizeInBits = 16 + case store32, fpuStore32, uLoad32, fpuLoad32: + sizeInBits = 32 + case store64, fpuStore64, uLoad64, fpuLoad64: + sizeInBits = 64 + case fpuStore128, fpuLoad128: + sizeInBits = 128 + default: + panic("BUG") + } + + if offsetFitsInAddressModeKindRegUnsignedImm12(sizeInBits, amode.imm) { + amode.kind = addressModeKindRegUnsignedImm12 + } else { + // This case, we load the offset into the temporary register, + // and then use it as the index register. + newPrev := m.lowerConstantI64AndInsert(i.prev, tmpRegVReg, amode.imm) + linkInstr(newPrev, i) + *amode = addressMode{kind: addressModeKindRegReg, rn: amode.rn, rm: tmpRegVReg, extOp: extendOpUXTX /* indicates rm reg is 64-bit */} + } +} + +// resolveRelativeAddresses resolves the relative addresses before encoding. +func (m *machine) resolveRelativeAddresses(ctx context.Context) { + ectx := m.executableContext + for { + if len(m.unresolvedAddressModes) > 0 { + arg0offset, ret0offset := m.arg0OffsetFromSP(), m.ret0OffsetFromSP() + for _, i := range m.unresolvedAddressModes { + m.resolveAddressingMode(arg0offset, ret0offset, i) + } + } + + // Reuse the slice to gather the unresolved conditional branches. + m.condBrRelocs = m.condBrRelocs[:0] + + var fn string + var fnIndex int + var labelToSSABlockID map[label]ssa.BasicBlockID + if wazevoapi.PerfMapEnabled { + fn = wazevoapi.GetCurrentFunctionName(ctx) + labelToSSABlockID = make(map[label]ssa.BasicBlockID) + for i, l := range ectx.SsaBlockIDToLabels { + labelToSSABlockID[l] = ssa.BasicBlockID(i) + } + fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx) + } + + // Next, in order to determine the offsets of relative jumps, we have to calculate the size of each label. + var offset int64 + for i, pos := range ectx.OrderedBlockLabels { + pos.BinaryOffset = offset + var size int64 + for cur := pos.Begin; ; cur = cur.next { + switch cur.kind { + case nop0: + l := cur.nop0Label() + if pos, ok := ectx.LabelPositions[l]; ok { + pos.BinaryOffset = offset + size + } + case condBr: + if !cur.condBrOffsetResolved() { + var nextLabel label + if i < len(ectx.OrderedBlockLabels)-1 { + // Note: this is only used when the block ends with fallthrough, + // therefore can be safely assumed that the next block exists when it's needed. + nextLabel = ectx.OrderedBlockLabels[i+1].L + } + m.condBrRelocs = append(m.condBrRelocs, condBrReloc{ + cbr: cur, currentLabelPos: pos, offset: offset + size, + nextLabel: nextLabel, + }) + } + } + size += cur.size() + if cur == pos.End { + break + } + } + + if wazevoapi.PerfMapEnabled { + if size > 0 { + l := pos.L + var labelStr string + if blkID, ok := labelToSSABlockID[l]; ok { + labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID) + } else { + labelStr = l.String() + } + wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr)) + } + } + offset += size + } + + // Before resolving any offsets, we need to check if all the conditional branches can be resolved. + var needRerun bool + for i := range m.condBrRelocs { + reloc := &m.condBrRelocs[i] + cbr := reloc.cbr + offset := reloc.offset + + target := cbr.condBrLabel() + offsetOfTarget := ectx.LabelPositions[target].BinaryOffset + diff := offsetOfTarget - offset + if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 { + // This case the conditional branch is too huge. We place the trampoline instructions at the end of the current block, + // and jump to it. + m.insertConditionalJumpTrampoline(cbr, reloc.currentLabelPos, reloc.nextLabel) + // Then, we need to recall this function to fix up the label offsets + // as they have changed after the trampoline is inserted. + needRerun = true + } + } + if needRerun { + if wazevoapi.PerfMapEnabled { + wazevoapi.PerfMap.Clear() + } + } else { + break + } + } + + var currentOffset int64 + for cur := ectx.RootInstr; cur != nil; cur = cur.next { + switch cur.kind { + case br: + target := cur.brLabel() + offsetOfTarget := ectx.LabelPositions[target].BinaryOffset + diff := offsetOfTarget - currentOffset + divided := diff >> 2 + if divided < minSignedInt26 || divided > maxSignedInt26 { + // This means the currently compiled single function is extremely large. + panic("too large function that requires branch relocation of large unconditional branch larger than 26-bit range") + } + cur.brOffsetResolve(diff) + case condBr: + if !cur.condBrOffsetResolved() { + target := cur.condBrLabel() + offsetOfTarget := ectx.LabelPositions[target].BinaryOffset + diff := offsetOfTarget - currentOffset + if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 { + panic("BUG: branch relocation for large conditional branch larger than 19-bit range must be handled properly") + } + cur.condBrOffsetResolve(diff) + } + case brTableSequence: + tableIndex := cur.u1 + targets := m.jmpTableTargets[tableIndex] + for i := range targets { + l := label(targets[i]) + offsetOfTarget := ectx.LabelPositions[l].BinaryOffset + diff := offsetOfTarget - (currentOffset + brTableSequenceOffsetTableBegin) + targets[i] = uint32(diff) + } + cur.brTableSequenceOffsetsResolved() + case emitSourceOffsetInfo: + m.compiler.AddSourceOffsetInfo(currentOffset, cur.sourceOffsetInfo()) + } + currentOffset += cur.size() + } +} + +const ( + maxSignedInt26 = 1<<25 - 1 + minSignedInt26 = -(1 << 25) + + maxSignedInt19 = 1<<18 - 1 + minSignedInt19 = -(1 << 18) +) + +func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *labelPosition, nextLabel label) { + cur := currentBlk.End + originalTarget := cbr.condBrLabel() + endNext := cur.next + + if cur.kind != br { + // If the current block ends with a conditional branch, we can just insert the trampoline after it. + // Otherwise, we need to insert "skip" instruction to skip the trampoline instructions. + skip := m.allocateInstr() + skip.asBr(nextLabel) + cur = linkInstr(cur, skip) + } + + cbrNewTargetInstr, cbrNewTargetLabel := m.allocateBrTarget() + cbr.setCondBrTargets(cbrNewTargetLabel) + cur = linkInstr(cur, cbrNewTargetInstr) + + // Then insert the unconditional branch to the original, which should be possible to get encoded + // as 26-bit offset should be enough for any practical application. + br := m.allocateInstr() + br.asBr(originalTarget) + cur = linkInstr(cur, br) + + // Update the end of the current block. + currentBlk.End = cur + + linkInstr(cur, endNext) +} + +// Format implements backend.Machine. +func (m *machine) Format() string { + ectx := m.executableContext + begins := map[*instruction]label{} + for l, pos := range ectx.LabelPositions { + begins[pos.Begin] = l + } + + irBlocks := map[label]ssa.BasicBlockID{} + for i, l := range ectx.SsaBlockIDToLabels { + irBlocks[l] = ssa.BasicBlockID(i) + } + + var lines []string + for cur := ectx.RootInstr; cur != nil; cur = cur.next { + if l, ok := begins[cur]; ok { + var labelStr string + if blkID, ok := irBlocks[l]; ok { + labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID) + } else { + labelStr = fmt.Sprintf("%s:", l) + } + lines = append(lines, labelStr) + } + if cur.kind == nop0 { + continue + } + lines = append(lines, "\t"+cur.String()) + } + return "\n" + strings.Join(lines, "\n") + "\n" +} + +// InsertReturn implements backend.Machine. +func (m *machine) InsertReturn() { + i := m.allocateInstr() + i.asRet() + m.insert(i) +} + +func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 { + offset, ok := m.spillSlots[id] + if !ok { + offset = m.spillSlotSize + // TODO: this should be aligned depending on the `size` to use Imm12 offset load/store as much as possible. + m.spillSlots[id] = offset + m.spillSlotSize += int64(size) + } + return offset + 16 // spill slot starts above the clobbered registers and the frame size. +} + +func (m *machine) clobberedRegSlotSize() int64 { + return int64(len(m.clobberedRegs) * 16) +} + +func (m *machine) arg0OffsetFromSP() int64 { + return m.frameSize() + + 16 + // 16-byte aligned return address + 16 // frame size saved below the clobbered registers. +} + +func (m *machine) ret0OffsetFromSP() int64 { + return m.arg0OffsetFromSP() + m.currentABI.ArgStackSize +} + +func (m *machine) requiredStackSize() int64 { + return m.maxRequiredStackSizeForCalls + + m.frameSize() + + 16 + // 16-byte aligned return address. + 16 // frame size saved below the clobbered registers. +} + +func (m *machine) frameSize() int64 { + s := m.clobberedRegSlotSize() + m.spillSlotSize + if s&0xf != 0 { + panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s)) + } + return s +} + +func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) { + // TODO: reuse the slice! + labels := make([]uint32, len(targets)) + for j, target := range targets { + labels[j] = uint32(m.executableContext.GetOrAllocateSSABlockLabel(target)) + } + index = len(m.jmpTableTargets) + m.jmpTableTargets = append(m.jmpTableTargets, labels) + return +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go new file mode 100644 index 000000000..466fac464 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go @@ -0,0 +1,469 @@ +package arm64 + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// PostRegAlloc implements backend.Machine. +func (m *machine) PostRegAlloc() { + m.setupPrologue() + m.postRegAlloc() +} + +// setupPrologue initializes the prologue of the function. +func (m *machine) setupPrologue() { + ectx := m.executableContext + + cur := ectx.RootInstr + prevInitInst := cur.next + + // + // (high address) (high address) + // SP----> +-----------------+ +------------------+ <----+ + // | ....... | | ....... | | + // | ret Y | | ret Y | | + // | ....... | | ....... | | + // | ret 0 | | ret 0 | | + // | arg X | | arg X | | size_of_arg_ret. + // | ....... | ====> | ....... | | + // | arg 1 | | arg 1 | | + // | arg 0 | | arg 0 | <----+ + // |-----------------| | size_of_arg_ret | + // | return address | + // +------------------+ <---- SP + // (low address) (low address) + + // Saves the return address (lr) and the size_of_arg_ret below the SP. + // size_of_arg_ret is used for stack unwinding. + cur = m.createReturnAddrAndSizeOfArgRetSlot(cur) + + if !m.stackBoundsCheckDisabled { + cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur) + } + + // Decrement SP if spillSlotSize > 0. + if m.spillSlotSize == 0 && len(m.spillSlots) != 0 { + panic(fmt.Sprintf("BUG: spillSlotSize=%d, spillSlots=%v\n", m.spillSlotSize, m.spillSlots)) + } + + if regs := m.clobberedRegs; len(regs) > 0 { + // + // (high address) (high address) + // +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | size_of_arg_ret | | size_of_arg_ret | + // | ReturnAddress | | ReturnAddress | + // SP----> +-----------------+ ====> +-----------------+ + // (low address) | clobbered M | + // | ............ | + // | clobbered 0 | + // +-----------------+ <----- SP + // (low address) + // + _amode := addressModePreOrPostIndex(spVReg, + -16, // stack pointer must be 16-byte aligned. + true, // Decrement before store. + ) + for _, vr := range regs { + // TODO: pair stores to reduce the number of instructions. + store := m.allocateInstr() + store.asStore(operandNR(vr), _amode, regTypeToRegisterSizeInBits(vr.RegType())) + cur = linkInstr(cur, store) + } + } + + if size := m.spillSlotSize; size > 0 { + // Check if size is 16-byte aligned. + if size&0xf != 0 { + panic(fmt.Errorf("BUG: spill slot size %d is not 16-byte aligned", size)) + } + + cur = m.addsAddOrSubStackPointer(cur, spVReg, size, false) + + // At this point, the stack looks like: + // + // (high address) + // +------------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | size_of_arg_ret | + // | ReturnAddress | + // +------------------+ + // | clobbered M | + // | ............ | + // | clobbered 0 | + // | spill slot N | + // | ............ | + // | spill slot 2 | + // | spill slot 0 | + // SP----> +------------------+ + // (low address) + } + + // We push the frame size into the stack to make it possible to unwind stack: + // + // + // (high address) (high address) + // +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | size_of_arg_ret | | size_of_arg_ret | + // | ReturnAddress | | ReturnAddress | + // +-----------------+ ==> +-----------------+ <----+ + // | clobbered M | | clobbered M | | + // | ............ | | ............ | | + // | clobbered 2 | | clobbered 2 | | + // | clobbered 1 | | clobbered 1 | | frame size + // | clobbered 0 | | clobbered 0 | | + // | spill slot N | | spill slot N | | + // | ............ | | ............ | | + // | spill slot 0 | | spill slot 0 | <----+ + // SP---> +-----------------+ | xxxxxx | ;; unused space to make it 16-byte aligned. + // | frame_size | + // +-----------------+ <---- SP + // (low address) + // + cur = m.createFrameSizeSlot(cur, m.frameSize()) + + linkInstr(cur, prevInitInst) +} + +func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruction { + // First we decrement the stack pointer to point the arg0 slot. + var sizeOfArgRetReg regalloc.VReg + s := int64(m.currentABI.AlignedArgResultStackSlotSize()) + if s > 0 { + cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s) + sizeOfArgRetReg = tmpRegVReg + + subSp := m.allocateInstr() + subSp.asALU(aluOpSub, operandNR(spVReg), operandNR(spVReg), operandNR(sizeOfArgRetReg), true) + cur = linkInstr(cur, subSp) + } else { + sizeOfArgRetReg = xzrVReg + } + + // Saves the return address (lr) and the size_of_arg_ret below the SP. + // size_of_arg_ret is used for stack unwinding. + pstr := m.allocateInstr() + amode := addressModePreOrPostIndex(spVReg, -16, true /* decrement before store */) + pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode) + cur = linkInstr(cur, pstr) + return cur +} + +func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction { + var frameSizeReg regalloc.VReg + if s > 0 { + cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s) + frameSizeReg = tmpRegVReg + } else { + frameSizeReg = xzrVReg + } + _amode := addressModePreOrPostIndex(spVReg, + -16, // stack pointer must be 16-byte aligned. + true, // Decrement before store. + ) + store := m.allocateInstr() + store.asStore(operandNR(frameSizeReg), _amode, 64) + cur = linkInstr(cur, store) + return cur +} + +// postRegAlloc does multiple things while walking through the instructions: +// 1. Removes the redundant copy instruction. +// 2. Inserts the epilogue. +func (m *machine) postRegAlloc() { + ectx := m.executableContext + for cur := ectx.RootInstr; cur != nil; cur = cur.next { + switch cur.kind { + case ret: + m.setupEpilogueAfter(cur.prev) + case loadConstBlockArg: + lc := cur + next := lc.next + m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0] + m.lowerLoadConstantBlockArgAfterRegAlloc(lc) + for _, instr := range m.executableContext.PendingInstructions { + cur = linkInstr(cur, instr) + } + linkInstr(cur, next) + m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0] + default: + // Removes the redundant copy instruction. + if cur.IsCopy() && cur.rn.realReg() == cur.rd.realReg() { + prev, next := cur.prev, cur.next + // Remove the copy instruction. + prev.next = next + if next != nil { + next.prev = prev + } + } + } + } +} + +func (m *machine) setupEpilogueAfter(cur *instruction) { + prevNext := cur.next + + // We've stored the frame size in the prologue, and now that we are about to return from this function, we won't need it anymore. + cur = m.addsAddOrSubStackPointer(cur, spVReg, 16, true) + + if s := m.spillSlotSize; s > 0 { + // Adjust SP to the original value: + // + // (high address) (high address) + // +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | xxxxx | | xxxxx | + // | ReturnAddress | | ReturnAddress | + // +-----------------+ ====> +-----------------+ + // | clobbered M | | clobbered M | + // | ............ | | ............ | + // | clobbered 1 | | clobbered 1 | + // | clobbered 0 | | clobbered 0 | + // | spill slot N | +-----------------+ <---- SP + // | ............ | + // | spill slot 0 | + // SP---> +-----------------+ + // (low address) + // + cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true) + } + + // First we need to restore the clobbered registers. + if len(m.clobberedRegs) > 0 { + // (high address) + // +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | xxxxx | | xxxxx | + // | ReturnAddress | | ReturnAddress | + // +-----------------+ ========> +-----------------+ <---- SP + // | clobbered M | + // | ........... | + // | clobbered 1 | + // | clobbered 0 | + // SP---> +-----------------+ + // (low address) + + l := len(m.clobberedRegs) - 1 + for i := range m.clobberedRegs { + vr := m.clobberedRegs[l-i] // reverse order to restore. + load := m.allocateInstr() + amode := addressModePreOrPostIndex(spVReg, + 16, // stack pointer must be 16-byte aligned. + false, // Increment after store. + ) + // TODO: pair loads to reduce the number of instructions. + switch regTypeToRegisterSizeInBits(vr.RegType()) { + case 64: // save int reg. + load.asULoad(operandNR(vr), amode, 64) + case 128: // save vector reg. + load.asFpuLoad(operandNR(vr), amode, 128) + } + cur = linkInstr(cur, load) + } + } + + // Reload the return address (lr). + // + // +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | ===> | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | xxxxx | +-----------------+ <---- SP + // | ReturnAddress | + // SP----> +-----------------+ + + ldr := m.allocateInstr() + ldr.asULoad(operandNR(lrVReg), + addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64) + cur = linkInstr(cur, ldr) + + if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 { + cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true) + } + + linkInstr(cur, prevNext) +} + +// saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient +// stack space left. Basically this is the combination of CalleeSavedRegisters plus argument registers execpt for x0, +// which always points to the execution context whenever the native code is entered from Go. +var saveRequiredRegs = []regalloc.VReg{ + x1VReg, x2VReg, x3VReg, x4VReg, x5VReg, x6VReg, x7VReg, + x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg, lrVReg, + v0VReg, v1VReg, v2VReg, v3VReg, v4VReg, v5VReg, v6VReg, v7VReg, + v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg, +} + +// insertStackBoundsCheck will insert the instructions after `cur` to check the +// stack bounds, and if there's no sufficient spaces required for the function, +// exit the execution and try growing it in Go world. +// +// TODO: we should be able to share the instructions across all the functions to reduce the size of compiled executable. +func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction { + if requiredStackSize%16 != 0 { + panic("BUG") + } + + if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok { + // sub tmp, sp, #requiredStackSize + sub := m.allocateInstr() + sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), immm12op, true) + cur = linkInstr(cur, sub) + } else { + // This case, we first load the requiredStackSize into the temporary register, + cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize) + // Then subtract it. + sub := m.allocateInstr() + sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true) + cur = linkInstr(cur, sub) + } + + tmp2 := x11VReg // Caller save, so it is safe to use it here in the prologue. + + // ldr tmp2, [executionContext #StackBottomPtr] + ldr := m.allocateInstr() + ldr.asULoad(operandNR(tmp2), addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: x0VReg, // execution context is always the first argument. + imm: wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(), + }, 64) + cur = linkInstr(cur, ldr) + + // subs xzr, tmp, tmp2 + subs := m.allocateInstr() + subs.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpRegVReg), operandNR(tmp2), true) + cur = linkInstr(cur, subs) + + // b.ge #imm + cbr := m.allocateInstr() + cbr.asCondBr(ge.asCond(), labelInvalid, false /* ignored */) + cur = linkInstr(cur, cbr) + + // Set the required stack size and set it to the exec context. + { + // First load the requiredStackSize into the temporary register, + cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize) + setRequiredStackSize := m.allocateInstr() + setRequiredStackSize.asStore(operandNR(tmpRegVReg), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + // Execution context is always the first argument. + rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(), + }, 64) + + cur = linkInstr(cur, setRequiredStackSize) + } + + ldrAddress := m.allocateInstr() + ldrAddress.asULoad(operandNR(tmpRegVReg), addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: x0VReg, // execution context is always the first argument + imm: wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(), + }, 64) + cur = linkInstr(cur, ldrAddress) + + // Then jumps to the stack grow call sequence's address, meaning + // transferring the control to the code compiled by CompileStackGrowCallSequence. + bl := m.allocateInstr() + bl.asCallIndirect(tmpRegVReg, nil) + cur = linkInstr(cur, bl) + + // Now that we know the entire code, we can finalize how many bytes + // we have to skip when the stack size is sufficient. + var cbrOffset int64 + for _cur := cbr; ; _cur = _cur.next { + cbrOffset += _cur.size() + if _cur == cur { + break + } + } + cbr.condBrOffsetResolve(cbrOffset) + return cur +} + +// CompileStackGrowCallSequence implements backend.Machine. +func (m *machine) CompileStackGrowCallSequence() []byte { + ectx := m.executableContext + + cur := m.allocateInstr() + cur.asNop0() + ectx.RootInstr = cur + + // Save the callee saved and argument registers. + cur = m.saveRegistersInExecutionContext(cur, saveRequiredRegs) + + // Save the current stack pointer. + cur = m.saveCurrentStackPointer(cur, x0VReg) + + // Set the exit status on the execution context. + cur = m.setExitCode(cur, x0VReg, wazevoapi.ExitCodeGrowStack) + + // Exit the execution. + cur = m.storeReturnAddressAndExit(cur) + + // After the exit, restore the saved registers. + cur = m.restoreRegistersInExecutionContext(cur, saveRequiredRegs) + + // Then goes back the original address of this stack grow call. + ret := m.allocateInstr() + ret.asRet() + linkInstr(cur, ret) + + m.encode(ectx.RootInstr) + return m.compiler.Buf() +} + +func (m *machine) addsAddOrSubStackPointer(cur *instruction, rd regalloc.VReg, diff int64, add bool) *instruction { + ectx := m.executableContext + + ectx.PendingInstructions = ectx.PendingInstructions[:0] + m.insertAddOrSubStackPointer(rd, diff, add) + for _, inserted := range ectx.PendingInstructions { + cur = linkInstr(cur, inserted) + } + return cur +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go new file mode 100644 index 000000000..1c8793b73 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go @@ -0,0 +1,152 @@ +package arm64 + +// This file implements the interfaces required for register allocations. See backend.RegAllocFunctionMachine. + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// ClobberedRegisters implements backend.RegAllocFunctionMachine. +func (m *machine) ClobberedRegisters(regs []regalloc.VReg) { + m.clobberedRegs = append(m.clobberedRegs[:0], regs...) +} + +// Swap implements backend.RegAllocFunctionMachine. +func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) { + prevNext := cur.next + var mov1, mov2, mov3 *instruction + if x1.RegType() == regalloc.RegTypeInt { + if !tmp.Valid() { + tmp = tmpRegVReg + } + mov1 = m.allocateInstr().asMove64(tmp, x1) + mov2 = m.allocateInstr().asMove64(x1, x2) + mov3 = m.allocateInstr().asMove64(x2, tmp) + cur = linkInstr(cur, mov1) + cur = linkInstr(cur, mov2) + cur = linkInstr(cur, mov3) + linkInstr(cur, prevNext) + } else { + if !tmp.Valid() { + r2 := x2.RealReg() + // Temporarily spill x1 to stack. + cur = m.InsertStoreRegisterAt(x1, cur, true).prev + // Then move x2 to x1. + cur = linkInstr(cur, m.allocateInstr().asFpuMov128(x1, x2)) + linkInstr(cur, prevNext) + // Then reload the original value on x1 from stack to r2. + m.InsertReloadRegisterAt(x1.SetRealReg(r2), cur, true) + } else { + mov1 = m.allocateInstr().asFpuMov128(tmp, x1) + mov2 = m.allocateInstr().asFpuMov128(x1, x2) + mov3 = m.allocateInstr().asFpuMov128(x2, tmp) + cur = linkInstr(cur, mov1) + cur = linkInstr(cur, mov2) + cur = linkInstr(cur, mov3) + linkInstr(cur, prevNext) + } + } +} + +// InsertMoveBefore implements backend.RegAllocFunctionMachine. +func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) { + typ := src.RegType() + if typ != dst.RegType() { + panic("BUG: src and dst must have the same type") + } + + mov := m.allocateInstr() + if typ == regalloc.RegTypeInt { + mov.asMove64(dst, src) + } else { + mov.asFpuMov128(dst, src) + } + + cur := instr.prev + prevNext := cur.next + cur = linkInstr(cur, mov) + linkInstr(cur, prevNext) +} + +// SSABlockLabel implements backend.RegAllocFunctionMachine. +func (m *machine) SSABlockLabel(id ssa.BasicBlockID) backend.Label { + return m.executableContext.SsaBlockIDToLabels[id] +} + +// InsertStoreRegisterAt implements backend.RegAllocFunctionMachine. +func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction { + if !v.IsRealReg() { + panic("BUG: VReg must be backed by real reg to be stored") + } + + typ := m.compiler.TypeOf(v) + + var prevNext, cur *instruction + if after { + cur, prevNext = instr, instr.next + } else { + cur, prevNext = instr.prev, instr + } + + offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size()) + var amode addressMode + cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true) + store := m.allocateInstr() + store.asStore(operandNR(v), amode, typ.Bits()) + + cur = linkInstr(cur, store) + return linkInstr(cur, prevNext) +} + +// InsertReloadRegisterAt implements backend.RegAllocFunctionMachine. +func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction { + if !v.IsRealReg() { + panic("BUG: VReg must be backed by real reg to be stored") + } + + typ := m.compiler.TypeOf(v) + + var prevNext, cur *instruction + if after { + cur, prevNext = instr, instr.next + } else { + cur, prevNext = instr.prev, instr + } + + offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size()) + var amode addressMode + cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true) + load := m.allocateInstr() + switch typ { + case ssa.TypeI32, ssa.TypeI64: + load.asULoad(operandNR(v), amode, typ.Bits()) + case ssa.TypeF32, ssa.TypeF64: + load.asFpuLoad(operandNR(v), amode, typ.Bits()) + case ssa.TypeV128: + load.asFpuLoad(operandNR(v), amode, 128) + default: + panic("TODO") + } + + cur = linkInstr(cur, load) + return linkInstr(cur, prevNext) +} + +// LastInstrForInsertion implements backend.RegAllocFunctionMachine. +func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction { + cur := end + for cur.kind == nop0 { + cur = cur.prev + if cur == begin { + return end + } + } + switch cur.kind { + case br: + return cur + default: + return end + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go new file mode 100644 index 000000000..83902d927 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go @@ -0,0 +1,117 @@ +package arm64 + +import ( + "encoding/binary" + "fmt" + "math" + "sort" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" +) + +const ( + // trampolineCallSize is the size of the trampoline instruction sequence for each function in an island. + trampolineCallSize = 4*4 + 4 // Four instructions + 32-bit immediate. + + // Unconditional branch offset is encoded as divided by 4 in imm26. + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link-?lang=en + + maxUnconditionalBranchOffset = maxSignedInt26 * 4 + minUnconditionalBranchOffset = minSignedInt26 * 4 + + // trampolineIslandInterval is the range of the trampoline island. + // Half of the range is used for the trampoline island, and the other half is used for the function. + trampolineIslandInterval = maxUnconditionalBranchOffset / 2 + + // maxNumFunctions explicitly specifies the maximum number of functions that can be allowed in a single executable. + maxNumFunctions = trampolineIslandInterval >> 6 + + // maxFunctionExecutableSize is the maximum size of a function that can exist in a trampoline island. + // Conservatively set to 1/4 of the trampoline island interval. + maxFunctionExecutableSize = trampolineIslandInterval >> 2 +) + +// CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo. +func (m *machine) CallTrampolineIslandInfo(numFunctions int) (interval, size int, err error) { + if numFunctions > maxNumFunctions { + return 0, 0, fmt.Errorf("too many functions: %d > %d", numFunctions, maxNumFunctions) + } + return trampolineIslandInterval, trampolineCallSize * numFunctions, nil +} + +// ResolveRelocations implements backend.Machine ResolveRelocations. +func (m *machine) ResolveRelocations( + refToBinaryOffset []int, + executable []byte, + relocations []backend.RelocationInfo, + callTrampolineIslandOffsets []int, +) { + for _, islandOffset := range callTrampolineIslandOffsets { + encodeCallTrampolineIsland(refToBinaryOffset, islandOffset, executable) + } + + for _, r := range relocations { + instrOffset := r.Offset + calleeFnOffset := refToBinaryOffset[r.FuncRef] + diff := int64(calleeFnOffset) - (instrOffset) + // Check if the diff is within the range of the branch instruction. + if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset { + // Find the near trampoline island from callTrampolineIslandOffsets. + islandOffset := searchTrampolineIsland(callTrampolineIslandOffsets, int(instrOffset)) + islandTargetOffset := islandOffset + trampolineCallSize*int(r.FuncRef) + diff = int64(islandTargetOffset) - (instrOffset) + if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset { + panic("BUG in trampoline placement") + } + } + binary.LittleEndian.PutUint32(executable[instrOffset:instrOffset+4], encodeUnconditionalBranch(true, diff)) + } +} + +// encodeCallTrampolineIsland encodes a trampoline island for the given functions. +// Each island consists of a trampoline instruction sequence for each function. +// Each trampoline instruction sequence consists of 4 instructions + 32-bit immediate. +func encodeCallTrampolineIsland(refToBinaryOffset []int, islandOffset int, executable []byte) { + for i := 0; i < len(refToBinaryOffset); i++ { + trampolineOffset := islandOffset + trampolineCallSize*i + + fnOffset := refToBinaryOffset[i] + diff := fnOffset - (trampolineOffset + 16) + if diff > math.MaxInt32 || diff < math.MinInt32 { + // This case even amd64 can't handle. 4GB is too big. + panic("too big binary") + } + + // The tmpReg, tmpReg2 is safe to overwrite (in fact any caller-saved register is safe to use). + tmpReg, tmpReg2 := regNumberInEncoding[tmpRegVReg.RealReg()], regNumberInEncoding[x11] + + // adr tmpReg, PC+16: load the address of #diff into tmpReg. + binary.LittleEndian.PutUint32(executable[trampolineOffset:], encodeAdr(tmpReg, 16)) + // ldrsw tmpReg2, [tmpReg]: Load #diff into tmpReg2. + binary.LittleEndian.PutUint32(executable[trampolineOffset+4:], + encodeLoadOrStore(sLoad32, tmpReg2, addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpRegVReg})) + // add tmpReg, tmpReg2, tmpReg: add #diff to the address of #diff, getting the absolute address of the function. + binary.LittleEndian.PutUint32(executable[trampolineOffset+8:], + encodeAluRRR(aluOpAdd, tmpReg, tmpReg, tmpReg2, true, false)) + // br tmpReg: branch to the function without overwriting the link register. + binary.LittleEndian.PutUint32(executable[trampolineOffset+12:], encodeUnconditionalBranchReg(tmpReg, false)) + // #diff + binary.LittleEndian.PutUint32(executable[trampolineOffset+16:], uint32(diff)) + } +} + +// searchTrampolineIsland finds the nearest trampoline island from callTrampolineIslandOffsets. +// Note that even if the offset is in the middle of two islands, it returns the latter one. +// That is ok because the island is always placed in the middle of the range. +// +// precondition: callTrampolineIslandOffsets is sorted in ascending order. +func searchTrampolineIsland(callTrampolineIslandOffsets []int, offset int) int { + l := len(callTrampolineIslandOffsets) + n := sort.Search(l, func(i int) bool { + return callTrampolineIslandOffsets[i] >= offset + }) + if n == l { + n = l - 1 + } + return callTrampolineIslandOffsets[n] +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go new file mode 100644 index 000000000..45737516d --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go @@ -0,0 +1,397 @@ +package arm64 + +import ( + "fmt" + "strconv" + "strings" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" +) + +// Arm64-specific registers. +// +// See https://developer.arm.com/documentation/dui0801/a/Overview-of-AArch64-state/Predeclared-core-register-names-in-AArch64-state + +const ( + // General purpose registers. Note that we do not distinguish wn and xn registers + // because they are the same from the perspective of register allocator, and + // the size can be determined by the type of the instruction. + + x0 = regalloc.RealRegInvalid + 1 + iota + x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 + x22 + x23 + x24 + x25 + x26 + x27 + x28 + x29 + x30 + + // Vector registers. Note that we do not distinguish vn and dn, ... registers + // because they are the same from the perspective of register allocator, and + // the size can be determined by the type of the instruction. + + v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7 + v8 + v9 + v10 + v11 + v12 + v13 + v14 + v15 + v16 + v17 + v18 + v19 + v20 + v21 + v22 + v23 + v24 + v25 + v26 + v27 + v28 + v29 + v30 + v31 + + // Special registers + + xzr + sp + lr = x30 + fp = x29 + tmp = x27 +) + +var ( + x0VReg = regalloc.FromRealReg(x0, regalloc.RegTypeInt) + x1VReg = regalloc.FromRealReg(x1, regalloc.RegTypeInt) + x2VReg = regalloc.FromRealReg(x2, regalloc.RegTypeInt) + x3VReg = regalloc.FromRealReg(x3, regalloc.RegTypeInt) + x4VReg = regalloc.FromRealReg(x4, regalloc.RegTypeInt) + x5VReg = regalloc.FromRealReg(x5, regalloc.RegTypeInt) + x6VReg = regalloc.FromRealReg(x6, regalloc.RegTypeInt) + x7VReg = regalloc.FromRealReg(x7, regalloc.RegTypeInt) + x8VReg = regalloc.FromRealReg(x8, regalloc.RegTypeInt) + x9VReg = regalloc.FromRealReg(x9, regalloc.RegTypeInt) + x10VReg = regalloc.FromRealReg(x10, regalloc.RegTypeInt) + x11VReg = regalloc.FromRealReg(x11, regalloc.RegTypeInt) + x12VReg = regalloc.FromRealReg(x12, regalloc.RegTypeInt) + x13VReg = regalloc.FromRealReg(x13, regalloc.RegTypeInt) + x14VReg = regalloc.FromRealReg(x14, regalloc.RegTypeInt) + x15VReg = regalloc.FromRealReg(x15, regalloc.RegTypeInt) + x16VReg = regalloc.FromRealReg(x16, regalloc.RegTypeInt) + x17VReg = regalloc.FromRealReg(x17, regalloc.RegTypeInt) + x18VReg = regalloc.FromRealReg(x18, regalloc.RegTypeInt) + x19VReg = regalloc.FromRealReg(x19, regalloc.RegTypeInt) + x20VReg = regalloc.FromRealReg(x20, regalloc.RegTypeInt) + x21VReg = regalloc.FromRealReg(x21, regalloc.RegTypeInt) + x22VReg = regalloc.FromRealReg(x22, regalloc.RegTypeInt) + x23VReg = regalloc.FromRealReg(x23, regalloc.RegTypeInt) + x24VReg = regalloc.FromRealReg(x24, regalloc.RegTypeInt) + x25VReg = regalloc.FromRealReg(x25, regalloc.RegTypeInt) + x26VReg = regalloc.FromRealReg(x26, regalloc.RegTypeInt) + x27VReg = regalloc.FromRealReg(x27, regalloc.RegTypeInt) + x28VReg = regalloc.FromRealReg(x28, regalloc.RegTypeInt) + x29VReg = regalloc.FromRealReg(x29, regalloc.RegTypeInt) + x30VReg = regalloc.FromRealReg(x30, regalloc.RegTypeInt) + v0VReg = regalloc.FromRealReg(v0, regalloc.RegTypeFloat) + v1VReg = regalloc.FromRealReg(v1, regalloc.RegTypeFloat) + v2VReg = regalloc.FromRealReg(v2, regalloc.RegTypeFloat) + v3VReg = regalloc.FromRealReg(v3, regalloc.RegTypeFloat) + v4VReg = regalloc.FromRealReg(v4, regalloc.RegTypeFloat) + v5VReg = regalloc.FromRealReg(v5, regalloc.RegTypeFloat) + v6VReg = regalloc.FromRealReg(v6, regalloc.RegTypeFloat) + v7VReg = regalloc.FromRealReg(v7, regalloc.RegTypeFloat) + v8VReg = regalloc.FromRealReg(v8, regalloc.RegTypeFloat) + v9VReg = regalloc.FromRealReg(v9, regalloc.RegTypeFloat) + v10VReg = regalloc.FromRealReg(v10, regalloc.RegTypeFloat) + v11VReg = regalloc.FromRealReg(v11, regalloc.RegTypeFloat) + v12VReg = regalloc.FromRealReg(v12, regalloc.RegTypeFloat) + v13VReg = regalloc.FromRealReg(v13, regalloc.RegTypeFloat) + v14VReg = regalloc.FromRealReg(v14, regalloc.RegTypeFloat) + v15VReg = regalloc.FromRealReg(v15, regalloc.RegTypeFloat) + v16VReg = regalloc.FromRealReg(v16, regalloc.RegTypeFloat) + v17VReg = regalloc.FromRealReg(v17, regalloc.RegTypeFloat) + v18VReg = regalloc.FromRealReg(v18, regalloc.RegTypeFloat) + v19VReg = regalloc.FromRealReg(v19, regalloc.RegTypeFloat) + v20VReg = regalloc.FromRealReg(v20, regalloc.RegTypeFloat) + v21VReg = regalloc.FromRealReg(v21, regalloc.RegTypeFloat) + v22VReg = regalloc.FromRealReg(v22, regalloc.RegTypeFloat) + v23VReg = regalloc.FromRealReg(v23, regalloc.RegTypeFloat) + v24VReg = regalloc.FromRealReg(v24, regalloc.RegTypeFloat) + v25VReg = regalloc.FromRealReg(v25, regalloc.RegTypeFloat) + v26VReg = regalloc.FromRealReg(v26, regalloc.RegTypeFloat) + v27VReg = regalloc.FromRealReg(v27, regalloc.RegTypeFloat) + // lr (link register) holds the return address at the function entry. + lrVReg = x30VReg + // tmpReg is used to perform spill/load on large stack offsets, and load large constants. + // Therefore, be cautious to use this register in the middle of the compilation, especially before the register allocation. + // This is the same as golang/go, but it's only described in the source code: + // https://github.com/golang/go/blob/18e17e2cb12837ea2c8582ecdb0cc780f49a1aac/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go#L59 + // https://github.com/golang/go/blob/18e17e2cb12837ea2c8582ecdb0cc780f49a1aac/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go#L13-L15 + tmpRegVReg = regalloc.FromRealReg(tmp, regalloc.RegTypeInt) + v28VReg = regalloc.FromRealReg(v28, regalloc.RegTypeFloat) + v29VReg = regalloc.FromRealReg(v29, regalloc.RegTypeFloat) + v30VReg = regalloc.FromRealReg(v30, regalloc.RegTypeFloat) + v31VReg = regalloc.FromRealReg(v31, regalloc.RegTypeFloat) + xzrVReg = regalloc.FromRealReg(xzr, regalloc.RegTypeInt) + spVReg = regalloc.FromRealReg(sp, regalloc.RegTypeInt) + fpVReg = regalloc.FromRealReg(fp, regalloc.RegTypeInt) +) + +var regNames = [...]string{ + x0: "x0", + x1: "x1", + x2: "x2", + x3: "x3", + x4: "x4", + x5: "x5", + x6: "x6", + x7: "x7", + x8: "x8", + x9: "x9", + x10: "x10", + x11: "x11", + x12: "x12", + x13: "x13", + x14: "x14", + x15: "x15", + x16: "x16", + x17: "x17", + x18: "x18", + x19: "x19", + x20: "x20", + x21: "x21", + x22: "x22", + x23: "x23", + x24: "x24", + x25: "x25", + x26: "x26", + x27: "x27", + x28: "x28", + x29: "x29", + x30: "x30", + xzr: "xzr", + sp: "sp", + v0: "v0", + v1: "v1", + v2: "v2", + v3: "v3", + v4: "v4", + v5: "v5", + v6: "v6", + v7: "v7", + v8: "v8", + v9: "v9", + v10: "v10", + v11: "v11", + v12: "v12", + v13: "v13", + v14: "v14", + v15: "v15", + v16: "v16", + v17: "v17", + v18: "v18", + v19: "v19", + v20: "v20", + v21: "v21", + v22: "v22", + v23: "v23", + v24: "v24", + v25: "v25", + v26: "v26", + v27: "v27", + v28: "v28", + v29: "v29", + v30: "v30", + v31: "v31", +} + +func formatVRegSized(r regalloc.VReg, size byte) (ret string) { + if r.IsRealReg() { + ret = regNames[r.RealReg()] + switch ret[0] { + case 'x': + switch size { + case 32: + ret = strings.Replace(ret, "x", "w", 1) + case 64: + default: + panic("BUG: invalid register size: " + strconv.Itoa(int(size))) + } + case 'v': + switch size { + case 32: + ret = strings.Replace(ret, "v", "s", 1) + case 64: + ret = strings.Replace(ret, "v", "d", 1) + case 128: + ret = strings.Replace(ret, "v", "q", 1) + default: + panic("BUG: invalid register size") + } + } + } else { + switch r.RegType() { + case regalloc.RegTypeInt: + switch size { + case 32: + ret = fmt.Sprintf("w%d?", r.ID()) + case 64: + ret = fmt.Sprintf("x%d?", r.ID()) + default: + panic("BUG: invalid register size: " + strconv.Itoa(int(size))) + } + case regalloc.RegTypeFloat: + switch size { + case 32: + ret = fmt.Sprintf("s%d?", r.ID()) + case 64: + ret = fmt.Sprintf("d%d?", r.ID()) + case 128: + ret = fmt.Sprintf("q%d?", r.ID()) + default: + panic("BUG: invalid register size") + } + default: + panic(fmt.Sprintf("BUG: invalid register type: %d for %s", r.RegType(), r)) + } + } + return +} + +func formatVRegWidthVec(r regalloc.VReg, width vecArrangement) (ret string) { + var id string + wspec := strings.ToLower(width.String()) + if r.IsRealReg() { + id = regNames[r.RealReg()][1:] + } else { + id = fmt.Sprintf("%d?", r.ID()) + } + ret = fmt.Sprintf("%s%s", wspec, id) + return +} + +func formatVRegVec(r regalloc.VReg, arr vecArrangement, index vecIndex) (ret string) { + id := fmt.Sprintf("v%d?", r.ID()) + if r.IsRealReg() { + id = regNames[r.RealReg()] + } + ret = fmt.Sprintf("%s.%s", id, strings.ToLower(arr.String())) + if index != vecIndexNone { + ret += fmt.Sprintf("[%d]", index) + } + return +} + +func regTypeToRegisterSizeInBits(r regalloc.RegType) byte { + switch r { + case regalloc.RegTypeInt: + return 64 + case regalloc.RegTypeFloat: + return 128 + default: + panic("BUG: invalid register type") + } +} + +var regNumberInEncoding = [...]uint32{ + x0: 0, + x1: 1, + x2: 2, + x3: 3, + x4: 4, + x5: 5, + x6: 6, + x7: 7, + x8: 8, + x9: 9, + x10: 10, + x11: 11, + x12: 12, + x13: 13, + x14: 14, + x15: 15, + x16: 16, + x17: 17, + x18: 18, + x19: 19, + x20: 20, + x21: 21, + x22: 22, + x23: 23, + x24: 24, + x25: 25, + x26: 26, + x27: 27, + x28: 28, + x29: 29, + x30: 30, + xzr: 31, + sp: 31, + v0: 0, + v1: 1, + v2: 2, + v3: 3, + v4: 4, + v5: 5, + v6: 6, + v7: 7, + v8: 8, + v9: 9, + v10: 10, + v11: 11, + v12: 12, + v13: 13, + v14: 14, + v15: 15, + v16: 16, + v17: 17, + v18: 18, + v19: 19, + v20: 20, + v21: 21, + v22: 22, + v23: 23, + v24: 24, + v25: 25, + v26: 26, + v27: 27, + v28: 28, + v29: 29, + v30: 30, + v31: 31, +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go new file mode 100644 index 000000000..edb0e36e3 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go @@ -0,0 +1,90 @@ +package arm64 + +import ( + "encoding/binary" + "reflect" + "unsafe" + + "github.com/tetratelabs/wazero/internal/wasmdebug" +) + +// UnwindStack implements wazevo.unwindStack. +func UnwindStack(sp, _, top uintptr, returnAddresses []uintptr) []uintptr { + l := int(top - sp) + + var stackBuf []byte + { + // TODO: use unsafe.Slice after floor version is set to Go 1.20. + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf)) + hdr.Data = sp + hdr.Len = l + hdr.Cap = l + } + + for i := uint64(0); i < uint64(l); { + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | <----+ + // | ....... | | + // | ret 0 | | + // | arg X | | size_of_arg_ret + // | ....... | | + // | arg 1 | | + // | arg 0 | <----+ + // | size_of_arg_ret | + // | ReturnAddress | + // +-----------------+ <----+ + // | ........... | | + // | spill slot M | | + // | ............ | | + // | spill slot 2 | | + // | spill slot 1 | | frame size + // | spill slot 1 | | + // | clobbered N | | + // | ............ | | + // | clobbered 0 | <----+ + // | xxxxxx | ;; unused space to make it 16-byte aligned. + // | frame_size | + // +-----------------+ <---- SP + // (low address) + + frameSize := binary.LittleEndian.Uint64(stackBuf[i:]) + i += frameSize + + 16 // frame size + aligned space. + retAddr := binary.LittleEndian.Uint64(stackBuf[i:]) + i += 8 // ret addr. + sizeOfArgRet := binary.LittleEndian.Uint64(stackBuf[i:]) + i += 8 + sizeOfArgRet + returnAddresses = append(returnAddresses, uintptr(retAddr)) + if len(returnAddresses) == wasmdebug.MaxFrames { + break + } + } + return returnAddresses +} + +// GoCallStackView implements wazevo.goCallStackView. +func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 { + // (high address) + // +-----------------+ <----+ + // | xxxxxxxxxxx | | ;; optional unused space to make it 16-byte aligned. + // ^ | arg[N]/ret[M] | | + // sliceSize | | ............ | | sliceSize + // | | arg[1]/ret[1] | | + // v | arg[0]/ret[0] | <----+ + // | sliceSize | + // | frame_size | + // +-----------------+ <---- stackPointerBeforeGoCall + // (low address) + ptr := unsafe.Pointer(stackPointerBeforeGoCall) + size := *(*uint64)(unsafe.Add(ptr, 8)) + var view []uint64 + { + sh := (*reflect.SliceHeader)(unsafe.Pointer(&view)) + sh.Data = uintptr(unsafe.Add(ptr, 16)) // skips the (frame_size, sliceSize). + sh.Len = int(size) + sh.Cap = int(size) + } + return view +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go new file mode 100644 index 000000000..54ce89e46 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go @@ -0,0 +1,100 @@ +package backend + +import ( + "context" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +type ( + // Machine is a backend for a specific ISA machine. + Machine interface { + ExecutableContext() ExecutableContext + + // DisableStackCheck disables the stack check for the current compilation for debugging/testing. + DisableStackCheck() + + // SetCurrentABI initializes the FunctionABI for the given signature. + SetCurrentABI(abi *FunctionABI) + + // SetCompiler sets the compilation context used for the lifetime of Machine. + // This is only called once per Machine, i.e. before the first compilation. + SetCompiler(Compiler) + + // LowerSingleBranch is called when the compilation of the given single branch is started. + LowerSingleBranch(b *ssa.Instruction) + + // LowerConditionalBranch is called when the compilation of the given conditional branch is started. + LowerConditionalBranch(b *ssa.Instruction) + + // LowerInstr is called for each instruction in the given block except for the ones marked as already lowered + // via Compiler.MarkLowered. The order is reverse, i.e. from the last instruction to the first one. + // + // Note: this can lower multiple instructions (which produce the inputs) at once whenever it's possible + // for optimization. + LowerInstr(*ssa.Instruction) + + // Reset resets the machine state for the next compilation. + Reset() + + // InsertMove inserts a move instruction from src to dst whose type is typ. + InsertMove(dst, src regalloc.VReg, typ ssa.Type) + + // InsertReturn inserts the return instruction to return from the current function. + InsertReturn() + + // InsertLoadConstantBlockArg inserts the instruction(s) to load the constant value into the given regalloc.VReg. + InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) + + // Format returns the string representation of the currently compiled machine code. + // This is only for testing purpose. + Format() string + + // RegAlloc does the register allocation after lowering. + RegAlloc() + + // PostRegAlloc does the post register allocation, e.g. setting up prologue/epilogue, redundant move elimination, etc. + PostRegAlloc() + + // ResolveRelocations resolves the relocations after emitting machine code. + // * refToBinaryOffset: the map from the function reference (ssa.FuncRef) to the executable offset. + // * executable: the binary to resolve the relocations. + // * relocations: the relocations to resolve. + // * callTrampolineIslandOffsets: the offsets of the trampoline islands in the executable. + ResolveRelocations( + refToBinaryOffset []int, + executable []byte, + relocations []RelocationInfo, + callTrampolineIslandOffsets []int, + ) + + // Encode encodes the machine instructions to the Compiler. + Encode(ctx context.Context) error + + // CompileGoFunctionTrampoline compiles the trampoline function to call a Go function of the given exit code and signature. + CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte + + // CompileStackGrowCallSequence returns the sequence of instructions shared by all functions to + // call the stack grow builtin function. + CompileStackGrowCallSequence() []byte + + // CompileEntryPreamble returns the sequence of instructions shared by multiple functions to + // enter the function from Go. + CompileEntryPreamble(signature *ssa.Signature) []byte + + // LowerParams lowers the given parameters. + LowerParams(params []ssa.Value) + + // LowerReturns lowers the given returns. + LowerReturns(returns []ssa.Value) + + // ArgsResultsRegs returns the registers used for arguments and return values. + ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) + + // CallTrampolineIslandInfo returns the interval of the offset where the trampoline island is placed, and + // the size of the trampoline island. If islandSize is zero, the trampoline island is not used on this machine. + CallTrampolineIslandInfo(numFunctions int) (interval, islandSize int, err error) + } +) diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go new file mode 100644 index 000000000..3f36c84e5 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go @@ -0,0 +1,319 @@ +package backend + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// RegAllocFunctionMachine is the interface for the machine specific logic that will be used in RegAllocFunction. +type RegAllocFunctionMachine[I regalloc.InstrConstraint] interface { + // InsertMoveBefore inserts the move instruction from src to dst before the given instruction. + InsertMoveBefore(dst, src regalloc.VReg, instr I) + // InsertStoreRegisterAt inserts the instruction(s) to store the given virtual register at the given instruction. + // If after is true, the instruction(s) will be inserted after the given instruction, otherwise before. + InsertStoreRegisterAt(v regalloc.VReg, instr I, after bool) I + // InsertReloadRegisterAt inserts the instruction(s) to reload the given virtual register at the given instruction. + // If after is true, the instruction(s) will be inserted after the given instruction, otherwise before. + InsertReloadRegisterAt(v regalloc.VReg, instr I, after bool) I + // ClobberedRegisters is called when the register allocation is done and the clobbered registers are known. + ClobberedRegisters(regs []regalloc.VReg) + // Swap swaps the two virtual registers after the given instruction. + Swap(cur I, x1, x2, tmp regalloc.VReg) + // LastInstrForInsertion implements LastInstrForInsertion of regalloc.Function. See its comment for details. + LastInstrForInsertion(begin, end I) I + // SSABlockLabel returns the label of the given ssa.BasicBlockID. + SSABlockLabel(id ssa.BasicBlockID) Label +} + +type ( + // RegAllocFunction implements regalloc.Function. + RegAllocFunction[I regalloc.InstrConstraint, m RegAllocFunctionMachine[I]] struct { + m m + ssb ssa.Builder + c Compiler + // iter is the iterator for reversePostOrderBlocks + iter int + reversePostOrderBlocks []RegAllocBlock[I, m] + // labelToRegAllocBlockIndex maps label to the index of reversePostOrderBlocks. + labelToRegAllocBlockIndex map[Label]int + loopNestingForestRoots []ssa.BasicBlock + } + + // RegAllocBlock implements regalloc.Block. + RegAllocBlock[I regalloc.InstrConstraint, m RegAllocFunctionMachine[I]] struct { + // f is the function this instruction belongs to. Used to reuse the regAllocFunctionImpl.predsSlice slice for Defs() and Uses(). + f *RegAllocFunction[I, m] + sb ssa.BasicBlock + l Label + begin, end I + loopNestingForestChildren []ssa.BasicBlock + cur I + id int + cachedLastInstrForInsertion I + } +) + +// NewRegAllocFunction returns a new RegAllocFunction. +func NewRegAllocFunction[I regalloc.InstrConstraint, M RegAllocFunctionMachine[I]](m M, ssb ssa.Builder, c Compiler) *RegAllocFunction[I, M] { + return &RegAllocFunction[I, M]{ + m: m, + ssb: ssb, + c: c, + labelToRegAllocBlockIndex: make(map[Label]int), + } +} + +// AddBlock adds a new block to the function. +func (f *RegAllocFunction[I, M]) AddBlock(sb ssa.BasicBlock, l Label, begin, end I) { + i := len(f.reversePostOrderBlocks) + f.reversePostOrderBlocks = append(f.reversePostOrderBlocks, RegAllocBlock[I, M]{ + f: f, + sb: sb, + l: l, + begin: begin, + end: end, + id: int(sb.ID()), + }) + f.labelToRegAllocBlockIndex[l] = i +} + +// Reset resets the function for the next compilation. +func (f *RegAllocFunction[I, M]) Reset() { + f.reversePostOrderBlocks = f.reversePostOrderBlocks[:0] + f.iter = 0 +} + +// StoreRegisterAfter implements regalloc.Function StoreRegisterAfter. +func (f *RegAllocFunction[I, M]) StoreRegisterAfter(v regalloc.VReg, instr regalloc.Instr) { + m := f.m + m.InsertStoreRegisterAt(v, instr.(I), true) +} + +// ReloadRegisterBefore implements regalloc.Function ReloadRegisterBefore. +func (f *RegAllocFunction[I, M]) ReloadRegisterBefore(v regalloc.VReg, instr regalloc.Instr) { + m := f.m + m.InsertReloadRegisterAt(v, instr.(I), false) +} + +// ReloadRegisterAfter implements regalloc.Function ReloadRegisterAfter. +func (f *RegAllocFunction[I, M]) ReloadRegisterAfter(v regalloc.VReg, instr regalloc.Instr) { + m := f.m + m.InsertReloadRegisterAt(v, instr.(I), true) +} + +// StoreRegisterBefore implements regalloc.Function StoreRegisterBefore. +func (f *RegAllocFunction[I, M]) StoreRegisterBefore(v regalloc.VReg, instr regalloc.Instr) { + m := f.m + m.InsertStoreRegisterAt(v, instr.(I), false) +} + +// ClobberedRegisters implements regalloc.Function ClobberedRegisters. +func (f *RegAllocFunction[I, M]) ClobberedRegisters(regs []regalloc.VReg) { + f.m.ClobberedRegisters(regs) +} + +// SwapBefore implements regalloc.Function SwapBefore. +func (f *RegAllocFunction[I, M]) SwapBefore(x1, x2, tmp regalloc.VReg, instr regalloc.Instr) { + f.m.Swap(instr.Prev().(I), x1, x2, tmp) +} + +// PostOrderBlockIteratorBegin implements regalloc.Function PostOrderBlockIteratorBegin. +func (f *RegAllocFunction[I, M]) PostOrderBlockIteratorBegin() regalloc.Block { + f.iter = len(f.reversePostOrderBlocks) - 1 + return f.PostOrderBlockIteratorNext() +} + +// PostOrderBlockIteratorNext implements regalloc.Function PostOrderBlockIteratorNext. +func (f *RegAllocFunction[I, M]) PostOrderBlockIteratorNext() regalloc.Block { + if f.iter < 0 { + return nil + } + b := &f.reversePostOrderBlocks[f.iter] + f.iter-- + return b +} + +// ReversePostOrderBlockIteratorBegin implements regalloc.Function ReversePostOrderBlockIteratorBegin. +func (f *RegAllocFunction[I, M]) ReversePostOrderBlockIteratorBegin() regalloc.Block { + f.iter = 0 + return f.ReversePostOrderBlockIteratorNext() +} + +// ReversePostOrderBlockIteratorNext implements regalloc.Function ReversePostOrderBlockIteratorNext. +func (f *RegAllocFunction[I, M]) ReversePostOrderBlockIteratorNext() regalloc.Block { + if f.iter >= len(f.reversePostOrderBlocks) { + return nil + } + b := &f.reversePostOrderBlocks[f.iter] + f.iter++ + return b +} + +// LoopNestingForestRoots implements regalloc.Function LoopNestingForestRoots. +func (f *RegAllocFunction[I, M]) LoopNestingForestRoots() int { + f.loopNestingForestRoots = f.ssb.LoopNestingForestRoots() + return len(f.loopNestingForestRoots) +} + +// LoopNestingForestRoot implements regalloc.Function LoopNestingForestRoot. +func (f *RegAllocFunction[I, M]) LoopNestingForestRoot(i int) regalloc.Block { + blk := f.loopNestingForestRoots[i] + l := f.m.SSABlockLabel(blk.ID()) + index := f.labelToRegAllocBlockIndex[l] + return &f.reversePostOrderBlocks[index] +} + +// InsertMoveBefore implements regalloc.Function InsertMoveBefore. +func (f *RegAllocFunction[I, M]) InsertMoveBefore(dst, src regalloc.VReg, instr regalloc.Instr) { + f.m.InsertMoveBefore(dst, src, instr.(I)) +} + +// LowestCommonAncestor implements regalloc.Function LowestCommonAncestor. +func (f *RegAllocFunction[I, M]) LowestCommonAncestor(blk1, blk2 regalloc.Block) regalloc.Block { + ret := f.ssb.LowestCommonAncestor(blk1.(*RegAllocBlock[I, M]).sb, blk2.(*RegAllocBlock[I, M]).sb) + l := f.m.SSABlockLabel(ret.ID()) + index := f.labelToRegAllocBlockIndex[l] + return &f.reversePostOrderBlocks[index] +} + +// Idom implements regalloc.Function Idom. +func (f *RegAllocFunction[I, M]) Idom(blk regalloc.Block) regalloc.Block { + builder := f.ssb + idom := builder.Idom(blk.(*RegAllocBlock[I, M]).sb) + if idom == nil { + panic("BUG: idom must not be nil") + } + l := f.m.SSABlockLabel(idom.ID()) + index := f.labelToRegAllocBlockIndex[l] + return &f.reversePostOrderBlocks[index] +} + +// ID implements regalloc.Block. +func (r *RegAllocBlock[I, m]) ID() int32 { return int32(r.id) } + +// BlockParams implements regalloc.Block. +func (r *RegAllocBlock[I, m]) BlockParams(regs *[]regalloc.VReg) []regalloc.VReg { + c := r.f.c + *regs = (*regs)[:0] + for i := 0; i < r.sb.Params(); i++ { + v := c.VRegOf(r.sb.Param(i)) + *regs = append(*regs, v) + } + return *regs +} + +// InstrIteratorBegin implements regalloc.Block. +func (r *RegAllocBlock[I, m]) InstrIteratorBegin() regalloc.Instr { + r.cur = r.begin + return r.cur +} + +// InstrIteratorNext implements regalloc.Block. +func (r *RegAllocBlock[I, m]) InstrIteratorNext() regalloc.Instr { + for { + if r.cur == r.end { + return nil + } + instr := r.cur.Next() + r.cur = instr.(I) + if instr == nil { + return nil + } else if instr.AddedBeforeRegAlloc() { + // Only concerned about the instruction added before regalloc. + return instr + } + } +} + +// InstrRevIteratorBegin implements regalloc.Block. +func (r *RegAllocBlock[I, m]) InstrRevIteratorBegin() regalloc.Instr { + r.cur = r.end + return r.cur +} + +// InstrRevIteratorNext implements regalloc.Block. +func (r *RegAllocBlock[I, m]) InstrRevIteratorNext() regalloc.Instr { + for { + if r.cur == r.begin { + return nil + } + instr := r.cur.Prev() + r.cur = instr.(I) + if instr == nil { + return nil + } else if instr.AddedBeforeRegAlloc() { + // Only concerned about the instruction added before regalloc. + return instr + } + } +} + +// FirstInstr implements regalloc.Block. +func (r *RegAllocBlock[I, m]) FirstInstr() regalloc.Instr { + return r.begin +} + +// EndInstr implements regalloc.Block. +func (r *RegAllocBlock[I, m]) EndInstr() regalloc.Instr { + return r.end +} + +// LastInstrForInsertion implements regalloc.Block. +func (r *RegAllocBlock[I, m]) LastInstrForInsertion() regalloc.Instr { + var nil I + if r.cachedLastInstrForInsertion == nil { + r.cachedLastInstrForInsertion = r.f.m.LastInstrForInsertion(r.begin, r.end) + } + return r.cachedLastInstrForInsertion +} + +// Preds implements regalloc.Block. +func (r *RegAllocBlock[I, m]) Preds() int { return r.sb.Preds() } + +// Pred implements regalloc.Block. +func (r *RegAllocBlock[I, m]) Pred(i int) regalloc.Block { + sb := r.sb + pred := sb.Pred(i) + l := r.f.m.SSABlockLabel(pred.ID()) + index := r.f.labelToRegAllocBlockIndex[l] + return &r.f.reversePostOrderBlocks[index] +} + +// Entry implements regalloc.Block. +func (r *RegAllocBlock[I, m]) Entry() bool { return r.sb.EntryBlock() } + +// Succs implements regalloc.Block. +func (r *RegAllocBlock[I, m]) Succs() int { + return r.sb.Succs() +} + +// Succ implements regalloc.Block. +func (r *RegAllocBlock[I, m]) Succ(i int) regalloc.Block { + sb := r.sb + succ := sb.Succ(i) + if succ.ReturnBlock() { + return nil + } + l := r.f.m.SSABlockLabel(succ.ID()) + index := r.f.labelToRegAllocBlockIndex[l] + return &r.f.reversePostOrderBlocks[index] +} + +// LoopHeader implements regalloc.Block. +func (r *RegAllocBlock[I, m]) LoopHeader() bool { + return r.sb.LoopHeader() +} + +// LoopNestingForestChildren implements regalloc.Block. +func (r *RegAllocBlock[I, m]) LoopNestingForestChildren() int { + r.loopNestingForestChildren = r.sb.LoopNestingForestChildren() + return len(r.loopNestingForestChildren) +} + +// LoopNestingForestChild implements regalloc.Block. +func (r *RegAllocBlock[I, m]) LoopNestingForestChild(i int) regalloc.Block { + blk := r.loopNestingForestChildren[i] + l := r.f.m.SSABlockLabel(blk.ID()) + index := r.f.labelToRegAllocBlockIndex[l] + return &r.f.reversePostOrderBlocks[index] +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go new file mode 100644 index 000000000..23157b478 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go @@ -0,0 +1,136 @@ +package regalloc + +import "fmt" + +// These interfaces are implemented by ISA-specific backends to abstract away the details, and allow the register +// allocators to work on any ISA. +// +// TODO: the interfaces are not stabilized yet, especially x64 will need some changes. E.g. x64 has an addressing mode +// where index can be in memory. That kind of info will be useful to reduce the register pressure, and should be leveraged +// by the register allocators, like https://docs.rs/regalloc2/latest/regalloc2/enum.OperandConstraint.html + +type ( + // Function is the top-level interface to do register allocation, which corresponds to a CFG containing + // Blocks(s). + Function interface { + // PostOrderBlockIteratorBegin returns the first block in the post-order traversal of the CFG. + // In other words, the last blocks in the CFG will be returned first. + PostOrderBlockIteratorBegin() Block + // PostOrderBlockIteratorNext returns the next block in the post-order traversal of the CFG. + PostOrderBlockIteratorNext() Block + // ReversePostOrderBlockIteratorBegin returns the first block in the reverse post-order traversal of the CFG. + // In other words, the first blocks in the CFG will be returned first. + ReversePostOrderBlockIteratorBegin() Block + // ReversePostOrderBlockIteratorNext returns the next block in the reverse post-order traversal of the CFG. + ReversePostOrderBlockIteratorNext() Block + // ClobberedRegisters tell the clobbered registers by this function. + ClobberedRegisters([]VReg) + // LoopNestingForestRoots returns the number of roots of the loop nesting forest in a function. + LoopNestingForestRoots() int + // LoopNestingForestRoot returns the i-th root of the loop nesting forest in a function. + LoopNestingForestRoot(i int) Block + // LowestCommonAncestor returns the lowest common ancestor of two blocks in the dominator tree. + LowestCommonAncestor(blk1, blk2 Block) Block + // Idom returns the immediate dominator of the given block. + Idom(blk Block) Block + + // Followings are for rewriting the function. + + // SwapAtEndOfBlock swaps the two virtual registers at the end of the given block. + SwapBefore(x1, x2, tmp VReg, instr Instr) + // StoreRegisterBefore inserts store instruction(s) before the given instruction for the given virtual register. + StoreRegisterBefore(v VReg, instr Instr) + // StoreRegisterAfter inserts store instruction(s) after the given instruction for the given virtual register. + StoreRegisterAfter(v VReg, instr Instr) + // ReloadRegisterBefore inserts reload instruction(s) before the given instruction for the given virtual register. + ReloadRegisterBefore(v VReg, instr Instr) + // ReloadRegisterAfter inserts reload instruction(s) after the given instruction for the given virtual register. + ReloadRegisterAfter(v VReg, instr Instr) + // InsertMoveBefore inserts move instruction(s) before the given instruction for the given virtual registers. + InsertMoveBefore(dst, src VReg, instr Instr) + } + + // Block is a basic block in the CFG of a function, and it consists of multiple instructions, and predecessor Block(s). + Block interface { + // ID returns the unique identifier of this block which is ordered in the reverse post-order traversal of the CFG. + ID() int32 + // BlockParams returns the virtual registers used as the parameters of this block. + BlockParams(*[]VReg) []VReg + // InstrIteratorBegin returns the first instruction in this block. Instructions added after lowering must be skipped. + // Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr. + InstrIteratorBegin() Instr + // InstrIteratorNext returns the next instruction in this block. Instructions added after lowering must be skipped. + // Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr. + InstrIteratorNext() Instr + // InstrRevIteratorBegin is the same as InstrIteratorBegin, but in the reverse order. + InstrRevIteratorBegin() Instr + // InstrRevIteratorNext is the same as InstrIteratorNext, but in the reverse order. + InstrRevIteratorNext() Instr + // FirstInstr returns the fist instruction in this block where instructions will be inserted after it. + FirstInstr() Instr + // EndInstr returns the end instruction in this block. + EndInstr() Instr + // LastInstrForInsertion returns the last instruction in this block where instructions will be inserted before it. + // Such insertions only happen when we need to insert spill/reload instructions to adjust the merge edges. + // At the time of register allocation, all the critical edges are already split, so there is no need + // to worry about the case where branching instruction has multiple successors. + // Therefore, usually, it is the nop instruction, but if the block ends with an unconditional branching, then it returns + // the unconditional branch, not the nop. In other words it is either nop or unconditional branch. + LastInstrForInsertion() Instr + // Preds returns the number of predecessors of this block in the CFG. + Preds() int + // Pred returns the i-th predecessor of this block in the CFG. + Pred(i int) Block + // Entry returns true if the block is for the entry block. + Entry() bool + // Succs returns the number of successors of this block in the CFG. + Succs() int + // Succ returns the i-th successor of this block in the CFG. + Succ(i int) Block + // LoopHeader returns true if this block is a loop header. + LoopHeader() bool + // LoopNestingForestChildren returns the number of children of this block in the loop nesting forest. + LoopNestingForestChildren() int + // LoopNestingForestChild returns the i-th child of this block in the loop nesting forest. + LoopNestingForestChild(i int) Block + } + + // Instr is an instruction in a block, abstracting away the underlying ISA. + Instr interface { + fmt.Stringer + // Next returns the next instruction in the same block. + Next() Instr + // Prev returns the previous instruction in the same block. + Prev() Instr + // Defs returns the virtual registers defined by this instruction. + Defs(*[]VReg) []VReg + // Uses returns the virtual registers used by this instruction. + // Note: multiple returned []VReg will not be held at the same time, so it's safe to use the same slice for this. + Uses(*[]VReg) []VReg + // AssignUse assigns the RealReg-allocated virtual register used by this instruction at the given index. + AssignUse(index int, v VReg) + // AssignDef assigns a RealReg-allocated virtual register defined by this instruction. + // This only accepts one register because we don't allocate registers for multi-def instructions (i.e. call instruction) + AssignDef(VReg) + // IsCopy returns true if this instruction is a move instruction between two registers. + // If true, the instruction is of the form of dst = src, and if the src and dst do not interfere with each other, + // we could coalesce them, and hence the copy can be eliminated from the final code. + IsCopy() bool + // IsCall returns true if this instruction is a call instruction. The result is used to insert + // caller saved register spills and restores. + IsCall() bool + // IsIndirectCall returns true if this instruction is an indirect call instruction which calls a function pointer. + // The result is used to insert caller saved register spills and restores. + IsIndirectCall() bool + // IsReturn returns true if this instruction is a return instruction. + IsReturn() bool + // AddedBeforeRegAlloc returns true if this instruction is added before register allocation. + AddedBeforeRegAlloc() bool + } + + // InstrConstraint is an interface for arch-specific instruction constraints. + InstrConstraint interface { + comparable + Instr + } +) diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go new file mode 100644 index 000000000..46df807e6 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go @@ -0,0 +1,123 @@ +package regalloc + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// VReg represents a register which is assigned to an SSA value. This is used to represent a register in the backend. +// A VReg may or may not be a physical register, and the info of physical register can be obtained by RealReg. +type VReg uint64 + +// VRegID is the lower 32bit of VReg, which is the pure identifier of VReg without RealReg info. +type VRegID uint32 + +// RealReg returns the RealReg of this VReg. +func (v VReg) RealReg() RealReg { + return RealReg(v >> 32) +} + +// IsRealReg returns true if this VReg is backed by a physical register. +func (v VReg) IsRealReg() bool { + return v.RealReg() != RealRegInvalid +} + +// FromRealReg returns a VReg from the given RealReg and RegType. +// This is used to represent a specific pre-colored register in the backend. +func FromRealReg(r RealReg, typ RegType) VReg { + rid := VRegID(r) + if rid > vRegIDReservedForRealNum { + panic(fmt.Sprintf("invalid real reg %d", r)) + } + return VReg(r).SetRealReg(r).SetRegType(typ) +} + +// SetRealReg sets the RealReg of this VReg and returns the updated VReg. +func (v VReg) SetRealReg(r RealReg) VReg { + return VReg(r)<<32 | (v & 0xff_00_ffffffff) +} + +// RegType returns the RegType of this VReg. +func (v VReg) RegType() RegType { + return RegType(v >> 40) +} + +// SetRegType sets the RegType of this VReg and returns the updated VReg. +func (v VReg) SetRegType(t RegType) VReg { + return VReg(t)<<40 | (v & 0x00_ff_ffffffff) +} + +// ID returns the VRegID of this VReg. +func (v VReg) ID() VRegID { + return VRegID(v & 0xffffffff) +} + +// Valid returns true if this VReg is Valid. +func (v VReg) Valid() bool { + return v.ID() != vRegIDInvalid && v.RegType() != RegTypeInvalid +} + +// RealReg represents a physical register. +type RealReg byte + +const RealRegInvalid RealReg = 0 + +const ( + vRegIDInvalid VRegID = 1 << 31 + VRegIDNonReservedBegin = vRegIDReservedForRealNum + vRegIDReservedForRealNum VRegID = 128 + VRegInvalid = VReg(vRegIDInvalid) +) + +// String implements fmt.Stringer. +func (r RealReg) String() string { + switch r { + case RealRegInvalid: + return "invalid" + default: + return fmt.Sprintf("r%d", r) + } +} + +// String implements fmt.Stringer. +func (v VReg) String() string { + if v.IsRealReg() { + return fmt.Sprintf("r%d", v.ID()) + } + return fmt.Sprintf("v%d?", v.ID()) +} + +// RegType represents the type of a register. +type RegType byte + +const ( + RegTypeInvalid RegType = iota + RegTypeInt + RegTypeFloat + NumRegType +) + +// String implements fmt.Stringer. +func (r RegType) String() string { + switch r { + case RegTypeInt: + return "int" + case RegTypeFloat: + return "float" + default: + return "invalid" + } +} + +// RegTypeOf returns the RegType of the given ssa.Type. +func RegTypeOf(p ssa.Type) RegType { + switch p { + case ssa.TypeI32, ssa.TypeI64: + return RegTypeInt + case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: + return RegTypeFloat + default: + panic("invalid type") + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go new file mode 100644 index 000000000..b4450d56f --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go @@ -0,0 +1,1212 @@ +// Package regalloc performs register allocation. The algorithm can work on any ISA by implementing the interfaces in +// api.go. +// +// References: +// - https://web.stanford.edu/class/archive/cs/cs143/cs143.1128/lectures/17/Slides17.pdf +// - https://en.wikipedia.org/wiki/Chaitin%27s_algorithm +// - https://llvm.org/ProjectsWithLLVM/2004-Fall-CS426-LS.pdf +// - https://pfalcon.github.io/ssabook/latest/book-full.pdf: Chapter 9. for liveness analysis. +// - https://github.com/golang/go/blob/release-branch.go1.21/src/cmd/compile/internal/ssa/regalloc.go +package regalloc + +import ( + "fmt" + "math" + "strings" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// NewAllocator returns a new Allocator. +func NewAllocator(allocatableRegs *RegisterInfo) Allocator { + a := Allocator{ + regInfo: allocatableRegs, + phiDefInstListPool: wazevoapi.NewPool[phiDefInstList](resetPhiDefInstList), + blockStates: wazevoapi.NewIDedPool[blockState](resetBlockState), + } + a.state.vrStates = wazevoapi.NewIDedPool[vrState](resetVrState) + a.state.reset() + for _, regs := range allocatableRegs.AllocatableRegisters { + for _, r := range regs { + a.allocatableSet = a.allocatableSet.add(r) + } + } + return a +} + +type ( + // RegisterInfo holds the statically-known ISA-specific register information. + RegisterInfo struct { + // AllocatableRegisters is a 2D array of allocatable RealReg, indexed by regTypeNum and regNum. + // The order matters: the first element is the most preferred one when allocating. + AllocatableRegisters [NumRegType][]RealReg + CalleeSavedRegisters RegSet + CallerSavedRegisters RegSet + RealRegToVReg []VReg + // RealRegName returns the name of the given RealReg for debugging. + RealRegName func(r RealReg) string + RealRegType func(r RealReg) RegType + } + + // Allocator is a register allocator. + Allocator struct { + // regInfo is static per ABI/ISA, and is initialized by the machine during Machine.PrepareRegisterAllocator. + regInfo *RegisterInfo + // allocatableSet is a set of allocatable RealReg derived from regInfo. Static per ABI/ISA. + allocatableSet RegSet + allocatedCalleeSavedRegs []VReg + vs []VReg + vs2 []VRegID + phiDefInstListPool wazevoapi.Pool[phiDefInstList] + + // Followings are re-used during various places. + blks []Block + reals []RealReg + currentOccupants regInUseSet + + // Following two fields are updated while iterating the blocks in the reverse postorder. + state state + blockStates wazevoapi.IDedPool[blockState] + } + + // programCounter represents an opaque index into the program which is used to represents a LiveInterval of a VReg. + programCounter int32 + + state struct { + argRealRegs []VReg + regsInUse regInUseSet + vrStates wazevoapi.IDedPool[vrState] + + currentBlockID int32 + + // allocatedRegSet is a set of RealReg that are allocated during the allocation phase. This is reset per function. + allocatedRegSet RegSet + } + + blockState struct { + // liveIns is a list of VReg that are live at the beginning of the block. + liveIns []VRegID + // seen is true if the block is visited during the liveness analysis. + seen bool + // visited is true if the block is visited during the allocation phase. + visited bool + startFromPredIndex int + // startRegs is a list of RealReg that are used at the beginning of the block. This is used to fix the merge edges. + startRegs regInUseSet + // endRegs is a list of RealReg that are used at the end of the block. This is used to fix the merge edges. + endRegs regInUseSet + } + + vrState struct { + v VReg + r RealReg + // defInstr is the instruction that defines this value. If this is the phi value and not the entry block, this is nil. + defInstr Instr + // defBlk is the block that defines this value. If this is the phi value, this is the block whose arguments contain this value. + defBlk Block + // lca = lowest common ancestor. This is the block that is the lowest common ancestor of all the blocks that + // reloads this value. This is used to determine the spill location. Only valid if spilled=true. + lca Block + // lastUse is the program counter of the last use of this value. This changes while iterating the block, and + // should not be used across the blocks as it becomes invalid. To check the validity, use lastUseUpdatedAtBlockID. + lastUse programCounter + lastUseUpdatedAtBlockID int32 + // spilled is true if this value is spilled i.e. the value is reload from the stack somewhere in the program. + // + // Note that this field is used during liveness analysis for different purpose. This is used to determine the + // value is live-in or not. + spilled bool + // isPhi is true if this is a phi value. + isPhi bool + desiredLoc desiredLoc + // phiDefInstList is a list of instructions that defines this phi value. + // This is used to determine the spill location, and only valid if isPhi=true. + *phiDefInstList + } + + // phiDefInstList is a linked list of instructions that defines a phi value. + phiDefInstList struct { + instr Instr + v VReg + next *phiDefInstList + } + + // desiredLoc represents a desired location for a VReg. + desiredLoc uint16 + // desiredLocKind is a kind of desired location for a VReg. + desiredLocKind uint16 +) + +const ( + // desiredLocKindUnspecified is a kind of desired location for a VReg that is not specified. + desiredLocKindUnspecified desiredLocKind = iota + // desiredLocKindStack is a kind of desired location for a VReg that is on the stack, only used for the phi values. + desiredLocKindStack + // desiredLocKindReg is a kind of desired location for a VReg that is in a register. + desiredLocKindReg + desiredLocUnspecified = desiredLoc(desiredLocKindUnspecified) + desiredLocStack = desiredLoc(desiredLocKindStack) +) + +func newDesiredLocReg(r RealReg) desiredLoc { + return desiredLoc(desiredLocKindReg) | desiredLoc(r<<2) +} + +func (d desiredLoc) realReg() RealReg { + return RealReg(d >> 2) +} + +func (d desiredLoc) stack() bool { + return d&3 == desiredLoc(desiredLocKindStack) +} + +func resetPhiDefInstList(l *phiDefInstList) { + l.instr = nil + l.next = nil + l.v = VRegInvalid +} + +func (s *state) dump(info *RegisterInfo) { //nolint:unused + fmt.Println("\t\tstate:") + fmt.Println("\t\t\targRealRegs:", s.argRealRegs) + fmt.Println("\t\t\tregsInUse", s.regsInUse.format(info)) + fmt.Println("\t\t\tallocatedRegSet:", s.allocatedRegSet.format(info)) + fmt.Println("\t\t\tused:", s.regsInUse.format(info)) + var strs []string + for i := 0; i <= s.vrStates.MaxIDEncountered(); i++ { + vs := s.vrStates.Get(i) + if vs == nil { + continue + } + if vs.r != RealRegInvalid { + strs = append(strs, fmt.Sprintf("(v%d: %s)", vs.v.ID(), info.RealRegName(vs.r))) + } + } + fmt.Println("\t\t\tvrStates:", strings.Join(strs, ", ")) +} + +func (s *state) reset() { + s.argRealRegs = s.argRealRegs[:0] + s.vrStates.Reset() + s.allocatedRegSet = RegSet(0) + s.regsInUse.reset() + s.currentBlockID = -1 +} + +func (s *state) setVRegState(v VReg, r RealReg) { + id := int(v.ID()) + st := s.vrStates.GetOrAllocate(id) + st.r = r + st.v = v +} + +func resetVrState(vs *vrState) { + vs.v = VRegInvalid + vs.r = RealRegInvalid + vs.defInstr = nil + vs.defBlk = nil + vs.spilled = false + vs.lastUse = -1 + vs.lastUseUpdatedAtBlockID = -1 + vs.lca = nil + vs.isPhi = false + vs.phiDefInstList = nil + vs.desiredLoc = desiredLocUnspecified +} + +func (s *state) getVRegState(v VRegID) *vrState { + return s.vrStates.GetOrAllocate(int(v)) +} + +func (s *state) useRealReg(r RealReg, v VReg) { + if s.regsInUse.has(r) { + panic("BUG: useRealReg: the given real register is already used") + } + s.regsInUse.add(r, v) + s.setVRegState(v, r) + s.allocatedRegSet = s.allocatedRegSet.add(r) +} + +func (s *state) releaseRealReg(r RealReg) { + current := s.regsInUse.get(r) + if current.Valid() { + s.regsInUse.remove(r) + s.setVRegState(current, RealRegInvalid) + } +} + +// recordReload records that the given VReg is reloaded in the given block. +// This is used to determine the spill location by tracking the lowest common ancestor of all the blocks that reloads the value. +func (vs *vrState) recordReload(f Function, blk Block) { + vs.spilled = true + if vs.lca == nil { + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\t\tv%d is reloaded in blk%d,\n", vs.v.ID(), blk.ID()) + } + vs.lca = blk + } else { + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\t\tv%d is reloaded in blk%d, lca=%d\n", vs.v.ID(), blk.ID(), vs.lca.ID()) + } + vs.lca = f.LowestCommonAncestor(vs.lca, blk) + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("updated lca=%d\n", vs.lca.ID()) + } + } +} + +func (s *state) findOrSpillAllocatable(a *Allocator, allocatable []RealReg, forbiddenMask RegSet, preferred RealReg) (r RealReg) { + r = RealRegInvalid + // First, check if the preferredMask has any allocatable register. + if preferred != RealRegInvalid && !forbiddenMask.has(preferred) && !s.regsInUse.has(preferred) { + for _, candidateReal := range allocatable { + // TODO: we should ensure the preferred register is in the allocatable set in the first place, + // but right now, just in case, we check it here. + if candidateReal == preferred { + return preferred + } + } + } + + var lastUseAt programCounter + var spillVReg VReg + for _, candidateReal := range allocatable { + if forbiddenMask.has(candidateReal) { + continue + } + + using := s.regsInUse.get(candidateReal) + if using == VRegInvalid { + // This is not used at this point. + return candidateReal + } + + // Real registers in use should not be spilled, so we skip them. + // For example, if the register is used as an argument register, and it might be + // spilled and not reloaded when it ends up being used as a temporary to pass + // stack based argument. + if using.IsRealReg() { + continue + } + + isPreferred := candidateReal == preferred + + // last == -1 means the value won't be used anymore. + if last := s.getVRegState(using.ID()).lastUse; r == RealRegInvalid || isPreferred || last == -1 || (lastUseAt != -1 && last > lastUseAt) { + lastUseAt = last + r = candidateReal + spillVReg = using + if isPreferred { + break + } + } + } + + if r == RealRegInvalid { + panic("not found any allocatable register") + } + + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\tspilling v%d when lastUseAt=%d and regsInUse=%s\n", spillVReg.ID(), lastUseAt, s.regsInUse.format(a.regInfo)) + } + s.releaseRealReg(r) + return r +} + +func (s *state) findAllocatable(allocatable []RealReg, forbiddenMask RegSet) RealReg { + for _, r := range allocatable { + if !s.regsInUse.has(r) && !forbiddenMask.has(r) { + return r + } + } + return RealRegInvalid +} + +func (s *state) resetAt(bs *blockState) { + s.regsInUse.range_(func(_ RealReg, vr VReg) { + s.setVRegState(vr, RealRegInvalid) + }) + s.regsInUse.reset() + bs.endRegs.range_(func(r RealReg, v VReg) { + id := int(v.ID()) + st := s.vrStates.GetOrAllocate(id) + if st.lastUseUpdatedAtBlockID == s.currentBlockID && st.lastUse == programCounterLiveIn { + s.regsInUse.add(r, v) + s.setVRegState(v, r) + } + }) +} + +func resetBlockState(b *blockState) { + b.seen = false + b.visited = false + b.endRegs.reset() + b.startRegs.reset() + b.startFromPredIndex = -1 + b.liveIns = b.liveIns[:0] +} + +func (b *blockState) dump(a *RegisterInfo) { + fmt.Println("\t\tblockState:") + fmt.Println("\t\t\tstartRegs:", b.startRegs.format(a)) + fmt.Println("\t\t\tendRegs:", b.endRegs.format(a)) + fmt.Println("\t\t\tstartFromPredIndex:", b.startFromPredIndex) + fmt.Println("\t\t\tvisited:", b.visited) +} + +// DoAllocation performs register allocation on the given Function. +func (a *Allocator) DoAllocation(f Function) { + a.livenessAnalysis(f) + a.alloc(f) + a.determineCalleeSavedRealRegs(f) +} + +func (a *Allocator) determineCalleeSavedRealRegs(f Function) { + a.allocatedCalleeSavedRegs = a.allocatedCalleeSavedRegs[:0] + a.state.allocatedRegSet.Range(func(allocatedRealReg RealReg) { + if a.regInfo.CalleeSavedRegisters.has(allocatedRealReg) { + a.allocatedCalleeSavedRegs = append(a.allocatedCalleeSavedRegs, a.regInfo.RealRegToVReg[allocatedRealReg]) + } + }) + f.ClobberedRegisters(a.allocatedCalleeSavedRegs) +} + +func (a *Allocator) getOrAllocateBlockState(blockID int32) *blockState { + return a.blockStates.GetOrAllocate(int(blockID)) +} + +// phiBlk returns the block that defines the given phi value, nil otherwise. +func (s *state) phiBlk(v VRegID) Block { + vs := s.getVRegState(v) + if vs.isPhi { + return vs.defBlk + } + return nil +} + +const ( + programCounterLiveIn = math.MinInt32 + programCounterLiveOut = math.MaxInt32 +) + +// liveAnalysis constructs Allocator.blockLivenessData. +// The algorithm here is described in https://pfalcon.github.io/ssabook/latest/book-full.pdf Chapter 9.2. +func (a *Allocator) livenessAnalysis(f Function) { + s := &a.state + for blk := f.PostOrderBlockIteratorBegin(); blk != nil; blk = f.PostOrderBlockIteratorNext() { // Order doesn't matter. + + // We should gather phi value data. + for _, p := range blk.BlockParams(&a.vs) { + vs := s.getVRegState(p.ID()) + vs.isPhi = true + vs.defBlk = blk + } + } + + for blk := f.PostOrderBlockIteratorBegin(); blk != nil; blk = f.PostOrderBlockIteratorNext() { + blkID := blk.ID() + info := a.getOrAllocateBlockState(blkID) + + a.vs2 = a.vs2[:0] + const ( + flagDeleted = false + flagLive = true + ) + ns := blk.Succs() + for i := 0; i < ns; i++ { + succ := blk.Succ(i) + if succ == nil { + continue + } + + succID := succ.ID() + succInfo := a.getOrAllocateBlockState(succID) + if !succInfo.seen { // This means the back edge. + continue + } + + for _, v := range succInfo.liveIns { + if s.phiBlk(v) != succ { + st := s.getVRegState(v) + // We use .spilled field to store the flag. + st.spilled = flagLive + a.vs2 = append(a.vs2, v) + } + } + } + + for instr := blk.InstrRevIteratorBegin(); instr != nil; instr = blk.InstrRevIteratorNext() { + + var use, def VReg + for _, def = range instr.Defs(&a.vs) { + if !def.IsRealReg() { + id := def.ID() + st := s.getVRegState(id) + // We use .spilled field to store the flag. + st.spilled = flagDeleted + a.vs2 = append(a.vs2, id) + } + } + for _, use = range instr.Uses(&a.vs) { + if !use.IsRealReg() { + id := use.ID() + st := s.getVRegState(id) + // We use .spilled field to store the flag. + st.spilled = flagLive + a.vs2 = append(a.vs2, id) + } + } + + if def.Valid() && s.phiBlk(def.ID()) != nil { + if use.Valid() && use.IsRealReg() { + // If the destination is a phi value, and the source is a real register, this is the beginning of the function. + a.state.argRealRegs = append(a.state.argRealRegs, use) + } + } + } + + for _, v := range a.vs2 { + st := s.getVRegState(v) + // We use .spilled field to store the flag. + if st.spilled == flagLive { //nolint:gosimple + info.liveIns = append(info.liveIns, v) + st.spilled = false + } + } + + info.seen = true + } + + nrs := f.LoopNestingForestRoots() + for i := 0; i < nrs; i++ { + root := f.LoopNestingForestRoot(i) + a.loopTreeDFS(root) + } +} + +// loopTreeDFS implements the Algorithm 9.3 in the book in an iterative way. +func (a *Allocator) loopTreeDFS(entry Block) { + a.blks = a.blks[:0] + a.blks = append(a.blks, entry) + + s := &a.state + for len(a.blks) > 0 { + tail := len(a.blks) - 1 + loop := a.blks[tail] + a.blks = a.blks[:tail] + a.vs2 = a.vs2[:0] + const ( + flagDone = false + flagPending = true + ) + info := a.getOrAllocateBlockState(loop.ID()) + for _, v := range info.liveIns { + if s.phiBlk(v) != loop { + a.vs2 = append(a.vs2, v) + st := s.getVRegState(v) + // We use .spilled field to store the flag. + st.spilled = flagPending + } + } + + var siblingAddedView []VRegID + cn := loop.LoopNestingForestChildren() + for i := 0; i < cn; i++ { + child := loop.LoopNestingForestChild(i) + childID := child.ID() + childInfo := a.getOrAllocateBlockState(childID) + + if i == 0 { + begin := len(childInfo.liveIns) + for _, v := range a.vs2 { + st := s.getVRegState(v) + // We use .spilled field to store the flag. + if st.spilled == flagPending { //nolint:gosimple + st.spilled = flagDone + // TODO: deduplicate, though I don't think it has much impact. + childInfo.liveIns = append(childInfo.liveIns, v) + } + } + siblingAddedView = childInfo.liveIns[begin:] + } else { + // TODO: deduplicate, though I don't think it has much impact. + childInfo.liveIns = append(childInfo.liveIns, siblingAddedView...) + } + + if child.LoopHeader() { + a.blks = append(a.blks, child) + } + } + + if cn == 0 { + // If there's no forest child, we haven't cleared the .spilled field at this point. + for _, v := range a.vs2 { + st := s.getVRegState(v) + st.spilled = false + } + } + } +} + +// alloc allocates registers for the given function by iterating the blocks in the reverse postorder. +// The algorithm here is derived from the Go compiler's allocator https://github.com/golang/go/blob/release-branch.go1.21/src/cmd/compile/internal/ssa/regalloc.go +// In short, this is a simply linear scan register allocation where each block inherits the register allocation state from +// one of its predecessors. Each block inherits the selected state and starts allocation from there. +// If there's a discrepancy in the end states between predecessors, the adjustments are made to ensure consistency after allocation is done (which we call "fixing merge state"). +// The spill instructions (store into the dedicated slots) are inserted after all the allocations and fixing merge states. That is because +// at the point, we all know where the reloads happen, and therefore we can know the best place to spill the values. More precisely, +// the spill happens in the block that is the lowest common ancestor of all the blocks that reloads the value. +// +// All of these logics are almost the same as Go's compiler which has a dedicated description in the source file ^^. +func (a *Allocator) alloc(f Function) { + // First we allocate each block in the reverse postorder (at least one predecessor should be allocated for each block). + for blk := f.ReversePostOrderBlockIteratorBegin(); blk != nil; blk = f.ReversePostOrderBlockIteratorNext() { + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("========== allocating blk%d ========\n", blk.ID()) + } + if blk.Entry() { + a.finalizeStartReg(blk) + } + a.allocBlock(f, blk) + } + // After the allocation, we all know the start and end state of each block. So we can fix the merge states. + for blk := f.ReversePostOrderBlockIteratorBegin(); blk != nil; blk = f.ReversePostOrderBlockIteratorNext() { + a.fixMergeState(f, blk) + } + // Finally, we insert the spill instructions as we know all the places where the reloads happen. + a.scheduleSpills(f) +} + +func (a *Allocator) updateLiveInVRState(liveness *blockState) { + currentBlockID := a.state.currentBlockID + for _, v := range liveness.liveIns { + vs := a.state.getVRegState(v) + vs.lastUse = programCounterLiveIn + vs.lastUseUpdatedAtBlockID = currentBlockID + } +} + +func (a *Allocator) finalizeStartReg(blk Block) { + bID := blk.ID() + liveness := a.getOrAllocateBlockState(bID) + s := &a.state + currentBlkState := a.getOrAllocateBlockState(bID) + if currentBlkState.startFromPredIndex > -1 { + return + } + + s.currentBlockID = bID + a.updateLiveInVRState(liveness) + + preds := blk.Preds() + var predState *blockState + switch preds { + case 0: // This is the entry block. + case 1: + predID := blk.Pred(0).ID() + predState = a.getOrAllocateBlockState(predID) + currentBlkState.startFromPredIndex = 0 + default: + // TODO: there should be some better heuristic to choose the predecessor. + for i := 0; i < preds; i++ { + predID := blk.Pred(i).ID() + if _predState := a.getOrAllocateBlockState(predID); _predState.visited { + predState = _predState + currentBlkState.startFromPredIndex = i + break + } + } + } + if predState == nil { + if !blk.Entry() { + panic(fmt.Sprintf("BUG: at lease one predecessor should be visited for blk%d", blk.ID())) + } + for _, u := range s.argRealRegs { + s.useRealReg(u.RealReg(), u) + } + currentBlkState.startFromPredIndex = 0 + } else if predState != nil { + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("allocating blk%d starting from blk%d (on index=%d) \n", + bID, blk.Pred(currentBlkState.startFromPredIndex).ID(), currentBlkState.startFromPredIndex) + } + s.resetAt(predState) + } + + s.regsInUse.range_(func(allocated RealReg, v VReg) { + currentBlkState.startRegs.add(allocated, v) + }) + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("finalized start reg for blk%d: %s\n", blk.ID(), currentBlkState.startRegs.format(a.regInfo)) + } +} + +func (a *Allocator) allocBlock(f Function, blk Block) { + bID := blk.ID() + s := &a.state + currentBlkState := a.getOrAllocateBlockState(bID) + s.currentBlockID = bID + + if currentBlkState.startFromPredIndex < 0 { + panic("BUG: startFromPredIndex should be set in finalizeStartReg prior to allocBlock") + } + + // Clears the previous state. + s.regsInUse.range_(func(allocatedRealReg RealReg, vr VReg) { + s.setVRegState(vr, RealRegInvalid) + }) + s.regsInUse.reset() + // Then set the start state. + currentBlkState.startRegs.range_(func(allocatedRealReg RealReg, vr VReg) { + s.useRealReg(allocatedRealReg, vr) + }) + + desiredUpdated := a.vs2[:0] + + // Update the last use of each VReg. + var pc programCounter + for instr := blk.InstrIteratorBegin(); instr != nil; instr = blk.InstrIteratorNext() { + var use, def VReg + for _, use = range instr.Uses(&a.vs) { + if !use.IsRealReg() { + s.getVRegState(use.ID()).lastUse = pc + } + } + + if instr.IsCopy() { + def = instr.Defs(&a.vs)[0] + r := def.RealReg() + if r != RealRegInvalid { + useID := use.ID() + vs := s.getVRegState(useID) + if !vs.isPhi { // TODO: no idea why do we need this. + vs.desiredLoc = newDesiredLocReg(r) + desiredUpdated = append(desiredUpdated, useID) + } + } + } + pc++ + } + + // Mark all live-out values by checking live-in of the successors. + // While doing so, we also update the desired register values. + var succ Block + for i, ns := 0, blk.Succs(); i < ns; i++ { + succ = blk.Succ(i) + if succ == nil { + continue + } + + succID := succ.ID() + succState := a.getOrAllocateBlockState(succID) + for _, v := range succState.liveIns { + if s.phiBlk(v) != succ { + st := s.getVRegState(v) + st.lastUse = programCounterLiveOut + } + } + + if succState.startFromPredIndex > -1 { + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("blk%d -> blk%d: start_regs: %s\n", bID, succID, succState.startRegs.format(a.regInfo)) + } + succState.startRegs.range_(func(allocatedRealReg RealReg, vr VReg) { + vs := s.getVRegState(vr.ID()) + vs.desiredLoc = newDesiredLocReg(allocatedRealReg) + desiredUpdated = append(desiredUpdated, vr.ID()) + }) + for _, p := range succ.BlockParams(&a.vs) { + vs := s.getVRegState(p.ID()) + if vs.desiredLoc.realReg() == RealRegInvalid { + vs.desiredLoc = desiredLocStack + desiredUpdated = append(desiredUpdated, p.ID()) + } + } + } + } + + // Propagate the desired register values from the end of the block to the beginning. + for instr := blk.InstrRevIteratorBegin(); instr != nil; instr = blk.InstrRevIteratorNext() { + if instr.IsCopy() { + def := instr.Defs(&a.vs)[0] + defState := s.getVRegState(def.ID()) + desired := defState.desiredLoc.realReg() + if desired == RealRegInvalid { + continue + } + + use := instr.Uses(&a.vs)[0] + useID := use.ID() + useState := s.getVRegState(useID) + if s.phiBlk(useID) != succ && useState.desiredLoc == desiredLocUnspecified { + useState.desiredLoc = newDesiredLocReg(desired) + desiredUpdated = append(desiredUpdated, useID) + } + } + } + + pc = 0 + for instr := blk.InstrIteratorBegin(); instr != nil; instr = blk.InstrIteratorNext() { + if wazevoapi.RegAllocLoggingEnabled { + fmt.Println(instr) + } + + var currentUsedSet RegSet + killSet := a.reals[:0] + + // Gather the set of registers that will be used in the current instruction. + for _, use := range instr.Uses(&a.vs) { + if use.IsRealReg() { + r := use.RealReg() + currentUsedSet = currentUsedSet.add(r) + if a.allocatableSet.has(r) { + killSet = append(killSet, r) + } + } else { + vs := s.getVRegState(use.ID()) + if r := vs.r; r != RealRegInvalid { + currentUsedSet = currentUsedSet.add(r) + } + } + } + + for i, use := range instr.Uses(&a.vs) { + if !use.IsRealReg() { + vs := s.getVRegState(use.ID()) + killed := vs.lastUse == pc + r := vs.r + + if r == RealRegInvalid { + r = s.findOrSpillAllocatable(a, a.regInfo.AllocatableRegisters[use.RegType()], currentUsedSet, + // Prefer the desired register if it's available. + vs.desiredLoc.realReg()) + vs.recordReload(f, blk) + f.ReloadRegisterBefore(use.SetRealReg(r), instr) + s.useRealReg(r, use) + } + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\ttrying to use v%v on %s\n", use.ID(), a.regInfo.RealRegName(r)) + } + instr.AssignUse(i, use.SetRealReg(r)) + currentUsedSet = currentUsedSet.add(r) + if killed { + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\tkill v%d with %s\n", use.ID(), a.regInfo.RealRegName(r)) + } + killSet = append(killSet, r) + } + } + } + + isIndirect := instr.IsIndirectCall() + call := instr.IsCall() || isIndirect + if call { + addr := RealRegInvalid + if instr.IsIndirectCall() { + addr = a.vs[0].RealReg() + } + a.releaseCallerSavedRegs(addr) + } + + for _, r := range killSet { + s.releaseRealReg(r) + } + a.reals = killSet + + defs := instr.Defs(&a.vs) + switch { + case len(defs) > 1: + // Some instructions define multiple values on real registers. + // E.g. call instructions (following calling convention) / div instruction on x64 that defines both rax and rdx. + // + // Note that currently I assume that such instructions define only the pre colored real registers, not the VRegs + // that require allocations. If we need to support such case, we need to add the logic to handle it here, + // though is there any such instruction? + for _, def := range defs { + if !def.IsRealReg() { + panic("BUG: multiple defs should be on real registers") + } + r := def.RealReg() + if s.regsInUse.has(r) { + s.releaseRealReg(r) + } + s.useRealReg(r, def) + } + case len(defs) == 1: + def := defs[0] + if def.IsRealReg() { + r := def.RealReg() + if a.allocatableSet.has(r) { + if s.regsInUse.has(r) { + s.releaseRealReg(r) + } + s.useRealReg(r, def) + } + } else { + vState := s.getVRegState(def.ID()) + r := vState.r + + if desired := vState.desiredLoc.realReg(); desired != RealRegInvalid { + if r != desired { + if (vState.isPhi && vState.defBlk == succ) || + // If this is not a phi and it's already assigned a real reg, + // this value has multiple definitions, hence we cannot assign the desired register. + (!s.regsInUse.has(desired) && r == RealRegInvalid) { + // If the phi value is passed via a real register, we force the value to be in the desired register. + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\t\tv%d is phi and desiredReg=%s\n", def.ID(), a.regInfo.RealRegName(desired)) + } + if r != RealRegInvalid { + // If the value is already in a different real register, we release it to change the state. + // Otherwise, multiple registers might have the same values at the end, which results in + // messing up the merge state reconciliation. + s.releaseRealReg(r) + } + r = desired + s.releaseRealReg(r) + s.useRealReg(r, def) + } + } + } + + // Allocate a new real register if `def` is not currently assigned one. + // It can happen when multiple instructions define the same VReg (e.g. const loads). + if r == RealRegInvalid { + if instr.IsCopy() { + copySrc := instr.Uses(&a.vs)[0].RealReg() + if a.allocatableSet.has(copySrc) && !s.regsInUse.has(copySrc) { + r = copySrc + } + } + if r == RealRegInvalid { + typ := def.RegType() + r = s.findOrSpillAllocatable(a, a.regInfo.AllocatableRegisters[typ], RegSet(0), RealRegInvalid) + } + s.useRealReg(r, def) + } + dr := def.SetRealReg(r) + instr.AssignDef(dr) + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\tdefining v%d with %s\n", def.ID(), a.regInfo.RealRegName(r)) + } + if vState.isPhi { + if vState.desiredLoc.stack() { // Stack based phi value. + f.StoreRegisterAfter(dr, instr) + // Release the real register as it's not used anymore. + s.releaseRealReg(r) + } else { + // Only the register based phis are necessary to track the defining instructions + // since the stack-based phis are already having stores inserted ^. + n := a.phiDefInstListPool.Allocate() + n.instr = instr + n.next = vState.phiDefInstList + n.v = dr + vState.phiDefInstList = n + } + } else { + vState.defInstr = instr + vState.defBlk = blk + } + } + } + if wazevoapi.RegAllocLoggingEnabled { + fmt.Println(instr) + } + pc++ + } + + s.regsInUse.range_(func(allocated RealReg, v VReg) { + currentBlkState.endRegs.add(allocated, v) + }) + + currentBlkState.visited = true + if wazevoapi.RegAllocLoggingEnabled { + currentBlkState.dump(a.regInfo) + } + + // Reset the desired end location. + for _, v := range desiredUpdated { + vs := s.getVRegState(v) + vs.desiredLoc = desiredLocUnspecified + } + a.vs2 = desiredUpdated[:0] + + for i := 0; i < blk.Succs(); i++ { + succ := blk.Succ(i) + if succ == nil { + continue + } + // If the successor is not visited yet, finalize the start state. + a.finalizeStartReg(succ) + } +} + +func (a *Allocator) releaseCallerSavedRegs(addrReg RealReg) { + s := &a.state + + for i := 0; i < 64; i++ { + allocated := RealReg(i) + if allocated == addrReg { // If this is the call indirect, we should not touch the addr register. + continue + } + if v := s.regsInUse.get(allocated); v.Valid() { + if v.IsRealReg() { + continue // This is the argument register as it's already used by VReg backed by the corresponding RealReg. + } + if !a.regInfo.CallerSavedRegisters.has(allocated) { + // If this is not a caller-saved register, it is safe to keep it across the call. + continue + } + s.releaseRealReg(allocated) + } + } +} + +func (a *Allocator) fixMergeState(f Function, blk Block) { + preds := blk.Preds() + if preds <= 1 { + return + } + + s := &a.state + + // Restores the state at the beginning of the block. + bID := blk.ID() + blkSt := a.getOrAllocateBlockState(bID) + desiredOccupants := &blkSt.startRegs + aliveOnRegVRegs := make(map[VReg]RealReg) + for i := 0; i < 64; i++ { + r := RealReg(i) + if v := blkSt.startRegs.get(r); v.Valid() { + aliveOnRegVRegs[v] = r + } + } + + if wazevoapi.RegAllocLoggingEnabled { + fmt.Println("fixMergeState", blk.ID(), ":", desiredOccupants.format(a.regInfo)) + } + + s.currentBlockID = bID + a.updateLiveInVRState(a.getOrAllocateBlockState(bID)) + + currentOccupants := &a.currentOccupants + for i := 0; i < preds; i++ { + currentOccupants.reset() + if i == blkSt.startFromPredIndex { + continue + } + + currentOccupantsRev := make(map[VReg]RealReg) + pred := blk.Pred(i) + predSt := a.getOrAllocateBlockState(pred.ID()) + for ii := 0; ii < 64; ii++ { + r := RealReg(ii) + if v := predSt.endRegs.get(r); v.Valid() { + if _, ok := aliveOnRegVRegs[v]; !ok { + continue + } + currentOccupants.add(r, v) + currentOccupantsRev[v] = r + } + } + + s.resetAt(predSt) + + // Finds the free registers if any. + intTmp, floatTmp := VRegInvalid, VRegInvalid + if intFree := s.findAllocatable( + a.regInfo.AllocatableRegisters[RegTypeInt], desiredOccupants.set, + ); intFree != RealRegInvalid { + intTmp = FromRealReg(intFree, RegTypeInt) + } + if floatFree := s.findAllocatable( + a.regInfo.AllocatableRegisters[RegTypeFloat], desiredOccupants.set, + ); floatFree != RealRegInvalid { + floatTmp = FromRealReg(floatFree, RegTypeFloat) + } + + if wazevoapi.RegAllocLoggingEnabled { + fmt.Println("\t", pred.ID(), ":", currentOccupants.format(a.regInfo)) + } + + for ii := 0; ii < 64; ii++ { + r := RealReg(ii) + desiredVReg := desiredOccupants.get(r) + if !desiredVReg.Valid() { + continue + } + + currentVReg := currentOccupants.get(r) + if desiredVReg.ID() == currentVReg.ID() { + continue + } + + typ := desiredVReg.RegType() + var tmpRealReg VReg + if typ == RegTypeInt { + tmpRealReg = intTmp + } else { + tmpRealReg = floatTmp + } + a.reconcileEdge(f, r, pred, currentOccupants, currentOccupantsRev, currentVReg, desiredVReg, tmpRealReg, typ) + } + } +} + +func (a *Allocator) reconcileEdge(f Function, + r RealReg, + pred Block, + currentOccupants *regInUseSet, + currentOccupantsRev map[VReg]RealReg, + currentVReg, desiredVReg VReg, + freeReg VReg, + typ RegType, +) { + s := &a.state + if currentVReg.Valid() { + // Both are on reg. + er, ok := currentOccupantsRev[desiredVReg] + if !ok { + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\t\tv%d is desired to be on %s, but currently on the stack\n", + desiredVReg.ID(), a.regInfo.RealRegName(r), + ) + } + // This case is that the desired value is on the stack, but currentVReg is on the target register. + // We need to move the current value to the stack, and reload the desired value. + // TODO: we can do better here. + f.StoreRegisterBefore(currentVReg.SetRealReg(r), pred.LastInstrForInsertion()) + delete(currentOccupantsRev, currentVReg) + + s.getVRegState(desiredVReg.ID()).recordReload(f, pred) + f.ReloadRegisterBefore(desiredVReg.SetRealReg(r), pred.LastInstrForInsertion()) + currentOccupants.add(r, desiredVReg) + currentOccupantsRev[desiredVReg] = r + return + } + + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\t\tv%d is desired to be on %s, but currently on %s\n", + desiredVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er), + ) + } + f.SwapBefore( + currentVReg.SetRealReg(r), + desiredVReg.SetRealReg(er), + freeReg, + pred.LastInstrForInsertion(), + ) + s.allocatedRegSet = s.allocatedRegSet.add(freeReg.RealReg()) + currentOccupantsRev[desiredVReg] = r + currentOccupantsRev[currentVReg] = er + currentOccupants.add(r, desiredVReg) + currentOccupants.add(er, currentVReg) + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\t\tv%d previously on %s moved to %s\n", currentVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er)) + } + } else { + // Desired is on reg, but currently the target register is not used. + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\t\tv%d is desired to be on %s, current not used\n", + desiredVReg.ID(), a.regInfo.RealRegName(r), + ) + } + if currentReg, ok := currentOccupantsRev[desiredVReg]; ok { + f.InsertMoveBefore( + FromRealReg(r, typ), + desiredVReg.SetRealReg(currentReg), + pred.LastInstrForInsertion(), + ) + currentOccupants.remove(currentReg) + } else { + s.getVRegState(desiredVReg.ID()).recordReload(f, pred) + f.ReloadRegisterBefore(desiredVReg.SetRealReg(r), pred.LastInstrForInsertion()) + } + currentOccupantsRev[desiredVReg] = r + currentOccupants.add(r, desiredVReg) + } + + if wazevoapi.RegAllocLoggingEnabled { + fmt.Println("\t", pred.ID(), ":", currentOccupants.format(a.regInfo)) + } +} + +func (a *Allocator) scheduleSpills(f Function) { + states := a.state.vrStates + for i := 0; i <= states.MaxIDEncountered(); i++ { + vs := states.Get(i) + if vs == nil { + continue + } + if vs.spilled { + a.scheduleSpill(f, vs) + } + } +} + +func (a *Allocator) scheduleSpill(f Function, vs *vrState) { + v := vs.v + // If the value is the phi value, we need to insert a spill after each phi definition. + if vs.isPhi { + for defInstr := vs.phiDefInstList; defInstr != nil; defInstr = defInstr.next { + f.StoreRegisterAfter(defInstr.v, defInstr.instr) + } + return + } + + pos := vs.lca + definingBlk := vs.defBlk + r := RealRegInvalid + if definingBlk == nil { + panic(fmt.Sprintf("BUG: definingBlk should not be nil for %s. This is likley a bug in backend lowering logic", vs.v.String())) + } + if pos == nil { + panic(fmt.Sprintf("BUG: pos should not be nil for %s. This is likley a bug in backend lowering logic", vs.v.String())) + } + + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("v%d is spilled in blk%d, lca=blk%d\n", v.ID(), definingBlk.ID(), pos.ID()) + } + for pos != definingBlk { + st := a.getOrAllocateBlockState(pos.ID()) + for ii := 0; ii < 64; ii++ { + rr := RealReg(ii) + if st.startRegs.get(rr) == v { + r = rr + // Already in the register, so we can place the spill at the beginning of the block. + break + } + } + + if r != RealRegInvalid { + break + } + + pos = f.Idom(pos) + } + + if pos == definingBlk { + defInstr := vs.defInstr + defInstr.Defs(&a.vs) + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("schedule spill v%d after %v\n", v.ID(), defInstr) + } + f.StoreRegisterAfter(a.vs[0], defInstr) + } else { + // Found an ancestor block that holds the value in the register at the beginning of the block. + // We need to insert a spill before the last use. + first := pos.FirstInstr() + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("schedule spill v%d before %v\n", v.ID(), first) + } + f.StoreRegisterAfter(v.SetRealReg(r), first) + } +} + +// Reset resets the allocator's internal state so that it can be reused. +func (a *Allocator) Reset() { + a.state.reset() + a.blockStates.Reset() + a.phiDefInstListPool.Reset() + a.vs = a.vs[:0] +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go new file mode 100644 index 000000000..e9bf60661 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go @@ -0,0 +1,108 @@ +package regalloc + +import ( + "fmt" + "strings" +) + +// NewRegSet returns a new RegSet with the given registers. +func NewRegSet(regs ...RealReg) RegSet { + var ret RegSet + for _, r := range regs { + ret = ret.add(r) + } + return ret +} + +// RegSet represents a set of registers. +type RegSet uint64 + +func (rs RegSet) format(info *RegisterInfo) string { //nolint:unused + var ret []string + for i := 0; i < 64; i++ { + if rs&(1<<uint(i)) != 0 { + ret = append(ret, info.RealRegName(RealReg(i))) + } + } + return strings.Join(ret, ", ") +} + +func (rs RegSet) has(r RealReg) bool { + return rs&(1<<uint(r)) != 0 +} + +func (rs RegSet) add(r RealReg) RegSet { + if r >= 64 { + return rs + } + return rs | 1<<uint(r) +} + +func (rs RegSet) Range(f func(allocatedRealReg RealReg)) { + for i := 0; i < 64; i++ { + if rs&(1<<uint(i)) != 0 { + f(RealReg(i)) + } + } +} + +type regInUseSet struct { + set RegSet + vrs [64]VReg +} + +func (rs *regInUseSet) reset() { + rs.set = 0 + for i := range rs.vrs { + rs.vrs[i] = VRegInvalid + } +} + +func (rs *regInUseSet) format(info *RegisterInfo) string { //nolint:unused + var ret []string + for i := 0; i < 64; i++ { + if rs.set&(1<<uint(i)) != 0 { + vr := rs.vrs[i] + ret = append(ret, fmt.Sprintf("(%s->v%d)", info.RealRegName(RealReg(i)), vr.ID())) + } + } + return strings.Join(ret, ", ") +} + +func (rs *regInUseSet) has(r RealReg) bool { + if r >= 64 { + return false + } + return rs.set&(1<<uint(r)) != 0 +} + +func (rs *regInUseSet) get(r RealReg) VReg { + if r >= 64 { + return VRegInvalid + } + return rs.vrs[r] +} + +func (rs *regInUseSet) remove(r RealReg) { + if r >= 64 { + return + } + rs.set &= ^(1 << uint(r)) + rs.vrs[r] = VRegInvalid +} + +func (rs *regInUseSet) add(r RealReg, vr VReg) { + if r >= 64 { + return + } + rs.set |= 1 << uint(r) + rs.vrs[r] = vr +} + +func (rs *regInUseSet) range_(f func(allocatedRealReg RealReg, vr VReg)) { + for i := 0; i < 64; i++ { + if rs.set&(1<<uint(i)) != 0 { + f(RealReg(i), rs.vrs[i]) + } + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go new file mode 100644 index 000000000..edfa962b5 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go @@ -0,0 +1,43 @@ +package backend + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// SSAValueDefinition represents a definition of an SSA value. +type SSAValueDefinition struct { + // BlockParamValue is valid if Instr == nil + BlockParamValue ssa.Value + + // BlkParamVReg is valid if Instr == nil + BlkParamVReg regalloc.VReg + + // Instr is not nil if this is a definition from an instruction. + Instr *ssa.Instruction + // N is the index of the return value in the instr's return values list. + N int + // RefCount is the number of references to the result. + RefCount int +} + +func (d *SSAValueDefinition) IsFromInstr() bool { + return d.Instr != nil +} + +func (d *SSAValueDefinition) IsFromBlockParam() bool { + return d.Instr == nil +} + +func (d *SSAValueDefinition) SSAValue() ssa.Value { + if d.IsFromBlockParam() { + return d.BlockParamValue + } else { + r, rs := d.Instr.Returns() + if d.N == 0 { + return r + } else { + return rs[d.N-1] + } + } +} |