summaryrefslogtreecommitdiff
path: root/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend
diff options
context:
space:
mode:
authorLibravatar kim <89579420+NyaaaWhatsUpDoc@users.noreply.github.com>2024-05-27 15:46:15 +0000
committerLibravatar GitHub <noreply@github.com>2024-05-27 17:46:15 +0200
commit1e7b32490dfdccddd04f46d4b0416b48d749d51b (patch)
tree62a11365933a5a11e0800af64cbdf9172e5e6e7a /vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend
parent[chore] Small styling + link issues (#2933) (diff)
downloadgotosocial-1e7b32490dfdccddd04f46d4b0416b48d749d51b.tar.xz
[experiment] add alternative wasm sqlite3 implementation available via build-tag (#2863)
This allows for building GoToSocial with [SQLite transpiled to WASM](https://github.com/ncruces/go-sqlite3) and accessed through [Wazero](https://wazero.io/).
Diffstat (limited to 'vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend')
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go170
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go3
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go417
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go226
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go219
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go33
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go186
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go9
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s29
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go248
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go443
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go168
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go35
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go2472
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go1683
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go71
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go187
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go3611
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go304
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go153
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go992
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go346
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go11
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go11
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go181
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go128
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go332
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go9
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s29
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go230
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go428
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go215
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go2545
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go2351
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go301
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go2221
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go350
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go440
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go515
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go469
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go152
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go117
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go397
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go90
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go100
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go319
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go136
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go123
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go1212
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go108
-rw-r--r--vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go43
51 files changed, 25568 insertions, 0 deletions
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go
new file mode 100644
index 000000000..cf91c6b7a
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go
@@ -0,0 +1,170 @@
+package backend
+
+import (
+ "fmt"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type (
+ // FunctionABI represents the ABI information for a function which corresponds to a ssa.Signature.
+ FunctionABI struct {
+ Initialized bool
+
+ Args, Rets []ABIArg
+ ArgStackSize, RetStackSize int64
+
+ ArgIntRealRegs byte
+ ArgFloatRealRegs byte
+ RetIntRealRegs byte
+ RetFloatRealRegs byte
+ }
+
+ // ABIArg represents either argument or return value's location.
+ ABIArg struct {
+ // Index is the index of the argument.
+ Index int
+ // Kind is the kind of the argument.
+ Kind ABIArgKind
+ // Reg is valid if Kind == ABIArgKindReg.
+ // This VReg must be based on RealReg.
+ Reg regalloc.VReg
+ // Offset is valid if Kind == ABIArgKindStack.
+ // This is the offset from the beginning of either arg or ret stack slot.
+ Offset int64
+ // Type is the type of the argument.
+ Type ssa.Type
+ }
+
+ // ABIArgKind is the kind of ABI argument.
+ ABIArgKind byte
+)
+
+const (
+ // ABIArgKindReg represents an argument passed in a register.
+ ABIArgKindReg = iota
+ // ABIArgKindStack represents an argument passed in the stack.
+ ABIArgKindStack
+)
+
+// String implements fmt.Stringer.
+func (a *ABIArg) String() string {
+ return fmt.Sprintf("args[%d]: %s", a.Index, a.Kind)
+}
+
+// String implements fmt.Stringer.
+func (a ABIArgKind) String() string {
+ switch a {
+ case ABIArgKindReg:
+ return "reg"
+ case ABIArgKindStack:
+ return "stack"
+ default:
+ panic("BUG")
+ }
+}
+
+// Init initializes the abiImpl for the given signature.
+func (a *FunctionABI) Init(sig *ssa.Signature, argResultInts, argResultFloats []regalloc.RealReg) {
+ if len(a.Rets) < len(sig.Results) {
+ a.Rets = make([]ABIArg, len(sig.Results))
+ }
+ a.Rets = a.Rets[:len(sig.Results)]
+ a.RetStackSize = a.setABIArgs(a.Rets, sig.Results, argResultInts, argResultFloats)
+ if argsNum := len(sig.Params); len(a.Args) < argsNum {
+ a.Args = make([]ABIArg, argsNum)
+ }
+ a.Args = a.Args[:len(sig.Params)]
+ a.ArgStackSize = a.setABIArgs(a.Args, sig.Params, argResultInts, argResultFloats)
+
+ // Gather the real registers usages in arg/return.
+ a.ArgIntRealRegs, a.ArgFloatRealRegs = 0, 0
+ a.RetIntRealRegs, a.RetFloatRealRegs = 0, 0
+ for i := range a.Rets {
+ r := &a.Rets[i]
+ if r.Kind == ABIArgKindReg {
+ if r.Type.IsInt() {
+ a.RetIntRealRegs++
+ } else {
+ a.RetFloatRealRegs++
+ }
+ }
+ }
+ for i := range a.Args {
+ arg := &a.Args[i]
+ if arg.Kind == ABIArgKindReg {
+ if arg.Type.IsInt() {
+ a.ArgIntRealRegs++
+ } else {
+ a.ArgFloatRealRegs++
+ }
+ }
+ }
+
+ a.Initialized = true
+}
+
+// setABIArgs sets the ABI arguments in the given slice. This assumes that len(s) >= len(types)
+// where if len(s) > len(types), the last elements of s is for the multi-return slot.
+func (a *FunctionABI) setABIArgs(s []ABIArg, types []ssa.Type, ints, floats []regalloc.RealReg) (stackSize int64) {
+ il, fl := len(ints), len(floats)
+
+ var stackOffset int64
+ intParamIndex, floatParamIndex := 0, 0
+ for i, typ := range types {
+ arg := &s[i]
+ arg.Index = i
+ arg.Type = typ
+ if typ.IsInt() {
+ if intParamIndex >= il {
+ arg.Kind = ABIArgKindStack
+ const slotSize = 8 // Align 8 bytes.
+ arg.Offset = stackOffset
+ stackOffset += slotSize
+ } else {
+ arg.Kind = ABIArgKindReg
+ arg.Reg = regalloc.FromRealReg(ints[intParamIndex], regalloc.RegTypeInt)
+ intParamIndex++
+ }
+ } else {
+ if floatParamIndex >= fl {
+ arg.Kind = ABIArgKindStack
+ slotSize := int64(8) // Align at least 8 bytes.
+ if typ.Bits() == 128 { // Vector.
+ slotSize = 16
+ }
+ arg.Offset = stackOffset
+ stackOffset += slotSize
+ } else {
+ arg.Kind = ABIArgKindReg
+ arg.Reg = regalloc.FromRealReg(floats[floatParamIndex], regalloc.RegTypeFloat)
+ floatParamIndex++
+ }
+ }
+ }
+ return stackOffset
+}
+
+func (a *FunctionABI) AlignedArgResultStackSlotSize() uint32 {
+ stackSlotSize := a.RetStackSize + a.ArgStackSize
+ // Align stackSlotSize to 16 bytes.
+ stackSlotSize = (stackSlotSize + 15) &^ 15
+ // Check overflow 32-bit.
+ if stackSlotSize > 0xFFFFFFFF {
+ panic("ABI stack slot size overflow")
+ }
+ return uint32(stackSlotSize)
+}
+
+func (a *FunctionABI) ABIInfoAsUint64() uint64 {
+ return uint64(a.ArgIntRealRegs)<<56 |
+ uint64(a.ArgFloatRealRegs)<<48 |
+ uint64(a.RetIntRealRegs)<<40 |
+ uint64(a.RetFloatRealRegs)<<32 |
+ uint64(a.AlignedArgResultStackSlotSize())
+}
+
+func ABIInfoFromUint64(info uint64) (argIntRealRegs, argFloatRealRegs, retIntRealRegs, retFloatRealRegs byte, stackSlotSize uint32) {
+ return byte(info >> 56), byte(info >> 48), byte(info >> 40), byte(info >> 32), uint32(info)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go
new file mode 100644
index 000000000..dd67da43e
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go
@@ -0,0 +1,3 @@
+// Package backend must be free of Wasm-specific concept. In other words,
+// this package must not import internal/wasm package.
+package backend
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go
new file mode 100644
index 000000000..59bbfe02d
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go
@@ -0,0 +1,417 @@
+package backend
+
+import (
+ "context"
+ "fmt"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// NewCompiler returns a new Compiler that can generate a machine code.
+func NewCompiler(ctx context.Context, mach Machine, builder ssa.Builder) Compiler {
+ return newCompiler(ctx, mach, builder)
+}
+
+func newCompiler(_ context.Context, mach Machine, builder ssa.Builder) *compiler {
+ argResultInts, argResultFloats := mach.ArgsResultsRegs()
+ c := &compiler{
+ mach: mach, ssaBuilder: builder,
+ nextVRegID: regalloc.VRegIDNonReservedBegin,
+ argResultInts: argResultInts,
+ argResultFloats: argResultFloats,
+ }
+ mach.SetCompiler(c)
+ return c
+}
+
+// Compiler is the backend of wazevo which takes ssa.Builder and Machine,
+// use the information there to emit the final machine code.
+type Compiler interface {
+ // SSABuilder returns the ssa.Builder used by this compiler.
+ SSABuilder() ssa.Builder
+
+ // Compile executes the following steps:
+ // 1. Lower()
+ // 2. RegAlloc()
+ // 3. Finalize()
+ // 4. Encode()
+ //
+ // Each step can be called individually for testing purpose, therefore they are exposed in this interface too.
+ //
+ // The returned byte slices are the machine code and the relocation information for the machine code.
+ // The caller is responsible for copying them immediately since the compiler may reuse the buffer.
+ Compile(ctx context.Context) (_ []byte, _ []RelocationInfo, _ error)
+
+ // Lower lowers the given ssa.Instruction to the machine-specific instructions.
+ Lower()
+
+ // RegAlloc performs the register allocation after Lower is called.
+ RegAlloc()
+
+ // Finalize performs the finalization of the compilation, including machine code emission.
+ // This must be called after RegAlloc.
+ Finalize(ctx context.Context) error
+
+ // Buf returns the buffer of the encoded machine code. This is only used for testing purpose.
+ Buf() []byte
+
+ BufPtr() *[]byte
+
+ // Format returns the debug string of the current state of the compiler.
+ Format() string
+
+ // Init initializes the internal state of the compiler for the next compilation.
+ Init()
+
+ // AllocateVReg allocates a new virtual register of the given type.
+ AllocateVReg(typ ssa.Type) regalloc.VReg
+
+ // ValueDefinition returns the definition of the given value.
+ ValueDefinition(ssa.Value) *SSAValueDefinition
+
+ // VRegOf returns the virtual register of the given ssa.Value.
+ VRegOf(value ssa.Value) regalloc.VReg
+
+ // TypeOf returns the ssa.Type of the given virtual register.
+ TypeOf(regalloc.VReg) ssa.Type
+
+ // MatchInstr returns true if the given definition is from an instruction with the given opcode, the current group ID,
+ // and a refcount of 1. That means, the instruction can be merged/swapped within the current instruction group.
+ MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool
+
+ // MatchInstrOneOf is the same as MatchInstr but for multiple opcodes. If it matches one of ssa.Opcode,
+ // this returns the opcode. Otherwise, this returns ssa.OpcodeInvalid.
+ //
+ // Note: caller should be careful to avoid excessive allocation on opcodes slice.
+ MatchInstrOneOf(def *SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode
+
+ // AddRelocationInfo appends the relocation information for the function reference at the current buffer offset.
+ AddRelocationInfo(funcRef ssa.FuncRef)
+
+ // AddSourceOffsetInfo appends the source offset information for the given offset.
+ AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset)
+
+ // SourceOffsetInfo returns the source offset information for the current buffer offset.
+ SourceOffsetInfo() []SourceOffsetInfo
+
+ // EmitByte appends a byte to the buffer. Used during the code emission.
+ EmitByte(b byte)
+
+ // Emit4Bytes appends 4 bytes to the buffer. Used during the code emission.
+ Emit4Bytes(b uint32)
+
+ // Emit8Bytes appends 8 bytes to the buffer. Used during the code emission.
+ Emit8Bytes(b uint64)
+
+ // GetFunctionABI returns the ABI information for the given signature.
+ GetFunctionABI(sig *ssa.Signature) *FunctionABI
+}
+
+// RelocationInfo represents the relocation information for a call instruction.
+type RelocationInfo struct {
+ // Offset represents the offset from the beginning of the machine code of either a function or the entire module.
+ Offset int64
+ // Target is the target function of the call instruction.
+ FuncRef ssa.FuncRef
+}
+
+// compiler implements Compiler.
+type compiler struct {
+ mach Machine
+ currentGID ssa.InstructionGroupID
+ ssaBuilder ssa.Builder
+ // nextVRegID is the next virtual register ID to be allocated.
+ nextVRegID regalloc.VRegID
+ // ssaValueToVRegs maps ssa.ValueID to regalloc.VReg.
+ ssaValueToVRegs [] /* VRegID to */ regalloc.VReg
+ // ssaValueDefinitions maps ssa.ValueID to its definition.
+ ssaValueDefinitions []SSAValueDefinition
+ // ssaValueRefCounts is a cached list obtained by ssa.Builder.ValueRefCounts().
+ ssaValueRefCounts []int
+ // returnVRegs is the list of virtual registers that store the return values.
+ returnVRegs []regalloc.VReg
+ varEdges [][2]regalloc.VReg
+ varEdgeTypes []ssa.Type
+ constEdges []struct {
+ cInst *ssa.Instruction
+ dst regalloc.VReg
+ }
+ vRegSet []bool
+ vRegIDs []regalloc.VRegID
+ tempRegs []regalloc.VReg
+ tmpVals []ssa.Value
+ ssaTypeOfVRegID [] /* VRegID to */ ssa.Type
+ buf []byte
+ relocations []RelocationInfo
+ sourceOffsets []SourceOffsetInfo
+ // abis maps ssa.SignatureID to the ABI implementation.
+ abis []FunctionABI
+ argResultInts, argResultFloats []regalloc.RealReg
+}
+
+// SourceOffsetInfo is a data to associate the source offset with the executable offset.
+type SourceOffsetInfo struct {
+ // SourceOffset is the source offset in the original source code.
+ SourceOffset ssa.SourceOffset
+ // ExecutableOffset is the offset in the compiled executable.
+ ExecutableOffset int64
+}
+
+// Compile implements Compiler.Compile.
+func (c *compiler) Compile(ctx context.Context) ([]byte, []RelocationInfo, error) {
+ c.Lower()
+ if wazevoapi.PrintSSAToBackendIRLowering && wazevoapi.PrintEnabledIndex(ctx) {
+ fmt.Printf("[[[after lowering for %s ]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
+ }
+ if wazevoapi.DeterministicCompilationVerifierEnabled {
+ wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After lowering to ISA specific IR", c.Format())
+ }
+ c.RegAlloc()
+ if wazevoapi.PrintRegisterAllocated && wazevoapi.PrintEnabledIndex(ctx) {
+ fmt.Printf("[[[after regalloc for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
+ }
+ if wazevoapi.DeterministicCompilationVerifierEnabled {
+ wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After Register Allocation", c.Format())
+ }
+ if err := c.Finalize(ctx); err != nil {
+ return nil, nil, err
+ }
+ if wazevoapi.PrintFinalizedMachineCode && wazevoapi.PrintEnabledIndex(ctx) {
+ fmt.Printf("[[[after finalize for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
+ }
+ if wazevoapi.DeterministicCompilationVerifierEnabled {
+ wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After Finalization", c.Format())
+ }
+ return c.buf, c.relocations, nil
+}
+
+// RegAlloc implements Compiler.RegAlloc.
+func (c *compiler) RegAlloc() {
+ c.mach.RegAlloc()
+}
+
+// Finalize implements Compiler.Finalize.
+func (c *compiler) Finalize(ctx context.Context) error {
+ c.mach.PostRegAlloc()
+ return c.mach.Encode(ctx)
+}
+
+// setCurrentGroupID sets the current instruction group ID.
+func (c *compiler) setCurrentGroupID(gid ssa.InstructionGroupID) {
+ c.currentGID = gid
+}
+
+// assignVirtualRegisters assigns a virtual register to each ssa.ValueID Valid in the ssa.Builder.
+func (c *compiler) assignVirtualRegisters() {
+ builder := c.ssaBuilder
+ refCounts := builder.ValueRefCounts()
+ c.ssaValueRefCounts = refCounts
+
+ need := len(refCounts)
+ if need >= len(c.ssaValueToVRegs) {
+ c.ssaValueToVRegs = append(c.ssaValueToVRegs, make([]regalloc.VReg, need+1)...)
+ }
+ if need >= len(c.ssaValueDefinitions) {
+ c.ssaValueDefinitions = append(c.ssaValueDefinitions, make([]SSAValueDefinition, need+1)...)
+ }
+
+ for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() {
+ // First we assign a virtual register to each parameter.
+ for i := 0; i < blk.Params(); i++ {
+ p := blk.Param(i)
+ pid := p.ID()
+ typ := p.Type()
+ vreg := c.AllocateVReg(typ)
+ c.ssaValueToVRegs[pid] = vreg
+ c.ssaValueDefinitions[pid] = SSAValueDefinition{BlockParamValue: p, BlkParamVReg: vreg}
+ c.ssaTypeOfVRegID[vreg.ID()] = p.Type()
+ }
+
+ // Assigns each value to a virtual register produced by instructions.
+ for cur := blk.Root(); cur != nil; cur = cur.Next() {
+ r, rs := cur.Returns()
+ var N int
+ if r.Valid() {
+ id := r.ID()
+ ssaTyp := r.Type()
+ typ := r.Type()
+ vReg := c.AllocateVReg(typ)
+ c.ssaValueToVRegs[id] = vReg
+ c.ssaValueDefinitions[id] = SSAValueDefinition{
+ Instr: cur,
+ N: 0,
+ RefCount: refCounts[id],
+ }
+ c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp
+ N++
+ }
+ for _, r := range rs {
+ id := r.ID()
+ ssaTyp := r.Type()
+ vReg := c.AllocateVReg(ssaTyp)
+ c.ssaValueToVRegs[id] = vReg
+ c.ssaValueDefinitions[id] = SSAValueDefinition{
+ Instr: cur,
+ N: N,
+ RefCount: refCounts[id],
+ }
+ c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp
+ N++
+ }
+ }
+ }
+
+ for i, retBlk := 0, builder.ReturnBlock(); i < retBlk.Params(); i++ {
+ typ := retBlk.Param(i).Type()
+ vReg := c.AllocateVReg(typ)
+ c.returnVRegs = append(c.returnVRegs, vReg)
+ c.ssaTypeOfVRegID[vReg.ID()] = typ
+ }
+}
+
+// AllocateVReg implements Compiler.AllocateVReg.
+func (c *compiler) AllocateVReg(typ ssa.Type) regalloc.VReg {
+ regType := regalloc.RegTypeOf(typ)
+ r := regalloc.VReg(c.nextVRegID).SetRegType(regType)
+
+ id := r.ID()
+ if int(id) >= len(c.ssaTypeOfVRegID) {
+ c.ssaTypeOfVRegID = append(c.ssaTypeOfVRegID, make([]ssa.Type, id+1)...)
+ }
+ c.ssaTypeOfVRegID[id] = typ
+ c.nextVRegID++
+ return r
+}
+
+// Init implements Compiler.Init.
+func (c *compiler) Init() {
+ c.currentGID = 0
+ c.nextVRegID = regalloc.VRegIDNonReservedBegin
+ c.returnVRegs = c.returnVRegs[:0]
+ c.mach.Reset()
+ c.varEdges = c.varEdges[:0]
+ c.constEdges = c.constEdges[:0]
+ c.buf = c.buf[:0]
+ c.sourceOffsets = c.sourceOffsets[:0]
+ c.relocations = c.relocations[:0]
+}
+
+// ValueDefinition implements Compiler.ValueDefinition.
+func (c *compiler) ValueDefinition(value ssa.Value) *SSAValueDefinition {
+ return &c.ssaValueDefinitions[value.ID()]
+}
+
+// VRegOf implements Compiler.VRegOf.
+func (c *compiler) VRegOf(value ssa.Value) regalloc.VReg {
+ return c.ssaValueToVRegs[value.ID()]
+}
+
+// Format implements Compiler.Format.
+func (c *compiler) Format() string {
+ return c.mach.Format()
+}
+
+// TypeOf implements Compiler.Format.
+func (c *compiler) TypeOf(v regalloc.VReg) ssa.Type {
+ return c.ssaTypeOfVRegID[v.ID()]
+}
+
+// MatchInstr implements Compiler.MatchInstr.
+func (c *compiler) MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool {
+ instr := def.Instr
+ return def.IsFromInstr() &&
+ instr.Opcode() == opcode &&
+ instr.GroupID() == c.currentGID &&
+ def.RefCount < 2
+}
+
+// MatchInstrOneOf implements Compiler.MatchInstrOneOf.
+func (c *compiler) MatchInstrOneOf(def *SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode {
+ instr := def.Instr
+ if !def.IsFromInstr() {
+ return ssa.OpcodeInvalid
+ }
+
+ if instr.GroupID() != c.currentGID {
+ return ssa.OpcodeInvalid
+ }
+
+ if def.RefCount >= 2 {
+ return ssa.OpcodeInvalid
+ }
+
+ opcode := instr.Opcode()
+ for _, op := range opcodes {
+ if opcode == op {
+ return opcode
+ }
+ }
+ return ssa.OpcodeInvalid
+}
+
+// SSABuilder implements Compiler .SSABuilder.
+func (c *compiler) SSABuilder() ssa.Builder {
+ return c.ssaBuilder
+}
+
+// AddSourceOffsetInfo implements Compiler.AddSourceOffsetInfo.
+func (c *compiler) AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset) {
+ c.sourceOffsets = append(c.sourceOffsets, SourceOffsetInfo{
+ SourceOffset: sourceOffset,
+ ExecutableOffset: executableOffset,
+ })
+}
+
+// SourceOffsetInfo implements Compiler.SourceOffsetInfo.
+func (c *compiler) SourceOffsetInfo() []SourceOffsetInfo {
+ return c.sourceOffsets
+}
+
+// AddRelocationInfo implements Compiler.AddRelocationInfo.
+func (c *compiler) AddRelocationInfo(funcRef ssa.FuncRef) {
+ c.relocations = append(c.relocations, RelocationInfo{
+ Offset: int64(len(c.buf)),
+ FuncRef: funcRef,
+ })
+}
+
+// Emit8Bytes implements Compiler.Emit8Bytes.
+func (c *compiler) Emit8Bytes(b uint64) {
+ c.buf = append(c.buf, byte(b), byte(b>>8), byte(b>>16), byte(b>>24), byte(b>>32), byte(b>>40), byte(b>>48), byte(b>>56))
+}
+
+// Emit4Bytes implements Compiler.Emit4Bytes.
+func (c *compiler) Emit4Bytes(b uint32) {
+ c.buf = append(c.buf, byte(b), byte(b>>8), byte(b>>16), byte(b>>24))
+}
+
+// EmitByte implements Compiler.EmitByte.
+func (c *compiler) EmitByte(b byte) {
+ c.buf = append(c.buf, b)
+}
+
+// Buf implements Compiler.Buf.
+func (c *compiler) Buf() []byte {
+ return c.buf
+}
+
+// BufPtr implements Compiler.BufPtr.
+func (c *compiler) BufPtr() *[]byte {
+ return &c.buf
+}
+
+func (c *compiler) GetFunctionABI(sig *ssa.Signature) *FunctionABI {
+ if int(sig.ID) >= len(c.abis) {
+ c.abis = append(c.abis, make([]FunctionABI, int(sig.ID)+1)...)
+ }
+
+ abi := &c.abis[sig.ID]
+ if abi.Initialized {
+ return abi
+ }
+
+ abi.Init(sig, c.argResultInts, c.argResultFloats)
+ return abi
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go
new file mode 100644
index 000000000..80e65668a
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go
@@ -0,0 +1,226 @@
+package backend
+
+import (
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// Lower implements Compiler.Lower.
+func (c *compiler) Lower() {
+ c.assignVirtualRegisters()
+ c.mach.SetCurrentABI(c.GetFunctionABI(c.ssaBuilder.Signature()))
+ c.mach.ExecutableContext().StartLoweringFunction(c.ssaBuilder.BlockIDMax())
+ c.lowerBlocks()
+}
+
+// lowerBlocks lowers each block in the ssa.Builder.
+func (c *compiler) lowerBlocks() {
+ builder := c.ssaBuilder
+ for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() {
+ c.lowerBlock(blk)
+ }
+
+ ectx := c.mach.ExecutableContext()
+ // After lowering all blocks, we need to link adjacent blocks to layout one single instruction list.
+ var prev ssa.BasicBlock
+ for next := builder.BlockIteratorReversePostOrderBegin(); next != nil; next = builder.BlockIteratorReversePostOrderNext() {
+ if prev != nil {
+ ectx.LinkAdjacentBlocks(prev, next)
+ }
+ prev = next
+ }
+}
+
+func (c *compiler) lowerBlock(blk ssa.BasicBlock) {
+ mach := c.mach
+ ectx := mach.ExecutableContext()
+ ectx.StartBlock(blk)
+
+ // We traverse the instructions in reverse order because we might want to lower multiple
+ // instructions together.
+ cur := blk.Tail()
+
+ // First gather the branching instructions at the end of the blocks.
+ var br0, br1 *ssa.Instruction
+ if cur.IsBranching() {
+ br0 = cur
+ cur = cur.Prev()
+ if cur != nil && cur.IsBranching() {
+ br1 = cur
+ cur = cur.Prev()
+ }
+ }
+
+ if br0 != nil {
+ c.lowerBranches(br0, br1)
+ }
+
+ if br1 != nil && br0 == nil {
+ panic("BUG? when a block has conditional branch but doesn't end with an unconditional branch?")
+ }
+
+ // Now start lowering the non-branching instructions.
+ for ; cur != nil; cur = cur.Prev() {
+ c.setCurrentGroupID(cur.GroupID())
+ if cur.Lowered() {
+ continue
+ }
+
+ switch cur.Opcode() {
+ case ssa.OpcodeReturn:
+ rets := cur.ReturnVals()
+ if len(rets) > 0 {
+ c.mach.LowerReturns(rets)
+ }
+ c.mach.InsertReturn()
+ default:
+ mach.LowerInstr(cur)
+ }
+ ectx.FlushPendingInstructions()
+ }
+
+ // Finally, if this is the entry block, we have to insert copies of arguments from the real location to the VReg.
+ if blk.EntryBlock() {
+ c.lowerFunctionArguments(blk)
+ }
+
+ ectx.EndBlock()
+}
+
+// lowerBranches is called right after StartBlock and before any LowerInstr call if
+// there are branches to the given block. br0 is the very end of the block and b1 is the before the br0 if it exists.
+// At least br0 is not nil, but br1 can be nil if there's no branching before br0.
+//
+// See ssa.Instruction IsBranching, and the comment on ssa.BasicBlock.
+func (c *compiler) lowerBranches(br0, br1 *ssa.Instruction) {
+ ectx := c.mach.ExecutableContext()
+
+ c.setCurrentGroupID(br0.GroupID())
+ c.mach.LowerSingleBranch(br0)
+ ectx.FlushPendingInstructions()
+ if br1 != nil {
+ c.setCurrentGroupID(br1.GroupID())
+ c.mach.LowerConditionalBranch(br1)
+ ectx.FlushPendingInstructions()
+ }
+
+ if br0.Opcode() == ssa.OpcodeJump {
+ _, args, target := br0.BranchData()
+ argExists := len(args) != 0
+ if argExists && br1 != nil {
+ panic("BUG: critical edge split failed")
+ }
+ if argExists && target.ReturnBlock() {
+ if len(args) > 0 {
+ c.mach.LowerReturns(args)
+ }
+ } else if argExists {
+ c.lowerBlockArguments(args, target)
+ }
+ }
+ ectx.FlushPendingInstructions()
+}
+
+func (c *compiler) lowerFunctionArguments(entry ssa.BasicBlock) {
+ ectx := c.mach.ExecutableContext()
+
+ c.tmpVals = c.tmpVals[:0]
+ for i := 0; i < entry.Params(); i++ {
+ p := entry.Param(i)
+ if c.ssaValueRefCounts[p.ID()] > 0 {
+ c.tmpVals = append(c.tmpVals, p)
+ } else {
+ // If the argument is not used, we can just pass an invalid value.
+ c.tmpVals = append(c.tmpVals, ssa.ValueInvalid)
+ }
+ }
+ c.mach.LowerParams(c.tmpVals)
+ ectx.FlushPendingInstructions()
+}
+
+// lowerBlockArguments lowers how to pass arguments to the given successor block.
+func (c *compiler) lowerBlockArguments(args []ssa.Value, succ ssa.BasicBlock) {
+ if len(args) != succ.Params() {
+ panic("BUG: mismatched number of arguments")
+ }
+
+ c.varEdges = c.varEdges[:0]
+ c.varEdgeTypes = c.varEdgeTypes[:0]
+ c.constEdges = c.constEdges[:0]
+ for i := 0; i < len(args); i++ {
+ dst := succ.Param(i)
+ src := args[i]
+
+ dstReg := c.VRegOf(dst)
+ srcDef := c.ssaValueDefinitions[src.ID()]
+ if srcDef.IsFromInstr() && srcDef.Instr.Constant() {
+ c.constEdges = append(c.constEdges, struct {
+ cInst *ssa.Instruction
+ dst regalloc.VReg
+ }{cInst: srcDef.Instr, dst: dstReg})
+ } else {
+ srcReg := c.VRegOf(src)
+ // Even when the src=dst, insert the move so that we can keep such registers keep-alive.
+ c.varEdges = append(c.varEdges, [2]regalloc.VReg{srcReg, dstReg})
+ c.varEdgeTypes = append(c.varEdgeTypes, src.Type())
+ }
+ }
+
+ // Check if there's an overlap among the dsts and srcs in varEdges.
+ c.vRegIDs = c.vRegIDs[:0]
+ for _, edge := range c.varEdges {
+ src := edge[0].ID()
+ if int(src) >= len(c.vRegSet) {
+ c.vRegSet = append(c.vRegSet, make([]bool, src+1)...)
+ }
+ c.vRegSet[src] = true
+ c.vRegIDs = append(c.vRegIDs, src)
+ }
+ separated := true
+ for _, edge := range c.varEdges {
+ dst := edge[1].ID()
+ if int(dst) >= len(c.vRegSet) {
+ c.vRegSet = append(c.vRegSet, make([]bool, dst+1)...)
+ } else {
+ if c.vRegSet[dst] {
+ separated = false
+ break
+ }
+ }
+ }
+ for _, id := range c.vRegIDs {
+ c.vRegSet[id] = false // reset for the next use.
+ }
+
+ if separated {
+ // If there's no overlap, we can simply move the source to destination.
+ for i, edge := range c.varEdges {
+ src, dst := edge[0], edge[1]
+ c.mach.InsertMove(dst, src, c.varEdgeTypes[i])
+ }
+ } else {
+ // Otherwise, we allocate a temporary registers and move the source to the temporary register,
+ //
+ // First move all of them to temporary registers.
+ c.tempRegs = c.tempRegs[:0]
+ for i, edge := range c.varEdges {
+ src := edge[0]
+ typ := c.varEdgeTypes[i]
+ temp := c.AllocateVReg(typ)
+ c.tempRegs = append(c.tempRegs, temp)
+ c.mach.InsertMove(temp, src, typ)
+ }
+ // Then move the temporary registers to the destination.
+ for i, edge := range c.varEdges {
+ temp := c.tempRegs[i]
+ dst := edge[1]
+ c.mach.InsertMove(dst, temp, c.varEdgeTypes[i])
+ }
+ }
+
+ // Finally, move the constants.
+ for _, edge := range c.constEdges {
+ cInst, dst := edge.cInst, edge.dst
+ c.mach.InsertLoadConstantBlockArg(cInst, dst)
+ }
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go
new file mode 100644
index 000000000..81c6a6b62
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go
@@ -0,0 +1,219 @@
+package backend
+
+import (
+ "fmt"
+ "math"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type ExecutableContext interface {
+ // StartLoweringFunction is called when the lowering of the given function is started.
+ // maximumBlockID is the maximum value of ssa.BasicBlockID existing in the function.
+ StartLoweringFunction(maximumBlockID ssa.BasicBlockID)
+
+ // LinkAdjacentBlocks is called after finished lowering all blocks in order to create one single instruction list.
+ LinkAdjacentBlocks(prev, next ssa.BasicBlock)
+
+ // StartBlock is called when the compilation of the given block is started.
+ // The order of this being called is the reverse post order of the ssa.BasicBlock(s) as we iterate with
+ // ssa.Builder BlockIteratorReversePostOrderBegin and BlockIteratorReversePostOrderEnd.
+ StartBlock(ssa.BasicBlock)
+
+ // EndBlock is called when the compilation of the current block is finished.
+ EndBlock()
+
+ // FlushPendingInstructions flushes the pending instructions to the buffer.
+ // This will be called after the lowering of each SSA Instruction.
+ FlushPendingInstructions()
+}
+
+type ExecutableContextT[Instr any] struct {
+ CurrentSSABlk ssa.BasicBlock
+
+ // InstrPool is the InstructionPool of instructions.
+ InstructionPool wazevoapi.Pool[Instr]
+ asNop func(*Instr)
+ setNext func(*Instr, *Instr)
+ setPrev func(*Instr, *Instr)
+
+ // RootInstr is the root instruction of the executable.
+ RootInstr *Instr
+ labelPositionPool wazevoapi.Pool[LabelPosition[Instr]]
+ NextLabel Label
+ // LabelPositions maps a label to the instructions of the region which the label represents.
+ LabelPositions map[Label]*LabelPosition[Instr]
+ OrderedBlockLabels []*LabelPosition[Instr]
+
+ // PerBlockHead and PerBlockEnd are the head and tail of the instruction list per currently-compiled ssa.BasicBlock.
+ PerBlockHead, PerBlockEnd *Instr
+ // PendingInstructions are the instructions which are not yet emitted into the instruction list.
+ PendingInstructions []*Instr
+
+ // SsaBlockIDToLabels maps an SSA block ID to the label.
+ SsaBlockIDToLabels []Label
+}
+
+func NewExecutableContextT[Instr any](
+ resetInstruction func(*Instr),
+ setNext func(*Instr, *Instr),
+ setPrev func(*Instr, *Instr),
+ asNop func(*Instr),
+) *ExecutableContextT[Instr] {
+ return &ExecutableContextT[Instr]{
+ InstructionPool: wazevoapi.NewPool[Instr](resetInstruction),
+ asNop: asNop,
+ setNext: setNext,
+ setPrev: setPrev,
+ labelPositionPool: wazevoapi.NewPool[LabelPosition[Instr]](resetLabelPosition[Instr]),
+ LabelPositions: make(map[Label]*LabelPosition[Instr]),
+ NextLabel: LabelInvalid,
+ }
+}
+
+func resetLabelPosition[T any](l *LabelPosition[T]) {
+ *l = LabelPosition[T]{}
+}
+
+// StartLoweringFunction implements ExecutableContext.
+func (e *ExecutableContextT[Instr]) StartLoweringFunction(max ssa.BasicBlockID) {
+ imax := int(max)
+ if len(e.SsaBlockIDToLabels) <= imax {
+ // Eagerly allocate labels for the blocks since the underlying slice will be used for the next iteration.
+ e.SsaBlockIDToLabels = append(e.SsaBlockIDToLabels, make([]Label, imax+1)...)
+ }
+}
+
+func (e *ExecutableContextT[Instr]) StartBlock(blk ssa.BasicBlock) {
+ e.CurrentSSABlk = blk
+
+ l := e.SsaBlockIDToLabels[e.CurrentSSABlk.ID()]
+ if l == LabelInvalid {
+ l = e.AllocateLabel()
+ e.SsaBlockIDToLabels[blk.ID()] = l
+ }
+
+ end := e.allocateNop0()
+ e.PerBlockHead, e.PerBlockEnd = end, end
+
+ labelPos, ok := e.LabelPositions[l]
+ if !ok {
+ labelPos = e.AllocateLabelPosition(l)
+ e.LabelPositions[l] = labelPos
+ }
+ e.OrderedBlockLabels = append(e.OrderedBlockLabels, labelPos)
+ labelPos.Begin, labelPos.End = end, end
+ labelPos.SB = blk
+}
+
+// EndBlock implements ExecutableContext.
+func (e *ExecutableContextT[T]) EndBlock() {
+ // Insert nop0 as the head of the block for convenience to simplify the logic of inserting instructions.
+ e.insertAtPerBlockHead(e.allocateNop0())
+
+ l := e.SsaBlockIDToLabels[e.CurrentSSABlk.ID()]
+ e.LabelPositions[l].Begin = e.PerBlockHead
+
+ if e.CurrentSSABlk.EntryBlock() {
+ e.RootInstr = e.PerBlockHead
+ }
+}
+
+func (e *ExecutableContextT[T]) insertAtPerBlockHead(i *T) {
+ if e.PerBlockHead == nil {
+ e.PerBlockHead = i
+ e.PerBlockEnd = i
+ return
+ }
+ e.setNext(i, e.PerBlockHead)
+ e.setPrev(e.PerBlockHead, i)
+ e.PerBlockHead = i
+}
+
+// FlushPendingInstructions implements ExecutableContext.
+func (e *ExecutableContextT[T]) FlushPendingInstructions() {
+ l := len(e.PendingInstructions)
+ if l == 0 {
+ return
+ }
+ for i := l - 1; i >= 0; i-- { // reverse because we lower instructions in reverse order.
+ e.insertAtPerBlockHead(e.PendingInstructions[i])
+ }
+ e.PendingInstructions = e.PendingInstructions[:0]
+}
+
+func (e *ExecutableContextT[T]) Reset() {
+ e.labelPositionPool.Reset()
+ e.InstructionPool.Reset()
+ for l := Label(0); l <= e.NextLabel; l++ {
+ delete(e.LabelPositions, l)
+ }
+ e.PendingInstructions = e.PendingInstructions[:0]
+ e.OrderedBlockLabels = e.OrderedBlockLabels[:0]
+ e.RootInstr = nil
+ e.SsaBlockIDToLabels = e.SsaBlockIDToLabels[:0]
+ e.PerBlockHead, e.PerBlockEnd = nil, nil
+ e.NextLabel = LabelInvalid
+}
+
+// AllocateLabel allocates an unused label.
+func (e *ExecutableContextT[T]) AllocateLabel() Label {
+ e.NextLabel++
+ return e.NextLabel
+}
+
+func (e *ExecutableContextT[T]) AllocateLabelPosition(la Label) *LabelPosition[T] {
+ l := e.labelPositionPool.Allocate()
+ l.L = la
+ return l
+}
+
+func (e *ExecutableContextT[T]) GetOrAllocateSSABlockLabel(blk ssa.BasicBlock) Label {
+ if blk.ReturnBlock() {
+ return LabelReturn
+ }
+ l := e.SsaBlockIDToLabels[blk.ID()]
+ if l == LabelInvalid {
+ l = e.AllocateLabel()
+ e.SsaBlockIDToLabels[blk.ID()] = l
+ }
+ return l
+}
+
+func (e *ExecutableContextT[T]) allocateNop0() *T {
+ i := e.InstructionPool.Allocate()
+ e.asNop(i)
+ return i
+}
+
+// LinkAdjacentBlocks implements backend.Machine.
+func (e *ExecutableContextT[T]) LinkAdjacentBlocks(prev, next ssa.BasicBlock) {
+ prevLabelPos := e.LabelPositions[e.GetOrAllocateSSABlockLabel(prev)]
+ nextLabelPos := e.LabelPositions[e.GetOrAllocateSSABlockLabel(next)]
+ e.setNext(prevLabelPos.End, nextLabelPos.Begin)
+}
+
+// LabelPosition represents the regions of the generated code which the label represents.
+type LabelPosition[Instr any] struct {
+ SB ssa.BasicBlock
+ L Label
+ Begin, End *Instr
+ BinaryOffset int64
+}
+
+// Label represents a position in the generated code which is either
+// a real instruction or the constant InstructionPool (e.g. jump tables).
+//
+// This is exactly the same as the traditional "label" in assembly code.
+type Label uint32
+
+const (
+ LabelInvalid Label = 0
+ LabelReturn Label = math.MaxUint32
+)
+
+// String implements backend.Machine.
+func (l Label) String() string {
+ return fmt.Sprintf("L%d", l)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go
new file mode 100644
index 000000000..6fe6d7b3c
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go
@@ -0,0 +1,33 @@
+package backend
+
+import "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+
+// GoFunctionCallRequiredStackSize returns the size of the stack required for the Go function call.
+// argBegin is the index of the first argument in the signature which is not either execution context or module context.
+func GoFunctionCallRequiredStackSize(sig *ssa.Signature, argBegin int) (ret, retUnaligned int64) {
+ var paramNeededInBytes, resultNeededInBytes int64
+ for _, p := range sig.Params[argBegin:] {
+ s := int64(p.Size())
+ if s < 8 {
+ s = 8 // We use uint64 for all basic types, except SIMD v128.
+ }
+ paramNeededInBytes += s
+ }
+ for _, r := range sig.Results {
+ s := int64(r.Size())
+ if s < 8 {
+ s = 8 // We use uint64 for all basic types, except SIMD v128.
+ }
+ resultNeededInBytes += s
+ }
+
+ if paramNeededInBytes > resultNeededInBytes {
+ ret = paramNeededInBytes
+ } else {
+ ret = resultNeededInBytes
+ }
+ retUnaligned = ret
+ // Align to 16 bytes.
+ ret = (ret + 15) &^ 15
+ return
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go
new file mode 100644
index 000000000..130f8c621
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go
@@ -0,0 +1,186 @@
+package amd64
+
+import (
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// For the details of the ABI, see:
+// https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#amd64-architecture
+
+var (
+ intArgResultRegs = []regalloc.RealReg{rax, rbx, rcx, rdi, rsi, r8, r9, r10, r11}
+ floatArgResultRegs = []regalloc.RealReg{xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7}
+)
+
+var regInfo = &regalloc.RegisterInfo{
+ AllocatableRegisters: [regalloc.NumRegType][]regalloc.RealReg{
+ regalloc.RegTypeInt: {
+ rax, rcx, rdx, rbx, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15,
+ },
+ regalloc.RegTypeFloat: {
+ xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
+ },
+ },
+ CalleeSavedRegisters: regalloc.NewRegSet(
+ rdx, r12, r13, r14, r15,
+ xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
+ ),
+ CallerSavedRegisters: regalloc.NewRegSet(
+ rax, rcx, rbx, rsi, rdi, r8, r9, r10, r11,
+ xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
+ ),
+ RealRegToVReg: []regalloc.VReg{
+ rax: raxVReg, rcx: rcxVReg, rdx: rdxVReg, rbx: rbxVReg, rsp: rspVReg, rbp: rbpVReg, rsi: rsiVReg, rdi: rdiVReg,
+ r8: r8VReg, r9: r9VReg, r10: r10VReg, r11: r11VReg, r12: r12VReg, r13: r13VReg, r14: r14VReg, r15: r15VReg,
+ xmm0: xmm0VReg, xmm1: xmm1VReg, xmm2: xmm2VReg, xmm3: xmm3VReg, xmm4: xmm4VReg, xmm5: xmm5VReg, xmm6: xmm6VReg,
+ xmm7: xmm7VReg, xmm8: xmm8VReg, xmm9: xmm9VReg, xmm10: xmm10VReg, xmm11: xmm11VReg, xmm12: xmm12VReg,
+ xmm13: xmm13VReg, xmm14: xmm14VReg, xmm15: xmm15VReg,
+ },
+ RealRegName: func(r regalloc.RealReg) string { return regNames[r] },
+ RealRegType: func(r regalloc.RealReg) regalloc.RegType {
+ if r < xmm0 {
+ return regalloc.RegTypeInt
+ }
+ return regalloc.RegTypeFloat
+ },
+}
+
+// ArgsResultsRegs implements backend.Machine.
+func (m *machine) ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) {
+ return intArgResultRegs, floatArgResultRegs
+}
+
+// LowerParams implements backend.Machine.
+func (m *machine) LowerParams(args []ssa.Value) {
+ a := m.currentABI
+
+ for i, ssaArg := range args {
+ if !ssaArg.Valid() {
+ continue
+ }
+ reg := m.c.VRegOf(ssaArg)
+ arg := &a.Args[i]
+ if arg.Kind == backend.ABIArgKindReg {
+ m.InsertMove(reg, arg.Reg, arg.Type)
+ } else {
+ //
+ // (high address)
+ // +-----------------+
+ // | ....... |
+ // | ret Y |
+ // | ....... |
+ // | ret 0 |
+ // | arg X |
+ // | ....... |
+ // | arg 1 |
+ // | arg 0 |
+ // | ReturnAddress |
+ // | Caller_RBP |
+ // +-----------------+ <-- RBP
+ // | ........... |
+ // | clobbered M |
+ // | ............ |
+ // | clobbered 0 |
+ // | spill slot N |
+ // | ........... |
+ // | spill slot 0 |
+ // RSP--> +-----------------+
+ // (low address)
+
+ // Load the value from the arg stack slot above the current RBP.
+ load := m.allocateInstr()
+ mem := newOperandMem(m.newAmodeImmRBPReg(uint32(arg.Offset + 16)))
+ switch arg.Type {
+ case ssa.TypeI32:
+ load.asMovzxRmR(extModeLQ, mem, reg)
+ case ssa.TypeI64:
+ load.asMov64MR(mem, reg)
+ case ssa.TypeF32:
+ load.asXmmUnaryRmR(sseOpcodeMovss, mem, reg)
+ case ssa.TypeF64:
+ load.asXmmUnaryRmR(sseOpcodeMovsd, mem, reg)
+ case ssa.TypeV128:
+ load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, reg)
+ default:
+ panic("BUG")
+ }
+ m.insert(load)
+ }
+ }
+}
+
+// LowerReturns implements backend.Machine.
+func (m *machine) LowerReturns(rets []ssa.Value) {
+ // Load the XMM registers first as it might need a temporary register to inline
+ // constant return.
+ a := m.currentABI
+ for i, ret := range rets {
+ r := &a.Rets[i]
+ if !r.Type.IsInt() {
+ m.LowerReturn(ret, r)
+ }
+ }
+ // Then load the GPR registers.
+ for i, ret := range rets {
+ r := &a.Rets[i]
+ if r.Type.IsInt() {
+ m.LowerReturn(ret, r)
+ }
+ }
+}
+
+func (m *machine) LowerReturn(ret ssa.Value, r *backend.ABIArg) {
+ reg := m.c.VRegOf(ret)
+ if def := m.c.ValueDefinition(ret); def.IsFromInstr() {
+ // Constant instructions are inlined.
+ if inst := def.Instr; inst.Constant() {
+ m.insertLoadConstant(inst, reg)
+ }
+ }
+ if r.Kind == backend.ABIArgKindReg {
+ m.InsertMove(r.Reg, reg, ret.Type())
+ } else {
+ //
+ // (high address)
+ // +-----------------+
+ // | ....... |
+ // | ret Y |
+ // | ....... |
+ // | ret 0 |
+ // | arg X |
+ // | ....... |
+ // | arg 1 |
+ // | arg 0 |
+ // | ReturnAddress |
+ // | Caller_RBP |
+ // +-----------------+ <-- RBP
+ // | ........... |
+ // | clobbered M |
+ // | ............ |
+ // | clobbered 0 |
+ // | spill slot N |
+ // | ........... |
+ // | spill slot 0 |
+ // RSP--> +-----------------+
+ // (low address)
+
+ // Store the value to the return stack slot above the current RBP.
+ store := m.allocateInstr()
+ mem := newOperandMem(m.newAmodeImmRBPReg(uint32(m.currentABI.ArgStackSize + 16 + r.Offset)))
+ switch r.Type {
+ case ssa.TypeI32:
+ store.asMovRM(reg, mem, 4)
+ case ssa.TypeI64:
+ store.asMovRM(reg, mem, 8)
+ case ssa.TypeF32:
+ store.asXmmMovRM(sseOpcodeMovss, reg, mem)
+ case ssa.TypeF64:
+ store.asXmmMovRM(sseOpcodeMovsd, reg, mem)
+ case ssa.TypeV128:
+ store.asXmmMovRM(sseOpcodeMovdqu, reg, mem)
+ }
+ m.insert(store)
+ }
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go
new file mode 100644
index 000000000..cbf1cfdc5
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go
@@ -0,0 +1,9 @@
+package amd64
+
+// entrypoint enters the machine code generated by this backend which begins with the preamble generated by functionABI.EmitGoEntryPreamble below.
+// This implements wazevo.entrypoint, and see the comments there for detail.
+func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr)
+
+// afterGoFunctionCallEntrypoint enters the machine code after growing the stack.
+// This implements wazevo.afterGoFunctionCallEntrypoint, and see the comments there for detail.
+func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s
new file mode 100644
index 000000000..e9cb131d1
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s
@@ -0,0 +1,29 @@
+#include "funcdata.h"
+#include "textflag.h"
+
+// entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr
+TEXT ·entrypoint(SB), NOSPLIT|NOFRAME, $0-48
+ MOVQ preambleExecutable+0(FP), R11
+ MOVQ functionExectuable+8(FP), R14
+ MOVQ executionContextPtr+16(FP), AX // First argument is passed in AX.
+ MOVQ moduleContextPtr+24(FP), BX // Second argument is passed in BX.
+ MOVQ paramResultSlicePtr+32(FP), R12
+ MOVQ goAllocatedStackSlicePtr+40(FP), R13
+ JMP R11
+
+// afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
+TEXT ·afterGoFunctionCallEntrypoint(SB), NOSPLIT|NOFRAME, $0-32
+ MOVQ executable+0(FP), CX
+ MOVQ executionContextPtr+8(FP), AX // First argument is passed in AX.
+
+ // Save the stack pointer and frame pointer.
+ MOVQ BP, 16(AX) // 16 == ExecutionContextOffsetOriginalFramePointer
+ MOVQ SP, 24(AX) // 24 == ExecutionContextOffsetOriginalStackPointer
+
+ // Then set the stack pointer and frame pointer to the values we got from the Go runtime.
+ MOVQ framePointer+24(FP), BP
+
+ // WARNING: do not update SP before BP, because the Go translates (FP) as (SP) + 8.
+ MOVQ stackPointer+16(FP), SP
+
+ JMP CX
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go
new file mode 100644
index 000000000..882d06c06
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go
@@ -0,0 +1,248 @@
+package amd64
+
+import (
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+var (
+ executionContextPtrReg = raxVReg
+
+ // Followings are callee saved registers. They can be used freely in the entry preamble
+ // since the preamble is called via Go assembly function which has stack-based ABI.
+
+ // savedExecutionContextPtr also must be a callee-saved reg so that they can be used in the prologue and epilogue.
+ savedExecutionContextPtr = rdxVReg
+ // paramResultSlicePtr must match with entrypoint function in abi_entry_amd64.s.
+ paramResultSlicePtr = r12VReg
+ // goAllocatedStackPtr must match with entrypoint function in abi_entry_amd64.s.
+ goAllocatedStackPtr = r13VReg
+ // functionExecutable must match with entrypoint function in abi_entry_amd64.s.
+ functionExecutable = r14VReg
+ tmpIntReg = r15VReg
+ tmpXmmReg = xmm15VReg
+)
+
+// CompileEntryPreamble implements backend.Machine.
+func (m *machine) CompileEntryPreamble(sig *ssa.Signature) []byte {
+ root := m.compileEntryPreamble(sig)
+ m.encodeWithoutSSA(root)
+ buf := m.c.Buf()
+ return buf
+}
+
+func (m *machine) compileEntryPreamble(sig *ssa.Signature) *instruction {
+ abi := backend.FunctionABI{}
+ abi.Init(sig, intArgResultRegs, floatArgResultRegs)
+
+ root := m.allocateNop()
+
+ //// ----------------------------------- prologue ----------------------------------- ////
+
+ // First, we save executionContextPtrReg into a callee-saved register so that it can be used in epilogue as well.
+ // mov %executionContextPtrReg, %savedExecutionContextPtr
+ cur := m.move64(executionContextPtrReg, savedExecutionContextPtr, root)
+
+ // Next is to save the original RBP and RSP into the execution context.
+ cur = m.saveOriginalRSPRBP(cur)
+
+ // Now set the RSP to the Go-allocated stack pointer.
+ // mov %goAllocatedStackPtr, %rsp
+ cur = m.move64(goAllocatedStackPtr, rspVReg, cur)
+
+ if stackSlotSize := abi.AlignedArgResultStackSlotSize(); stackSlotSize > 0 {
+ // Allocate stack slots for the arguments and return values.
+ // sub $stackSlotSize, %rsp
+ spDec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(stackSlotSize)), rspVReg, true)
+ cur = linkInstr(cur, spDec)
+ }
+
+ var offset uint32
+ for i := range abi.Args {
+ if i < 2 {
+ // module context ptr and execution context ptr are passed in rax and rbx by the Go assembly function.
+ continue
+ }
+ arg := &abi.Args[i]
+ cur = m.goEntryPreamblePassArg(cur, paramResultSlicePtr, offset, arg)
+ if arg.Type == ssa.TypeV128 {
+ offset += 16
+ } else {
+ offset += 8
+ }
+ }
+
+ // Zero out RBP so that the unwind/stack growth code can correctly detect the end of the stack.
+ zerosRbp := m.allocateInstr().asAluRmiR(aluRmiROpcodeXor, newOperandReg(rbpVReg), rbpVReg, true)
+ cur = linkInstr(cur, zerosRbp)
+
+ // Now ready to call the real function. Note that at this point stack pointer is already set to the Go-allocated,
+ // which is aligned to 16 bytes.
+ call := m.allocateInstr().asCallIndirect(newOperandReg(functionExecutable), &abi)
+ cur = linkInstr(cur, call)
+
+ //// ----------------------------------- epilogue ----------------------------------- ////
+
+ // Read the results from regs and the stack, and set them correctly into the paramResultSlicePtr.
+ offset = 0
+ for i := range abi.Rets {
+ r := &abi.Rets[i]
+ cur = m.goEntryPreamblePassResult(cur, paramResultSlicePtr, offset, r, uint32(abi.ArgStackSize))
+ if r.Type == ssa.TypeV128 {
+ offset += 16
+ } else {
+ offset += 8
+ }
+ }
+
+ // Finally, restore the original RBP and RSP.
+ cur = m.restoreOriginalRSPRBP(cur)
+
+ ret := m.allocateInstr().asRet()
+ linkInstr(cur, ret)
+ return root
+}
+
+// saveOriginalRSPRBP saves the original RSP and RBP into the execution context.
+func (m *machine) saveOriginalRSPRBP(cur *instruction) *instruction {
+ // mov %rbp, wazevoapi.ExecutionContextOffsetOriginalFramePointer(%executionContextPtrReg)
+ // mov %rsp, wazevoapi.ExecutionContextOffsetOriginalStackPointer(%executionContextPtrReg)
+ cur = m.loadOrStore64AtExecutionCtx(executionContextPtrReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, rbpVReg, true, cur)
+ cur = m.loadOrStore64AtExecutionCtx(executionContextPtrReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, rspVReg, true, cur)
+ return cur
+}
+
+// restoreOriginalRSPRBP restores the original RSP and RBP from the execution context.
+func (m *machine) restoreOriginalRSPRBP(cur *instruction) *instruction {
+ // mov wazevoapi.ExecutionContextOffsetOriginalFramePointer(%executionContextPtrReg), %rbp
+ // mov wazevoapi.ExecutionContextOffsetOriginalStackPointer(%executionContextPtrReg), %rsp
+ cur = m.loadOrStore64AtExecutionCtx(savedExecutionContextPtr, wazevoapi.ExecutionContextOffsetOriginalFramePointer, rbpVReg, false, cur)
+ cur = m.loadOrStore64AtExecutionCtx(savedExecutionContextPtr, wazevoapi.ExecutionContextOffsetOriginalStackPointer, rspVReg, false, cur)
+ return cur
+}
+
+func (m *machine) move64(src, dst regalloc.VReg, prev *instruction) *instruction {
+ mov := m.allocateInstr().asMovRR(src, dst, true)
+ return linkInstr(prev, mov)
+}
+
+func (m *machine) loadOrStore64AtExecutionCtx(execCtx regalloc.VReg, offset wazevoapi.Offset, r regalloc.VReg, store bool, prev *instruction) *instruction {
+ mem := newOperandMem(m.newAmodeImmReg(offset.U32(), execCtx))
+ instr := m.allocateInstr()
+ if store {
+ instr.asMovRM(r, mem, 8)
+ } else {
+ instr.asMov64MR(mem, r)
+ }
+ return linkInstr(prev, instr)
+}
+
+// This is for debugging.
+func (m *machine) linkUD2(cur *instruction) *instruction { //nolint
+ return linkInstr(cur, m.allocateInstr().asUD2())
+}
+
+func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regalloc.VReg, offsetInParamSlice uint32, arg *backend.ABIArg) *instruction {
+ var dst regalloc.VReg
+ argTyp := arg.Type
+ if arg.Kind == backend.ABIArgKindStack {
+ // Caller saved registers ca
+ switch argTyp {
+ case ssa.TypeI32, ssa.TypeI64:
+ dst = tmpIntReg
+ case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+ dst = tmpXmmReg
+ default:
+ panic("BUG")
+ }
+ } else {
+ dst = arg.Reg
+ }
+
+ load := m.allocateInstr()
+ a := newOperandMem(m.newAmodeImmReg(offsetInParamSlice, paramSlicePtr))
+ switch arg.Type {
+ case ssa.TypeI32:
+ load.asMovzxRmR(extModeLQ, a, dst)
+ case ssa.TypeI64:
+ load.asMov64MR(a, dst)
+ case ssa.TypeF32:
+ load.asXmmUnaryRmR(sseOpcodeMovss, a, dst)
+ case ssa.TypeF64:
+ load.asXmmUnaryRmR(sseOpcodeMovsd, a, dst)
+ case ssa.TypeV128:
+ load.asXmmUnaryRmR(sseOpcodeMovdqu, a, dst)
+ }
+
+ cur = linkInstr(cur, load)
+ if arg.Kind == backend.ABIArgKindStack {
+ // Store back to the stack.
+ store := m.allocateInstr()
+ a := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset), rspVReg))
+ switch arg.Type {
+ case ssa.TypeI32:
+ store.asMovRM(dst, a, 4)
+ case ssa.TypeI64:
+ store.asMovRM(dst, a, 8)
+ case ssa.TypeF32:
+ store.asXmmMovRM(sseOpcodeMovss, dst, a)
+ case ssa.TypeF64:
+ store.asXmmMovRM(sseOpcodeMovsd, dst, a)
+ case ssa.TypeV128:
+ store.asXmmMovRM(sseOpcodeMovdqu, dst, a)
+ }
+ cur = linkInstr(cur, store)
+ }
+ return cur
+}
+
+func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr regalloc.VReg, offsetInResultSlice uint32, result *backend.ABIArg, resultStackSlotBeginOffset uint32) *instruction {
+ var r regalloc.VReg
+ if result.Kind == backend.ABIArgKindStack {
+ // Load the value to the temporary.
+ load := m.allocateInstr()
+ offset := resultStackSlotBeginOffset + uint32(result.Offset)
+ a := newOperandMem(m.newAmodeImmReg(offset, rspVReg))
+ switch result.Type {
+ case ssa.TypeI32:
+ r = tmpIntReg
+ load.asMovzxRmR(extModeLQ, a, r)
+ case ssa.TypeI64:
+ r = tmpIntReg
+ load.asMov64MR(a, r)
+ case ssa.TypeF32:
+ r = tmpXmmReg
+ load.asXmmUnaryRmR(sseOpcodeMovss, a, r)
+ case ssa.TypeF64:
+ r = tmpXmmReg
+ load.asXmmUnaryRmR(sseOpcodeMovsd, a, r)
+ case ssa.TypeV128:
+ r = tmpXmmReg
+ load.asXmmUnaryRmR(sseOpcodeMovdqu, a, r)
+ default:
+ panic("BUG")
+ }
+ cur = linkInstr(cur, load)
+ } else {
+ r = result.Reg
+ }
+
+ store := m.allocateInstr()
+ a := newOperandMem(m.newAmodeImmReg(offsetInResultSlice, resultSlicePtr))
+ switch result.Type {
+ case ssa.TypeI32:
+ store.asMovRM(r, a, 4)
+ case ssa.TypeI64:
+ store.asMovRM(r, a, 8)
+ case ssa.TypeF32:
+ store.asXmmMovRM(sseOpcodeMovss, r, a)
+ case ssa.TypeF64:
+ store.asXmmMovRM(sseOpcodeMovsd, r, a)
+ case ssa.TypeV128:
+ store.asXmmMovRM(sseOpcodeMovdqu, r, a)
+ }
+
+ return linkInstr(cur, store)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go
new file mode 100644
index 000000000..751050aff
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go
@@ -0,0 +1,443 @@
+package amd64
+
+import (
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+var calleeSavedVRegs = []regalloc.VReg{
+ rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg,
+ xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg,
+}
+
+// CompileGoFunctionTrampoline implements backend.Machine.
+func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte {
+ ectx := m.ectx
+ argBegin := 1 // Skips exec context by default.
+ if needModuleContextPtr {
+ argBegin++
+ }
+
+ abi := &backend.FunctionABI{}
+ abi.Init(sig, intArgResultRegs, floatArgResultRegs)
+ m.currentABI = abi
+
+ cur := m.allocateNop()
+ ectx.RootInstr = cur
+
+ // Execution context is always the first argument.
+ execCtrPtr := raxVReg
+
+ // First we update RBP and RSP just like the normal prologue.
+ //
+ // (high address) (high address)
+ // RBP ----> +-----------------+ +-----------------+
+ // | ....... | | ....... |
+ // | ret Y | | ret Y |
+ // | ....... | | ....... |
+ // | ret 0 | | ret 0 |
+ // | arg X | | arg X |
+ // | ....... | ====> | ....... |
+ // | arg 1 | | arg 1 |
+ // | arg 0 | | arg 0 |
+ // | Return Addr | | Return Addr |
+ // RSP ----> +-----------------+ | Caller_RBP |
+ // (low address) +-----------------+ <----- RSP, RBP
+ //
+ cur = m.setupRBPRSP(cur)
+
+ goSliceSizeAligned, goSliceSizeAlignedUnaligned := backend.GoFunctionCallRequiredStackSize(sig, argBegin)
+ cur = m.insertStackBoundsCheck(goSliceSizeAligned+8 /* size of the Go slice */, cur)
+
+ // Save the callee saved registers.
+ cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs)
+
+ if needModuleContextPtr {
+ moduleCtrPtr := rbxVReg // Module context is always the second argument.
+ mem := m.newAmodeImmReg(
+ wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.U32(),
+ execCtrPtr)
+ store := m.allocateInstr().asMovRM(moduleCtrPtr, newOperandMem(mem), 8)
+ cur = linkInstr(cur, store)
+ }
+
+ // Now let's advance the RSP to the stack slot for the arguments.
+ //
+ // (high address) (high address)
+ // +-----------------+ +-----------------+
+ // | ....... | | ....... |
+ // | ret Y | | ret Y |
+ // | ....... | | ....... |
+ // | ret 0 | | ret 0 |
+ // | arg X | | arg X |
+ // | ....... | =======> | ....... |
+ // | arg 1 | | arg 1 |
+ // | arg 0 | | arg 0 |
+ // | Return Addr | | Return Addr |
+ // | Caller_RBP | | Caller_RBP |
+ // RBP,RSP --> +-----------------+ +-----------------+ <----- RBP
+ // (low address) | arg[N]/ret[M] |
+ // | .......... |
+ // | arg[1]/ret[1] |
+ // | arg[0]/ret[0] |
+ // +-----------------+ <----- RSP
+ // (low address)
+ //
+ // where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions,
+ // therefore will be accessed as the usual []uint64. So that's where we need to pass/receive
+ // the arguments/return values to/from Go function.
+ cur = m.addRSP(-int32(goSliceSizeAligned), cur)
+
+ // Next, we need to store all the arguments to the stack in the typical Wasm stack style.
+ var offsetInGoSlice int32
+ for i := range abi.Args[argBegin:] {
+ arg := &abi.Args[argBegin+i]
+ var v regalloc.VReg
+ if arg.Kind == backend.ABIArgKindReg {
+ v = arg.Reg
+ } else {
+ // We have saved callee saved registers, so we can use them.
+ if arg.Type.IsInt() {
+ v = r15VReg
+ } else {
+ v = xmm15VReg
+ }
+ mem := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg))
+ load := m.allocateInstr()
+ switch arg.Type {
+ case ssa.TypeI32:
+ load.asMovzxRmR(extModeLQ, mem, v)
+ case ssa.TypeI64:
+ load.asMov64MR(mem, v)
+ case ssa.TypeF32:
+ load.asXmmUnaryRmR(sseOpcodeMovss, mem, v)
+ case ssa.TypeF64:
+ load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v)
+ case ssa.TypeV128:
+ load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
+ default:
+ panic("BUG")
+ }
+ cur = linkInstr(cur, load)
+ }
+
+ store := m.allocateInstr()
+ mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg))
+ switch arg.Type {
+ case ssa.TypeI32:
+ store.asMovRM(v, mem, 4)
+ offsetInGoSlice += 8 // always uint64 rep.
+ case ssa.TypeI64:
+ store.asMovRM(v, mem, 8)
+ offsetInGoSlice += 8
+ case ssa.TypeF32:
+ store.asXmmMovRM(sseOpcodeMovss, v, mem)
+ offsetInGoSlice += 8 // always uint64 rep.
+ case ssa.TypeF64:
+ store.asXmmMovRM(sseOpcodeMovsd, v, mem)
+ offsetInGoSlice += 8
+ case ssa.TypeV128:
+ store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
+ offsetInGoSlice += 16
+ default:
+ panic("BUG")
+ }
+ cur = linkInstr(cur, store)
+ }
+
+ // Finally we push the size of the slice to the stack so the stack looks like:
+ //
+ // (high address)
+ // +-----------------+
+ // | ....... |
+ // | ret Y |
+ // | ....... |
+ // | ret 0 |
+ // | arg X |
+ // | ....... |
+ // | arg 1 |
+ // | arg 0 |
+ // | Return Addr |
+ // | Caller_RBP |
+ // +-----------------+ <----- RBP
+ // | arg[N]/ret[M] |
+ // | .......... |
+ // | arg[1]/ret[1] |
+ // | arg[0]/ret[0] |
+ // | slice size |
+ // +-----------------+ <----- RSP
+ // (low address)
+ //
+ // push $sliceSize
+ cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandImm32(uint32(goSliceSizeAlignedUnaligned))))
+
+ // Load the exitCode to the register.
+ exitCodeReg := r12VReg // Callee saved which is already saved.
+ cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(exitCode), false))
+
+ saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg)
+ cur = linkInstr(cur, setExitCode)
+ cur = linkInstr(cur, saveRsp)
+ cur = linkInstr(cur, saveRbp)
+
+ // Ready to exit the execution.
+ cur = m.storeReturnAddressAndExit(cur, execCtrPtr)
+
+ // We don't need the slice size anymore, so pop it.
+ cur = m.addRSP(8, cur)
+
+ // Ready to set up the results.
+ offsetInGoSlice = 0
+ // To avoid overwriting with the execution context pointer by the result, we need to track the offset,
+ // and defer the restoration of the result to the end of this function.
+ var argOverlapWithExecCtxOffset int32 = -1
+ for i := range abi.Rets {
+ r := &abi.Rets[i]
+ var v regalloc.VReg
+ isRegResult := r.Kind == backend.ABIArgKindReg
+ if isRegResult {
+ v = r.Reg
+ if v.RealReg() == execCtrPtr.RealReg() {
+ argOverlapWithExecCtxOffset = offsetInGoSlice
+ offsetInGoSlice += 8 // always uint64 rep.
+ continue
+ }
+ } else {
+ if r.Type.IsInt() {
+ v = r15VReg
+ } else {
+ v = xmm15VReg
+ }
+ }
+
+ load := m.allocateInstr()
+ mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg))
+ switch r.Type {
+ case ssa.TypeI32:
+ load.asMovzxRmR(extModeLQ, mem, v)
+ offsetInGoSlice += 8 // always uint64 rep.
+ case ssa.TypeI64:
+ load.asMov64MR(mem, v)
+ offsetInGoSlice += 8
+ case ssa.TypeF32:
+ load.asXmmUnaryRmR(sseOpcodeMovss, mem, v)
+ offsetInGoSlice += 8 // always uint64 rep.
+ case ssa.TypeF64:
+ load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v)
+ offsetInGoSlice += 8
+ case ssa.TypeV128:
+ load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
+ offsetInGoSlice += 16
+ default:
+ panic("BUG")
+ }
+ cur = linkInstr(cur, load)
+
+ if !isRegResult {
+ // We need to store it back to the result slot above rbp.
+ store := m.allocateInstr()
+ mem := newOperandMem(m.newAmodeImmReg(uint32(abi.ArgStackSize+r.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg))
+ switch r.Type {
+ case ssa.TypeI32:
+ store.asMovRM(v, mem, 4)
+ case ssa.TypeI64:
+ store.asMovRM(v, mem, 8)
+ case ssa.TypeF32:
+ store.asXmmMovRM(sseOpcodeMovss, v, mem)
+ case ssa.TypeF64:
+ store.asXmmMovRM(sseOpcodeMovsd, v, mem)
+ case ssa.TypeV128:
+ store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
+ default:
+ panic("BUG")
+ }
+ cur = linkInstr(cur, store)
+ }
+ }
+
+ // Before return, we need to restore the callee saved registers.
+ cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs)
+
+ if argOverlapWithExecCtxOffset >= 0 {
+ // At this point execCtt is not used anymore, so we can finally store the
+ // result to the register which overlaps with the execution context pointer.
+ mem := newOperandMem(m.newAmodeImmReg(uint32(argOverlapWithExecCtxOffset), rspVReg))
+ load := m.allocateInstr().asMov64MR(mem, execCtrPtr)
+ cur = linkInstr(cur, load)
+ }
+
+ // Finally ready to return.
+ cur = m.revertRBPRSP(cur)
+ linkInstr(cur, m.allocateInstr().asRet())
+
+ m.encodeWithoutSSA(ectx.RootInstr)
+ return m.c.Buf()
+}
+
+func (m *machine) saveRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction {
+ offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
+ for _, v := range regs {
+ store := m.allocateInstr()
+ mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx))
+ switch v.RegType() {
+ case regalloc.RegTypeInt:
+ store.asMovRM(v, mem, 8)
+ case regalloc.RegTypeFloat:
+ store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
+ default:
+ panic("BUG")
+ }
+ cur = linkInstr(cur, store)
+ offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally.
+ }
+ return cur
+}
+
+func (m *machine) restoreRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction {
+ offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
+ for _, v := range regs {
+ load := m.allocateInstr()
+ mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx))
+ switch v.RegType() {
+ case regalloc.RegTypeInt:
+ load.asMov64MR(mem, v)
+ case regalloc.RegTypeFloat:
+ load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
+ default:
+ panic("BUG")
+ }
+ cur = linkInstr(cur, load)
+ offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally.
+ }
+ return cur
+}
+
+func (m *machine) storeReturnAddressAndExit(cur *instruction, execCtx regalloc.VReg) *instruction {
+ readRip := m.allocateInstr()
+ cur = linkInstr(cur, readRip)
+
+ ripReg := r12VReg // Callee saved which is already saved.
+ saveRip := m.allocateInstr().asMovRM(
+ ripReg,
+ newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)),
+ 8,
+ )
+ cur = linkInstr(cur, saveRip)
+
+ exit := m.allocateExitSeq(execCtx)
+ cur = linkInstr(cur, exit)
+
+ nop, l := m.allocateBrTarget()
+ cur = linkInstr(cur, nop)
+ readRip.asLEA(newOperandLabel(l), ripReg)
+ return cur
+}
+
+// saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
+// stack space left. Basically this is the all allocatable registers except for RSP and RBP, and RAX which contains the
+// execution context pointer. ExecCtx pointer is always the first argument so we don't need to save it.
+var stackGrowSaveVRegs = []regalloc.VReg{
+ rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg,
+ rcxVReg, rbxVReg, rsiVReg, rdiVReg, r8VReg, r9VReg, r10VReg, r11VReg,
+ xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg,
+ xmm0VReg, xmm1VReg, xmm2VReg, xmm3VReg, xmm4VReg, xmm5VReg, xmm6VReg, xmm7VReg,
+}
+
+// CompileStackGrowCallSequence implements backend.Machine.
+func (m *machine) CompileStackGrowCallSequence() []byte {
+ ectx := m.ectx
+
+ cur := m.allocateNop()
+ ectx.RootInstr = cur
+
+ cur = m.setupRBPRSP(cur)
+
+ // Execution context is always the first argument.
+ execCtrPtr := raxVReg
+
+ // Save the callee saved and argument registers.
+ cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs)
+
+ // Load the exitCode to the register.
+ exitCodeReg := r12VReg // Already saved.
+ cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(wazevoapi.ExitCodeGrowStack), false))
+
+ saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg)
+ cur = linkInstr(cur, setExitCode)
+ cur = linkInstr(cur, saveRsp)
+ cur = linkInstr(cur, saveRbp)
+
+ // Ready to exit the execution.
+ cur = m.storeReturnAddressAndExit(cur, execCtrPtr)
+
+ // After the exit, restore the saved registers.
+ cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs)
+
+ // Finally ready to return.
+ cur = m.revertRBPRSP(cur)
+ linkInstr(cur, m.allocateInstr().asRet())
+
+ m.encodeWithoutSSA(ectx.RootInstr)
+ return m.c.Buf()
+}
+
+// insertStackBoundsCheck will insert the instructions after `cur` to check the
+// stack bounds, and if there's no sufficient spaces required for the function,
+// exit the execution and try growing it in Go world.
+func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction {
+ // add $requiredStackSize, %rsp ;; Temporarily update the sp.
+ // cmp ExecutionContextOffsetStackBottomPtr(%rax), %rsp ;; Compare the stack bottom and the sp.
+ // ja .ok
+ // sub $requiredStackSize, %rsp ;; Reverse the temporary update.
+ // pushq r15 ;; save the temporary.
+ // mov $requiredStackSize, %r15
+ // mov %15, ExecutionContextOffsetStackGrowRequiredSize(%rax) ;; Set the required size in the execution context.
+ // popq r15 ;; restore the temporary.
+ // callq *ExecutionContextOffsetStackGrowCallTrampolineAddress(%rax) ;; Call the Go function to grow the stack.
+ // jmp .cont
+ // .ok:
+ // sub $requiredStackSize, %rsp ;; Reverse the temporary update.
+ // .cont:
+ cur = m.addRSP(-int32(requiredStackSize), cur)
+ cur = linkInstr(cur, m.allocateInstr().asCmpRmiR(true,
+ newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackBottomPtr.U32(), raxVReg)),
+ rspVReg, true))
+
+ ja := m.allocateInstr()
+ cur = linkInstr(cur, ja)
+
+ cur = m.addRSP(int32(requiredStackSize), cur)
+
+ // Save the temporary.
+
+ cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r15VReg)))
+ // Load the required size to the temporary.
+ cur = linkInstr(cur, m.allocateInstr().asImm(r15VReg, uint64(requiredStackSize), true))
+ // Set the required size in the execution context.
+ cur = linkInstr(cur, m.allocateInstr().asMovRM(r15VReg,
+ newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.U32(), raxVReg)), 8))
+ // Restore the temporary.
+ cur = linkInstr(cur, m.allocateInstr().asPop64(r15VReg))
+ // Call the Go function to grow the stack.
+ cur = linkInstr(cur, m.allocateInstr().asCallIndirect(newOperandMem(m.newAmodeImmReg(
+ wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.U32(), raxVReg)), nil))
+ // Jump to the continuation.
+ jmpToCont := m.allocateInstr()
+ cur = linkInstr(cur, jmpToCont)
+
+ // .ok:
+ okInstr, ok := m.allocateBrTarget()
+ cur = linkInstr(cur, okInstr)
+ ja.asJmpIf(condNBE, newOperandLabel(ok))
+ // On the ok path, we only need to reverse the temporary update.
+ cur = m.addRSP(int32(requiredStackSize), cur)
+
+ // .cont:
+ contInstr, cont := m.allocateBrTarget()
+ cur = linkInstr(cur, contInstr)
+ jmpToCont.asJmp(newOperandLabel(cont))
+
+ return cur
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go
new file mode 100644
index 000000000..75cbeab75
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go
@@ -0,0 +1,168 @@
+package amd64
+
+import (
+ "fmt"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type cond byte
+
+const (
+ // condO represents (overflow) condition.
+ condO cond = iota
+ // condNO represents (no overflow) condition.
+ condNO
+ // condB represents (< unsigned) condition.
+ condB
+ // condNB represents (>= unsigned) condition.
+ condNB
+ // condZ represents (zero) condition.
+ condZ
+ // condNZ represents (not-zero) condition.
+ condNZ
+ // condBE represents (<= unsigned) condition.
+ condBE
+ // condNBE represents (> unsigned) condition.
+ condNBE
+ // condS represents (negative) condition.
+ condS
+ // condNS represents (not-negative) condition.
+ condNS
+ // condP represents (parity) condition.
+ condP
+ // condNP represents (not parity) condition.
+ condNP
+ // condL represents (< signed) condition.
+ condL
+ // condNL represents (>= signed) condition.
+ condNL
+ // condLE represents (<= signed) condition.
+ condLE
+ // condNLE represents (> signed) condition.
+ condNLE
+
+ condInvalid
+)
+
+func (c cond) String() string {
+ switch c {
+ case condO:
+ return "o"
+ case condNO:
+ return "no"
+ case condB:
+ return "b"
+ case condNB:
+ return "nb"
+ case condZ:
+ return "z"
+ case condNZ:
+ return "nz"
+ case condBE:
+ return "be"
+ case condNBE:
+ return "nbe"
+ case condS:
+ return "s"
+ case condNS:
+ return "ns"
+ case condL:
+ return "l"
+ case condNL:
+ return "nl"
+ case condLE:
+ return "le"
+ case condNLE:
+ return "nle"
+ case condP:
+ return "p"
+ case condNP:
+ return "np"
+ default:
+ panic("unreachable")
+ }
+}
+
+func condFromSSAIntCmpCond(origin ssa.IntegerCmpCond) cond {
+ switch origin {
+ case ssa.IntegerCmpCondEqual:
+ return condZ
+ case ssa.IntegerCmpCondNotEqual:
+ return condNZ
+ case ssa.IntegerCmpCondSignedLessThan:
+ return condL
+ case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
+ return condNL
+ case ssa.IntegerCmpCondSignedGreaterThan:
+ return condNLE
+ case ssa.IntegerCmpCondSignedLessThanOrEqual:
+ return condLE
+ case ssa.IntegerCmpCondUnsignedLessThan:
+ return condB
+ case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual:
+ return condNB
+ case ssa.IntegerCmpCondUnsignedGreaterThan:
+ return condNBE
+ case ssa.IntegerCmpCondUnsignedLessThanOrEqual:
+ return condBE
+ default:
+ panic("unreachable")
+ }
+}
+
+func condFromSSAFloatCmpCond(origin ssa.FloatCmpCond) cond {
+ switch origin {
+ case ssa.FloatCmpCondGreaterThanOrEqual:
+ return condNB
+ case ssa.FloatCmpCondGreaterThan:
+ return condNBE
+ case ssa.FloatCmpCondEqual, ssa.FloatCmpCondNotEqual, ssa.FloatCmpCondLessThan, ssa.FloatCmpCondLessThanOrEqual:
+ panic(fmt.Sprintf("cond %s must be treated as a special case", origin))
+ default:
+ panic("unreachable")
+ }
+}
+
+func (c cond) encoding() byte {
+ return byte(c)
+}
+
+func (c cond) invert() cond {
+ switch c {
+ case condO:
+ return condNO
+ case condNO:
+ return condO
+ case condB:
+ return condNB
+ case condNB:
+ return condB
+ case condZ:
+ return condNZ
+ case condNZ:
+ return condZ
+ case condBE:
+ return condNBE
+ case condNBE:
+ return condBE
+ case condS:
+ return condNS
+ case condNS:
+ return condS
+ case condP:
+ return condNP
+ case condNP:
+ return condP
+ case condL:
+ return condNL
+ case condNL:
+ return condL
+ case condLE:
+ return condNLE
+ case condNLE:
+ return condLE
+ default:
+ panic("unreachable")
+ }
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go
new file mode 100644
index 000000000..5e731e822
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go
@@ -0,0 +1,35 @@
+package amd64
+
+// extMode represents the mode of extension in movzx/movsx.
+type extMode byte
+
+const (
+ // extModeBL represents Byte -> Longword.
+ extModeBL extMode = iota
+ // extModeBQ represents Byte -> Quadword.
+ extModeBQ
+ // extModeWL represents Word -> Longword.
+ extModeWL
+ // extModeWQ represents Word -> Quadword.
+ extModeWQ
+ // extModeLQ represents Longword -> Quadword.
+ extModeLQ
+)
+
+// String implements fmt.Stringer.
+func (e extMode) String() string {
+ switch e {
+ case extModeBL:
+ return "bl"
+ case extModeBQ:
+ return "bq"
+ case extModeWL:
+ return "wl"
+ case extModeWQ:
+ return "wq"
+ case extModeLQ:
+ return "lq"
+ default:
+ panic("BUG: invalid ext mode")
+ }
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go
new file mode 100644
index 000000000..d27e79c0e
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go
@@ -0,0 +1,2472 @@
+package amd64
+
+import (
+ "fmt"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type instruction struct {
+ prev, next *instruction
+ op1, op2 operand
+ u1, u2 uint64
+ b1 bool
+ addedBeforeRegAlloc bool
+ kind instructionKind
+}
+
+// Next implements regalloc.Instr.
+func (i *instruction) Next() regalloc.Instr {
+ return i.next
+}
+
+// Prev implements regalloc.Instr.
+func (i *instruction) Prev() regalloc.Instr {
+ return i.prev
+}
+
+// IsCall implements regalloc.Instr.
+func (i *instruction) IsCall() bool { return i.kind == call }
+
+// IsIndirectCall implements regalloc.Instr.
+func (i *instruction) IsIndirectCall() bool { return i.kind == callIndirect }
+
+// IsReturn implements regalloc.Instr.
+func (i *instruction) IsReturn() bool { return i.kind == ret }
+
+// AddedBeforeRegAlloc implements regalloc.Instr.
+func (i *instruction) AddedBeforeRegAlloc() bool { return i.addedBeforeRegAlloc }
+
+// String implements regalloc.Instr.
+func (i *instruction) String() string {
+ switch i.kind {
+ case nop0:
+ return "nop"
+ case sourceOffsetInfo:
+ return fmt.Sprintf("source_offset_info %d", i.u1)
+ case ret:
+ return "ret"
+ case imm:
+ if i.b1 {
+ return fmt.Sprintf("movabsq $%d, %s", int64(i.u1), i.op2.format(true))
+ } else {
+ return fmt.Sprintf("movl $%d, %s", int32(i.u1), i.op2.format(false))
+ }
+ case aluRmiR:
+ return fmt.Sprintf("%s %s, %s", aluRmiROpcode(i.u1), i.op1.format(i.b1), i.op2.format(i.b1))
+ case movRR:
+ if i.b1 {
+ return fmt.Sprintf("movq %s, %s", i.op1.format(true), i.op2.format(true))
+ } else {
+ return fmt.Sprintf("movl %s, %s", i.op1.format(false), i.op2.format(false))
+ }
+ case xmmRmR:
+ return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(false), i.op2.format(false))
+ case gprToXmm:
+ return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(i.b1), i.op2.format(i.b1))
+ case xmmUnaryRmR:
+ return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(false), i.op2.format(false))
+ case xmmUnaryRmRImm:
+ return fmt.Sprintf("%s $%d, %s, %s", sseOpcode(i.u1), roundingMode(i.u2), i.op1.format(false), i.op2.format(false))
+ case unaryRmR:
+ var suffix string
+ if i.b1 {
+ suffix = "q"
+ } else {
+ suffix = "l"
+ }
+ return fmt.Sprintf("%s%s %s, %s", unaryRmROpcode(i.u1), suffix, i.op1.format(i.b1), i.op2.format(i.b1))
+ case not:
+ var op string
+ if i.b1 {
+ op = "notq"
+ } else {
+ op = "notl"
+ }
+ return fmt.Sprintf("%s %s", op, i.op1.format(i.b1))
+ case neg:
+ var op string
+ if i.b1 {
+ op = "negq"
+ } else {
+ op = "negl"
+ }
+ return fmt.Sprintf("%s %s", op, i.op1.format(i.b1))
+ case div:
+ var prefix string
+ var op string
+ if i.b1 {
+ op = "divq"
+ } else {
+ op = "divl"
+ }
+ if i.u1 != 0 {
+ prefix = "i"
+ }
+ return fmt.Sprintf("%s%s %s", prefix, op, i.op1.format(i.b1))
+ case mulHi:
+ signed, _64 := i.u1 != 0, i.b1
+ var op string
+ switch {
+ case signed && _64:
+ op = "imulq"
+ case !signed && _64:
+ op = "mulq"
+ case signed && !_64:
+ op = "imull"
+ case !signed && !_64:
+ op = "mull"
+ }
+ return fmt.Sprintf("%s %s", op, i.op1.format(i.b1))
+ case signExtendData:
+ var op string
+ if i.b1 {
+ op = "cqo"
+ } else {
+ op = "cdq"
+ }
+ return op
+ case movzxRmR:
+ return fmt.Sprintf("movzx.%s %s, %s", extMode(i.u1), i.op1.format(true), i.op2.format(true))
+ case mov64MR:
+ return fmt.Sprintf("movq %s, %s", i.op1.format(true), i.op2.format(true))
+ case lea:
+ return fmt.Sprintf("lea %s, %s", i.op1.format(true), i.op2.format(true))
+ case movsxRmR:
+ return fmt.Sprintf("movsx.%s %s, %s", extMode(i.u1), i.op1.format(true), i.op2.format(true))
+ case movRM:
+ var suffix string
+ switch i.u1 {
+ case 1:
+ suffix = "b"
+ case 2:
+ suffix = "w"
+ case 4:
+ suffix = "l"
+ case 8:
+ suffix = "q"
+ }
+ return fmt.Sprintf("mov.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true))
+ case shiftR:
+ var suffix string
+ if i.b1 {
+ suffix = "q"
+ } else {
+ suffix = "l"
+ }
+ return fmt.Sprintf("%s%s %s, %s", shiftROp(i.u1), suffix, i.op1.format(false), i.op2.format(i.b1))
+ case xmmRmiReg:
+ return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(true), i.op2.format(true))
+ case cmpRmiR:
+ var op, suffix string
+ if i.u1 != 0 {
+ op = "cmp"
+ } else {
+ op = "test"
+ }
+ if i.b1 {
+ suffix = "q"
+ } else {
+ suffix = "l"
+ }
+ if op == "test" && i.op1.kind == operandKindMem {
+ // Print consistently with AT&T syntax.
+ return fmt.Sprintf("%s%s %s, %s", op, suffix, i.op2.format(i.b1), i.op1.format(i.b1))
+ }
+ return fmt.Sprintf("%s%s %s, %s", op, suffix, i.op1.format(i.b1), i.op2.format(i.b1))
+ case setcc:
+ return fmt.Sprintf("set%s %s", cond(i.u1), i.op2.format(true))
+ case cmove:
+ var suffix string
+ if i.b1 {
+ suffix = "q"
+ } else {
+ suffix = "l"
+ }
+ return fmt.Sprintf("cmov%s%s %s, %s", cond(i.u1), suffix, i.op1.format(i.b1), i.op2.format(i.b1))
+ case push64:
+ return fmt.Sprintf("pushq %s", i.op1.format(true))
+ case pop64:
+ return fmt.Sprintf("popq %s", i.op1.format(true))
+ case xmmMovRM:
+ return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(true), i.op2.format(true))
+ case xmmLoadConst:
+ panic("TODO")
+ case xmmToGpr:
+ return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(i.b1), i.op2.format(i.b1))
+ case cvtUint64ToFloatSeq:
+ panic("TODO")
+ case cvtFloatToSintSeq:
+ panic("TODO")
+ case cvtFloatToUintSeq:
+ panic("TODO")
+ case xmmMinMaxSeq:
+ panic("TODO")
+ case xmmCmpRmR:
+ return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(false), i.op2.format(false))
+ case xmmRmRImm:
+ op := sseOpcode(i.u1)
+ r1, r2 := i.op1.format(op == sseOpcodePextrq || op == sseOpcodePinsrq),
+ i.op2.format(op == sseOpcodePextrq || op == sseOpcodePinsrq)
+ return fmt.Sprintf("%s $%d, %s, %s", op, i.u2, r1, r2)
+ case jmp:
+ return fmt.Sprintf("jmp %s", i.op1.format(true))
+ case jmpIf:
+ return fmt.Sprintf("j%s %s", cond(i.u1), i.op1.format(true))
+ case jmpTableIsland:
+ return fmt.Sprintf("jump_table_island: jmp_table_index=%d", i.u1)
+ case exitSequence:
+ return fmt.Sprintf("exit_sequence %s", i.op1.format(true))
+ case ud2:
+ return "ud2"
+ case call:
+ return fmt.Sprintf("call %s", ssa.FuncRef(i.u1))
+ case callIndirect:
+ return fmt.Sprintf("callq *%s", i.op1.format(true))
+ case xchg:
+ var suffix string
+ switch i.u1 {
+ case 1:
+ suffix = "b"
+ case 2:
+ suffix = "w"
+ case 4:
+ suffix = "l"
+ case 8:
+ suffix = "q"
+ }
+ return fmt.Sprintf("xchg.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true))
+ case zeros:
+ return fmt.Sprintf("xor %s, %s", i.op2.format(true), i.op2.format(true))
+ case fcvtToSintSequence:
+ execCtx, src, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat := i.fcvtToSintSequenceData()
+ return fmt.Sprintf(
+ "fcvtToSintSequence execCtx=%s, src=%s, tmpGp=%s, tmpGp2=%s, tmpXmm=%s, src64=%v, dst64=%v, sat=%v",
+ formatVRegSized(execCtx, true),
+ formatVRegSized(src, true),
+ formatVRegSized(tmpGp, true),
+ formatVRegSized(tmpGp2, true),
+ formatVRegSized(tmpXmm, true), src64, dst64, sat)
+ case fcvtToUintSequence:
+ execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat := i.fcvtToUintSequenceData()
+ return fmt.Sprintf(
+ "fcvtToUintSequence execCtx=%s, src=%s, tmpGp=%s, tmpGp2=%s, tmpXmm=%s, tmpXmm2=%s, src64=%v, dst64=%v, sat=%v",
+ formatVRegSized(execCtx, true),
+ formatVRegSized(src, true),
+ formatVRegSized(tmpGp, true),
+ formatVRegSized(tmpGp2, true),
+ formatVRegSized(tmpXmm, true),
+ formatVRegSized(tmpXmm2, true), src64, dst64, sat)
+ case idivRemSequence:
+ execCtx, divisor, tmpGp, isDiv, signed, _64 := i.idivRemSequenceData()
+ return fmt.Sprintf("idivRemSequence execCtx=%s, divisor=%s, tmpGp=%s, isDiv=%v, signed=%v, _64=%v",
+ formatVRegSized(execCtx, true), formatVRegSized(divisor, _64), formatVRegSized(tmpGp, _64), isDiv, signed, _64)
+ case defineUninitializedReg:
+ return fmt.Sprintf("defineUninitializedReg %s", i.op2.format(true))
+ case xmmCMov:
+ return fmt.Sprintf("xmmcmov%s %s, %s", cond(i.u1), i.op1.format(true), i.op2.format(true))
+ case blendvpd:
+ return fmt.Sprintf("blendvpd %s, %s, %%xmm0", i.op1.format(false), i.op2.format(false))
+ case mfence:
+ return "mfence"
+ case lockcmpxchg:
+ var suffix string
+ switch i.u1 {
+ case 1:
+ suffix = "b"
+ case 2:
+ suffix = "w"
+ case 4:
+ suffix = "l"
+ case 8:
+ suffix = "q"
+ }
+ return fmt.Sprintf("lock cmpxchg.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true))
+ case lockxadd:
+ var suffix string
+ switch i.u1 {
+ case 1:
+ suffix = "b"
+ case 2:
+ suffix = "w"
+ case 4:
+ suffix = "l"
+ case 8:
+ suffix = "q"
+ }
+ return fmt.Sprintf("lock xadd.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true))
+
+ case nopUseReg:
+ return fmt.Sprintf("nop_use_reg %s", i.op1.format(true))
+
+ default:
+ panic(fmt.Sprintf("BUG: %d", int(i.kind)))
+ }
+}
+
+// Defs implements regalloc.Instr.
+func (i *instruction) Defs(regs *[]regalloc.VReg) []regalloc.VReg {
+ *regs = (*regs)[:0]
+ switch dk := defKinds[i.kind]; dk {
+ case defKindNone:
+ case defKindOp2:
+ *regs = append(*regs, i.op2.reg())
+ case defKindCall:
+ _, _, retIntRealRegs, retFloatRealRegs, _ := backend.ABIInfoFromUint64(i.u2)
+ for i := byte(0); i < retIntRealRegs; i++ {
+ *regs = append(*regs, regInfo.RealRegToVReg[intArgResultRegs[i]])
+ }
+ for i := byte(0); i < retFloatRealRegs; i++ {
+ *regs = append(*regs, regInfo.RealRegToVReg[floatArgResultRegs[i]])
+ }
+ case defKindDivRem:
+ _, _, _, isDiv, _, _ := i.idivRemSequenceData()
+ if isDiv {
+ *regs = append(*regs, raxVReg)
+ } else {
+ *regs = append(*regs, rdxVReg)
+ }
+ default:
+ panic(fmt.Sprintf("BUG: invalid defKind \"%s\" for %s", dk, i))
+ }
+ return *regs
+}
+
+// Uses implements regalloc.Instr.
+func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
+ *regs = (*regs)[:0]
+ switch uk := useKinds[i.kind]; uk {
+ case useKindNone:
+ case useKindOp1Op2Reg, useKindOp1RegOp2:
+ opAny, opReg := &i.op1, &i.op2
+ if uk == useKindOp1RegOp2 {
+ opAny, opReg = opReg, opAny
+ }
+ // The destination operand (op2) can be only reg,
+ // the source operand (op1) can be imm32, reg or mem.
+ switch opAny.kind {
+ case operandKindReg:
+ *regs = append(*regs, opAny.reg())
+ case operandKindMem:
+ opAny.addressMode().uses(regs)
+ case operandKindImm32:
+ default:
+ panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+ }
+ if opReg.kind != operandKindReg {
+ panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+ }
+ *regs = append(*regs, opReg.reg())
+ case useKindOp1:
+ op := i.op1
+ switch op.kind {
+ case operandKindReg:
+ *regs = append(*regs, op.reg())
+ case operandKindMem:
+ op.addressMode().uses(regs)
+ case operandKindImm32, operandKindLabel:
+ default:
+ panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+ }
+ case useKindCallInd:
+ op := i.op1
+ switch op.kind {
+ case operandKindReg:
+ *regs = append(*regs, op.reg())
+ case operandKindMem:
+ op.addressMode().uses(regs)
+ default:
+ panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+ }
+ fallthrough
+ case useKindCall:
+ argIntRealRegs, argFloatRealRegs, _, _, _ := backend.ABIInfoFromUint64(i.u2)
+ for i := byte(0); i < argIntRealRegs; i++ {
+ *regs = append(*regs, regInfo.RealRegToVReg[intArgResultRegs[i]])
+ }
+ for i := byte(0); i < argFloatRealRegs; i++ {
+ *regs = append(*regs, regInfo.RealRegToVReg[floatArgResultRegs[i]])
+ }
+ case useKindFcvtToSintSequence:
+ execCtx, src, tmpGp, tmpGp2, tmpXmm, _, _, _ := i.fcvtToSintSequenceData()
+ *regs = append(*regs, execCtx, src, tmpGp, tmpGp2, tmpXmm)
+ case useKindFcvtToUintSequence:
+ execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, _, _, _ := i.fcvtToUintSequenceData()
+ *regs = append(*regs, execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2)
+ case useKindDivRem:
+ execCtx, divisor, tmpGp, _, _, _ := i.idivRemSequenceData()
+ // idiv uses rax and rdx as implicit operands.
+ *regs = append(*regs, raxVReg, rdxVReg, execCtx, divisor, tmpGp)
+ case useKindBlendvpd:
+ *regs = append(*regs, xmm0VReg)
+
+ opAny, opReg := &i.op1, &i.op2
+ switch opAny.kind {
+ case operandKindReg:
+ *regs = append(*regs, opAny.reg())
+ case operandKindMem:
+ opAny.addressMode().uses(regs)
+ default:
+ panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+ }
+ if opReg.kind != operandKindReg {
+ panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+ }
+ *regs = append(*regs, opReg.reg())
+
+ case useKindRaxOp1RegOp2:
+ opReg, opAny := &i.op1, &i.op2
+ *regs = append(*regs, raxVReg, opReg.reg())
+ switch opAny.kind {
+ case operandKindReg:
+ *regs = append(*regs, opAny.reg())
+ case operandKindMem:
+ opAny.addressMode().uses(regs)
+ default:
+ panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+ }
+ if opReg.kind != operandKindReg {
+ panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+ }
+
+ default:
+ panic(fmt.Sprintf("BUG: invalid useKind %s for %s", uk, i))
+ }
+ return *regs
+}
+
+// AssignUse implements regalloc.Instr.
+func (i *instruction) AssignUse(index int, v regalloc.VReg) {
+ switch uk := useKinds[i.kind]; uk {
+ case useKindNone:
+ case useKindCallInd:
+ if index != 0 {
+ panic("BUG")
+ }
+ op := &i.op1
+ switch op.kind {
+ case operandKindReg:
+ op.setReg(v)
+ case operandKindMem:
+ op.addressMode().assignUses(index, v)
+ default:
+ panic("BUG")
+ }
+ case useKindOp1Op2Reg, useKindOp1RegOp2:
+ op, opMustBeReg := &i.op1, &i.op2
+ if uk == useKindOp1RegOp2 {
+ op, opMustBeReg = opMustBeReg, op
+ }
+ switch op.kind {
+ case operandKindReg:
+ if index == 0 {
+ op.setReg(v)
+ } else if index == 1 {
+ opMustBeReg.setReg(v)
+ } else {
+ panic("BUG")
+ }
+ case operandKindMem:
+ nregs := op.addressMode().nregs()
+ if index < nregs {
+ op.addressMode().assignUses(index, v)
+ } else if index == nregs {
+ opMustBeReg.setReg(v)
+ } else {
+ panic("BUG")
+ }
+ case operandKindImm32:
+ if index == 0 {
+ opMustBeReg.setReg(v)
+ } else {
+ panic("BUG")
+ }
+ default:
+ panic(fmt.Sprintf("BUG: invalid operand pair: %s", i))
+ }
+ case useKindOp1:
+ op := &i.op1
+ switch op.kind {
+ case operandKindReg:
+ if index != 0 {
+ panic("BUG")
+ }
+ op.setReg(v)
+ case operandKindMem:
+ op.addressMode().assignUses(index, v)
+ default:
+ panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+ }
+ case useKindFcvtToSintSequence:
+ switch index {
+ case 0:
+ i.op1.addressMode().base = v
+ case 1:
+ i.op1.addressMode().index = v
+ case 2:
+ i.op2.addressMode().base = v
+ case 3:
+ i.op2.addressMode().index = v
+ case 4:
+ i.u1 = uint64(v)
+ default:
+ panic("BUG")
+ }
+ case useKindFcvtToUintSequence:
+ switch index {
+ case 0:
+ i.op1.addressMode().base = v
+ case 1:
+ i.op1.addressMode().index = v
+ case 2:
+ i.op2.addressMode().base = v
+ case 3:
+ i.op2.addressMode().index = v
+ case 4:
+ i.u1 = uint64(v)
+ case 5:
+ i.u2 = uint64(v)
+ default:
+ panic("BUG")
+ }
+ case useKindDivRem:
+ switch index {
+ case 0:
+ if v != raxVReg {
+ panic("BUG")
+ }
+ case 1:
+ if v != rdxVReg {
+ panic("BUG")
+ }
+ case 2:
+ i.op1.setReg(v)
+ case 3:
+ i.op2.setReg(v)
+ case 4:
+ i.u1 = uint64(v)
+ default:
+ panic("BUG")
+ }
+ case useKindBlendvpd:
+ op, opMustBeReg := &i.op1, &i.op2
+ if index == 0 {
+ if v.RealReg() != xmm0 {
+ panic("BUG")
+ }
+ } else {
+ switch op.kind {
+ case operandKindReg:
+ switch index {
+ case 1:
+ op.setReg(v)
+ case 2:
+ opMustBeReg.setReg(v)
+ default:
+ panic("BUG")
+ }
+ case operandKindMem:
+ nregs := op.addressMode().nregs()
+ index--
+ if index < nregs {
+ op.addressMode().assignUses(index, v)
+ } else if index == nregs {
+ opMustBeReg.setReg(v)
+ } else {
+ panic("BUG")
+ }
+ default:
+ panic(fmt.Sprintf("BUG: invalid operand pair: %s", i))
+ }
+ }
+
+ case useKindRaxOp1RegOp2:
+ switch index {
+ case 0:
+ if v.RealReg() != rax {
+ panic("BUG")
+ }
+ case 1:
+ i.op1.setReg(v)
+ default:
+ op := &i.op2
+ switch op.kind {
+ case operandKindReg:
+ switch index {
+ case 1:
+ op.setReg(v)
+ case 2:
+ op.setReg(v)
+ default:
+ panic("BUG")
+ }
+ case operandKindMem:
+ nregs := op.addressMode().nregs()
+ index -= 2
+ if index < nregs {
+ op.addressMode().assignUses(index, v)
+ } else if index == nregs {
+ op.setReg(v)
+ } else {
+ panic("BUG")
+ }
+ default:
+ panic(fmt.Sprintf("BUG: invalid operand pair: %s", i))
+ }
+ }
+ default:
+ panic(fmt.Sprintf("BUG: invalid useKind %s for %s", uk, i))
+ }
+}
+
+// AssignDef implements regalloc.Instr.
+func (i *instruction) AssignDef(reg regalloc.VReg) {
+ switch dk := defKinds[i.kind]; dk {
+ case defKindNone:
+ case defKindOp2:
+ i.op2.setReg(reg)
+ default:
+ panic(fmt.Sprintf("BUG: invalid defKind \"%s\" for %s", dk, i))
+ }
+}
+
+// IsCopy implements regalloc.Instr.
+func (i *instruction) IsCopy() bool {
+ k := i.kind
+ if k == movRR {
+ return true
+ }
+ if k == xmmUnaryRmR {
+ if i.op1.kind == operandKindReg {
+ sse := sseOpcode(i.u1)
+ return sse == sseOpcodeMovss || sse == sseOpcodeMovsd || sse == sseOpcodeMovdqu
+ }
+ }
+ return false
+}
+
+func resetInstruction(i *instruction) {
+ *i = instruction{}
+}
+
+func setNext(i *instruction, next *instruction) {
+ i.next = next
+}
+
+func setPrev(i *instruction, prev *instruction) {
+ i.prev = prev
+}
+
+func asNop(i *instruction) {
+ i.kind = nop0
+}
+
+func (i *instruction) asNop0WithLabel(label backend.Label) *instruction { //nolint
+ i.kind = nop0
+ i.u1 = uint64(label)
+ return i
+}
+
+func (i *instruction) nop0Label() backend.Label {
+ return backend.Label(i.u1)
+}
+
+type instructionKind byte
+
+const (
+ nop0 instructionKind = iota + 1
+
+ // Integer arithmetic/bit-twiddling: (add sub and or xor mul, etc.) (32 64) (reg addr imm) reg
+ aluRmiR
+
+ // Instructions on GPR that only read src and defines dst (dst is not modified): bsr, etc.
+ unaryRmR
+
+ // Bitwise not
+ not
+
+ // Integer negation
+ neg
+
+ // Integer quotient and remainder: (div idiv) $rax $rdx (reg addr)
+ div
+
+ // The high bits (RDX) of a (un)signed multiply: RDX:RAX := RAX * rhs.
+ mulHi
+
+ // Do a sign-extend based on the sign of the value in rax into rdx: (cwd cdq cqo)
+ // or al into ah: (cbw)
+ signExtendData
+
+ // Constant materialization: (imm32 imm64) reg.
+ // Either: movl $imm32, %reg32 or movabsq $imm64, %reg64.
+ imm
+
+ // GPR to GPR move: mov (64 32) reg reg.
+ movRR
+
+ // movzxRmR is zero-extended loads or move (R to R), except for 64 bits: movz (bl bq wl wq lq) addr reg.
+ // Note that the lq variant doesn't really exist since the default zero-extend rule makes it
+ // unnecessary. For that case we emit the equivalent "movl AM, reg32".
+ movzxRmR
+
+ // mov64MR is a plain 64-bit integer load, since movzxRmR can't represent that.
+ mov64MR
+
+ // Loads the memory address of addr into dst.
+ lea
+
+ // Sign-extended loads and moves: movs (bl bq wl wq lq) addr reg.
+ movsxRmR
+
+ // Integer stores: mov (b w l q) reg addr.
+ movRM
+
+ // Arithmetic shifts: (shl shr sar) (b w l q) imm reg.
+ shiftR
+
+ // Arithmetic SIMD shifts.
+ xmmRmiReg
+
+ // Integer comparisons/tests: cmp or test (b w l q) (reg addr imm) reg.
+ cmpRmiR
+
+ // Materializes the requested condition code in the destination reg.
+ setcc
+
+ // Integer conditional move.
+ // Overwrites the destination register.
+ cmove
+
+ // pushq (reg addr imm)
+ push64
+
+ // popq reg
+ pop64
+
+ // XMM (scalar or vector) binary op: (add sub and or xor mul adc? sbb?) (32 64) (reg addr) reg
+ xmmRmR
+
+ // XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg.
+ //
+ // This differs from xmmRmR in that the dst register of xmmUnaryRmR is not used in the
+ // computation of the instruction dst value and so does not have to be a previously valid
+ // value. This is characteristic of mov instructions.
+ xmmUnaryRmR
+
+ // XMM (scalar or vector) unary op with immediate: roundss, roundsd, etc.
+ //
+ // This differs from XMM_RM_R_IMM in that the dst register of
+ // XmmUnaryRmRImm is not used in the computation of the instruction dst
+ // value and so does not have to be a previously valid value.
+ xmmUnaryRmRImm
+
+ // XMM (scalar or vector) unary op (from xmm to mem): stores, movd, movq
+ xmmMovRM
+
+ // XMM (vector) unary op (to move a constant value into an xmm register): movups
+ xmmLoadConst
+
+ // XMM (scalar) unary op (from xmm to integer reg): movd, movq, cvtts{s,d}2si
+ xmmToGpr
+
+ // XMM (scalar) unary op (from integer to float reg): movd, movq, cvtsi2s{s,d}
+ gprToXmm
+
+ // Converts an unsigned int64 to a float32/float64.
+ cvtUint64ToFloatSeq
+
+ // Converts a scalar xmm to a signed int32/int64.
+ cvtFloatToSintSeq
+
+ // Converts a scalar xmm to an unsigned int32/int64.
+ cvtFloatToUintSeq
+
+ // A sequence to compute min/max with the proper NaN semantics for xmm registers.
+ xmmMinMaxSeq
+
+ // Float comparisons/tests: cmp (b w l q) (reg addr imm) reg.
+ xmmCmpRmR
+
+ // A binary XMM instruction with an 8-bit immediate: e.g. cmp (ps pd) imm (reg addr) reg
+ xmmRmRImm
+
+ // Direct call: call simm32.
+ // Note that the offset is the relative to the *current RIP*, which points to the first byte of the next instruction.
+ call
+
+ // Indirect call: callq (reg mem).
+ callIndirect
+
+ // Return.
+ ret
+
+ // Jump: jmp (reg, mem, imm32 or label)
+ jmp
+
+ // Jump conditionally: jcond cond label.
+ jmpIf
+
+ // jmpTableIsland is to emit the jump table.
+ jmpTableIsland
+
+ // exitSequence exits the execution and go back to the Go world.
+ exitSequence
+
+ // An instruction that will always trigger the illegal instruction exception.
+ ud2
+
+ // xchg is described in https://www.felixcloutier.com/x86/xchg.
+ // This instruction uses two operands, where one of them can be a memory address, and swaps their values.
+ // If the dst is a memory address, the execution is atomic.
+ xchg
+
+ // lockcmpxchg is the cmpxchg instruction https://www.felixcloutier.com/x86/cmpxchg with a lock prefix.
+ lockcmpxchg
+
+ // zeros puts zeros into the destination register. This is implemented as xor reg, reg for
+ // either integer or XMM registers. The reason why we have this instruction instead of using aluRmiR
+ // is that it requires the already-defined registers. From reg alloc's perspective, this defines
+ // the destination register and takes no inputs.
+ zeros
+
+ // sourceOffsetInfo is a dummy instruction to emit source offset info.
+ // The existence of this instruction does not affect the execution.
+ sourceOffsetInfo
+
+ // defineUninitializedReg is a no-op instruction that defines a register without a defining instruction.
+ defineUninitializedReg
+
+ // fcvtToSintSequence is a sequence of instructions to convert a float to a signed integer.
+ fcvtToSintSequence
+
+ // fcvtToUintSequence is a sequence of instructions to convert a float to an unsigned integer.
+ fcvtToUintSequence
+
+ // xmmCMov is a conditional move instruction for XMM registers. Lowered after register allocation.
+ xmmCMov
+
+ // idivRemSequence is a sequence of instructions to compute both the quotient and remainder of a division.
+ idivRemSequence
+
+ // blendvpd is https://www.felixcloutier.com/x86/blendvpd.
+ blendvpd
+
+ // mfence is https://www.felixcloutier.com/x86/mfence
+ mfence
+
+ // lockxadd is xadd https://www.felixcloutier.com/x86/xadd with a lock prefix.
+ lockxadd
+
+ // nopUseReg is a meta instruction that uses one register and does nothing.
+ nopUseReg
+
+ instrMax
+)
+
+func (i *instruction) asMFence() *instruction {
+ i.kind = mfence
+ return i
+}
+
+func (i *instruction) asNopUseReg(r regalloc.VReg) *instruction {
+ i.kind = nopUseReg
+ i.op1 = newOperandReg(r)
+ return i
+}
+
+func (i *instruction) asIdivRemSequence(execCtx, divisor, tmpGp regalloc.VReg, isDiv, signed, _64 bool) *instruction {
+ i.kind = idivRemSequence
+ i.op1 = newOperandReg(execCtx)
+ i.op2 = newOperandReg(divisor)
+ i.u1 = uint64(tmpGp)
+ if isDiv {
+ i.u2 |= 1
+ }
+ if signed {
+ i.u2 |= 2
+ }
+ if _64 {
+ i.u2 |= 4
+ }
+ return i
+}
+
+func (i *instruction) idivRemSequenceData() (
+ execCtx, divisor, tmpGp regalloc.VReg, isDiv, signed, _64 bool,
+) {
+ if i.kind != idivRemSequence {
+ panic("BUG")
+ }
+ return i.op1.reg(), i.op2.reg(), regalloc.VReg(i.u1), i.u2&1 != 0, i.u2&2 != 0, i.u2&4 != 0
+}
+
+func (i *instruction) asXmmCMov(cc cond, x operand, rd regalloc.VReg, size byte) *instruction {
+ i.kind = xmmCMov
+ i.op1 = x
+ i.op2 = newOperandReg(rd)
+ i.u1 = uint64(cc)
+ i.u2 = uint64(size)
+ return i
+}
+
+func (i *instruction) asDefineUninitializedReg(r regalloc.VReg) *instruction {
+ i.kind = defineUninitializedReg
+ i.op2 = newOperandReg(r)
+ return i
+}
+
+func (m *machine) allocateFcvtToUintSequence(
+ execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2 regalloc.VReg,
+ src64, dst64, sat bool,
+) *instruction {
+ i := m.allocateInstr()
+ i.kind = fcvtToUintSequence
+ op1a := m.amodePool.Allocate()
+ op2a := m.amodePool.Allocate()
+ i.op1 = newOperandMem(op1a)
+ i.op2 = newOperandMem(op2a)
+ if src64 {
+ op1a.imm32 = 1
+ } else {
+ op1a.imm32 = 0
+ }
+ if dst64 {
+ op1a.imm32 |= 2
+ }
+ if sat {
+ op1a.imm32 |= 4
+ }
+
+ op1a.base = execCtx
+ op1a.index = src
+ op2a.base = tmpGp
+ op2a.index = tmpGp2
+ i.u1 = uint64(tmpXmm)
+ i.u2 = uint64(tmpXmm2)
+ return i
+}
+
+func (i *instruction) fcvtToUintSequenceData() (
+ execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2 regalloc.VReg, src64, dst64, sat bool,
+) {
+ if i.kind != fcvtToUintSequence {
+ panic("BUG")
+ }
+ op1a := i.op1.addressMode()
+ op2a := i.op2.addressMode()
+ return op1a.base, op1a.index, op2a.base, op2a.index, regalloc.VReg(i.u1), regalloc.VReg(i.u2),
+ op1a.imm32&1 != 0, op1a.imm32&2 != 0, op1a.imm32&4 != 0
+}
+
+func (m *machine) allocateFcvtToSintSequence(
+ execCtx, src, tmpGp, tmpGp2, tmpXmm regalloc.VReg,
+ src64, dst64, sat bool,
+) *instruction {
+ i := m.allocateInstr()
+ i.kind = fcvtToSintSequence
+ op1a := m.amodePool.Allocate()
+ op2a := m.amodePool.Allocate()
+ i.op1 = newOperandMem(op1a)
+ i.op2 = newOperandMem(op2a)
+ op1a.base = execCtx
+ op1a.index = src
+ op2a.base = tmpGp
+ op2a.index = tmpGp2
+ i.u1 = uint64(tmpXmm)
+ if src64 {
+ i.u2 = 1
+ } else {
+ i.u2 = 0
+ }
+ if dst64 {
+ i.u2 |= 2
+ }
+ if sat {
+ i.u2 |= 4
+ }
+ return i
+}
+
+func (i *instruction) fcvtToSintSequenceData() (
+ execCtx, src, tmpGp, tmpGp2, tmpXmm regalloc.VReg, src64, dst64, sat bool,
+) {
+ if i.kind != fcvtToSintSequence {
+ panic("BUG")
+ }
+ op1a := i.op1.addressMode()
+ op2a := i.op2.addressMode()
+ return op1a.base, op1a.index, op2a.base, op2a.index, regalloc.VReg(i.u1),
+ i.u2&1 != 0, i.u2&2 != 0, i.u2&4 != 0
+}
+
+func (k instructionKind) String() string {
+ switch k {
+ case nop0:
+ return "nop"
+ case ret:
+ return "ret"
+ case imm:
+ return "imm"
+ case aluRmiR:
+ return "aluRmiR"
+ case movRR:
+ return "movRR"
+ case xmmRmR:
+ return "xmmRmR"
+ case gprToXmm:
+ return "gprToXmm"
+ case xmmUnaryRmR:
+ return "xmmUnaryRmR"
+ case xmmUnaryRmRImm:
+ return "xmmUnaryRmRImm"
+ case unaryRmR:
+ return "unaryRmR"
+ case not:
+ return "not"
+ case neg:
+ return "neg"
+ case div:
+ return "div"
+ case mulHi:
+ return "mulHi"
+ case signExtendData:
+ return "signExtendData"
+ case movzxRmR:
+ return "movzxRmR"
+ case mov64MR:
+ return "mov64MR"
+ case lea:
+ return "lea"
+ case movsxRmR:
+ return "movsxRmR"
+ case movRM:
+ return "movRM"
+ case shiftR:
+ return "shiftR"
+ case xmmRmiReg:
+ return "xmmRmiReg"
+ case cmpRmiR:
+ return "cmpRmiR"
+ case setcc:
+ return "setcc"
+ case cmove:
+ return "cmove"
+ case push64:
+ return "push64"
+ case pop64:
+ return "pop64"
+ case xmmMovRM:
+ return "xmmMovRM"
+ case xmmLoadConst:
+ return "xmmLoadConst"
+ case xmmToGpr:
+ return "xmmToGpr"
+ case cvtUint64ToFloatSeq:
+ return "cvtUint64ToFloatSeq"
+ case cvtFloatToSintSeq:
+ return "cvtFloatToSintSeq"
+ case cvtFloatToUintSeq:
+ return "cvtFloatToUintSeq"
+ case xmmMinMaxSeq:
+ return "xmmMinMaxSeq"
+ case xmmCmpRmR:
+ return "xmmCmpRmR"
+ case xmmRmRImm:
+ return "xmmRmRImm"
+ case jmpIf:
+ return "jmpIf"
+ case jmp:
+ return "jmp"
+ case jmpTableIsland:
+ return "jmpTableIsland"
+ case exitSequence:
+ return "exit_sequence"
+ case ud2:
+ return "ud2"
+ case xchg:
+ return "xchg"
+ case zeros:
+ return "zeros"
+ case fcvtToSintSequence:
+ return "fcvtToSintSequence"
+ case fcvtToUintSequence:
+ return "fcvtToUintSequence"
+ case xmmCMov:
+ return "xmmCMov"
+ case idivRemSequence:
+ return "idivRemSequence"
+ case mfence:
+ return "mfence"
+ case lockcmpxchg:
+ return "lockcmpxchg"
+ case lockxadd:
+ return "lockxadd"
+ default:
+ panic("BUG")
+ }
+}
+
+type aluRmiROpcode byte
+
+const (
+ aluRmiROpcodeAdd aluRmiROpcode = iota + 1
+ aluRmiROpcodeSub
+ aluRmiROpcodeAnd
+ aluRmiROpcodeOr
+ aluRmiROpcodeXor
+ aluRmiROpcodeMul
+)
+
+func (a aluRmiROpcode) String() string {
+ switch a {
+ case aluRmiROpcodeAdd:
+ return "add"
+ case aluRmiROpcodeSub:
+ return "sub"
+ case aluRmiROpcodeAnd:
+ return "and"
+ case aluRmiROpcodeOr:
+ return "or"
+ case aluRmiROpcodeXor:
+ return "xor"
+ case aluRmiROpcodeMul:
+ return "imul"
+ default:
+ panic("BUG")
+ }
+}
+
+func (i *instruction) asJmpIf(cond cond, target operand) *instruction {
+ i.kind = jmpIf
+ i.u1 = uint64(cond)
+ i.op1 = target
+ return i
+}
+
+// asJmpTableSequence is used to emit the jump table.
+// targetSliceIndex is the index of the target slice in machine.jmpTableTargets.
+func (i *instruction) asJmpTableSequence(targetSliceIndex int, targetCount int) *instruction {
+ i.kind = jmpTableIsland
+ i.u1 = uint64(targetSliceIndex)
+ i.u2 = uint64(targetCount)
+ return i
+}
+
+func (i *instruction) asJmp(target operand) *instruction {
+ i.kind = jmp
+ i.op1 = target
+ return i
+}
+
+func (i *instruction) jmpLabel() backend.Label {
+ switch i.kind {
+ case jmp, jmpIf, lea, xmmUnaryRmR:
+ return i.op1.label()
+ default:
+ panic("BUG")
+ }
+}
+
+func (i *instruction) asLEA(target operand, rd regalloc.VReg) *instruction {
+ i.kind = lea
+ i.op1 = target
+ i.op2 = newOperandReg(rd)
+ return i
+}
+
+func (i *instruction) asCall(ref ssa.FuncRef, abi *backend.FunctionABI) *instruction {
+ i.kind = call
+ i.u1 = uint64(ref)
+ if abi != nil {
+ i.u2 = abi.ABIInfoAsUint64()
+ }
+ return i
+}
+
+func (i *instruction) asCallIndirect(ptr operand, abi *backend.FunctionABI) *instruction {
+ if ptr.kind != operandKindReg && ptr.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = callIndirect
+ i.op1 = ptr
+ if abi != nil {
+ i.u2 = abi.ABIInfoAsUint64()
+ }
+ return i
+}
+
+func (i *instruction) asRet() *instruction {
+ i.kind = ret
+ return i
+}
+
+func (i *instruction) asImm(dst regalloc.VReg, value uint64, _64 bool) *instruction {
+ i.kind = imm
+ i.op2 = newOperandReg(dst)
+ i.u1 = value
+ i.b1 = _64
+ return i
+}
+
+func (i *instruction) asAluRmiR(op aluRmiROpcode, rm operand, rd regalloc.VReg, _64 bool) *instruction {
+ if rm.kind != operandKindReg && rm.kind != operandKindMem && rm.kind != operandKindImm32 {
+ panic("BUG")
+ }
+ i.kind = aluRmiR
+ i.op1 = rm
+ i.op2 = newOperandReg(rd)
+ i.u1 = uint64(op)
+ i.b1 = _64
+ return i
+}
+
+func (i *instruction) asZeros(dst regalloc.VReg) *instruction {
+ i.kind = zeros
+ i.op2 = newOperandReg(dst)
+ return i
+}
+
+func (i *instruction) asBlendvpd(rm operand, rd regalloc.VReg) *instruction {
+ if rm.kind != operandKindReg && rm.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = blendvpd
+ i.op1 = rm
+ i.op2 = newOperandReg(rd)
+ return i
+}
+
+func (i *instruction) asXmmRmR(op sseOpcode, rm operand, rd regalloc.VReg) *instruction {
+ if rm.kind != operandKindReg && rm.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = xmmRmR
+ i.op1 = rm
+ i.op2 = newOperandReg(rd)
+ i.u1 = uint64(op)
+ return i
+}
+
+func (i *instruction) asXmmRmRImm(op sseOpcode, imm uint8, rm operand, rd regalloc.VReg) *instruction {
+ if rm.kind != operandKindReg && rm.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = xmmRmRImm
+ i.op1 = rm
+ i.op2 = newOperandReg(rd)
+ i.u1 = uint64(op)
+ i.u2 = uint64(imm)
+ return i
+}
+
+func (i *instruction) asGprToXmm(op sseOpcode, rm operand, rd regalloc.VReg, _64 bool) *instruction {
+ if rm.kind != operandKindReg && rm.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = gprToXmm
+ i.op1 = rm
+ i.op2 = newOperandReg(rd)
+ i.u1 = uint64(op)
+ i.b1 = _64
+ return i
+}
+
+func (i *instruction) asEmitSourceOffsetInfo(l ssa.SourceOffset) *instruction {
+ i.kind = sourceOffsetInfo
+ i.u1 = uint64(l)
+ return i
+}
+
+func (i *instruction) sourceOffsetInfo() ssa.SourceOffset {
+ return ssa.SourceOffset(i.u1)
+}
+
+func (i *instruction) asXmmToGpr(op sseOpcode, rm, rd regalloc.VReg, _64 bool) *instruction {
+ i.kind = xmmToGpr
+ i.op1 = newOperandReg(rm)
+ i.op2 = newOperandReg(rd)
+ i.u1 = uint64(op)
+ i.b1 = _64
+ return i
+}
+
+func (i *instruction) asMovRM(rm regalloc.VReg, rd operand, size byte) *instruction {
+ if rd.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = movRM
+ i.op1 = newOperandReg(rm)
+ i.op2 = rd
+ i.u1 = uint64(size)
+ return i
+}
+
+func (i *instruction) asMovsxRmR(ext extMode, src operand, rd regalloc.VReg) *instruction {
+ if src.kind != operandKindReg && src.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = movsxRmR
+ i.op1 = src
+ i.op2 = newOperandReg(rd)
+ i.u1 = uint64(ext)
+ return i
+}
+
+func (i *instruction) asMovzxRmR(ext extMode, src operand, rd regalloc.VReg) *instruction {
+ if src.kind != operandKindReg && src.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = movzxRmR
+ i.op1 = src
+ i.op2 = newOperandReg(rd)
+ i.u1 = uint64(ext)
+ return i
+}
+
+func (i *instruction) asSignExtendData(_64 bool) *instruction {
+ i.kind = signExtendData
+ i.b1 = _64
+ return i
+}
+
+func (i *instruction) asUD2() *instruction {
+ i.kind = ud2
+ return i
+}
+
+func (i *instruction) asDiv(rn operand, signed bool, _64 bool) *instruction {
+ i.kind = div
+ i.op1 = rn
+ i.b1 = _64
+ if signed {
+ i.u1 = 1
+ }
+ return i
+}
+
+func (i *instruction) asMov64MR(rm operand, rd regalloc.VReg) *instruction {
+ if rm.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = mov64MR
+ i.op1 = rm
+ i.op2 = newOperandReg(rd)
+ return i
+}
+
+func (i *instruction) asMovRR(rm, rd regalloc.VReg, _64 bool) *instruction {
+ i.kind = movRR
+ i.op1 = newOperandReg(rm)
+ i.op2 = newOperandReg(rd)
+ i.b1 = _64
+ return i
+}
+
+func (i *instruction) asNot(rm operand, _64 bool) *instruction {
+ if rm.kind != operandKindReg && rm.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = not
+ i.op1 = rm
+ i.b1 = _64
+ return i
+}
+
+func (i *instruction) asNeg(rm operand, _64 bool) *instruction {
+ if rm.kind != operandKindReg && rm.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = neg
+ i.op1 = rm
+ i.b1 = _64
+ return i
+}
+
+func (i *instruction) asMulHi(rm operand, signed, _64 bool) *instruction {
+ if rm.kind != operandKindReg && (rm.kind != operandKindMem) {
+ panic("BUG")
+ }
+ i.kind = mulHi
+ i.op1 = rm
+ i.b1 = _64
+ if signed {
+ i.u1 = 1
+ }
+ return i
+}
+
+func (i *instruction) asUnaryRmR(op unaryRmROpcode, rm operand, rd regalloc.VReg, _64 bool) *instruction {
+ if rm.kind != operandKindReg && rm.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = unaryRmR
+ i.op1 = rm
+ i.op2 = newOperandReg(rd)
+ i.u1 = uint64(op)
+ i.b1 = _64
+ return i
+}
+
+func (i *instruction) asShiftR(op shiftROp, amount operand, rd regalloc.VReg, _64 bool) *instruction {
+ if amount.kind != operandKindReg && amount.kind != operandKindImm32 {
+ panic("BUG")
+ }
+ i.kind = shiftR
+ i.op1 = amount
+ i.op2 = newOperandReg(rd)
+ i.u1 = uint64(op)
+ i.b1 = _64
+ return i
+}
+
+func (i *instruction) asXmmRmiReg(op sseOpcode, rm operand, rd regalloc.VReg) *instruction {
+ if rm.kind != operandKindReg && rm.kind != operandKindImm32 && rm.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = xmmRmiReg
+ i.op1 = rm
+ i.op2 = newOperandReg(rd)
+ i.u1 = uint64(op)
+ return i
+}
+
+func (i *instruction) asCmpRmiR(cmp bool, rm operand, rn regalloc.VReg, _64 bool) *instruction {
+ if rm.kind != operandKindReg && rm.kind != operandKindImm32 && rm.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = cmpRmiR
+ i.op1 = rm
+ i.op2 = newOperandReg(rn)
+ if cmp {
+ i.u1 = 1
+ }
+ i.b1 = _64
+ return i
+}
+
+func (i *instruction) asSetcc(c cond, rd regalloc.VReg) *instruction {
+ i.kind = setcc
+ i.op2 = newOperandReg(rd)
+ i.u1 = uint64(c)
+ return i
+}
+
+func (i *instruction) asCmove(c cond, rm operand, rd regalloc.VReg, _64 bool) *instruction {
+ i.kind = cmove
+ i.op1 = rm
+ i.op2 = newOperandReg(rd)
+ i.u1 = uint64(c)
+ i.b1 = _64
+ return i
+}
+
+func (m *machine) allocateExitSeq(execCtx regalloc.VReg) *instruction {
+ i := m.allocateInstr()
+ i.kind = exitSequence
+ i.op1 = newOperandReg(execCtx)
+ // Allocate the address mode that will be used in encoding the exit sequence.
+ i.op2 = newOperandMem(m.amodePool.Allocate())
+ return i
+}
+
+func (i *instruction) asXmmUnaryRmR(op sseOpcode, rm operand, rd regalloc.VReg) *instruction {
+ if rm.kind != operandKindReg && rm.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = xmmUnaryRmR
+ i.op1 = rm
+ i.op2 = newOperandReg(rd)
+ i.u1 = uint64(op)
+ return i
+}
+
+func (i *instruction) asXmmUnaryRmRImm(op sseOpcode, imm byte, rm operand, rd regalloc.VReg) *instruction {
+ if rm.kind != operandKindReg && rm.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = xmmUnaryRmRImm
+ i.op1 = rm
+ i.op2 = newOperandReg(rd)
+ i.u1 = uint64(op)
+ i.u2 = uint64(imm)
+ return i
+}
+
+func (i *instruction) asXmmCmpRmR(op sseOpcode, rm operand, rd regalloc.VReg) *instruction {
+ if rm.kind != operandKindReg && rm.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = xmmCmpRmR
+ i.op1 = rm
+ i.op2 = newOperandReg(rd)
+ i.u1 = uint64(op)
+ return i
+}
+
+func (i *instruction) asXmmMovRM(op sseOpcode, rm regalloc.VReg, rd operand) *instruction {
+ if rd.kind != operandKindMem {
+ panic("BUG")
+ }
+ i.kind = xmmMovRM
+ i.op1 = newOperandReg(rm)
+ i.op2 = rd
+ i.u1 = uint64(op)
+ return i
+}
+
+func (i *instruction) asPop64(rm regalloc.VReg) *instruction {
+ i.kind = pop64
+ i.op1 = newOperandReg(rm)
+ return i
+}
+
+func (i *instruction) asPush64(op operand) *instruction {
+ if op.kind != operandKindReg && op.kind != operandKindMem && op.kind != operandKindImm32 {
+ panic("BUG")
+ }
+ i.kind = push64
+ i.op1 = op
+ return i
+}
+
+func (i *instruction) asXCHG(rm regalloc.VReg, rd operand, size byte) *instruction {
+ i.kind = xchg
+ i.op1 = newOperandReg(rm)
+ i.op2 = rd
+ i.u1 = uint64(size)
+ return i
+}
+
+func (i *instruction) asLockCmpXCHG(rm regalloc.VReg, rd *amode, size byte) *instruction {
+ i.kind = lockcmpxchg
+ i.op1 = newOperandReg(rm)
+ i.op2 = newOperandMem(rd)
+ i.u1 = uint64(size)
+ return i
+}
+
+func (i *instruction) asLockXAdd(rm regalloc.VReg, rd *amode, size byte) *instruction {
+ i.kind = lockxadd
+ i.op1 = newOperandReg(rm)
+ i.op2 = newOperandMem(rd)
+ i.u1 = uint64(size)
+ return i
+}
+
+type unaryRmROpcode byte
+
+const (
+ unaryRmROpcodeBsr unaryRmROpcode = iota
+ unaryRmROpcodeBsf
+ unaryRmROpcodeLzcnt
+ unaryRmROpcodeTzcnt
+ unaryRmROpcodePopcnt
+)
+
+func (u unaryRmROpcode) String() string {
+ switch u {
+ case unaryRmROpcodeBsr:
+ return "bsr"
+ case unaryRmROpcodeBsf:
+ return "bsf"
+ case unaryRmROpcodeLzcnt:
+ return "lzcnt"
+ case unaryRmROpcodeTzcnt:
+ return "tzcnt"
+ case unaryRmROpcodePopcnt:
+ return "popcnt"
+ default:
+ panic("BUG")
+ }
+}
+
+type shiftROp byte
+
+const (
+ shiftROpRotateLeft shiftROp = 0
+ shiftROpRotateRight shiftROp = 1
+ shiftROpShiftLeft shiftROp = 4
+ shiftROpShiftRightLogical shiftROp = 5
+ shiftROpShiftRightArithmetic shiftROp = 7
+)
+
+func (s shiftROp) String() string {
+ switch s {
+ case shiftROpRotateLeft:
+ return "rol"
+ case shiftROpRotateRight:
+ return "ror"
+ case shiftROpShiftLeft:
+ return "shl"
+ case shiftROpShiftRightLogical:
+ return "shr"
+ case shiftROpShiftRightArithmetic:
+ return "sar"
+ default:
+ panic("BUG")
+ }
+}
+
+type sseOpcode byte
+
+const (
+ sseOpcodeInvalid sseOpcode = iota
+ sseOpcodeAddps
+ sseOpcodeAddpd
+ sseOpcodeAddss
+ sseOpcodeAddsd
+ sseOpcodeAndps
+ sseOpcodeAndpd
+ sseOpcodeAndnps
+ sseOpcodeAndnpd
+ sseOpcodeBlendvps
+ sseOpcodeBlendvpd
+ sseOpcodeComiss
+ sseOpcodeComisd
+ sseOpcodeCmpps
+ sseOpcodeCmppd
+ sseOpcodeCmpss
+ sseOpcodeCmpsd
+ sseOpcodeCvtdq2ps
+ sseOpcodeCvtdq2pd
+ sseOpcodeCvtsd2ss
+ sseOpcodeCvtsd2si
+ sseOpcodeCvtsi2ss
+ sseOpcodeCvtsi2sd
+ sseOpcodeCvtss2si
+ sseOpcodeCvtss2sd
+ sseOpcodeCvttps2dq
+ sseOpcodeCvttss2si
+ sseOpcodeCvttsd2si
+ sseOpcodeDivps
+ sseOpcodeDivpd
+ sseOpcodeDivss
+ sseOpcodeDivsd
+ sseOpcodeInsertps
+ sseOpcodeMaxps
+ sseOpcodeMaxpd
+ sseOpcodeMaxss
+ sseOpcodeMaxsd
+ sseOpcodeMinps
+ sseOpcodeMinpd
+ sseOpcodeMinss
+ sseOpcodeMinsd
+ sseOpcodeMovaps
+ sseOpcodeMovapd
+ sseOpcodeMovd
+ sseOpcodeMovdqa
+ sseOpcodeMovdqu
+ sseOpcodeMovlhps
+ sseOpcodeMovmskps
+ sseOpcodeMovmskpd
+ sseOpcodeMovq
+ sseOpcodeMovss
+ sseOpcodeMovsd
+ sseOpcodeMovups
+ sseOpcodeMovupd
+ sseOpcodeMulps
+ sseOpcodeMulpd
+ sseOpcodeMulss
+ sseOpcodeMulsd
+ sseOpcodeOrps
+ sseOpcodeOrpd
+ sseOpcodePabsb
+ sseOpcodePabsw
+ sseOpcodePabsd
+ sseOpcodePackssdw
+ sseOpcodePacksswb
+ sseOpcodePackusdw
+ sseOpcodePackuswb
+ sseOpcodePaddb
+ sseOpcodePaddd
+ sseOpcodePaddq
+ sseOpcodePaddw
+ sseOpcodePaddsb
+ sseOpcodePaddsw
+ sseOpcodePaddusb
+ sseOpcodePaddusw
+ sseOpcodePalignr
+ sseOpcodePand
+ sseOpcodePandn
+ sseOpcodePavgb
+ sseOpcodePavgw
+ sseOpcodePcmpeqb
+ sseOpcodePcmpeqw
+ sseOpcodePcmpeqd
+ sseOpcodePcmpeqq
+ sseOpcodePcmpgtb
+ sseOpcodePcmpgtw
+ sseOpcodePcmpgtd
+ sseOpcodePcmpgtq
+ sseOpcodePextrb
+ sseOpcodePextrw
+ sseOpcodePextrd
+ sseOpcodePextrq
+ sseOpcodePinsrb
+ sseOpcodePinsrw
+ sseOpcodePinsrd
+ sseOpcodePinsrq
+ sseOpcodePmaddwd
+ sseOpcodePmaxsb
+ sseOpcodePmaxsw
+ sseOpcodePmaxsd
+ sseOpcodePmaxub
+ sseOpcodePmaxuw
+ sseOpcodePmaxud
+ sseOpcodePminsb
+ sseOpcodePminsw
+ sseOpcodePminsd
+ sseOpcodePminub
+ sseOpcodePminuw
+ sseOpcodePminud
+ sseOpcodePmovmskb
+ sseOpcodePmovsxbd
+ sseOpcodePmovsxbw
+ sseOpcodePmovsxbq
+ sseOpcodePmovsxwd
+ sseOpcodePmovsxwq
+ sseOpcodePmovsxdq
+ sseOpcodePmovzxbd
+ sseOpcodePmovzxbw
+ sseOpcodePmovzxbq
+ sseOpcodePmovzxwd
+ sseOpcodePmovzxwq
+ sseOpcodePmovzxdq
+ sseOpcodePmulld
+ sseOpcodePmullw
+ sseOpcodePmuludq
+ sseOpcodePor
+ sseOpcodePshufb
+ sseOpcodePshufd
+ sseOpcodePsllw
+ sseOpcodePslld
+ sseOpcodePsllq
+ sseOpcodePsraw
+ sseOpcodePsrad
+ sseOpcodePsrlw
+ sseOpcodePsrld
+ sseOpcodePsrlq
+ sseOpcodePsubb
+ sseOpcodePsubd
+ sseOpcodePsubq
+ sseOpcodePsubw
+ sseOpcodePsubsb
+ sseOpcodePsubsw
+ sseOpcodePsubusb
+ sseOpcodePsubusw
+ sseOpcodePtest
+ sseOpcodePunpckhbw
+ sseOpcodePunpcklbw
+ sseOpcodePxor
+ sseOpcodeRcpss
+ sseOpcodeRoundps
+ sseOpcodeRoundpd
+ sseOpcodeRoundss
+ sseOpcodeRoundsd
+ sseOpcodeRsqrtss
+ sseOpcodeSqrtps
+ sseOpcodeSqrtpd
+ sseOpcodeSqrtss
+ sseOpcodeSqrtsd
+ sseOpcodeSubps
+ sseOpcodeSubpd
+ sseOpcodeSubss
+ sseOpcodeSubsd
+ sseOpcodeUcomiss
+ sseOpcodeUcomisd
+ sseOpcodeXorps
+ sseOpcodeXorpd
+ sseOpcodePmulhrsw
+ sseOpcodeUnpcklps
+ sseOpcodeCvtps2pd
+ sseOpcodeCvtpd2ps
+ sseOpcodeCvttpd2dq
+ sseOpcodeShufps
+ sseOpcodePmaddubsw
+)
+
+func (s sseOpcode) String() string {
+ switch s {
+ case sseOpcodeInvalid:
+ return "invalid"
+ case sseOpcodeAddps:
+ return "addps"
+ case sseOpcodeAddpd:
+ return "addpd"
+ case sseOpcodeAddss:
+ return "addss"
+ case sseOpcodeAddsd:
+ return "addsd"
+ case sseOpcodeAndps:
+ return "andps"
+ case sseOpcodeAndpd:
+ return "andpd"
+ case sseOpcodeAndnps:
+ return "andnps"
+ case sseOpcodeAndnpd:
+ return "andnpd"
+ case sseOpcodeBlendvps:
+ return "blendvps"
+ case sseOpcodeBlendvpd:
+ return "blendvpd"
+ case sseOpcodeComiss:
+ return "comiss"
+ case sseOpcodeComisd:
+ return "comisd"
+ case sseOpcodeCmpps:
+ return "cmpps"
+ case sseOpcodeCmppd:
+ return "cmppd"
+ case sseOpcodeCmpss:
+ return "cmpss"
+ case sseOpcodeCmpsd:
+ return "cmpsd"
+ case sseOpcodeCvtdq2ps:
+ return "cvtdq2ps"
+ case sseOpcodeCvtdq2pd:
+ return "cvtdq2pd"
+ case sseOpcodeCvtsd2ss:
+ return "cvtsd2ss"
+ case sseOpcodeCvtsd2si:
+ return "cvtsd2si"
+ case sseOpcodeCvtsi2ss:
+ return "cvtsi2ss"
+ case sseOpcodeCvtsi2sd:
+ return "cvtsi2sd"
+ case sseOpcodeCvtss2si:
+ return "cvtss2si"
+ case sseOpcodeCvtss2sd:
+ return "cvtss2sd"
+ case sseOpcodeCvttps2dq:
+ return "cvttps2dq"
+ case sseOpcodeCvttss2si:
+ return "cvttss2si"
+ case sseOpcodeCvttsd2si:
+ return "cvttsd2si"
+ case sseOpcodeDivps:
+ return "divps"
+ case sseOpcodeDivpd:
+ return "divpd"
+ case sseOpcodeDivss:
+ return "divss"
+ case sseOpcodeDivsd:
+ return "divsd"
+ case sseOpcodeInsertps:
+ return "insertps"
+ case sseOpcodeMaxps:
+ return "maxps"
+ case sseOpcodeMaxpd:
+ return "maxpd"
+ case sseOpcodeMaxss:
+ return "maxss"
+ case sseOpcodeMaxsd:
+ return "maxsd"
+ case sseOpcodeMinps:
+ return "minps"
+ case sseOpcodeMinpd:
+ return "minpd"
+ case sseOpcodeMinss:
+ return "minss"
+ case sseOpcodeMinsd:
+ return "minsd"
+ case sseOpcodeMovaps:
+ return "movaps"
+ case sseOpcodeMovapd:
+ return "movapd"
+ case sseOpcodeMovd:
+ return "movd"
+ case sseOpcodeMovdqa:
+ return "movdqa"
+ case sseOpcodeMovdqu:
+ return "movdqu"
+ case sseOpcodeMovlhps:
+ return "movlhps"
+ case sseOpcodeMovmskps:
+ return "movmskps"
+ case sseOpcodeMovmskpd:
+ return "movmskpd"
+ case sseOpcodeMovq:
+ return "movq"
+ case sseOpcodeMovss:
+ return "movss"
+ case sseOpcodeMovsd:
+ return "movsd"
+ case sseOpcodeMovups:
+ return "movups"
+ case sseOpcodeMovupd:
+ return "movupd"
+ case sseOpcodeMulps:
+ return "mulps"
+ case sseOpcodeMulpd:
+ return "mulpd"
+ case sseOpcodeMulss:
+ return "mulss"
+ case sseOpcodeMulsd:
+ return "mulsd"
+ case sseOpcodeOrps:
+ return "orps"
+ case sseOpcodeOrpd:
+ return "orpd"
+ case sseOpcodePabsb:
+ return "pabsb"
+ case sseOpcodePabsw:
+ return "pabsw"
+ case sseOpcodePabsd:
+ return "pabsd"
+ case sseOpcodePackssdw:
+ return "packssdw"
+ case sseOpcodePacksswb:
+ return "packsswb"
+ case sseOpcodePackusdw:
+ return "packusdw"
+ case sseOpcodePackuswb:
+ return "packuswb"
+ case sseOpcodePaddb:
+ return "paddb"
+ case sseOpcodePaddd:
+ return "paddd"
+ case sseOpcodePaddq:
+ return "paddq"
+ case sseOpcodePaddw:
+ return "paddw"
+ case sseOpcodePaddsb:
+ return "paddsb"
+ case sseOpcodePaddsw:
+ return "paddsw"
+ case sseOpcodePaddusb:
+ return "paddusb"
+ case sseOpcodePaddusw:
+ return "paddusw"
+ case sseOpcodePalignr:
+ return "palignr"
+ case sseOpcodePand:
+ return "pand"
+ case sseOpcodePandn:
+ return "pandn"
+ case sseOpcodePavgb:
+ return "pavgb"
+ case sseOpcodePavgw:
+ return "pavgw"
+ case sseOpcodePcmpeqb:
+ return "pcmpeqb"
+ case sseOpcodePcmpeqw:
+ return "pcmpeqw"
+ case sseOpcodePcmpeqd:
+ return "pcmpeqd"
+ case sseOpcodePcmpeqq:
+ return "pcmpeqq"
+ case sseOpcodePcmpgtb:
+ return "pcmpgtb"
+ case sseOpcodePcmpgtw:
+ return "pcmpgtw"
+ case sseOpcodePcmpgtd:
+ return "pcmpgtd"
+ case sseOpcodePcmpgtq:
+ return "pcmpgtq"
+ case sseOpcodePextrb:
+ return "pextrb"
+ case sseOpcodePextrw:
+ return "pextrw"
+ case sseOpcodePextrd:
+ return "pextrd"
+ case sseOpcodePextrq:
+ return "pextrq"
+ case sseOpcodePinsrb:
+ return "pinsrb"
+ case sseOpcodePinsrw:
+ return "pinsrw"
+ case sseOpcodePinsrd:
+ return "pinsrd"
+ case sseOpcodePinsrq:
+ return "pinsrq"
+ case sseOpcodePmaddwd:
+ return "pmaddwd"
+ case sseOpcodePmaxsb:
+ return "pmaxsb"
+ case sseOpcodePmaxsw:
+ return "pmaxsw"
+ case sseOpcodePmaxsd:
+ return "pmaxsd"
+ case sseOpcodePmaxub:
+ return "pmaxub"
+ case sseOpcodePmaxuw:
+ return "pmaxuw"
+ case sseOpcodePmaxud:
+ return "pmaxud"
+ case sseOpcodePminsb:
+ return "pminsb"
+ case sseOpcodePminsw:
+ return "pminsw"
+ case sseOpcodePminsd:
+ return "pminsd"
+ case sseOpcodePminub:
+ return "pminub"
+ case sseOpcodePminuw:
+ return "pminuw"
+ case sseOpcodePminud:
+ return "pminud"
+ case sseOpcodePmovmskb:
+ return "pmovmskb"
+ case sseOpcodePmovsxbd:
+ return "pmovsxbd"
+ case sseOpcodePmovsxbw:
+ return "pmovsxbw"
+ case sseOpcodePmovsxbq:
+ return "pmovsxbq"
+ case sseOpcodePmovsxwd:
+ return "pmovsxwd"
+ case sseOpcodePmovsxwq:
+ return "pmovsxwq"
+ case sseOpcodePmovsxdq:
+ return "pmovsxdq"
+ case sseOpcodePmovzxbd:
+ return "pmovzxbd"
+ case sseOpcodePmovzxbw:
+ return "pmovzxbw"
+ case sseOpcodePmovzxbq:
+ return "pmovzxbq"
+ case sseOpcodePmovzxwd:
+ return "pmovzxwd"
+ case sseOpcodePmovzxwq:
+ return "pmovzxwq"
+ case sseOpcodePmovzxdq:
+ return "pmovzxdq"
+ case sseOpcodePmulld:
+ return "pmulld"
+ case sseOpcodePmullw:
+ return "pmullw"
+ case sseOpcodePmuludq:
+ return "pmuludq"
+ case sseOpcodePor:
+ return "por"
+ case sseOpcodePshufb:
+ return "pshufb"
+ case sseOpcodePshufd:
+ return "pshufd"
+ case sseOpcodePsllw:
+ return "psllw"
+ case sseOpcodePslld:
+ return "pslld"
+ case sseOpcodePsllq:
+ return "psllq"
+ case sseOpcodePsraw:
+ return "psraw"
+ case sseOpcodePsrad:
+ return "psrad"
+ case sseOpcodePsrlw:
+ return "psrlw"
+ case sseOpcodePsrld:
+ return "psrld"
+ case sseOpcodePsrlq:
+ return "psrlq"
+ case sseOpcodePsubb:
+ return "psubb"
+ case sseOpcodePsubd:
+ return "psubd"
+ case sseOpcodePsubq:
+ return "psubq"
+ case sseOpcodePsubw:
+ return "psubw"
+ case sseOpcodePsubsb:
+ return "psubsb"
+ case sseOpcodePsubsw:
+ return "psubsw"
+ case sseOpcodePsubusb:
+ return "psubusb"
+ case sseOpcodePsubusw:
+ return "psubusw"
+ case sseOpcodePtest:
+ return "ptest"
+ case sseOpcodePunpckhbw:
+ return "punpckhbw"
+ case sseOpcodePunpcklbw:
+ return "punpcklbw"
+ case sseOpcodePxor:
+ return "pxor"
+ case sseOpcodeRcpss:
+ return "rcpss"
+ case sseOpcodeRoundps:
+ return "roundps"
+ case sseOpcodeRoundpd:
+ return "roundpd"
+ case sseOpcodeRoundss:
+ return "roundss"
+ case sseOpcodeRoundsd:
+ return "roundsd"
+ case sseOpcodeRsqrtss:
+ return "rsqrtss"
+ case sseOpcodeSqrtps:
+ return "sqrtps"
+ case sseOpcodeSqrtpd:
+ return "sqrtpd"
+ case sseOpcodeSqrtss:
+ return "sqrtss"
+ case sseOpcodeSqrtsd:
+ return "sqrtsd"
+ case sseOpcodeSubps:
+ return "subps"
+ case sseOpcodeSubpd:
+ return "subpd"
+ case sseOpcodeSubss:
+ return "subss"
+ case sseOpcodeSubsd:
+ return "subsd"
+ case sseOpcodeUcomiss:
+ return "ucomiss"
+ case sseOpcodeUcomisd:
+ return "ucomisd"
+ case sseOpcodeXorps:
+ return "xorps"
+ case sseOpcodeXorpd:
+ return "xorpd"
+ case sseOpcodePmulhrsw:
+ return "pmulhrsw"
+ case sseOpcodeUnpcklps:
+ return "unpcklps"
+ case sseOpcodeCvtps2pd:
+ return "cvtps2pd"
+ case sseOpcodeCvtpd2ps:
+ return "cvtpd2ps"
+ case sseOpcodeCvttpd2dq:
+ return "cvttpd2dq"
+ case sseOpcodeShufps:
+ return "shufps"
+ case sseOpcodePmaddubsw:
+ return "pmaddubsw"
+ default:
+ panic("BUG")
+ }
+}
+
+type roundingMode uint8
+
+const (
+ roundingModeNearest roundingMode = iota
+ roundingModeDown
+ roundingModeUp
+ roundingModeZero
+)
+
+func (r roundingMode) String() string {
+ switch r {
+ case roundingModeNearest:
+ return "nearest"
+ case roundingModeDown:
+ return "down"
+ case roundingModeUp:
+ return "up"
+ case roundingModeZero:
+ return "zero"
+ default:
+ panic("BUG")
+ }
+}
+
+// cmpPred is the immediate value for a comparison operation in xmmRmRImm.
+type cmpPred uint8
+
+const (
+ // cmpPredEQ_OQ is Equal (ordered, non-signaling)
+ cmpPredEQ_OQ cmpPred = iota
+ // cmpPredLT_OS is Less-than (ordered, signaling)
+ cmpPredLT_OS
+ // cmpPredLE_OS is Less-than-or-equal (ordered, signaling)
+ cmpPredLE_OS
+ // cmpPredUNORD_Q is Unordered (non-signaling)
+ cmpPredUNORD_Q
+ // cmpPredNEQ_UQ is Not-equal (unordered, non-signaling)
+ cmpPredNEQ_UQ
+ // cmpPredNLT_US is Not-less-than (unordered, signaling)
+ cmpPredNLT_US
+ // cmpPredNLE_US is Not-less-than-or-equal (unordered, signaling)
+ cmpPredNLE_US
+ // cmpPredORD_Q is Ordered (non-signaling)
+ cmpPredORD_Q
+ // cmpPredEQ_UQ is Equal (unordered, non-signaling)
+ cmpPredEQ_UQ
+ // cmpPredNGE_US is Not-greater-than-or-equal (unordered, signaling)
+ cmpPredNGE_US
+ // cmpPredNGT_US is Not-greater-than (unordered, signaling)
+ cmpPredNGT_US
+ // cmpPredFALSE_OQ is False (ordered, non-signaling)
+ cmpPredFALSE_OQ
+ // cmpPredNEQ_OQ is Not-equal (ordered, non-signaling)
+ cmpPredNEQ_OQ
+ // cmpPredGE_OS is Greater-than-or-equal (ordered, signaling)
+ cmpPredGE_OS
+ // cmpPredGT_OS is Greater-than (ordered, signaling)
+ cmpPredGT_OS
+ // cmpPredTRUE_UQ is True (unordered, non-signaling)
+ cmpPredTRUE_UQ
+ // Equal (ordered, signaling)
+ cmpPredEQ_OS
+ // Less-than (ordered, nonsignaling)
+ cmpPredLT_OQ
+ // Less-than-or-equal (ordered, nonsignaling)
+ cmpPredLE_OQ
+ // Unordered (signaling)
+ cmpPredUNORD_S
+ // Not-equal (unordered, signaling)
+ cmpPredNEQ_US
+ // Not-less-than (unordered, nonsignaling)
+ cmpPredNLT_UQ
+ // Not-less-than-or-equal (unordered, nonsignaling)
+ cmpPredNLE_UQ
+ // Ordered (signaling)
+ cmpPredORD_S
+ // Equal (unordered, signaling)
+ cmpPredEQ_US
+ // Not-greater-than-or-equal (unordered, non-signaling)
+ cmpPredNGE_UQ
+ // Not-greater-than (unordered, nonsignaling)
+ cmpPredNGT_UQ
+ // False (ordered, signaling)
+ cmpPredFALSE_OS
+ // Not-equal (ordered, signaling)
+ cmpPredNEQ_OS
+ // Greater-than-or-equal (ordered, nonsignaling)
+ cmpPredGE_OQ
+ // Greater-than (ordered, nonsignaling)
+ cmpPredGT_OQ
+ // True (unordered, signaling)
+ cmpPredTRUE_US
+)
+
+func (r cmpPred) String() string {
+ switch r {
+ case cmpPredEQ_OQ:
+ return "eq_oq"
+ case cmpPredLT_OS:
+ return "lt_os"
+ case cmpPredLE_OS:
+ return "le_os"
+ case cmpPredUNORD_Q:
+ return "unord_q"
+ case cmpPredNEQ_UQ:
+ return "neq_uq"
+ case cmpPredNLT_US:
+ return "nlt_us"
+ case cmpPredNLE_US:
+ return "nle_us"
+ case cmpPredORD_Q:
+ return "ord_q"
+ case cmpPredEQ_UQ:
+ return "eq_uq"
+ case cmpPredNGE_US:
+ return "nge_us"
+ case cmpPredNGT_US:
+ return "ngt_us"
+ case cmpPredFALSE_OQ:
+ return "false_oq"
+ case cmpPredNEQ_OQ:
+ return "neq_oq"
+ case cmpPredGE_OS:
+ return "ge_os"
+ case cmpPredGT_OS:
+ return "gt_os"
+ case cmpPredTRUE_UQ:
+ return "true_uq"
+ case cmpPredEQ_OS:
+ return "eq_os"
+ case cmpPredLT_OQ:
+ return "lt_oq"
+ case cmpPredLE_OQ:
+ return "le_oq"
+ case cmpPredUNORD_S:
+ return "unord_s"
+ case cmpPredNEQ_US:
+ return "neq_us"
+ case cmpPredNLT_UQ:
+ return "nlt_uq"
+ case cmpPredNLE_UQ:
+ return "nle_uq"
+ case cmpPredORD_S:
+ return "ord_s"
+ case cmpPredEQ_US:
+ return "eq_us"
+ case cmpPredNGE_UQ:
+ return "nge_uq"
+ case cmpPredNGT_UQ:
+ return "ngt_uq"
+ case cmpPredFALSE_OS:
+ return "false_os"
+ case cmpPredNEQ_OS:
+ return "neq_os"
+ case cmpPredGE_OQ:
+ return "ge_oq"
+ case cmpPredGT_OQ:
+ return "gt_oq"
+ case cmpPredTRUE_US:
+ return "true_us"
+ default:
+ panic("BUG")
+ }
+}
+
+func linkInstr(prev, next *instruction) *instruction {
+ prev.next = next
+ next.prev = prev
+ return next
+}
+
+type defKind byte
+
+const (
+ defKindNone defKind = iota + 1
+ defKindOp2
+ defKindCall
+ defKindDivRem
+)
+
+var defKinds = [instrMax]defKind{
+ nop0: defKindNone,
+ ret: defKindNone,
+ movRR: defKindOp2,
+ movRM: defKindNone,
+ xmmMovRM: defKindNone,
+ aluRmiR: defKindNone,
+ shiftR: defKindNone,
+ imm: defKindOp2,
+ unaryRmR: defKindOp2,
+ xmmRmiReg: defKindNone,
+ xmmUnaryRmR: defKindOp2,
+ xmmUnaryRmRImm: defKindOp2,
+ xmmCmpRmR: defKindNone,
+ xmmRmR: defKindNone,
+ xmmRmRImm: defKindNone,
+ mov64MR: defKindOp2,
+ movsxRmR: defKindOp2,
+ movzxRmR: defKindOp2,
+ gprToXmm: defKindOp2,
+ xmmToGpr: defKindOp2,
+ cmove: defKindNone,
+ call: defKindCall,
+ callIndirect: defKindCall,
+ ud2: defKindNone,
+ jmp: defKindNone,
+ jmpIf: defKindNone,
+ jmpTableIsland: defKindNone,
+ cmpRmiR: defKindNone,
+ exitSequence: defKindNone,
+ lea: defKindOp2,
+ setcc: defKindOp2,
+ zeros: defKindOp2,
+ sourceOffsetInfo: defKindNone,
+ fcvtToSintSequence: defKindNone,
+ defineUninitializedReg: defKindOp2,
+ fcvtToUintSequence: defKindNone,
+ xmmCMov: defKindOp2,
+ idivRemSequence: defKindDivRem,
+ blendvpd: defKindNone,
+ mfence: defKindNone,
+ xchg: defKindNone,
+ lockcmpxchg: defKindNone,
+ lockxadd: defKindNone,
+ neg: defKindNone,
+ nopUseReg: defKindNone,
+}
+
+// String implements fmt.Stringer.
+func (d defKind) String() string {
+ switch d {
+ case defKindNone:
+ return "none"
+ case defKindOp2:
+ return "op2"
+ case defKindCall:
+ return "call"
+ case defKindDivRem:
+ return "divrem"
+ default:
+ return "invalid"
+ }
+}
+
+type useKind byte
+
+const (
+ useKindNone useKind = iota + 1
+ useKindOp1
+ // useKindOp1Op2Reg is Op1 can be any operand, Op2 must be a register.
+ useKindOp1Op2Reg
+ // useKindOp1RegOp2 is Op1 must be a register, Op2 can be any operand.
+ useKindOp1RegOp2
+ // useKindRaxOp1RegOp2 is Op1 must be a register, Op2 can be any operand, and RAX is used.
+ useKindRaxOp1RegOp2
+ useKindDivRem
+ useKindBlendvpd
+ useKindCall
+ useKindCallInd
+ useKindFcvtToSintSequence
+ useKindFcvtToUintSequence
+)
+
+var useKinds = [instrMax]useKind{
+ nop0: useKindNone,
+ ret: useKindNone,
+ movRR: useKindOp1,
+ movRM: useKindOp1RegOp2,
+ xmmMovRM: useKindOp1RegOp2,
+ cmove: useKindOp1Op2Reg,
+ aluRmiR: useKindOp1Op2Reg,
+ shiftR: useKindOp1Op2Reg,
+ imm: useKindNone,
+ unaryRmR: useKindOp1,
+ xmmRmiReg: useKindOp1Op2Reg,
+ xmmUnaryRmR: useKindOp1,
+ xmmUnaryRmRImm: useKindOp1,
+ xmmCmpRmR: useKindOp1Op2Reg,
+ xmmRmR: useKindOp1Op2Reg,
+ xmmRmRImm: useKindOp1Op2Reg,
+ mov64MR: useKindOp1,
+ movzxRmR: useKindOp1,
+ movsxRmR: useKindOp1,
+ gprToXmm: useKindOp1,
+ xmmToGpr: useKindOp1,
+ call: useKindCall,
+ callIndirect: useKindCallInd,
+ ud2: useKindNone,
+ jmpIf: useKindOp1,
+ jmp: useKindOp1,
+ cmpRmiR: useKindOp1Op2Reg,
+ exitSequence: useKindOp1,
+ lea: useKindOp1,
+ jmpTableIsland: useKindNone,
+ setcc: useKindNone,
+ zeros: useKindNone,
+ sourceOffsetInfo: useKindNone,
+ fcvtToSintSequence: useKindFcvtToSintSequence,
+ defineUninitializedReg: useKindNone,
+ fcvtToUintSequence: useKindFcvtToUintSequence,
+ xmmCMov: useKindOp1,
+ idivRemSequence: useKindDivRem,
+ blendvpd: useKindBlendvpd,
+ mfence: useKindNone,
+ xchg: useKindOp1RegOp2,
+ lockcmpxchg: useKindRaxOp1RegOp2,
+ lockxadd: useKindOp1RegOp2,
+ neg: useKindOp1,
+ nopUseReg: useKindOp1,
+}
+
+func (u useKind) String() string {
+ switch u {
+ case useKindNone:
+ return "none"
+ case useKindOp1:
+ return "op1"
+ case useKindOp1Op2Reg:
+ return "op1op2Reg"
+ case useKindOp1RegOp2:
+ return "op1RegOp2"
+ case useKindCall:
+ return "call"
+ case useKindCallInd:
+ return "callInd"
+ default:
+ return "invalid"
+ }
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go
new file mode 100644
index 000000000..6637b428c
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go
@@ -0,0 +1,1683 @@
+package amd64
+
+import (
+ "fmt"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+func (i *instruction) encode(c backend.Compiler) (needsLabelResolution bool) {
+ switch kind := i.kind; kind {
+ case nop0, sourceOffsetInfo, defineUninitializedReg, fcvtToSintSequence, fcvtToUintSequence, nopUseReg:
+ case ret:
+ encodeRet(c)
+ case imm:
+ dst := regEncodings[i.op2.reg().RealReg()]
+ con := i.u1
+ if i.b1 { // 64 bit.
+ if lower32willSignExtendTo64(con) {
+ // Sign extend mov(imm32).
+ encodeRegReg(c,
+ legacyPrefixesNone,
+ 0xc7, 1,
+ 0,
+ dst,
+ rexInfo(0).setW(),
+ )
+ c.Emit4Bytes(uint32(con))
+ } else {
+ c.EmitByte(rexEncodingW | dst.rexBit())
+ c.EmitByte(0xb8 | dst.encoding())
+ c.Emit8Bytes(con)
+ }
+ } else {
+ if dst.rexBit() > 0 {
+ c.EmitByte(rexEncodingDefault | 0x1)
+ }
+ c.EmitByte(0xb8 | dst.encoding())
+ c.Emit4Bytes(uint32(con))
+ }
+
+ case aluRmiR:
+ var rex rexInfo
+ if i.b1 {
+ rex = rex.setW()
+ } else {
+ rex = rex.clearW()
+ }
+
+ dst := regEncodings[i.op2.reg().RealReg()]
+
+ aluOp := aluRmiROpcode(i.u1)
+ if aluOp == aluRmiROpcodeMul {
+ op1 := i.op1
+ const regMemOpc, regMemOpcNum = 0x0FAF, 2
+ switch op1.kind {
+ case operandKindReg:
+ src := regEncodings[op1.reg().RealReg()]
+ encodeRegReg(c, legacyPrefixesNone, regMemOpc, regMemOpcNum, dst, src, rex)
+ case operandKindMem:
+ m := i.op1.addressMode()
+ encodeRegMem(c, legacyPrefixesNone, regMemOpc, regMemOpcNum, dst, m, rex)
+ case operandKindImm32:
+ imm8 := lower8willSignExtendTo32(op1.imm32())
+ var opc uint32
+ if imm8 {
+ opc = 0x6b
+ } else {
+ opc = 0x69
+ }
+ encodeRegReg(c, legacyPrefixesNone, opc, 1, dst, dst, rex)
+ if imm8 {
+ c.EmitByte(byte(op1.imm32()))
+ } else {
+ c.Emit4Bytes(op1.imm32())
+ }
+ default:
+ panic("BUG: invalid operand kind")
+ }
+ } else {
+ const opcodeNum = 1
+ var opcR, opcM, subOpcImm uint32
+ switch aluOp {
+ case aluRmiROpcodeAdd:
+ opcR, opcM, subOpcImm = 0x01, 0x03, 0x0
+ case aluRmiROpcodeSub:
+ opcR, opcM, subOpcImm = 0x29, 0x2b, 0x5
+ case aluRmiROpcodeAnd:
+ opcR, opcM, subOpcImm = 0x21, 0x23, 0x4
+ case aluRmiROpcodeOr:
+ opcR, opcM, subOpcImm = 0x09, 0x0b, 0x1
+ case aluRmiROpcodeXor:
+ opcR, opcM, subOpcImm = 0x31, 0x33, 0x6
+ default:
+ panic("BUG: invalid aluRmiROpcode")
+ }
+
+ op1 := i.op1
+ switch op1.kind {
+ case operandKindReg:
+ src := regEncodings[op1.reg().RealReg()]
+ encodeRegReg(c, legacyPrefixesNone, opcR, opcodeNum, src, dst, rex)
+ case operandKindMem:
+ m := i.op1.addressMode()
+ encodeRegMem(c, legacyPrefixesNone, opcM, opcodeNum, dst, m, rex)
+ case operandKindImm32:
+ imm8 := lower8willSignExtendTo32(op1.imm32())
+ var opc uint32
+ if imm8 {
+ opc = 0x83
+ } else {
+ opc = 0x81
+ }
+ encodeRegReg(c, legacyPrefixesNone, opc, opcodeNum, regEnc(subOpcImm), dst, rex)
+ if imm8 {
+ c.EmitByte(byte(op1.imm32()))
+ } else {
+ c.Emit4Bytes(op1.imm32())
+ }
+ default:
+ panic("BUG: invalid operand kind")
+ }
+ }
+
+ case movRR:
+ src := regEncodings[i.op1.reg().RealReg()]
+ dst := regEncodings[i.op2.reg().RealReg()]
+ var rex rexInfo
+ if i.b1 {
+ rex = rex.setW()
+ } else {
+ rex = rex.clearW()
+ }
+ encodeRegReg(c, legacyPrefixesNone, 0x89, 1, src, dst, rex)
+
+ case xmmRmR, blendvpd:
+ op := sseOpcode(i.u1)
+ var legPrex legacyPrefixes
+ var opcode uint32
+ var opcodeNum uint32
+ switch op {
+ case sseOpcodeAddps:
+ legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F58, 2
+ case sseOpcodeAddpd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F58, 2
+ case sseOpcodeAddss:
+ legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F58, 2
+ case sseOpcodeAddsd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F58, 2
+ case sseOpcodeAndps:
+ legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F54, 2
+ case sseOpcodeAndpd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F54, 2
+ case sseOpcodeAndnps:
+ legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F55, 2
+ case sseOpcodeAndnpd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F55, 2
+ case sseOpcodeBlendvps:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3814, 3
+ case sseOpcodeBlendvpd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3815, 3
+ case sseOpcodeDivps:
+ legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5E, 2
+ case sseOpcodeDivpd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5E, 2
+ case sseOpcodeDivss:
+ legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5E, 2
+ case sseOpcodeDivsd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5E, 2
+ case sseOpcodeMaxps:
+ legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5F, 2
+ case sseOpcodeMaxpd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5F, 2
+ case sseOpcodeMaxss:
+ legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5F, 2
+ case sseOpcodeMaxsd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5F, 2
+ case sseOpcodeMinps:
+ legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5D, 2
+ case sseOpcodeMinpd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5D, 2
+ case sseOpcodeMinss:
+ legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5D, 2
+ case sseOpcodeMinsd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5D, 2
+ case sseOpcodeMovlhps:
+ legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F16, 2
+ case sseOpcodeMovsd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F10, 2
+ case sseOpcodeMulps:
+ legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F59, 2
+ case sseOpcodeMulpd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F59, 2
+ case sseOpcodeMulss:
+ legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F59, 2
+ case sseOpcodeMulsd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F59, 2
+ case sseOpcodeOrpd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F56, 2
+ case sseOpcodeOrps:
+ legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F56, 2
+ case sseOpcodePackssdw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F6B, 2
+ case sseOpcodePacksswb:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F63, 2
+ case sseOpcodePackusdw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F382B, 3
+ case sseOpcodePackuswb:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F67, 2
+ case sseOpcodePaddb:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFC, 2
+ case sseOpcodePaddd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFE, 2
+ case sseOpcodePaddq:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD4, 2
+ case sseOpcodePaddw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFD, 2
+ case sseOpcodePaddsb:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEC, 2
+ case sseOpcodePaddsw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FED, 2
+ case sseOpcodePaddusb:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDC, 2
+ case sseOpcodePaddusw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDD, 2
+ case sseOpcodePand:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDB, 2
+ case sseOpcodePandn:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDF, 2
+ case sseOpcodePavgb:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE0, 2
+ case sseOpcodePavgw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE3, 2
+ case sseOpcodePcmpeqb:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F74, 2
+ case sseOpcodePcmpeqw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F75, 2
+ case sseOpcodePcmpeqd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F76, 2
+ case sseOpcodePcmpeqq:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3829, 3
+ case sseOpcodePcmpgtb:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F64, 2
+ case sseOpcodePcmpgtw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F65, 2
+ case sseOpcodePcmpgtd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F66, 2
+ case sseOpcodePcmpgtq:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3837, 3
+ case sseOpcodePmaddwd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF5, 2
+ case sseOpcodePmaxsb:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383C, 3
+ case sseOpcodePmaxsw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEE, 2
+ case sseOpcodePmaxsd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383D, 3
+ case sseOpcodePmaxub:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDE, 2
+ case sseOpcodePmaxuw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383E, 3
+ case sseOpcodePmaxud:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383F, 3
+ case sseOpcodePminsb:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3838, 3
+ case sseOpcodePminsw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEA, 2
+ case sseOpcodePminsd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3839, 3
+ case sseOpcodePminub:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDA, 2
+ case sseOpcodePminuw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383A, 3
+ case sseOpcodePminud:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383B, 3
+ case sseOpcodePmulld:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3840, 3
+ case sseOpcodePmullw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD5, 2
+ case sseOpcodePmuludq:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF4, 2
+ case sseOpcodePor:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEB, 2
+ case sseOpcodePshufb:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3800, 3
+ case sseOpcodePsubb:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF8, 2
+ case sseOpcodePsubd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFA, 2
+ case sseOpcodePsubq:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFB, 2
+ case sseOpcodePsubw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF9, 2
+ case sseOpcodePsubsb:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE8, 2
+ case sseOpcodePsubsw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE9, 2
+ case sseOpcodePsubusb:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD8, 2
+ case sseOpcodePsubusw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD9, 2
+ case sseOpcodePunpckhbw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F68, 2
+ case sseOpcodePunpcklbw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F60, 2
+ case sseOpcodePxor:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEF, 2
+ case sseOpcodeSubps:
+ legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5C, 2
+ case sseOpcodeSubpd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5C, 2
+ case sseOpcodeSubss:
+ legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5C, 2
+ case sseOpcodeSubsd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5C, 2
+ case sseOpcodeXorps:
+ legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F57, 2
+ case sseOpcodeXorpd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F57, 2
+ case sseOpcodePmulhrsw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F380B, 3
+ case sseOpcodeUnpcklps:
+ legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F14, 2
+ case sseOpcodePmaddubsw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3804, 3
+ default:
+ if kind == blendvpd {
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3815, 3
+ } else {
+ panic(fmt.Sprintf("Unsupported sseOpcode: %s", op))
+ }
+ }
+
+ dst := regEncodings[i.op2.reg().RealReg()]
+
+ rex := rexInfo(0).clearW()
+ op1 := i.op1
+ if op1.kind == operandKindReg {
+ src := regEncodings[op1.reg().RealReg()]
+ encodeRegReg(c, legPrex, opcode, opcodeNum, dst, src, rex)
+ } else if i.op1.kind == operandKindMem {
+ m := i.op1.addressMode()
+ encodeRegMem(c, legPrex, opcode, opcodeNum, dst, m, rex)
+ } else {
+ panic("BUG: invalid operand kind")
+ }
+
+ case gprToXmm:
+ var legPrefix legacyPrefixes
+ var opcode uint32
+ const opcodeNum = 2
+ switch sseOpcode(i.u1) {
+ case sseOpcodeMovd, sseOpcodeMovq:
+ legPrefix, opcode = legacyPrefixes0x66, 0x0f6e
+ case sseOpcodeCvtsi2ss:
+ legPrefix, opcode = legacyPrefixes0xF3, 0x0f2a
+ case sseOpcodeCvtsi2sd:
+ legPrefix, opcode = legacyPrefixes0xF2, 0x0f2a
+ default:
+ panic(fmt.Sprintf("Unsupported sseOpcode: %s", sseOpcode(i.u1)))
+ }
+
+ var rex rexInfo
+ if i.b1 {
+ rex = rex.setW()
+ } else {
+ rex = rex.clearW()
+ }
+ dst := regEncodings[i.op2.reg().RealReg()]
+
+ op1 := i.op1
+ if op1.kind == operandKindReg {
+ src := regEncodings[op1.reg().RealReg()]
+ encodeRegReg(c, legPrefix, opcode, opcodeNum, dst, src, rex)
+ } else if i.op1.kind == operandKindMem {
+ m := i.op1.addressMode()
+ encodeRegMem(c, legPrefix, opcode, opcodeNum, dst, m, rex)
+ } else {
+ panic("BUG: invalid operand kind")
+ }
+
+ case xmmUnaryRmR:
+ var prefix legacyPrefixes
+ var opcode uint32
+ var opcodeNum uint32
+ op := sseOpcode(i.u1)
+ switch op {
+ case sseOpcodeCvtss2sd:
+ prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5A, 2
+ case sseOpcodeCvtsd2ss:
+ prefix, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5A, 2
+ case sseOpcodeMovaps:
+ prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F28, 2
+ case sseOpcodeMovapd:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F28, 2
+ case sseOpcodeMovdqa:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F6F, 2
+ case sseOpcodeMovdqu:
+ prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F6F, 2
+ case sseOpcodeMovsd:
+ prefix, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F10, 2
+ case sseOpcodeMovss:
+ prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F10, 2
+ case sseOpcodeMovups:
+ prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F10, 2
+ case sseOpcodeMovupd:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F10, 2
+ case sseOpcodePabsb:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F381C, 3
+ case sseOpcodePabsw:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F381D, 3
+ case sseOpcodePabsd:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F381E, 3
+ case sseOpcodePmovsxbd:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3821, 3
+ case sseOpcodePmovsxbw:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3820, 3
+ case sseOpcodePmovsxbq:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3822, 3
+ case sseOpcodePmovsxwd:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3823, 3
+ case sseOpcodePmovsxwq:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3824, 3
+ case sseOpcodePmovsxdq:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3825, 3
+ case sseOpcodePmovzxbd:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3831, 3
+ case sseOpcodePmovzxbw:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3830, 3
+ case sseOpcodePmovzxbq:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3832, 3
+ case sseOpcodePmovzxwd:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3833, 3
+ case sseOpcodePmovzxwq:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3834, 3
+ case sseOpcodePmovzxdq:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3835, 3
+ case sseOpcodeSqrtps:
+ prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F51, 2
+ case sseOpcodeSqrtpd:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F51, 2
+ case sseOpcodeSqrtss:
+ prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F51, 2
+ case sseOpcodeSqrtsd:
+ prefix, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F51, 2
+ case sseOpcodeXorps:
+ prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F57, 2
+ case sseOpcodeXorpd:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F57, 2
+ case sseOpcodeCvtdq2ps:
+ prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F5B, 2
+ case sseOpcodeCvtdq2pd:
+ prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0FE6, 2
+ case sseOpcodeCvtps2pd:
+ prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F5A, 2
+ case sseOpcodeCvtpd2ps:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5A, 2
+ case sseOpcodeCvttps2dq:
+ prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5B, 2
+ case sseOpcodeCvttpd2dq:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE6, 2
+ default:
+ panic(fmt.Sprintf("Unsupported sseOpcode: %s", op))
+ }
+
+ dst := regEncodings[i.op2.reg().RealReg()]
+
+ rex := rexInfo(0).clearW()
+ op1 := i.op1
+ if op1.kind == operandKindReg {
+ src := regEncodings[op1.reg().RealReg()]
+ encodeRegReg(c, prefix, opcode, opcodeNum, dst, src, rex)
+ } else if i.op1.kind == operandKindMem {
+ m := i.op1.addressMode()
+ needsLabelResolution = encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex)
+ } else {
+ panic("BUG: invalid operand kind")
+ }
+
+ case xmmUnaryRmRImm:
+ var prefix legacyPrefixes
+ var opcode uint32
+ var opcodeNum uint32
+ op := sseOpcode(i.u1)
+ switch op {
+ case sseOpcodeRoundps:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a08, 3
+ case sseOpcodeRoundss:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a0a, 3
+ case sseOpcodeRoundpd:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a09, 3
+ case sseOpcodeRoundsd:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a0b, 3
+ }
+ rex := rexInfo(0).clearW()
+ dst := regEncodings[i.op2.reg().RealReg()]
+ op1 := i.op1
+ if op1.kind == operandKindReg {
+ src := regEncodings[op1.reg().RealReg()]
+ encodeRegReg(c, prefix, opcode, opcodeNum, dst, src, rex)
+ } else if i.op1.kind == operandKindMem {
+ m := i.op1.addressMode()
+ encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex)
+ } else {
+ panic("BUG: invalid operand kind")
+ }
+
+ c.EmitByte(byte(i.u2))
+
+ case unaryRmR:
+ var prefix legacyPrefixes
+ var opcode uint32
+ var opcodeNum uint32
+ op := unaryRmROpcode(i.u1)
+ // We assume size is either 32 or 64.
+ switch op {
+ case unaryRmROpcodeBsr:
+ prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0fbd, 2
+ case unaryRmROpcodeBsf:
+ prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0fbc, 2
+ case unaryRmROpcodeLzcnt:
+ prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0fbd, 2
+ case unaryRmROpcodeTzcnt:
+ prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0fbc, 2
+ case unaryRmROpcodePopcnt:
+ prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0fb8, 2
+ default:
+ panic(fmt.Sprintf("Unsupported unaryRmROpcode: %s", op))
+ }
+
+ dst := regEncodings[i.op2.reg().RealReg()]
+
+ rex := rexInfo(0)
+ if i.b1 { // 64 bit.
+ rex = rexInfo(0).setW()
+ } else {
+ rex = rexInfo(0).clearW()
+ }
+ op1 := i.op1
+ if op1.kind == operandKindReg {
+ src := regEncodings[op1.reg().RealReg()]
+ encodeRegReg(c, prefix, opcode, opcodeNum, dst, src, rex)
+ } else if i.op1.kind == operandKindMem {
+ m := i.op1.addressMode()
+ encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex)
+ } else {
+ panic("BUG: invalid operand kind")
+ }
+
+ case not:
+ var prefix legacyPrefixes
+ src := regEncodings[i.op1.reg().RealReg()]
+ rex := rexInfo(0)
+ if i.b1 { // 64 bit.
+ rex = rexInfo(0).setW()
+ } else {
+ rex = rexInfo(0).clearW()
+ }
+ subopcode := uint8(2)
+ encodeEncEnc(c, prefix, 0xf7, 1, subopcode, uint8(src), rex)
+
+ case neg:
+ var prefix legacyPrefixes
+ src := regEncodings[i.op1.reg().RealReg()]
+ rex := rexInfo(0)
+ if i.b1 { // 64 bit.
+ rex = rexInfo(0).setW()
+ } else {
+ rex = rexInfo(0).clearW()
+ }
+ subopcode := uint8(3)
+ encodeEncEnc(c, prefix, 0xf7, 1, subopcode, uint8(src), rex)
+
+ case div:
+ rex := rexInfo(0)
+ if i.b1 { // 64 bit.
+ rex = rexInfo(0).setW()
+ } else {
+ rex = rexInfo(0).clearW()
+ }
+ var subopcode uint8
+ if i.u1 != 0 { // Signed.
+ subopcode = 7
+ } else {
+ subopcode = 6
+ }
+
+ divisor := i.op1
+ if divisor.kind == operandKindReg {
+ src := regEncodings[divisor.reg().RealReg()]
+ encodeEncEnc(c, legacyPrefixesNone, 0xf7, 1, subopcode, uint8(src), rex)
+ } else if divisor.kind == operandKindMem {
+ m := divisor.addressMode()
+ encodeEncMem(c, legacyPrefixesNone, 0xf7, 1, subopcode, m, rex)
+ } else {
+ panic("BUG: invalid operand kind")
+ }
+
+ case mulHi:
+ var prefix legacyPrefixes
+ rex := rexInfo(0)
+ if i.b1 { // 64 bit.
+ rex = rexInfo(0).setW()
+ } else {
+ rex = rexInfo(0).clearW()
+ }
+
+ signed := i.u1 != 0
+ var subopcode uint8
+ if signed {
+ subopcode = 5
+ } else {
+ subopcode = 4
+ }
+
+ // src1 is implicitly rax,
+ // dst_lo is implicitly rax,
+ // dst_hi is implicitly rdx.
+ src2 := i.op1
+ if src2.kind == operandKindReg {
+ src := regEncodings[src2.reg().RealReg()]
+ encodeEncEnc(c, prefix, 0xf7, 1, subopcode, uint8(src), rex)
+ } else if src2.kind == operandKindMem {
+ m := src2.addressMode()
+ encodeEncMem(c, prefix, 0xf7, 1, subopcode, m, rex)
+ } else {
+ panic("BUG: invalid operand kind")
+ }
+
+ case signExtendData:
+ if i.b1 { // 64 bit.
+ c.EmitByte(0x48)
+ c.EmitByte(0x99)
+ } else {
+ c.EmitByte(0x99)
+ }
+ case movzxRmR, movsxRmR:
+ signed := i.kind == movsxRmR
+
+ ext := extMode(i.u1)
+ var opcode uint32
+ var opcodeNum uint32
+ var rex rexInfo
+ switch ext {
+ case extModeBL:
+ if signed {
+ opcode, opcodeNum, rex = 0x0fbe, 2, rex.clearW()
+ } else {
+ opcode, opcodeNum, rex = 0x0fb6, 2, rex.clearW()
+ }
+ case extModeBQ:
+ if signed {
+ opcode, opcodeNum, rex = 0x0fbe, 2, rex.setW()
+ } else {
+ opcode, opcodeNum, rex = 0x0fb6, 2, rex.setW()
+ }
+ case extModeWL:
+ if signed {
+ opcode, opcodeNum, rex = 0x0fbf, 2, rex.clearW()
+ } else {
+ opcode, opcodeNum, rex = 0x0fb7, 2, rex.clearW()
+ }
+ case extModeWQ:
+ if signed {
+ opcode, opcodeNum, rex = 0x0fbf, 2, rex.setW()
+ } else {
+ opcode, opcodeNum, rex = 0x0fb7, 2, rex.setW()
+ }
+ case extModeLQ:
+ if signed {
+ opcode, opcodeNum, rex = 0x63, 1, rex.setW()
+ } else {
+ opcode, opcodeNum, rex = 0x8b, 1, rex.clearW()
+ }
+ default:
+ panic("BUG: invalid extMode")
+ }
+
+ op := i.op1
+ dst := regEncodings[i.op2.reg().RealReg()]
+ switch op.kind {
+ case operandKindReg:
+ src := regEncodings[op.reg().RealReg()]
+ if ext == extModeBL || ext == extModeBQ {
+ // Some destinations must be encoded with REX.R = 1.
+ if e := src.encoding(); e >= 4 && e <= 7 {
+ rex = rex.always()
+ }
+ }
+ encodeRegReg(c, legacyPrefixesNone, opcode, opcodeNum, dst, src, rex)
+ case operandKindMem:
+ m := op.addressMode()
+ encodeRegMem(c, legacyPrefixesNone, opcode, opcodeNum, dst, m, rex)
+ default:
+ panic("BUG: invalid operand kind")
+ }
+
+ case mov64MR:
+ m := i.op1.addressMode()
+ encodeLoad64(c, m, i.op2.reg().RealReg())
+
+ case lea:
+ needsLabelResolution = true
+ dst := regEncodings[i.op2.reg().RealReg()]
+ rex := rexInfo(0).setW()
+ const opcode, opcodeNum = 0x8d, 1
+ switch i.op1.kind {
+ case operandKindMem:
+ a := i.op1.addressMode()
+ encodeRegMem(c, legacyPrefixesNone, opcode, opcodeNum, dst, a, rex)
+ case operandKindLabel:
+ rex.encode(c, regRexBit(byte(dst)), 0)
+ c.EmitByte(byte((opcode) & 0xff))
+
+ // Indicate "LEAQ [RIP + 32bit displacement].
+ // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing
+ c.EmitByte(encodeModRM(0b00, dst.encoding(), 0b101))
+
+ // This will be resolved later, so we just emit a placeholder (0xffffffff for testing).
+ c.Emit4Bytes(0xffffffff)
+ default:
+ panic("BUG: invalid operand kind")
+ }
+
+ case movRM:
+ m := i.op2.addressMode()
+ src := regEncodings[i.op1.reg().RealReg()]
+
+ var rex rexInfo
+ switch i.u1 {
+ case 1:
+ if e := src.encoding(); e >= 4 && e <= 7 {
+ rex = rex.always()
+ }
+ encodeRegMem(c, legacyPrefixesNone, 0x88, 1, src, m, rex.clearW())
+ case 2:
+ encodeRegMem(c, legacyPrefixes0x66, 0x89, 1, src, m, rex.clearW())
+ case 4:
+ encodeRegMem(c, legacyPrefixesNone, 0x89, 1, src, m, rex.clearW())
+ case 8:
+ encodeRegMem(c, legacyPrefixesNone, 0x89, 1, src, m, rex.setW())
+ default:
+ panic(fmt.Sprintf("BUG: invalid size %d: %s", i.u1, i.String()))
+ }
+
+ case shiftR:
+ src := regEncodings[i.op2.reg().RealReg()]
+ amount := i.op1
+
+ var opcode uint32
+ var prefix legacyPrefixes
+ rex := rexInfo(0)
+ if i.b1 { // 64 bit.
+ rex = rexInfo(0).setW()
+ } else {
+ rex = rexInfo(0).clearW()
+ }
+
+ switch amount.kind {
+ case operandKindReg:
+ if amount.reg() != rcxVReg {
+ panic("BUG: invalid reg operand: must be rcx")
+ }
+ opcode, prefix = 0xd3, legacyPrefixesNone
+ encodeEncEnc(c, prefix, opcode, 1, uint8(i.u1), uint8(src), rex)
+ case operandKindImm32:
+ opcode, prefix = 0xc1, legacyPrefixesNone
+ encodeEncEnc(c, prefix, opcode, 1, uint8(i.u1), uint8(src), rex)
+ c.EmitByte(byte(amount.imm32()))
+ default:
+ panic("BUG: invalid operand kind")
+ }
+ case xmmRmiReg:
+ const legPrefix = legacyPrefixes0x66
+ rex := rexInfo(0).clearW()
+ dst := regEncodings[i.op2.reg().RealReg()]
+
+ var opcode uint32
+ var regDigit uint8
+
+ op := sseOpcode(i.u1)
+ op1 := i.op1
+ if i.op1.kind == operandKindImm32 {
+ switch op {
+ case sseOpcodePsllw:
+ opcode, regDigit = 0x0f71, 6
+ case sseOpcodePslld:
+ opcode, regDigit = 0x0f72, 6
+ case sseOpcodePsllq:
+ opcode, regDigit = 0x0f73, 6
+ case sseOpcodePsraw:
+ opcode, regDigit = 0x0f71, 4
+ case sseOpcodePsrad:
+ opcode, regDigit = 0x0f72, 4
+ case sseOpcodePsrlw:
+ opcode, regDigit = 0x0f71, 2
+ case sseOpcodePsrld:
+ opcode, regDigit = 0x0f72, 2
+ case sseOpcodePsrlq:
+ opcode, regDigit = 0x0f73, 2
+ default:
+ panic("invalid opcode")
+ }
+
+ encodeEncEnc(c, legPrefix, opcode, 2, regDigit, uint8(dst), rex)
+ imm32 := op1.imm32()
+ if imm32 > 0xff&imm32 {
+ panic("immediate value does not fit 1 byte")
+ }
+ c.EmitByte(uint8(imm32))
+ } else {
+ switch op {
+ case sseOpcodePsllw:
+ opcode = 0x0ff1
+ case sseOpcodePslld:
+ opcode = 0x0ff2
+ case sseOpcodePsllq:
+ opcode = 0x0ff3
+ case sseOpcodePsraw:
+ opcode = 0x0fe1
+ case sseOpcodePsrad:
+ opcode = 0x0fe2
+ case sseOpcodePsrlw:
+ opcode = 0x0fd1
+ case sseOpcodePsrld:
+ opcode = 0x0fd2
+ case sseOpcodePsrlq:
+ opcode = 0x0fd3
+ default:
+ panic("invalid opcode")
+ }
+
+ if op1.kind == operandKindReg {
+ reg := regEncodings[op1.reg().RealReg()]
+ encodeRegReg(c, legPrefix, opcode, 2, dst, reg, rex)
+ } else if op1.kind == operandKindMem {
+ m := op1.addressMode()
+ encodeRegMem(c, legPrefix, opcode, 2, dst, m, rex)
+ } else {
+ panic("BUG: invalid operand kind")
+ }
+ }
+
+ case cmpRmiR:
+ var opcode uint32
+ isCmp := i.u1 != 0
+ rex := rexInfo(0)
+ _64 := i.b1
+ if _64 { // 64 bit.
+ rex = rex.setW()
+ } else {
+ rex = rex.clearW()
+ }
+ dst := regEncodings[i.op2.reg().RealReg()]
+ op1 := i.op1
+ switch op1.kind {
+ case operandKindReg:
+ reg := regEncodings[op1.reg().RealReg()]
+ if isCmp {
+ opcode = 0x39
+ } else {
+ opcode = 0x85
+ }
+ // Here we swap the encoding of the operands for CMP to be consistent with the output of LLVM/GCC.
+ encodeRegReg(c, legacyPrefixesNone, opcode, 1, reg, dst, rex)
+
+ case operandKindMem:
+ if isCmp {
+ opcode = 0x3b
+ } else {
+ opcode = 0x85
+ }
+ m := op1.addressMode()
+ encodeRegMem(c, legacyPrefixesNone, opcode, 1, dst, m, rex)
+
+ case operandKindImm32:
+ imm32 := op1.imm32()
+ useImm8 := isCmp && lower8willSignExtendTo32(imm32)
+ var subopcode uint8
+
+ switch {
+ case isCmp && useImm8:
+ opcode, subopcode = 0x83, 7
+ case isCmp && !useImm8:
+ opcode, subopcode = 0x81, 7
+ default:
+ opcode, subopcode = 0xf7, 0
+ }
+ encodeEncEnc(c, legacyPrefixesNone, opcode, 1, subopcode, uint8(dst), rex)
+ if useImm8 {
+ c.EmitByte(uint8(imm32))
+ } else {
+ c.Emit4Bytes(imm32)
+ }
+
+ default:
+ panic("BUG: invalid operand kind")
+ }
+ case setcc:
+ cc := cond(i.u1)
+ dst := regEncodings[i.op2.reg().RealReg()]
+ rex := rexInfo(0).clearW().always()
+ opcode := uint32(0x0f90) + uint32(cc)
+ encodeEncEnc(c, legacyPrefixesNone, opcode, 2, 0, uint8(dst), rex)
+ case cmove:
+ cc := cond(i.u1)
+ dst := regEncodings[i.op2.reg().RealReg()]
+ rex := rexInfo(0)
+ if i.b1 { // 64 bit.
+ rex = rex.setW()
+ } else {
+ rex = rex.clearW()
+ }
+ opcode := uint32(0x0f40) + uint32(cc)
+ src := i.op1
+ switch src.kind {
+ case operandKindReg:
+ srcReg := regEncodings[src.reg().RealReg()]
+ encodeRegReg(c, legacyPrefixesNone, opcode, 2, dst, srcReg, rex)
+ case operandKindMem:
+ m := src.addressMode()
+ encodeRegMem(c, legacyPrefixesNone, opcode, 2, dst, m, rex)
+ default:
+ panic("BUG: invalid operand kind")
+ }
+ case push64:
+ op := i.op1
+
+ switch op.kind {
+ case operandKindReg:
+ dst := regEncodings[op.reg().RealReg()]
+ if dst.rexBit() > 0 {
+ c.EmitByte(rexEncodingDefault | 0x1)
+ }
+ c.EmitByte(0x50 | dst.encoding())
+ case operandKindMem:
+ m := op.addressMode()
+ encodeRegMem(
+ c, legacyPrefixesNone, 0xff, 1, regEnc(6), m, rexInfo(0).clearW(),
+ )
+ case operandKindImm32:
+ c.EmitByte(0x68)
+ c.Emit4Bytes(op.imm32())
+ default:
+ panic("BUG: invalid operand kind")
+ }
+
+ case pop64:
+ dst := regEncodings[i.op1.reg().RealReg()]
+ if dst.rexBit() > 0 {
+ c.EmitByte(rexEncodingDefault | 0x1)
+ }
+ c.EmitByte(0x58 | dst.encoding())
+
+ case xmmMovRM:
+ var legPrefix legacyPrefixes
+ var opcode uint32
+ const opcodeNum = 2
+ switch sseOpcode(i.u1) {
+ case sseOpcodeMovaps:
+ legPrefix, opcode = legacyPrefixesNone, 0x0f29
+ case sseOpcodeMovapd:
+ legPrefix, opcode = legacyPrefixes0x66, 0x0f29
+ case sseOpcodeMovdqa:
+ legPrefix, opcode = legacyPrefixes0x66, 0x0f7f
+ case sseOpcodeMovdqu:
+ legPrefix, opcode = legacyPrefixes0xF3, 0x0f7f
+ case sseOpcodeMovss:
+ legPrefix, opcode = legacyPrefixes0xF3, 0x0f11
+ case sseOpcodeMovsd:
+ legPrefix, opcode = legacyPrefixes0xF2, 0x0f11
+ case sseOpcodeMovups:
+ legPrefix, opcode = legacyPrefixesNone, 0x0f11
+ case sseOpcodeMovupd:
+ legPrefix, opcode = legacyPrefixes0x66, 0x0f11
+ default:
+ panic(fmt.Sprintf("Unsupported sseOpcode: %s", sseOpcode(i.u1)))
+ }
+
+ dst := regEncodings[i.op1.reg().RealReg()]
+ encodeRegMem(c, legPrefix, opcode, opcodeNum, dst, i.op2.addressMode(), rexInfo(0).clearW())
+ case xmmLoadConst:
+ panic("TODO")
+ case xmmToGpr:
+ var legPrefix legacyPrefixes
+ var opcode uint32
+ var argSwap bool
+ const opcodeNum = 2
+ switch sseOpcode(i.u1) {
+ case sseOpcodeMovd, sseOpcodeMovq:
+ legPrefix, opcode, argSwap = legacyPrefixes0x66, 0x0f7e, false
+ case sseOpcodeMovmskps:
+ legPrefix, opcode, argSwap = legacyPrefixesNone, 0x0f50, true
+ case sseOpcodeMovmskpd:
+ legPrefix, opcode, argSwap = legacyPrefixes0x66, 0x0f50, true
+ case sseOpcodePmovmskb:
+ legPrefix, opcode, argSwap = legacyPrefixes0x66, 0x0fd7, true
+ case sseOpcodeCvttss2si:
+ legPrefix, opcode, argSwap = legacyPrefixes0xF3, 0x0f2c, true
+ case sseOpcodeCvttsd2si:
+ legPrefix, opcode, argSwap = legacyPrefixes0xF2, 0x0f2c, true
+ default:
+ panic(fmt.Sprintf("Unsupported sseOpcode: %s", sseOpcode(i.u1)))
+ }
+
+ var rex rexInfo
+ if i.b1 {
+ rex = rex.setW()
+ } else {
+ rex = rex.clearW()
+ }
+ src := regEncodings[i.op1.reg().RealReg()]
+ dst := regEncodings[i.op2.reg().RealReg()]
+ if argSwap {
+ src, dst = dst, src
+ }
+ encodeRegReg(c, legPrefix, opcode, opcodeNum, src, dst, rex)
+
+ case cvtUint64ToFloatSeq:
+ panic("TODO")
+ case cvtFloatToSintSeq:
+ panic("TODO")
+ case cvtFloatToUintSeq:
+ panic("TODO")
+ case xmmMinMaxSeq:
+ panic("TODO")
+ case xmmCmpRmR:
+ var prefix legacyPrefixes
+ var opcode uint32
+ var opcodeNum uint32
+ rex := rexInfo(0)
+ _64 := i.b1
+ if _64 { // 64 bit.
+ rex = rex.setW()
+ } else {
+ rex = rex.clearW()
+ }
+
+ op := sseOpcode(i.u1)
+ switch op {
+ case sseOpcodePtest:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3817, 3
+ case sseOpcodeUcomisd:
+ prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f2e, 2
+ case sseOpcodeUcomiss:
+ prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0f2e, 2
+ default:
+ panic(fmt.Sprintf("Unsupported sseOpcode: %s", op))
+ }
+
+ dst := regEncodings[i.op2.reg().RealReg()]
+ op1 := i.op1
+ switch op1.kind {
+ case operandKindReg:
+ reg := regEncodings[op1.reg().RealReg()]
+ encodeRegReg(c, prefix, opcode, opcodeNum, dst, reg, rex)
+
+ case operandKindMem:
+ m := op1.addressMode()
+ encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex)
+
+ default:
+ panic("BUG: invalid operand kind")
+ }
+ case xmmRmRImm:
+ op := sseOpcode(i.u1)
+ var legPrex legacyPrefixes
+ var opcode uint32
+ var opcodeNum uint32
+ var swap bool
+ switch op {
+ case sseOpcodeCmpps:
+ legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0FC2, 2
+ case sseOpcodeCmppd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FC2, 2
+ case sseOpcodeCmpss:
+ legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0FC2, 2
+ case sseOpcodeCmpsd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0FC2, 2
+ case sseOpcodeInsertps:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A21, 3
+ case sseOpcodePalignr:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A0F, 3
+ case sseOpcodePinsrb:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A20, 3
+ case sseOpcodePinsrw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FC4, 2
+ case sseOpcodePinsrd, sseOpcodePinsrq:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A22, 3
+ case sseOpcodePextrb:
+ swap = true
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A14, 3
+ case sseOpcodePextrw:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FC5, 2
+ case sseOpcodePextrd, sseOpcodePextrq:
+ swap = true
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A16, 3
+ case sseOpcodePshufd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F70, 2
+ case sseOpcodeRoundps:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A08, 3
+ case sseOpcodeRoundpd:
+ legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A09, 3
+ case sseOpcodeShufps:
+ legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0FC6, 2
+ default:
+ panic(fmt.Sprintf("Unsupported sseOpcode: %s", op))
+ }
+
+ dst := regEncodings[i.op2.reg().RealReg()]
+
+ var rex rexInfo
+ if op == sseOpcodePextrq || op == sseOpcodePinsrq {
+ rex = rexInfo(0).setW()
+ } else {
+ rex = rexInfo(0).clearW()
+ }
+ op1 := i.op1
+ if op1.kind == operandKindReg {
+ src := regEncodings[op1.reg().RealReg()]
+ if swap {
+ src, dst = dst, src
+ }
+ encodeRegReg(c, legPrex, opcode, opcodeNum, dst, src, rex)
+ } else if i.op1.kind == operandKindMem {
+ if swap {
+ panic("BUG: this is not possible to encode")
+ }
+ m := i.op1.addressMode()
+ encodeRegMem(c, legPrex, opcode, opcodeNum, dst, m, rex)
+ } else {
+ panic("BUG: invalid operand kind")
+ }
+
+ c.EmitByte(byte(i.u2))
+
+ case jmp:
+ const (
+ regMemOpcode = 0xff
+ regMemOpcodeNum = 1
+ regMemSubOpcode = 4
+ )
+ op := i.op1
+ switch op.kind {
+ case operandKindLabel:
+ needsLabelResolution = true
+ fallthrough
+ case operandKindImm32:
+ c.EmitByte(0xe9)
+ c.Emit4Bytes(op.imm32())
+ case operandKindMem:
+ m := op.addressMode()
+ encodeRegMem(c,
+ legacyPrefixesNone,
+ regMemOpcode, regMemOpcodeNum,
+ regMemSubOpcode, m, rexInfo(0).clearW(),
+ )
+ case operandKindReg:
+ r := op.reg().RealReg()
+ encodeRegReg(
+ c,
+ legacyPrefixesNone,
+ regMemOpcode, regMemOpcodeNum,
+ regMemSubOpcode,
+ regEncodings[r], rexInfo(0).clearW(),
+ )
+ default:
+ panic("BUG: invalid operand kind")
+ }
+
+ case jmpIf:
+ op := i.op1
+ switch op.kind {
+ case operandKindLabel:
+ needsLabelResolution = true
+ fallthrough
+ case operandKindImm32:
+ c.EmitByte(0x0f)
+ c.EmitByte(0x80 | cond(i.u1).encoding())
+ c.Emit4Bytes(op.imm32())
+ default:
+ panic("BUG: invalid operand kind")
+ }
+
+ case jmpTableIsland:
+ needsLabelResolution = true
+ for tc := uint64(0); tc < i.u2; tc++ {
+ c.Emit8Bytes(0)
+ }
+
+ case exitSequence:
+ execCtx := i.op1.reg()
+ allocatedAmode := i.op2.addressMode()
+
+ // Restore the RBP, RSP, and return to the Go code:
+ *allocatedAmode = amode{
+ kindWithShift: uint32(amodeImmReg), base: execCtx,
+ imm32: wazevoapi.ExecutionContextOffsetOriginalFramePointer.U32(),
+ }
+ encodeLoad64(c, allocatedAmode, rbp)
+ allocatedAmode.imm32 = wazevoapi.ExecutionContextOffsetOriginalStackPointer.U32()
+ encodeLoad64(c, allocatedAmode, rsp)
+ encodeRet(c)
+
+ case ud2:
+ c.EmitByte(0x0f)
+ c.EmitByte(0x0b)
+
+ case call:
+ c.EmitByte(0xe8)
+ // Meaning that the call target is a function value, and requires relocation.
+ c.AddRelocationInfo(ssa.FuncRef(i.u1))
+ // Note that this is zero as a placeholder for the call target if it's a function value.
+ c.Emit4Bytes(uint32(i.u2))
+
+ case callIndirect:
+ op := i.op1
+
+ const opcodeNum = 1
+ const opcode = 0xff
+ rex := rexInfo(0).clearW()
+ switch op.kind {
+ case operandKindReg:
+ dst := regEncodings[op.reg().RealReg()]
+ encodeRegReg(c,
+ legacyPrefixesNone,
+ opcode, opcodeNum,
+ regEnc(2),
+ dst,
+ rex,
+ )
+ case operandKindMem:
+ m := op.addressMode()
+ encodeRegMem(c,
+ legacyPrefixesNone,
+ opcode, opcodeNum,
+ regEnc(2),
+ m,
+ rex,
+ )
+ default:
+ panic("BUG: invalid operand kind")
+ }
+
+ case xchg:
+ src, dst := regEncodings[i.op1.reg().RealReg()], i.op2
+ size := i.u1
+
+ var rex rexInfo
+ var opcode uint32
+ lp := legacyPrefixesNone
+ switch size {
+ case 8:
+ opcode = 0x87
+ rex = rexInfo(0).setW()
+ case 4:
+ opcode = 0x87
+ rex = rexInfo(0).clearW()
+ case 2:
+ lp = legacyPrefixes0x66
+ opcode = 0x87
+ rex = rexInfo(0).clearW()
+ case 1:
+ opcode = 0x86
+ if i.op2.kind == operandKindReg {
+ panic("TODO?: xchg on two 1-byte registers")
+ }
+ // Some destinations must be encoded with REX.R = 1.
+ if e := src.encoding(); e >= 4 && e <= 7 {
+ rex = rexInfo(0).always()
+ }
+ default:
+ panic(fmt.Sprintf("BUG: invalid size %d: %s", size, i.String()))
+ }
+
+ switch dst.kind {
+ case operandKindMem:
+ m := dst.addressMode()
+ encodeRegMem(c, lp, opcode, 1, src, m, rex)
+ case operandKindReg:
+ r := dst.reg().RealReg()
+ encodeRegReg(c, lp, opcode, 1, src, regEncodings[r], rex)
+ default:
+ panic("BUG: invalid operand kind")
+ }
+
+ case lockcmpxchg:
+ src, dst := regEncodings[i.op1.reg().RealReg()], i.op2
+ size := i.u1
+
+ var rex rexInfo
+ var opcode uint32
+ lp := legacyPrefixes0xF0 // Lock prefix.
+ switch size {
+ case 8:
+ opcode = 0x0FB1
+ rex = rexInfo(0).setW()
+ case 4:
+ opcode = 0x0FB1
+ rex = rexInfo(0).clearW()
+ case 2:
+ lp = legacyPrefixes0x660xF0 // Legacy prefix + Lock prefix.
+ opcode = 0x0FB1
+ rex = rexInfo(0).clearW()
+ case 1:
+ opcode = 0x0FB0
+ // Some destinations must be encoded with REX.R = 1.
+ if e := src.encoding(); e >= 4 && e <= 7 {
+ rex = rexInfo(0).always()
+ }
+ default:
+ panic(fmt.Sprintf("BUG: invalid size %d: %s", size, i.String()))
+ }
+
+ switch dst.kind {
+ case operandKindMem:
+ m := dst.addressMode()
+ encodeRegMem(c, lp, opcode, 2, src, m, rex)
+ default:
+ panic("BUG: invalid operand kind")
+ }
+
+ case lockxadd:
+ src, dst := regEncodings[i.op1.reg().RealReg()], i.op2
+ size := i.u1
+
+ var rex rexInfo
+ var opcode uint32
+ lp := legacyPrefixes0xF0 // Lock prefix.
+ switch size {
+ case 8:
+ opcode = 0x0FC1
+ rex = rexInfo(0).setW()
+ case 4:
+ opcode = 0x0FC1
+ rex = rexInfo(0).clearW()
+ case 2:
+ lp = legacyPrefixes0x660xF0 // Legacy prefix + Lock prefix.
+ opcode = 0x0FC1
+ rex = rexInfo(0).clearW()
+ case 1:
+ opcode = 0x0FC0
+ // Some destinations must be encoded with REX.R = 1.
+ if e := src.encoding(); e >= 4 && e <= 7 {
+ rex = rexInfo(0).always()
+ }
+ default:
+ panic(fmt.Sprintf("BUG: invalid size %d: %s", size, i.String()))
+ }
+
+ switch dst.kind {
+ case operandKindMem:
+ m := dst.addressMode()
+ encodeRegMem(c, lp, opcode, 2, src, m, rex)
+ default:
+ panic("BUG: invalid operand kind")
+ }
+
+ case zeros:
+ r := i.op2.reg()
+ if r.RegType() == regalloc.RegTypeInt {
+ i.asAluRmiR(aluRmiROpcodeXor, newOperandReg(r), r, true)
+ } else {
+ i.asXmmRmR(sseOpcodePxor, newOperandReg(r), r)
+ }
+ i.encode(c)
+
+ case mfence:
+ // https://www.felixcloutier.com/x86/mfence
+ c.EmitByte(0x0f)
+ c.EmitByte(0xae)
+ c.EmitByte(0xf0)
+
+ default:
+ panic(fmt.Sprintf("TODO: %v", i.kind))
+ }
+ return
+}
+
+func encodeLoad64(c backend.Compiler, m *amode, rd regalloc.RealReg) {
+ dst := regEncodings[rd]
+ encodeRegMem(c, legacyPrefixesNone, 0x8b, 1, dst, m, rexInfo(0).setW())
+}
+
+func encodeRet(c backend.Compiler) {
+ c.EmitByte(0xc3)
+}
+
+func encodeEncEnc(
+ c backend.Compiler,
+ legPrefixes legacyPrefixes,
+ opcodes uint32,
+ opcodeNum uint32,
+ r uint8,
+ rm uint8,
+ rex rexInfo,
+) {
+ legPrefixes.encode(c)
+ rex.encode(c, r>>3, rm>>3)
+
+ for opcodeNum > 0 {
+ opcodeNum--
+ c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff))
+ }
+ c.EmitByte(encodeModRM(3, r&7, rm&7))
+}
+
+func encodeRegReg(
+ c backend.Compiler,
+ legPrefixes legacyPrefixes,
+ opcodes uint32,
+ opcodeNum uint32,
+ r regEnc,
+ rm regEnc,
+ rex rexInfo,
+) {
+ encodeEncEnc(c, legPrefixes, opcodes, opcodeNum, uint8(r), uint8(rm), rex)
+}
+
+func encodeModRM(mod byte, reg byte, rm byte) byte {
+ return mod<<6 | reg<<3 | rm
+}
+
+func encodeSIB(shift byte, encIndex byte, encBase byte) byte {
+ return shift<<6 | encIndex<<3 | encBase
+}
+
+func encodeRegMem(
+ c backend.Compiler, legPrefixes legacyPrefixes, opcodes uint32, opcodeNum uint32, r regEnc, m *amode, rex rexInfo,
+) (needsLabelResolution bool) {
+ needsLabelResolution = encodeEncMem(c, legPrefixes, opcodes, opcodeNum, uint8(r), m, rex)
+ return
+}
+
+func encodeEncMem(
+ c backend.Compiler, legPrefixes legacyPrefixes, opcodes uint32, opcodeNum uint32, r uint8, m *amode, rex rexInfo,
+) (needsLabelResolution bool) {
+ legPrefixes.encode(c)
+
+ const (
+ modNoDisplacement = 0b00
+ modShortDisplacement = 0b01
+ modLongDisplacement = 0b10
+
+ useSBI = 4 // the encoding of rsp or r12 register.
+ )
+
+ switch m.kind() {
+ case amodeImmReg, amodeImmRBP:
+ base := m.base.RealReg()
+ baseEnc := regEncodings[base]
+
+ rex.encode(c, regRexBit(r), baseEnc.rexBit())
+
+ for opcodeNum > 0 {
+ opcodeNum--
+ c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff))
+ }
+
+ // SIB byte is the last byte of the memory encoding before the displacement
+ const sibByte = 0x24 // == encodeSIB(0, 4, 4)
+
+ immZero, baseRbp, baseR13 := m.imm32 == 0, base == rbp, base == r13
+ short := lower8willSignExtendTo32(m.imm32)
+ rspOrR12 := base == rsp || base == r12
+
+ if immZero && !baseRbp && !baseR13 { // rbp or r13 can't be used as base for without displacement encoding.
+ c.EmitByte(encodeModRM(modNoDisplacement, regEncoding(r), baseEnc.encoding()))
+ if rspOrR12 {
+ c.EmitByte(sibByte)
+ }
+ } else if short { // Note: this includes the case where m.imm32 == 0 && base == rbp || base == r13.
+ c.EmitByte(encodeModRM(modShortDisplacement, regEncoding(r), baseEnc.encoding()))
+ if rspOrR12 {
+ c.EmitByte(sibByte)
+ }
+ c.EmitByte(byte(m.imm32))
+ } else {
+ c.EmitByte(encodeModRM(modLongDisplacement, regEncoding(r), baseEnc.encoding()))
+ if rspOrR12 {
+ c.EmitByte(sibByte)
+ }
+ c.Emit4Bytes(m.imm32)
+ }
+
+ case amodeRegRegShift:
+ base := m.base.RealReg()
+ baseEnc := regEncodings[base]
+ index := m.index.RealReg()
+ indexEnc := regEncodings[index]
+
+ if index == rsp {
+ panic("BUG: rsp can't be used as index of addressing mode")
+ }
+
+ rex.encodeForIndex(c, regEnc(r), indexEnc, baseEnc)
+
+ for opcodeNum > 0 {
+ opcodeNum--
+ c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff))
+ }
+
+ immZero, baseRbp, baseR13 := m.imm32 == 0, base == rbp, base == r13
+ if immZero && !baseRbp && !baseR13 { // rbp or r13 can't be used as base for without displacement encoding. (curious why? because it's interpreted as RIP relative addressing).
+ c.EmitByte(encodeModRM(modNoDisplacement, regEncoding(r), useSBI))
+ c.EmitByte(encodeSIB(m.shift(), indexEnc.encoding(), baseEnc.encoding()))
+ } else if lower8willSignExtendTo32(m.imm32) {
+ c.EmitByte(encodeModRM(modShortDisplacement, regEncoding(r), useSBI))
+ c.EmitByte(encodeSIB(m.shift(), indexEnc.encoding(), baseEnc.encoding()))
+ c.EmitByte(byte(m.imm32))
+ } else {
+ c.EmitByte(encodeModRM(modLongDisplacement, regEncoding(r), useSBI))
+ c.EmitByte(encodeSIB(m.shift(), indexEnc.encoding(), baseEnc.encoding()))
+ c.Emit4Bytes(m.imm32)
+ }
+
+ case amodeRipRel:
+ rex.encode(c, regRexBit(r), 0)
+ for opcodeNum > 0 {
+ opcodeNum--
+ c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff))
+ }
+
+ // Indicate "LEAQ [RIP + 32bit displacement].
+ // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing
+ c.EmitByte(encodeModRM(0b00, regEncoding(r), 0b101))
+
+ // This will be resolved later, so we just emit a placeholder.
+ needsLabelResolution = true
+ c.Emit4Bytes(0)
+
+ default:
+ panic("BUG: invalid addressing mode")
+ }
+ return
+}
+
+const (
+ rexEncodingDefault byte = 0x40
+ rexEncodingW = rexEncodingDefault | 0x08
+)
+
+// rexInfo is a bit set to indicate:
+//
+// 0x01: W bit must be cleared.
+// 0x02: REX prefix must be emitted.
+type rexInfo byte
+
+func (ri rexInfo) setW() rexInfo {
+ return ri | 0x01
+}
+
+func (ri rexInfo) clearW() rexInfo {
+ return ri & 0x02
+}
+
+func (ri rexInfo) always() rexInfo {
+ return ri | 0x02
+}
+
+func (ri rexInfo) notAlways() rexInfo { //nolint
+ return ri & 0x01
+}
+
+func (ri rexInfo) encode(c backend.Compiler, r uint8, b uint8) {
+ var w byte = 0
+ if ri&0x01 != 0 {
+ w = 0x01
+ }
+ rex := rexEncodingDefault | w<<3 | r<<2 | b
+ if rex != rexEncodingDefault || ri&0x02 != 0 {
+ c.EmitByte(rex)
+ }
+}
+
+func (ri rexInfo) encodeForIndex(c backend.Compiler, encR regEnc, encIndex regEnc, encBase regEnc) {
+ var w byte = 0
+ if ri&0x01 != 0 {
+ w = 0x01
+ }
+ r := encR.rexBit()
+ x := encIndex.rexBit()
+ b := encBase.rexBit()
+ rex := byte(0x40) | w<<3 | r<<2 | x<<1 | b
+ if rex != 0x40 || ri&0x02 != 0 {
+ c.EmitByte(rex)
+ }
+}
+
+type regEnc byte
+
+func (r regEnc) rexBit() byte {
+ return regRexBit(byte(r))
+}
+
+func (r regEnc) encoding() byte {
+ return regEncoding(byte(r))
+}
+
+func regRexBit(r byte) byte {
+ return r >> 3
+}
+
+func regEncoding(r byte) byte {
+ return r & 0x07
+}
+
+var regEncodings = [...]regEnc{
+ rax: 0b000,
+ rcx: 0b001,
+ rdx: 0b010,
+ rbx: 0b011,
+ rsp: 0b100,
+ rbp: 0b101,
+ rsi: 0b110,
+ rdi: 0b111,
+ r8: 0b1000,
+ r9: 0b1001,
+ r10: 0b1010,
+ r11: 0b1011,
+ r12: 0b1100,
+ r13: 0b1101,
+ r14: 0b1110,
+ r15: 0b1111,
+ xmm0: 0b000,
+ xmm1: 0b001,
+ xmm2: 0b010,
+ xmm3: 0b011,
+ xmm4: 0b100,
+ xmm5: 0b101,
+ xmm6: 0b110,
+ xmm7: 0b111,
+ xmm8: 0b1000,
+ xmm9: 0b1001,
+ xmm10: 0b1010,
+ xmm11: 0b1011,
+ xmm12: 0b1100,
+ xmm13: 0b1101,
+ xmm14: 0b1110,
+ xmm15: 0b1111,
+}
+
+type legacyPrefixes byte
+
+const (
+ legacyPrefixesNone legacyPrefixes = iota
+ legacyPrefixes0x66
+ legacyPrefixes0xF0
+ legacyPrefixes0x660xF0
+ legacyPrefixes0xF2
+ legacyPrefixes0xF3
+)
+
+func (p legacyPrefixes) encode(c backend.Compiler) {
+ switch p {
+ case legacyPrefixesNone:
+ case legacyPrefixes0x66:
+ c.EmitByte(0x66)
+ case legacyPrefixes0xF0:
+ c.EmitByte(0xf0)
+ case legacyPrefixes0x660xF0:
+ c.EmitByte(0x66)
+ c.EmitByte(0xf0)
+ case legacyPrefixes0xF2:
+ c.EmitByte(0xf2)
+ case legacyPrefixes0xF3:
+ c.EmitByte(0xf3)
+ default:
+ panic("BUG: invalid legacy prefix")
+ }
+}
+
+func lower32willSignExtendTo64(x uint64) bool {
+ xs := int64(x)
+ return xs == int64(uint64(int32(xs)))
+}
+
+func lower8willSignExtendTo32(x uint32) bool {
+ xs := int32(x)
+ return xs == ((xs << 24) >> 24)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go
new file mode 100644
index 000000000..55d05ef63
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go
@@ -0,0 +1,71 @@
+package amd64
+
+import (
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// lowerConstant allocates a new VReg and inserts the instruction to load the constant value.
+func (m *machine) lowerConstant(instr *ssa.Instruction) (vr regalloc.VReg) {
+ val := instr.Return()
+ valType := val.Type()
+
+ vr = m.c.AllocateVReg(valType)
+ m.insertLoadConstant(instr, vr)
+ return
+}
+
+// InsertLoadConstantBlockArg implements backend.Machine.
+func (m *machine) InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) {
+ m.insertLoadConstant(instr, vr)
+}
+
+func (m *machine) insertLoadConstant(instr *ssa.Instruction, vr regalloc.VReg) {
+ val := instr.Return()
+ valType := val.Type()
+ v := instr.ConstantVal()
+
+ bits := valType.Bits()
+ if bits < 64 { // Clear the redundant bits just in case it's unexpectedly sign-extended, etc.
+ v = v & ((1 << valType.Bits()) - 1)
+ }
+
+ switch valType {
+ case ssa.TypeF32, ssa.TypeF64:
+ m.lowerFconst(vr, v, bits == 64)
+ case ssa.TypeI32, ssa.TypeI64:
+ m.lowerIconst(vr, v, bits == 64)
+ default:
+ panic("BUG")
+ }
+}
+
+func (m *machine) lowerFconst(dst regalloc.VReg, c uint64, _64 bool) {
+ if c == 0 {
+ xor := m.allocateInstr().asZeros(dst)
+ m.insert(xor)
+ } else {
+ var tmpType ssa.Type
+ if _64 {
+ tmpType = ssa.TypeI64
+ } else {
+ tmpType = ssa.TypeI32
+ }
+ tmpInt := m.c.AllocateVReg(tmpType)
+ loadToGP := m.allocateInstr().asImm(tmpInt, c, _64)
+ m.insert(loadToGP)
+
+ movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpInt), dst, _64)
+ m.insert(movToXmm)
+ }
+}
+
+func (m *machine) lowerIconst(dst regalloc.VReg, c uint64, _64 bool) {
+ i := m.allocateInstr()
+ if c == 0 {
+ i.asZeros(dst)
+ } else {
+ i.asImm(dst, c, _64)
+ }
+ m.insert(i)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go
new file mode 100644
index 000000000..bee673d25
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go
@@ -0,0 +1,187 @@
+package amd64
+
+import (
+ "fmt"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+var addendsMatchOpcodes = [...]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst, ssa.OpcodeIshl}
+
+type addend struct {
+ r regalloc.VReg
+ off int64
+ shift byte
+}
+
+func (a addend) String() string {
+ return fmt.Sprintf("addend{r=%s, off=%d, shift=%d}", a.r, a.off, a.shift)
+}
+
+// lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
+func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32) (am *amode) {
+ def := m.c.ValueDefinition(ptr)
+
+ if offsetBase&0x80000000 != 0 {
+ // Special casing the huge base offset whose MSB is set. In x64, the immediate is always
+ // sign-extended, but our IR semantics requires the offset base is always unsigned.
+ // Note that this should be extremely rare or even this shouldn't hit in the real application,
+ // therefore we don't need to optimize this case in my opinion.
+
+ a := m.lowerAddend(def)
+ off64 := a.off + int64(offsetBase)
+ offsetBaseReg := m.c.AllocateVReg(ssa.TypeI64)
+ m.lowerIconst(offsetBaseReg, uint64(off64), true)
+ if a.r != regalloc.VRegInvalid {
+ return m.newAmodeRegRegShift(0, offsetBaseReg, a.r, a.shift)
+ } else {
+ return m.newAmodeImmReg(0, offsetBaseReg)
+ }
+ }
+
+ if op := m.c.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op == ssa.OpcodeIadd {
+ add := def.Instr
+ x, y := add.Arg2()
+ xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+ ax := m.lowerAddend(xDef)
+ ay := m.lowerAddend(yDef)
+ add.MarkLowered()
+ return m.lowerAddendsToAmode(ax, ay, offsetBase)
+ } else {
+ // If it is not an Iadd, then we lower the one addend.
+ a := m.lowerAddend(def)
+ // off is always 0 if r is valid.
+ if a.r != regalloc.VRegInvalid {
+ if a.shift != 0 {
+ tmpReg := m.c.AllocateVReg(ssa.TypeI64)
+ m.lowerIconst(tmpReg, 0, true)
+ return m.newAmodeRegRegShift(offsetBase, tmpReg, a.r, a.shift)
+ }
+ return m.newAmodeImmReg(offsetBase, a.r)
+ } else {
+ off64 := a.off + int64(offsetBase)
+ tmpReg := m.c.AllocateVReg(ssa.TypeI64)
+ m.lowerIconst(tmpReg, uint64(off64), true)
+ return m.newAmodeImmReg(0, tmpReg)
+ }
+ }
+}
+
+func (m *machine) lowerAddendsToAmode(x, y addend, offBase uint32) *amode {
+ if x.r != regalloc.VRegInvalid && x.off != 0 || y.r != regalloc.VRegInvalid && y.off != 0 {
+ panic("invalid input")
+ }
+
+ u64 := uint64(x.off+y.off) + uint64(offBase)
+ if u64 != 0 {
+ if _, ok := asImm32(u64, false); !ok {
+ tmpReg := m.c.AllocateVReg(ssa.TypeI64)
+ m.lowerIconst(tmpReg, u64, true)
+ // Blank u64 as it has been already lowered.
+ u64 = 0
+
+ if x.r == regalloc.VRegInvalid {
+ x.r = tmpReg
+ } else if y.r == regalloc.VRegInvalid {
+ y.r = tmpReg
+ } else {
+ // We already know that either rx or ry is invalid,
+ // so we overwrite it with the temporary register.
+ panic("BUG")
+ }
+ }
+ }
+
+ u32 := uint32(u64)
+ switch {
+ // We assume rx, ry are valid iff offx, offy are 0.
+ case x.r != regalloc.VRegInvalid && y.r != regalloc.VRegInvalid:
+ switch {
+ case x.shift != 0 && y.shift != 0:
+ // Cannot absorb two shifted registers, must lower one to a shift instruction.
+ shifted := m.allocateInstr()
+ shifted.asShiftR(shiftROpShiftLeft, newOperandImm32(uint32(x.shift)), x.r, true)
+ m.insert(shifted)
+
+ return m.newAmodeRegRegShift(u32, x.r, y.r, y.shift)
+ case x.shift != 0 && y.shift == 0:
+ // Swap base and index.
+ x, y = y, x
+ fallthrough
+ default:
+ return m.newAmodeRegRegShift(u32, x.r, y.r, y.shift)
+ }
+ case x.r == regalloc.VRegInvalid && y.r != regalloc.VRegInvalid:
+ x, y = y, x
+ fallthrough
+ case x.r != regalloc.VRegInvalid && y.r == regalloc.VRegInvalid:
+ if x.shift != 0 {
+ zero := m.c.AllocateVReg(ssa.TypeI64)
+ m.lowerIconst(zero, 0, true)
+ return m.newAmodeRegRegShift(u32, zero, x.r, x.shift)
+ }
+ return m.newAmodeImmReg(u32, x.r)
+ default: // Both are invalid: use the offset.
+ tmpReg := m.c.AllocateVReg(ssa.TypeI64)
+ m.lowerIconst(tmpReg, u64, true)
+ return m.newAmodeImmReg(0, tmpReg)
+ }
+}
+
+func (m *machine) lowerAddend(x *backend.SSAValueDefinition) addend {
+ if x.IsFromBlockParam() {
+ return addend{x.BlkParamVReg, 0, 0}
+ }
+ // Ensure the addend is not referenced in multiple places; we will discard nested Iadds.
+ op := m.c.MatchInstrOneOf(x, addendsMatchOpcodes[:])
+ if op != ssa.OpcodeInvalid && op != ssa.OpcodeIadd {
+ return m.lowerAddendFromInstr(x.Instr)
+ }
+ p := m.getOperand_Reg(x)
+ return addend{p.reg(), 0, 0}
+}
+
+// lowerAddendFromInstr takes an instruction returns a Vreg and an offset that can be used in an address mode.
+// The Vreg is regalloc.VRegInvalid if the addend cannot be lowered to a register.
+// The offset is 0 if the addend can be lowered to a register.
+func (m *machine) lowerAddendFromInstr(instr *ssa.Instruction) addend {
+ instr.MarkLowered()
+ switch op := instr.Opcode(); op {
+ case ssa.OpcodeIconst:
+ u64 := instr.ConstantVal()
+ if instr.Return().Type().Bits() == 32 {
+ return addend{regalloc.VRegInvalid, int64(int32(u64)), 0} // sign-extend.
+ } else {
+ return addend{regalloc.VRegInvalid, int64(u64), 0}
+ }
+ case ssa.OpcodeUExtend, ssa.OpcodeSExtend:
+ input := instr.Arg()
+ inputDef := m.c.ValueDefinition(input)
+ if input.Type().Bits() != 32 {
+ panic("BUG: invalid input type " + input.Type().String())
+ }
+ constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant()
+ switch {
+ case constInst && op == ssa.OpcodeSExtend:
+ return addend{regalloc.VRegInvalid, int64(uint32(inputDef.Instr.ConstantVal())), 0}
+ case constInst && op == ssa.OpcodeUExtend:
+ return addend{regalloc.VRegInvalid, int64(int32(inputDef.Instr.ConstantVal())), 0} // sign-extend!
+ default:
+ r := m.getOperand_Reg(inputDef)
+ return addend{r.reg(), 0, 0}
+ }
+ case ssa.OpcodeIshl:
+ // If the addend is a shift, we can only handle it if the shift amount is a constant.
+ x, amount := instr.Arg2()
+ amountDef := m.c.ValueDefinition(amount)
+ if amountDef.IsFromInstr() && amountDef.Instr.Constant() && amountDef.Instr.ConstantVal() <= 3 {
+ r := m.getOperand_Reg(m.c.ValueDefinition(x))
+ return addend{r.reg(), 0, uint8(amountDef.Instr.ConstantVal())}
+ }
+ r := m.getOperand_Reg(m.c.ValueDefinition(x))
+ return addend{r.reg(), 0, 0}
+ }
+ panic("BUG: invalid opcode")
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
new file mode 100644
index 000000000..310ad2203
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
@@ -0,0 +1,3611 @@
+package amd64
+
+import (
+ "context"
+ "encoding/binary"
+ "fmt"
+ "math"
+ "strings"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+ "github.com/tetratelabs/wazero/internal/platform"
+)
+
+// NewBackend returns a new backend for arm64.
+func NewBackend() backend.Machine {
+ ectx := backend.NewExecutableContextT[instruction](
+ resetInstruction,
+ setNext,
+ setPrev,
+ asNop,
+ )
+ return &machine{
+ ectx: ectx,
+ cpuFeatures: platform.CpuFeatures,
+ regAlloc: regalloc.NewAllocator(regInfo),
+ spillSlots: map[regalloc.VRegID]int64{},
+ amodePool: wazevoapi.NewPool[amode](nil),
+ constSwizzleMaskConstIndex: -1,
+ constSqmulRoundSatIndex: -1,
+ constI8x16SHLMaskTableIndex: -1,
+ constI8x16LogicalSHRMaskTableIndex: -1,
+ constF64x2CvtFromIMaskIndex: -1,
+ constTwop52Index: -1,
+ constI32sMaxOnF64x2Index: -1,
+ constI32uMaxOnF64x2Index: -1,
+ constAllOnesI8x16Index: -1,
+ constAllOnesI16x8Index: -1,
+ constExtAddPairwiseI16x8uMask1Index: -1,
+ constExtAddPairwiseI16x8uMask2Index: -1,
+ }
+}
+
+type (
+ // machine implements backend.Machine for amd64.
+ machine struct {
+ c backend.Compiler
+ ectx *backend.ExecutableContextT[instruction]
+ stackBoundsCheckDisabled bool
+
+ amodePool wazevoapi.Pool[amode]
+
+ cpuFeatures platform.CpuFeatureFlags
+
+ regAlloc regalloc.Allocator
+ regAllocFn *backend.RegAllocFunction[*instruction, *machine]
+ regAllocStarted bool
+
+ spillSlotSize int64
+ spillSlots map[regalloc.VRegID]int64
+ currentABI *backend.FunctionABI
+ clobberedRegs []regalloc.VReg
+
+ maxRequiredStackSizeForCalls int64
+
+ labelResolutionPends []labelResolutionPend
+
+ jmpTableTargets [][]uint32
+ consts []_const
+
+ constSwizzleMaskConstIndex, constSqmulRoundSatIndex,
+ constI8x16SHLMaskTableIndex, constI8x16LogicalSHRMaskTableIndex,
+ constF64x2CvtFromIMaskIndex, constTwop52Index,
+ constI32sMaxOnF64x2Index, constI32uMaxOnF64x2Index,
+ constAllOnesI8x16Index, constAllOnesI16x8Index,
+ constExtAddPairwiseI16x8uMask1Index, constExtAddPairwiseI16x8uMask2Index int
+ }
+
+ _const struct {
+ lo, hi uint64
+ _var []byte
+ label *labelPosition
+ }
+
+ labelResolutionPend struct {
+ instr *instruction
+ instrOffset int64
+ // imm32Offset is the offset of the last 4 bytes of the instruction.
+ imm32Offset int64
+ }
+
+ labelPosition = backend.LabelPosition[instruction]
+)
+
+func (m *machine) getOrAllocateConstLabel(i *int, _var []byte) backend.Label {
+ index := *i
+ if index == -1 {
+ label := m.allocateLabel()
+ index = len(m.consts)
+ m.consts = append(m.consts, _const{
+ _var: _var,
+ label: label,
+ })
+ *i = index
+ }
+ return m.consts[index].label.L
+}
+
+// Reset implements backend.Machine.
+func (m *machine) Reset() {
+ m.consts = m.consts[:0]
+ m.clobberedRegs = m.clobberedRegs[:0]
+ for key := range m.spillSlots {
+ m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key))
+ }
+ for _, key := range m.clobberedRegs {
+ delete(m.spillSlots, regalloc.VRegID(key))
+ }
+
+ m.stackBoundsCheckDisabled = false
+ m.ectx.Reset()
+
+ m.regAllocFn.Reset()
+ m.regAlloc.Reset()
+ m.regAllocStarted = false
+ m.clobberedRegs = m.clobberedRegs[:0]
+
+ m.spillSlotSize = 0
+ m.maxRequiredStackSizeForCalls = 0
+
+ m.amodePool.Reset()
+ m.jmpTableTargets = m.jmpTableTargets[:0]
+ m.constSwizzleMaskConstIndex = -1
+ m.constSqmulRoundSatIndex = -1
+ m.constI8x16SHLMaskTableIndex = -1
+ m.constI8x16LogicalSHRMaskTableIndex = -1
+ m.constF64x2CvtFromIMaskIndex = -1
+ m.constTwop52Index = -1
+ m.constI32sMaxOnF64x2Index = -1
+ m.constI32uMaxOnF64x2Index = -1
+ m.constAllOnesI8x16Index = -1
+ m.constAllOnesI16x8Index = -1
+ m.constExtAddPairwiseI16x8uMask1Index = -1
+ m.constExtAddPairwiseI16x8uMask2Index = -1
+}
+
+// ExecutableContext implements backend.Machine.
+func (m *machine) ExecutableContext() backend.ExecutableContext { return m.ectx }
+
+// DisableStackCheck implements backend.Machine.
+func (m *machine) DisableStackCheck() { m.stackBoundsCheckDisabled = true }
+
+// SetCompiler implements backend.Machine.
+func (m *machine) SetCompiler(c backend.Compiler) {
+ m.c = c
+ m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, c.SSABuilder(), c)
+}
+
+// SetCurrentABI implements backend.Machine.
+func (m *machine) SetCurrentABI(abi *backend.FunctionABI) {
+ m.currentABI = abi
+}
+
+// RegAlloc implements backend.Machine.
+func (m *machine) RegAlloc() {
+ rf := m.regAllocFn
+ for _, pos := range m.ectx.OrderedBlockLabels {
+ rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End)
+ }
+
+ m.regAllocStarted = true
+ m.regAlloc.DoAllocation(rf)
+ // Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes.
+ m.spillSlotSize = (m.spillSlotSize + 15) &^ 15
+}
+
+// InsertReturn implements backend.Machine.
+func (m *machine) InsertReturn() {
+ i := m.allocateInstr().asRet()
+ m.insert(i)
+}
+
+// LowerSingleBranch implements backend.Machine.
+func (m *machine) LowerSingleBranch(b *ssa.Instruction) {
+ ectx := m.ectx
+ switch b.Opcode() {
+ case ssa.OpcodeJump:
+ _, _, targetBlk := b.BranchData()
+ if b.IsFallthroughJump() {
+ return
+ }
+ jmp := m.allocateInstr()
+ target := ectx.GetOrAllocateSSABlockLabel(targetBlk)
+ if target == backend.LabelReturn {
+ jmp.asRet()
+ } else {
+ jmp.asJmp(newOperandLabel(target))
+ }
+ m.insert(jmp)
+ case ssa.OpcodeBrTable:
+ index, target := b.BrTableData()
+ m.lowerBrTable(index, target)
+ default:
+ panic("BUG: unexpected branch opcode" + b.Opcode().String())
+ }
+}
+
+func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) {
+ // TODO: reuse the slice!
+ labels := make([]uint32, len(targets))
+ for j, target := range targets {
+ labels[j] = uint32(m.ectx.GetOrAllocateSSABlockLabel(target))
+ }
+ index = len(m.jmpTableTargets)
+ m.jmpTableTargets = append(m.jmpTableTargets, labels)
+ return
+}
+
+var condBranchMatches = [...]ssa.Opcode{ssa.OpcodeIcmp, ssa.OpcodeFcmp}
+
+func (m *machine) lowerBrTable(index ssa.Value, targets []ssa.BasicBlock) {
+ _v := m.getOperand_Reg(m.c.ValueDefinition(index))
+ v := m.copyToTmp(_v.reg())
+
+ // First, we need to do the bounds check.
+ maxIndex := m.c.AllocateVReg(ssa.TypeI32)
+ m.lowerIconst(maxIndex, uint64(len(targets)-1), false)
+ cmp := m.allocateInstr().asCmpRmiR(true, newOperandReg(maxIndex), v, false)
+ m.insert(cmp)
+
+ // Then do the conditional move maxIndex to v if v > maxIndex.
+ cmov := m.allocateInstr().asCmove(condNB, newOperandReg(maxIndex), v, false)
+ m.insert(cmov)
+
+ // Now that v has the correct index. Load the address of the jump table into the addr.
+ addr := m.c.AllocateVReg(ssa.TypeI64)
+ leaJmpTableAddr := m.allocateInstr()
+ m.insert(leaJmpTableAddr)
+
+ // Then add the target's offset into jmpTableAddr.
+ loadTargetOffsetFromJmpTable := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd,
+ // Shift by 3 because each entry is 8 bytes.
+ newOperandMem(m.newAmodeRegRegShift(0, addr, v, 3)), addr, true)
+ m.insert(loadTargetOffsetFromJmpTable)
+
+ // Now ready to jump.
+ jmp := m.allocateInstr().asJmp(newOperandReg(addr))
+ m.insert(jmp)
+
+ jmpTableBegin, jmpTableBeginLabel := m.allocateBrTarget()
+ m.insert(jmpTableBegin)
+ leaJmpTableAddr.asLEA(newOperandLabel(jmpTableBeginLabel), addr)
+
+ jmpTable := m.allocateInstr()
+ targetSliceIndex := m.addJmpTableTarget(targets)
+ jmpTable.asJmpTableSequence(targetSliceIndex, len(targets))
+ m.insert(jmpTable)
+}
+
+// LowerConditionalBranch implements backend.Machine.
+func (m *machine) LowerConditionalBranch(b *ssa.Instruction) {
+ exctx := m.ectx
+ cval, args, targetBlk := b.BranchData()
+ if len(args) > 0 {
+ panic(fmt.Sprintf(
+ "conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s",
+ exctx.CurrentSSABlk,
+ targetBlk,
+ ))
+ }
+
+ target := exctx.GetOrAllocateSSABlockLabel(targetBlk)
+ cvalDef := m.c.ValueDefinition(cval)
+
+ switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) {
+ case ssa.OpcodeIcmp:
+ cvalInstr := cvalDef.Instr
+ x, y, c := cvalInstr.IcmpData()
+
+ cc := condFromSSAIntCmpCond(c)
+ if b.Opcode() == ssa.OpcodeBrz {
+ cc = cc.invert()
+ }
+
+ // First, perform the comparison and set the flag.
+ xd, yd := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+ if !m.tryLowerBandToFlag(xd, yd) {
+ m.lowerIcmpToFlag(xd, yd, x.Type() == ssa.TypeI64)
+ }
+
+ // Then perform the conditional branch.
+ m.insert(m.allocateInstr().asJmpIf(cc, newOperandLabel(target)))
+ cvalDef.Instr.MarkLowered()
+ case ssa.OpcodeFcmp:
+ cvalInstr := cvalDef.Instr
+
+ f1, f2, and := m.lowerFcmpToFlags(cvalInstr)
+ isBrz := b.Opcode() == ssa.OpcodeBrz
+ if isBrz {
+ f1 = f1.invert()
+ }
+ if f2 == condInvalid {
+ m.insert(m.allocateInstr().asJmpIf(f1, newOperandLabel(target)))
+ } else {
+ if isBrz {
+ f2 = f2.invert()
+ and = !and
+ }
+ jmp1, jmp2 := m.allocateInstr(), m.allocateInstr()
+ m.insert(jmp1)
+ m.insert(jmp2)
+ notTaken, notTakenLabel := m.allocateBrTarget()
+ m.insert(notTaken)
+ if and {
+ jmp1.asJmpIf(f1.invert(), newOperandLabel(notTakenLabel))
+ jmp2.asJmpIf(f2, newOperandLabel(target))
+ } else {
+ jmp1.asJmpIf(f1, newOperandLabel(target))
+ jmp2.asJmpIf(f2, newOperandLabel(target))
+ }
+ }
+
+ cvalDef.Instr.MarkLowered()
+ default:
+ v := m.getOperand_Reg(cvalDef)
+
+ var cc cond
+ if b.Opcode() == ssa.OpcodeBrz {
+ cc = condZ
+ } else {
+ cc = condNZ
+ }
+
+ // Perform test %v, %v to set the flag.
+ cmp := m.allocateInstr().asCmpRmiR(false, v, v.reg(), false)
+ m.insert(cmp)
+ m.insert(m.allocateInstr().asJmpIf(cc, newOperandLabel(target)))
+ }
+}
+
+// LowerInstr implements backend.Machine.
+func (m *machine) LowerInstr(instr *ssa.Instruction) {
+ if l := instr.SourceOffset(); l.Valid() {
+ info := m.allocateInstr().asEmitSourceOffsetInfo(l)
+ m.insert(info)
+ }
+
+ switch op := instr.Opcode(); op {
+ case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable:
+ panic("BUG: branching instructions are handled by LowerBranches")
+ case ssa.OpcodeReturn:
+ panic("BUG: return must be handled by backend.Compiler")
+ case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined.
+ case ssa.OpcodeCall, ssa.OpcodeCallIndirect:
+ m.lowerCall(instr)
+ case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32:
+ m.lowerStore(instr)
+ case ssa.OpcodeIadd:
+ m.lowerAluRmiROp(instr, aluRmiROpcodeAdd)
+ case ssa.OpcodeIsub:
+ m.lowerAluRmiROp(instr, aluRmiROpcodeSub)
+ case ssa.OpcodeImul:
+ m.lowerAluRmiROp(instr, aluRmiROpcodeMul)
+ case ssa.OpcodeSdiv, ssa.OpcodeUdiv, ssa.OpcodeSrem, ssa.OpcodeUrem:
+ isDiv := op == ssa.OpcodeSdiv || op == ssa.OpcodeUdiv
+ isSigned := op == ssa.OpcodeSdiv || op == ssa.OpcodeSrem
+ m.lowerIDivRem(instr, isDiv, isSigned)
+ case ssa.OpcodeBand:
+ m.lowerAluRmiROp(instr, aluRmiROpcodeAnd)
+ case ssa.OpcodeBor:
+ m.lowerAluRmiROp(instr, aluRmiROpcodeOr)
+ case ssa.OpcodeBxor:
+ m.lowerAluRmiROp(instr, aluRmiROpcodeXor)
+ case ssa.OpcodeIshl:
+ m.lowerShiftR(instr, shiftROpShiftLeft)
+ case ssa.OpcodeSshr:
+ m.lowerShiftR(instr, shiftROpShiftRightArithmetic)
+ case ssa.OpcodeUshr:
+ m.lowerShiftR(instr, shiftROpShiftRightLogical)
+ case ssa.OpcodeRotl:
+ m.lowerShiftR(instr, shiftROpRotateLeft)
+ case ssa.OpcodeRotr:
+ m.lowerShiftR(instr, shiftROpRotateRight)
+ case ssa.OpcodeClz:
+ m.lowerClz(instr)
+ case ssa.OpcodeCtz:
+ m.lowerCtz(instr)
+ case ssa.OpcodePopcnt:
+ m.lowerUnaryRmR(instr, unaryRmROpcodePopcnt)
+ case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv:
+ m.lowerXmmRmR(instr)
+ case ssa.OpcodeFabs:
+ m.lowerFabsFneg(instr)
+ case ssa.OpcodeFneg:
+ m.lowerFabsFneg(instr)
+ case ssa.OpcodeCeil:
+ m.lowerRound(instr, roundingModeUp)
+ case ssa.OpcodeFloor:
+ m.lowerRound(instr, roundingModeDown)
+ case ssa.OpcodeTrunc:
+ m.lowerRound(instr, roundingModeZero)
+ case ssa.OpcodeNearest:
+ m.lowerRound(instr, roundingModeNearest)
+ case ssa.OpcodeFmin, ssa.OpcodeFmax:
+ m.lowerFminFmax(instr)
+ case ssa.OpcodeFcopysign:
+ m.lowerFcopysign(instr)
+ case ssa.OpcodeBitcast:
+ m.lowerBitcast(instr)
+ case ssa.OpcodeSqrt:
+ m.lowerSqrt(instr)
+ case ssa.OpcodeFpromote:
+ v := instr.Arg()
+ rn := m.getOperand_Reg(m.c.ValueDefinition(v))
+ rd := m.c.VRegOf(instr.Return())
+ cnt := m.allocateInstr()
+ cnt.asXmmUnaryRmR(sseOpcodeCvtss2sd, rn, rd)
+ m.insert(cnt)
+ case ssa.OpcodeFdemote:
+ v := instr.Arg()
+ rn := m.getOperand_Reg(m.c.ValueDefinition(v))
+ rd := m.c.VRegOf(instr.Return())
+ cnt := m.allocateInstr()
+ cnt.asXmmUnaryRmR(sseOpcodeCvtsd2ss, rn, rd)
+ m.insert(cnt)
+ case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat:
+ x, ctx := instr.Arg2()
+ rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rd := m.c.VRegOf(instr.Return())
+ ctxVReg := m.c.VRegOf(ctx)
+ m.lowerFcvtToSint(ctxVReg, rn.reg(), rd, x.Type() == ssa.TypeF64,
+ instr.Return().Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat)
+ case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat:
+ x, ctx := instr.Arg2()
+ rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rd := m.c.VRegOf(instr.Return())
+ ctxVReg := m.c.VRegOf(ctx)
+ m.lowerFcvtToUint(ctxVReg, rn.reg(), rd, x.Type() == ssa.TypeF64,
+ instr.Return().Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat)
+ case ssa.OpcodeFcvtFromSint:
+ x := instr.Arg()
+ rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rd := newOperandReg(m.c.VRegOf(instr.Return()))
+ m.lowerFcvtFromSint(rn, rd,
+ x.Type() == ssa.TypeI64, instr.Return().Type().Bits() == 64)
+ case ssa.OpcodeFcvtFromUint:
+ x := instr.Arg()
+ rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rd := newOperandReg(m.c.VRegOf(instr.Return()))
+ m.lowerFcvtFromUint(rn, rd, x.Type() == ssa.TypeI64,
+ instr.Return().Type().Bits() == 64)
+ case ssa.OpcodeVanyTrue:
+ m.lowerVanyTrue(instr)
+ case ssa.OpcodeVallTrue:
+ m.lowerVallTrue(instr)
+ case ssa.OpcodeVhighBits:
+ m.lowerVhighBits(instr)
+ case ssa.OpcodeVbnot:
+ m.lowerVbnot(instr)
+ case ssa.OpcodeVband:
+ x, y := instr.Arg2()
+ m.lowerVbBinOp(sseOpcodePand, x, y, instr.Return())
+ case ssa.OpcodeVbor:
+ x, y := instr.Arg2()
+ m.lowerVbBinOp(sseOpcodePor, x, y, instr.Return())
+ case ssa.OpcodeVbxor:
+ x, y := instr.Arg2()
+ m.lowerVbBinOp(sseOpcodePxor, x, y, instr.Return())
+ case ssa.OpcodeVbandnot:
+ m.lowerVbandnot(instr, sseOpcodePandn)
+ case ssa.OpcodeVbitselect:
+ m.lowerVbitselect(instr)
+ case ssa.OpcodeVIadd:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ vecOp = sseOpcodePaddb
+ case ssa.VecLaneI16x8:
+ vecOp = sseOpcodePaddw
+ case ssa.VecLaneI32x4:
+ vecOp = sseOpcodePaddd
+ case ssa.VecLaneI64x2:
+ vecOp = sseOpcodePaddq
+ }
+ m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+ case ssa.OpcodeVSaddSat:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ vecOp = sseOpcodePaddsb
+ case ssa.VecLaneI16x8:
+ vecOp = sseOpcodePaddsw
+ }
+ m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+ case ssa.OpcodeVUaddSat:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ vecOp = sseOpcodePaddusb
+ case ssa.VecLaneI16x8:
+ vecOp = sseOpcodePaddusw
+ }
+ m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+ case ssa.OpcodeVIsub:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ vecOp = sseOpcodePsubb
+ case ssa.VecLaneI16x8:
+ vecOp = sseOpcodePsubw
+ case ssa.VecLaneI32x4:
+ vecOp = sseOpcodePsubd
+ case ssa.VecLaneI64x2:
+ vecOp = sseOpcodePsubq
+ }
+ m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+ case ssa.OpcodeVSsubSat:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ vecOp = sseOpcodePsubsb
+ case ssa.VecLaneI16x8:
+ vecOp = sseOpcodePsubsw
+ }
+ m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+ case ssa.OpcodeVUsubSat:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ vecOp = sseOpcodePsubusb
+ case ssa.VecLaneI16x8:
+ vecOp = sseOpcodePsubusw
+ }
+ m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+ case ssa.OpcodeVImul:
+ m.lowerVImul(instr)
+ case ssa.OpcodeVIneg:
+ x, lane := instr.ArgWithLane()
+ rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rd := m.c.VRegOf(instr.Return())
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ vecOp = sseOpcodePsubb
+ case ssa.VecLaneI16x8:
+ vecOp = sseOpcodePsubw
+ case ssa.VecLaneI32x4:
+ vecOp = sseOpcodePsubd
+ case ssa.VecLaneI64x2:
+ vecOp = sseOpcodePsubq
+ default:
+ panic("BUG")
+ }
+
+ tmp := m.c.AllocateVReg(ssa.TypeV128)
+ m.insert(m.allocateInstr().asZeros(tmp))
+
+ i := m.allocateInstr()
+ i.asXmmRmR(vecOp, rn, tmp)
+ m.insert(i)
+
+ m.copyTo(tmp, rd)
+ case ssa.OpcodeVFadd:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneF32x4:
+ vecOp = sseOpcodeAddps
+ case ssa.VecLaneF64x2:
+ vecOp = sseOpcodeAddpd
+ }
+ m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+ case ssa.OpcodeVFsub:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneF32x4:
+ vecOp = sseOpcodeSubps
+ case ssa.VecLaneF64x2:
+ vecOp = sseOpcodeSubpd
+ }
+ m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+ case ssa.OpcodeVFdiv:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneF32x4:
+ vecOp = sseOpcodeDivps
+ case ssa.VecLaneF64x2:
+ vecOp = sseOpcodeDivpd
+ }
+ m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+ case ssa.OpcodeVFmul:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneF32x4:
+ vecOp = sseOpcodeMulps
+ case ssa.VecLaneF64x2:
+ vecOp = sseOpcodeMulpd
+ }
+ m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+ case ssa.OpcodeVFneg:
+ x, lane := instr.ArgWithLane()
+ rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rd := m.c.VRegOf(instr.Return())
+
+ tmp := m.c.AllocateVReg(ssa.TypeV128)
+
+ var shiftOp, xorOp sseOpcode
+ var shiftAmt uint32
+ switch lane {
+ case ssa.VecLaneF32x4:
+ shiftOp, shiftAmt, xorOp = sseOpcodePslld, 31, sseOpcodeXorps
+ case ssa.VecLaneF64x2:
+ shiftOp, shiftAmt, xorOp = sseOpcodePsllq, 63, sseOpcodeXorpd
+ }
+
+ zero := m.allocateInstr()
+ zero.asZeros(tmp)
+ m.insert(zero)
+
+ // Set all bits on tmp by CMPPD with arg=0 (== pseudo CMPEQPD instruction).
+ // See https://www.felixcloutier.com/x86/cmpps
+ //
+ // Note: if we do not clear all the bits ^ with XORPS, this might end up not setting ones on some lane
+ // if the lane is NaN.
+ cmp := m.allocateInstr()
+ cmp.asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_UQ), newOperandReg(tmp), tmp)
+ m.insert(cmp)
+
+ // Do the left shift on each lane to set only the most significant bit in each.
+ i := m.allocateInstr()
+ i.asXmmRmiReg(shiftOp, newOperandImm32(shiftAmt), tmp)
+ m.insert(i)
+
+ // Get the negated result by XOR on each lane with tmp.
+ i = m.allocateInstr()
+ i.asXmmRmR(xorOp, rn, tmp)
+ m.insert(i)
+
+ m.copyTo(tmp, rd)
+
+ case ssa.OpcodeVSqrt:
+ x, lane := instr.ArgWithLane()
+ rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rd := m.c.VRegOf(instr.Return())
+
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneF32x4:
+ vecOp = sseOpcodeSqrtps
+ case ssa.VecLaneF64x2:
+ vecOp = sseOpcodeSqrtpd
+ }
+ i := m.allocateInstr()
+ i.asXmmUnaryRmR(vecOp, rn, rd)
+ m.insert(i)
+
+ case ssa.OpcodeVImin:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ vecOp = sseOpcodePminsb
+ case ssa.VecLaneI16x8:
+ vecOp = sseOpcodePminsw
+ case ssa.VecLaneI32x4:
+ vecOp = sseOpcodePminsd
+ }
+ m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+ case ssa.OpcodeVUmin:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ vecOp = sseOpcodePminub
+ case ssa.VecLaneI16x8:
+ vecOp = sseOpcodePminuw
+ case ssa.VecLaneI32x4:
+ vecOp = sseOpcodePminud
+ }
+ m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+ case ssa.OpcodeVImax:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ vecOp = sseOpcodePmaxsb
+ case ssa.VecLaneI16x8:
+ vecOp = sseOpcodePmaxsw
+ case ssa.VecLaneI32x4:
+ vecOp = sseOpcodePmaxsd
+ }
+ m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+ case ssa.OpcodeVUmax:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ vecOp = sseOpcodePmaxub
+ case ssa.VecLaneI16x8:
+ vecOp = sseOpcodePmaxuw
+ case ssa.VecLaneI32x4:
+ vecOp = sseOpcodePmaxud
+ }
+ m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+ case ssa.OpcodeVAvgRound:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ vecOp = sseOpcodePavgb
+ case ssa.VecLaneI16x8:
+ vecOp = sseOpcodePavgw
+ }
+ m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+ case ssa.OpcodeVIcmp:
+ x, y, c, lane := instr.VIcmpData()
+ m.lowerVIcmp(x, y, c, instr.Return(), lane)
+
+ case ssa.OpcodeVFcmp:
+ x, y, c, lane := instr.VFcmpData()
+ m.lowerVFcmp(x, y, c, instr.Return(), lane)
+
+ case ssa.OpcodeExtractlane:
+ x, index, signed, lane := instr.ExtractlaneData()
+ m.lowerExtractLane(x, index, signed, instr.Return(), lane)
+
+ case ssa.OpcodeInsertlane:
+ x, y, index, lane := instr.InsertlaneData()
+ m.lowerInsertLane(x, y, index, instr.Return(), lane)
+
+ case ssa.OpcodeSwizzle:
+ x, y, _ := instr.Arg2WithLane()
+ m.lowerSwizzle(x, y, instr.Return())
+
+ case ssa.OpcodeShuffle:
+ x, y, lo, hi := instr.ShuffleData()
+ m.lowerShuffle(x, y, lo, hi, instr.Return())
+
+ case ssa.OpcodeSplat:
+ x, lane := instr.ArgWithLane()
+ m.lowerSplat(x, instr.Return(), lane)
+
+ case ssa.OpcodeSqmulRoundSat:
+ x, y := instr.Arg2()
+ m.lowerSqmulRoundSat(x, y, instr.Return())
+
+ case ssa.OpcodeVZeroExtLoad:
+ ptr, offset, typ := instr.VZeroExtLoadData()
+ var sseOp sseOpcode
+ // Both movss and movsd clears the higher bits of the destination register upt 128 bits.
+ // https://www.felixcloutier.com/x86/movss
+ // https://www.felixcloutier.com/x86/movsd
+ if typ == ssa.TypeF32 {
+ sseOp = sseOpcodeMovss
+ } else {
+ sseOp = sseOpcodeMovsd
+ }
+ mem := m.lowerToAddressMode(ptr, offset)
+ dst := m.c.VRegOf(instr.Return())
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandMem(mem), dst))
+
+ case ssa.OpcodeVMinPseudo:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneF32x4:
+ vecOp = sseOpcodeMinps
+ case ssa.VecLaneF64x2:
+ vecOp = sseOpcodeMinpd
+ default:
+ panic("BUG: unexpected lane type")
+ }
+ m.lowerVbBinOpUnaligned(vecOp, y, x, instr.Return())
+
+ case ssa.OpcodeVMaxPseudo:
+ x, y, lane := instr.Arg2WithLane()
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneF32x4:
+ vecOp = sseOpcodeMaxps
+ case ssa.VecLaneF64x2:
+ vecOp = sseOpcodeMaxpd
+ default:
+ panic("BUG: unexpected lane type")
+ }
+ m.lowerVbBinOpUnaligned(vecOp, y, x, instr.Return())
+
+ case ssa.OpcodeVIshl:
+ x, y, lane := instr.Arg2WithLane()
+ m.lowerVIshl(x, y, instr.Return(), lane)
+
+ case ssa.OpcodeVSshr:
+ x, y, lane := instr.Arg2WithLane()
+ m.lowerVSshr(x, y, instr.Return(), lane)
+
+ case ssa.OpcodeVUshr:
+ x, y, lane := instr.Arg2WithLane()
+ m.lowerVUshr(x, y, instr.Return(), lane)
+
+ case ssa.OpcodeVCeil:
+ x, lane := instr.ArgWithLane()
+ m.lowerVRound(x, instr.Return(), 0x2, lane == ssa.VecLaneF64x2)
+
+ case ssa.OpcodeVFloor:
+ x, lane := instr.ArgWithLane()
+ m.lowerVRound(x, instr.Return(), 0x1, lane == ssa.VecLaneF64x2)
+
+ case ssa.OpcodeVTrunc:
+ x, lane := instr.ArgWithLane()
+ m.lowerVRound(x, instr.Return(), 0x3, lane == ssa.VecLaneF64x2)
+
+ case ssa.OpcodeVNearest:
+ x, lane := instr.ArgWithLane()
+ m.lowerVRound(x, instr.Return(), 0x0, lane == ssa.VecLaneF64x2)
+
+ case ssa.OpcodeExtIaddPairwise:
+ x, lane, signed := instr.ExtIaddPairwiseData()
+ m.lowerExtIaddPairwise(x, instr.Return(), lane, signed)
+
+ case ssa.OpcodeUwidenLow, ssa.OpcodeSwidenLow:
+ x, lane := instr.ArgWithLane()
+ m.lowerWidenLow(x, instr.Return(), lane, op == ssa.OpcodeSwidenLow)
+
+ case ssa.OpcodeUwidenHigh, ssa.OpcodeSwidenHigh:
+ x, lane := instr.ArgWithLane()
+ m.lowerWidenHigh(x, instr.Return(), lane, op == ssa.OpcodeSwidenHigh)
+
+ case ssa.OpcodeLoadSplat:
+ ptr, offset, lane := instr.LoadSplatData()
+ m.lowerLoadSplat(ptr, offset, instr.Return(), lane)
+
+ case ssa.OpcodeVFcvtFromUint, ssa.OpcodeVFcvtFromSint:
+ x, lane := instr.ArgWithLane()
+ m.lowerVFcvtFromInt(x, instr.Return(), lane, op == ssa.OpcodeVFcvtFromSint)
+
+ case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat:
+ x, lane := instr.ArgWithLane()
+ m.lowerVFcvtToIntSat(x, instr.Return(), lane, op == ssa.OpcodeVFcvtToSintSat)
+
+ case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow:
+ x, y, lane := instr.Arg2WithLane()
+ m.lowerNarrow(x, y, instr.Return(), lane, op == ssa.OpcodeSnarrow)
+
+ case ssa.OpcodeFvpromoteLow:
+ x := instr.Arg()
+ src := m.getOperand_Reg(m.c.ValueDefinition(x))
+ dst := m.c.VRegOf(instr.Return())
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtps2pd, src, dst))
+
+ case ssa.OpcodeFvdemote:
+ x := instr.Arg()
+ src := m.getOperand_Reg(m.c.ValueDefinition(x))
+ dst := m.c.VRegOf(instr.Return())
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtpd2ps, src, dst))
+
+ case ssa.OpcodeWideningPairwiseDotProductS:
+ x, y := instr.Arg2()
+ m.lowerWideningPairwiseDotProductS(x, y, instr.Return())
+
+ case ssa.OpcodeVIabs:
+ m.lowerVIabs(instr)
+ case ssa.OpcodeVIpopcnt:
+ m.lowerVIpopcnt(instr)
+ case ssa.OpcodeVFmin:
+ m.lowerVFmin(instr)
+ case ssa.OpcodeVFmax:
+ m.lowerVFmax(instr)
+ case ssa.OpcodeVFabs:
+ m.lowerVFabs(instr)
+ case ssa.OpcodeUndefined:
+ m.insert(m.allocateInstr().asUD2())
+ case ssa.OpcodeExitWithCode:
+ execCtx, code := instr.ExitWithCodeData()
+ m.lowerExitWithCode(m.c.VRegOf(execCtx), code)
+ case ssa.OpcodeExitIfTrueWithCode:
+ execCtx, c, code := instr.ExitIfTrueWithCodeData()
+ m.lowerExitIfTrueWithCode(m.c.VRegOf(execCtx), c, code)
+ case ssa.OpcodeLoad:
+ ptr, offset, typ := instr.LoadData()
+ dst := m.c.VRegOf(instr.Return())
+ m.lowerLoad(ptr, offset, typ, dst)
+ case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32:
+ ptr, offset, _ := instr.LoadData()
+ ret := m.c.VRegOf(instr.Return())
+ m.lowerExtLoad(op, ptr, offset, ret)
+ case ssa.OpcodeVconst:
+ result := m.c.VRegOf(instr.Return())
+ lo, hi := instr.VconstData()
+ m.lowerVconst(result, lo, hi)
+ case ssa.OpcodeSExtend, ssa.OpcodeUExtend:
+ from, to, signed := instr.ExtendData()
+ m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed)
+ case ssa.OpcodeIcmp:
+ m.lowerIcmp(instr)
+ case ssa.OpcodeFcmp:
+ m.lowerFcmp(instr)
+ case ssa.OpcodeSelect:
+ cval, x, y := instr.SelectData()
+ m.lowerSelect(x, y, cval, instr.Return())
+ case ssa.OpcodeIreduce:
+ rn := m.getOperand_Mem_Reg(m.c.ValueDefinition(instr.Arg()))
+ retVal := instr.Return()
+ rd := m.c.VRegOf(retVal)
+
+ if retVal.Type() != ssa.TypeI32 {
+ panic("TODO?: Ireduce to non-i32")
+ }
+ m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, rn, rd))
+
+ case ssa.OpcodeAtomicLoad:
+ ptr := instr.Arg()
+ size := instr.AtomicTargetSize()
+ dst := m.c.VRegOf(instr.Return())
+
+ // At this point, the ptr is ensured to be aligned, so using a normal load is atomic.
+ // https://github.com/golang/go/blob/adead1a93f472affa97c494ef19f2f492ee6f34a/src/runtime/internal/atomic/atomic_amd64.go#L30
+ mem := newOperandMem(m.lowerToAddressMode(ptr, 0))
+ load := m.allocateInstr()
+ switch size {
+ case 8:
+ load.asMov64MR(mem, dst)
+ case 4:
+ load.asMovzxRmR(extModeLQ, mem, dst)
+ case 2:
+ load.asMovzxRmR(extModeWQ, mem, dst)
+ case 1:
+ load.asMovzxRmR(extModeBQ, mem, dst)
+ default:
+ panic("BUG")
+ }
+ m.insert(load)
+
+ case ssa.OpcodeFence:
+ m.insert(m.allocateInstr().asMFence())
+
+ case ssa.OpcodeAtomicStore:
+ ptr, _val := instr.Arg2()
+ size := instr.AtomicTargetSize()
+
+ val := m.getOperand_Reg(m.c.ValueDefinition(_val))
+ // The content on the val register will be overwritten by xchg, so we need to copy it to a temporary register.
+ copied := m.copyToTmp(val.reg())
+
+ mem := newOperandMem(m.lowerToAddressMode(ptr, 0))
+ store := m.allocateInstr().asXCHG(copied, mem, byte(size))
+ m.insert(store)
+
+ case ssa.OpcodeAtomicCas:
+ addr, exp, repl := instr.Arg3()
+ size := instr.AtomicTargetSize()
+ m.lowerAtomicCas(addr, exp, repl, size, instr.Return())
+
+ case ssa.OpcodeAtomicRmw:
+ addr, val := instr.Arg2()
+ atomicOp, size := instr.AtomicRmwData()
+ m.lowerAtomicRmw(atomicOp, addr, val, size, instr.Return())
+
+ default:
+ panic("TODO: lowering " + op.String())
+ }
+}
+
+func (m *machine) lowerAtomicRmw(op ssa.AtomicRmwOp, addr, val ssa.Value, size uint64, ret ssa.Value) {
+ mem := m.lowerToAddressMode(addr, 0)
+ _val := m.getOperand_Reg(m.c.ValueDefinition(val))
+
+ switch op {
+ case ssa.AtomicRmwOpAdd, ssa.AtomicRmwOpSub:
+ valCopied := m.copyToTmp(_val.reg())
+ if op == ssa.AtomicRmwOpSub {
+ // Negate the value.
+ m.insert(m.allocateInstr().asNeg(newOperandReg(valCopied), true))
+ }
+ m.insert(m.allocateInstr().asLockXAdd(valCopied, mem, byte(size)))
+ m.clearHigherBitsForAtomic(valCopied, size, ret.Type())
+ m.copyTo(valCopied, m.c.VRegOf(ret))
+
+ case ssa.AtomicRmwOpAnd, ssa.AtomicRmwOpOr, ssa.AtomicRmwOpXor:
+ accumulator := raxVReg
+ // Reserve rax for the accumulator to make regalloc happy.
+ // Note: do this initialization before defining valCopied, because it might be the same register and
+ // if that happens, the unnecessary load/store will be performed inside the loop.
+ // This can be mitigated in any way once the register allocator is clever enough.
+ m.insert(m.allocateInstr().asDefineUninitializedReg(accumulator))
+
+ // Copy the value to a temporary register.
+ valCopied := m.copyToTmp(_val.reg())
+ m.clearHigherBitsForAtomic(valCopied, size, ret.Type())
+
+ memOp := newOperandMem(mem)
+ tmp := m.c.AllocateVReg(ssa.TypeI64)
+ beginLoop, beginLoopLabel := m.allocateBrTarget()
+ {
+ m.insert(beginLoop)
+ // Reset the value on tmp by the original value.
+ m.copyTo(valCopied, tmp)
+ // Load the current value at the memory location into accumulator.
+ switch size {
+ case 1:
+ m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, memOp, accumulator))
+ case 2:
+ m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, memOp, accumulator))
+ case 4:
+ m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, memOp, accumulator))
+ case 8:
+ m.insert(m.allocateInstr().asMov64MR(memOp, accumulator))
+ default:
+ panic("BUG")
+ }
+ // Then perform the logical operation on the accumulator and the value on tmp.
+ switch op {
+ case ssa.AtomicRmwOpAnd:
+ m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, newOperandReg(accumulator), tmp, true))
+ case ssa.AtomicRmwOpOr:
+ m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeOr, newOperandReg(accumulator), tmp, true))
+ case ssa.AtomicRmwOpXor:
+ m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeXor, newOperandReg(accumulator), tmp, true))
+ default:
+ panic("BUG")
+ }
+ // Finally, try compare-exchange the value at the memory location with the tmp.
+ m.insert(m.allocateInstr().asLockCmpXCHG(tmp, memOp.addressMode(), byte(size)))
+ // If it succeeds, ZF will be set, and we can break the loop.
+ m.insert(m.allocateInstr().asJmpIf(condNZ, newOperandLabel(beginLoopLabel)))
+ }
+
+ // valCopied must be alive at the end of the loop.
+ m.insert(m.allocateInstr().asNopUseReg(valCopied))
+
+ // At this point, accumulator contains the result.
+ m.clearHigherBitsForAtomic(accumulator, size, ret.Type())
+ m.copyTo(accumulator, m.c.VRegOf(ret))
+
+ case ssa.AtomicRmwOpXchg:
+ valCopied := m.copyToTmp(_val.reg())
+
+ m.insert(m.allocateInstr().asXCHG(valCopied, newOperandMem(mem), byte(size)))
+ m.clearHigherBitsForAtomic(valCopied, size, ret.Type())
+ m.copyTo(valCopied, m.c.VRegOf(ret))
+
+ default:
+ panic("BUG")
+ }
+}
+
+func (m *machine) lowerAtomicCas(addr, exp, repl ssa.Value, size uint64, ret ssa.Value) {
+ mem := m.lowerToAddressMode(addr, 0)
+ expOp := m.getOperand_Reg(m.c.ValueDefinition(exp))
+ replOp := m.getOperand_Reg(m.c.ValueDefinition(repl))
+
+ accumulator := raxVReg
+ m.copyTo(expOp.reg(), accumulator)
+ m.insert(m.allocateInstr().asLockCmpXCHG(replOp.reg(), mem, byte(size)))
+ m.clearHigherBitsForAtomic(accumulator, size, ret.Type())
+ m.copyTo(accumulator, m.c.VRegOf(ret))
+}
+
+func (m *machine) clearHigherBitsForAtomic(r regalloc.VReg, valSize uint64, resultType ssa.Type) {
+ switch resultType {
+ case ssa.TypeI32:
+ switch valSize {
+ case 1:
+ m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(r), r))
+ case 2:
+ m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(r), r))
+ }
+ case ssa.TypeI64:
+ switch valSize {
+ case 1:
+ m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(r), r))
+ case 2:
+ m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, newOperandReg(r), r))
+ case 4:
+ m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, newOperandReg(r), r))
+ }
+ }
+}
+
+func (m *machine) lowerFcmp(instr *ssa.Instruction) {
+ f1, f2, and := m.lowerFcmpToFlags(instr)
+ rd := m.c.VRegOf(instr.Return())
+ if f2 == condInvalid {
+ tmp := m.c.AllocateVReg(ssa.TypeI32)
+ m.insert(m.allocateInstr().asSetcc(f1, tmp))
+ // On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match
+ // the semantics of Icmp that sets either 0 or 1.
+ m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp), rd))
+ } else {
+ tmp1, tmp2 := m.c.AllocateVReg(ssa.TypeI32), m.c.AllocateVReg(ssa.TypeI32)
+ m.insert(m.allocateInstr().asSetcc(f1, tmp1))
+ m.insert(m.allocateInstr().asSetcc(f2, tmp2))
+ var op aluRmiROpcode
+ if and {
+ op = aluRmiROpcodeAnd
+ } else {
+ op = aluRmiROpcodeOr
+ }
+ m.insert(m.allocateInstr().asAluRmiR(op, newOperandReg(tmp1), tmp2, false))
+ m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp2), rd))
+ }
+}
+
+func (m *machine) lowerIcmp(instr *ssa.Instruction) {
+ x, y, c := instr.IcmpData()
+ m.lowerIcmpToFlag(m.c.ValueDefinition(x), m.c.ValueDefinition(y), x.Type() == ssa.TypeI64)
+ rd := m.c.VRegOf(instr.Return())
+ tmp := m.c.AllocateVReg(ssa.TypeI32)
+ m.insert(m.allocateInstr().asSetcc(condFromSSAIntCmpCond(c), tmp))
+ // On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match
+ // the semantics of Icmp that sets either 0 or 1.
+ m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp), rd))
+}
+
+func (m *machine) lowerSelect(x, y, cval, ret ssa.Value) {
+ xo, yo := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y))
+ rd := m.c.VRegOf(ret)
+
+ var cond cond
+ cvalDef := m.c.ValueDefinition(cval)
+ switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) {
+ case ssa.OpcodeIcmp:
+ icmp := cvalDef.Instr
+ xc, yc, cc := icmp.IcmpData()
+ m.lowerIcmpToFlag(m.c.ValueDefinition(xc), m.c.ValueDefinition(yc), xc.Type() == ssa.TypeI64)
+ cond = condFromSSAIntCmpCond(cc)
+ icmp.Lowered()
+ default: // TODO: match ssa.OpcodeFcmp for optimization, but seems a bit complex.
+ cv := m.getOperand_Reg(cvalDef)
+ test := m.allocateInstr().asCmpRmiR(false, cv, cv.reg(), false)
+ m.insert(test)
+ cond = condNZ
+ }
+
+ if typ := x.Type(); typ.IsInt() {
+ _64 := typ.Bits() == 64
+ mov := m.allocateInstr()
+ tmp := m.c.AllocateVReg(typ)
+ switch yo.kind {
+ case operandKindReg:
+ mov.asMovRR(yo.reg(), tmp, _64)
+ case operandKindMem:
+ if _64 {
+ mov.asMov64MR(yo, tmp)
+ } else {
+ mov.asMovzxRmR(extModeLQ, yo, tmp)
+ }
+ default:
+ panic("BUG")
+ }
+ m.insert(mov)
+ cmov := m.allocateInstr().asCmove(cond, xo, tmp, _64)
+ m.insert(cmov)
+ m.insert(m.allocateInstr().asMovRR(tmp, rd, _64))
+ } else {
+ mov := m.allocateInstr()
+ tmp := m.c.AllocateVReg(typ)
+ switch typ {
+ case ssa.TypeF32:
+ mov.asXmmUnaryRmR(sseOpcodeMovss, yo, tmp)
+ case ssa.TypeF64:
+ mov.asXmmUnaryRmR(sseOpcodeMovsd, yo, tmp)
+ case ssa.TypeV128:
+ mov.asXmmUnaryRmR(sseOpcodeMovdqu, yo, tmp)
+ default:
+ panic("BUG")
+ }
+ m.insert(mov)
+
+ cmov := m.allocateInstr().asXmmCMov(cond, xo, tmp, typ.Size())
+ m.insert(cmov)
+
+ m.copyTo(tmp, rd)
+ }
+}
+
+func (m *machine) lowerXmmCmovAfterRegAlloc(i *instruction) {
+ x := i.op1
+ rd := i.op2.reg()
+ cond := cond(i.u1)
+
+ jcc := m.allocateInstr()
+ m.insert(jcc)
+
+ mov := m.allocateInstr()
+ switch i.u2 {
+ case 4:
+ mov.asXmmUnaryRmR(sseOpcodeMovss, x, rd)
+ case 8:
+ mov.asXmmUnaryRmR(sseOpcodeMovsd, x, rd)
+ case 16:
+ mov.asXmmUnaryRmR(sseOpcodeMovdqu, x, rd)
+ default:
+ panic("BUG")
+ }
+ m.insert(mov)
+
+ nop, end := m.allocateBrTarget()
+ m.insert(nop)
+ jcc.asJmpIf(cond.invert(), newOperandLabel(end))
+}
+
+func (m *machine) lowerExtend(_arg, ret ssa.Value, from, to byte, signed bool) {
+ rd0 := m.c.VRegOf(ret)
+ arg := m.getOperand_Mem_Reg(m.c.ValueDefinition(_arg))
+
+ rd := m.c.AllocateVReg(ret.Type())
+
+ ext := m.allocateInstr()
+ switch {
+ case from == 8 && to == 16 && signed:
+ ext.asMovsxRmR(extModeBQ, arg, rd)
+ case from == 8 && to == 16 && !signed:
+ ext.asMovzxRmR(extModeBL, arg, rd)
+ case from == 8 && to == 32 && signed:
+ ext.asMovsxRmR(extModeBL, arg, rd)
+ case from == 8 && to == 32 && !signed:
+ ext.asMovzxRmR(extModeBQ, arg, rd)
+ case from == 8 && to == 64 && signed:
+ ext.asMovsxRmR(extModeBQ, arg, rd)
+ case from == 8 && to == 64 && !signed:
+ ext.asMovzxRmR(extModeBQ, arg, rd)
+ case from == 16 && to == 32 && signed:
+ ext.asMovsxRmR(extModeWL, arg, rd)
+ case from == 16 && to == 32 && !signed:
+ ext.asMovzxRmR(extModeWL, arg, rd)
+ case from == 16 && to == 64 && signed:
+ ext.asMovsxRmR(extModeWQ, arg, rd)
+ case from == 16 && to == 64 && !signed:
+ ext.asMovzxRmR(extModeWQ, arg, rd)
+ case from == 32 && to == 64 && signed:
+ ext.asMovsxRmR(extModeLQ, arg, rd)
+ case from == 32 && to == 64 && !signed:
+ ext.asMovzxRmR(extModeLQ, arg, rd)
+ default:
+ panic(fmt.Sprintf("BUG: unhandled extend: from=%d, to=%d, signed=%t", from, to, signed))
+ }
+ m.insert(ext)
+
+ m.copyTo(rd, rd0)
+}
+
+func (m *machine) lowerVconst(dst regalloc.VReg, lo, hi uint64) {
+ if lo == 0 && hi == 0 {
+ m.insert(m.allocateInstr().asZeros(dst))
+ return
+ }
+
+ load := m.allocateInstr()
+ constLabel := m.allocateLabel()
+ m.consts = append(m.consts, _const{label: constLabel, lo: lo, hi: hi})
+ load.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(constLabel.L)), dst)
+ m.insert(load)
+}
+
+func (m *machine) lowerCtz(instr *ssa.Instruction) {
+ if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) {
+ m.lowerUnaryRmR(instr, unaryRmROpcodeTzcnt)
+ } else {
+ // On processors that do not support TZCNT, the BSF instruction is
+ // executed instead. The key difference between TZCNT and BSF
+ // instruction is that if source operand is zero, the content of
+ // destination operand is undefined.
+ // https://www.felixcloutier.com/x86/tzcnt.html
+
+ x := instr.Arg()
+ if !x.Type().IsInt() {
+ panic("BUG?")
+ }
+ _64 := x.Type().Bits() == 64
+
+ xDef := m.c.ValueDefinition(x)
+ tmp := m.c.AllocateVReg(x.Type())
+ rm := m.getOperand_Reg(xDef)
+
+ // First, we have to check if the target is non-zero.
+ test := m.allocateInstr()
+ test.asCmpRmiR(false, rm, rm.reg(), _64)
+ m.insert(test)
+
+ jmpNz := m.allocateInstr()
+ m.insert(jmpNz)
+
+ // If the value is zero, we just push the const value.
+ m.lowerIconst(tmp, uint64(x.Type().Bits()), _64)
+
+ // Now jump right after the non-zero case.
+ jmpAtEnd := m.allocateInstr()
+ m.insert(jmpAtEnd)
+
+ // jmpNz target label is set here.
+ nop, nz := m.allocateBrTarget()
+ jmpNz.asJmpIf(condNZ, newOperandLabel(nz))
+ m.insert(nop)
+
+ // Emit the non-zero case.
+ bsr := m.allocateInstr()
+ bsr.asUnaryRmR(unaryRmROpcodeBsf, rm, tmp, _64)
+ m.insert(bsr)
+
+ // jmpAtEnd target label is set here.
+ nopEnd, end := m.allocateBrTarget()
+ jmpAtEnd.asJmp(newOperandLabel(end))
+ m.insert(nopEnd)
+
+ m.copyTo(tmp, m.c.VRegOf(instr.Return()))
+ }
+}
+
+func (m *machine) lowerClz(instr *ssa.Instruction) {
+ if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) {
+ m.lowerUnaryRmR(instr, unaryRmROpcodeLzcnt)
+ } else {
+ // On processors that do not support LZCNT, we combine BSR (calculating
+ // most significant set bit) with XOR. This logic is described in
+ // "Replace Raw Assembly Code with Builtin Intrinsics" section in:
+ // https://developer.apple.com/documentation/apple-silicon/addressing-architectural-differences-in-your-macos-code.
+
+ x := instr.Arg()
+ if !x.Type().IsInt() {
+ panic("BUG?")
+ }
+ _64 := x.Type().Bits() == 64
+
+ xDef := m.c.ValueDefinition(x)
+ rm := m.getOperand_Reg(xDef)
+ tmp := m.c.AllocateVReg(x.Type())
+
+ // First, we have to check if the rm is non-zero as BSR is undefined
+ // on zero. See https://www.felixcloutier.com/x86/bsr.
+ test := m.allocateInstr()
+ test.asCmpRmiR(false, rm, rm.reg(), _64)
+ m.insert(test)
+
+ jmpNz := m.allocateInstr()
+ m.insert(jmpNz)
+
+ // If the value is zero, we just push the const value.
+ m.lowerIconst(tmp, uint64(x.Type().Bits()), _64)
+
+ // Now jump right after the non-zero case.
+ jmpAtEnd := m.allocateInstr()
+ m.insert(jmpAtEnd)
+
+ // jmpNz target label is set here.
+ nop, nz := m.allocateBrTarget()
+ jmpNz.asJmpIf(condNZ, newOperandLabel(nz))
+ m.insert(nop)
+
+ // Emit the non-zero case.
+ bsr := m.allocateInstr()
+ bsr.asUnaryRmR(unaryRmROpcodeBsr, rm, tmp, _64)
+ m.insert(bsr)
+
+ // Now we XOR the value with the bit length minus one.
+ xor := m.allocateInstr()
+ xor.asAluRmiR(aluRmiROpcodeXor, newOperandImm32(uint32(x.Type().Bits()-1)), tmp, _64)
+ m.insert(xor)
+
+ // jmpAtEnd target label is set here.
+ nopEnd, end := m.allocateBrTarget()
+ jmpAtEnd.asJmp(newOperandLabel(end))
+ m.insert(nopEnd)
+
+ m.copyTo(tmp, m.c.VRegOf(instr.Return()))
+ }
+}
+
+func (m *machine) lowerUnaryRmR(si *ssa.Instruction, op unaryRmROpcode) {
+ x := si.Arg()
+ if !x.Type().IsInt() {
+ panic("BUG?")
+ }
+ _64 := x.Type().Bits() == 64
+
+ xDef := m.c.ValueDefinition(x)
+ rm := m.getOperand_Mem_Reg(xDef)
+ rd := m.c.VRegOf(si.Return())
+
+ instr := m.allocateInstr()
+ instr.asUnaryRmR(op, rm, rd, _64)
+ m.insert(instr)
+}
+
+func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, dst regalloc.VReg) {
+ mem := newOperandMem(m.lowerToAddressMode(ptr, offset))
+ load := m.allocateInstr()
+ switch typ {
+ case ssa.TypeI32:
+ load.asMovzxRmR(extModeLQ, mem, dst)
+ case ssa.TypeI64:
+ load.asMov64MR(mem, dst)
+ case ssa.TypeF32:
+ load.asXmmUnaryRmR(sseOpcodeMovss, mem, dst)
+ case ssa.TypeF64:
+ load.asXmmUnaryRmR(sseOpcodeMovsd, mem, dst)
+ case ssa.TypeV128:
+ load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, dst)
+ default:
+ panic("BUG")
+ }
+ m.insert(load)
+}
+
+func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, dst regalloc.VReg) {
+ mem := newOperandMem(m.lowerToAddressMode(ptr, offset))
+ load := m.allocateInstr()
+ switch op {
+ case ssa.OpcodeUload8:
+ load.asMovzxRmR(extModeBQ, mem, dst)
+ case ssa.OpcodeUload16:
+ load.asMovzxRmR(extModeWQ, mem, dst)
+ case ssa.OpcodeUload32:
+ load.asMovzxRmR(extModeLQ, mem, dst)
+ case ssa.OpcodeSload8:
+ load.asMovsxRmR(extModeBQ, mem, dst)
+ case ssa.OpcodeSload16:
+ load.asMovsxRmR(extModeWQ, mem, dst)
+ case ssa.OpcodeSload32:
+ load.asMovsxRmR(extModeLQ, mem, dst)
+ default:
+ panic("BUG")
+ }
+ m.insert(load)
+}
+
+func (m *machine) lowerExitIfTrueWithCode(execCtx regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) {
+ condDef := m.c.ValueDefinition(cond)
+ if !m.c.MatchInstr(condDef, ssa.OpcodeIcmp) {
+ panic("TODO: ExitIfTrue must come after Icmp at the moment: " + condDef.Instr.Opcode().String())
+ }
+ cvalInstr := condDef.Instr
+ cvalInstr.MarkLowered()
+
+ // We need to copy the execution context to a temp register, because if it's spilled,
+ // it might end up being reloaded inside the exiting branch.
+ execCtxTmp := m.copyToTmp(execCtx)
+
+ x, y, c := cvalInstr.IcmpData()
+ xx, yy := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+ if !m.tryLowerBandToFlag(xx, yy) {
+ m.lowerIcmpToFlag(xx, yy, x.Type() == ssa.TypeI64)
+ }
+
+ jmpIf := m.allocateInstr()
+ m.insert(jmpIf)
+ l := m.lowerExitWithCode(execCtxTmp, code)
+ jmpIf.asJmpIf(condFromSSAIntCmpCond(c).invert(), newOperandLabel(l))
+}
+
+func (m *machine) tryLowerBandToFlag(x, y *backend.SSAValueDefinition) (ok bool) {
+ var target *backend.SSAValueDefinition
+ if x.IsFromInstr() && x.Instr.Constant() && x.Instr.ConstantVal() == 0 {
+ if m.c.MatchInstr(y, ssa.OpcodeBand) {
+ target = y
+ }
+ }
+
+ if y.IsFromInstr() && y.Instr.Constant() && y.Instr.ConstantVal() == 0 {
+ if m.c.MatchInstr(x, ssa.OpcodeBand) {
+ target = x
+ }
+ }
+
+ if target == nil {
+ return false
+ }
+
+ bandInstr := target.Instr
+ bandX, bandY := bandInstr.Arg2()
+
+ xx := m.getOperand_Reg(m.c.ValueDefinition(bandX))
+ yy := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(bandY))
+ test := m.allocateInstr().asCmpRmiR(false, yy, xx.reg(), bandX.Type() == ssa.TypeI64)
+ m.insert(test)
+ bandInstr.MarkLowered()
+ return true
+}
+
+func (m *machine) allocateExitInstructions(execCtx, exitCodeReg regalloc.VReg) (saveRsp, saveRbp, setExitCode *instruction) {
+ saveRsp = m.allocateInstr().asMovRM(
+ rspVReg,
+ newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.U32(), execCtx)),
+ 8,
+ )
+
+ saveRbp = m.allocateInstr().asMovRM(
+ rbpVReg,
+ newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetFramePointerBeforeGoCall.U32(), execCtx)),
+ 8,
+ )
+ setExitCode = m.allocateInstr().asMovRM(
+ exitCodeReg,
+ newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetExitCodeOffset.U32(), execCtx)),
+ 4,
+ )
+ return
+}
+
+func (m *machine) lowerExitWithCode(execCtx regalloc.VReg, code wazevoapi.ExitCode) (afterLabel backend.Label) {
+ exitCodeReg := rbpVReg
+ saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtx, exitCodeReg)
+
+ // Set save RSP, RBP, and write exit code.
+ m.insert(saveRsp)
+ m.insert(saveRbp)
+ m.lowerIconst(exitCodeReg, uint64(code), false)
+ m.insert(setExitCode)
+
+ ripReg := rbpVReg
+
+ // Next is to save the current address for stack unwinding.
+ nop, currentAddrLabel := m.allocateBrTarget()
+ m.insert(nop)
+ readRip := m.allocateInstr().asLEA(newOperandLabel(currentAddrLabel), ripReg)
+ m.insert(readRip)
+ saveRip := m.allocateInstr().asMovRM(
+ ripReg,
+ newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)),
+ 8,
+ )
+ m.insert(saveRip)
+
+ // Finally exit.
+ exitSq := m.allocateExitSeq(execCtx)
+ m.insert(exitSq)
+
+ // Return the label for continuation.
+ continuation, afterLabel := m.allocateBrTarget()
+ m.insert(continuation)
+ return afterLabel
+}
+
+func (m *machine) lowerAluRmiROp(si *ssa.Instruction, op aluRmiROpcode) {
+ x, y := si.Arg2()
+ if !x.Type().IsInt() {
+ panic("BUG?")
+ }
+
+ _64 := x.Type().Bits() == 64
+
+ xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+
+ // TODO: commutative args can be swapped if one of them is an immediate.
+ rn := m.getOperand_Reg(xDef)
+ rm := m.getOperand_Mem_Imm32_Reg(yDef)
+ rd := m.c.VRegOf(si.Return())
+
+ // rn is being overwritten, so we first copy its value to a temp register,
+ // in case it is referenced again later.
+ tmp := m.copyToTmp(rn.reg())
+
+ alu := m.allocateInstr()
+ alu.asAluRmiR(op, rm, tmp, _64)
+ m.insert(alu)
+
+ // tmp now contains the result, we copy it to the dest register.
+ m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerShiftR(si *ssa.Instruction, op shiftROp) {
+ x, amt := si.Arg2()
+ if !x.Type().IsInt() {
+ panic("BUG?")
+ }
+ _64 := x.Type().Bits() == 64
+
+ xDef, amtDef := m.c.ValueDefinition(x), m.c.ValueDefinition(amt)
+
+ opAmt := m.getOperand_Imm32_Reg(amtDef)
+ rx := m.getOperand_Reg(xDef)
+ rd := m.c.VRegOf(si.Return())
+
+ // rx is being overwritten, so we first copy its value to a temp register,
+ // in case it is referenced again later.
+ tmpDst := m.copyToTmp(rx.reg())
+
+ if opAmt.kind == operandKindReg {
+ // If opAmt is a register we must copy its value to rcx,
+ // because shiftR encoding mandates that the shift amount is in rcx.
+ m.copyTo(opAmt.reg(), rcxVReg)
+
+ alu := m.allocateInstr()
+ alu.asShiftR(op, newOperandReg(rcxVReg), tmpDst, _64)
+ m.insert(alu)
+
+ } else {
+ alu := m.allocateInstr()
+ alu.asShiftR(op, opAmt, tmpDst, _64)
+ m.insert(alu)
+ }
+
+ // tmp now contains the result, we copy it to the dest register.
+ m.copyTo(tmpDst, rd)
+}
+
+func (m *machine) lowerXmmRmR(instr *ssa.Instruction) {
+ x, y := instr.Arg2()
+ if !x.Type().IsFloat() {
+ panic("BUG?")
+ }
+ _64 := x.Type().Bits() == 64
+
+ var op sseOpcode
+ if _64 {
+ switch instr.Opcode() {
+ case ssa.OpcodeFadd:
+ op = sseOpcodeAddsd
+ case ssa.OpcodeFsub:
+ op = sseOpcodeSubsd
+ case ssa.OpcodeFmul:
+ op = sseOpcodeMulsd
+ case ssa.OpcodeFdiv:
+ op = sseOpcodeDivsd
+ default:
+ panic("BUG")
+ }
+ } else {
+ switch instr.Opcode() {
+ case ssa.OpcodeFadd:
+ op = sseOpcodeAddss
+ case ssa.OpcodeFsub:
+ op = sseOpcodeSubss
+ case ssa.OpcodeFmul:
+ op = sseOpcodeMulss
+ case ssa.OpcodeFdiv:
+ op = sseOpcodeDivss
+ default:
+ panic("BUG")
+ }
+ }
+
+ xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+ rn := m.getOperand_Reg(yDef)
+ rm := m.getOperand_Reg(xDef)
+ rd := m.c.VRegOf(instr.Return())
+
+ // rm is being overwritten, so we first copy its value to a temp register,
+ // in case it is referenced again later.
+ tmp := m.copyToTmp(rm.reg())
+
+ xmm := m.allocateInstr().asXmmRmR(op, rn, tmp)
+ m.insert(xmm)
+
+ m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerSqrt(instr *ssa.Instruction) {
+ x := instr.Arg()
+ if !x.Type().IsFloat() {
+ panic("BUG")
+ }
+ _64 := x.Type().Bits() == 64
+ var op sseOpcode
+ if _64 {
+ op = sseOpcodeSqrtsd
+ } else {
+ op = sseOpcodeSqrtss
+ }
+
+ xDef := m.c.ValueDefinition(x)
+ rm := m.getOperand_Mem_Reg(xDef)
+ rd := m.c.VRegOf(instr.Return())
+
+ xmm := m.allocateInstr().asXmmUnaryRmR(op, rm, rd)
+ m.insert(xmm)
+}
+
+func (m *machine) lowerFabsFneg(instr *ssa.Instruction) {
+ x := instr.Arg()
+ if !x.Type().IsFloat() {
+ panic("BUG")
+ }
+ _64 := x.Type().Bits() == 64
+ var op sseOpcode
+ var mask uint64
+ if _64 {
+ switch instr.Opcode() {
+ case ssa.OpcodeFabs:
+ mask, op = 0x7fffffffffffffff, sseOpcodeAndpd
+ case ssa.OpcodeFneg:
+ mask, op = 0x8000000000000000, sseOpcodeXorpd
+ }
+ } else {
+ switch instr.Opcode() {
+ case ssa.OpcodeFabs:
+ mask, op = 0x7fffffff, sseOpcodeAndps
+ case ssa.OpcodeFneg:
+ mask, op = 0x80000000, sseOpcodeXorps
+ }
+ }
+
+ tmp := m.c.AllocateVReg(x.Type())
+
+ xDef := m.c.ValueDefinition(x)
+ rm := m.getOperand_Reg(xDef)
+ rd := m.c.VRegOf(instr.Return())
+
+ m.lowerFconst(tmp, mask, _64)
+
+ xmm := m.allocateInstr().asXmmRmR(op, rm, tmp)
+ m.insert(xmm)
+
+ m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerStore(si *ssa.Instruction) {
+ value, ptr, offset, storeSizeInBits := si.StoreData()
+ rm := m.getOperand_Reg(m.c.ValueDefinition(value))
+ mem := newOperandMem(m.lowerToAddressMode(ptr, offset))
+
+ store := m.allocateInstr()
+ switch value.Type() {
+ case ssa.TypeI32:
+ store.asMovRM(rm.reg(), mem, storeSizeInBits/8)
+ case ssa.TypeI64:
+ store.asMovRM(rm.reg(), mem, storeSizeInBits/8)
+ case ssa.TypeF32:
+ store.asXmmMovRM(sseOpcodeMovss, rm.reg(), mem)
+ case ssa.TypeF64:
+ store.asXmmMovRM(sseOpcodeMovsd, rm.reg(), mem)
+ case ssa.TypeV128:
+ store.asXmmMovRM(sseOpcodeMovdqu, rm.reg(), mem)
+ default:
+ panic("BUG")
+ }
+ m.insert(store)
+}
+
+func (m *machine) lowerCall(si *ssa.Instruction) {
+ isDirectCall := si.Opcode() == ssa.OpcodeCall
+ var indirectCalleePtr ssa.Value
+ var directCallee ssa.FuncRef
+ var sigID ssa.SignatureID
+ var args []ssa.Value
+ var isMemmove bool
+ if isDirectCall {
+ directCallee, sigID, args = si.CallData()
+ } else {
+ indirectCalleePtr, sigID, args, isMemmove = si.CallIndirectData()
+ }
+ calleeABI := m.c.GetFunctionABI(m.c.SSABuilder().ResolveSignature(sigID))
+
+ stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize())
+ if m.maxRequiredStackSizeForCalls < stackSlotSize+16 {
+ m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // 16 == return address + RBP.
+ }
+
+ // Note: See machine.SetupPrologue for the stack layout.
+ // The stack pointer decrease/increase will be inserted later in the compilation.
+
+ for i, arg := range args {
+ reg := m.c.VRegOf(arg)
+ def := m.c.ValueDefinition(arg)
+ m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize)
+ }
+
+ if isMemmove {
+ // Go's memmove *might* use all xmm0-xmm15, so we need to release them.
+ // https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#architecture-specifics
+ // https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/runtime/memmove_amd64.s#L271-L286
+ for i := regalloc.RealReg(0); i < 16; i++ {
+ m.insert(m.allocateInstr().asDefineUninitializedReg(regInfo.RealRegToVReg[xmm0+i]))
+ }
+ }
+
+ if isDirectCall {
+ call := m.allocateInstr().asCall(directCallee, calleeABI)
+ m.insert(call)
+ } else {
+ ptrOp := m.getOperand_Mem_Reg(m.c.ValueDefinition(indirectCalleePtr))
+ callInd := m.allocateInstr().asCallIndirect(ptrOp, calleeABI)
+ m.insert(callInd)
+ }
+
+ if isMemmove {
+ for i := regalloc.RealReg(0); i < 16; i++ {
+ m.insert(m.allocateInstr().asNopUseReg(regInfo.RealRegToVReg[xmm0+i]))
+ }
+ }
+
+ var index int
+ r1, rs := si.Returns()
+ if r1.Valid() {
+ m.callerGenFunctionReturnVReg(calleeABI, 0, m.c.VRegOf(r1), stackSlotSize)
+ index++
+ }
+
+ for _, r := range rs {
+ m.callerGenFunctionReturnVReg(calleeABI, index, m.c.VRegOf(r), stackSlotSize)
+ index++
+ }
+}
+
+// callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the
+// caller side of the function call.
+func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def *backend.SSAValueDefinition, stackSlotSize int64) {
+ arg := &a.Args[argIndex]
+ if def != nil && def.IsFromInstr() {
+ // Constant instructions are inlined.
+ if inst := def.Instr; inst.Constant() {
+ m.insertLoadConstant(inst, reg)
+ }
+ }
+ if arg.Kind == backend.ABIArgKindReg {
+ m.InsertMove(arg.Reg, reg, arg.Type)
+ } else {
+ store := m.allocateInstr()
+ mem := newOperandMem(m.newAmodeImmReg(
+ // -stackSlotSize because the stack pointer is not yet decreased.
+ uint32(arg.Offset-stackSlotSize), rspVReg))
+ switch arg.Type {
+ case ssa.TypeI32:
+ store.asMovRM(reg, mem, 4)
+ case ssa.TypeI64:
+ store.asMovRM(reg, mem, 8)
+ case ssa.TypeF32:
+ store.asXmmMovRM(sseOpcodeMovss, reg, mem)
+ case ssa.TypeF64:
+ store.asXmmMovRM(sseOpcodeMovsd, reg, mem)
+ case ssa.TypeV128:
+ store.asXmmMovRM(sseOpcodeMovdqu, reg, mem)
+ default:
+ panic("BUG")
+ }
+ m.insert(store)
+ }
+}
+
+func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex int, reg regalloc.VReg, stackSlotSize int64) {
+ r := &a.Rets[retIndex]
+ if r.Kind == backend.ABIArgKindReg {
+ m.InsertMove(reg, r.Reg, r.Type)
+ } else {
+ load := m.allocateInstr()
+ mem := newOperandMem(m.newAmodeImmReg(
+ // -stackSlotSize because the stack pointer is not yet decreased.
+ uint32(a.ArgStackSize+r.Offset-stackSlotSize), rspVReg))
+ switch r.Type {
+ case ssa.TypeI32:
+ load.asMovzxRmR(extModeLQ, mem, reg)
+ case ssa.TypeI64:
+ load.asMov64MR(mem, reg)
+ case ssa.TypeF32:
+ load.asXmmUnaryRmR(sseOpcodeMovss, mem, reg)
+ case ssa.TypeF64:
+ load.asXmmUnaryRmR(sseOpcodeMovsd, mem, reg)
+ case ssa.TypeV128:
+ load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, reg)
+ default:
+ panic("BUG")
+ }
+ m.insert(load)
+ }
+}
+
+// InsertMove implements backend.Machine.
+func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) {
+ switch typ {
+ case ssa.TypeI32, ssa.TypeI64:
+ i := m.allocateInstr().asMovRR(src, dst, typ.Bits() == 64)
+ m.insert(i)
+ case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+ var op sseOpcode
+ switch typ {
+ case ssa.TypeF32:
+ op = sseOpcodeMovss
+ case ssa.TypeF64:
+ op = sseOpcodeMovsd
+ case ssa.TypeV128:
+ op = sseOpcodeMovdqa
+ }
+ i := m.allocateInstr().asXmmUnaryRmR(op, newOperandReg(src), dst)
+ m.insert(i)
+ default:
+ panic("BUG")
+ }
+}
+
+// Format implements backend.Machine.
+func (m *machine) Format() string {
+ ectx := m.ectx
+ begins := map[*instruction]backend.Label{}
+ for l, pos := range ectx.LabelPositions {
+ begins[pos.Begin] = l
+ }
+
+ irBlocks := map[backend.Label]ssa.BasicBlockID{}
+ for i, l := range ectx.SsaBlockIDToLabels {
+ irBlocks[l] = ssa.BasicBlockID(i)
+ }
+
+ var lines []string
+ for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+ if l, ok := begins[cur]; ok {
+ var labelStr string
+ if blkID, ok := irBlocks[l]; ok {
+ labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID)
+ } else {
+ labelStr = fmt.Sprintf("%s:", l)
+ }
+ lines = append(lines, labelStr)
+ }
+ if cur.kind == nop0 {
+ continue
+ }
+ lines = append(lines, "\t"+cur.String())
+ }
+ for _, vc := range m.consts {
+ if vc._var == nil {
+ lines = append(lines, fmt.Sprintf("%s: const [%d %d]", vc.label.L, vc.lo, vc.hi))
+ } else {
+ lines = append(lines, fmt.Sprintf("%s: const %#x", vc.label.L, vc._var))
+ }
+ }
+ return "\n" + strings.Join(lines, "\n") + "\n"
+}
+
+func (m *machine) encodeWithoutSSA(root *instruction) {
+ m.labelResolutionPends = m.labelResolutionPends[:0]
+ ectx := m.ectx
+
+ bufPtr := m.c.BufPtr()
+ for cur := root; cur != nil; cur = cur.next {
+ offset := int64(len(*bufPtr))
+ if cur.kind == nop0 {
+ l := cur.nop0Label()
+ if pos, ok := ectx.LabelPositions[l]; ok {
+ pos.BinaryOffset = offset
+ }
+ }
+
+ needLabelResolution := cur.encode(m.c)
+ if needLabelResolution {
+ m.labelResolutionPends = append(m.labelResolutionPends,
+ labelResolutionPend{instr: cur, imm32Offset: int64(len(*bufPtr)) - 4},
+ )
+ }
+ }
+
+ for i := range m.labelResolutionPends {
+ p := &m.labelResolutionPends[i]
+ switch p.instr.kind {
+ case jmp, jmpIf, lea:
+ target := p.instr.jmpLabel()
+ targetOffset := ectx.LabelPositions[target].BinaryOffset
+ imm32Offset := p.imm32Offset
+ jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction.
+ binary.LittleEndian.PutUint32((*bufPtr)[imm32Offset:], uint32(jmpOffset))
+ default:
+ panic("BUG")
+ }
+ }
+}
+
+// Encode implements backend.Machine Encode.
+func (m *machine) Encode(ctx context.Context) (err error) {
+ ectx := m.ectx
+ bufPtr := m.c.BufPtr()
+
+ var fn string
+ var fnIndex int
+ var labelToSSABlockID map[backend.Label]ssa.BasicBlockID
+ if wazevoapi.PerfMapEnabled {
+ fn = wazevoapi.GetCurrentFunctionName(ctx)
+ labelToSSABlockID = make(map[backend.Label]ssa.BasicBlockID)
+ for i, l := range ectx.SsaBlockIDToLabels {
+ labelToSSABlockID[l] = ssa.BasicBlockID(i)
+ }
+ fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx)
+ }
+
+ m.labelResolutionPends = m.labelResolutionPends[:0]
+ for _, pos := range ectx.OrderedBlockLabels {
+ offset := int64(len(*bufPtr))
+ pos.BinaryOffset = offset
+ for cur := pos.Begin; cur != pos.End.next; cur = cur.next {
+ offset := int64(len(*bufPtr))
+
+ switch cur.kind {
+ case nop0:
+ l := cur.nop0Label()
+ if pos, ok := ectx.LabelPositions[l]; ok {
+ pos.BinaryOffset = offset
+ }
+ case sourceOffsetInfo:
+ m.c.AddSourceOffsetInfo(offset, cur.sourceOffsetInfo())
+ }
+
+ needLabelResolution := cur.encode(m.c)
+ if needLabelResolution {
+ m.labelResolutionPends = append(m.labelResolutionPends,
+ labelResolutionPend{instr: cur, instrOffset: offset, imm32Offset: int64(len(*bufPtr)) - 4},
+ )
+ }
+ }
+
+ if wazevoapi.PerfMapEnabled {
+ l := pos.L
+ var labelStr string
+ if blkID, ok := labelToSSABlockID[l]; ok {
+ labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID)
+ } else {
+ labelStr = l.String()
+ }
+ size := int64(len(*bufPtr)) - offset
+ wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr))
+ }
+ }
+
+ for i := range m.consts {
+ offset := int64(len(*bufPtr))
+ vc := &m.consts[i]
+ vc.label.BinaryOffset = offset
+ if vc._var == nil {
+ lo, hi := vc.lo, vc.hi
+ m.c.Emit8Bytes(lo)
+ m.c.Emit8Bytes(hi)
+ } else {
+ for _, b := range vc._var {
+ m.c.EmitByte(b)
+ }
+ }
+ }
+
+ buf := *bufPtr
+ for i := range m.labelResolutionPends {
+ p := &m.labelResolutionPends[i]
+ switch p.instr.kind {
+ case jmp, jmpIf, lea, xmmUnaryRmR:
+ target := p.instr.jmpLabel()
+ targetOffset := ectx.LabelPositions[target].BinaryOffset
+ imm32Offset := p.imm32Offset
+ jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction.
+ binary.LittleEndian.PutUint32(buf[imm32Offset:], uint32(jmpOffset))
+ case jmpTableIsland:
+ tableBegin := p.instrOffset
+ // Each entry is the offset from the beginning of the jmpTableIsland instruction in 8 bytes.
+ targets := m.jmpTableTargets[p.instr.u1]
+ for i, l := range targets {
+ targetOffset := ectx.LabelPositions[backend.Label(l)].BinaryOffset
+ jmpOffset := targetOffset - tableBegin
+ binary.LittleEndian.PutUint64(buf[tableBegin+int64(i)*8:], uint64(jmpOffset))
+ }
+ default:
+ panic("BUG")
+ }
+ }
+ return
+}
+
+// ResolveRelocations implements backend.Machine.
+func (m *machine) ResolveRelocations(refToBinaryOffset []int, binary []byte, relocations []backend.RelocationInfo, _ []int) {
+ for _, r := range relocations {
+ offset := r.Offset
+ calleeFnOffset := refToBinaryOffset[r.FuncRef]
+ // offset is the offset of the last 4 bytes of the call instruction.
+ callInstrOffsetBytes := binary[offset : offset+4]
+ diff := int64(calleeFnOffset) - (offset + 4) // +4 because we want the offset of the next instruction (In x64, RIP always points to the next instruction).
+ callInstrOffsetBytes[0] = byte(diff)
+ callInstrOffsetBytes[1] = byte(diff >> 8)
+ callInstrOffsetBytes[2] = byte(diff >> 16)
+ callInstrOffsetBytes[3] = byte(diff >> 24)
+ }
+}
+
+// CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo.
+func (m *machine) CallTrampolineIslandInfo(_ int) (_, _ int, _ error) { return }
+
+func (m *machine) lowerIcmpToFlag(xd, yd *backend.SSAValueDefinition, _64 bool) {
+ x := m.getOperand_Reg(xd)
+ y := m.getOperand_Mem_Imm32_Reg(yd)
+ cmp := m.allocateInstr().asCmpRmiR(true, y, x.reg(), _64)
+ m.insert(cmp)
+}
+
+func (m *machine) lowerFcmpToFlags(instr *ssa.Instruction) (f1, f2 cond, and bool) {
+ x, y, c := instr.FcmpData()
+ switch c {
+ case ssa.FloatCmpCondEqual:
+ f1, f2 = condNP, condZ
+ and = true
+ case ssa.FloatCmpCondNotEqual:
+ f1, f2 = condP, condNZ
+ case ssa.FloatCmpCondLessThan:
+ f1 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThan)
+ f2 = condInvalid
+ x, y = y, x
+ case ssa.FloatCmpCondLessThanOrEqual:
+ f1 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThanOrEqual)
+ f2 = condInvalid
+ x, y = y, x
+ default:
+ f1 = condFromSSAFloatCmpCond(c)
+ f2 = condInvalid
+ }
+
+ var opc sseOpcode
+ if x.Type() == ssa.TypeF32 {
+ opc = sseOpcodeUcomiss
+ } else {
+ opc = sseOpcodeUcomisd
+ }
+
+ xr := m.getOperand_Reg(m.c.ValueDefinition(x))
+ yr := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+ m.insert(m.allocateInstr().asXmmCmpRmR(opc, yr, xr.reg()))
+ return
+}
+
+// allocateInstr allocates an instruction.
+func (m *machine) allocateInstr() *instruction {
+ instr := m.ectx.InstructionPool.Allocate()
+ if !m.regAllocStarted {
+ instr.addedBeforeRegAlloc = true
+ }
+ return instr
+}
+
+func (m *machine) allocateNop() *instruction {
+ instr := m.allocateInstr()
+ instr.kind = nop0
+ return instr
+}
+
+func (m *machine) insert(i *instruction) {
+ ectx := m.ectx
+ ectx.PendingInstructions = append(ectx.PendingInstructions, i)
+}
+
+func (m *machine) allocateBrTarget() (nop *instruction, l backend.Label) { //nolint
+ pos := m.allocateLabel()
+ l = pos.L
+ nop = m.allocateInstr()
+ nop.asNop0WithLabel(l)
+ pos.Begin, pos.End = nop, nop
+ return
+}
+
+func (m *machine) allocateLabel() *labelPosition {
+ ectx := m.ectx
+ l := ectx.AllocateLabel()
+ pos := ectx.AllocateLabelPosition(l)
+ ectx.LabelPositions[l] = pos
+ return pos
+}
+
+func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 {
+ offset, ok := m.spillSlots[id]
+ if !ok {
+ offset = m.spillSlotSize
+ m.spillSlots[id] = offset
+ m.spillSlotSize += int64(size)
+ }
+ return offset
+}
+
+func (m *machine) copyTo(src regalloc.VReg, dst regalloc.VReg) {
+ mov := m.allocateInstr()
+ if src.RegType() == regalloc.RegTypeInt {
+ mov.asMovRR(src, dst, true)
+ } else {
+ mov.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), dst)
+ }
+ m.insert(mov)
+}
+
+func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg {
+ typ := m.c.TypeOf(v)
+ tmp := m.c.AllocateVReg(typ)
+ m.copyTo(v, tmp)
+ return tmp
+}
+
+func (m *machine) requiredStackSize() int64 {
+ return m.maxRequiredStackSizeForCalls +
+ m.frameSize() +
+ 16 + // Need for stack checking.
+ 16 // return address and the caller RBP.
+}
+
+func (m *machine) frameSize() int64 {
+ s := m.clobberedRegSlotSize() + m.spillSlotSize
+ if s&0xf != 0 {
+ panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s))
+ }
+ return s
+}
+
+func (m *machine) clobberedRegSlotSize() int64 {
+ return int64(len(m.clobberedRegs) * 16)
+}
+
+func (m *machine) lowerIDivRem(si *ssa.Instruction, isDiv bool, signed bool) {
+ x, y, execCtx := si.Arg3()
+
+ dividend := m.getOperand_Reg(m.c.ValueDefinition(x))
+ divisor := m.getOperand_Reg(m.c.ValueDefinition(y))
+ ctxVReg := m.c.VRegOf(execCtx)
+ tmpGp := m.c.AllocateVReg(si.Return().Type())
+
+ m.copyTo(dividend.reg(), raxVReg)
+ m.insert(m.allocateInstr().asDefineUninitializedReg(rdxVReg))
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
+ seq := m.allocateInstr().asIdivRemSequence(ctxVReg, divisor.reg(), tmpGp, isDiv, signed, x.Type().Bits() == 64)
+ m.insert(seq)
+ rd := m.c.VRegOf(si.Return())
+ if isDiv {
+ m.copyTo(raxVReg, rd)
+ } else {
+ m.copyTo(rdxVReg, rd)
+ }
+}
+
+func (m *machine) lowerIDivRemSequenceAfterRegAlloc(i *instruction) {
+ execCtx, divisor, tmpGp, isDiv, signed, _64 := i.idivRemSequenceData()
+
+ dividend := raxVReg
+
+ // Ensure yr is not zero.
+ test := m.allocateInstr()
+ test.asCmpRmiR(false, newOperandReg(divisor), divisor, _64)
+ m.insert(test)
+
+ jnz := m.allocateInstr()
+ m.insert(jnz)
+
+ nz := m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerDivisionByZero)
+
+ // If not zero, we can proceed with the division.
+ jnz.asJmpIf(condNZ, newOperandLabel(nz))
+
+ var ifRemNeg1 *instruction
+ if signed {
+ var neg1 uint64
+ if _64 {
+ neg1 = 0xffffffffffffffff
+ } else {
+ neg1 = 0xffffffff
+ }
+ m.lowerIconst(tmpGp, neg1, _64)
+
+ if isDiv {
+ // For signed division, we have to have branches for "math.MinInt{32,64} / -1"
+ // case which results in the floating point exception via division error as
+ // the resulting value exceeds the maximum of signed int.
+
+ // First, we check if the divisor is -1.
+ cmp := m.allocateInstr()
+ cmp.asCmpRmiR(true, newOperandReg(tmpGp), divisor, _64)
+ m.insert(cmp)
+
+ ifNotNeg1 := m.allocateInstr()
+ m.insert(ifNotNeg1)
+
+ var minInt uint64
+ if _64 {
+ minInt = 0x8000000000000000
+ } else {
+ minInt = 0x80000000
+ }
+ m.lowerIconst(tmpGp, minInt, _64)
+
+ // Next we check if the quotient is the most negative value for the signed integer, i.e.
+ // if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1) respectively.
+ cmp2 := m.allocateInstr()
+ cmp2.asCmpRmiR(true, newOperandReg(tmpGp), dividend, _64)
+ m.insert(cmp2)
+
+ ifNotMinInt := m.allocateInstr()
+ m.insert(ifNotMinInt)
+
+ // Trap if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1),
+ // as that is the overflow in division as the result becomes 2^31 which is larger than
+ // the maximum of signed 32-bit int (2^31-1).
+ end := m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
+ ifNotNeg1.asJmpIf(condNZ, newOperandLabel(end))
+ ifNotMinInt.asJmpIf(condNZ, newOperandLabel(end))
+ } else {
+ // If it is remainder, zeros DX register and compare the divisor to -1.
+ xor := m.allocateInstr().asZeros(rdxVReg)
+ m.insert(xor)
+
+ // We check if the divisor is -1.
+ cmp := m.allocateInstr()
+ cmp.asCmpRmiR(true, newOperandReg(tmpGp), divisor, _64)
+ m.insert(cmp)
+
+ ifRemNeg1 = m.allocateInstr()
+ m.insert(ifRemNeg1)
+ }
+
+ // Sign-extend DX register to have 2*x.Type().Bits() dividend over DX and AX registers.
+ sed := m.allocateInstr()
+ sed.asSignExtendData(_64)
+ m.insert(sed)
+ } else {
+ // Zeros DX register to have 2*x.Type().Bits() dividend over DX and AX registers.
+ zeros := m.allocateInstr().asZeros(rdxVReg)
+ m.insert(zeros)
+ }
+
+ div := m.allocateInstr()
+ div.asDiv(newOperandReg(divisor), signed, _64)
+ m.insert(div)
+
+ nop, end := m.allocateBrTarget()
+ m.insert(nop)
+ // If we are compiling a Rem instruction, when the divisor is -1 we land at the end of the function.
+ if ifRemNeg1 != nil {
+ ifRemNeg1.asJmpIf(condZ, newOperandLabel(end))
+ }
+}
+
+func (m *machine) lowerRound(instr *ssa.Instruction, imm roundingMode) {
+ x := instr.Arg()
+ if !x.Type().IsFloat() {
+ panic("BUG?")
+ }
+ var op sseOpcode
+ if x.Type().Bits() == 64 {
+ op = sseOpcodeRoundsd
+ } else {
+ op = sseOpcodeRoundss
+ }
+
+ xDef := m.c.ValueDefinition(x)
+ rm := m.getOperand_Mem_Reg(xDef)
+ rd := m.c.VRegOf(instr.Return())
+
+ xmm := m.allocateInstr().asXmmUnaryRmRImm(op, uint8(imm), rm, rd)
+ m.insert(xmm)
+}
+
+func (m *machine) lowerFminFmax(instr *ssa.Instruction) {
+ x, y := instr.Arg2()
+ if !x.Type().IsFloat() {
+ panic("BUG?")
+ }
+
+ _64 := x.Type().Bits() == 64
+ isMin := instr.Opcode() == ssa.OpcodeFmin
+ var minMaxOp sseOpcode
+
+ switch {
+ case _64 && isMin:
+ minMaxOp = sseOpcodeMinpd
+ case _64 && !isMin:
+ minMaxOp = sseOpcodeMaxpd
+ case !_64 && isMin:
+ minMaxOp = sseOpcodeMinps
+ case !_64 && !isMin:
+ minMaxOp = sseOpcodeMaxps
+ }
+
+ xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+ rm := m.getOperand_Reg(xDef)
+ // We cannot ensure that y is aligned to 16 bytes, so we have to use it on reg.
+ rn := m.getOperand_Reg(yDef)
+ rd := m.c.VRegOf(instr.Return())
+
+ tmp := m.copyToTmp(rm.reg())
+
+ // Check if this is (either x1 or x2 is NaN) or (x1 equals x2) case.
+ cmp := m.allocateInstr()
+ if _64 {
+ cmp.asXmmCmpRmR(sseOpcodeUcomisd, rn, tmp)
+ } else {
+ cmp.asXmmCmpRmR(sseOpcodeUcomiss, rn, tmp)
+ }
+ m.insert(cmp)
+
+ // At this point, we have the three cases of conditional flags below
+ // (See https://www.felixcloutier.com/x86/ucomiss#operation for detail.)
+ //
+ // 1) Two values are NaN-free and different: All flags are cleared.
+ // 2) Two values are NaN-free and equal: Only ZF flags is set.
+ // 3) One of Two values is NaN: ZF, PF and CF flags are set.
+
+ // Jump instruction to handle 1) case by checking the ZF flag
+ // as ZF is only set for 2) and 3) cases.
+ nanFreeOrDiffJump := m.allocateInstr()
+ m.insert(nanFreeOrDiffJump)
+
+ // Start handling 2) and 3).
+
+ // Jump if one of two values is NaN by checking the parity flag (PF).
+ ifIsNan := m.allocateInstr()
+ m.insert(ifIsNan)
+
+ // Start handling 2) NaN-free and equal.
+
+ // Before we exit this case, we have to ensure that positive zero (or negative zero for min instruction) is
+ // returned if two values are positive and negative zeros.
+ var op sseOpcode
+ switch {
+ case !_64 && isMin:
+ op = sseOpcodeOrps
+ case _64 && isMin:
+ op = sseOpcodeOrpd
+ case !_64 && !isMin:
+ op = sseOpcodeAndps
+ case _64 && !isMin:
+ op = sseOpcodeAndpd
+ }
+ orAnd := m.allocateInstr()
+ orAnd.asXmmRmR(op, rn, tmp)
+ m.insert(orAnd)
+
+ // Done, jump to end.
+ sameExitJump := m.allocateInstr()
+ m.insert(sameExitJump)
+
+ // Start handling 3) either is NaN.
+ isNanTarget, isNan := m.allocateBrTarget()
+ m.insert(isNanTarget)
+ ifIsNan.asJmpIf(condP, newOperandLabel(isNan))
+
+ // We emit the ADD instruction to produce the NaN in tmp.
+ add := m.allocateInstr()
+ if _64 {
+ add.asXmmRmR(sseOpcodeAddsd, rn, tmp)
+ } else {
+ add.asXmmRmR(sseOpcodeAddss, rn, tmp)
+ }
+ m.insert(add)
+
+ // Exit from the NaN case branch.
+ nanExitJmp := m.allocateInstr()
+ m.insert(nanExitJmp)
+
+ // Start handling 1).
+ doMinMaxTarget, doMinMax := m.allocateBrTarget()
+ m.insert(doMinMaxTarget)
+ nanFreeOrDiffJump.asJmpIf(condNZ, newOperandLabel(doMinMax))
+
+ // Now handle the NaN-free and different values case.
+ minMax := m.allocateInstr()
+ minMax.asXmmRmR(minMaxOp, rn, tmp)
+ m.insert(minMax)
+
+ endNop, end := m.allocateBrTarget()
+ m.insert(endNop)
+ nanExitJmp.asJmp(newOperandLabel(end))
+ sameExitJump.asJmp(newOperandLabel(end))
+
+ m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerFcopysign(instr *ssa.Instruction) {
+ x, y := instr.Arg2()
+ if !x.Type().IsFloat() {
+ panic("BUG")
+ }
+
+ _64 := x.Type().Bits() == 64
+
+ xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+ rm := m.getOperand_Reg(xDef)
+ rn := m.getOperand_Reg(yDef)
+ rd := m.c.VRegOf(instr.Return())
+
+ // Clear the non-sign bits of src via AND with the mask.
+ var opAnd, opOr sseOpcode
+ var signMask uint64
+ if _64 {
+ signMask, opAnd, opOr = 0x8000000000000000, sseOpcodeAndpd, sseOpcodeOrpd
+ } else {
+ signMask, opAnd, opOr = 0x80000000, sseOpcodeAndps, sseOpcodeOrps
+ }
+
+ signBitReg := m.c.AllocateVReg(x.Type())
+ m.lowerFconst(signBitReg, signMask, _64)
+ nonSignBitReg := m.c.AllocateVReg(x.Type())
+ m.lowerFconst(nonSignBitReg, ^signMask, _64)
+
+ // Extract the sign bits of rn.
+ and := m.allocateInstr().asXmmRmR(opAnd, rn, signBitReg)
+ m.insert(and)
+
+ // Clear the sign bit of dst via AND with the non-sign bit mask.
+ xor := m.allocateInstr().asXmmRmR(opAnd, rm, nonSignBitReg)
+ m.insert(xor)
+
+ // Copy the sign bits of src to dst via OR.
+ or := m.allocateInstr().asXmmRmR(opOr, newOperandReg(signBitReg), nonSignBitReg)
+ m.insert(or)
+
+ m.copyTo(nonSignBitReg, rd)
+}
+
+func (m *machine) lowerBitcast(instr *ssa.Instruction) {
+ x, dstTyp := instr.BitcastData()
+ srcTyp := x.Type()
+ rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rd := m.c.VRegOf(instr.Return())
+ switch {
+ case srcTyp == ssa.TypeF32 && dstTyp == ssa.TypeI32:
+ cvt := m.allocateInstr().asXmmToGpr(sseOpcodeMovd, rn.reg(), rd, false)
+ m.insert(cvt)
+ case srcTyp == ssa.TypeI32 && dstTyp == ssa.TypeF32:
+ cvt := m.allocateInstr().asGprToXmm(sseOpcodeMovd, rn, rd, false)
+ m.insert(cvt)
+ case srcTyp == ssa.TypeF64 && dstTyp == ssa.TypeI64:
+ cvt := m.allocateInstr().asXmmToGpr(sseOpcodeMovq, rn.reg(), rd, true)
+ m.insert(cvt)
+ case srcTyp == ssa.TypeI64 && dstTyp == ssa.TypeF64:
+ cvt := m.allocateInstr().asGprToXmm(sseOpcodeMovq, rn, rd, true)
+ m.insert(cvt)
+ default:
+ panic(fmt.Sprintf("invalid bitcast from %s to %s", srcTyp, dstTyp))
+ }
+}
+
+func (m *machine) lowerFcvtToSint(ctxVReg, rn, rd regalloc.VReg, src64, dst64, sat bool) {
+ var tmpXmm regalloc.VReg
+ if dst64 {
+ tmpXmm = m.c.AllocateVReg(ssa.TypeF64)
+ } else {
+ tmpXmm = m.c.AllocateVReg(ssa.TypeF32)
+ }
+
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm))
+ tmpGp, tmpGp2 := m.c.AllocateVReg(ssa.TypeI64), m.c.AllocateVReg(ssa.TypeI64)
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp2))
+
+ m.insert(m.allocateFcvtToSintSequence(ctxVReg, rn, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat))
+ m.copyTo(tmpGp, rd)
+}
+
+func (m *machine) lowerFcvtToSintSequenceAfterRegalloc(i *instruction) {
+ execCtx, src, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat := i.fcvtToSintSequenceData()
+ var cmpOp, truncOp sseOpcode
+ if src64 {
+ cmpOp, truncOp = sseOpcodeUcomisd, sseOpcodeCvttsd2si
+ } else {
+ cmpOp, truncOp = sseOpcodeUcomiss, sseOpcodeCvttss2si
+ }
+
+ trunc := m.allocateInstr()
+ trunc.asXmmToGpr(truncOp, src, tmpGp, dst64)
+ m.insert(trunc)
+
+ // Check if the dst operand was INT_MIN, by checking it against 1.
+ cmp1 := m.allocateInstr()
+ cmp1.asCmpRmiR(true, newOperandImm32(1), tmpGp, dst64)
+ m.insert(cmp1)
+
+ // If no overflow, then we are done.
+ doneTarget, done := m.allocateBrTarget()
+ ifNoOverflow := m.allocateInstr()
+ ifNoOverflow.asJmpIf(condNO, newOperandLabel(done))
+ m.insert(ifNoOverflow)
+
+ // Now, check for NaN.
+ cmpNan := m.allocateInstr()
+ cmpNan.asXmmCmpRmR(cmpOp, newOperandReg(src), src)
+ m.insert(cmpNan)
+
+ // We allocate the "non-nan target" here, but we will insert it later.
+ notNanTarget, notNaN := m.allocateBrTarget()
+ ifNotNan := m.allocateInstr()
+ ifNotNan.asJmpIf(condNP, newOperandLabel(notNaN))
+ m.insert(ifNotNan)
+
+ if sat {
+ // If NaN and saturating, return 0.
+ zeroDst := m.allocateInstr().asZeros(tmpGp)
+ m.insert(zeroDst)
+
+ jmpEnd := m.allocateInstr()
+ jmpEnd.asJmp(newOperandLabel(done))
+ m.insert(jmpEnd)
+
+ // Otherwise:
+ m.insert(notNanTarget)
+
+ // Zero-out the tmp register.
+ zero := m.allocateInstr().asZeros(tmpXmm)
+ m.insert(zero)
+
+ cmpXmm := m.allocateInstr().asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src)
+ m.insert(cmpXmm)
+
+ // if >= jump to end.
+ jmpEnd2 := m.allocateInstr()
+ jmpEnd2.asJmpIf(condB, newOperandLabel(done))
+ m.insert(jmpEnd2)
+
+ // Otherwise, saturate to INT_MAX.
+ if dst64 {
+ m.lowerIconst(tmpGp, math.MaxInt64, dst64)
+ } else {
+ m.lowerIconst(tmpGp, math.MaxInt32, dst64)
+ }
+
+ } else {
+
+ // If non-sat, NaN, trap.
+ m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeInvalidConversionToInteger)
+
+ // Otherwise, we will jump here.
+ m.insert(notNanTarget)
+
+ // jump over trap if src larger than threshold
+ condAboveThreshold := condNB
+
+ // The magic constants are various combination of minInt for int[32|64] represented as float[32|64].
+ var minInt uint64
+ switch {
+ case src64 && dst64:
+ minInt = 0xc3e0000000000000
+ case src64 && !dst64:
+ condAboveThreshold = condNBE
+ minInt = 0xC1E0_0000_0020_0000
+ case !src64 && dst64:
+ minInt = 0xDF00_0000
+ case !src64 && !dst64:
+ minInt = 0xCF00_0000
+ }
+
+ loadToGP := m.allocateInstr().asImm(tmpGp2, minInt, src64)
+ m.insert(loadToGP)
+
+ movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp2), tmpXmm, src64)
+ m.insert(movToXmm)
+
+ cmpXmm := m.allocateInstr().asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src)
+ m.insert(cmpXmm)
+
+ jmpIfLarger := m.allocateInstr()
+ checkPositiveTarget, checkPositive := m.allocateBrTarget()
+ jmpIfLarger.asJmpIf(condAboveThreshold, newOperandLabel(checkPositive))
+ m.insert(jmpIfLarger)
+
+ m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
+
+ // If positive, it was a real overflow.
+ m.insert(checkPositiveTarget)
+
+ // Zero out the temp register.
+ xorpd := m.allocateInstr()
+ xorpd.asXmmRmR(sseOpcodeXorpd, newOperandReg(tmpXmm), tmpXmm)
+ m.insert(xorpd)
+
+ pos := m.allocateInstr()
+ pos.asXmmCmpRmR(cmpOp, newOperandReg(src), tmpXmm)
+ m.insert(pos)
+
+ // If >= jump to end.
+ jmp := m.allocateInstr().asJmpIf(condNB, newOperandLabel(done))
+ m.insert(jmp)
+ m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
+ }
+
+ m.insert(doneTarget)
+}
+
+func (m *machine) lowerFcvtToUint(ctxVReg, rn, rd regalloc.VReg, src64, dst64, sat bool) {
+ tmpXmm, tmpXmm2 := m.c.AllocateVReg(ssa.TypeF64), m.c.AllocateVReg(ssa.TypeF64)
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm))
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm2))
+ tmpGp, tmpGp2 := m.c.AllocateVReg(ssa.TypeI64), m.c.AllocateVReg(ssa.TypeI64)
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp2))
+
+ m.insert(m.allocateFcvtToUintSequence(
+ ctxVReg, rn, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat,
+ ))
+ m.copyTo(tmpGp, rd)
+}
+
+func (m *machine) lowerFcvtToUintSequenceAfterRegalloc(i *instruction) {
+ execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat := i.fcvtToUintSequenceData()
+
+ var subOp, cmpOp, truncOp sseOpcode
+ if src64 {
+ subOp, cmpOp, truncOp = sseOpcodeSubsd, sseOpcodeUcomisd, sseOpcodeCvttsd2si
+ } else {
+ subOp, cmpOp, truncOp = sseOpcodeSubss, sseOpcodeUcomiss, sseOpcodeCvttss2si
+ }
+
+ doneTarget, done := m.allocateBrTarget()
+
+ switch {
+ case src64 && dst64:
+ loadToGP := m.allocateInstr().asImm(tmpGp, 0x43e0000000000000, true)
+ m.insert(loadToGP)
+ movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, true)
+ m.insert(movToXmm)
+ case src64 && !dst64:
+ loadToGP := m.allocateInstr().asImm(tmpGp, 0x41e0000000000000, true)
+ m.insert(loadToGP)
+ movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, true)
+ m.insert(movToXmm)
+ case !src64 && dst64:
+ loadToGP := m.allocateInstr().asImm(tmpGp, 0x5f000000, false)
+ m.insert(loadToGP)
+ movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, false)
+ m.insert(movToXmm)
+ case !src64 && !dst64:
+ loadToGP := m.allocateInstr().asImm(tmpGp, 0x4f000000, false)
+ m.insert(loadToGP)
+ movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, false)
+ m.insert(movToXmm)
+ }
+
+ cmp := m.allocateInstr()
+ cmp.asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src)
+ m.insert(cmp)
+
+ // If above `tmp` ("large threshold"), jump to `ifAboveThreshold`
+ ifAboveThresholdTarget, ifAboveThreshold := m.allocateBrTarget()
+ jmpIfAboveThreshold := m.allocateInstr()
+ jmpIfAboveThreshold.asJmpIf(condNB, newOperandLabel(ifAboveThreshold))
+ m.insert(jmpIfAboveThreshold)
+
+ ifNotNaNTarget, ifNotNaN := m.allocateBrTarget()
+ jmpIfNotNaN := m.allocateInstr()
+ jmpIfNotNaN.asJmpIf(condNP, newOperandLabel(ifNotNaN))
+ m.insert(jmpIfNotNaN)
+
+ // If NaN, handle the error condition.
+ if sat {
+ // On NaN, saturating, we just return 0.
+ zeros := m.allocateInstr().asZeros(tmpGp)
+ m.insert(zeros)
+
+ jmpEnd := m.allocateInstr()
+ jmpEnd.asJmp(newOperandLabel(done))
+ m.insert(jmpEnd)
+ } else {
+ // On NaN, non-saturating, we trap.
+ m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeInvalidConversionToInteger)
+ }
+
+ // If not NaN, land here.
+ m.insert(ifNotNaNTarget)
+
+ // Truncation happens here.
+
+ trunc := m.allocateInstr()
+ trunc.asXmmToGpr(truncOp, src, tmpGp, dst64)
+ m.insert(trunc)
+
+ // Check if the result is negative.
+ cmpNeg := m.allocateInstr()
+ cmpNeg.asCmpRmiR(true, newOperandImm32(0), tmpGp, dst64)
+ m.insert(cmpNeg)
+
+ // If non-neg, jump to end.
+ jmpIfNonNeg := m.allocateInstr()
+ jmpIfNonNeg.asJmpIf(condNL, newOperandLabel(done))
+ m.insert(jmpIfNonNeg)
+
+ if sat {
+ // If the input was "small" (< 2**(width -1)), the only way to get an integer
+ // overflow is because the input was too small: saturate to the min value, i.e. 0.
+ zeros := m.allocateInstr().asZeros(tmpGp)
+ m.insert(zeros)
+
+ jmpEnd := m.allocateInstr()
+ jmpEnd.asJmp(newOperandLabel(done))
+ m.insert(jmpEnd)
+ } else {
+ // If not saturating, trap.
+ m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
+ }
+
+ // If above the threshold, land here.
+ m.insert(ifAboveThresholdTarget)
+
+ // tmpDiff := threshold - rn.
+ copySrc := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), tmpXmm2)
+ m.insert(copySrc)
+
+ sub := m.allocateInstr()
+ sub.asXmmRmR(subOp, newOperandReg(tmpXmm), tmpXmm2) // must be -0x8000000000000000
+ m.insert(sub)
+
+ trunc2 := m.allocateInstr()
+ trunc2.asXmmToGpr(truncOp, tmpXmm2, tmpGp, dst64)
+ m.insert(trunc2)
+
+ // Check if the result is negative.
+ cmpNeg2 := m.allocateInstr().asCmpRmiR(true, newOperandImm32(0), tmpGp, dst64)
+ m.insert(cmpNeg2)
+
+ ifNextLargeTarget, ifNextLarge := m.allocateBrTarget()
+ jmpIfNextLarge := m.allocateInstr()
+ jmpIfNextLarge.asJmpIf(condNL, newOperandLabel(ifNextLarge))
+ m.insert(jmpIfNextLarge)
+
+ if sat {
+ // The input was "large" (>= maxInt), so the only way to get an integer
+ // overflow is because the input was too large: saturate to the max value.
+ var maxInt uint64
+ if dst64 {
+ maxInt = math.MaxUint64
+ } else {
+ maxInt = math.MaxUint32
+ }
+ m.lowerIconst(tmpGp, maxInt, dst64)
+
+ jmpToEnd := m.allocateInstr()
+ jmpToEnd.asJmp(newOperandLabel(done))
+ m.insert(jmpToEnd)
+ } else {
+ // If not saturating, trap.
+ m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
+ }
+
+ m.insert(ifNextLargeTarget)
+
+ var op operand
+ if dst64 {
+ m.lowerIconst(tmpGp2, 0x8000000000000000, true)
+ op = newOperandReg(tmpGp2)
+ } else {
+ op = newOperandImm32(0x80000000)
+ }
+
+ add := m.allocateInstr()
+ add.asAluRmiR(aluRmiROpcodeAdd, op, tmpGp, dst64)
+ m.insert(add)
+
+ m.insert(doneTarget)
+}
+
+func (m *machine) lowerFcvtFromSint(rn, rd operand, src64, dst64 bool) {
+ var op sseOpcode
+ if dst64 {
+ op = sseOpcodeCvtsi2sd
+ } else {
+ op = sseOpcodeCvtsi2ss
+ }
+
+ trunc := m.allocateInstr()
+ trunc.asGprToXmm(op, rn, rd.reg(), src64)
+ m.insert(trunc)
+}
+
+func (m *machine) lowerFcvtFromUint(rn, rd operand, src64, dst64 bool) {
+ var op sseOpcode
+ if dst64 {
+ op = sseOpcodeCvtsi2sd
+ } else {
+ op = sseOpcodeCvtsi2ss
+ }
+
+ // Src is 32 bit, then we just perform the conversion with 64 bit width.
+ //
+ // See the following link for why we use 64bit conversion for unsigned 32bit integer sources:
+ // https://stackoverflow.com/questions/41495498/fpu-operations-generated-by-gcc-during-casting-integer-to-float.
+ //
+ // Here's the summary:
+ // >> CVTSI2SS is indeed designed for converting a signed integer to a scalar single-precision float,
+ // >> not an unsigned integer like you have here. So what gives? Well, a 64-bit processor has 64-bit wide
+ // >> registers available, so the unsigned 32-bit input values can be stored as signed 64-bit intermediate values,
+ // >> which allows CVTSI2SS to be used after all.
+ //
+ if !src64 {
+ // Before we convert, we have to clear the higher 32-bits of the 64-bit register
+ // to get the correct result.
+ tmp := m.c.AllocateVReg(ssa.TypeI32)
+ m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, rn, tmp))
+ m.insert(m.allocateInstr().asGprToXmm(op, newOperandReg(tmp), rd.reg(), true))
+ return
+ }
+
+ // If uint64, we have to do a bit more work.
+ endTarget, end := m.allocateBrTarget()
+
+ var tmpXmm regalloc.VReg
+ if dst64 {
+ tmpXmm = m.c.AllocateVReg(ssa.TypeF64)
+ } else {
+ tmpXmm = m.c.AllocateVReg(ssa.TypeF32)
+ }
+
+ // Check if the most significant bit (sign bit) is set.
+ test := m.allocateInstr()
+ test.asCmpRmiR(false, rn, rn.reg(), src64)
+ m.insert(test)
+
+ // Jump if the sign bit is set.
+ ifSignTarget, ifSign := m.allocateBrTarget()
+ jmpIfNeg := m.allocateInstr()
+ jmpIfNeg.asJmpIf(condS, newOperandLabel(ifSign))
+ m.insert(jmpIfNeg)
+
+ // If the sign bit is not set, we could fit the unsigned int into float32/float64.
+ // So, we convert it to float and emit jump instruction to exit from this branch.
+ cvt := m.allocateInstr()
+ cvt.asGprToXmm(op, rn, tmpXmm, src64)
+ m.insert(cvt)
+
+ // We are done, jump to end.
+ jmpEnd := m.allocateInstr()
+ jmpEnd.asJmp(newOperandLabel(end))
+ m.insert(jmpEnd)
+
+ // Now handling the case where sign-bit is set.
+ // We emit the following sequences:
+ // mov %rn, %tmp
+ // shr 1, %tmp
+ // mov %rn, %tmp2
+ // and 1, %tmp2
+ // or %tmp2, %tmp
+ // cvtsi2ss %tmp, %xmm0
+ // addsd %xmm0, %xmm0
+ m.insert(ifSignTarget)
+
+ tmp := m.copyToTmp(rn.reg())
+ shr := m.allocateInstr()
+ shr.asShiftR(shiftROpShiftRightLogical, newOperandImm32(1), tmp, src64)
+ m.insert(shr)
+
+ tmp2 := m.copyToTmp(rn.reg())
+ and := m.allocateInstr()
+ and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp2, src64)
+ m.insert(and)
+
+ or := m.allocateInstr()
+ or.asAluRmiR(aluRmiROpcodeOr, newOperandReg(tmp2), tmp, src64)
+ m.insert(or)
+
+ cvt2 := m.allocateInstr()
+ cvt2.asGprToXmm(op, newOperandReg(tmp), tmpXmm, src64)
+ m.insert(cvt2)
+
+ addsd := m.allocateInstr()
+ if dst64 {
+ addsd.asXmmRmR(sseOpcodeAddsd, newOperandReg(tmpXmm), tmpXmm)
+ } else {
+ addsd.asXmmRmR(sseOpcodeAddss, newOperandReg(tmpXmm), tmpXmm)
+ }
+ m.insert(addsd)
+
+ m.insert(endTarget)
+ m.copyTo(tmpXmm, rd.reg())
+}
+
+func (m *machine) lowerVanyTrue(instr *ssa.Instruction) {
+ x := instr.Arg()
+ rm := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rd := m.c.VRegOf(instr.Return())
+
+ tmp := m.c.AllocateVReg(ssa.TypeI32)
+
+ cmp := m.allocateInstr()
+ cmp.asXmmCmpRmR(sseOpcodePtest, rm, rm.reg())
+ m.insert(cmp)
+
+ setcc := m.allocateInstr()
+ setcc.asSetcc(condNZ, tmp)
+ m.insert(setcc)
+
+ // Clear the irrelevant bits.
+ and := m.allocateInstr()
+ and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp, false)
+ m.insert(and)
+
+ m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerVallTrue(instr *ssa.Instruction) {
+ x, lane := instr.ArgWithLane()
+ var op sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ op = sseOpcodePcmpeqb
+ case ssa.VecLaneI16x8:
+ op = sseOpcodePcmpeqw
+ case ssa.VecLaneI32x4:
+ op = sseOpcodePcmpeqd
+ case ssa.VecLaneI64x2:
+ op = sseOpcodePcmpeqq
+ }
+ rm := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rd := m.c.VRegOf(instr.Return())
+
+ tmp := m.c.AllocateVReg(ssa.TypeV128)
+
+ zeros := m.allocateInstr()
+ zeros.asZeros(tmp)
+ m.insert(zeros)
+
+ pcmp := m.allocateInstr()
+ pcmp.asXmmRmR(op, rm, tmp)
+ m.insert(pcmp)
+
+ test := m.allocateInstr()
+ test.asXmmCmpRmR(sseOpcodePtest, newOperandReg(tmp), tmp)
+ m.insert(test)
+
+ tmp2 := m.c.AllocateVReg(ssa.TypeI32)
+
+ setcc := m.allocateInstr()
+ setcc.asSetcc(condZ, tmp2)
+ m.insert(setcc)
+
+ // Clear the irrelevant bits.
+ and := m.allocateInstr()
+ and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp2, false)
+ m.insert(and)
+
+ m.copyTo(tmp2, rd)
+}
+
+func (m *machine) lowerVhighBits(instr *ssa.Instruction) {
+ x, lane := instr.ArgWithLane()
+ rm := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rd := m.c.VRegOf(instr.Return())
+ switch lane {
+ case ssa.VecLaneI8x16:
+ mov := m.allocateInstr()
+ mov.asXmmToGpr(sseOpcodePmovmskb, rm.reg(), rd, false)
+ m.insert(mov)
+
+ case ssa.VecLaneI16x8:
+ // When we have:
+ // R1 = [R1(w1), R1(w2), R1(w3), R1(w4), R1(w5), R1(w6), R1(w7), R1(v8)]
+ // R2 = [R2(w1), R2(w2), R2(w3), R2(v4), R2(w5), R2(w6), R2(w7), R2(v8)]
+ // where RX(wn) is n-th signed word (16-bit) of RX register,
+ //
+ // "PACKSSWB R1, R2" produces
+ // R1 = [
+ // byte_sat(R1(w1)), byte_sat(R1(w2)), byte_sat(R1(w3)), byte_sat(R1(w4)),
+ // byte_sat(R1(w5)), byte_sat(R1(w6)), byte_sat(R1(w7)), byte_sat(R1(w8)),
+ // byte_sat(R2(w1)), byte_sat(R2(w2)), byte_sat(R2(w3)), byte_sat(R2(w4)),
+ // byte_sat(R2(w5)), byte_sat(R2(w6)), byte_sat(R2(w7)), byte_sat(R2(w8)),
+ // ]
+ // where R1 is the destination register, and
+ // byte_sat(w) = int8(w) if w fits as signed 8-bit,
+ // 0x80 if w is less than 0x80
+ // 0x7F if w is greater than 0x7f
+ //
+ // See https://www.felixcloutier.com/x86/packsswb:packssdw for detail.
+ //
+ // Therefore, v.register ends up having i-th and (i+8)-th bit set if i-th lane is negative (for i in 0..8).
+ tmp := m.copyToTmp(rm.reg())
+ res := m.c.AllocateVReg(ssa.TypeI32)
+
+ pak := m.allocateInstr()
+ pak.asXmmRmR(sseOpcodePacksswb, rm, tmp)
+ m.insert(pak)
+
+ mov := m.allocateInstr()
+ mov.asXmmToGpr(sseOpcodePmovmskb, tmp, res, false)
+ m.insert(mov)
+
+ // Clear the higher bits than 8.
+ shr := m.allocateInstr()
+ shr.asShiftR(shiftROpShiftRightLogical, newOperandImm32(8), res, false)
+ m.insert(shr)
+
+ m.copyTo(res, rd)
+
+ case ssa.VecLaneI32x4:
+ mov := m.allocateInstr()
+ mov.asXmmToGpr(sseOpcodeMovmskps, rm.reg(), rd, true)
+ m.insert(mov)
+
+ case ssa.VecLaneI64x2:
+ mov := m.allocateInstr()
+ mov.asXmmToGpr(sseOpcodeMovmskpd, rm.reg(), rd, true)
+ m.insert(mov)
+ }
+}
+
+func (m *machine) lowerVbnot(instr *ssa.Instruction) {
+ x := instr.Arg()
+ xDef := m.c.ValueDefinition(x)
+ rm := m.getOperand_Reg(xDef)
+ rd := m.c.VRegOf(instr.Return())
+
+ tmp := m.copyToTmp(rm.reg())
+ tmp2 := m.c.AllocateVReg(ssa.TypeV128)
+
+ // Ensure tmp2 is considered defined by regalloc.
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2))
+
+ // Set all bits on tmp register.
+ pak := m.allocateInstr()
+ pak.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp2), tmp2)
+ m.insert(pak)
+
+ // Then XOR with tmp to reverse all bits on v.register.
+ xor := m.allocateInstr()
+ xor.asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp)
+ m.insert(xor)
+
+ m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerSplat(x, ret ssa.Value, lane ssa.VecLane) {
+ tmpDst := m.c.AllocateVReg(ssa.TypeV128)
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
+
+ switch lane {
+ case ssa.VecLaneI8x16:
+ tmp := m.c.AllocateVReg(ssa.TypeV128)
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmp))
+ xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, xx, tmpDst))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpDst))
+ case ssa.VecLaneI16x8:
+ xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, xx, tmpDst))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, xx, tmpDst))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
+ case ssa.VecLaneI32x4:
+ xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, xx, tmpDst))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
+ case ssa.VecLaneI64x2:
+ xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, xx, tmpDst))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, xx, tmpDst))
+ case ssa.VecLaneF32x4:
+ xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, 0, xx, tmpDst))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
+ case ssa.VecLaneF64x2:
+ xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, xx, tmpDst))
+ default:
+ panic(fmt.Sprintf("invalid lane type: %s", lane))
+ }
+
+ m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerShuffle(x, y ssa.Value, lo, hi uint64, ret ssa.Value) {
+ var xMask, yMask [2]uint64
+ for i := 0; i < 8; i++ {
+ loLane := byte(lo >> (i * 8))
+ if loLane < 16 {
+ xMask[0] |= uint64(loLane) << (i * 8)
+ yMask[0] |= uint64(0x80) << (i * 8)
+ } else {
+ xMask[0] |= uint64(0x80) << (i * 8)
+ yMask[0] |= uint64(loLane-16) << (i * 8)
+ }
+ hiLane := byte(hi >> (i * 8))
+ if hiLane < 16 {
+ xMask[1] |= uint64(hiLane) << (i * 8)
+ yMask[1] |= uint64(0x80) << (i * 8)
+ } else {
+ xMask[1] |= uint64(0x80) << (i * 8)
+ yMask[1] |= uint64(hiLane-16) << (i * 8)
+ }
+ }
+
+ xmaskLabel := m.allocateLabel()
+ m.consts = append(m.consts, _const{lo: xMask[0], hi: xMask[1], label: xmaskLabel})
+ ymaskLabel := m.allocateLabel()
+ m.consts = append(m.consts, _const{lo: yMask[0], hi: yMask[1], label: ymaskLabel})
+
+ xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y))
+ tmpX, tmpY := m.copyToTmp(xx.reg()), m.copyToTmp(yy.reg())
+
+ // Apply mask to X.
+ tmp := m.c.AllocateVReg(ssa.TypeV128)
+ loadMaskLo := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(xmaskLabel.L)), tmp)
+ m.insert(loadMaskLo)
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpX))
+
+ // Apply mask to Y.
+ loadMaskHi := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(ymaskLabel.L)), tmp)
+ m.insert(loadMaskHi)
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpY))
+
+ // Combine the results.
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeOrps, newOperandReg(tmpX), tmpY))
+
+ m.copyTo(tmpY, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVbBinOpUnaligned(op sseOpcode, x, y, ret ssa.Value) {
+ rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rm := m.getOperand_Reg(m.c.ValueDefinition(y))
+ rd := m.c.VRegOf(ret)
+
+ tmp := m.copyToTmp(rn.reg())
+
+ binOp := m.allocateInstr()
+ binOp.asXmmRmR(op, rm, tmp)
+ m.insert(binOp)
+
+ m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerVbBinOp(op sseOpcode, x, y, ret ssa.Value) {
+ rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rm := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+ rd := m.c.VRegOf(ret)
+
+ tmp := m.copyToTmp(rn.reg())
+
+ binOp := m.allocateInstr()
+ binOp.asXmmRmR(op, rm, tmp)
+ m.insert(binOp)
+
+ m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerVFcmp(x, y ssa.Value, c ssa.FloatCmpCond, ret ssa.Value, lane ssa.VecLane) {
+ var cmpOp sseOpcode
+ switch lane {
+ case ssa.VecLaneF32x4:
+ cmpOp = sseOpcodeCmpps
+ case ssa.VecLaneF64x2:
+ cmpOp = sseOpcodeCmppd
+ default:
+ panic(fmt.Sprintf("invalid lane type: %s", lane))
+ }
+
+ xx, yy := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+ var cmpImm cmpPred
+ switch c {
+ case ssa.FloatCmpCondGreaterThan:
+ yy, xx = xx, yy
+ cmpImm = cmpPredLT_OS
+ case ssa.FloatCmpCondGreaterThanOrEqual:
+ yy, xx = xx, yy
+ cmpImm = cmpPredLE_OS
+ case ssa.FloatCmpCondEqual:
+ cmpImm = cmpPredEQ_OQ
+ case ssa.FloatCmpCondNotEqual:
+ cmpImm = cmpPredNEQ_UQ
+ case ssa.FloatCmpCondLessThan:
+ cmpImm = cmpPredLT_OS
+ case ssa.FloatCmpCondLessThanOrEqual:
+ cmpImm = cmpPredLE_OS
+ default:
+ panic(fmt.Sprintf("invalid float comparison condition: %s", c))
+ }
+
+ tmp := m.c.AllocateVReg(ssa.TypeV128)
+ xxx := m.getOperand_Mem_Reg(xx)
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, xxx, tmp))
+
+ rm := m.getOperand_Mem_Reg(yy)
+ m.insert(m.allocateInstr().asXmmRmRImm(cmpOp, byte(cmpImm), rm, tmp))
+
+ m.copyTo(tmp, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVIcmp(x, y ssa.Value, c ssa.IntegerCmpCond, ret ssa.Value, lane ssa.VecLane) {
+ var eq, gt, maxu, minu, mins sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ eq, gt, maxu, minu, mins = sseOpcodePcmpeqb, sseOpcodePcmpgtb, sseOpcodePmaxub, sseOpcodePminub, sseOpcodePminsb
+ case ssa.VecLaneI16x8:
+ eq, gt, maxu, minu, mins = sseOpcodePcmpeqw, sseOpcodePcmpgtw, sseOpcodePmaxuw, sseOpcodePminuw, sseOpcodePminsw
+ case ssa.VecLaneI32x4:
+ eq, gt, maxu, minu, mins = sseOpcodePcmpeqd, sseOpcodePcmpgtd, sseOpcodePmaxud, sseOpcodePminud, sseOpcodePminsd
+ case ssa.VecLaneI64x2:
+ eq, gt = sseOpcodePcmpeqq, sseOpcodePcmpgtq
+ default:
+ panic(fmt.Sprintf("invalid lane type: %s", lane))
+ }
+
+ tmp := m.c.AllocateVReg(ssa.TypeV128)
+ var op operand
+ switch c {
+ case ssa.IntegerCmpCondSignedLessThanOrEqual:
+ if lane == ssa.VecLaneI64x2 {
+ x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+ // Copy x to tmp.
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp))
+ op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+ } else {
+ y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+ // Copy y to tmp.
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp))
+ op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+ }
+ case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
+ if lane == ssa.VecLaneI64x2 {
+ y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+ // Copy y to tmp.
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp))
+ op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+ } else {
+ x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+ // Copy x to tmp.
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp))
+ op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+ }
+ case ssa.IntegerCmpCondSignedLessThan, ssa.IntegerCmpCondUnsignedLessThan, ssa.IntegerCmpCondUnsignedLessThanOrEqual:
+ y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+ // Copy y to tmp.
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp))
+ op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+ default:
+ x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+ // Copy x to tmp.
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp))
+ op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+ }
+
+ switch c {
+ case ssa.IntegerCmpCondEqual:
+ m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
+ case ssa.IntegerCmpCondNotEqual:
+ // First we compare for equality.
+ m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
+ // Then flip the bits. To do so, we set all bits on tmp2.
+ tmp2 := m.c.AllocateVReg(ssa.TypeV128)
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2))
+ m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2))
+ // And then xor with tmp.
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp))
+ case ssa.IntegerCmpCondSignedGreaterThan, ssa.IntegerCmpCondSignedLessThan:
+ m.insert(m.allocateInstr().asXmmRmR(gt, op, tmp))
+ case ssa.IntegerCmpCondSignedGreaterThanOrEqual, ssa.IntegerCmpCondSignedLessThanOrEqual:
+ if lane == ssa.VecLaneI64x2 {
+ m.insert(m.allocateInstr().asXmmRmR(gt, op, tmp))
+ // Then flip the bits. To do so, we set all bits on tmp2.
+ tmp2 := m.c.AllocateVReg(ssa.TypeV128)
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2))
+ m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2))
+ // And then xor with tmp.
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp))
+ } else {
+ // First take min of x and y.
+ m.insert(m.allocateInstr().asXmmRmR(mins, op, tmp))
+ // Then compare for equality.
+ m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
+ }
+ case ssa.IntegerCmpCondUnsignedGreaterThan, ssa.IntegerCmpCondUnsignedLessThan:
+ // First maxu of x and y.
+ m.insert(m.allocateInstr().asXmmRmR(maxu, op, tmp))
+ // Then compare for equality.
+ m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
+ // Then flip the bits. To do so, we set all bits on tmp2.
+ tmp2 := m.c.AllocateVReg(ssa.TypeV128)
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2))
+ m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2))
+ // And then xor with tmp.
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp))
+ case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual, ssa.IntegerCmpCondUnsignedLessThanOrEqual:
+ m.insert(m.allocateInstr().asXmmRmR(minu, op, tmp))
+ m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
+ default:
+ panic("BUG")
+ }
+
+ m.copyTo(tmp, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVbandnot(instr *ssa.Instruction, op sseOpcode) {
+ x, y := instr.Arg2()
+ xDef := m.c.ValueDefinition(x)
+ yDef := m.c.ValueDefinition(y)
+ rm, rn := m.getOperand_Reg(xDef), m.getOperand_Reg(yDef)
+ rd := m.c.VRegOf(instr.Return())
+
+ tmp := m.copyToTmp(rn.reg())
+
+ // pandn between rn, rm.
+ pand := m.allocateInstr()
+ pand.asXmmRmR(sseOpcodePandn, rm, tmp)
+ m.insert(pand)
+
+ m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerVbitselect(instr *ssa.Instruction) {
+ c, x, y := instr.SelectData()
+ xDef := m.c.ValueDefinition(x)
+ yDef := m.c.ValueDefinition(y)
+ rm, rn := m.getOperand_Reg(xDef), m.getOperand_Reg(yDef)
+ creg := m.getOperand_Reg(m.c.ValueDefinition(c))
+ rd := m.c.VRegOf(instr.Return())
+
+ tmpC := m.copyToTmp(creg.reg())
+ tmpX := m.copyToTmp(rm.reg())
+
+ // And between c, x (overwrites x).
+ pand := m.allocateInstr()
+ pand.asXmmRmR(sseOpcodePand, creg, tmpX)
+ m.insert(pand)
+
+ // Andn between y, c (overwrites c).
+ pandn := m.allocateInstr()
+ pandn.asXmmRmR(sseOpcodePandn, rn, tmpC)
+ m.insert(pandn)
+
+ por := m.allocateInstr()
+ por.asXmmRmR(sseOpcodePor, newOperandReg(tmpC), tmpX)
+ m.insert(por)
+
+ m.copyTo(tmpX, rd)
+}
+
+func (m *machine) lowerVFmin(instr *ssa.Instruction) {
+ x, y, lane := instr.Arg2WithLane()
+ rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rm := m.getOperand_Reg(m.c.ValueDefinition(y))
+ rd := m.c.VRegOf(instr.Return())
+
+ var min, cmp, andn, or, srl /* shift right logical */ sseOpcode
+ var shiftNumToInverseNaN uint32
+ if lane == ssa.VecLaneF32x4 {
+ min, cmp, andn, or, srl, shiftNumToInverseNaN = sseOpcodeMinps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodePsrld, 0xa
+ } else {
+ min, cmp, andn, or, srl, shiftNumToInverseNaN = sseOpcodeMinpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodePsrlq, 0xd
+ }
+
+ tmp1 := m.copyToTmp(rn.reg())
+ tmp2 := m.copyToTmp(rm.reg())
+
+ // tmp1=min(rn, rm)
+ minIns1 := m.allocateInstr()
+ minIns1.asXmmRmR(min, rn, tmp2)
+ m.insert(minIns1)
+
+ // tmp2=min(rm, rn)
+ minIns2 := m.allocateInstr()
+ minIns2.asXmmRmR(min, rm, tmp1)
+ m.insert(minIns2)
+
+ // tmp3:=tmp1=min(rn, rm)
+ tmp3 := m.copyToTmp(tmp1)
+
+ // tmp1 = -0 if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN
+ // NaN if rn == NaN || rm == NaN
+ // min(rm, rm) otherwise
+ orIns := m.allocateInstr()
+ orIns.asXmmRmR(or, newOperandReg(tmp2), tmp1)
+ m.insert(orIns)
+
+ // tmp3 is originally min(rn,rm).
+ // tmp3 = 0^ (set all bits) if rn == NaN || rm == NaN
+ // 0 otherwise
+ cmpIns := m.allocateInstr()
+ cmpIns.asXmmRmRImm(cmp, uint8(cmpPredUNORD_Q), newOperandReg(tmp2), tmp3)
+ m.insert(cmpIns)
+
+ // tmp1 = -0 if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN
+ // ^0 if rn == NaN || rm == NaN
+ // min(v1, v2) otherwise
+ orIns2 := m.allocateInstr()
+ orIns2.asXmmRmR(or, newOperandReg(tmp3), tmp1)
+ m.insert(orIns2)
+
+ // tmp3 = set all bits on the mantissa bits
+ // 0 otherwise
+ shift := m.allocateInstr()
+ shift.asXmmRmiReg(srl, newOperandImm32(shiftNumToInverseNaN), tmp3)
+ m.insert(shift)
+
+ // tmp3 = tmp1 and !tmp3
+ // = -0 if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN
+ // set all bits on exponential and sign bit (== NaN) if rn == NaN || rm == NaN
+ // min(rn, rm) otherwise
+ andnIns := m.allocateInstr()
+ andnIns.asXmmRmR(andn, newOperandReg(tmp1), tmp3)
+ m.insert(andnIns)
+
+ m.copyTo(tmp3, rd)
+}
+
+func (m *machine) lowerVFmax(instr *ssa.Instruction) {
+ x, y, lane := instr.Arg2WithLane()
+ rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rm := m.getOperand_Reg(m.c.ValueDefinition(y))
+ rd := m.c.VRegOf(instr.Return())
+
+ var max, cmp, andn, or, xor, sub, srl /* shift right logical */ sseOpcode
+ var shiftNumToInverseNaN uint32
+ if lane == ssa.VecLaneF32x4 {
+ max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = sseOpcodeMaxps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodeXorps, sseOpcodeSubps, sseOpcodePsrld, 0xa
+ } else {
+ max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = sseOpcodeMaxpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodeXorpd, sseOpcodeSubpd, sseOpcodePsrlq, 0xd
+ }
+
+ tmp0 := m.copyToTmp(rm.reg())
+ tmp1 := m.copyToTmp(rn.reg())
+
+ // tmp0=max(rn, rm)
+ maxIns1 := m.allocateInstr()
+ maxIns1.asXmmRmR(max, rn, tmp0)
+ m.insert(maxIns1)
+
+ // tmp1=max(rm, rn)
+ maxIns2 := m.allocateInstr()
+ maxIns2.asXmmRmR(max, rm, tmp1)
+ m.insert(maxIns2)
+
+ // tmp2=max(rm, rn)
+ tmp2 := m.copyToTmp(tmp1)
+
+ // tmp2 = -0 if (rn == -0 && rm == 0) || (rn == 0 && rm == -0)
+ // 0 if (rn == 0 && rm == 0)
+ // -0 if (rn == -0 && rm == -0)
+ // v1^v2 if rn == NaN || rm == NaN
+ // 0 otherwise
+ xorInstr := m.allocateInstr()
+ xorInstr.asXmmRmR(xor, newOperandReg(tmp0), tmp2)
+ m.insert(xorInstr)
+ // tmp1 = -0 if (rn == -0 && rm == 0) || (rn == 0 && rm == -0)
+ // 0 if (rn == 0 && rm == 0)
+ // -0 if (rn == -0 && rm == -0)
+ // NaN if rn == NaN || rm == NaN
+ // max(v1, v2) otherwise
+ orInstr := m.allocateInstr()
+ orInstr.asXmmRmR(or, newOperandReg(tmp2), tmp1)
+ m.insert(orInstr)
+
+ tmp3 := m.copyToTmp(tmp1)
+
+ // tmp3 = 0 if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) || (rn == 0 && rm == 0)
+ // -0 if (rn == -0 && rm == -0)
+ // NaN if rn == NaN || rm == NaN
+ // max(v1, v2) otherwise
+ //
+ // Note: -0 - (-0) = 0 (!= -0) in floating point operation.
+ subIns := m.allocateInstr()
+ subIns.asXmmRmR(sub, newOperandReg(tmp2), tmp3)
+ m.insert(subIns)
+
+ // tmp1 = 0^ if rn == NaN || rm == NaN
+ cmpIns := m.allocateInstr()
+ cmpIns.asXmmRmRImm(cmp, uint8(cmpPredUNORD_Q), newOperandReg(tmp1), tmp1)
+ m.insert(cmpIns)
+
+ // tmp1 = set all bits on the mantissa bits
+ // 0 otherwise
+ shift := m.allocateInstr()
+ shift.asXmmRmiReg(srl, newOperandImm32(shiftNumToInverseNaN), tmp1)
+ m.insert(shift)
+
+ andnIns := m.allocateInstr()
+ andnIns.asXmmRmR(andn, newOperandReg(tmp3), tmp1)
+ m.insert(andnIns)
+
+ m.copyTo(tmp1, rd)
+}
+
+func (m *machine) lowerVFabs(instr *ssa.Instruction) {
+ x, lane := instr.ArgWithLane()
+ rm := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+ rd := m.c.VRegOf(instr.Return())
+
+ tmp := m.c.AllocateVReg(ssa.TypeV128)
+
+ def := m.allocateInstr()
+ def.asDefineUninitializedReg(tmp)
+ m.insert(def)
+
+ // Set all bits on tmp.
+ pcmp := m.allocateInstr()
+ pcmp.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp)
+ m.insert(pcmp)
+
+ switch lane {
+ case ssa.VecLaneF32x4:
+ // Shift right packed single floats by 1 to clear the sign bits.
+ shift := m.allocateInstr()
+ shift.asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp)
+ m.insert(shift)
+ // Clear the sign bit of rm.
+ andp := m.allocateInstr()
+ andp.asXmmRmR(sseOpcodeAndpd, rm, tmp)
+ m.insert(andp)
+ case ssa.VecLaneF64x2:
+ // Shift right packed single floats by 1 to clear the sign bits.
+ shift := m.allocateInstr()
+ shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(1), tmp)
+ m.insert(shift)
+ // Clear the sign bit of rm.
+ andp := m.allocateInstr()
+ andp.asXmmRmR(sseOpcodeAndps, rm, tmp)
+ m.insert(andp)
+ }
+
+ m.copyTo(tmp, rd)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go
new file mode 100644
index 000000000..8fa974c66
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go
@@ -0,0 +1,304 @@
+package amd64
+
+import (
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+)
+
+// PostRegAlloc implements backend.Machine.
+func (m *machine) PostRegAlloc() {
+ m.setupPrologue()
+ m.postRegAlloc()
+}
+
+func (m *machine) setupPrologue() {
+ cur := m.ectx.RootInstr
+ prevInitInst := cur.next
+
+ // At this point, we have the stack layout as follows:
+ //
+ // (high address)
+ // +-----------------+ <----- RBP (somewhere in the middle of the stack)
+ // | ....... |
+ // | ret Y |
+ // | ....... |
+ // | ret 0 |
+ // | arg X |
+ // | ....... |
+ // | arg 1 |
+ // | arg 0 |
+ // | Return Addr |
+ // RSP ----> +-----------------+
+ // (low address)
+
+ // First, we push the RBP, and update the RBP to the current RSP.
+ //
+ // (high address) (high address)
+ // RBP ----> +-----------------+ +-----------------+
+ // | ....... | | ....... |
+ // | ret Y | | ret Y |
+ // | ....... | | ....... |
+ // | ret 0 | | ret 0 |
+ // | arg X | | arg X |
+ // | ....... | ====> | ....... |
+ // | arg 1 | | arg 1 |
+ // | arg 0 | | arg 0 |
+ // | Return Addr | | Return Addr |
+ // RSP ----> +-----------------+ | Caller_RBP |
+ // (low address) +-----------------+ <----- RSP, RBP
+ //
+ cur = m.setupRBPRSP(cur)
+
+ if !m.stackBoundsCheckDisabled {
+ cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur)
+ }
+
+ //
+ // (high address)
+ // +-----------------+ +-----------------+
+ // | ....... | | ....... |
+ // | ret Y | | ret Y |
+ // | ....... | | ....... |
+ // | ret 0 | | ret 0 |
+ // | arg X | | arg X |
+ // | ....... | | ....... |
+ // | arg 1 | | arg 1 |
+ // | arg 0 | | arg 0 |
+ // | xxxxx | | xxxxx |
+ // | Return Addr | | Return Addr |
+ // | Caller_RBP | ====> | Caller_RBP |
+ // RBP,RSP->+-----------------+ +-----------------+ <----- RBP
+ // (low address) | clobbered M |
+ // | clobbered 1 |
+ // | ........... |
+ // | clobbered 0 |
+ // +-----------------+ <----- RSP
+ //
+ if regs := m.clobberedRegs; len(regs) > 0 {
+ for i := range regs {
+ r := regs[len(regs)-1-i] // Reverse order.
+ if r.RegType() == regalloc.RegTypeInt {
+ cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r)))
+ } else {
+ // Push the XMM register is not supported by the PUSH instruction.
+ cur = m.addRSP(-16, cur)
+ push := m.allocateInstr().asXmmMovRM(
+ sseOpcodeMovdqu, r, newOperandMem(m.newAmodeImmReg(0, rspVReg)),
+ )
+ cur = linkInstr(cur, push)
+ }
+ }
+ }
+
+ if size := m.spillSlotSize; size > 0 {
+ // Simply decrease the RSP to allocate the spill slots.
+ // sub $size, %rsp
+ cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(size)), rspVReg, true))
+
+ // At this point, we have the stack layout as follows:
+ //
+ // (high address)
+ // +-----------------+
+ // | ....... |
+ // | ret Y |
+ // | ....... |
+ // | ret 0 |
+ // | arg X |
+ // | ....... |
+ // | arg 1 |
+ // | arg 0 |
+ // | ReturnAddress |
+ // | Caller_RBP |
+ // +-----------------+ <--- RBP
+ // | clobbered M |
+ // | ............ |
+ // | clobbered 1 |
+ // | clobbered 0 |
+ // | spill slot N |
+ // | ............ |
+ // | spill slot 0 |
+ // +-----------------+ <--- RSP
+ // (low address)
+ }
+
+ linkInstr(cur, prevInitInst)
+}
+
+// postRegAlloc does multiple things while walking through the instructions:
+// 1. Inserts the epilogue code.
+// 2. Removes the redundant copy instruction.
+// 3. Inserts the dec/inc RSP instruction right before/after the call instruction.
+// 4. Lowering that is supposed to be done after regalloc.
+func (m *machine) postRegAlloc() {
+ ectx := m.ectx
+ for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+ switch k := cur.kind; k {
+ case ret:
+ m.setupEpilogueAfter(cur.prev)
+ continue
+ case fcvtToSintSequence, fcvtToUintSequence:
+ m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
+ if k == fcvtToSintSequence {
+ m.lowerFcvtToSintSequenceAfterRegalloc(cur)
+ } else {
+ m.lowerFcvtToUintSequenceAfterRegalloc(cur)
+ }
+ prev := cur.prev
+ next := cur.next
+ cur := prev
+ for _, instr := range m.ectx.PendingInstructions {
+ cur = linkInstr(cur, instr)
+ }
+ linkInstr(cur, next)
+ continue
+ case xmmCMov:
+ m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
+ m.lowerXmmCmovAfterRegAlloc(cur)
+ prev := cur.prev
+ next := cur.next
+ cur := prev
+ for _, instr := range m.ectx.PendingInstructions {
+ cur = linkInstr(cur, instr)
+ }
+ linkInstr(cur, next)
+ continue
+ case idivRemSequence:
+ m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
+ m.lowerIDivRemSequenceAfterRegAlloc(cur)
+ prev := cur.prev
+ next := cur.next
+ cur := prev
+ for _, instr := range m.ectx.PendingInstructions {
+ cur = linkInstr(cur, instr)
+ }
+ linkInstr(cur, next)
+ continue
+ case call, callIndirect:
+ // At this point, reg alloc is done, therefore we can safely insert dec/inc RPS instruction
+ // right before/after the call instruction. If this is done before reg alloc, the stack slot
+ // can point to the wrong location and therefore results in a wrong value.
+ call := cur
+ next := call.next
+ _, _, _, _, size := backend.ABIInfoFromUint64(call.u2)
+ if size > 0 {
+ dec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(size), rspVReg, true)
+ linkInstr(call.prev, dec)
+ linkInstr(dec, call)
+ inc := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(size), rspVReg, true)
+ linkInstr(call, inc)
+ linkInstr(inc, next)
+ }
+ continue
+ }
+
+ // Removes the redundant copy instruction.
+ if cur.IsCopy() && cur.op1.reg().RealReg() == cur.op2.reg().RealReg() {
+ prev, next := cur.prev, cur.next
+ // Remove the copy instruction.
+ prev.next = next
+ if next != nil {
+ next.prev = prev
+ }
+ }
+ }
+}
+
+func (m *machine) setupEpilogueAfter(cur *instruction) {
+ prevNext := cur.next
+
+ // At this point, we have the stack layout as follows:
+ //
+ // (high address)
+ // +-----------------+
+ // | ....... |
+ // | ret Y |
+ // | ....... |
+ // | ret 0 |
+ // | arg X |
+ // | ....... |
+ // | arg 1 |
+ // | arg 0 |
+ // | ReturnAddress |
+ // | Caller_RBP |
+ // +-----------------+ <--- RBP
+ // | clobbered M |
+ // | ............ |
+ // | clobbered 1 |
+ // | clobbered 0 |
+ // | spill slot N |
+ // | ............ |
+ // | spill slot 0 |
+ // +-----------------+ <--- RSP
+ // (low address)
+
+ if size := m.spillSlotSize; size > 0 {
+ // Simply increase the RSP to free the spill slots.
+ // add $size, %rsp
+ cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(uint32(size)), rspVReg, true))
+ }
+
+ //
+ // (high address)
+ // +-----------------+ +-----------------+
+ // | ....... | | ....... |
+ // | ret Y | | ret Y |
+ // | ....... | | ....... |
+ // | ret 0 | | ret 0 |
+ // | arg X | | arg X |
+ // | ....... | | ....... |
+ // | arg 1 | | arg 1 |
+ // | arg 0 | | arg 0 |
+ // | ReturnAddress | | ReturnAddress |
+ // | Caller_RBP | | Caller_RBP |
+ // RBP ---> +-----------------+ ========> +-----------------+ <---- RSP, RBP
+ // | clobbered M |
+ // | ............ |
+ // | clobbered 1 |
+ // | clobbered 0 |
+ // RSP ---> +-----------------+
+ // (low address)
+ //
+ if regs := m.clobberedRegs; len(regs) > 0 {
+ for _, r := range regs {
+ if r.RegType() == regalloc.RegTypeInt {
+ cur = linkInstr(cur, m.allocateInstr().asPop64(r))
+ } else {
+ // Pop the XMM register is not supported by the POP instruction.
+ pop := m.allocateInstr().asXmmUnaryRmR(
+ sseOpcodeMovdqu, newOperandMem(m.newAmodeImmReg(0, rspVReg)), r,
+ )
+ cur = linkInstr(cur, pop)
+ cur = m.addRSP(16, cur)
+ }
+ }
+ }
+
+ // Now roll back the RSP to RBP, and pop the caller's RBP.
+ cur = m.revertRBPRSP(cur)
+
+ linkInstr(cur, prevNext)
+}
+
+func (m *machine) addRSP(offset int32, cur *instruction) *instruction {
+ if offset == 0 {
+ return cur
+ }
+ opcode := aluRmiROpcodeAdd
+ if offset < 0 {
+ opcode = aluRmiROpcodeSub
+ offset = -offset
+ }
+ return linkInstr(cur, m.allocateInstr().asAluRmiR(opcode, newOperandImm32(uint32(offset)), rspVReg, true))
+}
+
+func (m *machine) setupRBPRSP(cur *instruction) *instruction {
+ cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(rbpVReg)))
+ cur = linkInstr(cur, m.allocateInstr().asMovRR(rspVReg, rbpVReg, true))
+ return cur
+}
+
+func (m *machine) revertRBPRSP(cur *instruction) *instruction {
+ cur = linkInstr(cur, m.allocateInstr().asMovRR(rbpVReg, rspVReg, true))
+ cur = linkInstr(cur, m.allocateInstr().asPop64(rbpVReg))
+ return cur
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go
new file mode 100644
index 000000000..0bb28ee9e
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go
@@ -0,0 +1,153 @@
+package amd64
+
+import (
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// InsertMoveBefore implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
+ typ := src.RegType()
+ if typ != dst.RegType() {
+ panic("BUG: src and dst must have the same type")
+ }
+
+ mov := m.allocateInstr()
+ if typ == regalloc.RegTypeInt {
+ mov.asMovRR(src, dst, true)
+ } else {
+ mov.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), dst)
+ }
+
+ cur := instr.prev
+ prevNext := cur.next
+ cur = linkInstr(cur, mov)
+ linkInstr(cur, prevNext)
+}
+
+// InsertStoreRegisterAt implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+ if !v.IsRealReg() {
+ panic("BUG: VReg must be backed by real reg to be stored")
+ }
+
+ typ := m.c.TypeOf(v)
+
+ var prevNext, cur *instruction
+ if after {
+ cur, prevNext = instr, instr.next
+ } else {
+ cur, prevNext = instr.prev, instr
+ }
+
+ offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
+ store := m.allocateInstr()
+ mem := newOperandMem(m.newAmodeImmReg(uint32(offsetFromSP), rspVReg))
+ switch typ {
+ case ssa.TypeI32:
+ store.asMovRM(v, mem, 4)
+ case ssa.TypeI64:
+ store.asMovRM(v, mem, 8)
+ case ssa.TypeF32:
+ store.asXmmMovRM(sseOpcodeMovss, v, mem)
+ case ssa.TypeF64:
+ store.asXmmMovRM(sseOpcodeMovsd, v, mem)
+ case ssa.TypeV128:
+ store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
+ }
+
+ cur = linkInstr(cur, store)
+ return linkInstr(cur, prevNext)
+}
+
+// InsertReloadRegisterAt implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+ if !v.IsRealReg() {
+ panic("BUG: VReg must be backed by real reg to be stored")
+ }
+
+ typ := m.c.TypeOf(v)
+ var prevNext, cur *instruction
+ if after {
+ cur, prevNext = instr, instr.next
+ } else {
+ cur, prevNext = instr.prev, instr
+ }
+
+ // Load the value to the temporary.
+ load := m.allocateInstr()
+ offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
+ a := newOperandMem(m.newAmodeImmReg(uint32(offsetFromSP), rspVReg))
+ switch typ {
+ case ssa.TypeI32:
+ load.asMovzxRmR(extModeLQ, a, v)
+ case ssa.TypeI64:
+ load.asMov64MR(a, v)
+ case ssa.TypeF32:
+ load.asXmmUnaryRmR(sseOpcodeMovss, a, v)
+ case ssa.TypeF64:
+ load.asXmmUnaryRmR(sseOpcodeMovsd, a, v)
+ case ssa.TypeV128:
+ load.asXmmUnaryRmR(sseOpcodeMovdqu, a, v)
+ default:
+ panic("BUG")
+ }
+
+ cur = linkInstr(cur, load)
+ return linkInstr(cur, prevNext)
+}
+
+// ClobberedRegisters implements backend.RegAllocFunctionMachine.
+func (m *machine) ClobberedRegisters(regs []regalloc.VReg) {
+ m.clobberedRegs = append(m.clobberedRegs[:0], regs...)
+}
+
+// Swap implements backend.RegAllocFunctionMachine.
+func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
+ if x1.RegType() == regalloc.RegTypeInt {
+ prevNext := cur.next
+ xc := m.allocateInstr().asXCHG(x1, newOperandReg(x2), 8)
+ cur = linkInstr(cur, xc)
+ linkInstr(cur, prevNext)
+ } else {
+ if tmp.Valid() {
+ prevNext := cur.next
+ m.InsertMoveBefore(tmp, x1, prevNext)
+ m.InsertMoveBefore(x1, x2, prevNext)
+ m.InsertMoveBefore(x2, tmp, prevNext)
+ } else {
+ prevNext := cur.next
+ r2 := x2.RealReg()
+ // Temporarily spill x1 to stack.
+ cur = m.InsertStoreRegisterAt(x1, cur, true).prev
+ // Then move x2 to x1.
+ cur = linkInstr(cur, m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqa, newOperandReg(x2), x1))
+ linkInstr(cur, prevNext)
+ // Then reload the original value on x1 from stack to r2.
+ m.InsertReloadRegisterAt(x1.SetRealReg(r2), cur, true)
+ }
+ }
+}
+
+// LastInstrForInsertion implements backend.RegAllocFunctionMachine.
+func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction {
+ cur := end
+ for cur.kind == nop0 {
+ cur = cur.prev
+ if cur == begin {
+ return end
+ }
+ }
+ switch cur.kind {
+ case jmp:
+ return cur
+ default:
+ return end
+ }
+}
+
+// SSABlockLabel implements backend.RegAllocFunctionMachine.
+func (m *machine) SSABlockLabel(id ssa.BasicBlockID) backend.Label {
+ return m.ectx.SsaBlockIDToLabels[id]
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go
new file mode 100644
index 000000000..539a8b754
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go
@@ -0,0 +1,992 @@
+package amd64
+
+import (
+ "fmt"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+var swizzleMask = [16]byte{
+ 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+ 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+}
+
+func (m *machine) lowerSwizzle(x, y ssa.Value, ret ssa.Value) {
+ masklabel := m.getOrAllocateConstLabel(&m.constSwizzleMaskConstIndex, swizzleMask[:])
+
+ // Load mask to maskReg.
+ maskReg := m.c.AllocateVReg(ssa.TypeV128)
+ loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(masklabel)), maskReg)
+ m.insert(loadMask)
+
+ // Copy x and y to tmp registers.
+ xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ tmpDst := m.copyToTmp(xx.reg())
+ yy := m.getOperand_Reg(m.c.ValueDefinition(y))
+ tmpX := m.copyToTmp(yy.reg())
+
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddusb, newOperandReg(maskReg), tmpX))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpX), tmpDst))
+
+ // Copy the result to the destination register.
+ m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerInsertLane(x, y ssa.Value, index byte, ret ssa.Value, lane ssa.VecLane) {
+ // Copy x to tmp.
+ tmpDst := m.c.AllocateVReg(ssa.TypeV128)
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), tmpDst))
+
+ yy := m.getOperand_Reg(m.c.ValueDefinition(y))
+ switch lane {
+ case ssa.VecLaneI8x16:
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, index, yy, tmpDst))
+ case ssa.VecLaneI16x8:
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, index, yy, tmpDst))
+ case ssa.VecLaneI32x4:
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, index, yy, tmpDst))
+ case ssa.VecLaneI64x2:
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, index, yy, tmpDst))
+ case ssa.VecLaneF32x4:
+ // In INSERTPS instruction, the destination index is encoded at 4 and 5 bits of the argument.
+ // See https://www.felixcloutier.com/x86/insertps
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, index<<4, yy, tmpDst))
+ case ssa.VecLaneF64x2:
+ if index == 0 {
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, yy, tmpDst))
+ } else {
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, yy, tmpDst))
+ }
+ default:
+ panic(fmt.Sprintf("invalid lane type: %s", lane))
+ }
+
+ m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerExtractLane(x ssa.Value, index byte, signed bool, ret ssa.Value, lane ssa.VecLane) {
+ // Pextr variants are used to extract a lane from a vector register.
+ xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+
+ tmpDst := m.c.AllocateVReg(ret.Type())
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
+ switch lane {
+ case ssa.VecLaneI8x16:
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrb, index, xx, tmpDst))
+ if signed {
+ m.insert(m.allocateInstr().asMovsxRmR(extModeBL, newOperandReg(tmpDst), tmpDst))
+ } else {
+ m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(tmpDst), tmpDst))
+ }
+ case ssa.VecLaneI16x8:
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrw, index, xx, tmpDst))
+ if signed {
+ m.insert(m.allocateInstr().asMovsxRmR(extModeWL, newOperandReg(tmpDst), tmpDst))
+ } else {
+ m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(tmpDst), tmpDst))
+ }
+ case ssa.VecLaneI32x4:
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrd, index, xx, tmpDst))
+ case ssa.VecLaneI64x2:
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, index, xx, tmpDst))
+ case ssa.VecLaneF32x4:
+ if index == 0 {
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovss, xx, tmpDst))
+ } else {
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, index, xx, tmpDst))
+ }
+ case ssa.VecLaneF64x2:
+ if index == 0 {
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst))
+ } else {
+ m.copyTo(xx.reg(), tmpDst)
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0b00_00_11_10, newOperandReg(tmpDst), tmpDst))
+ }
+ default:
+ panic(fmt.Sprintf("invalid lane type: %s", lane))
+ }
+
+ m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+var sqmulRoundSat = [16]byte{
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+}
+
+func (m *machine) lowerSqmulRoundSat(x, y, ret ssa.Value) {
+ // See https://github.com/WebAssembly/simd/pull/365 for the following logic.
+ maskLabel := m.getOrAllocateConstLabel(&m.constSqmulRoundSatIndex, sqmulRoundSat[:])
+
+ tmp := m.c.AllocateVReg(ssa.TypeV128)
+ loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp)
+ m.insert(loadMask)
+
+ xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+ tmpX := m.copyToTmp(xx.reg())
+
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmulhrsw, yy, tmpX))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmpX), tmp))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmpX))
+
+ m.copyTo(tmpX, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVUshr(x, y, ret ssa.Value, lane ssa.VecLane) {
+ switch lane {
+ case ssa.VecLaneI8x16:
+ m.lowerVUshri8x16(x, y, ret)
+ case ssa.VecLaneI16x8, ssa.VecLaneI32x4, ssa.VecLaneI64x2:
+ m.lowerShr(x, y, ret, lane, false)
+ default:
+ panic(fmt.Sprintf("invalid lane type: %s", lane))
+ }
+}
+
+// i8x16LogicalSHRMaskTable is necessary for emulating non-existent packed bytes logical right shifts on amd64.
+// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
+var i8x16LogicalSHRMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
+ 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // for 1 shift
+ 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, // for 2 shift
+ 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // for 3 shift
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // for 4 shift
+ 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // for 5 shift
+ 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // for 6 shift
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // for 7 shift
+}
+
+func (m *machine) lowerVUshri8x16(x, y, ret ssa.Value) {
+ tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
+ // Load the modulo 8 mask to tmpReg.
+ m.lowerIconst(tmpGpReg, 0x7, false)
+ // Take the modulo 8 of the shift amount.
+ shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y))
+ m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, tmpGpReg, false))
+
+ _xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ xx := m.copyToTmp(_xx.reg())
+
+ vecTmp := m.c.AllocateVReg(ssa.TypeV128)
+ m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), vecTmp, false))
+ m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrlw, newOperandReg(vecTmp), xx))
+
+ maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16LogicalSHRMaskTableIndex, i8x16LogicalSHRMaskTable[:])
+ base := m.c.AllocateVReg(ssa.TypeI64)
+ lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base)
+ m.insert(lea)
+
+ // Shift tmpGpReg by 4 to multiply the shift amount by 16.
+ m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false))
+
+ mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0)
+ loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), vecTmp)
+ m.insert(loadMask)
+
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(vecTmp), xx))
+ m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVSshr(x, y, ret ssa.Value, lane ssa.VecLane) {
+ switch lane {
+ case ssa.VecLaneI8x16:
+ m.lowerVSshri8x16(x, y, ret)
+ case ssa.VecLaneI16x8, ssa.VecLaneI32x4:
+ m.lowerShr(x, y, ret, lane, true)
+ case ssa.VecLaneI64x2:
+ m.lowerVSshri64x2(x, y, ret)
+ default:
+ panic(fmt.Sprintf("invalid lane type: %s", lane))
+ }
+}
+
+func (m *machine) lowerVSshri8x16(x, y, ret ssa.Value) {
+ shiftAmtReg := m.c.AllocateVReg(ssa.TypeI32)
+ // Load the modulo 8 mask to tmpReg.
+ m.lowerIconst(shiftAmtReg, 0x7, false)
+ // Take the modulo 8 of the shift amount.
+ shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y))
+ m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, shiftAmtReg, false))
+
+ // Copy the x value to two temporary registers.
+ _xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ xx := m.copyToTmp(_xx.reg())
+ vecTmp := m.c.AllocateVReg(ssa.TypeV128)
+ m.copyTo(xx, vecTmp)
+
+ // Assuming that we have
+ // xx = [b1, ..., b16]
+ // vecTmp = [b1, ..., b16]
+ // at this point, then we use PUNPCKLBW and PUNPCKHBW to produce:
+ // xx = [b1, b1, b2, b2, ..., b8, b8]
+ // vecTmp = [b9, b9, b10, b10, ..., b16, b16]
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpcklbw, newOperandReg(xx), xx))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpckhbw, newOperandReg(vecTmp), vecTmp))
+
+ // Adding 8 to the shift amount, and then move the amount to vecTmp2.
+ vecTmp2 := m.c.AllocateVReg(ssa.TypeV128)
+ m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(8), shiftAmtReg, false))
+ m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(shiftAmtReg), vecTmp2, false))
+
+ // Perform the word packed arithmetic right shifts on vreg and vecTmp.
+ // This changes these two registers as:
+ // xx = [xxx, b1 >> s, xxx, b2 >> s, ..., xxx, b8 >> s]
+ // vecTmp = [xxx, b9 >> s, xxx, b10 >> s, ..., xxx, b16 >> s]
+ // where xxx is 1 or 0 depending on each byte's sign, and ">>" is the arithmetic shift on a byte.
+ m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), xx))
+ m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), vecTmp))
+
+ // Finally, we can get the result by packing these two word vectors.
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePacksswb, newOperandReg(vecTmp), xx))
+
+ m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVSshri64x2(x, y, ret ssa.Value) {
+ // Load the shift amount to RCX.
+ shiftAmt := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+ m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, shiftAmt, rcxVReg))
+
+ tmpGp := m.c.AllocateVReg(ssa.TypeI64)
+
+ _xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ xxReg := m.copyToTmp(_xx.reg())
+
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 0, newOperandReg(xxReg), tmpGp))
+ m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), xxReg))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 1, newOperandReg(xxReg), tmpGp))
+ m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), xxReg))
+
+ m.copyTo(xxReg, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerShr(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) {
+ var modulo uint64
+ var shiftOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI16x8:
+ modulo = 0xf
+ if signed {
+ shiftOp = sseOpcodePsraw
+ } else {
+ shiftOp = sseOpcodePsrlw
+ }
+ case ssa.VecLaneI32x4:
+ modulo = 0x1f
+ if signed {
+ shiftOp = sseOpcodePsrad
+ } else {
+ shiftOp = sseOpcodePsrld
+ }
+ case ssa.VecLaneI64x2:
+ modulo = 0x3f
+ if signed {
+ panic("BUG")
+ }
+ shiftOp = sseOpcodePsrlq
+ default:
+ panic(fmt.Sprintf("invalid lane type: %s", lane))
+ }
+
+ _xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ xx := m.copyToTmp(_xx.reg())
+
+ tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
+ // Load the modulo 8 mask to tmpReg.
+ m.lowerIconst(tmpGpReg, modulo, false)
+ // Take the modulo 8 of the shift amount.
+ m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd,
+ m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false))
+ // And move it to a xmm register.
+ tmpVec := m.c.AllocateVReg(ssa.TypeV128)
+ m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false))
+
+ // Then do the actual shift.
+ m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx))
+
+ m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVIshl(x, y, ret ssa.Value, lane ssa.VecLane) {
+ var modulo uint64
+ var shiftOp sseOpcode
+ var isI8x16 bool
+ switch lane {
+ case ssa.VecLaneI8x16:
+ isI8x16 = true
+ modulo = 0x7
+ shiftOp = sseOpcodePsllw
+ case ssa.VecLaneI16x8:
+ modulo = 0xf
+ shiftOp = sseOpcodePsllw
+ case ssa.VecLaneI32x4:
+ modulo = 0x1f
+ shiftOp = sseOpcodePslld
+ case ssa.VecLaneI64x2:
+ modulo = 0x3f
+ shiftOp = sseOpcodePsllq
+ default:
+ panic(fmt.Sprintf("invalid lane type: %s", lane))
+ }
+
+ _xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ xx := m.copyToTmp(_xx.reg())
+
+ tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
+ // Load the modulo 8 mask to tmpReg.
+ m.lowerIconst(tmpGpReg, modulo, false)
+ // Take the modulo 8 of the shift amount.
+ m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd,
+ m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false))
+ // And move it to a xmm register.
+ tmpVec := m.c.AllocateVReg(ssa.TypeV128)
+ m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false))
+
+ // Then do the actual shift.
+ m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx))
+
+ if isI8x16 {
+ maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16SHLMaskTableIndex, i8x16SHLMaskTable[:])
+ base := m.c.AllocateVReg(ssa.TypeI64)
+ lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base)
+ m.insert(lea)
+
+ // Shift tmpGpReg by 4 to multiply the shift amount by 16.
+ m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false))
+
+ mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0)
+ loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), tmpVec)
+ m.insert(loadMask)
+
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(tmpVec), xx))
+ }
+
+ m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+// i8x16SHLMaskTable is necessary for emulating non-existent packed bytes left shifts on amd64.
+// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
+var i8x16SHLMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
+ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, // for 1 shift
+ 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, // for 2 shift
+ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, // for 3 shift
+ 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // for 4 shift
+ 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, // for 5 shift
+ 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, // for 6 shift
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // for 7 shift
+}
+
+func (m *machine) lowerVRound(x, ret ssa.Value, imm byte, _64 bool) {
+ xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+ var round sseOpcode
+ if _64 {
+ round = sseOpcodeRoundpd
+ } else {
+ round = sseOpcodeRoundps
+ }
+ m.insert(m.allocateInstr().asXmmUnaryRmRImm(round, imm, xx, m.c.VRegOf(ret)))
+}
+
+var (
+ allOnesI8x16 = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1}
+ allOnesI16x8 = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0}
+ extAddPairwiseI16x8uMask1 = [16]byte{0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80}
+ extAddPairwiseI16x8uMask2 = [16]byte{0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00}
+)
+
+func (m *machine) lowerExtIaddPairwise(x, ret ssa.Value, srcLane ssa.VecLane, signed bool) {
+ _xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ xx := m.copyToTmp(_xx.reg())
+ switch srcLane {
+ case ssa.VecLaneI8x16:
+ allOneReg := m.c.AllocateVReg(ssa.TypeV128)
+ mask := m.getOrAllocateConstLabel(&m.constAllOnesI8x16Index, allOnesI8x16[:])
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOneReg))
+
+ var resultReg regalloc.VReg
+ if signed {
+ resultReg = allOneReg
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(xx), resultReg))
+ } else {
+ // Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned.
+ resultReg = xx
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(allOneReg), resultReg))
+ }
+ m.copyTo(resultReg, m.c.VRegOf(ret))
+
+ case ssa.VecLaneI16x8:
+ if signed {
+ allOnesReg := m.c.AllocateVReg(ssa.TypeV128)
+ mask := m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:])
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOnesReg))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(allOnesReg), xx))
+ m.copyTo(xx, m.c.VRegOf(ret))
+ } else {
+ maskReg := m.c.AllocateVReg(ssa.TypeV128)
+ mask := m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask1Index, extAddPairwiseI16x8uMask1[:])
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
+
+ // Flip the sign bits on xx.
+ //
+ // Assuming that xx = [w1, ..., w8], now we have,
+ // xx[i] = int8(-w1) for i = 0...8
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(maskReg), xx))
+
+ mask = m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:])
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
+
+ // For i = 0,..4 (as this results in i32x4 lanes), now we have
+ // xx[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1)))
+ // c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr)
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(maskReg), xx))
+
+ mask = m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask2Index, extAddPairwiseI16x8uMask2[:])
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
+
+ // vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)).
+ // c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr)
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(maskReg), xx))
+
+ m.copyTo(xx, m.c.VRegOf(ret))
+ }
+ default:
+ panic(fmt.Sprintf("invalid lane type: %s", srcLane))
+ }
+}
+
+func (m *machine) lowerWidenLow(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
+ var sseOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ if signed {
+ sseOp = sseOpcodePmovsxbw
+ } else {
+ sseOp = sseOpcodePmovzxbw
+ }
+ case ssa.VecLaneI16x8:
+ if signed {
+ sseOp = sseOpcodePmovsxwd
+ } else {
+ sseOp = sseOpcodePmovzxwd
+ }
+ case ssa.VecLaneI32x4:
+ if signed {
+ sseOp = sseOpcodePmovsxdq
+ } else {
+ sseOp = sseOpcodePmovzxdq
+ }
+ default:
+ panic(fmt.Sprintf("invalid lane type: %s", lane))
+ }
+
+ xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, xx, m.c.VRegOf(ret)))
+}
+
+func (m *machine) lowerWidenHigh(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
+ tmp := m.c.AllocateVReg(ssa.TypeV128)
+ xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ m.copyTo(xx.reg(), tmp)
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePalignr, 8, newOperandReg(tmp), tmp))
+
+ var sseOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ if signed {
+ sseOp = sseOpcodePmovsxbw
+ } else {
+ sseOp = sseOpcodePmovzxbw
+ }
+ case ssa.VecLaneI16x8:
+ if signed {
+ sseOp = sseOpcodePmovsxwd
+ } else {
+ sseOp = sseOpcodePmovzxwd
+ }
+ case ssa.VecLaneI32x4:
+ if signed {
+ sseOp = sseOpcodePmovsxdq
+ } else {
+ sseOp = sseOpcodePmovzxdq
+ }
+ default:
+ panic(fmt.Sprintf("invalid lane type: %s", lane))
+ }
+
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandReg(tmp), m.c.VRegOf(ret)))
+}
+
+func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, ret ssa.Value, lane ssa.VecLane) {
+ tmpDst, tmpGp := m.c.AllocateVReg(ssa.TypeV128), m.c.AllocateVReg(ssa.TypeI64)
+ am := newOperandMem(m.lowerToAddressMode(ptr, offset))
+
+ m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
+ switch lane {
+ case ssa.VecLaneI8x16:
+ m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, am, tmpGp))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, newOperandReg(tmpGp), tmpDst))
+ tmpZeroVec := m.c.AllocateVReg(ssa.TypeV128)
+ m.insert(m.allocateInstr().asZeros(tmpZeroVec))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpZeroVec), tmpDst))
+ case ssa.VecLaneI16x8:
+ m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, am, tmpGp))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, newOperandReg(tmpGp), tmpDst))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, newOperandReg(tmpGp), tmpDst))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
+ case ssa.VecLaneI32x4:
+ m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, am, tmpGp))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, newOperandReg(tmpGp), tmpDst))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
+ case ssa.VecLaneI64x2:
+ m.insert(m.allocateInstr().asMov64MR(am, tmpGp))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), tmpDst))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), tmpDst))
+ default:
+ panic(fmt.Sprintf("invalid lane type: %s", lane))
+ }
+
+ m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+var f64x2CvtFromIMask = [16]byte{
+ 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+}
+
+func (m *machine) lowerVFcvtFromInt(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
+ switch lane {
+ case ssa.VecLaneF32x4:
+ if signed {
+ xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, xx, m.c.VRegOf(ret)))
+ } else {
+ xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ // Copy the value to two temporary registers.
+ tmp := m.copyToTmp(xx.reg())
+ tmp2 := m.copyToTmp(xx.reg())
+
+ // Clear the higher 16 bits of each 32-bit element.
+ m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePslld, newOperandImm32(0xa), tmp))
+ m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0xa), tmp))
+
+ // Subtract the higher 16-bits from tmp2: clear the lower 16-bits of tmp2.
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubd, newOperandReg(tmp), tmp2))
+
+ // Convert the lower 16-bits in tmp.
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp))
+
+ // Left shift by one and convert tmp2, meaning that halved conversion result of higher 16-bits in tmp2.
+ m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp2))
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp2), tmp2))
+
+ // Double the converted halved higher 16bits.
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp2), tmp2))
+
+ // Get the conversion result by add tmp (holding lower 16-bit conversion) into tmp2.
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp), tmp2))
+
+ m.copyTo(tmp2, m.c.VRegOf(ret))
+ }
+ case ssa.VecLaneF64x2:
+ if signed {
+ xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2pd, xx, m.c.VRegOf(ret)))
+ } else {
+ maskReg := m.c.AllocateVReg(ssa.TypeV128)
+ maskLabel := m.getOrAllocateConstLabel(&m.constF64x2CvtFromIMaskIndex, f64x2CvtFromIMask[:])
+ // maskReg = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg))
+
+ _xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ xx := m.copyToTmp(_xx.reg())
+
+ // Given that we have xx = [d1, d2, d3, d4], this results in
+ // xx = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]]
+ // = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52]
+ // ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeUnpcklps, newOperandReg(maskReg), xx))
+
+ // maskReg = [float64(0x1.0p52), float64(0x1.0p52)]
+ maskLabel = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:])
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg))
+
+ // Now, we get the result as
+ // xx = [float64(uint32(d1)), float64(uint32(d2))]
+ // because the following equality always satisfies:
+ // float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubpd, newOperandReg(maskReg), xx))
+
+ m.copyTo(xx, m.c.VRegOf(ret))
+ }
+ default:
+ panic(fmt.Sprintf("invalid lane type: %s", lane))
+ }
+}
+
+var (
+ // i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes.
+ i32sMaxOnF64x2 = [16]byte{
+ 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
+ 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
+ }
+
+ // i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes.
+ i32uMaxOnF64x2 = [16]byte{
+ 0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
+ 0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
+ }
+
+ // twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that
+ // with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics,
+ // like addition or subtraction, the resulted floating point holds exactly the same
+ // bit representations in 32-bit integer on its mantissa.
+ //
+ // Note: the name twop52 is common across various compiler ecosystem.
+ // E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28
+ // E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html
+ twop52 = [16]byte{
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
+ }
+)
+
+func (m *machine) lowerVFcvtToIntSat(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
+ _xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ xx := m.copyToTmp(_xx.reg())
+
+ switch lane {
+ case ssa.VecLaneF32x4:
+ if signed {
+ tmp := m.copyToTmp(xx)
+
+ // Assuming we have xx = [v1, v2, v3, v4].
+ //
+ // Set all bits if lane is not NaN on tmp.
+ // tmp[i] = 0xffffffff if vi != NaN
+ // = 0 if vi == NaN
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp))
+
+ // Clear NaN lanes on xx, meaning that
+ // xx[i] = vi if vi != NaN
+ // 0 if vi == NaN
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp), xx))
+
+ // tmp[i] = ^vi if vi != NaN
+ // = 0xffffffff if vi == NaN
+ // which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative.
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeXorps, newOperandReg(xx), tmp))
+
+ // xx[i] = int32(vi) if vi != NaN and xx is not overflowing.
+ // = 0x80000000 if vi != NaN and xx is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq)
+ // = 0 if vi == NaN
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx))
+
+ // Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane.
+ //
+ // tmp[i] = 0x80000000 if vi is positive
+ // = any satisfying any&0x80000000 = 0 if vi is negative or zero.
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(xx), tmp))
+
+ // Arithmetic right shifting tmp by 31, meaning that we have
+ // tmp[i] = 0xffffffff if vi is positive, 0 otherwise.
+ m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrad, newOperandImm32(0x1f), tmp))
+
+ // Flipping 0x80000000 if vi is positive, otherwise keep intact.
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), xx))
+ } else {
+ tmp := m.c.AllocateVReg(ssa.TypeV128)
+ m.insert(m.allocateInstr().asZeros(tmp))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxps, newOperandReg(tmp), xx))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp))
+ m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0x1), tmp))
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp))
+ tmp2 := m.copyToTmp(xx)
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubps, newOperandReg(tmp), tmp2))
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredLE_OS), newOperandReg(tmp2), tmp))
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(tmp2), tmp2))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp2))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaxsd, newOperandReg(tmp), tmp2))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(tmp2), xx))
+ }
+
+ case ssa.VecLaneF64x2:
+ tmp2 := m.c.AllocateVReg(ssa.TypeV128)
+ if signed {
+ tmp := m.copyToTmp(xx)
+
+ // Set all bits for non-NaN lanes, zeros otherwise.
+ // I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise.
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp))
+
+ maskLabel := m.getOrAllocateConstLabel(&m.constI32sMaxOnF64x2Index, i32sMaxOnF64x2[:])
+ // Load the 2147483647 into tmp2's each lane.
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp2))
+
+ // tmp[i] = 2147483647 if vi != NaN, 0 otherwise.
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp2), tmp))
+
+ // MINPD returns the source register's value as-is, so we have
+ // xx[i] = vi if vi != NaN
+ // = 0 if vi == NaN
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp), xx))
+
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttpd2dq, newOperandReg(xx), xx))
+ } else {
+ tmp := m.c.AllocateVReg(ssa.TypeV128)
+ m.insert(m.allocateInstr().asZeros(tmp))
+
+ // xx[i] = vi if vi != NaN && vi > 0
+ // = 0 if vi == NaN || vi <= 0
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxpd, newOperandReg(tmp), xx))
+
+ // tmp2[i] = float64(math.MaxUint32) = math.MaxUint32
+ maskIndex := m.getOrAllocateConstLabel(&m.constI32uMaxOnF64x2Index, i32uMaxOnF64x2[:])
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2))
+
+ // xx[i] = vi if vi != NaN && vi > 0 && vi <= math.MaxUint32
+ // = 0 otherwise
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp2), xx))
+
+ // Round the floating points into integer.
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeRoundpd, 0x3, newOperandReg(xx), xx))
+
+ // tmp2[i] = float64(0x1.0p52)
+ maskIndex = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:])
+ m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2))
+
+ // xx[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32
+ // = 0 otherwise
+ //
+ // This means that xx[i] holds exactly the same bit of uint32(vi) in its lower 32-bits.
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddpd, newOperandReg(tmp2), xx))
+
+ // At this point, we have
+ // xx = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)]
+ // tmp = [0, 0, 0, 0]
+ // as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in
+ // xx = [xx[00], xx[10], tmp[00], tmp[00]] = [xx[00], xx[10], 0, 0]
+ // meaning that for i = 0 and 1, we have
+ // xx[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32
+ // = 0 otherwise.
+ m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeShufps, 0b00_00_10_00, newOperandReg(tmp), xx))
+ }
+ default:
+ panic(fmt.Sprintf("invalid lane type: %s", lane))
+ }
+
+ m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerNarrow(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) {
+ _xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ xx := m.copyToTmp(_xx.reg())
+ yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+
+ var sseOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI16x8:
+ if signed {
+ sseOp = sseOpcodePacksswb
+ } else {
+ sseOp = sseOpcodePackuswb
+ }
+ case ssa.VecLaneI32x4:
+ if signed {
+ sseOp = sseOpcodePackssdw
+ } else {
+ sseOp = sseOpcodePackusdw
+ }
+ default:
+ panic(fmt.Sprintf("invalid lane type: %s", lane))
+ }
+ m.insert(m.allocateInstr().asXmmRmR(sseOp, yy, xx))
+ m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerWideningPairwiseDotProductS(x, y, ret ssa.Value) {
+ _xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+ xx := m.copyToTmp(_xx.reg())
+ yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, yy, xx))
+ m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVIabs(instr *ssa.Instruction) {
+ x, lane := instr.ArgWithLane()
+ rd := m.c.VRegOf(instr.Return())
+
+ if lane == ssa.VecLaneI64x2 {
+ _xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+
+ blendReg := xmm0VReg
+ m.insert(m.allocateInstr().asDefineUninitializedReg(blendReg))
+
+ tmp := m.copyToTmp(_xx.reg())
+ xx := m.copyToTmp(_xx.reg())
+
+ // Clear all bits on blendReg.
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(blendReg), blendReg))
+ // Subtract xx from blendMaskReg.
+ m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubq, newOperandReg(xx), blendReg))
+ // Copy the subtracted value ^^ back into tmp.
+ m.copyTo(blendReg, xx)
+
+ m.insert(m.allocateInstr().asBlendvpd(newOperandReg(tmp), xx))
+
+ m.copyTo(xx, rd)
+ } else {
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI8x16:
+ vecOp = sseOpcodePabsb
+ case ssa.VecLaneI16x8:
+ vecOp = sseOpcodePabsw
+ case ssa.VecLaneI32x4:
+ vecOp = sseOpcodePabsd
+ }
+ rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+
+ i := m.allocateInstr()
+ i.asXmmUnaryRmR(vecOp, rn, rd)
+ m.insert(i)
+ }
+}
+
+func (m *machine) lowerVIpopcnt(instr *ssa.Instruction) {
+ x := instr.Arg()
+ rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rd := m.c.VRegOf(instr.Return())
+
+ tmp1 := m.c.AllocateVReg(ssa.TypeV128)
+ m.lowerVconst(tmp1, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f)
+
+ // Copy input into tmp2.
+ tmp2 := m.copyToTmp(rn.reg())
+
+ // Given that we have:
+ // rm = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn.
+ //
+ // Take PAND on tmp1 and tmp2, so that we mask out all the higher bits.
+ // tmp2 = [l1, ..., l16].
+ pand := m.allocateInstr()
+ pand.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp2)
+ m.insert(pand)
+
+ // Do logical (packed word) right shift by 4 on rm and PAND against the mask (tmp1); meaning that we have
+ // tmp3 = [h1, ...., h16].
+ tmp3 := m.copyToTmp(rn.reg())
+ psrlw := m.allocateInstr()
+ psrlw.asXmmRmiReg(sseOpcodePsrlw, newOperandImm32(4), tmp3)
+ m.insert(psrlw)
+
+ pand2 := m.allocateInstr()
+ pand2.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp3)
+ m.insert(pand2)
+
+ // Read the popcntTable into tmp4, and we have
+ // tmp4 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04]
+ tmp4 := m.c.AllocateVReg(ssa.TypeV128)
+ m.lowerVconst(tmp4, 0x03_02_02_01_02_01_01_00, 0x04_03_03_02_03_02_02_01)
+
+ // Make a copy for later.
+ tmp5 := m.copyToTmp(tmp4)
+
+ // tmp4 = [popcnt(l1), ..., popcnt(l16)].
+ pshufb := m.allocateInstr()
+ pshufb.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp2), tmp4)
+ m.insert(pshufb)
+
+ pshufb2 := m.allocateInstr()
+ pshufb2.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp3), tmp5)
+ m.insert(pshufb2)
+
+ // tmp4 + tmp5 is the result.
+ paddb := m.allocateInstr()
+ paddb.asXmmRmR(sseOpcodePaddb, newOperandReg(tmp4), tmp5)
+ m.insert(paddb)
+
+ m.copyTo(tmp5, rd)
+}
+
+func (m *machine) lowerVImul(instr *ssa.Instruction) {
+ x, y, lane := instr.Arg2WithLane()
+ rd := m.c.VRegOf(instr.Return())
+ if lane == ssa.VecLaneI64x2 {
+ rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+ rm := m.getOperand_Reg(m.c.ValueDefinition(y))
+ // Assuming that we have
+ // rm = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high]
+ // rn = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high]
+ // where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane.
+
+ // Copy rn into tmp1.
+ tmp1 := m.copyToTmp(rn.reg())
+
+ // And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high]
+ shift := m.allocateInstr()
+ shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp1)
+ m.insert(shift)
+
+ // Execute "pmuludq rm,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit.
+ mul := m.allocateInstr()
+ mul.asXmmRmR(sseOpcodePmuludq, rm, tmp1)
+ m.insert(mul)
+
+ // Copy rm value into tmp2.
+ tmp2 := m.copyToTmp(rm.reg())
+
+ // And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high]
+ shift2 := m.allocateInstr()
+ shift2.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp2)
+ m.insert(shift2)
+
+ // Execute "pmuludq rm,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit.
+ mul2 := m.allocateInstr()
+ mul2.asXmmRmR(sseOpcodePmuludq, rn, tmp2)
+ m.insert(mul2)
+
+ // Adds tmp1 and tmp2 and do the logical left shift by 32-bit,
+ // which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32]
+ add := m.allocateInstr()
+ add.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp2), tmp1)
+ m.insert(add)
+
+ shift3 := m.allocateInstr()
+ shift3.asXmmRmiReg(sseOpcodePsllq, newOperandImm32(32), tmp1)
+ m.insert(shift3)
+
+ // Copy rm value into tmp3.
+ tmp3 := m.copyToTmp(rm.reg())
+
+ // "pmuludq rm,tmp3" makes tmp3 = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit.
+ mul3 := m.allocateInstr()
+ mul3.asXmmRmR(sseOpcodePmuludq, rn, tmp3)
+ m.insert(mul3)
+
+ // Finally, we get the result by computing tmp1 + tmp3,
+ // which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo]
+ add2 := m.allocateInstr()
+ add2.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp3), tmp1)
+ m.insert(add2)
+
+ m.copyTo(tmp1, rd)
+
+ } else {
+ var vecOp sseOpcode
+ switch lane {
+ case ssa.VecLaneI16x8:
+ vecOp = sseOpcodePmullw
+ case ssa.VecLaneI32x4:
+ vecOp = sseOpcodePmulld
+ default:
+ panic("unsupported: " + lane.String())
+ }
+ m.lowerVbBinOp(vecOp, x, y, instr.Return())
+ }
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go
new file mode 100644
index 000000000..c6fcb8673
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go
@@ -0,0 +1,346 @@
+package amd64
+
+import (
+ "fmt"
+ "unsafe"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type operand struct {
+ kind operandKind
+ data uint64
+}
+
+type operandKind byte
+
+const (
+ // operandKindReg is an operand which is an integer Register.
+ operandKindReg operandKind = iota + 1
+
+ // operandKindMem is a value in Memory.
+ // 32, 64, or 128 bit value.
+ operandKindMem
+
+ // operandKindImm32 is a signed-32-bit integer immediate value.
+ operandKindImm32
+
+ // operandKindLabel is a label.
+ operandKindLabel
+)
+
+// String implements fmt.Stringer.
+func (o operandKind) String() string {
+ switch o {
+ case operandKindReg:
+ return "reg"
+ case operandKindMem:
+ return "mem"
+ case operandKindImm32:
+ return "imm32"
+ case operandKindLabel:
+ return "label"
+ default:
+ panic("BUG: invalid operand kind")
+ }
+}
+
+// format returns the string representation of the operand.
+// _64 is only for the case where the operand is a register, and it's integer.
+func (o *operand) format(_64 bool) string {
+ switch o.kind {
+ case operandKindReg:
+ return formatVRegSized(o.reg(), _64)
+ case operandKindMem:
+ return o.addressMode().String()
+ case operandKindImm32:
+ return fmt.Sprintf("$%d", int32(o.imm32()))
+ case operandKindLabel:
+ return backend.Label(o.imm32()).String()
+ default:
+ panic(fmt.Sprintf("BUG: invalid operand: %s", o.kind))
+ }
+}
+
+//go:inline
+func (o *operand) reg() regalloc.VReg {
+ return regalloc.VReg(o.data)
+}
+
+//go:inline
+func (o *operand) setReg(r regalloc.VReg) {
+ o.data = uint64(r)
+}
+
+//go:inline
+func (o *operand) addressMode() *amode {
+ return wazevoapi.PtrFromUintptr[amode](uintptr(o.data))
+}
+
+//go:inline
+func (o *operand) imm32() uint32 {
+ return uint32(o.data)
+}
+
+func (o *operand) label() backend.Label {
+ switch o.kind {
+ case operandKindLabel:
+ return backend.Label(o.data)
+ case operandKindMem:
+ mem := o.addressMode()
+ if mem.kind() != amodeRipRel {
+ panic("BUG: invalid label")
+ }
+ return backend.Label(mem.imm32)
+ default:
+ panic("BUG: invalid operand kind")
+ }
+}
+
+func newOperandLabel(label backend.Label) operand {
+ return operand{kind: operandKindLabel, data: uint64(label)}
+}
+
+func newOperandReg(r regalloc.VReg) operand {
+ return operand{kind: operandKindReg, data: uint64(r)}
+}
+
+func newOperandImm32(imm32 uint32) operand {
+ return operand{kind: operandKindImm32, data: uint64(imm32)}
+}
+
+func newOperandMem(amode *amode) operand {
+ return operand{kind: operandKindMem, data: uint64(uintptr(unsafe.Pointer(amode)))}
+}
+
+// amode is a memory operand (addressing mode).
+type amode struct {
+ kindWithShift uint32
+ imm32 uint32
+ base regalloc.VReg
+
+ // For amodeRegRegShift:
+ index regalloc.VReg
+}
+
+type amodeKind byte
+
+const (
+ // amodeRegRegShift calculates sign-extend-32-to-64(Immediate) + base
+ amodeImmReg amodeKind = iota + 1
+
+ // amodeImmRBP is the same as amodeImmReg, but the base register is fixed to RBP.
+ // The only differece is that it doesn't tell the register allocator to use RBP which is distracting for the
+ // register allocator.
+ amodeImmRBP
+
+ // amodeRegRegShift calculates sign-extend-32-to-64(Immediate) + base + (Register2 << Shift)
+ amodeRegRegShift
+
+ // amodeRipRel is a RIP-relative addressing mode specified by the label.
+ amodeRipRel
+
+ // TODO: there are other addressing modes such as the one without base register.
+)
+
+func (a *amode) kind() amodeKind {
+ return amodeKind(a.kindWithShift & 0xff)
+}
+
+func (a *amode) shift() byte {
+ return byte(a.kindWithShift >> 8)
+}
+
+func (a *amode) uses(rs *[]regalloc.VReg) {
+ switch a.kind() {
+ case amodeImmReg:
+ *rs = append(*rs, a.base)
+ case amodeRegRegShift:
+ *rs = append(*rs, a.base, a.index)
+ case amodeImmRBP, amodeRipRel:
+ default:
+ panic("BUG: invalid amode kind")
+ }
+}
+
+func (a *amode) nregs() int {
+ switch a.kind() {
+ case amodeImmReg:
+ return 1
+ case amodeRegRegShift:
+ return 2
+ case amodeImmRBP, amodeRipRel:
+ return 0
+ default:
+ panic("BUG: invalid amode kind")
+ }
+}
+
+func (a *amode) assignUses(i int, reg regalloc.VReg) {
+ switch a.kind() {
+ case amodeImmReg:
+ if i == 0 {
+ a.base = reg
+ } else {
+ panic("BUG: invalid amode assignment")
+ }
+ case amodeRegRegShift:
+ if i == 0 {
+ a.base = reg
+ } else if i == 1 {
+ a.index = reg
+ } else {
+ panic("BUG: invalid amode assignment")
+ }
+ default:
+ panic("BUG: invalid amode assignment")
+ }
+}
+
+func (m *machine) newAmodeImmReg(imm32 uint32, base regalloc.VReg) *amode {
+ ret := m.amodePool.Allocate()
+ *ret = amode{kindWithShift: uint32(amodeImmReg), imm32: imm32, base: base}
+ return ret
+}
+
+func (m *machine) newAmodeImmRBPReg(imm32 uint32) *amode {
+ ret := m.amodePool.Allocate()
+ *ret = amode{kindWithShift: uint32(amodeImmRBP), imm32: imm32, base: rbpVReg}
+ return ret
+}
+
+func (m *machine) newAmodeRegRegShift(imm32 uint32, base, index regalloc.VReg, shift byte) *amode {
+ if shift > 3 {
+ panic(fmt.Sprintf("BUG: invalid shift (must be 3>=): %d", shift))
+ }
+ ret := m.amodePool.Allocate()
+ *ret = amode{kindWithShift: uint32(amodeRegRegShift) | uint32(shift)<<8, imm32: imm32, base: base, index: index}
+ return ret
+}
+
+func (m *machine) newAmodeRipRel(label backend.Label) *amode {
+ ret := m.amodePool.Allocate()
+ *ret = amode{kindWithShift: uint32(amodeRipRel), imm32: uint32(label)}
+ return ret
+}
+
+// String implements fmt.Stringer.
+func (a *amode) String() string {
+ switch a.kind() {
+ case amodeImmReg, amodeImmRBP:
+ if a.imm32 == 0 {
+ return fmt.Sprintf("(%s)", formatVRegSized(a.base, true))
+ }
+ return fmt.Sprintf("%d(%s)", int32(a.imm32), formatVRegSized(a.base, true))
+ case amodeRegRegShift:
+ shift := 1 << a.shift()
+ if a.imm32 == 0 {
+ return fmt.Sprintf(
+ "(%s,%s,%d)",
+ formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift)
+ }
+ return fmt.Sprintf(
+ "%d(%s,%s,%d)",
+ int32(a.imm32), formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift)
+ case amodeRipRel:
+ return fmt.Sprintf("%s(%%rip)", backend.Label(a.imm32))
+ default:
+ panic("BUG: invalid amode kind")
+ }
+}
+
+func (m *machine) getOperand_Mem_Reg(def *backend.SSAValueDefinition) (op operand) {
+ if def.IsFromBlockParam() {
+ return newOperandReg(def.BlkParamVReg)
+ }
+
+ if def.SSAValue().Type() == ssa.TypeV128 {
+ // SIMD instructions require strict memory alignment, so we don't support the memory operand for V128 at the moment.
+ return m.getOperand_Reg(def)
+ }
+
+ if m.c.MatchInstr(def, ssa.OpcodeLoad) {
+ instr := def.Instr
+ ptr, offset, _ := instr.LoadData()
+ op = newOperandMem(m.lowerToAddressMode(ptr, offset))
+ instr.MarkLowered()
+ return op
+ }
+ return m.getOperand_Reg(def)
+}
+
+func (m *machine) getOperand_Mem_Imm32_Reg(def *backend.SSAValueDefinition) (op operand) {
+ if def.IsFromBlockParam() {
+ return newOperandReg(def.BlkParamVReg)
+ }
+
+ if m.c.MatchInstr(def, ssa.OpcodeLoad) {
+ instr := def.Instr
+ ptr, offset, _ := instr.LoadData()
+ op = newOperandMem(m.lowerToAddressMode(ptr, offset))
+ instr.MarkLowered()
+ return op
+ }
+ return m.getOperand_Imm32_Reg(def)
+}
+
+func (m *machine) getOperand_Imm32_Reg(def *backend.SSAValueDefinition) (op operand) {
+ if def.IsFromBlockParam() {
+ return newOperandReg(def.BlkParamVReg)
+ }
+
+ instr := def.Instr
+ if instr.Constant() {
+ // If the operation is 64-bit, x64 sign-extends the 32-bit immediate value.
+ // Therefore, we need to check if the immediate value is within the 32-bit range and if the sign bit is set,
+ // we should not use the immediate value.
+ if op, ok := asImm32Operand(instr.ConstantVal(), instr.Return().Type() == ssa.TypeI32); ok {
+ instr.MarkLowered()
+ return op
+ }
+ }
+ return m.getOperand_Reg(def)
+}
+
+func asImm32Operand(val uint64, allowSignExt bool) (operand, bool) {
+ if imm32, ok := asImm32(val, allowSignExt); ok {
+ return newOperandImm32(imm32), true
+ }
+ return operand{}, false
+}
+
+func asImm32(val uint64, allowSignExt bool) (uint32, bool) {
+ u32val := uint32(val)
+ if uint64(u32val) != val {
+ return 0, false
+ }
+ if !allowSignExt && u32val&0x80000000 != 0 {
+ return 0, false
+ }
+ return u32val, true
+}
+
+func (m *machine) getOperand_Reg(def *backend.SSAValueDefinition) (op operand) {
+ var v regalloc.VReg
+ if def.IsFromBlockParam() {
+ v = def.BlkParamVReg
+ } else {
+ instr := def.Instr
+ if instr.Constant() {
+ // We inline all the constant instructions so that we could reduce the register usage.
+ v = m.lowerConstant(instr)
+ instr.MarkLowered()
+ } else {
+ if n := def.N; n == 0 {
+ v = m.c.VRegOf(instr.Return())
+ } else {
+ _, rs := instr.Returns()
+ v = m.c.VRegOf(rs[n-1])
+ }
+ }
+ }
+ return newOperandReg(v)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go
new file mode 100644
index 000000000..5219837e3
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go
@@ -0,0 +1,11 @@
+//go:build !tinygo
+
+package amd64
+
+import "reflect"
+
+// setSliceLimits sets both Cap and Len for the given reflected slice.
+func setSliceLimits(s *reflect.SliceHeader, limit uintptr) {
+ s.Len = int(limit)
+ s.Cap = int(limit)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go
new file mode 100644
index 000000000..df4cf46ec
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go
@@ -0,0 +1,11 @@
+//go:build tinygo
+
+package amd64
+
+import "reflect"
+
+// setSliceLimits sets both Cap and Len for the given reflected slice.
+func setSliceLimits(s *reflect.SliceHeader, limit uintptr) {
+ s.Len = limit
+ s.Len = limit
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go
new file mode 100644
index 000000000..4aec856fa
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go
@@ -0,0 +1,181 @@
+package amd64
+
+import (
+ "fmt"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+)
+
+// Amd64-specific registers.
+const (
+ // rax is a gp register.
+ rax = regalloc.RealRegInvalid + 1 + iota
+ // rcx is a gp register.
+ rcx
+ // rdx is a gp register.
+ rdx
+ // rbx is a gp register.
+ rbx
+ // rsp is a gp register.
+ rsp
+ // rbp is a gp register.
+ rbp
+ // rsi is a gp register.
+ rsi
+ // rdi is a gp register.
+ rdi
+ // r8 is a gp register.
+ r8
+ // r9 is a gp register.
+ r9
+ // r10 is a gp register.
+ r10
+ // r11 is a gp register.
+ r11
+ // r12 is a gp register.
+ r12
+ // r13 is a gp register.
+ r13
+ // r14 is a gp register.
+ r14
+ // r15 is a gp register.
+ r15
+
+ // xmm0 is a vector register.
+ xmm0
+ // xmm1 is a vector register.
+ xmm1
+ // xmm2 is a vector register.
+ xmm2
+ // xmm3 is a vector register.
+ xmm3
+ // xmm4 is a vector register.
+ xmm4
+ // xmm5 is a vector register.
+ xmm5
+ // xmm6 is a vector register.
+ xmm6
+ // xmm7 is a vector register.
+ xmm7
+ // xmm8 is a vector register.
+ xmm8
+ // xmm9 is a vector register.
+ xmm9
+ // xmm10 is a vector register.
+ xmm10
+ // xmm11 is a vector register.
+ xmm11
+ // xmm12 is a vector register.
+ xmm12
+ // xmm13 is a vector register.
+ xmm13
+ // xmm14 is a vector register.
+ xmm14
+ // xmm15 is a vector register.
+ xmm15
+)
+
+var (
+ raxVReg = regalloc.FromRealReg(rax, regalloc.RegTypeInt)
+ rcxVReg = regalloc.FromRealReg(rcx, regalloc.RegTypeInt)
+ rdxVReg = regalloc.FromRealReg(rdx, regalloc.RegTypeInt)
+ rbxVReg = regalloc.FromRealReg(rbx, regalloc.RegTypeInt)
+ rspVReg = regalloc.FromRealReg(rsp, regalloc.RegTypeInt)
+ rbpVReg = regalloc.FromRealReg(rbp, regalloc.RegTypeInt)
+ rsiVReg = regalloc.FromRealReg(rsi, regalloc.RegTypeInt)
+ rdiVReg = regalloc.FromRealReg(rdi, regalloc.RegTypeInt)
+ r8VReg = regalloc.FromRealReg(r8, regalloc.RegTypeInt)
+ r9VReg = regalloc.FromRealReg(r9, regalloc.RegTypeInt)
+ r10VReg = regalloc.FromRealReg(r10, regalloc.RegTypeInt)
+ r11VReg = regalloc.FromRealReg(r11, regalloc.RegTypeInt)
+ r12VReg = regalloc.FromRealReg(r12, regalloc.RegTypeInt)
+ r13VReg = regalloc.FromRealReg(r13, regalloc.RegTypeInt)
+ r14VReg = regalloc.FromRealReg(r14, regalloc.RegTypeInt)
+ r15VReg = regalloc.FromRealReg(r15, regalloc.RegTypeInt)
+
+ xmm0VReg = regalloc.FromRealReg(xmm0, regalloc.RegTypeFloat)
+ xmm1VReg = regalloc.FromRealReg(xmm1, regalloc.RegTypeFloat)
+ xmm2VReg = regalloc.FromRealReg(xmm2, regalloc.RegTypeFloat)
+ xmm3VReg = regalloc.FromRealReg(xmm3, regalloc.RegTypeFloat)
+ xmm4VReg = regalloc.FromRealReg(xmm4, regalloc.RegTypeFloat)
+ xmm5VReg = regalloc.FromRealReg(xmm5, regalloc.RegTypeFloat)
+ xmm6VReg = regalloc.FromRealReg(xmm6, regalloc.RegTypeFloat)
+ xmm7VReg = regalloc.FromRealReg(xmm7, regalloc.RegTypeFloat)
+ xmm8VReg = regalloc.FromRealReg(xmm8, regalloc.RegTypeFloat)
+ xmm9VReg = regalloc.FromRealReg(xmm9, regalloc.RegTypeFloat)
+ xmm10VReg = regalloc.FromRealReg(xmm10, regalloc.RegTypeFloat)
+ xmm11VReg = regalloc.FromRealReg(xmm11, regalloc.RegTypeFloat)
+ xmm12VReg = regalloc.FromRealReg(xmm12, regalloc.RegTypeFloat)
+ xmm13VReg = regalloc.FromRealReg(xmm13, regalloc.RegTypeFloat)
+ xmm14VReg = regalloc.FromRealReg(xmm14, regalloc.RegTypeFloat)
+ xmm15VReg = regalloc.FromRealReg(xmm15, regalloc.RegTypeFloat)
+)
+
+var regNames = [...]string{
+ rax: "rax",
+ rcx: "rcx",
+ rdx: "rdx",
+ rbx: "rbx",
+ rsp: "rsp",
+ rbp: "rbp",
+ rsi: "rsi",
+ rdi: "rdi",
+ r8: "r8",
+ r9: "r9",
+ r10: "r10",
+ r11: "r11",
+ r12: "r12",
+ r13: "r13",
+ r14: "r14",
+ r15: "r15",
+ xmm0: "xmm0",
+ xmm1: "xmm1",
+ xmm2: "xmm2",
+ xmm3: "xmm3",
+ xmm4: "xmm4",
+ xmm5: "xmm5",
+ xmm6: "xmm6",
+ xmm7: "xmm7",
+ xmm8: "xmm8",
+ xmm9: "xmm9",
+ xmm10: "xmm10",
+ xmm11: "xmm11",
+ xmm12: "xmm12",
+ xmm13: "xmm13",
+ xmm14: "xmm14",
+ xmm15: "xmm15",
+}
+
+func formatVRegSized(r regalloc.VReg, _64 bool) string {
+ if r.IsRealReg() {
+ if r.RegType() == regalloc.RegTypeInt {
+ rr := r.RealReg()
+ orig := regNames[rr]
+ if rr <= rdi {
+ if _64 {
+ return "%" + orig
+ } else {
+ return "%e" + orig[1:]
+ }
+ } else {
+ if _64 {
+ return "%" + orig
+ } else {
+ return "%" + orig + "d"
+ }
+ }
+ } else {
+ return "%" + regNames[r.RealReg()]
+ }
+ } else {
+ if r.RegType() == regalloc.RegTypeInt {
+ if _64 {
+ return fmt.Sprintf("%%r%d?", r.ID())
+ } else {
+ return fmt.Sprintf("%%r%dd?", r.ID())
+ }
+ } else {
+ return fmt.Sprintf("%%xmm%d?", r.ID())
+ }
+ }
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go
new file mode 100644
index 000000000..05ba5f027
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go
@@ -0,0 +1,128 @@
+package amd64
+
+import (
+ "encoding/binary"
+ "reflect"
+ "unsafe"
+
+ "github.com/tetratelabs/wazero/internal/wasmdebug"
+)
+
+func stackView(rbp, top uintptr) []byte {
+ var stackBuf []byte
+ {
+ // TODO: use unsafe.Slice after floor version is set to Go 1.20.
+ hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf))
+ hdr.Data = rbp
+ setSliceLimits(hdr, top-rbp)
+ }
+ return stackBuf
+}
+
+// UnwindStack implements wazevo.unwindStack.
+func UnwindStack(_, rbp, top uintptr, returnAddresses []uintptr) []uintptr {
+ stackBuf := stackView(rbp, top)
+
+ for i := uint64(0); i < uint64(len(stackBuf)); {
+ // (high address)
+ // +-----------------+
+ // | ....... |
+ // | ret Y |
+ // | ....... |
+ // | ret 0 |
+ // | arg X |
+ // | ....... |
+ // | arg 1 |
+ // | arg 0 |
+ // | ReturnAddress |
+ // | Caller_RBP |
+ // +-----------------+ <---- Caller_RBP
+ // | ........... |
+ // | clobbered M |
+ // | ............ |
+ // | clobbered 0 |
+ // | spill slot N |
+ // | ............ |
+ // | spill slot 0 |
+ // | ReturnAddress |
+ // | Caller_RBP |
+ // +-----------------+ <---- RBP
+ // (low address)
+
+ callerRBP := binary.LittleEndian.Uint64(stackBuf[i:])
+ retAddr := binary.LittleEndian.Uint64(stackBuf[i+8:])
+ returnAddresses = append(returnAddresses, uintptr(retAddr))
+ i = callerRBP - uint64(rbp)
+ if len(returnAddresses) == wasmdebug.MaxFrames {
+ break
+ }
+ }
+ return returnAddresses
+}
+
+// GoCallStackView implements wazevo.goCallStackView.
+func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
+ // (high address)
+ // +-----------------+ <----+
+ // | xxxxxxxxxxx | | ;; optional unused space to make it 16-byte aligned.
+ // ^ | arg[N]/ret[M] | |
+ // sliceSize | | ............ | | SizeInBytes/8
+ // | | arg[1]/ret[1] | |
+ // v | arg[0]/ret[0] | <----+
+ // | SizeInBytes |
+ // +-----------------+ <---- stackPointerBeforeGoCall
+ // (low address)
+ data := unsafe.Pointer(uintptr(unsafe.Pointer(stackPointerBeforeGoCall)) + 8)
+ size := *stackPointerBeforeGoCall / 8
+ return unsafe.Slice((*uint64)(data), int(size))
+}
+
+func AdjustClonedStack(oldRsp, oldTop, rsp, rbp, top uintptr) {
+ diff := uint64(rsp - oldRsp)
+
+ newBuf := stackView(rbp, top)
+ for i := uint64(0); i < uint64(len(newBuf)); {
+ // (high address)
+ // +-----------------+
+ // | ....... |
+ // | ret Y |
+ // | ....... |
+ // | ret 0 |
+ // | arg X |
+ // | ....... |
+ // | arg 1 |
+ // | arg 0 |
+ // | ReturnAddress |
+ // | Caller_RBP |
+ // +-----------------+ <---- Caller_RBP
+ // | ........... |
+ // | clobbered M |
+ // | ............ |
+ // | clobbered 0 |
+ // | spill slot N |
+ // | ............ |
+ // | spill slot 0 |
+ // | ReturnAddress |
+ // | Caller_RBP |
+ // +-----------------+ <---- RBP
+ // (low address)
+
+ callerRBP := binary.LittleEndian.Uint64(newBuf[i:])
+ if callerRBP == 0 {
+ // End of stack.
+ break
+ }
+ if i64 := int64(callerRBP); i64 < int64(oldRsp) || i64 >= int64(oldTop) {
+ panic("BUG: callerRBP is out of range")
+ }
+ if int(callerRBP) < 0 {
+ panic("BUG: callerRBP is negative")
+ }
+ adjustedCallerRBP := callerRBP + diff
+ if int(adjustedCallerRBP) < 0 {
+ panic("BUG: adjustedCallerRBP is negative")
+ }
+ binary.LittleEndian.PutUint64(newBuf[i:], adjustedCallerRBP)
+ i = adjustedCallerRBP - uint64(rbp)
+ }
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
new file mode 100644
index 000000000..6615471c6
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
@@ -0,0 +1,332 @@
+package arm64
+
+import (
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// References:
+// * https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#arm64-architecture
+// * https://developer.arm.com/documentation/102374/0101/Procedure-Call-Standard
+
+var (
+ intParamResultRegs = []regalloc.RealReg{x0, x1, x2, x3, x4, x5, x6, x7}
+ floatParamResultRegs = []regalloc.RealReg{v0, v1, v2, v3, v4, v5, v6, v7}
+)
+
+var regInfo = &regalloc.RegisterInfo{
+ AllocatableRegisters: [regalloc.NumRegType][]regalloc.RealReg{
+ // We don't allocate:
+ // - x18: Reserved by the macOS: https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms#Respect-the-purpose-of-specific-CPU-registers
+ // - x28: Reserved by Go runtime.
+ // - x27(=tmpReg): because of the reason described on tmpReg.
+ regalloc.RegTypeInt: {
+ x8, x9, x10, x11, x12, x13, x14, x15,
+ x16, x17, x19, x20, x21, x22, x23, x24, x25,
+ x26, x29, x30,
+ // These are the argument/return registers. Less preferred in the allocation.
+ x7, x6, x5, x4, x3, x2, x1, x0,
+ },
+ regalloc.RegTypeFloat: {
+ v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+ v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30,
+ // These are the argument/return registers. Less preferred in the allocation.
+ v7, v6, v5, v4, v3, v2, v1, v0,
+ },
+ },
+ CalleeSavedRegisters: regalloc.NewRegSet(
+ x19, x20, x21, x22, x23, x24, x25, x26, x28,
+ v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+ ),
+ CallerSavedRegisters: regalloc.NewRegSet(
+ x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x29, x30,
+ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+ ),
+ RealRegToVReg: []regalloc.VReg{
+ x0: x0VReg, x1: x1VReg, x2: x2VReg, x3: x3VReg, x4: x4VReg, x5: x5VReg, x6: x6VReg, x7: x7VReg, x8: x8VReg, x9: x9VReg, x10: x10VReg, x11: x11VReg, x12: x12VReg, x13: x13VReg, x14: x14VReg, x15: x15VReg, x16: x16VReg, x17: x17VReg, x18: x18VReg, x19: x19VReg, x20: x20VReg, x21: x21VReg, x22: x22VReg, x23: x23VReg, x24: x24VReg, x25: x25VReg, x26: x26VReg, x27: x27VReg, x28: x28VReg, x29: x29VReg, x30: x30VReg,
+ v0: v0VReg, v1: v1VReg, v2: v2VReg, v3: v3VReg, v4: v4VReg, v5: v5VReg, v6: v6VReg, v7: v7VReg, v8: v8VReg, v9: v9VReg, v10: v10VReg, v11: v11VReg, v12: v12VReg, v13: v13VReg, v14: v14VReg, v15: v15VReg, v16: v16VReg, v17: v17VReg, v18: v18VReg, v19: v19VReg, v20: v20VReg, v21: v21VReg, v22: v22VReg, v23: v23VReg, v24: v24VReg, v25: v25VReg, v26: v26VReg, v27: v27VReg, v28: v28VReg, v29: v29VReg, v30: v30VReg, v31: v31VReg,
+ },
+ RealRegName: func(r regalloc.RealReg) string { return regNames[r] },
+ RealRegType: func(r regalloc.RealReg) regalloc.RegType {
+ if r < v0 {
+ return regalloc.RegTypeInt
+ }
+ return regalloc.RegTypeFloat
+ },
+}
+
+// ArgsResultsRegs implements backend.Machine.
+func (m *machine) ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) {
+ return intParamResultRegs, floatParamResultRegs
+}
+
+// LowerParams implements backend.FunctionABI.
+func (m *machine) LowerParams(args []ssa.Value) {
+ a := m.currentABI
+
+ for i, ssaArg := range args {
+ if !ssaArg.Valid() {
+ continue
+ }
+ reg := m.compiler.VRegOf(ssaArg)
+ arg := &a.Args[i]
+ if arg.Kind == backend.ABIArgKindReg {
+ m.InsertMove(reg, arg.Reg, arg.Type)
+ } else {
+ // TODO: we could use pair load if there's consecutive loads for the same type.
+ //
+ // (high address)
+ // +-----------------+
+ // | ....... |
+ // | ret Y |
+ // | ....... |
+ // | ret 0 |
+ // | arg X |
+ // | ....... |
+ // | arg 1 |
+ // | arg 0 | <-|
+ // | ReturnAddress | |
+ // +-----------------+ |
+ // | ........... | |
+ // | clobbered M | | argStackOffset: is unknown at this point of compilation.
+ // | ............ | |
+ // | clobbered 0 | |
+ // | spill slot N | |
+ // | ........... | |
+ // | spill slot 0 | |
+ // SP---> +-----------------+ <-+
+ // (low address)
+
+ bits := arg.Type.Bits()
+ // At this point of compilation, we don't yet know how much space exist below the return address.
+ // So we instruct the address mode to add the `argStackOffset` to the offset at the later phase of compilation.
+ amode := addressMode{imm: arg.Offset, rn: spVReg, kind: addressModeKindArgStackSpace}
+ load := m.allocateInstr()
+ switch arg.Type {
+ case ssa.TypeI32, ssa.TypeI64:
+ load.asULoad(operandNR(reg), amode, bits)
+ case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+ load.asFpuLoad(operandNR(reg), amode, bits)
+ default:
+ panic("BUG")
+ }
+ m.insert(load)
+ m.unresolvedAddressModes = append(m.unresolvedAddressModes, load)
+ }
+ }
+}
+
+// LowerReturns lowers the given returns.
+func (m *machine) LowerReturns(rets []ssa.Value) {
+ a := m.currentABI
+
+ l := len(rets) - 1
+ for i := range rets {
+ // Reverse order in order to avoid overwriting the stack returns existing in the return registers.
+ ret := rets[l-i]
+ r := &a.Rets[l-i]
+ reg := m.compiler.VRegOf(ret)
+ if def := m.compiler.ValueDefinition(ret); def.IsFromInstr() {
+ // Constant instructions are inlined.
+ if inst := def.Instr; inst.Constant() {
+ val := inst.Return()
+ valType := val.Type()
+ v := inst.ConstantVal()
+ m.insertLoadConstant(v, valType, reg)
+ }
+ }
+ if r.Kind == backend.ABIArgKindReg {
+ m.InsertMove(r.Reg, reg, ret.Type())
+ } else {
+ // TODO: we could use pair store if there's consecutive stores for the same type.
+ //
+ // (high address)
+ // +-----------------+
+ // | ....... |
+ // | ret Y |
+ // | ....... |
+ // | ret 0 | <-+
+ // | arg X | |
+ // | ....... | |
+ // | arg 1 | |
+ // | arg 0 | |
+ // | ReturnAddress | |
+ // +-----------------+ |
+ // | ........... | |
+ // | spill slot M | | retStackOffset: is unknown at this point of compilation.
+ // | ............ | |
+ // | spill slot 2 | |
+ // | spill slot 1 | |
+ // | clobbered 0 | |
+ // | clobbered 1 | |
+ // | ........... | |
+ // | clobbered N | |
+ // SP---> +-----------------+ <-+
+ // (low address)
+
+ bits := r.Type.Bits()
+
+ // At this point of compilation, we don't yet know how much space exist below the return address.
+ // So we instruct the address mode to add the `retStackOffset` to the offset at the later phase of compilation.
+ amode := addressMode{imm: r.Offset, rn: spVReg, kind: addressModeKindResultStackSpace}
+ store := m.allocateInstr()
+ store.asStore(operandNR(reg), amode, bits)
+ m.insert(store)
+ m.unresolvedAddressModes = append(m.unresolvedAddressModes, store)
+ }
+ }
+}
+
+// callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the
+// caller side of the function call.
+func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def *backend.SSAValueDefinition, slotBegin int64) {
+ arg := &a.Args[argIndex]
+ if def != nil && def.IsFromInstr() {
+ // Constant instructions are inlined.
+ if inst := def.Instr; inst.Constant() {
+ val := inst.Return()
+ valType := val.Type()
+ v := inst.ConstantVal()
+ m.insertLoadConstant(v, valType, reg)
+ }
+ }
+ if arg.Kind == backend.ABIArgKindReg {
+ m.InsertMove(arg.Reg, reg, arg.Type)
+ } else {
+ // TODO: we could use pair store if there's consecutive stores for the same type.
+ //
+ // Note that at this point, stack pointer is already adjusted.
+ bits := arg.Type.Bits()
+ amode := m.resolveAddressModeForOffset(arg.Offset-slotBegin, bits, spVReg, false)
+ store := m.allocateInstr()
+ store.asStore(operandNR(reg), amode, bits)
+ m.insert(store)
+ }
+}
+
+func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex int, reg regalloc.VReg, slotBegin int64) {
+ r := &a.Rets[retIndex]
+ if r.Kind == backend.ABIArgKindReg {
+ m.InsertMove(reg, r.Reg, r.Type)
+ } else {
+ // TODO: we could use pair load if there's consecutive loads for the same type.
+ amode := m.resolveAddressModeForOffset(a.ArgStackSize+r.Offset-slotBegin, r.Type.Bits(), spVReg, false)
+ ldr := m.allocateInstr()
+ switch r.Type {
+ case ssa.TypeI32, ssa.TypeI64:
+ ldr.asULoad(operandNR(reg), amode, r.Type.Bits())
+ case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+ ldr.asFpuLoad(operandNR(reg), amode, r.Type.Bits())
+ default:
+ panic("BUG")
+ }
+ m.insert(ldr)
+ }
+}
+
+func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) (*instruction, addressMode) {
+ exct := m.executableContext
+ exct.PendingInstructions = exct.PendingInstructions[:0]
+ mode := m.resolveAddressModeForOffset(offset, dstBits, rn, allowTmpRegUse)
+ for _, instr := range exct.PendingInstructions {
+ cur = linkInstr(cur, instr)
+ }
+ return cur, mode
+}
+
+func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) addressMode {
+ if rn.RegType() != regalloc.RegTypeInt {
+ panic("BUG: rn should be a pointer: " + formatVRegSized(rn, 64))
+ }
+ var amode addressMode
+ if offsetFitsInAddressModeKindRegUnsignedImm12(dstBits, offset) {
+ amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: rn, imm: offset}
+ } else if offsetFitsInAddressModeKindRegSignedImm9(offset) {
+ amode = addressMode{kind: addressModeKindRegSignedImm9, rn: rn, imm: offset}
+ } else {
+ var indexReg regalloc.VReg
+ if allowTmpRegUse {
+ m.lowerConstantI64(tmpRegVReg, offset)
+ indexReg = tmpRegVReg
+ } else {
+ indexReg = m.compiler.AllocateVReg(ssa.TypeI64)
+ m.lowerConstantI64(indexReg, offset)
+ }
+ amode = addressMode{kind: addressModeKindRegReg, rn: rn, rm: indexReg, extOp: extendOpUXTX /* indicates index rm is 64-bit */}
+ }
+ return amode
+}
+
+func (m *machine) lowerCall(si *ssa.Instruction) {
+ isDirectCall := si.Opcode() == ssa.OpcodeCall
+ var indirectCalleePtr ssa.Value
+ var directCallee ssa.FuncRef
+ var sigID ssa.SignatureID
+ var args []ssa.Value
+ if isDirectCall {
+ directCallee, sigID, args = si.CallData()
+ } else {
+ indirectCalleePtr, sigID, args, _ /* on arm64, the calling convention is compatible with the Go runtime */ = si.CallIndirectData()
+ }
+ calleeABI := m.compiler.GetFunctionABI(m.compiler.SSABuilder().ResolveSignature(sigID))
+
+ stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize())
+ if m.maxRequiredStackSizeForCalls < stackSlotSize+16 {
+ m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // return address frame.
+ }
+
+ for i, arg := range args {
+ reg := m.compiler.VRegOf(arg)
+ def := m.compiler.ValueDefinition(arg)
+ m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize)
+ }
+
+ if isDirectCall {
+ call := m.allocateInstr()
+ call.asCall(directCallee, calleeABI)
+ m.insert(call)
+ } else {
+ ptr := m.compiler.VRegOf(indirectCalleePtr)
+ callInd := m.allocateInstr()
+ callInd.asCallIndirect(ptr, calleeABI)
+ m.insert(callInd)
+ }
+
+ var index int
+ r1, rs := si.Returns()
+ if r1.Valid() {
+ m.callerGenFunctionReturnVReg(calleeABI, 0, m.compiler.VRegOf(r1), stackSlotSize)
+ index++
+ }
+
+ for _, r := range rs {
+ m.callerGenFunctionReturnVReg(calleeABI, index, m.compiler.VRegOf(r), stackSlotSize)
+ index++
+ }
+}
+
+func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add bool) {
+ if imm12Operand, ok := asImm12Operand(uint64(diff)); ok {
+ alu := m.allocateInstr()
+ var ao aluOp
+ if add {
+ ao = aluOpAdd
+ } else {
+ ao = aluOpSub
+ }
+ alu.asALU(ao, operandNR(rd), operandNR(spVReg), imm12Operand, true)
+ m.insert(alu)
+ } else {
+ m.lowerConstantI64(tmpRegVReg, diff)
+ alu := m.allocateInstr()
+ var ao aluOp
+ if add {
+ ao = aluOpAdd
+ } else {
+ ao = aluOpSub
+ }
+ alu.asALU(ao, operandNR(rd), operandNR(spVReg), operandNR(tmpRegVReg), true)
+ m.insert(alu)
+ }
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go
new file mode 100644
index 000000000..5f0c613df
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go
@@ -0,0 +1,9 @@
+package arm64
+
+// entrypoint enters the machine code generated by this backend which begins with the preamble generated by functionABI.EmitGoEntryPreamble below.
+// This implements wazevo.entrypoint, and see the comments there for detail.
+func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr)
+
+// afterGoFunctionCallEntrypoint enters the machine code after growing the stack.
+// This implements wazevo.afterGoFunctionCallEntrypoint, and see the comments there for detail.
+func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s
new file mode 100644
index 000000000..0b579f852
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s
@@ -0,0 +1,29 @@
+//go:build arm64
+
+#include "funcdata.h"
+#include "textflag.h"
+
+// See the comments on EmitGoEntryPreamble for what this function is supposed to do.
+TEXT ·entrypoint(SB), NOSPLIT|NOFRAME, $0-48
+ MOVD preambleExecutable+0(FP), R27
+ MOVD functionExectuable+8(FP), R24
+ MOVD executionContextPtr+16(FP), R0
+ MOVD moduleContextPtr+24(FP), R1
+ MOVD paramResultSlicePtr+32(FP), R19
+ MOVD goAllocatedStackSlicePtr+40(FP), R26
+ JMP (R27)
+
+TEXT ·afterGoFunctionCallEntrypoint(SB), NOSPLIT|NOFRAME, $0-32
+ MOVD goCallReturnAddress+0(FP), R20
+ MOVD executionContextPtr+8(FP), R0
+ MOVD stackPointer+16(FP), R19
+
+ // Save the current FP(R29), SP and LR(R30) into the wazevo.executionContext (stored in R0).
+ MOVD R29, 16(R0) // Store FP(R29) into [RO, #ExecutionContextOffsets.OriginalFramePointer]
+ MOVD RSP, R27 // Move SP to R27 (temporary register) since SP cannot be stored directly in str instructions.
+ MOVD R27, 24(R0) // Store R27 into [RO, #ExecutionContextOffsets.OriginalFramePointer]
+ MOVD R30, 32(R0) // Store R30 into [R0, #ExecutionContextOffsets.GoReturnAddress]
+
+ // Load the new stack pointer (which sits somewhere in Go-allocated stack) into SP.
+ MOVD R19, RSP
+ JMP (R20)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
new file mode 100644
index 000000000..7a9cceb33
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
@@ -0,0 +1,230 @@
+package arm64
+
+import (
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// CompileEntryPreamble implements backend.Machine. This assumes `entrypoint` function (in abi_go_entry_arm64.s) passes:
+//
+// 1. First (execution context ptr) and Second arguments are already passed in x0, and x1.
+// 2. param/result slice ptr in x19; the pointer to []uint64{} which is used to pass arguments and accept return values.
+// 3. Go-allocated stack slice ptr in x26.
+// 4. Function executable in x24.
+//
+// also SP and FP are correct Go-runtime-based values, and LR is the return address to the Go-side caller.
+func (m *machine) CompileEntryPreamble(signature *ssa.Signature) []byte {
+ root := m.constructEntryPreamble(signature)
+ m.encode(root)
+ return m.compiler.Buf()
+}
+
+var (
+ executionContextPtrReg = x0VReg
+ // callee-saved regs so that they can be used in the prologue and epilogue.
+ paramResultSlicePtr = x19VReg
+ savedExecutionContextPtr = x20VReg
+ // goAllocatedStackPtr is not used in the epilogue.
+ goAllocatedStackPtr = x26VReg
+ // paramResultSliceCopied is not used in the epilogue.
+ paramResultSliceCopied = x25VReg
+ // tmpRegVReg is not used in the epilogue.
+ functionExecutable = x24VReg
+)
+
+func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regalloc.VReg, arg *backend.ABIArg, argStartOffsetFromSP int64) *instruction {
+ typ := arg.Type
+ bits := typ.Bits()
+ isStackArg := arg.Kind == backend.ABIArgKindStack
+
+ var loadTargetReg operand
+ if !isStackArg {
+ loadTargetReg = operandNR(arg.Reg)
+ } else {
+ switch typ {
+ case ssa.TypeI32, ssa.TypeI64:
+ loadTargetReg = operandNR(x15VReg)
+ case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+ loadTargetReg = operandNR(v15VReg)
+ default:
+ panic("TODO?")
+ }
+ }
+
+ var postIndexImm int64
+ if typ == ssa.TypeV128 {
+ postIndexImm = 16 // v128 is represented as 2x64-bit in Go slice.
+ } else {
+ postIndexImm = 8
+ }
+ loadMode := addressMode{kind: addressModeKindPostIndex, rn: paramSlicePtr, imm: postIndexImm}
+
+ instr := m.allocateInstr()
+ switch typ {
+ case ssa.TypeI32:
+ instr.asULoad(loadTargetReg, loadMode, 32)
+ case ssa.TypeI64:
+ instr.asULoad(loadTargetReg, loadMode, 64)
+ case ssa.TypeF32:
+ instr.asFpuLoad(loadTargetReg, loadMode, 32)
+ case ssa.TypeF64:
+ instr.asFpuLoad(loadTargetReg, loadMode, 64)
+ case ssa.TypeV128:
+ instr.asFpuLoad(loadTargetReg, loadMode, 128)
+ }
+ cur = linkInstr(cur, instr)
+
+ if isStackArg {
+ var storeMode addressMode
+ cur, storeMode = m.resolveAddressModeForOffsetAndInsert(cur, argStartOffsetFromSP+arg.Offset, bits, spVReg, true)
+ toStack := m.allocateInstr()
+ toStack.asStore(loadTargetReg, storeMode, bits)
+ cur = linkInstr(cur, toStack)
+ }
+ return cur
+}
+
+func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr regalloc.VReg, result *backend.ABIArg, resultStartOffsetFromSP int64) *instruction {
+ isStackArg := result.Kind == backend.ABIArgKindStack
+ typ := result.Type
+ bits := typ.Bits()
+
+ var storeTargetReg operand
+ if !isStackArg {
+ storeTargetReg = operandNR(result.Reg)
+ } else {
+ switch typ {
+ case ssa.TypeI32, ssa.TypeI64:
+ storeTargetReg = operandNR(x15VReg)
+ case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+ storeTargetReg = operandNR(v15VReg)
+ default:
+ panic("TODO?")
+ }
+ }
+
+ var postIndexImm int64
+ if typ == ssa.TypeV128 {
+ postIndexImm = 16 // v128 is represented as 2x64-bit in Go slice.
+ } else {
+ postIndexImm = 8
+ }
+
+ if isStackArg {
+ var loadMode addressMode
+ cur, loadMode = m.resolveAddressModeForOffsetAndInsert(cur, resultStartOffsetFromSP+result.Offset, bits, spVReg, true)
+ toReg := m.allocateInstr()
+ switch typ {
+ case ssa.TypeI32, ssa.TypeI64:
+ toReg.asULoad(storeTargetReg, loadMode, bits)
+ case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+ toReg.asFpuLoad(storeTargetReg, loadMode, bits)
+ default:
+ panic("TODO?")
+ }
+ cur = linkInstr(cur, toReg)
+ }
+
+ mode := addressMode{kind: addressModeKindPostIndex, rn: resultSlicePtr, imm: postIndexImm}
+ instr := m.allocateInstr()
+ instr.asStore(storeTargetReg, mode, bits)
+ cur = linkInstr(cur, instr)
+ return cur
+}
+
+func (m *machine) constructEntryPreamble(sig *ssa.Signature) (root *instruction) {
+ abi := backend.FunctionABI{}
+ abi.Init(sig, intParamResultRegs, floatParamResultRegs)
+
+ root = m.allocateNop()
+
+ //// ----------------------------------- prologue ----------------------------------- ////
+
+ // First, we save executionContextPtrReg into a callee-saved register so that it can be used in epilogue as well.
+ // mov savedExecutionContextPtr, x0
+ cur := m.move64(savedExecutionContextPtr, executionContextPtrReg, root)
+
+ // Next, save the current FP, SP and LR into the wazevo.executionContext:
+ // str fp, [savedExecutionContextPtr, #OriginalFramePointer]
+ // mov tmp, sp ;; sp cannot be str'ed directly.
+ // str sp, [savedExecutionContextPtr, #OriginalStackPointer]
+ // str lr, [savedExecutionContextPtr, #GoReturnAddress]
+ cur = m.loadOrStoreAtExecutionContext(fpVReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, true, cur)
+ cur = m.move64(tmpRegVReg, spVReg, cur)
+ cur = m.loadOrStoreAtExecutionContext(tmpRegVReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, true, cur)
+ cur = m.loadOrStoreAtExecutionContext(lrVReg, wazevoapi.ExecutionContextOffsetGoReturnAddress, true, cur)
+
+ // Then, move the Go-allocated stack pointer to SP:
+ // mov sp, goAllocatedStackPtr
+ cur = m.move64(spVReg, goAllocatedStackPtr, cur)
+
+ prReg := paramResultSlicePtr
+ if len(abi.Args) > 2 && len(abi.Rets) > 0 {
+ // paramResultSlicePtr is modified during the execution of goEntryPreamblePassArg,
+ // so copy it to another reg.
+ cur = m.move64(paramResultSliceCopied, paramResultSlicePtr, cur)
+ prReg = paramResultSliceCopied
+ }
+
+ stackSlotSize := int64(abi.AlignedArgResultStackSlotSize())
+ for i := range abi.Args {
+ if i < 2 {
+ // module context ptr and execution context ptr are passed in x0 and x1 by the Go assembly function.
+ continue
+ }
+ arg := &abi.Args[i]
+ cur = m.goEntryPreamblePassArg(cur, prReg, arg, -stackSlotSize)
+ }
+
+ // Call the real function.
+ bl := m.allocateInstr()
+ bl.asCallIndirect(functionExecutable, &abi)
+ cur = linkInstr(cur, bl)
+
+ ///// ----------------------------------- epilogue ----------------------------------- /////
+
+ // Store the register results into paramResultSlicePtr.
+ for i := range abi.Rets {
+ cur = m.goEntryPreamblePassResult(cur, paramResultSlicePtr, &abi.Rets[i], abi.ArgStackSize-stackSlotSize)
+ }
+
+ // Finally, restore the FP, SP and LR, and return to the Go code.
+ // ldr fp, [savedExecutionContextPtr, #OriginalFramePointer]
+ // ldr tmp, [savedExecutionContextPtr, #OriginalStackPointer]
+ // mov sp, tmp ;; sp cannot be str'ed directly.
+ // ldr lr, [savedExecutionContextPtr, #GoReturnAddress]
+ // ret ;; --> return to the Go code
+ cur = m.loadOrStoreAtExecutionContext(fpVReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, false, cur)
+ cur = m.loadOrStoreAtExecutionContext(tmpRegVReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, false, cur)
+ cur = m.move64(spVReg, tmpRegVReg, cur)
+ cur = m.loadOrStoreAtExecutionContext(lrVReg, wazevoapi.ExecutionContextOffsetGoReturnAddress, false, cur)
+ retInst := m.allocateInstr()
+ retInst.asRet()
+ linkInstr(cur, retInst)
+ return
+}
+
+func (m *machine) move64(dst, src regalloc.VReg, prev *instruction) *instruction {
+ instr := m.allocateInstr()
+ instr.asMove64(dst, src)
+ return linkInstr(prev, instr)
+}
+
+func (m *machine) loadOrStoreAtExecutionContext(d regalloc.VReg, offset wazevoapi.Offset, store bool, prev *instruction) *instruction {
+ instr := m.allocateInstr()
+ mode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: savedExecutionContextPtr, imm: offset.I64()}
+ if store {
+ instr.asStore(operandNR(d), mode, 64)
+ } else {
+ instr.asULoad(operandNR(d), mode, 64)
+ }
+ return linkInstr(prev, instr)
+}
+
+func linkInstr(prev, next *instruction) *instruction {
+ prev.next = next
+ next.prev = prev
+ return next
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
new file mode 100644
index 000000000..466b1f960
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
@@ -0,0 +1,428 @@
+package arm64
+
+import (
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+var calleeSavedRegistersSorted = []regalloc.VReg{
+ x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg,
+ v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg,
+}
+
+// CompileGoFunctionTrampoline implements backend.Machine.
+func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte {
+ exct := m.executableContext
+ argBegin := 1 // Skips exec context by default.
+ if needModuleContextPtr {
+ argBegin++
+ }
+
+ abi := &backend.FunctionABI{}
+ abi.Init(sig, intParamResultRegs, floatParamResultRegs)
+ m.currentABI = abi
+
+ cur := m.allocateInstr()
+ cur.asNop0()
+ exct.RootInstr = cur
+
+ // Execution context is always the first argument.
+ execCtrPtr := x0VReg
+
+ // In the following, we create the following stack layout:
+ //
+ // (high address)
+ // SP ------> +-----------------+ <----+
+ // | ....... | |
+ // | ret Y | |
+ // | ....... | |
+ // | ret 0 | |
+ // | arg X | | size_of_arg_ret
+ // | ....... | |
+ // | arg 1 | |
+ // | arg 0 | <----+ <-------- originalArg0Reg
+ // | size_of_arg_ret |
+ // | ReturnAddress |
+ // +-----------------+ <----+
+ // | xxxx | | ;; might be padded to make it 16-byte aligned.
+ // +--->| arg[N]/ret[M] | |
+ // sliceSize| | ............ | | goCallStackSize
+ // | | arg[1]/ret[1] | |
+ // +--->| arg[0]/ret[0] | <----+ <-------- arg0ret0AddrReg
+ // | sliceSize |
+ // | frame_size |
+ // +-----------------+
+ // (low address)
+ //
+ // where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions,
+ // therefore will be accessed as the usual []uint64. So that's where we need to pass/receive
+ // the arguments/return values.
+
+ // First of all, to update the SP, and create "ReturnAddress + size_of_arg_ret".
+ cur = m.createReturnAddrAndSizeOfArgRetSlot(cur)
+
+ const frameInfoSize = 16 // == frame_size + sliceSize.
+
+ // Next, we should allocate the stack for the Go function call if necessary.
+ goCallStackSize, sliceSizeInBytes := backend.GoFunctionCallRequiredStackSize(sig, argBegin)
+ cur = m.insertStackBoundsCheck(goCallStackSize+frameInfoSize, cur)
+
+ originalArg0Reg := x17VReg // Caller save, so we can use it for whatever we want.
+ if m.currentABI.AlignedArgResultStackSlotSize() > 0 {
+ // At this point, SP points to `ReturnAddress`, so add 16 to get the original arg 0 slot.
+ cur = m.addsAddOrSubStackPointer(cur, originalArg0Reg, frameInfoSize, true)
+ }
+
+ // Save the callee saved registers.
+ cur = m.saveRegistersInExecutionContext(cur, calleeSavedRegistersSorted)
+
+ if needModuleContextPtr {
+ offset := wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.I64()
+ if !offsetFitsInAddressModeKindRegUnsignedImm12(64, offset) {
+ panic("BUG: too large or un-aligned offset for goFunctionCallCalleeModuleContextOpaque in execution context")
+ }
+
+ // Module context is always the second argument.
+ moduleCtrPtr := x1VReg
+ store := m.allocateInstr()
+ amode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtrPtr, imm: offset}
+ store.asStore(operandNR(moduleCtrPtr), amode, 64)
+ cur = linkInstr(cur, store)
+ }
+
+ // Advances the stack pointer.
+ cur = m.addsAddOrSubStackPointer(cur, spVReg, goCallStackSize, false)
+
+ // Copy the pointer to x15VReg.
+ arg0ret0AddrReg := x15VReg // Caller save, so we can use it for whatever we want.
+ copySp := m.allocateInstr()
+ copySp.asMove64(arg0ret0AddrReg, spVReg)
+ cur = linkInstr(cur, copySp)
+
+ // Next, we need to store all the arguments to the stack in the typical Wasm stack style.
+ for i := range abi.Args[argBegin:] {
+ arg := &abi.Args[argBegin+i]
+ store := m.allocateInstr()
+ var v regalloc.VReg
+ if arg.Kind == backend.ABIArgKindReg {
+ v = arg.Reg
+ } else {
+ cur, v = m.goFunctionCallLoadStackArg(cur, originalArg0Reg, arg,
+ // Caller save, so we can use it for whatever we want.
+ x11VReg, v11VReg)
+ }
+
+ var sizeInBits byte
+ if arg.Type == ssa.TypeV128 {
+ sizeInBits = 128
+ } else {
+ sizeInBits = 64
+ }
+ store.asStore(operandNR(v),
+ addressMode{
+ kind: addressModeKindPostIndex,
+ rn: arg0ret0AddrReg, imm: int64(sizeInBits / 8),
+ }, sizeInBits)
+ cur = linkInstr(cur, store)
+ }
+
+ // Finally, now that we've advanced SP to arg[0]/ret[0], we allocate `frame_size + sliceSize`.
+ var frameSizeReg, sliceSizeReg regalloc.VReg
+ if goCallStackSize > 0 {
+ cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, goCallStackSize)
+ frameSizeReg = tmpRegVReg
+ cur = m.lowerConstantI64AndInsert(cur, x16VReg, sliceSizeInBytes/8)
+ sliceSizeReg = x16VReg
+ } else {
+ frameSizeReg = xzrVReg
+ sliceSizeReg = xzrVReg
+ }
+ _amode := addressModePreOrPostIndex(spVReg, -16, true)
+ storeP := m.allocateInstr()
+ storeP.asStorePair64(frameSizeReg, sliceSizeReg, _amode)
+ cur = linkInstr(cur, storeP)
+
+ // Set the exit status on the execution context.
+ cur = m.setExitCode(cur, x0VReg, exitCode)
+
+ // Save the current stack pointer.
+ cur = m.saveCurrentStackPointer(cur, x0VReg)
+
+ // Exit the execution.
+ cur = m.storeReturnAddressAndExit(cur)
+
+ // After the call, we need to restore the callee saved registers.
+ cur = m.restoreRegistersInExecutionContext(cur, calleeSavedRegistersSorted)
+
+ // Get the pointer to the arg[0]/ret[0]: We need to skip `frame_size + sliceSize`.
+ if len(abi.Rets) > 0 {
+ cur = m.addsAddOrSubStackPointer(cur, arg0ret0AddrReg, frameInfoSize, true)
+ }
+
+ // Advances the SP so that it points to `ReturnAddress`.
+ cur = m.addsAddOrSubStackPointer(cur, spVReg, frameInfoSize+goCallStackSize, true)
+ ldr := m.allocateInstr()
+ // And load the return address.
+ ldr.asULoad(operandNR(lrVReg),
+ addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
+ cur = linkInstr(cur, ldr)
+
+ originalRet0Reg := x17VReg // Caller save, so we can use it for whatever we want.
+ if m.currentABI.RetStackSize > 0 {
+ cur = m.addsAddOrSubStackPointer(cur, originalRet0Reg, m.currentABI.ArgStackSize, true)
+ }
+
+ // Make the SP point to the original address (above the result slot).
+ if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
+ cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
+ }
+
+ for i := range abi.Rets {
+ r := &abi.Rets[i]
+ if r.Kind == backend.ABIArgKindReg {
+ loadIntoReg := m.allocateInstr()
+ mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
+ switch r.Type {
+ case ssa.TypeI32:
+ mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+ loadIntoReg.asULoad(operandNR(r.Reg), mode, 32)
+ case ssa.TypeI64:
+ mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+ loadIntoReg.asULoad(operandNR(r.Reg), mode, 64)
+ case ssa.TypeF32:
+ mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+ loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 32)
+ case ssa.TypeF64:
+ mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+ loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 64)
+ case ssa.TypeV128:
+ mode.imm = 16
+ loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 128)
+ default:
+ panic("TODO")
+ }
+ cur = linkInstr(cur, loadIntoReg)
+ } else {
+ // First we need to load the value to a temporary just like ^^.
+ intTmp, floatTmp := x11VReg, v11VReg
+ loadIntoTmpReg := m.allocateInstr()
+ mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
+ var resultReg regalloc.VReg
+ switch r.Type {
+ case ssa.TypeI32:
+ mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+ loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 32)
+ resultReg = intTmp
+ case ssa.TypeI64:
+ mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+ loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 64)
+ resultReg = intTmp
+ case ssa.TypeF32:
+ mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+ loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 32)
+ resultReg = floatTmp
+ case ssa.TypeF64:
+ mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+ loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 64)
+ resultReg = floatTmp
+ case ssa.TypeV128:
+ mode.imm = 16
+ loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 128)
+ resultReg = floatTmp
+ default:
+ panic("TODO")
+ }
+ cur = linkInstr(cur, loadIntoTmpReg)
+ cur = m.goFunctionCallStoreStackResult(cur, originalRet0Reg, r, resultReg)
+ }
+ }
+
+ ret := m.allocateInstr()
+ ret.asRet()
+ linkInstr(cur, ret)
+
+ m.encode(m.executableContext.RootInstr)
+ return m.compiler.Buf()
+}
+
+func (m *machine) saveRegistersInExecutionContext(cur *instruction, regs []regalloc.VReg) *instruction {
+ offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
+ for _, v := range regs {
+ store := m.allocateInstr()
+ var sizeInBits byte
+ switch v.RegType() {
+ case regalloc.RegTypeInt:
+ sizeInBits = 64
+ case regalloc.RegTypeFloat:
+ sizeInBits = 128
+ }
+ store.asStore(operandNR(v),
+ addressMode{
+ kind: addressModeKindRegUnsignedImm12,
+ // Execution context is always the first argument.
+ rn: x0VReg, imm: offset,
+ }, sizeInBits)
+ store.prev = cur
+ cur.next = store
+ cur = store
+ offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally store regs at the offset of multiple of 16.
+ }
+ return cur
+}
+
+func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []regalloc.VReg) *instruction {
+ offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
+ for _, v := range regs {
+ load := m.allocateInstr()
+ var as func(dst operand, amode addressMode, sizeInBits byte)
+ var sizeInBits byte
+ switch v.RegType() {
+ case regalloc.RegTypeInt:
+ as = load.asULoad
+ sizeInBits = 64
+ case regalloc.RegTypeFloat:
+ as = load.asFpuLoad
+ sizeInBits = 128
+ }
+ as(operandNR(v),
+ addressMode{
+ kind: addressModeKindRegUnsignedImm12,
+ // Execution context is always the first argument.
+ rn: x0VReg, imm: offset,
+ }, sizeInBits)
+ cur = linkInstr(cur, load)
+ offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally load regs at the offset of multiple of 16.
+ }
+ return cur
+}
+
+func (m *machine) lowerConstantI64AndInsert(cur *instruction, dst regalloc.VReg, v int64) *instruction {
+ exct := m.executableContext
+ exct.PendingInstructions = exct.PendingInstructions[:0]
+ m.lowerConstantI64(dst, v)
+ for _, instr := range exct.PendingInstructions {
+ cur = linkInstr(cur, instr)
+ }
+ return cur
+}
+
+func (m *machine) lowerConstantI32AndInsert(cur *instruction, dst regalloc.VReg, v int32) *instruction {
+ exct := m.executableContext
+ exct.PendingInstructions = exct.PendingInstructions[:0]
+ m.lowerConstantI32(dst, v)
+ for _, instr := range exct.PendingInstructions {
+ cur = linkInstr(cur, instr)
+ }
+ return cur
+}
+
+func (m *machine) setExitCode(cur *instruction, execCtr regalloc.VReg, exitCode wazevoapi.ExitCode) *instruction {
+ constReg := x17VReg // caller-saved, so we can use it.
+ cur = m.lowerConstantI32AndInsert(cur, constReg, int32(exitCode))
+
+ // Set the exit status on the execution context.
+ setExistStatus := m.allocateInstr()
+ setExistStatus.asStore(operandNR(constReg),
+ addressMode{
+ kind: addressModeKindRegUnsignedImm12,
+ rn: execCtr, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
+ }, 32)
+ cur = linkInstr(cur, setExistStatus)
+ return cur
+}
+
+func (m *machine) storeReturnAddressAndExit(cur *instruction) *instruction {
+ // Read the return address into tmp, and store it in the execution context.
+ adr := m.allocateInstr()
+ adr.asAdr(tmpRegVReg, exitSequenceSize+8)
+ cur = linkInstr(cur, adr)
+
+ storeReturnAddr := m.allocateInstr()
+ storeReturnAddr.asStore(operandNR(tmpRegVReg),
+ addressMode{
+ kind: addressModeKindRegUnsignedImm12,
+ // Execution context is always the first argument.
+ rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
+ }, 64)
+ cur = linkInstr(cur, storeReturnAddr)
+
+ // Exit the execution.
+ trapSeq := m.allocateInstr()
+ trapSeq.asExitSequence(x0VReg)
+ cur = linkInstr(cur, trapSeq)
+ return cur
+}
+
+func (m *machine) saveCurrentStackPointer(cur *instruction, execCtr regalloc.VReg) *instruction {
+ // Save the current stack pointer:
+ // mov tmp, sp,
+ // str tmp, [exec_ctx, #stackPointerBeforeGoCall]
+ movSp := m.allocateInstr()
+ movSp.asMove64(tmpRegVReg, spVReg)
+ cur = linkInstr(cur, movSp)
+
+ strSp := m.allocateInstr()
+ strSp.asStore(operandNR(tmpRegVReg),
+ addressMode{
+ kind: addressModeKindRegUnsignedImm12,
+ rn: execCtr, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
+ }, 64)
+ cur = linkInstr(cur, strSp)
+ return cur
+}
+
+func (m *machine) goFunctionCallLoadStackArg(cur *instruction, originalArg0Reg regalloc.VReg, arg *backend.ABIArg, intVReg, floatVReg regalloc.VReg) (*instruction, regalloc.VReg) {
+ load := m.allocateInstr()
+ var result regalloc.VReg
+ mode := addressMode{kind: addressModeKindPostIndex, rn: originalArg0Reg}
+ switch arg.Type {
+ case ssa.TypeI32:
+ mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+ load.asULoad(operandNR(intVReg), mode, 32)
+ result = intVReg
+ case ssa.TypeI64:
+ mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+ load.asULoad(operandNR(intVReg), mode, 64)
+ result = intVReg
+ case ssa.TypeF32:
+ mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+ load.asFpuLoad(operandNR(floatVReg), mode, 32)
+ result = floatVReg
+ case ssa.TypeF64:
+ mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+ load.asFpuLoad(operandNR(floatVReg), mode, 64)
+ result = floatVReg
+ case ssa.TypeV128:
+ mode.imm = 16
+ load.asFpuLoad(operandNR(floatVReg), mode, 128)
+ result = floatVReg
+ default:
+ panic("TODO")
+ }
+
+ cur = linkInstr(cur, load)
+ return cur, result
+}
+
+func (m *machine) goFunctionCallStoreStackResult(cur *instruction, originalRet0Reg regalloc.VReg, result *backend.ABIArg, resultVReg regalloc.VReg) *instruction {
+ store := m.allocateInstr()
+ mode := addressMode{kind: addressModeKindPostIndex, rn: originalRet0Reg}
+ var sizeInBits byte
+ switch result.Type {
+ case ssa.TypeI32, ssa.TypeF32:
+ mode.imm = 8
+ sizeInBits = 32
+ case ssa.TypeI64, ssa.TypeF64:
+ mode.imm = 8
+ sizeInBits = 64
+ case ssa.TypeV128:
+ mode.imm = 16
+ sizeInBits = 128
+ default:
+ panic("TODO")
+ }
+ store.asStore(operandNR(resultVReg), mode, sizeInBits)
+ return linkInstr(cur, store)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go
new file mode 100644
index 000000000..6f6cdd1b2
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go
@@ -0,0 +1,215 @@
+package arm64
+
+import (
+ "strconv"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type (
+ cond uint64
+ condKind byte
+)
+
+const (
+ // condKindRegisterZero represents a condition which checks if the register is zero.
+ // This indicates that the instruction must be encoded as CBZ:
+ // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/CBZ--Compare-and-Branch-on-Zero-
+ condKindRegisterZero condKind = iota
+ // condKindRegisterNotZero indicates that the instruction must be encoded as CBNZ:
+ // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/CBNZ--Compare-and-Branch-on-Nonzero-
+ condKindRegisterNotZero
+ // condKindCondFlagSet indicates that the instruction must be encoded as B.cond:
+ // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/B-cond--Branch-conditionally-
+ condKindCondFlagSet
+)
+
+// kind returns the kind of condition which is stored in the first two bits.
+func (c cond) kind() condKind {
+ return condKind(c & 0b11)
+}
+
+func (c cond) asUint64() uint64 {
+ return uint64(c)
+}
+
+// register returns the register for register conditions.
+// This panics if the condition is not a register condition (condKindRegisterZero or condKindRegisterNotZero).
+func (c cond) register() regalloc.VReg {
+ if c.kind() != condKindRegisterZero && c.kind() != condKindRegisterNotZero {
+ panic("condition is not a register")
+ }
+ return regalloc.VReg(c >> 2)
+}
+
+func registerAsRegZeroCond(r regalloc.VReg) cond {
+ return cond(r)<<2 | cond(condKindRegisterZero)
+}
+
+func registerAsRegNotZeroCond(r regalloc.VReg) cond {
+ return cond(r)<<2 | cond(condKindRegisterNotZero)
+}
+
+func (c cond) flag() condFlag {
+ if c.kind() != condKindCondFlagSet {
+ panic("condition is not a flag")
+ }
+ return condFlag(c >> 2)
+}
+
+func (c condFlag) asCond() cond {
+ return cond(c)<<2 | cond(condKindCondFlagSet)
+}
+
+// condFlag represents a condition flag for conditional branches.
+// The value matches the encoding of condition flags in the ARM64 instruction set.
+// https://developer.arm.com/documentation/den0024/a/The-A64-instruction-set/Data-processing-instructions/Conditional-instructions
+type condFlag uint8
+
+const (
+ eq condFlag = iota // eq represents "equal"
+ ne // ne represents "not equal"
+ hs // hs represents "higher or same"
+ lo // lo represents "lower"
+ mi // mi represents "minus or negative result"
+ pl // pl represents "plus or positive result"
+ vs // vs represents "overflow set"
+ vc // vc represents "overflow clear"
+ hi // hi represents "higher"
+ ls // ls represents "lower or same"
+ ge // ge represents "greater or equal"
+ lt // lt represents "less than"
+ gt // gt represents "greater than"
+ le // le represents "less than or equal"
+ al // al represents "always"
+ nv // nv represents "never"
+)
+
+// invert returns the inverted condition.
+func (c condFlag) invert() condFlag {
+ switch c {
+ case eq:
+ return ne
+ case ne:
+ return eq
+ case hs:
+ return lo
+ case lo:
+ return hs
+ case mi:
+ return pl
+ case pl:
+ return mi
+ case vs:
+ return vc
+ case vc:
+ return vs
+ case hi:
+ return ls
+ case ls:
+ return hi
+ case ge:
+ return lt
+ case lt:
+ return ge
+ case gt:
+ return le
+ case le:
+ return gt
+ case al:
+ return nv
+ case nv:
+ return al
+ default:
+ panic(c)
+ }
+}
+
+// String implements fmt.Stringer.
+func (c condFlag) String() string {
+ switch c {
+ case eq:
+ return "eq"
+ case ne:
+ return "ne"
+ case hs:
+ return "hs"
+ case lo:
+ return "lo"
+ case mi:
+ return "mi"
+ case pl:
+ return "pl"
+ case vs:
+ return "vs"
+ case vc:
+ return "vc"
+ case hi:
+ return "hi"
+ case ls:
+ return "ls"
+ case ge:
+ return "ge"
+ case lt:
+ return "lt"
+ case gt:
+ return "gt"
+ case le:
+ return "le"
+ case al:
+ return "al"
+ case nv:
+ return "nv"
+ default:
+ panic(strconv.Itoa(int(c)))
+ }
+}
+
+// condFlagFromSSAIntegerCmpCond returns the condition flag for the given ssa.IntegerCmpCond.
+func condFlagFromSSAIntegerCmpCond(c ssa.IntegerCmpCond) condFlag {
+ switch c {
+ case ssa.IntegerCmpCondEqual:
+ return eq
+ case ssa.IntegerCmpCondNotEqual:
+ return ne
+ case ssa.IntegerCmpCondSignedLessThan:
+ return lt
+ case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
+ return ge
+ case ssa.IntegerCmpCondSignedGreaterThan:
+ return gt
+ case ssa.IntegerCmpCondSignedLessThanOrEqual:
+ return le
+ case ssa.IntegerCmpCondUnsignedLessThan:
+ return lo
+ case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual:
+ return hs
+ case ssa.IntegerCmpCondUnsignedGreaterThan:
+ return hi
+ case ssa.IntegerCmpCondUnsignedLessThanOrEqual:
+ return ls
+ default:
+ panic(c)
+ }
+}
+
+// condFlagFromSSAFloatCmpCond returns the condition flag for the given ssa.FloatCmpCond.
+func condFlagFromSSAFloatCmpCond(c ssa.FloatCmpCond) condFlag {
+ switch c {
+ case ssa.FloatCmpCondEqual:
+ return eq
+ case ssa.FloatCmpCondNotEqual:
+ return ne
+ case ssa.FloatCmpCondLessThan:
+ return mi
+ case ssa.FloatCmpCondLessThanOrEqual:
+ return ls
+ case ssa.FloatCmpCondGreaterThan:
+ return gt
+ case ssa.FloatCmpCondGreaterThanOrEqual:
+ return ge
+ default:
+ panic(c)
+ }
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
new file mode 100644
index 000000000..8aabc5997
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
@@ -0,0 +1,2545 @@
+package arm64
+
+import (
+ "fmt"
+ "math"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type (
+ // instruction represents either a real instruction in arm64, or the meta instructions
+ // that are convenient for code generation. For example, inline constants are also treated
+ // as instructions.
+ //
+ // Basically, each instruction knows how to get encoded in binaries. Hence, the final output of compilation
+ // can be considered equivalent to the sequence of such instructions.
+ //
+ // Each field is interpreted depending on the kind.
+ //
+ // TODO: optimize the layout later once the impl settles.
+ instruction struct {
+ prev, next *instruction
+ u1, u2, u3 uint64
+ rd, rm, rn, ra operand
+ amode addressMode
+ kind instructionKind
+ addedBeforeRegAlloc bool
+ }
+
+ // instructionKind represents the kind of instruction.
+ // This controls how the instruction struct is interpreted.
+ instructionKind byte
+)
+
+func asNop0(i *instruction) {
+ i.kind = nop0
+}
+
+func setNext(i, next *instruction) {
+ i.next = next
+}
+
+func setPrev(i, prev *instruction) {
+ i.prev = prev
+}
+
+// IsCall implements regalloc.Instr IsCall.
+func (i *instruction) IsCall() bool {
+ return i.kind == call
+}
+
+// IsIndirectCall implements regalloc.Instr IsIndirectCall.
+func (i *instruction) IsIndirectCall() bool {
+ return i.kind == callInd
+}
+
+// IsReturn implements regalloc.Instr IsReturn.
+func (i *instruction) IsReturn() bool {
+ return i.kind == ret
+}
+
+// Next implements regalloc.Instr Next.
+func (i *instruction) Next() regalloc.Instr {
+ return i.next
+}
+
+// Prev implements regalloc.Instr Prev.
+func (i *instruction) Prev() regalloc.Instr {
+ return i.prev
+}
+
+// AddedBeforeRegAlloc implements regalloc.Instr AddedBeforeRegAlloc.
+func (i *instruction) AddedBeforeRegAlloc() bool {
+ return i.addedBeforeRegAlloc
+}
+
+type defKind byte
+
+const (
+ defKindNone defKind = iota + 1
+ defKindRD
+ defKindCall
+)
+
+var defKinds = [numInstructionKinds]defKind{
+ adr: defKindRD,
+ aluRRR: defKindRD,
+ aluRRRR: defKindRD,
+ aluRRImm12: defKindRD,
+ aluRRBitmaskImm: defKindRD,
+ aluRRRShift: defKindRD,
+ aluRRImmShift: defKindRD,
+ aluRRRExtend: defKindRD,
+ bitRR: defKindRD,
+ movZ: defKindRD,
+ movK: defKindRD,
+ movN: defKindRD,
+ mov32: defKindRD,
+ mov64: defKindRD,
+ fpuMov64: defKindRD,
+ fpuMov128: defKindRD,
+ fpuRR: defKindRD,
+ fpuRRR: defKindRD,
+ nop0: defKindNone,
+ call: defKindCall,
+ callInd: defKindCall,
+ ret: defKindNone,
+ store8: defKindNone,
+ store16: defKindNone,
+ store32: defKindNone,
+ store64: defKindNone,
+ exitSequence: defKindNone,
+ condBr: defKindNone,
+ br: defKindNone,
+ brTableSequence: defKindNone,
+ cSet: defKindRD,
+ extend: defKindRD,
+ fpuCmp: defKindNone,
+ uLoad8: defKindRD,
+ uLoad16: defKindRD,
+ uLoad32: defKindRD,
+ sLoad8: defKindRD,
+ sLoad16: defKindRD,
+ sLoad32: defKindRD,
+ uLoad64: defKindRD,
+ fpuLoad32: defKindRD,
+ fpuLoad64: defKindRD,
+ fpuLoad128: defKindRD,
+ vecLoad1R: defKindRD,
+ loadFpuConst32: defKindRD,
+ loadFpuConst64: defKindRD,
+ loadFpuConst128: defKindRD,
+ fpuStore32: defKindNone,
+ fpuStore64: defKindNone,
+ fpuStore128: defKindNone,
+ udf: defKindNone,
+ cSel: defKindRD,
+ fpuCSel: defKindRD,
+ movToVec: defKindRD,
+ movFromVec: defKindRD,
+ movFromVecSigned: defKindRD,
+ vecDup: defKindRD,
+ vecDupElement: defKindRD,
+ vecExtract: defKindRD,
+ vecMisc: defKindRD,
+ vecMovElement: defKindRD,
+ vecLanes: defKindRD,
+ vecShiftImm: defKindRD,
+ vecTbl: defKindRD,
+ vecTbl2: defKindRD,
+ vecPermute: defKindRD,
+ vecRRR: defKindRD,
+ vecRRRRewrite: defKindNone,
+ fpuToInt: defKindRD,
+ intToFpu: defKindRD,
+ cCmpImm: defKindNone,
+ movToFPSR: defKindNone,
+ movFromFPSR: defKindRD,
+ emitSourceOffsetInfo: defKindNone,
+ atomicRmw: defKindRD,
+ atomicCas: defKindNone,
+ atomicLoad: defKindRD,
+ atomicStore: defKindNone,
+ dmb: defKindNone,
+ loadConstBlockArg: defKindRD,
+}
+
+// Defs returns the list of regalloc.VReg that are defined by the instruction.
+// In order to reduce the number of allocations, the caller can pass the slice to be used.
+func (i *instruction) Defs(regs *[]regalloc.VReg) []regalloc.VReg {
+ *regs = (*regs)[:0]
+ switch defKinds[i.kind] {
+ case defKindNone:
+ case defKindRD:
+ *regs = append(*regs, i.rd.nr())
+ case defKindCall:
+ _, _, retIntRealRegs, retFloatRealRegs, _ := backend.ABIInfoFromUint64(i.u2)
+ for i := byte(0); i < retIntRealRegs; i++ {
+ *regs = append(*regs, regInfo.RealRegToVReg[intParamResultRegs[i]])
+ }
+ for i := byte(0); i < retFloatRealRegs; i++ {
+ *regs = append(*regs, regInfo.RealRegToVReg[floatParamResultRegs[i]])
+ }
+ default:
+ panic(fmt.Sprintf("defKind for %v not defined", i))
+ }
+ return *regs
+}
+
+// AssignDef implements regalloc.Instr AssignDef.
+func (i *instruction) AssignDef(reg regalloc.VReg) {
+ switch defKinds[i.kind] {
+ case defKindNone:
+ case defKindRD:
+ i.rd = i.rd.assignReg(reg)
+ case defKindCall:
+ panic("BUG: call instructions shouldn't be assigned")
+ default:
+ panic(fmt.Sprintf("defKind for %v not defined", i))
+ }
+}
+
+type useKind byte
+
+const (
+ useKindNone useKind = iota + 1
+ useKindRN
+ useKindRNRM
+ useKindRNRMRA
+ useKindRNRN1RM
+ useKindCall
+ useKindCallInd
+ useKindAMode
+ useKindRNAMode
+ useKindCond
+ // useKindRDRewrite indicates an instruction where RD is used both as a source and destination.
+ // A temporary register for RD must be allocated explicitly with the source copied to this
+ // register before the instruction and the value copied from this register to the instruction
+ // return register.
+ useKindRDRewrite
+)
+
+var useKinds = [numInstructionKinds]useKind{
+ udf: useKindNone,
+ aluRRR: useKindRNRM,
+ aluRRRR: useKindRNRMRA,
+ aluRRImm12: useKindRN,
+ aluRRBitmaskImm: useKindRN,
+ aluRRRShift: useKindRNRM,
+ aluRRImmShift: useKindRN,
+ aluRRRExtend: useKindRNRM,
+ bitRR: useKindRN,
+ movZ: useKindNone,
+ movK: useKindNone,
+ movN: useKindNone,
+ mov32: useKindRN,
+ mov64: useKindRN,
+ fpuMov64: useKindRN,
+ fpuMov128: useKindRN,
+ fpuRR: useKindRN,
+ fpuRRR: useKindRNRM,
+ nop0: useKindNone,
+ call: useKindCall,
+ callInd: useKindCallInd,
+ ret: useKindNone,
+ store8: useKindRNAMode,
+ store16: useKindRNAMode,
+ store32: useKindRNAMode,
+ store64: useKindRNAMode,
+ exitSequence: useKindRN,
+ condBr: useKindCond,
+ br: useKindNone,
+ brTableSequence: useKindRN,
+ cSet: useKindNone,
+ extend: useKindRN,
+ fpuCmp: useKindRNRM,
+ uLoad8: useKindAMode,
+ uLoad16: useKindAMode,
+ uLoad32: useKindAMode,
+ sLoad8: useKindAMode,
+ sLoad16: useKindAMode,
+ sLoad32: useKindAMode,
+ uLoad64: useKindAMode,
+ fpuLoad32: useKindAMode,
+ fpuLoad64: useKindAMode,
+ fpuLoad128: useKindAMode,
+ fpuStore32: useKindRNAMode,
+ fpuStore64: useKindRNAMode,
+ fpuStore128: useKindRNAMode,
+ loadFpuConst32: useKindNone,
+ loadFpuConst64: useKindNone,
+ loadFpuConst128: useKindNone,
+ vecLoad1R: useKindRN,
+ cSel: useKindRNRM,
+ fpuCSel: useKindRNRM,
+ movToVec: useKindRN,
+ movFromVec: useKindRN,
+ movFromVecSigned: useKindRN,
+ vecDup: useKindRN,
+ vecDupElement: useKindRN,
+ vecExtract: useKindRNRM,
+ cCmpImm: useKindRN,
+ vecMisc: useKindRN,
+ vecMovElement: useKindRN,
+ vecLanes: useKindRN,
+ vecShiftImm: useKindRN,
+ vecTbl: useKindRNRM,
+ vecTbl2: useKindRNRN1RM,
+ vecRRR: useKindRNRM,
+ vecRRRRewrite: useKindRDRewrite,
+ vecPermute: useKindRNRM,
+ fpuToInt: useKindRN,
+ intToFpu: useKindRN,
+ movToFPSR: useKindRN,
+ movFromFPSR: useKindNone,
+ adr: useKindNone,
+ emitSourceOffsetInfo: useKindNone,
+ atomicRmw: useKindRNRM,
+ atomicCas: useKindRDRewrite,
+ atomicLoad: useKindRN,
+ atomicStore: useKindRNRM,
+ loadConstBlockArg: useKindNone,
+ dmb: useKindNone,
+}
+
+// Uses returns the list of regalloc.VReg that are used by the instruction.
+// In order to reduce the number of allocations, the caller can pass the slice to be used.
+func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
+ *regs = (*regs)[:0]
+ switch useKinds[i.kind] {
+ case useKindNone:
+ case useKindRN:
+ if rn := i.rn.reg(); rn.Valid() {
+ *regs = append(*regs, rn)
+ }
+ case useKindRNRM:
+ if rn := i.rn.reg(); rn.Valid() {
+ *regs = append(*regs, rn)
+ }
+ if rm := i.rm.reg(); rm.Valid() {
+ *regs = append(*regs, rm)
+ }
+ case useKindRNRMRA:
+ if rn := i.rn.reg(); rn.Valid() {
+ *regs = append(*regs, rn)
+ }
+ if rm := i.rm.reg(); rm.Valid() {
+ *regs = append(*regs, rm)
+ }
+ if ra := i.ra.reg(); ra.Valid() {
+ *regs = append(*regs, ra)
+ }
+ case useKindRNRN1RM:
+ if rn := i.rn.reg(); rn.Valid() && rn.IsRealReg() {
+ rn1 := regalloc.FromRealReg(rn.RealReg()+1, rn.RegType())
+ *regs = append(*regs, rn, rn1)
+ }
+ if rm := i.rm.reg(); rm.Valid() {
+ *regs = append(*regs, rm)
+ }
+ case useKindAMode:
+ if amodeRN := i.amode.rn; amodeRN.Valid() {
+ *regs = append(*regs, amodeRN)
+ }
+ if amodeRM := i.amode.rm; amodeRM.Valid() {
+ *regs = append(*regs, amodeRM)
+ }
+ case useKindRNAMode:
+ *regs = append(*regs, i.rn.reg())
+ if amodeRN := i.amode.rn; amodeRN.Valid() {
+ *regs = append(*regs, amodeRN)
+ }
+ if amodeRM := i.amode.rm; amodeRM.Valid() {
+ *regs = append(*regs, amodeRM)
+ }
+ case useKindCond:
+ cnd := cond(i.u1)
+ if cnd.kind() != condKindCondFlagSet {
+ *regs = append(*regs, cnd.register())
+ }
+ case useKindCallInd:
+ *regs = append(*regs, i.rn.nr())
+ fallthrough
+ case useKindCall:
+ argIntRealRegs, argFloatRealRegs, _, _, _ := backend.ABIInfoFromUint64(i.u2)
+ for i := byte(0); i < argIntRealRegs; i++ {
+ *regs = append(*regs, regInfo.RealRegToVReg[intParamResultRegs[i]])
+ }
+ for i := byte(0); i < argFloatRealRegs; i++ {
+ *regs = append(*regs, regInfo.RealRegToVReg[floatParamResultRegs[i]])
+ }
+ case useKindRDRewrite:
+ *regs = append(*regs, i.rn.reg())
+ *regs = append(*regs, i.rm.reg())
+ *regs = append(*regs, i.rd.reg())
+ default:
+ panic(fmt.Sprintf("useKind for %v not defined", i))
+ }
+ return *regs
+}
+
+func (i *instruction) AssignUse(index int, reg regalloc.VReg) {
+ switch useKinds[i.kind] {
+ case useKindNone:
+ case useKindRN:
+ if rn := i.rn.reg(); rn.Valid() {
+ i.rn = i.rn.assignReg(reg)
+ }
+ case useKindRNRM:
+ if index == 0 {
+ if rn := i.rn.reg(); rn.Valid() {
+ i.rn = i.rn.assignReg(reg)
+ }
+ } else {
+ if rm := i.rm.reg(); rm.Valid() {
+ i.rm = i.rm.assignReg(reg)
+ }
+ }
+ case useKindRDRewrite:
+ if index == 0 {
+ if rn := i.rn.reg(); rn.Valid() {
+ i.rn = i.rn.assignReg(reg)
+ }
+ } else if index == 1 {
+ if rm := i.rm.reg(); rm.Valid() {
+ i.rm = i.rm.assignReg(reg)
+ }
+ } else {
+ if rd := i.rd.reg(); rd.Valid() {
+ i.rd = i.rd.assignReg(reg)
+ }
+ }
+ case useKindRNRN1RM:
+ if index == 0 {
+ if rn := i.rn.reg(); rn.Valid() {
+ i.rn = i.rn.assignReg(reg)
+ }
+ if rn1 := i.rn.reg() + 1; rn1.Valid() {
+ i.rm = i.rm.assignReg(reg + 1)
+ }
+ } else {
+ if rm := i.rm.reg(); rm.Valid() {
+ i.rm = i.rm.assignReg(reg)
+ }
+ }
+ case useKindRNRMRA:
+ if index == 0 {
+ if rn := i.rn.reg(); rn.Valid() {
+ i.rn = i.rn.assignReg(reg)
+ }
+ } else if index == 1 {
+ if rm := i.rm.reg(); rm.Valid() {
+ i.rm = i.rm.assignReg(reg)
+ }
+ } else {
+ if ra := i.ra.reg(); ra.Valid() {
+ i.ra = i.ra.assignReg(reg)
+ }
+ }
+ case useKindAMode:
+ if index == 0 {
+ if amodeRN := i.amode.rn; amodeRN.Valid() {
+ i.amode.rn = reg
+ }
+ } else {
+ if amodeRM := i.amode.rm; amodeRM.Valid() {
+ i.amode.rm = reg
+ }
+ }
+ case useKindRNAMode:
+ if index == 0 {
+ i.rn = i.rn.assignReg(reg)
+ } else if index == 1 {
+ if amodeRN := i.amode.rn; amodeRN.Valid() {
+ i.amode.rn = reg
+ } else {
+ panic("BUG")
+ }
+ } else {
+ if amodeRM := i.amode.rm; amodeRM.Valid() {
+ i.amode.rm = reg
+ } else {
+ panic("BUG")
+ }
+ }
+ case useKindCond:
+ c := cond(i.u1)
+ switch c.kind() {
+ case condKindRegisterZero:
+ i.u1 = uint64(registerAsRegZeroCond(reg))
+ case condKindRegisterNotZero:
+ i.u1 = uint64(registerAsRegNotZeroCond(reg))
+ }
+ case useKindCall:
+ panic("BUG: call instructions shouldn't be assigned")
+ case useKindCallInd:
+ i.rn = i.rn.assignReg(reg)
+ default:
+ panic(fmt.Sprintf("useKind for %v not defined", i))
+ }
+}
+
+func (i *instruction) asCall(ref ssa.FuncRef, abi *backend.FunctionABI) {
+ i.kind = call
+ i.u1 = uint64(ref)
+ if abi != nil {
+ i.u2 = abi.ABIInfoAsUint64()
+ }
+}
+
+func (i *instruction) asCallIndirect(ptr regalloc.VReg, abi *backend.FunctionABI) {
+ i.kind = callInd
+ i.rn = operandNR(ptr)
+ if abi != nil {
+ i.u2 = abi.ABIInfoAsUint64()
+ }
+}
+
+func (i *instruction) callFuncRef() ssa.FuncRef {
+ return ssa.FuncRef(i.u1)
+}
+
+// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
+func (i *instruction) asMOVZ(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+ i.kind = movZ
+ i.rd = operandNR(dst)
+ i.u1 = imm
+ i.u2 = shift
+ if dst64bit {
+ i.u3 = 1
+ }
+}
+
+// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
+func (i *instruction) asMOVK(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+ i.kind = movK
+ i.rd = operandNR(dst)
+ i.u1 = imm
+ i.u2 = shift
+ if dst64bit {
+ i.u3 = 1
+ }
+}
+
+// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
+func (i *instruction) asMOVN(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+ i.kind = movN
+ i.rd = operandNR(dst)
+ i.u1 = imm
+ i.u2 = shift
+ if dst64bit {
+ i.u3 = 1
+ }
+}
+
+func (i *instruction) asNop0() *instruction {
+ i.kind = nop0
+ return i
+}
+
+func (i *instruction) asNop0WithLabel(l label) {
+ i.kind = nop0
+ i.u1 = uint64(l)
+}
+
+func (i *instruction) nop0Label() label {
+ return label(i.u1)
+}
+
+func (i *instruction) asRet() {
+ i.kind = ret
+}
+
+func (i *instruction) asStorePair64(src1, src2 regalloc.VReg, amode addressMode) {
+ i.kind = storeP64
+ i.rn = operandNR(src1)
+ i.rm = operandNR(src2)
+ i.amode = amode
+}
+
+func (i *instruction) asLoadPair64(src1, src2 regalloc.VReg, amode addressMode) {
+ i.kind = loadP64
+ i.rn = operandNR(src1)
+ i.rm = operandNR(src2)
+ i.amode = amode
+}
+
+func (i *instruction) asStore(src operand, amode addressMode, sizeInBits byte) {
+ switch sizeInBits {
+ case 8:
+ i.kind = store8
+ case 16:
+ i.kind = store16
+ case 32:
+ if src.reg().RegType() == regalloc.RegTypeInt {
+ i.kind = store32
+ } else {
+ i.kind = fpuStore32
+ }
+ case 64:
+ if src.reg().RegType() == regalloc.RegTypeInt {
+ i.kind = store64
+ } else {
+ i.kind = fpuStore64
+ }
+ case 128:
+ i.kind = fpuStore128
+ }
+ i.rn = src
+ i.amode = amode
+}
+
+func (i *instruction) asSLoad(dst operand, amode addressMode, sizeInBits byte) {
+ switch sizeInBits {
+ case 8:
+ i.kind = sLoad8
+ case 16:
+ i.kind = sLoad16
+ case 32:
+ i.kind = sLoad32
+ default:
+ panic("BUG")
+ }
+ i.rd = dst
+ i.amode = amode
+}
+
+func (i *instruction) asULoad(dst operand, amode addressMode, sizeInBits byte) {
+ switch sizeInBits {
+ case 8:
+ i.kind = uLoad8
+ case 16:
+ i.kind = uLoad16
+ case 32:
+ i.kind = uLoad32
+ case 64:
+ i.kind = uLoad64
+ }
+ i.rd = dst
+ i.amode = amode
+}
+
+func (i *instruction) asFpuLoad(dst operand, amode addressMode, sizeInBits byte) {
+ switch sizeInBits {
+ case 32:
+ i.kind = fpuLoad32
+ case 64:
+ i.kind = fpuLoad64
+ case 128:
+ i.kind = fpuLoad128
+ }
+ i.rd = dst
+ i.amode = amode
+}
+
+func (i *instruction) asVecLoad1R(rd, rn operand, arr vecArrangement) {
+ // NOTE: currently only has support for no-offset loads, though it is suspicious that
+ // we would need to support offset load (that is only available for post-index).
+ i.kind = vecLoad1R
+ i.rd = rd
+ i.rn = rn
+ i.u1 = uint64(arr)
+}
+
+func (i *instruction) asCSet(rd regalloc.VReg, mask bool, c condFlag) {
+ i.kind = cSet
+ i.rd = operandNR(rd)
+ i.u1 = uint64(c)
+ if mask {
+ i.u2 = 1
+ }
+}
+
+func (i *instruction) asCSel(rd, rn, rm operand, c condFlag, _64bit bool) {
+ i.kind = cSel
+ i.rd = rd
+ i.rn = rn
+ i.rm = rm
+ i.u1 = uint64(c)
+ if _64bit {
+ i.u3 = 1
+ }
+}
+
+func (i *instruction) asFpuCSel(rd, rn, rm operand, c condFlag, _64bit bool) {
+ i.kind = fpuCSel
+ i.rd = rd
+ i.rn = rn
+ i.rm = rm
+ i.u1 = uint64(c)
+ if _64bit {
+ i.u3 = 1
+ }
+}
+
+func (i *instruction) asBr(target label) {
+ if target == labelReturn {
+ panic("BUG: call site should special case for returnLabel")
+ }
+ i.kind = br
+ i.u1 = uint64(target)
+}
+
+func (i *instruction) asBrTableSequence(indexReg regalloc.VReg, targetIndex, targetCounts int) {
+ i.kind = brTableSequence
+ i.rn = operandNR(indexReg)
+ i.u1 = uint64(targetIndex)
+ i.u2 = uint64(targetCounts)
+}
+
+func (i *instruction) brTableSequenceOffsetsResolved() {
+ i.u3 = 1 // indicate that the offsets are resolved, for debugging.
+}
+
+func (i *instruction) brLabel() label {
+ return label(i.u1)
+}
+
+// brOffsetResolved is called when the target label is resolved.
+func (i *instruction) brOffsetResolve(offset int64) {
+ i.u2 = uint64(offset)
+ i.u3 = 1 // indicate that the offset is resolved, for debugging.
+}
+
+func (i *instruction) brOffset() int64 {
+ return int64(i.u2)
+}
+
+// asCondBr encodes a conditional branch instruction. is64bit is only needed when cond is not flag.
+func (i *instruction) asCondBr(c cond, target label, is64bit bool) {
+ i.kind = condBr
+ i.u1 = c.asUint64()
+ i.u2 = uint64(target)
+ if is64bit {
+ i.u3 = 1
+ }
+}
+
+func (i *instruction) setCondBrTargets(target label) {
+ i.u2 = uint64(target)
+}
+
+func (i *instruction) condBrLabel() label {
+ return label(i.u2)
+}
+
+// condBrOffsetResolve is called when the target label is resolved.
+func (i *instruction) condBrOffsetResolve(offset int64) {
+ i.rd.data = uint64(offset)
+ i.rd.data2 = 1 // indicate that the offset is resolved, for debugging.
+}
+
+// condBrOffsetResolved returns true if condBrOffsetResolve is already called.
+func (i *instruction) condBrOffsetResolved() bool {
+ return i.rd.data2 == 1
+}
+
+func (i *instruction) condBrOffset() int64 {
+ return int64(i.rd.data)
+}
+
+func (i *instruction) condBrCond() cond {
+ return cond(i.u1)
+}
+
+func (i *instruction) condBr64bit() bool {
+ return i.u3 == 1
+}
+
+func (i *instruction) asLoadFpuConst32(rd regalloc.VReg, raw uint64) {
+ i.kind = loadFpuConst32
+ i.u1 = raw
+ i.rd = operandNR(rd)
+}
+
+func (i *instruction) asLoadFpuConst64(rd regalloc.VReg, raw uint64) {
+ i.kind = loadFpuConst64
+ i.u1 = raw
+ i.rd = operandNR(rd)
+}
+
+func (i *instruction) asLoadFpuConst128(rd regalloc.VReg, lo, hi uint64) {
+ i.kind = loadFpuConst128
+ i.u1 = lo
+ i.u2 = hi
+ i.rd = operandNR(rd)
+}
+
+func (i *instruction) asFpuCmp(rn, rm operand, is64bit bool) {
+ i.kind = fpuCmp
+ i.rn, i.rm = rn, rm
+ if is64bit {
+ i.u3 = 1
+ }
+}
+
+func (i *instruction) asCCmpImm(rn operand, imm uint64, c condFlag, flag byte, is64bit bool) {
+ i.kind = cCmpImm
+ i.rn = rn
+ i.rm.data = imm
+ i.u1 = uint64(c)
+ i.u2 = uint64(flag)
+ if is64bit {
+ i.u3 = 1
+ }
+}
+
+// asALU setups a basic ALU instruction.
+func (i *instruction) asALU(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
+ switch rm.kind {
+ case operandKindNR:
+ i.kind = aluRRR
+ case operandKindSR:
+ i.kind = aluRRRShift
+ case operandKindER:
+ i.kind = aluRRRExtend
+ case operandKindImm12:
+ i.kind = aluRRImm12
+ default:
+ panic("BUG")
+ }
+ i.u1 = uint64(aluOp)
+ i.rd, i.rn, i.rm = rd, rn, rm
+ if dst64bit {
+ i.u3 = 1
+ }
+}
+
+// asALU setups a basic ALU instruction.
+func (i *instruction) asALURRRR(aluOp aluOp, rd, rn, rm, ra operand, dst64bit bool) {
+ i.kind = aluRRRR
+ i.u1 = uint64(aluOp)
+ i.rd, i.rn, i.rm, i.ra = rd, rn, rm, ra
+ if dst64bit {
+ i.u3 = 1
+ }
+}
+
+// asALUShift setups a shift based ALU instruction.
+func (i *instruction) asALUShift(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
+ switch rm.kind {
+ case operandKindNR:
+ i.kind = aluRRR // If the shift amount op is a register, then the instruction is encoded as a normal ALU instruction with two register operands.
+ case operandKindShiftImm:
+ i.kind = aluRRImmShift
+ default:
+ panic("BUG")
+ }
+ i.u1 = uint64(aluOp)
+ i.rd, i.rn, i.rm = rd, rn, rm
+ if dst64bit {
+ i.u3 = 1
+ }
+}
+
+func (i *instruction) asALUBitmaskImm(aluOp aluOp, rd, rn regalloc.VReg, imm uint64, dst64bit bool) {
+ i.kind = aluRRBitmaskImm
+ i.u1 = uint64(aluOp)
+ i.rn, i.rd = operandNR(rn), operandNR(rd)
+ i.u2 = imm
+ if dst64bit {
+ i.u3 = 1
+ }
+}
+
+func (i *instruction) asMovToFPSR(rn regalloc.VReg) {
+ i.kind = movToFPSR
+ i.rn = operandNR(rn)
+}
+
+func (i *instruction) asMovFromFPSR(rd regalloc.VReg) {
+ i.kind = movFromFPSR
+ i.rd = operandNR(rd)
+}
+
+func (i *instruction) asBitRR(bitOp bitOp, rd, rn regalloc.VReg, is64bit bool) {
+ i.kind = bitRR
+ i.rn, i.rd = operandNR(rn), operandNR(rd)
+ i.u1 = uint64(bitOp)
+ if is64bit {
+ i.u2 = 1
+ }
+}
+
+func (i *instruction) asFpuRRR(op fpuBinOp, rd, rn, rm operand, dst64bit bool) {
+ i.kind = fpuRRR
+ i.u1 = uint64(op)
+ i.rd, i.rn, i.rm = rd, rn, rm
+ if dst64bit {
+ i.u3 = 1
+ }
+}
+
+func (i *instruction) asFpuRR(op fpuUniOp, rd, rn operand, dst64bit bool) {
+ i.kind = fpuRR
+ i.u1 = uint64(op)
+ i.rd, i.rn = rd, rn
+ if dst64bit {
+ i.u3 = 1
+ }
+}
+
+func (i *instruction) asExtend(rd, rn regalloc.VReg, fromBits, toBits byte, signed bool) {
+ i.kind = extend
+ i.rn, i.rd = operandNR(rn), operandNR(rd)
+ i.u1 = uint64(fromBits)
+ i.u2 = uint64(toBits)
+ if signed {
+ i.u3 = 1
+ }
+}
+
+func (i *instruction) asMove32(rd, rn regalloc.VReg) {
+ i.kind = mov32
+ i.rn, i.rd = operandNR(rn), operandNR(rd)
+}
+
+func (i *instruction) asMove64(rd, rn regalloc.VReg) *instruction {
+ i.kind = mov64
+ i.rn, i.rd = operandNR(rn), operandNR(rd)
+ return i
+}
+
+func (i *instruction) asFpuMov64(rd, rn regalloc.VReg) {
+ i.kind = fpuMov64
+ i.rn, i.rd = operandNR(rn), operandNR(rd)
+}
+
+func (i *instruction) asFpuMov128(rd, rn regalloc.VReg) *instruction {
+ i.kind = fpuMov128
+ i.rn, i.rd = operandNR(rn), operandNR(rd)
+ return i
+}
+
+func (i *instruction) asMovToVec(rd, rn operand, arr vecArrangement, index vecIndex) {
+ i.kind = movToVec
+ i.rd = rd
+ i.rn = rn
+ i.u1, i.u2 = uint64(arr), uint64(index)
+}
+
+func (i *instruction) asMovFromVec(rd, rn operand, arr vecArrangement, index vecIndex, signed bool) {
+ if signed {
+ i.kind = movFromVecSigned
+ } else {
+ i.kind = movFromVec
+ }
+ i.rd = rd
+ i.rn = rn
+ i.u1, i.u2 = uint64(arr), uint64(index)
+}
+
+func (i *instruction) asVecDup(rd, rn operand, arr vecArrangement) {
+ i.kind = vecDup
+ i.u1 = uint64(arr)
+ i.rn, i.rd = rn, rd
+}
+
+func (i *instruction) asVecDupElement(rd, rn operand, arr vecArrangement, index vecIndex) {
+ i.kind = vecDupElement
+ i.u1 = uint64(arr)
+ i.rn, i.rd = rn, rd
+ i.u2 = uint64(index)
+}
+
+func (i *instruction) asVecExtract(rd, rn, rm operand, arr vecArrangement, index uint32) {
+ i.kind = vecExtract
+ i.u1 = uint64(arr)
+ i.rn, i.rm, i.rd = rn, rm, rd
+ i.u2 = uint64(index)
+}
+
+func (i *instruction) asVecMovElement(rd, rn operand, arr vecArrangement, rdIndex, rnIndex vecIndex) {
+ i.kind = vecMovElement
+ i.u1 = uint64(arr)
+ i.u2, i.u3 = uint64(rdIndex), uint64(rnIndex)
+ i.rn, i.rd = rn, rd
+}
+
+func (i *instruction) asVecMisc(op vecOp, rd, rn operand, arr vecArrangement) {
+ i.kind = vecMisc
+ i.u1 = uint64(op)
+ i.rn, i.rd = rn, rd
+ i.u2 = uint64(arr)
+}
+
+func (i *instruction) asVecLanes(op vecOp, rd, rn operand, arr vecArrangement) {
+ i.kind = vecLanes
+ i.u1 = uint64(op)
+ i.rn, i.rd = rn, rd
+ i.u2 = uint64(arr)
+}
+
+func (i *instruction) asVecShiftImm(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction {
+ i.kind = vecShiftImm
+ i.u1 = uint64(op)
+ i.rn, i.rm, i.rd = rn, rm, rd
+ i.u2 = uint64(arr)
+ return i
+}
+
+func (i *instruction) asVecTbl(nregs byte, rd, rn, rm operand, arr vecArrangement) {
+ switch nregs {
+ case 0, 1:
+ i.kind = vecTbl
+ case 2:
+ i.kind = vecTbl2
+ if !rn.reg().IsRealReg() {
+ panic("rn is not a RealReg")
+ }
+ if rn.realReg() == v31 {
+ panic("rn cannot be v31")
+ }
+ default:
+ panic(fmt.Sprintf("unsupported number of registers %d", nregs))
+ }
+ i.rn, i.rm, i.rd = rn, rm, rd
+ i.u2 = uint64(arr)
+}
+
+func (i *instruction) asVecPermute(op vecOp, rd, rn, rm operand, arr vecArrangement) {
+ i.kind = vecPermute
+ i.u1 = uint64(op)
+ i.rn, i.rm, i.rd = rn, rm, rd
+ i.u2 = uint64(arr)
+}
+
+func (i *instruction) asVecRRR(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction {
+ i.kind = vecRRR
+ i.u1 = uint64(op)
+ i.rn, i.rd, i.rm = rn, rd, rm
+ i.u2 = uint64(arr)
+ return i
+}
+
+// asVecRRRRewrite encodes a vector instruction that rewrites the destination register.
+// IMPORTANT: the destination register must be already defined before this instruction.
+func (i *instruction) asVecRRRRewrite(op vecOp, rd, rn, rm operand, arr vecArrangement) {
+ i.kind = vecRRRRewrite
+ i.u1 = uint64(op)
+ i.rn, i.rd, i.rm = rn, rd, rm
+ i.u2 = uint64(arr)
+}
+
+func (i *instruction) IsCopy() bool {
+ op := i.kind
+ // We do not include mov32 as it is not a copy instruction in the sense that it does not preserve the upper 32 bits,
+ // and it is only used in the translation of IReduce, not the actual copy indeed.
+ return op == mov64 || op == fpuMov64 || op == fpuMov128
+}
+
+// String implements fmt.Stringer.
+func (i *instruction) String() (str string) {
+ is64SizeBitToSize := func(u3 uint64) byte {
+ if u3 == 0 {
+ return 32
+ }
+ return 64
+ }
+
+ switch i.kind {
+ case nop0:
+ if i.u1 != 0 {
+ l := label(i.u1)
+ str = fmt.Sprintf("%s:", l)
+ } else {
+ str = "nop0"
+ }
+ case aluRRR:
+ size := is64SizeBitToSize(i.u3)
+ str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
+ formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size),
+ i.rm.format(size))
+ case aluRRRR:
+ size := is64SizeBitToSize(i.u3)
+ str = fmt.Sprintf("%s %s, %s, %s, %s", aluOp(i.u1).String(),
+ formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.ra.nr(), size))
+ case aluRRImm12:
+ size := is64SizeBitToSize(i.u3)
+ str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
+ formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), i.rm.format(size))
+ case aluRRBitmaskImm:
+ size := is64SizeBitToSize(i.u3)
+ rd, rn := formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size)
+ if size == 32 {
+ str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, uint32(i.u2))
+ } else {
+ str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, i.u2)
+ }
+ case aluRRImmShift:
+ size := is64SizeBitToSize(i.u3)
+ str = fmt.Sprintf("%s %s, %s, %#x",
+ aluOp(i.u1).String(),
+ formatVRegSized(i.rd.nr(), size),
+ formatVRegSized(i.rn.nr(), size),
+ i.rm.shiftImm(),
+ )
+ case aluRRRShift:
+ size := is64SizeBitToSize(i.u3)
+ str = fmt.Sprintf("%s %s, %s, %s",
+ aluOp(i.u1).String(),
+ formatVRegSized(i.rd.nr(), size),
+ formatVRegSized(i.rn.nr(), size),
+ i.rm.format(size),
+ )
+ case aluRRRExtend:
+ size := is64SizeBitToSize(i.u3)
+ str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
+ formatVRegSized(i.rd.nr(), size),
+ formatVRegSized(i.rn.nr(), size),
+ // Regardless of the source size, the register is formatted in 32-bit.
+ i.rm.format(32),
+ )
+ case bitRR:
+ size := is64SizeBitToSize(i.u2)
+ str = fmt.Sprintf("%s %s, %s",
+ bitOp(i.u1),
+ formatVRegSized(i.rd.nr(), size),
+ formatVRegSized(i.rn.nr(), size),
+ )
+ case uLoad8:
+ str = fmt.Sprintf("ldrb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+ case sLoad8:
+ str = fmt.Sprintf("ldrsb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+ case uLoad16:
+ str = fmt.Sprintf("ldrh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+ case sLoad16:
+ str = fmt.Sprintf("ldrsh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+ case uLoad32:
+ str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+ case sLoad32:
+ str = fmt.Sprintf("ldrs %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+ case uLoad64:
+ str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64))
+ case store8:
+ str = fmt.Sprintf("strb %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(8))
+ case store16:
+ str = fmt.Sprintf("strh %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(16))
+ case store32:
+ str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(32))
+ case store64:
+ str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64))
+ case storeP64:
+ str = fmt.Sprintf("stp %s, %s, %s",
+ formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64))
+ case loadP64:
+ str = fmt.Sprintf("ldp %s, %s, %s",
+ formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64))
+ case mov64:
+ str = fmt.Sprintf("mov %s, %s",
+ formatVRegSized(i.rd.nr(), 64),
+ formatVRegSized(i.rn.nr(), 64))
+ case mov32:
+ str = fmt.Sprintf("mov %s, %s", formatVRegSized(i.rd.nr(), 32), formatVRegSized(i.rn.nr(), 32))
+ case movZ:
+ size := is64SizeBitToSize(i.u3)
+ str = fmt.Sprintf("movz %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+ case movN:
+ size := is64SizeBitToSize(i.u3)
+ str = fmt.Sprintf("movn %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+ case movK:
+ size := is64SizeBitToSize(i.u3)
+ str = fmt.Sprintf("movk %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+ case extend:
+ fromBits, toBits := byte(i.u1), byte(i.u2)
+
+ var signedStr string
+ if i.u3 == 1 {
+ signedStr = "s"
+ } else {
+ signedStr = "u"
+ }
+ var fromStr string
+ switch fromBits {
+ case 8:
+ fromStr = "b"
+ case 16:
+ fromStr = "h"
+ case 32:
+ fromStr = "w"
+ }
+ str = fmt.Sprintf("%sxt%s %s, %s", signedStr, fromStr, formatVRegSized(i.rd.nr(), toBits), formatVRegSized(i.rn.nr(), 32))
+ case cSel:
+ size := is64SizeBitToSize(i.u3)
+ str = fmt.Sprintf("csel %s, %s, %s, %s",
+ formatVRegSized(i.rd.nr(), size),
+ formatVRegSized(i.rn.nr(), size),
+ formatVRegSized(i.rm.nr(), size),
+ condFlag(i.u1),
+ )
+ case cSet:
+ if i.u2 != 0 {
+ str = fmt.Sprintf("csetm %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1))
+ } else {
+ str = fmt.Sprintf("cset %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1))
+ }
+ case cCmpImm:
+ size := is64SizeBitToSize(i.u3)
+ str = fmt.Sprintf("ccmp %s, #%#x, #%#x, %s",
+ formatVRegSized(i.rn.nr(), size), i.rm.data,
+ i.u2&0b1111,
+ condFlag(i.u1))
+ case fpuMov64:
+ str = fmt.Sprintf("mov %s, %s",
+ formatVRegVec(i.rd.nr(), vecArrangement8B, vecIndexNone),
+ formatVRegVec(i.rn.nr(), vecArrangement8B, vecIndexNone))
+ case fpuMov128:
+ str = fmt.Sprintf("mov %s, %s",
+ formatVRegVec(i.rd.nr(), vecArrangement16B, vecIndexNone),
+ formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone))
+ case fpuMovFromVec:
+ panic("TODO")
+ case fpuRR:
+ dstSz := is64SizeBitToSize(i.u3)
+ srcSz := dstSz
+ op := fpuUniOp(i.u1)
+ switch op {
+ case fpuUniOpCvt32To64:
+ srcSz = 32
+ case fpuUniOpCvt64To32:
+ srcSz = 64
+ }
+ str = fmt.Sprintf("%s %s, %s", op.String(),
+ formatVRegSized(i.rd.nr(), dstSz), formatVRegSized(i.rn.nr(), srcSz))
+ case fpuRRR:
+ size := is64SizeBitToSize(i.u3)
+ str = fmt.Sprintf("%s %s, %s, %s", fpuBinOp(i.u1).String(),
+ formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
+ case fpuRRI:
+ panic("TODO")
+ case fpuRRRR:
+ panic("TODO")
+ case fpuCmp:
+ size := is64SizeBitToSize(i.u3)
+ str = fmt.Sprintf("fcmp %s, %s",
+ formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
+ case fpuLoad32:
+ str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+ case fpuStore32:
+ str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(64))
+ case fpuLoad64:
+ str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64))
+ case fpuStore64:
+ str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64))
+ case fpuLoad128:
+ str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 128), i.amode.format(64))
+ case fpuStore128:
+ str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 128), i.amode.format(64))
+ case loadFpuConst32:
+ str = fmt.Sprintf("ldr %s, #8; b 8; data.f32 %f", formatVRegSized(i.rd.nr(), 32), math.Float32frombits(uint32(i.u1)))
+ case loadFpuConst64:
+ str = fmt.Sprintf("ldr %s, #8; b 16; data.f64 %f", formatVRegSized(i.rd.nr(), 64), math.Float64frombits(i.u1))
+ case loadFpuConst128:
+ str = fmt.Sprintf("ldr %s, #8; b 32; data.v128 %016x %016x",
+ formatVRegSized(i.rd.nr(), 128), i.u1, i.u2)
+ case fpuToInt:
+ var op, src, dst string
+ if signed := i.u1 == 1; signed {
+ op = "fcvtzs"
+ } else {
+ op = "fcvtzu"
+ }
+ if src64 := i.u2 == 1; src64 {
+ src = formatVRegWidthVec(i.rn.nr(), vecArrangementD)
+ } else {
+ src = formatVRegWidthVec(i.rn.nr(), vecArrangementS)
+ }
+ if dst64 := i.u3 == 1; dst64 {
+ dst = formatVRegSized(i.rd.nr(), 64)
+ } else {
+ dst = formatVRegSized(i.rd.nr(), 32)
+ }
+ str = fmt.Sprintf("%s %s, %s", op, dst, src)
+
+ case intToFpu:
+ var op, src, dst string
+ if signed := i.u1 == 1; signed {
+ op = "scvtf"
+ } else {
+ op = "ucvtf"
+ }
+ if src64 := i.u2 == 1; src64 {
+ src = formatVRegSized(i.rn.nr(), 64)
+ } else {
+ src = formatVRegSized(i.rn.nr(), 32)
+ }
+ if dst64 := i.u3 == 1; dst64 {
+ dst = formatVRegWidthVec(i.rd.nr(), vecArrangementD)
+ } else {
+ dst = formatVRegWidthVec(i.rd.nr(), vecArrangementS)
+ }
+ str = fmt.Sprintf("%s %s, %s", op, dst, src)
+ case fpuCSel:
+ size := is64SizeBitToSize(i.u3)
+ str = fmt.Sprintf("fcsel %s, %s, %s, %s",
+ formatVRegSized(i.rd.nr(), size),
+ formatVRegSized(i.rn.nr(), size),
+ formatVRegSized(i.rm.nr(), size),
+ condFlag(i.u1),
+ )
+ case movToVec:
+ var size byte
+ arr := vecArrangement(i.u1)
+ switch arr {
+ case vecArrangementB, vecArrangementH, vecArrangementS:
+ size = 32
+ case vecArrangementD:
+ size = 64
+ default:
+ panic("unsupported arrangement " + arr.String())
+ }
+ str = fmt.Sprintf("ins %s, %s", formatVRegVec(i.rd.nr(), arr, vecIndex(i.u2)), formatVRegSized(i.rn.nr(), size))
+ case movFromVec, movFromVecSigned:
+ var size byte
+ var opcode string
+ arr := vecArrangement(i.u1)
+ signed := i.kind == movFromVecSigned
+ switch arr {
+ case vecArrangementB, vecArrangementH, vecArrangementS:
+ size = 32
+ if signed {
+ opcode = "smov"
+ } else {
+ opcode = "umov"
+ }
+ case vecArrangementD:
+ size = 64
+ if signed {
+ opcode = "smov"
+ } else {
+ opcode = "mov"
+ }
+ default:
+ panic("unsupported arrangement " + arr.String())
+ }
+ str = fmt.Sprintf("%s %s, %s", opcode, formatVRegSized(i.rd.nr(), size), formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)))
+ case vecDup:
+ str = fmt.Sprintf("dup %s, %s",
+ formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone),
+ formatVRegSized(i.rn.nr(), 64),
+ )
+ case vecDupElement:
+ arr := vecArrangement(i.u1)
+ str = fmt.Sprintf("dup %s, %s",
+ formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+ formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)),
+ )
+ case vecDupFromFpu:
+ panic("TODO")
+ case vecExtract:
+ str = fmt.Sprintf("ext %s, %s, %s, #%d",
+ formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone),
+ formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndexNone),
+ formatVRegVec(i.rm.nr(), vecArrangement(i.u1), vecIndexNone),
+ uint32(i.u2),
+ )
+ case vecExtend:
+ panic("TODO")
+ case vecMovElement:
+ str = fmt.Sprintf("mov %s, %s",
+ formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndex(i.u2)),
+ formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndex(i.u3)),
+ )
+ case vecMiscNarrow:
+ panic("TODO")
+ case vecRRR, vecRRRRewrite:
+ str = fmt.Sprintf("%s %s, %s, %s",
+ vecOp(i.u1),
+ formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+ formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone),
+ formatVRegVec(i.rm.nr(), vecArrangement(i.u2), vecIndexNone),
+ )
+ case vecMisc:
+ vop := vecOp(i.u1)
+ if vop == vecOpCmeq0 {
+ str = fmt.Sprintf("cmeq %s, %s, #0",
+ formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+ formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
+ } else {
+ str = fmt.Sprintf("%s %s, %s",
+ vop,
+ formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+ formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
+ }
+ case vecLanes:
+ arr := vecArrangement(i.u2)
+ var destArr vecArrangement
+ switch arr {
+ case vecArrangement8B, vecArrangement16B:
+ destArr = vecArrangementH
+ case vecArrangement4H, vecArrangement8H:
+ destArr = vecArrangementS
+ case vecArrangement4S:
+ destArr = vecArrangementD
+ default:
+ panic("invalid arrangement " + arr.String())
+ }
+ str = fmt.Sprintf("%s %s, %s",
+ vecOp(i.u1),
+ formatVRegWidthVec(i.rd.nr(), destArr),
+ formatVRegVec(i.rn.nr(), arr, vecIndexNone))
+ case vecShiftImm:
+ arr := vecArrangement(i.u2)
+ str = fmt.Sprintf("%s %s, %s, #%d",
+ vecOp(i.u1),
+ formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+ formatVRegVec(i.rn.nr(), arr, vecIndexNone),
+ i.rm.shiftImm())
+ case vecTbl:
+ arr := vecArrangement(i.u2)
+ str = fmt.Sprintf("tbl %s, { %s }, %s",
+ formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+ formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone),
+ formatVRegVec(i.rm.nr(), arr, vecIndexNone))
+ case vecTbl2:
+ arr := vecArrangement(i.u2)
+ rd, rn, rm := i.rd.nr(), i.rn.nr(), i.rm.nr()
+ rn1 := regalloc.FromRealReg(rn.RealReg()+1, rn.RegType())
+ str = fmt.Sprintf("tbl %s, { %s, %s }, %s",
+ formatVRegVec(rd, arr, vecIndexNone),
+ formatVRegVec(rn, vecArrangement16B, vecIndexNone),
+ formatVRegVec(rn1, vecArrangement16B, vecIndexNone),
+ formatVRegVec(rm, arr, vecIndexNone))
+ case vecPermute:
+ arr := vecArrangement(i.u2)
+ str = fmt.Sprintf("%s %s, %s, %s",
+ vecOp(i.u1),
+ formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+ formatVRegVec(i.rn.nr(), arr, vecIndexNone),
+ formatVRegVec(i.rm.nr(), arr, vecIndexNone))
+ case movToFPSR:
+ str = fmt.Sprintf("msr fpsr, %s", formatVRegSized(i.rn.nr(), 64))
+ case movFromFPSR:
+ str = fmt.Sprintf("mrs %s fpsr", formatVRegSized(i.rd.nr(), 64))
+ case call:
+ str = fmt.Sprintf("bl %s", ssa.FuncRef(i.u1))
+ case callInd:
+ str = fmt.Sprintf("bl %s", formatVRegSized(i.rn.nr(), 64))
+ case ret:
+ str = "ret"
+ case br:
+ target := label(i.u1)
+ if i.u3 != 0 {
+ str = fmt.Sprintf("b #%#x (%s)", i.brOffset(), target.String())
+ } else {
+ str = fmt.Sprintf("b %s", target.String())
+ }
+ case condBr:
+ size := is64SizeBitToSize(i.u3)
+ c := cond(i.u1)
+ target := label(i.u2)
+ switch c.kind() {
+ case condKindRegisterZero:
+ if !i.condBrOffsetResolved() {
+ str = fmt.Sprintf("cbz %s, (%s)", formatVRegSized(c.register(), size), target.String())
+ } else {
+ str = fmt.Sprintf("cbz %s, #%#x %s", formatVRegSized(c.register(), size), i.condBrOffset(), target.String())
+ }
+ case condKindRegisterNotZero:
+ if offset := i.condBrOffset(); offset != 0 {
+ str = fmt.Sprintf("cbnz %s, #%#x (%s)", formatVRegSized(c.register(), size), offset, target.String())
+ } else {
+ str = fmt.Sprintf("cbnz %s, %s", formatVRegSized(c.register(), size), target.String())
+ }
+ case condKindCondFlagSet:
+ if offset := i.condBrOffset(); offset != 0 {
+ if target == labelInvalid {
+ str = fmt.Sprintf("b.%s #%#x", c.flag(), offset)
+ } else {
+ str = fmt.Sprintf("b.%s #%#x, (%s)", c.flag(), offset, target.String())
+ }
+ } else {
+ str = fmt.Sprintf("b.%s %s", c.flag(), target.String())
+ }
+ }
+ case adr:
+ str = fmt.Sprintf("adr %s, #%#x", formatVRegSized(i.rd.nr(), 64), int64(i.u1))
+ case brTableSequence:
+ targetIndex := i.u1
+ str = fmt.Sprintf("br_table_sequence %s, table_index=%d", formatVRegSized(i.rn.nr(), 64), targetIndex)
+ case exitSequence:
+ str = fmt.Sprintf("exit_sequence %s", formatVRegSized(i.rn.nr(), 64))
+ case atomicRmw:
+ m := atomicRmwOp(i.u1).String()
+ size := byte(32)
+ switch i.u2 {
+ case 8:
+ size = 64
+ case 2:
+ m = m + "h"
+ case 1:
+ m = m + "b"
+ }
+ str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64))
+ case atomicCas:
+ m := "casal"
+ size := byte(32)
+ switch i.u2 {
+ case 8:
+ size = 64
+ case 2:
+ m = m + "h"
+ case 1:
+ m = m + "b"
+ }
+ str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
+ case atomicLoad:
+ m := "ldar"
+ size := byte(32)
+ switch i.u2 {
+ case 8:
+ size = 64
+ case 2:
+ m = m + "h"
+ case 1:
+ m = m + "b"
+ }
+ str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64))
+ case atomicStore:
+ m := "stlr"
+ size := byte(32)
+ switch i.u2 {
+ case 8:
+ size = 64
+ case 2:
+ m = m + "h"
+ case 1:
+ m = m + "b"
+ }
+ str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
+ case dmb:
+ str = "dmb"
+ case udf:
+ str = "udf"
+ case emitSourceOffsetInfo:
+ str = fmt.Sprintf("source_offset_info %d", ssa.SourceOffset(i.u1))
+ case vecLoad1R:
+ str = fmt.Sprintf("ld1r {%s}, [%s]", formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone), formatVRegSized(i.rn.nr(), 64))
+ case loadConstBlockArg:
+ str = fmt.Sprintf("load_const_block_arg %s, %#x", formatVRegSized(i.rd.nr(), 64), i.u1)
+ default:
+ panic(i.kind)
+ }
+ return
+}
+
+func (i *instruction) asAdr(rd regalloc.VReg, offset int64) {
+ i.kind = adr
+ i.rd = operandNR(rd)
+ i.u1 = uint64(offset)
+}
+
+func (i *instruction) asAtomicRmw(op atomicRmwOp, rn, rs, rt operand, size uint64) {
+ i.kind = atomicRmw
+ i.rd, i.rn, i.rm = rt, rn, rs
+ i.u1 = uint64(op)
+ i.u2 = size
+}
+
+func (i *instruction) asAtomicCas(rn, rs, rt operand, size uint64) {
+ i.kind = atomicCas
+ i.rm, i.rn, i.rd = rt, rn, rs
+ i.u2 = size
+}
+
+func (i *instruction) asAtomicLoad(rn, rt operand, size uint64) {
+ i.kind = atomicLoad
+ i.rn, i.rd = rn, rt
+ i.u2 = size
+}
+
+func (i *instruction) asAtomicStore(rn, rt operand, size uint64) {
+ i.kind = atomicStore
+ i.rn, i.rm = rn, rt
+ i.u2 = size
+}
+
+func (i *instruction) asDMB() {
+ i.kind = dmb
+}
+
+// TODO: delete unnecessary things.
+const (
+ // nop0 represents a no-op of zero size.
+ nop0 instructionKind = iota + 1
+ // aluRRR represents an ALU operation with two register sources and a register destination.
+ aluRRR
+ // aluRRRR represents an ALU operation with three register sources and a register destination.
+ aluRRRR
+ // aluRRImm12 represents an ALU operation with a register source and an immediate-12 source, with a register destination.
+ aluRRImm12
+ // aluRRBitmaskImm represents an ALU operation with a register source and a bitmask immediate, with a register destination.
+ aluRRBitmaskImm
+ // aluRRImmShift represents an ALU operation with a register source and an immediate-shifted source, with a register destination.
+ aluRRImmShift
+ // aluRRRShift represents an ALU operation with two register sources, one of which can be shifted, with a register destination.
+ aluRRRShift
+ // aluRRRExtend represents an ALU operation with two register sources, one of which can be extended, with a register destination.
+ aluRRRExtend
+ // bitRR represents a bit op instruction with a single register source.
+ bitRR
+ // uLoad8 represents an unsigned 8-bit load.
+ uLoad8
+ // sLoad8 represents a signed 8-bit load into 64-bit register.
+ sLoad8
+ // uLoad16 represents an unsigned 16-bit load into 64-bit register.
+ uLoad16
+ // sLoad16 represents a signed 16-bit load into 64-bit register.
+ sLoad16
+ // uLoad32 represents an unsigned 32-bit load into 64-bit register.
+ uLoad32
+ // sLoad32 represents a signed 32-bit load into 64-bit register.
+ sLoad32
+ // uLoad64 represents a 64-bit load.
+ uLoad64
+ // store8 represents an 8-bit store.
+ store8
+ // store16 represents a 16-bit store.
+ store16
+ // store32 represents a 32-bit store.
+ store32
+ // store64 represents a 64-bit store.
+ store64
+ // storeP64 represents a store of a pair of registers.
+ storeP64
+ // loadP64 represents a load of a pair of registers.
+ loadP64
+ // mov64 represents a MOV instruction. These are encoded as ORR's but we keep them separate for better handling.
+ mov64
+ // mov32 represents a 32-bit MOV. This zeroes the top 32 bits of the destination.
+ mov32
+ // movZ represents a MOVZ with a 16-bit immediate.
+ movZ
+ // movN represents a MOVN with a 16-bit immediate.
+ movN
+ // movK represents a MOVK with a 16-bit immediate.
+ movK
+ // extend represents a sign- or zero-extend operation.
+ extend
+ // cSel represents a conditional-select operation.
+ cSel
+ // cSet represents a conditional-set operation.
+ cSet
+ // cCmpImm represents a conditional comparison with an immediate.
+ cCmpImm
+ // fpuMov64 represents a FPU move. Distinct from a vector-register move; moving just 64 bits appears to be significantly faster.
+ fpuMov64
+ // fpuMov128 represents a vector register move.
+ fpuMov128
+ // fpuMovFromVec represents a move to scalar from a vector element.
+ fpuMovFromVec
+ // fpuRR represents a 1-op FPU instruction.
+ fpuRR
+ // fpuRRR represents a 2-op FPU instruction.
+ fpuRRR
+ // fpuRRI represents a 2-op FPU instruction with immediate value.
+ fpuRRI
+ // fpuRRRR represents a 3-op FPU instruction.
+ fpuRRRR
+ // fpuCmp represents a FPU comparison, either 32 or 64 bit.
+ fpuCmp
+ // fpuLoad32 represents a floating-point load, single-precision (32 bit).
+ fpuLoad32
+ // fpuStore32 represents a floating-point store, single-precision (32 bit).
+ fpuStore32
+ // fpuLoad64 represents a floating-point load, double-precision (64 bit).
+ fpuLoad64
+ // fpuStore64 represents a floating-point store, double-precision (64 bit).
+ fpuStore64
+ // fpuLoad128 represents a floating-point/vector load, 128 bit.
+ fpuLoad128
+ // fpuStore128 represents a floating-point/vector store, 128 bit.
+ fpuStore128
+ // loadFpuConst32 represents a load of a 32-bit floating-point constant.
+ loadFpuConst32
+ // loadFpuConst64 represents a load of a 64-bit floating-point constant.
+ loadFpuConst64
+ // loadFpuConst128 represents a load of a 128-bit floating-point constant.
+ loadFpuConst128
+ // vecLoad1R represents a load of a one single-element structure that replicates to all lanes of a vector.
+ vecLoad1R
+ // fpuToInt represents a conversion from FP to integer.
+ fpuToInt
+ // intToFpu represents a conversion from integer to FP.
+ intToFpu
+ // fpuCSel represents a 32/64-bit FP conditional select.
+ fpuCSel
+ // movToVec represents a move to a vector element from a GPR.
+ movToVec
+ // movFromVec represents an unsigned move from a vector element to a GPR.
+ movFromVec
+ // movFromVecSigned represents a signed move from a vector element to a GPR.
+ movFromVecSigned
+ // vecDup represents a duplication of general-purpose register to vector.
+ vecDup
+ // vecDupElement represents a duplication of a vector element to vector or scalar.
+ vecDupElement
+ // vecDupFromFpu represents a duplication of scalar to vector.
+ vecDupFromFpu
+ // vecExtract represents a vector extraction operation.
+ vecExtract
+ // vecExtend represents a vector extension operation.
+ vecExtend
+ // vecMovElement represents a move vector element to another vector element operation.
+ vecMovElement
+ // vecMiscNarrow represents a vector narrowing operation.
+ vecMiscNarrow
+ // vecRRR represents a vector ALU operation.
+ vecRRR
+ // vecRRRRewrite is exactly the same as vecRRR except that this rewrites the destination register.
+ // For example, BSL instruction rewrites the destination register, and the existing value influences the result.
+ // Therefore, the "destination" register in vecRRRRewrite will be treated as "use" which makes the register outlive
+ // the instruction while this instruction doesn't have "def" in the context of register allocation.
+ vecRRRRewrite
+ // vecMisc represents a vector two register miscellaneous instruction.
+ vecMisc
+ // vecLanes represents a vector instruction across lanes.
+ vecLanes
+ // vecShiftImm represents a SIMD scalar shift by immediate instruction.
+ vecShiftImm
+ // vecTbl represents a table vector lookup - single register table.
+ vecTbl
+ // vecTbl2 represents a table vector lookup - two register table.
+ vecTbl2
+ // vecPermute represents a vector permute instruction.
+ vecPermute
+ // movToNZCV represents a move to the FPSR.
+ movToFPSR
+ // movFromNZCV represents a move from the FPSR.
+ movFromFPSR
+ // call represents a machine call instruction.
+ call
+ // callInd represents a machine indirect-call instruction.
+ callInd
+ // ret represents a machine return instruction.
+ ret
+ // br represents an unconditional branch.
+ br
+ // condBr represents a conditional branch.
+ condBr
+ // adr represents a compute the address (using a PC-relative offset) of a memory location.
+ adr
+ // brTableSequence represents a jump-table sequence.
+ brTableSequence
+ // exitSequence consists of multiple instructions, and exits the execution immediately.
+ // See encodeExitSequence.
+ exitSequence
+ // atomicRmw represents an atomic read-modify-write operation with two register sources and a register destination.
+ atomicRmw
+ // atomicCas represents an atomic compare-and-swap operation with three register sources. The value is loaded to
+ // the source register containing the comparison value.
+ atomicCas
+ // atomicLoad represents an atomic load with one source register and a register destination.
+ atomicLoad
+ // atomicStore represents an atomic store with two source registers and no destination.
+ atomicStore
+ // dmb represents the data memory barrier instruction in inner-shareable (ish) mode.
+ dmb
+ // UDF is the undefined instruction. For debugging only.
+ udf
+ // loadConstBlockArg represents a load of a constant block argument.
+ loadConstBlockArg
+
+ // emitSourceOffsetInfo is a dummy instruction to emit source offset info.
+ // The existence of this instruction does not affect the execution.
+ emitSourceOffsetInfo
+
+ // ------------------- do not define below this line -------------------
+ numInstructionKinds
+)
+
+func (i *instruction) asLoadConstBlockArg(v uint64, typ ssa.Type, dst regalloc.VReg) *instruction {
+ i.kind = loadConstBlockArg
+ i.u1 = v
+ i.u2 = uint64(typ)
+ i.rd = operandNR(dst)
+ return i
+}
+
+func (i *instruction) loadConstBlockArgData() (v uint64, typ ssa.Type, dst regalloc.VReg) {
+ return i.u1, ssa.Type(i.u2), i.rd.nr()
+}
+
+func (i *instruction) asEmitSourceOffsetInfo(l ssa.SourceOffset) *instruction {
+ i.kind = emitSourceOffsetInfo
+ i.u1 = uint64(l)
+ return i
+}
+
+func (i *instruction) sourceOffsetInfo() ssa.SourceOffset {
+ return ssa.SourceOffset(i.u1)
+}
+
+func (i *instruction) asUDF() *instruction {
+ i.kind = udf
+ return i
+}
+
+func (i *instruction) asFpuToInt(rd, rn operand, rdSigned, src64bit, dst64bit bool) {
+ i.kind = fpuToInt
+ i.rn = rn
+ i.rd = rd
+ if rdSigned {
+ i.u1 = 1
+ }
+ if src64bit {
+ i.u2 = 1
+ }
+ if dst64bit {
+ i.u3 = 1
+ }
+}
+
+func (i *instruction) asIntToFpu(rd, rn operand, rnSigned, src64bit, dst64bit bool) {
+ i.kind = intToFpu
+ i.rn = rn
+ i.rd = rd
+ if rnSigned {
+ i.u1 = 1
+ }
+ if src64bit {
+ i.u2 = 1
+ }
+ if dst64bit {
+ i.u3 = 1
+ }
+}
+
+func (i *instruction) asExitSequence(ctx regalloc.VReg) *instruction {
+ i.kind = exitSequence
+ i.rn = operandNR(ctx)
+ return i
+}
+
+// aluOp determines the type of ALU operation. Instructions whose kind is one of
+// aluRRR, aluRRRR, aluRRImm12, aluRRBitmaskImm, aluRRImmShift, aluRRRShift and aluRRRExtend
+// would use this type.
+type aluOp int
+
+func (a aluOp) String() string {
+ switch a {
+ case aluOpAdd:
+ return "add"
+ case aluOpSub:
+ return "sub"
+ case aluOpOrr:
+ return "orr"
+ case aluOpOrn:
+ return "orn"
+ case aluOpAnd:
+ return "and"
+ case aluOpAnds:
+ return "ands"
+ case aluOpBic:
+ return "bic"
+ case aluOpEor:
+ return "eor"
+ case aluOpAddS:
+ return "adds"
+ case aluOpSubS:
+ return "subs"
+ case aluOpSMulH:
+ return "sMulH"
+ case aluOpUMulH:
+ return "uMulH"
+ case aluOpSDiv:
+ return "sdiv"
+ case aluOpUDiv:
+ return "udiv"
+ case aluOpRotR:
+ return "ror"
+ case aluOpLsr:
+ return "lsr"
+ case aluOpAsr:
+ return "asr"
+ case aluOpLsl:
+ return "lsl"
+ case aluOpMAdd:
+ return "madd"
+ case aluOpMSub:
+ return "msub"
+ }
+ panic(int(a))
+}
+
+const (
+ // 32/64-bit Add.
+ aluOpAdd aluOp = iota
+ // 32/64-bit Subtract.
+ aluOpSub
+ // 32/64-bit Bitwise OR.
+ aluOpOrr
+ // 32/64-bit Bitwise OR NOT.
+ aluOpOrn
+ // 32/64-bit Bitwise AND.
+ aluOpAnd
+ // 32/64-bit Bitwise ANDS.
+ aluOpAnds
+ // 32/64-bit Bitwise AND NOT.
+ aluOpBic
+ // 32/64-bit Bitwise XOR (Exclusive OR).
+ aluOpEor
+ // 32/64-bit Add setting flags.
+ aluOpAddS
+ // 32/64-bit Subtract setting flags.
+ aluOpSubS
+ // Signed multiply, high-word result.
+ aluOpSMulH
+ // Unsigned multiply, high-word result.
+ aluOpUMulH
+ // 64-bit Signed divide.
+ aluOpSDiv
+ // 64-bit Unsigned divide.
+ aluOpUDiv
+ // 32/64-bit Rotate right.
+ aluOpRotR
+ // 32/64-bit Logical shift right.
+ aluOpLsr
+ // 32/64-bit Arithmetic shift right.
+ aluOpAsr
+ // 32/64-bit Logical shift left.
+ aluOpLsl /// Multiply-add
+
+ // MAdd and MSub are only applicable for aluRRRR.
+ aluOpMAdd
+ aluOpMSub
+)
+
+// vecOp determines the type of vector operation. Instructions whose kind is one of
+// vecOpCnt would use this type.
+type vecOp int
+
+// String implements fmt.Stringer.
+func (b vecOp) String() string {
+ switch b {
+ case vecOpCnt:
+ return "cnt"
+ case vecOpCmeq:
+ return "cmeq"
+ case vecOpCmgt:
+ return "cmgt"
+ case vecOpCmhi:
+ return "cmhi"
+ case vecOpCmge:
+ return "cmge"
+ case vecOpCmhs:
+ return "cmhs"
+ case vecOpFcmeq:
+ return "fcmeq"
+ case vecOpFcmgt:
+ return "fcmgt"
+ case vecOpFcmge:
+ return "fcmge"
+ case vecOpCmeq0:
+ return "cmeq0"
+ case vecOpUaddlv:
+ return "uaddlv"
+ case vecOpBit:
+ return "bit"
+ case vecOpBic:
+ return "bic"
+ case vecOpBsl:
+ return "bsl"
+ case vecOpNot:
+ return "not"
+ case vecOpAnd:
+ return "and"
+ case vecOpOrr:
+ return "orr"
+ case vecOpEOR:
+ return "eor"
+ case vecOpFadd:
+ return "fadd"
+ case vecOpAdd:
+ return "add"
+ case vecOpAddp:
+ return "addp"
+ case vecOpAddv:
+ return "addv"
+ case vecOpSub:
+ return "sub"
+ case vecOpFsub:
+ return "fsub"
+ case vecOpSmin:
+ return "smin"
+ case vecOpUmin:
+ return "umin"
+ case vecOpUminv:
+ return "uminv"
+ case vecOpSmax:
+ return "smax"
+ case vecOpUmax:
+ return "umax"
+ case vecOpUmaxp:
+ return "umaxp"
+ case vecOpUrhadd:
+ return "urhadd"
+ case vecOpFmul:
+ return "fmul"
+ case vecOpSqrdmulh:
+ return "sqrdmulh"
+ case vecOpMul:
+ return "mul"
+ case vecOpUmlal:
+ return "umlal"
+ case vecOpFdiv:
+ return "fdiv"
+ case vecOpFsqrt:
+ return "fsqrt"
+ case vecOpAbs:
+ return "abs"
+ case vecOpFabs:
+ return "fabs"
+ case vecOpNeg:
+ return "neg"
+ case vecOpFneg:
+ return "fneg"
+ case vecOpFrintp:
+ return "frintp"
+ case vecOpFrintm:
+ return "frintm"
+ case vecOpFrintn:
+ return "frintn"
+ case vecOpFrintz:
+ return "frintz"
+ case vecOpFcvtl:
+ return "fcvtl"
+ case vecOpFcvtn:
+ return "fcvtn"
+ case vecOpFcvtzu:
+ return "fcvtzu"
+ case vecOpFcvtzs:
+ return "fcvtzs"
+ case vecOpScvtf:
+ return "scvtf"
+ case vecOpUcvtf:
+ return "ucvtf"
+ case vecOpSqxtn:
+ return "sqxtn"
+ case vecOpUqxtn:
+ return "uqxtn"
+ case vecOpSqxtun:
+ return "sqxtun"
+ case vecOpRev64:
+ return "rev64"
+ case vecOpXtn:
+ return "xtn"
+ case vecOpShll:
+ return "shll"
+ case vecOpSshl:
+ return "sshl"
+ case vecOpSshll:
+ return "sshll"
+ case vecOpUshl:
+ return "ushl"
+ case vecOpUshll:
+ return "ushll"
+ case vecOpSshr:
+ return "sshr"
+ case vecOpZip1:
+ return "zip1"
+ case vecOpFmin:
+ return "fmin"
+ case vecOpFmax:
+ return "fmax"
+ case vecOpSmull:
+ return "smull"
+ case vecOpSmull2:
+ return "smull2"
+ }
+ panic(int(b))
+}
+
+const (
+ vecOpCnt vecOp = iota
+ vecOpCmeq0
+ vecOpCmeq
+ vecOpCmgt
+ vecOpCmhi
+ vecOpCmge
+ vecOpCmhs
+ vecOpFcmeq
+ vecOpFcmgt
+ vecOpFcmge
+ vecOpUaddlv
+ vecOpBit
+ vecOpBic
+ vecOpBsl
+ vecOpNot
+ vecOpAnd
+ vecOpOrr
+ vecOpEOR
+ vecOpAdd
+ vecOpFadd
+ vecOpAddv
+ vecOpSqadd
+ vecOpUqadd
+ vecOpAddp
+ vecOpSub
+ vecOpFsub
+ vecOpSqsub
+ vecOpUqsub
+ vecOpSmin
+ vecOpUmin
+ vecOpUminv
+ vecOpFmin
+ vecOpSmax
+ vecOpUmax
+ vecOpUmaxp
+ vecOpFmax
+ vecOpUrhadd
+ vecOpMul
+ vecOpFmul
+ vecOpSqrdmulh
+ vecOpUmlal
+ vecOpFdiv
+ vecOpFsqrt
+ vecOpAbs
+ vecOpFabs
+ vecOpNeg
+ vecOpFneg
+ vecOpFrintm
+ vecOpFrintn
+ vecOpFrintp
+ vecOpFrintz
+ vecOpFcvtl
+ vecOpFcvtn
+ vecOpFcvtzs
+ vecOpFcvtzu
+ vecOpScvtf
+ vecOpUcvtf
+ vecOpSqxtn
+ vecOpSqxtun
+ vecOpUqxtn
+ vecOpRev64
+ vecOpXtn
+ vecOpShll
+ vecOpSshl
+ vecOpSshll
+ vecOpUshl
+ vecOpUshll
+ vecOpSshr
+ vecOpZip1
+ vecOpSmull
+ vecOpSmull2
+)
+
+// bitOp determines the type of bitwise operation. Instructions whose kind is one of
+// bitOpRbit and bitOpClz would use this type.
+type bitOp int
+
+// String implements fmt.Stringer.
+func (b bitOp) String() string {
+ switch b {
+ case bitOpRbit:
+ return "rbit"
+ case bitOpClz:
+ return "clz"
+ }
+ panic(int(b))
+}
+
+const (
+ // 32/64-bit Rbit.
+ bitOpRbit bitOp = iota
+ // 32/64-bit Clz.
+ bitOpClz
+)
+
+// fpuUniOp represents a unary floating-point unit (FPU) operation.
+type fpuUniOp byte
+
+const (
+ fpuUniOpNeg fpuUniOp = iota
+ fpuUniOpCvt32To64
+ fpuUniOpCvt64To32
+ fpuUniOpSqrt
+ fpuUniOpRoundPlus
+ fpuUniOpRoundMinus
+ fpuUniOpRoundZero
+ fpuUniOpRoundNearest
+ fpuUniOpAbs
+)
+
+// String implements the fmt.Stringer.
+func (f fpuUniOp) String() string {
+ switch f {
+ case fpuUniOpNeg:
+ return "fneg"
+ case fpuUniOpCvt32To64:
+ return "fcvt"
+ case fpuUniOpCvt64To32:
+ return "fcvt"
+ case fpuUniOpSqrt:
+ return "fsqrt"
+ case fpuUniOpRoundPlus:
+ return "frintp"
+ case fpuUniOpRoundMinus:
+ return "frintm"
+ case fpuUniOpRoundZero:
+ return "frintz"
+ case fpuUniOpRoundNearest:
+ return "frintn"
+ case fpuUniOpAbs:
+ return "fabs"
+ }
+ panic(int(f))
+}
+
+// fpuBinOp represents a binary floating-point unit (FPU) operation.
+type fpuBinOp byte
+
+const (
+ fpuBinOpAdd = iota
+ fpuBinOpSub
+ fpuBinOpMul
+ fpuBinOpDiv
+ fpuBinOpMax
+ fpuBinOpMin
+)
+
+// String implements the fmt.Stringer.
+func (f fpuBinOp) String() string {
+ switch f {
+ case fpuBinOpAdd:
+ return "fadd"
+ case fpuBinOpSub:
+ return "fsub"
+ case fpuBinOpMul:
+ return "fmul"
+ case fpuBinOpDiv:
+ return "fdiv"
+ case fpuBinOpMax:
+ return "fmax"
+ case fpuBinOpMin:
+ return "fmin"
+ }
+ panic(int(f))
+}
+
+// extMode represents the mode of a register operand extension.
+// For example, aluRRRExtend instructions need this info to determine the extensions.
+type extMode byte
+
+const (
+ extModeNone extMode = iota
+ // extModeZeroExtend64 suggests a zero-extension to 32 bits if the original bit size is less than 32.
+ extModeZeroExtend32
+ // extModeSignExtend64 stands for a sign-extension to 32 bits if the original bit size is less than 32.
+ extModeSignExtend32
+ // extModeZeroExtend64 suggests a zero-extension to 64 bits if the original bit size is less than 64.
+ extModeZeroExtend64
+ // extModeSignExtend64 stands for a sign-extension to 64 bits if the original bit size is less than 64.
+ extModeSignExtend64
+)
+
+func (e extMode) bits() byte {
+ switch e {
+ case extModeZeroExtend32, extModeSignExtend32:
+ return 32
+ case extModeZeroExtend64, extModeSignExtend64:
+ return 64
+ default:
+ return 0
+ }
+}
+
+func (e extMode) signed() bool {
+ switch e {
+ case extModeSignExtend32, extModeSignExtend64:
+ return true
+ default:
+ return false
+ }
+}
+
+func extModeOf(t ssa.Type, signed bool) extMode {
+ switch t.Bits() {
+ case 32:
+ if signed {
+ return extModeSignExtend32
+ }
+ return extModeZeroExtend32
+ case 64:
+ if signed {
+ return extModeSignExtend64
+ }
+ return extModeZeroExtend64
+ default:
+ panic("TODO? do we need narrower than 32 bits?")
+ }
+}
+
+type extendOp byte
+
+const (
+ extendOpUXTB extendOp = 0b000
+ extendOpUXTH extendOp = 0b001
+ extendOpUXTW extendOp = 0b010
+ // extendOpUXTX does nothing, but convenient symbol that officially exists. See:
+ // https://stackoverflow.com/questions/72041372/what-do-the-uxtx-and-sxtx-extensions-mean-for-32-bit-aarch64-adds-instruct
+ extendOpUXTX extendOp = 0b011
+ extendOpSXTB extendOp = 0b100
+ extendOpSXTH extendOp = 0b101
+ extendOpSXTW extendOp = 0b110
+ // extendOpSXTX does nothing, but convenient symbol that officially exists. See:
+ // https://stackoverflow.com/questions/72041372/what-do-the-uxtx-and-sxtx-extensions-mean-for-32-bit-aarch64-adds-instruct
+ extendOpSXTX extendOp = 0b111
+ extendOpNone extendOp = 0xff
+)
+
+func (e extendOp) srcBits() byte {
+ switch e {
+ case extendOpUXTB, extendOpSXTB:
+ return 8
+ case extendOpUXTH, extendOpSXTH:
+ return 16
+ case extendOpUXTW, extendOpSXTW:
+ return 32
+ case extendOpUXTX, extendOpSXTX:
+ return 64
+ }
+ panic(int(e))
+}
+
+func (e extendOp) String() string {
+ switch e {
+ case extendOpUXTB:
+ return "UXTB"
+ case extendOpUXTH:
+ return "UXTH"
+ case extendOpUXTW:
+ return "UXTW"
+ case extendOpUXTX:
+ return "UXTX"
+ case extendOpSXTB:
+ return "SXTB"
+ case extendOpSXTH:
+ return "SXTH"
+ case extendOpSXTW:
+ return "SXTW"
+ case extendOpSXTX:
+ return "SXTX"
+ }
+ panic(int(e))
+}
+
+func extendOpFrom(signed bool, from byte) extendOp {
+ switch from {
+ case 8:
+ if signed {
+ return extendOpSXTB
+ }
+ return extendOpUXTB
+ case 16:
+ if signed {
+ return extendOpSXTH
+ }
+ return extendOpUXTH
+ case 32:
+ if signed {
+ return extendOpSXTW
+ }
+ return extendOpUXTW
+ case 64:
+ if signed {
+ return extendOpSXTX
+ }
+ return extendOpUXTX
+ }
+ panic("invalid extendOpFrom")
+}
+
+type shiftOp byte
+
+const (
+ shiftOpLSL shiftOp = 0b00
+ shiftOpLSR shiftOp = 0b01
+ shiftOpASR shiftOp = 0b10
+ shiftOpROR shiftOp = 0b11
+)
+
+func (s shiftOp) String() string {
+ switch s {
+ case shiftOpLSL:
+ return "lsl"
+ case shiftOpLSR:
+ return "lsr"
+ case shiftOpASR:
+ return "asr"
+ case shiftOpROR:
+ return "ror"
+ }
+ panic(int(s))
+}
+
+const exitSequenceSize = 6 * 4 // 6 instructions as in encodeExitSequence.
+
+// size returns the size of the instruction in encoded bytes.
+func (i *instruction) size() int64 {
+ switch i.kind {
+ case exitSequence:
+ return exitSequenceSize // 5 instructions as in encodeExitSequence.
+ case nop0, loadConstBlockArg:
+ return 0
+ case emitSourceOffsetInfo:
+ return 0
+ case loadFpuConst32:
+ if i.u1 == 0 {
+ return 4 // zero loading can be encoded as a single instruction.
+ }
+ return 4 + 4 + 4
+ case loadFpuConst64:
+ if i.u1 == 0 {
+ return 4 // zero loading can be encoded as a single instruction.
+ }
+ return 4 + 4 + 8
+ case loadFpuConst128:
+ if i.u1 == 0 && i.u2 == 0 {
+ return 4 // zero loading can be encoded as a single instruction.
+ }
+ return 4 + 4 + 16
+ case brTableSequence:
+ return 4*4 + int64(i.u2)*4
+ default:
+ return 4
+ }
+}
+
+// vecArrangement is the arrangement of data within a vector register.
+type vecArrangement byte
+
+const (
+ // vecArrangementNone is an arrangement indicating no data is stored.
+ vecArrangementNone vecArrangement = iota
+ // vecArrangement8B is an arrangement of 8 bytes (64-bit vector)
+ vecArrangement8B
+ // vecArrangement16B is an arrangement of 16 bytes (128-bit vector)
+ vecArrangement16B
+ // vecArrangement4H is an arrangement of 4 half precisions (64-bit vector)
+ vecArrangement4H
+ // vecArrangement8H is an arrangement of 8 half precisions (128-bit vector)
+ vecArrangement8H
+ // vecArrangement2S is an arrangement of 2 single precisions (64-bit vector)
+ vecArrangement2S
+ // vecArrangement4S is an arrangement of 4 single precisions (128-bit vector)
+ vecArrangement4S
+ // vecArrangement1D is an arrangement of 1 double precision (64-bit vector)
+ vecArrangement1D
+ // vecArrangement2D is an arrangement of 2 double precisions (128-bit vector)
+ vecArrangement2D
+
+ // Assign each vector size specifier to a vector arrangement ID.
+ // Instructions can only have an arrangement or a size specifier, but not both, so it
+ // simplifies the internal representation of vector instructions by being able to
+ // store either into the same field.
+
+ // vecArrangementB is a size specifier of byte
+ vecArrangementB
+ // vecArrangementH is a size specifier of word (16-bit)
+ vecArrangementH
+ // vecArrangementS is a size specifier of double word (32-bit)
+ vecArrangementS
+ // vecArrangementD is a size specifier of quad word (64-bit)
+ vecArrangementD
+ // vecArrangementQ is a size specifier of the entire vector (128-bit)
+ vecArrangementQ
+)
+
+// String implements fmt.Stringer
+func (v vecArrangement) String() (ret string) {
+ switch v {
+ case vecArrangement8B:
+ ret = "8B"
+ case vecArrangement16B:
+ ret = "16B"
+ case vecArrangement4H:
+ ret = "4H"
+ case vecArrangement8H:
+ ret = "8H"
+ case vecArrangement2S:
+ ret = "2S"
+ case vecArrangement4S:
+ ret = "4S"
+ case vecArrangement1D:
+ ret = "1D"
+ case vecArrangement2D:
+ ret = "2D"
+ case vecArrangementB:
+ ret = "B"
+ case vecArrangementH:
+ ret = "H"
+ case vecArrangementS:
+ ret = "S"
+ case vecArrangementD:
+ ret = "D"
+ case vecArrangementQ:
+ ret = "Q"
+ case vecArrangementNone:
+ ret = "none"
+ default:
+ panic(v)
+ }
+ return
+}
+
+// vecIndex is the index of an element of a vector register
+type vecIndex byte
+
+// vecIndexNone indicates no vector index specified.
+const vecIndexNone = ^vecIndex(0)
+
+func ssaLaneToArrangement(lane ssa.VecLane) vecArrangement {
+ switch lane {
+ case ssa.VecLaneI8x16:
+ return vecArrangement16B
+ case ssa.VecLaneI16x8:
+ return vecArrangement8H
+ case ssa.VecLaneI32x4:
+ return vecArrangement4S
+ case ssa.VecLaneI64x2:
+ return vecArrangement2D
+ case ssa.VecLaneF32x4:
+ return vecArrangement4S
+ case ssa.VecLaneF64x2:
+ return vecArrangement2D
+ default:
+ panic(lane)
+ }
+}
+
+// atomicRmwOp is the type of atomic read-modify-write operation.
+type atomicRmwOp byte
+
+const (
+ // atomicRmwOpAdd is an atomic add operation.
+ atomicRmwOpAdd atomicRmwOp = iota
+ // atomicRmwOpClr is an atomic clear operation, i.e. AND NOT.
+ atomicRmwOpClr
+ // atomicRmwOpSet is an atomic set operation, i.e. OR.
+ atomicRmwOpSet
+ // atomicRmwOpEor is an atomic exclusive OR operation.
+ atomicRmwOpEor
+ // atomicRmwOpSwp is an atomic swap operation.
+ atomicRmwOpSwp
+)
+
+// String implements fmt.Stringer
+func (a atomicRmwOp) String() string {
+ switch a {
+ case atomicRmwOpAdd:
+ return "ldaddal"
+ case atomicRmwOpClr:
+ return "ldclral"
+ case atomicRmwOpSet:
+ return "ldsetal"
+ case atomicRmwOpEor:
+ return "ldeoral"
+ case atomicRmwOpSwp:
+ return "swpal"
+ }
+ panic(fmt.Sprintf("unknown atomicRmwOp: %d", a))
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
new file mode 100644
index 000000000..227a96474
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
@@ -0,0 +1,2351 @@
+package arm64
+
+import (
+ "context"
+ "fmt"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// Encode implements backend.Machine Encode.
+func (m *machine) Encode(ctx context.Context) error {
+ m.resolveRelativeAddresses(ctx)
+ m.encode(m.executableContext.RootInstr)
+ if l := len(m.compiler.Buf()); l > maxFunctionExecutableSize {
+ return fmt.Errorf("function size exceeds the limit: %d > %d", l, maxFunctionExecutableSize)
+ }
+ return nil
+}
+
+func (m *machine) encode(root *instruction) {
+ for cur := root; cur != nil; cur = cur.next {
+ cur.encode(m)
+ }
+}
+
+func (i *instruction) encode(m *machine) {
+ c := m.compiler
+ switch kind := i.kind; kind {
+ case nop0, emitSourceOffsetInfo, loadConstBlockArg:
+ case exitSequence:
+ encodeExitSequence(c, i.rn.reg())
+ case ret:
+ // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/RET--Return-from-subroutine-?lang=en
+ c.Emit4Bytes(encodeRet())
+ case br:
+ imm := i.brOffset()
+ c.Emit4Bytes(encodeUnconditionalBranch(false, imm))
+ case call:
+ // We still don't know the exact address of the function to call, so we emit a placeholder.
+ c.AddRelocationInfo(i.callFuncRef())
+ c.Emit4Bytes(encodeUnconditionalBranch(true, 0)) // 0 = placeholder
+ case callInd:
+ c.Emit4Bytes(encodeUnconditionalBranchReg(regNumberInEncoding[i.rn.realReg()], true))
+ case store8, store16, store32, store64, fpuStore32, fpuStore64, fpuStore128:
+ c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rn.realReg()], i.amode))
+ case uLoad8, uLoad16, uLoad32, uLoad64, sLoad8, sLoad16, sLoad32, fpuLoad32, fpuLoad64, fpuLoad128:
+ c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rd.realReg()], i.amode))
+ case vecLoad1R:
+ c.Emit4Bytes(encodeVecLoad1R(
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ vecArrangement(i.u1)))
+ case condBr:
+ imm19 := i.condBrOffset()
+ if imm19%4 != 0 {
+ panic("imm26 for branch must be a multiple of 4")
+ }
+
+ imm19U32 := uint32(imm19/4) & 0b111_11111111_11111111
+ brCond := i.condBrCond()
+ switch brCond.kind() {
+ case condKindRegisterZero:
+ rt := regNumberInEncoding[brCond.register().RealReg()]
+ c.Emit4Bytes(encodeCBZCBNZ(rt, false, imm19U32, i.condBr64bit()))
+ case condKindRegisterNotZero:
+ rt := regNumberInEncoding[brCond.register().RealReg()]
+ c.Emit4Bytes(encodeCBZCBNZ(rt, true, imm19U32, i.condBr64bit()))
+ case condKindCondFlagSet:
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B-cond--Branch-conditionally-
+ fl := brCond.flag()
+ c.Emit4Bytes(0b01010100<<24 | (imm19U32 << 5) | uint32(fl))
+ default:
+ panic("BUG")
+ }
+ case movN:
+ c.Emit4Bytes(encodeMoveWideImmediate(0b00, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+ case movZ:
+ c.Emit4Bytes(encodeMoveWideImmediate(0b10, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+ case movK:
+ c.Emit4Bytes(encodeMoveWideImmediate(0b11, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+ case mov32:
+ to, from := i.rd.realReg(), i.rn.realReg()
+ c.Emit4Bytes(encodeAsMov32(regNumberInEncoding[from], regNumberInEncoding[to]))
+ case mov64:
+ to, from := i.rd.realReg(), i.rn.realReg()
+ toIsSp := to == sp
+ fromIsSp := from == sp
+ c.Emit4Bytes(encodeMov64(regNumberInEncoding[to], regNumberInEncoding[from], toIsSp, fromIsSp))
+ case loadP64, storeP64:
+ rt, rt2 := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()]
+ amode := i.amode
+ rn := regNumberInEncoding[amode.rn.RealReg()]
+ var pre bool
+ switch amode.kind {
+ case addressModeKindPostIndex:
+ case addressModeKindPreIndex:
+ pre = true
+ default:
+ panic("BUG")
+ }
+ c.Emit4Bytes(encodePreOrPostIndexLoadStorePair64(pre, kind == loadP64, rn, rt, rt2, amode.imm))
+ case loadFpuConst32:
+ rd := regNumberInEncoding[i.rd.realReg()]
+ if i.u1 == 0 {
+ c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B))
+ } else {
+ encodeLoadFpuConst32(c, rd, i.u1)
+ }
+ case loadFpuConst64:
+ rd := regNumberInEncoding[i.rd.realReg()]
+ if i.u1 == 0 {
+ c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B))
+ } else {
+ encodeLoadFpuConst64(c, regNumberInEncoding[i.rd.realReg()], i.u1)
+ }
+ case loadFpuConst128:
+ rd := regNumberInEncoding[i.rd.realReg()]
+ lo, hi := i.u1, i.u2
+ if lo == 0 && hi == 0 {
+ c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement16B))
+ } else {
+ encodeLoadFpuConst128(c, rd, lo, hi)
+ }
+ case aluRRRR:
+ c.Emit4Bytes(encodeAluRRRR(
+ aluOp(i.u1),
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ regNumberInEncoding[i.rm.realReg()],
+ regNumberInEncoding[i.ra.realReg()],
+ uint32(i.u3),
+ ))
+ case aluRRImmShift:
+ c.Emit4Bytes(encodeAluRRImm(
+ aluOp(i.u1),
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ uint32(i.rm.shiftImm()),
+ uint32(i.u3),
+ ))
+ case aluRRR:
+ rn := i.rn.realReg()
+ c.Emit4Bytes(encodeAluRRR(
+ aluOp(i.u1),
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[rn],
+ regNumberInEncoding[i.rm.realReg()],
+ i.u3 == 1,
+ rn == sp,
+ ))
+ case aluRRRExtend:
+ rm, exo, to := i.rm.er()
+ c.Emit4Bytes(encodeAluRRRExtend(
+ aluOp(i.u1),
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ regNumberInEncoding[rm.RealReg()],
+ exo,
+ to,
+ ))
+ case aluRRRShift:
+ r, amt, sop := i.rm.sr()
+ c.Emit4Bytes(encodeAluRRRShift(
+ aluOp(i.u1),
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ regNumberInEncoding[r.RealReg()],
+ uint32(amt),
+ sop,
+ i.u3 == 1,
+ ))
+ case aluRRBitmaskImm:
+ c.Emit4Bytes(encodeAluBitmaskImmediate(
+ aluOp(i.u1),
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ i.u2,
+ i.u3 == 1,
+ ))
+ case bitRR:
+ c.Emit4Bytes(encodeBitRR(
+ bitOp(i.u1),
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ uint32(i.u2)),
+ )
+ case aluRRImm12:
+ imm12, shift := i.rm.imm12()
+ c.Emit4Bytes(encodeAluRRImm12(
+ aluOp(i.u1),
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ imm12, shift,
+ i.u3 == 1,
+ ))
+ case fpuRRR:
+ c.Emit4Bytes(encodeFpuRRR(
+ fpuBinOp(i.u1),
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ regNumberInEncoding[i.rm.realReg()],
+ i.u3 == 1,
+ ))
+ case fpuMov64, fpuMov128:
+ // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/MOV--vector---Move-vector--an-alias-of-ORR--vector--register--
+ rd := regNumberInEncoding[i.rd.realReg()]
+ rn := regNumberInEncoding[i.rn.realReg()]
+ var q uint32
+ if kind == fpuMov128 {
+ q = 0b1
+ }
+ c.Emit4Bytes(q<<30 | 0b1110101<<21 | rn<<16 | 0b000111<<10 | rn<<5 | rd)
+ case cSet:
+ rd := regNumberInEncoding[i.rd.realReg()]
+ cf := condFlag(i.u1)
+ if i.u2 == 1 {
+ // https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/CSETM--Conditional-Set-Mask--an-alias-of-CSINV-
+ // Note that we set 64bit version here.
+ c.Emit4Bytes(0b1101101010011111<<16 | uint32(cf.invert())<<12 | 0b011111<<5 | rd)
+ } else {
+ // https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/CSET--Conditional-Set--an-alias-of-CSINC-
+ // Note that we set 64bit version here.
+ c.Emit4Bytes(0b1001101010011111<<16 | uint32(cf.invert())<<12 | 0b111111<<5 | rd)
+ }
+ case extend:
+ c.Emit4Bytes(encodeExtend(i.u3 == 1, byte(i.u1), byte(i.u2), regNumberInEncoding[i.rd.realReg()], regNumberInEncoding[i.rn.realReg()]))
+ case fpuCmp:
+ // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/FCMP--Floating-point-quiet-Compare--scalar--?lang=en
+ rn, rm := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()]
+ var ftype uint32
+ if i.u3 == 1 {
+ ftype = 0b01 // double precision.
+ }
+ c.Emit4Bytes(0b1111<<25 | ftype<<22 | 1<<21 | rm<<16 | 0b1<<13 | rn<<5)
+ case udf:
+ // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UDF--Permanently-Undefined-?lang=en
+ if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
+ c.Emit4Bytes(dummyInstruction)
+ } else {
+ c.Emit4Bytes(0)
+ }
+ case adr:
+ c.Emit4Bytes(encodeAdr(regNumberInEncoding[i.rd.realReg()], uint32(i.u1)))
+ case cSel:
+ c.Emit4Bytes(encodeConditionalSelect(
+ kind,
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ regNumberInEncoding[i.rm.realReg()],
+ condFlag(i.u1),
+ i.u3 == 1,
+ ))
+ case fpuCSel:
+ c.Emit4Bytes(encodeFpuCSel(
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ regNumberInEncoding[i.rm.realReg()],
+ condFlag(i.u1),
+ i.u3 == 1,
+ ))
+ case movToVec:
+ c.Emit4Bytes(encodeMoveToVec(
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ vecArrangement(byte(i.u1)),
+ vecIndex(i.u2),
+ ))
+ case movFromVec, movFromVecSigned:
+ c.Emit4Bytes(encodeMoveFromVec(
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ vecArrangement(byte(i.u1)),
+ vecIndex(i.u2),
+ i.kind == movFromVecSigned,
+ ))
+ case vecDup:
+ c.Emit4Bytes(encodeVecDup(
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ vecArrangement(byte(i.u1))))
+ case vecDupElement:
+ c.Emit4Bytes(encodeVecDupElement(
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ vecArrangement(byte(i.u1)),
+ vecIndex(i.u2)))
+ case vecExtract:
+ c.Emit4Bytes(encodeVecExtract(
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ regNumberInEncoding[i.rm.realReg()],
+ vecArrangement(byte(i.u1)),
+ uint32(i.u2)))
+ case vecPermute:
+ c.Emit4Bytes(encodeVecPermute(
+ vecOp(i.u1),
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ regNumberInEncoding[i.rm.realReg()],
+ vecArrangement(byte(i.u2))))
+ case vecMovElement:
+ c.Emit4Bytes(encodeVecMovElement(
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ vecArrangement(i.u1),
+ uint32(i.u2), uint32(i.u3),
+ ))
+ case vecMisc:
+ c.Emit4Bytes(encodeAdvancedSIMDTwoMisc(
+ vecOp(i.u1),
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ vecArrangement(i.u2),
+ ))
+ case vecLanes:
+ c.Emit4Bytes(encodeVecLanes(
+ vecOp(i.u1),
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ vecArrangement(i.u2),
+ ))
+ case vecShiftImm:
+ c.Emit4Bytes(encodeVecShiftImm(
+ vecOp(i.u1),
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ uint32(i.rm.shiftImm()),
+ vecArrangement(i.u2),
+ ))
+ case vecTbl:
+ c.Emit4Bytes(encodeVecTbl(
+ 1,
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ regNumberInEncoding[i.rm.realReg()],
+ vecArrangement(i.u2)),
+ )
+ case vecTbl2:
+ c.Emit4Bytes(encodeVecTbl(
+ 2,
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ regNumberInEncoding[i.rm.realReg()],
+ vecArrangement(i.u2)),
+ )
+ case brTableSequence:
+ targets := m.jmpTableTargets[i.u1]
+ encodeBrTableSequence(c, i.rn.reg(), targets)
+ case fpuToInt, intToFpu:
+ c.Emit4Bytes(encodeCnvBetweenFloatInt(i))
+ case fpuRR:
+ c.Emit4Bytes(encodeFloatDataOneSource(
+ fpuUniOp(i.u1),
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ i.u3 == 1,
+ ))
+ case vecRRR:
+ if op := vecOp(i.u1); op == vecOpBsl || op == vecOpBit || op == vecOpUmlal {
+ panic(fmt.Sprintf("vecOp %s must use vecRRRRewrite instead of vecRRR", op.String()))
+ }
+ fallthrough
+ case vecRRRRewrite:
+ c.Emit4Bytes(encodeVecRRR(
+ vecOp(i.u1),
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ regNumberInEncoding[i.rm.realReg()],
+ vecArrangement(i.u2),
+ ))
+ case cCmpImm:
+ // Conditional compare (immediate) in https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
+ sf := uint32(i.u3 & 0b1)
+ nzcv := uint32(i.u2 & 0b1111)
+ cond := uint32(condFlag(i.u1))
+ imm := uint32(i.rm.data & 0b11111)
+ rn := regNumberInEncoding[i.rn.realReg()]
+ c.Emit4Bytes(
+ sf<<31 | 0b111101001<<22 | imm<<16 | cond<<12 | 0b1<<11 | rn<<5 | nzcv,
+ )
+ case movFromFPSR:
+ rt := regNumberInEncoding[i.rd.realReg()]
+ c.Emit4Bytes(encodeSystemRegisterMove(rt, true))
+ case movToFPSR:
+ rt := regNumberInEncoding[i.rn.realReg()]
+ c.Emit4Bytes(encodeSystemRegisterMove(rt, false))
+ case atomicRmw:
+ c.Emit4Bytes(encodeAtomicRmw(
+ atomicRmwOp(i.u1),
+ regNumberInEncoding[i.rm.realReg()],
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ uint32(i.u2),
+ ))
+ case atomicCas:
+ c.Emit4Bytes(encodeAtomicCas(
+ regNumberInEncoding[i.rd.realReg()],
+ regNumberInEncoding[i.rm.realReg()],
+ regNumberInEncoding[i.rn.realReg()],
+ uint32(i.u2),
+ ))
+ case atomicLoad:
+ c.Emit4Bytes(encodeAtomicLoadStore(
+ regNumberInEncoding[i.rn.realReg()],
+ regNumberInEncoding[i.rd.realReg()],
+ uint32(i.u2),
+ 1,
+ ))
+ case atomicStore:
+ c.Emit4Bytes(encodeAtomicLoadStore(
+ regNumberInEncoding[i.rn.realReg()],
+ regNumberInEncoding[i.rm.realReg()],
+ uint32(i.u2),
+ 0,
+ ))
+ case dmb:
+ c.Emit4Bytes(encodeDMB())
+ default:
+ panic(i.String())
+ }
+}
+
+func encodeMov64(rd, rn uint32, toIsSp, fromIsSp bool) uint32 {
+ if toIsSp || fromIsSp {
+ // This is an alias of ADD (immediate):
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MOV--to-from-SP---Move-between-register-and-stack-pointer--an-alias-of-ADD--immediate--
+ return encodeAddSubtractImmediate(0b100, 0, 0, rn, rd)
+ } else {
+ // This is an alias of ORR (shifted register):
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MOV--register---Move--register---an-alias-of-ORR--shifted-register--
+ return encodeLogicalShiftedRegister(0b101, 0, rn, 0, regNumberInEncoding[xzr], rd)
+ }
+}
+
+// encodeSystemRegisterMove encodes as "System register move" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en
+//
+// Note that currently we only supports read/write of FPSR.
+func encodeSystemRegisterMove(rt uint32, fromSystem bool) uint32 {
+ ret := 0b11010101<<24 | 0b11011<<16 | 0b01000100<<8 | 0b001<<5 | rt
+ if fromSystem {
+ ret |= 0b1 << 21
+ }
+ return ret
+}
+
+// encodeVecRRR encodes as either "Advanced SIMD three *" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeVecRRR(op vecOp, rd, rn, rm uint32, arr vecArrangement) uint32 {
+ switch op {
+ case vecOpBit:
+ _, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b10 /* always has size 0b10 */, 0b1, q)
+ case vecOpBic:
+ if arr > vecArrangement16B {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ _, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b01 /* always has size 0b01 */, 0b0, q)
+ case vecOpBsl:
+ if arr > vecArrangement16B {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ _, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b01 /* always has size 0b01 */, 0b1, q)
+ case vecOpAnd:
+ if arr > vecArrangement16B {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ _, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b00 /* always has size 0b00 */, 0b0, q)
+ case vecOpOrr:
+ _, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b10 /* always has size 0b10 */, 0b0, q)
+ case vecOpEOR:
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, size, 0b1, q)
+ case vecOpCmeq:
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10001, size, 0b1, q)
+ case vecOpCmgt:
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00110, size, 0b0, q)
+ case vecOpCmhi:
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00110, size, 0b1, q)
+ case vecOpCmge:
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00111, size, 0b0, q)
+ case vecOpCmhs:
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00111, size, 0b1, q)
+ case vecOpFcmeq:
+ var size, q uint32
+ switch arr {
+ case vecArrangement4S:
+ size, q = 0b00, 0b1
+ case vecArrangement2S:
+ size, q = 0b00, 0b0
+ case vecArrangement2D:
+ size, q = 0b01, 0b1
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11100, size, 0b0, q)
+ case vecOpFcmgt:
+ if arr < vecArrangement2S || arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11100, size, 0b1, q)
+ case vecOpFcmge:
+ var size, q uint32
+ switch arr {
+ case vecArrangement4S:
+ size, q = 0b00, 0b1
+ case vecArrangement2S:
+ size, q = 0b00, 0b0
+ case vecArrangement2D:
+ size, q = 0b01, 0b1
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11100, size, 0b1, q)
+ case vecOpAdd:
+ if arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10000, size, 0b0, q)
+ case vecOpSqadd:
+ if arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00001, size, 0b0, q)
+ case vecOpUqadd:
+ if arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00001, size, 0b1, q)
+ case vecOpAddp:
+ if arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10111, size, 0b0, q)
+ case vecOpSqsub:
+ if arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00101, size, 0b0, q)
+ case vecOpUqsub:
+ if arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00101, size, 0b1, q)
+ case vecOpSub:
+ if arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10000, size, 0b1, q)
+ case vecOpFmin:
+ if arr < vecArrangement2S || arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11110, size, 0b0, q)
+ case vecOpSmin:
+ if arr > vecArrangement4S {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01101, size, 0b0, q)
+ case vecOpUmin:
+ if arr > vecArrangement4S {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01101, size, 0b1, q)
+ case vecOpFmax:
+ var size, q uint32
+ switch arr {
+ case vecArrangement4S:
+ size, q = 0b00, 0b1
+ case vecArrangement2S:
+ size, q = 0b00, 0b0
+ case vecArrangement2D:
+ size, q = 0b01, 0b1
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11110, size, 0b0, q)
+ case vecOpFadd:
+ var size, q uint32
+ switch arr {
+ case vecArrangement4S:
+ size, q = 0b00, 0b1
+ case vecArrangement2S:
+ size, q = 0b00, 0b0
+ case vecArrangement2D:
+ size, q = 0b01, 0b1
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11010, size, 0b0, q)
+ case vecOpFsub:
+ if arr < vecArrangement2S || arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11010, size, 0b0, q)
+ case vecOpFmul:
+ var size, q uint32
+ switch arr {
+ case vecArrangement4S:
+ size, q = 0b00, 0b1
+ case vecArrangement2S:
+ size, q = 0b00, 0b0
+ case vecArrangement2D:
+ size, q = 0b01, 0b1
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11011, size, 0b1, q)
+ case vecOpSqrdmulh:
+ if arr < vecArrangement4H || arr > vecArrangement4S {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10110, size, 0b1, q)
+ case vecOpFdiv:
+ var size, q uint32
+ switch arr {
+ case vecArrangement4S:
+ size, q = 0b00, 0b1
+ case vecArrangement2S:
+ size, q = 0b00, 0b0
+ case vecArrangement2D:
+ size, q = 0b01, 0b1
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11111, size, 0b1, q)
+ case vecOpSmax:
+ if arr > vecArrangement4S {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01100, size, 0b0, q)
+ case vecOpUmax:
+ if arr > vecArrangement4S {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01100, size, 0b1, q)
+ case vecOpUmaxp:
+ if arr > vecArrangement4S {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10100, size, 0b1, q)
+ case vecOpUrhadd:
+ if arr > vecArrangement4S {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00010, size, 0b1, q)
+ case vecOpMul:
+ if arr > vecArrangement4S {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10011, size, 0b0, q)
+ case vecOpUmlal:
+ if arr > vecArrangement4S {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeDifferent(rd, rn, rm, 0b1000, size, 0b1, q)
+ case vecOpSshl:
+ if arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01000, size, 0b0, q)
+ case vecOpUshl:
+ if arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01000, size, 0b1, q)
+
+ case vecOpSmull:
+ if arr > vecArrangement4S {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, _ := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeDifferent(rd, rn, rm, 0b1100, size, 0b0, 0b0)
+
+ case vecOpSmull2:
+ if arr > vecArrangement4S {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, _ := arrToSizeQEncoded(arr)
+ return encodeAdvancedSIMDThreeDifferent(rd, rn, rm, 0b1100, size, 0b0, 0b1)
+
+ default:
+ panic("TODO: " + op.String())
+ }
+}
+
+func arrToSizeQEncoded(arr vecArrangement) (size, q uint32) {
+ switch arr {
+ case vecArrangement16B:
+ q = 0b1
+ fallthrough
+ case vecArrangement8B:
+ size = 0b00
+ case vecArrangement8H:
+ q = 0b1
+ fallthrough
+ case vecArrangement4H:
+ size = 0b01
+ case vecArrangement4S:
+ q = 0b1
+ fallthrough
+ case vecArrangement2S:
+ size = 0b10
+ case vecArrangement2D:
+ q = 0b1
+ fallthrough
+ case vecArrangement1D:
+ size = 0b11
+ default:
+ panic("BUG")
+ }
+ return
+}
+
+// encodeAdvancedSIMDThreeSame encodes as "Advanced SIMD three same" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeAdvancedSIMDThreeSame(rd, rn, rm, opcode, size, U, Q uint32) uint32 {
+ return Q<<30 | U<<29 | 0b111<<25 | size<<22 | 0b1<<21 | rm<<16 | opcode<<11 | 0b1<<10 | rn<<5 | rd
+}
+
+// encodeAdvancedSIMDThreeDifferent encodes as "Advanced SIMD three different" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeAdvancedSIMDThreeDifferent(rd, rn, rm, opcode, size, U, Q uint32) uint32 {
+ return Q<<30 | U<<29 | 0b111<<25 | size<<22 | 0b1<<21 | rm<<16 | opcode<<12 | rn<<5 | rd
+}
+
+// encodeFloatDataOneSource encodes as "Floating-point data-processing (1 source)" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp
+func encodeFloatDataOneSource(op fpuUniOp, rd, rn uint32, dst64bit bool) uint32 {
+ var opcode, ptype uint32
+ switch op {
+ case fpuUniOpCvt32To64:
+ opcode = 0b000101
+ case fpuUniOpCvt64To32:
+ opcode = 0b000100
+ ptype = 0b01
+ case fpuUniOpNeg:
+ opcode = 0b000010
+ if dst64bit {
+ ptype = 0b01
+ }
+ case fpuUniOpSqrt:
+ opcode = 0b000011
+ if dst64bit {
+ ptype = 0b01
+ }
+ case fpuUniOpRoundPlus:
+ opcode = 0b001001
+ if dst64bit {
+ ptype = 0b01
+ }
+ case fpuUniOpRoundMinus:
+ opcode = 0b001010
+ if dst64bit {
+ ptype = 0b01
+ }
+ case fpuUniOpRoundZero:
+ opcode = 0b001011
+ if dst64bit {
+ ptype = 0b01
+ }
+ case fpuUniOpRoundNearest:
+ opcode = 0b001000
+ if dst64bit {
+ ptype = 0b01
+ }
+ case fpuUniOpAbs:
+ opcode = 0b000001
+ if dst64bit {
+ ptype = 0b01
+ }
+ default:
+ panic("BUG")
+ }
+ return 0b1111<<25 | ptype<<22 | 0b1<<21 | opcode<<15 | 0b1<<14 | rn<<5 | rd
+}
+
+// encodeCnvBetweenFloatInt encodes as "Conversion between floating-point and integer" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeCnvBetweenFloatInt(i *instruction) uint32 {
+ rd := regNumberInEncoding[i.rd.realReg()]
+ rn := regNumberInEncoding[i.rn.realReg()]
+
+ var opcode uint32
+ var rmode uint32
+ var ptype uint32
+ var sf uint32
+ switch i.kind {
+ case intToFpu: // Either UCVTF or SCVTF.
+ rmode = 0b00
+
+ signed := i.u1 == 1
+ src64bit := i.u2 == 1
+ dst64bit := i.u3 == 1
+ if signed {
+ opcode = 0b010
+ } else {
+ opcode = 0b011
+ }
+ if src64bit {
+ sf = 0b1
+ }
+ if dst64bit {
+ ptype = 0b01
+ } else {
+ ptype = 0b00
+ }
+ case fpuToInt: // Either FCVTZU or FCVTZS.
+ rmode = 0b11
+
+ signed := i.u1 == 1
+ src64bit := i.u2 == 1
+ dst64bit := i.u3 == 1
+
+ if signed {
+ opcode = 0b000
+ } else {
+ opcode = 0b001
+ }
+ if dst64bit {
+ sf = 0b1
+ }
+ if src64bit {
+ ptype = 0b01
+ } else {
+ ptype = 0b00
+ }
+ }
+ return sf<<31 | 0b1111<<25 | ptype<<22 | 0b1<<21 | rmode<<19 | opcode<<16 | rn<<5 | rd
+}
+
+// encodeAdr encodes a PC-relative ADR instruction.
+// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/ADR--Form-PC-relative-address-
+func encodeAdr(rd uint32, offset uint32) uint32 {
+ if offset >= 1<<20 {
+ panic("BUG: too large adr instruction")
+ }
+ return offset&0b11<<29 | 0b1<<28 | offset&0b1111111111_1111111100<<3 | rd
+}
+
+// encodeFpuCSel encodes as "Floating-point conditional select" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeFpuCSel(rd, rn, rm uint32, c condFlag, _64bit bool) uint32 {
+ var ftype uint32
+ if _64bit {
+ ftype = 0b01 // double precision.
+ }
+ return 0b1111<<25 | ftype<<22 | 0b1<<21 | rm<<16 | uint32(c)<<12 | 0b11<<10 | rn<<5 | rd
+}
+
+// encodeMoveToVec encodes as "Move general-purpose register to a vector element" (represented as `ins`) in
+// https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/MOV--vector--from-general-
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--from-general---Move-general-purpose-register-to-a-vector-element--an-alias-of-INS--general--?lang=en
+func encodeMoveToVec(rd, rn uint32, arr vecArrangement, index vecIndex) uint32 {
+ var imm5 uint32
+ switch arr {
+ case vecArrangementB:
+ imm5 |= 0b1
+ imm5 |= uint32(index) << 1
+ if index > 0b1111 {
+ panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", index))
+ }
+ case vecArrangementH:
+ imm5 |= 0b10
+ imm5 |= uint32(index) << 2
+ if index > 0b111 {
+ panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", index))
+ }
+ case vecArrangementS:
+ imm5 |= 0b100
+ imm5 |= uint32(index) << 3
+ if index > 0b11 {
+ panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", index))
+ }
+ case vecArrangementD:
+ imm5 |= 0b1000
+ imm5 |= uint32(index) << 4
+ if index > 0b1 {
+ panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", index))
+ }
+ default:
+ panic("Unsupported arrangement " + arr.String())
+ }
+
+ return 0b01001110000<<21 | imm5<<16 | 0b000111<<10 | rn<<5 | rd
+}
+
+// encodeMoveToVec encodes as "Move vector element to another vector element, mov (element)" (represented as `ins`) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--element---Move-vector-element-to-another-vector-element--an-alias-of-INS--element--?lang=en
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en
+func encodeVecMovElement(rd, rn uint32, arr vecArrangement, srcIndex, dstIndex uint32) uint32 {
+ var imm4, imm5 uint32
+ switch arr {
+ case vecArrangementB:
+ imm5 |= 0b1
+ imm5 |= srcIndex << 1
+ imm4 = dstIndex
+ if srcIndex > 0b1111 {
+ panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", srcIndex))
+ }
+ case vecArrangementH:
+ imm5 |= 0b10
+ imm5 |= srcIndex << 2
+ imm4 = dstIndex << 1
+ if srcIndex > 0b111 {
+ panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", srcIndex))
+ }
+ case vecArrangementS:
+ imm5 |= 0b100
+ imm5 |= srcIndex << 3
+ imm4 = dstIndex << 2
+ if srcIndex > 0b11 {
+ panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", srcIndex))
+ }
+ case vecArrangementD:
+ imm5 |= 0b1000
+ imm5 |= srcIndex << 4
+ imm4 = dstIndex << 3
+ if srcIndex > 0b1 {
+ panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", srcIndex))
+ }
+ default:
+ panic("Unsupported arrangement " + arr.String())
+ }
+
+ return 0b01101110000<<21 | imm5<<16 | imm4<<11 | 0b1<<10 | rn<<5 | rd
+}
+
+// encodeUnconditionalBranchReg encodes as "Unconditional branch (register)" in:
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en
+func encodeUnconditionalBranchReg(rn uint32, link bool) uint32 {
+ var opc uint32
+ if link {
+ opc = 0b0001
+ }
+ return 0b1101011<<25 | opc<<21 | 0b11111<<16 | rn<<5
+}
+
+// encodeMoveFromVec encodes as "Move vector element to a general-purpose register"
+// (represented as `umov` when dest is 32-bit, `umov` otherwise) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register-?lang=en
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--to-general---Move-vector-element-to-general-purpose-register--an-alias-of-UMOV-?lang=en
+func encodeMoveFromVec(rd, rn uint32, arr vecArrangement, index vecIndex, signed bool) uint32 {
+ var op, imm4, q, imm5 uint32
+ switch {
+ case arr == vecArrangementB:
+ imm5 |= 0b1
+ imm5 |= uint32(index) << 1
+ if index > 0b1111 {
+ panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", index))
+ }
+ case arr == vecArrangementH:
+ imm5 |= 0b10
+ imm5 |= uint32(index) << 2
+ if index > 0b111 {
+ panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", index))
+ }
+ case arr == vecArrangementS && signed:
+ q = 0b1
+ fallthrough
+ case arr == vecArrangementS:
+ imm5 |= 0b100
+ imm5 |= uint32(index) << 3
+ if index > 0b11 {
+ panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", index))
+ }
+ case arr == vecArrangementD && !signed:
+ imm5 |= 0b1000
+ imm5 |= uint32(index) << 4
+ q = 0b1
+ if index > 0b1 {
+ panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", index))
+ }
+ default:
+ panic("Unsupported arrangement " + arr.String())
+ }
+ if signed {
+ op, imm4 = 0, 0b0101
+ } else {
+ op, imm4 = 0, 0b0111
+ }
+ return op<<29 | 0b01110000<<21 | q<<30 | imm5<<16 | imm4<<11 | 1<<10 | rn<<5 | rd
+}
+
+// encodeVecDup encodes as "Duplicate general-purpose register to vector" DUP (general)
+// (represented as `dup`)
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--general---Duplicate-general-purpose-register-to-vector-?lang=en
+func encodeVecDup(rd, rn uint32, arr vecArrangement) uint32 {
+ var q, imm5 uint32
+ switch arr {
+ case vecArrangement8B:
+ q, imm5 = 0b0, 0b1
+ case vecArrangement16B:
+ q, imm5 = 0b1, 0b1
+ case vecArrangement4H:
+ q, imm5 = 0b0, 0b10
+ case vecArrangement8H:
+ q, imm5 = 0b1, 0b10
+ case vecArrangement2S:
+ q, imm5 = 0b0, 0b100
+ case vecArrangement4S:
+ q, imm5 = 0b1, 0b100
+ case vecArrangement2D:
+ q, imm5 = 0b1, 0b1000
+ default:
+ panic("Unsupported arrangement " + arr.String())
+ }
+ return q<<30 | 0b001110000<<21 | imm5<<16 | 0b000011<<10 | rn<<5 | rd
+}
+
+// encodeVecDup encodes as "Duplicate vector element to vector or scalar" DUP (element).
+// (represented as `dup`)
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar-
+func encodeVecDupElement(rd, rn uint32, arr vecArrangement, srcIndex vecIndex) uint32 {
+ var q, imm5 uint32
+ q = 0b1
+ switch arr {
+ case vecArrangementB:
+ imm5 |= 0b1
+ imm5 |= uint32(srcIndex) << 1
+ case vecArrangementH:
+ imm5 |= 0b10
+ imm5 |= uint32(srcIndex) << 2
+ case vecArrangementS:
+ imm5 |= 0b100
+ imm5 |= uint32(srcIndex) << 3
+ case vecArrangementD:
+ imm5 |= 0b1000
+ imm5 |= uint32(srcIndex) << 4
+ default:
+ panic("unsupported arrangement" + arr.String())
+ }
+
+ return q<<30 | 0b001110000<<21 | imm5<<16 | 0b1<<10 | rn<<5 | rd
+}
+
+// encodeVecExtract encodes as "Advanced SIMD extract."
+// Currently only `ext` is defined.
+// https://developer.arm.com/documentation/ddi0602/2023-06/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp
+// https://developer.arm.com/documentation/ddi0602/2023-06/SIMD-FP-Instructions/EXT--Extract-vector-from-pair-of-vectors-?lang=en
+func encodeVecExtract(rd, rn, rm uint32, arr vecArrangement, index uint32) uint32 {
+ var q, imm4 uint32
+ switch arr {
+ case vecArrangement8B:
+ q, imm4 = 0, 0b0111&uint32(index)
+ case vecArrangement16B:
+ q, imm4 = 1, 0b1111&uint32(index)
+ default:
+ panic("Unsupported arrangement " + arr.String())
+ }
+ return q<<30 | 0b101110000<<21 | rm<<16 | imm4<<11 | rn<<5 | rd
+}
+
+// encodeVecPermute encodes as "Advanced SIMD permute."
+// https://developer.arm.com/documentation/ddi0602/2023-06/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp
+func encodeVecPermute(op vecOp, rd, rn, rm uint32, arr vecArrangement) uint32 {
+ var q, size, opcode uint32
+ switch op {
+ case vecOpZip1:
+ opcode = 0b011
+ if arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q = arrToSizeQEncoded(arr)
+ default:
+ panic("TODO: " + op.String())
+ }
+ return q<<30 | 0b001110<<24 | size<<22 | rm<<16 | opcode<<12 | 0b10<<10 | rn<<5 | rd
+}
+
+// encodeConditionalSelect encodes as "Conditional select" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#condsel
+func encodeConditionalSelect(kind instructionKind, rd, rn, rm uint32, c condFlag, _64bit bool) uint32 {
+ if kind != cSel {
+ panic("TODO: support other conditional select")
+ }
+
+ ret := 0b110101<<23 | rm<<16 | uint32(c)<<12 | rn<<5 | rd
+ if _64bit {
+ ret |= 0b1 << 31
+ }
+ return ret
+}
+
+const dummyInstruction uint32 = 0x14000000 // "b 0"
+
+// encodeLoadFpuConst32 encodes the following three instructions:
+//
+// ldr s8, #8 ;; literal load of data.f32
+// b 8 ;; skip the data
+// data.f32 xxxxxxx
+func encodeLoadFpuConst32(c backend.Compiler, rd uint32, rawF32 uint64) {
+ c.Emit4Bytes(
+ // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--?lang=en
+ 0b111<<26 | (0x8/4)<<5 | rd,
+ )
+ c.Emit4Bytes(encodeUnconditionalBranch(false, 8)) // b 8
+ if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
+ // Inlined data.f32 cannot be disassembled, so we add a dummy instruction here.
+ c.Emit4Bytes(dummyInstruction)
+ } else {
+ c.Emit4Bytes(uint32(rawF32)) // data.f32 xxxxxxx
+ }
+}
+
+// encodeLoadFpuConst64 encodes the following three instructions:
+//
+// ldr d8, #8 ;; literal load of data.f64
+// b 12 ;; skip the data
+// data.f64 xxxxxxx
+func encodeLoadFpuConst64(c backend.Compiler, rd uint32, rawF64 uint64) {
+ c.Emit4Bytes(
+ // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--?lang=en
+ 0b1<<30 | 0b111<<26 | (0x8/4)<<5 | rd,
+ )
+ c.Emit4Bytes(encodeUnconditionalBranch(false, 12)) // b 12
+ if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
+ // Inlined data.f64 cannot be disassembled, so we add dummy instructions here.
+ c.Emit4Bytes(dummyInstruction)
+ c.Emit4Bytes(dummyInstruction)
+ } else {
+ // data.f64 xxxxxxx
+ c.Emit4Bytes(uint32(rawF64))
+ c.Emit4Bytes(uint32(rawF64 >> 32))
+ }
+}
+
+// encodeLoadFpuConst128 encodes the following three instructions:
+//
+// ldr v8, #8 ;; literal load of data.f64
+// b 20 ;; skip the data
+// data.v128 xxxxxxx
+func encodeLoadFpuConst128(c backend.Compiler, rd uint32, lo, hi uint64) {
+ c.Emit4Bytes(
+ // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--?lang=en
+ 0b1<<31 | 0b111<<26 | (0x8/4)<<5 | rd,
+ )
+ c.Emit4Bytes(encodeUnconditionalBranch(false, 20)) // b 20
+ if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
+ // Inlined data.v128 cannot be disassembled, so we add dummy instructions here.
+ c.Emit4Bytes(dummyInstruction)
+ c.Emit4Bytes(dummyInstruction)
+ c.Emit4Bytes(dummyInstruction)
+ c.Emit4Bytes(dummyInstruction)
+ } else {
+ // data.v128 xxxxxxx
+ c.Emit4Bytes(uint32(lo))
+ c.Emit4Bytes(uint32(lo >> 32))
+ c.Emit4Bytes(uint32(hi))
+ c.Emit4Bytes(uint32(hi >> 32))
+ }
+}
+
+// encodeAluRRRR encodes as Data-processing (3 source) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
+func encodeAluRRRR(op aluOp, rd, rn, rm, ra, _64bit uint32) uint32 {
+ var oO, op31 uint32
+ switch op {
+ case aluOpMAdd:
+ op31, oO = 0b000, 0b0
+ case aluOpMSub:
+ op31, oO = 0b000, 0b1
+ default:
+ panic("TODO/BUG")
+ }
+ return _64bit<<31 | 0b11011<<24 | op31<<21 | rm<<16 | oO<<15 | ra<<10 | rn<<5 | rd
+}
+
+// encodeBitRR encodes as Data-processing (1 source) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
+func encodeBitRR(op bitOp, rd, rn, _64bit uint32) uint32 {
+ var opcode2, opcode uint32
+ switch op {
+ case bitOpRbit:
+ opcode2, opcode = 0b00000, 0b000000
+ case bitOpClz:
+ opcode2, opcode = 0b00000, 0b000100
+ default:
+ panic("TODO/BUG")
+ }
+ return _64bit<<31 | 0b1_0_11010110<<21 | opcode2<<15 | opcode<<10 | rn<<5 | rd
+}
+
+func encodeAsMov32(rn, rd uint32) uint32 {
+ // This is an alias of ORR (shifted register):
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MOV--register---Move--register---an-alias-of-ORR--shifted-register--
+ return encodeLogicalShiftedRegister(0b001, 0, rn, 0, regNumberInEncoding[xzr], rd)
+}
+
+// encodeExtend encodes extension instructions.
+func encodeExtend(signed bool, from, to byte, rd, rn uint32) uint32 {
+ // UTXB: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UXTB--Unsigned-Extend-Byte--an-alias-of-UBFM-?lang=en
+ // UTXH: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UXTH--Unsigned-Extend-Halfword--an-alias-of-UBFM-?lang=en
+ // STXB: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SXTB--Signed-Extend-Byte--an-alias-of-SBFM-
+ // STXH: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SXTH--Sign-Extend-Halfword--an-alias-of-SBFM-
+ // STXW: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SXTW--Sign-Extend-Word--an-alias-of-SBFM-
+ var _31to10 uint32
+ switch {
+ case !signed && from == 8 && to == 32:
+ // 32-bit UXTB
+ _31to10 = 0b0101001100000000000111
+ case !signed && from == 16 && to == 32:
+ // 32-bit UXTH
+ _31to10 = 0b0101001100000000001111
+ case !signed && from == 8 && to == 64:
+ // 64-bit UXTB
+ _31to10 = 0b0101001100000000000111
+ case !signed && from == 16 && to == 64:
+ // 64-bit UXTH
+ _31to10 = 0b0101001100000000001111
+ case !signed && from == 32 && to == 64:
+ return encodeAsMov32(rn, rd)
+ case signed && from == 8 && to == 32:
+ // 32-bit SXTB
+ _31to10 = 0b0001001100000000000111
+ case signed && from == 16 && to == 32:
+ // 32-bit SXTH
+ _31to10 = 0b0001001100000000001111
+ case signed && from == 8 && to == 64:
+ // 64-bit SXTB
+ _31to10 = 0b1001001101000000000111
+ case signed && from == 16 && to == 64:
+ // 64-bit SXTH
+ _31to10 = 0b1001001101000000001111
+ case signed && from == 32 && to == 64:
+ // SXTW
+ _31to10 = 0b1001001101000000011111
+ default:
+ panic("BUG")
+ }
+ return _31to10<<10 | rn<<5 | rd
+}
+
+func encodeLoadOrStore(kind instructionKind, rt uint32, amode addressMode) uint32 {
+ var _22to31 uint32
+ var bits int64
+ switch kind {
+ case uLoad8:
+ _22to31 = 0b0011100001
+ bits = 8
+ case sLoad8:
+ _22to31 = 0b0011100010
+ bits = 8
+ case uLoad16:
+ _22to31 = 0b0111100001
+ bits = 16
+ case sLoad16:
+ _22to31 = 0b0111100010
+ bits = 16
+ case uLoad32:
+ _22to31 = 0b1011100001
+ bits = 32
+ case sLoad32:
+ _22to31 = 0b1011100010
+ bits = 32
+ case uLoad64:
+ _22to31 = 0b1111100001
+ bits = 64
+ case fpuLoad32:
+ _22to31 = 0b1011110001
+ bits = 32
+ case fpuLoad64:
+ _22to31 = 0b1111110001
+ bits = 64
+ case fpuLoad128:
+ _22to31 = 0b0011110011
+ bits = 128
+ case store8:
+ _22to31 = 0b0011100000
+ bits = 8
+ case store16:
+ _22to31 = 0b0111100000
+ bits = 16
+ case store32:
+ _22to31 = 0b1011100000
+ bits = 32
+ case store64:
+ _22to31 = 0b1111100000
+ bits = 64
+ case fpuStore32:
+ _22to31 = 0b1011110000
+ bits = 32
+ case fpuStore64:
+ _22to31 = 0b1111110000
+ bits = 64
+ case fpuStore128:
+ _22to31 = 0b0011110010
+ bits = 128
+ default:
+ panic("BUG")
+ }
+
+ switch amode.kind {
+ case addressModeKindRegScaledExtended:
+ return encodeLoadOrStoreExtended(_22to31,
+ regNumberInEncoding[amode.rn.RealReg()],
+ regNumberInEncoding[amode.rm.RealReg()],
+ rt, true, amode.extOp)
+ case addressModeKindRegScaled:
+ return encodeLoadOrStoreExtended(_22to31,
+ regNumberInEncoding[amode.rn.RealReg()], regNumberInEncoding[amode.rm.RealReg()],
+ rt, true, extendOpNone)
+ case addressModeKindRegExtended:
+ return encodeLoadOrStoreExtended(_22to31,
+ regNumberInEncoding[amode.rn.RealReg()], regNumberInEncoding[amode.rm.RealReg()],
+ rt, false, amode.extOp)
+ case addressModeKindRegReg:
+ return encodeLoadOrStoreExtended(_22to31,
+ regNumberInEncoding[amode.rn.RealReg()], regNumberInEncoding[amode.rm.RealReg()],
+ rt, false, extendOpNone)
+ case addressModeKindRegSignedImm9:
+ // e.g. https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled--
+ return encodeLoadOrStoreSIMM9(_22to31, 0b00 /* unscaled */, regNumberInEncoding[amode.rn.RealReg()], rt, amode.imm)
+ case addressModeKindPostIndex:
+ return encodeLoadOrStoreSIMM9(_22to31, 0b01 /* post index */, regNumberInEncoding[amode.rn.RealReg()], rt, amode.imm)
+ case addressModeKindPreIndex:
+ return encodeLoadOrStoreSIMM9(_22to31, 0b11 /* pre index */, regNumberInEncoding[amode.rn.RealReg()], rt, amode.imm)
+ case addressModeKindRegUnsignedImm12:
+ // "unsigned immediate" in https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Loads-and-Stores?lang=en
+ rn := regNumberInEncoding[amode.rn.RealReg()]
+ imm := amode.imm
+ div := bits / 8
+ if imm != 0 && !offsetFitsInAddressModeKindRegUnsignedImm12(byte(bits), imm) {
+ panic("BUG")
+ }
+ imm /= div
+ return _22to31<<22 | 0b1<<24 | uint32(imm&0b111111111111)<<10 | rn<<5 | rt
+ default:
+ panic("BUG")
+ }
+}
+
+// encodeVecLoad1R encodes as Load one single-element structure and Replicate to all lanes (of one register) in
+// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LD1R--Load-one-single-element-structure-and-Replicate-to-all-lanes--of-one-register--?lang=en#sa_imm
+func encodeVecLoad1R(rt, rn uint32, arr vecArrangement) uint32 {
+ size, q := arrToSizeQEncoded(arr)
+ return q<<30 | 0b001101010000001100<<12 | size<<10 | rn<<5 | rt
+}
+
+// encodeAluBitmaskImmediate encodes as Logical (immediate) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
+func encodeAluBitmaskImmediate(op aluOp, rd, rn uint32, imm uint64, _64bit bool) uint32 {
+ var _31to23 uint32
+ switch op {
+ case aluOpAnd:
+ _31to23 = 0b00_100100
+ case aluOpOrr:
+ _31to23 = 0b01_100100
+ case aluOpEor:
+ _31to23 = 0b10_100100
+ case aluOpAnds:
+ _31to23 = 0b11_100100
+ default:
+ panic("BUG")
+ }
+ if _64bit {
+ _31to23 |= 0b1 << 8
+ }
+ immr, imms, N := bitmaskImmediate(imm, _64bit)
+ return _31to23<<23 | uint32(N)<<22 | uint32(immr)<<16 | uint32(imms)<<10 | rn<<5 | rd
+}
+
+func bitmaskImmediate(c uint64, is64bit bool) (immr, imms, N byte) {
+ var size uint32
+ switch {
+ case c != c>>32|c<<32:
+ size = 64
+ case c != c>>16|c<<48:
+ size = 32
+ c = uint64(int32(c))
+ case c != c>>8|c<<56:
+ size = 16
+ c = uint64(int16(c))
+ case c != c>>4|c<<60:
+ size = 8
+ c = uint64(int8(c))
+ case c != c>>2|c<<62:
+ size = 4
+ c = uint64(int64(c<<60) >> 60)
+ default:
+ size = 2
+ c = uint64(int64(c<<62) >> 62)
+ }
+
+ neg := false
+ if int64(c) < 0 {
+ c = ^c
+ neg = true
+ }
+
+ onesSize, nonZeroPos := getOnesSequenceSize(c)
+ if neg {
+ nonZeroPos = onesSize + nonZeroPos
+ onesSize = size - onesSize
+ }
+
+ var mode byte = 32
+ if is64bit && size == 64 {
+ N, mode = 0b1, 64
+ }
+
+ immr = byte((size - nonZeroPos) & (size - 1) & uint32(mode-1))
+ imms = byte((onesSize - 1) | 63&^(size<<1-1))
+ return
+}
+
+func getOnesSequenceSize(x uint64) (size, nonZeroPos uint32) {
+ // Take 0b00111000 for example:
+ y := getLowestBit(x) // = 0b0000100
+ nonZeroPos = setBitPos(y) // = 2
+ size = setBitPos(x+y) - nonZeroPos // = setBitPos(0b0100000) - 2 = 5 - 2 = 3
+ return
+}
+
+func setBitPos(x uint64) (ret uint32) {
+ for ; ; ret++ {
+ if x == 0b1 {
+ break
+ }
+ x = x >> 1
+ }
+ return
+}
+
+// encodeLoadOrStoreExtended encodes store/load instruction as "extended register offset" in Load/store register (register offset):
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Loads-and-Stores?lang=en
+func encodeLoadOrStoreExtended(_22to32 uint32, rn, rm, rt uint32, scaled bool, extOp extendOp) uint32 {
+ var option uint32
+ switch extOp {
+ case extendOpUXTW:
+ option = 0b010
+ case extendOpSXTW:
+ option = 0b110
+ case extendOpNone:
+ option = 0b111
+ default:
+ panic("BUG")
+ }
+ var s uint32
+ if scaled {
+ s = 0b1
+ }
+ return _22to32<<22 | 0b1<<21 | rm<<16 | option<<13 | s<<12 | 0b10<<10 | rn<<5 | rt
+}
+
+// encodeLoadOrStoreSIMM9 encodes store/load instruction as one of post-index, pre-index or unscaled immediate as in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Loads-and-Stores?lang=en
+func encodeLoadOrStoreSIMM9(_22to32, _1011 uint32, rn, rt uint32, imm9 int64) uint32 {
+ return _22to32<<22 | (uint32(imm9)&0b111111111)<<12 | _1011<<10 | rn<<5 | rt
+}
+
+// encodeFpuRRR encodes as single or double precision (depending on `_64bit`) of Floating-point data-processing (2 source) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeFpuRRR(op fpuBinOp, rd, rn, rm uint32, _64bit bool) (ret uint32) {
+ // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADD--vector--Add-vectors--scalar--floating-point-and-integer-
+ var opcode uint32
+ switch op {
+ case fpuBinOpAdd:
+ opcode = 0b0010
+ case fpuBinOpSub:
+ opcode = 0b0011
+ case fpuBinOpMul:
+ opcode = 0b0000
+ case fpuBinOpDiv:
+ opcode = 0b0001
+ case fpuBinOpMax:
+ opcode = 0b0100
+ case fpuBinOpMin:
+ opcode = 0b0101
+ default:
+ panic("BUG")
+ }
+ var ptype uint32
+ if _64bit {
+ ptype = 0b01
+ }
+ return 0b1111<<25 | ptype<<22 | 0b1<<21 | rm<<16 | opcode<<12 | 0b1<<11 | rn<<5 | rd
+}
+
+// encodeAluRRImm12 encodes as Add/subtract (immediate) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
+func encodeAluRRImm12(op aluOp, rd, rn uint32, imm12 uint16, shiftBit byte, _64bit bool) uint32 {
+ var _31to24 uint32
+ switch op {
+ case aluOpAdd:
+ _31to24 = 0b00_10001
+ case aluOpAddS:
+ _31to24 = 0b01_10001
+ case aluOpSub:
+ _31to24 = 0b10_10001
+ case aluOpSubS:
+ _31to24 = 0b11_10001
+ default:
+ panic("BUG")
+ }
+ if _64bit {
+ _31to24 |= 0b1 << 7
+ }
+ return _31to24<<24 | uint32(shiftBit)<<22 | uint32(imm12&0b111111111111)<<10 | rn<<5 | rd
+}
+
+// encodeAluRRR encodes as Data Processing (shifted register), depending on aluOp.
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift
+func encodeAluRRRShift(op aluOp, rd, rn, rm, amount uint32, shiftOp shiftOp, _64bit bool) uint32 {
+ var _31to24 uint32
+ var opc, n uint32
+ switch op {
+ case aluOpAdd:
+ _31to24 = 0b00001011
+ case aluOpAddS:
+ _31to24 = 0b00101011
+ case aluOpSub:
+ _31to24 = 0b01001011
+ case aluOpSubS:
+ _31to24 = 0b01101011
+ case aluOpAnd, aluOpOrr, aluOpEor, aluOpAnds:
+ // "Logical (shifted register)".
+ switch op {
+ case aluOpAnd:
+ // all zeros
+ case aluOpOrr:
+ opc = 0b01
+ case aluOpEor:
+ opc = 0b10
+ case aluOpAnds:
+ opc = 0b11
+ }
+ _31to24 = 0b000_01010
+ default:
+ panic(op.String())
+ }
+
+ if _64bit {
+ _31to24 |= 0b1 << 7
+ }
+
+ var shift uint32
+ switch shiftOp {
+ case shiftOpLSL:
+ shift = 0b00
+ case shiftOpLSR:
+ shift = 0b01
+ case shiftOpASR:
+ shift = 0b10
+ default:
+ panic(shiftOp.String())
+ }
+ return opc<<29 | n<<21 | _31to24<<24 | shift<<22 | rm<<16 | (amount << 10) | (rn << 5) | rd
+}
+
+// "Add/subtract (extended register)" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_ext
+func encodeAluRRRExtend(ao aluOp, rd, rn, rm uint32, extOp extendOp, to byte) uint32 {
+ var s, op uint32
+ switch ao {
+ case aluOpAdd:
+ op = 0b0
+ case aluOpAddS:
+ op, s = 0b0, 0b1
+ case aluOpSub:
+ op = 0b1
+ case aluOpSubS:
+ op, s = 0b1, 0b1
+ default:
+ panic("BUG: extended register operand can be used only for add/sub")
+ }
+
+ var sf uint32
+ if to == 64 {
+ sf = 0b1
+ }
+
+ var option uint32
+ switch extOp {
+ case extendOpUXTB:
+ option = 0b000
+ case extendOpUXTH:
+ option = 0b001
+ case extendOpUXTW:
+ option = 0b010
+ case extendOpSXTB:
+ option = 0b100
+ case extendOpSXTH:
+ option = 0b101
+ case extendOpSXTW:
+ option = 0b110
+ case extendOpSXTX, extendOpUXTX:
+ panic(fmt.Sprintf("%s is essentially noop, and should be handled much earlier than encoding", extOp.String()))
+ }
+ return sf<<31 | op<<30 | s<<29 | 0b1011001<<21 | rm<<16 | option<<13 | rn<<5 | rd
+}
+
+// encodeAluRRR encodes as Data Processing (register), depending on aluOp.
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
+func encodeAluRRR(op aluOp, rd, rn, rm uint32, _64bit, isRnSp bool) uint32 {
+ var _31to21, _15to10 uint32
+ switch op {
+ case aluOpAdd:
+ if isRnSp {
+ // "Extended register" with UXTW.
+ _31to21 = 0b00001011_001
+ _15to10 = 0b011000
+ } else {
+ // "Shifted register" with shift = 0
+ _31to21 = 0b00001011_000
+ }
+ case aluOpAddS:
+ if isRnSp {
+ panic("TODO")
+ }
+ // "Shifted register" with shift = 0
+ _31to21 = 0b00101011_000
+ case aluOpSub:
+ if isRnSp {
+ // "Extended register" with UXTW.
+ _31to21 = 0b01001011_001
+ _15to10 = 0b011000
+ } else {
+ // "Shifted register" with shift = 0
+ _31to21 = 0b01001011_000
+ }
+ case aluOpSubS:
+ if isRnSp {
+ panic("TODO")
+ }
+ // "Shifted register" with shift = 0
+ _31to21 = 0b01101011_000
+ case aluOpAnd, aluOpOrr, aluOpOrn, aluOpEor, aluOpAnds:
+ // "Logical (shifted register)".
+ var opc, n uint32
+ switch op {
+ case aluOpAnd:
+ // all zeros
+ case aluOpOrr:
+ opc = 0b01
+ case aluOpOrn:
+ opc = 0b01
+ n = 1
+ case aluOpEor:
+ opc = 0b10
+ case aluOpAnds:
+ opc = 0b11
+ }
+ _31to21 = 0b000_01010_000 | opc<<8 | n
+ case aluOpLsl, aluOpAsr, aluOpLsr, aluOpRotR:
+ // "Data-processing (2 source)".
+ _31to21 = 0b00011010_110
+ switch op {
+ case aluOpLsl:
+ _15to10 = 0b001000
+ case aluOpLsr:
+ _15to10 = 0b001001
+ case aluOpAsr:
+ _15to10 = 0b001010
+ case aluOpRotR:
+ _15to10 = 0b001011
+ }
+ case aluOpSDiv:
+ // "Data-processing (2 source)".
+ _31to21 = 0b11010110
+ _15to10 = 0b000011
+ case aluOpUDiv:
+ // "Data-processing (2 source)".
+ _31to21 = 0b11010110
+ _15to10 = 0b000010
+ default:
+ panic(op.String())
+ }
+ if _64bit {
+ _31to21 |= 0b1 << 10
+ }
+ return _31to21<<21 | rm<<16 | (_15to10 << 10) | (rn << 5) | rd
+}
+
+// encodeLogicalShiftedRegister encodes as Logical (shifted register) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
+func encodeLogicalShiftedRegister(sf_opc uint32, shift_N uint32, rm uint32, imm6 uint32, rn, rd uint32) (ret uint32) {
+ ret = sf_opc << 29
+ ret |= 0b01010 << 24
+ ret |= shift_N << 21
+ ret |= rm << 16
+ ret |= imm6 << 10
+ ret |= rn << 5
+ ret |= rd
+ return
+}
+
+// encodeAddSubtractImmediate encodes as Add/subtract (immediate) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
+func encodeAddSubtractImmediate(sf_op_s uint32, sh uint32, imm12 uint32, rn, rd uint32) (ret uint32) {
+ ret = sf_op_s << 29
+ ret |= 0b100010 << 23
+ ret |= sh << 22
+ ret |= imm12 << 10
+ ret |= rn << 5
+ ret |= rd
+ return
+}
+
+// encodePreOrPostIndexLoadStorePair64 encodes as Load/store pair (pre/post-indexed) in
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STP--Store-Pair-of-Registers-
+func encodePreOrPostIndexLoadStorePair64(pre bool, load bool, rn, rt, rt2 uint32, imm7 int64) (ret uint32) {
+ if imm7%8 != 0 {
+ panic("imm7 for pair load/store must be a multiple of 8")
+ }
+ imm7 /= 8
+ ret = rt
+ ret |= rn << 5
+ ret |= rt2 << 10
+ ret |= (uint32(imm7) & 0b1111111) << 15
+ if load {
+ ret |= 0b1 << 22
+ }
+ ret |= 0b101010001 << 23
+ if pre {
+ ret |= 0b1 << 24
+ }
+ return
+}
+
+// encodeUnconditionalBranch encodes as B or BL instructions:
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B--Branch-
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link-
+func encodeUnconditionalBranch(link bool, imm26 int64) (ret uint32) {
+ if imm26%4 != 0 {
+ panic("imm26 for branch must be a multiple of 4")
+ }
+ imm26 /= 4
+ ret = uint32(imm26 & 0b11_11111111_11111111_11111111)
+ ret |= 0b101 << 26
+ if link {
+ ret |= 0b1 << 31
+ }
+ return
+}
+
+// encodeCBZCBNZ encodes as either CBZ or CBNZ:
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CBZ--Compare-and-Branch-on-Zero-
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CBNZ--Compare-and-Branch-on-Nonzero-
+func encodeCBZCBNZ(rt uint32, nz bool, imm19 uint32, _64bit bool) (ret uint32) {
+ ret = rt
+ ret |= imm19 << 5
+ if nz {
+ ret |= 1 << 24
+ }
+ ret |= 0b11010 << 25
+ if _64bit {
+ ret |= 1 << 31
+ }
+ return
+}
+
+// encodeMoveWideImmediate encodes as either MOVZ, MOVN or MOVK, as Move wide (immediate) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
+//
+// "shift" must have been divided by 16 at this point.
+func encodeMoveWideImmediate(opc uint32, rd uint32, imm, shift, _64bit uint64) (ret uint32) {
+ ret = rd
+ ret |= uint32(imm&0xffff) << 5
+ ret |= (uint32(shift)) << 21
+ ret |= 0b100101 << 23
+ ret |= opc << 29
+ ret |= uint32(_64bit) << 31
+ return
+}
+
+// encodeAluRRImm encodes as "Bitfield" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en#log_imm
+func encodeAluRRImm(op aluOp, rd, rn, amount, _64bit uint32) uint32 {
+ var opc uint32
+ var immr, imms uint32
+ switch op {
+ case aluOpLsl:
+ // LSL (immediate) is an alias for UBFM.
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/UBFM--Unsigned-Bitfield-Move-?lang=en
+ opc = 0b10
+ if amount == 0 {
+ // This can be encoded as NOP, but we don't do it for consistency: lsr xn, xm, #0
+ immr = 0
+ if _64bit == 1 {
+ imms = 0b111111
+ } else {
+ imms = 0b11111
+ }
+ } else {
+ if _64bit == 1 {
+ immr = 64 - amount
+ } else {
+ immr = (32 - amount) & 0b11111
+ }
+ imms = immr - 1
+ }
+ case aluOpLsr:
+ // LSR (immediate) is an alias for UBFM.
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LSR--immediate---Logical-Shift-Right--immediate---an-alias-of-UBFM-?lang=en
+ opc = 0b10
+ imms, immr = 0b011111|_64bit<<5, amount
+ case aluOpAsr:
+ // ASR (immediate) is an alias for SBFM.
+ // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SBFM--Signed-Bitfield-Move-?lang=en
+ opc = 0b00
+ imms, immr = 0b011111|_64bit<<5, amount
+ default:
+ panic(op.String())
+ }
+ return _64bit<<31 | opc<<29 | 0b100110<<23 | _64bit<<22 | immr<<16 | imms<<10 | rn<<5 | rd
+}
+
+// encodeVecLanes encodes as Data Processing (Advanced SIMD across lanes) depending on vecOp in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeVecLanes(op vecOp, rd uint32, rn uint32, arr vecArrangement) uint32 {
+ var u, q, size, opcode uint32
+ switch arr {
+ case vecArrangement8B:
+ q, size = 0b0, 0b00
+ case vecArrangement16B:
+ q, size = 0b1, 0b00
+ case vecArrangement4H:
+ q, size = 0, 0b01
+ case vecArrangement8H:
+ q, size = 1, 0b01
+ case vecArrangement4S:
+ q, size = 1, 0b10
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ switch op {
+ case vecOpUaddlv:
+ u, opcode = 1, 0b00011
+ case vecOpUminv:
+ u, opcode = 1, 0b11010
+ case vecOpAddv:
+ u, opcode = 0, 0b11011
+ default:
+ panic("unsupported or illegal vecOp: " + op.String())
+ }
+ return q<<30 | u<<29 | 0b1110<<24 | size<<22 | 0b11000<<17 | opcode<<12 | 0b10<<10 | rn<<5 | rd
+}
+
+// encodeVecLanes encodes as Data Processing (Advanced SIMD scalar shift by immediate) depending on vecOp in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeVecShiftImm(op vecOp, rd uint32, rn, amount uint32, arr vecArrangement) uint32 {
+ var u, q, immh, immb, opcode uint32
+ switch op {
+ case vecOpSshll:
+ u, opcode = 0b0, 0b10100
+ case vecOpUshll:
+ u, opcode = 0b1, 0b10100
+ case vecOpSshr:
+ u, opcode = 0, 0b00000
+ default:
+ panic("unsupported or illegal vecOp: " + op.String())
+ }
+ switch arr {
+ case vecArrangement16B:
+ q = 0b1
+ fallthrough
+ case vecArrangement8B:
+ immh = 0b0001
+ immb = 8 - uint32(amount&0b111)
+ case vecArrangement8H:
+ q = 0b1
+ fallthrough
+ case vecArrangement4H:
+ v := 16 - uint32(amount&0b1111)
+ immb = v & 0b111
+ immh = 0b0010 | (v >> 3)
+ case vecArrangement4S:
+ q = 0b1
+ fallthrough
+ case vecArrangement2S:
+ v := 32 - uint32(amount&0b11111)
+ immb = v & 0b111
+ immh = 0b0100 | (v >> 3)
+ case vecArrangement2D:
+ q = 0b1
+ v := 64 - uint32(amount&0b111111)
+ immb = v & 0b111
+ immh = 0b1000 | (v >> 3)
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ return q<<30 | u<<29 | 0b011110<<23 | immh<<19 | immb<<16 | 0b000001<<10 | opcode<<11 | 0b1<<10 | rn<<5 | rd
+}
+
+// encodeVecTbl encodes as Data Processing (Advanced SIMD table lookup) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp
+//
+// Note: tblOp may encode tbl1, tbl2... in the future. Currently, it is ignored.
+func encodeVecTbl(nregs, rd, rn, rm uint32, arr vecArrangement) uint32 {
+ var q, op2, len, op uint32
+
+ switch nregs {
+ case 1:
+ // tbl: single-register
+ len = 0b00
+ case 2:
+ // tbl2: 2-register table
+ len = 0b01
+ default:
+ panic(fmt.Sprintf("unsupported number or registers %d", nregs))
+ }
+ switch arr {
+ case vecArrangement8B:
+ q = 0b0
+ case vecArrangement16B:
+ q = 0b1
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+
+ return q<<30 | 0b001110<<24 | op2<<22 | rm<<16 | len<<13 | op<<12 | rn<<5 | rd
+}
+
+// encodeVecMisc encodes as Data Processing (Advanced SIMD two-register miscellaneous) depending on vecOp in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp
+func encodeAdvancedSIMDTwoMisc(op vecOp, rd, rn uint32, arr vecArrangement) uint32 {
+ var q, u, size, opcode uint32
+ switch op {
+ case vecOpCnt:
+ opcode = 0b00101
+ switch arr {
+ case vecArrangement8B:
+ q, size = 0b0, 0b00
+ case vecArrangement16B:
+ q, size = 0b1, 0b00
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ case vecOpCmeq0:
+ if arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ opcode = 0b01001
+ size, q = arrToSizeQEncoded(arr)
+ case vecOpNot:
+ u = 1
+ opcode = 0b00101
+ switch arr {
+ case vecArrangement8B:
+ q, size = 0b0, 0b00
+ case vecArrangement16B:
+ q, size = 0b1, 0b00
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ case vecOpAbs:
+ if arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ opcode = 0b01011
+ u = 0b0
+ size, q = arrToSizeQEncoded(arr)
+ case vecOpNeg:
+ if arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ opcode = 0b01011
+ u = 0b1
+ size, q = arrToSizeQEncoded(arr)
+ case vecOpFabs:
+ if arr < vecArrangement2S || arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ opcode = 0b01111
+ u = 0b0
+ size, q = arrToSizeQEncoded(arr)
+ case vecOpFneg:
+ if arr < vecArrangement2S || arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ opcode = 0b01111
+ u = 0b1
+ size, q = arrToSizeQEncoded(arr)
+ case vecOpFrintm:
+ u = 0b0
+ opcode = 0b11001
+ switch arr {
+ case vecArrangement2S:
+ q, size = 0b0, 0b00
+ case vecArrangement4S:
+ q, size = 0b1, 0b00
+ case vecArrangement2D:
+ q, size = 0b1, 0b01
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ case vecOpFrintn:
+ u = 0b0
+ opcode = 0b11000
+ switch arr {
+ case vecArrangement2S:
+ q, size = 0b0, 0b00
+ case vecArrangement4S:
+ q, size = 0b1, 0b00
+ case vecArrangement2D:
+ q, size = 0b1, 0b01
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ case vecOpFrintp:
+ u = 0b0
+ opcode = 0b11000
+ if arr < vecArrangement2S || arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q = arrToSizeQEncoded(arr)
+ case vecOpFrintz:
+ u = 0b0
+ opcode = 0b11001
+ if arr < vecArrangement2S || arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q = arrToSizeQEncoded(arr)
+ case vecOpFsqrt:
+ if arr < vecArrangement2S || arr == vecArrangement1D {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ opcode = 0b11111
+ u = 0b1
+ size, q = arrToSizeQEncoded(arr)
+ case vecOpFcvtl:
+ opcode = 0b10111
+ u = 0b0
+ switch arr {
+ case vecArrangement2S:
+ size, q = 0b01, 0b0
+ case vecArrangement4H:
+ size, q = 0b00, 0b0
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ case vecOpFcvtn:
+ opcode = 0b10110
+ u = 0b0
+ switch arr {
+ case vecArrangement2S:
+ size, q = 0b01, 0b0
+ case vecArrangement4H:
+ size, q = 0b00, 0b0
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ case vecOpFcvtzs:
+ opcode = 0b11011
+ u = 0b0
+ switch arr {
+ case vecArrangement2S:
+ q, size = 0b0, 0b10
+ case vecArrangement4S:
+ q, size = 0b1, 0b10
+ case vecArrangement2D:
+ q, size = 0b1, 0b11
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ case vecOpFcvtzu:
+ opcode = 0b11011
+ u = 0b1
+ switch arr {
+ case vecArrangement2S:
+ q, size = 0b0, 0b10
+ case vecArrangement4S:
+ q, size = 0b1, 0b10
+ case vecArrangement2D:
+ q, size = 0b1, 0b11
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ case vecOpScvtf:
+ opcode = 0b11101
+ u = 0b0
+ switch arr {
+ case vecArrangement4S:
+ q, size = 0b1, 0b00
+ case vecArrangement2S:
+ q, size = 0b0, 0b00
+ case vecArrangement2D:
+ q, size = 0b1, 0b01
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ case vecOpUcvtf:
+ opcode = 0b11101
+ u = 0b1
+ switch arr {
+ case vecArrangement4S:
+ q, size = 0b1, 0b00
+ case vecArrangement2S:
+ q, size = 0b0, 0b00
+ case vecArrangement2D:
+ q, size = 0b1, 0b01
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ case vecOpSqxtn:
+ // When q == 1 it encodes sqxtn2 (operates on upper 64 bits).
+ opcode = 0b10100
+ u = 0b0
+ if arr > vecArrangement4S {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q = arrToSizeQEncoded(arr)
+ case vecOpUqxtn:
+ // When q == 1 it encodes uqxtn2 (operates on upper 64 bits).
+ opcode = 0b10100
+ u = 0b1
+ if arr > vecArrangement4S {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q = arrToSizeQEncoded(arr)
+ case vecOpSqxtun:
+ // When q == 1 it encodes sqxtun2 (operates on upper 64 bits).
+ opcode = 0b10010 // 0b10100
+ u = 0b1
+ if arr > vecArrangement4S {
+ panic("unsupported arrangement: " + arr.String())
+ }
+ size, q = arrToSizeQEncoded(arr)
+ case vecOpRev64:
+ opcode = 0b00000
+ size, q = arrToSizeQEncoded(arr)
+ case vecOpXtn:
+ u = 0b0
+ opcode = 0b10010
+ size, q = arrToSizeQEncoded(arr)
+ case vecOpShll:
+ u = 0b1
+ opcode = 0b10011
+ switch arr {
+ case vecArrangement8B:
+ q, size = 0b0, 0b00
+ case vecArrangement4H:
+ q, size = 0b0, 0b01
+ case vecArrangement2S:
+ q, size = 0b0, 0b10
+ default:
+ panic("unsupported arrangement: " + arr.String())
+ }
+ default:
+ panic("unsupported or illegal vecOp: " + op.String())
+ }
+ return q<<30 | u<<29 | 0b01110<<24 | size<<22 | 0b10000<<17 | opcode<<12 | 0b10<<10 | rn<<5 | rd
+}
+
+// brTableSequenceOffsetTableBegin is the offset inside the brTableSequence where the table begins after 4 instructions
+const brTableSequenceOffsetTableBegin = 16
+
+func encodeBrTableSequence(c backend.Compiler, index regalloc.VReg, targets []uint32) {
+ tmpRegNumber := regNumberInEncoding[tmp]
+ indexNumber := regNumberInEncoding[index.RealReg()]
+
+ // adr tmpReg, PC+16 (PC+16 is the address of the first label offset)
+ // ldrsw index, [tmpReg, index, UXTW 2] ;; index = int64(*(tmpReg + index*8))
+ // add tmpReg, tmpReg, index
+ // br tmpReg
+ // [offset_to_l1, offset_to_l2, ..., offset_to_lN]
+ c.Emit4Bytes(encodeAdr(tmpRegNumber, 16))
+ c.Emit4Bytes(encodeLoadOrStore(sLoad32, indexNumber,
+ addressMode{kind: addressModeKindRegScaledExtended, rn: tmpRegVReg, rm: index, extOp: extendOpUXTW},
+ ))
+ c.Emit4Bytes(encodeAluRRR(aluOpAdd, tmpRegNumber, tmpRegNumber, indexNumber, true, false))
+ c.Emit4Bytes(encodeUnconditionalBranchReg(tmpRegNumber, false))
+
+ // Offsets are resolved in ResolveRelativeAddress phase.
+ for _, offset := range targets {
+ if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
+ // Inlined offset tables cannot be disassembled properly, so pad dummy instructions to make the debugging easier.
+ c.Emit4Bytes(dummyInstruction)
+ } else {
+ c.Emit4Bytes(offset)
+ }
+ }
+}
+
+// encodeExitSequence matches the implementation detail of functionABI.emitGoEntryPreamble.
+func encodeExitSequence(c backend.Compiler, ctxReg regalloc.VReg) {
+ // Restore the FP, SP and LR, and return to the Go code:
+ // ldr lr, [ctxReg, #GoReturnAddress]
+ // ldr fp, [ctxReg, #OriginalFramePointer]
+ // ldr tmp, [ctxReg, #OriginalStackPointer]
+ // mov sp, tmp ;; sp cannot be str'ed directly.
+ // ret ;; --> return to the Go code
+
+ var ctxEvicted bool
+ if ctx := ctxReg.RealReg(); ctx == fp || ctx == lr {
+ // In order to avoid overwriting the context register, we move ctxReg to tmp.
+ c.Emit4Bytes(encodeMov64(regNumberInEncoding[tmp], regNumberInEncoding[ctx], false, false))
+ ctxReg = tmpRegVReg
+ ctxEvicted = true
+ }
+
+ restoreLr := encodeLoadOrStore(
+ uLoad64,
+ regNumberInEncoding[lr],
+ addressMode{
+ kind: addressModeKindRegUnsignedImm12,
+ rn: ctxReg,
+ imm: wazevoapi.ExecutionContextOffsetGoReturnAddress.I64(),
+ },
+ )
+
+ restoreFp := encodeLoadOrStore(
+ uLoad64,
+ regNumberInEncoding[fp],
+ addressMode{
+ kind: addressModeKindRegUnsignedImm12,
+ rn: ctxReg,
+ imm: wazevoapi.ExecutionContextOffsetOriginalFramePointer.I64(),
+ },
+ )
+
+ restoreSpToTmp := encodeLoadOrStore(
+ uLoad64,
+ regNumberInEncoding[tmp],
+ addressMode{
+ kind: addressModeKindRegUnsignedImm12,
+ rn: ctxReg,
+ imm: wazevoapi.ExecutionContextOffsetOriginalStackPointer.I64(),
+ },
+ )
+
+ movTmpToSp := encodeAddSubtractImmediate(0b100, 0, 0,
+ regNumberInEncoding[tmp], regNumberInEncoding[sp])
+
+ c.Emit4Bytes(restoreFp)
+ c.Emit4Bytes(restoreLr)
+ c.Emit4Bytes(restoreSpToTmp)
+ c.Emit4Bytes(movTmpToSp)
+ c.Emit4Bytes(encodeRet())
+ if !ctxEvicted {
+ // In order to have the fixed-length exit sequence, we need to padd the binary.
+ // Since this will never be reached, we insert a dummy instruction.
+ c.Emit4Bytes(dummyInstruction)
+ }
+}
+
+func encodeRet() uint32 {
+ // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/RET--Return-from-subroutine-?lang=en
+ return 0b1101011001011111<<16 | regNumberInEncoding[lr]<<5
+}
+
+func encodeAtomicRmw(op atomicRmwOp, rs, rt, rn uint32, size uint32) uint32 {
+ var _31to21, _15to10, sz uint32
+
+ switch size {
+ case 8:
+ sz = 0b11
+ case 4:
+ sz = 0b10
+ case 2:
+ sz = 0b01
+ case 1:
+ sz = 0b00
+ }
+
+ _31to21 = 0b00111000_111 | sz<<9
+
+ switch op {
+ case atomicRmwOpAdd:
+ _15to10 = 0b000000
+ case atomicRmwOpClr:
+ _15to10 = 0b000100
+ case atomicRmwOpSet:
+ _15to10 = 0b001100
+ case atomicRmwOpEor:
+ _15to10 = 0b001000
+ case atomicRmwOpSwp:
+ _15to10 = 0b100000
+ }
+
+ return _31to21<<21 | rs<<16 | _15to10<<10 | rn<<5 | rt
+}
+
+func encodeAtomicCas(rs, rt, rn uint32, size uint32) uint32 {
+ var _31to21, _15to10, sz uint32
+
+ switch size {
+ case 8:
+ sz = 0b11
+ case 4:
+ sz = 0b10
+ case 2:
+ sz = 0b01
+ case 1:
+ sz = 0b00
+ }
+
+ _31to21 = 0b00001000_111 | sz<<9
+ _15to10 = 0b111111
+
+ return _31to21<<21 | rs<<16 | _15to10<<10 | rn<<5 | rt
+}
+
+func encodeAtomicLoadStore(rn, rt, size, l uint32) uint32 {
+ var _31to21, _20to16, _15to10, sz uint32
+
+ switch size {
+ case 8:
+ sz = 0b11
+ case 4:
+ sz = 0b10
+ case 2:
+ sz = 0b01
+ case 1:
+ sz = 0b00
+ }
+
+ _31to21 = 0b00001000_100 | sz<<9 | l<<1
+ _20to16 = 0b11111
+ _15to10 = 0b111111
+
+ return _31to21<<21 | _20to16<<16 | _15to10<<10 | rn<<5 | rt
+}
+
+func encodeDMB() uint32 {
+ return 0b11010101000000110011101110111111
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
new file mode 100644
index 000000000..698b382d4
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
@@ -0,0 +1,301 @@
+package arm64
+
+import (
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// lowerConstant allocates a new VReg and inserts the instruction to load the constant value.
+func (m *machine) lowerConstant(instr *ssa.Instruction) (vr regalloc.VReg) {
+ val := instr.Return()
+ valType := val.Type()
+
+ vr = m.compiler.AllocateVReg(valType)
+ v := instr.ConstantVal()
+ m.insertLoadConstant(v, valType, vr)
+ return
+}
+
+// InsertLoadConstantBlockArg implements backend.Machine.
+func (m *machine) InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) {
+ val := instr.Return()
+ valType := val.Type()
+ v := instr.ConstantVal()
+ load := m.allocateInstr()
+ load.asLoadConstBlockArg(v, valType, vr)
+ m.insert(load)
+}
+
+func (m *machine) lowerLoadConstantBlockArgAfterRegAlloc(i *instruction) {
+ v, typ, dst := i.loadConstBlockArgData()
+ m.insertLoadConstant(v, typ, dst)
+}
+
+func (m *machine) insertLoadConstant(v uint64, valType ssa.Type, vr regalloc.VReg) {
+ if valType.Bits() < 64 { // Clear the redundant bits just in case it's unexpectedly sign-extended, etc.
+ v = v & ((1 << valType.Bits()) - 1)
+ }
+
+ switch valType {
+ case ssa.TypeF32:
+ loadF := m.allocateInstr()
+ loadF.asLoadFpuConst32(vr, v)
+ m.insert(loadF)
+ case ssa.TypeF64:
+ loadF := m.allocateInstr()
+ loadF.asLoadFpuConst64(vr, v)
+ m.insert(loadF)
+ case ssa.TypeI32:
+ if v == 0 {
+ m.InsertMove(vr, xzrVReg, ssa.TypeI32)
+ } else {
+ m.lowerConstantI32(vr, int32(v))
+ }
+ case ssa.TypeI64:
+ if v == 0 {
+ m.InsertMove(vr, xzrVReg, ssa.TypeI64)
+ } else {
+ m.lowerConstantI64(vr, int64(v))
+ }
+ default:
+ panic("TODO")
+ }
+}
+
+// The following logics are based on the old asm/arm64 package.
+// https://github.com/tetratelabs/wazero/blob/39f2ff23a6d609e10c82b9cc0b981f6de5b87a9c/internal/asm/arm64/impl.go
+
+func (m *machine) lowerConstantI32(dst regalloc.VReg, c int32) {
+ // Following the logic here:
+ // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1637
+ ic := int64(uint32(c))
+ if ic >= 0 && (ic <= 0xfff || (ic&0xfff) == 0 && (uint64(ic>>12) <= 0xfff)) {
+ if isBitMaskImmediate(uint64(c), false) {
+ m.lowerConstViaBitMaskImmediate(uint64(uint32(c)), dst, false)
+ return
+ }
+ }
+
+ if t := const16bitAligned(int64(uint32(c))); t >= 0 {
+ // If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
+ // We could load it into temporary with movk.
+ m.insertMOVZ(dst, uint64(uint32(c)>>(16*t)), t, false)
+ } else if t := const16bitAligned(int64(^c)); t >= 0 {
+ // Also, if the inverse of the const can fit within 16-bit range, do the same ^^.
+ m.insertMOVN(dst, uint64(^c>>(16*t)), t, false)
+ } else if isBitMaskImmediate(uint64(uint32(c)), false) {
+ m.lowerConstViaBitMaskImmediate(uint64(c), dst, false)
+ } else {
+ // Otherwise, we use MOVZ and MOVK to load it.
+ c16 := uint16(c)
+ m.insertMOVZ(dst, uint64(c16), 0, false)
+ c16 = uint16(uint32(c) >> 16)
+ m.insertMOVK(dst, uint64(c16), 1, false)
+ }
+}
+
+func (m *machine) lowerConstantI64(dst regalloc.VReg, c int64) {
+ // Following the logic here:
+ // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1798-L1852
+ if c >= 0 && (c <= 0xfff || (c&0xfff) == 0 && (uint64(c>>12) <= 0xfff)) {
+ if isBitMaskImmediate(uint64(c), true) {
+ m.lowerConstViaBitMaskImmediate(uint64(c), dst, true)
+ return
+ }
+ }
+
+ if t := const16bitAligned(c); t >= 0 {
+ // If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
+ // We could load it into temporary with movk.
+ m.insertMOVZ(dst, uint64(c)>>(16*t), t, true)
+ } else if t := const16bitAligned(^c); t >= 0 {
+ // Also, if the reverse of the const can fit within 16-bit range, do the same ^^.
+ m.insertMOVN(dst, uint64(^c)>>(16*t), t, true)
+ } else if isBitMaskImmediate(uint64(c), true) {
+ m.lowerConstViaBitMaskImmediate(uint64(c), dst, true)
+ } else {
+ m.load64bitConst(c, dst)
+ }
+}
+
+func (m *machine) lowerConstViaBitMaskImmediate(c uint64, dst regalloc.VReg, b64 bool) {
+ instr := m.allocateInstr()
+ instr.asALUBitmaskImm(aluOpOrr, dst, xzrVReg, c, b64)
+ m.insert(instr)
+}
+
+// isBitMaskImmediate determines if the value can be encoded as "bitmask immediate".
+//
+// Such an immediate is a 32-bit or 64-bit pattern viewed as a vector of identical elements of size e = 2, 4, 8, 16, 32, or 64 bits.
+// Each element contains the same sub-pattern: a single run of 1 to e-1 non-zero bits, rotated by 0 to e-1 bits.
+//
+// See https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/MOV--bitmask-immediate-
+func isBitMaskImmediate(x uint64, _64 bool) bool {
+ // All zeros and ones are not "bitmask immediate" by definition.
+ if x == 0 || (_64 && x == 0xffff_ffff_ffff_ffff) || (!_64 && x == 0xffff_ffff) {
+ return false
+ }
+
+ switch {
+ case x != x>>32|x<<32:
+ // e = 64
+ case x != x>>16|x<<48:
+ // e = 32 (x == x>>32|x<<32).
+ // e.g. 0x00ff_ff00_00ff_ff00
+ x = uint64(int32(x))
+ case x != x>>8|x<<56:
+ // e = 16 (x == x>>16|x<<48).
+ // e.g. 0x00ff_00ff_00ff_00ff
+ x = uint64(int16(x))
+ case x != x>>4|x<<60:
+ // e = 8 (x == x>>8|x<<56).
+ // e.g. 0x0f0f_0f0f_0f0f_0f0f
+ x = uint64(int8(x))
+ default:
+ // e = 4 or 2.
+ return true
+ }
+ return sequenceOfSetbits(x) || sequenceOfSetbits(^x)
+}
+
+// sequenceOfSetbits returns true if the number's binary representation is the sequence set bit (1).
+// For example: 0b1110 -> true, 0b1010 -> false
+func sequenceOfSetbits(x uint64) bool {
+ y := getLowestBit(x)
+ // If x is a sequence of set bit, this should results in the number
+ // with only one set bit (i.e. power of two).
+ y += x
+ return (y-1)&y == 0
+}
+
+func getLowestBit(x uint64) uint64 {
+ return x & (^x + 1)
+}
+
+// const16bitAligned check if the value is on the 16-bit alignment.
+// If so, returns the shift num divided by 16, and otherwise -1.
+func const16bitAligned(v int64) (ret int) {
+ ret = -1
+ for s := 0; s < 64; s += 16 {
+ if (uint64(v) &^ (uint64(0xffff) << uint(s))) == 0 {
+ ret = s / 16
+ break
+ }
+ }
+ return
+}
+
+// load64bitConst loads a 64-bit constant into the register, following the same logic to decide how to load large 64-bit
+// consts as in the Go assembler.
+//
+// See https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6632-L6759
+func (m *machine) load64bitConst(c int64, dst regalloc.VReg) {
+ var bits [4]uint64
+ var zeros, negs int
+ for i := 0; i < 4; i++ {
+ bits[i] = uint64(c) >> uint(i*16) & 0xffff
+ if v := bits[i]; v == 0 {
+ zeros++
+ } else if v == 0xffff {
+ negs++
+ }
+ }
+
+ if zeros == 3 {
+ // one MOVZ instruction.
+ for i, v := range bits {
+ if v != 0 {
+ m.insertMOVZ(dst, v, i, true)
+ }
+ }
+ } else if negs == 3 {
+ // one MOVN instruction.
+ for i, v := range bits {
+ if v != 0xffff {
+ v = ^v
+ m.insertMOVN(dst, v, i, true)
+ }
+ }
+ } else if zeros == 2 {
+ // one MOVZ then one OVK.
+ var movz bool
+ for i, v := range bits {
+ if !movz && v != 0 { // MOVZ.
+ m.insertMOVZ(dst, v, i, true)
+ movz = true
+ } else if v != 0 {
+ m.insertMOVK(dst, v, i, true)
+ }
+ }
+
+ } else if negs == 2 {
+ // one MOVN then one or two MOVK.
+ var movn bool
+ for i, v := range bits { // Emit MOVN.
+ if !movn && v != 0xffff {
+ v = ^v
+ // https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
+ m.insertMOVN(dst, v, i, true)
+ movn = true
+ } else if v != 0xffff {
+ m.insertMOVK(dst, v, i, true)
+ }
+ }
+
+ } else if zeros == 1 {
+ // one MOVZ then two MOVK.
+ var movz bool
+ for i, v := range bits {
+ if !movz && v != 0 { // MOVZ.
+ m.insertMOVZ(dst, v, i, true)
+ movz = true
+ } else if v != 0 {
+ m.insertMOVK(dst, v, i, true)
+ }
+ }
+
+ } else if negs == 1 {
+ // one MOVN then two MOVK.
+ var movn bool
+ for i, v := range bits { // Emit MOVN.
+ if !movn && v != 0xffff {
+ v = ^v
+ // https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
+ m.insertMOVN(dst, v, i, true)
+ movn = true
+ } else if v != 0xffff {
+ m.insertMOVK(dst, v, i, true)
+ }
+ }
+
+ } else {
+ // one MOVZ then up to three MOVK.
+ var movz bool
+ for i, v := range bits {
+ if !movz && v != 0 { // MOVZ.
+ m.insertMOVZ(dst, v, i, true)
+ movz = true
+ } else if v != 0 {
+ m.insertMOVK(dst, v, i, true)
+ }
+ }
+ }
+}
+
+func (m *machine) insertMOVZ(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
+ instr := m.allocateInstr()
+ instr.asMOVZ(dst, v, uint64(shift), dst64)
+ m.insert(instr)
+}
+
+func (m *machine) insertMOVK(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
+ instr := m.allocateInstr()
+ instr.asMOVK(dst, v, uint64(shift), dst64)
+ m.insert(instr)
+}
+
+func (m *machine) insertMOVN(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
+ instr := m.allocateInstr()
+ instr.asMOVN(dst, v, uint64(shift), dst64)
+ m.insert(instr)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
new file mode 100644
index 000000000..2bb234e8c
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
@@ -0,0 +1,2221 @@
+package arm64
+
+// Files prefixed as lower_instr** do the instruction selection, meaning that lowering SSA level instructions
+// into machine specific instructions.
+//
+// Importantly, what the lower** functions does includes tree-matching; find the pattern from the given instruction tree,
+// and merge the multiple instructions if possible. It can be considered as "N:1" instruction selection.
+
+import (
+ "fmt"
+ "math"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// LowerSingleBranch implements backend.Machine.
+func (m *machine) LowerSingleBranch(br *ssa.Instruction) {
+ ectx := m.executableContext
+ switch br.Opcode() {
+ case ssa.OpcodeJump:
+ _, _, targetBlk := br.BranchData()
+ if br.IsFallthroughJump() {
+ return
+ }
+ b := m.allocateInstr()
+ target := ectx.GetOrAllocateSSABlockLabel(targetBlk)
+ if target == labelReturn {
+ b.asRet()
+ } else {
+ b.asBr(target)
+ }
+ m.insert(b)
+ case ssa.OpcodeBrTable:
+ m.lowerBrTable(br)
+ default:
+ panic("BUG: unexpected branch opcode" + br.Opcode().String())
+ }
+}
+
+func (m *machine) lowerBrTable(i *ssa.Instruction) {
+ index, targets := i.BrTableData()
+ indexOperand := m.getOperand_NR(m.compiler.ValueDefinition(index), extModeNone)
+
+ // Firstly, we have to do the bounds check of the index, and
+ // set it to the default target (sitting at the end of the list) if it's out of bounds.
+
+ // mov maxIndexReg #maximum_index
+ // subs wzr, index, maxIndexReg
+ // csel adjustedIndex, maxIndexReg, index, hs ;; if index is higher or equal than maxIndexReg.
+ maxIndexReg := m.compiler.AllocateVReg(ssa.TypeI32)
+ m.lowerConstantI32(maxIndexReg, int32(len(targets)-1))
+ subs := m.allocateInstr()
+ subs.asALU(aluOpSubS, operandNR(xzrVReg), indexOperand, operandNR(maxIndexReg), false)
+ m.insert(subs)
+ csel := m.allocateInstr()
+ adjustedIndex := m.compiler.AllocateVReg(ssa.TypeI32)
+ csel.asCSel(operandNR(adjustedIndex), operandNR(maxIndexReg), indexOperand, hs, false)
+ m.insert(csel)
+
+ brSequence := m.allocateInstr()
+
+ tableIndex := m.addJmpTableTarget(targets)
+ brSequence.asBrTableSequence(adjustedIndex, tableIndex, len(targets))
+ m.insert(brSequence)
+}
+
+// LowerConditionalBranch implements backend.Machine.
+func (m *machine) LowerConditionalBranch(b *ssa.Instruction) {
+ exctx := m.executableContext
+ cval, args, targetBlk := b.BranchData()
+ if len(args) > 0 {
+ panic(fmt.Sprintf(
+ "conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s",
+ exctx.CurrentSSABlk,
+ targetBlk,
+ ))
+ }
+
+ target := exctx.GetOrAllocateSSABlockLabel(targetBlk)
+ cvalDef := m.compiler.ValueDefinition(cval)
+
+ switch {
+ case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction.
+ cvalInstr := cvalDef.Instr
+ x, y, c := cvalInstr.IcmpData()
+ cc, signed := condFlagFromSSAIntegerCmpCond(c), c.Signed()
+ if b.Opcode() == ssa.OpcodeBrz {
+ cc = cc.invert()
+ }
+
+ if !m.tryLowerBandToFlag(x, y) {
+ m.lowerIcmpToFlag(x, y, signed)
+ }
+ cbr := m.allocateInstr()
+ cbr.asCondBr(cc.asCond(), target, false /* ignored */)
+ m.insert(cbr)
+ cvalDef.Instr.MarkLowered()
+ case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly.
+ cvalInstr := cvalDef.Instr
+ x, y, c := cvalInstr.FcmpData()
+ cc := condFlagFromSSAFloatCmpCond(c)
+ if b.Opcode() == ssa.OpcodeBrz {
+ cc = cc.invert()
+ }
+ m.lowerFcmpToFlag(x, y)
+ cbr := m.allocateInstr()
+ cbr.asCondBr(cc.asCond(), target, false /* ignored */)
+ m.insert(cbr)
+ cvalDef.Instr.MarkLowered()
+ default:
+ rn := m.getOperand_NR(cvalDef, extModeNone)
+ var c cond
+ if b.Opcode() == ssa.OpcodeBrz {
+ c = registerAsRegZeroCond(rn.nr())
+ } else {
+ c = registerAsRegNotZeroCond(rn.nr())
+ }
+ cbr := m.allocateInstr()
+ cbr.asCondBr(c, target, false)
+ m.insert(cbr)
+ }
+}
+
+func (m *machine) tryLowerBandToFlag(x, y ssa.Value) (ok bool) {
+ xx := m.compiler.ValueDefinition(x)
+ yy := m.compiler.ValueDefinition(y)
+ if xx.IsFromInstr() && xx.Instr.Constant() && xx.Instr.ConstantVal() == 0 {
+ if m.compiler.MatchInstr(yy, ssa.OpcodeBand) {
+ bandInstr := yy.Instr
+ m.lowerBitwiseAluOp(bandInstr, aluOpAnds, true)
+ ok = true
+ bandInstr.MarkLowered()
+ return
+ }
+ }
+
+ if yy.IsFromInstr() && yy.Instr.Constant() && yy.Instr.ConstantVal() == 0 {
+ if m.compiler.MatchInstr(xx, ssa.OpcodeBand) {
+ bandInstr := xx.Instr
+ m.lowerBitwiseAluOp(bandInstr, aluOpAnds, true)
+ ok = true
+ bandInstr.MarkLowered()
+ return
+ }
+ }
+ return
+}
+
+// LowerInstr implements backend.Machine.
+func (m *machine) LowerInstr(instr *ssa.Instruction) {
+ if l := instr.SourceOffset(); l.Valid() {
+ info := m.allocateInstr().asEmitSourceOffsetInfo(l)
+ m.insert(info)
+ }
+
+ switch op := instr.Opcode(); op {
+ case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable:
+ panic("BUG: branching instructions are handled by LowerBranches")
+ case ssa.OpcodeReturn:
+ panic("BUG: return must be handled by backend.Compiler")
+ case ssa.OpcodeIadd, ssa.OpcodeIsub:
+ m.lowerSubOrAdd(instr, op == ssa.OpcodeIadd)
+ case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv, ssa.OpcodeFmax, ssa.OpcodeFmin:
+ m.lowerFpuBinOp(instr)
+ case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined.
+ case ssa.OpcodeExitWithCode:
+ execCtx, code := instr.ExitWithCodeData()
+ m.lowerExitWithCode(m.compiler.VRegOf(execCtx), code)
+ case ssa.OpcodeExitIfTrueWithCode:
+ execCtx, c, code := instr.ExitIfTrueWithCodeData()
+ m.lowerExitIfTrueWithCode(m.compiler.VRegOf(execCtx), c, code)
+ case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32:
+ m.lowerStore(instr)
+ case ssa.OpcodeLoad:
+ dst := instr.Return()
+ ptr, offset, typ := instr.LoadData()
+ m.lowerLoad(ptr, offset, typ, dst)
+ case ssa.OpcodeVZeroExtLoad:
+ dst := instr.Return()
+ ptr, offset, typ := instr.VZeroExtLoadData()
+ m.lowerLoad(ptr, offset, typ, dst)
+ case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32:
+ ptr, offset, _ := instr.LoadData()
+ ret := m.compiler.VRegOf(instr.Return())
+ m.lowerExtLoad(op, ptr, offset, ret)
+ case ssa.OpcodeCall, ssa.OpcodeCallIndirect:
+ m.lowerCall(instr)
+ case ssa.OpcodeIcmp:
+ m.lowerIcmp(instr)
+ case ssa.OpcodeVIcmp:
+ m.lowerVIcmp(instr)
+ case ssa.OpcodeVFcmp:
+ m.lowerVFcmp(instr)
+ case ssa.OpcodeVCeil:
+ m.lowerVecMisc(vecOpFrintp, instr)
+ case ssa.OpcodeVFloor:
+ m.lowerVecMisc(vecOpFrintm, instr)
+ case ssa.OpcodeVTrunc:
+ m.lowerVecMisc(vecOpFrintz, instr)
+ case ssa.OpcodeVNearest:
+ m.lowerVecMisc(vecOpFrintn, instr)
+ case ssa.OpcodeVMaxPseudo:
+ m.lowerVMinMaxPseudo(instr, true)
+ case ssa.OpcodeVMinPseudo:
+ m.lowerVMinMaxPseudo(instr, false)
+ case ssa.OpcodeBand:
+ m.lowerBitwiseAluOp(instr, aluOpAnd, false)
+ case ssa.OpcodeBor:
+ m.lowerBitwiseAluOp(instr, aluOpOrr, false)
+ case ssa.OpcodeBxor:
+ m.lowerBitwiseAluOp(instr, aluOpEor, false)
+ case ssa.OpcodeIshl:
+ m.lowerShifts(instr, extModeNone, aluOpLsl)
+ case ssa.OpcodeSshr:
+ if instr.Return().Type().Bits() == 64 {
+ m.lowerShifts(instr, extModeSignExtend64, aluOpAsr)
+ } else {
+ m.lowerShifts(instr, extModeSignExtend32, aluOpAsr)
+ }
+ case ssa.OpcodeUshr:
+ if instr.Return().Type().Bits() == 64 {
+ m.lowerShifts(instr, extModeZeroExtend64, aluOpLsr)
+ } else {
+ m.lowerShifts(instr, extModeZeroExtend32, aluOpLsr)
+ }
+ case ssa.OpcodeRotl:
+ m.lowerRotl(instr)
+ case ssa.OpcodeRotr:
+ m.lowerRotr(instr)
+ case ssa.OpcodeSExtend, ssa.OpcodeUExtend:
+ from, to, signed := instr.ExtendData()
+ m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed)
+ case ssa.OpcodeFcmp:
+ x, y, c := instr.FcmpData()
+ m.lowerFcmp(x, y, instr.Return(), c)
+ case ssa.OpcodeImul:
+ x, y := instr.Arg2()
+ result := instr.Return()
+ m.lowerImul(x, y, result)
+ case ssa.OpcodeUndefined:
+ undef := m.allocateInstr()
+ undef.asUDF()
+ m.insert(undef)
+ case ssa.OpcodeSelect:
+ c, x, y := instr.SelectData()
+ if x.Type() == ssa.TypeV128 {
+ rc := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ m.lowerSelectVec(rc, rn, rm, rd)
+ } else {
+ m.lowerSelect(c, x, y, instr.Return())
+ }
+ case ssa.OpcodeClz:
+ x := instr.Arg()
+ result := instr.Return()
+ m.lowerClz(x, result)
+ case ssa.OpcodeCtz:
+ x := instr.Arg()
+ result := instr.Return()
+ m.lowerCtz(x, result)
+ case ssa.OpcodePopcnt:
+ x := instr.Arg()
+ result := instr.Return()
+ m.lowerPopcnt(x, result)
+ case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat:
+ x, ctx := instr.Arg2()
+ result := instr.Return()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(result))
+ ctxVReg := m.compiler.VRegOf(ctx)
+ m.lowerFpuToInt(rd, rn, ctxVReg, true, x.Type() == ssa.TypeF64,
+ result.Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat)
+ case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat:
+ x, ctx := instr.Arg2()
+ result := instr.Return()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(result))
+ ctxVReg := m.compiler.VRegOf(ctx)
+ m.lowerFpuToInt(rd, rn, ctxVReg, false, x.Type() == ssa.TypeF64,
+ result.Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat)
+ case ssa.OpcodeFcvtFromSint:
+ x := instr.Arg()
+ result := instr.Return()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(result))
+ m.lowerIntToFpu(rd, rn, true, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
+ case ssa.OpcodeFcvtFromUint:
+ x := instr.Arg()
+ result := instr.Return()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(result))
+ m.lowerIntToFpu(rd, rn, false, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
+ case ssa.OpcodeFdemote:
+ v := instr.Arg()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ cnt := m.allocateInstr()
+ cnt.asFpuRR(fpuUniOpCvt64To32, rd, rn, false)
+ m.insert(cnt)
+ case ssa.OpcodeFpromote:
+ v := instr.Arg()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ cnt := m.allocateInstr()
+ cnt.asFpuRR(fpuUniOpCvt32To64, rd, rn, true)
+ m.insert(cnt)
+ case ssa.OpcodeIreduce:
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(instr.Arg()), extModeNone)
+ retVal := instr.Return()
+ rd := m.compiler.VRegOf(retVal)
+
+ if retVal.Type() != ssa.TypeI32 {
+ panic("TODO?: Ireduce to non-i32")
+ }
+ mov := m.allocateInstr()
+ mov.asMove32(rd, rn.reg())
+ m.insert(mov)
+ case ssa.OpcodeFneg:
+ m.lowerFpuUniOp(fpuUniOpNeg, instr.Arg(), instr.Return())
+ case ssa.OpcodeSqrt:
+ m.lowerFpuUniOp(fpuUniOpSqrt, instr.Arg(), instr.Return())
+ case ssa.OpcodeCeil:
+ m.lowerFpuUniOp(fpuUniOpRoundPlus, instr.Arg(), instr.Return())
+ case ssa.OpcodeFloor:
+ m.lowerFpuUniOp(fpuUniOpRoundMinus, instr.Arg(), instr.Return())
+ case ssa.OpcodeTrunc:
+ m.lowerFpuUniOp(fpuUniOpRoundZero, instr.Arg(), instr.Return())
+ case ssa.OpcodeNearest:
+ m.lowerFpuUniOp(fpuUniOpRoundNearest, instr.Arg(), instr.Return())
+ case ssa.OpcodeFabs:
+ m.lowerFpuUniOp(fpuUniOpAbs, instr.Arg(), instr.Return())
+ case ssa.OpcodeBitcast:
+ m.lowerBitcast(instr)
+ case ssa.OpcodeFcopysign:
+ x, y := instr.Arg2()
+ m.lowerFcopysign(x, y, instr.Return())
+ case ssa.OpcodeSdiv, ssa.OpcodeUdiv:
+ x, y, ctx := instr.Arg3()
+ ctxVReg := m.compiler.VRegOf(ctx)
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ m.lowerIDiv(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSdiv)
+ case ssa.OpcodeSrem, ssa.OpcodeUrem:
+ x, y, ctx := instr.Arg3()
+ ctxVReg := m.compiler.VRegOf(ctx)
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ m.lowerIRem(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem)
+ case ssa.OpcodeVconst:
+ result := m.compiler.VRegOf(instr.Return())
+ lo, hi := instr.VconstData()
+ v := m.allocateInstr()
+ v.asLoadFpuConst128(result, lo, hi)
+ m.insert(v)
+ case ssa.OpcodeVbnot:
+ x := instr.Arg()
+ ins := m.allocateInstr()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ ins.asVecMisc(vecOpNot, rd, rn, vecArrangement16B)
+ m.insert(ins)
+ case ssa.OpcodeVbxor:
+ x, y := instr.Arg2()
+ m.lowerVecRRR(vecOpEOR, x, y, instr.Return(), vecArrangement16B)
+ case ssa.OpcodeVbor:
+ x, y := instr.Arg2()
+ m.lowerVecRRR(vecOpOrr, x, y, instr.Return(), vecArrangement16B)
+ case ssa.OpcodeVband:
+ x, y := instr.Arg2()
+ m.lowerVecRRR(vecOpAnd, x, y, instr.Return(), vecArrangement16B)
+ case ssa.OpcodeVbandnot:
+ x, y := instr.Arg2()
+ m.lowerVecRRR(vecOpBic, x, y, instr.Return(), vecArrangement16B)
+ case ssa.OpcodeVbitselect:
+ c, x, y := instr.SelectData()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ creg := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
+ tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+ // creg is overwritten by BSL, so we need to move it to the result register before the instruction
+ // in case when it is used somewhere else.
+ mov := m.allocateInstr()
+ mov.asFpuMov128(tmp.nr(), creg.nr())
+ m.insert(mov)
+
+ ins := m.allocateInstr()
+ ins.asVecRRRRewrite(vecOpBsl, tmp, rn, rm, vecArrangement16B)
+ m.insert(ins)
+
+ mov2 := m.allocateInstr()
+ rd := m.compiler.VRegOf(instr.Return())
+ mov2.asFpuMov128(rd, tmp.nr())
+ m.insert(mov2)
+ case ssa.OpcodeVanyTrue, ssa.OpcodeVallTrue:
+ x, lane := instr.ArgWithLane()
+ var arr vecArrangement
+ if op == ssa.OpcodeVallTrue {
+ arr = ssaLaneToArrangement(lane)
+ }
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ m.lowerVcheckTrue(op, rm, rd, arr)
+ case ssa.OpcodeVhighBits:
+ x, lane := instr.ArgWithLane()
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVhighBits(rm, rd, arr)
+ case ssa.OpcodeVIadd:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpAdd, x, y, instr.Return(), arr)
+ case ssa.OpcodeExtIaddPairwise:
+ v, lane, signed := instr.ExtIaddPairwiseData()
+ vv := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
+
+ tmpLo, tmpHi := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+ var widen vecOp
+ if signed {
+ widen = vecOpSshll
+ } else {
+ widen = vecOpUshll
+ }
+
+ var loArr, hiArr, dstArr vecArrangement
+ switch lane {
+ case ssa.VecLaneI8x16:
+ loArr, hiArr, dstArr = vecArrangement8B, vecArrangement16B, vecArrangement8H
+ case ssa.VecLaneI16x8:
+ loArr, hiArr, dstArr = vecArrangement4H, vecArrangement8H, vecArrangement4S
+ case ssa.VecLaneI32x4:
+ loArr, hiArr, dstArr = vecArrangement2S, vecArrangement4S, vecArrangement2D
+ default:
+ panic("unsupported lane " + lane.String())
+ }
+
+ widenLo := m.allocateInstr().asVecShiftImm(widen, tmpLo, vv, operandShiftImm(0), loArr)
+ widenHi := m.allocateInstr().asVecShiftImm(widen, tmpHi, vv, operandShiftImm(0), hiArr)
+ addp := m.allocateInstr().asVecRRR(vecOpAddp, operandNR(m.compiler.VRegOf(instr.Return())), tmpLo, tmpHi, dstArr)
+ m.insert(widenLo)
+ m.insert(widenHi)
+ m.insert(addp)
+
+ case ssa.OpcodeVSaddSat:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpSqadd, x, y, instr.Return(), arr)
+ case ssa.OpcodeVUaddSat:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpUqadd, x, y, instr.Return(), arr)
+ case ssa.OpcodeVIsub:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpSub, x, y, instr.Return(), arr)
+ case ssa.OpcodeVSsubSat:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpSqsub, x, y, instr.Return(), arr)
+ case ssa.OpcodeVUsubSat:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpUqsub, x, y, instr.Return(), arr)
+ case ssa.OpcodeVImin:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpSmin, x, y, instr.Return(), arr)
+ case ssa.OpcodeVUmin:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpUmin, x, y, instr.Return(), arr)
+ case ssa.OpcodeVImax:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpSmax, x, y, instr.Return(), arr)
+ case ssa.OpcodeVUmax:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpUmax, x, y, instr.Return(), arr)
+ case ssa.OpcodeVAvgRound:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpUrhadd, x, y, instr.Return(), arr)
+ case ssa.OpcodeVImul:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ m.lowerVIMul(rd, rn, rm, arr)
+ case ssa.OpcodeVIabs:
+ m.lowerVecMisc(vecOpAbs, instr)
+ case ssa.OpcodeVIneg:
+ m.lowerVecMisc(vecOpNeg, instr)
+ case ssa.OpcodeVIpopcnt:
+ m.lowerVecMisc(vecOpCnt, instr)
+ case ssa.OpcodeVIshl,
+ ssa.OpcodeVSshr, ssa.OpcodeVUshr:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ m.lowerVShift(op, rd, rn, rm, arr)
+ case ssa.OpcodeVSqrt:
+ m.lowerVecMisc(vecOpFsqrt, instr)
+ case ssa.OpcodeVFabs:
+ m.lowerVecMisc(vecOpFabs, instr)
+ case ssa.OpcodeVFneg:
+ m.lowerVecMisc(vecOpFneg, instr)
+ case ssa.OpcodeVFmin:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpFmin, x, y, instr.Return(), arr)
+ case ssa.OpcodeVFmax:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpFmax, x, y, instr.Return(), arr)
+ case ssa.OpcodeVFadd:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpFadd, x, y, instr.Return(), arr)
+ case ssa.OpcodeVFsub:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpFsub, x, y, instr.Return(), arr)
+ case ssa.OpcodeVFmul:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpFmul, x, y, instr.Return(), arr)
+ case ssa.OpcodeSqmulRoundSat:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpSqrdmulh, x, y, instr.Return(), arr)
+ case ssa.OpcodeVFdiv:
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+ m.lowerVecRRR(vecOpFdiv, x, y, instr.Return(), arr)
+ case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat:
+ x, lane := instr.ArgWithLane()
+ arr := ssaLaneToArrangement(lane)
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ m.lowerVfpuToInt(rd, rn, arr, op == ssa.OpcodeVFcvtToSintSat)
+ case ssa.OpcodeVFcvtFromSint, ssa.OpcodeVFcvtFromUint:
+ x, lane := instr.ArgWithLane()
+ arr := ssaLaneToArrangement(lane)
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ m.lowerVfpuFromInt(rd, rn, arr, op == ssa.OpcodeVFcvtFromSint)
+ case ssa.OpcodeSwidenLow, ssa.OpcodeUwidenLow:
+ x, lane := instr.ArgWithLane()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+
+ var arr vecArrangement
+ switch lane {
+ case ssa.VecLaneI8x16:
+ arr = vecArrangement8B
+ case ssa.VecLaneI16x8:
+ arr = vecArrangement4H
+ case ssa.VecLaneI32x4:
+ arr = vecArrangement2S
+ }
+
+ shll := m.allocateInstr()
+ if signed := op == ssa.OpcodeSwidenLow; signed {
+ shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr)
+ } else {
+ shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr)
+ }
+ m.insert(shll)
+ case ssa.OpcodeSwidenHigh, ssa.OpcodeUwidenHigh:
+ x, lane := instr.ArgWithLane()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+
+ arr := ssaLaneToArrangement(lane)
+
+ shll := m.allocateInstr()
+ if signed := op == ssa.OpcodeSwidenHigh; signed {
+ shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr)
+ } else {
+ shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr)
+ }
+ m.insert(shll)
+
+ case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow:
+ x, y, lane := instr.Arg2WithLane()
+ var arr, arr2 vecArrangement
+ switch lane {
+ case ssa.VecLaneI16x8: // I16x8
+ arr = vecArrangement8B
+ arr2 = vecArrangement16B // Implies sqxtn2.
+ case ssa.VecLaneI32x4:
+ arr = vecArrangement4H
+ arr2 = vecArrangement8H // Implies sqxtn2.
+ default:
+ panic("unsupported lane " + lane.String())
+ }
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+
+ tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+ loQxtn := m.allocateInstr()
+ hiQxtn := m.allocateInstr()
+ if signed := op == ssa.OpcodeSnarrow; signed {
+ // Narrow lanes on rn and write them into lower-half of rd.
+ loQxtn.asVecMisc(vecOpSqxtn, tmp, rn, arr) // low
+ // Narrow lanes on rm and write them into higher-half of rd.
+ hiQxtn.asVecMisc(vecOpSqxtn, tmp, rm, arr2) // high (sqxtn2)
+ } else {
+ // Narrow lanes on rn and write them into lower-half of rd.
+ loQxtn.asVecMisc(vecOpSqxtun, tmp, rn, arr) // low
+ // Narrow lanes on rm and write them into higher-half of rd.
+ hiQxtn.asVecMisc(vecOpSqxtun, tmp, rm, arr2) // high (sqxtn2)
+ }
+ m.insert(loQxtn)
+ m.insert(hiQxtn)
+
+ mov := m.allocateInstr()
+ mov.asFpuMov128(rd.nr(), tmp.nr())
+ m.insert(mov)
+ case ssa.OpcodeFvpromoteLow:
+ x, lane := instr.ArgWithLane()
+ if lane != ssa.VecLaneF32x4 {
+ panic("unsupported lane type " + lane.String())
+ }
+ ins := m.allocateInstr()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ ins.asVecMisc(vecOpFcvtl, rd, rn, vecArrangement2S)
+ m.insert(ins)
+ case ssa.OpcodeFvdemote:
+ x, lane := instr.ArgWithLane()
+ if lane != ssa.VecLaneF64x2 {
+ panic("unsupported lane type " + lane.String())
+ }
+ ins := m.allocateInstr()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ ins.asVecMisc(vecOpFcvtn, rd, rn, vecArrangement2S)
+ m.insert(ins)
+ case ssa.OpcodeExtractlane:
+ x, index, signed, lane := instr.ExtractlaneData()
+
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+
+ mov := m.allocateInstr()
+ switch lane {
+ case ssa.VecLaneI8x16:
+ mov.asMovFromVec(rd, rn, vecArrangementB, vecIndex(index), signed)
+ case ssa.VecLaneI16x8:
+ mov.asMovFromVec(rd, rn, vecArrangementH, vecIndex(index), signed)
+ case ssa.VecLaneI32x4:
+ mov.asMovFromVec(rd, rn, vecArrangementS, vecIndex(index), signed)
+ case ssa.VecLaneI64x2:
+ mov.asMovFromVec(rd, rn, vecArrangementD, vecIndex(index), signed)
+ case ssa.VecLaneF32x4:
+ mov.asVecMovElement(rd, rn, vecArrangementS, vecIndex(0), vecIndex(index))
+ case ssa.VecLaneF64x2:
+ mov.asVecMovElement(rd, rn, vecArrangementD, vecIndex(0), vecIndex(index))
+ default:
+ panic("unsupported lane: " + lane.String())
+ }
+
+ m.insert(mov)
+
+ case ssa.OpcodeInsertlane:
+ x, y, index, lane := instr.InsertlaneData()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+ // Initially mov rn to tmp.
+ mov1 := m.allocateInstr()
+ mov1.asFpuMov128(tmpReg.nr(), rn.nr())
+ m.insert(mov1)
+
+ // movToVec and vecMovElement do not clear the remaining bits to zero,
+ // thus, we can mov rm in-place to tmp.
+ mov2 := m.allocateInstr()
+ switch lane {
+ case ssa.VecLaneI8x16:
+ mov2.asMovToVec(tmpReg, rm, vecArrangementB, vecIndex(index))
+ case ssa.VecLaneI16x8:
+ mov2.asMovToVec(tmpReg, rm, vecArrangementH, vecIndex(index))
+ case ssa.VecLaneI32x4:
+ mov2.asMovToVec(tmpReg, rm, vecArrangementS, vecIndex(index))
+ case ssa.VecLaneI64x2:
+ mov2.asMovToVec(tmpReg, rm, vecArrangementD, vecIndex(index))
+ case ssa.VecLaneF32x4:
+ mov2.asVecMovElement(tmpReg, rm, vecArrangementS, vecIndex(index), vecIndex(0))
+ case ssa.VecLaneF64x2:
+ mov2.asVecMovElement(tmpReg, rm, vecArrangementD, vecIndex(index), vecIndex(0))
+ }
+ m.insert(mov2)
+
+ // Finally mov tmp to rd.
+ mov3 := m.allocateInstr()
+ mov3.asFpuMov128(rd.nr(), tmpReg.nr())
+ m.insert(mov3)
+
+ case ssa.OpcodeSwizzle:
+ x, y, lane := instr.Arg2WithLane()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+
+ arr := ssaLaneToArrangement(lane)
+
+ // tbl <rd>.<arr>, { <rn>.<arr> }, <rm>.<arr>
+ tbl1 := m.allocateInstr()
+ tbl1.asVecTbl(1, rd, rn, rm, arr)
+ m.insert(tbl1)
+
+ case ssa.OpcodeShuffle:
+ x, y, lane1, lane2 := instr.ShuffleData()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+
+ m.lowerShuffle(rd, rn, rm, lane1, lane2)
+
+ case ssa.OpcodeSplat:
+ x, lane := instr.ArgWithLane()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+
+ dup := m.allocateInstr()
+ switch lane {
+ case ssa.VecLaneI8x16:
+ dup.asVecDup(rd, rn, vecArrangement16B)
+ case ssa.VecLaneI16x8:
+ dup.asVecDup(rd, rn, vecArrangement8H)
+ case ssa.VecLaneI32x4:
+ dup.asVecDup(rd, rn, vecArrangement4S)
+ case ssa.VecLaneI64x2:
+ dup.asVecDup(rd, rn, vecArrangement2D)
+ case ssa.VecLaneF32x4:
+ dup.asVecDupElement(rd, rn, vecArrangementS, vecIndex(0))
+ case ssa.VecLaneF64x2:
+ dup.asVecDupElement(rd, rn, vecArrangementD, vecIndex(0))
+ }
+ m.insert(dup)
+
+ case ssa.OpcodeWideningPairwiseDotProductS:
+ x, y := instr.Arg2()
+ xx, yy := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone),
+ m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ tmp, tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+ m.insert(m.allocateInstr().asVecRRR(vecOpSmull, tmp, xx, yy, vecArrangement8H))
+ m.insert(m.allocateInstr().asVecRRR(vecOpSmull2, tmp2, xx, yy, vecArrangement8H))
+ m.insert(m.allocateInstr().asVecRRR(vecOpAddp, tmp, tmp, tmp2, vecArrangement4S))
+
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ m.insert(m.allocateInstr().asFpuMov128(rd.nr(), tmp.nr()))
+
+ case ssa.OpcodeLoadSplat:
+ ptr, offset, lane := instr.LoadSplatData()
+ m.lowerLoadSplat(ptr, offset, lane, instr.Return())
+
+ case ssa.OpcodeAtomicRmw:
+ m.lowerAtomicRmw(instr)
+
+ case ssa.OpcodeAtomicCas:
+ m.lowerAtomicCas(instr)
+
+ case ssa.OpcodeAtomicLoad:
+ m.lowerAtomicLoad(instr)
+
+ case ssa.OpcodeAtomicStore:
+ m.lowerAtomicStore(instr)
+
+ case ssa.OpcodeFence:
+ instr := m.allocateInstr()
+ instr.asDMB()
+ m.insert(instr)
+
+ default:
+ panic("TODO: lowering " + op.String())
+ }
+ m.executableContext.FlushPendingInstructions()
+}
+
+func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) {
+ // `tbl2` requires 2 consecutive registers, so we arbitrarily pick v29, v30.
+ vReg, wReg := v29VReg, v30VReg
+
+ // Initialize v29, v30 to rn, rm.
+ movv := m.allocateInstr()
+ movv.asFpuMov128(vReg, rn.nr())
+ m.insert(movv)
+
+ movw := m.allocateInstr()
+ movw.asFpuMov128(wReg, rm.nr())
+ m.insert(movw)
+
+ // `lane1`, `lane2` are already encoded as two u64s with the right layout:
+ // lane1 := lane[7]<<56 | ... | lane[1]<<8 | lane[0]
+ // lane2 := lane[15]<<56 | ... | lane[9]<<8 | lane[8]
+ // Thus, we can use loadFpuConst128.
+ tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+ lfc := m.allocateInstr()
+ lfc.asLoadFpuConst128(tmp.nr(), lane1, lane2)
+ m.insert(lfc)
+
+ // tbl <rd>.16b, { <vReg>.16B, <wReg>.16b }, <tmp>.16b
+ tbl2 := m.allocateInstr()
+ tbl2.asVecTbl(2, rd, operandNR(vReg), tmp, vecArrangement16B)
+ m.insert(tbl2)
+}
+
+func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangement) {
+ var modulo byte
+ switch arr {
+ case vecArrangement16B:
+ modulo = 0x7 // Modulo 8.
+ case vecArrangement8H:
+ modulo = 0xf // Modulo 16.
+ case vecArrangement4S:
+ modulo = 0x1f // Modulo 32.
+ case vecArrangement2D:
+ modulo = 0x3f // Modulo 64.
+ default:
+ panic("unsupported arrangment " + arr.String())
+ }
+
+ rtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+ vtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+ and := m.allocateInstr()
+ and.asALUBitmaskImm(aluOpAnd, rtmp.nr(), rm.nr(), uint64(modulo), true)
+ m.insert(and)
+
+ if op != ssa.OpcodeVIshl {
+ // Negate the amount to make this as right shift.
+ neg := m.allocateInstr()
+ neg.asALU(aluOpSub, rtmp, operandNR(xzrVReg), rtmp, true)
+ m.insert(neg)
+ }
+
+ // Copy the shift amount into a vector register as sshl/ushl requires it to be there.
+ dup := m.allocateInstr()
+ dup.asVecDup(vtmp, rtmp, arr)
+ m.insert(dup)
+
+ if op == ssa.OpcodeVIshl || op == ssa.OpcodeVSshr {
+ sshl := m.allocateInstr()
+ sshl.asVecRRR(vecOpSshl, rd, rn, vtmp, arr)
+ m.insert(sshl)
+ } else {
+ ushl := m.allocateInstr()
+ ushl.asVecRRR(vecOpUshl, rd, rn, vtmp, arr)
+ m.insert(ushl)
+ }
+}
+
+func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangement) {
+ tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+ // Special case VallTrue for i64x2.
+ if op == ssa.OpcodeVallTrue && arr == vecArrangement2D {
+ // cmeq v3?.2d, v2?.2d, #0
+ // addp v3?.2d, v3?.2d, v3?.2d
+ // fcmp v3?, v3?
+ // cset dst, eq
+
+ ins := m.allocateInstr()
+ ins.asVecMisc(vecOpCmeq0, tmp, rm, vecArrangement2D)
+ m.insert(ins)
+
+ addp := m.allocateInstr()
+ addp.asVecRRR(vecOpAddp, tmp, tmp, tmp, vecArrangement2D)
+ m.insert(addp)
+
+ fcmp := m.allocateInstr()
+ fcmp.asFpuCmp(tmp, tmp, true)
+ m.insert(fcmp)
+
+ cset := m.allocateInstr()
+ cset.asCSet(rd.nr(), false, eq)
+ m.insert(cset)
+
+ return
+ }
+
+ // Create a scalar value with umaxp or uminv, then compare it against zero.
+ ins := m.allocateInstr()
+ if op == ssa.OpcodeVanyTrue {
+ // umaxp v4?.16b, v2?.16b, v2?.16b
+ ins.asVecRRR(vecOpUmaxp, tmp, rm, rm, vecArrangement16B)
+ } else {
+ // uminv d4?, v2?.4s
+ ins.asVecLanes(vecOpUminv, tmp, rm, arr)
+ }
+ m.insert(ins)
+
+ // mov x3?, v4?.d[0]
+ // ccmp x3?, #0x0, #0x0, al
+ // cset x3?, ne
+ // mov x0, x3?
+
+ movv := m.allocateInstr()
+ movv.asMovFromVec(rd, tmp, vecArrangementD, vecIndex(0), false)
+ m.insert(movv)
+
+ fc := m.allocateInstr()
+ fc.asCCmpImm(rd, uint64(0), al, 0, true)
+ m.insert(fc)
+
+ cset := m.allocateInstr()
+ cset.asCSet(rd.nr(), false, ne)
+ m.insert(cset)
+}
+
+func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
+ r0 := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+ v0 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+ v1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+ switch arr {
+ case vecArrangement16B:
+ // sshr v6?.16b, v2?.16b, #7
+ // movz x4?, #0x201, lsl 0
+ // movk x4?, #0x804, lsl 16
+ // movk x4?, #0x2010, lsl 32
+ // movk x4?, #0x8040, lsl 48
+ // dup v5?.2d, x4?
+ // and v6?.16b, v6?.16b, v5?.16b
+ // ext v5?.16b, v6?.16b, v6?.16b, #8
+ // zip1 v5?.16b, v6?.16b, v5?.16b
+ // addv s5?, v5?.8h
+ // umov s3?, v5?.h[0]
+
+ // Right arithmetic shift on the original vector and store the result into v1. So we have:
+ // v1[i] = 0xff if vi<0, 0 otherwise.
+ sshr := m.allocateInstr()
+ sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(7), vecArrangement16B)
+ m.insert(sshr)
+
+ // Load the bit mask into r0.
+ m.insertMOVZ(r0.nr(), 0x0201, 0, true)
+ m.insertMOVK(r0.nr(), 0x0804, 1, true)
+ m.insertMOVK(r0.nr(), 0x2010, 2, true)
+ m.insertMOVK(r0.nr(), 0x8040, 3, true)
+
+ // dup r0 to v0.
+ dup := m.allocateInstr()
+ dup.asVecDup(v0, r0, vecArrangement2D)
+ m.insert(dup)
+
+ // Lane-wise logical AND with the bit mask, meaning that we have
+ // v[i] = (1 << i) if vi<0, 0 otherwise.
+ //
+ // Below, we use the following notation:
+ // wi := (1 << i) if vi<0, 0 otherwise.
+ and := m.allocateInstr()
+ and.asVecRRR(vecOpAnd, v1, v1, v0, vecArrangement16B)
+ m.insert(and)
+
+ // Swap the lower and higher 8 byte elements, and write it into v0, meaning that we have
+ // v0[i] = w(i+8) if i < 8, w(i-8) otherwise.
+ ext := m.allocateInstr()
+ ext.asVecExtract(v0, v1, v1, vecArrangement16B, uint32(8))
+ m.insert(ext)
+
+ // v = [w0, w8, ..., w7, w15]
+ zip1 := m.allocateInstr()
+ zip1.asVecPermute(vecOpZip1, v0, v1, v0, vecArrangement16B)
+ m.insert(zip1)
+
+ // v.h[0] = w0 + ... + w15
+ addv := m.allocateInstr()
+ addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
+ m.insert(addv)
+
+ // Extract the v.h[0] as the result.
+ movfv := m.allocateInstr()
+ movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false)
+ m.insert(movfv)
+ case vecArrangement8H:
+ // sshr v6?.8h, v2?.8h, #15
+ // movz x4?, #0x1, lsl 0
+ // movk x4?, #0x2, lsl 16
+ // movk x4?, #0x4, lsl 32
+ // movk x4?, #0x8, lsl 48
+ // dup v5?.2d, x4?
+ // lsl x4?, x4?, 0x4
+ // ins v5?.d[1], x4?
+ // and v5?.16b, v6?.16b, v5?.16b
+ // addv s5?, v5?.8h
+ // umov s3?, v5?.h[0]
+
+ // Right arithmetic shift on the original vector and store the result into v1. So we have:
+ // v[i] = 0xffff if vi<0, 0 otherwise.
+ sshr := m.allocateInstr()
+ sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(15), vecArrangement8H)
+ m.insert(sshr)
+
+ // Load the bit mask into r0.
+ m.lowerConstantI64(r0.nr(), 0x0008000400020001)
+
+ // dup r0 to vector v0.
+ dup := m.allocateInstr()
+ dup.asVecDup(v0, r0, vecArrangement2D)
+ m.insert(dup)
+
+ lsl := m.allocateInstr()
+ lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(4), true)
+ m.insert(lsl)
+
+ movv := m.allocateInstr()
+ movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
+ m.insert(movv)
+
+ // Lane-wise logical AND with the bitmask, meaning that we have
+ // v[i] = (1 << i) if vi<0, 0 otherwise for i=0..3
+ // = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7
+ and := m.allocateInstr()
+ and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
+ m.insert(and)
+
+ addv := m.allocateInstr()
+ addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
+ m.insert(addv)
+
+ movfv := m.allocateInstr()
+ movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false)
+ m.insert(movfv)
+ case vecArrangement4S:
+ // sshr v6?.8h, v2?.8h, #15
+ // movz x4?, #0x1, lsl 0
+ // movk x4?, #0x2, lsl 16
+ // movk x4?, #0x4, lsl 32
+ // movk x4?, #0x8, lsl 48
+ // dup v5?.2d, x4?
+ // lsl x4?, x4?, 0x4
+ // ins v5?.d[1], x4?
+ // and v5?.16b, v6?.16b, v5?.16b
+ // addv s5?, v5?.8h
+ // umov s3?, v5?.h[0]
+
+ // Right arithmetic shift on the original vector and store the result into v1. So we have:
+ // v[i] = 0xffffffff if vi<0, 0 otherwise.
+ sshr := m.allocateInstr()
+ sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(31), vecArrangement4S)
+ m.insert(sshr)
+
+ // Load the bit mask into r0.
+ m.lowerConstantI64(r0.nr(), 0x0000000200000001)
+
+ // dup r0 to vector v0.
+ dup := m.allocateInstr()
+ dup.asVecDup(v0, r0, vecArrangement2D)
+ m.insert(dup)
+
+ lsl := m.allocateInstr()
+ lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(2), true)
+ m.insert(lsl)
+
+ movv := m.allocateInstr()
+ movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
+ m.insert(movv)
+
+ // Lane-wise logical AND with the bitmask, meaning that we have
+ // v[i] = (1 << i) if vi<0, 0 otherwise for i in [0, 1]
+ // = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3]
+ and := m.allocateInstr()
+ and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
+ m.insert(and)
+
+ addv := m.allocateInstr()
+ addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement4S)
+ m.insert(addv)
+
+ movfv := m.allocateInstr()
+ movfv.asMovFromVec(rd, v0, vecArrangementS, vecIndex(0), false)
+ m.insert(movfv)
+ case vecArrangement2D:
+ // mov d3?, v2?.d[0]
+ // mov x4?, v2?.d[1]
+ // lsr x4?, x4?, 0x3f
+ // lsr d3?, d3?, 0x3f
+ // add s3?, s3?, w4?, lsl #1
+
+ // Move the lower 64-bit int into result.
+ movv0 := m.allocateInstr()
+ movv0.asMovFromVec(rd, rm, vecArrangementD, vecIndex(0), false)
+ m.insert(movv0)
+
+ // Move the higher 64-bit int into r0.
+ movv1 := m.allocateInstr()
+ movv1.asMovFromVec(r0, rm, vecArrangementD, vecIndex(1), false)
+ m.insert(movv1)
+
+ // Move the sign bit into the least significant bit.
+ lsr1 := m.allocateInstr()
+ lsr1.asALUShift(aluOpLsr, r0, r0, operandShiftImm(63), true)
+ m.insert(lsr1)
+
+ lsr2 := m.allocateInstr()
+ lsr2.asALUShift(aluOpLsr, rd, rd, operandShiftImm(63), true)
+ m.insert(lsr2)
+
+ // rd = (r0<<1) | rd
+ lsl := m.allocateInstr()
+ lsl.asALU(aluOpAdd, rd, rd, operandSR(r0.nr(), 1, shiftOpLSL), false)
+ m.insert(lsl)
+ default:
+ panic("Unsupported " + arr.String())
+ }
+}
+
+func (m *machine) lowerVecMisc(op vecOp, instr *ssa.Instruction) {
+ x, lane := instr.ArgWithLane()
+ arr := ssaLaneToArrangement(lane)
+ ins := m.allocateInstr()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ ins.asVecMisc(op, rd, rn, arr)
+ m.insert(ins)
+}
+
+func (m *machine) lowerVecRRR(op vecOp, x, y, ret ssa.Value, arr vecArrangement) {
+ ins := m.allocateInstr()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(ret))
+ ins.asVecRRR(op, rd, rn, rm, arr)
+ m.insert(ins)
+}
+
+func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
+ if arr != vecArrangement2D {
+ mul := m.allocateInstr()
+ mul.asVecRRR(vecOpMul, rd, rn, rm, arr)
+ m.insert(mul)
+ } else {
+ tmp1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+ tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+ tmp3 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+ tmpRes := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+ // Following the algorithm in https://chromium-review.googlesource.com/c/v8/v8/+/1781696
+ rev64 := m.allocateInstr()
+ rev64.asVecMisc(vecOpRev64, tmp2, rm, vecArrangement4S)
+ m.insert(rev64)
+
+ mul := m.allocateInstr()
+ mul.asVecRRR(vecOpMul, tmp2, tmp2, rn, vecArrangement4S)
+ m.insert(mul)
+
+ xtn1 := m.allocateInstr()
+ xtn1.asVecMisc(vecOpXtn, tmp1, rn, vecArrangement2S)
+ m.insert(xtn1)
+
+ addp := m.allocateInstr()
+ addp.asVecRRR(vecOpAddp, tmp2, tmp2, tmp2, vecArrangement4S)
+ m.insert(addp)
+
+ xtn2 := m.allocateInstr()
+ xtn2.asVecMisc(vecOpXtn, tmp3, rm, vecArrangement2S)
+ m.insert(xtn2)
+
+ // Note: do not write the result directly into result yet. This is the same reason as in bsl.
+ // In short, in UMLAL instruction, the result register is also one of the source register, and
+ // the value on the result register is significant.
+ shll := m.allocateInstr()
+ shll.asVecMisc(vecOpShll, tmpRes, tmp2, vecArrangement2S)
+ m.insert(shll)
+
+ umlal := m.allocateInstr()
+ umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, tmp3, tmp1, vecArrangement2S)
+ m.insert(umlal)
+
+ mov := m.allocateInstr()
+ mov.asFpuMov128(rd.nr(), tmpRes.nr())
+ m.insert(mov)
+ }
+}
+
+func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) {
+ x, y, lane := instr.Arg2WithLane()
+ arr := ssaLaneToArrangement(lane)
+
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+
+ // Note: this usage of tmp is important.
+ // BSL modifies the destination register, so we need to use a temporary register so that
+ // the actual definition of the destination register happens *after* the BSL instruction.
+ // That way, we can force the spill instruction to be inserted after the BSL instruction.
+ tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+ fcmgt := m.allocateInstr()
+ if max {
+ fcmgt.asVecRRR(vecOpFcmgt, tmp, rm, rn, arr)
+ } else {
+ // If min, swap the args.
+ fcmgt.asVecRRR(vecOpFcmgt, tmp, rn, rm, arr)
+ }
+ m.insert(fcmgt)
+
+ bsl := m.allocateInstr()
+ bsl.asVecRRRRewrite(vecOpBsl, tmp, rm, rn, vecArrangement16B)
+ m.insert(bsl)
+
+ res := operandNR(m.compiler.VRegOf(instr.Return()))
+ mov2 := m.allocateInstr()
+ mov2.asFpuMov128(res.nr(), tmp.nr())
+ m.insert(mov2)
+}
+
+func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
+ div := m.allocateInstr()
+
+ if signed {
+ div.asALU(aluOpSDiv, rd, rn, rm, _64bit)
+ } else {
+ div.asALU(aluOpUDiv, rd, rn, rm, _64bit)
+ }
+ m.insert(div)
+
+ // Check if rm is zero:
+ m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero)
+
+ // rd = rn-rd*rm by MSUB instruction.
+ msub := m.allocateInstr()
+ msub.asALURRRR(aluOpMSub, rd, rd, rm, rn, _64bit)
+ m.insert(msub)
+}
+
+func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
+ div := m.allocateInstr()
+
+ if signed {
+ div.asALU(aluOpSDiv, rd, rn, rm, _64bit)
+ } else {
+ div.asALU(aluOpUDiv, rd, rn, rm, _64bit)
+ }
+ m.insert(div)
+
+ // Check if rm is zero:
+ m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero)
+
+ if signed {
+ // We need to check the signed overflow which happens iff "math.MinInt{32,64} / -1"
+ minusOneCheck := m.allocateInstr()
+ // Sets eq condition if rm == -1.
+ minusOneCheck.asALU(aluOpAddS, operandNR(xzrVReg), rm, operandImm12(1, 0), _64bit)
+ m.insert(minusOneCheck)
+
+ ccmp := m.allocateInstr()
+ // If eq condition is set, sets the flag by the result based on "rn - 1", otherwise clears the flag.
+ ccmp.asCCmpImm(rn, 1, eq, 0, _64bit)
+ m.insert(ccmp)
+
+ // Check the overflow flag.
+ m.exitIfNot(execCtxVReg, vs.invert().asCond(), false, wazevoapi.ExitCodeIntegerOverflow)
+ }
+}
+
+// exitIfNot emits a conditional branch to exit if the condition is not met.
+// If `c` (cond type) is a register, `cond64bit` must be chosen to indicate whether the register is 32-bit or 64-bit.
+// Otherwise, `cond64bit` is ignored.
+func (m *machine) exitIfNot(execCtxVReg regalloc.VReg, c cond, cond64bit bool, code wazevoapi.ExitCode) {
+ execCtxTmp := m.copyToTmp(execCtxVReg)
+
+ cbr := m.allocateInstr()
+ m.insert(cbr)
+ m.lowerExitWithCode(execCtxTmp, code)
+ // Conditional branch target is after exit.
+ l := m.insertBrTargetLabel()
+ cbr.asCondBr(c, l, cond64bit)
+}
+
+func (m *machine) lowerFcopysign(x, y, ret ssa.Value) {
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ var tmpI, tmpF operand
+ _64 := x.Type() == ssa.TypeF64
+ if _64 {
+ tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
+ tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+ } else {
+ tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF32))
+ tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+ }
+ rd := m.compiler.VRegOf(ret)
+ m.lowerFcopysignImpl(operandNR(rd), rn, rm, tmpI, tmpF, _64)
+}
+
+func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool) {
+ // This is exactly the same code emitted by GCC for "__builtin_copysign":
+ //
+ // mov x0, -9223372036854775808
+ // fmov d2, x0
+ // vbit v0.8b, v1.8b, v2.8b
+ //
+
+ setMSB := m.allocateInstr()
+ if _64bit {
+ m.lowerConstantI64(tmpI.nr(), math.MinInt64)
+ setMSB.asMovToVec(tmpF, tmpI, vecArrangementD, vecIndex(0))
+ } else {
+ m.lowerConstantI32(tmpI.nr(), math.MinInt32)
+ setMSB.asMovToVec(tmpF, tmpI, vecArrangementS, vecIndex(0))
+ }
+ m.insert(setMSB)
+
+ tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
+
+ mov := m.allocateInstr()
+ mov.asFpuMov64(tmpReg.nr(), rn.nr())
+ m.insert(mov)
+
+ vbit := m.allocateInstr()
+ vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, tmpF, vecArrangement8B)
+ m.insert(vbit)
+
+ movDst := m.allocateInstr()
+ movDst.asFpuMov64(rd.nr(), tmpReg.nr())
+ m.insert(movDst)
+}
+
+func (m *machine) lowerBitcast(instr *ssa.Instruction) {
+ v, dstType := instr.BitcastData()
+ srcType := v.Type()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(instr.Return()))
+ srcInt := srcType.IsInt()
+ dstInt := dstType.IsInt()
+ switch {
+ case srcInt && !dstInt: // Int to Float:
+ mov := m.allocateInstr()
+ var arr vecArrangement
+ if srcType.Bits() == 64 {
+ arr = vecArrangementD
+ } else {
+ arr = vecArrangementS
+ }
+ mov.asMovToVec(rd, rn, arr, vecIndex(0))
+ m.insert(mov)
+ case !srcInt && dstInt: // Float to Int:
+ mov := m.allocateInstr()
+ var arr vecArrangement
+ if dstType.Bits() == 64 {
+ arr = vecArrangementD
+ } else {
+ arr = vecArrangementS
+ }
+ mov.asMovFromVec(rd, rn, arr, vecIndex(0), false)
+ m.insert(mov)
+ default:
+ panic("TODO?BUG?")
+ }
+}
+
+func (m *machine) lowerFpuUniOp(op fpuUniOp, in, out ssa.Value) {
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(in), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(out))
+
+ neg := m.allocateInstr()
+ neg.asFpuRR(op, rd, rn, in.Type().Bits() == 64)
+ m.insert(neg)
+}
+
+func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) {
+ if !nonTrapping {
+ // First of all, we have to clear the FPU flags.
+ flagClear := m.allocateInstr()
+ flagClear.asMovToFPSR(xzrVReg)
+ m.insert(flagClear)
+ }
+
+ // Then, do the conversion which doesn't trap inherently.
+ cvt := m.allocateInstr()
+ cvt.asFpuToInt(rd, rn, signed, src64bit, dst64bit)
+ m.insert(cvt)
+
+ if !nonTrapping {
+ tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
+
+ // After the conversion, check the FPU flags.
+ getFlag := m.allocateInstr()
+ getFlag.asMovFromFPSR(tmpReg)
+ m.insert(getFlag)
+
+ execCtx := m.copyToTmp(ctx)
+ _rn := operandNR(m.copyToTmp(rn.nr()))
+
+ // Check if the conversion was undefined by comparing the status with 1.
+ // See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register
+ alu := m.allocateInstr()
+ alu.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpReg), operandImm12(1, 0), true)
+ m.insert(alu)
+
+ // If it is not undefined, we can return the result.
+ ok := m.allocateInstr()
+ m.insert(ok)
+
+ // Otherwise, we have to choose the status depending on it is overflow or NaN conversion.
+
+ // Comparing itself to check if it is a NaN.
+ fpuCmp := m.allocateInstr()
+ fpuCmp.asFpuCmp(_rn, _rn, src64bit)
+ m.insert(fpuCmp)
+ // If the VC flag is not set (== VS flag is set), it is a NaN.
+ m.exitIfNot(execCtx, vc.asCond(), false, wazevoapi.ExitCodeInvalidConversionToInteger)
+ // Otherwise, it is an overflow.
+ m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
+
+ // Conditional branch target is after exit.
+ l := m.insertBrTargetLabel()
+ ok.asCondBr(ne.asCond(), l, false /* ignored */)
+ }
+}
+
+func (m *machine) lowerIntToFpu(rd, rn operand, signed, src64bit, dst64bit bool) {
+ cvt := m.allocateInstr()
+ cvt.asIntToFpu(rd, rn, signed, src64bit, dst64bit)
+ m.insert(cvt)
+}
+
+func (m *machine) lowerFpuBinOp(si *ssa.Instruction) {
+ instr := m.allocateInstr()
+ var op fpuBinOp
+ switch si.Opcode() {
+ case ssa.OpcodeFadd:
+ op = fpuBinOpAdd
+ case ssa.OpcodeFsub:
+ op = fpuBinOpSub
+ case ssa.OpcodeFmul:
+ op = fpuBinOpMul
+ case ssa.OpcodeFdiv:
+ op = fpuBinOpDiv
+ case ssa.OpcodeFmax:
+ op = fpuBinOpMax
+ case ssa.OpcodeFmin:
+ op = fpuBinOpMin
+ }
+ x, y := si.Arg2()
+ xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
+ rn := m.getOperand_NR(xDef, extModeNone)
+ rm := m.getOperand_NR(yDef, extModeNone)
+ rd := operandNR(m.compiler.VRegOf(si.Return()))
+ instr.asFpuRRR(op, rd, rn, rm, x.Type().Bits() == 64)
+ m.insert(instr)
+}
+
+func (m *machine) lowerSubOrAdd(si *ssa.Instruction, add bool) {
+ x, y := si.Arg2()
+ if !x.Type().IsInt() {
+ panic("BUG?")
+ }
+
+ xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
+ rn := m.getOperand_NR(xDef, extModeNone)
+ rm, yNegated := m.getOperand_MaybeNegatedImm12_ER_SR_NR(yDef, extModeNone)
+
+ var aop aluOp
+ switch {
+ case add && !yNegated: // rn+rm = x+y
+ aop = aluOpAdd
+ case add && yNegated: // rn-rm = x-(-y) = x+y
+ aop = aluOpSub
+ case !add && !yNegated: // rn-rm = x-y
+ aop = aluOpSub
+ case !add && yNegated: // rn+rm = x-(-y) = x-y
+ aop = aluOpAdd
+ }
+ rd := operandNR(m.compiler.VRegOf(si.Return()))
+ alu := m.allocateInstr()
+ alu.asALU(aop, rd, rn, rm, x.Type().Bits() == 64)
+ m.insert(alu)
+}
+
+// InsertMove implements backend.Machine.
+func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) {
+ instr := m.allocateInstr()
+ switch typ {
+ case ssa.TypeI32, ssa.TypeI64:
+ instr.asMove64(dst, src)
+ case ssa.TypeF32, ssa.TypeF64:
+ instr.asFpuMov64(dst, src)
+ case ssa.TypeV128:
+ instr.asFpuMov128(dst, src)
+ default:
+ panic("TODO")
+ }
+ m.insert(instr)
+}
+
+func (m *machine) lowerIcmp(si *ssa.Instruction) {
+ x, y, c := si.IcmpData()
+ flag := condFlagFromSSAIntegerCmpCond(c)
+
+ in64bit := x.Type().Bits() == 64
+ var ext extMode
+ if in64bit {
+ if c.Signed() {
+ ext = extModeSignExtend64
+ } else {
+ ext = extModeZeroExtend64
+ }
+ } else {
+ if c.Signed() {
+ ext = extModeSignExtend32
+ } else {
+ ext = extModeZeroExtend32
+ }
+ }
+
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
+ rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), ext)
+ alu := m.allocateInstr()
+ alu.asALU(aluOpSubS, operandNR(xzrVReg), rn, rm, in64bit)
+ m.insert(alu)
+
+ cset := m.allocateInstr()
+ cset.asCSet(m.compiler.VRegOf(si.Return()), false, flag)
+ m.insert(cset)
+}
+
+func (m *machine) lowerVIcmp(si *ssa.Instruction) {
+ x, y, c, lane := si.VIcmpData()
+ flag := condFlagFromSSAIntegerCmpCond(c)
+ arr := ssaLaneToArrangement(lane)
+
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(si.Return()))
+
+ switch flag {
+ case eq:
+ cmp := m.allocateInstr()
+ cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr)
+ m.insert(cmp)
+ case ne:
+ cmp := m.allocateInstr()
+ cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr)
+ m.insert(cmp)
+ not := m.allocateInstr()
+ not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
+ m.insert(not)
+ case ge:
+ cmp := m.allocateInstr()
+ cmp.asVecRRR(vecOpCmge, rd, rn, rm, arr)
+ m.insert(cmp)
+ case gt:
+ cmp := m.allocateInstr()
+ cmp.asVecRRR(vecOpCmgt, rd, rn, rm, arr)
+ m.insert(cmp)
+ case le:
+ cmp := m.allocateInstr()
+ cmp.asVecRRR(vecOpCmge, rd, rm, rn, arr) // rm, rn are swapped
+ m.insert(cmp)
+ case lt:
+ cmp := m.allocateInstr()
+ cmp.asVecRRR(vecOpCmgt, rd, rm, rn, arr) // rm, rn are swapped
+ m.insert(cmp)
+ case hs:
+ cmp := m.allocateInstr()
+ cmp.asVecRRR(vecOpCmhs, rd, rn, rm, arr)
+ m.insert(cmp)
+ case hi:
+ cmp := m.allocateInstr()
+ cmp.asVecRRR(vecOpCmhi, rd, rn, rm, arr)
+ m.insert(cmp)
+ case ls:
+ cmp := m.allocateInstr()
+ cmp.asVecRRR(vecOpCmhs, rd, rm, rn, arr) // rm, rn are swapped
+ m.insert(cmp)
+ case lo:
+ cmp := m.allocateInstr()
+ cmp.asVecRRR(vecOpCmhi, rd, rm, rn, arr) // rm, rn are swapped
+ m.insert(cmp)
+ }
+}
+
+func (m *machine) lowerVFcmp(si *ssa.Instruction) {
+ x, y, c, lane := si.VFcmpData()
+ flag := condFlagFromSSAFloatCmpCond(c)
+ arr := ssaLaneToArrangement(lane)
+
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ rd := operandNR(m.compiler.VRegOf(si.Return()))
+
+ switch flag {
+ case eq:
+ cmp := m.allocateInstr()
+ cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr)
+ m.insert(cmp)
+ case ne:
+ cmp := m.allocateInstr()
+ cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr)
+ m.insert(cmp)
+ not := m.allocateInstr()
+ not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
+ m.insert(not)
+ case ge:
+ cmp := m.allocateInstr()
+ cmp.asVecRRR(vecOpFcmge, rd, rn, rm, arr)
+ m.insert(cmp)
+ case gt:
+ cmp := m.allocateInstr()
+ cmp.asVecRRR(vecOpFcmgt, rd, rn, rm, arr)
+ m.insert(cmp)
+ case mi:
+ cmp := m.allocateInstr()
+ cmp.asVecRRR(vecOpFcmgt, rd, rm, rn, arr) // rm, rn are swapped
+ m.insert(cmp)
+ case ls:
+ cmp := m.allocateInstr()
+ cmp.asVecRRR(vecOpFcmge, rd, rm, rn, arr) // rm, rn are swapped
+ m.insert(cmp)
+ }
+}
+
+func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool) {
+ cvt := m.allocateInstr()
+ if signed {
+ cvt.asVecMisc(vecOpFcvtzs, rd, rn, arr)
+ } else {
+ cvt.asVecMisc(vecOpFcvtzu, rd, rn, arr)
+ }
+ m.insert(cvt)
+
+ if arr == vecArrangement2D {
+ narrow := m.allocateInstr()
+ if signed {
+ narrow.asVecMisc(vecOpSqxtn, rd, rd, vecArrangement2S)
+ } else {
+ narrow.asVecMisc(vecOpUqxtn, rd, rd, vecArrangement2S)
+ }
+ m.insert(narrow)
+ }
+}
+
+func (m *machine) lowerVfpuFromInt(rd, rn operand, arr vecArrangement, signed bool) {
+ cvt := m.allocateInstr()
+ if signed {
+ cvt.asVecMisc(vecOpScvtf, rd, rn, arr)
+ } else {
+ cvt.asVecMisc(vecOpUcvtf, rd, rn, arr)
+ }
+ m.insert(cvt)
+}
+
+func (m *machine) lowerShifts(si *ssa.Instruction, ext extMode, aluOp aluOp) {
+ x, amount := si.Arg2()
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
+ rm := m.getOperand_ShiftImm_NR(m.compiler.ValueDefinition(amount), ext, x.Type().Bits())
+ rd := operandNR(m.compiler.VRegOf(si.Return()))
+
+ alu := m.allocateInstr()
+ alu.asALUShift(aluOp, rd, rn, rm, x.Type().Bits() == 64)
+ m.insert(alu)
+}
+
+func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp, ignoreResult bool) {
+ x, y := si.Arg2()
+
+ xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
+ rn := m.getOperand_NR(xDef, extModeNone)
+
+ var rd operand
+ if ignoreResult {
+ rd = operandNR(xzrVReg)
+ } else {
+ rd = operandNR(m.compiler.VRegOf(si.Return()))
+ }
+
+ _64 := x.Type().Bits() == 64
+ alu := m.allocateInstr()
+ if instr := yDef.Instr; instr != nil && instr.Constant() {
+ c := instr.ConstantVal()
+ if isBitMaskImmediate(c, _64) {
+ // Constant bit wise operations can be lowered to a single instruction.
+ alu.asALUBitmaskImm(op, rd.nr(), rn.nr(), c, _64)
+ m.insert(alu)
+ return
+ }
+ }
+
+ rm := m.getOperand_SR_NR(yDef, extModeNone)
+ alu.asALU(op, rd, rn, rm, _64)
+ m.insert(alu)
+}
+
+func (m *machine) lowerRotl(si *ssa.Instruction) {
+ x, y := si.Arg2()
+ r := si.Return()
+ _64 := r.Type().Bits() == 64
+
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ var tmp operand
+ if _64 {
+ tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+ } else {
+ tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+ }
+ rd := operandNR(m.compiler.VRegOf(r))
+
+ // Encode rotl as neg + rotr: neg is a sub against the zero-reg.
+ m.lowerRotlImpl(rd, rn, rm, tmp, _64)
+}
+
+func (m *machine) lowerRotlImpl(rd, rn, rm, tmp operand, is64bit bool) {
+ // Encode rotl as neg + rotr: neg is a sub against the zero-reg.
+ neg := m.allocateInstr()
+ neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rm, is64bit)
+ m.insert(neg)
+ alu := m.allocateInstr()
+ alu.asALU(aluOpRotR, rd, rn, tmp, is64bit)
+ m.insert(alu)
+}
+
+func (m *machine) lowerRotr(si *ssa.Instruction) {
+ x, y := si.Arg2()
+
+ xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
+ rn := m.getOperand_NR(xDef, extModeNone)
+ rm := m.getOperand_NR(yDef, extModeNone)
+ rd := operandNR(m.compiler.VRegOf(si.Return()))
+
+ alu := m.allocateInstr()
+ alu.asALU(aluOpRotR, rd, rn, rm, si.Return().Type().Bits() == 64)
+ m.insert(alu)
+}
+
+func (m *machine) lowerExtend(arg, ret ssa.Value, from, to byte, signed bool) {
+ rd := m.compiler.VRegOf(ret)
+ def := m.compiler.ValueDefinition(arg)
+
+ if instr := def.Instr; !signed && from == 32 && instr != nil {
+ // We can optimize out the unsigned extend because:
+ // Writes to the W register set bits [63:32] of the X register to zero
+ // https://developer.arm.com/documentation/den0024/a/An-Introduction-to-the-ARMv8-Instruction-Sets/The-ARMv8-instruction-sets/Distinguishing-between-32-bit-and-64-bit-A64-instructions
+ switch instr.Opcode() {
+ case
+ ssa.OpcodeIadd, ssa.OpcodeIsub, ssa.OpcodeLoad,
+ ssa.OpcodeBand, ssa.OpcodeBor, ssa.OpcodeBnot,
+ ssa.OpcodeIshl, ssa.OpcodeUshr, ssa.OpcodeSshr,
+ ssa.OpcodeRotl, ssa.OpcodeRotr,
+ ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32:
+ // So, if the argument is the result of a 32-bit operation, we can just copy the register.
+ // It is highly likely that this copy will be optimized out after register allocation.
+ rn := m.compiler.VRegOf(arg)
+ mov := m.allocateInstr()
+ // Note: do not use move32 as it will be lowered to a 32-bit move, which is not copy (that is actually the impl of UExtend).
+ mov.asMove64(rd, rn)
+ m.insert(mov)
+ return
+ default:
+ }
+ }
+ rn := m.getOperand_NR(def, extModeNone)
+
+ ext := m.allocateInstr()
+ ext.asExtend(rd, rn.nr(), from, to, signed)
+ m.insert(ext)
+}
+
+func (m *machine) lowerFcmp(x, y, result ssa.Value, c ssa.FloatCmpCond) {
+ rn, rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone), m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+
+ fc := m.allocateInstr()
+ fc.asFpuCmp(rn, rm, x.Type().Bits() == 64)
+ m.insert(fc)
+
+ cset := m.allocateInstr()
+ cset.asCSet(m.compiler.VRegOf(result), false, condFlagFromSSAFloatCmpCond(c))
+ m.insert(cset)
+}
+
+func (m *machine) lowerImul(x, y, result ssa.Value) {
+ rd := m.compiler.VRegOf(result)
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+
+ // TODO: if this comes before Add/Sub, we could merge it by putting it into the place of xzrVReg.
+
+ mul := m.allocateInstr()
+ mul.asALURRRR(aluOpMAdd, operandNR(rd), rn, rm, operandNR(xzrVReg), x.Type().Bits() == 64)
+ m.insert(mul)
+}
+
+func (m *machine) lowerClz(x, result ssa.Value) {
+ rd := m.compiler.VRegOf(result)
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ clz := m.allocateInstr()
+ clz.asBitRR(bitOpClz, rd, rn.nr(), x.Type().Bits() == 64)
+ m.insert(clz)
+}
+
+func (m *machine) lowerCtz(x, result ssa.Value) {
+ rd := m.compiler.VRegOf(result)
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rbit := m.allocateInstr()
+ _64 := x.Type().Bits() == 64
+ var tmpReg regalloc.VReg
+ if _64 {
+ tmpReg = m.compiler.AllocateVReg(ssa.TypeI64)
+ } else {
+ tmpReg = m.compiler.AllocateVReg(ssa.TypeI32)
+ }
+ rbit.asBitRR(bitOpRbit, tmpReg, rn.nr(), _64)
+ m.insert(rbit)
+
+ clz := m.allocateInstr()
+ clz.asBitRR(bitOpClz, rd, tmpReg, _64)
+ m.insert(clz)
+}
+
+func (m *machine) lowerPopcnt(x, result ssa.Value) {
+ // arm64 doesn't have an instruction for population count on scalar register,
+ // so we use the vector instruction `cnt`.
+ // This is exactly what the official Go implements bits.OneCount.
+ // For example, "func () int { return bits.OneCount(10) }" is compiled as
+ //
+ // MOVD $10, R0 ;; Load 10.
+ // FMOVD R0, F0
+ // VCNT V0.B8, V0.B8
+ // UADDLV V0.B8, V0
+ //
+ // In aarch64 asm, FMOVD is encoded as `ins`, VCNT is `cnt`,
+ // and the registers may use different names. In our encoding we use the following
+ // instructions:
+ //
+ // ins v0.d[0], x0 ;; mov from GPR to vec (FMOV above) is encoded as INS
+ // cnt v0.16b, v0.16b ;; we use vec arrangement 16b
+ // uaddlv h0, v0.8b ;; h0 is still v0 with the dest width specifier 'H', implied when src arrangement is 8b
+ // mov x5, v0.d[0] ;; finally we mov the result back to a GPR
+ //
+
+ rd := operandNR(m.compiler.VRegOf(result))
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+
+ rf1 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
+ ins := m.allocateInstr()
+ ins.asMovToVec(rf1, rn, vecArrangementD, vecIndex(0))
+ m.insert(ins)
+
+ rf2 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
+ cnt := m.allocateInstr()
+ cnt.asVecMisc(vecOpCnt, rf2, rf1, vecArrangement16B)
+ m.insert(cnt)
+
+ rf3 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
+ uaddlv := m.allocateInstr()
+ uaddlv.asVecLanes(vecOpUaddlv, rf3, rf2, vecArrangement8B)
+ m.insert(uaddlv)
+
+ mov := m.allocateInstr()
+ mov.asMovFromVec(rd, rf3, vecArrangementD, vecIndex(0), false)
+ m.insert(mov)
+}
+
+// lowerExitWithCode lowers the lowerExitWithCode takes a context pointer as argument.
+func (m *machine) lowerExitWithCode(execCtxVReg regalloc.VReg, code wazevoapi.ExitCode) {
+ tmpReg1 := m.compiler.AllocateVReg(ssa.TypeI32)
+ loadExitCodeConst := m.allocateInstr()
+ loadExitCodeConst.asMOVZ(tmpReg1, uint64(code), 0, true)
+
+ setExitCode := m.allocateInstr()
+ setExitCode.asStore(operandNR(tmpReg1),
+ addressMode{
+ kind: addressModeKindRegUnsignedImm12,
+ rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
+ }, 32)
+
+ // In order to unwind the stack, we also need to push the current stack pointer:
+ tmp2 := m.compiler.AllocateVReg(ssa.TypeI64)
+ movSpToTmp := m.allocateInstr()
+ movSpToTmp.asMove64(tmp2, spVReg)
+ strSpToExecCtx := m.allocateInstr()
+ strSpToExecCtx.asStore(operandNR(tmp2),
+ addressMode{
+ kind: addressModeKindRegUnsignedImm12,
+ rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
+ }, 64)
+ // Also the address of this exit.
+ tmp3 := m.compiler.AllocateVReg(ssa.TypeI64)
+ currentAddrToTmp := m.allocateInstr()
+ currentAddrToTmp.asAdr(tmp3, 0)
+ storeCurrentAddrToExecCtx := m.allocateInstr()
+ storeCurrentAddrToExecCtx.asStore(operandNR(tmp3),
+ addressMode{
+ kind: addressModeKindRegUnsignedImm12,
+ rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
+ }, 64)
+
+ exitSeq := m.allocateInstr()
+ exitSeq.asExitSequence(execCtxVReg)
+
+ m.insert(loadExitCodeConst)
+ m.insert(setExitCode)
+ m.insert(movSpToTmp)
+ m.insert(strSpToExecCtx)
+ m.insert(currentAddrToTmp)
+ m.insert(storeCurrentAddrToExecCtx)
+ m.insert(exitSeq)
+}
+
+func (m *machine) lowerIcmpToFlag(x, y ssa.Value, signed bool) {
+ if x.Type() != y.Type() {
+ panic(
+ fmt.Sprintf("TODO(maybe): support icmp with different types: v%d=%s != v%d=%s",
+ x.ID(), x.Type(), y.ID(), y.Type()))
+ }
+
+ extMod := extModeOf(x.Type(), signed)
+
+ // First operand must be in pure register form.
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extMod)
+ // Second operand can be in any of Imm12, ER, SR, or NR form supported by the SUBS instructions.
+ rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), extMod)
+
+ alu := m.allocateInstr()
+ // subs zr, rn, rm
+ alu.asALU(
+ aluOpSubS,
+ // We don't need the result, just need to set flags.
+ operandNR(xzrVReg),
+ rn,
+ rm,
+ x.Type().Bits() == 64,
+ )
+ m.insert(alu)
+}
+
+func (m *machine) lowerFcmpToFlag(x, y ssa.Value) {
+ if x.Type() != y.Type() {
+ panic("TODO(maybe): support icmp with different types")
+ }
+
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+ cmp := m.allocateInstr()
+ cmp.asFpuCmp(rn, rm, x.Type().Bits() == 64)
+ m.insert(cmp)
+}
+
+func (m *machine) lowerExitIfTrueWithCode(execCtxVReg regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) {
+ condDef := m.compiler.ValueDefinition(cond)
+ if !m.compiler.MatchInstr(condDef, ssa.OpcodeIcmp) {
+ panic("TODO: OpcodeExitIfTrueWithCode must come after Icmp at the moment: " + condDef.Instr.Opcode().String())
+ }
+ condDef.Instr.MarkLowered()
+
+ cvalInstr := condDef.Instr
+ x, y, c := cvalInstr.IcmpData()
+ signed := c.Signed()
+
+ if !m.tryLowerBandToFlag(x, y) {
+ m.lowerIcmpToFlag(x, y, signed)
+ }
+
+ // We need to copy the execution context to a temp register, because if it's spilled,
+ // it might end up being reloaded inside the exiting branch.
+ execCtxTmp := m.copyToTmp(execCtxVReg)
+
+ // We have to skip the entire exit sequence if the condition is false.
+ cbr := m.allocateInstr()
+ m.insert(cbr)
+ m.lowerExitWithCode(execCtxTmp, code)
+ // conditional branch target is after exit.
+ l := m.insertBrTargetLabel()
+ cbr.asCondBr(condFlagFromSSAIntegerCmpCond(c).invert().asCond(), l, false /* ignored */)
+}
+
+func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
+ cvalDef := m.compiler.ValueDefinition(c)
+
+ var cc condFlag
+ switch {
+ case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction.
+ cvalInstr := cvalDef.Instr
+ x, y, c := cvalInstr.IcmpData()
+ cc = condFlagFromSSAIntegerCmpCond(c)
+ m.lowerIcmpToFlag(x, y, c.Signed())
+ cvalDef.Instr.MarkLowered()
+ case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly.
+ cvalInstr := cvalDef.Instr
+ x, y, c := cvalInstr.FcmpData()
+ cc = condFlagFromSSAFloatCmpCond(c)
+ m.lowerFcmpToFlag(x, y)
+ cvalDef.Instr.MarkLowered()
+ default:
+ rn := m.getOperand_NR(cvalDef, extModeNone)
+ if c.Type() != ssa.TypeI32 && c.Type() != ssa.TypeI64 {
+ panic("TODO?BUG?: support select with non-integer condition")
+ }
+ alu := m.allocateInstr()
+ // subs zr, rn, zr
+ alu.asALU(
+ aluOpSubS,
+ // We don't need the result, just need to set flags.
+ operandNR(xzrVReg),
+ rn,
+ operandNR(xzrVReg),
+ c.Type().Bits() == 64,
+ )
+ m.insert(alu)
+ cc = ne
+ }
+
+ rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+ rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+
+ rd := operandNR(m.compiler.VRegOf(result))
+ switch x.Type() {
+ case ssa.TypeI32, ssa.TypeI64:
+ // csel rd, rn, rm, cc
+ csel := m.allocateInstr()
+ csel.asCSel(rd, rn, rm, cc, x.Type().Bits() == 64)
+ m.insert(csel)
+ case ssa.TypeF32, ssa.TypeF64:
+ // fcsel rd, rn, rm, cc
+ fcsel := m.allocateInstr()
+ fcsel.asFpuCSel(rd, rn, rm, cc, x.Type().Bits() == 64)
+ m.insert(fcsel)
+ default:
+ panic("BUG")
+ }
+}
+
+func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
+ // First check if `rc` is zero or not.
+ checkZero := m.allocateInstr()
+ checkZero.asALU(aluOpSubS, operandNR(xzrVReg), rc, operandNR(xzrVReg), false)
+ m.insert(checkZero)
+
+ // Then use CSETM to set all bits to one if `rc` is zero.
+ allOnesOrZero := m.compiler.AllocateVReg(ssa.TypeI64)
+ cset := m.allocateInstr()
+ cset.asCSet(allOnesOrZero, true, ne)
+ m.insert(cset)
+
+ // Then move the bits to the result vector register.
+ tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+ dup := m.allocateInstr()
+ dup.asVecDup(tmp2, operandNR(allOnesOrZero), vecArrangement2D)
+ m.insert(dup)
+
+ // Now that `tmp2` has either all bits one or zero depending on `rc`,
+ // we can use bsl to select between `rn` and `rm`.
+ ins := m.allocateInstr()
+ ins.asVecRRRRewrite(vecOpBsl, tmp2, rn, rm, vecArrangement16B)
+ m.insert(ins)
+
+ // Finally, move the result to the destination register.
+ mov2 := m.allocateInstr()
+ mov2.asFpuMov128(rd.nr(), tmp2.nr())
+ m.insert(mov2)
+}
+
+func (m *machine) lowerAtomicRmw(si *ssa.Instruction) {
+ ssaOp, size := si.AtomicRmwData()
+
+ var op atomicRmwOp
+ var negateArg bool
+ var flipArg bool
+ switch ssaOp {
+ case ssa.AtomicRmwOpAdd:
+ op = atomicRmwOpAdd
+ case ssa.AtomicRmwOpSub:
+ op = atomicRmwOpAdd
+ negateArg = true
+ case ssa.AtomicRmwOpAnd:
+ op = atomicRmwOpClr
+ flipArg = true
+ case ssa.AtomicRmwOpOr:
+ op = atomicRmwOpSet
+ case ssa.AtomicRmwOpXor:
+ op = atomicRmwOpEor
+ case ssa.AtomicRmwOpXchg:
+ op = atomicRmwOpSwp
+ default:
+ panic(fmt.Sprintf("unknown ssa atomic rmw op: %s", ssaOp))
+ }
+
+ addr, val := si.Arg2()
+ addrDef, valDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(val)
+ rn := m.getOperand_NR(addrDef, extModeNone)
+ rt := operandNR(m.compiler.VRegOf(si.Return()))
+ rs := m.getOperand_NR(valDef, extModeNone)
+
+ _64 := si.Return().Type().Bits() == 64
+ var tmp operand
+ if _64 {
+ tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+ } else {
+ tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+ }
+ m.lowerAtomicRmwImpl(op, rn, rs, rt, tmp, size, negateArg, flipArg, _64)
+}
+
+func (m *machine) lowerAtomicRmwImpl(op atomicRmwOp, rn, rs, rt, tmp operand, size uint64, negateArg, flipArg, dst64bit bool) {
+ switch {
+ case negateArg:
+ neg := m.allocateInstr()
+ neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rs, dst64bit)
+ m.insert(neg)
+ case flipArg:
+ flip := m.allocateInstr()
+ flip.asALU(aluOpOrn, tmp, operandNR(xzrVReg), rs, dst64bit)
+ m.insert(flip)
+ default:
+ tmp = rs
+ }
+
+ rmw := m.allocateInstr()
+ rmw.asAtomicRmw(op, rn, tmp, rt, size)
+ m.insert(rmw)
+}
+
+func (m *machine) lowerAtomicCas(si *ssa.Instruction) {
+ addr, exp, repl := si.Arg3()
+ size := si.AtomicTargetSize()
+
+ addrDef, expDef, replDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(exp), m.compiler.ValueDefinition(repl)
+ rn := m.getOperand_NR(addrDef, extModeNone)
+ rt := m.getOperand_NR(replDef, extModeNone)
+ rs := m.getOperand_NR(expDef, extModeNone)
+ tmp := operandNR(m.compiler.AllocateVReg(si.Return().Type()))
+
+ _64 := si.Return().Type().Bits() == 64
+ // rs is overwritten by CAS, so we need to move it to the result register before the instruction
+ // in case when it is used somewhere else.
+ mov := m.allocateInstr()
+ if _64 {
+ mov.asMove64(tmp.nr(), rs.nr())
+ } else {
+ mov.asMove32(tmp.nr(), rs.nr())
+ }
+ m.insert(mov)
+
+ m.lowerAtomicCasImpl(rn, tmp, rt, size)
+
+ mov2 := m.allocateInstr()
+ rd := m.compiler.VRegOf(si.Return())
+ if _64 {
+ mov2.asMove64(rd, tmp.nr())
+ } else {
+ mov2.asMove32(rd, tmp.nr())
+ }
+ m.insert(mov2)
+}
+
+func (m *machine) lowerAtomicCasImpl(rn, rs, rt operand, size uint64) {
+ cas := m.allocateInstr()
+ cas.asAtomicCas(rn, rs, rt, size)
+ m.insert(cas)
+}
+
+func (m *machine) lowerAtomicLoad(si *ssa.Instruction) {
+ addr := si.Arg()
+ size := si.AtomicTargetSize()
+
+ addrDef := m.compiler.ValueDefinition(addr)
+ rn := m.getOperand_NR(addrDef, extModeNone)
+ rt := operandNR(m.compiler.VRegOf(si.Return()))
+
+ m.lowerAtomicLoadImpl(rn, rt, size)
+}
+
+func (m *machine) lowerAtomicLoadImpl(rn, rt operand, size uint64) {
+ ld := m.allocateInstr()
+ ld.asAtomicLoad(rn, rt, size)
+ m.insert(ld)
+}
+
+func (m *machine) lowerAtomicStore(si *ssa.Instruction) {
+ addr, val := si.Arg2()
+ size := si.AtomicTargetSize()
+
+ addrDef := m.compiler.ValueDefinition(addr)
+ valDef := m.compiler.ValueDefinition(val)
+ rn := m.getOperand_NR(addrDef, extModeNone)
+ rt := m.getOperand_NR(valDef, extModeNone)
+
+ m.lowerAtomicStoreImpl(rn, rt, size)
+}
+
+func (m *machine) lowerAtomicStoreImpl(rn, rt operand, size uint64) {
+ ld := m.allocateInstr()
+ ld.asAtomicStore(rn, rt, size)
+ m.insert(ld)
+}
+
+// copyToTmp copies the given regalloc.VReg to a temporary register. This is called before cbr to avoid the regalloc issue
+// e.g. reload happening in the middle of the exit sequence which is not the path the normal path executes
+func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg {
+ typ := m.compiler.TypeOf(v)
+ mov := m.allocateInstr()
+ tmp := m.compiler.AllocateVReg(typ)
+ if typ.IsInt() {
+ mov.asMove64(tmp, v)
+ } else {
+ mov.asFpuMov128(tmp, v)
+ }
+ m.insert(mov)
+ return tmp
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go
new file mode 100644
index 000000000..d9fbf1789
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go
@@ -0,0 +1,350 @@
+package arm64
+
+// This file contains the logic to "find and determine operands" for instructions.
+// In order to finalize the form of an operand, we might end up merging/eliminating
+// the source instructions into an operand whenever possible.
+
+import (
+ "fmt"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type (
+ // operand represents an operand of an instruction whose type is determined by the kind.
+ operand struct {
+ kind operandKind
+ data, data2 uint64
+ }
+ operandKind byte
+)
+
+// Here's the list of operand kinds. We use the abbreviation of the kind name not only for these consts,
+// but also names of functions which return the operand of the kind.
+const (
+ // operandKindNR represents "NormalRegister" (NR). This is literally the register without any special operation unlike others.
+ operandKindNR operandKind = iota
+ // operandKindSR represents "Shifted Register" (SR). This is a register which is shifted by a constant.
+ // Some of the arm64 instructions can take this kind of operand.
+ operandKindSR
+ // operandKindER represents "Extended Register (ER). This is a register which is sign/zero-extended to a larger size.
+ // Some of the arm64 instructions can take this kind of operand.
+ operandKindER
+ // operandKindImm12 represents "Immediate 12" (Imm12). This is a 12-bit immediate value which can be either shifted by 12 or not.
+ // See asImm12 function for detail.
+ operandKindImm12
+ // operandKindShiftImm represents "Shifted Immediate" (ShiftImm) used by shift operations.
+ operandKindShiftImm
+)
+
+// String implements fmt.Stringer for debugging.
+func (o operand) format(size byte) string {
+ switch o.kind {
+ case operandKindNR:
+ return formatVRegSized(o.nr(), size)
+ case operandKindSR:
+ r, amt, sop := o.sr()
+ return fmt.Sprintf("%s, %s #%d", formatVRegSized(r, size), sop, amt)
+ case operandKindER:
+ r, eop, _ := o.er()
+ return fmt.Sprintf("%s %s", formatVRegSized(r, size), eop)
+ case operandKindImm12:
+ imm12, shiftBit := o.imm12()
+ if shiftBit == 1 {
+ return fmt.Sprintf("#%#x", uint64(imm12)<<12)
+ } else {
+ return fmt.Sprintf("#%#x", imm12)
+ }
+ default:
+ panic(fmt.Sprintf("unknown operand kind: %d", o.kind))
+ }
+}
+
+// operandNR encodes the given VReg as an operand of operandKindNR.
+func operandNR(r regalloc.VReg) operand {
+ return operand{kind: operandKindNR, data: uint64(r)}
+}
+
+// nr decodes the underlying VReg assuming the operand is of operandKindNR.
+func (o operand) nr() regalloc.VReg {
+ return regalloc.VReg(o.data)
+}
+
+// operandER encodes the given VReg as an operand of operandKindER.
+func operandER(r regalloc.VReg, eop extendOp, to byte) operand {
+ if to < 32 {
+ panic("TODO?BUG?: when we need to extend to less than 32 bits?")
+ }
+ return operand{kind: operandKindER, data: uint64(r), data2: uint64(eop)<<32 | uint64(to)}
+}
+
+// er decodes the underlying VReg, extend operation, and the target size assuming the operand is of operandKindER.
+func (o operand) er() (r regalloc.VReg, eop extendOp, to byte) {
+ return regalloc.VReg(o.data), extendOp(o.data2>>32) & 0xff, byte(o.data2 & 0xff)
+}
+
+// operandSR encodes the given VReg as an operand of operandKindSR.
+func operandSR(r regalloc.VReg, amt byte, sop shiftOp) operand {
+ return operand{kind: operandKindSR, data: uint64(r), data2: uint64(amt)<<32 | uint64(sop)}
+}
+
+// sr decodes the underlying VReg, shift amount, and shift operation assuming the operand is of operandKindSR.
+func (o operand) sr() (r regalloc.VReg, amt byte, sop shiftOp) {
+ return regalloc.VReg(o.data), byte(o.data2>>32) & 0xff, shiftOp(o.data2) & 0xff
+}
+
+// operandImm12 encodes the given imm12 as an operand of operandKindImm12.
+func operandImm12(imm12 uint16, shiftBit byte) operand {
+ return operand{kind: operandKindImm12, data: uint64(imm12) | uint64(shiftBit)<<32}
+}
+
+// imm12 decodes the underlying imm12 data assuming the operand is of operandKindImm12.
+func (o operand) imm12() (v uint16, shiftBit byte) {
+ return uint16(o.data), byte(o.data >> 32)
+}
+
+// operandShiftImm encodes the given amount as an operand of operandKindShiftImm.
+func operandShiftImm(amount byte) operand {
+ return operand{kind: operandKindShiftImm, data: uint64(amount)}
+}
+
+// shiftImm decodes the underlying shift amount data assuming the operand is of operandKindShiftImm.
+func (o operand) shiftImm() byte {
+ return byte(o.data)
+}
+
+// reg returns the register of the operand if applicable.
+func (o operand) reg() regalloc.VReg {
+ switch o.kind {
+ case operandKindNR:
+ return o.nr()
+ case operandKindSR:
+ r, _, _ := o.sr()
+ return r
+ case operandKindER:
+ r, _, _ := o.er()
+ return r
+ case operandKindImm12:
+ // Does not have a register.
+ case operandKindShiftImm:
+ // Does not have a register.
+ default:
+ panic(o.kind)
+ }
+ return regalloc.VRegInvalid
+}
+
+func (o operand) realReg() regalloc.RealReg {
+ return o.nr().RealReg()
+}
+
+func (o operand) assignReg(v regalloc.VReg) operand {
+ switch o.kind {
+ case operandKindNR:
+ return operandNR(v)
+ case operandKindSR:
+ _, amt, sop := o.sr()
+ return operandSR(v, amt, sop)
+ case operandKindER:
+ _, eop, to := o.er()
+ return operandER(v, eop, to)
+ case operandKindImm12:
+ // Does not have a register.
+ case operandKindShiftImm:
+ // Does not have a register.
+ }
+ panic(o.kind)
+}
+
+// ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def).
+//
+// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
+// If the operand can be expressed as operandKindImm12, `mode` is ignored.
+func (m *machine) getOperand_Imm12_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+ if def.IsFromBlockParam() {
+ return operandNR(def.BlkParamVReg)
+ }
+
+ instr := def.Instr
+ if instr.Opcode() == ssa.OpcodeIconst {
+ if imm12Op, ok := asImm12Operand(instr.ConstantVal()); ok {
+ instr.MarkLowered()
+ return imm12Op
+ }
+ }
+ return m.getOperand_ER_SR_NR(def, mode)
+}
+
+// getOperand_MaybeNegatedImm12_ER_SR_NR is almost the same as getOperand_Imm12_ER_SR_NR, but this might negate the immediate value.
+// If the immediate value is negated, the second return value is true, otherwise always false.
+func (m *machine) getOperand_MaybeNegatedImm12_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand, negatedImm12 bool) {
+ if def.IsFromBlockParam() {
+ return operandNR(def.BlkParamVReg), false
+ }
+
+ instr := def.Instr
+ if instr.Opcode() == ssa.OpcodeIconst {
+ c := instr.ConstantVal()
+ if imm12Op, ok := asImm12Operand(c); ok {
+ instr.MarkLowered()
+ return imm12Op, false
+ }
+
+ signExtended := int64(c)
+ if def.SSAValue().Type().Bits() == 32 {
+ signExtended = (signExtended << 32) >> 32
+ }
+ negatedWithoutSign := -signExtended
+ if imm12Op, ok := asImm12Operand(uint64(negatedWithoutSign)); ok {
+ instr.MarkLowered()
+ return imm12Op, true
+ }
+ }
+ return m.getOperand_ER_SR_NR(def, mode), false
+}
+
+// ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def).
+//
+// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
+func (m *machine) getOperand_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+ if def.IsFromBlockParam() {
+ return operandNR(def.BlkParamVReg)
+ }
+
+ if m.compiler.MatchInstr(def, ssa.OpcodeSExtend) || m.compiler.MatchInstr(def, ssa.OpcodeUExtend) {
+ extInstr := def.Instr
+
+ signed := extInstr.Opcode() == ssa.OpcodeSExtend
+ innerExtFromBits, innerExtToBits := extInstr.ExtendFromToBits()
+ modeBits, modeSigned := mode.bits(), mode.signed()
+ if mode == extModeNone || innerExtToBits == modeBits {
+ eop := extendOpFrom(signed, innerExtFromBits)
+ extArg := m.getOperand_NR(m.compiler.ValueDefinition(extInstr.Arg()), extModeNone)
+ op = operandER(extArg.nr(), eop, innerExtToBits)
+ extInstr.MarkLowered()
+ return
+ }
+
+ if innerExtToBits > modeBits {
+ panic("BUG?TODO?: need the results of inner extension to be larger than the mode")
+ }
+
+ switch {
+ case (!signed && !modeSigned) || (signed && modeSigned):
+ // Two sign/zero extensions are equivalent to one sign/zero extension for the larger size.
+ eop := extendOpFrom(modeSigned, innerExtFromBits)
+ op = operandER(m.compiler.VRegOf(extInstr.Arg()), eop, modeBits)
+ extInstr.MarkLowered()
+ case (signed && !modeSigned) || (!signed && modeSigned):
+ // We need to {sign, zero}-extend the result of the {zero,sign} extension.
+ eop := extendOpFrom(modeSigned, innerExtToBits)
+ op = operandER(m.compiler.VRegOf(extInstr.Return()), eop, modeBits)
+ // Note that we failed to merge the inner extension instruction this case.
+ }
+ return
+ }
+ return m.getOperand_SR_NR(def, mode)
+}
+
+// ensureValueNR returns an operand of either operandKindSR or operandKindNR from the given value (defined by `def).
+//
+// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
+func (m *machine) getOperand_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+ if def.IsFromBlockParam() {
+ return operandNR(def.BlkParamVReg)
+ }
+
+ if m.compiler.MatchInstr(def, ssa.OpcodeIshl) {
+ // Check if the shift amount is constant instruction.
+ targetVal, amountVal := def.Instr.Arg2()
+ targetVReg := m.getOperand_NR(m.compiler.ValueDefinition(targetVal), extModeNone).nr()
+ amountDef := m.compiler.ValueDefinition(amountVal)
+ if amountDef.IsFromInstr() && amountDef.Instr.Constant() {
+ // If that is the case, we can use the shifted register operand (SR).
+ c := byte(amountDef.Instr.ConstantVal()) & (targetVal.Type().Bits() - 1) // Clears the unnecessary bits.
+ def.Instr.MarkLowered()
+ amountDef.Instr.MarkLowered()
+ return operandSR(targetVReg, c, shiftOpLSL)
+ }
+ }
+ return m.getOperand_NR(def, mode)
+}
+
+// getOperand_ShiftImm_NR returns an operand of either operandKindShiftImm or operandKindNR from the given value (defined by `def).
+func (m *machine) getOperand_ShiftImm_NR(def *backend.SSAValueDefinition, mode extMode, shiftBitWidth byte) (op operand) {
+ if def.IsFromBlockParam() {
+ return operandNR(def.BlkParamVReg)
+ }
+
+ instr := def.Instr
+ if instr.Constant() {
+ amount := byte(instr.ConstantVal()) & (shiftBitWidth - 1) // Clears the unnecessary bits.
+ return operandShiftImm(amount)
+ }
+ return m.getOperand_NR(def, mode)
+}
+
+// ensureValueNR returns an operand of operandKindNR from the given value (defined by `def).
+//
+// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
+func (m *machine) getOperand_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+ var v regalloc.VReg
+ if def.IsFromBlockParam() {
+ v = def.BlkParamVReg
+ } else {
+ instr := def.Instr
+ if instr.Constant() {
+ // We inline all the constant instructions so that we could reduce the register usage.
+ v = m.lowerConstant(instr)
+ instr.MarkLowered()
+ } else {
+ if n := def.N; n == 0 {
+ v = m.compiler.VRegOf(instr.Return())
+ } else {
+ _, rs := instr.Returns()
+ v = m.compiler.VRegOf(rs[n-1])
+ }
+ }
+ }
+
+ r := v
+ switch inBits := def.SSAValue().Type().Bits(); {
+ case mode == extModeNone:
+ case inBits == 32 && (mode == extModeZeroExtend32 || mode == extModeSignExtend32):
+ case inBits == 32 && mode == extModeZeroExtend64:
+ extended := m.compiler.AllocateVReg(ssa.TypeI64)
+ ext := m.allocateInstr()
+ ext.asExtend(extended, v, 32, 64, false)
+ m.insert(ext)
+ r = extended
+ case inBits == 32 && mode == extModeSignExtend64:
+ extended := m.compiler.AllocateVReg(ssa.TypeI64)
+ ext := m.allocateInstr()
+ ext.asExtend(extended, v, 32, 64, true)
+ m.insert(ext)
+ r = extended
+ case inBits == 64 && (mode == extModeZeroExtend64 || mode == extModeSignExtend64):
+ }
+ return operandNR(r)
+}
+
+func asImm12Operand(val uint64) (op operand, ok bool) {
+ v, shiftBit, ok := asImm12(val)
+ if !ok {
+ return operand{}, false
+ }
+ return operandImm12(v, shiftBit), true
+}
+
+func asImm12(val uint64) (v uint16, shiftBit byte, ok bool) {
+ const mask1, mask2 uint64 = 0xfff, 0xfff_000
+ if val&^mask1 == 0 {
+ return uint16(val), 0, true
+ } else if val&^mask2 == 0 {
+ return uint16(val >> 12), 1, true
+ } else {
+ return 0, 0, false
+ }
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
new file mode 100644
index 000000000..4842eaa38
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
@@ -0,0 +1,440 @@
+package arm64
+
+import (
+ "fmt"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type (
+ // addressMode represents an ARM64 addressing mode.
+ //
+ // https://developer.arm.com/documentation/102374/0101/Loads-and-stores---addressing
+ // TODO: use the bit-packed layout like operand struct.
+ addressMode struct {
+ kind addressModeKind
+ rn, rm regalloc.VReg
+ extOp extendOp
+ imm int64
+ }
+
+ // addressModeKind represents the kind of ARM64 addressing mode.
+ addressModeKind byte
+)
+
+const (
+ // addressModeKindRegExtended takes a base register and an index register. The index register is sign/zero-extended,
+ // and then scaled by bits(type)/8.
+ //
+ // e.g.
+ // - ldrh w1, [x2, w3, SXTW #1] ;; sign-extended and scaled by 2 (== LSL #1)
+ // - strh w1, [x2, w3, UXTW #1] ;; zero-extended and scaled by 2 (== LSL #1)
+ // - ldr w1, [x2, w3, SXTW #2] ;; sign-extended and scaled by 4 (== LSL #2)
+ // - str x1, [x2, w3, UXTW #3] ;; zero-extended and scaled by 8 (== LSL #3)
+ //
+ // See the following pages:
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--register---Load-Register-Halfword--register--
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--register---Load-Register--register--
+ addressModeKindRegScaledExtended addressModeKind = iota
+
+ // addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without extension factor.
+ addressModeKindRegScaled
+
+ // addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without scale factor.
+ addressModeKindRegExtended
+
+ // addressModeKindRegReg takes a base register and an index register. The index register is not either scaled or extended.
+ addressModeKindRegReg
+
+ // addressModeKindRegSignedImm9 takes a base register and a 9-bit "signed" immediate offset (-256 to 255).
+ // The immediate will be sign-extended, and be added to the base register.
+ // This is a.k.a. "unscaled" since the immediate is not scaled.
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled--
+ addressModeKindRegSignedImm9
+
+ // addressModeKindRegUnsignedImm12 takes a base register and a 12-bit "unsigned" immediate offset. scaled by
+ // the size of the type. In other words, the actual offset will be imm12 * bits(type)/8.
+ // See "Unsigned offset" in the following pages:
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+ addressModeKindRegUnsignedImm12
+
+ // addressModePostIndex takes a base register and a 9-bit "signed" immediate offset.
+ // After the load/store, the base register will be updated by the offset.
+ //
+ // Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset.
+ //
+ // See "Post-index" in the following pages for examples:
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
+ addressModeKindPostIndex
+
+ // addressModePostIndex takes a base register and a 9-bit "signed" immediate offset.
+ // Before the load/store, the base register will be updated by the offset.
+ //
+ // Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset.
+ //
+ // See "Pre-index" in the following pages for examples:
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
+ addressModeKindPreIndex
+
+ // addressModeKindArgStackSpace is used to resolve the address of the argument stack space
+ // exiting right above the stack pointer. Since we don't know the exact stack space needed for a function
+ // at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above.
+ addressModeKindArgStackSpace
+
+ // addressModeKindResultStackSpace is used to resolve the address of the result stack space
+ // exiting right above the stack pointer. Since we don't know the exact stack space needed for a function
+ // at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above.
+ addressModeKindResultStackSpace
+)
+
+func (a addressMode) format(dstSizeBits byte) (ret string) {
+ base := formatVRegSized(a.rn, 64)
+ if rn := a.rn; rn.RegType() != regalloc.RegTypeInt {
+ panic("invalid base register type: " + a.rn.RegType().String())
+ } else if rn.IsRealReg() && v0 <= a.rn.RealReg() && a.rn.RealReg() <= v30 {
+ panic("BUG: likely a bug in reg alloc or reset behavior")
+ }
+
+ switch a.kind {
+ case addressModeKindRegScaledExtended:
+ amount := a.sizeInBitsToShiftAmount(dstSizeBits)
+ ret = fmt.Sprintf("[%s, %s, %s #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp, amount)
+ case addressModeKindRegScaled:
+ amount := a.sizeInBitsToShiftAmount(dstSizeBits)
+ ret = fmt.Sprintf("[%s, %s, lsl #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), amount)
+ case addressModeKindRegExtended:
+ ret = fmt.Sprintf("[%s, %s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp)
+ case addressModeKindRegReg:
+ ret = fmt.Sprintf("[%s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()))
+ case addressModeKindRegSignedImm9:
+ if a.imm != 0 {
+ ret = fmt.Sprintf("[%s, #%#x]", base, a.imm)
+ } else {
+ ret = fmt.Sprintf("[%s]", base)
+ }
+ case addressModeKindRegUnsignedImm12:
+ if a.imm != 0 {
+ ret = fmt.Sprintf("[%s, #%#x]", base, a.imm)
+ } else {
+ ret = fmt.Sprintf("[%s]", base)
+ }
+ case addressModeKindPostIndex:
+ ret = fmt.Sprintf("[%s], #%#x", base, a.imm)
+ case addressModeKindPreIndex:
+ ret = fmt.Sprintf("[%s, #%#x]!", base, a.imm)
+ case addressModeKindArgStackSpace:
+ ret = fmt.Sprintf("[#arg_space, #%#x]", a.imm)
+ case addressModeKindResultStackSpace:
+ ret = fmt.Sprintf("[#ret_space, #%#x]", a.imm)
+ }
+ return
+}
+
+func addressModePreOrPostIndex(rn regalloc.VReg, imm int64, preIndex bool) addressMode {
+ if !offsetFitsInAddressModeKindRegSignedImm9(imm) {
+ panic(fmt.Sprintf("BUG: offset %#x does not fit in addressModeKindRegSignedImm9", imm))
+ }
+ if preIndex {
+ return addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm}
+ } else {
+ return addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm}
+ }
+}
+
+func offsetFitsInAddressModeKindRegUnsignedImm12(dstSizeInBits byte, offset int64) bool {
+ divisor := int64(dstSizeInBits) / 8
+ return 0 < offset && offset%divisor == 0 && offset/divisor < 4096
+}
+
+func offsetFitsInAddressModeKindRegSignedImm9(offset int64) bool {
+ return -256 <= offset && offset <= 255
+}
+
+func (a addressMode) indexRegBits() byte {
+ bits := a.extOp.srcBits()
+ if bits != 32 && bits != 64 {
+ panic("invalid index register for address mode. it must be either 32 or 64 bits")
+ }
+ return bits
+}
+
+func (a addressMode) sizeInBitsToShiftAmount(sizeInBits byte) (lsl byte) {
+ switch sizeInBits {
+ case 8:
+ lsl = 0
+ case 16:
+ lsl = 1
+ case 32:
+ lsl = 2
+ case 64:
+ lsl = 3
+ }
+ return
+}
+
+func extLoadSignSize(op ssa.Opcode) (size byte, signed bool) {
+ switch op {
+ case ssa.OpcodeUload8:
+ size, signed = 8, false
+ case ssa.OpcodeUload16:
+ size, signed = 16, false
+ case ssa.OpcodeUload32:
+ size, signed = 32, false
+ case ssa.OpcodeSload8:
+ size, signed = 8, true
+ case ssa.OpcodeSload16:
+ size, signed = 16, true
+ case ssa.OpcodeSload32:
+ size, signed = 32, true
+ default:
+ panic("BUG")
+ }
+ return
+}
+
+func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, ret regalloc.VReg) {
+ size, signed := extLoadSignSize(op)
+ amode := m.lowerToAddressMode(ptr, offset, size)
+ load := m.allocateInstr()
+ if signed {
+ load.asSLoad(operandNR(ret), amode, size)
+ } else {
+ load.asULoad(operandNR(ret), amode, size)
+ }
+ m.insert(load)
+}
+
+func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, ret ssa.Value) {
+ amode := m.lowerToAddressMode(ptr, offset, typ.Bits())
+
+ dst := m.compiler.VRegOf(ret)
+ load := m.allocateInstr()
+ switch typ {
+ case ssa.TypeI32, ssa.TypeI64:
+ load.asULoad(operandNR(dst), amode, typ.Bits())
+ case ssa.TypeF32, ssa.TypeF64:
+ load.asFpuLoad(operandNR(dst), amode, typ.Bits())
+ case ssa.TypeV128:
+ load.asFpuLoad(operandNR(dst), amode, 128)
+ default:
+ panic("TODO")
+ }
+ m.insert(load)
+}
+
+func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, lane ssa.VecLane, ret ssa.Value) {
+ // vecLoad1R has offset address mode (base+imm) only for post index, so we simply add the offset to the base.
+ base := m.getOperand_NR(m.compiler.ValueDefinition(ptr), extModeNone).nr()
+ offsetReg := m.compiler.AllocateVReg(ssa.TypeI64)
+ m.lowerConstantI64(offsetReg, int64(offset))
+ addedBase := m.addReg64ToReg64(base, offsetReg)
+
+ rd := operandNR(m.compiler.VRegOf(ret))
+
+ ld1r := m.allocateInstr()
+ ld1r.asVecLoad1R(rd, operandNR(addedBase), ssaLaneToArrangement(lane))
+ m.insert(ld1r)
+}
+
+func (m *machine) lowerStore(si *ssa.Instruction) {
+ // TODO: merge consecutive stores into a single pair store instruction.
+ value, ptr, offset, storeSizeInBits := si.StoreData()
+ amode := m.lowerToAddressMode(ptr, offset, storeSizeInBits)
+
+ valueOp := m.getOperand_NR(m.compiler.ValueDefinition(value), extModeNone)
+ store := m.allocateInstr()
+ store.asStore(valueOp, amode, storeSizeInBits)
+ m.insert(store)
+}
+
+// lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
+func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode addressMode) {
+ // TODO: currently the instruction selection logic doesn't support addressModeKindRegScaledExtended and
+ // addressModeKindRegScaled since collectAddends doesn't take ssa.OpcodeIshl into account. This should be fixed
+ // to support more efficient address resolution.
+
+ a32s, a64s, offset := m.collectAddends(ptr)
+ offset += int64(offsetBase)
+ return m.lowerToAddressModeFromAddends(a32s, a64s, size, offset)
+}
+
+// lowerToAddressModeFromAddends creates an addressMode from a list of addends collected by collectAddends.
+// During the construction, this might emit additional instructions.
+//
+// Extracted as a separate function for easy testing.
+func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32], a64s *wazevoapi.Queue[regalloc.VReg], size byte, offset int64) (amode addressMode) {
+ switch a64sExist, a32sExist := !a64s.Empty(), !a32s.Empty(); {
+ case a64sExist && a32sExist:
+ var base regalloc.VReg
+ base = a64s.Dequeue()
+ var a32 addend32
+ a32 = a32s.Dequeue()
+ amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext}
+ case a64sExist && offsetFitsInAddressModeKindRegUnsignedImm12(size, offset):
+ var base regalloc.VReg
+ base = a64s.Dequeue()
+ amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset}
+ offset = 0
+ case a64sExist && offsetFitsInAddressModeKindRegSignedImm9(offset):
+ var base regalloc.VReg
+ base = a64s.Dequeue()
+ amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset}
+ offset = 0
+ case a64sExist:
+ var base regalloc.VReg
+ base = a64s.Dequeue()
+ if !a64s.Empty() {
+ index := a64s.Dequeue()
+ amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */}
+ } else {
+ amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
+ }
+ case a32sExist:
+ base32 := a32s.Dequeue()
+
+ // First we need 64-bit base.
+ base := m.compiler.AllocateVReg(ssa.TypeI64)
+ baseExt := m.allocateInstr()
+ var signed bool
+ if base32.ext == extendOpSXTW {
+ signed = true
+ }
+ baseExt.asExtend(base, base32.r, 32, 64, signed)
+ m.insert(baseExt)
+
+ if !a32s.Empty() {
+ index := a32s.Dequeue()
+ amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext}
+ } else {
+ amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
+ }
+ default: // Only static offsets.
+ tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
+ m.lowerConstantI64(tmpReg, offset)
+ amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0}
+ offset = 0
+ }
+
+ baseReg := amode.rn
+ if offset > 0 {
+ baseReg = m.addConstToReg64(baseReg, offset) // baseReg += offset
+ }
+
+ for !a64s.Empty() {
+ a64 := a64s.Dequeue()
+ baseReg = m.addReg64ToReg64(baseReg, a64) // baseReg += a64
+ }
+
+ for !a32s.Empty() {
+ a32 := a32s.Dequeue()
+ baseReg = m.addRegToReg64Ext(baseReg, a32.r, a32.ext) // baseReg += (a32 extended to 64-bit)
+ }
+ amode.rn = baseReg
+ return
+}
+
+var addendsMatchOpcodes = [4]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst}
+
+func (m *machine) collectAddends(ptr ssa.Value) (addends32 *wazevoapi.Queue[addend32], addends64 *wazevoapi.Queue[regalloc.VReg], offset int64) {
+ m.addendsWorkQueue.Reset()
+ m.addends32.Reset()
+ m.addends64.Reset()
+ m.addendsWorkQueue.Enqueue(ptr)
+
+ for !m.addendsWorkQueue.Empty() {
+ v := m.addendsWorkQueue.Dequeue()
+
+ def := m.compiler.ValueDefinition(v)
+ switch op := m.compiler.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op {
+ case ssa.OpcodeIadd:
+ // If the addend is an add, we recursively collect its operands.
+ x, y := def.Instr.Arg2()
+ m.addendsWorkQueue.Enqueue(x)
+ m.addendsWorkQueue.Enqueue(y)
+ def.Instr.MarkLowered()
+ case ssa.OpcodeIconst:
+ // If the addend is constant, we just statically merge it into the offset.
+ ic := def.Instr
+ u64 := ic.ConstantVal()
+ if ic.Return().Type().Bits() == 32 {
+ offset += int64(int32(u64)) // sign-extend.
+ } else {
+ offset += int64(u64)
+ }
+ def.Instr.MarkLowered()
+ case ssa.OpcodeUExtend, ssa.OpcodeSExtend:
+ input := def.Instr.Arg()
+ if input.Type().Bits() != 32 {
+ panic("illegal size: " + input.Type().String())
+ }
+
+ var ext extendOp
+ if op == ssa.OpcodeUExtend {
+ ext = extendOpUXTW
+ } else {
+ ext = extendOpSXTW
+ }
+
+ inputDef := m.compiler.ValueDefinition(input)
+ constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant()
+ switch {
+ case constInst && ext == extendOpUXTW:
+ // Zero-extension of a 32-bit constant can be merged into the offset.
+ offset += int64(uint32(inputDef.Instr.ConstantVal()))
+ case constInst && ext == extendOpSXTW:
+ // Sign-extension of a 32-bit constant can be merged into the offset.
+ offset += int64(int32(inputDef.Instr.ConstantVal())) // sign-extend!
+ default:
+ m.addends32.Enqueue(addend32{r: m.getOperand_NR(inputDef, extModeNone).nr(), ext: ext})
+ }
+ def.Instr.MarkLowered()
+ continue
+ default:
+ // If the addend is not one of them, we simply use it as-is (without merging!), optionally zero-extending it.
+ m.addends64.Enqueue(m.getOperand_NR(def, extModeZeroExtend64 /* optional zero ext */).nr())
+ }
+ }
+ return &m.addends32, &m.addends64, offset
+}
+
+func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) {
+ rd = m.compiler.AllocateVReg(ssa.TypeI64)
+ alu := m.allocateInstr()
+ if imm12Op, ok := asImm12Operand(uint64(c)); ok {
+ alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), imm12Op, true)
+ } else if imm12Op, ok = asImm12Operand(uint64(-c)); ok {
+ alu.asALU(aluOpSub, operandNR(rd), operandNR(r), imm12Op, true)
+ } else {
+ tmp := m.compiler.AllocateVReg(ssa.TypeI64)
+ m.load64bitConst(c, tmp)
+ alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), operandNR(tmp), true)
+ }
+ m.insert(alu)
+ return
+}
+
+func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) {
+ rd = m.compiler.AllocateVReg(ssa.TypeI64)
+ alu := m.allocateInstr()
+ alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandNR(rm), true)
+ m.insert(alu)
+ return
+}
+
+func (m *machine) addRegToReg64Ext(rn, rm regalloc.VReg, ext extendOp) (rd regalloc.VReg) {
+ rd = m.compiler.AllocateVReg(ssa.TypeI64)
+ alu := m.allocateInstr()
+ alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandER(rm, ext, 64), true)
+ m.insert(alu)
+ return
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
new file mode 100644
index 000000000..b435d9ba9
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
@@ -0,0 +1,515 @@
+package arm64
+
+import (
+ "context"
+ "fmt"
+ "strings"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type (
+ // machine implements backend.Machine.
+ machine struct {
+ compiler backend.Compiler
+ executableContext *backend.ExecutableContextT[instruction]
+ currentABI *backend.FunctionABI
+
+ regAlloc regalloc.Allocator
+ regAllocFn *backend.RegAllocFunction[*instruction, *machine]
+
+ // addendsWorkQueue is used during address lowering, defined here for reuse.
+ addendsWorkQueue wazevoapi.Queue[ssa.Value]
+ addends32 wazevoapi.Queue[addend32]
+ // addends64 is used during address lowering, defined here for reuse.
+ addends64 wazevoapi.Queue[regalloc.VReg]
+ unresolvedAddressModes []*instruction
+
+ // condBrRelocs holds the conditional branches which need offset relocation.
+ condBrRelocs []condBrReloc
+
+ // jmpTableTargets holds the labels of the jump table targets.
+ jmpTableTargets [][]uint32
+
+ // spillSlotSize is the size of the stack slot in bytes used for spilling registers.
+ // During the execution of the function, the stack looks like:
+ //
+ //
+ // (high address)
+ // +-----------------+
+ // | ....... |
+ // | ret Y |
+ // | ....... |
+ // | ret 0 |
+ // | arg X |
+ // | ....... |
+ // | arg 1 |
+ // | arg 0 |
+ // | xxxxx |
+ // | ReturnAddress |
+ // +-----------------+ <<-|
+ // | ........... | |
+ // | spill slot M | | <--- spillSlotSize
+ // | ............ | |
+ // | spill slot 2 | |
+ // | spill slot 1 | <<-+
+ // | clobbered N |
+ // | ........... |
+ // | clobbered 1 |
+ // | clobbered 0 |
+ // SP---> +-----------------+
+ // (low address)
+ //
+ // and it represents the size of the space between FP and the first spilled slot. This must be a multiple of 16.
+ // Also note that this is only known after register allocation.
+ spillSlotSize int64
+ spillSlots map[regalloc.VRegID]int64 // regalloc.VRegID to offset.
+ // clobberedRegs holds real-register backed VRegs saved at the function prologue, and restored at the epilogue.
+ clobberedRegs []regalloc.VReg
+
+ maxRequiredStackSizeForCalls int64
+ stackBoundsCheckDisabled bool
+
+ regAllocStarted bool
+ }
+
+ addend32 struct {
+ r regalloc.VReg
+ ext extendOp
+ }
+
+ condBrReloc struct {
+ cbr *instruction
+ // currentLabelPos is the labelPosition within which condBr is defined.
+ currentLabelPos *labelPosition
+ // Next block's labelPosition.
+ nextLabel label
+ offset int64
+ }
+
+ labelPosition = backend.LabelPosition[instruction]
+ label = backend.Label
+)
+
+const (
+ labelReturn = backend.LabelReturn
+ labelInvalid = backend.LabelInvalid
+)
+
+// NewBackend returns a new backend for arm64.
+func NewBackend() backend.Machine {
+ m := &machine{
+ spillSlots: make(map[regalloc.VRegID]int64),
+ executableContext: newExecutableContext(),
+ regAlloc: regalloc.NewAllocator(regInfo),
+ }
+ return m
+}
+
+func newExecutableContext() *backend.ExecutableContextT[instruction] {
+ return backend.NewExecutableContextT[instruction](resetInstruction, setNext, setPrev, asNop0)
+}
+
+// ExecutableContext implements backend.Machine.
+func (m *machine) ExecutableContext() backend.ExecutableContext {
+ return m.executableContext
+}
+
+// RegAlloc implements backend.Machine Function.
+func (m *machine) RegAlloc() {
+ rf := m.regAllocFn
+ for _, pos := range m.executableContext.OrderedBlockLabels {
+ rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End)
+ }
+
+ m.regAllocStarted = true
+ m.regAlloc.DoAllocation(rf)
+ // Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes.
+ m.spillSlotSize = (m.spillSlotSize + 15) &^ 15
+}
+
+// Reset implements backend.Machine.
+func (m *machine) Reset() {
+ m.clobberedRegs = m.clobberedRegs[:0]
+ for key := range m.spillSlots {
+ m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key))
+ }
+ for _, key := range m.clobberedRegs {
+ delete(m.spillSlots, regalloc.VRegID(key))
+ }
+ m.clobberedRegs = m.clobberedRegs[:0]
+ m.regAllocStarted = false
+ m.regAlloc.Reset()
+ m.regAllocFn.Reset()
+ m.spillSlotSize = 0
+ m.unresolvedAddressModes = m.unresolvedAddressModes[:0]
+ m.maxRequiredStackSizeForCalls = 0
+ m.executableContext.Reset()
+ m.jmpTableTargets = m.jmpTableTargets[:0]
+}
+
+// SetCurrentABI implements backend.Machine SetCurrentABI.
+func (m *machine) SetCurrentABI(abi *backend.FunctionABI) {
+ m.currentABI = abi
+}
+
+// DisableStackCheck implements backend.Machine DisableStackCheck.
+func (m *machine) DisableStackCheck() {
+ m.stackBoundsCheckDisabled = true
+}
+
+// SetCompiler implements backend.Machine.
+func (m *machine) SetCompiler(ctx backend.Compiler) {
+ m.compiler = ctx
+ m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, ctx.SSABuilder(), ctx)
+}
+
+func (m *machine) insert(i *instruction) {
+ ectx := m.executableContext
+ ectx.PendingInstructions = append(ectx.PendingInstructions, i)
+}
+
+func (m *machine) insertBrTargetLabel() label {
+ nop, l := m.allocateBrTarget()
+ m.insert(nop)
+ return l
+}
+
+func (m *machine) allocateBrTarget() (nop *instruction, l label) {
+ ectx := m.executableContext
+ l = ectx.AllocateLabel()
+ nop = m.allocateInstr()
+ nop.asNop0WithLabel(l)
+ pos := ectx.AllocateLabelPosition(l)
+ pos.Begin, pos.End = nop, nop
+ ectx.LabelPositions[l] = pos
+ return
+}
+
+// allocateInstr allocates an instruction.
+func (m *machine) allocateInstr() *instruction {
+ instr := m.executableContext.InstructionPool.Allocate()
+ if !m.regAllocStarted {
+ instr.addedBeforeRegAlloc = true
+ }
+ return instr
+}
+
+func resetInstruction(i *instruction) {
+ *i = instruction{}
+}
+
+func (m *machine) allocateNop() *instruction {
+ instr := m.allocateInstr()
+ instr.asNop0()
+ return instr
+}
+
+func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruction) {
+ amode := &i.amode
+ switch amode.kind {
+ case addressModeKindResultStackSpace:
+ amode.imm += ret0offset
+ case addressModeKindArgStackSpace:
+ amode.imm += arg0offset
+ default:
+ panic("BUG")
+ }
+
+ var sizeInBits byte
+ switch i.kind {
+ case store8, uLoad8:
+ sizeInBits = 8
+ case store16, uLoad16:
+ sizeInBits = 16
+ case store32, fpuStore32, uLoad32, fpuLoad32:
+ sizeInBits = 32
+ case store64, fpuStore64, uLoad64, fpuLoad64:
+ sizeInBits = 64
+ case fpuStore128, fpuLoad128:
+ sizeInBits = 128
+ default:
+ panic("BUG")
+ }
+
+ if offsetFitsInAddressModeKindRegUnsignedImm12(sizeInBits, amode.imm) {
+ amode.kind = addressModeKindRegUnsignedImm12
+ } else {
+ // This case, we load the offset into the temporary register,
+ // and then use it as the index register.
+ newPrev := m.lowerConstantI64AndInsert(i.prev, tmpRegVReg, amode.imm)
+ linkInstr(newPrev, i)
+ *amode = addressMode{kind: addressModeKindRegReg, rn: amode.rn, rm: tmpRegVReg, extOp: extendOpUXTX /* indicates rm reg is 64-bit */}
+ }
+}
+
+// resolveRelativeAddresses resolves the relative addresses before encoding.
+func (m *machine) resolveRelativeAddresses(ctx context.Context) {
+ ectx := m.executableContext
+ for {
+ if len(m.unresolvedAddressModes) > 0 {
+ arg0offset, ret0offset := m.arg0OffsetFromSP(), m.ret0OffsetFromSP()
+ for _, i := range m.unresolvedAddressModes {
+ m.resolveAddressingMode(arg0offset, ret0offset, i)
+ }
+ }
+
+ // Reuse the slice to gather the unresolved conditional branches.
+ m.condBrRelocs = m.condBrRelocs[:0]
+
+ var fn string
+ var fnIndex int
+ var labelToSSABlockID map[label]ssa.BasicBlockID
+ if wazevoapi.PerfMapEnabled {
+ fn = wazevoapi.GetCurrentFunctionName(ctx)
+ labelToSSABlockID = make(map[label]ssa.BasicBlockID)
+ for i, l := range ectx.SsaBlockIDToLabels {
+ labelToSSABlockID[l] = ssa.BasicBlockID(i)
+ }
+ fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx)
+ }
+
+ // Next, in order to determine the offsets of relative jumps, we have to calculate the size of each label.
+ var offset int64
+ for i, pos := range ectx.OrderedBlockLabels {
+ pos.BinaryOffset = offset
+ var size int64
+ for cur := pos.Begin; ; cur = cur.next {
+ switch cur.kind {
+ case nop0:
+ l := cur.nop0Label()
+ if pos, ok := ectx.LabelPositions[l]; ok {
+ pos.BinaryOffset = offset + size
+ }
+ case condBr:
+ if !cur.condBrOffsetResolved() {
+ var nextLabel label
+ if i < len(ectx.OrderedBlockLabels)-1 {
+ // Note: this is only used when the block ends with fallthrough,
+ // therefore can be safely assumed that the next block exists when it's needed.
+ nextLabel = ectx.OrderedBlockLabels[i+1].L
+ }
+ m.condBrRelocs = append(m.condBrRelocs, condBrReloc{
+ cbr: cur, currentLabelPos: pos, offset: offset + size,
+ nextLabel: nextLabel,
+ })
+ }
+ }
+ size += cur.size()
+ if cur == pos.End {
+ break
+ }
+ }
+
+ if wazevoapi.PerfMapEnabled {
+ if size > 0 {
+ l := pos.L
+ var labelStr string
+ if blkID, ok := labelToSSABlockID[l]; ok {
+ labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID)
+ } else {
+ labelStr = l.String()
+ }
+ wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr))
+ }
+ }
+ offset += size
+ }
+
+ // Before resolving any offsets, we need to check if all the conditional branches can be resolved.
+ var needRerun bool
+ for i := range m.condBrRelocs {
+ reloc := &m.condBrRelocs[i]
+ cbr := reloc.cbr
+ offset := reloc.offset
+
+ target := cbr.condBrLabel()
+ offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
+ diff := offsetOfTarget - offset
+ if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
+ // This case the conditional branch is too huge. We place the trampoline instructions at the end of the current block,
+ // and jump to it.
+ m.insertConditionalJumpTrampoline(cbr, reloc.currentLabelPos, reloc.nextLabel)
+ // Then, we need to recall this function to fix up the label offsets
+ // as they have changed after the trampoline is inserted.
+ needRerun = true
+ }
+ }
+ if needRerun {
+ if wazevoapi.PerfMapEnabled {
+ wazevoapi.PerfMap.Clear()
+ }
+ } else {
+ break
+ }
+ }
+
+ var currentOffset int64
+ for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+ switch cur.kind {
+ case br:
+ target := cur.brLabel()
+ offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
+ diff := offsetOfTarget - currentOffset
+ divided := diff >> 2
+ if divided < minSignedInt26 || divided > maxSignedInt26 {
+ // This means the currently compiled single function is extremely large.
+ panic("too large function that requires branch relocation of large unconditional branch larger than 26-bit range")
+ }
+ cur.brOffsetResolve(diff)
+ case condBr:
+ if !cur.condBrOffsetResolved() {
+ target := cur.condBrLabel()
+ offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
+ diff := offsetOfTarget - currentOffset
+ if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
+ panic("BUG: branch relocation for large conditional branch larger than 19-bit range must be handled properly")
+ }
+ cur.condBrOffsetResolve(diff)
+ }
+ case brTableSequence:
+ tableIndex := cur.u1
+ targets := m.jmpTableTargets[tableIndex]
+ for i := range targets {
+ l := label(targets[i])
+ offsetOfTarget := ectx.LabelPositions[l].BinaryOffset
+ diff := offsetOfTarget - (currentOffset + brTableSequenceOffsetTableBegin)
+ targets[i] = uint32(diff)
+ }
+ cur.brTableSequenceOffsetsResolved()
+ case emitSourceOffsetInfo:
+ m.compiler.AddSourceOffsetInfo(currentOffset, cur.sourceOffsetInfo())
+ }
+ currentOffset += cur.size()
+ }
+}
+
+const (
+ maxSignedInt26 = 1<<25 - 1
+ minSignedInt26 = -(1 << 25)
+
+ maxSignedInt19 = 1<<18 - 1
+ minSignedInt19 = -(1 << 18)
+)
+
+func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *labelPosition, nextLabel label) {
+ cur := currentBlk.End
+ originalTarget := cbr.condBrLabel()
+ endNext := cur.next
+
+ if cur.kind != br {
+ // If the current block ends with a conditional branch, we can just insert the trampoline after it.
+ // Otherwise, we need to insert "skip" instruction to skip the trampoline instructions.
+ skip := m.allocateInstr()
+ skip.asBr(nextLabel)
+ cur = linkInstr(cur, skip)
+ }
+
+ cbrNewTargetInstr, cbrNewTargetLabel := m.allocateBrTarget()
+ cbr.setCondBrTargets(cbrNewTargetLabel)
+ cur = linkInstr(cur, cbrNewTargetInstr)
+
+ // Then insert the unconditional branch to the original, which should be possible to get encoded
+ // as 26-bit offset should be enough for any practical application.
+ br := m.allocateInstr()
+ br.asBr(originalTarget)
+ cur = linkInstr(cur, br)
+
+ // Update the end of the current block.
+ currentBlk.End = cur
+
+ linkInstr(cur, endNext)
+}
+
+// Format implements backend.Machine.
+func (m *machine) Format() string {
+ ectx := m.executableContext
+ begins := map[*instruction]label{}
+ for l, pos := range ectx.LabelPositions {
+ begins[pos.Begin] = l
+ }
+
+ irBlocks := map[label]ssa.BasicBlockID{}
+ for i, l := range ectx.SsaBlockIDToLabels {
+ irBlocks[l] = ssa.BasicBlockID(i)
+ }
+
+ var lines []string
+ for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+ if l, ok := begins[cur]; ok {
+ var labelStr string
+ if blkID, ok := irBlocks[l]; ok {
+ labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID)
+ } else {
+ labelStr = fmt.Sprintf("%s:", l)
+ }
+ lines = append(lines, labelStr)
+ }
+ if cur.kind == nop0 {
+ continue
+ }
+ lines = append(lines, "\t"+cur.String())
+ }
+ return "\n" + strings.Join(lines, "\n") + "\n"
+}
+
+// InsertReturn implements backend.Machine.
+func (m *machine) InsertReturn() {
+ i := m.allocateInstr()
+ i.asRet()
+ m.insert(i)
+}
+
+func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 {
+ offset, ok := m.spillSlots[id]
+ if !ok {
+ offset = m.spillSlotSize
+ // TODO: this should be aligned depending on the `size` to use Imm12 offset load/store as much as possible.
+ m.spillSlots[id] = offset
+ m.spillSlotSize += int64(size)
+ }
+ return offset + 16 // spill slot starts above the clobbered registers and the frame size.
+}
+
+func (m *machine) clobberedRegSlotSize() int64 {
+ return int64(len(m.clobberedRegs) * 16)
+}
+
+func (m *machine) arg0OffsetFromSP() int64 {
+ return m.frameSize() +
+ 16 + // 16-byte aligned return address
+ 16 // frame size saved below the clobbered registers.
+}
+
+func (m *machine) ret0OffsetFromSP() int64 {
+ return m.arg0OffsetFromSP() + m.currentABI.ArgStackSize
+}
+
+func (m *machine) requiredStackSize() int64 {
+ return m.maxRequiredStackSizeForCalls +
+ m.frameSize() +
+ 16 + // 16-byte aligned return address.
+ 16 // frame size saved below the clobbered registers.
+}
+
+func (m *machine) frameSize() int64 {
+ s := m.clobberedRegSlotSize() + m.spillSlotSize
+ if s&0xf != 0 {
+ panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s))
+ }
+ return s
+}
+
+func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) {
+ // TODO: reuse the slice!
+ labels := make([]uint32, len(targets))
+ for j, target := range targets {
+ labels[j] = uint32(m.executableContext.GetOrAllocateSSABlockLabel(target))
+ }
+ index = len(m.jmpTableTargets)
+ m.jmpTableTargets = append(m.jmpTableTargets, labels)
+ return
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
new file mode 100644
index 000000000..466fac464
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
@@ -0,0 +1,469 @@
+package arm64
+
+import (
+ "fmt"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// PostRegAlloc implements backend.Machine.
+func (m *machine) PostRegAlloc() {
+ m.setupPrologue()
+ m.postRegAlloc()
+}
+
+// setupPrologue initializes the prologue of the function.
+func (m *machine) setupPrologue() {
+ ectx := m.executableContext
+
+ cur := ectx.RootInstr
+ prevInitInst := cur.next
+
+ //
+ // (high address) (high address)
+ // SP----> +-----------------+ +------------------+ <----+
+ // | ....... | | ....... | |
+ // | ret Y | | ret Y | |
+ // | ....... | | ....... | |
+ // | ret 0 | | ret 0 | |
+ // | arg X | | arg X | | size_of_arg_ret.
+ // | ....... | ====> | ....... | |
+ // | arg 1 | | arg 1 | |
+ // | arg 0 | | arg 0 | <----+
+ // |-----------------| | size_of_arg_ret |
+ // | return address |
+ // +------------------+ <---- SP
+ // (low address) (low address)
+
+ // Saves the return address (lr) and the size_of_arg_ret below the SP.
+ // size_of_arg_ret is used for stack unwinding.
+ cur = m.createReturnAddrAndSizeOfArgRetSlot(cur)
+
+ if !m.stackBoundsCheckDisabled {
+ cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur)
+ }
+
+ // Decrement SP if spillSlotSize > 0.
+ if m.spillSlotSize == 0 && len(m.spillSlots) != 0 {
+ panic(fmt.Sprintf("BUG: spillSlotSize=%d, spillSlots=%v\n", m.spillSlotSize, m.spillSlots))
+ }
+
+ if regs := m.clobberedRegs; len(regs) > 0 {
+ //
+ // (high address) (high address)
+ // +-----------------+ +-----------------+
+ // | ....... | | ....... |
+ // | ret Y | | ret Y |
+ // | ....... | | ....... |
+ // | ret 0 | | ret 0 |
+ // | arg X | | arg X |
+ // | ....... | | ....... |
+ // | arg 1 | | arg 1 |
+ // | arg 0 | | arg 0 |
+ // | size_of_arg_ret | | size_of_arg_ret |
+ // | ReturnAddress | | ReturnAddress |
+ // SP----> +-----------------+ ====> +-----------------+
+ // (low address) | clobbered M |
+ // | ............ |
+ // | clobbered 0 |
+ // +-----------------+ <----- SP
+ // (low address)
+ //
+ _amode := addressModePreOrPostIndex(spVReg,
+ -16, // stack pointer must be 16-byte aligned.
+ true, // Decrement before store.
+ )
+ for _, vr := range regs {
+ // TODO: pair stores to reduce the number of instructions.
+ store := m.allocateInstr()
+ store.asStore(operandNR(vr), _amode, regTypeToRegisterSizeInBits(vr.RegType()))
+ cur = linkInstr(cur, store)
+ }
+ }
+
+ if size := m.spillSlotSize; size > 0 {
+ // Check if size is 16-byte aligned.
+ if size&0xf != 0 {
+ panic(fmt.Errorf("BUG: spill slot size %d is not 16-byte aligned", size))
+ }
+
+ cur = m.addsAddOrSubStackPointer(cur, spVReg, size, false)
+
+ // At this point, the stack looks like:
+ //
+ // (high address)
+ // +------------------+
+ // | ....... |
+ // | ret Y |
+ // | ....... |
+ // | ret 0 |
+ // | arg X |
+ // | ....... |
+ // | arg 1 |
+ // | arg 0 |
+ // | size_of_arg_ret |
+ // | ReturnAddress |
+ // +------------------+
+ // | clobbered M |
+ // | ............ |
+ // | clobbered 0 |
+ // | spill slot N |
+ // | ............ |
+ // | spill slot 2 |
+ // | spill slot 0 |
+ // SP----> +------------------+
+ // (low address)
+ }
+
+ // We push the frame size into the stack to make it possible to unwind stack:
+ //
+ //
+ // (high address) (high address)
+ // +-----------------+ +-----------------+
+ // | ....... | | ....... |
+ // | ret Y | | ret Y |
+ // | ....... | | ....... |
+ // | ret 0 | | ret 0 |
+ // | arg X | | arg X |
+ // | ....... | | ....... |
+ // | arg 1 | | arg 1 |
+ // | arg 0 | | arg 0 |
+ // | size_of_arg_ret | | size_of_arg_ret |
+ // | ReturnAddress | | ReturnAddress |
+ // +-----------------+ ==> +-----------------+ <----+
+ // | clobbered M | | clobbered M | |
+ // | ............ | | ............ | |
+ // | clobbered 2 | | clobbered 2 | |
+ // | clobbered 1 | | clobbered 1 | | frame size
+ // | clobbered 0 | | clobbered 0 | |
+ // | spill slot N | | spill slot N | |
+ // | ............ | | ............ | |
+ // | spill slot 0 | | spill slot 0 | <----+
+ // SP---> +-----------------+ | xxxxxx | ;; unused space to make it 16-byte aligned.
+ // | frame_size |
+ // +-----------------+ <---- SP
+ // (low address)
+ //
+ cur = m.createFrameSizeSlot(cur, m.frameSize())
+
+ linkInstr(cur, prevInitInst)
+}
+
+func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruction {
+ // First we decrement the stack pointer to point the arg0 slot.
+ var sizeOfArgRetReg regalloc.VReg
+ s := int64(m.currentABI.AlignedArgResultStackSlotSize())
+ if s > 0 {
+ cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
+ sizeOfArgRetReg = tmpRegVReg
+
+ subSp := m.allocateInstr()
+ subSp.asALU(aluOpSub, operandNR(spVReg), operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
+ cur = linkInstr(cur, subSp)
+ } else {
+ sizeOfArgRetReg = xzrVReg
+ }
+
+ // Saves the return address (lr) and the size_of_arg_ret below the SP.
+ // size_of_arg_ret is used for stack unwinding.
+ pstr := m.allocateInstr()
+ amode := addressModePreOrPostIndex(spVReg, -16, true /* decrement before store */)
+ pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode)
+ cur = linkInstr(cur, pstr)
+ return cur
+}
+
+func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction {
+ var frameSizeReg regalloc.VReg
+ if s > 0 {
+ cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
+ frameSizeReg = tmpRegVReg
+ } else {
+ frameSizeReg = xzrVReg
+ }
+ _amode := addressModePreOrPostIndex(spVReg,
+ -16, // stack pointer must be 16-byte aligned.
+ true, // Decrement before store.
+ )
+ store := m.allocateInstr()
+ store.asStore(operandNR(frameSizeReg), _amode, 64)
+ cur = linkInstr(cur, store)
+ return cur
+}
+
+// postRegAlloc does multiple things while walking through the instructions:
+// 1. Removes the redundant copy instruction.
+// 2. Inserts the epilogue.
+func (m *machine) postRegAlloc() {
+ ectx := m.executableContext
+ for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+ switch cur.kind {
+ case ret:
+ m.setupEpilogueAfter(cur.prev)
+ case loadConstBlockArg:
+ lc := cur
+ next := lc.next
+ m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
+ m.lowerLoadConstantBlockArgAfterRegAlloc(lc)
+ for _, instr := range m.executableContext.PendingInstructions {
+ cur = linkInstr(cur, instr)
+ }
+ linkInstr(cur, next)
+ m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
+ default:
+ // Removes the redundant copy instruction.
+ if cur.IsCopy() && cur.rn.realReg() == cur.rd.realReg() {
+ prev, next := cur.prev, cur.next
+ // Remove the copy instruction.
+ prev.next = next
+ if next != nil {
+ next.prev = prev
+ }
+ }
+ }
+ }
+}
+
+func (m *machine) setupEpilogueAfter(cur *instruction) {
+ prevNext := cur.next
+
+ // We've stored the frame size in the prologue, and now that we are about to return from this function, we won't need it anymore.
+ cur = m.addsAddOrSubStackPointer(cur, spVReg, 16, true)
+
+ if s := m.spillSlotSize; s > 0 {
+ // Adjust SP to the original value:
+ //
+ // (high address) (high address)
+ // +-----------------+ +-----------------+
+ // | ....... | | ....... |
+ // | ret Y | | ret Y |
+ // | ....... | | ....... |
+ // | ret 0 | | ret 0 |
+ // | arg X | | arg X |
+ // | ....... | | ....... |
+ // | arg 1 | | arg 1 |
+ // | arg 0 | | arg 0 |
+ // | xxxxx | | xxxxx |
+ // | ReturnAddress | | ReturnAddress |
+ // +-----------------+ ====> +-----------------+
+ // | clobbered M | | clobbered M |
+ // | ............ | | ............ |
+ // | clobbered 1 | | clobbered 1 |
+ // | clobbered 0 | | clobbered 0 |
+ // | spill slot N | +-----------------+ <---- SP
+ // | ............ |
+ // | spill slot 0 |
+ // SP---> +-----------------+
+ // (low address)
+ //
+ cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
+ }
+
+ // First we need to restore the clobbered registers.
+ if len(m.clobberedRegs) > 0 {
+ // (high address)
+ // +-----------------+ +-----------------+
+ // | ....... | | ....... |
+ // | ret Y | | ret Y |
+ // | ....... | | ....... |
+ // | ret 0 | | ret 0 |
+ // | arg X | | arg X |
+ // | ....... | | ....... |
+ // | arg 1 | | arg 1 |
+ // | arg 0 | | arg 0 |
+ // | xxxxx | | xxxxx |
+ // | ReturnAddress | | ReturnAddress |
+ // +-----------------+ ========> +-----------------+ <---- SP
+ // | clobbered M |
+ // | ........... |
+ // | clobbered 1 |
+ // | clobbered 0 |
+ // SP---> +-----------------+
+ // (low address)
+
+ l := len(m.clobberedRegs) - 1
+ for i := range m.clobberedRegs {
+ vr := m.clobberedRegs[l-i] // reverse order to restore.
+ load := m.allocateInstr()
+ amode := addressModePreOrPostIndex(spVReg,
+ 16, // stack pointer must be 16-byte aligned.
+ false, // Increment after store.
+ )
+ // TODO: pair loads to reduce the number of instructions.
+ switch regTypeToRegisterSizeInBits(vr.RegType()) {
+ case 64: // save int reg.
+ load.asULoad(operandNR(vr), amode, 64)
+ case 128: // save vector reg.
+ load.asFpuLoad(operandNR(vr), amode, 128)
+ }
+ cur = linkInstr(cur, load)
+ }
+ }
+
+ // Reload the return address (lr).
+ //
+ // +-----------------+ +-----------------+
+ // | ....... | | ....... |
+ // | ret Y | | ret Y |
+ // | ....... | | ....... |
+ // | ret 0 | | ret 0 |
+ // | arg X | | arg X |
+ // | ....... | ===> | ....... |
+ // | arg 1 | | arg 1 |
+ // | arg 0 | | arg 0 |
+ // | xxxxx | +-----------------+ <---- SP
+ // | ReturnAddress |
+ // SP----> +-----------------+
+
+ ldr := m.allocateInstr()
+ ldr.asULoad(operandNR(lrVReg),
+ addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
+ cur = linkInstr(cur, ldr)
+
+ if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
+ cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
+ }
+
+ linkInstr(cur, prevNext)
+}
+
+// saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
+// stack space left. Basically this is the combination of CalleeSavedRegisters plus argument registers execpt for x0,
+// which always points to the execution context whenever the native code is entered from Go.
+var saveRequiredRegs = []regalloc.VReg{
+ x1VReg, x2VReg, x3VReg, x4VReg, x5VReg, x6VReg, x7VReg,
+ x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg, lrVReg,
+ v0VReg, v1VReg, v2VReg, v3VReg, v4VReg, v5VReg, v6VReg, v7VReg,
+ v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg,
+}
+
+// insertStackBoundsCheck will insert the instructions after `cur` to check the
+// stack bounds, and if there's no sufficient spaces required for the function,
+// exit the execution and try growing it in Go world.
+//
+// TODO: we should be able to share the instructions across all the functions to reduce the size of compiled executable.
+func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction {
+ if requiredStackSize%16 != 0 {
+ panic("BUG")
+ }
+
+ if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok {
+ // sub tmp, sp, #requiredStackSize
+ sub := m.allocateInstr()
+ sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), immm12op, true)
+ cur = linkInstr(cur, sub)
+ } else {
+ // This case, we first load the requiredStackSize into the temporary register,
+ cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
+ // Then subtract it.
+ sub := m.allocateInstr()
+ sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true)
+ cur = linkInstr(cur, sub)
+ }
+
+ tmp2 := x11VReg // Caller save, so it is safe to use it here in the prologue.
+
+ // ldr tmp2, [executionContext #StackBottomPtr]
+ ldr := m.allocateInstr()
+ ldr.asULoad(operandNR(tmp2), addressMode{
+ kind: addressModeKindRegUnsignedImm12,
+ rn: x0VReg, // execution context is always the first argument.
+ imm: wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(),
+ }, 64)
+ cur = linkInstr(cur, ldr)
+
+ // subs xzr, tmp, tmp2
+ subs := m.allocateInstr()
+ subs.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpRegVReg), operandNR(tmp2), true)
+ cur = linkInstr(cur, subs)
+
+ // b.ge #imm
+ cbr := m.allocateInstr()
+ cbr.asCondBr(ge.asCond(), labelInvalid, false /* ignored */)
+ cur = linkInstr(cur, cbr)
+
+ // Set the required stack size and set it to the exec context.
+ {
+ // First load the requiredStackSize into the temporary register,
+ cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
+ setRequiredStackSize := m.allocateInstr()
+ setRequiredStackSize.asStore(operandNR(tmpRegVReg),
+ addressMode{
+ kind: addressModeKindRegUnsignedImm12,
+ // Execution context is always the first argument.
+ rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
+ }, 64)
+
+ cur = linkInstr(cur, setRequiredStackSize)
+ }
+
+ ldrAddress := m.allocateInstr()
+ ldrAddress.asULoad(operandNR(tmpRegVReg), addressMode{
+ kind: addressModeKindRegUnsignedImm12,
+ rn: x0VReg, // execution context is always the first argument
+ imm: wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(),
+ }, 64)
+ cur = linkInstr(cur, ldrAddress)
+
+ // Then jumps to the stack grow call sequence's address, meaning
+ // transferring the control to the code compiled by CompileStackGrowCallSequence.
+ bl := m.allocateInstr()
+ bl.asCallIndirect(tmpRegVReg, nil)
+ cur = linkInstr(cur, bl)
+
+ // Now that we know the entire code, we can finalize how many bytes
+ // we have to skip when the stack size is sufficient.
+ var cbrOffset int64
+ for _cur := cbr; ; _cur = _cur.next {
+ cbrOffset += _cur.size()
+ if _cur == cur {
+ break
+ }
+ }
+ cbr.condBrOffsetResolve(cbrOffset)
+ return cur
+}
+
+// CompileStackGrowCallSequence implements backend.Machine.
+func (m *machine) CompileStackGrowCallSequence() []byte {
+ ectx := m.executableContext
+
+ cur := m.allocateInstr()
+ cur.asNop0()
+ ectx.RootInstr = cur
+
+ // Save the callee saved and argument registers.
+ cur = m.saveRegistersInExecutionContext(cur, saveRequiredRegs)
+
+ // Save the current stack pointer.
+ cur = m.saveCurrentStackPointer(cur, x0VReg)
+
+ // Set the exit status on the execution context.
+ cur = m.setExitCode(cur, x0VReg, wazevoapi.ExitCodeGrowStack)
+
+ // Exit the execution.
+ cur = m.storeReturnAddressAndExit(cur)
+
+ // After the exit, restore the saved registers.
+ cur = m.restoreRegistersInExecutionContext(cur, saveRequiredRegs)
+
+ // Then goes back the original address of this stack grow call.
+ ret := m.allocateInstr()
+ ret.asRet()
+ linkInstr(cur, ret)
+
+ m.encode(ectx.RootInstr)
+ return m.compiler.Buf()
+}
+
+func (m *machine) addsAddOrSubStackPointer(cur *instruction, rd regalloc.VReg, diff int64, add bool) *instruction {
+ ectx := m.executableContext
+
+ ectx.PendingInstructions = ectx.PendingInstructions[:0]
+ m.insertAddOrSubStackPointer(rd, diff, add)
+ for _, inserted := range ectx.PendingInstructions {
+ cur = linkInstr(cur, inserted)
+ }
+ return cur
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
new file mode 100644
index 000000000..1c8793b73
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
@@ -0,0 +1,152 @@
+package arm64
+
+// This file implements the interfaces required for register allocations. See backend.RegAllocFunctionMachine.
+
+import (
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// ClobberedRegisters implements backend.RegAllocFunctionMachine.
+func (m *machine) ClobberedRegisters(regs []regalloc.VReg) {
+ m.clobberedRegs = append(m.clobberedRegs[:0], regs...)
+}
+
+// Swap implements backend.RegAllocFunctionMachine.
+func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
+ prevNext := cur.next
+ var mov1, mov2, mov3 *instruction
+ if x1.RegType() == regalloc.RegTypeInt {
+ if !tmp.Valid() {
+ tmp = tmpRegVReg
+ }
+ mov1 = m.allocateInstr().asMove64(tmp, x1)
+ mov2 = m.allocateInstr().asMove64(x1, x2)
+ mov3 = m.allocateInstr().asMove64(x2, tmp)
+ cur = linkInstr(cur, mov1)
+ cur = linkInstr(cur, mov2)
+ cur = linkInstr(cur, mov3)
+ linkInstr(cur, prevNext)
+ } else {
+ if !tmp.Valid() {
+ r2 := x2.RealReg()
+ // Temporarily spill x1 to stack.
+ cur = m.InsertStoreRegisterAt(x1, cur, true).prev
+ // Then move x2 to x1.
+ cur = linkInstr(cur, m.allocateInstr().asFpuMov128(x1, x2))
+ linkInstr(cur, prevNext)
+ // Then reload the original value on x1 from stack to r2.
+ m.InsertReloadRegisterAt(x1.SetRealReg(r2), cur, true)
+ } else {
+ mov1 = m.allocateInstr().asFpuMov128(tmp, x1)
+ mov2 = m.allocateInstr().asFpuMov128(x1, x2)
+ mov3 = m.allocateInstr().asFpuMov128(x2, tmp)
+ cur = linkInstr(cur, mov1)
+ cur = linkInstr(cur, mov2)
+ cur = linkInstr(cur, mov3)
+ linkInstr(cur, prevNext)
+ }
+ }
+}
+
+// InsertMoveBefore implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
+ typ := src.RegType()
+ if typ != dst.RegType() {
+ panic("BUG: src and dst must have the same type")
+ }
+
+ mov := m.allocateInstr()
+ if typ == regalloc.RegTypeInt {
+ mov.asMove64(dst, src)
+ } else {
+ mov.asFpuMov128(dst, src)
+ }
+
+ cur := instr.prev
+ prevNext := cur.next
+ cur = linkInstr(cur, mov)
+ linkInstr(cur, prevNext)
+}
+
+// SSABlockLabel implements backend.RegAllocFunctionMachine.
+func (m *machine) SSABlockLabel(id ssa.BasicBlockID) backend.Label {
+ return m.executableContext.SsaBlockIDToLabels[id]
+}
+
+// InsertStoreRegisterAt implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+ if !v.IsRealReg() {
+ panic("BUG: VReg must be backed by real reg to be stored")
+ }
+
+ typ := m.compiler.TypeOf(v)
+
+ var prevNext, cur *instruction
+ if after {
+ cur, prevNext = instr, instr.next
+ } else {
+ cur, prevNext = instr.prev, instr
+ }
+
+ offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
+ var amode addressMode
+ cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
+ store := m.allocateInstr()
+ store.asStore(operandNR(v), amode, typ.Bits())
+
+ cur = linkInstr(cur, store)
+ return linkInstr(cur, prevNext)
+}
+
+// InsertReloadRegisterAt implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+ if !v.IsRealReg() {
+ panic("BUG: VReg must be backed by real reg to be stored")
+ }
+
+ typ := m.compiler.TypeOf(v)
+
+ var prevNext, cur *instruction
+ if after {
+ cur, prevNext = instr, instr.next
+ } else {
+ cur, prevNext = instr.prev, instr
+ }
+
+ offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
+ var amode addressMode
+ cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
+ load := m.allocateInstr()
+ switch typ {
+ case ssa.TypeI32, ssa.TypeI64:
+ load.asULoad(operandNR(v), amode, typ.Bits())
+ case ssa.TypeF32, ssa.TypeF64:
+ load.asFpuLoad(operandNR(v), amode, typ.Bits())
+ case ssa.TypeV128:
+ load.asFpuLoad(operandNR(v), amode, 128)
+ default:
+ panic("TODO")
+ }
+
+ cur = linkInstr(cur, load)
+ return linkInstr(cur, prevNext)
+}
+
+// LastInstrForInsertion implements backend.RegAllocFunctionMachine.
+func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction {
+ cur := end
+ for cur.kind == nop0 {
+ cur = cur.prev
+ if cur == begin {
+ return end
+ }
+ }
+ switch cur.kind {
+ case br:
+ return cur
+ default:
+ return end
+ }
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go
new file mode 100644
index 000000000..83902d927
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go
@@ -0,0 +1,117 @@
+package arm64
+
+import (
+ "encoding/binary"
+ "fmt"
+ "math"
+ "sort"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+)
+
+const (
+ // trampolineCallSize is the size of the trampoline instruction sequence for each function in an island.
+ trampolineCallSize = 4*4 + 4 // Four instructions + 32-bit immediate.
+
+ // Unconditional branch offset is encoded as divided by 4 in imm26.
+ // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link-?lang=en
+
+ maxUnconditionalBranchOffset = maxSignedInt26 * 4
+ minUnconditionalBranchOffset = minSignedInt26 * 4
+
+ // trampolineIslandInterval is the range of the trampoline island.
+ // Half of the range is used for the trampoline island, and the other half is used for the function.
+ trampolineIslandInterval = maxUnconditionalBranchOffset / 2
+
+ // maxNumFunctions explicitly specifies the maximum number of functions that can be allowed in a single executable.
+ maxNumFunctions = trampolineIslandInterval >> 6
+
+ // maxFunctionExecutableSize is the maximum size of a function that can exist in a trampoline island.
+ // Conservatively set to 1/4 of the trampoline island interval.
+ maxFunctionExecutableSize = trampolineIslandInterval >> 2
+)
+
+// CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo.
+func (m *machine) CallTrampolineIslandInfo(numFunctions int) (interval, size int, err error) {
+ if numFunctions > maxNumFunctions {
+ return 0, 0, fmt.Errorf("too many functions: %d > %d", numFunctions, maxNumFunctions)
+ }
+ return trampolineIslandInterval, trampolineCallSize * numFunctions, nil
+}
+
+// ResolveRelocations implements backend.Machine ResolveRelocations.
+func (m *machine) ResolveRelocations(
+ refToBinaryOffset []int,
+ executable []byte,
+ relocations []backend.RelocationInfo,
+ callTrampolineIslandOffsets []int,
+) {
+ for _, islandOffset := range callTrampolineIslandOffsets {
+ encodeCallTrampolineIsland(refToBinaryOffset, islandOffset, executable)
+ }
+
+ for _, r := range relocations {
+ instrOffset := r.Offset
+ calleeFnOffset := refToBinaryOffset[r.FuncRef]
+ diff := int64(calleeFnOffset) - (instrOffset)
+ // Check if the diff is within the range of the branch instruction.
+ if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset {
+ // Find the near trampoline island from callTrampolineIslandOffsets.
+ islandOffset := searchTrampolineIsland(callTrampolineIslandOffsets, int(instrOffset))
+ islandTargetOffset := islandOffset + trampolineCallSize*int(r.FuncRef)
+ diff = int64(islandTargetOffset) - (instrOffset)
+ if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset {
+ panic("BUG in trampoline placement")
+ }
+ }
+ binary.LittleEndian.PutUint32(executable[instrOffset:instrOffset+4], encodeUnconditionalBranch(true, diff))
+ }
+}
+
+// encodeCallTrampolineIsland encodes a trampoline island for the given functions.
+// Each island consists of a trampoline instruction sequence for each function.
+// Each trampoline instruction sequence consists of 4 instructions + 32-bit immediate.
+func encodeCallTrampolineIsland(refToBinaryOffset []int, islandOffset int, executable []byte) {
+ for i := 0; i < len(refToBinaryOffset); i++ {
+ trampolineOffset := islandOffset + trampolineCallSize*i
+
+ fnOffset := refToBinaryOffset[i]
+ diff := fnOffset - (trampolineOffset + 16)
+ if diff > math.MaxInt32 || diff < math.MinInt32 {
+ // This case even amd64 can't handle. 4GB is too big.
+ panic("too big binary")
+ }
+
+ // The tmpReg, tmpReg2 is safe to overwrite (in fact any caller-saved register is safe to use).
+ tmpReg, tmpReg2 := regNumberInEncoding[tmpRegVReg.RealReg()], regNumberInEncoding[x11]
+
+ // adr tmpReg, PC+16: load the address of #diff into tmpReg.
+ binary.LittleEndian.PutUint32(executable[trampolineOffset:], encodeAdr(tmpReg, 16))
+ // ldrsw tmpReg2, [tmpReg]: Load #diff into tmpReg2.
+ binary.LittleEndian.PutUint32(executable[trampolineOffset+4:],
+ encodeLoadOrStore(sLoad32, tmpReg2, addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpRegVReg}))
+ // add tmpReg, tmpReg2, tmpReg: add #diff to the address of #diff, getting the absolute address of the function.
+ binary.LittleEndian.PutUint32(executable[trampolineOffset+8:],
+ encodeAluRRR(aluOpAdd, tmpReg, tmpReg, tmpReg2, true, false))
+ // br tmpReg: branch to the function without overwriting the link register.
+ binary.LittleEndian.PutUint32(executable[trampolineOffset+12:], encodeUnconditionalBranchReg(tmpReg, false))
+ // #diff
+ binary.LittleEndian.PutUint32(executable[trampolineOffset+16:], uint32(diff))
+ }
+}
+
+// searchTrampolineIsland finds the nearest trampoline island from callTrampolineIslandOffsets.
+// Note that even if the offset is in the middle of two islands, it returns the latter one.
+// That is ok because the island is always placed in the middle of the range.
+//
+// precondition: callTrampolineIslandOffsets is sorted in ascending order.
+func searchTrampolineIsland(callTrampolineIslandOffsets []int, offset int) int {
+ l := len(callTrampolineIslandOffsets)
+ n := sort.Search(l, func(i int) bool {
+ return callTrampolineIslandOffsets[i] >= offset
+ })
+ if n == l {
+ n = l - 1
+ }
+ return callTrampolineIslandOffsets[n]
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go
new file mode 100644
index 000000000..45737516d
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go
@@ -0,0 +1,397 @@
+package arm64
+
+import (
+ "fmt"
+ "strconv"
+ "strings"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+)
+
+// Arm64-specific registers.
+//
+// See https://developer.arm.com/documentation/dui0801/a/Overview-of-AArch64-state/Predeclared-core-register-names-in-AArch64-state
+
+const (
+ // General purpose registers. Note that we do not distinguish wn and xn registers
+ // because they are the same from the perspective of register allocator, and
+ // the size can be determined by the type of the instruction.
+
+ x0 = regalloc.RealRegInvalid + 1 + iota
+ x1
+ x2
+ x3
+ x4
+ x5
+ x6
+ x7
+ x8
+ x9
+ x10
+ x11
+ x12
+ x13
+ x14
+ x15
+ x16
+ x17
+ x18
+ x19
+ x20
+ x21
+ x22
+ x23
+ x24
+ x25
+ x26
+ x27
+ x28
+ x29
+ x30
+
+ // Vector registers. Note that we do not distinguish vn and dn, ... registers
+ // because they are the same from the perspective of register allocator, and
+ // the size can be determined by the type of the instruction.
+
+ v0
+ v1
+ v2
+ v3
+ v4
+ v5
+ v6
+ v7
+ v8
+ v9
+ v10
+ v11
+ v12
+ v13
+ v14
+ v15
+ v16
+ v17
+ v18
+ v19
+ v20
+ v21
+ v22
+ v23
+ v24
+ v25
+ v26
+ v27
+ v28
+ v29
+ v30
+ v31
+
+ // Special registers
+
+ xzr
+ sp
+ lr = x30
+ fp = x29
+ tmp = x27
+)
+
+var (
+ x0VReg = regalloc.FromRealReg(x0, regalloc.RegTypeInt)
+ x1VReg = regalloc.FromRealReg(x1, regalloc.RegTypeInt)
+ x2VReg = regalloc.FromRealReg(x2, regalloc.RegTypeInt)
+ x3VReg = regalloc.FromRealReg(x3, regalloc.RegTypeInt)
+ x4VReg = regalloc.FromRealReg(x4, regalloc.RegTypeInt)
+ x5VReg = regalloc.FromRealReg(x5, regalloc.RegTypeInt)
+ x6VReg = regalloc.FromRealReg(x6, regalloc.RegTypeInt)
+ x7VReg = regalloc.FromRealReg(x7, regalloc.RegTypeInt)
+ x8VReg = regalloc.FromRealReg(x8, regalloc.RegTypeInt)
+ x9VReg = regalloc.FromRealReg(x9, regalloc.RegTypeInt)
+ x10VReg = regalloc.FromRealReg(x10, regalloc.RegTypeInt)
+ x11VReg = regalloc.FromRealReg(x11, regalloc.RegTypeInt)
+ x12VReg = regalloc.FromRealReg(x12, regalloc.RegTypeInt)
+ x13VReg = regalloc.FromRealReg(x13, regalloc.RegTypeInt)
+ x14VReg = regalloc.FromRealReg(x14, regalloc.RegTypeInt)
+ x15VReg = regalloc.FromRealReg(x15, regalloc.RegTypeInt)
+ x16VReg = regalloc.FromRealReg(x16, regalloc.RegTypeInt)
+ x17VReg = regalloc.FromRealReg(x17, regalloc.RegTypeInt)
+ x18VReg = regalloc.FromRealReg(x18, regalloc.RegTypeInt)
+ x19VReg = regalloc.FromRealReg(x19, regalloc.RegTypeInt)
+ x20VReg = regalloc.FromRealReg(x20, regalloc.RegTypeInt)
+ x21VReg = regalloc.FromRealReg(x21, regalloc.RegTypeInt)
+ x22VReg = regalloc.FromRealReg(x22, regalloc.RegTypeInt)
+ x23VReg = regalloc.FromRealReg(x23, regalloc.RegTypeInt)
+ x24VReg = regalloc.FromRealReg(x24, regalloc.RegTypeInt)
+ x25VReg = regalloc.FromRealReg(x25, regalloc.RegTypeInt)
+ x26VReg = regalloc.FromRealReg(x26, regalloc.RegTypeInt)
+ x27VReg = regalloc.FromRealReg(x27, regalloc.RegTypeInt)
+ x28VReg = regalloc.FromRealReg(x28, regalloc.RegTypeInt)
+ x29VReg = regalloc.FromRealReg(x29, regalloc.RegTypeInt)
+ x30VReg = regalloc.FromRealReg(x30, regalloc.RegTypeInt)
+ v0VReg = regalloc.FromRealReg(v0, regalloc.RegTypeFloat)
+ v1VReg = regalloc.FromRealReg(v1, regalloc.RegTypeFloat)
+ v2VReg = regalloc.FromRealReg(v2, regalloc.RegTypeFloat)
+ v3VReg = regalloc.FromRealReg(v3, regalloc.RegTypeFloat)
+ v4VReg = regalloc.FromRealReg(v4, regalloc.RegTypeFloat)
+ v5VReg = regalloc.FromRealReg(v5, regalloc.RegTypeFloat)
+ v6VReg = regalloc.FromRealReg(v6, regalloc.RegTypeFloat)
+ v7VReg = regalloc.FromRealReg(v7, regalloc.RegTypeFloat)
+ v8VReg = regalloc.FromRealReg(v8, regalloc.RegTypeFloat)
+ v9VReg = regalloc.FromRealReg(v9, regalloc.RegTypeFloat)
+ v10VReg = regalloc.FromRealReg(v10, regalloc.RegTypeFloat)
+ v11VReg = regalloc.FromRealReg(v11, regalloc.RegTypeFloat)
+ v12VReg = regalloc.FromRealReg(v12, regalloc.RegTypeFloat)
+ v13VReg = regalloc.FromRealReg(v13, regalloc.RegTypeFloat)
+ v14VReg = regalloc.FromRealReg(v14, regalloc.RegTypeFloat)
+ v15VReg = regalloc.FromRealReg(v15, regalloc.RegTypeFloat)
+ v16VReg = regalloc.FromRealReg(v16, regalloc.RegTypeFloat)
+ v17VReg = regalloc.FromRealReg(v17, regalloc.RegTypeFloat)
+ v18VReg = regalloc.FromRealReg(v18, regalloc.RegTypeFloat)
+ v19VReg = regalloc.FromRealReg(v19, regalloc.RegTypeFloat)
+ v20VReg = regalloc.FromRealReg(v20, regalloc.RegTypeFloat)
+ v21VReg = regalloc.FromRealReg(v21, regalloc.RegTypeFloat)
+ v22VReg = regalloc.FromRealReg(v22, regalloc.RegTypeFloat)
+ v23VReg = regalloc.FromRealReg(v23, regalloc.RegTypeFloat)
+ v24VReg = regalloc.FromRealReg(v24, regalloc.RegTypeFloat)
+ v25VReg = regalloc.FromRealReg(v25, regalloc.RegTypeFloat)
+ v26VReg = regalloc.FromRealReg(v26, regalloc.RegTypeFloat)
+ v27VReg = regalloc.FromRealReg(v27, regalloc.RegTypeFloat)
+ // lr (link register) holds the return address at the function entry.
+ lrVReg = x30VReg
+ // tmpReg is used to perform spill/load on large stack offsets, and load large constants.
+ // Therefore, be cautious to use this register in the middle of the compilation, especially before the register allocation.
+ // This is the same as golang/go, but it's only described in the source code:
+ // https://github.com/golang/go/blob/18e17e2cb12837ea2c8582ecdb0cc780f49a1aac/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go#L59
+ // https://github.com/golang/go/blob/18e17e2cb12837ea2c8582ecdb0cc780f49a1aac/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go#L13-L15
+ tmpRegVReg = regalloc.FromRealReg(tmp, regalloc.RegTypeInt)
+ v28VReg = regalloc.FromRealReg(v28, regalloc.RegTypeFloat)
+ v29VReg = regalloc.FromRealReg(v29, regalloc.RegTypeFloat)
+ v30VReg = regalloc.FromRealReg(v30, regalloc.RegTypeFloat)
+ v31VReg = regalloc.FromRealReg(v31, regalloc.RegTypeFloat)
+ xzrVReg = regalloc.FromRealReg(xzr, regalloc.RegTypeInt)
+ spVReg = regalloc.FromRealReg(sp, regalloc.RegTypeInt)
+ fpVReg = regalloc.FromRealReg(fp, regalloc.RegTypeInt)
+)
+
+var regNames = [...]string{
+ x0: "x0",
+ x1: "x1",
+ x2: "x2",
+ x3: "x3",
+ x4: "x4",
+ x5: "x5",
+ x6: "x6",
+ x7: "x7",
+ x8: "x8",
+ x9: "x9",
+ x10: "x10",
+ x11: "x11",
+ x12: "x12",
+ x13: "x13",
+ x14: "x14",
+ x15: "x15",
+ x16: "x16",
+ x17: "x17",
+ x18: "x18",
+ x19: "x19",
+ x20: "x20",
+ x21: "x21",
+ x22: "x22",
+ x23: "x23",
+ x24: "x24",
+ x25: "x25",
+ x26: "x26",
+ x27: "x27",
+ x28: "x28",
+ x29: "x29",
+ x30: "x30",
+ xzr: "xzr",
+ sp: "sp",
+ v0: "v0",
+ v1: "v1",
+ v2: "v2",
+ v3: "v3",
+ v4: "v4",
+ v5: "v5",
+ v6: "v6",
+ v7: "v7",
+ v8: "v8",
+ v9: "v9",
+ v10: "v10",
+ v11: "v11",
+ v12: "v12",
+ v13: "v13",
+ v14: "v14",
+ v15: "v15",
+ v16: "v16",
+ v17: "v17",
+ v18: "v18",
+ v19: "v19",
+ v20: "v20",
+ v21: "v21",
+ v22: "v22",
+ v23: "v23",
+ v24: "v24",
+ v25: "v25",
+ v26: "v26",
+ v27: "v27",
+ v28: "v28",
+ v29: "v29",
+ v30: "v30",
+ v31: "v31",
+}
+
+func formatVRegSized(r regalloc.VReg, size byte) (ret string) {
+ if r.IsRealReg() {
+ ret = regNames[r.RealReg()]
+ switch ret[0] {
+ case 'x':
+ switch size {
+ case 32:
+ ret = strings.Replace(ret, "x", "w", 1)
+ case 64:
+ default:
+ panic("BUG: invalid register size: " + strconv.Itoa(int(size)))
+ }
+ case 'v':
+ switch size {
+ case 32:
+ ret = strings.Replace(ret, "v", "s", 1)
+ case 64:
+ ret = strings.Replace(ret, "v", "d", 1)
+ case 128:
+ ret = strings.Replace(ret, "v", "q", 1)
+ default:
+ panic("BUG: invalid register size")
+ }
+ }
+ } else {
+ switch r.RegType() {
+ case regalloc.RegTypeInt:
+ switch size {
+ case 32:
+ ret = fmt.Sprintf("w%d?", r.ID())
+ case 64:
+ ret = fmt.Sprintf("x%d?", r.ID())
+ default:
+ panic("BUG: invalid register size: " + strconv.Itoa(int(size)))
+ }
+ case regalloc.RegTypeFloat:
+ switch size {
+ case 32:
+ ret = fmt.Sprintf("s%d?", r.ID())
+ case 64:
+ ret = fmt.Sprintf("d%d?", r.ID())
+ case 128:
+ ret = fmt.Sprintf("q%d?", r.ID())
+ default:
+ panic("BUG: invalid register size")
+ }
+ default:
+ panic(fmt.Sprintf("BUG: invalid register type: %d for %s", r.RegType(), r))
+ }
+ }
+ return
+}
+
+func formatVRegWidthVec(r regalloc.VReg, width vecArrangement) (ret string) {
+ var id string
+ wspec := strings.ToLower(width.String())
+ if r.IsRealReg() {
+ id = regNames[r.RealReg()][1:]
+ } else {
+ id = fmt.Sprintf("%d?", r.ID())
+ }
+ ret = fmt.Sprintf("%s%s", wspec, id)
+ return
+}
+
+func formatVRegVec(r regalloc.VReg, arr vecArrangement, index vecIndex) (ret string) {
+ id := fmt.Sprintf("v%d?", r.ID())
+ if r.IsRealReg() {
+ id = regNames[r.RealReg()]
+ }
+ ret = fmt.Sprintf("%s.%s", id, strings.ToLower(arr.String()))
+ if index != vecIndexNone {
+ ret += fmt.Sprintf("[%d]", index)
+ }
+ return
+}
+
+func regTypeToRegisterSizeInBits(r regalloc.RegType) byte {
+ switch r {
+ case regalloc.RegTypeInt:
+ return 64
+ case regalloc.RegTypeFloat:
+ return 128
+ default:
+ panic("BUG: invalid register type")
+ }
+}
+
+var regNumberInEncoding = [...]uint32{
+ x0: 0,
+ x1: 1,
+ x2: 2,
+ x3: 3,
+ x4: 4,
+ x5: 5,
+ x6: 6,
+ x7: 7,
+ x8: 8,
+ x9: 9,
+ x10: 10,
+ x11: 11,
+ x12: 12,
+ x13: 13,
+ x14: 14,
+ x15: 15,
+ x16: 16,
+ x17: 17,
+ x18: 18,
+ x19: 19,
+ x20: 20,
+ x21: 21,
+ x22: 22,
+ x23: 23,
+ x24: 24,
+ x25: 25,
+ x26: 26,
+ x27: 27,
+ x28: 28,
+ x29: 29,
+ x30: 30,
+ xzr: 31,
+ sp: 31,
+ v0: 0,
+ v1: 1,
+ v2: 2,
+ v3: 3,
+ v4: 4,
+ v5: 5,
+ v6: 6,
+ v7: 7,
+ v8: 8,
+ v9: 9,
+ v10: 10,
+ v11: 11,
+ v12: 12,
+ v13: 13,
+ v14: 14,
+ v15: 15,
+ v16: 16,
+ v17: 17,
+ v18: 18,
+ v19: 19,
+ v20: 20,
+ v21: 21,
+ v22: 22,
+ v23: 23,
+ v24: 24,
+ v25: 25,
+ v26: 26,
+ v27: 27,
+ v28: 28,
+ v29: 29,
+ v30: 30,
+ v31: 31,
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go
new file mode 100644
index 000000000..edb0e36e3
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go
@@ -0,0 +1,90 @@
+package arm64
+
+import (
+ "encoding/binary"
+ "reflect"
+ "unsafe"
+
+ "github.com/tetratelabs/wazero/internal/wasmdebug"
+)
+
+// UnwindStack implements wazevo.unwindStack.
+func UnwindStack(sp, _, top uintptr, returnAddresses []uintptr) []uintptr {
+ l := int(top - sp)
+
+ var stackBuf []byte
+ {
+ // TODO: use unsafe.Slice after floor version is set to Go 1.20.
+ hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf))
+ hdr.Data = sp
+ hdr.Len = l
+ hdr.Cap = l
+ }
+
+ for i := uint64(0); i < uint64(l); {
+ // (high address)
+ // +-----------------+
+ // | ....... |
+ // | ret Y | <----+
+ // | ....... | |
+ // | ret 0 | |
+ // | arg X | | size_of_arg_ret
+ // | ....... | |
+ // | arg 1 | |
+ // | arg 0 | <----+
+ // | size_of_arg_ret |
+ // | ReturnAddress |
+ // +-----------------+ <----+
+ // | ........... | |
+ // | spill slot M | |
+ // | ............ | |
+ // | spill slot 2 | |
+ // | spill slot 1 | | frame size
+ // | spill slot 1 | |
+ // | clobbered N | |
+ // | ............ | |
+ // | clobbered 0 | <----+
+ // | xxxxxx | ;; unused space to make it 16-byte aligned.
+ // | frame_size |
+ // +-----------------+ <---- SP
+ // (low address)
+
+ frameSize := binary.LittleEndian.Uint64(stackBuf[i:])
+ i += frameSize +
+ 16 // frame size + aligned space.
+ retAddr := binary.LittleEndian.Uint64(stackBuf[i:])
+ i += 8 // ret addr.
+ sizeOfArgRet := binary.LittleEndian.Uint64(stackBuf[i:])
+ i += 8 + sizeOfArgRet
+ returnAddresses = append(returnAddresses, uintptr(retAddr))
+ if len(returnAddresses) == wasmdebug.MaxFrames {
+ break
+ }
+ }
+ return returnAddresses
+}
+
+// GoCallStackView implements wazevo.goCallStackView.
+func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
+ // (high address)
+ // +-----------------+ <----+
+ // | xxxxxxxxxxx | | ;; optional unused space to make it 16-byte aligned.
+ // ^ | arg[N]/ret[M] | |
+ // sliceSize | | ............ | | sliceSize
+ // | | arg[1]/ret[1] | |
+ // v | arg[0]/ret[0] | <----+
+ // | sliceSize |
+ // | frame_size |
+ // +-----------------+ <---- stackPointerBeforeGoCall
+ // (low address)
+ ptr := unsafe.Pointer(stackPointerBeforeGoCall)
+ size := *(*uint64)(unsafe.Add(ptr, 8))
+ var view []uint64
+ {
+ sh := (*reflect.SliceHeader)(unsafe.Pointer(&view))
+ sh.Data = uintptr(unsafe.Add(ptr, 16)) // skips the (frame_size, sliceSize).
+ sh.Len = int(size)
+ sh.Cap = int(size)
+ }
+ return view
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go
new file mode 100644
index 000000000..54ce89e46
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go
@@ -0,0 +1,100 @@
+package backend
+
+import (
+ "context"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type (
+ // Machine is a backend for a specific ISA machine.
+ Machine interface {
+ ExecutableContext() ExecutableContext
+
+ // DisableStackCheck disables the stack check for the current compilation for debugging/testing.
+ DisableStackCheck()
+
+ // SetCurrentABI initializes the FunctionABI for the given signature.
+ SetCurrentABI(abi *FunctionABI)
+
+ // SetCompiler sets the compilation context used for the lifetime of Machine.
+ // This is only called once per Machine, i.e. before the first compilation.
+ SetCompiler(Compiler)
+
+ // LowerSingleBranch is called when the compilation of the given single branch is started.
+ LowerSingleBranch(b *ssa.Instruction)
+
+ // LowerConditionalBranch is called when the compilation of the given conditional branch is started.
+ LowerConditionalBranch(b *ssa.Instruction)
+
+ // LowerInstr is called for each instruction in the given block except for the ones marked as already lowered
+ // via Compiler.MarkLowered. The order is reverse, i.e. from the last instruction to the first one.
+ //
+ // Note: this can lower multiple instructions (which produce the inputs) at once whenever it's possible
+ // for optimization.
+ LowerInstr(*ssa.Instruction)
+
+ // Reset resets the machine state for the next compilation.
+ Reset()
+
+ // InsertMove inserts a move instruction from src to dst whose type is typ.
+ InsertMove(dst, src regalloc.VReg, typ ssa.Type)
+
+ // InsertReturn inserts the return instruction to return from the current function.
+ InsertReturn()
+
+ // InsertLoadConstantBlockArg inserts the instruction(s) to load the constant value into the given regalloc.VReg.
+ InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg)
+
+ // Format returns the string representation of the currently compiled machine code.
+ // This is only for testing purpose.
+ Format() string
+
+ // RegAlloc does the register allocation after lowering.
+ RegAlloc()
+
+ // PostRegAlloc does the post register allocation, e.g. setting up prologue/epilogue, redundant move elimination, etc.
+ PostRegAlloc()
+
+ // ResolveRelocations resolves the relocations after emitting machine code.
+ // * refToBinaryOffset: the map from the function reference (ssa.FuncRef) to the executable offset.
+ // * executable: the binary to resolve the relocations.
+ // * relocations: the relocations to resolve.
+ // * callTrampolineIslandOffsets: the offsets of the trampoline islands in the executable.
+ ResolveRelocations(
+ refToBinaryOffset []int,
+ executable []byte,
+ relocations []RelocationInfo,
+ callTrampolineIslandOffsets []int,
+ )
+
+ // Encode encodes the machine instructions to the Compiler.
+ Encode(ctx context.Context) error
+
+ // CompileGoFunctionTrampoline compiles the trampoline function to call a Go function of the given exit code and signature.
+ CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte
+
+ // CompileStackGrowCallSequence returns the sequence of instructions shared by all functions to
+ // call the stack grow builtin function.
+ CompileStackGrowCallSequence() []byte
+
+ // CompileEntryPreamble returns the sequence of instructions shared by multiple functions to
+ // enter the function from Go.
+ CompileEntryPreamble(signature *ssa.Signature) []byte
+
+ // LowerParams lowers the given parameters.
+ LowerParams(params []ssa.Value)
+
+ // LowerReturns lowers the given returns.
+ LowerReturns(returns []ssa.Value)
+
+ // ArgsResultsRegs returns the registers used for arguments and return values.
+ ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg)
+
+ // CallTrampolineIslandInfo returns the interval of the offset where the trampoline island is placed, and
+ // the size of the trampoline island. If islandSize is zero, the trampoline island is not used on this machine.
+ CallTrampolineIslandInfo(numFunctions int) (interval, islandSize int, err error)
+ }
+)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go
new file mode 100644
index 000000000..3f36c84e5
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go
@@ -0,0 +1,319 @@
+package backend
+
+import (
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// RegAllocFunctionMachine is the interface for the machine specific logic that will be used in RegAllocFunction.
+type RegAllocFunctionMachine[I regalloc.InstrConstraint] interface {
+ // InsertMoveBefore inserts the move instruction from src to dst before the given instruction.
+ InsertMoveBefore(dst, src regalloc.VReg, instr I)
+ // InsertStoreRegisterAt inserts the instruction(s) to store the given virtual register at the given instruction.
+ // If after is true, the instruction(s) will be inserted after the given instruction, otherwise before.
+ InsertStoreRegisterAt(v regalloc.VReg, instr I, after bool) I
+ // InsertReloadRegisterAt inserts the instruction(s) to reload the given virtual register at the given instruction.
+ // If after is true, the instruction(s) will be inserted after the given instruction, otherwise before.
+ InsertReloadRegisterAt(v regalloc.VReg, instr I, after bool) I
+ // ClobberedRegisters is called when the register allocation is done and the clobbered registers are known.
+ ClobberedRegisters(regs []regalloc.VReg)
+ // Swap swaps the two virtual registers after the given instruction.
+ Swap(cur I, x1, x2, tmp regalloc.VReg)
+ // LastInstrForInsertion implements LastInstrForInsertion of regalloc.Function. See its comment for details.
+ LastInstrForInsertion(begin, end I) I
+ // SSABlockLabel returns the label of the given ssa.BasicBlockID.
+ SSABlockLabel(id ssa.BasicBlockID) Label
+}
+
+type (
+ // RegAllocFunction implements regalloc.Function.
+ RegAllocFunction[I regalloc.InstrConstraint, m RegAllocFunctionMachine[I]] struct {
+ m m
+ ssb ssa.Builder
+ c Compiler
+ // iter is the iterator for reversePostOrderBlocks
+ iter int
+ reversePostOrderBlocks []RegAllocBlock[I, m]
+ // labelToRegAllocBlockIndex maps label to the index of reversePostOrderBlocks.
+ labelToRegAllocBlockIndex map[Label]int
+ loopNestingForestRoots []ssa.BasicBlock
+ }
+
+ // RegAllocBlock implements regalloc.Block.
+ RegAllocBlock[I regalloc.InstrConstraint, m RegAllocFunctionMachine[I]] struct {
+ // f is the function this instruction belongs to. Used to reuse the regAllocFunctionImpl.predsSlice slice for Defs() and Uses().
+ f *RegAllocFunction[I, m]
+ sb ssa.BasicBlock
+ l Label
+ begin, end I
+ loopNestingForestChildren []ssa.BasicBlock
+ cur I
+ id int
+ cachedLastInstrForInsertion I
+ }
+)
+
+// NewRegAllocFunction returns a new RegAllocFunction.
+func NewRegAllocFunction[I regalloc.InstrConstraint, M RegAllocFunctionMachine[I]](m M, ssb ssa.Builder, c Compiler) *RegAllocFunction[I, M] {
+ return &RegAllocFunction[I, M]{
+ m: m,
+ ssb: ssb,
+ c: c,
+ labelToRegAllocBlockIndex: make(map[Label]int),
+ }
+}
+
+// AddBlock adds a new block to the function.
+func (f *RegAllocFunction[I, M]) AddBlock(sb ssa.BasicBlock, l Label, begin, end I) {
+ i := len(f.reversePostOrderBlocks)
+ f.reversePostOrderBlocks = append(f.reversePostOrderBlocks, RegAllocBlock[I, M]{
+ f: f,
+ sb: sb,
+ l: l,
+ begin: begin,
+ end: end,
+ id: int(sb.ID()),
+ })
+ f.labelToRegAllocBlockIndex[l] = i
+}
+
+// Reset resets the function for the next compilation.
+func (f *RegAllocFunction[I, M]) Reset() {
+ f.reversePostOrderBlocks = f.reversePostOrderBlocks[:0]
+ f.iter = 0
+}
+
+// StoreRegisterAfter implements regalloc.Function StoreRegisterAfter.
+func (f *RegAllocFunction[I, M]) StoreRegisterAfter(v regalloc.VReg, instr regalloc.Instr) {
+ m := f.m
+ m.InsertStoreRegisterAt(v, instr.(I), true)
+}
+
+// ReloadRegisterBefore implements regalloc.Function ReloadRegisterBefore.
+func (f *RegAllocFunction[I, M]) ReloadRegisterBefore(v regalloc.VReg, instr regalloc.Instr) {
+ m := f.m
+ m.InsertReloadRegisterAt(v, instr.(I), false)
+}
+
+// ReloadRegisterAfter implements regalloc.Function ReloadRegisterAfter.
+func (f *RegAllocFunction[I, M]) ReloadRegisterAfter(v regalloc.VReg, instr regalloc.Instr) {
+ m := f.m
+ m.InsertReloadRegisterAt(v, instr.(I), true)
+}
+
+// StoreRegisterBefore implements regalloc.Function StoreRegisterBefore.
+func (f *RegAllocFunction[I, M]) StoreRegisterBefore(v regalloc.VReg, instr regalloc.Instr) {
+ m := f.m
+ m.InsertStoreRegisterAt(v, instr.(I), false)
+}
+
+// ClobberedRegisters implements regalloc.Function ClobberedRegisters.
+func (f *RegAllocFunction[I, M]) ClobberedRegisters(regs []regalloc.VReg) {
+ f.m.ClobberedRegisters(regs)
+}
+
+// SwapBefore implements regalloc.Function SwapBefore.
+func (f *RegAllocFunction[I, M]) SwapBefore(x1, x2, tmp regalloc.VReg, instr regalloc.Instr) {
+ f.m.Swap(instr.Prev().(I), x1, x2, tmp)
+}
+
+// PostOrderBlockIteratorBegin implements regalloc.Function PostOrderBlockIteratorBegin.
+func (f *RegAllocFunction[I, M]) PostOrderBlockIteratorBegin() regalloc.Block {
+ f.iter = len(f.reversePostOrderBlocks) - 1
+ return f.PostOrderBlockIteratorNext()
+}
+
+// PostOrderBlockIteratorNext implements regalloc.Function PostOrderBlockIteratorNext.
+func (f *RegAllocFunction[I, M]) PostOrderBlockIteratorNext() regalloc.Block {
+ if f.iter < 0 {
+ return nil
+ }
+ b := &f.reversePostOrderBlocks[f.iter]
+ f.iter--
+ return b
+}
+
+// ReversePostOrderBlockIteratorBegin implements regalloc.Function ReversePostOrderBlockIteratorBegin.
+func (f *RegAllocFunction[I, M]) ReversePostOrderBlockIteratorBegin() regalloc.Block {
+ f.iter = 0
+ return f.ReversePostOrderBlockIteratorNext()
+}
+
+// ReversePostOrderBlockIteratorNext implements regalloc.Function ReversePostOrderBlockIteratorNext.
+func (f *RegAllocFunction[I, M]) ReversePostOrderBlockIteratorNext() regalloc.Block {
+ if f.iter >= len(f.reversePostOrderBlocks) {
+ return nil
+ }
+ b := &f.reversePostOrderBlocks[f.iter]
+ f.iter++
+ return b
+}
+
+// LoopNestingForestRoots implements regalloc.Function LoopNestingForestRoots.
+func (f *RegAllocFunction[I, M]) LoopNestingForestRoots() int {
+ f.loopNestingForestRoots = f.ssb.LoopNestingForestRoots()
+ return len(f.loopNestingForestRoots)
+}
+
+// LoopNestingForestRoot implements regalloc.Function LoopNestingForestRoot.
+func (f *RegAllocFunction[I, M]) LoopNestingForestRoot(i int) regalloc.Block {
+ blk := f.loopNestingForestRoots[i]
+ l := f.m.SSABlockLabel(blk.ID())
+ index := f.labelToRegAllocBlockIndex[l]
+ return &f.reversePostOrderBlocks[index]
+}
+
+// InsertMoveBefore implements regalloc.Function InsertMoveBefore.
+func (f *RegAllocFunction[I, M]) InsertMoveBefore(dst, src regalloc.VReg, instr regalloc.Instr) {
+ f.m.InsertMoveBefore(dst, src, instr.(I))
+}
+
+// LowestCommonAncestor implements regalloc.Function LowestCommonAncestor.
+func (f *RegAllocFunction[I, M]) LowestCommonAncestor(blk1, blk2 regalloc.Block) regalloc.Block {
+ ret := f.ssb.LowestCommonAncestor(blk1.(*RegAllocBlock[I, M]).sb, blk2.(*RegAllocBlock[I, M]).sb)
+ l := f.m.SSABlockLabel(ret.ID())
+ index := f.labelToRegAllocBlockIndex[l]
+ return &f.reversePostOrderBlocks[index]
+}
+
+// Idom implements regalloc.Function Idom.
+func (f *RegAllocFunction[I, M]) Idom(blk regalloc.Block) regalloc.Block {
+ builder := f.ssb
+ idom := builder.Idom(blk.(*RegAllocBlock[I, M]).sb)
+ if idom == nil {
+ panic("BUG: idom must not be nil")
+ }
+ l := f.m.SSABlockLabel(idom.ID())
+ index := f.labelToRegAllocBlockIndex[l]
+ return &f.reversePostOrderBlocks[index]
+}
+
+// ID implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) ID() int32 { return int32(r.id) }
+
+// BlockParams implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) BlockParams(regs *[]regalloc.VReg) []regalloc.VReg {
+ c := r.f.c
+ *regs = (*regs)[:0]
+ for i := 0; i < r.sb.Params(); i++ {
+ v := c.VRegOf(r.sb.Param(i))
+ *regs = append(*regs, v)
+ }
+ return *regs
+}
+
+// InstrIteratorBegin implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) InstrIteratorBegin() regalloc.Instr {
+ r.cur = r.begin
+ return r.cur
+}
+
+// InstrIteratorNext implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) InstrIteratorNext() regalloc.Instr {
+ for {
+ if r.cur == r.end {
+ return nil
+ }
+ instr := r.cur.Next()
+ r.cur = instr.(I)
+ if instr == nil {
+ return nil
+ } else if instr.AddedBeforeRegAlloc() {
+ // Only concerned about the instruction added before regalloc.
+ return instr
+ }
+ }
+}
+
+// InstrRevIteratorBegin implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) InstrRevIteratorBegin() regalloc.Instr {
+ r.cur = r.end
+ return r.cur
+}
+
+// InstrRevIteratorNext implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) InstrRevIteratorNext() regalloc.Instr {
+ for {
+ if r.cur == r.begin {
+ return nil
+ }
+ instr := r.cur.Prev()
+ r.cur = instr.(I)
+ if instr == nil {
+ return nil
+ } else if instr.AddedBeforeRegAlloc() {
+ // Only concerned about the instruction added before regalloc.
+ return instr
+ }
+ }
+}
+
+// FirstInstr implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) FirstInstr() regalloc.Instr {
+ return r.begin
+}
+
+// EndInstr implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) EndInstr() regalloc.Instr {
+ return r.end
+}
+
+// LastInstrForInsertion implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) LastInstrForInsertion() regalloc.Instr {
+ var nil I
+ if r.cachedLastInstrForInsertion == nil {
+ r.cachedLastInstrForInsertion = r.f.m.LastInstrForInsertion(r.begin, r.end)
+ }
+ return r.cachedLastInstrForInsertion
+}
+
+// Preds implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Preds() int { return r.sb.Preds() }
+
+// Pred implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Pred(i int) regalloc.Block {
+ sb := r.sb
+ pred := sb.Pred(i)
+ l := r.f.m.SSABlockLabel(pred.ID())
+ index := r.f.labelToRegAllocBlockIndex[l]
+ return &r.f.reversePostOrderBlocks[index]
+}
+
+// Entry implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Entry() bool { return r.sb.EntryBlock() }
+
+// Succs implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Succs() int {
+ return r.sb.Succs()
+}
+
+// Succ implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Succ(i int) regalloc.Block {
+ sb := r.sb
+ succ := sb.Succ(i)
+ if succ.ReturnBlock() {
+ return nil
+ }
+ l := r.f.m.SSABlockLabel(succ.ID())
+ index := r.f.labelToRegAllocBlockIndex[l]
+ return &r.f.reversePostOrderBlocks[index]
+}
+
+// LoopHeader implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) LoopHeader() bool {
+ return r.sb.LoopHeader()
+}
+
+// LoopNestingForestChildren implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) LoopNestingForestChildren() int {
+ r.loopNestingForestChildren = r.sb.LoopNestingForestChildren()
+ return len(r.loopNestingForestChildren)
+}
+
+// LoopNestingForestChild implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) LoopNestingForestChild(i int) regalloc.Block {
+ blk := r.loopNestingForestChildren[i]
+ l := r.f.m.SSABlockLabel(blk.ID())
+ index := r.f.labelToRegAllocBlockIndex[l]
+ return &r.f.reversePostOrderBlocks[index]
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go
new file mode 100644
index 000000000..23157b478
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go
@@ -0,0 +1,136 @@
+package regalloc
+
+import "fmt"
+
+// These interfaces are implemented by ISA-specific backends to abstract away the details, and allow the register
+// allocators to work on any ISA.
+//
+// TODO: the interfaces are not stabilized yet, especially x64 will need some changes. E.g. x64 has an addressing mode
+// where index can be in memory. That kind of info will be useful to reduce the register pressure, and should be leveraged
+// by the register allocators, like https://docs.rs/regalloc2/latest/regalloc2/enum.OperandConstraint.html
+
+type (
+ // Function is the top-level interface to do register allocation, which corresponds to a CFG containing
+ // Blocks(s).
+ Function interface {
+ // PostOrderBlockIteratorBegin returns the first block in the post-order traversal of the CFG.
+ // In other words, the last blocks in the CFG will be returned first.
+ PostOrderBlockIteratorBegin() Block
+ // PostOrderBlockIteratorNext returns the next block in the post-order traversal of the CFG.
+ PostOrderBlockIteratorNext() Block
+ // ReversePostOrderBlockIteratorBegin returns the first block in the reverse post-order traversal of the CFG.
+ // In other words, the first blocks in the CFG will be returned first.
+ ReversePostOrderBlockIteratorBegin() Block
+ // ReversePostOrderBlockIteratorNext returns the next block in the reverse post-order traversal of the CFG.
+ ReversePostOrderBlockIteratorNext() Block
+ // ClobberedRegisters tell the clobbered registers by this function.
+ ClobberedRegisters([]VReg)
+ // LoopNestingForestRoots returns the number of roots of the loop nesting forest in a function.
+ LoopNestingForestRoots() int
+ // LoopNestingForestRoot returns the i-th root of the loop nesting forest in a function.
+ LoopNestingForestRoot(i int) Block
+ // LowestCommonAncestor returns the lowest common ancestor of two blocks in the dominator tree.
+ LowestCommonAncestor(blk1, blk2 Block) Block
+ // Idom returns the immediate dominator of the given block.
+ Idom(blk Block) Block
+
+ // Followings are for rewriting the function.
+
+ // SwapAtEndOfBlock swaps the two virtual registers at the end of the given block.
+ SwapBefore(x1, x2, tmp VReg, instr Instr)
+ // StoreRegisterBefore inserts store instruction(s) before the given instruction for the given virtual register.
+ StoreRegisterBefore(v VReg, instr Instr)
+ // StoreRegisterAfter inserts store instruction(s) after the given instruction for the given virtual register.
+ StoreRegisterAfter(v VReg, instr Instr)
+ // ReloadRegisterBefore inserts reload instruction(s) before the given instruction for the given virtual register.
+ ReloadRegisterBefore(v VReg, instr Instr)
+ // ReloadRegisterAfter inserts reload instruction(s) after the given instruction for the given virtual register.
+ ReloadRegisterAfter(v VReg, instr Instr)
+ // InsertMoveBefore inserts move instruction(s) before the given instruction for the given virtual registers.
+ InsertMoveBefore(dst, src VReg, instr Instr)
+ }
+
+ // Block is a basic block in the CFG of a function, and it consists of multiple instructions, and predecessor Block(s).
+ Block interface {
+ // ID returns the unique identifier of this block which is ordered in the reverse post-order traversal of the CFG.
+ ID() int32
+ // BlockParams returns the virtual registers used as the parameters of this block.
+ BlockParams(*[]VReg) []VReg
+ // InstrIteratorBegin returns the first instruction in this block. Instructions added after lowering must be skipped.
+ // Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr.
+ InstrIteratorBegin() Instr
+ // InstrIteratorNext returns the next instruction in this block. Instructions added after lowering must be skipped.
+ // Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr.
+ InstrIteratorNext() Instr
+ // InstrRevIteratorBegin is the same as InstrIteratorBegin, but in the reverse order.
+ InstrRevIteratorBegin() Instr
+ // InstrRevIteratorNext is the same as InstrIteratorNext, but in the reverse order.
+ InstrRevIteratorNext() Instr
+ // FirstInstr returns the fist instruction in this block where instructions will be inserted after it.
+ FirstInstr() Instr
+ // EndInstr returns the end instruction in this block.
+ EndInstr() Instr
+ // LastInstrForInsertion returns the last instruction in this block where instructions will be inserted before it.
+ // Such insertions only happen when we need to insert spill/reload instructions to adjust the merge edges.
+ // At the time of register allocation, all the critical edges are already split, so there is no need
+ // to worry about the case where branching instruction has multiple successors.
+ // Therefore, usually, it is the nop instruction, but if the block ends with an unconditional branching, then it returns
+ // the unconditional branch, not the nop. In other words it is either nop or unconditional branch.
+ LastInstrForInsertion() Instr
+ // Preds returns the number of predecessors of this block in the CFG.
+ Preds() int
+ // Pred returns the i-th predecessor of this block in the CFG.
+ Pred(i int) Block
+ // Entry returns true if the block is for the entry block.
+ Entry() bool
+ // Succs returns the number of successors of this block in the CFG.
+ Succs() int
+ // Succ returns the i-th successor of this block in the CFG.
+ Succ(i int) Block
+ // LoopHeader returns true if this block is a loop header.
+ LoopHeader() bool
+ // LoopNestingForestChildren returns the number of children of this block in the loop nesting forest.
+ LoopNestingForestChildren() int
+ // LoopNestingForestChild returns the i-th child of this block in the loop nesting forest.
+ LoopNestingForestChild(i int) Block
+ }
+
+ // Instr is an instruction in a block, abstracting away the underlying ISA.
+ Instr interface {
+ fmt.Stringer
+ // Next returns the next instruction in the same block.
+ Next() Instr
+ // Prev returns the previous instruction in the same block.
+ Prev() Instr
+ // Defs returns the virtual registers defined by this instruction.
+ Defs(*[]VReg) []VReg
+ // Uses returns the virtual registers used by this instruction.
+ // Note: multiple returned []VReg will not be held at the same time, so it's safe to use the same slice for this.
+ Uses(*[]VReg) []VReg
+ // AssignUse assigns the RealReg-allocated virtual register used by this instruction at the given index.
+ AssignUse(index int, v VReg)
+ // AssignDef assigns a RealReg-allocated virtual register defined by this instruction.
+ // This only accepts one register because we don't allocate registers for multi-def instructions (i.e. call instruction)
+ AssignDef(VReg)
+ // IsCopy returns true if this instruction is a move instruction between two registers.
+ // If true, the instruction is of the form of dst = src, and if the src and dst do not interfere with each other,
+ // we could coalesce them, and hence the copy can be eliminated from the final code.
+ IsCopy() bool
+ // IsCall returns true if this instruction is a call instruction. The result is used to insert
+ // caller saved register spills and restores.
+ IsCall() bool
+ // IsIndirectCall returns true if this instruction is an indirect call instruction which calls a function pointer.
+ // The result is used to insert caller saved register spills and restores.
+ IsIndirectCall() bool
+ // IsReturn returns true if this instruction is a return instruction.
+ IsReturn() bool
+ // AddedBeforeRegAlloc returns true if this instruction is added before register allocation.
+ AddedBeforeRegAlloc() bool
+ }
+
+ // InstrConstraint is an interface for arch-specific instruction constraints.
+ InstrConstraint interface {
+ comparable
+ Instr
+ }
+)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go
new file mode 100644
index 000000000..46df807e6
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go
@@ -0,0 +1,123 @@
+package regalloc
+
+import (
+ "fmt"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// VReg represents a register which is assigned to an SSA value. This is used to represent a register in the backend.
+// A VReg may or may not be a physical register, and the info of physical register can be obtained by RealReg.
+type VReg uint64
+
+// VRegID is the lower 32bit of VReg, which is the pure identifier of VReg without RealReg info.
+type VRegID uint32
+
+// RealReg returns the RealReg of this VReg.
+func (v VReg) RealReg() RealReg {
+ return RealReg(v >> 32)
+}
+
+// IsRealReg returns true if this VReg is backed by a physical register.
+func (v VReg) IsRealReg() bool {
+ return v.RealReg() != RealRegInvalid
+}
+
+// FromRealReg returns a VReg from the given RealReg and RegType.
+// This is used to represent a specific pre-colored register in the backend.
+func FromRealReg(r RealReg, typ RegType) VReg {
+ rid := VRegID(r)
+ if rid > vRegIDReservedForRealNum {
+ panic(fmt.Sprintf("invalid real reg %d", r))
+ }
+ return VReg(r).SetRealReg(r).SetRegType(typ)
+}
+
+// SetRealReg sets the RealReg of this VReg and returns the updated VReg.
+func (v VReg) SetRealReg(r RealReg) VReg {
+ return VReg(r)<<32 | (v & 0xff_00_ffffffff)
+}
+
+// RegType returns the RegType of this VReg.
+func (v VReg) RegType() RegType {
+ return RegType(v >> 40)
+}
+
+// SetRegType sets the RegType of this VReg and returns the updated VReg.
+func (v VReg) SetRegType(t RegType) VReg {
+ return VReg(t)<<40 | (v & 0x00_ff_ffffffff)
+}
+
+// ID returns the VRegID of this VReg.
+func (v VReg) ID() VRegID {
+ return VRegID(v & 0xffffffff)
+}
+
+// Valid returns true if this VReg is Valid.
+func (v VReg) Valid() bool {
+ return v.ID() != vRegIDInvalid && v.RegType() != RegTypeInvalid
+}
+
+// RealReg represents a physical register.
+type RealReg byte
+
+const RealRegInvalid RealReg = 0
+
+const (
+ vRegIDInvalid VRegID = 1 << 31
+ VRegIDNonReservedBegin = vRegIDReservedForRealNum
+ vRegIDReservedForRealNum VRegID = 128
+ VRegInvalid = VReg(vRegIDInvalid)
+)
+
+// String implements fmt.Stringer.
+func (r RealReg) String() string {
+ switch r {
+ case RealRegInvalid:
+ return "invalid"
+ default:
+ return fmt.Sprintf("r%d", r)
+ }
+}
+
+// String implements fmt.Stringer.
+func (v VReg) String() string {
+ if v.IsRealReg() {
+ return fmt.Sprintf("r%d", v.ID())
+ }
+ return fmt.Sprintf("v%d?", v.ID())
+}
+
+// RegType represents the type of a register.
+type RegType byte
+
+const (
+ RegTypeInvalid RegType = iota
+ RegTypeInt
+ RegTypeFloat
+ NumRegType
+)
+
+// String implements fmt.Stringer.
+func (r RegType) String() string {
+ switch r {
+ case RegTypeInt:
+ return "int"
+ case RegTypeFloat:
+ return "float"
+ default:
+ return "invalid"
+ }
+}
+
+// RegTypeOf returns the RegType of the given ssa.Type.
+func RegTypeOf(p ssa.Type) RegType {
+ switch p {
+ case ssa.TypeI32, ssa.TypeI64:
+ return RegTypeInt
+ case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+ return RegTypeFloat
+ default:
+ panic("invalid type")
+ }
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
new file mode 100644
index 000000000..b4450d56f
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
@@ -0,0 +1,1212 @@
+// Package regalloc performs register allocation. The algorithm can work on any ISA by implementing the interfaces in
+// api.go.
+//
+// References:
+// - https://web.stanford.edu/class/archive/cs/cs143/cs143.1128/lectures/17/Slides17.pdf
+// - https://en.wikipedia.org/wiki/Chaitin%27s_algorithm
+// - https://llvm.org/ProjectsWithLLVM/2004-Fall-CS426-LS.pdf
+// - https://pfalcon.github.io/ssabook/latest/book-full.pdf: Chapter 9. for liveness analysis.
+// - https://github.com/golang/go/blob/release-branch.go1.21/src/cmd/compile/internal/ssa/regalloc.go
+package regalloc
+
+import (
+ "fmt"
+ "math"
+ "strings"
+
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// NewAllocator returns a new Allocator.
+func NewAllocator(allocatableRegs *RegisterInfo) Allocator {
+ a := Allocator{
+ regInfo: allocatableRegs,
+ phiDefInstListPool: wazevoapi.NewPool[phiDefInstList](resetPhiDefInstList),
+ blockStates: wazevoapi.NewIDedPool[blockState](resetBlockState),
+ }
+ a.state.vrStates = wazevoapi.NewIDedPool[vrState](resetVrState)
+ a.state.reset()
+ for _, regs := range allocatableRegs.AllocatableRegisters {
+ for _, r := range regs {
+ a.allocatableSet = a.allocatableSet.add(r)
+ }
+ }
+ return a
+}
+
+type (
+ // RegisterInfo holds the statically-known ISA-specific register information.
+ RegisterInfo struct {
+ // AllocatableRegisters is a 2D array of allocatable RealReg, indexed by regTypeNum and regNum.
+ // The order matters: the first element is the most preferred one when allocating.
+ AllocatableRegisters [NumRegType][]RealReg
+ CalleeSavedRegisters RegSet
+ CallerSavedRegisters RegSet
+ RealRegToVReg []VReg
+ // RealRegName returns the name of the given RealReg for debugging.
+ RealRegName func(r RealReg) string
+ RealRegType func(r RealReg) RegType
+ }
+
+ // Allocator is a register allocator.
+ Allocator struct {
+ // regInfo is static per ABI/ISA, and is initialized by the machine during Machine.PrepareRegisterAllocator.
+ regInfo *RegisterInfo
+ // allocatableSet is a set of allocatable RealReg derived from regInfo. Static per ABI/ISA.
+ allocatableSet RegSet
+ allocatedCalleeSavedRegs []VReg
+ vs []VReg
+ vs2 []VRegID
+ phiDefInstListPool wazevoapi.Pool[phiDefInstList]
+
+ // Followings are re-used during various places.
+ blks []Block
+ reals []RealReg
+ currentOccupants regInUseSet
+
+ // Following two fields are updated while iterating the blocks in the reverse postorder.
+ state state
+ blockStates wazevoapi.IDedPool[blockState]
+ }
+
+ // programCounter represents an opaque index into the program which is used to represents a LiveInterval of a VReg.
+ programCounter int32
+
+ state struct {
+ argRealRegs []VReg
+ regsInUse regInUseSet
+ vrStates wazevoapi.IDedPool[vrState]
+
+ currentBlockID int32
+
+ // allocatedRegSet is a set of RealReg that are allocated during the allocation phase. This is reset per function.
+ allocatedRegSet RegSet
+ }
+
+ blockState struct {
+ // liveIns is a list of VReg that are live at the beginning of the block.
+ liveIns []VRegID
+ // seen is true if the block is visited during the liveness analysis.
+ seen bool
+ // visited is true if the block is visited during the allocation phase.
+ visited bool
+ startFromPredIndex int
+ // startRegs is a list of RealReg that are used at the beginning of the block. This is used to fix the merge edges.
+ startRegs regInUseSet
+ // endRegs is a list of RealReg that are used at the end of the block. This is used to fix the merge edges.
+ endRegs regInUseSet
+ }
+
+ vrState struct {
+ v VReg
+ r RealReg
+ // defInstr is the instruction that defines this value. If this is the phi value and not the entry block, this is nil.
+ defInstr Instr
+ // defBlk is the block that defines this value. If this is the phi value, this is the block whose arguments contain this value.
+ defBlk Block
+ // lca = lowest common ancestor. This is the block that is the lowest common ancestor of all the blocks that
+ // reloads this value. This is used to determine the spill location. Only valid if spilled=true.
+ lca Block
+ // lastUse is the program counter of the last use of this value. This changes while iterating the block, and
+ // should not be used across the blocks as it becomes invalid. To check the validity, use lastUseUpdatedAtBlockID.
+ lastUse programCounter
+ lastUseUpdatedAtBlockID int32
+ // spilled is true if this value is spilled i.e. the value is reload from the stack somewhere in the program.
+ //
+ // Note that this field is used during liveness analysis for different purpose. This is used to determine the
+ // value is live-in or not.
+ spilled bool
+ // isPhi is true if this is a phi value.
+ isPhi bool
+ desiredLoc desiredLoc
+ // phiDefInstList is a list of instructions that defines this phi value.
+ // This is used to determine the spill location, and only valid if isPhi=true.
+ *phiDefInstList
+ }
+
+ // phiDefInstList is a linked list of instructions that defines a phi value.
+ phiDefInstList struct {
+ instr Instr
+ v VReg
+ next *phiDefInstList
+ }
+
+ // desiredLoc represents a desired location for a VReg.
+ desiredLoc uint16
+ // desiredLocKind is a kind of desired location for a VReg.
+ desiredLocKind uint16
+)
+
+const (
+ // desiredLocKindUnspecified is a kind of desired location for a VReg that is not specified.
+ desiredLocKindUnspecified desiredLocKind = iota
+ // desiredLocKindStack is a kind of desired location for a VReg that is on the stack, only used for the phi values.
+ desiredLocKindStack
+ // desiredLocKindReg is a kind of desired location for a VReg that is in a register.
+ desiredLocKindReg
+ desiredLocUnspecified = desiredLoc(desiredLocKindUnspecified)
+ desiredLocStack = desiredLoc(desiredLocKindStack)
+)
+
+func newDesiredLocReg(r RealReg) desiredLoc {
+ return desiredLoc(desiredLocKindReg) | desiredLoc(r<<2)
+}
+
+func (d desiredLoc) realReg() RealReg {
+ return RealReg(d >> 2)
+}
+
+func (d desiredLoc) stack() bool {
+ return d&3 == desiredLoc(desiredLocKindStack)
+}
+
+func resetPhiDefInstList(l *phiDefInstList) {
+ l.instr = nil
+ l.next = nil
+ l.v = VRegInvalid
+}
+
+func (s *state) dump(info *RegisterInfo) { //nolint:unused
+ fmt.Println("\t\tstate:")
+ fmt.Println("\t\t\targRealRegs:", s.argRealRegs)
+ fmt.Println("\t\t\tregsInUse", s.regsInUse.format(info))
+ fmt.Println("\t\t\tallocatedRegSet:", s.allocatedRegSet.format(info))
+ fmt.Println("\t\t\tused:", s.regsInUse.format(info))
+ var strs []string
+ for i := 0; i <= s.vrStates.MaxIDEncountered(); i++ {
+ vs := s.vrStates.Get(i)
+ if vs == nil {
+ continue
+ }
+ if vs.r != RealRegInvalid {
+ strs = append(strs, fmt.Sprintf("(v%d: %s)", vs.v.ID(), info.RealRegName(vs.r)))
+ }
+ }
+ fmt.Println("\t\t\tvrStates:", strings.Join(strs, ", "))
+}
+
+func (s *state) reset() {
+ s.argRealRegs = s.argRealRegs[:0]
+ s.vrStates.Reset()
+ s.allocatedRegSet = RegSet(0)
+ s.regsInUse.reset()
+ s.currentBlockID = -1
+}
+
+func (s *state) setVRegState(v VReg, r RealReg) {
+ id := int(v.ID())
+ st := s.vrStates.GetOrAllocate(id)
+ st.r = r
+ st.v = v
+}
+
+func resetVrState(vs *vrState) {
+ vs.v = VRegInvalid
+ vs.r = RealRegInvalid
+ vs.defInstr = nil
+ vs.defBlk = nil
+ vs.spilled = false
+ vs.lastUse = -1
+ vs.lastUseUpdatedAtBlockID = -1
+ vs.lca = nil
+ vs.isPhi = false
+ vs.phiDefInstList = nil
+ vs.desiredLoc = desiredLocUnspecified
+}
+
+func (s *state) getVRegState(v VRegID) *vrState {
+ return s.vrStates.GetOrAllocate(int(v))
+}
+
+func (s *state) useRealReg(r RealReg, v VReg) {
+ if s.regsInUse.has(r) {
+ panic("BUG: useRealReg: the given real register is already used")
+ }
+ s.regsInUse.add(r, v)
+ s.setVRegState(v, r)
+ s.allocatedRegSet = s.allocatedRegSet.add(r)
+}
+
+func (s *state) releaseRealReg(r RealReg) {
+ current := s.regsInUse.get(r)
+ if current.Valid() {
+ s.regsInUse.remove(r)
+ s.setVRegState(current, RealRegInvalid)
+ }
+}
+
+// recordReload records that the given VReg is reloaded in the given block.
+// This is used to determine the spill location by tracking the lowest common ancestor of all the blocks that reloads the value.
+func (vs *vrState) recordReload(f Function, blk Block) {
+ vs.spilled = true
+ if vs.lca == nil {
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("\t\tv%d is reloaded in blk%d,\n", vs.v.ID(), blk.ID())
+ }
+ vs.lca = blk
+ } else {
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("\t\tv%d is reloaded in blk%d, lca=%d\n", vs.v.ID(), blk.ID(), vs.lca.ID())
+ }
+ vs.lca = f.LowestCommonAncestor(vs.lca, blk)
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("updated lca=%d\n", vs.lca.ID())
+ }
+ }
+}
+
+func (s *state) findOrSpillAllocatable(a *Allocator, allocatable []RealReg, forbiddenMask RegSet, preferred RealReg) (r RealReg) {
+ r = RealRegInvalid
+ // First, check if the preferredMask has any allocatable register.
+ if preferred != RealRegInvalid && !forbiddenMask.has(preferred) && !s.regsInUse.has(preferred) {
+ for _, candidateReal := range allocatable {
+ // TODO: we should ensure the preferred register is in the allocatable set in the first place,
+ // but right now, just in case, we check it here.
+ if candidateReal == preferred {
+ return preferred
+ }
+ }
+ }
+
+ var lastUseAt programCounter
+ var spillVReg VReg
+ for _, candidateReal := range allocatable {
+ if forbiddenMask.has(candidateReal) {
+ continue
+ }
+
+ using := s.regsInUse.get(candidateReal)
+ if using == VRegInvalid {
+ // This is not used at this point.
+ return candidateReal
+ }
+
+ // Real registers in use should not be spilled, so we skip them.
+ // For example, if the register is used as an argument register, and it might be
+ // spilled and not reloaded when it ends up being used as a temporary to pass
+ // stack based argument.
+ if using.IsRealReg() {
+ continue
+ }
+
+ isPreferred := candidateReal == preferred
+
+ // last == -1 means the value won't be used anymore.
+ if last := s.getVRegState(using.ID()).lastUse; r == RealRegInvalid || isPreferred || last == -1 || (lastUseAt != -1 && last > lastUseAt) {
+ lastUseAt = last
+ r = candidateReal
+ spillVReg = using
+ if isPreferred {
+ break
+ }
+ }
+ }
+
+ if r == RealRegInvalid {
+ panic("not found any allocatable register")
+ }
+
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("\tspilling v%d when lastUseAt=%d and regsInUse=%s\n", spillVReg.ID(), lastUseAt, s.regsInUse.format(a.regInfo))
+ }
+ s.releaseRealReg(r)
+ return r
+}
+
+func (s *state) findAllocatable(allocatable []RealReg, forbiddenMask RegSet) RealReg {
+ for _, r := range allocatable {
+ if !s.regsInUse.has(r) && !forbiddenMask.has(r) {
+ return r
+ }
+ }
+ return RealRegInvalid
+}
+
+func (s *state) resetAt(bs *blockState) {
+ s.regsInUse.range_(func(_ RealReg, vr VReg) {
+ s.setVRegState(vr, RealRegInvalid)
+ })
+ s.regsInUse.reset()
+ bs.endRegs.range_(func(r RealReg, v VReg) {
+ id := int(v.ID())
+ st := s.vrStates.GetOrAllocate(id)
+ if st.lastUseUpdatedAtBlockID == s.currentBlockID && st.lastUse == programCounterLiveIn {
+ s.regsInUse.add(r, v)
+ s.setVRegState(v, r)
+ }
+ })
+}
+
+func resetBlockState(b *blockState) {
+ b.seen = false
+ b.visited = false
+ b.endRegs.reset()
+ b.startRegs.reset()
+ b.startFromPredIndex = -1
+ b.liveIns = b.liveIns[:0]
+}
+
+func (b *blockState) dump(a *RegisterInfo) {
+ fmt.Println("\t\tblockState:")
+ fmt.Println("\t\t\tstartRegs:", b.startRegs.format(a))
+ fmt.Println("\t\t\tendRegs:", b.endRegs.format(a))
+ fmt.Println("\t\t\tstartFromPredIndex:", b.startFromPredIndex)
+ fmt.Println("\t\t\tvisited:", b.visited)
+}
+
+// DoAllocation performs register allocation on the given Function.
+func (a *Allocator) DoAllocation(f Function) {
+ a.livenessAnalysis(f)
+ a.alloc(f)
+ a.determineCalleeSavedRealRegs(f)
+}
+
+func (a *Allocator) determineCalleeSavedRealRegs(f Function) {
+ a.allocatedCalleeSavedRegs = a.allocatedCalleeSavedRegs[:0]
+ a.state.allocatedRegSet.Range(func(allocatedRealReg RealReg) {
+ if a.regInfo.CalleeSavedRegisters.has(allocatedRealReg) {
+ a.allocatedCalleeSavedRegs = append(a.allocatedCalleeSavedRegs, a.regInfo.RealRegToVReg[allocatedRealReg])
+ }
+ })
+ f.ClobberedRegisters(a.allocatedCalleeSavedRegs)
+}
+
+func (a *Allocator) getOrAllocateBlockState(blockID int32) *blockState {
+ return a.blockStates.GetOrAllocate(int(blockID))
+}
+
+// phiBlk returns the block that defines the given phi value, nil otherwise.
+func (s *state) phiBlk(v VRegID) Block {
+ vs := s.getVRegState(v)
+ if vs.isPhi {
+ return vs.defBlk
+ }
+ return nil
+}
+
+const (
+ programCounterLiveIn = math.MinInt32
+ programCounterLiveOut = math.MaxInt32
+)
+
+// liveAnalysis constructs Allocator.blockLivenessData.
+// The algorithm here is described in https://pfalcon.github.io/ssabook/latest/book-full.pdf Chapter 9.2.
+func (a *Allocator) livenessAnalysis(f Function) {
+ s := &a.state
+ for blk := f.PostOrderBlockIteratorBegin(); blk != nil; blk = f.PostOrderBlockIteratorNext() { // Order doesn't matter.
+
+ // We should gather phi value data.
+ for _, p := range blk.BlockParams(&a.vs) {
+ vs := s.getVRegState(p.ID())
+ vs.isPhi = true
+ vs.defBlk = blk
+ }
+ }
+
+ for blk := f.PostOrderBlockIteratorBegin(); blk != nil; blk = f.PostOrderBlockIteratorNext() {
+ blkID := blk.ID()
+ info := a.getOrAllocateBlockState(blkID)
+
+ a.vs2 = a.vs2[:0]
+ const (
+ flagDeleted = false
+ flagLive = true
+ )
+ ns := blk.Succs()
+ for i := 0; i < ns; i++ {
+ succ := blk.Succ(i)
+ if succ == nil {
+ continue
+ }
+
+ succID := succ.ID()
+ succInfo := a.getOrAllocateBlockState(succID)
+ if !succInfo.seen { // This means the back edge.
+ continue
+ }
+
+ for _, v := range succInfo.liveIns {
+ if s.phiBlk(v) != succ {
+ st := s.getVRegState(v)
+ // We use .spilled field to store the flag.
+ st.spilled = flagLive
+ a.vs2 = append(a.vs2, v)
+ }
+ }
+ }
+
+ for instr := blk.InstrRevIteratorBegin(); instr != nil; instr = blk.InstrRevIteratorNext() {
+
+ var use, def VReg
+ for _, def = range instr.Defs(&a.vs) {
+ if !def.IsRealReg() {
+ id := def.ID()
+ st := s.getVRegState(id)
+ // We use .spilled field to store the flag.
+ st.spilled = flagDeleted
+ a.vs2 = append(a.vs2, id)
+ }
+ }
+ for _, use = range instr.Uses(&a.vs) {
+ if !use.IsRealReg() {
+ id := use.ID()
+ st := s.getVRegState(id)
+ // We use .spilled field to store the flag.
+ st.spilled = flagLive
+ a.vs2 = append(a.vs2, id)
+ }
+ }
+
+ if def.Valid() && s.phiBlk(def.ID()) != nil {
+ if use.Valid() && use.IsRealReg() {
+ // If the destination is a phi value, and the source is a real register, this is the beginning of the function.
+ a.state.argRealRegs = append(a.state.argRealRegs, use)
+ }
+ }
+ }
+
+ for _, v := range a.vs2 {
+ st := s.getVRegState(v)
+ // We use .spilled field to store the flag.
+ if st.spilled == flagLive { //nolint:gosimple
+ info.liveIns = append(info.liveIns, v)
+ st.spilled = false
+ }
+ }
+
+ info.seen = true
+ }
+
+ nrs := f.LoopNestingForestRoots()
+ for i := 0; i < nrs; i++ {
+ root := f.LoopNestingForestRoot(i)
+ a.loopTreeDFS(root)
+ }
+}
+
+// loopTreeDFS implements the Algorithm 9.3 in the book in an iterative way.
+func (a *Allocator) loopTreeDFS(entry Block) {
+ a.blks = a.blks[:0]
+ a.blks = append(a.blks, entry)
+
+ s := &a.state
+ for len(a.blks) > 0 {
+ tail := len(a.blks) - 1
+ loop := a.blks[tail]
+ a.blks = a.blks[:tail]
+ a.vs2 = a.vs2[:0]
+ const (
+ flagDone = false
+ flagPending = true
+ )
+ info := a.getOrAllocateBlockState(loop.ID())
+ for _, v := range info.liveIns {
+ if s.phiBlk(v) != loop {
+ a.vs2 = append(a.vs2, v)
+ st := s.getVRegState(v)
+ // We use .spilled field to store the flag.
+ st.spilled = flagPending
+ }
+ }
+
+ var siblingAddedView []VRegID
+ cn := loop.LoopNestingForestChildren()
+ for i := 0; i < cn; i++ {
+ child := loop.LoopNestingForestChild(i)
+ childID := child.ID()
+ childInfo := a.getOrAllocateBlockState(childID)
+
+ if i == 0 {
+ begin := len(childInfo.liveIns)
+ for _, v := range a.vs2 {
+ st := s.getVRegState(v)
+ // We use .spilled field to store the flag.
+ if st.spilled == flagPending { //nolint:gosimple
+ st.spilled = flagDone
+ // TODO: deduplicate, though I don't think it has much impact.
+ childInfo.liveIns = append(childInfo.liveIns, v)
+ }
+ }
+ siblingAddedView = childInfo.liveIns[begin:]
+ } else {
+ // TODO: deduplicate, though I don't think it has much impact.
+ childInfo.liveIns = append(childInfo.liveIns, siblingAddedView...)
+ }
+
+ if child.LoopHeader() {
+ a.blks = append(a.blks, child)
+ }
+ }
+
+ if cn == 0 {
+ // If there's no forest child, we haven't cleared the .spilled field at this point.
+ for _, v := range a.vs2 {
+ st := s.getVRegState(v)
+ st.spilled = false
+ }
+ }
+ }
+}
+
+// alloc allocates registers for the given function by iterating the blocks in the reverse postorder.
+// The algorithm here is derived from the Go compiler's allocator https://github.com/golang/go/blob/release-branch.go1.21/src/cmd/compile/internal/ssa/regalloc.go
+// In short, this is a simply linear scan register allocation where each block inherits the register allocation state from
+// one of its predecessors. Each block inherits the selected state and starts allocation from there.
+// If there's a discrepancy in the end states between predecessors, the adjustments are made to ensure consistency after allocation is done (which we call "fixing merge state").
+// The spill instructions (store into the dedicated slots) are inserted after all the allocations and fixing merge states. That is because
+// at the point, we all know where the reloads happen, and therefore we can know the best place to spill the values. More precisely,
+// the spill happens in the block that is the lowest common ancestor of all the blocks that reloads the value.
+//
+// All of these logics are almost the same as Go's compiler which has a dedicated description in the source file ^^.
+func (a *Allocator) alloc(f Function) {
+ // First we allocate each block in the reverse postorder (at least one predecessor should be allocated for each block).
+ for blk := f.ReversePostOrderBlockIteratorBegin(); blk != nil; blk = f.ReversePostOrderBlockIteratorNext() {
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("========== allocating blk%d ========\n", blk.ID())
+ }
+ if blk.Entry() {
+ a.finalizeStartReg(blk)
+ }
+ a.allocBlock(f, blk)
+ }
+ // After the allocation, we all know the start and end state of each block. So we can fix the merge states.
+ for blk := f.ReversePostOrderBlockIteratorBegin(); blk != nil; blk = f.ReversePostOrderBlockIteratorNext() {
+ a.fixMergeState(f, blk)
+ }
+ // Finally, we insert the spill instructions as we know all the places where the reloads happen.
+ a.scheduleSpills(f)
+}
+
+func (a *Allocator) updateLiveInVRState(liveness *blockState) {
+ currentBlockID := a.state.currentBlockID
+ for _, v := range liveness.liveIns {
+ vs := a.state.getVRegState(v)
+ vs.lastUse = programCounterLiveIn
+ vs.lastUseUpdatedAtBlockID = currentBlockID
+ }
+}
+
+func (a *Allocator) finalizeStartReg(blk Block) {
+ bID := blk.ID()
+ liveness := a.getOrAllocateBlockState(bID)
+ s := &a.state
+ currentBlkState := a.getOrAllocateBlockState(bID)
+ if currentBlkState.startFromPredIndex > -1 {
+ return
+ }
+
+ s.currentBlockID = bID
+ a.updateLiveInVRState(liveness)
+
+ preds := blk.Preds()
+ var predState *blockState
+ switch preds {
+ case 0: // This is the entry block.
+ case 1:
+ predID := blk.Pred(0).ID()
+ predState = a.getOrAllocateBlockState(predID)
+ currentBlkState.startFromPredIndex = 0
+ default:
+ // TODO: there should be some better heuristic to choose the predecessor.
+ for i := 0; i < preds; i++ {
+ predID := blk.Pred(i).ID()
+ if _predState := a.getOrAllocateBlockState(predID); _predState.visited {
+ predState = _predState
+ currentBlkState.startFromPredIndex = i
+ break
+ }
+ }
+ }
+ if predState == nil {
+ if !blk.Entry() {
+ panic(fmt.Sprintf("BUG: at lease one predecessor should be visited for blk%d", blk.ID()))
+ }
+ for _, u := range s.argRealRegs {
+ s.useRealReg(u.RealReg(), u)
+ }
+ currentBlkState.startFromPredIndex = 0
+ } else if predState != nil {
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("allocating blk%d starting from blk%d (on index=%d) \n",
+ bID, blk.Pred(currentBlkState.startFromPredIndex).ID(), currentBlkState.startFromPredIndex)
+ }
+ s.resetAt(predState)
+ }
+
+ s.regsInUse.range_(func(allocated RealReg, v VReg) {
+ currentBlkState.startRegs.add(allocated, v)
+ })
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("finalized start reg for blk%d: %s\n", blk.ID(), currentBlkState.startRegs.format(a.regInfo))
+ }
+}
+
+func (a *Allocator) allocBlock(f Function, blk Block) {
+ bID := blk.ID()
+ s := &a.state
+ currentBlkState := a.getOrAllocateBlockState(bID)
+ s.currentBlockID = bID
+
+ if currentBlkState.startFromPredIndex < 0 {
+ panic("BUG: startFromPredIndex should be set in finalizeStartReg prior to allocBlock")
+ }
+
+ // Clears the previous state.
+ s.regsInUse.range_(func(allocatedRealReg RealReg, vr VReg) {
+ s.setVRegState(vr, RealRegInvalid)
+ })
+ s.regsInUse.reset()
+ // Then set the start state.
+ currentBlkState.startRegs.range_(func(allocatedRealReg RealReg, vr VReg) {
+ s.useRealReg(allocatedRealReg, vr)
+ })
+
+ desiredUpdated := a.vs2[:0]
+
+ // Update the last use of each VReg.
+ var pc programCounter
+ for instr := blk.InstrIteratorBegin(); instr != nil; instr = blk.InstrIteratorNext() {
+ var use, def VReg
+ for _, use = range instr.Uses(&a.vs) {
+ if !use.IsRealReg() {
+ s.getVRegState(use.ID()).lastUse = pc
+ }
+ }
+
+ if instr.IsCopy() {
+ def = instr.Defs(&a.vs)[0]
+ r := def.RealReg()
+ if r != RealRegInvalid {
+ useID := use.ID()
+ vs := s.getVRegState(useID)
+ if !vs.isPhi { // TODO: no idea why do we need this.
+ vs.desiredLoc = newDesiredLocReg(r)
+ desiredUpdated = append(desiredUpdated, useID)
+ }
+ }
+ }
+ pc++
+ }
+
+ // Mark all live-out values by checking live-in of the successors.
+ // While doing so, we also update the desired register values.
+ var succ Block
+ for i, ns := 0, blk.Succs(); i < ns; i++ {
+ succ = blk.Succ(i)
+ if succ == nil {
+ continue
+ }
+
+ succID := succ.ID()
+ succState := a.getOrAllocateBlockState(succID)
+ for _, v := range succState.liveIns {
+ if s.phiBlk(v) != succ {
+ st := s.getVRegState(v)
+ st.lastUse = programCounterLiveOut
+ }
+ }
+
+ if succState.startFromPredIndex > -1 {
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("blk%d -> blk%d: start_regs: %s\n", bID, succID, succState.startRegs.format(a.regInfo))
+ }
+ succState.startRegs.range_(func(allocatedRealReg RealReg, vr VReg) {
+ vs := s.getVRegState(vr.ID())
+ vs.desiredLoc = newDesiredLocReg(allocatedRealReg)
+ desiredUpdated = append(desiredUpdated, vr.ID())
+ })
+ for _, p := range succ.BlockParams(&a.vs) {
+ vs := s.getVRegState(p.ID())
+ if vs.desiredLoc.realReg() == RealRegInvalid {
+ vs.desiredLoc = desiredLocStack
+ desiredUpdated = append(desiredUpdated, p.ID())
+ }
+ }
+ }
+ }
+
+ // Propagate the desired register values from the end of the block to the beginning.
+ for instr := blk.InstrRevIteratorBegin(); instr != nil; instr = blk.InstrRevIteratorNext() {
+ if instr.IsCopy() {
+ def := instr.Defs(&a.vs)[0]
+ defState := s.getVRegState(def.ID())
+ desired := defState.desiredLoc.realReg()
+ if desired == RealRegInvalid {
+ continue
+ }
+
+ use := instr.Uses(&a.vs)[0]
+ useID := use.ID()
+ useState := s.getVRegState(useID)
+ if s.phiBlk(useID) != succ && useState.desiredLoc == desiredLocUnspecified {
+ useState.desiredLoc = newDesiredLocReg(desired)
+ desiredUpdated = append(desiredUpdated, useID)
+ }
+ }
+ }
+
+ pc = 0
+ for instr := blk.InstrIteratorBegin(); instr != nil; instr = blk.InstrIteratorNext() {
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Println(instr)
+ }
+
+ var currentUsedSet RegSet
+ killSet := a.reals[:0]
+
+ // Gather the set of registers that will be used in the current instruction.
+ for _, use := range instr.Uses(&a.vs) {
+ if use.IsRealReg() {
+ r := use.RealReg()
+ currentUsedSet = currentUsedSet.add(r)
+ if a.allocatableSet.has(r) {
+ killSet = append(killSet, r)
+ }
+ } else {
+ vs := s.getVRegState(use.ID())
+ if r := vs.r; r != RealRegInvalid {
+ currentUsedSet = currentUsedSet.add(r)
+ }
+ }
+ }
+
+ for i, use := range instr.Uses(&a.vs) {
+ if !use.IsRealReg() {
+ vs := s.getVRegState(use.ID())
+ killed := vs.lastUse == pc
+ r := vs.r
+
+ if r == RealRegInvalid {
+ r = s.findOrSpillAllocatable(a, a.regInfo.AllocatableRegisters[use.RegType()], currentUsedSet,
+ // Prefer the desired register if it's available.
+ vs.desiredLoc.realReg())
+ vs.recordReload(f, blk)
+ f.ReloadRegisterBefore(use.SetRealReg(r), instr)
+ s.useRealReg(r, use)
+ }
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("\ttrying to use v%v on %s\n", use.ID(), a.regInfo.RealRegName(r))
+ }
+ instr.AssignUse(i, use.SetRealReg(r))
+ currentUsedSet = currentUsedSet.add(r)
+ if killed {
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("\tkill v%d with %s\n", use.ID(), a.regInfo.RealRegName(r))
+ }
+ killSet = append(killSet, r)
+ }
+ }
+ }
+
+ isIndirect := instr.IsIndirectCall()
+ call := instr.IsCall() || isIndirect
+ if call {
+ addr := RealRegInvalid
+ if instr.IsIndirectCall() {
+ addr = a.vs[0].RealReg()
+ }
+ a.releaseCallerSavedRegs(addr)
+ }
+
+ for _, r := range killSet {
+ s.releaseRealReg(r)
+ }
+ a.reals = killSet
+
+ defs := instr.Defs(&a.vs)
+ switch {
+ case len(defs) > 1:
+ // Some instructions define multiple values on real registers.
+ // E.g. call instructions (following calling convention) / div instruction on x64 that defines both rax and rdx.
+ //
+ // Note that currently I assume that such instructions define only the pre colored real registers, not the VRegs
+ // that require allocations. If we need to support such case, we need to add the logic to handle it here,
+ // though is there any such instruction?
+ for _, def := range defs {
+ if !def.IsRealReg() {
+ panic("BUG: multiple defs should be on real registers")
+ }
+ r := def.RealReg()
+ if s.regsInUse.has(r) {
+ s.releaseRealReg(r)
+ }
+ s.useRealReg(r, def)
+ }
+ case len(defs) == 1:
+ def := defs[0]
+ if def.IsRealReg() {
+ r := def.RealReg()
+ if a.allocatableSet.has(r) {
+ if s.regsInUse.has(r) {
+ s.releaseRealReg(r)
+ }
+ s.useRealReg(r, def)
+ }
+ } else {
+ vState := s.getVRegState(def.ID())
+ r := vState.r
+
+ if desired := vState.desiredLoc.realReg(); desired != RealRegInvalid {
+ if r != desired {
+ if (vState.isPhi && vState.defBlk == succ) ||
+ // If this is not a phi and it's already assigned a real reg,
+ // this value has multiple definitions, hence we cannot assign the desired register.
+ (!s.regsInUse.has(desired) && r == RealRegInvalid) {
+ // If the phi value is passed via a real register, we force the value to be in the desired register.
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("\t\tv%d is phi and desiredReg=%s\n", def.ID(), a.regInfo.RealRegName(desired))
+ }
+ if r != RealRegInvalid {
+ // If the value is already in a different real register, we release it to change the state.
+ // Otherwise, multiple registers might have the same values at the end, which results in
+ // messing up the merge state reconciliation.
+ s.releaseRealReg(r)
+ }
+ r = desired
+ s.releaseRealReg(r)
+ s.useRealReg(r, def)
+ }
+ }
+ }
+
+ // Allocate a new real register if `def` is not currently assigned one.
+ // It can happen when multiple instructions define the same VReg (e.g. const loads).
+ if r == RealRegInvalid {
+ if instr.IsCopy() {
+ copySrc := instr.Uses(&a.vs)[0].RealReg()
+ if a.allocatableSet.has(copySrc) && !s.regsInUse.has(copySrc) {
+ r = copySrc
+ }
+ }
+ if r == RealRegInvalid {
+ typ := def.RegType()
+ r = s.findOrSpillAllocatable(a, a.regInfo.AllocatableRegisters[typ], RegSet(0), RealRegInvalid)
+ }
+ s.useRealReg(r, def)
+ }
+ dr := def.SetRealReg(r)
+ instr.AssignDef(dr)
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("\tdefining v%d with %s\n", def.ID(), a.regInfo.RealRegName(r))
+ }
+ if vState.isPhi {
+ if vState.desiredLoc.stack() { // Stack based phi value.
+ f.StoreRegisterAfter(dr, instr)
+ // Release the real register as it's not used anymore.
+ s.releaseRealReg(r)
+ } else {
+ // Only the register based phis are necessary to track the defining instructions
+ // since the stack-based phis are already having stores inserted ^.
+ n := a.phiDefInstListPool.Allocate()
+ n.instr = instr
+ n.next = vState.phiDefInstList
+ n.v = dr
+ vState.phiDefInstList = n
+ }
+ } else {
+ vState.defInstr = instr
+ vState.defBlk = blk
+ }
+ }
+ }
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Println(instr)
+ }
+ pc++
+ }
+
+ s.regsInUse.range_(func(allocated RealReg, v VReg) {
+ currentBlkState.endRegs.add(allocated, v)
+ })
+
+ currentBlkState.visited = true
+ if wazevoapi.RegAllocLoggingEnabled {
+ currentBlkState.dump(a.regInfo)
+ }
+
+ // Reset the desired end location.
+ for _, v := range desiredUpdated {
+ vs := s.getVRegState(v)
+ vs.desiredLoc = desiredLocUnspecified
+ }
+ a.vs2 = desiredUpdated[:0]
+
+ for i := 0; i < blk.Succs(); i++ {
+ succ := blk.Succ(i)
+ if succ == nil {
+ continue
+ }
+ // If the successor is not visited yet, finalize the start state.
+ a.finalizeStartReg(succ)
+ }
+}
+
+func (a *Allocator) releaseCallerSavedRegs(addrReg RealReg) {
+ s := &a.state
+
+ for i := 0; i < 64; i++ {
+ allocated := RealReg(i)
+ if allocated == addrReg { // If this is the call indirect, we should not touch the addr register.
+ continue
+ }
+ if v := s.regsInUse.get(allocated); v.Valid() {
+ if v.IsRealReg() {
+ continue // This is the argument register as it's already used by VReg backed by the corresponding RealReg.
+ }
+ if !a.regInfo.CallerSavedRegisters.has(allocated) {
+ // If this is not a caller-saved register, it is safe to keep it across the call.
+ continue
+ }
+ s.releaseRealReg(allocated)
+ }
+ }
+}
+
+func (a *Allocator) fixMergeState(f Function, blk Block) {
+ preds := blk.Preds()
+ if preds <= 1 {
+ return
+ }
+
+ s := &a.state
+
+ // Restores the state at the beginning of the block.
+ bID := blk.ID()
+ blkSt := a.getOrAllocateBlockState(bID)
+ desiredOccupants := &blkSt.startRegs
+ aliveOnRegVRegs := make(map[VReg]RealReg)
+ for i := 0; i < 64; i++ {
+ r := RealReg(i)
+ if v := blkSt.startRegs.get(r); v.Valid() {
+ aliveOnRegVRegs[v] = r
+ }
+ }
+
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Println("fixMergeState", blk.ID(), ":", desiredOccupants.format(a.regInfo))
+ }
+
+ s.currentBlockID = bID
+ a.updateLiveInVRState(a.getOrAllocateBlockState(bID))
+
+ currentOccupants := &a.currentOccupants
+ for i := 0; i < preds; i++ {
+ currentOccupants.reset()
+ if i == blkSt.startFromPredIndex {
+ continue
+ }
+
+ currentOccupantsRev := make(map[VReg]RealReg)
+ pred := blk.Pred(i)
+ predSt := a.getOrAllocateBlockState(pred.ID())
+ for ii := 0; ii < 64; ii++ {
+ r := RealReg(ii)
+ if v := predSt.endRegs.get(r); v.Valid() {
+ if _, ok := aliveOnRegVRegs[v]; !ok {
+ continue
+ }
+ currentOccupants.add(r, v)
+ currentOccupantsRev[v] = r
+ }
+ }
+
+ s.resetAt(predSt)
+
+ // Finds the free registers if any.
+ intTmp, floatTmp := VRegInvalid, VRegInvalid
+ if intFree := s.findAllocatable(
+ a.regInfo.AllocatableRegisters[RegTypeInt], desiredOccupants.set,
+ ); intFree != RealRegInvalid {
+ intTmp = FromRealReg(intFree, RegTypeInt)
+ }
+ if floatFree := s.findAllocatable(
+ a.regInfo.AllocatableRegisters[RegTypeFloat], desiredOccupants.set,
+ ); floatFree != RealRegInvalid {
+ floatTmp = FromRealReg(floatFree, RegTypeFloat)
+ }
+
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Println("\t", pred.ID(), ":", currentOccupants.format(a.regInfo))
+ }
+
+ for ii := 0; ii < 64; ii++ {
+ r := RealReg(ii)
+ desiredVReg := desiredOccupants.get(r)
+ if !desiredVReg.Valid() {
+ continue
+ }
+
+ currentVReg := currentOccupants.get(r)
+ if desiredVReg.ID() == currentVReg.ID() {
+ continue
+ }
+
+ typ := desiredVReg.RegType()
+ var tmpRealReg VReg
+ if typ == RegTypeInt {
+ tmpRealReg = intTmp
+ } else {
+ tmpRealReg = floatTmp
+ }
+ a.reconcileEdge(f, r, pred, currentOccupants, currentOccupantsRev, currentVReg, desiredVReg, tmpRealReg, typ)
+ }
+ }
+}
+
+func (a *Allocator) reconcileEdge(f Function,
+ r RealReg,
+ pred Block,
+ currentOccupants *regInUseSet,
+ currentOccupantsRev map[VReg]RealReg,
+ currentVReg, desiredVReg VReg,
+ freeReg VReg,
+ typ RegType,
+) {
+ s := &a.state
+ if currentVReg.Valid() {
+ // Both are on reg.
+ er, ok := currentOccupantsRev[desiredVReg]
+ if !ok {
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("\t\tv%d is desired to be on %s, but currently on the stack\n",
+ desiredVReg.ID(), a.regInfo.RealRegName(r),
+ )
+ }
+ // This case is that the desired value is on the stack, but currentVReg is on the target register.
+ // We need to move the current value to the stack, and reload the desired value.
+ // TODO: we can do better here.
+ f.StoreRegisterBefore(currentVReg.SetRealReg(r), pred.LastInstrForInsertion())
+ delete(currentOccupantsRev, currentVReg)
+
+ s.getVRegState(desiredVReg.ID()).recordReload(f, pred)
+ f.ReloadRegisterBefore(desiredVReg.SetRealReg(r), pred.LastInstrForInsertion())
+ currentOccupants.add(r, desiredVReg)
+ currentOccupantsRev[desiredVReg] = r
+ return
+ }
+
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("\t\tv%d is desired to be on %s, but currently on %s\n",
+ desiredVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er),
+ )
+ }
+ f.SwapBefore(
+ currentVReg.SetRealReg(r),
+ desiredVReg.SetRealReg(er),
+ freeReg,
+ pred.LastInstrForInsertion(),
+ )
+ s.allocatedRegSet = s.allocatedRegSet.add(freeReg.RealReg())
+ currentOccupantsRev[desiredVReg] = r
+ currentOccupantsRev[currentVReg] = er
+ currentOccupants.add(r, desiredVReg)
+ currentOccupants.add(er, currentVReg)
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("\t\tv%d previously on %s moved to %s\n", currentVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er))
+ }
+ } else {
+ // Desired is on reg, but currently the target register is not used.
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("\t\tv%d is desired to be on %s, current not used\n",
+ desiredVReg.ID(), a.regInfo.RealRegName(r),
+ )
+ }
+ if currentReg, ok := currentOccupantsRev[desiredVReg]; ok {
+ f.InsertMoveBefore(
+ FromRealReg(r, typ),
+ desiredVReg.SetRealReg(currentReg),
+ pred.LastInstrForInsertion(),
+ )
+ currentOccupants.remove(currentReg)
+ } else {
+ s.getVRegState(desiredVReg.ID()).recordReload(f, pred)
+ f.ReloadRegisterBefore(desiredVReg.SetRealReg(r), pred.LastInstrForInsertion())
+ }
+ currentOccupantsRev[desiredVReg] = r
+ currentOccupants.add(r, desiredVReg)
+ }
+
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Println("\t", pred.ID(), ":", currentOccupants.format(a.regInfo))
+ }
+}
+
+func (a *Allocator) scheduleSpills(f Function) {
+ states := a.state.vrStates
+ for i := 0; i <= states.MaxIDEncountered(); i++ {
+ vs := states.Get(i)
+ if vs == nil {
+ continue
+ }
+ if vs.spilled {
+ a.scheduleSpill(f, vs)
+ }
+ }
+}
+
+func (a *Allocator) scheduleSpill(f Function, vs *vrState) {
+ v := vs.v
+ // If the value is the phi value, we need to insert a spill after each phi definition.
+ if vs.isPhi {
+ for defInstr := vs.phiDefInstList; defInstr != nil; defInstr = defInstr.next {
+ f.StoreRegisterAfter(defInstr.v, defInstr.instr)
+ }
+ return
+ }
+
+ pos := vs.lca
+ definingBlk := vs.defBlk
+ r := RealRegInvalid
+ if definingBlk == nil {
+ panic(fmt.Sprintf("BUG: definingBlk should not be nil for %s. This is likley a bug in backend lowering logic", vs.v.String()))
+ }
+ if pos == nil {
+ panic(fmt.Sprintf("BUG: pos should not be nil for %s. This is likley a bug in backend lowering logic", vs.v.String()))
+ }
+
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("v%d is spilled in blk%d, lca=blk%d\n", v.ID(), definingBlk.ID(), pos.ID())
+ }
+ for pos != definingBlk {
+ st := a.getOrAllocateBlockState(pos.ID())
+ for ii := 0; ii < 64; ii++ {
+ rr := RealReg(ii)
+ if st.startRegs.get(rr) == v {
+ r = rr
+ // Already in the register, so we can place the spill at the beginning of the block.
+ break
+ }
+ }
+
+ if r != RealRegInvalid {
+ break
+ }
+
+ pos = f.Idom(pos)
+ }
+
+ if pos == definingBlk {
+ defInstr := vs.defInstr
+ defInstr.Defs(&a.vs)
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("schedule spill v%d after %v\n", v.ID(), defInstr)
+ }
+ f.StoreRegisterAfter(a.vs[0], defInstr)
+ } else {
+ // Found an ancestor block that holds the value in the register at the beginning of the block.
+ // We need to insert a spill before the last use.
+ first := pos.FirstInstr()
+ if wazevoapi.RegAllocLoggingEnabled {
+ fmt.Printf("schedule spill v%d before %v\n", v.ID(), first)
+ }
+ f.StoreRegisterAfter(v.SetRealReg(r), first)
+ }
+}
+
+// Reset resets the allocator's internal state so that it can be reused.
+func (a *Allocator) Reset() {
+ a.state.reset()
+ a.blockStates.Reset()
+ a.phiDefInstListPool.Reset()
+ a.vs = a.vs[:0]
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
new file mode 100644
index 000000000..e9bf60661
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
@@ -0,0 +1,108 @@
+package regalloc
+
+import (
+ "fmt"
+ "strings"
+)
+
+// NewRegSet returns a new RegSet with the given registers.
+func NewRegSet(regs ...RealReg) RegSet {
+ var ret RegSet
+ for _, r := range regs {
+ ret = ret.add(r)
+ }
+ return ret
+}
+
+// RegSet represents a set of registers.
+type RegSet uint64
+
+func (rs RegSet) format(info *RegisterInfo) string { //nolint:unused
+ var ret []string
+ for i := 0; i < 64; i++ {
+ if rs&(1<<uint(i)) != 0 {
+ ret = append(ret, info.RealRegName(RealReg(i)))
+ }
+ }
+ return strings.Join(ret, ", ")
+}
+
+func (rs RegSet) has(r RealReg) bool {
+ return rs&(1<<uint(r)) != 0
+}
+
+func (rs RegSet) add(r RealReg) RegSet {
+ if r >= 64 {
+ return rs
+ }
+ return rs | 1<<uint(r)
+}
+
+func (rs RegSet) Range(f func(allocatedRealReg RealReg)) {
+ for i := 0; i < 64; i++ {
+ if rs&(1<<uint(i)) != 0 {
+ f(RealReg(i))
+ }
+ }
+}
+
+type regInUseSet struct {
+ set RegSet
+ vrs [64]VReg
+}
+
+func (rs *regInUseSet) reset() {
+ rs.set = 0
+ for i := range rs.vrs {
+ rs.vrs[i] = VRegInvalid
+ }
+}
+
+func (rs *regInUseSet) format(info *RegisterInfo) string { //nolint:unused
+ var ret []string
+ for i := 0; i < 64; i++ {
+ if rs.set&(1<<uint(i)) != 0 {
+ vr := rs.vrs[i]
+ ret = append(ret, fmt.Sprintf("(%s->v%d)", info.RealRegName(RealReg(i)), vr.ID()))
+ }
+ }
+ return strings.Join(ret, ", ")
+}
+
+func (rs *regInUseSet) has(r RealReg) bool {
+ if r >= 64 {
+ return false
+ }
+ return rs.set&(1<<uint(r)) != 0
+}
+
+func (rs *regInUseSet) get(r RealReg) VReg {
+ if r >= 64 {
+ return VRegInvalid
+ }
+ return rs.vrs[r]
+}
+
+func (rs *regInUseSet) remove(r RealReg) {
+ if r >= 64 {
+ return
+ }
+ rs.set &= ^(1 << uint(r))
+ rs.vrs[r] = VRegInvalid
+}
+
+func (rs *regInUseSet) add(r RealReg, vr VReg) {
+ if r >= 64 {
+ return
+ }
+ rs.set |= 1 << uint(r)
+ rs.vrs[r] = vr
+}
+
+func (rs *regInUseSet) range_(f func(allocatedRealReg RealReg, vr VReg)) {
+ for i := 0; i < 64; i++ {
+ if rs.set&(1<<uint(i)) != 0 {
+ f(RealReg(i), rs.vrs[i])
+ }
+ }
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go
new file mode 100644
index 000000000..edfa962b5
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go
@@ -0,0 +1,43 @@
+package backend
+
+import (
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+ "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// SSAValueDefinition represents a definition of an SSA value.
+type SSAValueDefinition struct {
+ // BlockParamValue is valid if Instr == nil
+ BlockParamValue ssa.Value
+
+ // BlkParamVReg is valid if Instr == nil
+ BlkParamVReg regalloc.VReg
+
+ // Instr is not nil if this is a definition from an instruction.
+ Instr *ssa.Instruction
+ // N is the index of the return value in the instr's return values list.
+ N int
+ // RefCount is the number of references to the result.
+ RefCount int
+}
+
+func (d *SSAValueDefinition) IsFromInstr() bool {
+ return d.Instr != nil
+}
+
+func (d *SSAValueDefinition) IsFromBlockParam() bool {
+ return d.Instr == nil
+}
+
+func (d *SSAValueDefinition) SSAValue() ssa.Value {
+ if d.IsFromBlockParam() {
+ return d.BlockParamValue
+ } else {
+ r, rs := d.Instr.Returns()
+ if d.N == 0 {
+ return r
+ } else {
+ return rs[d.N-1]
+ }
+ }
+}