47 files changed, 0 insertions, 25529 deletions
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go
deleted file mode 100644
index cf91c6b7a..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go
+++ /dev/null
@@ -1,170 +0,0 @@
-package backend
-
-import (
-	"fmt"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-type (
-	// FunctionABI represents the ABI information for a function which corresponds to a ssa.Signature.
-	FunctionABI struct {
-		Initialized bool
-
-		Args, Rets                 []ABIArg
-		ArgStackSize, RetStackSize int64
-
-		ArgIntRealRegs   byte
-		ArgFloatRealRegs byte
-		RetIntRealRegs   byte
-		RetFloatRealRegs byte
-	}
-
-	// ABIArg represents either argument or return value's location.
-	ABIArg struct {
-		// Index is the index of the argument.
-		Index int
-		// Kind is the kind of the argument.
-		Kind ABIArgKind
-		// Reg is valid if Kind == ABIArgKindReg.
-		// This VReg must be based on RealReg.
-		Reg regalloc.VReg
-		// Offset is valid if Kind == ABIArgKindStack.
-		// This is the offset from the beginning of either arg or ret stack slot.
-		Offset int64
-		// Type is the type of the argument.
-		Type ssa.Type
-	}
-
-	// ABIArgKind is the kind of ABI argument.
-	ABIArgKind byte
-)
-
-const (
-	// ABIArgKindReg represents an argument passed in a register.
-	ABIArgKindReg = iota
-	// ABIArgKindStack represents an argument passed in the stack.
-	ABIArgKindStack
-)
-
-// String implements fmt.Stringer.
-func (a *ABIArg) String() string {
-	return fmt.Sprintf("args[%d]: %s", a.Index, a.Kind)
-}
-
-// String implements fmt.Stringer.
-func (a ABIArgKind) String() string {
-	switch a {
-	case ABIArgKindReg:
-		return "reg"
-	case ABIArgKindStack:
-		return "stack"
-	default:
-		panic("BUG")
-	}
-}
-
-// Init initializes the abiImpl for the given signature.
-func (a *FunctionABI) Init(sig *ssa.Signature, argResultInts, argResultFloats []regalloc.RealReg) {
-	if len(a.Rets) < len(sig.Results) {
-		a.Rets = make([]ABIArg, len(sig.Results))
-	}
-	a.Rets = a.Rets[:len(sig.Results)]
-	a.RetStackSize = a.setABIArgs(a.Rets, sig.Results, argResultInts, argResultFloats)
-	if argsNum := len(sig.Params); len(a.Args) < argsNum {
-		a.Args = make([]ABIArg, argsNum)
-	}
-	a.Args = a.Args[:len(sig.Params)]
-	a.ArgStackSize = a.setABIArgs(a.Args, sig.Params, argResultInts, argResultFloats)
-
-	// Gather the real registers usages in arg/return.
-	a.ArgIntRealRegs, a.ArgFloatRealRegs = 0, 0
-	a.RetIntRealRegs, a.RetFloatRealRegs = 0, 0
-	for i := range a.Rets {
-		r := &a.Rets[i]
-		if r.Kind == ABIArgKindReg {
-			if r.Type.IsInt() {
-				a.RetIntRealRegs++
-			} else {
-				a.RetFloatRealRegs++
-			}
-		}
-	}
-	for i := range a.Args {
-		arg := &a.Args[i]
-		if arg.Kind == ABIArgKindReg {
-			if arg.Type.IsInt() {
-				a.ArgIntRealRegs++
-			} else {
-				a.ArgFloatRealRegs++
-			}
-		}
-	}
-
-	a.Initialized = true
-}
-
-// setABIArgs sets the ABI arguments in the given slice. This assumes that len(s) >= len(types)
-// where if len(s) > len(types), the last elements of s is for the multi-return slot.
-func (a *FunctionABI) setABIArgs(s []ABIArg, types []ssa.Type, ints, floats []regalloc.RealReg) (stackSize int64) {
-	il, fl := len(ints), len(floats)
-
-	var stackOffset int64
-	intParamIndex, floatParamIndex := 0, 0
-	for i, typ := range types {
-		arg := &s[i]
-		arg.Index = i
-		arg.Type = typ
-		if typ.IsInt() {
-			if intParamIndex >= il {
-				arg.Kind = ABIArgKindStack
-				const slotSize = 8 // Align 8 bytes.
-				arg.Offset = stackOffset
-				stackOffset += slotSize
-			} else {
-				arg.Kind = ABIArgKindReg
-				arg.Reg = regalloc.FromRealReg(ints[intParamIndex], regalloc.RegTypeInt)
-				intParamIndex++
-			}
-		} else {
-			if floatParamIndex >= fl {
-				arg.Kind = ABIArgKindStack
-				slotSize := int64(8)   // Align at least 8 bytes.
-				if typ.Bits() == 128 { // Vector.
-					slotSize = 16
-				}
-				arg.Offset = stackOffset
-				stackOffset += slotSize
-			} else {
-				arg.Kind = ABIArgKindReg
-				arg.Reg = regalloc.FromRealReg(floats[floatParamIndex], regalloc.RegTypeFloat)
-				floatParamIndex++
-			}
-		}
-	}
-	return stackOffset
-}
-
-func (a *FunctionABI) AlignedArgResultStackSlotSize() uint32 {
-	stackSlotSize := a.RetStackSize + a.ArgStackSize
-	// Align stackSlotSize to 16 bytes.
-	stackSlotSize = (stackSlotSize + 15) &^ 15
-	// Check overflow 32-bit.
-	if stackSlotSize > 0xFFFFFFFF {
-		panic("ABI stack slot size overflow")
-	}
-	return uint32(stackSlotSize)
-}
-
-func (a *FunctionABI) ABIInfoAsUint64() uint64 {
-	return uint64(a.ArgIntRealRegs)<<56 |
-		uint64(a.ArgFloatRealRegs)<<48 |
-		uint64(a.RetIntRealRegs)<<40 |
-		uint64(a.RetFloatRealRegs)<<32 |
-		uint64(a.AlignedArgResultStackSlotSize())
-}
-
-func ABIInfoFromUint64(info uint64) (argIntRealRegs, argFloatRealRegs, retIntRealRegs, retFloatRealRegs byte, stackSlotSize uint32) {
-	return byte(info >> 56), byte(info >> 48), byte(info >> 40), byte(info >> 32), uint32(info)
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go
deleted file mode 100644
index dd67da43e..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go
+++ /dev/null
@@ -1,3 +0,0 @@
-// Package backend must be free of Wasm-specific concept. In other words,
-// this package must not import internal/wasm package.
-package backend
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go
deleted file mode 100644
index 62d365015..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go
+++ /dev/null
@@ -1,399 +0,0 @@
-package backend
-
-import (
-	"context"
-	"fmt"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-)
-
-// NewCompiler returns a new Compiler that can generate a machine code.
-func NewCompiler(ctx context.Context, mach Machine, builder ssa.Builder) Compiler {
-	return newCompiler(ctx, mach, builder)
-}
-
-func newCompiler(_ context.Context, mach Machine, builder ssa.Builder) *compiler {
-	argResultInts, argResultFloats := mach.ArgsResultsRegs()
-	c := &compiler{
-		mach: mach, ssaBuilder: builder,
-		nextVRegID:      regalloc.VRegIDNonReservedBegin,
-		argResultInts:   argResultInts,
-		argResultFloats: argResultFloats,
-	}
-	mach.SetCompiler(c)
-	return c
-}
-
-// Compiler is the backend of wazevo which takes ssa.Builder and Machine,
-// use the information there to emit the final machine code.
-type Compiler interface {
-	// SSABuilder returns the ssa.Builder used by this compiler.
-	SSABuilder() ssa.Builder
-
-	// Compile executes the following steps:
-	// 	1. Lower()
-	// 	2. RegAlloc()
-	// 	3. Finalize()
-	// 	4. Encode()
-	//
-	// Each step can be called individually for testing purpose, therefore they are exposed in this interface too.
-	//
-	// The returned byte slices are the machine code and the relocation information for the machine code.
-	// The caller is responsible for copying them immediately since the compiler may reuse the buffer.
-	Compile(ctx context.Context) (_ []byte, _ []RelocationInfo, _ error)
-
-	// Lower lowers the given ssa.Instruction to the machine-specific instructions.
-	Lower()
-
-	// RegAlloc performs the register allocation after Lower is called.
-	RegAlloc()
-
-	// Finalize performs the finalization of the compilation, including machine code emission.
-	// This must be called after RegAlloc.
-	Finalize(ctx context.Context) error
-
-	// Buf returns the buffer of the encoded machine code. This is only used for testing purpose.
-	Buf() []byte
-
-	BufPtr() *[]byte
-
-	// Format returns the debug string of the current state of the compiler.
-	Format() string
-
-	// Init initializes the internal state of the compiler for the next compilation.
-	Init()
-
-	// AllocateVReg allocates a new virtual register of the given type.
-	AllocateVReg(typ ssa.Type) regalloc.VReg
-
-	// ValueDefinition returns the definition of the given value.
-	ValueDefinition(ssa.Value) SSAValueDefinition
-
-	// VRegOf returns the virtual register of the given ssa.Value.
-	VRegOf(value ssa.Value) regalloc.VReg
-
-	// TypeOf returns the ssa.Type of the given virtual register.
-	TypeOf(regalloc.VReg) ssa.Type
-
-	// MatchInstr returns true if the given definition is from an instruction with the given opcode, the current group ID,
-	// and a refcount of 1. That means, the instruction can be merged/swapped within the current instruction group.
-	MatchInstr(def SSAValueDefinition, opcode ssa.Opcode) bool
-
-	// MatchInstrOneOf is the same as MatchInstr but for multiple opcodes. If it matches one of ssa.Opcode,
-	// this returns the opcode. Otherwise, this returns ssa.OpcodeInvalid.
-	//
-	// Note: caller should be careful to avoid excessive allocation on opcodes slice.
-	MatchInstrOneOf(def SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode
-
-	// AddRelocationInfo appends the relocation information for the function reference at the current buffer offset.
-	AddRelocationInfo(funcRef ssa.FuncRef)
-
-	// AddSourceOffsetInfo appends the source offset information for the given offset.
-	AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset)
-
-	// SourceOffsetInfo returns the source offset information for the current buffer offset.
-	SourceOffsetInfo() []SourceOffsetInfo
-
-	// EmitByte appends a byte to the buffer. Used during the code emission.
-	EmitByte(b byte)
-
-	// Emit4Bytes appends 4 bytes to the buffer. Used during the code emission.
-	Emit4Bytes(b uint32)
-
-	// Emit8Bytes appends 8 bytes to the buffer. Used during the code emission.
-	Emit8Bytes(b uint64)
-
-	// GetFunctionABI returns the ABI information for the given signature.
-	GetFunctionABI(sig *ssa.Signature) *FunctionABI
-}
-
-// RelocationInfo represents the relocation information for a call instruction.
-type RelocationInfo struct {
-	// Offset represents the offset from the beginning of the machine code of either a function or the entire module.
-	Offset int64
-	// Target is the target function of the call instruction.
-	FuncRef ssa.FuncRef
-}
-
-// compiler implements Compiler.
-type compiler struct {
-	mach       Machine
-	currentGID ssa.InstructionGroupID
-	ssaBuilder ssa.Builder
-	// nextVRegID is the next virtual register ID to be allocated.
-	nextVRegID regalloc.VRegID
-	// ssaValueToVRegs maps ssa.ValueID to regalloc.VReg.
-	ssaValueToVRegs [] /* VRegID to */ regalloc.VReg
-	ssaValuesInfo   []ssa.ValueInfo
-	// returnVRegs is the list of virtual registers that store the return values.
-	returnVRegs  []regalloc.VReg
-	varEdges     [][2]regalloc.VReg
-	varEdgeTypes []ssa.Type
-	constEdges   []struct {
-		cInst *ssa.Instruction
-		dst   regalloc.VReg
-	}
-	vRegSet         []bool
-	vRegIDs         []regalloc.VRegID
-	tempRegs        []regalloc.VReg
-	tmpVals         []ssa.Value
-	ssaTypeOfVRegID [] /* VRegID to */ ssa.Type
-	buf             []byte
-	relocations     []RelocationInfo
-	sourceOffsets   []SourceOffsetInfo
-	// abis maps ssa.SignatureID to the ABI implementation.
-	abis                           []FunctionABI
-	argResultInts, argResultFloats []regalloc.RealReg
-}
-
-// SourceOffsetInfo is a data to associate the source offset with the executable offset.
-type SourceOffsetInfo struct {
-	// SourceOffset is the source offset in the original source code.
-	SourceOffset ssa.SourceOffset
-	// ExecutableOffset is the offset in the compiled executable.
-	ExecutableOffset int64
-}
-
-// Compile implements Compiler.Compile.
-func (c *compiler) Compile(ctx context.Context) ([]byte, []RelocationInfo, error) {
-	c.Lower()
-	if wazevoapi.PrintSSAToBackendIRLowering && wazevoapi.PrintEnabledIndex(ctx) {
-		fmt.Printf("[[[after lowering for %s ]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
-	}
-	if wazevoapi.DeterministicCompilationVerifierEnabled {
-		wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After lowering to ISA specific IR", c.Format())
-	}
-	c.RegAlloc()
-	if wazevoapi.PrintRegisterAllocated && wazevoapi.PrintEnabledIndex(ctx) {
-		fmt.Printf("[[[after regalloc for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
-	}
-	if wazevoapi.DeterministicCompilationVerifierEnabled {
-		wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After Register Allocation", c.Format())
-	}
-	if err := c.Finalize(ctx); err != nil {
-		return nil, nil, err
-	}
-	if wazevoapi.PrintFinalizedMachineCode && wazevoapi.PrintEnabledIndex(ctx) {
-		fmt.Printf("[[[after finalize for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
-	}
-	if wazevoapi.DeterministicCompilationVerifierEnabled {
-		wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After Finalization", c.Format())
-	}
-	return c.buf, c.relocations, nil
-}
-
-// RegAlloc implements Compiler.RegAlloc.
-func (c *compiler) RegAlloc() {
-	c.mach.RegAlloc()
-}
-
-// Finalize implements Compiler.Finalize.
-func (c *compiler) Finalize(ctx context.Context) error {
-	c.mach.PostRegAlloc()
-	return c.mach.Encode(ctx)
-}
-
-// setCurrentGroupID sets the current instruction group ID.
-func (c *compiler) setCurrentGroupID(gid ssa.InstructionGroupID) {
-	c.currentGID = gid
-}
-
-// assignVirtualRegisters assigns a virtual register to each ssa.ValueID Valid in the ssa.Builder.
-func (c *compiler) assignVirtualRegisters() {
-	builder := c.ssaBuilder
-	c.ssaValuesInfo = builder.ValuesInfo()
-
-	if diff := len(c.ssaValuesInfo) - len(c.ssaValueToVRegs); diff > 0 {
-		c.ssaValueToVRegs = append(c.ssaValueToVRegs, make([]regalloc.VReg, diff+1)...)
-	}
-
-	for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() {
-		// First we assign a virtual register to each parameter.
-		for i := 0; i < blk.Params(); i++ {
-			p := blk.Param(i)
-			pid := p.ID()
-			typ := p.Type()
-			vreg := c.AllocateVReg(typ)
-			c.ssaValueToVRegs[pid] = vreg
-			c.ssaTypeOfVRegID[vreg.ID()] = p.Type()
-		}
-
-		// Assigns each value to a virtual register produced by instructions.
-		for cur := blk.Root(); cur != nil; cur = cur.Next() {
-			r, rs := cur.Returns()
-			if r.Valid() {
-				id := r.ID()
-				ssaTyp := r.Type()
-				typ := r.Type()
-				vReg := c.AllocateVReg(typ)
-				c.ssaValueToVRegs[id] = vReg
-				c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp
-			}
-			for _, r := range rs {
-				id := r.ID()
-				ssaTyp := r.Type()
-				vReg := c.AllocateVReg(ssaTyp)
-				c.ssaValueToVRegs[id] = vReg
-				c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp
-			}
-		}
-	}
-
-	for i, retBlk := 0, builder.ReturnBlock(); i < retBlk.Params(); i++ {
-		typ := retBlk.Param(i).Type()
-		vReg := c.AllocateVReg(typ)
-		c.returnVRegs = append(c.returnVRegs, vReg)
-		c.ssaTypeOfVRegID[vReg.ID()] = typ
-	}
-}
-
-// AllocateVReg implements Compiler.AllocateVReg.
-func (c *compiler) AllocateVReg(typ ssa.Type) regalloc.VReg {
-	regType := regalloc.RegTypeOf(typ)
-	r := regalloc.VReg(c.nextVRegID).SetRegType(regType)
-
-	id := r.ID()
-	if int(id) >= len(c.ssaTypeOfVRegID) {
-		c.ssaTypeOfVRegID = append(c.ssaTypeOfVRegID, make([]ssa.Type, id+1)...)
-	}
-	c.ssaTypeOfVRegID[id] = typ
-	c.nextVRegID++
-	return r
-}
-
-// Init implements Compiler.Init.
-func (c *compiler) Init() {
-	c.currentGID = 0
-	c.nextVRegID = regalloc.VRegIDNonReservedBegin
-	c.returnVRegs = c.returnVRegs[:0]
-	c.mach.Reset()
-	c.varEdges = c.varEdges[:0]
-	c.constEdges = c.constEdges[:0]
-	c.buf = c.buf[:0]
-	c.sourceOffsets = c.sourceOffsets[:0]
-	c.relocations = c.relocations[:0]
-}
-
-// ValueDefinition implements Compiler.ValueDefinition.
-func (c *compiler) ValueDefinition(value ssa.Value) SSAValueDefinition {
-	return SSAValueDefinition{
-		V:        value,
-		Instr:    c.ssaBuilder.InstructionOfValue(value),
-		RefCount: c.ssaValuesInfo[value.ID()].RefCount,
-	}
-}
-
-// VRegOf implements Compiler.VRegOf.
-func (c *compiler) VRegOf(value ssa.Value) regalloc.VReg {
-	return c.ssaValueToVRegs[value.ID()]
-}
-
-// Format implements Compiler.Format.
-func (c *compiler) Format() string {
-	return c.mach.Format()
-}
-
-// TypeOf implements Compiler.Format.
-func (c *compiler) TypeOf(v regalloc.VReg) ssa.Type {
-	return c.ssaTypeOfVRegID[v.ID()]
-}
-
-// MatchInstr implements Compiler.MatchInstr.
-func (c *compiler) MatchInstr(def SSAValueDefinition, opcode ssa.Opcode) bool {
-	instr := def.Instr
-	return def.IsFromInstr() &&
-		instr.Opcode() == opcode &&
-		instr.GroupID() == c.currentGID &&
-		def.RefCount < 2
-}
-
-// MatchInstrOneOf implements Compiler.MatchInstrOneOf.
-func (c *compiler) MatchInstrOneOf(def SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode {
-	instr := def.Instr
-	if !def.IsFromInstr() {
-		return ssa.OpcodeInvalid
-	}
-
-	if instr.GroupID() != c.currentGID {
-		return ssa.OpcodeInvalid
-	}
-
-	if def.RefCount >= 2 {
-		return ssa.OpcodeInvalid
-	}
-
-	opcode := instr.Opcode()
-	for _, op := range opcodes {
-		if opcode == op {
-			return opcode
-		}
-	}
-	return ssa.OpcodeInvalid
-}
-
-// SSABuilder implements Compiler .SSABuilder.
-func (c *compiler) SSABuilder() ssa.Builder {
-	return c.ssaBuilder
-}
-
-// AddSourceOffsetInfo implements Compiler.AddSourceOffsetInfo.
-func (c *compiler) AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset) {
-	c.sourceOffsets = append(c.sourceOffsets, SourceOffsetInfo{
-		SourceOffset:     sourceOffset,
-		ExecutableOffset: executableOffset,
-	})
-}
-
-// SourceOffsetInfo implements Compiler.SourceOffsetInfo.
-func (c *compiler) SourceOffsetInfo() []SourceOffsetInfo {
-	return c.sourceOffsets
-}
-
-// AddRelocationInfo implements Compiler.AddRelocationInfo.
-func (c *compiler) AddRelocationInfo(funcRef ssa.FuncRef) {
-	c.relocations = append(c.relocations, RelocationInfo{
-		Offset:  int64(len(c.buf)),
-		FuncRef: funcRef,
-	})
-}
-
-// Emit8Bytes implements Compiler.Emit8Bytes.
-func (c *compiler) Emit8Bytes(b uint64) {
-	c.buf = append(c.buf, byte(b), byte(b>>8), byte(b>>16), byte(b>>24), byte(b>>32), byte(b>>40), byte(b>>48), byte(b>>56))
-}
-
-// Emit4Bytes implements Compiler.Emit4Bytes.
-func (c *compiler) Emit4Bytes(b uint32) {
-	c.buf = append(c.buf, byte(b), byte(b>>8), byte(b>>16), byte(b>>24))
-}
-
-// EmitByte implements Compiler.EmitByte.
-func (c *compiler) EmitByte(b byte) {
-	c.buf = append(c.buf, b)
-}
-
-// Buf implements Compiler.Buf.
-func (c *compiler) Buf() []byte {
-	return c.buf
-}
-
-// BufPtr implements Compiler.BufPtr.
-func (c *compiler) BufPtr() *[]byte {
-	return &c.buf
-}
-
-func (c *compiler) GetFunctionABI(sig *ssa.Signature) *FunctionABI {
-	if int(sig.ID) >= len(c.abis) {
-		c.abis = append(c.abis, make([]FunctionABI, int(sig.ID)+1)...)
-	}
-
-	abi := &c.abis[sig.ID]
-	if abi.Initialized {
-		return abi
-	}
-
-	abi.Init(sig, c.argResultInts, c.argResultFloats)
-	return abi
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go
deleted file mode 100644
index 735cfa3d3..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go
+++ /dev/null
@@ -1,226 +0,0 @@
-package backend
-
-import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-// Lower implements Compiler.Lower.
-func (c *compiler) Lower() {
-	c.assignVirtualRegisters()
-	c.mach.SetCurrentABI(c.GetFunctionABI(c.ssaBuilder.Signature()))
-	c.mach.StartLoweringFunction(c.ssaBuilder.BlockIDMax())
-	c.lowerBlocks()
-}
-
-// lowerBlocks lowers each block in the ssa.Builder.
-func (c *compiler) lowerBlocks() {
-	builder := c.ssaBuilder
-	for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() {
-		c.lowerBlock(blk)
-	}
-
-	// After lowering all blocks, we need to link adjacent blocks to layout one single instruction list.
-	var prev ssa.BasicBlock
-	for next := builder.BlockIteratorReversePostOrderBegin(); next != nil; next = builder.BlockIteratorReversePostOrderNext() {
-		if prev != nil {
-			c.mach.LinkAdjacentBlocks(prev, next)
-		}
-		prev = next
-	}
-}
-
-func (c *compiler) lowerBlock(blk ssa.BasicBlock) {
-	mach := c.mach
-	mach.StartBlock(blk)
-
-	// We traverse the instructions in reverse order because we might want to lower multiple
-	// instructions together.
-	cur := blk.Tail()
-
-	// First gather the branching instructions at the end of the blocks.
-	var br0, br1 *ssa.Instruction
-	if cur.IsBranching() {
-		br0 = cur
-		cur = cur.Prev()
-		if cur != nil && cur.IsBranching() {
-			br1 = cur
-			cur = cur.Prev()
-		}
-	}
-
-	if br0 != nil {
-		c.lowerBranches(br0, br1)
-	}
-
-	if br1 != nil && br0 == nil {
-		panic("BUG? when a block has conditional branch but doesn't end with an unconditional branch?")
-	}
-
-	// Now start lowering the non-branching instructions.
-	for ; cur != nil; cur = cur.Prev() {
-		c.setCurrentGroupID(cur.GroupID())
-		if cur.Lowered() {
-			continue
-		}
-
-		switch cur.Opcode() {
-		case ssa.OpcodeReturn:
-			rets := cur.ReturnVals()
-			if len(rets) > 0 {
-				c.mach.LowerReturns(rets)
-			}
-			c.mach.InsertReturn()
-		default:
-			mach.LowerInstr(cur)
-		}
-		mach.FlushPendingInstructions()
-	}
-
-	// Finally, if this is the entry block, we have to insert copies of arguments from the real location to the VReg.
-	if blk.EntryBlock() {
-		c.lowerFunctionArguments(blk)
-	}
-
-	mach.EndBlock()
-}
-
-// lowerBranches is called right after StartBlock and before any LowerInstr call if
-// there are branches to the given block. br0 is the very end of the block and b1 is the before the br0 if it exists.
-// At least br0 is not nil, but br1 can be nil if there's no branching before br0.
-//
-// See ssa.Instruction IsBranching, and the comment on ssa.BasicBlock.
-func (c *compiler) lowerBranches(br0, br1 *ssa.Instruction) {
-	mach := c.mach
-
-	c.setCurrentGroupID(br0.GroupID())
-	c.mach.LowerSingleBranch(br0)
-	mach.FlushPendingInstructions()
-	if br1 != nil {
-		c.setCurrentGroupID(br1.GroupID())
-		c.mach.LowerConditionalBranch(br1)
-		mach.FlushPendingInstructions()
-	}
-
-	if br0.Opcode() == ssa.OpcodeJump {
-		_, args, targetBlockID := br0.BranchData()
-		argExists := len(args) != 0
-		if argExists && br1 != nil {
-			panic("BUG: critical edge split failed")
-		}
-		target := c.ssaBuilder.BasicBlock(targetBlockID)
-		if argExists && target.ReturnBlock() {
-			if len(args) > 0 {
-				c.mach.LowerReturns(args)
-			}
-		} else if argExists {
-			c.lowerBlockArguments(args, target)
-		}
-	}
-	mach.FlushPendingInstructions()
-}
-
-func (c *compiler) lowerFunctionArguments(entry ssa.BasicBlock) {
-	mach := c.mach
-
-	c.tmpVals = c.tmpVals[:0]
-	data := c.ssaBuilder.ValuesInfo()
-	for i := 0; i < entry.Params(); i++ {
-		p := entry.Param(i)
-		if data[p.ID()].RefCount > 0 {
-			c.tmpVals = append(c.tmpVals, p)
-		} else {
-			// If the argument is not used, we can just pass an invalid value.
-			c.tmpVals = append(c.tmpVals, ssa.ValueInvalid)
-		}
-	}
-	mach.LowerParams(c.tmpVals)
-	mach.FlushPendingInstructions()
-}
-
-// lowerBlockArguments lowers how to pass arguments to the given successor block.
-func (c *compiler) lowerBlockArguments(args []ssa.Value, succ ssa.BasicBlock) {
-	if len(args) != succ.Params() {
-		panic("BUG: mismatched number of arguments")
-	}
-
-	c.varEdges = c.varEdges[:0]
-	c.varEdgeTypes = c.varEdgeTypes[:0]
-	c.constEdges = c.constEdges[:0]
-	for i := 0; i < len(args); i++ {
-		dst := succ.Param(i)
-		src := args[i]
-
-		dstReg := c.VRegOf(dst)
-		srcInstr := c.ssaBuilder.InstructionOfValue(src)
-		if srcInstr != nil && srcInstr.Constant() {
-			c.constEdges = append(c.constEdges, struct {
-				cInst *ssa.Instruction
-				dst   regalloc.VReg
-			}{cInst: srcInstr, dst: dstReg})
-		} else {
-			srcReg := c.VRegOf(src)
-			// Even when the src=dst, insert the move so that we can keep such registers keep-alive.
-			c.varEdges = append(c.varEdges, [2]regalloc.VReg{srcReg, dstReg})
-			c.varEdgeTypes = append(c.varEdgeTypes, src.Type())
-		}
-	}
-
-	// Check if there's an overlap among the dsts and srcs in varEdges.
-	c.vRegIDs = c.vRegIDs[:0]
-	for _, edge := range c.varEdges {
-		src := edge[0].ID()
-		if int(src) >= len(c.vRegSet) {
-			c.vRegSet = append(c.vRegSet, make([]bool, src+1)...)
-		}
-		c.vRegSet[src] = true
-		c.vRegIDs = append(c.vRegIDs, src)
-	}
-	separated := true
-	for _, edge := range c.varEdges {
-		dst := edge[1].ID()
-		if int(dst) >= len(c.vRegSet) {
-			c.vRegSet = append(c.vRegSet, make([]bool, dst+1)...)
-		} else {
-			if c.vRegSet[dst] {
-				separated = false
-				break
-			}
-		}
-	}
-	for _, id := range c.vRegIDs {
-		c.vRegSet[id] = false // reset for the next use.
-	}
-
-	if separated {
-		// If there's no overlap, we can simply move the source to destination.
-		for i, edge := range c.varEdges {
-			src, dst := edge[0], edge[1]
-			c.mach.InsertMove(dst, src, c.varEdgeTypes[i])
-		}
-	} else {
-		// Otherwise, we allocate a temporary registers and move the source to the temporary register,
-		//
-		// First move all of them to temporary registers.
-		c.tempRegs = c.tempRegs[:0]
-		for i, edge := range c.varEdges {
-			src := edge[0]
-			typ := c.varEdgeTypes[i]
-			temp := c.AllocateVReg(typ)
-			c.tempRegs = append(c.tempRegs, temp)
-			c.mach.InsertMove(temp, src, typ)
-		}
-		// Then move the temporary registers to the destination.
-		for i, edge := range c.varEdges {
-			temp := c.tempRegs[i]
-			dst := edge[1]
-			c.mach.InsertMove(dst, temp, c.varEdgeTypes[i])
-		}
-	}
-
-	// Finally, move the constants.
-	for _, edge := range c.constEdges {
-		cInst, dst := edge.cInst, edge.dst
-		c.mach.InsertLoadConstantBlockArg(cInst, dst)
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go
deleted file mode 100644
index 6fe6d7b3c..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go
+++ /dev/null
@@ -1,33 +0,0 @@
-package backend
-
-import "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-
-// GoFunctionCallRequiredStackSize returns the size of the stack required for the Go function call.
-// argBegin is the index of the first argument in the signature which is not either execution context or module context.
-func GoFunctionCallRequiredStackSize(sig *ssa.Signature, argBegin int) (ret, retUnaligned int64) {
-	var paramNeededInBytes, resultNeededInBytes int64
-	for _, p := range sig.Params[argBegin:] {
-		s := int64(p.Size())
-		if s < 8 {
-			s = 8 // We use uint64 for all basic types, except SIMD v128.
-		}
-		paramNeededInBytes += s
-	}
-	for _, r := range sig.Results {
-		s := int64(r.Size())
-		if s < 8 {
-			s = 8 // We use uint64 for all basic types, except SIMD v128.
-		}
-		resultNeededInBytes += s
-	}
-
-	if paramNeededInBytes > resultNeededInBytes {
-		ret = paramNeededInBytes
-	} else {
-		ret = resultNeededInBytes
-	}
-	retUnaligned = ret
-	// Align to 16 bytes.
-	ret = (ret + 15) &^ 15
-	return
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go
deleted file mode 100644
index 130f8c621..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go
+++ /dev/null
@@ -1,186 +0,0 @@
-package amd64
-
-import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-// For the details of the ABI, see:
-// https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#amd64-architecture
-
-var (
-	intArgResultRegs   = []regalloc.RealReg{rax, rbx, rcx, rdi, rsi, r8, r9, r10, r11}
-	floatArgResultRegs = []regalloc.RealReg{xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7}
-)
-
-var regInfo = &regalloc.RegisterInfo{
-	AllocatableRegisters: [regalloc.NumRegType][]regalloc.RealReg{
-		regalloc.RegTypeInt: {
-			rax, rcx, rdx, rbx, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15,
-		},
-		regalloc.RegTypeFloat: {
-			xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
-		},
-	},
-	CalleeSavedRegisters: regalloc.NewRegSet(
-		rdx, r12, r13, r14, r15,
-		xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
-	),
-	CallerSavedRegisters: regalloc.NewRegSet(
-		rax, rcx, rbx, rsi, rdi, r8, r9, r10, r11,
-		xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
-	),
-	RealRegToVReg: []regalloc.VReg{
-		rax: raxVReg, rcx: rcxVReg, rdx: rdxVReg, rbx: rbxVReg, rsp: rspVReg, rbp: rbpVReg, rsi: rsiVReg, rdi: rdiVReg,
-		r8: r8VReg, r9: r9VReg, r10: r10VReg, r11: r11VReg, r12: r12VReg, r13: r13VReg, r14: r14VReg, r15: r15VReg,
-		xmm0: xmm0VReg, xmm1: xmm1VReg, xmm2: xmm2VReg, xmm3: xmm3VReg, xmm4: xmm4VReg, xmm5: xmm5VReg, xmm6: xmm6VReg,
-		xmm7: xmm7VReg, xmm8: xmm8VReg, xmm9: xmm9VReg, xmm10: xmm10VReg, xmm11: xmm11VReg, xmm12: xmm12VReg,
-		xmm13: xmm13VReg, xmm14: xmm14VReg, xmm15: xmm15VReg,
-	},
-	RealRegName: func(r regalloc.RealReg) string { return regNames[r] },
-	RealRegType: func(r regalloc.RealReg) regalloc.RegType {
-		if r < xmm0 {
-			return regalloc.RegTypeInt
-		}
-		return regalloc.RegTypeFloat
-	},
-}
-
-// ArgsResultsRegs implements backend.Machine.
-func (m *machine) ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) {
-	return intArgResultRegs, floatArgResultRegs
-}
-
-// LowerParams implements backend.Machine.
-func (m *machine) LowerParams(args []ssa.Value) {
-	a := m.currentABI
-
-	for i, ssaArg := range args {
-		if !ssaArg.Valid() {
-			continue
-		}
-		reg := m.c.VRegOf(ssaArg)
-		arg := &a.Args[i]
-		if arg.Kind == backend.ABIArgKindReg {
-			m.InsertMove(reg, arg.Reg, arg.Type)
-		} else {
-			//
-			//            (high address)
-			//          +-----------------+
-			//          |     .......     |
-			//          |      ret Y      |
-			//          |     .......     |
-			//          |      ret 0      |
-			//          |      arg X      |
-			//          |     .......     |
-			//          |      arg 1      |
-			//          |      arg 0      |
-			//          |   ReturnAddress |
-			//          |    Caller_RBP   |
-			//          +-----------------+ <-- RBP
-			//          |   ...........   |
-			//          |   clobbered  M  |
-			//          |   ............  |
-			//          |   clobbered  0  |
-			//          |   spill slot N  |
-			//          |   ...........   |
-			//          |   spill slot 0  |
-			//   RSP--> +-----------------+
-			//             (low address)
-
-			// Load the value from the arg stack slot above the current RBP.
-			load := m.allocateInstr()
-			mem := newOperandMem(m.newAmodeImmRBPReg(uint32(arg.Offset + 16)))
-			switch arg.Type {
-			case ssa.TypeI32:
-				load.asMovzxRmR(extModeLQ, mem, reg)
-			case ssa.TypeI64:
-				load.asMov64MR(mem, reg)
-			case ssa.TypeF32:
-				load.asXmmUnaryRmR(sseOpcodeMovss, mem, reg)
-			case ssa.TypeF64:
-				load.asXmmUnaryRmR(sseOpcodeMovsd, mem, reg)
-			case ssa.TypeV128:
-				load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, reg)
-			default:
-				panic("BUG")
-			}
-			m.insert(load)
-		}
-	}
-}
-
-// LowerReturns implements backend.Machine.
-func (m *machine) LowerReturns(rets []ssa.Value) {
-	// Load the XMM registers first as it might need a temporary register to inline
-	// constant return.
-	a := m.currentABI
-	for i, ret := range rets {
-		r := &a.Rets[i]
-		if !r.Type.IsInt() {
-			m.LowerReturn(ret, r)
-		}
-	}
-	// Then load the GPR registers.
-	for i, ret := range rets {
-		r := &a.Rets[i]
-		if r.Type.IsInt() {
-			m.LowerReturn(ret, r)
-		}
-	}
-}
-
-func (m *machine) LowerReturn(ret ssa.Value, r *backend.ABIArg) {
-	reg := m.c.VRegOf(ret)
-	if def := m.c.ValueDefinition(ret); def.IsFromInstr() {
-		// Constant instructions are inlined.
-		if inst := def.Instr; inst.Constant() {
-			m.insertLoadConstant(inst, reg)
-		}
-	}
-	if r.Kind == backend.ABIArgKindReg {
-		m.InsertMove(r.Reg, reg, ret.Type())
-	} else {
-		//
-		//            (high address)
-		//          +-----------------+
-		//          |     .......     |
-		//          |      ret Y      |
-		//          |     .......     |
-		//          |      ret 0      |
-		//          |      arg X      |
-		//          |     .......     |
-		//          |      arg 1      |
-		//          |      arg 0      |
-		//          |   ReturnAddress |
-		//          |    Caller_RBP   |
-		//          +-----------------+ <-- RBP
-		//          |   ...........   |
-		//          |   clobbered  M  |
-		//          |   ............  |
-		//          |   clobbered  0  |
-		//          |   spill slot N  |
-		//          |   ...........   |
-		//          |   spill slot 0  |
-		//   RSP--> +-----------------+
-		//             (low address)
-
-		// Store the value to the return stack slot above the current RBP.
-		store := m.allocateInstr()
-		mem := newOperandMem(m.newAmodeImmRBPReg(uint32(m.currentABI.ArgStackSize + 16 + r.Offset)))
-		switch r.Type {
-		case ssa.TypeI32:
-			store.asMovRM(reg, mem, 4)
-		case ssa.TypeI64:
-			store.asMovRM(reg, mem, 8)
-		case ssa.TypeF32:
-			store.asXmmMovRM(sseOpcodeMovss, reg, mem)
-		case ssa.TypeF64:
-			store.asXmmMovRM(sseOpcodeMovsd, reg, mem)
-		case ssa.TypeV128:
-			store.asXmmMovRM(sseOpcodeMovdqu, reg, mem)
-		}
-		m.insert(store)
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go
deleted file mode 100644
index cbf1cfdc5..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go
+++ /dev/null
@@ -1,9 +0,0 @@
-package amd64
-
-// entrypoint enters the machine code generated by this backend which begins with the preamble generated by functionABI.EmitGoEntryPreamble below.
-// This implements wazevo.entrypoint, and see the comments there for detail.
-func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr)
-
-// afterGoFunctionCallEntrypoint enters the machine code after growing the stack.
-// This implements wazevo.afterGoFunctionCallEntrypoint, and see the comments there for detail.
-func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s
deleted file mode 100644
index e9cb131d1..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s
+++ /dev/null
@@ -1,29 +0,0 @@
-#include "funcdata.h"
-#include "textflag.h"
-
-// entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr
-TEXT ·entrypoint(SB), NOSPLIT|NOFRAME, $0-48
-	MOVQ preambleExecutable+0(FP), R11
-	MOVQ functionExectuable+8(FP), R14
-	MOVQ executionContextPtr+16(FP), AX       // First argument is passed in AX.
-	MOVQ moduleContextPtr+24(FP), BX          // Second argument is passed in BX.
-	MOVQ paramResultSlicePtr+32(FP), R12
-	MOVQ goAllocatedStackSlicePtr+40(FP), R13
-	JMP  R11
-
-// afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
-TEXT ·afterGoFunctionCallEntrypoint(SB), NOSPLIT|NOFRAME, $0-32
-	MOVQ executable+0(FP), CX
-	MOVQ executionContextPtr+8(FP), AX // First argument is passed in AX.
-
-	// Save the stack pointer and frame pointer.
-	MOVQ BP, 16(AX) // 16 == ExecutionContextOffsetOriginalFramePointer
-	MOVQ SP, 24(AX) // 24 == ExecutionContextOffsetOriginalStackPointer
-
-	// Then set the stack pointer and frame pointer to the values we got from the Go runtime.
-	MOVQ framePointer+24(FP), BP
-
-	// WARNING: do not update SP before BP, because the Go translates (FP) as (SP) + 8.
-	MOVQ stackPointer+16(FP), SP
-
-	JMP CX
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go
deleted file mode 100644
index 882d06c06..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go
+++ /dev/null
@@ -1,248 +0,0 @@
-package amd64
-
-import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-)
-
-var (
-	executionContextPtrReg = raxVReg
-
-	// Followings are callee saved registers. They can be used freely in the entry preamble
-	// since the preamble is called via Go assembly function which has stack-based ABI.
-
-	// savedExecutionContextPtr also must be a callee-saved reg so that they can be used in the prologue and epilogue.
-	savedExecutionContextPtr = rdxVReg
-	// paramResultSlicePtr must match with entrypoint function in abi_entry_amd64.s.
-	paramResultSlicePtr = r12VReg
-	// goAllocatedStackPtr must match with entrypoint function in abi_entry_amd64.s.
-	goAllocatedStackPtr = r13VReg
-	// functionExecutable must match with entrypoint function in abi_entry_amd64.s.
-	functionExecutable = r14VReg
-	tmpIntReg          = r15VReg
-	tmpXmmReg          = xmm15VReg
-)
-
-// CompileEntryPreamble implements backend.Machine.
-func (m *machine) CompileEntryPreamble(sig *ssa.Signature) []byte {
-	root := m.compileEntryPreamble(sig)
-	m.encodeWithoutSSA(root)
-	buf := m.c.Buf()
-	return buf
-}
-
-func (m *machine) compileEntryPreamble(sig *ssa.Signature) *instruction {
-	abi := backend.FunctionABI{}
-	abi.Init(sig, intArgResultRegs, floatArgResultRegs)
-
-	root := m.allocateNop()
-
-	//// ----------------------------------- prologue ----------------------------------- ////
-
-	// First, we save executionContextPtrReg into a callee-saved register so that it can be used in epilogue as well.
-	// 		mov %executionContextPtrReg, %savedExecutionContextPtr
-	cur := m.move64(executionContextPtrReg, savedExecutionContextPtr, root)
-
-	// Next is to save the original RBP and RSP into the execution context.
-	cur = m.saveOriginalRSPRBP(cur)
-
-	// Now set the RSP to the Go-allocated stack pointer.
-	// 		mov %goAllocatedStackPtr, %rsp
-	cur = m.move64(goAllocatedStackPtr, rspVReg, cur)
-
-	if stackSlotSize := abi.AlignedArgResultStackSlotSize(); stackSlotSize > 0 {
-		// Allocate stack slots for the arguments and return values.
-		// 		sub $stackSlotSize, %rsp
-		spDec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(stackSlotSize)), rspVReg, true)
-		cur = linkInstr(cur, spDec)
-	}
-
-	var offset uint32
-	for i := range abi.Args {
-		if i < 2 {
-			// module context ptr and execution context ptr are passed in rax and rbx by the Go assembly function.
-			continue
-		}
-		arg := &abi.Args[i]
-		cur = m.goEntryPreamblePassArg(cur, paramResultSlicePtr, offset, arg)
-		if arg.Type == ssa.TypeV128 {
-			offset += 16
-		} else {
-			offset += 8
-		}
-	}
-
-	// Zero out RBP so that the unwind/stack growth code can correctly detect the end of the stack.
-	zerosRbp := m.allocateInstr().asAluRmiR(aluRmiROpcodeXor, newOperandReg(rbpVReg), rbpVReg, true)
-	cur = linkInstr(cur, zerosRbp)
-
-	// Now ready to call the real function. Note that at this point stack pointer is already set to the Go-allocated,
-	// which is aligned to 16 bytes.
-	call := m.allocateInstr().asCallIndirect(newOperandReg(functionExecutable), &abi)
-	cur = linkInstr(cur, call)
-
-	//// ----------------------------------- epilogue ----------------------------------- ////
-
-	// Read the results from regs and the stack, and set them correctly into the paramResultSlicePtr.
-	offset = 0
-	for i := range abi.Rets {
-		r := &abi.Rets[i]
-		cur = m.goEntryPreamblePassResult(cur, paramResultSlicePtr, offset, r, uint32(abi.ArgStackSize))
-		if r.Type == ssa.TypeV128 {
-			offset += 16
-		} else {
-			offset += 8
-		}
-	}
-
-	// Finally, restore the original RBP and RSP.
-	cur = m.restoreOriginalRSPRBP(cur)
-
-	ret := m.allocateInstr().asRet()
-	linkInstr(cur, ret)
-	return root
-}
-
-// saveOriginalRSPRBP saves the original RSP and RBP into the execution context.
-func (m *machine) saveOriginalRSPRBP(cur *instruction) *instruction {
-	// 		mov %rbp, wazevoapi.ExecutionContextOffsetOriginalFramePointer(%executionContextPtrReg)
-	// 		mov %rsp, wazevoapi.ExecutionContextOffsetOriginalStackPointer(%executionContextPtrReg)
-	cur = m.loadOrStore64AtExecutionCtx(executionContextPtrReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, rbpVReg, true, cur)
-	cur = m.loadOrStore64AtExecutionCtx(executionContextPtrReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, rspVReg, true, cur)
-	return cur
-}
-
-// restoreOriginalRSPRBP restores the original RSP and RBP from the execution context.
-func (m *machine) restoreOriginalRSPRBP(cur *instruction) *instruction {
-	// 		mov wazevoapi.ExecutionContextOffsetOriginalFramePointer(%executionContextPtrReg), %rbp
-	// 		mov wazevoapi.ExecutionContextOffsetOriginalStackPointer(%executionContextPtrReg), %rsp
-	cur = m.loadOrStore64AtExecutionCtx(savedExecutionContextPtr, wazevoapi.ExecutionContextOffsetOriginalFramePointer, rbpVReg, false, cur)
-	cur = m.loadOrStore64AtExecutionCtx(savedExecutionContextPtr, wazevoapi.ExecutionContextOffsetOriginalStackPointer, rspVReg, false, cur)
-	return cur
-}
-
-func (m *machine) move64(src, dst regalloc.VReg, prev *instruction) *instruction {
-	mov := m.allocateInstr().asMovRR(src, dst, true)
-	return linkInstr(prev, mov)
-}
-
-func (m *machine) loadOrStore64AtExecutionCtx(execCtx regalloc.VReg, offset wazevoapi.Offset, r regalloc.VReg, store bool, prev *instruction) *instruction {
-	mem := newOperandMem(m.newAmodeImmReg(offset.U32(), execCtx))
-	instr := m.allocateInstr()
-	if store {
-		instr.asMovRM(r, mem, 8)
-	} else {
-		instr.asMov64MR(mem, r)
-	}
-	return linkInstr(prev, instr)
-}
-
-// This is for debugging.
-func (m *machine) linkUD2(cur *instruction) *instruction { //nolint
-	return linkInstr(cur, m.allocateInstr().asUD2())
-}
-
-func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regalloc.VReg, offsetInParamSlice uint32, arg *backend.ABIArg) *instruction {
-	var dst regalloc.VReg
-	argTyp := arg.Type
-	if arg.Kind == backend.ABIArgKindStack {
-		// Caller saved registers ca
-		switch argTyp {
-		case ssa.TypeI32, ssa.TypeI64:
-			dst = tmpIntReg
-		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-			dst = tmpXmmReg
-		default:
-			panic("BUG")
-		}
-	} else {
-		dst = arg.Reg
-	}
-
-	load := m.allocateInstr()
-	a := newOperandMem(m.newAmodeImmReg(offsetInParamSlice, paramSlicePtr))
-	switch arg.Type {
-	case ssa.TypeI32:
-		load.asMovzxRmR(extModeLQ, a, dst)
-	case ssa.TypeI64:
-		load.asMov64MR(a, dst)
-	case ssa.TypeF32:
-		load.asXmmUnaryRmR(sseOpcodeMovss, a, dst)
-	case ssa.TypeF64:
-		load.asXmmUnaryRmR(sseOpcodeMovsd, a, dst)
-	case ssa.TypeV128:
-		load.asXmmUnaryRmR(sseOpcodeMovdqu, a, dst)
-	}
-
-	cur = linkInstr(cur, load)
-	if arg.Kind == backend.ABIArgKindStack {
-		// Store back to the stack.
-		store := m.allocateInstr()
-		a := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset), rspVReg))
-		switch arg.Type {
-		case ssa.TypeI32:
-			store.asMovRM(dst, a, 4)
-		case ssa.TypeI64:
-			store.asMovRM(dst, a, 8)
-		case ssa.TypeF32:
-			store.asXmmMovRM(sseOpcodeMovss, dst, a)
-		case ssa.TypeF64:
-			store.asXmmMovRM(sseOpcodeMovsd, dst, a)
-		case ssa.TypeV128:
-			store.asXmmMovRM(sseOpcodeMovdqu, dst, a)
-		}
-		cur = linkInstr(cur, store)
-	}
-	return cur
-}
-
-func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr regalloc.VReg, offsetInResultSlice uint32, result *backend.ABIArg, resultStackSlotBeginOffset uint32) *instruction {
-	var r regalloc.VReg
-	if result.Kind == backend.ABIArgKindStack {
-		// Load the value to the temporary.
-		load := m.allocateInstr()
-		offset := resultStackSlotBeginOffset + uint32(result.Offset)
-		a := newOperandMem(m.newAmodeImmReg(offset, rspVReg))
-		switch result.Type {
-		case ssa.TypeI32:
-			r = tmpIntReg
-			load.asMovzxRmR(extModeLQ, a, r)
-		case ssa.TypeI64:
-			r = tmpIntReg
-			load.asMov64MR(a, r)
-		case ssa.TypeF32:
-			r = tmpXmmReg
-			load.asXmmUnaryRmR(sseOpcodeMovss, a, r)
-		case ssa.TypeF64:
-			r = tmpXmmReg
-			load.asXmmUnaryRmR(sseOpcodeMovsd, a, r)
-		case ssa.TypeV128:
-			r = tmpXmmReg
-			load.asXmmUnaryRmR(sseOpcodeMovdqu, a, r)
-		default:
-			panic("BUG")
-		}
-		cur = linkInstr(cur, load)
-	} else {
-		r = result.Reg
-	}
-
-	store := m.allocateInstr()
-	a := newOperandMem(m.newAmodeImmReg(offsetInResultSlice, resultSlicePtr))
-	switch result.Type {
-	case ssa.TypeI32:
-		store.asMovRM(r, a, 4)
-	case ssa.TypeI64:
-		store.asMovRM(r, a, 8)
-	case ssa.TypeF32:
-		store.asXmmMovRM(sseOpcodeMovss, r, a)
-	case ssa.TypeF64:
-		store.asXmmMovRM(sseOpcodeMovsd, r, a)
-	case ssa.TypeV128:
-		store.asXmmMovRM(sseOpcodeMovdqu, r, a)
-	}
-
-	return linkInstr(cur, store)
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go
deleted file mode 100644
index 96f035e58..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go
+++ /dev/null
@@ -1,440 +0,0 @@
-package amd64
-
-import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-)
-
-var calleeSavedVRegs = []regalloc.VReg{
-	rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg,
-	xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg,
-}
-
-// CompileGoFunctionTrampoline implements backend.Machine.
-func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte {
-	argBegin := 1 // Skips exec context by default.
-	if needModuleContextPtr {
-		argBegin++
-	}
-
-	abi := &backend.FunctionABI{}
-	abi.Init(sig, intArgResultRegs, floatArgResultRegs)
-	m.currentABI = abi
-
-	cur := m.allocateNop()
-	m.rootInstr = cur
-
-	// Execution context is always the first argument.
-	execCtrPtr := raxVReg
-
-	// First we update RBP and RSP just like the normal prologue.
-	//
-	//                   (high address)                     (high address)
-	//       RBP ----> +-----------------+                +-----------------+
-	//                 |     .......     |                |     .......     |
-	//                 |      ret Y      |                |      ret Y      |
-	//                 |     .......     |                |     .......     |
-	//                 |      ret 0      |                |      ret 0      |
-	//                 |      arg X      |                |      arg X      |
-	//                 |     .......     |     ====>      |     .......     |
-	//                 |      arg 1      |                |      arg 1      |
-	//                 |      arg 0      |                |      arg 0      |
-	//                 |   Return Addr   |                |   Return Addr   |
-	//       RSP ----> +-----------------+                |    Caller_RBP   |
-	//                    (low address)                   +-----------------+ <----- RSP, RBP
-	//
-	cur = m.setupRBPRSP(cur)
-
-	goSliceSizeAligned, goSliceSizeAlignedUnaligned := backend.GoFunctionCallRequiredStackSize(sig, argBegin)
-	cur = m.insertStackBoundsCheck(goSliceSizeAligned+8 /* size of the Go slice */, cur)
-
-	// Save the callee saved registers.
-	cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs)
-
-	if needModuleContextPtr {
-		moduleCtrPtr := rbxVReg // Module context is always the second argument.
-		mem := m.newAmodeImmReg(
-			wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.U32(),
-			execCtrPtr)
-		store := m.allocateInstr().asMovRM(moduleCtrPtr, newOperandMem(mem), 8)
-		cur = linkInstr(cur, store)
-	}
-
-	// Now let's advance the RSP to the stack slot for the arguments.
-	//
-	//                (high address)                     (high address)
-	//              +-----------------+               +-----------------+
-	//              |     .......     |               |     .......     |
-	//              |      ret Y      |               |      ret Y      |
-	//              |     .......     |               |     .......     |
-	//              |      ret 0      |               |      ret 0      |
-	//              |      arg X      |               |      arg X      |
-	//              |     .......     |   =======>    |     .......     |
-	//              |      arg 1      |               |      arg 1      |
-	//              |      arg 0      |               |      arg 0      |
-	//              |   Return Addr   |               |   Return Addr   |
-	//              |    Caller_RBP   |               |    Caller_RBP   |
-	//  RBP,RSP --> +-----------------+               +-----------------+ <----- RBP
-	//                 (low address)                  |  arg[N]/ret[M]  |
-	//                                                |    ..........   |
-	//                                                |  arg[1]/ret[1]  |
-	//                                                |  arg[0]/ret[0]  |
-	//                                                +-----------------+ <----- RSP
-	//                                                   (low address)
-	//
-	// where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions,
-	// therefore will be accessed as the usual []uint64. So that's where we need to pass/receive
-	// the arguments/return values to/from Go function.
-	cur = m.addRSP(-int32(goSliceSizeAligned), cur)
-
-	// Next, we need to store all the arguments to the stack in the typical Wasm stack style.
-	var offsetInGoSlice int32
-	for i := range abi.Args[argBegin:] {
-		arg := &abi.Args[argBegin+i]
-		var v regalloc.VReg
-		if arg.Kind == backend.ABIArgKindReg {
-			v = arg.Reg
-		} else {
-			// We have saved callee saved registers, so we can use them.
-			if arg.Type.IsInt() {
-				v = r15VReg
-			} else {
-				v = xmm15VReg
-			}
-			mem := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg))
-			load := m.allocateInstr()
-			switch arg.Type {
-			case ssa.TypeI32:
-				load.asMovzxRmR(extModeLQ, mem, v)
-			case ssa.TypeI64:
-				load.asMov64MR(mem, v)
-			case ssa.TypeF32:
-				load.asXmmUnaryRmR(sseOpcodeMovss, mem, v)
-			case ssa.TypeF64:
-				load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v)
-			case ssa.TypeV128:
-				load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
-			default:
-				panic("BUG")
-			}
-			cur = linkInstr(cur, load)
-		}
-
-		store := m.allocateInstr()
-		mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg))
-		switch arg.Type {
-		case ssa.TypeI32:
-			store.asMovRM(v, mem, 4)
-			offsetInGoSlice += 8 // always uint64 rep.
-		case ssa.TypeI64:
-			store.asMovRM(v, mem, 8)
-			offsetInGoSlice += 8
-		case ssa.TypeF32:
-			store.asXmmMovRM(sseOpcodeMovss, v, mem)
-			offsetInGoSlice += 8 // always uint64 rep.
-		case ssa.TypeF64:
-			store.asXmmMovRM(sseOpcodeMovsd, v, mem)
-			offsetInGoSlice += 8
-		case ssa.TypeV128:
-			store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
-			offsetInGoSlice += 16
-		default:
-			panic("BUG")
-		}
-		cur = linkInstr(cur, store)
-	}
-
-	// Finally we push the size of the slice to the stack so the stack looks like:
-	//
-	//          (high address)
-	//       +-----------------+
-	//       |     .......     |
-	//       |      ret Y      |
-	//       |     .......     |
-	//       |      ret 0      |
-	//       |      arg X      |
-	//       |     .......     |
-	//       |      arg 1      |
-	//       |      arg 0      |
-	//       |   Return Addr   |
-	//       |    Caller_RBP   |
-	//       +-----------------+ <----- RBP
-	//       |  arg[N]/ret[M]  |
-	//       |    ..........   |
-	//       |  arg[1]/ret[1]  |
-	//       |  arg[0]/ret[0]  |
-	//       |    slice size   |
-	//       +-----------------+ <----- RSP
-	//         (low address)
-	//
-	// 		push $sliceSize
-	cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandImm32(uint32(goSliceSizeAlignedUnaligned))))
-
-	// Load the exitCode to the register.
-	exitCodeReg := r12VReg // Callee saved which is already saved.
-	cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(exitCode), false))
-
-	saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg)
-	cur = linkInstr(cur, setExitCode)
-	cur = linkInstr(cur, saveRsp)
-	cur = linkInstr(cur, saveRbp)
-
-	// Ready to exit the execution.
-	cur = m.storeReturnAddressAndExit(cur, execCtrPtr)
-
-	// We don't need the slice size anymore, so pop it.
-	cur = m.addRSP(8, cur)
-
-	// Ready to set up the results.
-	offsetInGoSlice = 0
-	// To avoid overwriting with the execution context pointer by the result, we need to track the offset,
-	// and defer the restoration of the result to the end of this function.
-	var argOverlapWithExecCtxOffset int32 = -1
-	for i := range abi.Rets {
-		r := &abi.Rets[i]
-		var v regalloc.VReg
-		isRegResult := r.Kind == backend.ABIArgKindReg
-		if isRegResult {
-			v = r.Reg
-			if v.RealReg() == execCtrPtr.RealReg() {
-				argOverlapWithExecCtxOffset = offsetInGoSlice
-				offsetInGoSlice += 8 // always uint64 rep.
-				continue
-			}
-		} else {
-			if r.Type.IsInt() {
-				v = r15VReg
-			} else {
-				v = xmm15VReg
-			}
-		}
-
-		load := m.allocateInstr()
-		mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg))
-		switch r.Type {
-		case ssa.TypeI32:
-			load.asMovzxRmR(extModeLQ, mem, v)
-			offsetInGoSlice += 8 // always uint64 rep.
-		case ssa.TypeI64:
-			load.asMov64MR(mem, v)
-			offsetInGoSlice += 8
-		case ssa.TypeF32:
-			load.asXmmUnaryRmR(sseOpcodeMovss, mem, v)
-			offsetInGoSlice += 8 // always uint64 rep.
-		case ssa.TypeF64:
-			load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v)
-			offsetInGoSlice += 8
-		case ssa.TypeV128:
-			load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
-			offsetInGoSlice += 16
-		default:
-			panic("BUG")
-		}
-		cur = linkInstr(cur, load)
-
-		if !isRegResult {
-			// We need to store it back to the result slot above rbp.
-			store := m.allocateInstr()
-			mem := newOperandMem(m.newAmodeImmReg(uint32(abi.ArgStackSize+r.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg))
-			switch r.Type {
-			case ssa.TypeI32:
-				store.asMovRM(v, mem, 4)
-			case ssa.TypeI64:
-				store.asMovRM(v, mem, 8)
-			case ssa.TypeF32:
-				store.asXmmMovRM(sseOpcodeMovss, v, mem)
-			case ssa.TypeF64:
-				store.asXmmMovRM(sseOpcodeMovsd, v, mem)
-			case ssa.TypeV128:
-				store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
-			default:
-				panic("BUG")
-			}
-			cur = linkInstr(cur, store)
-		}
-	}
-
-	// Before return, we need to restore the callee saved registers.
-	cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs)
-
-	if argOverlapWithExecCtxOffset >= 0 {
-		// At this point execCtt is not used anymore, so we can finally store the
-		// result to the register which overlaps with the execution context pointer.
-		mem := newOperandMem(m.newAmodeImmReg(uint32(argOverlapWithExecCtxOffset), rspVReg))
-		load := m.allocateInstr().asMov64MR(mem, execCtrPtr)
-		cur = linkInstr(cur, load)
-	}
-
-	// Finally ready to return.
-	cur = m.revertRBPRSP(cur)
-	linkInstr(cur, m.allocateInstr().asRet())
-
-	m.encodeWithoutSSA(m.rootInstr)
-	return m.c.Buf()
-}
-
-func (m *machine) saveRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction {
-	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
-	for _, v := range regs {
-		store := m.allocateInstr()
-		mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx))
-		switch v.RegType() {
-		case regalloc.RegTypeInt:
-			store.asMovRM(v, mem, 8)
-		case regalloc.RegTypeFloat:
-			store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
-		default:
-			panic("BUG")
-		}
-		cur = linkInstr(cur, store)
-		offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally.
-	}
-	return cur
-}
-
-func (m *machine) restoreRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction {
-	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
-	for _, v := range regs {
-		load := m.allocateInstr()
-		mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx))
-		switch v.RegType() {
-		case regalloc.RegTypeInt:
-			load.asMov64MR(mem, v)
-		case regalloc.RegTypeFloat:
-			load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
-		default:
-			panic("BUG")
-		}
-		cur = linkInstr(cur, load)
-		offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally.
-	}
-	return cur
-}
-
-func (m *machine) storeReturnAddressAndExit(cur *instruction, execCtx regalloc.VReg) *instruction {
-	readRip := m.allocateInstr()
-	cur = linkInstr(cur, readRip)
-
-	ripReg := r12VReg // Callee saved which is already saved.
-	saveRip := m.allocateInstr().asMovRM(
-		ripReg,
-		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)),
-		8,
-	)
-	cur = linkInstr(cur, saveRip)
-
-	exit := m.allocateExitSeq(execCtx)
-	cur = linkInstr(cur, exit)
-
-	nop, l := m.allocateBrTarget()
-	cur = linkInstr(cur, nop)
-	readRip.asLEA(newOperandLabel(l), ripReg)
-	return cur
-}
-
-// saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
-// stack space left. Basically this is the all allocatable registers except for RSP and RBP, and RAX which contains the
-// execution context pointer. ExecCtx pointer is always the first argument so we don't need to save it.
-var stackGrowSaveVRegs = []regalloc.VReg{
-	rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg,
-	rcxVReg, rbxVReg, rsiVReg, rdiVReg, r8VReg, r9VReg, r10VReg, r11VReg,
-	xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg,
-	xmm0VReg, xmm1VReg, xmm2VReg, xmm3VReg, xmm4VReg, xmm5VReg, xmm6VReg, xmm7VReg,
-}
-
-// CompileStackGrowCallSequence implements backend.Machine.
-func (m *machine) CompileStackGrowCallSequence() []byte {
-	cur := m.allocateNop()
-	m.rootInstr = cur
-
-	cur = m.setupRBPRSP(cur)
-
-	// Execution context is always the first argument.
-	execCtrPtr := raxVReg
-
-	// Save the callee saved and argument registers.
-	cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs)
-
-	// Load the exitCode to the register.
-	exitCodeReg := r12VReg // Already saved.
-	cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(wazevoapi.ExitCodeGrowStack), false))
-
-	saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg)
-	cur = linkInstr(cur, setExitCode)
-	cur = linkInstr(cur, saveRsp)
-	cur = linkInstr(cur, saveRbp)
-
-	// Ready to exit the execution.
-	cur = m.storeReturnAddressAndExit(cur, execCtrPtr)
-
-	// After the exit, restore the saved registers.
-	cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs)
-
-	// Finally ready to return.
-	cur = m.revertRBPRSP(cur)
-	linkInstr(cur, m.allocateInstr().asRet())
-
-	m.encodeWithoutSSA(m.rootInstr)
-	return m.c.Buf()
-}
-
-// insertStackBoundsCheck will insert the instructions after `cur` to check the
-// stack bounds, and if there's no sufficient spaces required for the function,
-// exit the execution and try growing it in Go world.
-func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction {
-	//		add $requiredStackSize, %rsp ;; Temporarily update the sp.
-	// 		cmp ExecutionContextOffsetStackBottomPtr(%rax), %rsp ;; Compare the stack bottom and the sp.
-	// 		ja .ok
-	//		sub $requiredStackSize, %rsp ;; Reverse the temporary update.
-	//      pushq r15 ;; save the temporary.
-	//		mov $requiredStackSize, %r15
-	//		mov %15, ExecutionContextOffsetStackGrowRequiredSize(%rax) ;; Set the required size in the execution context.
-	//      popq r15 ;; restore the temporary.
-	//		callq *ExecutionContextOffsetStackGrowCallTrampolineAddress(%rax) ;; Call the Go function to grow the stack.
-	//		jmp .cont
-	// .ok:
-	//		sub $requiredStackSize, %rsp ;; Reverse the temporary update.
-	// .cont:
-	cur = m.addRSP(-int32(requiredStackSize), cur)
-	cur = linkInstr(cur, m.allocateInstr().asCmpRmiR(true,
-		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackBottomPtr.U32(), raxVReg)),
-		rspVReg, true))
-
-	ja := m.allocateInstr()
-	cur = linkInstr(cur, ja)
-
-	cur = m.addRSP(int32(requiredStackSize), cur)
-
-	// Save the temporary.
-
-	cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r15VReg)))
-	// Load the required size to the temporary.
-	cur = linkInstr(cur, m.allocateInstr().asImm(r15VReg, uint64(requiredStackSize), true))
-	// Set the required size in the execution context.
-	cur = linkInstr(cur, m.allocateInstr().asMovRM(r15VReg,
-		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.U32(), raxVReg)), 8))
-	// Restore the temporary.
-	cur = linkInstr(cur, m.allocateInstr().asPop64(r15VReg))
-	// Call the Go function to grow the stack.
-	cur = linkInstr(cur, m.allocateInstr().asCallIndirect(newOperandMem(m.newAmodeImmReg(
-		wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.U32(), raxVReg)), nil))
-	// Jump to the continuation.
-	jmpToCont := m.allocateInstr()
-	cur = linkInstr(cur, jmpToCont)
-
-	// .ok:
-	okInstr, ok := m.allocateBrTarget()
-	cur = linkInstr(cur, okInstr)
-	ja.asJmpIf(condNBE, newOperandLabel(ok))
-	// On the ok path, we only need to reverse the temporary update.
-	cur = m.addRSP(int32(requiredStackSize), cur)
-
-	// .cont:
-	contInstr, cont := m.allocateBrTarget()
-	cur = linkInstr(cur, contInstr)
-	jmpToCont.asJmp(newOperandLabel(cont))
-
-	return cur
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go
deleted file mode 100644
index 75cbeab75..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go
+++ /dev/null
@@ -1,168 +0,0 @@
-package amd64
-
-import (
-	"fmt"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-type cond byte
-
-const (
-	// condO represents (overflow) condition.
-	condO cond = iota
-	// condNO represents (no overflow) condition.
-	condNO
-	// condB represents (< unsigned) condition.
-	condB
-	// condNB represents (>= unsigned) condition.
-	condNB
-	// condZ represents (zero) condition.
-	condZ
-	// condNZ represents (not-zero) condition.
-	condNZ
-	// condBE represents (<= unsigned) condition.
-	condBE
-	// condNBE represents (> unsigned) condition.
-	condNBE
-	// condS represents (negative) condition.
-	condS
-	// condNS represents (not-negative) condition.
-	condNS
-	// condP represents (parity) condition.
-	condP
-	// condNP represents (not parity) condition.
-	condNP
-	// condL represents (< signed) condition.
-	condL
-	// condNL represents (>= signed) condition.
-	condNL
-	// condLE represents (<= signed) condition.
-	condLE
-	// condNLE represents (> signed) condition.
-	condNLE
-
-	condInvalid
-)
-
-func (c cond) String() string {
-	switch c {
-	case condO:
-		return "o"
-	case condNO:
-		return "no"
-	case condB:
-		return "b"
-	case condNB:
-		return "nb"
-	case condZ:
-		return "z"
-	case condNZ:
-		return "nz"
-	case condBE:
-		return "be"
-	case condNBE:
-		return "nbe"
-	case condS:
-		return "s"
-	case condNS:
-		return "ns"
-	case condL:
-		return "l"
-	case condNL:
-		return "nl"
-	case condLE:
-		return "le"
-	case condNLE:
-		return "nle"
-	case condP:
-		return "p"
-	case condNP:
-		return "np"
-	default:
-		panic("unreachable")
-	}
-}
-
-func condFromSSAIntCmpCond(origin ssa.IntegerCmpCond) cond {
-	switch origin {
-	case ssa.IntegerCmpCondEqual:
-		return condZ
-	case ssa.IntegerCmpCondNotEqual:
-		return condNZ
-	case ssa.IntegerCmpCondSignedLessThan:
-		return condL
-	case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
-		return condNL
-	case ssa.IntegerCmpCondSignedGreaterThan:
-		return condNLE
-	case ssa.IntegerCmpCondSignedLessThanOrEqual:
-		return condLE
-	case ssa.IntegerCmpCondUnsignedLessThan:
-		return condB
-	case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual:
-		return condNB
-	case ssa.IntegerCmpCondUnsignedGreaterThan:
-		return condNBE
-	case ssa.IntegerCmpCondUnsignedLessThanOrEqual:
-		return condBE
-	default:
-		panic("unreachable")
-	}
-}
-
-func condFromSSAFloatCmpCond(origin ssa.FloatCmpCond) cond {
-	switch origin {
-	case ssa.FloatCmpCondGreaterThanOrEqual:
-		return condNB
-	case ssa.FloatCmpCondGreaterThan:
-		return condNBE
-	case ssa.FloatCmpCondEqual, ssa.FloatCmpCondNotEqual, ssa.FloatCmpCondLessThan, ssa.FloatCmpCondLessThanOrEqual:
-		panic(fmt.Sprintf("cond %s must be treated as a special case", origin))
-	default:
-		panic("unreachable")
-	}
-}
-
-func (c cond) encoding() byte {
-	return byte(c)
-}
-
-func (c cond) invert() cond {
-	switch c {
-	case condO:
-		return condNO
-	case condNO:
-		return condO
-	case condB:
-		return condNB
-	case condNB:
-		return condB
-	case condZ:
-		return condNZ
-	case condNZ:
-		return condZ
-	case condBE:
-		return condNBE
-	case condNBE:
-		return condBE
-	case condS:
-		return condNS
-	case condNS:
-		return condS
-	case condP:
-		return condNP
-	case condNP:
-		return condP
-	case condL:
-		return condNL
-	case condNL:
-		return condL
-	case condLE:
-		return condNLE
-	case condNLE:
-		return condLE
-	default:
-		panic("unreachable")
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go
deleted file mode 100644
index 5e731e822..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go
+++ /dev/null
@@ -1,35 +0,0 @@
-package amd64
-
-// extMode represents the mode of extension in movzx/movsx.
-type extMode byte
-
-const (
-	// extModeBL represents Byte -> Longword.
-	extModeBL extMode = iota
-	// extModeBQ represents Byte -> Quadword.
-	extModeBQ
-	// extModeWL represents Word -> Longword.
-	extModeWL
-	// extModeWQ represents Word -> Quadword.
-	extModeWQ
-	// extModeLQ represents Longword -> Quadword.
-	extModeLQ
-)
-
-// String implements fmt.Stringer.
-func (e extMode) String() string {
-	switch e {
-	case extModeBL:
-		return "bl"
-	case extModeBQ:
-		return "bq"
-	case extModeWL:
-		return "wl"
-	case extModeWQ:
-		return "wq"
-	case extModeLQ:
-		return "lq"
-	default:
-		panic("BUG: invalid ext mode")
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go
deleted file mode 100644
index 6a3e58f51..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go
+++ /dev/null
@@ -1,2447 +0,0 @@
-package amd64
-
-import (
-	"fmt"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-type instruction struct {
-	prev, next          *instruction
-	op1, op2            operand
-	u1, u2              uint64
-	b1                  bool
-	addedBeforeRegAlloc bool
-	kind                instructionKind
-}
-
-// IsCall implements regalloc.Instr.
-func (i *instruction) IsCall() bool { return i.kind == call }
-
-// IsIndirectCall implements regalloc.Instr.
-func (i *instruction) IsIndirectCall() bool { return i.kind == callIndirect }
-
-// IsReturn implements regalloc.Instr.
-func (i *instruction) IsReturn() bool { return i.kind == ret }
-
-// String implements regalloc.Instr.
-func (i *instruction) String() string {
-	switch i.kind {
-	case nop0:
-		return "nop"
-	case sourceOffsetInfo:
-		return fmt.Sprintf("source_offset_info %d", i.u1)
-	case ret:
-		return "ret"
-	case imm:
-		if i.b1 {
-			return fmt.Sprintf("movabsq $%d, %s", int64(i.u1), i.op2.format(true))
-		} else {
-			return fmt.Sprintf("movl $%d, %s", int32(i.u1), i.op2.format(false))
-		}
-	case aluRmiR:
-		return fmt.Sprintf("%s %s, %s", aluRmiROpcode(i.u1), i.op1.format(i.b1), i.op2.format(i.b1))
-	case movRR:
-		if i.b1 {
-			return fmt.Sprintf("movq %s, %s", i.op1.format(true), i.op2.format(true))
-		} else {
-			return fmt.Sprintf("movl %s, %s", i.op1.format(false), i.op2.format(false))
-		}
-	case xmmRmR:
-		return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(false), i.op2.format(false))
-	case gprToXmm:
-		return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(i.b1), i.op2.format(i.b1))
-	case xmmUnaryRmR:
-		return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(false), i.op2.format(false))
-	case xmmUnaryRmRImm:
-		return fmt.Sprintf("%s $%d, %s, %s", sseOpcode(i.u1), roundingMode(i.u2), i.op1.format(false), i.op2.format(false))
-	case unaryRmR:
-		var suffix string
-		if i.b1 {
-			suffix = "q"
-		} else {
-			suffix = "l"
-		}
-		return fmt.Sprintf("%s%s %s, %s", unaryRmROpcode(i.u1), suffix, i.op1.format(i.b1), i.op2.format(i.b1))
-	case not:
-		var op string
-		if i.b1 {
-			op = "notq"
-		} else {
-			op = "notl"
-		}
-		return fmt.Sprintf("%s %s", op, i.op1.format(i.b1))
-	case neg:
-		var op string
-		if i.b1 {
-			op = "negq"
-		} else {
-			op = "negl"
-		}
-		return fmt.Sprintf("%s %s", op, i.op1.format(i.b1))
-	case div:
-		var prefix string
-		var op string
-		if i.b1 {
-			op = "divq"
-		} else {
-			op = "divl"
-		}
-		if i.u1 != 0 {
-			prefix = "i"
-		}
-		return fmt.Sprintf("%s%s %s", prefix, op, i.op1.format(i.b1))
-	case mulHi:
-		signed, _64 := i.u1 != 0, i.b1
-		var op string
-		switch {
-		case signed && _64:
-			op = "imulq"
-		case !signed && _64:
-			op = "mulq"
-		case signed && !_64:
-			op = "imull"
-		case !signed && !_64:
-			op = "mull"
-		}
-		return fmt.Sprintf("%s %s", op, i.op1.format(i.b1))
-	case signExtendData:
-		var op string
-		if i.b1 {
-			op = "cqo"
-		} else {
-			op = "cdq"
-		}
-		return op
-	case movzxRmR:
-		return fmt.Sprintf("movzx.%s %s, %s", extMode(i.u1), i.op1.format(true), i.op2.format(true))
-	case mov64MR:
-		return fmt.Sprintf("movq %s, %s", i.op1.format(true), i.op2.format(true))
-	case lea:
-		return fmt.Sprintf("lea %s, %s", i.op1.format(true), i.op2.format(true))
-	case movsxRmR:
-		return fmt.Sprintf("movsx.%s %s, %s", extMode(i.u1), i.op1.format(true), i.op2.format(true))
-	case movRM:
-		var suffix string
-		switch i.u1 {
-		case 1:
-			suffix = "b"
-		case 2:
-			suffix = "w"
-		case 4:
-			suffix = "l"
-		case 8:
-			suffix = "q"
-		}
-		return fmt.Sprintf("mov.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true))
-	case shiftR:
-		var suffix string
-		if i.b1 {
-			suffix = "q"
-		} else {
-			suffix = "l"
-		}
-		return fmt.Sprintf("%s%s %s, %s", shiftROp(i.u1), suffix, i.op1.format(false), i.op2.format(i.b1))
-	case xmmRmiReg:
-		return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(true), i.op2.format(true))
-	case cmpRmiR:
-		var op, suffix string
-		if i.u1 != 0 {
-			op = "cmp"
-		} else {
-			op = "test"
-		}
-		if i.b1 {
-			suffix = "q"
-		} else {
-			suffix = "l"
-		}
-		if op == "test" && i.op1.kind == operandKindMem {
-			// Print consistently with AT&T syntax.
-			return fmt.Sprintf("%s%s %s, %s", op, suffix, i.op2.format(i.b1), i.op1.format(i.b1))
-		}
-		return fmt.Sprintf("%s%s %s, %s", op, suffix, i.op1.format(i.b1), i.op2.format(i.b1))
-	case setcc:
-		return fmt.Sprintf("set%s %s", cond(i.u1), i.op2.format(true))
-	case cmove:
-		var suffix string
-		if i.b1 {
-			suffix = "q"
-		} else {
-			suffix = "l"
-		}
-		return fmt.Sprintf("cmov%s%s %s, %s", cond(i.u1), suffix, i.op1.format(i.b1), i.op2.format(i.b1))
-	case push64:
-		return fmt.Sprintf("pushq %s", i.op1.format(true))
-	case pop64:
-		return fmt.Sprintf("popq %s", i.op1.format(true))
-	case xmmMovRM:
-		return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(true), i.op2.format(true))
-	case xmmLoadConst:
-		panic("TODO")
-	case xmmToGpr:
-		return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(i.b1), i.op2.format(i.b1))
-	case cvtUint64ToFloatSeq:
-		panic("TODO")
-	case cvtFloatToSintSeq:
-		panic("TODO")
-	case cvtFloatToUintSeq:
-		panic("TODO")
-	case xmmMinMaxSeq:
-		panic("TODO")
-	case xmmCmpRmR:
-		return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(false), i.op2.format(false))
-	case xmmRmRImm:
-		op := sseOpcode(i.u1)
-		r1, r2 := i.op1.format(op == sseOpcodePextrq || op == sseOpcodePinsrq),
-			i.op2.format(op == sseOpcodePextrq || op == sseOpcodePinsrq)
-		return fmt.Sprintf("%s $%d, %s, %s", op, i.u2, r1, r2)
-	case jmp:
-		return fmt.Sprintf("jmp %s", i.op1.format(true))
-	case jmpIf:
-		return fmt.Sprintf("j%s %s", cond(i.u1), i.op1.format(true))
-	case jmpTableIsland:
-		return fmt.Sprintf("jump_table_island: jmp_table_index=%d", i.u1)
-	case exitSequence:
-		return fmt.Sprintf("exit_sequence %s", i.op1.format(true))
-	case ud2:
-		return "ud2"
-	case call:
-		return fmt.Sprintf("call %s", ssa.FuncRef(i.u1))
-	case callIndirect:
-		return fmt.Sprintf("callq *%s", i.op1.format(true))
-	case xchg:
-		var suffix string
-		switch i.u1 {
-		case 1:
-			suffix = "b"
-		case 2:
-			suffix = "w"
-		case 4:
-			suffix = "l"
-		case 8:
-			suffix = "q"
-		}
-		return fmt.Sprintf("xchg.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true))
-	case zeros:
-		return fmt.Sprintf("xor %s, %s", i.op2.format(true), i.op2.format(true))
-	case fcvtToSintSequence:
-		execCtx, src, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat := i.fcvtToSintSequenceData()
-		return fmt.Sprintf(
-			"fcvtToSintSequence execCtx=%s, src=%s, tmpGp=%s, tmpGp2=%s, tmpXmm=%s, src64=%v, dst64=%v, sat=%v",
-			formatVRegSized(execCtx, true),
-			formatVRegSized(src, true),
-			formatVRegSized(tmpGp, true),
-			formatVRegSized(tmpGp2, true),
-			formatVRegSized(tmpXmm, true), src64, dst64, sat)
-	case fcvtToUintSequence:
-		execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat := i.fcvtToUintSequenceData()
-		return fmt.Sprintf(
-			"fcvtToUintSequence execCtx=%s, src=%s, tmpGp=%s, tmpGp2=%s, tmpXmm=%s, tmpXmm2=%s, src64=%v, dst64=%v, sat=%v",
-			formatVRegSized(execCtx, true),
-			formatVRegSized(src, true),
-			formatVRegSized(tmpGp, true),
-			formatVRegSized(tmpGp2, true),
-			formatVRegSized(tmpXmm, true),
-			formatVRegSized(tmpXmm2, true), src64, dst64, sat)
-	case idivRemSequence:
-		execCtx, divisor, tmpGp, isDiv, signed, _64 := i.idivRemSequenceData()
-		return fmt.Sprintf("idivRemSequence execCtx=%s, divisor=%s, tmpGp=%s, isDiv=%v, signed=%v, _64=%v",
-			formatVRegSized(execCtx, true), formatVRegSized(divisor, _64), formatVRegSized(tmpGp, _64), isDiv, signed, _64)
-	case defineUninitializedReg:
-		return fmt.Sprintf("defineUninitializedReg %s", i.op2.format(true))
-	case xmmCMov:
-		return fmt.Sprintf("xmmcmov%s %s, %s", cond(i.u1), i.op1.format(true), i.op2.format(true))
-	case blendvpd:
-		return fmt.Sprintf("blendvpd %s, %s, %%xmm0", i.op1.format(false), i.op2.format(false))
-	case mfence:
-		return "mfence"
-	case lockcmpxchg:
-		var suffix string
-		switch i.u1 {
-		case 1:
-			suffix = "b"
-		case 2:
-			suffix = "w"
-		case 4:
-			suffix = "l"
-		case 8:
-			suffix = "q"
-		}
-		return fmt.Sprintf("lock cmpxchg.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true))
-	case lockxadd:
-		var suffix string
-		switch i.u1 {
-		case 1:
-			suffix = "b"
-		case 2:
-			suffix = "w"
-		case 4:
-			suffix = "l"
-		case 8:
-			suffix = "q"
-		}
-		return fmt.Sprintf("lock xadd.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true))
-
-	case nopUseReg:
-		return fmt.Sprintf("nop_use_reg %s", i.op1.format(true))
-
-	default:
-		panic(fmt.Sprintf("BUG: %d", int(i.kind)))
-	}
-}
-
-// Defs implements regalloc.Instr.
-func (i *instruction) Defs(regs *[]regalloc.VReg) []regalloc.VReg {
-	*regs = (*regs)[:0]
-	switch dk := defKinds[i.kind]; dk {
-	case defKindNone:
-	case defKindOp2:
-		*regs = append(*regs, i.op2.reg())
-	case defKindCall:
-		_, _, retIntRealRegs, retFloatRealRegs, _ := backend.ABIInfoFromUint64(i.u2)
-		for i := byte(0); i < retIntRealRegs; i++ {
-			*regs = append(*regs, regInfo.RealRegToVReg[intArgResultRegs[i]])
-		}
-		for i := byte(0); i < retFloatRealRegs; i++ {
-			*regs = append(*regs, regInfo.RealRegToVReg[floatArgResultRegs[i]])
-		}
-	case defKindDivRem:
-		_, _, _, isDiv, _, _ := i.idivRemSequenceData()
-		if isDiv {
-			*regs = append(*regs, raxVReg)
-		} else {
-			*regs = append(*regs, rdxVReg)
-		}
-	default:
-		panic(fmt.Sprintf("BUG: invalid defKind \"%s\" for %s", dk, i))
-	}
-	return *regs
-}
-
-// Uses implements regalloc.Instr.
-func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
-	*regs = (*regs)[:0]
-	switch uk := useKinds[i.kind]; uk {
-	case useKindNone:
-	case useKindOp1Op2Reg, useKindOp1RegOp2:
-		opAny, opReg := &i.op1, &i.op2
-		if uk == useKindOp1RegOp2 {
-			opAny, opReg = opReg, opAny
-		}
-		// The destination operand (op2) can be only reg,
-		// the source operand (op1) can be imm32, reg or mem.
-		switch opAny.kind {
-		case operandKindReg:
-			*regs = append(*regs, opAny.reg())
-		case operandKindMem:
-			opAny.addressMode().uses(regs)
-		case operandKindImm32:
-		default:
-			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
-		}
-		if opReg.kind != operandKindReg {
-			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
-		}
-		*regs = append(*regs, opReg.reg())
-	case useKindOp1:
-		op := i.op1
-		switch op.kind {
-		case operandKindReg:
-			*regs = append(*regs, op.reg())
-		case operandKindMem:
-			op.addressMode().uses(regs)
-		case operandKindImm32, operandKindLabel:
-		default:
-			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
-		}
-	case useKindCallInd:
-		op := i.op1
-		switch op.kind {
-		case operandKindReg:
-			*regs = append(*regs, op.reg())
-		case operandKindMem:
-			op.addressMode().uses(regs)
-		default:
-			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
-		}
-		fallthrough
-	case useKindCall:
-		argIntRealRegs, argFloatRealRegs, _, _, _ := backend.ABIInfoFromUint64(i.u2)
-		for i := byte(0); i < argIntRealRegs; i++ {
-			*regs = append(*regs, regInfo.RealRegToVReg[intArgResultRegs[i]])
-		}
-		for i := byte(0); i < argFloatRealRegs; i++ {
-			*regs = append(*regs, regInfo.RealRegToVReg[floatArgResultRegs[i]])
-		}
-	case useKindFcvtToSintSequence:
-		execCtx, src, tmpGp, tmpGp2, tmpXmm, _, _, _ := i.fcvtToSintSequenceData()
-		*regs = append(*regs, execCtx, src, tmpGp, tmpGp2, tmpXmm)
-	case useKindFcvtToUintSequence:
-		execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, _, _, _ := i.fcvtToUintSequenceData()
-		*regs = append(*regs, execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2)
-	case useKindDivRem:
-		execCtx, divisor, tmpGp, _, _, _ := i.idivRemSequenceData()
-		// idiv uses rax and rdx as implicit operands.
-		*regs = append(*regs, raxVReg, rdxVReg, execCtx, divisor, tmpGp)
-	case useKindBlendvpd:
-		*regs = append(*regs, xmm0VReg)
-
-		opAny, opReg := &i.op1, &i.op2
-		switch opAny.kind {
-		case operandKindReg:
-			*regs = append(*regs, opAny.reg())
-		case operandKindMem:
-			opAny.addressMode().uses(regs)
-		default:
-			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
-		}
-		if opReg.kind != operandKindReg {
-			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
-		}
-		*regs = append(*regs, opReg.reg())
-
-	case useKindRaxOp1RegOp2:
-		opReg, opAny := &i.op1, &i.op2
-		*regs = append(*regs, raxVReg, opReg.reg())
-		switch opAny.kind {
-		case operandKindReg:
-			*regs = append(*regs, opAny.reg())
-		case operandKindMem:
-			opAny.addressMode().uses(regs)
-		default:
-			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
-		}
-		if opReg.kind != operandKindReg {
-			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
-		}
-
-	default:
-		panic(fmt.Sprintf("BUG: invalid useKind %s for %s", uk, i))
-	}
-	return *regs
-}
-
-// AssignUse implements regalloc.Instr.
-func (i *instruction) AssignUse(index int, v regalloc.VReg) {
-	switch uk := useKinds[i.kind]; uk {
-	case useKindNone:
-	case useKindCallInd:
-		if index != 0 {
-			panic("BUG")
-		}
-		op := &i.op1
-		switch op.kind {
-		case operandKindReg:
-			op.setReg(v)
-		case operandKindMem:
-			op.addressMode().assignUses(index, v)
-		default:
-			panic("BUG")
-		}
-	case useKindOp1Op2Reg, useKindOp1RegOp2:
-		op, opMustBeReg := &i.op1, &i.op2
-		if uk == useKindOp1RegOp2 {
-			op, opMustBeReg = opMustBeReg, op
-		}
-		switch op.kind {
-		case operandKindReg:
-			if index == 0 {
-				op.setReg(v)
-			} else if index == 1 {
-				opMustBeReg.setReg(v)
-			} else {
-				panic("BUG")
-			}
-		case operandKindMem:
-			nregs := op.addressMode().nregs()
-			if index < nregs {
-				op.addressMode().assignUses(index, v)
-			} else if index == nregs {
-				opMustBeReg.setReg(v)
-			} else {
-				panic("BUG")
-			}
-		case operandKindImm32:
-			if index == 0 {
-				opMustBeReg.setReg(v)
-			} else {
-				panic("BUG")
-			}
-		default:
-			panic(fmt.Sprintf("BUG: invalid operand pair: %s", i))
-		}
-	case useKindOp1:
-		op := &i.op1
-		switch op.kind {
-		case operandKindReg:
-			if index != 0 {
-				panic("BUG")
-			}
-			op.setReg(v)
-		case operandKindMem:
-			op.addressMode().assignUses(index, v)
-		default:
-			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
-		}
-	case useKindFcvtToSintSequence:
-		switch index {
-		case 0:
-			i.op1.addressMode().base = v
-		case 1:
-			i.op1.addressMode().index = v
-		case 2:
-			i.op2.addressMode().base = v
-		case 3:
-			i.op2.addressMode().index = v
-		case 4:
-			i.u1 = uint64(v)
-		default:
-			panic("BUG")
-		}
-	case useKindFcvtToUintSequence:
-		switch index {
-		case 0:
-			i.op1.addressMode().base = v
-		case 1:
-			i.op1.addressMode().index = v
-		case 2:
-			i.op2.addressMode().base = v
-		case 3:
-			i.op2.addressMode().index = v
-		case 4:
-			i.u1 = uint64(v)
-		case 5:
-			i.u2 = uint64(v)
-		default:
-			panic("BUG")
-		}
-	case useKindDivRem:
-		switch index {
-		case 0:
-			if v != raxVReg {
-				panic("BUG")
-			}
-		case 1:
-			if v != rdxVReg {
-				panic("BUG")
-			}
-		case 2:
-			i.op1.setReg(v)
-		case 3:
-			i.op2.setReg(v)
-		case 4:
-			i.u1 = uint64(v)
-		default:
-			panic("BUG")
-		}
-	case useKindBlendvpd:
-		op, opMustBeReg := &i.op1, &i.op2
-		if index == 0 {
-			if v.RealReg() != xmm0 {
-				panic("BUG")
-			}
-		} else {
-			switch op.kind {
-			case operandKindReg:
-				switch index {
-				case 1:
-					op.setReg(v)
-				case 2:
-					opMustBeReg.setReg(v)
-				default:
-					panic("BUG")
-				}
-			case operandKindMem:
-				nregs := op.addressMode().nregs()
-				index--
-				if index < nregs {
-					op.addressMode().assignUses(index, v)
-				} else if index == nregs {
-					opMustBeReg.setReg(v)
-				} else {
-					panic("BUG")
-				}
-			default:
-				panic(fmt.Sprintf("BUG: invalid operand pair: %s", i))
-			}
-		}
-
-	case useKindRaxOp1RegOp2:
-		switch index {
-		case 0:
-			if v.RealReg() != rax {
-				panic("BUG")
-			}
-		case 1:
-			i.op1.setReg(v)
-		default:
-			op := &i.op2
-			switch op.kind {
-			case operandKindReg:
-				switch index {
-				case 1:
-					op.setReg(v)
-				case 2:
-					op.setReg(v)
-				default:
-					panic("BUG")
-				}
-			case operandKindMem:
-				nregs := op.addressMode().nregs()
-				index -= 2
-				if index < nregs {
-					op.addressMode().assignUses(index, v)
-				} else if index == nregs {
-					op.setReg(v)
-				} else {
-					panic("BUG")
-				}
-			default:
-				panic(fmt.Sprintf("BUG: invalid operand pair: %s", i))
-			}
-		}
-	default:
-		panic(fmt.Sprintf("BUG: invalid useKind %s for %s", uk, i))
-	}
-}
-
-// AssignDef implements regalloc.Instr.
-func (i *instruction) AssignDef(reg regalloc.VReg) {
-	switch dk := defKinds[i.kind]; dk {
-	case defKindNone:
-	case defKindOp2:
-		i.op2.setReg(reg)
-	default:
-		panic(fmt.Sprintf("BUG: invalid defKind \"%s\" for %s", dk, i))
-	}
-}
-
-// IsCopy implements regalloc.Instr.
-func (i *instruction) IsCopy() bool {
-	k := i.kind
-	if k == movRR {
-		return true
-	}
-	if k == xmmUnaryRmR {
-		if i.op1.kind == operandKindReg {
-			sse := sseOpcode(i.u1)
-			return sse == sseOpcodeMovss || sse == sseOpcodeMovsd || sse == sseOpcodeMovdqu
-		}
-	}
-	return false
-}
-
-func resetInstruction(i *instruction) {
-	*i = instruction{}
-}
-
-func (i *instruction) asNop0WithLabel(label label) *instruction { //nolint
-	i.kind = nop0
-	i.u1 = uint64(label)
-	return i
-}
-
-func (i *instruction) nop0Label() label {
-	return label(i.u1)
-}
-
-type instructionKind byte
-
-const (
-	nop0 instructionKind = iota + 1
-
-	// Integer arithmetic/bit-twiddling: (add sub and or xor mul, etc.) (32 64) (reg addr imm) reg
-	aluRmiR
-
-	// Instructions on GPR that only read src and defines dst (dst is not modified): bsr, etc.
-	unaryRmR
-
-	// Bitwise not
-	not
-
-	// Integer negation
-	neg
-
-	// Integer quotient and remainder: (div idiv) $rax $rdx (reg addr)
-	div
-
-	// The high bits (RDX) of a (un)signed multiply: RDX:RAX := RAX * rhs.
-	mulHi
-
-	// Do a sign-extend based on the sign of the value in rax into rdx: (cwd cdq cqo)
-	// or al into ah: (cbw)
-	signExtendData
-
-	// Constant materialization: (imm32 imm64) reg.
-	// Either: movl $imm32, %reg32 or movabsq $imm64, %reg64.
-	imm
-
-	// GPR to GPR move: mov (64 32) reg reg.
-	movRR
-
-	// movzxRmR is zero-extended loads or move (R to R), except for 64 bits: movz (bl bq wl wq lq) addr reg.
-	// Note that the lq variant doesn't really exist since the default zero-extend rule makes it
-	// unnecessary. For that case we emit the equivalent "movl AM, reg32".
-	movzxRmR
-
-	// mov64MR is a plain 64-bit integer load, since movzxRmR can't represent that.
-	mov64MR
-
-	// Loads the memory address of addr into dst.
-	lea
-
-	// Sign-extended loads and moves: movs (bl bq wl wq lq) addr reg.
-	movsxRmR
-
-	// Integer stores: mov (b w l q) reg addr.
-	movRM
-
-	// Arithmetic shifts: (shl shr sar) (b w l q) imm reg.
-	shiftR
-
-	// Arithmetic SIMD shifts.
-	xmmRmiReg
-
-	// Integer comparisons/tests: cmp or test (b w l q) (reg addr imm) reg.
-	cmpRmiR
-
-	// Materializes the requested condition code in the destination reg.
-	setcc
-
-	// Integer conditional move.
-	// Overwrites the destination register.
-	cmove
-
-	// pushq (reg addr imm)
-	push64
-
-	// popq reg
-	pop64
-
-	// XMM (scalar or vector) binary op: (add sub and or xor mul adc? sbb?) (32 64) (reg addr) reg
-	xmmRmR
-
-	// XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg.
-	//
-	// This differs from xmmRmR in that the dst register of xmmUnaryRmR is not used in the
-	// computation of the instruction dst value and so does not have to be a previously valid
-	// value. This is characteristic of mov instructions.
-	xmmUnaryRmR
-
-	// XMM (scalar or vector) unary op with immediate: roundss, roundsd, etc.
-	//
-	// This differs from XMM_RM_R_IMM in that the dst register of
-	// XmmUnaryRmRImm is not used in the computation of the instruction dst
-	// value and so does not have to be a previously valid value.
-	xmmUnaryRmRImm
-
-	// XMM (scalar or vector) unary op (from xmm to mem): stores, movd, movq
-	xmmMovRM
-
-	// XMM (vector) unary op (to move a constant value into an xmm register): movups
-	xmmLoadConst
-
-	// XMM (scalar) unary op (from xmm to integer reg): movd, movq, cvtts{s,d}2si
-	xmmToGpr
-
-	// XMM (scalar) unary op (from integer to float reg): movd, movq, cvtsi2s{s,d}
-	gprToXmm
-
-	// Converts an unsigned int64 to a float32/float64.
-	cvtUint64ToFloatSeq
-
-	// Converts a scalar xmm to a signed int32/int64.
-	cvtFloatToSintSeq
-
-	// Converts a scalar xmm to an unsigned int32/int64.
-	cvtFloatToUintSeq
-
-	// A sequence to compute min/max with the proper NaN semantics for xmm registers.
-	xmmMinMaxSeq
-
-	// Float comparisons/tests: cmp (b w l q) (reg addr imm) reg.
-	xmmCmpRmR
-
-	// A binary XMM instruction with an 8-bit immediate: e.g. cmp (ps pd) imm (reg addr) reg
-	xmmRmRImm
-
-	// Direct call: call simm32.
-	// Note that the offset is the relative to the *current RIP*, which points to the first byte of the next instruction.
-	call
-
-	// Indirect call: callq (reg mem).
-	callIndirect
-
-	// Return.
-	ret
-
-	// Jump: jmp (reg, mem, imm32 or label)
-	jmp
-
-	// Jump conditionally: jcond cond label.
-	jmpIf
-
-	// jmpTableIsland is to emit the jump table.
-	jmpTableIsland
-
-	// exitSequence exits the execution and go back to the Go world.
-	exitSequence
-
-	// An instruction that will always trigger the illegal instruction exception.
-	ud2
-
-	// xchg is described in https://www.felixcloutier.com/x86/xchg.
-	// This instruction uses two operands, where one of them can be a memory address, and swaps their values.
-	// If the dst is a memory address, the execution is atomic.
-	xchg
-
-	// lockcmpxchg is the cmpxchg instruction https://www.felixcloutier.com/x86/cmpxchg with a lock prefix.
-	lockcmpxchg
-
-	// zeros puts zeros into the destination register. This is implemented as xor reg, reg for
-	// either integer or XMM registers. The reason why we have this instruction instead of using aluRmiR
-	// is that it requires the already-defined registers. From reg alloc's perspective, this defines
-	// the destination register and takes no inputs.
-	zeros
-
-	// sourceOffsetInfo is a dummy instruction to emit source offset info.
-	// The existence of this instruction does not affect the execution.
-	sourceOffsetInfo
-
-	// defineUninitializedReg is a no-op instruction that defines a register without a defining instruction.
-	defineUninitializedReg
-
-	// fcvtToSintSequence is a sequence of instructions to convert a float to a signed integer.
-	fcvtToSintSequence
-
-	// fcvtToUintSequence is a sequence of instructions to convert a float to an unsigned integer.
-	fcvtToUintSequence
-
-	// xmmCMov is a conditional move instruction for XMM registers. Lowered after register allocation.
-	xmmCMov
-
-	// idivRemSequence is a sequence of instructions to compute both the quotient and remainder of a division.
-	idivRemSequence
-
-	// blendvpd is https://www.felixcloutier.com/x86/blendvpd.
-	blendvpd
-
-	// mfence is https://www.felixcloutier.com/x86/mfence
-	mfence
-
-	// lockxadd is xadd https://www.felixcloutier.com/x86/xadd with a lock prefix.
-	lockxadd
-
-	// nopUseReg is a meta instruction that uses one register and does nothing.
-	nopUseReg
-
-	instrMax
-)
-
-func (i *instruction) asMFence() *instruction {
-	i.kind = mfence
-	return i
-}
-
-func (i *instruction) asNopUseReg(r regalloc.VReg) *instruction {
-	i.kind = nopUseReg
-	i.op1 = newOperandReg(r)
-	return i
-}
-
-func (i *instruction) asIdivRemSequence(execCtx, divisor, tmpGp regalloc.VReg, isDiv, signed, _64 bool) *instruction {
-	i.kind = idivRemSequence
-	i.op1 = newOperandReg(execCtx)
-	i.op2 = newOperandReg(divisor)
-	i.u1 = uint64(tmpGp)
-	if isDiv {
-		i.u2 |= 1
-	}
-	if signed {
-		i.u2 |= 2
-	}
-	if _64 {
-		i.u2 |= 4
-	}
-	return i
-}
-
-func (i *instruction) idivRemSequenceData() (
-	execCtx, divisor, tmpGp regalloc.VReg, isDiv, signed, _64 bool,
-) {
-	if i.kind != idivRemSequence {
-		panic("BUG")
-	}
-	return i.op1.reg(), i.op2.reg(), regalloc.VReg(i.u1), i.u2&1 != 0, i.u2&2 != 0, i.u2&4 != 0
-}
-
-func (i *instruction) asXmmCMov(cc cond, x operand, rd regalloc.VReg, size byte) *instruction {
-	i.kind = xmmCMov
-	i.op1 = x
-	i.op2 = newOperandReg(rd)
-	i.u1 = uint64(cc)
-	i.u2 = uint64(size)
-	return i
-}
-
-func (i *instruction) asDefineUninitializedReg(r regalloc.VReg) *instruction {
-	i.kind = defineUninitializedReg
-	i.op2 = newOperandReg(r)
-	return i
-}
-
-func (m *machine) allocateFcvtToUintSequence(
-	execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2 regalloc.VReg,
-	src64, dst64, sat bool,
-) *instruction {
-	i := m.allocateInstr()
-	i.kind = fcvtToUintSequence
-	op1a := m.amodePool.Allocate()
-	op2a := m.amodePool.Allocate()
-	i.op1 = newOperandMem(op1a)
-	i.op2 = newOperandMem(op2a)
-	if src64 {
-		op1a.imm32 = 1
-	} else {
-		op1a.imm32 = 0
-	}
-	if dst64 {
-		op1a.imm32 |= 2
-	}
-	if sat {
-		op1a.imm32 |= 4
-	}
-
-	op1a.base = execCtx
-	op1a.index = src
-	op2a.base = tmpGp
-	op2a.index = tmpGp2
-	i.u1 = uint64(tmpXmm)
-	i.u2 = uint64(tmpXmm2)
-	return i
-}
-
-func (i *instruction) fcvtToUintSequenceData() (
-	execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2 regalloc.VReg, src64, dst64, sat bool,
-) {
-	if i.kind != fcvtToUintSequence {
-		panic("BUG")
-	}
-	op1a := i.op1.addressMode()
-	op2a := i.op2.addressMode()
-	return op1a.base, op1a.index, op2a.base, op2a.index, regalloc.VReg(i.u1), regalloc.VReg(i.u2),
-		op1a.imm32&1 != 0, op1a.imm32&2 != 0, op1a.imm32&4 != 0
-}
-
-func (m *machine) allocateFcvtToSintSequence(
-	execCtx, src, tmpGp, tmpGp2, tmpXmm regalloc.VReg,
-	src64, dst64, sat bool,
-) *instruction {
-	i := m.allocateInstr()
-	i.kind = fcvtToSintSequence
-	op1a := m.amodePool.Allocate()
-	op2a := m.amodePool.Allocate()
-	i.op1 = newOperandMem(op1a)
-	i.op2 = newOperandMem(op2a)
-	op1a.base = execCtx
-	op1a.index = src
-	op2a.base = tmpGp
-	op2a.index = tmpGp2
-	i.u1 = uint64(tmpXmm)
-	if src64 {
-		i.u2 = 1
-	} else {
-		i.u2 = 0
-	}
-	if dst64 {
-		i.u2 |= 2
-	}
-	if sat {
-		i.u2 |= 4
-	}
-	return i
-}
-
-func (i *instruction) fcvtToSintSequenceData() (
-	execCtx, src, tmpGp, tmpGp2, tmpXmm regalloc.VReg, src64, dst64, sat bool,
-) {
-	if i.kind != fcvtToSintSequence {
-		panic("BUG")
-	}
-	op1a := i.op1.addressMode()
-	op2a := i.op2.addressMode()
-	return op1a.base, op1a.index, op2a.base, op2a.index, regalloc.VReg(i.u1),
-		i.u2&1 != 0, i.u2&2 != 0, i.u2&4 != 0
-}
-
-func (k instructionKind) String() string {
-	switch k {
-	case nop0:
-		return "nop"
-	case ret:
-		return "ret"
-	case imm:
-		return "imm"
-	case aluRmiR:
-		return "aluRmiR"
-	case movRR:
-		return "movRR"
-	case xmmRmR:
-		return "xmmRmR"
-	case gprToXmm:
-		return "gprToXmm"
-	case xmmUnaryRmR:
-		return "xmmUnaryRmR"
-	case xmmUnaryRmRImm:
-		return "xmmUnaryRmRImm"
-	case unaryRmR:
-		return "unaryRmR"
-	case not:
-		return "not"
-	case neg:
-		return "neg"
-	case div:
-		return "div"
-	case mulHi:
-		return "mulHi"
-	case signExtendData:
-		return "signExtendData"
-	case movzxRmR:
-		return "movzxRmR"
-	case mov64MR:
-		return "mov64MR"
-	case lea:
-		return "lea"
-	case movsxRmR:
-		return "movsxRmR"
-	case movRM:
-		return "movRM"
-	case shiftR:
-		return "shiftR"
-	case xmmRmiReg:
-		return "xmmRmiReg"
-	case cmpRmiR:
-		return "cmpRmiR"
-	case setcc:
-		return "setcc"
-	case cmove:
-		return "cmove"
-	case push64:
-		return "push64"
-	case pop64:
-		return "pop64"
-	case xmmMovRM:
-		return "xmmMovRM"
-	case xmmLoadConst:
-		return "xmmLoadConst"
-	case xmmToGpr:
-		return "xmmToGpr"
-	case cvtUint64ToFloatSeq:
-		return "cvtUint64ToFloatSeq"
-	case cvtFloatToSintSeq:
-		return "cvtFloatToSintSeq"
-	case cvtFloatToUintSeq:
-		return "cvtFloatToUintSeq"
-	case xmmMinMaxSeq:
-		return "xmmMinMaxSeq"
-	case xmmCmpRmR:
-		return "xmmCmpRmR"
-	case xmmRmRImm:
-		return "xmmRmRImm"
-	case jmpIf:
-		return "jmpIf"
-	case jmp:
-		return "jmp"
-	case jmpTableIsland:
-		return "jmpTableIsland"
-	case exitSequence:
-		return "exit_sequence"
-	case ud2:
-		return "ud2"
-	case xchg:
-		return "xchg"
-	case zeros:
-		return "zeros"
-	case fcvtToSintSequence:
-		return "fcvtToSintSequence"
-	case fcvtToUintSequence:
-		return "fcvtToUintSequence"
-	case xmmCMov:
-		return "xmmCMov"
-	case idivRemSequence:
-		return "idivRemSequence"
-	case mfence:
-		return "mfence"
-	case lockcmpxchg:
-		return "lockcmpxchg"
-	case lockxadd:
-		return "lockxadd"
-	default:
-		panic("BUG")
-	}
-}
-
-type aluRmiROpcode byte
-
-const (
-	aluRmiROpcodeAdd aluRmiROpcode = iota + 1
-	aluRmiROpcodeSub
-	aluRmiROpcodeAnd
-	aluRmiROpcodeOr
-	aluRmiROpcodeXor
-	aluRmiROpcodeMul
-)
-
-func (a aluRmiROpcode) String() string {
-	switch a {
-	case aluRmiROpcodeAdd:
-		return "add"
-	case aluRmiROpcodeSub:
-		return "sub"
-	case aluRmiROpcodeAnd:
-		return "and"
-	case aluRmiROpcodeOr:
-		return "or"
-	case aluRmiROpcodeXor:
-		return "xor"
-	case aluRmiROpcodeMul:
-		return "imul"
-	default:
-		panic("BUG")
-	}
-}
-
-func (i *instruction) asJmpIf(cond cond, target operand) *instruction {
-	i.kind = jmpIf
-	i.u1 = uint64(cond)
-	i.op1 = target
-	return i
-}
-
-// asJmpTableSequence is used to emit the jump table.
-// targetSliceIndex is the index of the target slice in machine.jmpTableTargets.
-func (i *instruction) asJmpTableSequence(targetSliceIndex int, targetCount int) *instruction {
-	i.kind = jmpTableIsland
-	i.u1 = uint64(targetSliceIndex)
-	i.u2 = uint64(targetCount)
-	return i
-}
-
-func (i *instruction) asJmp(target operand) *instruction {
-	i.kind = jmp
-	i.op1 = target
-	return i
-}
-
-func (i *instruction) jmpLabel() label {
-	switch i.kind {
-	case jmp, jmpIf, lea, xmmUnaryRmR:
-		return i.op1.label()
-	default:
-		panic("BUG")
-	}
-}
-
-func (i *instruction) asLEA(target operand, rd regalloc.VReg) *instruction {
-	i.kind = lea
-	i.op1 = target
-	i.op2 = newOperandReg(rd)
-	return i
-}
-
-func (i *instruction) asCall(ref ssa.FuncRef, abi *backend.FunctionABI) *instruction {
-	i.kind = call
-	i.u1 = uint64(ref)
-	if abi != nil {
-		i.u2 = abi.ABIInfoAsUint64()
-	}
-	return i
-}
-
-func (i *instruction) asCallIndirect(ptr operand, abi *backend.FunctionABI) *instruction {
-	if ptr.kind != operandKindReg && ptr.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = callIndirect
-	i.op1 = ptr
-	if abi != nil {
-		i.u2 = abi.ABIInfoAsUint64()
-	}
-	return i
-}
-
-func (i *instruction) asRet() *instruction {
-	i.kind = ret
-	return i
-}
-
-func (i *instruction) asImm(dst regalloc.VReg, value uint64, _64 bool) *instruction {
-	i.kind = imm
-	i.op2 = newOperandReg(dst)
-	i.u1 = value
-	i.b1 = _64
-	return i
-}
-
-func (i *instruction) asAluRmiR(op aluRmiROpcode, rm operand, rd regalloc.VReg, _64 bool) *instruction {
-	if rm.kind != operandKindReg && rm.kind != operandKindMem && rm.kind != operandKindImm32 {
-		panic("BUG")
-	}
-	i.kind = aluRmiR
-	i.op1 = rm
-	i.op2 = newOperandReg(rd)
-	i.u1 = uint64(op)
-	i.b1 = _64
-	return i
-}
-
-func (i *instruction) asZeros(dst regalloc.VReg) *instruction {
-	i.kind = zeros
-	i.op2 = newOperandReg(dst)
-	return i
-}
-
-func (i *instruction) asBlendvpd(rm operand, rd regalloc.VReg) *instruction {
-	if rm.kind != operandKindReg && rm.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = blendvpd
-	i.op1 = rm
-	i.op2 = newOperandReg(rd)
-	return i
-}
-
-func (i *instruction) asXmmRmR(op sseOpcode, rm operand, rd regalloc.VReg) *instruction {
-	if rm.kind != operandKindReg && rm.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = xmmRmR
-	i.op1 = rm
-	i.op2 = newOperandReg(rd)
-	i.u1 = uint64(op)
-	return i
-}
-
-func (i *instruction) asXmmRmRImm(op sseOpcode, imm uint8, rm operand, rd regalloc.VReg) *instruction {
-	if rm.kind != operandKindReg && rm.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = xmmRmRImm
-	i.op1 = rm
-	i.op2 = newOperandReg(rd)
-	i.u1 = uint64(op)
-	i.u2 = uint64(imm)
-	return i
-}
-
-func (i *instruction) asGprToXmm(op sseOpcode, rm operand, rd regalloc.VReg, _64 bool) *instruction {
-	if rm.kind != operandKindReg && rm.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = gprToXmm
-	i.op1 = rm
-	i.op2 = newOperandReg(rd)
-	i.u1 = uint64(op)
-	i.b1 = _64
-	return i
-}
-
-func (i *instruction) asEmitSourceOffsetInfo(l ssa.SourceOffset) *instruction {
-	i.kind = sourceOffsetInfo
-	i.u1 = uint64(l)
-	return i
-}
-
-func (i *instruction) sourceOffsetInfo() ssa.SourceOffset {
-	return ssa.SourceOffset(i.u1)
-}
-
-func (i *instruction) asXmmToGpr(op sseOpcode, rm, rd regalloc.VReg, _64 bool) *instruction {
-	i.kind = xmmToGpr
-	i.op1 = newOperandReg(rm)
-	i.op2 = newOperandReg(rd)
-	i.u1 = uint64(op)
-	i.b1 = _64
-	return i
-}
-
-func (i *instruction) asMovRM(rm regalloc.VReg, rd operand, size byte) *instruction {
-	if rd.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = movRM
-	i.op1 = newOperandReg(rm)
-	i.op2 = rd
-	i.u1 = uint64(size)
-	return i
-}
-
-func (i *instruction) asMovsxRmR(ext extMode, src operand, rd regalloc.VReg) *instruction {
-	if src.kind != operandKindReg && src.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = movsxRmR
-	i.op1 = src
-	i.op2 = newOperandReg(rd)
-	i.u1 = uint64(ext)
-	return i
-}
-
-func (i *instruction) asMovzxRmR(ext extMode, src operand, rd regalloc.VReg) *instruction {
-	if src.kind != operandKindReg && src.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = movzxRmR
-	i.op1 = src
-	i.op2 = newOperandReg(rd)
-	i.u1 = uint64(ext)
-	return i
-}
-
-func (i *instruction) asSignExtendData(_64 bool) *instruction {
-	i.kind = signExtendData
-	i.b1 = _64
-	return i
-}
-
-func (i *instruction) asUD2() *instruction {
-	i.kind = ud2
-	return i
-}
-
-func (i *instruction) asDiv(rn operand, signed bool, _64 bool) *instruction {
-	i.kind = div
-	i.op1 = rn
-	i.b1 = _64
-	if signed {
-		i.u1 = 1
-	}
-	return i
-}
-
-func (i *instruction) asMov64MR(rm operand, rd regalloc.VReg) *instruction {
-	if rm.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = mov64MR
-	i.op1 = rm
-	i.op2 = newOperandReg(rd)
-	return i
-}
-
-func (i *instruction) asMovRR(rm, rd regalloc.VReg, _64 bool) *instruction {
-	i.kind = movRR
-	i.op1 = newOperandReg(rm)
-	i.op2 = newOperandReg(rd)
-	i.b1 = _64
-	return i
-}
-
-func (i *instruction) asNot(rm operand, _64 bool) *instruction {
-	if rm.kind != operandKindReg && rm.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = not
-	i.op1 = rm
-	i.b1 = _64
-	return i
-}
-
-func (i *instruction) asNeg(rm operand, _64 bool) *instruction {
-	if rm.kind != operandKindReg && rm.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = neg
-	i.op1 = rm
-	i.b1 = _64
-	return i
-}
-
-func (i *instruction) asMulHi(rm operand, signed, _64 bool) *instruction {
-	if rm.kind != operandKindReg && (rm.kind != operandKindMem) {
-		panic("BUG")
-	}
-	i.kind = mulHi
-	i.op1 = rm
-	i.b1 = _64
-	if signed {
-		i.u1 = 1
-	}
-	return i
-}
-
-func (i *instruction) asUnaryRmR(op unaryRmROpcode, rm operand, rd regalloc.VReg, _64 bool) *instruction {
-	if rm.kind != operandKindReg && rm.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = unaryRmR
-	i.op1 = rm
-	i.op2 = newOperandReg(rd)
-	i.u1 = uint64(op)
-	i.b1 = _64
-	return i
-}
-
-func (i *instruction) asShiftR(op shiftROp, amount operand, rd regalloc.VReg, _64 bool) *instruction {
-	if amount.kind != operandKindReg && amount.kind != operandKindImm32 {
-		panic("BUG")
-	}
-	i.kind = shiftR
-	i.op1 = amount
-	i.op2 = newOperandReg(rd)
-	i.u1 = uint64(op)
-	i.b1 = _64
-	return i
-}
-
-func (i *instruction) asXmmRmiReg(op sseOpcode, rm operand, rd regalloc.VReg) *instruction {
-	if rm.kind != operandKindReg && rm.kind != operandKindImm32 && rm.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = xmmRmiReg
-	i.op1 = rm
-	i.op2 = newOperandReg(rd)
-	i.u1 = uint64(op)
-	return i
-}
-
-func (i *instruction) asCmpRmiR(cmp bool, rm operand, rn regalloc.VReg, _64 bool) *instruction {
-	if rm.kind != operandKindReg && rm.kind != operandKindImm32 && rm.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = cmpRmiR
-	i.op1 = rm
-	i.op2 = newOperandReg(rn)
-	if cmp {
-		i.u1 = 1
-	}
-	i.b1 = _64
-	return i
-}
-
-func (i *instruction) asSetcc(c cond, rd regalloc.VReg) *instruction {
-	i.kind = setcc
-	i.op2 = newOperandReg(rd)
-	i.u1 = uint64(c)
-	return i
-}
-
-func (i *instruction) asCmove(c cond, rm operand, rd regalloc.VReg, _64 bool) *instruction {
-	i.kind = cmove
-	i.op1 = rm
-	i.op2 = newOperandReg(rd)
-	i.u1 = uint64(c)
-	i.b1 = _64
-	return i
-}
-
-func (m *machine) allocateExitSeq(execCtx regalloc.VReg) *instruction {
-	i := m.allocateInstr()
-	i.kind = exitSequence
-	i.op1 = newOperandReg(execCtx)
-	// Allocate the address mode that will be used in encoding the exit sequence.
-	i.op2 = newOperandMem(m.amodePool.Allocate())
-	return i
-}
-
-func (i *instruction) asXmmUnaryRmR(op sseOpcode, rm operand, rd regalloc.VReg) *instruction {
-	if rm.kind != operandKindReg && rm.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = xmmUnaryRmR
-	i.op1 = rm
-	i.op2 = newOperandReg(rd)
-	i.u1 = uint64(op)
-	return i
-}
-
-func (i *instruction) asXmmUnaryRmRImm(op sseOpcode, imm byte, rm operand, rd regalloc.VReg) *instruction {
-	if rm.kind != operandKindReg && rm.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = xmmUnaryRmRImm
-	i.op1 = rm
-	i.op2 = newOperandReg(rd)
-	i.u1 = uint64(op)
-	i.u2 = uint64(imm)
-	return i
-}
-
-func (i *instruction) asXmmCmpRmR(op sseOpcode, rm operand, rd regalloc.VReg) *instruction {
-	if rm.kind != operandKindReg && rm.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = xmmCmpRmR
-	i.op1 = rm
-	i.op2 = newOperandReg(rd)
-	i.u1 = uint64(op)
-	return i
-}
-
-func (i *instruction) asXmmMovRM(op sseOpcode, rm regalloc.VReg, rd operand) *instruction {
-	if rd.kind != operandKindMem {
-		panic("BUG")
-	}
-	i.kind = xmmMovRM
-	i.op1 = newOperandReg(rm)
-	i.op2 = rd
-	i.u1 = uint64(op)
-	return i
-}
-
-func (i *instruction) asPop64(rm regalloc.VReg) *instruction {
-	i.kind = pop64
-	i.op1 = newOperandReg(rm)
-	return i
-}
-
-func (i *instruction) asPush64(op operand) *instruction {
-	if op.kind != operandKindReg && op.kind != operandKindMem && op.kind != operandKindImm32 {
-		panic("BUG")
-	}
-	i.kind = push64
-	i.op1 = op
-	return i
-}
-
-func (i *instruction) asXCHG(rm regalloc.VReg, rd operand, size byte) *instruction {
-	i.kind = xchg
-	i.op1 = newOperandReg(rm)
-	i.op2 = rd
-	i.u1 = uint64(size)
-	return i
-}
-
-func (i *instruction) asLockCmpXCHG(rm regalloc.VReg, rd *amode, size byte) *instruction {
-	i.kind = lockcmpxchg
-	i.op1 = newOperandReg(rm)
-	i.op2 = newOperandMem(rd)
-	i.u1 = uint64(size)
-	return i
-}
-
-func (i *instruction) asLockXAdd(rm regalloc.VReg, rd *amode, size byte) *instruction {
-	i.kind = lockxadd
-	i.op1 = newOperandReg(rm)
-	i.op2 = newOperandMem(rd)
-	i.u1 = uint64(size)
-	return i
-}
-
-type unaryRmROpcode byte
-
-const (
-	unaryRmROpcodeBsr unaryRmROpcode = iota
-	unaryRmROpcodeBsf
-	unaryRmROpcodeLzcnt
-	unaryRmROpcodeTzcnt
-	unaryRmROpcodePopcnt
-)
-
-func (u unaryRmROpcode) String() string {
-	switch u {
-	case unaryRmROpcodeBsr:
-		return "bsr"
-	case unaryRmROpcodeBsf:
-		return "bsf"
-	case unaryRmROpcodeLzcnt:
-		return "lzcnt"
-	case unaryRmROpcodeTzcnt:
-		return "tzcnt"
-	case unaryRmROpcodePopcnt:
-		return "popcnt"
-	default:
-		panic("BUG")
-	}
-}
-
-type shiftROp byte
-
-const (
-	shiftROpRotateLeft           shiftROp = 0
-	shiftROpRotateRight          shiftROp = 1
-	shiftROpShiftLeft            shiftROp = 4
-	shiftROpShiftRightLogical    shiftROp = 5
-	shiftROpShiftRightArithmetic shiftROp = 7
-)
-
-func (s shiftROp) String() string {
-	switch s {
-	case shiftROpRotateLeft:
-		return "rol"
-	case shiftROpRotateRight:
-		return "ror"
-	case shiftROpShiftLeft:
-		return "shl"
-	case shiftROpShiftRightLogical:
-		return "shr"
-	case shiftROpShiftRightArithmetic:
-		return "sar"
-	default:
-		panic("BUG")
-	}
-}
-
-type sseOpcode byte
-
-const (
-	sseOpcodeInvalid sseOpcode = iota
-	sseOpcodeAddps
-	sseOpcodeAddpd
-	sseOpcodeAddss
-	sseOpcodeAddsd
-	sseOpcodeAndps
-	sseOpcodeAndpd
-	sseOpcodeAndnps
-	sseOpcodeAndnpd
-	sseOpcodeBlendvps
-	sseOpcodeBlendvpd
-	sseOpcodeComiss
-	sseOpcodeComisd
-	sseOpcodeCmpps
-	sseOpcodeCmppd
-	sseOpcodeCmpss
-	sseOpcodeCmpsd
-	sseOpcodeCvtdq2ps
-	sseOpcodeCvtdq2pd
-	sseOpcodeCvtsd2ss
-	sseOpcodeCvtsd2si
-	sseOpcodeCvtsi2ss
-	sseOpcodeCvtsi2sd
-	sseOpcodeCvtss2si
-	sseOpcodeCvtss2sd
-	sseOpcodeCvttps2dq
-	sseOpcodeCvttss2si
-	sseOpcodeCvttsd2si
-	sseOpcodeDivps
-	sseOpcodeDivpd
-	sseOpcodeDivss
-	sseOpcodeDivsd
-	sseOpcodeInsertps
-	sseOpcodeMaxps
-	sseOpcodeMaxpd
-	sseOpcodeMaxss
-	sseOpcodeMaxsd
-	sseOpcodeMinps
-	sseOpcodeMinpd
-	sseOpcodeMinss
-	sseOpcodeMinsd
-	sseOpcodeMovaps
-	sseOpcodeMovapd
-	sseOpcodeMovd
-	sseOpcodeMovdqa
-	sseOpcodeMovdqu
-	sseOpcodeMovlhps
-	sseOpcodeMovmskps
-	sseOpcodeMovmskpd
-	sseOpcodeMovq
-	sseOpcodeMovss
-	sseOpcodeMovsd
-	sseOpcodeMovups
-	sseOpcodeMovupd
-	sseOpcodeMulps
-	sseOpcodeMulpd
-	sseOpcodeMulss
-	sseOpcodeMulsd
-	sseOpcodeOrps
-	sseOpcodeOrpd
-	sseOpcodePabsb
-	sseOpcodePabsw
-	sseOpcodePabsd
-	sseOpcodePackssdw
-	sseOpcodePacksswb
-	sseOpcodePackusdw
-	sseOpcodePackuswb
-	sseOpcodePaddb
-	sseOpcodePaddd
-	sseOpcodePaddq
-	sseOpcodePaddw
-	sseOpcodePaddsb
-	sseOpcodePaddsw
-	sseOpcodePaddusb
-	sseOpcodePaddusw
-	sseOpcodePalignr
-	sseOpcodePand
-	sseOpcodePandn
-	sseOpcodePavgb
-	sseOpcodePavgw
-	sseOpcodePcmpeqb
-	sseOpcodePcmpeqw
-	sseOpcodePcmpeqd
-	sseOpcodePcmpeqq
-	sseOpcodePcmpgtb
-	sseOpcodePcmpgtw
-	sseOpcodePcmpgtd
-	sseOpcodePcmpgtq
-	sseOpcodePextrb
-	sseOpcodePextrw
-	sseOpcodePextrd
-	sseOpcodePextrq
-	sseOpcodePinsrb
-	sseOpcodePinsrw
-	sseOpcodePinsrd
-	sseOpcodePinsrq
-	sseOpcodePmaddwd
-	sseOpcodePmaxsb
-	sseOpcodePmaxsw
-	sseOpcodePmaxsd
-	sseOpcodePmaxub
-	sseOpcodePmaxuw
-	sseOpcodePmaxud
-	sseOpcodePminsb
-	sseOpcodePminsw
-	sseOpcodePminsd
-	sseOpcodePminub
-	sseOpcodePminuw
-	sseOpcodePminud
-	sseOpcodePmovmskb
-	sseOpcodePmovsxbd
-	sseOpcodePmovsxbw
-	sseOpcodePmovsxbq
-	sseOpcodePmovsxwd
-	sseOpcodePmovsxwq
-	sseOpcodePmovsxdq
-	sseOpcodePmovzxbd
-	sseOpcodePmovzxbw
-	sseOpcodePmovzxbq
-	sseOpcodePmovzxwd
-	sseOpcodePmovzxwq
-	sseOpcodePmovzxdq
-	sseOpcodePmulld
-	sseOpcodePmullw
-	sseOpcodePmuludq
-	sseOpcodePor
-	sseOpcodePshufb
-	sseOpcodePshufd
-	sseOpcodePsllw
-	sseOpcodePslld
-	sseOpcodePsllq
-	sseOpcodePsraw
-	sseOpcodePsrad
-	sseOpcodePsrlw
-	sseOpcodePsrld
-	sseOpcodePsrlq
-	sseOpcodePsubb
-	sseOpcodePsubd
-	sseOpcodePsubq
-	sseOpcodePsubw
-	sseOpcodePsubsb
-	sseOpcodePsubsw
-	sseOpcodePsubusb
-	sseOpcodePsubusw
-	sseOpcodePtest
-	sseOpcodePunpckhbw
-	sseOpcodePunpcklbw
-	sseOpcodePxor
-	sseOpcodeRcpss
-	sseOpcodeRoundps
-	sseOpcodeRoundpd
-	sseOpcodeRoundss
-	sseOpcodeRoundsd
-	sseOpcodeRsqrtss
-	sseOpcodeSqrtps
-	sseOpcodeSqrtpd
-	sseOpcodeSqrtss
-	sseOpcodeSqrtsd
-	sseOpcodeSubps
-	sseOpcodeSubpd
-	sseOpcodeSubss
-	sseOpcodeSubsd
-	sseOpcodeUcomiss
-	sseOpcodeUcomisd
-	sseOpcodeXorps
-	sseOpcodeXorpd
-	sseOpcodePmulhrsw
-	sseOpcodeUnpcklps
-	sseOpcodeCvtps2pd
-	sseOpcodeCvtpd2ps
-	sseOpcodeCvttpd2dq
-	sseOpcodeShufps
-	sseOpcodePmaddubsw
-)
-
-func (s sseOpcode) String() string {
-	switch s {
-	case sseOpcodeInvalid:
-		return "invalid"
-	case sseOpcodeAddps:
-		return "addps"
-	case sseOpcodeAddpd:
-		return "addpd"
-	case sseOpcodeAddss:
-		return "addss"
-	case sseOpcodeAddsd:
-		return "addsd"
-	case sseOpcodeAndps:
-		return "andps"
-	case sseOpcodeAndpd:
-		return "andpd"
-	case sseOpcodeAndnps:
-		return "andnps"
-	case sseOpcodeAndnpd:
-		return "andnpd"
-	case sseOpcodeBlendvps:
-		return "blendvps"
-	case sseOpcodeBlendvpd:
-		return "blendvpd"
-	case sseOpcodeComiss:
-		return "comiss"
-	case sseOpcodeComisd:
-		return "comisd"
-	case sseOpcodeCmpps:
-		return "cmpps"
-	case sseOpcodeCmppd:
-		return "cmppd"
-	case sseOpcodeCmpss:
-		return "cmpss"
-	case sseOpcodeCmpsd:
-		return "cmpsd"
-	case sseOpcodeCvtdq2ps:
-		return "cvtdq2ps"
-	case sseOpcodeCvtdq2pd:
-		return "cvtdq2pd"
-	case sseOpcodeCvtsd2ss:
-		return "cvtsd2ss"
-	case sseOpcodeCvtsd2si:
-		return "cvtsd2si"
-	case sseOpcodeCvtsi2ss:
-		return "cvtsi2ss"
-	case sseOpcodeCvtsi2sd:
-		return "cvtsi2sd"
-	case sseOpcodeCvtss2si:
-		return "cvtss2si"
-	case sseOpcodeCvtss2sd:
-		return "cvtss2sd"
-	case sseOpcodeCvttps2dq:
-		return "cvttps2dq"
-	case sseOpcodeCvttss2si:
-		return "cvttss2si"
-	case sseOpcodeCvttsd2si:
-		return "cvttsd2si"
-	case sseOpcodeDivps:
-		return "divps"
-	case sseOpcodeDivpd:
-		return "divpd"
-	case sseOpcodeDivss:
-		return "divss"
-	case sseOpcodeDivsd:
-		return "divsd"
-	case sseOpcodeInsertps:
-		return "insertps"
-	case sseOpcodeMaxps:
-		return "maxps"
-	case sseOpcodeMaxpd:
-		return "maxpd"
-	case sseOpcodeMaxss:
-		return "maxss"
-	case sseOpcodeMaxsd:
-		return "maxsd"
-	case sseOpcodeMinps:
-		return "minps"
-	case sseOpcodeMinpd:
-		return "minpd"
-	case sseOpcodeMinss:
-		return "minss"
-	case sseOpcodeMinsd:
-		return "minsd"
-	case sseOpcodeMovaps:
-		return "movaps"
-	case sseOpcodeMovapd:
-		return "movapd"
-	case sseOpcodeMovd:
-		return "movd"
-	case sseOpcodeMovdqa:
-		return "movdqa"
-	case sseOpcodeMovdqu:
-		return "movdqu"
-	case sseOpcodeMovlhps:
-		return "movlhps"
-	case sseOpcodeMovmskps:
-		return "movmskps"
-	case sseOpcodeMovmskpd:
-		return "movmskpd"
-	case sseOpcodeMovq:
-		return "movq"
-	case sseOpcodeMovss:
-		return "movss"
-	case sseOpcodeMovsd:
-		return "movsd"
-	case sseOpcodeMovups:
-		return "movups"
-	case sseOpcodeMovupd:
-		return "movupd"
-	case sseOpcodeMulps:
-		return "mulps"
-	case sseOpcodeMulpd:
-		return "mulpd"
-	case sseOpcodeMulss:
-		return "mulss"
-	case sseOpcodeMulsd:
-		return "mulsd"
-	case sseOpcodeOrps:
-		return "orps"
-	case sseOpcodeOrpd:
-		return "orpd"
-	case sseOpcodePabsb:
-		return "pabsb"
-	case sseOpcodePabsw:
-		return "pabsw"
-	case sseOpcodePabsd:
-		return "pabsd"
-	case sseOpcodePackssdw:
-		return "packssdw"
-	case sseOpcodePacksswb:
-		return "packsswb"
-	case sseOpcodePackusdw:
-		return "packusdw"
-	case sseOpcodePackuswb:
-		return "packuswb"
-	case sseOpcodePaddb:
-		return "paddb"
-	case sseOpcodePaddd:
-		return "paddd"
-	case sseOpcodePaddq:
-		return "paddq"
-	case sseOpcodePaddw:
-		return "paddw"
-	case sseOpcodePaddsb:
-		return "paddsb"
-	case sseOpcodePaddsw:
-		return "paddsw"
-	case sseOpcodePaddusb:
-		return "paddusb"
-	case sseOpcodePaddusw:
-		return "paddusw"
-	case sseOpcodePalignr:
-		return "palignr"
-	case sseOpcodePand:
-		return "pand"
-	case sseOpcodePandn:
-		return "pandn"
-	case sseOpcodePavgb:
-		return "pavgb"
-	case sseOpcodePavgw:
-		return "pavgw"
-	case sseOpcodePcmpeqb:
-		return "pcmpeqb"
-	case sseOpcodePcmpeqw:
-		return "pcmpeqw"
-	case sseOpcodePcmpeqd:
-		return "pcmpeqd"
-	case sseOpcodePcmpeqq:
-		return "pcmpeqq"
-	case sseOpcodePcmpgtb:
-		return "pcmpgtb"
-	case sseOpcodePcmpgtw:
-		return "pcmpgtw"
-	case sseOpcodePcmpgtd:
-		return "pcmpgtd"
-	case sseOpcodePcmpgtq:
-		return "pcmpgtq"
-	case sseOpcodePextrb:
-		return "pextrb"
-	case sseOpcodePextrw:
-		return "pextrw"
-	case sseOpcodePextrd:
-		return "pextrd"
-	case sseOpcodePextrq:
-		return "pextrq"
-	case sseOpcodePinsrb:
-		return "pinsrb"
-	case sseOpcodePinsrw:
-		return "pinsrw"
-	case sseOpcodePinsrd:
-		return "pinsrd"
-	case sseOpcodePinsrq:
-		return "pinsrq"
-	case sseOpcodePmaddwd:
-		return "pmaddwd"
-	case sseOpcodePmaxsb:
-		return "pmaxsb"
-	case sseOpcodePmaxsw:
-		return "pmaxsw"
-	case sseOpcodePmaxsd:
-		return "pmaxsd"
-	case sseOpcodePmaxub:
-		return "pmaxub"
-	case sseOpcodePmaxuw:
-		return "pmaxuw"
-	case sseOpcodePmaxud:
-		return "pmaxud"
-	case sseOpcodePminsb:
-		return "pminsb"
-	case sseOpcodePminsw:
-		return "pminsw"
-	case sseOpcodePminsd:
-		return "pminsd"
-	case sseOpcodePminub:
-		return "pminub"
-	case sseOpcodePminuw:
-		return "pminuw"
-	case sseOpcodePminud:
-		return "pminud"
-	case sseOpcodePmovmskb:
-		return "pmovmskb"
-	case sseOpcodePmovsxbd:
-		return "pmovsxbd"
-	case sseOpcodePmovsxbw:
-		return "pmovsxbw"
-	case sseOpcodePmovsxbq:
-		return "pmovsxbq"
-	case sseOpcodePmovsxwd:
-		return "pmovsxwd"
-	case sseOpcodePmovsxwq:
-		return "pmovsxwq"
-	case sseOpcodePmovsxdq:
-		return "pmovsxdq"
-	case sseOpcodePmovzxbd:
-		return "pmovzxbd"
-	case sseOpcodePmovzxbw:
-		return "pmovzxbw"
-	case sseOpcodePmovzxbq:
-		return "pmovzxbq"
-	case sseOpcodePmovzxwd:
-		return "pmovzxwd"
-	case sseOpcodePmovzxwq:
-		return "pmovzxwq"
-	case sseOpcodePmovzxdq:
-		return "pmovzxdq"
-	case sseOpcodePmulld:
-		return "pmulld"
-	case sseOpcodePmullw:
-		return "pmullw"
-	case sseOpcodePmuludq:
-		return "pmuludq"
-	case sseOpcodePor:
-		return "por"
-	case sseOpcodePshufb:
-		return "pshufb"
-	case sseOpcodePshufd:
-		return "pshufd"
-	case sseOpcodePsllw:
-		return "psllw"
-	case sseOpcodePslld:
-		return "pslld"
-	case sseOpcodePsllq:
-		return "psllq"
-	case sseOpcodePsraw:
-		return "psraw"
-	case sseOpcodePsrad:
-		return "psrad"
-	case sseOpcodePsrlw:
-		return "psrlw"
-	case sseOpcodePsrld:
-		return "psrld"
-	case sseOpcodePsrlq:
-		return "psrlq"
-	case sseOpcodePsubb:
-		return "psubb"
-	case sseOpcodePsubd:
-		return "psubd"
-	case sseOpcodePsubq:
-		return "psubq"
-	case sseOpcodePsubw:
-		return "psubw"
-	case sseOpcodePsubsb:
-		return "psubsb"
-	case sseOpcodePsubsw:
-		return "psubsw"
-	case sseOpcodePsubusb:
-		return "psubusb"
-	case sseOpcodePsubusw:
-		return "psubusw"
-	case sseOpcodePtest:
-		return "ptest"
-	case sseOpcodePunpckhbw:
-		return "punpckhbw"
-	case sseOpcodePunpcklbw:
-		return "punpcklbw"
-	case sseOpcodePxor:
-		return "pxor"
-	case sseOpcodeRcpss:
-		return "rcpss"
-	case sseOpcodeRoundps:
-		return "roundps"
-	case sseOpcodeRoundpd:
-		return "roundpd"
-	case sseOpcodeRoundss:
-		return "roundss"
-	case sseOpcodeRoundsd:
-		return "roundsd"
-	case sseOpcodeRsqrtss:
-		return "rsqrtss"
-	case sseOpcodeSqrtps:
-		return "sqrtps"
-	case sseOpcodeSqrtpd:
-		return "sqrtpd"
-	case sseOpcodeSqrtss:
-		return "sqrtss"
-	case sseOpcodeSqrtsd:
-		return "sqrtsd"
-	case sseOpcodeSubps:
-		return "subps"
-	case sseOpcodeSubpd:
-		return "subpd"
-	case sseOpcodeSubss:
-		return "subss"
-	case sseOpcodeSubsd:
-		return "subsd"
-	case sseOpcodeUcomiss:
-		return "ucomiss"
-	case sseOpcodeUcomisd:
-		return "ucomisd"
-	case sseOpcodeXorps:
-		return "xorps"
-	case sseOpcodeXorpd:
-		return "xorpd"
-	case sseOpcodePmulhrsw:
-		return "pmulhrsw"
-	case sseOpcodeUnpcklps:
-		return "unpcklps"
-	case sseOpcodeCvtps2pd:
-		return "cvtps2pd"
-	case sseOpcodeCvtpd2ps:
-		return "cvtpd2ps"
-	case sseOpcodeCvttpd2dq:
-		return "cvttpd2dq"
-	case sseOpcodeShufps:
-		return "shufps"
-	case sseOpcodePmaddubsw:
-		return "pmaddubsw"
-	default:
-		panic("BUG")
-	}
-}
-
-type roundingMode uint8
-
-const (
-	roundingModeNearest roundingMode = iota
-	roundingModeDown
-	roundingModeUp
-	roundingModeZero
-)
-
-func (r roundingMode) String() string {
-	switch r {
-	case roundingModeNearest:
-		return "nearest"
-	case roundingModeDown:
-		return "down"
-	case roundingModeUp:
-		return "up"
-	case roundingModeZero:
-		return "zero"
-	default:
-		panic("BUG")
-	}
-}
-
-// cmpPred is the immediate value for a comparison operation in xmmRmRImm.
-type cmpPred uint8
-
-const (
-	// cmpPredEQ_OQ is Equal (ordered, non-signaling)
-	cmpPredEQ_OQ cmpPred = iota
-	// cmpPredLT_OS is Less-than (ordered, signaling)
-	cmpPredLT_OS
-	// cmpPredLE_OS is Less-than-or-equal (ordered, signaling)
-	cmpPredLE_OS
-	// cmpPredUNORD_Q is Unordered (non-signaling)
-	cmpPredUNORD_Q
-	// cmpPredNEQ_UQ is Not-equal (unordered, non-signaling)
-	cmpPredNEQ_UQ
-	// cmpPredNLT_US is Not-less-than (unordered, signaling)
-	cmpPredNLT_US
-	// cmpPredNLE_US is Not-less-than-or-equal (unordered, signaling)
-	cmpPredNLE_US
-	// cmpPredORD_Q is Ordered (non-signaling)
-	cmpPredORD_Q
-	// cmpPredEQ_UQ is Equal (unordered, non-signaling)
-	cmpPredEQ_UQ
-	// cmpPredNGE_US is Not-greater-than-or-equal (unordered, signaling)
-	cmpPredNGE_US
-	// cmpPredNGT_US is Not-greater-than (unordered, signaling)
-	cmpPredNGT_US
-	// cmpPredFALSE_OQ is False (ordered, non-signaling)
-	cmpPredFALSE_OQ
-	// cmpPredNEQ_OQ is Not-equal (ordered, non-signaling)
-	cmpPredNEQ_OQ
-	// cmpPredGE_OS is Greater-than-or-equal (ordered, signaling)
-	cmpPredGE_OS
-	// cmpPredGT_OS is Greater-than (ordered, signaling)
-	cmpPredGT_OS
-	// cmpPredTRUE_UQ is True (unordered, non-signaling)
-	cmpPredTRUE_UQ
-	// Equal (ordered, signaling)
-	cmpPredEQ_OS
-	// Less-than (ordered, nonsignaling)
-	cmpPredLT_OQ
-	// Less-than-or-equal (ordered, nonsignaling)
-	cmpPredLE_OQ
-	// Unordered (signaling)
-	cmpPredUNORD_S
-	// Not-equal (unordered, signaling)
-	cmpPredNEQ_US
-	// Not-less-than (unordered, nonsignaling)
-	cmpPredNLT_UQ
-	// Not-less-than-or-equal (unordered, nonsignaling)
-	cmpPredNLE_UQ
-	// Ordered (signaling)
-	cmpPredORD_S
-	// Equal (unordered, signaling)
-	cmpPredEQ_US
-	// Not-greater-than-or-equal (unordered, non-signaling)
-	cmpPredNGE_UQ
-	// Not-greater-than (unordered, nonsignaling)
-	cmpPredNGT_UQ
-	// False (ordered, signaling)
-	cmpPredFALSE_OS
-	// Not-equal (ordered, signaling)
-	cmpPredNEQ_OS
-	// Greater-than-or-equal (ordered, nonsignaling)
-	cmpPredGE_OQ
-	// Greater-than (ordered, nonsignaling)
-	cmpPredGT_OQ
-	// True (unordered, signaling)
-	cmpPredTRUE_US
-)
-
-func (r cmpPred) String() string {
-	switch r {
-	case cmpPredEQ_OQ:
-		return "eq_oq"
-	case cmpPredLT_OS:
-		return "lt_os"
-	case cmpPredLE_OS:
-		return "le_os"
-	case cmpPredUNORD_Q:
-		return "unord_q"
-	case cmpPredNEQ_UQ:
-		return "neq_uq"
-	case cmpPredNLT_US:
-		return "nlt_us"
-	case cmpPredNLE_US:
-		return "nle_us"
-	case cmpPredORD_Q:
-		return "ord_q"
-	case cmpPredEQ_UQ:
-		return "eq_uq"
-	case cmpPredNGE_US:
-		return "nge_us"
-	case cmpPredNGT_US:
-		return "ngt_us"
-	case cmpPredFALSE_OQ:
-		return "false_oq"
-	case cmpPredNEQ_OQ:
-		return "neq_oq"
-	case cmpPredGE_OS:
-		return "ge_os"
-	case cmpPredGT_OS:
-		return "gt_os"
-	case cmpPredTRUE_UQ:
-		return "true_uq"
-	case cmpPredEQ_OS:
-		return "eq_os"
-	case cmpPredLT_OQ:
-		return "lt_oq"
-	case cmpPredLE_OQ:
-		return "le_oq"
-	case cmpPredUNORD_S:
-		return "unord_s"
-	case cmpPredNEQ_US:
-		return "neq_us"
-	case cmpPredNLT_UQ:
-		return "nlt_uq"
-	case cmpPredNLE_UQ:
-		return "nle_uq"
-	case cmpPredORD_S:
-		return "ord_s"
-	case cmpPredEQ_US:
-		return "eq_us"
-	case cmpPredNGE_UQ:
-		return "nge_uq"
-	case cmpPredNGT_UQ:
-		return "ngt_uq"
-	case cmpPredFALSE_OS:
-		return "false_os"
-	case cmpPredNEQ_OS:
-		return "neq_os"
-	case cmpPredGE_OQ:
-		return "ge_oq"
-	case cmpPredGT_OQ:
-		return "gt_oq"
-	case cmpPredTRUE_US:
-		return "true_us"
-	default:
-		panic("BUG")
-	}
-}
-
-func linkInstr(prev, next *instruction) *instruction {
-	prev.next = next
-	next.prev = prev
-	return next
-}
-
-type defKind byte
-
-const (
-	defKindNone defKind = iota + 1
-	defKindOp2
-	defKindCall
-	defKindDivRem
-)
-
-var defKinds = [instrMax]defKind{
-	nop0:                   defKindNone,
-	ret:                    defKindNone,
-	movRR:                  defKindOp2,
-	movRM:                  defKindNone,
-	xmmMovRM:               defKindNone,
-	aluRmiR:                defKindNone,
-	shiftR:                 defKindNone,
-	imm:                    defKindOp2,
-	unaryRmR:               defKindOp2,
-	xmmRmiReg:              defKindNone,
-	xmmUnaryRmR:            defKindOp2,
-	xmmUnaryRmRImm:         defKindOp2,
-	xmmCmpRmR:              defKindNone,
-	xmmRmR:                 defKindNone,
-	xmmRmRImm:              defKindNone,
-	mov64MR:                defKindOp2,
-	movsxRmR:               defKindOp2,
-	movzxRmR:               defKindOp2,
-	gprToXmm:               defKindOp2,
-	xmmToGpr:               defKindOp2,
-	cmove:                  defKindNone,
-	call:                   defKindCall,
-	callIndirect:           defKindCall,
-	ud2:                    defKindNone,
-	jmp:                    defKindNone,
-	jmpIf:                  defKindNone,
-	jmpTableIsland:         defKindNone,
-	cmpRmiR:                defKindNone,
-	exitSequence:           defKindNone,
-	lea:                    defKindOp2,
-	setcc:                  defKindOp2,
-	zeros:                  defKindOp2,
-	sourceOffsetInfo:       defKindNone,
-	fcvtToSintSequence:     defKindNone,
-	defineUninitializedReg: defKindOp2,
-	fcvtToUintSequence:     defKindNone,
-	xmmCMov:                defKindOp2,
-	idivRemSequence:        defKindDivRem,
-	blendvpd:               defKindNone,
-	mfence:                 defKindNone,
-	xchg:                   defKindNone,
-	lockcmpxchg:            defKindNone,
-	lockxadd:               defKindNone,
-	neg:                    defKindNone,
-	nopUseReg:              defKindNone,
-}
-
-// String implements fmt.Stringer.
-func (d defKind) String() string {
-	switch d {
-	case defKindNone:
-		return "none"
-	case defKindOp2:
-		return "op2"
-	case defKindCall:
-		return "call"
-	case defKindDivRem:
-		return "divrem"
-	default:
-		return "invalid"
-	}
-}
-
-type useKind byte
-
-const (
-	useKindNone useKind = iota + 1
-	useKindOp1
-	// useKindOp1Op2Reg is Op1 can be any operand, Op2 must be a register.
-	useKindOp1Op2Reg
-	// useKindOp1RegOp2 is Op1 must be a register, Op2 can be any operand.
-	useKindOp1RegOp2
-	// useKindRaxOp1RegOp2 is Op1 must be a register, Op2 can be any operand, and RAX is used.
-	useKindRaxOp1RegOp2
-	useKindDivRem
-	useKindBlendvpd
-	useKindCall
-	useKindCallInd
-	useKindFcvtToSintSequence
-	useKindFcvtToUintSequence
-)
-
-var useKinds = [instrMax]useKind{
-	nop0:                   useKindNone,
-	ret:                    useKindNone,
-	movRR:                  useKindOp1,
-	movRM:                  useKindOp1RegOp2,
-	xmmMovRM:               useKindOp1RegOp2,
-	cmove:                  useKindOp1Op2Reg,
-	aluRmiR:                useKindOp1Op2Reg,
-	shiftR:                 useKindOp1Op2Reg,
-	imm:                    useKindNone,
-	unaryRmR:               useKindOp1,
-	xmmRmiReg:              useKindOp1Op2Reg,
-	xmmUnaryRmR:            useKindOp1,
-	xmmUnaryRmRImm:         useKindOp1,
-	xmmCmpRmR:              useKindOp1Op2Reg,
-	xmmRmR:                 useKindOp1Op2Reg,
-	xmmRmRImm:              useKindOp1Op2Reg,
-	mov64MR:                useKindOp1,
-	movzxRmR:               useKindOp1,
-	movsxRmR:               useKindOp1,
-	gprToXmm:               useKindOp1,
-	xmmToGpr:               useKindOp1,
-	call:                   useKindCall,
-	callIndirect:           useKindCallInd,
-	ud2:                    useKindNone,
-	jmpIf:                  useKindOp1,
-	jmp:                    useKindOp1,
-	cmpRmiR:                useKindOp1Op2Reg,
-	exitSequence:           useKindOp1,
-	lea:                    useKindOp1,
-	jmpTableIsland:         useKindNone,
-	setcc:                  useKindNone,
-	zeros:                  useKindNone,
-	sourceOffsetInfo:       useKindNone,
-	fcvtToSintSequence:     useKindFcvtToSintSequence,
-	defineUninitializedReg: useKindNone,
-	fcvtToUintSequence:     useKindFcvtToUintSequence,
-	xmmCMov:                useKindOp1,
-	idivRemSequence:        useKindDivRem,
-	blendvpd:               useKindBlendvpd,
-	mfence:                 useKindNone,
-	xchg:                   useKindOp1RegOp2,
-	lockcmpxchg:            useKindRaxOp1RegOp2,
-	lockxadd:               useKindOp1RegOp2,
-	neg:                    useKindOp1,
-	nopUseReg:              useKindOp1,
-}
-
-func (u useKind) String() string {
-	switch u {
-	case useKindNone:
-		return "none"
-	case useKindOp1:
-		return "op1"
-	case useKindOp1Op2Reg:
-		return "op1op2Reg"
-	case useKindOp1RegOp2:
-		return "op1RegOp2"
-	case useKindCall:
-		return "call"
-	case useKindCallInd:
-		return "callInd"
-	default:
-		return "invalid"
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go
deleted file mode 100644
index 6637b428c..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go
+++ /dev/null
@@ -1,1683 +0,0 @@
-package amd64
-
-import (
-	"fmt"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-)
-
-func (i *instruction) encode(c backend.Compiler) (needsLabelResolution bool) {
-	switch kind := i.kind; kind {
-	case nop0, sourceOffsetInfo, defineUninitializedReg, fcvtToSintSequence, fcvtToUintSequence, nopUseReg:
-	case ret:
-		encodeRet(c)
-	case imm:
-		dst := regEncodings[i.op2.reg().RealReg()]
-		con := i.u1
-		if i.b1 { // 64 bit.
-			if lower32willSignExtendTo64(con) {
-				// Sign extend mov(imm32).
-				encodeRegReg(c,
-					legacyPrefixesNone,
-					0xc7, 1,
-					0,
-					dst,
-					rexInfo(0).setW(),
-				)
-				c.Emit4Bytes(uint32(con))
-			} else {
-				c.EmitByte(rexEncodingW | dst.rexBit())
-				c.EmitByte(0xb8 | dst.encoding())
-				c.Emit8Bytes(con)
-			}
-		} else {
-			if dst.rexBit() > 0 {
-				c.EmitByte(rexEncodingDefault | 0x1)
-			}
-			c.EmitByte(0xb8 | dst.encoding())
-			c.Emit4Bytes(uint32(con))
-		}
-
-	case aluRmiR:
-		var rex rexInfo
-		if i.b1 {
-			rex = rex.setW()
-		} else {
-			rex = rex.clearW()
-		}
-
-		dst := regEncodings[i.op2.reg().RealReg()]
-
-		aluOp := aluRmiROpcode(i.u1)
-		if aluOp == aluRmiROpcodeMul {
-			op1 := i.op1
-			const regMemOpc, regMemOpcNum = 0x0FAF, 2
-			switch op1.kind {
-			case operandKindReg:
-				src := regEncodings[op1.reg().RealReg()]
-				encodeRegReg(c, legacyPrefixesNone, regMemOpc, regMemOpcNum, dst, src, rex)
-			case operandKindMem:
-				m := i.op1.addressMode()
-				encodeRegMem(c, legacyPrefixesNone, regMemOpc, regMemOpcNum, dst, m, rex)
-			case operandKindImm32:
-				imm8 := lower8willSignExtendTo32(op1.imm32())
-				var opc uint32
-				if imm8 {
-					opc = 0x6b
-				} else {
-					opc = 0x69
-				}
-				encodeRegReg(c, legacyPrefixesNone, opc, 1, dst, dst, rex)
-				if imm8 {
-					c.EmitByte(byte(op1.imm32()))
-				} else {
-					c.Emit4Bytes(op1.imm32())
-				}
-			default:
-				panic("BUG: invalid operand kind")
-			}
-		} else {
-			const opcodeNum = 1
-			var opcR, opcM, subOpcImm uint32
-			switch aluOp {
-			case aluRmiROpcodeAdd:
-				opcR, opcM, subOpcImm = 0x01, 0x03, 0x0
-			case aluRmiROpcodeSub:
-				opcR, opcM, subOpcImm = 0x29, 0x2b, 0x5
-			case aluRmiROpcodeAnd:
-				opcR, opcM, subOpcImm = 0x21, 0x23, 0x4
-			case aluRmiROpcodeOr:
-				opcR, opcM, subOpcImm = 0x09, 0x0b, 0x1
-			case aluRmiROpcodeXor:
-				opcR, opcM, subOpcImm = 0x31, 0x33, 0x6
-			default:
-				panic("BUG: invalid aluRmiROpcode")
-			}
-
-			op1 := i.op1
-			switch op1.kind {
-			case operandKindReg:
-				src := regEncodings[op1.reg().RealReg()]
-				encodeRegReg(c, legacyPrefixesNone, opcR, opcodeNum, src, dst, rex)
-			case operandKindMem:
-				m := i.op1.addressMode()
-				encodeRegMem(c, legacyPrefixesNone, opcM, opcodeNum, dst, m, rex)
-			case operandKindImm32:
-				imm8 := lower8willSignExtendTo32(op1.imm32())
-				var opc uint32
-				if imm8 {
-					opc = 0x83
-				} else {
-					opc = 0x81
-				}
-				encodeRegReg(c, legacyPrefixesNone, opc, opcodeNum, regEnc(subOpcImm), dst, rex)
-				if imm8 {
-					c.EmitByte(byte(op1.imm32()))
-				} else {
-					c.Emit4Bytes(op1.imm32())
-				}
-			default:
-				panic("BUG: invalid operand kind")
-			}
-		}
-
-	case movRR:
-		src := regEncodings[i.op1.reg().RealReg()]
-		dst := regEncodings[i.op2.reg().RealReg()]
-		var rex rexInfo
-		if i.b1 {
-			rex = rex.setW()
-		} else {
-			rex = rex.clearW()
-		}
-		encodeRegReg(c, legacyPrefixesNone, 0x89, 1, src, dst, rex)
-
-	case xmmRmR, blendvpd:
-		op := sseOpcode(i.u1)
-		var legPrex legacyPrefixes
-		var opcode uint32
-		var opcodeNum uint32
-		switch op {
-		case sseOpcodeAddps:
-			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F58, 2
-		case sseOpcodeAddpd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F58, 2
-		case sseOpcodeAddss:
-			legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F58, 2
-		case sseOpcodeAddsd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F58, 2
-		case sseOpcodeAndps:
-			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F54, 2
-		case sseOpcodeAndpd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F54, 2
-		case sseOpcodeAndnps:
-			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F55, 2
-		case sseOpcodeAndnpd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F55, 2
-		case sseOpcodeBlendvps:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3814, 3
-		case sseOpcodeBlendvpd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3815, 3
-		case sseOpcodeDivps:
-			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5E, 2
-		case sseOpcodeDivpd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5E, 2
-		case sseOpcodeDivss:
-			legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5E, 2
-		case sseOpcodeDivsd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5E, 2
-		case sseOpcodeMaxps:
-			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5F, 2
-		case sseOpcodeMaxpd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5F, 2
-		case sseOpcodeMaxss:
-			legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5F, 2
-		case sseOpcodeMaxsd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5F, 2
-		case sseOpcodeMinps:
-			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5D, 2
-		case sseOpcodeMinpd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5D, 2
-		case sseOpcodeMinss:
-			legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5D, 2
-		case sseOpcodeMinsd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5D, 2
-		case sseOpcodeMovlhps:
-			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F16, 2
-		case sseOpcodeMovsd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F10, 2
-		case sseOpcodeMulps:
-			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F59, 2
-		case sseOpcodeMulpd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F59, 2
-		case sseOpcodeMulss:
-			legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F59, 2
-		case sseOpcodeMulsd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F59, 2
-		case sseOpcodeOrpd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F56, 2
-		case sseOpcodeOrps:
-			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F56, 2
-		case sseOpcodePackssdw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F6B, 2
-		case sseOpcodePacksswb:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F63, 2
-		case sseOpcodePackusdw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F382B, 3
-		case sseOpcodePackuswb:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F67, 2
-		case sseOpcodePaddb:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFC, 2
-		case sseOpcodePaddd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFE, 2
-		case sseOpcodePaddq:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD4, 2
-		case sseOpcodePaddw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFD, 2
-		case sseOpcodePaddsb:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEC, 2
-		case sseOpcodePaddsw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FED, 2
-		case sseOpcodePaddusb:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDC, 2
-		case sseOpcodePaddusw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDD, 2
-		case sseOpcodePand:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDB, 2
-		case sseOpcodePandn:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDF, 2
-		case sseOpcodePavgb:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE0, 2
-		case sseOpcodePavgw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE3, 2
-		case sseOpcodePcmpeqb:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F74, 2
-		case sseOpcodePcmpeqw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F75, 2
-		case sseOpcodePcmpeqd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F76, 2
-		case sseOpcodePcmpeqq:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3829, 3
-		case sseOpcodePcmpgtb:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F64, 2
-		case sseOpcodePcmpgtw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F65, 2
-		case sseOpcodePcmpgtd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F66, 2
-		case sseOpcodePcmpgtq:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3837, 3
-		case sseOpcodePmaddwd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF5, 2
-		case sseOpcodePmaxsb:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383C, 3
-		case sseOpcodePmaxsw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEE, 2
-		case sseOpcodePmaxsd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383D, 3
-		case sseOpcodePmaxub:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDE, 2
-		case sseOpcodePmaxuw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383E, 3
-		case sseOpcodePmaxud:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383F, 3
-		case sseOpcodePminsb:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3838, 3
-		case sseOpcodePminsw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEA, 2
-		case sseOpcodePminsd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3839, 3
-		case sseOpcodePminub:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDA, 2
-		case sseOpcodePminuw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383A, 3
-		case sseOpcodePminud:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383B, 3
-		case sseOpcodePmulld:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3840, 3
-		case sseOpcodePmullw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD5, 2
-		case sseOpcodePmuludq:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF4, 2
-		case sseOpcodePor:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEB, 2
-		case sseOpcodePshufb:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3800, 3
-		case sseOpcodePsubb:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF8, 2
-		case sseOpcodePsubd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFA, 2
-		case sseOpcodePsubq:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFB, 2
-		case sseOpcodePsubw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF9, 2
-		case sseOpcodePsubsb:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE8, 2
-		case sseOpcodePsubsw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE9, 2
-		case sseOpcodePsubusb:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD8, 2
-		case sseOpcodePsubusw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD9, 2
-		case sseOpcodePunpckhbw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F68, 2
-		case sseOpcodePunpcklbw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F60, 2
-		case sseOpcodePxor:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEF, 2
-		case sseOpcodeSubps:
-			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5C, 2
-		case sseOpcodeSubpd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5C, 2
-		case sseOpcodeSubss:
-			legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5C, 2
-		case sseOpcodeSubsd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5C, 2
-		case sseOpcodeXorps:
-			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F57, 2
-		case sseOpcodeXorpd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F57, 2
-		case sseOpcodePmulhrsw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F380B, 3
-		case sseOpcodeUnpcklps:
-			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F14, 2
-		case sseOpcodePmaddubsw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3804, 3
-		default:
-			if kind == blendvpd {
-				legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3815, 3
-			} else {
-				panic(fmt.Sprintf("Unsupported sseOpcode: %s", op))
-			}
-		}
-
-		dst := regEncodings[i.op2.reg().RealReg()]
-
-		rex := rexInfo(0).clearW()
-		op1 := i.op1
-		if op1.kind == operandKindReg {
-			src := regEncodings[op1.reg().RealReg()]
-			encodeRegReg(c, legPrex, opcode, opcodeNum, dst, src, rex)
-		} else if i.op1.kind == operandKindMem {
-			m := i.op1.addressMode()
-			encodeRegMem(c, legPrex, opcode, opcodeNum, dst, m, rex)
-		} else {
-			panic("BUG: invalid operand kind")
-		}
-
-	case gprToXmm:
-		var legPrefix legacyPrefixes
-		var opcode uint32
-		const opcodeNum = 2
-		switch sseOpcode(i.u1) {
-		case sseOpcodeMovd, sseOpcodeMovq:
-			legPrefix, opcode = legacyPrefixes0x66, 0x0f6e
-		case sseOpcodeCvtsi2ss:
-			legPrefix, opcode = legacyPrefixes0xF3, 0x0f2a
-		case sseOpcodeCvtsi2sd:
-			legPrefix, opcode = legacyPrefixes0xF2, 0x0f2a
-		default:
-			panic(fmt.Sprintf("Unsupported sseOpcode: %s", sseOpcode(i.u1)))
-		}
-
-		var rex rexInfo
-		if i.b1 {
-			rex = rex.setW()
-		} else {
-			rex = rex.clearW()
-		}
-		dst := regEncodings[i.op2.reg().RealReg()]
-
-		op1 := i.op1
-		if op1.kind == operandKindReg {
-			src := regEncodings[op1.reg().RealReg()]
-			encodeRegReg(c, legPrefix, opcode, opcodeNum, dst, src, rex)
-		} else if i.op1.kind == operandKindMem {
-			m := i.op1.addressMode()
-			encodeRegMem(c, legPrefix, opcode, opcodeNum, dst, m, rex)
-		} else {
-			panic("BUG: invalid operand kind")
-		}
-
-	case xmmUnaryRmR:
-		var prefix legacyPrefixes
-		var opcode uint32
-		var opcodeNum uint32
-		op := sseOpcode(i.u1)
-		switch op {
-		case sseOpcodeCvtss2sd:
-			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5A, 2
-		case sseOpcodeCvtsd2ss:
-			prefix, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5A, 2
-		case sseOpcodeMovaps:
-			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F28, 2
-		case sseOpcodeMovapd:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F28, 2
-		case sseOpcodeMovdqa:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F6F, 2
-		case sseOpcodeMovdqu:
-			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F6F, 2
-		case sseOpcodeMovsd:
-			prefix, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F10, 2
-		case sseOpcodeMovss:
-			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F10, 2
-		case sseOpcodeMovups:
-			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F10, 2
-		case sseOpcodeMovupd:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F10, 2
-		case sseOpcodePabsb:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F381C, 3
-		case sseOpcodePabsw:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F381D, 3
-		case sseOpcodePabsd:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F381E, 3
-		case sseOpcodePmovsxbd:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3821, 3
-		case sseOpcodePmovsxbw:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3820, 3
-		case sseOpcodePmovsxbq:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3822, 3
-		case sseOpcodePmovsxwd:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3823, 3
-		case sseOpcodePmovsxwq:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3824, 3
-		case sseOpcodePmovsxdq:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3825, 3
-		case sseOpcodePmovzxbd:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3831, 3
-		case sseOpcodePmovzxbw:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3830, 3
-		case sseOpcodePmovzxbq:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3832, 3
-		case sseOpcodePmovzxwd:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3833, 3
-		case sseOpcodePmovzxwq:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3834, 3
-		case sseOpcodePmovzxdq:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3835, 3
-		case sseOpcodeSqrtps:
-			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F51, 2
-		case sseOpcodeSqrtpd:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F51, 2
-		case sseOpcodeSqrtss:
-			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F51, 2
-		case sseOpcodeSqrtsd:
-			prefix, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F51, 2
-		case sseOpcodeXorps:
-			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F57, 2
-		case sseOpcodeXorpd:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F57, 2
-		case sseOpcodeCvtdq2ps:
-			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F5B, 2
-		case sseOpcodeCvtdq2pd:
-			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0FE6, 2
-		case sseOpcodeCvtps2pd:
-			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F5A, 2
-		case sseOpcodeCvtpd2ps:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5A, 2
-		case sseOpcodeCvttps2dq:
-			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5B, 2
-		case sseOpcodeCvttpd2dq:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE6, 2
-		default:
-			panic(fmt.Sprintf("Unsupported sseOpcode: %s", op))
-		}
-
-		dst := regEncodings[i.op2.reg().RealReg()]
-
-		rex := rexInfo(0).clearW()
-		op1 := i.op1
-		if op1.kind == operandKindReg {
-			src := regEncodings[op1.reg().RealReg()]
-			encodeRegReg(c, prefix, opcode, opcodeNum, dst, src, rex)
-		} else if i.op1.kind == operandKindMem {
-			m := i.op1.addressMode()
-			needsLabelResolution = encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex)
-		} else {
-			panic("BUG: invalid operand kind")
-		}
-
-	case xmmUnaryRmRImm:
-		var prefix legacyPrefixes
-		var opcode uint32
-		var opcodeNum uint32
-		op := sseOpcode(i.u1)
-		switch op {
-		case sseOpcodeRoundps:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a08, 3
-		case sseOpcodeRoundss:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a0a, 3
-		case sseOpcodeRoundpd:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a09, 3
-		case sseOpcodeRoundsd:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a0b, 3
-		}
-		rex := rexInfo(0).clearW()
-		dst := regEncodings[i.op2.reg().RealReg()]
-		op1 := i.op1
-		if op1.kind == operandKindReg {
-			src := regEncodings[op1.reg().RealReg()]
-			encodeRegReg(c, prefix, opcode, opcodeNum, dst, src, rex)
-		} else if i.op1.kind == operandKindMem {
-			m := i.op1.addressMode()
-			encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex)
-		} else {
-			panic("BUG: invalid operand kind")
-		}
-
-		c.EmitByte(byte(i.u2))
-
-	case unaryRmR:
-		var prefix legacyPrefixes
-		var opcode uint32
-		var opcodeNum uint32
-		op := unaryRmROpcode(i.u1)
-		// We assume size is either 32 or 64.
-		switch op {
-		case unaryRmROpcodeBsr:
-			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0fbd, 2
-		case unaryRmROpcodeBsf:
-			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0fbc, 2
-		case unaryRmROpcodeLzcnt:
-			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0fbd, 2
-		case unaryRmROpcodeTzcnt:
-			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0fbc, 2
-		case unaryRmROpcodePopcnt:
-			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0fb8, 2
-		default:
-			panic(fmt.Sprintf("Unsupported unaryRmROpcode: %s", op))
-		}
-
-		dst := regEncodings[i.op2.reg().RealReg()]
-
-		rex := rexInfo(0)
-		if i.b1 { // 64 bit.
-			rex = rexInfo(0).setW()
-		} else {
-			rex = rexInfo(0).clearW()
-		}
-		op1 := i.op1
-		if op1.kind == operandKindReg {
-			src := regEncodings[op1.reg().RealReg()]
-			encodeRegReg(c, prefix, opcode, opcodeNum, dst, src, rex)
-		} else if i.op1.kind == operandKindMem {
-			m := i.op1.addressMode()
-			encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex)
-		} else {
-			panic("BUG: invalid operand kind")
-		}
-
-	case not:
-		var prefix legacyPrefixes
-		src := regEncodings[i.op1.reg().RealReg()]
-		rex := rexInfo(0)
-		if i.b1 { // 64 bit.
-			rex = rexInfo(0).setW()
-		} else {
-			rex = rexInfo(0).clearW()
-		}
-		subopcode := uint8(2)
-		encodeEncEnc(c, prefix, 0xf7, 1, subopcode, uint8(src), rex)
-
-	case neg:
-		var prefix legacyPrefixes
-		src := regEncodings[i.op1.reg().RealReg()]
-		rex := rexInfo(0)
-		if i.b1 { // 64 bit.
-			rex = rexInfo(0).setW()
-		} else {
-			rex = rexInfo(0).clearW()
-		}
-		subopcode := uint8(3)
-		encodeEncEnc(c, prefix, 0xf7, 1, subopcode, uint8(src), rex)
-
-	case div:
-		rex := rexInfo(0)
-		if i.b1 { // 64 bit.
-			rex = rexInfo(0).setW()
-		} else {
-			rex = rexInfo(0).clearW()
-		}
-		var subopcode uint8
-		if i.u1 != 0 { // Signed.
-			subopcode = 7
-		} else {
-			subopcode = 6
-		}
-
-		divisor := i.op1
-		if divisor.kind == operandKindReg {
-			src := regEncodings[divisor.reg().RealReg()]
-			encodeEncEnc(c, legacyPrefixesNone, 0xf7, 1, subopcode, uint8(src), rex)
-		} else if divisor.kind == operandKindMem {
-			m := divisor.addressMode()
-			encodeEncMem(c, legacyPrefixesNone, 0xf7, 1, subopcode, m, rex)
-		} else {
-			panic("BUG: invalid operand kind")
-		}
-
-	case mulHi:
-		var prefix legacyPrefixes
-		rex := rexInfo(0)
-		if i.b1 { // 64 bit.
-			rex = rexInfo(0).setW()
-		} else {
-			rex = rexInfo(0).clearW()
-		}
-
-		signed := i.u1 != 0
-		var subopcode uint8
-		if signed {
-			subopcode = 5
-		} else {
-			subopcode = 4
-		}
-
-		// src1 is implicitly rax,
-		// dst_lo is implicitly rax,
-		// dst_hi is implicitly rdx.
-		src2 := i.op1
-		if src2.kind == operandKindReg {
-			src := regEncodings[src2.reg().RealReg()]
-			encodeEncEnc(c, prefix, 0xf7, 1, subopcode, uint8(src), rex)
-		} else if src2.kind == operandKindMem {
-			m := src2.addressMode()
-			encodeEncMem(c, prefix, 0xf7, 1, subopcode, m, rex)
-		} else {
-			panic("BUG: invalid operand kind")
-		}
-
-	case signExtendData:
-		if i.b1 { // 64 bit.
-			c.EmitByte(0x48)
-			c.EmitByte(0x99)
-		} else {
-			c.EmitByte(0x99)
-		}
-	case movzxRmR, movsxRmR:
-		signed := i.kind == movsxRmR
-
-		ext := extMode(i.u1)
-		var opcode uint32
-		var opcodeNum uint32
-		var rex rexInfo
-		switch ext {
-		case extModeBL:
-			if signed {
-				opcode, opcodeNum, rex = 0x0fbe, 2, rex.clearW()
-			} else {
-				opcode, opcodeNum, rex = 0x0fb6, 2, rex.clearW()
-			}
-		case extModeBQ:
-			if signed {
-				opcode, opcodeNum, rex = 0x0fbe, 2, rex.setW()
-			} else {
-				opcode, opcodeNum, rex = 0x0fb6, 2, rex.setW()
-			}
-		case extModeWL:
-			if signed {
-				opcode, opcodeNum, rex = 0x0fbf, 2, rex.clearW()
-			} else {
-				opcode, opcodeNum, rex = 0x0fb7, 2, rex.clearW()
-			}
-		case extModeWQ:
-			if signed {
-				opcode, opcodeNum, rex = 0x0fbf, 2, rex.setW()
-			} else {
-				opcode, opcodeNum, rex = 0x0fb7, 2, rex.setW()
-			}
-		case extModeLQ:
-			if signed {
-				opcode, opcodeNum, rex = 0x63, 1, rex.setW()
-			} else {
-				opcode, opcodeNum, rex = 0x8b, 1, rex.clearW()
-			}
-		default:
-			panic("BUG: invalid extMode")
-		}
-
-		op := i.op1
-		dst := regEncodings[i.op2.reg().RealReg()]
-		switch op.kind {
-		case operandKindReg:
-			src := regEncodings[op.reg().RealReg()]
-			if ext == extModeBL || ext == extModeBQ {
-				// Some destinations must be encoded with REX.R = 1.
-				if e := src.encoding(); e >= 4 && e <= 7 {
-					rex = rex.always()
-				}
-			}
-			encodeRegReg(c, legacyPrefixesNone, opcode, opcodeNum, dst, src, rex)
-		case operandKindMem:
-			m := op.addressMode()
-			encodeRegMem(c, legacyPrefixesNone, opcode, opcodeNum, dst, m, rex)
-		default:
-			panic("BUG: invalid operand kind")
-		}
-
-	case mov64MR:
-		m := i.op1.addressMode()
-		encodeLoad64(c, m, i.op2.reg().RealReg())
-
-	case lea:
-		needsLabelResolution = true
-		dst := regEncodings[i.op2.reg().RealReg()]
-		rex := rexInfo(0).setW()
-		const opcode, opcodeNum = 0x8d, 1
-		switch i.op1.kind {
-		case operandKindMem:
-			a := i.op1.addressMode()
-			encodeRegMem(c, legacyPrefixesNone, opcode, opcodeNum, dst, a, rex)
-		case operandKindLabel:
-			rex.encode(c, regRexBit(byte(dst)), 0)
-			c.EmitByte(byte((opcode) & 0xff))
-
-			// Indicate "LEAQ [RIP + 32bit displacement].
-			// https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing
-			c.EmitByte(encodeModRM(0b00, dst.encoding(), 0b101))
-
-			// This will be resolved later, so we just emit a placeholder (0xffffffff for testing).
-			c.Emit4Bytes(0xffffffff)
-		default:
-			panic("BUG: invalid operand kind")
-		}
-
-	case movRM:
-		m := i.op2.addressMode()
-		src := regEncodings[i.op1.reg().RealReg()]
-
-		var rex rexInfo
-		switch i.u1 {
-		case 1:
-			if e := src.encoding(); e >= 4 && e <= 7 {
-				rex = rex.always()
-			}
-			encodeRegMem(c, legacyPrefixesNone, 0x88, 1, src, m, rex.clearW())
-		case 2:
-			encodeRegMem(c, legacyPrefixes0x66, 0x89, 1, src, m, rex.clearW())
-		case 4:
-			encodeRegMem(c, legacyPrefixesNone, 0x89, 1, src, m, rex.clearW())
-		case 8:
-			encodeRegMem(c, legacyPrefixesNone, 0x89, 1, src, m, rex.setW())
-		default:
-			panic(fmt.Sprintf("BUG: invalid size %d: %s", i.u1, i.String()))
-		}
-
-	case shiftR:
-		src := regEncodings[i.op2.reg().RealReg()]
-		amount := i.op1
-
-		var opcode uint32
-		var prefix legacyPrefixes
-		rex := rexInfo(0)
-		if i.b1 { // 64 bit.
-			rex = rexInfo(0).setW()
-		} else {
-			rex = rexInfo(0).clearW()
-		}
-
-		switch amount.kind {
-		case operandKindReg:
-			if amount.reg() != rcxVReg {
-				panic("BUG: invalid reg operand: must be rcx")
-			}
-			opcode, prefix = 0xd3, legacyPrefixesNone
-			encodeEncEnc(c, prefix, opcode, 1, uint8(i.u1), uint8(src), rex)
-		case operandKindImm32:
-			opcode, prefix = 0xc1, legacyPrefixesNone
-			encodeEncEnc(c, prefix, opcode, 1, uint8(i.u1), uint8(src), rex)
-			c.EmitByte(byte(amount.imm32()))
-		default:
-			panic("BUG: invalid operand kind")
-		}
-	case xmmRmiReg:
-		const legPrefix = legacyPrefixes0x66
-		rex := rexInfo(0).clearW()
-		dst := regEncodings[i.op2.reg().RealReg()]
-
-		var opcode uint32
-		var regDigit uint8
-
-		op := sseOpcode(i.u1)
-		op1 := i.op1
-		if i.op1.kind == operandKindImm32 {
-			switch op {
-			case sseOpcodePsllw:
-				opcode, regDigit = 0x0f71, 6
-			case sseOpcodePslld:
-				opcode, regDigit = 0x0f72, 6
-			case sseOpcodePsllq:
-				opcode, regDigit = 0x0f73, 6
-			case sseOpcodePsraw:
-				opcode, regDigit = 0x0f71, 4
-			case sseOpcodePsrad:
-				opcode, regDigit = 0x0f72, 4
-			case sseOpcodePsrlw:
-				opcode, regDigit = 0x0f71, 2
-			case sseOpcodePsrld:
-				opcode, regDigit = 0x0f72, 2
-			case sseOpcodePsrlq:
-				opcode, regDigit = 0x0f73, 2
-			default:
-				panic("invalid opcode")
-			}
-
-			encodeEncEnc(c, legPrefix, opcode, 2, regDigit, uint8(dst), rex)
-			imm32 := op1.imm32()
-			if imm32 > 0xff&imm32 {
-				panic("immediate value does not fit 1 byte")
-			}
-			c.EmitByte(uint8(imm32))
-		} else {
-			switch op {
-			case sseOpcodePsllw:
-				opcode = 0x0ff1
-			case sseOpcodePslld:
-				opcode = 0x0ff2
-			case sseOpcodePsllq:
-				opcode = 0x0ff3
-			case sseOpcodePsraw:
-				opcode = 0x0fe1
-			case sseOpcodePsrad:
-				opcode = 0x0fe2
-			case sseOpcodePsrlw:
-				opcode = 0x0fd1
-			case sseOpcodePsrld:
-				opcode = 0x0fd2
-			case sseOpcodePsrlq:
-				opcode = 0x0fd3
-			default:
-				panic("invalid opcode")
-			}
-
-			if op1.kind == operandKindReg {
-				reg := regEncodings[op1.reg().RealReg()]
-				encodeRegReg(c, legPrefix, opcode, 2, dst, reg, rex)
-			} else if op1.kind == operandKindMem {
-				m := op1.addressMode()
-				encodeRegMem(c, legPrefix, opcode, 2, dst, m, rex)
-			} else {
-				panic("BUG: invalid operand kind")
-			}
-		}
-
-	case cmpRmiR:
-		var opcode uint32
-		isCmp := i.u1 != 0
-		rex := rexInfo(0)
-		_64 := i.b1
-		if _64 { // 64 bit.
-			rex = rex.setW()
-		} else {
-			rex = rex.clearW()
-		}
-		dst := regEncodings[i.op2.reg().RealReg()]
-		op1 := i.op1
-		switch op1.kind {
-		case operandKindReg:
-			reg := regEncodings[op1.reg().RealReg()]
-			if isCmp {
-				opcode = 0x39
-			} else {
-				opcode = 0x85
-			}
-			// Here we swap the encoding of the operands for CMP to be consistent with the output of LLVM/GCC.
-			encodeRegReg(c, legacyPrefixesNone, opcode, 1, reg, dst, rex)
-
-		case operandKindMem:
-			if isCmp {
-				opcode = 0x3b
-			} else {
-				opcode = 0x85
-			}
-			m := op1.addressMode()
-			encodeRegMem(c, legacyPrefixesNone, opcode, 1, dst, m, rex)
-
-		case operandKindImm32:
-			imm32 := op1.imm32()
-			useImm8 := isCmp && lower8willSignExtendTo32(imm32)
-			var subopcode uint8
-
-			switch {
-			case isCmp && useImm8:
-				opcode, subopcode = 0x83, 7
-			case isCmp && !useImm8:
-				opcode, subopcode = 0x81, 7
-			default:
-				opcode, subopcode = 0xf7, 0
-			}
-			encodeEncEnc(c, legacyPrefixesNone, opcode, 1, subopcode, uint8(dst), rex)
-			if useImm8 {
-				c.EmitByte(uint8(imm32))
-			} else {
-				c.Emit4Bytes(imm32)
-			}
-
-		default:
-			panic("BUG: invalid operand kind")
-		}
-	case setcc:
-		cc := cond(i.u1)
-		dst := regEncodings[i.op2.reg().RealReg()]
-		rex := rexInfo(0).clearW().always()
-		opcode := uint32(0x0f90) + uint32(cc)
-		encodeEncEnc(c, legacyPrefixesNone, opcode, 2, 0, uint8(dst), rex)
-	case cmove:
-		cc := cond(i.u1)
-		dst := regEncodings[i.op2.reg().RealReg()]
-		rex := rexInfo(0)
-		if i.b1 { // 64 bit.
-			rex = rex.setW()
-		} else {
-			rex = rex.clearW()
-		}
-		opcode := uint32(0x0f40) + uint32(cc)
-		src := i.op1
-		switch src.kind {
-		case operandKindReg:
-			srcReg := regEncodings[src.reg().RealReg()]
-			encodeRegReg(c, legacyPrefixesNone, opcode, 2, dst, srcReg, rex)
-		case operandKindMem:
-			m := src.addressMode()
-			encodeRegMem(c, legacyPrefixesNone, opcode, 2, dst, m, rex)
-		default:
-			panic("BUG: invalid operand kind")
-		}
-	case push64:
-		op := i.op1
-
-		switch op.kind {
-		case operandKindReg:
-			dst := regEncodings[op.reg().RealReg()]
-			if dst.rexBit() > 0 {
-				c.EmitByte(rexEncodingDefault | 0x1)
-			}
-			c.EmitByte(0x50 | dst.encoding())
-		case operandKindMem:
-			m := op.addressMode()
-			encodeRegMem(
-				c, legacyPrefixesNone, 0xff, 1, regEnc(6), m, rexInfo(0).clearW(),
-			)
-		case operandKindImm32:
-			c.EmitByte(0x68)
-			c.Emit4Bytes(op.imm32())
-		default:
-			panic("BUG: invalid operand kind")
-		}
-
-	case pop64:
-		dst := regEncodings[i.op1.reg().RealReg()]
-		if dst.rexBit() > 0 {
-			c.EmitByte(rexEncodingDefault | 0x1)
-		}
-		c.EmitByte(0x58 | dst.encoding())
-
-	case xmmMovRM:
-		var legPrefix legacyPrefixes
-		var opcode uint32
-		const opcodeNum = 2
-		switch sseOpcode(i.u1) {
-		case sseOpcodeMovaps:
-			legPrefix, opcode = legacyPrefixesNone, 0x0f29
-		case sseOpcodeMovapd:
-			legPrefix, opcode = legacyPrefixes0x66, 0x0f29
-		case sseOpcodeMovdqa:
-			legPrefix, opcode = legacyPrefixes0x66, 0x0f7f
-		case sseOpcodeMovdqu:
-			legPrefix, opcode = legacyPrefixes0xF3, 0x0f7f
-		case sseOpcodeMovss:
-			legPrefix, opcode = legacyPrefixes0xF3, 0x0f11
-		case sseOpcodeMovsd:
-			legPrefix, opcode = legacyPrefixes0xF2, 0x0f11
-		case sseOpcodeMovups:
-			legPrefix, opcode = legacyPrefixesNone, 0x0f11
-		case sseOpcodeMovupd:
-			legPrefix, opcode = legacyPrefixes0x66, 0x0f11
-		default:
-			panic(fmt.Sprintf("Unsupported sseOpcode: %s", sseOpcode(i.u1)))
-		}
-
-		dst := regEncodings[i.op1.reg().RealReg()]
-		encodeRegMem(c, legPrefix, opcode, opcodeNum, dst, i.op2.addressMode(), rexInfo(0).clearW())
-	case xmmLoadConst:
-		panic("TODO")
-	case xmmToGpr:
-		var legPrefix legacyPrefixes
-		var opcode uint32
-		var argSwap bool
-		const opcodeNum = 2
-		switch sseOpcode(i.u1) {
-		case sseOpcodeMovd, sseOpcodeMovq:
-			legPrefix, opcode, argSwap = legacyPrefixes0x66, 0x0f7e, false
-		case sseOpcodeMovmskps:
-			legPrefix, opcode, argSwap = legacyPrefixesNone, 0x0f50, true
-		case sseOpcodeMovmskpd:
-			legPrefix, opcode, argSwap = legacyPrefixes0x66, 0x0f50, true
-		case sseOpcodePmovmskb:
-			legPrefix, opcode, argSwap = legacyPrefixes0x66, 0x0fd7, true
-		case sseOpcodeCvttss2si:
-			legPrefix, opcode, argSwap = legacyPrefixes0xF3, 0x0f2c, true
-		case sseOpcodeCvttsd2si:
-			legPrefix, opcode, argSwap = legacyPrefixes0xF2, 0x0f2c, true
-		default:
-			panic(fmt.Sprintf("Unsupported sseOpcode: %s", sseOpcode(i.u1)))
-		}
-
-		var rex rexInfo
-		if i.b1 {
-			rex = rex.setW()
-		} else {
-			rex = rex.clearW()
-		}
-		src := regEncodings[i.op1.reg().RealReg()]
-		dst := regEncodings[i.op2.reg().RealReg()]
-		if argSwap {
-			src, dst = dst, src
-		}
-		encodeRegReg(c, legPrefix, opcode, opcodeNum, src, dst, rex)
-
-	case cvtUint64ToFloatSeq:
-		panic("TODO")
-	case cvtFloatToSintSeq:
-		panic("TODO")
-	case cvtFloatToUintSeq:
-		panic("TODO")
-	case xmmMinMaxSeq:
-		panic("TODO")
-	case xmmCmpRmR:
-		var prefix legacyPrefixes
-		var opcode uint32
-		var opcodeNum uint32
-		rex := rexInfo(0)
-		_64 := i.b1
-		if _64 { // 64 bit.
-			rex = rex.setW()
-		} else {
-			rex = rex.clearW()
-		}
-
-		op := sseOpcode(i.u1)
-		switch op {
-		case sseOpcodePtest:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3817, 3
-		case sseOpcodeUcomisd:
-			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f2e, 2
-		case sseOpcodeUcomiss:
-			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0f2e, 2
-		default:
-			panic(fmt.Sprintf("Unsupported sseOpcode: %s", op))
-		}
-
-		dst := regEncodings[i.op2.reg().RealReg()]
-		op1 := i.op1
-		switch op1.kind {
-		case operandKindReg:
-			reg := regEncodings[op1.reg().RealReg()]
-			encodeRegReg(c, prefix, opcode, opcodeNum, dst, reg, rex)
-
-		case operandKindMem:
-			m := op1.addressMode()
-			encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex)
-
-		default:
-			panic("BUG: invalid operand kind")
-		}
-	case xmmRmRImm:
-		op := sseOpcode(i.u1)
-		var legPrex legacyPrefixes
-		var opcode uint32
-		var opcodeNum uint32
-		var swap bool
-		switch op {
-		case sseOpcodeCmpps:
-			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0FC2, 2
-		case sseOpcodeCmppd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FC2, 2
-		case sseOpcodeCmpss:
-			legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0FC2, 2
-		case sseOpcodeCmpsd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0FC2, 2
-		case sseOpcodeInsertps:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A21, 3
-		case sseOpcodePalignr:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A0F, 3
-		case sseOpcodePinsrb:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A20, 3
-		case sseOpcodePinsrw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FC4, 2
-		case sseOpcodePinsrd, sseOpcodePinsrq:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A22, 3
-		case sseOpcodePextrb:
-			swap = true
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A14, 3
-		case sseOpcodePextrw:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FC5, 2
-		case sseOpcodePextrd, sseOpcodePextrq:
-			swap = true
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A16, 3
-		case sseOpcodePshufd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F70, 2
-		case sseOpcodeRoundps:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A08, 3
-		case sseOpcodeRoundpd:
-			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A09, 3
-		case sseOpcodeShufps:
-			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0FC6, 2
-		default:
-			panic(fmt.Sprintf("Unsupported sseOpcode: %s", op))
-		}
-
-		dst := regEncodings[i.op2.reg().RealReg()]
-
-		var rex rexInfo
-		if op == sseOpcodePextrq || op == sseOpcodePinsrq {
-			rex = rexInfo(0).setW()
-		} else {
-			rex = rexInfo(0).clearW()
-		}
-		op1 := i.op1
-		if op1.kind == operandKindReg {
-			src := regEncodings[op1.reg().RealReg()]
-			if swap {
-				src, dst = dst, src
-			}
-			encodeRegReg(c, legPrex, opcode, opcodeNum, dst, src, rex)
-		} else if i.op1.kind == operandKindMem {
-			if swap {
-				panic("BUG: this is not possible to encode")
-			}
-			m := i.op1.addressMode()
-			encodeRegMem(c, legPrex, opcode, opcodeNum, dst, m, rex)
-		} else {
-			panic("BUG: invalid operand kind")
-		}
-
-		c.EmitByte(byte(i.u2))
-
-	case jmp:
-		const (
-			regMemOpcode    = 0xff
-			regMemOpcodeNum = 1
-			regMemSubOpcode = 4
-		)
-		op := i.op1
-		switch op.kind {
-		case operandKindLabel:
-			needsLabelResolution = true
-			fallthrough
-		case operandKindImm32:
-			c.EmitByte(0xe9)
-			c.Emit4Bytes(op.imm32())
-		case operandKindMem:
-			m := op.addressMode()
-			encodeRegMem(c,
-				legacyPrefixesNone,
-				regMemOpcode, regMemOpcodeNum,
-				regMemSubOpcode, m, rexInfo(0).clearW(),
-			)
-		case operandKindReg:
-			r := op.reg().RealReg()
-			encodeRegReg(
-				c,
-				legacyPrefixesNone,
-				regMemOpcode, regMemOpcodeNum,
-				regMemSubOpcode,
-				regEncodings[r], rexInfo(0).clearW(),
-			)
-		default:
-			panic("BUG: invalid operand kind")
-		}
-
-	case jmpIf:
-		op := i.op1
-		switch op.kind {
-		case operandKindLabel:
-			needsLabelResolution = true
-			fallthrough
-		case operandKindImm32:
-			c.EmitByte(0x0f)
-			c.EmitByte(0x80 | cond(i.u1).encoding())
-			c.Emit4Bytes(op.imm32())
-		default:
-			panic("BUG: invalid operand kind")
-		}
-
-	case jmpTableIsland:
-		needsLabelResolution = true
-		for tc := uint64(0); tc < i.u2; tc++ {
-			c.Emit8Bytes(0)
-		}
-
-	case exitSequence:
-		execCtx := i.op1.reg()
-		allocatedAmode := i.op2.addressMode()
-
-		// Restore the RBP, RSP, and return to the Go code:
-		*allocatedAmode = amode{
-			kindWithShift: uint32(amodeImmReg), base: execCtx,
-			imm32: wazevoapi.ExecutionContextOffsetOriginalFramePointer.U32(),
-		}
-		encodeLoad64(c, allocatedAmode, rbp)
-		allocatedAmode.imm32 = wazevoapi.ExecutionContextOffsetOriginalStackPointer.U32()
-		encodeLoad64(c, allocatedAmode, rsp)
-		encodeRet(c)
-
-	case ud2:
-		c.EmitByte(0x0f)
-		c.EmitByte(0x0b)
-
-	case call:
-		c.EmitByte(0xe8)
-		// Meaning that the call target is a function value, and requires relocation.
-		c.AddRelocationInfo(ssa.FuncRef(i.u1))
-		// Note that this is zero as a placeholder for the call target if it's a function value.
-		c.Emit4Bytes(uint32(i.u2))
-
-	case callIndirect:
-		op := i.op1
-
-		const opcodeNum = 1
-		const opcode = 0xff
-		rex := rexInfo(0).clearW()
-		switch op.kind {
-		case operandKindReg:
-			dst := regEncodings[op.reg().RealReg()]
-			encodeRegReg(c,
-				legacyPrefixesNone,
-				opcode, opcodeNum,
-				regEnc(2),
-				dst,
-				rex,
-			)
-		case operandKindMem:
-			m := op.addressMode()
-			encodeRegMem(c,
-				legacyPrefixesNone,
-				opcode, opcodeNum,
-				regEnc(2),
-				m,
-				rex,
-			)
-		default:
-			panic("BUG: invalid operand kind")
-		}
-
-	case xchg:
-		src, dst := regEncodings[i.op1.reg().RealReg()], i.op2
-		size := i.u1
-
-		var rex rexInfo
-		var opcode uint32
-		lp := legacyPrefixesNone
-		switch size {
-		case 8:
-			opcode = 0x87
-			rex = rexInfo(0).setW()
-		case 4:
-			opcode = 0x87
-			rex = rexInfo(0).clearW()
-		case 2:
-			lp = legacyPrefixes0x66
-			opcode = 0x87
-			rex = rexInfo(0).clearW()
-		case 1:
-			opcode = 0x86
-			if i.op2.kind == operandKindReg {
-				panic("TODO?: xchg on two 1-byte registers")
-			}
-			// Some destinations must be encoded with REX.R = 1.
-			if e := src.encoding(); e >= 4 && e <= 7 {
-				rex = rexInfo(0).always()
-			}
-		default:
-			panic(fmt.Sprintf("BUG: invalid size %d: %s", size, i.String()))
-		}
-
-		switch dst.kind {
-		case operandKindMem:
-			m := dst.addressMode()
-			encodeRegMem(c, lp, opcode, 1, src, m, rex)
-		case operandKindReg:
-			r := dst.reg().RealReg()
-			encodeRegReg(c, lp, opcode, 1, src, regEncodings[r], rex)
-		default:
-			panic("BUG: invalid operand kind")
-		}
-
-	case lockcmpxchg:
-		src, dst := regEncodings[i.op1.reg().RealReg()], i.op2
-		size := i.u1
-
-		var rex rexInfo
-		var opcode uint32
-		lp := legacyPrefixes0xF0 // Lock prefix.
-		switch size {
-		case 8:
-			opcode = 0x0FB1
-			rex = rexInfo(0).setW()
-		case 4:
-			opcode = 0x0FB1
-			rex = rexInfo(0).clearW()
-		case 2:
-			lp = legacyPrefixes0x660xF0 // Legacy prefix + Lock prefix.
-			opcode = 0x0FB1
-			rex = rexInfo(0).clearW()
-		case 1:
-			opcode = 0x0FB0
-			// Some destinations must be encoded with REX.R = 1.
-			if e := src.encoding(); e >= 4 && e <= 7 {
-				rex = rexInfo(0).always()
-			}
-		default:
-			panic(fmt.Sprintf("BUG: invalid size %d: %s", size, i.String()))
-		}
-
-		switch dst.kind {
-		case operandKindMem:
-			m := dst.addressMode()
-			encodeRegMem(c, lp, opcode, 2, src, m, rex)
-		default:
-			panic("BUG: invalid operand kind")
-		}
-
-	case lockxadd:
-		src, dst := regEncodings[i.op1.reg().RealReg()], i.op2
-		size := i.u1
-
-		var rex rexInfo
-		var opcode uint32
-		lp := legacyPrefixes0xF0 // Lock prefix.
-		switch size {
-		case 8:
-			opcode = 0x0FC1
-			rex = rexInfo(0).setW()
-		case 4:
-			opcode = 0x0FC1
-			rex = rexInfo(0).clearW()
-		case 2:
-			lp = legacyPrefixes0x660xF0 // Legacy prefix + Lock prefix.
-			opcode = 0x0FC1
-			rex = rexInfo(0).clearW()
-		case 1:
-			opcode = 0x0FC0
-			// Some destinations must be encoded with REX.R = 1.
-			if e := src.encoding(); e >= 4 && e <= 7 {
-				rex = rexInfo(0).always()
-			}
-		default:
-			panic(fmt.Sprintf("BUG: invalid size %d: %s", size, i.String()))
-		}
-
-		switch dst.kind {
-		case operandKindMem:
-			m := dst.addressMode()
-			encodeRegMem(c, lp, opcode, 2, src, m, rex)
-		default:
-			panic("BUG: invalid operand kind")
-		}
-
-	case zeros:
-		r := i.op2.reg()
-		if r.RegType() == regalloc.RegTypeInt {
-			i.asAluRmiR(aluRmiROpcodeXor, newOperandReg(r), r, true)
-		} else {
-			i.asXmmRmR(sseOpcodePxor, newOperandReg(r), r)
-		}
-		i.encode(c)
-
-	case mfence:
-		// https://www.felixcloutier.com/x86/mfence
-		c.EmitByte(0x0f)
-		c.EmitByte(0xae)
-		c.EmitByte(0xf0)
-
-	default:
-		panic(fmt.Sprintf("TODO: %v", i.kind))
-	}
-	return
-}
-
-func encodeLoad64(c backend.Compiler, m *amode, rd regalloc.RealReg) {
-	dst := regEncodings[rd]
-	encodeRegMem(c, legacyPrefixesNone, 0x8b, 1, dst, m, rexInfo(0).setW())
-}
-
-func encodeRet(c backend.Compiler) {
-	c.EmitByte(0xc3)
-}
-
-func encodeEncEnc(
-	c backend.Compiler,
-	legPrefixes legacyPrefixes,
-	opcodes uint32,
-	opcodeNum uint32,
-	r uint8,
-	rm uint8,
-	rex rexInfo,
-) {
-	legPrefixes.encode(c)
-	rex.encode(c, r>>3, rm>>3)
-
-	for opcodeNum > 0 {
-		opcodeNum--
-		c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff))
-	}
-	c.EmitByte(encodeModRM(3, r&7, rm&7))
-}
-
-func encodeRegReg(
-	c backend.Compiler,
-	legPrefixes legacyPrefixes,
-	opcodes uint32,
-	opcodeNum uint32,
-	r regEnc,
-	rm regEnc,
-	rex rexInfo,
-) {
-	encodeEncEnc(c, legPrefixes, opcodes, opcodeNum, uint8(r), uint8(rm), rex)
-}
-
-func encodeModRM(mod byte, reg byte, rm byte) byte {
-	return mod<<6 | reg<<3 | rm
-}
-
-func encodeSIB(shift byte, encIndex byte, encBase byte) byte {
-	return shift<<6 | encIndex<<3 | encBase
-}
-
-func encodeRegMem(
-	c backend.Compiler, legPrefixes legacyPrefixes, opcodes uint32, opcodeNum uint32, r regEnc, m *amode, rex rexInfo,
-) (needsLabelResolution bool) {
-	needsLabelResolution = encodeEncMem(c, legPrefixes, opcodes, opcodeNum, uint8(r), m, rex)
-	return
-}
-
-func encodeEncMem(
-	c backend.Compiler, legPrefixes legacyPrefixes, opcodes uint32, opcodeNum uint32, r uint8, m *amode, rex rexInfo,
-) (needsLabelResolution bool) {
-	legPrefixes.encode(c)
-
-	const (
-		modNoDisplacement    = 0b00
-		modShortDisplacement = 0b01
-		modLongDisplacement  = 0b10
-
-		useSBI = 4 // the encoding of rsp or r12 register.
-	)
-
-	switch m.kind() {
-	case amodeImmReg, amodeImmRBP:
-		base := m.base.RealReg()
-		baseEnc := regEncodings[base]
-
-		rex.encode(c, regRexBit(r), baseEnc.rexBit())
-
-		for opcodeNum > 0 {
-			opcodeNum--
-			c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff))
-		}
-
-		// SIB byte is the last byte of the memory encoding before the displacement
-		const sibByte = 0x24 // == encodeSIB(0, 4, 4)
-
-		immZero, baseRbp, baseR13 := m.imm32 == 0, base == rbp, base == r13
-		short := lower8willSignExtendTo32(m.imm32)
-		rspOrR12 := base == rsp || base == r12
-
-		if immZero && !baseRbp && !baseR13 { // rbp or r13 can't be used as base for without displacement encoding.
-			c.EmitByte(encodeModRM(modNoDisplacement, regEncoding(r), baseEnc.encoding()))
-			if rspOrR12 {
-				c.EmitByte(sibByte)
-			}
-		} else if short { // Note: this includes the case where m.imm32 == 0 && base == rbp || base == r13.
-			c.EmitByte(encodeModRM(modShortDisplacement, regEncoding(r), baseEnc.encoding()))
-			if rspOrR12 {
-				c.EmitByte(sibByte)
-			}
-			c.EmitByte(byte(m.imm32))
-		} else {
-			c.EmitByte(encodeModRM(modLongDisplacement, regEncoding(r), baseEnc.encoding()))
-			if rspOrR12 {
-				c.EmitByte(sibByte)
-			}
-			c.Emit4Bytes(m.imm32)
-		}
-
-	case amodeRegRegShift:
-		base := m.base.RealReg()
-		baseEnc := regEncodings[base]
-		index := m.index.RealReg()
-		indexEnc := regEncodings[index]
-
-		if index == rsp {
-			panic("BUG: rsp can't be used as index of addressing mode")
-		}
-
-		rex.encodeForIndex(c, regEnc(r), indexEnc, baseEnc)
-
-		for opcodeNum > 0 {
-			opcodeNum--
-			c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff))
-		}
-
-		immZero, baseRbp, baseR13 := m.imm32 == 0, base == rbp, base == r13
-		if immZero && !baseRbp && !baseR13 { // rbp or r13 can't be used as base for without displacement encoding. (curious why? because it's interpreted as RIP relative addressing).
-			c.EmitByte(encodeModRM(modNoDisplacement, regEncoding(r), useSBI))
-			c.EmitByte(encodeSIB(m.shift(), indexEnc.encoding(), baseEnc.encoding()))
-		} else if lower8willSignExtendTo32(m.imm32) {
-			c.EmitByte(encodeModRM(modShortDisplacement, regEncoding(r), useSBI))
-			c.EmitByte(encodeSIB(m.shift(), indexEnc.encoding(), baseEnc.encoding()))
-			c.EmitByte(byte(m.imm32))
-		} else {
-			c.EmitByte(encodeModRM(modLongDisplacement, regEncoding(r), useSBI))
-			c.EmitByte(encodeSIB(m.shift(), indexEnc.encoding(), baseEnc.encoding()))
-			c.Emit4Bytes(m.imm32)
-		}
-
-	case amodeRipRel:
-		rex.encode(c, regRexBit(r), 0)
-		for opcodeNum > 0 {
-			opcodeNum--
-			c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff))
-		}
-
-		// Indicate "LEAQ [RIP + 32bit displacement].
-		// https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing
-		c.EmitByte(encodeModRM(0b00, regEncoding(r), 0b101))
-
-		// This will be resolved later, so we just emit a placeholder.
-		needsLabelResolution = true
-		c.Emit4Bytes(0)
-
-	default:
-		panic("BUG: invalid addressing mode")
-	}
-	return
-}
-
-const (
-	rexEncodingDefault byte = 0x40
-	rexEncodingW            = rexEncodingDefault | 0x08
-)
-
-// rexInfo is a bit set to indicate:
-//
-//	0x01: W bit must be cleared.
-//	0x02: REX prefix must be emitted.
-type rexInfo byte
-
-func (ri rexInfo) setW() rexInfo {
-	return ri | 0x01
-}
-
-func (ri rexInfo) clearW() rexInfo {
-	return ri & 0x02
-}
-
-func (ri rexInfo) always() rexInfo {
-	return ri | 0x02
-}
-
-func (ri rexInfo) notAlways() rexInfo { //nolint
-	return ri & 0x01
-}
-
-func (ri rexInfo) encode(c backend.Compiler, r uint8, b uint8) {
-	var w byte = 0
-	if ri&0x01 != 0 {
-		w = 0x01
-	}
-	rex := rexEncodingDefault | w<<3 | r<<2 | b
-	if rex != rexEncodingDefault || ri&0x02 != 0 {
-		c.EmitByte(rex)
-	}
-}
-
-func (ri rexInfo) encodeForIndex(c backend.Compiler, encR regEnc, encIndex regEnc, encBase regEnc) {
-	var w byte = 0
-	if ri&0x01 != 0 {
-		w = 0x01
-	}
-	r := encR.rexBit()
-	x := encIndex.rexBit()
-	b := encBase.rexBit()
-	rex := byte(0x40) | w<<3 | r<<2 | x<<1 | b
-	if rex != 0x40 || ri&0x02 != 0 {
-		c.EmitByte(rex)
-	}
-}
-
-type regEnc byte
-
-func (r regEnc) rexBit() byte {
-	return regRexBit(byte(r))
-}
-
-func (r regEnc) encoding() byte {
-	return regEncoding(byte(r))
-}
-
-func regRexBit(r byte) byte {
-	return r >> 3
-}
-
-func regEncoding(r byte) byte {
-	return r & 0x07
-}
-
-var regEncodings = [...]regEnc{
-	rax:   0b000,
-	rcx:   0b001,
-	rdx:   0b010,
-	rbx:   0b011,
-	rsp:   0b100,
-	rbp:   0b101,
-	rsi:   0b110,
-	rdi:   0b111,
-	r8:    0b1000,
-	r9:    0b1001,
-	r10:   0b1010,
-	r11:   0b1011,
-	r12:   0b1100,
-	r13:   0b1101,
-	r14:   0b1110,
-	r15:   0b1111,
-	xmm0:  0b000,
-	xmm1:  0b001,
-	xmm2:  0b010,
-	xmm3:  0b011,
-	xmm4:  0b100,
-	xmm5:  0b101,
-	xmm6:  0b110,
-	xmm7:  0b111,
-	xmm8:  0b1000,
-	xmm9:  0b1001,
-	xmm10: 0b1010,
-	xmm11: 0b1011,
-	xmm12: 0b1100,
-	xmm13: 0b1101,
-	xmm14: 0b1110,
-	xmm15: 0b1111,
-}
-
-type legacyPrefixes byte
-
-const (
-	legacyPrefixesNone legacyPrefixes = iota
-	legacyPrefixes0x66
-	legacyPrefixes0xF0
-	legacyPrefixes0x660xF0
-	legacyPrefixes0xF2
-	legacyPrefixes0xF3
-)
-
-func (p legacyPrefixes) encode(c backend.Compiler) {
-	switch p {
-	case legacyPrefixesNone:
-	case legacyPrefixes0x66:
-		c.EmitByte(0x66)
-	case legacyPrefixes0xF0:
-		c.EmitByte(0xf0)
-	case legacyPrefixes0x660xF0:
-		c.EmitByte(0x66)
-		c.EmitByte(0xf0)
-	case legacyPrefixes0xF2:
-		c.EmitByte(0xf2)
-	case legacyPrefixes0xF3:
-		c.EmitByte(0xf3)
-	default:
-		panic("BUG: invalid legacy prefix")
-	}
-}
-
-func lower32willSignExtendTo64(x uint64) bool {
-	xs := int64(x)
-	return xs == int64(uint64(int32(xs)))
-}
-
-func lower8willSignExtendTo32(x uint32) bool {
-	xs := int32(x)
-	return xs == ((xs << 24) >> 24)
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go
deleted file mode 100644
index 55d05ef63..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go
+++ /dev/null
@@ -1,71 +0,0 @@
-package amd64
-
-import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-// lowerConstant allocates a new VReg and inserts the instruction to load the constant value.
-func (m *machine) lowerConstant(instr *ssa.Instruction) (vr regalloc.VReg) {
-	val := instr.Return()
-	valType := val.Type()
-
-	vr = m.c.AllocateVReg(valType)
-	m.insertLoadConstant(instr, vr)
-	return
-}
-
-// InsertLoadConstantBlockArg implements backend.Machine.
-func (m *machine) InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) {
-	m.insertLoadConstant(instr, vr)
-}
-
-func (m *machine) insertLoadConstant(instr *ssa.Instruction, vr regalloc.VReg) {
-	val := instr.Return()
-	valType := val.Type()
-	v := instr.ConstantVal()
-
-	bits := valType.Bits()
-	if bits < 64 { // Clear the redundant bits just in case it's unexpectedly sign-extended, etc.
-		v = v & ((1 << valType.Bits()) - 1)
-	}
-
-	switch valType {
-	case ssa.TypeF32, ssa.TypeF64:
-		m.lowerFconst(vr, v, bits == 64)
-	case ssa.TypeI32, ssa.TypeI64:
-		m.lowerIconst(vr, v, bits == 64)
-	default:
-		panic("BUG")
-	}
-}
-
-func (m *machine) lowerFconst(dst regalloc.VReg, c uint64, _64 bool) {
-	if c == 0 {
-		xor := m.allocateInstr().asZeros(dst)
-		m.insert(xor)
-	} else {
-		var tmpType ssa.Type
-		if _64 {
-			tmpType = ssa.TypeI64
-		} else {
-			tmpType = ssa.TypeI32
-		}
-		tmpInt := m.c.AllocateVReg(tmpType)
-		loadToGP := m.allocateInstr().asImm(tmpInt, c, _64)
-		m.insert(loadToGP)
-
-		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpInt), dst, _64)
-		m.insert(movToXmm)
-	}
-}
-
-func (m *machine) lowerIconst(dst regalloc.VReg, c uint64, _64 bool) {
-	i := m.allocateInstr()
-	if c == 0 {
-		i.asZeros(dst)
-	} else {
-		i.asImm(dst, c, _64)
-	}
-	m.insert(i)
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go
deleted file mode 100644
index befe8c643..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go
+++ /dev/null
@@ -1,187 +0,0 @@
-package amd64
-
-import (
-	"fmt"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-var addendsMatchOpcodes = [...]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst, ssa.OpcodeIshl}
-
-type addend struct {
-	r     regalloc.VReg
-	off   int64
-	shift byte
-}
-
-func (a addend) String() string {
-	return fmt.Sprintf("addend{r=%s, off=%d, shift=%d}", a.r, a.off, a.shift)
-}
-
-// lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
-func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32) (am *amode) {
-	def := m.c.ValueDefinition(ptr)
-
-	if offsetBase&0x80000000 != 0 {
-		// Special casing the huge base offset whose MSB is set. In x64, the immediate is always
-		// sign-extended, but our IR semantics requires the offset base is always unsigned.
-		// Note that this should be extremely rare or even this shouldn't hit in the real application,
-		// therefore we don't need to optimize this case in my opinion.
-
-		a := m.lowerAddend(def)
-		off64 := a.off + int64(offsetBase)
-		offsetBaseReg := m.c.AllocateVReg(ssa.TypeI64)
-		m.lowerIconst(offsetBaseReg, uint64(off64), true)
-		if a.r != regalloc.VRegInvalid {
-			return m.newAmodeRegRegShift(0, offsetBaseReg, a.r, a.shift)
-		} else {
-			return m.newAmodeImmReg(0, offsetBaseReg)
-		}
-	}
-
-	if op := m.c.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op == ssa.OpcodeIadd {
-		add := def.Instr
-		x, y := add.Arg2()
-		xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
-		ax := m.lowerAddend(xDef)
-		ay := m.lowerAddend(yDef)
-		add.MarkLowered()
-		return m.lowerAddendsToAmode(ax, ay, offsetBase)
-	} else {
-		// If it is not an Iadd, then we lower the one addend.
-		a := m.lowerAddend(def)
-		// off is always 0 if r is valid.
-		if a.r != regalloc.VRegInvalid {
-			if a.shift != 0 {
-				tmpReg := m.c.AllocateVReg(ssa.TypeI64)
-				m.lowerIconst(tmpReg, 0, true)
-				return m.newAmodeRegRegShift(offsetBase, tmpReg, a.r, a.shift)
-			}
-			return m.newAmodeImmReg(offsetBase, a.r)
-		} else {
-			off64 := a.off + int64(offsetBase)
-			tmpReg := m.c.AllocateVReg(ssa.TypeI64)
-			m.lowerIconst(tmpReg, uint64(off64), true)
-			return m.newAmodeImmReg(0, tmpReg)
-		}
-	}
-}
-
-func (m *machine) lowerAddendsToAmode(x, y addend, offBase uint32) *amode {
-	if x.r != regalloc.VRegInvalid && x.off != 0 || y.r != regalloc.VRegInvalid && y.off != 0 {
-		panic("invalid input")
-	}
-
-	u64 := uint64(x.off+y.off) + uint64(offBase)
-	if u64 != 0 {
-		if _, ok := asImm32(u64, false); !ok {
-			tmpReg := m.c.AllocateVReg(ssa.TypeI64)
-			m.lowerIconst(tmpReg, u64, true)
-			// Blank u64 as it has been already lowered.
-			u64 = 0
-
-			if x.r == regalloc.VRegInvalid {
-				x.r = tmpReg
-			} else if y.r == regalloc.VRegInvalid {
-				y.r = tmpReg
-			} else {
-				// We already know that either rx or ry is invalid,
-				// so we overwrite it with the temporary register.
-				panic("BUG")
-			}
-		}
-	}
-
-	u32 := uint32(u64)
-	switch {
-	// We assume rx, ry are valid iff offx, offy are 0.
-	case x.r != regalloc.VRegInvalid && y.r != regalloc.VRegInvalid:
-		switch {
-		case x.shift != 0 && y.shift != 0:
-			// Cannot absorb two shifted registers, must lower one to a shift instruction.
-			shifted := m.allocateInstr()
-			shifted.asShiftR(shiftROpShiftLeft, newOperandImm32(uint32(x.shift)), x.r, true)
-			m.insert(shifted)
-
-			return m.newAmodeRegRegShift(u32, x.r, y.r, y.shift)
-		case x.shift != 0 && y.shift == 0:
-			// Swap base and index.
-			x, y = y, x
-			fallthrough
-		default:
-			return m.newAmodeRegRegShift(u32, x.r, y.r, y.shift)
-		}
-	case x.r == regalloc.VRegInvalid && y.r != regalloc.VRegInvalid:
-		x, y = y, x
-		fallthrough
-	case x.r != regalloc.VRegInvalid && y.r == regalloc.VRegInvalid:
-		if x.shift != 0 {
-			zero := m.c.AllocateVReg(ssa.TypeI64)
-			m.lowerIconst(zero, 0, true)
-			return m.newAmodeRegRegShift(u32, zero, x.r, x.shift)
-		}
-		return m.newAmodeImmReg(u32, x.r)
-	default: // Both are invalid: use the offset.
-		tmpReg := m.c.AllocateVReg(ssa.TypeI64)
-		m.lowerIconst(tmpReg, u64, true)
-		return m.newAmodeImmReg(0, tmpReg)
-	}
-}
-
-func (m *machine) lowerAddend(x backend.SSAValueDefinition) addend {
-	if !x.IsFromInstr() {
-		return addend{m.c.VRegOf(x.V), 0, 0}
-	}
-	// Ensure the addend is not referenced in multiple places; we will discard nested Iadds.
-	op := m.c.MatchInstrOneOf(x, addendsMatchOpcodes[:])
-	if op != ssa.OpcodeInvalid && op != ssa.OpcodeIadd {
-		return m.lowerAddendFromInstr(x.Instr)
-	}
-	p := m.getOperand_Reg(x)
-	return addend{p.reg(), 0, 0}
-}
-
-// lowerAddendFromInstr takes an instruction returns a Vreg and an offset that can be used in an address mode.
-// The Vreg is regalloc.VRegInvalid if the addend cannot be lowered to a register.
-// The offset is 0 if the addend can be lowered to a register.
-func (m *machine) lowerAddendFromInstr(instr *ssa.Instruction) addend {
-	instr.MarkLowered()
-	switch op := instr.Opcode(); op {
-	case ssa.OpcodeIconst:
-		u64 := instr.ConstantVal()
-		if instr.Return().Type().Bits() == 32 {
-			return addend{regalloc.VRegInvalid, int64(int32(u64)), 0} // sign-extend.
-		} else {
-			return addend{regalloc.VRegInvalid, int64(u64), 0}
-		}
-	case ssa.OpcodeUExtend, ssa.OpcodeSExtend:
-		input := instr.Arg()
-		inputDef := m.c.ValueDefinition(input)
-		if input.Type().Bits() != 32 {
-			panic("BUG: invalid input type " + input.Type().String())
-		}
-		constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant()
-		switch {
-		case constInst && op == ssa.OpcodeSExtend:
-			return addend{regalloc.VRegInvalid, int64(uint32(inputDef.Instr.ConstantVal())), 0}
-		case constInst && op == ssa.OpcodeUExtend:
-			return addend{regalloc.VRegInvalid, int64(int32(inputDef.Instr.ConstantVal())), 0} // sign-extend!
-		default:
-			r := m.getOperand_Reg(inputDef)
-			return addend{r.reg(), 0, 0}
-		}
-	case ssa.OpcodeIshl:
-		// If the addend is a shift, we can only handle it if the shift amount is a constant.
-		x, amount := instr.Arg2()
-		amountDef := m.c.ValueDefinition(amount)
-		if amountDef.IsFromInstr() && amountDef.Instr.Constant() && amountDef.Instr.ConstantVal() <= 3 {
-			r := m.getOperand_Reg(m.c.ValueDefinition(x))
-			return addend{r.reg(), 0, uint8(amountDef.Instr.ConstantVal())}
-		}
-		r := m.getOperand_Reg(m.c.ValueDefinition(x))
-		return addend{r.reg(), 0, 0}
-	}
-	panic("BUG: invalid opcode")
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
deleted file mode 100644
index 7c27c92af..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
+++ /dev/null
@@ -1,3729 +0,0 @@
-package amd64
-
-import (
-	"context"
-	"encoding/binary"
-	"fmt"
-	"math"
-	"strings"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-	"github.com/tetratelabs/wazero/internal/platform"
-)
-
-// NewBackend returns a new backend for arm64.
-func NewBackend() backend.Machine {
-	m := &machine{
-		cpuFeatures:                         platform.CpuFeatures,
-		regAlloc:                            regalloc.NewAllocator[*instruction, *labelPosition, *regAllocFn](regInfo),
-		spillSlots:                          map[regalloc.VRegID]int64{},
-		amodePool:                           wazevoapi.NewPool[amode](nil),
-		labelPositionPool:                   wazevoapi.NewIDedPool[labelPosition](resetLabelPosition),
-		instrPool:                           wazevoapi.NewPool[instruction](resetInstruction),
-		constSwizzleMaskConstIndex:          -1,
-		constSqmulRoundSatIndex:             -1,
-		constI8x16SHLMaskTableIndex:         -1,
-		constI8x16LogicalSHRMaskTableIndex:  -1,
-		constF64x2CvtFromIMaskIndex:         -1,
-		constTwop52Index:                    -1,
-		constI32sMaxOnF64x2Index:            -1,
-		constI32uMaxOnF64x2Index:            -1,
-		constAllOnesI8x16Index:              -1,
-		constAllOnesI16x8Index:              -1,
-		constExtAddPairwiseI16x8uMask1Index: -1,
-		constExtAddPairwiseI16x8uMask2Index: -1,
-	}
-	m.regAllocFn.m = m
-	return m
-}
-
-type (
-	// machine implements backend.Machine for amd64.
-	machine struct {
-		c                        backend.Compiler
-		stackBoundsCheckDisabled bool
-
-		instrPool wazevoapi.Pool[instruction]
-		amodePool wazevoapi.Pool[amode]
-
-		cpuFeatures platform.CpuFeatureFlags
-
-		regAlloc        regalloc.Allocator[*instruction, *labelPosition, *regAllocFn]
-		regAllocFn      regAllocFn
-		regAllocStarted bool
-
-		// labelPositionPool is the pool of labelPosition. The id is the label where
-		// if the label is less than the maxSSABlockID, it's the ssa.BasicBlockID.
-		labelPositionPool wazevoapi.IDedPool[labelPosition]
-		// nextLabel is the next label to be allocated. The first free label comes after maxSSABlockID
-		// so that we can have an identical label for the SSA block ID, which is useful for debugging.
-		nextLabel label
-		// rootInstr is the first instruction of the function.
-		rootInstr *instruction
-		// currentLabelPos is the currently-compiled ssa.BasicBlock's labelPosition.
-		currentLabelPos *labelPosition
-		// orderedSSABlockLabelPos is the ordered list of labelPosition in the generated code for each ssa.BasicBlock.
-		orderedSSABlockLabelPos []*labelPosition
-		// returnLabelPos is the labelPosition for the return block.
-		returnLabelPos labelPosition
-		// perBlockHead and perBlockEnd are the head and tail of the instruction list per currently-compiled ssa.BasicBlock.
-		perBlockHead, perBlockEnd *instruction
-		// pendingInstructions are the instructions which are not yet emitted into the instruction list.
-		pendingInstructions []*instruction
-		// maxSSABlockID is the maximum ssa.BasicBlockID in the current function.
-		maxSSABlockID label
-
-		spillSlotSize int64
-		spillSlots    map[regalloc.VRegID]int64
-		currentABI    *backend.FunctionABI
-		clobberedRegs []regalloc.VReg
-
-		maxRequiredStackSizeForCalls int64
-
-		labelResolutionPends []labelResolutionPend
-
-		// jmpTableTargets holds the labels of the jump table targets.
-		jmpTableTargets [][]uint32
-		// jmpTableTargetNext is the index to the jmpTableTargets slice to be used for the next jump table.
-		jmpTableTargetsNext int
-		consts              []_const
-
-		constSwizzleMaskConstIndex, constSqmulRoundSatIndex,
-		constI8x16SHLMaskTableIndex, constI8x16LogicalSHRMaskTableIndex,
-		constF64x2CvtFromIMaskIndex, constTwop52Index,
-		constI32sMaxOnF64x2Index, constI32uMaxOnF64x2Index,
-		constAllOnesI8x16Index, constAllOnesI16x8Index,
-		constExtAddPairwiseI16x8uMask1Index, constExtAddPairwiseI16x8uMask2Index int
-	}
-
-	_const struct {
-		lo, hi   uint64
-		_var     []byte
-		label    label
-		labelPos *labelPosition
-	}
-
-	labelResolutionPend struct {
-		instr       *instruction
-		instrOffset int64
-		// imm32Offset is the offset of the last 4 bytes of the instruction.
-		imm32Offset int64
-	}
-)
-
-type (
-	// label represents a position in the generated code which is either
-	// a real instruction or the constant InstructionPool (e.g. jump tables).
-	//
-	// This is exactly the same as the traditional "label" in assembly code.
-	label uint32
-
-	// labelPosition represents the regions of the generated code which the label represents.
-	// This implements regalloc.Block.
-	labelPosition struct {
-		// sb is not nil if this corresponds to a ssa.BasicBlock.
-		sb ssa.BasicBlock
-		// cur is used to walk through the instructions in the block during the register allocation.
-		cur,
-		// begin and end are the first and last instructions of the block.
-		begin, end *instruction
-		// binaryOffset is the offset in the binary where the label is located.
-		binaryOffset int64
-	}
-)
-
-// String implements backend.Machine.
-func (l label) String() string {
-	return fmt.Sprintf("L%d", l)
-}
-
-func resetLabelPosition(l *labelPosition) {
-	*l = labelPosition{}
-}
-
-const labelReturn = math.MaxUint32
-
-func ssaBlockLabel(sb ssa.BasicBlock) label {
-	if sb.ReturnBlock() {
-		return labelReturn
-	}
-	return label(sb.ID())
-}
-
-// getOrAllocateSSABlockLabelPosition returns the labelPosition for the given basic block.
-func (m *machine) getOrAllocateSSABlockLabelPosition(sb ssa.BasicBlock) *labelPosition {
-	if sb.ReturnBlock() {
-		m.returnLabelPos.sb = sb
-		return &m.returnLabelPos
-	}
-
-	l := ssaBlockLabel(sb)
-	pos := m.labelPositionPool.GetOrAllocate(int(l))
-	pos.sb = sb
-	return pos
-}
-
-func (m *machine) getOrAllocateConstLabel(i *int, _var []byte) label {
-	index := *i
-	if index == -1 {
-		l, pos := m.allocateLabel()
-		index = len(m.consts)
-		m.consts = append(m.consts, _const{
-			_var:     _var,
-			label:    l,
-			labelPos: pos,
-		})
-		*i = index
-	}
-	return m.consts[index].label
-}
-
-// Reset implements backend.Machine.
-func (m *machine) Reset() {
-	m.consts = m.consts[:0]
-	m.clobberedRegs = m.clobberedRegs[:0]
-	for key := range m.spillSlots {
-		m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key))
-	}
-	for _, key := range m.clobberedRegs {
-		delete(m.spillSlots, regalloc.VRegID(key))
-	}
-
-	m.stackBoundsCheckDisabled = false
-	m.regAlloc.Reset()
-	m.labelPositionPool.Reset()
-	m.instrPool.Reset()
-	m.regAllocStarted = false
-	m.clobberedRegs = m.clobberedRegs[:0]
-
-	m.spillSlotSize = 0
-	m.maxRequiredStackSizeForCalls = 0
-	m.perBlockHead, m.perBlockEnd, m.rootInstr = nil, nil, nil
-	m.pendingInstructions = m.pendingInstructions[:0]
-	m.orderedSSABlockLabelPos = m.orderedSSABlockLabelPos[:0]
-
-	m.amodePool.Reset()
-	m.jmpTableTargetsNext = 0
-	m.constSwizzleMaskConstIndex = -1
-	m.constSqmulRoundSatIndex = -1
-	m.constI8x16SHLMaskTableIndex = -1
-	m.constI8x16LogicalSHRMaskTableIndex = -1
-	m.constF64x2CvtFromIMaskIndex = -1
-	m.constTwop52Index = -1
-	m.constI32sMaxOnF64x2Index = -1
-	m.constI32uMaxOnF64x2Index = -1
-	m.constAllOnesI8x16Index = -1
-	m.constAllOnesI16x8Index = -1
-	m.constExtAddPairwiseI16x8uMask1Index = -1
-	m.constExtAddPairwiseI16x8uMask2Index = -1
-}
-
-// StartLoweringFunction implements backend.Machine StartLoweringFunction.
-func (m *machine) StartLoweringFunction(maxBlockID ssa.BasicBlockID) {
-	m.maxSSABlockID = label(maxBlockID)
-	m.nextLabel = label(maxBlockID) + 1
-}
-
-// LinkAdjacentBlocks implements backend.Machine.
-func (m *machine) LinkAdjacentBlocks(prev, next ssa.BasicBlock) {
-	prevPos, nextPos := m.getOrAllocateSSABlockLabelPosition(prev), m.getOrAllocateSSABlockLabelPosition(next)
-	prevPos.end.next = nextPos.begin
-}
-
-// StartBlock implements backend.Machine.
-func (m *machine) StartBlock(blk ssa.BasicBlock) {
-	m.currentLabelPos = m.getOrAllocateSSABlockLabelPosition(blk)
-	labelPos := m.currentLabelPos
-	end := m.allocateNop()
-	m.perBlockHead, m.perBlockEnd = end, end
-	labelPos.begin, labelPos.end = end, end
-	m.orderedSSABlockLabelPos = append(m.orderedSSABlockLabelPos, labelPos)
-}
-
-// EndBlock implements ExecutableContext.
-func (m *machine) EndBlock() {
-	// Insert nop0 as the head of the block for convenience to simplify the logic of inserting instructions.
-	m.insertAtPerBlockHead(m.allocateNop())
-
-	m.currentLabelPos.begin = m.perBlockHead
-
-	if m.currentLabelPos.sb.EntryBlock() {
-		m.rootInstr = m.perBlockHead
-	}
-}
-
-func (m *machine) insertAtPerBlockHead(i *instruction) {
-	if m.perBlockHead == nil {
-		m.perBlockHead = i
-		m.perBlockEnd = i
-		return
-	}
-
-	i.next = m.perBlockHead
-	m.perBlockHead.prev = i
-	m.perBlockHead = i
-}
-
-// FlushPendingInstructions implements backend.Machine.
-func (m *machine) FlushPendingInstructions() {
-	l := len(m.pendingInstructions)
-	if l == 0 {
-		return
-	}
-	for i := l - 1; i >= 0; i-- { // reverse because we lower instructions in reverse order.
-		m.insertAtPerBlockHead(m.pendingInstructions[i])
-	}
-	m.pendingInstructions = m.pendingInstructions[:0]
-}
-
-// DisableStackCheck implements backend.Machine.
-func (m *machine) DisableStackCheck() { m.stackBoundsCheckDisabled = true }
-
-// SetCompiler implements backend.Machine.
-func (m *machine) SetCompiler(c backend.Compiler) {
-	m.c = c
-	m.regAllocFn.ssaB = c.SSABuilder()
-}
-
-// SetCurrentABI implements backend.Machine.
-func (m *machine) SetCurrentABI(abi *backend.FunctionABI) { m.currentABI = abi }
-
-// RegAlloc implements backend.Machine.
-func (m *machine) RegAlloc() {
-	rf := m.regAllocFn
-	m.regAllocStarted = true
-	m.regAlloc.DoAllocation(&rf)
-	// Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes.
-	m.spillSlotSize = (m.spillSlotSize + 15) &^ 15
-}
-
-// InsertReturn implements backend.Machine.
-func (m *machine) InsertReturn() {
-	i := m.allocateInstr().asRet()
-	m.insert(i)
-}
-
-// LowerSingleBranch implements backend.Machine.
-func (m *machine) LowerSingleBranch(b *ssa.Instruction) {
-	switch b.Opcode() {
-	case ssa.OpcodeJump:
-		_, _, targetBlkID := b.BranchData()
-		if b.IsFallthroughJump() {
-			return
-		}
-		jmp := m.allocateInstr()
-		target := ssaBlockLabel(m.c.SSABuilder().BasicBlock(targetBlkID))
-		if target == labelReturn {
-			jmp.asRet()
-		} else {
-			jmp.asJmp(newOperandLabel(target))
-		}
-		m.insert(jmp)
-	case ssa.OpcodeBrTable:
-		index, targetBlkIDs := b.BrTableData()
-		m.lowerBrTable(index, targetBlkIDs)
-	default:
-		panic("BUG: unexpected branch opcode" + b.Opcode().String())
-	}
-}
-
-func (m *machine) addJmpTableTarget(targets ssa.Values) (index int) {
-	if m.jmpTableTargetsNext == len(m.jmpTableTargets) {
-		m.jmpTableTargets = append(m.jmpTableTargets, make([]uint32, 0, len(targets.View())))
-	}
-
-	index = m.jmpTableTargetsNext
-	m.jmpTableTargetsNext++
-	m.jmpTableTargets[index] = m.jmpTableTargets[index][:0]
-	for _, targetBlockID := range targets.View() {
-		target := m.c.SSABuilder().BasicBlock(ssa.BasicBlockID(targetBlockID))
-		m.jmpTableTargets[index] = append(m.jmpTableTargets[index], uint32(ssaBlockLabel(target)))
-	}
-	return
-}
-
-var condBranchMatches = [...]ssa.Opcode{ssa.OpcodeIcmp, ssa.OpcodeFcmp}
-
-func (m *machine) lowerBrTable(index ssa.Value, targets ssa.Values) {
-	_v := m.getOperand_Reg(m.c.ValueDefinition(index))
-	v := m.copyToTmp(_v.reg())
-
-	targetCount := len(targets.View())
-
-	// First, we need to do the bounds check.
-	maxIndex := m.c.AllocateVReg(ssa.TypeI32)
-	m.lowerIconst(maxIndex, uint64(targetCount-1), false)
-	cmp := m.allocateInstr().asCmpRmiR(true, newOperandReg(maxIndex), v, false)
-	m.insert(cmp)
-
-	// Then do the conditional move maxIndex to v if v > maxIndex.
-	cmov := m.allocateInstr().asCmove(condNB, newOperandReg(maxIndex), v, false)
-	m.insert(cmov)
-
-	// Now that v has the correct index. Load the address of the jump table into the addr.
-	addr := m.c.AllocateVReg(ssa.TypeI64)
-	leaJmpTableAddr := m.allocateInstr()
-	m.insert(leaJmpTableAddr)
-
-	// Then add the target's offset into jmpTableAddr.
-	loadTargetOffsetFromJmpTable := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd,
-		// Shift by 3 because each entry is 8 bytes.
-		newOperandMem(m.newAmodeRegRegShift(0, addr, v, 3)), addr, true)
-	m.insert(loadTargetOffsetFromJmpTable)
-
-	// Now ready to jump.
-	jmp := m.allocateInstr().asJmp(newOperandReg(addr))
-	m.insert(jmp)
-
-	jmpTableBegin, jmpTableBeginLabel := m.allocateBrTarget()
-	m.insert(jmpTableBegin)
-	leaJmpTableAddr.asLEA(newOperandLabel(jmpTableBeginLabel), addr)
-
-	jmpTable := m.allocateInstr()
-	targetSliceIndex := m.addJmpTableTarget(targets)
-	jmpTable.asJmpTableSequence(targetSliceIndex, targetCount)
-	m.insert(jmpTable)
-}
-
-// LowerConditionalBranch implements backend.Machine.
-func (m *machine) LowerConditionalBranch(b *ssa.Instruction) {
-	cval, args, targetBlkID := b.BranchData()
-	if len(args) > 0 {
-		panic(fmt.Sprintf(
-			"conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s",
-			m.currentLabelPos.sb,
-			targetBlkID,
-		))
-	}
-
-	target := ssaBlockLabel(m.c.SSABuilder().BasicBlock(targetBlkID))
-	cvalDef := m.c.ValueDefinition(cval)
-
-	switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) {
-	case ssa.OpcodeIcmp:
-		cvalInstr := cvalDef.Instr
-		x, y, c := cvalInstr.IcmpData()
-
-		cc := condFromSSAIntCmpCond(c)
-		if b.Opcode() == ssa.OpcodeBrz {
-			cc = cc.invert()
-		}
-
-		// First, perform the comparison and set the flag.
-		xd, yd := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
-		if !m.tryLowerBandToFlag(xd, yd) {
-			m.lowerIcmpToFlag(xd, yd, x.Type() == ssa.TypeI64)
-		}
-
-		// Then perform the conditional branch.
-		m.insert(m.allocateInstr().asJmpIf(cc, newOperandLabel(target)))
-		cvalDef.Instr.MarkLowered()
-	case ssa.OpcodeFcmp:
-		cvalInstr := cvalDef.Instr
-
-		f1, f2, and := m.lowerFcmpToFlags(cvalInstr)
-		isBrz := b.Opcode() == ssa.OpcodeBrz
-		if isBrz {
-			f1 = f1.invert()
-		}
-		if f2 == condInvalid {
-			m.insert(m.allocateInstr().asJmpIf(f1, newOperandLabel(target)))
-		} else {
-			if isBrz {
-				f2 = f2.invert()
-				and = !and
-			}
-			jmp1, jmp2 := m.allocateInstr(), m.allocateInstr()
-			m.insert(jmp1)
-			m.insert(jmp2)
-			notTaken, notTakenLabel := m.allocateBrTarget()
-			m.insert(notTaken)
-			if and {
-				jmp1.asJmpIf(f1.invert(), newOperandLabel(notTakenLabel))
-				jmp2.asJmpIf(f2, newOperandLabel(target))
-			} else {
-				jmp1.asJmpIf(f1, newOperandLabel(target))
-				jmp2.asJmpIf(f2, newOperandLabel(target))
-			}
-		}
-
-		cvalDef.Instr.MarkLowered()
-	default:
-		v := m.getOperand_Reg(cvalDef)
-
-		var cc cond
-		if b.Opcode() == ssa.OpcodeBrz {
-			cc = condZ
-		} else {
-			cc = condNZ
-		}
-
-		// Perform test %v, %v to set the flag.
-		cmp := m.allocateInstr().asCmpRmiR(false, v, v.reg(), false)
-		m.insert(cmp)
-		m.insert(m.allocateInstr().asJmpIf(cc, newOperandLabel(target)))
-	}
-}
-
-// LowerInstr implements backend.Machine.
-func (m *machine) LowerInstr(instr *ssa.Instruction) {
-	if l := instr.SourceOffset(); l.Valid() {
-		info := m.allocateInstr().asEmitSourceOffsetInfo(l)
-		m.insert(info)
-	}
-
-	switch op := instr.Opcode(); op {
-	case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable:
-		panic("BUG: branching instructions are handled by LowerBranches")
-	case ssa.OpcodeReturn:
-		panic("BUG: return must be handled by backend.Compiler")
-	case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined.
-	case ssa.OpcodeCall, ssa.OpcodeCallIndirect:
-		m.lowerCall(instr)
-	case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32:
-		m.lowerStore(instr)
-	case ssa.OpcodeIadd:
-		m.lowerAluRmiROp(instr, aluRmiROpcodeAdd)
-	case ssa.OpcodeIsub:
-		m.lowerAluRmiROp(instr, aluRmiROpcodeSub)
-	case ssa.OpcodeImul:
-		m.lowerAluRmiROp(instr, aluRmiROpcodeMul)
-	case ssa.OpcodeSdiv, ssa.OpcodeUdiv, ssa.OpcodeSrem, ssa.OpcodeUrem:
-		isDiv := op == ssa.OpcodeSdiv || op == ssa.OpcodeUdiv
-		isSigned := op == ssa.OpcodeSdiv || op == ssa.OpcodeSrem
-		m.lowerIDivRem(instr, isDiv, isSigned)
-	case ssa.OpcodeBand:
-		m.lowerAluRmiROp(instr, aluRmiROpcodeAnd)
-	case ssa.OpcodeBor:
-		m.lowerAluRmiROp(instr, aluRmiROpcodeOr)
-	case ssa.OpcodeBxor:
-		m.lowerAluRmiROp(instr, aluRmiROpcodeXor)
-	case ssa.OpcodeIshl:
-		m.lowerShiftR(instr, shiftROpShiftLeft)
-	case ssa.OpcodeSshr:
-		m.lowerShiftR(instr, shiftROpShiftRightArithmetic)
-	case ssa.OpcodeUshr:
-		m.lowerShiftR(instr, shiftROpShiftRightLogical)
-	case ssa.OpcodeRotl:
-		m.lowerShiftR(instr, shiftROpRotateLeft)
-	case ssa.OpcodeRotr:
-		m.lowerShiftR(instr, shiftROpRotateRight)
-	case ssa.OpcodeClz:
-		m.lowerClz(instr)
-	case ssa.OpcodeCtz:
-		m.lowerCtz(instr)
-	case ssa.OpcodePopcnt:
-		m.lowerUnaryRmR(instr, unaryRmROpcodePopcnt)
-	case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv:
-		m.lowerXmmRmR(instr)
-	case ssa.OpcodeFabs:
-		m.lowerFabsFneg(instr)
-	case ssa.OpcodeFneg:
-		m.lowerFabsFneg(instr)
-	case ssa.OpcodeCeil:
-		m.lowerRound(instr, roundingModeUp)
-	case ssa.OpcodeFloor:
-		m.lowerRound(instr, roundingModeDown)
-	case ssa.OpcodeTrunc:
-		m.lowerRound(instr, roundingModeZero)
-	case ssa.OpcodeNearest:
-		m.lowerRound(instr, roundingModeNearest)
-	case ssa.OpcodeFmin, ssa.OpcodeFmax:
-		m.lowerFminFmax(instr)
-	case ssa.OpcodeFcopysign:
-		m.lowerFcopysign(instr)
-	case ssa.OpcodeBitcast:
-		m.lowerBitcast(instr)
-	case ssa.OpcodeSqrt:
-		m.lowerSqrt(instr)
-	case ssa.OpcodeFpromote:
-		v := instr.Arg()
-		rn := m.getOperand_Reg(m.c.ValueDefinition(v))
-		rd := m.c.VRegOf(instr.Return())
-		cnt := m.allocateInstr()
-		cnt.asXmmUnaryRmR(sseOpcodeCvtss2sd, rn, rd)
-		m.insert(cnt)
-	case ssa.OpcodeFdemote:
-		v := instr.Arg()
-		rn := m.getOperand_Reg(m.c.ValueDefinition(v))
-		rd := m.c.VRegOf(instr.Return())
-		cnt := m.allocateInstr()
-		cnt.asXmmUnaryRmR(sseOpcodeCvtsd2ss, rn, rd)
-		m.insert(cnt)
-	case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat:
-		x, ctx := instr.Arg2()
-		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
-		rd := m.c.VRegOf(instr.Return())
-		ctxVReg := m.c.VRegOf(ctx)
-		m.lowerFcvtToSint(ctxVReg, rn.reg(), rd, x.Type() == ssa.TypeF64,
-			instr.Return().Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat)
-	case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat:
-		x, ctx := instr.Arg2()
-		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
-		rd := m.c.VRegOf(instr.Return())
-		ctxVReg := m.c.VRegOf(ctx)
-		m.lowerFcvtToUint(ctxVReg, rn.reg(), rd, x.Type() == ssa.TypeF64,
-			instr.Return().Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat)
-	case ssa.OpcodeFcvtFromSint:
-		x := instr.Arg()
-		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
-		rd := newOperandReg(m.c.VRegOf(instr.Return()))
-		m.lowerFcvtFromSint(rn, rd,
-			x.Type() == ssa.TypeI64, instr.Return().Type().Bits() == 64)
-	case ssa.OpcodeFcvtFromUint:
-		x := instr.Arg()
-		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
-		rd := newOperandReg(m.c.VRegOf(instr.Return()))
-		m.lowerFcvtFromUint(rn, rd, x.Type() == ssa.TypeI64,
-			instr.Return().Type().Bits() == 64)
-	case ssa.OpcodeVanyTrue:
-		m.lowerVanyTrue(instr)
-	case ssa.OpcodeVallTrue:
-		m.lowerVallTrue(instr)
-	case ssa.OpcodeVhighBits:
-		m.lowerVhighBits(instr)
-	case ssa.OpcodeVbnot:
-		m.lowerVbnot(instr)
-	case ssa.OpcodeVband:
-		x, y := instr.Arg2()
-		m.lowerVbBinOp(sseOpcodePand, x, y, instr.Return())
-	case ssa.OpcodeVbor:
-		x, y := instr.Arg2()
-		m.lowerVbBinOp(sseOpcodePor, x, y, instr.Return())
-	case ssa.OpcodeVbxor:
-		x, y := instr.Arg2()
-		m.lowerVbBinOp(sseOpcodePxor, x, y, instr.Return())
-	case ssa.OpcodeVbandnot:
-		m.lowerVbandnot(instr, sseOpcodePandn)
-	case ssa.OpcodeVbitselect:
-		m.lowerVbitselect(instr)
-	case ssa.OpcodeVIadd:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneI8x16:
-			vecOp = sseOpcodePaddb
-		case ssa.VecLaneI16x8:
-			vecOp = sseOpcodePaddw
-		case ssa.VecLaneI32x4:
-			vecOp = sseOpcodePaddd
-		case ssa.VecLaneI64x2:
-			vecOp = sseOpcodePaddq
-		}
-		m.lowerVbBinOp(vecOp, x, y, instr.Return())
-
-	case ssa.OpcodeVSaddSat:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneI8x16:
-			vecOp = sseOpcodePaddsb
-		case ssa.VecLaneI16x8:
-			vecOp = sseOpcodePaddsw
-		}
-		m.lowerVbBinOp(vecOp, x, y, instr.Return())
-
-	case ssa.OpcodeVUaddSat:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneI8x16:
-			vecOp = sseOpcodePaddusb
-		case ssa.VecLaneI16x8:
-			vecOp = sseOpcodePaddusw
-		}
-		m.lowerVbBinOp(vecOp, x, y, instr.Return())
-
-	case ssa.OpcodeVIsub:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneI8x16:
-			vecOp = sseOpcodePsubb
-		case ssa.VecLaneI16x8:
-			vecOp = sseOpcodePsubw
-		case ssa.VecLaneI32x4:
-			vecOp = sseOpcodePsubd
-		case ssa.VecLaneI64x2:
-			vecOp = sseOpcodePsubq
-		}
-		m.lowerVbBinOp(vecOp, x, y, instr.Return())
-
-	case ssa.OpcodeVSsubSat:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneI8x16:
-			vecOp = sseOpcodePsubsb
-		case ssa.VecLaneI16x8:
-			vecOp = sseOpcodePsubsw
-		}
-		m.lowerVbBinOp(vecOp, x, y, instr.Return())
-
-	case ssa.OpcodeVUsubSat:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneI8x16:
-			vecOp = sseOpcodePsubusb
-		case ssa.VecLaneI16x8:
-			vecOp = sseOpcodePsubusw
-		}
-		m.lowerVbBinOp(vecOp, x, y, instr.Return())
-
-	case ssa.OpcodeVImul:
-		m.lowerVImul(instr)
-	case ssa.OpcodeVIneg:
-		x, lane := instr.ArgWithLane()
-		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
-		rd := m.c.VRegOf(instr.Return())
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneI8x16:
-			vecOp = sseOpcodePsubb
-		case ssa.VecLaneI16x8:
-			vecOp = sseOpcodePsubw
-		case ssa.VecLaneI32x4:
-			vecOp = sseOpcodePsubd
-		case ssa.VecLaneI64x2:
-			vecOp = sseOpcodePsubq
-		default:
-			panic("BUG")
-		}
-
-		tmp := m.c.AllocateVReg(ssa.TypeV128)
-		m.insert(m.allocateInstr().asZeros(tmp))
-
-		i := m.allocateInstr()
-		i.asXmmRmR(vecOp, rn, tmp)
-		m.insert(i)
-
-		m.copyTo(tmp, rd)
-	case ssa.OpcodeVFadd:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneF32x4:
-			vecOp = sseOpcodeAddps
-		case ssa.VecLaneF64x2:
-			vecOp = sseOpcodeAddpd
-		}
-		m.lowerVbBinOp(vecOp, x, y, instr.Return())
-
-	case ssa.OpcodeVFsub:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneF32x4:
-			vecOp = sseOpcodeSubps
-		case ssa.VecLaneF64x2:
-			vecOp = sseOpcodeSubpd
-		}
-		m.lowerVbBinOp(vecOp, x, y, instr.Return())
-
-	case ssa.OpcodeVFdiv:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneF32x4:
-			vecOp = sseOpcodeDivps
-		case ssa.VecLaneF64x2:
-			vecOp = sseOpcodeDivpd
-		}
-		m.lowerVbBinOp(vecOp, x, y, instr.Return())
-
-	case ssa.OpcodeVFmul:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneF32x4:
-			vecOp = sseOpcodeMulps
-		case ssa.VecLaneF64x2:
-			vecOp = sseOpcodeMulpd
-		}
-		m.lowerVbBinOp(vecOp, x, y, instr.Return())
-
-	case ssa.OpcodeVFneg:
-		x, lane := instr.ArgWithLane()
-		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
-		rd := m.c.VRegOf(instr.Return())
-
-		tmp := m.c.AllocateVReg(ssa.TypeV128)
-
-		var shiftOp, xorOp sseOpcode
-		var shiftAmt uint32
-		switch lane {
-		case ssa.VecLaneF32x4:
-			shiftOp, shiftAmt, xorOp = sseOpcodePslld, 31, sseOpcodeXorps
-		case ssa.VecLaneF64x2:
-			shiftOp, shiftAmt, xorOp = sseOpcodePsllq, 63, sseOpcodeXorpd
-		}
-
-		zero := m.allocateInstr()
-		zero.asZeros(tmp)
-		m.insert(zero)
-
-		// Set all bits on tmp by CMPPD with arg=0 (== pseudo CMPEQPD instruction).
-		// See https://www.felixcloutier.com/x86/cmpps
-		//
-		// Note: if we do not clear all the bits ^ with XORPS, this might end up not setting ones on some lane
-		// if the lane is NaN.
-		cmp := m.allocateInstr()
-		cmp.asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_UQ), newOperandReg(tmp), tmp)
-		m.insert(cmp)
-
-		// Do the left shift on each lane to set only the most significant bit in each.
-		i := m.allocateInstr()
-		i.asXmmRmiReg(shiftOp, newOperandImm32(shiftAmt), tmp)
-		m.insert(i)
-
-		// Get the negated result by XOR on each lane with tmp.
-		i = m.allocateInstr()
-		i.asXmmRmR(xorOp, rn, tmp)
-		m.insert(i)
-
-		m.copyTo(tmp, rd)
-
-	case ssa.OpcodeVSqrt:
-		x, lane := instr.ArgWithLane()
-		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
-		rd := m.c.VRegOf(instr.Return())
-
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneF32x4:
-			vecOp = sseOpcodeSqrtps
-		case ssa.VecLaneF64x2:
-			vecOp = sseOpcodeSqrtpd
-		}
-		i := m.allocateInstr()
-		i.asXmmUnaryRmR(vecOp, rn, rd)
-		m.insert(i)
-
-	case ssa.OpcodeVImin:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneI8x16:
-			vecOp = sseOpcodePminsb
-		case ssa.VecLaneI16x8:
-			vecOp = sseOpcodePminsw
-		case ssa.VecLaneI32x4:
-			vecOp = sseOpcodePminsd
-		}
-		m.lowerVbBinOp(vecOp, x, y, instr.Return())
-
-	case ssa.OpcodeVUmin:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneI8x16:
-			vecOp = sseOpcodePminub
-		case ssa.VecLaneI16x8:
-			vecOp = sseOpcodePminuw
-		case ssa.VecLaneI32x4:
-			vecOp = sseOpcodePminud
-		}
-		m.lowerVbBinOp(vecOp, x, y, instr.Return())
-
-	case ssa.OpcodeVImax:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneI8x16:
-			vecOp = sseOpcodePmaxsb
-		case ssa.VecLaneI16x8:
-			vecOp = sseOpcodePmaxsw
-		case ssa.VecLaneI32x4:
-			vecOp = sseOpcodePmaxsd
-		}
-		m.lowerVbBinOp(vecOp, x, y, instr.Return())
-
-	case ssa.OpcodeVUmax:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneI8x16:
-			vecOp = sseOpcodePmaxub
-		case ssa.VecLaneI16x8:
-			vecOp = sseOpcodePmaxuw
-		case ssa.VecLaneI32x4:
-			vecOp = sseOpcodePmaxud
-		}
-		m.lowerVbBinOp(vecOp, x, y, instr.Return())
-
-	case ssa.OpcodeVAvgRound:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneI8x16:
-			vecOp = sseOpcodePavgb
-		case ssa.VecLaneI16x8:
-			vecOp = sseOpcodePavgw
-		}
-		m.lowerVbBinOp(vecOp, x, y, instr.Return())
-
-	case ssa.OpcodeVIcmp:
-		x, y, c, lane := instr.VIcmpData()
-		m.lowerVIcmp(x, y, c, instr.Return(), lane)
-
-	case ssa.OpcodeVFcmp:
-		x, y, c, lane := instr.VFcmpData()
-		m.lowerVFcmp(x, y, c, instr.Return(), lane)
-
-	case ssa.OpcodeExtractlane:
-		x, index, signed, lane := instr.ExtractlaneData()
-		m.lowerExtractLane(x, index, signed, instr.Return(), lane)
-
-	case ssa.OpcodeInsertlane:
-		x, y, index, lane := instr.InsertlaneData()
-		m.lowerInsertLane(x, y, index, instr.Return(), lane)
-
-	case ssa.OpcodeSwizzle:
-		x, y, _ := instr.Arg2WithLane()
-		m.lowerSwizzle(x, y, instr.Return())
-
-	case ssa.OpcodeShuffle:
-		x, y, lo, hi := instr.ShuffleData()
-		m.lowerShuffle(x, y, lo, hi, instr.Return())
-
-	case ssa.OpcodeSplat:
-		x, lane := instr.ArgWithLane()
-		m.lowerSplat(x, instr.Return(), lane)
-
-	case ssa.OpcodeSqmulRoundSat:
-		x, y := instr.Arg2()
-		m.lowerSqmulRoundSat(x, y, instr.Return())
-
-	case ssa.OpcodeVZeroExtLoad:
-		ptr, offset, typ := instr.VZeroExtLoadData()
-		var sseOp sseOpcode
-		// Both movss and movsd clears the higher bits of the destination register upt 128 bits.
-		// https://www.felixcloutier.com/x86/movss
-		// https://www.felixcloutier.com/x86/movsd
-		if typ == ssa.TypeF32 {
-			sseOp = sseOpcodeMovss
-		} else {
-			sseOp = sseOpcodeMovsd
-		}
-		mem := m.lowerToAddressMode(ptr, offset)
-		dst := m.c.VRegOf(instr.Return())
-		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandMem(mem), dst))
-
-	case ssa.OpcodeVMinPseudo:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneF32x4:
-			vecOp = sseOpcodeMinps
-		case ssa.VecLaneF64x2:
-			vecOp = sseOpcodeMinpd
-		default:
-			panic("BUG: unexpected lane type")
-		}
-		m.lowerVbBinOpUnaligned(vecOp, y, x, instr.Return())
-
-	case ssa.OpcodeVMaxPseudo:
-		x, y, lane := instr.Arg2WithLane()
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneF32x4:
-			vecOp = sseOpcodeMaxps
-		case ssa.VecLaneF64x2:
-			vecOp = sseOpcodeMaxpd
-		default:
-			panic("BUG: unexpected lane type")
-		}
-		m.lowerVbBinOpUnaligned(vecOp, y, x, instr.Return())
-
-	case ssa.OpcodeVIshl:
-		x, y, lane := instr.Arg2WithLane()
-		m.lowerVIshl(x, y, instr.Return(), lane)
-
-	case ssa.OpcodeVSshr:
-		x, y, lane := instr.Arg2WithLane()
-		m.lowerVSshr(x, y, instr.Return(), lane)
-
-	case ssa.OpcodeVUshr:
-		x, y, lane := instr.Arg2WithLane()
-		m.lowerVUshr(x, y, instr.Return(), lane)
-
-	case ssa.OpcodeVCeil:
-		x, lane := instr.ArgWithLane()
-		m.lowerVRound(x, instr.Return(), 0x2, lane == ssa.VecLaneF64x2)
-
-	case ssa.OpcodeVFloor:
-		x, lane := instr.ArgWithLane()
-		m.lowerVRound(x, instr.Return(), 0x1, lane == ssa.VecLaneF64x2)
-
-	case ssa.OpcodeVTrunc:
-		x, lane := instr.ArgWithLane()
-		m.lowerVRound(x, instr.Return(), 0x3, lane == ssa.VecLaneF64x2)
-
-	case ssa.OpcodeVNearest:
-		x, lane := instr.ArgWithLane()
-		m.lowerVRound(x, instr.Return(), 0x0, lane == ssa.VecLaneF64x2)
-
-	case ssa.OpcodeExtIaddPairwise:
-		x, lane, signed := instr.ExtIaddPairwiseData()
-		m.lowerExtIaddPairwise(x, instr.Return(), lane, signed)
-
-	case ssa.OpcodeUwidenLow, ssa.OpcodeSwidenLow:
-		x, lane := instr.ArgWithLane()
-		m.lowerWidenLow(x, instr.Return(), lane, op == ssa.OpcodeSwidenLow)
-
-	case ssa.OpcodeUwidenHigh, ssa.OpcodeSwidenHigh:
-		x, lane := instr.ArgWithLane()
-		m.lowerWidenHigh(x, instr.Return(), lane, op == ssa.OpcodeSwidenHigh)
-
-	case ssa.OpcodeLoadSplat:
-		ptr, offset, lane := instr.LoadSplatData()
-		m.lowerLoadSplat(ptr, offset, instr.Return(), lane)
-
-	case ssa.OpcodeVFcvtFromUint, ssa.OpcodeVFcvtFromSint:
-		x, lane := instr.ArgWithLane()
-		m.lowerVFcvtFromInt(x, instr.Return(), lane, op == ssa.OpcodeVFcvtFromSint)
-
-	case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat:
-		x, lane := instr.ArgWithLane()
-		m.lowerVFcvtToIntSat(x, instr.Return(), lane, op == ssa.OpcodeVFcvtToSintSat)
-
-	case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow:
-		x, y, lane := instr.Arg2WithLane()
-		m.lowerNarrow(x, y, instr.Return(), lane, op == ssa.OpcodeSnarrow)
-
-	case ssa.OpcodeFvpromoteLow:
-		x := instr.Arg()
-		src := m.getOperand_Reg(m.c.ValueDefinition(x))
-		dst := m.c.VRegOf(instr.Return())
-		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtps2pd, src, dst))
-
-	case ssa.OpcodeFvdemote:
-		x := instr.Arg()
-		src := m.getOperand_Reg(m.c.ValueDefinition(x))
-		dst := m.c.VRegOf(instr.Return())
-		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtpd2ps, src, dst))
-
-	case ssa.OpcodeWideningPairwiseDotProductS:
-		x, y := instr.Arg2()
-		m.lowerWideningPairwiseDotProductS(x, y, instr.Return())
-
-	case ssa.OpcodeVIabs:
-		m.lowerVIabs(instr)
-	case ssa.OpcodeVIpopcnt:
-		m.lowerVIpopcnt(instr)
-	case ssa.OpcodeVFmin:
-		m.lowerVFmin(instr)
-	case ssa.OpcodeVFmax:
-		m.lowerVFmax(instr)
-	case ssa.OpcodeVFabs:
-		m.lowerVFabs(instr)
-	case ssa.OpcodeUndefined:
-		m.insert(m.allocateInstr().asUD2())
-	case ssa.OpcodeExitWithCode:
-		execCtx, code := instr.ExitWithCodeData()
-		m.lowerExitWithCode(m.c.VRegOf(execCtx), code)
-	case ssa.OpcodeExitIfTrueWithCode:
-		execCtx, c, code := instr.ExitIfTrueWithCodeData()
-		m.lowerExitIfTrueWithCode(m.c.VRegOf(execCtx), c, code)
-	case ssa.OpcodeLoad:
-		ptr, offset, typ := instr.LoadData()
-		dst := m.c.VRegOf(instr.Return())
-		m.lowerLoad(ptr, offset, typ, dst)
-	case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32:
-		ptr, offset, _ := instr.LoadData()
-		ret := m.c.VRegOf(instr.Return())
-		m.lowerExtLoad(op, ptr, offset, ret)
-	case ssa.OpcodeVconst:
-		result := m.c.VRegOf(instr.Return())
-		lo, hi := instr.VconstData()
-		m.lowerVconst(result, lo, hi)
-	case ssa.OpcodeSExtend, ssa.OpcodeUExtend:
-		from, to, signed := instr.ExtendData()
-		m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed)
-	case ssa.OpcodeIcmp:
-		m.lowerIcmp(instr)
-	case ssa.OpcodeFcmp:
-		m.lowerFcmp(instr)
-	case ssa.OpcodeSelect:
-		cval, x, y := instr.SelectData()
-		m.lowerSelect(x, y, cval, instr.Return())
-	case ssa.OpcodeIreduce:
-		rn := m.getOperand_Mem_Reg(m.c.ValueDefinition(instr.Arg()))
-		retVal := instr.Return()
-		rd := m.c.VRegOf(retVal)
-
-		if retVal.Type() != ssa.TypeI32 {
-			panic("TODO?: Ireduce to non-i32")
-		}
-		m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, rn, rd))
-
-	case ssa.OpcodeAtomicLoad:
-		ptr := instr.Arg()
-		size := instr.AtomicTargetSize()
-		dst := m.c.VRegOf(instr.Return())
-
-		// At this point, the ptr is ensured to be aligned, so using a normal load is atomic.
-		// https://github.com/golang/go/blob/adead1a93f472affa97c494ef19f2f492ee6f34a/src/runtime/internal/atomic/atomic_amd64.go#L30
-		mem := newOperandMem(m.lowerToAddressMode(ptr, 0))
-		load := m.allocateInstr()
-		switch size {
-		case 8:
-			load.asMov64MR(mem, dst)
-		case 4:
-			load.asMovzxRmR(extModeLQ, mem, dst)
-		case 2:
-			load.asMovzxRmR(extModeWQ, mem, dst)
-		case 1:
-			load.asMovzxRmR(extModeBQ, mem, dst)
-		default:
-			panic("BUG")
-		}
-		m.insert(load)
-
-	case ssa.OpcodeFence:
-		m.insert(m.allocateInstr().asMFence())
-
-	case ssa.OpcodeAtomicStore:
-		ptr, _val := instr.Arg2()
-		size := instr.AtomicTargetSize()
-
-		val := m.getOperand_Reg(m.c.ValueDefinition(_val))
-		// The content on the val register will be overwritten by xchg, so we need to copy it to a temporary register.
-		copied := m.copyToTmp(val.reg())
-
-		mem := newOperandMem(m.lowerToAddressMode(ptr, 0))
-		store := m.allocateInstr().asXCHG(copied, mem, byte(size))
-		m.insert(store)
-
-	case ssa.OpcodeAtomicCas:
-		addr, exp, repl := instr.Arg3()
-		size := instr.AtomicTargetSize()
-		m.lowerAtomicCas(addr, exp, repl, size, instr.Return())
-
-	case ssa.OpcodeAtomicRmw:
-		addr, val := instr.Arg2()
-		atomicOp, size := instr.AtomicRmwData()
-		m.lowerAtomicRmw(atomicOp, addr, val, size, instr.Return())
-
-	default:
-		panic("TODO: lowering " + op.String())
-	}
-}
-
-func (m *machine) lowerAtomicRmw(op ssa.AtomicRmwOp, addr, val ssa.Value, size uint64, ret ssa.Value) {
-	mem := m.lowerToAddressMode(addr, 0)
-	_val := m.getOperand_Reg(m.c.ValueDefinition(val))
-
-	switch op {
-	case ssa.AtomicRmwOpAdd, ssa.AtomicRmwOpSub:
-		valCopied := m.copyToTmp(_val.reg())
-		if op == ssa.AtomicRmwOpSub {
-			// Negate the value.
-			m.insert(m.allocateInstr().asNeg(newOperandReg(valCopied), true))
-		}
-		m.insert(m.allocateInstr().asLockXAdd(valCopied, mem, byte(size)))
-		m.clearHigherBitsForAtomic(valCopied, size, ret.Type())
-		m.copyTo(valCopied, m.c.VRegOf(ret))
-
-	case ssa.AtomicRmwOpAnd, ssa.AtomicRmwOpOr, ssa.AtomicRmwOpXor:
-		accumulator := raxVReg
-		// Reserve rax for the accumulator to make regalloc happy.
-		// Note: do this initialization before defining valCopied, because it might be the same register and
-		// if that happens, the unnecessary load/store will be performed inside the loop.
-		// This can be mitigated in any way once the register allocator is clever enough.
-		m.insert(m.allocateInstr().asDefineUninitializedReg(accumulator))
-
-		// Copy the value to a temporary register.
-		valCopied := m.copyToTmp(_val.reg())
-		m.clearHigherBitsForAtomic(valCopied, size, ret.Type())
-
-		memOp := newOperandMem(mem)
-		tmp := m.c.AllocateVReg(ssa.TypeI64)
-		beginLoop, beginLoopLabel := m.allocateBrTarget()
-		{
-			m.insert(beginLoop)
-			// Reset the value on tmp by the original value.
-			m.copyTo(valCopied, tmp)
-			// Load the current value at the memory location into accumulator.
-			switch size {
-			case 1:
-				m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, memOp, accumulator))
-			case 2:
-				m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, memOp, accumulator))
-			case 4:
-				m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, memOp, accumulator))
-			case 8:
-				m.insert(m.allocateInstr().asMov64MR(memOp, accumulator))
-			default:
-				panic("BUG")
-			}
-			// Then perform the logical operation on the accumulator and the value on tmp.
-			switch op {
-			case ssa.AtomicRmwOpAnd:
-				m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, newOperandReg(accumulator), tmp, true))
-			case ssa.AtomicRmwOpOr:
-				m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeOr, newOperandReg(accumulator), tmp, true))
-			case ssa.AtomicRmwOpXor:
-				m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeXor, newOperandReg(accumulator), tmp, true))
-			default:
-				panic("BUG")
-			}
-			// Finally, try compare-exchange the value at the memory location with the tmp.
-			m.insert(m.allocateInstr().asLockCmpXCHG(tmp, memOp.addressMode(), byte(size)))
-			// If it succeeds, ZF will be set, and we can break the loop.
-			m.insert(m.allocateInstr().asJmpIf(condNZ, newOperandLabel(beginLoopLabel)))
-		}
-
-		// valCopied must be alive at the end of the loop.
-		m.insert(m.allocateInstr().asNopUseReg(valCopied))
-
-		// At this point, accumulator contains the result.
-		m.clearHigherBitsForAtomic(accumulator, size, ret.Type())
-		m.copyTo(accumulator, m.c.VRegOf(ret))
-
-	case ssa.AtomicRmwOpXchg:
-		valCopied := m.copyToTmp(_val.reg())
-
-		m.insert(m.allocateInstr().asXCHG(valCopied, newOperandMem(mem), byte(size)))
-		m.clearHigherBitsForAtomic(valCopied, size, ret.Type())
-		m.copyTo(valCopied, m.c.VRegOf(ret))
-
-	default:
-		panic("BUG")
-	}
-}
-
-func (m *machine) lowerAtomicCas(addr, exp, repl ssa.Value, size uint64, ret ssa.Value) {
-	mem := m.lowerToAddressMode(addr, 0)
-	expOp := m.getOperand_Reg(m.c.ValueDefinition(exp))
-	replOp := m.getOperand_Reg(m.c.ValueDefinition(repl))
-
-	accumulator := raxVReg
-	m.copyTo(expOp.reg(), accumulator)
-	m.insert(m.allocateInstr().asLockCmpXCHG(replOp.reg(), mem, byte(size)))
-	m.clearHigherBitsForAtomic(accumulator, size, ret.Type())
-	m.copyTo(accumulator, m.c.VRegOf(ret))
-}
-
-func (m *machine) clearHigherBitsForAtomic(r regalloc.VReg, valSize uint64, resultType ssa.Type) {
-	switch resultType {
-	case ssa.TypeI32:
-		switch valSize {
-		case 1:
-			m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(r), r))
-		case 2:
-			m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(r), r))
-		}
-	case ssa.TypeI64:
-		switch valSize {
-		case 1:
-			m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(r), r))
-		case 2:
-			m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, newOperandReg(r), r))
-		case 4:
-			m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, newOperandReg(r), r))
-		}
-	}
-}
-
-func (m *machine) lowerFcmp(instr *ssa.Instruction) {
-	f1, f2, and := m.lowerFcmpToFlags(instr)
-	rd := m.c.VRegOf(instr.Return())
-	if f2 == condInvalid {
-		tmp := m.c.AllocateVReg(ssa.TypeI32)
-		m.insert(m.allocateInstr().asSetcc(f1, tmp))
-		// On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match
-		// the semantics of Icmp that sets either 0 or 1.
-		m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp), rd))
-	} else {
-		tmp1, tmp2 := m.c.AllocateVReg(ssa.TypeI32), m.c.AllocateVReg(ssa.TypeI32)
-		m.insert(m.allocateInstr().asSetcc(f1, tmp1))
-		m.insert(m.allocateInstr().asSetcc(f2, tmp2))
-		var op aluRmiROpcode
-		if and {
-			op = aluRmiROpcodeAnd
-		} else {
-			op = aluRmiROpcodeOr
-		}
-		m.insert(m.allocateInstr().asAluRmiR(op, newOperandReg(tmp1), tmp2, false))
-		m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp2), rd))
-	}
-}
-
-func (m *machine) lowerIcmp(instr *ssa.Instruction) {
-	x, y, c := instr.IcmpData()
-	m.lowerIcmpToFlag(m.c.ValueDefinition(x), m.c.ValueDefinition(y), x.Type() == ssa.TypeI64)
-	rd := m.c.VRegOf(instr.Return())
-	tmp := m.c.AllocateVReg(ssa.TypeI32)
-	m.insert(m.allocateInstr().asSetcc(condFromSSAIntCmpCond(c), tmp))
-	// On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match
-	// the semantics of Icmp that sets either 0 or 1.
-	m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp), rd))
-}
-
-func (m *machine) lowerSelect(x, y, cval, ret ssa.Value) {
-	xo, yo := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y))
-	rd := m.c.VRegOf(ret)
-
-	var cond cond
-	cvalDef := m.c.ValueDefinition(cval)
-	switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) {
-	case ssa.OpcodeIcmp:
-		icmp := cvalDef.Instr
-		xc, yc, cc := icmp.IcmpData()
-		m.lowerIcmpToFlag(m.c.ValueDefinition(xc), m.c.ValueDefinition(yc), xc.Type() == ssa.TypeI64)
-		cond = condFromSSAIntCmpCond(cc)
-		icmp.Lowered()
-	default: // TODO: match ssa.OpcodeFcmp for optimization, but seems a bit complex.
-		cv := m.getOperand_Reg(cvalDef)
-		test := m.allocateInstr().asCmpRmiR(false, cv, cv.reg(), false)
-		m.insert(test)
-		cond = condNZ
-	}
-
-	if typ := x.Type(); typ.IsInt() {
-		_64 := typ.Bits() == 64
-		mov := m.allocateInstr()
-		tmp := m.c.AllocateVReg(typ)
-		switch yo.kind {
-		case operandKindReg:
-			mov.asMovRR(yo.reg(), tmp, _64)
-		case operandKindMem:
-			if _64 {
-				mov.asMov64MR(yo, tmp)
-			} else {
-				mov.asMovzxRmR(extModeLQ, yo, tmp)
-			}
-		default:
-			panic("BUG")
-		}
-		m.insert(mov)
-		cmov := m.allocateInstr().asCmove(cond, xo, tmp, _64)
-		m.insert(cmov)
-		m.insert(m.allocateInstr().asMovRR(tmp, rd, _64))
-	} else {
-		mov := m.allocateInstr()
-		tmp := m.c.AllocateVReg(typ)
-		switch typ {
-		case ssa.TypeF32:
-			mov.asXmmUnaryRmR(sseOpcodeMovss, yo, tmp)
-		case ssa.TypeF64:
-			mov.asXmmUnaryRmR(sseOpcodeMovsd, yo, tmp)
-		case ssa.TypeV128:
-			mov.asXmmUnaryRmR(sseOpcodeMovdqu, yo, tmp)
-		default:
-			panic("BUG")
-		}
-		m.insert(mov)
-
-		cmov := m.allocateInstr().asXmmCMov(cond, xo, tmp, typ.Size())
-		m.insert(cmov)
-
-		m.copyTo(tmp, rd)
-	}
-}
-
-func (m *machine) lowerXmmCmovAfterRegAlloc(i *instruction) {
-	x := i.op1
-	rd := i.op2.reg()
-	cond := cond(i.u1)
-
-	jcc := m.allocateInstr()
-	m.insert(jcc)
-
-	mov := m.allocateInstr()
-	switch i.u2 {
-	case 4:
-		mov.asXmmUnaryRmR(sseOpcodeMovss, x, rd)
-	case 8:
-		mov.asXmmUnaryRmR(sseOpcodeMovsd, x, rd)
-	case 16:
-		mov.asXmmUnaryRmR(sseOpcodeMovdqu, x, rd)
-	default:
-		panic("BUG")
-	}
-	m.insert(mov)
-
-	nop, end := m.allocateBrTarget()
-	m.insert(nop)
-	jcc.asJmpIf(cond.invert(), newOperandLabel(end))
-}
-
-func (m *machine) lowerExtend(_arg, ret ssa.Value, from, to byte, signed bool) {
-	rd0 := m.c.VRegOf(ret)
-	arg := m.getOperand_Mem_Reg(m.c.ValueDefinition(_arg))
-
-	rd := m.c.AllocateVReg(ret.Type())
-
-	ext := m.allocateInstr()
-	switch {
-	case from == 8 && to == 16 && signed:
-		ext.asMovsxRmR(extModeBQ, arg, rd)
-	case from == 8 && to == 16 && !signed:
-		ext.asMovzxRmR(extModeBL, arg, rd)
-	case from == 8 && to == 32 && signed:
-		ext.asMovsxRmR(extModeBL, arg, rd)
-	case from == 8 && to == 32 && !signed:
-		ext.asMovzxRmR(extModeBQ, arg, rd)
-	case from == 8 && to == 64 && signed:
-		ext.asMovsxRmR(extModeBQ, arg, rd)
-	case from == 8 && to == 64 && !signed:
-		ext.asMovzxRmR(extModeBQ, arg, rd)
-	case from == 16 && to == 32 && signed:
-		ext.asMovsxRmR(extModeWL, arg, rd)
-	case from == 16 && to == 32 && !signed:
-		ext.asMovzxRmR(extModeWL, arg, rd)
-	case from == 16 && to == 64 && signed:
-		ext.asMovsxRmR(extModeWQ, arg, rd)
-	case from == 16 && to == 64 && !signed:
-		ext.asMovzxRmR(extModeWQ, arg, rd)
-	case from == 32 && to == 64 && signed:
-		ext.asMovsxRmR(extModeLQ, arg, rd)
-	case from == 32 && to == 64 && !signed:
-		ext.asMovzxRmR(extModeLQ, arg, rd)
-	default:
-		panic(fmt.Sprintf("BUG: unhandled extend: from=%d, to=%d, signed=%t", from, to, signed))
-	}
-	m.insert(ext)
-
-	m.copyTo(rd, rd0)
-}
-
-func (m *machine) lowerVconst(dst regalloc.VReg, lo, hi uint64) {
-	if lo == 0 && hi == 0 {
-		m.insert(m.allocateInstr().asZeros(dst))
-		return
-	}
-
-	load := m.allocateInstr()
-	l, pos := m.allocateLabel()
-	m.consts = append(m.consts, _const{label: l, labelPos: pos, lo: lo, hi: hi})
-	load.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(l)), dst)
-	m.insert(load)
-}
-
-func (m *machine) lowerCtz(instr *ssa.Instruction) {
-	if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) {
-		m.lowerUnaryRmR(instr, unaryRmROpcodeTzcnt)
-	} else {
-		// On processors that do not support TZCNT, the BSF instruction is
-		// executed instead. The key difference between TZCNT and BSF
-		// instruction is that if source operand is zero, the content of
-		// destination operand is undefined.
-		// https://www.felixcloutier.com/x86/tzcnt.html
-
-		x := instr.Arg()
-		if !x.Type().IsInt() {
-			panic("BUG?")
-		}
-		_64 := x.Type().Bits() == 64
-
-		xDef := m.c.ValueDefinition(x)
-		tmp := m.c.AllocateVReg(x.Type())
-		rm := m.getOperand_Reg(xDef)
-
-		// First, we have to check if the target is non-zero.
-		test := m.allocateInstr()
-		test.asCmpRmiR(false, rm, rm.reg(), _64)
-		m.insert(test)
-
-		jmpNz := m.allocateInstr()
-		m.insert(jmpNz)
-
-		// If the value is zero, we just push the const value.
-		m.lowerIconst(tmp, uint64(x.Type().Bits()), _64)
-
-		// Now jump right after the non-zero case.
-		jmpAtEnd := m.allocateInstr()
-		m.insert(jmpAtEnd)
-
-		// jmpNz target label is set here.
-		nop, nz := m.allocateBrTarget()
-		jmpNz.asJmpIf(condNZ, newOperandLabel(nz))
-		m.insert(nop)
-
-		// Emit the non-zero case.
-		bsr := m.allocateInstr()
-		bsr.asUnaryRmR(unaryRmROpcodeBsf, rm, tmp, _64)
-		m.insert(bsr)
-
-		// jmpAtEnd target label is set here.
-		nopEnd, end := m.allocateBrTarget()
-		jmpAtEnd.asJmp(newOperandLabel(end))
-		m.insert(nopEnd)
-
-		m.copyTo(tmp, m.c.VRegOf(instr.Return()))
-	}
-}
-
-func (m *machine) lowerClz(instr *ssa.Instruction) {
-	if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) {
-		m.lowerUnaryRmR(instr, unaryRmROpcodeLzcnt)
-	} else {
-		// On processors that do not support LZCNT, we combine BSR (calculating
-		// most significant set bit) with XOR. This logic is described in
-		// "Replace Raw Assembly Code with Builtin Intrinsics" section in:
-		// https://developer.apple.com/documentation/apple-silicon/addressing-architectural-differences-in-your-macos-code.
-
-		x := instr.Arg()
-		if !x.Type().IsInt() {
-			panic("BUG?")
-		}
-		_64 := x.Type().Bits() == 64
-
-		xDef := m.c.ValueDefinition(x)
-		rm := m.getOperand_Reg(xDef)
-		tmp := m.c.AllocateVReg(x.Type())
-
-		// First, we have to check if the rm is non-zero as BSR is undefined
-		// on zero. See https://www.felixcloutier.com/x86/bsr.
-		test := m.allocateInstr()
-		test.asCmpRmiR(false, rm, rm.reg(), _64)
-		m.insert(test)
-
-		jmpNz := m.allocateInstr()
-		m.insert(jmpNz)
-
-		// If the value is zero, we just push the const value.
-		m.lowerIconst(tmp, uint64(x.Type().Bits()), _64)
-
-		// Now jump right after the non-zero case.
-		jmpAtEnd := m.allocateInstr()
-		m.insert(jmpAtEnd)
-
-		// jmpNz target label is set here.
-		nop, nz := m.allocateBrTarget()
-		jmpNz.asJmpIf(condNZ, newOperandLabel(nz))
-		m.insert(nop)
-
-		// Emit the non-zero case.
-		bsr := m.allocateInstr()
-		bsr.asUnaryRmR(unaryRmROpcodeBsr, rm, tmp, _64)
-		m.insert(bsr)
-
-		// Now we XOR the value with the bit length minus one.
-		xor := m.allocateInstr()
-		xor.asAluRmiR(aluRmiROpcodeXor, newOperandImm32(uint32(x.Type().Bits()-1)), tmp, _64)
-		m.insert(xor)
-
-		// jmpAtEnd target label is set here.
-		nopEnd, end := m.allocateBrTarget()
-		jmpAtEnd.asJmp(newOperandLabel(end))
-		m.insert(nopEnd)
-
-		m.copyTo(tmp, m.c.VRegOf(instr.Return()))
-	}
-}
-
-func (m *machine) lowerUnaryRmR(si *ssa.Instruction, op unaryRmROpcode) {
-	x := si.Arg()
-	if !x.Type().IsInt() {
-		panic("BUG?")
-	}
-	_64 := x.Type().Bits() == 64
-
-	xDef := m.c.ValueDefinition(x)
-	rm := m.getOperand_Mem_Reg(xDef)
-	rd := m.c.VRegOf(si.Return())
-
-	instr := m.allocateInstr()
-	instr.asUnaryRmR(op, rm, rd, _64)
-	m.insert(instr)
-}
-
-func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, dst regalloc.VReg) {
-	mem := newOperandMem(m.lowerToAddressMode(ptr, offset))
-	load := m.allocateInstr()
-	switch typ {
-	case ssa.TypeI32:
-		load.asMovzxRmR(extModeLQ, mem, dst)
-	case ssa.TypeI64:
-		load.asMov64MR(mem, dst)
-	case ssa.TypeF32:
-		load.asXmmUnaryRmR(sseOpcodeMovss, mem, dst)
-	case ssa.TypeF64:
-		load.asXmmUnaryRmR(sseOpcodeMovsd, mem, dst)
-	case ssa.TypeV128:
-		load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, dst)
-	default:
-		panic("BUG")
-	}
-	m.insert(load)
-}
-
-func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, dst regalloc.VReg) {
-	mem := newOperandMem(m.lowerToAddressMode(ptr, offset))
-	load := m.allocateInstr()
-	switch op {
-	case ssa.OpcodeUload8:
-		load.asMovzxRmR(extModeBQ, mem, dst)
-	case ssa.OpcodeUload16:
-		load.asMovzxRmR(extModeWQ, mem, dst)
-	case ssa.OpcodeUload32:
-		load.asMovzxRmR(extModeLQ, mem, dst)
-	case ssa.OpcodeSload8:
-		load.asMovsxRmR(extModeBQ, mem, dst)
-	case ssa.OpcodeSload16:
-		load.asMovsxRmR(extModeWQ, mem, dst)
-	case ssa.OpcodeSload32:
-		load.asMovsxRmR(extModeLQ, mem, dst)
-	default:
-		panic("BUG")
-	}
-	m.insert(load)
-}
-
-func (m *machine) lowerExitIfTrueWithCode(execCtx regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) {
-	condDef := m.c.ValueDefinition(cond)
-	if !m.c.MatchInstr(condDef, ssa.OpcodeIcmp) {
-		panic("TODO: ExitIfTrue must come after Icmp at the moment: " + condDef.Instr.Opcode().String())
-	}
-	cvalInstr := condDef.Instr
-	cvalInstr.MarkLowered()
-
-	// We need to copy the execution context to a temp register, because if it's spilled,
-	// it might end up being reloaded inside the exiting branch.
-	execCtxTmp := m.copyToTmp(execCtx)
-
-	x, y, c := cvalInstr.IcmpData()
-	xx, yy := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
-	if !m.tryLowerBandToFlag(xx, yy) {
-		m.lowerIcmpToFlag(xx, yy, x.Type() == ssa.TypeI64)
-	}
-
-	jmpIf := m.allocateInstr()
-	m.insert(jmpIf)
-	l := m.lowerExitWithCode(execCtxTmp, code)
-	jmpIf.asJmpIf(condFromSSAIntCmpCond(c).invert(), newOperandLabel(l))
-}
-
-func (m *machine) tryLowerBandToFlag(x, y backend.SSAValueDefinition) (ok bool) {
-	var target backend.SSAValueDefinition
-	var got bool
-	if x.IsFromInstr() && x.Instr.Constant() && x.Instr.ConstantVal() == 0 {
-		if m.c.MatchInstr(y, ssa.OpcodeBand) {
-			target = y
-			got = true
-		}
-	}
-
-	if y.IsFromInstr() && y.Instr.Constant() && y.Instr.ConstantVal() == 0 {
-		if m.c.MatchInstr(x, ssa.OpcodeBand) {
-			target = x
-			got = true
-		}
-	}
-
-	if !got {
-		return false
-	}
-
-	bandInstr := target.Instr
-	bandX, bandY := bandInstr.Arg2()
-
-	xx := m.getOperand_Reg(m.c.ValueDefinition(bandX))
-	yy := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(bandY))
-	test := m.allocateInstr().asCmpRmiR(false, yy, xx.reg(), bandX.Type() == ssa.TypeI64)
-	m.insert(test)
-	bandInstr.MarkLowered()
-	return true
-}
-
-func (m *machine) allocateExitInstructions(execCtx, exitCodeReg regalloc.VReg) (saveRsp, saveRbp, setExitCode *instruction) {
-	saveRsp = m.allocateInstr().asMovRM(
-		rspVReg,
-		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.U32(), execCtx)),
-		8,
-	)
-
-	saveRbp = m.allocateInstr().asMovRM(
-		rbpVReg,
-		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetFramePointerBeforeGoCall.U32(), execCtx)),
-		8,
-	)
-	setExitCode = m.allocateInstr().asMovRM(
-		exitCodeReg,
-		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetExitCodeOffset.U32(), execCtx)),
-		4,
-	)
-	return
-}
-
-func (m *machine) lowerExitWithCode(execCtx regalloc.VReg, code wazevoapi.ExitCode) (afterLabel label) {
-	exitCodeReg := rbpVReg
-	saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtx, exitCodeReg)
-
-	// Set save RSP, RBP, and write exit code.
-	m.insert(saveRsp)
-	m.insert(saveRbp)
-	m.lowerIconst(exitCodeReg, uint64(code), false)
-	m.insert(setExitCode)
-
-	ripReg := rbpVReg
-
-	// Next is to save the current address for stack unwinding.
-	nop, currentAddrLabel := m.allocateBrTarget()
-	m.insert(nop)
-	readRip := m.allocateInstr().asLEA(newOperandLabel(currentAddrLabel), ripReg)
-	m.insert(readRip)
-	saveRip := m.allocateInstr().asMovRM(
-		ripReg,
-		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)),
-		8,
-	)
-	m.insert(saveRip)
-
-	// Finally exit.
-	exitSq := m.allocateExitSeq(execCtx)
-	m.insert(exitSq)
-
-	// Return the label for continuation.
-	continuation, afterLabel := m.allocateBrTarget()
-	m.insert(continuation)
-	return afterLabel
-}
-
-func (m *machine) lowerAluRmiROp(si *ssa.Instruction, op aluRmiROpcode) {
-	x, y := si.Arg2()
-	if !x.Type().IsInt() {
-		panic("BUG?")
-	}
-
-	_64 := x.Type().Bits() == 64
-
-	xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
-
-	// TODO: commutative args can be swapped if one of them is an immediate.
-	rn := m.getOperand_Reg(xDef)
-	rm := m.getOperand_Mem_Imm32_Reg(yDef)
-	rd := m.c.VRegOf(si.Return())
-
-	// rn is being overwritten, so we first copy its value to a temp register,
-	// in case it is referenced again later.
-	tmp := m.copyToTmp(rn.reg())
-
-	alu := m.allocateInstr()
-	alu.asAluRmiR(op, rm, tmp, _64)
-	m.insert(alu)
-
-	// tmp now contains the result, we copy it to the dest register.
-	m.copyTo(tmp, rd)
-}
-
-func (m *machine) lowerShiftR(si *ssa.Instruction, op shiftROp) {
-	x, amt := si.Arg2()
-	if !x.Type().IsInt() {
-		panic("BUG?")
-	}
-	_64 := x.Type().Bits() == 64
-
-	xDef, amtDef := m.c.ValueDefinition(x), m.c.ValueDefinition(amt)
-
-	opAmt := m.getOperand_Imm32_Reg(amtDef)
-	rx := m.getOperand_Reg(xDef)
-	rd := m.c.VRegOf(si.Return())
-
-	// rx is being overwritten, so we first copy its value to a temp register,
-	// in case it is referenced again later.
-	tmpDst := m.copyToTmp(rx.reg())
-
-	if opAmt.kind == operandKindReg {
-		// If opAmt is a register we must copy its value to rcx,
-		// because shiftR encoding mandates that the shift amount is in rcx.
-		m.copyTo(opAmt.reg(), rcxVReg)
-
-		alu := m.allocateInstr()
-		alu.asShiftR(op, newOperandReg(rcxVReg), tmpDst, _64)
-		m.insert(alu)
-
-	} else {
-		alu := m.allocateInstr()
-		alu.asShiftR(op, opAmt, tmpDst, _64)
-		m.insert(alu)
-	}
-
-	// tmp now contains the result, we copy it to the dest register.
-	m.copyTo(tmpDst, rd)
-}
-
-func (m *machine) lowerXmmRmR(instr *ssa.Instruction) {
-	x, y := instr.Arg2()
-	if !x.Type().IsFloat() {
-		panic("BUG?")
-	}
-	_64 := x.Type().Bits() == 64
-
-	var op sseOpcode
-	if _64 {
-		switch instr.Opcode() {
-		case ssa.OpcodeFadd:
-			op = sseOpcodeAddsd
-		case ssa.OpcodeFsub:
-			op = sseOpcodeSubsd
-		case ssa.OpcodeFmul:
-			op = sseOpcodeMulsd
-		case ssa.OpcodeFdiv:
-			op = sseOpcodeDivsd
-		default:
-			panic("BUG")
-		}
-	} else {
-		switch instr.Opcode() {
-		case ssa.OpcodeFadd:
-			op = sseOpcodeAddss
-		case ssa.OpcodeFsub:
-			op = sseOpcodeSubss
-		case ssa.OpcodeFmul:
-			op = sseOpcodeMulss
-		case ssa.OpcodeFdiv:
-			op = sseOpcodeDivss
-		default:
-			panic("BUG")
-		}
-	}
-
-	xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
-	rn := m.getOperand_Reg(yDef)
-	rm := m.getOperand_Reg(xDef)
-	rd := m.c.VRegOf(instr.Return())
-
-	// rm is being overwritten, so we first copy its value to a temp register,
-	// in case it is referenced again later.
-	tmp := m.copyToTmp(rm.reg())
-
-	xmm := m.allocateInstr().asXmmRmR(op, rn, tmp)
-	m.insert(xmm)
-
-	m.copyTo(tmp, rd)
-}
-
-func (m *machine) lowerSqrt(instr *ssa.Instruction) {
-	x := instr.Arg()
-	if !x.Type().IsFloat() {
-		panic("BUG")
-	}
-	_64 := x.Type().Bits() == 64
-	var op sseOpcode
-	if _64 {
-		op = sseOpcodeSqrtsd
-	} else {
-		op = sseOpcodeSqrtss
-	}
-
-	xDef := m.c.ValueDefinition(x)
-	rm := m.getOperand_Mem_Reg(xDef)
-	rd := m.c.VRegOf(instr.Return())
-
-	xmm := m.allocateInstr().asXmmUnaryRmR(op, rm, rd)
-	m.insert(xmm)
-}
-
-func (m *machine) lowerFabsFneg(instr *ssa.Instruction) {
-	x := instr.Arg()
-	if !x.Type().IsFloat() {
-		panic("BUG")
-	}
-	_64 := x.Type().Bits() == 64
-	var op sseOpcode
-	var mask uint64
-	if _64 {
-		switch instr.Opcode() {
-		case ssa.OpcodeFabs:
-			mask, op = 0x7fffffffffffffff, sseOpcodeAndpd
-		case ssa.OpcodeFneg:
-			mask, op = 0x8000000000000000, sseOpcodeXorpd
-		}
-	} else {
-		switch instr.Opcode() {
-		case ssa.OpcodeFabs:
-			mask, op = 0x7fffffff, sseOpcodeAndps
-		case ssa.OpcodeFneg:
-			mask, op = 0x80000000, sseOpcodeXorps
-		}
-	}
-
-	tmp := m.c.AllocateVReg(x.Type())
-
-	xDef := m.c.ValueDefinition(x)
-	rm := m.getOperand_Reg(xDef)
-	rd := m.c.VRegOf(instr.Return())
-
-	m.lowerFconst(tmp, mask, _64)
-
-	xmm := m.allocateInstr().asXmmRmR(op, rm, tmp)
-	m.insert(xmm)
-
-	m.copyTo(tmp, rd)
-}
-
-func (m *machine) lowerStore(si *ssa.Instruction) {
-	value, ptr, offset, storeSizeInBits := si.StoreData()
-	rm := m.getOperand_Reg(m.c.ValueDefinition(value))
-	mem := newOperandMem(m.lowerToAddressMode(ptr, offset))
-
-	store := m.allocateInstr()
-	switch value.Type() {
-	case ssa.TypeI32:
-		store.asMovRM(rm.reg(), mem, storeSizeInBits/8)
-	case ssa.TypeI64:
-		store.asMovRM(rm.reg(), mem, storeSizeInBits/8)
-	case ssa.TypeF32:
-		store.asXmmMovRM(sseOpcodeMovss, rm.reg(), mem)
-	case ssa.TypeF64:
-		store.asXmmMovRM(sseOpcodeMovsd, rm.reg(), mem)
-	case ssa.TypeV128:
-		store.asXmmMovRM(sseOpcodeMovdqu, rm.reg(), mem)
-	default:
-		panic("BUG")
-	}
-	m.insert(store)
-}
-
-func (m *machine) lowerCall(si *ssa.Instruction) {
-	isDirectCall := si.Opcode() == ssa.OpcodeCall
-	var indirectCalleePtr ssa.Value
-	var directCallee ssa.FuncRef
-	var sigID ssa.SignatureID
-	var args []ssa.Value
-	var isMemmove bool
-	if isDirectCall {
-		directCallee, sigID, args = si.CallData()
-	} else {
-		indirectCalleePtr, sigID, args, isMemmove = si.CallIndirectData()
-	}
-	calleeABI := m.c.GetFunctionABI(m.c.SSABuilder().ResolveSignature(sigID))
-
-	stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize())
-	if m.maxRequiredStackSizeForCalls < stackSlotSize+16 {
-		m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // 16 == return address + RBP.
-	}
-
-	// Note: See machine.SetupPrologue for the stack layout.
-	// The stack pointer decrease/increase will be inserted later in the compilation.
-
-	for i, arg := range args {
-		reg := m.c.VRegOf(arg)
-		def := m.c.ValueDefinition(arg)
-		m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize)
-	}
-
-	if isMemmove {
-		// Go's memmove *might* use all xmm0-xmm15, so we need to release them.
-		// https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#architecture-specifics
-		// https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/runtime/memmove_amd64.s#L271-L286
-		for i := regalloc.RealReg(0); i < 16; i++ {
-			m.insert(m.allocateInstr().asDefineUninitializedReg(regInfo.RealRegToVReg[xmm0+i]))
-		}
-	}
-
-	if isDirectCall {
-		call := m.allocateInstr().asCall(directCallee, calleeABI)
-		m.insert(call)
-	} else {
-		ptrOp := m.getOperand_Mem_Reg(m.c.ValueDefinition(indirectCalleePtr))
-		callInd := m.allocateInstr().asCallIndirect(ptrOp, calleeABI)
-		m.insert(callInd)
-	}
-
-	if isMemmove {
-		for i := regalloc.RealReg(0); i < 16; i++ {
-			m.insert(m.allocateInstr().asNopUseReg(regInfo.RealRegToVReg[xmm0+i]))
-		}
-	}
-
-	var index int
-	r1, rs := si.Returns()
-	if r1.Valid() {
-		m.callerGenFunctionReturnVReg(calleeABI, 0, m.c.VRegOf(r1), stackSlotSize)
-		index++
-	}
-
-	for _, r := range rs {
-		m.callerGenFunctionReturnVReg(calleeABI, index, m.c.VRegOf(r), stackSlotSize)
-		index++
-	}
-}
-
-// callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the
-// caller side of the function call.
-func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def backend.SSAValueDefinition, stackSlotSize int64) {
-	arg := &a.Args[argIndex]
-	if def.IsFromInstr() {
-		// Constant instructions are inlined.
-		if inst := def.Instr; inst.Constant() {
-			m.insertLoadConstant(inst, reg)
-		}
-	}
-	if arg.Kind == backend.ABIArgKindReg {
-		m.InsertMove(arg.Reg, reg, arg.Type)
-	} else {
-		store := m.allocateInstr()
-		mem := newOperandMem(m.newAmodeImmReg(
-			// -stackSlotSize because the stack pointer is not yet decreased.
-			uint32(arg.Offset-stackSlotSize), rspVReg))
-		switch arg.Type {
-		case ssa.TypeI32:
-			store.asMovRM(reg, mem, 4)
-		case ssa.TypeI64:
-			store.asMovRM(reg, mem, 8)
-		case ssa.TypeF32:
-			store.asXmmMovRM(sseOpcodeMovss, reg, mem)
-		case ssa.TypeF64:
-			store.asXmmMovRM(sseOpcodeMovsd, reg, mem)
-		case ssa.TypeV128:
-			store.asXmmMovRM(sseOpcodeMovdqu, reg, mem)
-		default:
-			panic("BUG")
-		}
-		m.insert(store)
-	}
-}
-
-func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex int, reg regalloc.VReg, stackSlotSize int64) {
-	r := &a.Rets[retIndex]
-	if r.Kind == backend.ABIArgKindReg {
-		m.InsertMove(reg, r.Reg, r.Type)
-	} else {
-		load := m.allocateInstr()
-		mem := newOperandMem(m.newAmodeImmReg(
-			// -stackSlotSize because the stack pointer is not yet decreased.
-			uint32(a.ArgStackSize+r.Offset-stackSlotSize), rspVReg))
-		switch r.Type {
-		case ssa.TypeI32:
-			load.asMovzxRmR(extModeLQ, mem, reg)
-		case ssa.TypeI64:
-			load.asMov64MR(mem, reg)
-		case ssa.TypeF32:
-			load.asXmmUnaryRmR(sseOpcodeMovss, mem, reg)
-		case ssa.TypeF64:
-			load.asXmmUnaryRmR(sseOpcodeMovsd, mem, reg)
-		case ssa.TypeV128:
-			load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, reg)
-		default:
-			panic("BUG")
-		}
-		m.insert(load)
-	}
-}
-
-// InsertMove implements backend.Machine.
-func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) {
-	switch typ {
-	case ssa.TypeI32, ssa.TypeI64:
-		i := m.allocateInstr().asMovRR(src, dst, typ.Bits() == 64)
-		m.insert(i)
-	case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-		var op sseOpcode
-		switch typ {
-		case ssa.TypeF32:
-			op = sseOpcodeMovss
-		case ssa.TypeF64:
-			op = sseOpcodeMovsd
-		case ssa.TypeV128:
-			op = sseOpcodeMovdqa
-		}
-		i := m.allocateInstr().asXmmUnaryRmR(op, newOperandReg(src), dst)
-		m.insert(i)
-	default:
-		panic("BUG")
-	}
-}
-
-// Format implements backend.Machine.
-func (m *machine) Format() string {
-	begins := map[*instruction]label{}
-	for l := label(0); l < m.nextLabel; l++ {
-		pos := m.labelPositionPool.Get(int(l))
-		if pos != nil {
-			begins[pos.begin] = l
-		}
-	}
-
-	var lines []string
-	for cur := m.rootInstr; cur != nil; cur = cur.next {
-		if l, ok := begins[cur]; ok {
-			var labelStr string
-			if l <= m.maxSSABlockID {
-				labelStr = fmt.Sprintf("%s (SSA Block: blk%d):", l, l)
-			} else {
-				labelStr = fmt.Sprintf("%s:", l)
-			}
-			lines = append(lines, labelStr)
-		}
-		if cur.kind == nop0 {
-			continue
-		}
-		lines = append(lines, "\t"+cur.String())
-	}
-	for _, vc := range m.consts {
-		if vc._var == nil {
-			lines = append(lines, fmt.Sprintf("%s: const [%d %d]", vc.label, vc.lo, vc.hi))
-		} else {
-			lines = append(lines, fmt.Sprintf("%s: const %#x", vc.label, vc._var))
-		}
-	}
-	return "\n" + strings.Join(lines, "\n") + "\n"
-}
-
-func (m *machine) encodeWithoutSSA(root *instruction) {
-	m.labelResolutionPends = m.labelResolutionPends[:0]
-	bufPtr := m.c.BufPtr()
-	for cur := root; cur != nil; cur = cur.next {
-		offset := int64(len(*bufPtr))
-		if cur.kind == nop0 {
-			l := cur.nop0Label()
-			pos := m.labelPositionPool.Get(int(l))
-			if pos != nil {
-				pos.binaryOffset = offset
-			}
-		}
-
-		needLabelResolution := cur.encode(m.c)
-		if needLabelResolution {
-			m.labelResolutionPends = append(m.labelResolutionPends,
-				labelResolutionPend{instr: cur, imm32Offset: int64(len(*bufPtr)) - 4},
-			)
-		}
-	}
-
-	for i := range m.labelResolutionPends {
-		p := &m.labelResolutionPends[i]
-		switch p.instr.kind {
-		case jmp, jmpIf, lea:
-			target := p.instr.jmpLabel()
-			targetOffset := m.labelPositionPool.Get(int(target)).binaryOffset
-			imm32Offset := p.imm32Offset
-			jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction.
-			binary.LittleEndian.PutUint32((*bufPtr)[imm32Offset:], uint32(jmpOffset))
-		default:
-			panic("BUG")
-		}
-	}
-}
-
-// Encode implements backend.Machine Encode.
-func (m *machine) Encode(ctx context.Context) (err error) {
-	bufPtr := m.c.BufPtr()
-
-	var fn string
-	var fnIndex int
-	var labelPosToLabel map[*labelPosition]label
-	if wazevoapi.PerfMapEnabled {
-		fn = wazevoapi.GetCurrentFunctionName(ctx)
-		labelPosToLabel = make(map[*labelPosition]label)
-		for i := 0; i <= m.labelPositionPool.MaxIDEncountered(); i++ {
-			pos := m.labelPositionPool.Get(i)
-			labelPosToLabel[pos] = label(i)
-		}
-		fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx)
-	}
-
-	m.labelResolutionPends = m.labelResolutionPends[:0]
-	for _, pos := range m.orderedSSABlockLabelPos {
-		offset := int64(len(*bufPtr))
-		pos.binaryOffset = offset
-		for cur := pos.begin; cur != pos.end.next; cur = cur.next {
-			offset := int64(len(*bufPtr))
-
-			switch cur.kind {
-			case nop0:
-				l := cur.nop0Label()
-				if pos := m.labelPositionPool.Get(int(l)); pos != nil {
-					pos.binaryOffset = offset
-				}
-			case sourceOffsetInfo:
-				m.c.AddSourceOffsetInfo(offset, cur.sourceOffsetInfo())
-			}
-
-			needLabelResolution := cur.encode(m.c)
-			if needLabelResolution {
-				m.labelResolutionPends = append(m.labelResolutionPends,
-					labelResolutionPend{instr: cur, instrOffset: offset, imm32Offset: int64(len(*bufPtr)) - 4},
-				)
-			}
-		}
-
-		if wazevoapi.PerfMapEnabled {
-			l := labelPosToLabel[pos]
-			size := int64(len(*bufPtr)) - offset
-			wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, l))
-		}
-	}
-
-	for i := range m.consts {
-		offset := int64(len(*bufPtr))
-		vc := &m.consts[i]
-		vc.labelPos.binaryOffset = offset
-		if vc._var == nil {
-			lo, hi := vc.lo, vc.hi
-			m.c.Emit8Bytes(lo)
-			m.c.Emit8Bytes(hi)
-		} else {
-			for _, b := range vc._var {
-				m.c.EmitByte(b)
-			}
-		}
-	}
-
-	buf := *bufPtr
-	for i := range m.labelResolutionPends {
-		p := &m.labelResolutionPends[i]
-		switch p.instr.kind {
-		case jmp, jmpIf, lea, xmmUnaryRmR:
-			target := p.instr.jmpLabel()
-			targetOffset := m.labelPositionPool.Get(int(target)).binaryOffset
-			imm32Offset := p.imm32Offset
-			jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction.
-			binary.LittleEndian.PutUint32(buf[imm32Offset:], uint32(jmpOffset))
-		case jmpTableIsland:
-			tableBegin := p.instrOffset
-			// Each entry is the offset from the beginning of the jmpTableIsland instruction in 8 bytes.
-			targets := m.jmpTableTargets[p.instr.u1]
-			for i, l := range targets {
-				targetOffset := m.labelPositionPool.Get(int(l)).binaryOffset
-				jmpOffset := targetOffset - tableBegin
-				binary.LittleEndian.PutUint64(buf[tableBegin+int64(i)*8:], uint64(jmpOffset))
-			}
-		default:
-			panic("BUG")
-		}
-	}
-	return
-}
-
-// ResolveRelocations implements backend.Machine.
-func (m *machine) ResolveRelocations(refToBinaryOffset []int, _ int, binary []byte, relocations []backend.RelocationInfo, _ []int) {
-	for _, r := range relocations {
-		offset := r.Offset
-		calleeFnOffset := refToBinaryOffset[r.FuncRef]
-		// offset is the offset of the last 4 bytes of the call instruction.
-		callInstrOffsetBytes := binary[offset : offset+4]
-		diff := int64(calleeFnOffset) - (offset + 4) // +4 because we want the offset of the next instruction (In x64, RIP always points to the next instruction).
-		callInstrOffsetBytes[0] = byte(diff)
-		callInstrOffsetBytes[1] = byte(diff >> 8)
-		callInstrOffsetBytes[2] = byte(diff >> 16)
-		callInstrOffsetBytes[3] = byte(diff >> 24)
-	}
-}
-
-// CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo.
-func (m *machine) CallTrampolineIslandInfo(_ int) (_, _ int, _ error) { return }
-
-func (m *machine) lowerIcmpToFlag(xd, yd backend.SSAValueDefinition, _64 bool) {
-	x := m.getOperand_Reg(xd)
-	y := m.getOperand_Mem_Imm32_Reg(yd)
-	cmp := m.allocateInstr().asCmpRmiR(true, y, x.reg(), _64)
-	m.insert(cmp)
-}
-
-func (m *machine) lowerFcmpToFlags(instr *ssa.Instruction) (f1, f2 cond, and bool) {
-	x, y, c := instr.FcmpData()
-	switch c {
-	case ssa.FloatCmpCondEqual:
-		f1, f2 = condNP, condZ
-		and = true
-	case ssa.FloatCmpCondNotEqual:
-		f1, f2 = condP, condNZ
-	case ssa.FloatCmpCondLessThan:
-		f1 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThan)
-		f2 = condInvalid
-		x, y = y, x
-	case ssa.FloatCmpCondLessThanOrEqual:
-		f1 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThanOrEqual)
-		f2 = condInvalid
-		x, y = y, x
-	default:
-		f1 = condFromSSAFloatCmpCond(c)
-		f2 = condInvalid
-	}
-
-	var opc sseOpcode
-	if x.Type() == ssa.TypeF32 {
-		opc = sseOpcodeUcomiss
-	} else {
-		opc = sseOpcodeUcomisd
-	}
-
-	xr := m.getOperand_Reg(m.c.ValueDefinition(x))
-	yr := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
-	m.insert(m.allocateInstr().asXmmCmpRmR(opc, yr, xr.reg()))
-	return
-}
-
-// allocateInstr allocates an instruction.
-func (m *machine) allocateInstr() *instruction {
-	instr := m.instrPool.Allocate()
-	if !m.regAllocStarted {
-		instr.addedBeforeRegAlloc = true
-	}
-	return instr
-}
-
-func (m *machine) allocateNop() *instruction {
-	instr := m.allocateInstr()
-	instr.kind = nop0
-	return instr
-}
-
-func (m *machine) insert(i *instruction) {
-	m.pendingInstructions = append(m.pendingInstructions, i)
-}
-
-func (m *machine) allocateBrTarget() (nop *instruction, l label) { //nolint
-	l, pos := m.allocateLabel()
-	nop = m.allocateInstr()
-	nop.asNop0WithLabel(l)
-	pos.begin, pos.end = nop, nop
-	return
-}
-
-func (m *machine) allocateLabel() (label, *labelPosition) {
-	l := m.nextLabel
-	pos := m.labelPositionPool.GetOrAllocate(int(l))
-	m.nextLabel++
-	return l, pos
-}
-
-func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 {
-	offset, ok := m.spillSlots[id]
-	if !ok {
-		offset = m.spillSlotSize
-		m.spillSlots[id] = offset
-		m.spillSlotSize += int64(size)
-	}
-	return offset
-}
-
-func (m *machine) copyTo(src regalloc.VReg, dst regalloc.VReg) {
-	mov := m.allocateInstr()
-	if src.RegType() == regalloc.RegTypeInt {
-		mov.asMovRR(src, dst, true)
-	} else {
-		mov.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), dst)
-	}
-	m.insert(mov)
-}
-
-func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg {
-	typ := m.c.TypeOf(v)
-	tmp := m.c.AllocateVReg(typ)
-	m.copyTo(v, tmp)
-	return tmp
-}
-
-func (m *machine) requiredStackSize() int64 {
-	return m.maxRequiredStackSizeForCalls +
-		m.frameSize() +
-		16 + // Need for stack checking.
-		16 // return address and the caller RBP.
-}
-
-func (m *machine) frameSize() int64 {
-	s := m.clobberedRegSlotSize() + m.spillSlotSize
-	if s&0xf != 0 {
-		panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s))
-	}
-	return s
-}
-
-func (m *machine) clobberedRegSlotSize() int64 {
-	return int64(len(m.clobberedRegs) * 16)
-}
-
-func (m *machine) lowerIDivRem(si *ssa.Instruction, isDiv bool, signed bool) {
-	x, y, execCtx := si.Arg3()
-
-	dividend := m.getOperand_Reg(m.c.ValueDefinition(x))
-	divisor := m.getOperand_Reg(m.c.ValueDefinition(y))
-	ctxVReg := m.c.VRegOf(execCtx)
-	tmpGp := m.c.AllocateVReg(si.Return().Type())
-
-	m.copyTo(dividend.reg(), raxVReg)
-	m.insert(m.allocateInstr().asDefineUninitializedReg(rdxVReg))
-	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
-	seq := m.allocateInstr().asIdivRemSequence(ctxVReg, divisor.reg(), tmpGp, isDiv, signed, x.Type().Bits() == 64)
-	m.insert(seq)
-	rd := m.c.VRegOf(si.Return())
-	if isDiv {
-		m.copyTo(raxVReg, rd)
-	} else {
-		m.copyTo(rdxVReg, rd)
-	}
-}
-
-func (m *machine) lowerIDivRemSequenceAfterRegAlloc(i *instruction) {
-	execCtx, divisor, tmpGp, isDiv, signed, _64 := i.idivRemSequenceData()
-
-	dividend := raxVReg
-
-	// Ensure yr is not zero.
-	test := m.allocateInstr()
-	test.asCmpRmiR(false, newOperandReg(divisor), divisor, _64)
-	m.insert(test)
-
-	jnz := m.allocateInstr()
-	m.insert(jnz)
-
-	nz := m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerDivisionByZero)
-
-	// If not zero, we can proceed with the division.
-	jnz.asJmpIf(condNZ, newOperandLabel(nz))
-
-	var ifRemNeg1 *instruction
-	if signed {
-		var neg1 uint64
-		if _64 {
-			neg1 = 0xffffffffffffffff
-		} else {
-			neg1 = 0xffffffff
-		}
-		m.lowerIconst(tmpGp, neg1, _64)
-
-		if isDiv {
-			// For signed division, we have to have branches for "math.MinInt{32,64} / -1"
-			// case which results in the floating point exception via division error as
-			// the resulting value exceeds the maximum of signed int.
-
-			// First, we check if the divisor is -1.
-			cmp := m.allocateInstr()
-			cmp.asCmpRmiR(true, newOperandReg(tmpGp), divisor, _64)
-			m.insert(cmp)
-
-			ifNotNeg1 := m.allocateInstr()
-			m.insert(ifNotNeg1)
-
-			var minInt uint64
-			if _64 {
-				minInt = 0x8000000000000000
-			} else {
-				minInt = 0x80000000
-			}
-			m.lowerIconst(tmpGp, minInt, _64)
-
-			// Next we check if the quotient is the most negative value for the signed integer, i.e.
-			// if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1) respectively.
-			cmp2 := m.allocateInstr()
-			cmp2.asCmpRmiR(true, newOperandReg(tmpGp), dividend, _64)
-			m.insert(cmp2)
-
-			ifNotMinInt := m.allocateInstr()
-			m.insert(ifNotMinInt)
-
-			// Trap if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1),
-			// as that is the overflow in division as the result becomes 2^31 which is larger than
-			// the maximum of signed 32-bit int (2^31-1).
-			end := m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
-			ifNotNeg1.asJmpIf(condNZ, newOperandLabel(end))
-			ifNotMinInt.asJmpIf(condNZ, newOperandLabel(end))
-		} else {
-			// If it is remainder, zeros DX register and compare the divisor to -1.
-			xor := m.allocateInstr().asZeros(rdxVReg)
-			m.insert(xor)
-
-			// We check if the divisor is -1.
-			cmp := m.allocateInstr()
-			cmp.asCmpRmiR(true, newOperandReg(tmpGp), divisor, _64)
-			m.insert(cmp)
-
-			ifRemNeg1 = m.allocateInstr()
-			m.insert(ifRemNeg1)
-		}
-
-		// Sign-extend DX register to have 2*x.Type().Bits() dividend over DX and AX registers.
-		sed := m.allocateInstr()
-		sed.asSignExtendData(_64)
-		m.insert(sed)
-	} else {
-		// Zeros DX register to have 2*x.Type().Bits() dividend over DX and AX registers.
-		zeros := m.allocateInstr().asZeros(rdxVReg)
-		m.insert(zeros)
-	}
-
-	div := m.allocateInstr()
-	div.asDiv(newOperandReg(divisor), signed, _64)
-	m.insert(div)
-
-	nop, end := m.allocateBrTarget()
-	m.insert(nop)
-	// If we are compiling a Rem instruction, when the divisor is -1 we land at the end of the function.
-	if ifRemNeg1 != nil {
-		ifRemNeg1.asJmpIf(condZ, newOperandLabel(end))
-	}
-}
-
-func (m *machine) lowerRound(instr *ssa.Instruction, imm roundingMode) {
-	x := instr.Arg()
-	if !x.Type().IsFloat() {
-		panic("BUG?")
-	}
-	var op sseOpcode
-	if x.Type().Bits() == 64 {
-		op = sseOpcodeRoundsd
-	} else {
-		op = sseOpcodeRoundss
-	}
-
-	xDef := m.c.ValueDefinition(x)
-	rm := m.getOperand_Mem_Reg(xDef)
-	rd := m.c.VRegOf(instr.Return())
-
-	xmm := m.allocateInstr().asXmmUnaryRmRImm(op, uint8(imm), rm, rd)
-	m.insert(xmm)
-}
-
-func (m *machine) lowerFminFmax(instr *ssa.Instruction) {
-	x, y := instr.Arg2()
-	if !x.Type().IsFloat() {
-		panic("BUG?")
-	}
-
-	_64 := x.Type().Bits() == 64
-	isMin := instr.Opcode() == ssa.OpcodeFmin
-	var minMaxOp sseOpcode
-
-	switch {
-	case _64 && isMin:
-		minMaxOp = sseOpcodeMinpd
-	case _64 && !isMin:
-		minMaxOp = sseOpcodeMaxpd
-	case !_64 && isMin:
-		minMaxOp = sseOpcodeMinps
-	case !_64 && !isMin:
-		minMaxOp = sseOpcodeMaxps
-	}
-
-	xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
-	rm := m.getOperand_Reg(xDef)
-	// We cannot ensure that y is aligned to 16 bytes, so we have to use it on reg.
-	rn := m.getOperand_Reg(yDef)
-	rd := m.c.VRegOf(instr.Return())
-
-	tmp := m.copyToTmp(rm.reg())
-
-	// Check if this is (either x1 or x2 is NaN) or (x1 equals x2) case.
-	cmp := m.allocateInstr()
-	if _64 {
-		cmp.asXmmCmpRmR(sseOpcodeUcomisd, rn, tmp)
-	} else {
-		cmp.asXmmCmpRmR(sseOpcodeUcomiss, rn, tmp)
-	}
-	m.insert(cmp)
-
-	// At this point, we have the three cases of conditional flags below
-	// (See https://www.felixcloutier.com/x86/ucomiss#operation for detail.)
-	//
-	// 1) Two values are NaN-free and different: All flags are cleared.
-	// 2) Two values are NaN-free and equal: Only ZF flags is set.
-	// 3) One of Two values is NaN: ZF, PF and CF flags are set.
-
-	// Jump instruction to handle 1) case by checking the ZF flag
-	// as ZF is only set for 2) and 3) cases.
-	nanFreeOrDiffJump := m.allocateInstr()
-	m.insert(nanFreeOrDiffJump)
-
-	// Start handling 2) and 3).
-
-	// Jump if one of two values is NaN by checking the parity flag (PF).
-	ifIsNan := m.allocateInstr()
-	m.insert(ifIsNan)
-
-	// Start handling 2) NaN-free and equal.
-
-	// Before we exit this case, we have to ensure that positive zero (or negative zero for min instruction) is
-	// returned if two values are positive and negative zeros.
-	var op sseOpcode
-	switch {
-	case !_64 && isMin:
-		op = sseOpcodeOrps
-	case _64 && isMin:
-		op = sseOpcodeOrpd
-	case !_64 && !isMin:
-		op = sseOpcodeAndps
-	case _64 && !isMin:
-		op = sseOpcodeAndpd
-	}
-	orAnd := m.allocateInstr()
-	orAnd.asXmmRmR(op, rn, tmp)
-	m.insert(orAnd)
-
-	// Done, jump to end.
-	sameExitJump := m.allocateInstr()
-	m.insert(sameExitJump)
-
-	// Start handling 3) either is NaN.
-	isNanTarget, isNan := m.allocateBrTarget()
-	m.insert(isNanTarget)
-	ifIsNan.asJmpIf(condP, newOperandLabel(isNan))
-
-	// We emit the ADD instruction to produce the NaN in tmp.
-	add := m.allocateInstr()
-	if _64 {
-		add.asXmmRmR(sseOpcodeAddsd, rn, tmp)
-	} else {
-		add.asXmmRmR(sseOpcodeAddss, rn, tmp)
-	}
-	m.insert(add)
-
-	// Exit from the NaN case branch.
-	nanExitJmp := m.allocateInstr()
-	m.insert(nanExitJmp)
-
-	// Start handling 1).
-	doMinMaxTarget, doMinMax := m.allocateBrTarget()
-	m.insert(doMinMaxTarget)
-	nanFreeOrDiffJump.asJmpIf(condNZ, newOperandLabel(doMinMax))
-
-	// Now handle the NaN-free and different values case.
-	minMax := m.allocateInstr()
-	minMax.asXmmRmR(minMaxOp, rn, tmp)
-	m.insert(minMax)
-
-	endNop, end := m.allocateBrTarget()
-	m.insert(endNop)
-	nanExitJmp.asJmp(newOperandLabel(end))
-	sameExitJump.asJmp(newOperandLabel(end))
-
-	m.copyTo(tmp, rd)
-}
-
-func (m *machine) lowerFcopysign(instr *ssa.Instruction) {
-	x, y := instr.Arg2()
-	if !x.Type().IsFloat() {
-		panic("BUG")
-	}
-
-	_64 := x.Type().Bits() == 64
-
-	xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
-	rm := m.getOperand_Reg(xDef)
-	rn := m.getOperand_Reg(yDef)
-	rd := m.c.VRegOf(instr.Return())
-
-	// Clear the non-sign bits of src via AND with the mask.
-	var opAnd, opOr sseOpcode
-	var signMask uint64
-	if _64 {
-		signMask, opAnd, opOr = 0x8000000000000000, sseOpcodeAndpd, sseOpcodeOrpd
-	} else {
-		signMask, opAnd, opOr = 0x80000000, sseOpcodeAndps, sseOpcodeOrps
-	}
-
-	signBitReg := m.c.AllocateVReg(x.Type())
-	m.lowerFconst(signBitReg, signMask, _64)
-	nonSignBitReg := m.c.AllocateVReg(x.Type())
-	m.lowerFconst(nonSignBitReg, ^signMask, _64)
-
-	// Extract the sign bits of rn.
-	and := m.allocateInstr().asXmmRmR(opAnd, rn, signBitReg)
-	m.insert(and)
-
-	// Clear the sign bit of dst via AND with the non-sign bit mask.
-	xor := m.allocateInstr().asXmmRmR(opAnd, rm, nonSignBitReg)
-	m.insert(xor)
-
-	// Copy the sign bits of src to dst via OR.
-	or := m.allocateInstr().asXmmRmR(opOr, newOperandReg(signBitReg), nonSignBitReg)
-	m.insert(or)
-
-	m.copyTo(nonSignBitReg, rd)
-}
-
-func (m *machine) lowerBitcast(instr *ssa.Instruction) {
-	x, dstTyp := instr.BitcastData()
-	srcTyp := x.Type()
-	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
-	rd := m.c.VRegOf(instr.Return())
-	switch {
-	case srcTyp == ssa.TypeF32 && dstTyp == ssa.TypeI32:
-		cvt := m.allocateInstr().asXmmToGpr(sseOpcodeMovd, rn.reg(), rd, false)
-		m.insert(cvt)
-	case srcTyp == ssa.TypeI32 && dstTyp == ssa.TypeF32:
-		cvt := m.allocateInstr().asGprToXmm(sseOpcodeMovd, rn, rd, false)
-		m.insert(cvt)
-	case srcTyp == ssa.TypeF64 && dstTyp == ssa.TypeI64:
-		cvt := m.allocateInstr().asXmmToGpr(sseOpcodeMovq, rn.reg(), rd, true)
-		m.insert(cvt)
-	case srcTyp == ssa.TypeI64 && dstTyp == ssa.TypeF64:
-		cvt := m.allocateInstr().asGprToXmm(sseOpcodeMovq, rn, rd, true)
-		m.insert(cvt)
-	default:
-		panic(fmt.Sprintf("invalid bitcast from %s to %s", srcTyp, dstTyp))
-	}
-}
-
-func (m *machine) lowerFcvtToSint(ctxVReg, rn, rd regalloc.VReg, src64, dst64, sat bool) {
-	var tmpXmm regalloc.VReg
-	if dst64 {
-		tmpXmm = m.c.AllocateVReg(ssa.TypeF64)
-	} else {
-		tmpXmm = m.c.AllocateVReg(ssa.TypeF32)
-	}
-
-	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm))
-	tmpGp, tmpGp2 := m.c.AllocateVReg(ssa.TypeI64), m.c.AllocateVReg(ssa.TypeI64)
-	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
-	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp2))
-
-	m.insert(m.allocateFcvtToSintSequence(ctxVReg, rn, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat))
-	m.copyTo(tmpGp, rd)
-}
-
-func (m *machine) lowerFcvtToSintSequenceAfterRegalloc(i *instruction) {
-	execCtx, src, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat := i.fcvtToSintSequenceData()
-	var cmpOp, truncOp sseOpcode
-	if src64 {
-		cmpOp, truncOp = sseOpcodeUcomisd, sseOpcodeCvttsd2si
-	} else {
-		cmpOp, truncOp = sseOpcodeUcomiss, sseOpcodeCvttss2si
-	}
-
-	trunc := m.allocateInstr()
-	trunc.asXmmToGpr(truncOp, src, tmpGp, dst64)
-	m.insert(trunc)
-
-	// Check if the dst operand was INT_MIN, by checking it against 1.
-	cmp1 := m.allocateInstr()
-	cmp1.asCmpRmiR(true, newOperandImm32(1), tmpGp, dst64)
-	m.insert(cmp1)
-
-	// If no overflow, then we are done.
-	doneTarget, done := m.allocateBrTarget()
-	ifNoOverflow := m.allocateInstr()
-	ifNoOverflow.asJmpIf(condNO, newOperandLabel(done))
-	m.insert(ifNoOverflow)
-
-	// Now, check for NaN.
-	cmpNan := m.allocateInstr()
-	cmpNan.asXmmCmpRmR(cmpOp, newOperandReg(src), src)
-	m.insert(cmpNan)
-
-	// We allocate the "non-nan target" here, but we will insert it later.
-	notNanTarget, notNaN := m.allocateBrTarget()
-	ifNotNan := m.allocateInstr()
-	ifNotNan.asJmpIf(condNP, newOperandLabel(notNaN))
-	m.insert(ifNotNan)
-
-	if sat {
-		// If NaN and saturating, return 0.
-		zeroDst := m.allocateInstr().asZeros(tmpGp)
-		m.insert(zeroDst)
-
-		jmpEnd := m.allocateInstr()
-		jmpEnd.asJmp(newOperandLabel(done))
-		m.insert(jmpEnd)
-
-		// Otherwise:
-		m.insert(notNanTarget)
-
-		// Zero-out the tmp register.
-		zero := m.allocateInstr().asZeros(tmpXmm)
-		m.insert(zero)
-
-		cmpXmm := m.allocateInstr().asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src)
-		m.insert(cmpXmm)
-
-		// if >= jump to end.
-		jmpEnd2 := m.allocateInstr()
-		jmpEnd2.asJmpIf(condB, newOperandLabel(done))
-		m.insert(jmpEnd2)
-
-		// Otherwise, saturate to INT_MAX.
-		if dst64 {
-			m.lowerIconst(tmpGp, math.MaxInt64, dst64)
-		} else {
-			m.lowerIconst(tmpGp, math.MaxInt32, dst64)
-		}
-
-	} else {
-
-		// If non-sat, NaN, trap.
-		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeInvalidConversionToInteger)
-
-		// Otherwise, we will jump here.
-		m.insert(notNanTarget)
-
-		// jump over trap if src larger than threshold
-		condAboveThreshold := condNB
-
-		// The magic constants are various combination of minInt for int[32|64] represented as float[32|64].
-		var minInt uint64
-		switch {
-		case src64 && dst64:
-			minInt = 0xc3e0000000000000
-		case src64 && !dst64:
-			condAboveThreshold = condNBE
-			minInt = 0xC1E0_0000_0020_0000
-		case !src64 && dst64:
-			minInt = 0xDF00_0000
-		case !src64 && !dst64:
-			minInt = 0xCF00_0000
-		}
-
-		loadToGP := m.allocateInstr().asImm(tmpGp2, minInt, src64)
-		m.insert(loadToGP)
-
-		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp2), tmpXmm, src64)
-		m.insert(movToXmm)
-
-		cmpXmm := m.allocateInstr().asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src)
-		m.insert(cmpXmm)
-
-		jmpIfLarger := m.allocateInstr()
-		checkPositiveTarget, checkPositive := m.allocateBrTarget()
-		jmpIfLarger.asJmpIf(condAboveThreshold, newOperandLabel(checkPositive))
-		m.insert(jmpIfLarger)
-
-		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
-
-		// If positive, it was a real overflow.
-		m.insert(checkPositiveTarget)
-
-		// Zero out the temp register.
-		xorpd := m.allocateInstr()
-		xorpd.asXmmRmR(sseOpcodeXorpd, newOperandReg(tmpXmm), tmpXmm)
-		m.insert(xorpd)
-
-		pos := m.allocateInstr()
-		pos.asXmmCmpRmR(cmpOp, newOperandReg(src), tmpXmm)
-		m.insert(pos)
-
-		// If >= jump to end.
-		jmp := m.allocateInstr().asJmpIf(condNB, newOperandLabel(done))
-		m.insert(jmp)
-		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
-	}
-
-	m.insert(doneTarget)
-}
-
-func (m *machine) lowerFcvtToUint(ctxVReg, rn, rd regalloc.VReg, src64, dst64, sat bool) {
-	tmpXmm, tmpXmm2 := m.c.AllocateVReg(ssa.TypeF64), m.c.AllocateVReg(ssa.TypeF64)
-	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm))
-	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm2))
-	tmpGp, tmpGp2 := m.c.AllocateVReg(ssa.TypeI64), m.c.AllocateVReg(ssa.TypeI64)
-	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
-	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp2))
-
-	m.insert(m.allocateFcvtToUintSequence(
-		ctxVReg, rn, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat,
-	))
-	m.copyTo(tmpGp, rd)
-}
-
-func (m *machine) lowerFcvtToUintSequenceAfterRegalloc(i *instruction) {
-	execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat := i.fcvtToUintSequenceData()
-
-	var subOp, cmpOp, truncOp sseOpcode
-	if src64 {
-		subOp, cmpOp, truncOp = sseOpcodeSubsd, sseOpcodeUcomisd, sseOpcodeCvttsd2si
-	} else {
-		subOp, cmpOp, truncOp = sseOpcodeSubss, sseOpcodeUcomiss, sseOpcodeCvttss2si
-	}
-
-	doneTarget, done := m.allocateBrTarget()
-
-	switch {
-	case src64 && dst64:
-		loadToGP := m.allocateInstr().asImm(tmpGp, 0x43e0000000000000, true)
-		m.insert(loadToGP)
-		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, true)
-		m.insert(movToXmm)
-	case src64 && !dst64:
-		loadToGP := m.allocateInstr().asImm(tmpGp, 0x41e0000000000000, true)
-		m.insert(loadToGP)
-		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, true)
-		m.insert(movToXmm)
-	case !src64 && dst64:
-		loadToGP := m.allocateInstr().asImm(tmpGp, 0x5f000000, false)
-		m.insert(loadToGP)
-		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, false)
-		m.insert(movToXmm)
-	case !src64 && !dst64:
-		loadToGP := m.allocateInstr().asImm(tmpGp, 0x4f000000, false)
-		m.insert(loadToGP)
-		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, false)
-		m.insert(movToXmm)
-	}
-
-	cmp := m.allocateInstr()
-	cmp.asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src)
-	m.insert(cmp)
-
-	// If above `tmp` ("large threshold"), jump to `ifAboveThreshold`
-	ifAboveThresholdTarget, ifAboveThreshold := m.allocateBrTarget()
-	jmpIfAboveThreshold := m.allocateInstr()
-	jmpIfAboveThreshold.asJmpIf(condNB, newOperandLabel(ifAboveThreshold))
-	m.insert(jmpIfAboveThreshold)
-
-	ifNotNaNTarget, ifNotNaN := m.allocateBrTarget()
-	jmpIfNotNaN := m.allocateInstr()
-	jmpIfNotNaN.asJmpIf(condNP, newOperandLabel(ifNotNaN))
-	m.insert(jmpIfNotNaN)
-
-	// If NaN, handle the error condition.
-	if sat {
-		// On NaN, saturating, we just return 0.
-		zeros := m.allocateInstr().asZeros(tmpGp)
-		m.insert(zeros)
-
-		jmpEnd := m.allocateInstr()
-		jmpEnd.asJmp(newOperandLabel(done))
-		m.insert(jmpEnd)
-	} else {
-		// On NaN, non-saturating, we trap.
-		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeInvalidConversionToInteger)
-	}
-
-	// If not NaN, land here.
-	m.insert(ifNotNaNTarget)
-
-	// Truncation happens here.
-
-	trunc := m.allocateInstr()
-	trunc.asXmmToGpr(truncOp, src, tmpGp, dst64)
-	m.insert(trunc)
-
-	// Check if the result is negative.
-	cmpNeg := m.allocateInstr()
-	cmpNeg.asCmpRmiR(true, newOperandImm32(0), tmpGp, dst64)
-	m.insert(cmpNeg)
-
-	// If non-neg, jump to end.
-	jmpIfNonNeg := m.allocateInstr()
-	jmpIfNonNeg.asJmpIf(condNL, newOperandLabel(done))
-	m.insert(jmpIfNonNeg)
-
-	if sat {
-		// If the input was "small" (< 2**(width -1)), the only way to get an integer
-		// overflow is because the input was too small: saturate to the min value, i.e. 0.
-		zeros := m.allocateInstr().asZeros(tmpGp)
-		m.insert(zeros)
-
-		jmpEnd := m.allocateInstr()
-		jmpEnd.asJmp(newOperandLabel(done))
-		m.insert(jmpEnd)
-	} else {
-		// If not saturating, trap.
-		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
-	}
-
-	// If above the threshold, land here.
-	m.insert(ifAboveThresholdTarget)
-
-	// tmpDiff := threshold - rn.
-	copySrc := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), tmpXmm2)
-	m.insert(copySrc)
-
-	sub := m.allocateInstr()
-	sub.asXmmRmR(subOp, newOperandReg(tmpXmm), tmpXmm2) // must be -0x8000000000000000
-	m.insert(sub)
-
-	trunc2 := m.allocateInstr()
-	trunc2.asXmmToGpr(truncOp, tmpXmm2, tmpGp, dst64)
-	m.insert(trunc2)
-
-	// Check if the result is negative.
-	cmpNeg2 := m.allocateInstr().asCmpRmiR(true, newOperandImm32(0), tmpGp, dst64)
-	m.insert(cmpNeg2)
-
-	ifNextLargeTarget, ifNextLarge := m.allocateBrTarget()
-	jmpIfNextLarge := m.allocateInstr()
-	jmpIfNextLarge.asJmpIf(condNL, newOperandLabel(ifNextLarge))
-	m.insert(jmpIfNextLarge)
-
-	if sat {
-		// The input was "large" (>= maxInt), so the only way to get an integer
-		// overflow is because the input was too large: saturate to the max value.
-		var maxInt uint64
-		if dst64 {
-			maxInt = math.MaxUint64
-		} else {
-			maxInt = math.MaxUint32
-		}
-		m.lowerIconst(tmpGp, maxInt, dst64)
-
-		jmpToEnd := m.allocateInstr()
-		jmpToEnd.asJmp(newOperandLabel(done))
-		m.insert(jmpToEnd)
-	} else {
-		// If not saturating, trap.
-		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
-	}
-
-	m.insert(ifNextLargeTarget)
-
-	var op operand
-	if dst64 {
-		m.lowerIconst(tmpGp2, 0x8000000000000000, true)
-		op = newOperandReg(tmpGp2)
-	} else {
-		op = newOperandImm32(0x80000000)
-	}
-
-	add := m.allocateInstr()
-	add.asAluRmiR(aluRmiROpcodeAdd, op, tmpGp, dst64)
-	m.insert(add)
-
-	m.insert(doneTarget)
-}
-
-func (m *machine) lowerFcvtFromSint(rn, rd operand, src64, dst64 bool) {
-	var op sseOpcode
-	if dst64 {
-		op = sseOpcodeCvtsi2sd
-	} else {
-		op = sseOpcodeCvtsi2ss
-	}
-
-	trunc := m.allocateInstr()
-	trunc.asGprToXmm(op, rn, rd.reg(), src64)
-	m.insert(trunc)
-}
-
-func (m *machine) lowerFcvtFromUint(rn, rd operand, src64, dst64 bool) {
-	var op sseOpcode
-	if dst64 {
-		op = sseOpcodeCvtsi2sd
-	} else {
-		op = sseOpcodeCvtsi2ss
-	}
-
-	// Src is 32 bit, then we just perform the conversion with 64 bit width.
-	//
-	// See the following link for why we use 64bit conversion for unsigned 32bit integer sources:
-	// https://stackoverflow.com/questions/41495498/fpu-operations-generated-by-gcc-during-casting-integer-to-float.
-	//
-	// Here's the summary:
-	// >> CVTSI2SS is indeed designed for converting a signed integer to a scalar single-precision float,
-	// >> not an unsigned integer like you have here. So what gives? Well, a 64-bit processor has 64-bit wide
-	// >> registers available, so the unsigned 32-bit input values can be stored as signed 64-bit intermediate values,
-	// >> which allows CVTSI2SS to be used after all.
-	//
-	if !src64 {
-		// Before we convert, we have to clear the higher 32-bits of the 64-bit register
-		// to get the correct result.
-		tmp := m.c.AllocateVReg(ssa.TypeI32)
-		m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, rn, tmp))
-		m.insert(m.allocateInstr().asGprToXmm(op, newOperandReg(tmp), rd.reg(), true))
-		return
-	}
-
-	// If uint64, we have to do a bit more work.
-	endTarget, end := m.allocateBrTarget()
-
-	var tmpXmm regalloc.VReg
-	if dst64 {
-		tmpXmm = m.c.AllocateVReg(ssa.TypeF64)
-	} else {
-		tmpXmm = m.c.AllocateVReg(ssa.TypeF32)
-	}
-
-	// Check if the most significant bit (sign bit) is set.
-	test := m.allocateInstr()
-	test.asCmpRmiR(false, rn, rn.reg(), src64)
-	m.insert(test)
-
-	// Jump if the sign bit is set.
-	ifSignTarget, ifSign := m.allocateBrTarget()
-	jmpIfNeg := m.allocateInstr()
-	jmpIfNeg.asJmpIf(condS, newOperandLabel(ifSign))
-	m.insert(jmpIfNeg)
-
-	// If the sign bit is not set, we could fit the unsigned int into float32/float64.
-	// So, we convert it to float and emit jump instruction to exit from this branch.
-	cvt := m.allocateInstr()
-	cvt.asGprToXmm(op, rn, tmpXmm, src64)
-	m.insert(cvt)
-
-	// We are done, jump to end.
-	jmpEnd := m.allocateInstr()
-	jmpEnd.asJmp(newOperandLabel(end))
-	m.insert(jmpEnd)
-
-	// Now handling the case where sign-bit is set.
-	// We emit the following sequences:
-	// 	   mov      %rn, %tmp
-	// 	   shr      1, %tmp
-	// 	   mov      %rn, %tmp2
-	// 	   and      1, %tmp2
-	// 	   or       %tmp2, %tmp
-	// 	   cvtsi2ss %tmp, %xmm0
-	// 	   addsd    %xmm0, %xmm0
-	m.insert(ifSignTarget)
-
-	tmp := m.copyToTmp(rn.reg())
-	shr := m.allocateInstr()
-	shr.asShiftR(shiftROpShiftRightLogical, newOperandImm32(1), tmp, src64)
-	m.insert(shr)
-
-	tmp2 := m.copyToTmp(rn.reg())
-	and := m.allocateInstr()
-	and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp2, src64)
-	m.insert(and)
-
-	or := m.allocateInstr()
-	or.asAluRmiR(aluRmiROpcodeOr, newOperandReg(tmp2), tmp, src64)
-	m.insert(or)
-
-	cvt2 := m.allocateInstr()
-	cvt2.asGprToXmm(op, newOperandReg(tmp), tmpXmm, src64)
-	m.insert(cvt2)
-
-	addsd := m.allocateInstr()
-	if dst64 {
-		addsd.asXmmRmR(sseOpcodeAddsd, newOperandReg(tmpXmm), tmpXmm)
-	} else {
-		addsd.asXmmRmR(sseOpcodeAddss, newOperandReg(tmpXmm), tmpXmm)
-	}
-	m.insert(addsd)
-
-	m.insert(endTarget)
-	m.copyTo(tmpXmm, rd.reg())
-}
-
-func (m *machine) lowerVanyTrue(instr *ssa.Instruction) {
-	x := instr.Arg()
-	rm := m.getOperand_Reg(m.c.ValueDefinition(x))
-	rd := m.c.VRegOf(instr.Return())
-
-	tmp := m.c.AllocateVReg(ssa.TypeI32)
-
-	cmp := m.allocateInstr()
-	cmp.asXmmCmpRmR(sseOpcodePtest, rm, rm.reg())
-	m.insert(cmp)
-
-	setcc := m.allocateInstr()
-	setcc.asSetcc(condNZ, tmp)
-	m.insert(setcc)
-
-	// Clear the irrelevant bits.
-	and := m.allocateInstr()
-	and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp, false)
-	m.insert(and)
-
-	m.copyTo(tmp, rd)
-}
-
-func (m *machine) lowerVallTrue(instr *ssa.Instruction) {
-	x, lane := instr.ArgWithLane()
-	var op sseOpcode
-	switch lane {
-	case ssa.VecLaneI8x16:
-		op = sseOpcodePcmpeqb
-	case ssa.VecLaneI16x8:
-		op = sseOpcodePcmpeqw
-	case ssa.VecLaneI32x4:
-		op = sseOpcodePcmpeqd
-	case ssa.VecLaneI64x2:
-		op = sseOpcodePcmpeqq
-	}
-	rm := m.getOperand_Reg(m.c.ValueDefinition(x))
-	rd := m.c.VRegOf(instr.Return())
-
-	tmp := m.c.AllocateVReg(ssa.TypeV128)
-
-	zeros := m.allocateInstr()
-	zeros.asZeros(tmp)
-	m.insert(zeros)
-
-	pcmp := m.allocateInstr()
-	pcmp.asXmmRmR(op, rm, tmp)
-	m.insert(pcmp)
-
-	test := m.allocateInstr()
-	test.asXmmCmpRmR(sseOpcodePtest, newOperandReg(tmp), tmp)
-	m.insert(test)
-
-	tmp2 := m.c.AllocateVReg(ssa.TypeI32)
-
-	setcc := m.allocateInstr()
-	setcc.asSetcc(condZ, tmp2)
-	m.insert(setcc)
-
-	// Clear the irrelevant bits.
-	and := m.allocateInstr()
-	and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp2, false)
-	m.insert(and)
-
-	m.copyTo(tmp2, rd)
-}
-
-func (m *machine) lowerVhighBits(instr *ssa.Instruction) {
-	x, lane := instr.ArgWithLane()
-	rm := m.getOperand_Reg(m.c.ValueDefinition(x))
-	rd := m.c.VRegOf(instr.Return())
-	switch lane {
-	case ssa.VecLaneI8x16:
-		mov := m.allocateInstr()
-		mov.asXmmToGpr(sseOpcodePmovmskb, rm.reg(), rd, false)
-		m.insert(mov)
-
-	case ssa.VecLaneI16x8:
-		// When we have:
-		// 	R1 = [R1(w1), R1(w2), R1(w3), R1(w4), R1(w5), R1(w6), R1(w7), R1(v8)]
-		// 	R2 = [R2(w1), R2(w2), R2(w3), R2(v4), R2(w5), R2(w6), R2(w7), R2(v8)]
-		//	where RX(wn) is n-th signed word (16-bit) of RX register,
-		//
-		// "PACKSSWB R1, R2" produces
-		//  R1 = [
-		// 		byte_sat(R1(w1)), byte_sat(R1(w2)), byte_sat(R1(w3)), byte_sat(R1(w4)),
-		// 		byte_sat(R1(w5)), byte_sat(R1(w6)), byte_sat(R1(w7)), byte_sat(R1(w8)),
-		// 		byte_sat(R2(w1)), byte_sat(R2(w2)), byte_sat(R2(w3)), byte_sat(R2(w4)),
-		// 		byte_sat(R2(w5)), byte_sat(R2(w6)), byte_sat(R2(w7)), byte_sat(R2(w8)),
-		//  ]
-		//  where R1 is the destination register, and
-		// 	byte_sat(w) = int8(w) if w fits as signed 8-bit,
-		//                0x80 if w is less than 0x80
-		//                0x7F if w is greater than 0x7f
-		//
-		// See https://www.felixcloutier.com/x86/packsswb:packssdw for detail.
-		//
-		// Therefore, v.register ends up having i-th and (i+8)-th bit set if i-th lane is negative (for i in 0..8).
-		tmp := m.copyToTmp(rm.reg())
-		res := m.c.AllocateVReg(ssa.TypeI32)
-
-		pak := m.allocateInstr()
-		pak.asXmmRmR(sseOpcodePacksswb, rm, tmp)
-		m.insert(pak)
-
-		mov := m.allocateInstr()
-		mov.asXmmToGpr(sseOpcodePmovmskb, tmp, res, false)
-		m.insert(mov)
-
-		// Clear the higher bits than 8.
-		shr := m.allocateInstr()
-		shr.asShiftR(shiftROpShiftRightLogical, newOperandImm32(8), res, false)
-		m.insert(shr)
-
-		m.copyTo(res, rd)
-
-	case ssa.VecLaneI32x4:
-		mov := m.allocateInstr()
-		mov.asXmmToGpr(sseOpcodeMovmskps, rm.reg(), rd, true)
-		m.insert(mov)
-
-	case ssa.VecLaneI64x2:
-		mov := m.allocateInstr()
-		mov.asXmmToGpr(sseOpcodeMovmskpd, rm.reg(), rd, true)
-		m.insert(mov)
-	}
-}
-
-func (m *machine) lowerVbnot(instr *ssa.Instruction) {
-	x := instr.Arg()
-	xDef := m.c.ValueDefinition(x)
-	rm := m.getOperand_Reg(xDef)
-	rd := m.c.VRegOf(instr.Return())
-
-	tmp := m.copyToTmp(rm.reg())
-	tmp2 := m.c.AllocateVReg(ssa.TypeV128)
-
-	// Ensure tmp2 is considered defined by regalloc.
-	m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2))
-
-	// Set all bits on tmp register.
-	pak := m.allocateInstr()
-	pak.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp2), tmp2)
-	m.insert(pak)
-
-	// Then XOR with tmp to reverse all bits on v.register.
-	xor := m.allocateInstr()
-	xor.asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp)
-	m.insert(xor)
-
-	m.copyTo(tmp, rd)
-}
-
-func (m *machine) lowerSplat(x, ret ssa.Value, lane ssa.VecLane) {
-	tmpDst := m.c.AllocateVReg(ssa.TypeV128)
-	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
-
-	switch lane {
-	case ssa.VecLaneI8x16:
-		tmp := m.c.AllocateVReg(ssa.TypeV128)
-		m.insert(m.allocateInstr().asDefineUninitializedReg(tmp))
-		xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, xx, tmpDst))
-		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp))
-		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpDst))
-	case ssa.VecLaneI16x8:
-		xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, xx, tmpDst))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, xx, tmpDst))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
-	case ssa.VecLaneI32x4:
-		xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, xx, tmpDst))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
-	case ssa.VecLaneI64x2:
-		xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, xx, tmpDst))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, xx, tmpDst))
-	case ssa.VecLaneF32x4:
-		xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, 0, xx, tmpDst))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
-	case ssa.VecLaneF64x2:
-		xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst))
-		m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, xx, tmpDst))
-	default:
-		panic(fmt.Sprintf("invalid lane type: %s", lane))
-	}
-
-	m.copyTo(tmpDst, m.c.VRegOf(ret))
-}
-
-func (m *machine) lowerShuffle(x, y ssa.Value, lo, hi uint64, ret ssa.Value) {
-	var xMask, yMask [2]uint64
-	for i := 0; i < 8; i++ {
-		loLane := byte(lo >> (i * 8))
-		if loLane < 16 {
-			xMask[0] |= uint64(loLane) << (i * 8)
-			yMask[0] |= uint64(0x80) << (i * 8)
-		} else {
-			xMask[0] |= uint64(0x80) << (i * 8)
-			yMask[0] |= uint64(loLane-16) << (i * 8)
-		}
-		hiLane := byte(hi >> (i * 8))
-		if hiLane < 16 {
-			xMask[1] |= uint64(hiLane) << (i * 8)
-			yMask[1] |= uint64(0x80) << (i * 8)
-		} else {
-			xMask[1] |= uint64(0x80) << (i * 8)
-			yMask[1] |= uint64(hiLane-16) << (i * 8)
-		}
-	}
-
-	xl, xmaskPos := m.allocateLabel()
-	m.consts = append(m.consts, _const{lo: xMask[0], hi: xMask[1], label: xl, labelPos: xmaskPos})
-	yl, ymaskPos := m.allocateLabel()
-	m.consts = append(m.consts, _const{lo: yMask[0], hi: yMask[1], label: yl, labelPos: ymaskPos})
-
-	xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y))
-	tmpX, tmpY := m.copyToTmp(xx.reg()), m.copyToTmp(yy.reg())
-
-	// Apply mask to X.
-	tmp := m.c.AllocateVReg(ssa.TypeV128)
-	loadMaskLo := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(xl)), tmp)
-	m.insert(loadMaskLo)
-	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpX))
-
-	// Apply mask to Y.
-	loadMaskHi := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(yl)), tmp)
-	m.insert(loadMaskHi)
-	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpY))
-
-	// Combine the results.
-	m.insert(m.allocateInstr().asXmmRmR(sseOpcodeOrps, newOperandReg(tmpX), tmpY))
-
-	m.copyTo(tmpY, m.c.VRegOf(ret))
-}
-
-func (m *machine) lowerVbBinOpUnaligned(op sseOpcode, x, y, ret ssa.Value) {
-	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
-	rm := m.getOperand_Reg(m.c.ValueDefinition(y))
-	rd := m.c.VRegOf(ret)
-
-	tmp := m.copyToTmp(rn.reg())
-
-	binOp := m.allocateInstr()
-	binOp.asXmmRmR(op, rm, tmp)
-	m.insert(binOp)
-
-	m.copyTo(tmp, rd)
-}
-
-func (m *machine) lowerVbBinOp(op sseOpcode, x, y, ret ssa.Value) {
-	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
-	rm := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
-	rd := m.c.VRegOf(ret)
-
-	tmp := m.copyToTmp(rn.reg())
-
-	binOp := m.allocateInstr()
-	binOp.asXmmRmR(op, rm, tmp)
-	m.insert(binOp)
-
-	m.copyTo(tmp, rd)
-}
-
-func (m *machine) lowerVFcmp(x, y ssa.Value, c ssa.FloatCmpCond, ret ssa.Value, lane ssa.VecLane) {
-	var cmpOp sseOpcode
-	switch lane {
-	case ssa.VecLaneF32x4:
-		cmpOp = sseOpcodeCmpps
-	case ssa.VecLaneF64x2:
-		cmpOp = sseOpcodeCmppd
-	default:
-		panic(fmt.Sprintf("invalid lane type: %s", lane))
-	}
-
-	xx, yy := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
-	var cmpImm cmpPred
-	switch c {
-	case ssa.FloatCmpCondGreaterThan:
-		yy, xx = xx, yy
-		cmpImm = cmpPredLT_OS
-	case ssa.FloatCmpCondGreaterThanOrEqual:
-		yy, xx = xx, yy
-		cmpImm = cmpPredLE_OS
-	case ssa.FloatCmpCondEqual:
-		cmpImm = cmpPredEQ_OQ
-	case ssa.FloatCmpCondNotEqual:
-		cmpImm = cmpPredNEQ_UQ
-	case ssa.FloatCmpCondLessThan:
-		cmpImm = cmpPredLT_OS
-	case ssa.FloatCmpCondLessThanOrEqual:
-		cmpImm = cmpPredLE_OS
-	default:
-		panic(fmt.Sprintf("invalid float comparison condition: %s", c))
-	}
-
-	tmp := m.c.AllocateVReg(ssa.TypeV128)
-	xxx := m.getOperand_Mem_Reg(xx)
-	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, xxx, tmp))
-
-	rm := m.getOperand_Mem_Reg(yy)
-	m.insert(m.allocateInstr().asXmmRmRImm(cmpOp, byte(cmpImm), rm, tmp))
-
-	m.copyTo(tmp, m.c.VRegOf(ret))
-}
-
-func (m *machine) lowerVIcmp(x, y ssa.Value, c ssa.IntegerCmpCond, ret ssa.Value, lane ssa.VecLane) {
-	var eq, gt, maxu, minu, mins sseOpcode
-	switch lane {
-	case ssa.VecLaneI8x16:
-		eq, gt, maxu, minu, mins = sseOpcodePcmpeqb, sseOpcodePcmpgtb, sseOpcodePmaxub, sseOpcodePminub, sseOpcodePminsb
-	case ssa.VecLaneI16x8:
-		eq, gt, maxu, minu, mins = sseOpcodePcmpeqw, sseOpcodePcmpgtw, sseOpcodePmaxuw, sseOpcodePminuw, sseOpcodePminsw
-	case ssa.VecLaneI32x4:
-		eq, gt, maxu, minu, mins = sseOpcodePcmpeqd, sseOpcodePcmpgtd, sseOpcodePmaxud, sseOpcodePminud, sseOpcodePminsd
-	case ssa.VecLaneI64x2:
-		eq, gt = sseOpcodePcmpeqq, sseOpcodePcmpgtq
-	default:
-		panic(fmt.Sprintf("invalid lane type: %s", lane))
-	}
-
-	tmp := m.c.AllocateVReg(ssa.TypeV128)
-	var op operand
-	switch c {
-	case ssa.IntegerCmpCondSignedLessThanOrEqual:
-		if lane == ssa.VecLaneI64x2 {
-			x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
-			// Copy x to tmp.
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp))
-			op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
-		} else {
-			y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
-			// Copy y to tmp.
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp))
-			op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
-		}
-	case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
-		if lane == ssa.VecLaneI64x2 {
-			y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
-			// Copy y to tmp.
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp))
-			op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
-		} else {
-			x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
-			// Copy x to tmp.
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp))
-			op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
-		}
-	case ssa.IntegerCmpCondSignedLessThan, ssa.IntegerCmpCondUnsignedLessThan, ssa.IntegerCmpCondUnsignedLessThanOrEqual:
-		y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
-		// Copy y to tmp.
-		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp))
-		op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
-	default:
-		x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
-		// Copy x to tmp.
-		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp))
-		op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
-	}
-
-	switch c {
-	case ssa.IntegerCmpCondEqual:
-		m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
-	case ssa.IntegerCmpCondNotEqual:
-		// First we compare for equality.
-		m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
-		// Then flip the bits. To do so, we set all bits on tmp2.
-		tmp2 := m.c.AllocateVReg(ssa.TypeV128)
-		m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2))
-		m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2))
-		// And then xor with tmp.
-		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp))
-	case ssa.IntegerCmpCondSignedGreaterThan, ssa.IntegerCmpCondSignedLessThan:
-		m.insert(m.allocateInstr().asXmmRmR(gt, op, tmp))
-	case ssa.IntegerCmpCondSignedGreaterThanOrEqual, ssa.IntegerCmpCondSignedLessThanOrEqual:
-		if lane == ssa.VecLaneI64x2 {
-			m.insert(m.allocateInstr().asXmmRmR(gt, op, tmp))
-			// Then flip the bits. To do so, we set all bits on tmp2.
-			tmp2 := m.c.AllocateVReg(ssa.TypeV128)
-			m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2))
-			m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2))
-			// And then xor with tmp.
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp))
-		} else {
-			// First take min of x and y.
-			m.insert(m.allocateInstr().asXmmRmR(mins, op, tmp))
-			// Then compare for equality.
-			m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
-		}
-	case ssa.IntegerCmpCondUnsignedGreaterThan, ssa.IntegerCmpCondUnsignedLessThan:
-		// First maxu of x and y.
-		m.insert(m.allocateInstr().asXmmRmR(maxu, op, tmp))
-		// Then compare for equality.
-		m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
-		// Then flip the bits. To do so, we set all bits on tmp2.
-		tmp2 := m.c.AllocateVReg(ssa.TypeV128)
-		m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2))
-		m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2))
-		// And then xor with tmp.
-		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp))
-	case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual, ssa.IntegerCmpCondUnsignedLessThanOrEqual:
-		m.insert(m.allocateInstr().asXmmRmR(minu, op, tmp))
-		m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
-	default:
-		panic("BUG")
-	}
-
-	m.copyTo(tmp, m.c.VRegOf(ret))
-}
-
-func (m *machine) lowerVbandnot(instr *ssa.Instruction, op sseOpcode) {
-	x, y := instr.Arg2()
-	xDef := m.c.ValueDefinition(x)
-	yDef := m.c.ValueDefinition(y)
-	rm, rn := m.getOperand_Reg(xDef), m.getOperand_Reg(yDef)
-	rd := m.c.VRegOf(instr.Return())
-
-	tmp := m.copyToTmp(rn.reg())
-
-	// pandn between rn, rm.
-	pand := m.allocateInstr()
-	pand.asXmmRmR(sseOpcodePandn, rm, tmp)
-	m.insert(pand)
-
-	m.copyTo(tmp, rd)
-}
-
-func (m *machine) lowerVbitselect(instr *ssa.Instruction) {
-	c, x, y := instr.SelectData()
-	xDef := m.c.ValueDefinition(x)
-	yDef := m.c.ValueDefinition(y)
-	rm, rn := m.getOperand_Reg(xDef), m.getOperand_Reg(yDef)
-	creg := m.getOperand_Reg(m.c.ValueDefinition(c))
-	rd := m.c.VRegOf(instr.Return())
-
-	tmpC := m.copyToTmp(creg.reg())
-	tmpX := m.copyToTmp(rm.reg())
-
-	// And between c, x (overwrites x).
-	pand := m.allocateInstr()
-	pand.asXmmRmR(sseOpcodePand, creg, tmpX)
-	m.insert(pand)
-
-	// Andn between y, c (overwrites c).
-	pandn := m.allocateInstr()
-	pandn.asXmmRmR(sseOpcodePandn, rn, tmpC)
-	m.insert(pandn)
-
-	por := m.allocateInstr()
-	por.asXmmRmR(sseOpcodePor, newOperandReg(tmpC), tmpX)
-	m.insert(por)
-
-	m.copyTo(tmpX, rd)
-}
-
-func (m *machine) lowerVFmin(instr *ssa.Instruction) {
-	x, y, lane := instr.Arg2WithLane()
-	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
-	rm := m.getOperand_Reg(m.c.ValueDefinition(y))
-	rd := m.c.VRegOf(instr.Return())
-
-	var min, cmp, andn, or, srl /* shift right logical */ sseOpcode
-	var shiftNumToInverseNaN uint32
-	if lane == ssa.VecLaneF32x4 {
-		min, cmp, andn, or, srl, shiftNumToInverseNaN = sseOpcodeMinps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodePsrld, 0xa
-	} else {
-		min, cmp, andn, or, srl, shiftNumToInverseNaN = sseOpcodeMinpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodePsrlq, 0xd
-	}
-
-	tmp1 := m.copyToTmp(rn.reg())
-	tmp2 := m.copyToTmp(rm.reg())
-
-	// tmp1=min(rn, rm)
-	minIns1 := m.allocateInstr()
-	minIns1.asXmmRmR(min, rn, tmp2)
-	m.insert(minIns1)
-
-	// tmp2=min(rm, rn)
-	minIns2 := m.allocateInstr()
-	minIns2.asXmmRmR(min, rm, tmp1)
-	m.insert(minIns2)
-
-	// tmp3:=tmp1=min(rn, rm)
-	tmp3 := m.copyToTmp(tmp1)
-
-	// tmp1 = -0         if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN
-	//       NaN         if rn == NaN || rm == NaN
-	//       min(rm, rm) otherwise
-	orIns := m.allocateInstr()
-	orIns.asXmmRmR(or, newOperandReg(tmp2), tmp1)
-	m.insert(orIns)
-
-	// tmp3 is originally min(rn,rm).
-	// tmp3 = 0^ (set all bits) if rn == NaN || rm == NaN
-	//        0 otherwise
-	cmpIns := m.allocateInstr()
-	cmpIns.asXmmRmRImm(cmp, uint8(cmpPredUNORD_Q), newOperandReg(tmp2), tmp3)
-	m.insert(cmpIns)
-
-	// tmp1 = -0          if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN
-	//        ^0          if rn == NaN || rm == NaN
-	//        min(v1, v2) otherwise
-	orIns2 := m.allocateInstr()
-	orIns2.asXmmRmR(or, newOperandReg(tmp3), tmp1)
-	m.insert(orIns2)
-
-	// tmp3 = set all bits on the mantissa bits
-	//        0 otherwise
-	shift := m.allocateInstr()
-	shift.asXmmRmiReg(srl, newOperandImm32(shiftNumToInverseNaN), tmp3)
-	m.insert(shift)
-
-	// tmp3 = tmp1 and !tmp3
-	//     = -0                                                   if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN
-	//       set all bits on exponential and sign bit (== NaN)    if rn == NaN || rm == NaN
-	//       min(rn, rm)                                          otherwise
-	andnIns := m.allocateInstr()
-	andnIns.asXmmRmR(andn, newOperandReg(tmp1), tmp3)
-	m.insert(andnIns)
-
-	m.copyTo(tmp3, rd)
-}
-
-func (m *machine) lowerVFmax(instr *ssa.Instruction) {
-	x, y, lane := instr.Arg2WithLane()
-	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
-	rm := m.getOperand_Reg(m.c.ValueDefinition(y))
-	rd := m.c.VRegOf(instr.Return())
-
-	var max, cmp, andn, or, xor, sub, srl /* shift right logical */ sseOpcode
-	var shiftNumToInverseNaN uint32
-	if lane == ssa.VecLaneF32x4 {
-		max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = sseOpcodeMaxps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodeXorps, sseOpcodeSubps, sseOpcodePsrld, 0xa
-	} else {
-		max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = sseOpcodeMaxpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodeXorpd, sseOpcodeSubpd, sseOpcodePsrlq, 0xd
-	}
-
-	tmp0 := m.copyToTmp(rm.reg())
-	tmp1 := m.copyToTmp(rn.reg())
-
-	// tmp0=max(rn, rm)
-	maxIns1 := m.allocateInstr()
-	maxIns1.asXmmRmR(max, rn, tmp0)
-	m.insert(maxIns1)
-
-	// tmp1=max(rm, rn)
-	maxIns2 := m.allocateInstr()
-	maxIns2.asXmmRmR(max, rm, tmp1)
-	m.insert(maxIns2)
-
-	// tmp2=max(rm, rn)
-	tmp2 := m.copyToTmp(tmp1)
-
-	// tmp2 = -0       if (rn == -0 && rm == 0) || (rn == 0 && rm == -0)
-	//         0       if (rn == 0 && rm ==  0)
-	//        -0       if (rn == -0 && rm == -0)
-	//       v1^v2     if rn == NaN || rm == NaN
-	//         0       otherwise
-	xorInstr := m.allocateInstr()
-	xorInstr.asXmmRmR(xor, newOperandReg(tmp0), tmp2)
-	m.insert(xorInstr)
-	// tmp1 = -0           if (rn == -0 && rm == 0) || (rn == 0 && rm == -0)
-	//         0           if (rn == 0 && rm ==  0)
-	//        -0           if (rn == -0 && rm == -0)
-	//        NaN          if rn == NaN || rm == NaN
-	//        max(v1, v2)  otherwise
-	orInstr := m.allocateInstr()
-	orInstr.asXmmRmR(or, newOperandReg(tmp2), tmp1)
-	m.insert(orInstr)
-
-	tmp3 := m.copyToTmp(tmp1)
-
-	// tmp3 = 0           if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) || (rn == 0 && rm ==  0)
-	//       -0           if (rn == -0 && rm == -0)
-	//       NaN          if rn == NaN || rm == NaN
-	//       max(v1, v2)  otherwise
-	//
-	// Note: -0 - (-0) = 0 (!= -0) in floating point operation.
-	subIns := m.allocateInstr()
-	subIns.asXmmRmR(sub, newOperandReg(tmp2), tmp3)
-	m.insert(subIns)
-
-	// tmp1 = 0^ if rn == NaN || rm == NaN
-	cmpIns := m.allocateInstr()
-	cmpIns.asXmmRmRImm(cmp, uint8(cmpPredUNORD_Q), newOperandReg(tmp1), tmp1)
-	m.insert(cmpIns)
-
-	// tmp1 = set all bits on the mantissa bits
-	//        0 otherwise
-	shift := m.allocateInstr()
-	shift.asXmmRmiReg(srl, newOperandImm32(shiftNumToInverseNaN), tmp1)
-	m.insert(shift)
-
-	andnIns := m.allocateInstr()
-	andnIns.asXmmRmR(andn, newOperandReg(tmp3), tmp1)
-	m.insert(andnIns)
-
-	m.copyTo(tmp1, rd)
-}
-
-func (m *machine) lowerVFabs(instr *ssa.Instruction) {
-	x, lane := instr.ArgWithLane()
-	rm := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
-	rd := m.c.VRegOf(instr.Return())
-
-	tmp := m.c.AllocateVReg(ssa.TypeV128)
-
-	def := m.allocateInstr()
-	def.asDefineUninitializedReg(tmp)
-	m.insert(def)
-
-	// Set all bits on tmp.
-	pcmp := m.allocateInstr()
-	pcmp.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp)
-	m.insert(pcmp)
-
-	switch lane {
-	case ssa.VecLaneF32x4:
-		// Shift right packed single floats by 1 to clear the sign bits.
-		shift := m.allocateInstr()
-		shift.asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp)
-		m.insert(shift)
-		// Clear the sign bit of rm.
-		andp := m.allocateInstr()
-		andp.asXmmRmR(sseOpcodeAndpd, rm, tmp)
-		m.insert(andp)
-	case ssa.VecLaneF64x2:
-		// Shift right packed single floats by 1 to clear the sign bits.
-		shift := m.allocateInstr()
-		shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(1), tmp)
-		m.insert(shift)
-		// Clear the sign bit of rm.
-		andp := m.allocateInstr()
-		andp.asXmmRmR(sseOpcodeAndps, rm, tmp)
-		m.insert(andp)
-	}
-
-	m.copyTo(tmp, rd)
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go
deleted file mode 100644
index e53729860..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go
+++ /dev/null
@@ -1,303 +0,0 @@
-package amd64
-
-import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-)
-
-// PostRegAlloc implements backend.Machine.
-func (m *machine) PostRegAlloc() {
-	m.setupPrologue()
-	m.postRegAlloc()
-}
-
-func (m *machine) setupPrologue() {
-	cur := m.rootInstr
-	prevInitInst := cur.next
-
-	// At this point, we have the stack layout as follows:
-	//
-	//                   (high address)
-	//                 +-----------------+ <----- RBP (somewhere in the middle of the stack)
-	//                 |     .......     |
-	//                 |      ret Y      |
-	//                 |     .......     |
-	//                 |      ret 0      |
-	//                 |      arg X      |
-	//                 |     .......     |
-	//                 |      arg 1      |
-	//                 |      arg 0      |
-	//                 |   Return Addr   |
-	//       RSP ----> +-----------------+
-	//                    (low address)
-
-	// First, we push the RBP, and update the RBP to the current RSP.
-	//
-	//                   (high address)                     (high address)
-	//       RBP ----> +-----------------+                +-----------------+
-	//                 |     .......     |                |     .......     |
-	//                 |      ret Y      |                |      ret Y      |
-	//                 |     .......     |                |     .......     |
-	//                 |      ret 0      |                |      ret 0      |
-	//                 |      arg X      |                |      arg X      |
-	//                 |     .......     |     ====>      |     .......     |
-	//                 |      arg 1      |                |      arg 1      |
-	//                 |      arg 0      |                |      arg 0      |
-	//                 |   Return Addr   |                |   Return Addr   |
-	//       RSP ----> +-----------------+                |    Caller_RBP   |
-	//                    (low address)                   +-----------------+ <----- RSP, RBP
-	//
-	cur = m.setupRBPRSP(cur)
-
-	if !m.stackBoundsCheckDisabled {
-		cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur)
-	}
-
-	//
-	//            (high address)
-	//          +-----------------+                  +-----------------+
-	//          |     .......     |                  |     .......     |
-	//          |      ret Y      |                  |      ret Y      |
-	//          |     .......     |                  |     .......     |
-	//          |      ret 0      |                  |      ret 0      |
-	//          |      arg X      |                  |      arg X      |
-	//          |     .......     |                  |     .......     |
-	//          |      arg 1      |                  |      arg 1      |
-	//          |      arg 0      |                  |      arg 0      |
-	//          |      xxxxx      |                  |      xxxxx      |
-	//          |   Return Addr   |                  |   Return Addr   |
-	//          |    Caller_RBP   |      ====>       |    Caller_RBP   |
-	// RBP,RSP->+-----------------+                  +-----------------+ <----- RBP
-	//             (low address)                     |   clobbered M   |
-	//                                               |   clobbered 1   |
-	//                                               |   ...........   |
-	//                                               |   clobbered 0   |
-	//                                               +-----------------+ <----- RSP
-	//
-	if regs := m.clobberedRegs; len(regs) > 0 {
-		for i := range regs {
-			r := regs[len(regs)-1-i] // Reverse order.
-			if r.RegType() == regalloc.RegTypeInt {
-				cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r)))
-			} else {
-				// Push the XMM register is not supported by the PUSH instruction.
-				cur = m.addRSP(-16, cur)
-				push := m.allocateInstr().asXmmMovRM(
-					sseOpcodeMovdqu, r, newOperandMem(m.newAmodeImmReg(0, rspVReg)),
-				)
-				cur = linkInstr(cur, push)
-			}
-		}
-	}
-
-	if size := m.spillSlotSize; size > 0 {
-		// Simply decrease the RSP to allocate the spill slots.
-		// 		sub $size, %rsp
-		cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(size)), rspVReg, true))
-
-		// At this point, we have the stack layout as follows:
-		//
-		//            (high address)
-		//          +-----------------+
-		//          |     .......     |
-		//          |      ret Y      |
-		//          |     .......     |
-		//          |      ret 0      |
-		//          |      arg X      |
-		//          |     .......     |
-		//          |      arg 1      |
-		//          |      arg 0      |
-		//          |   ReturnAddress |
-		//          |   Caller_RBP    |
-		//          +-----------------+ <--- RBP
-		//          |    clobbered M  |
-		//          |   ............  |
-		//          |    clobbered 1  |
-		//          |    clobbered 0  |
-		//          |   spill slot N  |
-		//          |   ............  |
-		//          |   spill slot 0  |
-		//          +-----------------+ <--- RSP
-		//             (low address)
-	}
-
-	linkInstr(cur, prevInitInst)
-}
-
-// postRegAlloc does multiple things while walking through the instructions:
-// 1. Inserts the epilogue code.
-// 2. Removes the redundant copy instruction.
-// 3. Inserts the dec/inc RSP instruction right before/after the call instruction.
-// 4. Lowering that is supposed to be done after regalloc.
-func (m *machine) postRegAlloc() {
-	for cur := m.rootInstr; cur != nil; cur = cur.next {
-		switch k := cur.kind; k {
-		case ret:
-			m.setupEpilogueAfter(cur.prev)
-			continue
-		case fcvtToSintSequence, fcvtToUintSequence:
-			m.pendingInstructions = m.pendingInstructions[:0]
-			if k == fcvtToSintSequence {
-				m.lowerFcvtToSintSequenceAfterRegalloc(cur)
-			} else {
-				m.lowerFcvtToUintSequenceAfterRegalloc(cur)
-			}
-			prev := cur.prev
-			next := cur.next
-			cur := prev
-			for _, instr := range m.pendingInstructions {
-				cur = linkInstr(cur, instr)
-			}
-			linkInstr(cur, next)
-			continue
-		case xmmCMov:
-			m.pendingInstructions = m.pendingInstructions[:0]
-			m.lowerXmmCmovAfterRegAlloc(cur)
-			prev := cur.prev
-			next := cur.next
-			cur := prev
-			for _, instr := range m.pendingInstructions {
-				cur = linkInstr(cur, instr)
-			}
-			linkInstr(cur, next)
-			continue
-		case idivRemSequence:
-			m.pendingInstructions = m.pendingInstructions[:0]
-			m.lowerIDivRemSequenceAfterRegAlloc(cur)
-			prev := cur.prev
-			next := cur.next
-			cur := prev
-			for _, instr := range m.pendingInstructions {
-				cur = linkInstr(cur, instr)
-			}
-			linkInstr(cur, next)
-			continue
-		case call, callIndirect:
-			// At this point, reg alloc is done, therefore we can safely insert dec/inc RPS instruction
-			// right before/after the call instruction. If this is done before reg alloc, the stack slot
-			// can point to the wrong location and therefore results in a wrong value.
-			call := cur
-			next := call.next
-			_, _, _, _, size := backend.ABIInfoFromUint64(call.u2)
-			if size > 0 {
-				dec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(size), rspVReg, true)
-				linkInstr(call.prev, dec)
-				linkInstr(dec, call)
-				inc := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(size), rspVReg, true)
-				linkInstr(call, inc)
-				linkInstr(inc, next)
-			}
-			continue
-		}
-
-		// Removes the redundant copy instruction.
-		if cur.IsCopy() && cur.op1.reg().RealReg() == cur.op2.reg().RealReg() {
-			prev, next := cur.prev, cur.next
-			// Remove the copy instruction.
-			prev.next = next
-			if next != nil {
-				next.prev = prev
-			}
-		}
-	}
-}
-
-func (m *machine) setupEpilogueAfter(cur *instruction) {
-	prevNext := cur.next
-
-	// At this point, we have the stack layout as follows:
-	//
-	//            (high address)
-	//          +-----------------+
-	//          |     .......     |
-	//          |      ret Y      |
-	//          |     .......     |
-	//          |      ret 0      |
-	//          |      arg X      |
-	//          |     .......     |
-	//          |      arg 1      |
-	//          |      arg 0      |
-	//          |   ReturnAddress |
-	//          |   Caller_RBP    |
-	//          +-----------------+ <--- RBP
-	//          |    clobbered M  |
-	//          |   ............  |
-	//          |    clobbered 1  |
-	//          |    clobbered 0  |
-	//          |   spill slot N  |
-	//          |   ............  |
-	//          |   spill slot 0  |
-	//          +-----------------+ <--- RSP
-	//             (low address)
-
-	if size := m.spillSlotSize; size > 0 {
-		// Simply increase the RSP to free the spill slots.
-		// 		add $size, %rsp
-		cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(uint32(size)), rspVReg, true))
-	}
-
-	//
-	//             (high address)
-	//            +-----------------+                     +-----------------+
-	//            |     .......     |                     |     .......     |
-	//            |      ret Y      |                     |      ret Y      |
-	//            |     .......     |                     |     .......     |
-	//            |      ret 0      |                     |      ret 0      |
-	//            |      arg X      |                     |      arg X      |
-	//            |     .......     |                     |     .......     |
-	//            |      arg 1      |                     |      arg 1      |
-	//            |      arg 0      |                     |      arg 0      |
-	//            |   ReturnAddress |                     |   ReturnAddress |
-	//            |    Caller_RBP   |                     |    Caller_RBP   |
-	//   RBP ---> +-----------------+      ========>      +-----------------+ <---- RSP, RBP
-	//            |    clobbered M  |
-	//            |   ............  |
-	//            |    clobbered 1  |
-	//            |    clobbered 0  |
-	//   RSP ---> +-----------------+
-	//               (low address)
-	//
-	if regs := m.clobberedRegs; len(regs) > 0 {
-		for _, r := range regs {
-			if r.RegType() == regalloc.RegTypeInt {
-				cur = linkInstr(cur, m.allocateInstr().asPop64(r))
-			} else {
-				// Pop the XMM register is not supported by the POP instruction.
-				pop := m.allocateInstr().asXmmUnaryRmR(
-					sseOpcodeMovdqu, newOperandMem(m.newAmodeImmReg(0, rspVReg)), r,
-				)
-				cur = linkInstr(cur, pop)
-				cur = m.addRSP(16, cur)
-			}
-		}
-	}
-
-	// Now roll back the RSP to RBP, and pop the caller's RBP.
-	cur = m.revertRBPRSP(cur)
-
-	linkInstr(cur, prevNext)
-}
-
-func (m *machine) addRSP(offset int32, cur *instruction) *instruction {
-	if offset == 0 {
-		return cur
-	}
-	opcode := aluRmiROpcodeAdd
-	if offset < 0 {
-		opcode = aluRmiROpcodeSub
-		offset = -offset
-	}
-	return linkInstr(cur, m.allocateInstr().asAluRmiR(opcode, newOperandImm32(uint32(offset)), rspVReg, true))
-}
-
-func (m *machine) setupRBPRSP(cur *instruction) *instruction {
-	cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(rbpVReg)))
-	cur = linkInstr(cur, m.allocateInstr().asMovRR(rspVReg, rbpVReg, true))
-	return cur
-}
-
-func (m *machine) revertRBPRSP(cur *instruction) *instruction {
-	cur = linkInstr(cur, m.allocateInstr().asMovRR(rbpVReg, rspVReg, true))
-	cur = linkInstr(cur, m.allocateInstr().asPop64(rbpVReg))
-	return cur
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go
deleted file mode 100644
index de9dcc944..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go
+++ /dev/null
@@ -1,352 +0,0 @@
-package amd64
-
-import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-// regAllocFn implements regalloc.Function.
-type regAllocFn struct {
-	ssaB                   ssa.Builder
-	m                      *machine
-	loopNestingForestRoots []ssa.BasicBlock
-	blockIter              int
-}
-
-// PostOrderBlockIteratorBegin implements regalloc.Function.
-func (f *regAllocFn) PostOrderBlockIteratorBegin() *labelPosition {
-	f.blockIter = len(f.m.orderedSSABlockLabelPos) - 1
-	return f.PostOrderBlockIteratorNext()
-}
-
-// PostOrderBlockIteratorNext implements regalloc.Function.
-func (f *regAllocFn) PostOrderBlockIteratorNext() *labelPosition {
-	if f.blockIter < 0 {
-		return nil
-	}
-	b := f.m.orderedSSABlockLabelPos[f.blockIter]
-	f.blockIter--
-	return b
-}
-
-// ReversePostOrderBlockIteratorBegin implements regalloc.Function.
-func (f *regAllocFn) ReversePostOrderBlockIteratorBegin() *labelPosition {
-	f.blockIter = 0
-	return f.ReversePostOrderBlockIteratorNext()
-}
-
-// ReversePostOrderBlockIteratorNext implements regalloc.Function.
-func (f *regAllocFn) ReversePostOrderBlockIteratorNext() *labelPosition {
-	if f.blockIter >= len(f.m.orderedSSABlockLabelPos) {
-		return nil
-	}
-	b := f.m.orderedSSABlockLabelPos[f.blockIter]
-	f.blockIter++
-	return b
-}
-
-// ClobberedRegisters implements regalloc.Function.
-func (f *regAllocFn) ClobberedRegisters(regs []regalloc.VReg) {
-	f.m.clobberedRegs = append(f.m.clobberedRegs[:0], regs...)
-}
-
-// LoopNestingForestRoots implements regalloc.Function.
-func (f *regAllocFn) LoopNestingForestRoots() int {
-	f.loopNestingForestRoots = f.ssaB.LoopNestingForestRoots()
-	return len(f.loopNestingForestRoots)
-}
-
-// LoopNestingForestRoot implements regalloc.Function.
-func (f *regAllocFn) LoopNestingForestRoot(i int) *labelPosition {
-	root := f.loopNestingForestRoots[i]
-	pos := f.m.getOrAllocateSSABlockLabelPosition(root)
-	return pos
-}
-
-// LowestCommonAncestor implements regalloc.Function.
-func (f *regAllocFn) LowestCommonAncestor(blk1, blk2 *labelPosition) *labelPosition {
-	sb := f.ssaB.LowestCommonAncestor(blk1.sb, blk2.sb)
-	pos := f.m.getOrAllocateSSABlockLabelPosition(sb)
-	return pos
-}
-
-// Idom implements regalloc.Function.
-func (f *regAllocFn) Idom(blk *labelPosition) *labelPosition {
-	sb := f.ssaB.Idom(blk.sb)
-	pos := f.m.getOrAllocateSSABlockLabelPosition(sb)
-	return pos
-}
-
-// SwapBefore implements regalloc.Function.
-func (f *regAllocFn) SwapBefore(x1, x2, tmp regalloc.VReg, instr *instruction) {
-	f.m.swap(instr.prev, x1, x2, tmp)
-}
-
-// StoreRegisterBefore implements regalloc.Function.
-func (f *regAllocFn) StoreRegisterBefore(v regalloc.VReg, instr *instruction) {
-	m := f.m
-	m.insertStoreRegisterAt(v, instr, false)
-}
-
-// StoreRegisterAfter implements regalloc.Function.
-func (f *regAllocFn) StoreRegisterAfter(v regalloc.VReg, instr *instruction) {
-	m := f.m
-	m.insertStoreRegisterAt(v, instr, true)
-}
-
-// ReloadRegisterBefore implements regalloc.Function.
-func (f *regAllocFn) ReloadRegisterBefore(v regalloc.VReg, instr *instruction) {
-	m := f.m
-	m.insertReloadRegisterAt(v, instr, false)
-}
-
-// ReloadRegisterAfter implements regalloc.Function.
-func (f *regAllocFn) ReloadRegisterAfter(v regalloc.VReg, instr *instruction) {
-	m := f.m
-	m.insertReloadRegisterAt(v, instr, true)
-}
-
-// InsertMoveBefore implements regalloc.Function.
-func (f *regAllocFn) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
-	f.m.insertMoveBefore(dst, src, instr)
-}
-
-// LoopNestingForestChild implements regalloc.Function.
-func (f *regAllocFn) LoopNestingForestChild(pos *labelPosition, i int) *labelPosition {
-	childSB := pos.sb.LoopNestingForestChildren()[i]
-	return f.m.getOrAllocateSSABlockLabelPosition(childSB)
-}
-
-// Succ implements regalloc.Block.
-func (f *regAllocFn) Succ(pos *labelPosition, i int) *labelPosition {
-	succSB := pos.sb.Succ(i)
-	if succSB.ReturnBlock() {
-		return nil
-	}
-	return f.m.getOrAllocateSSABlockLabelPosition(succSB)
-}
-
-// Pred implements regalloc.Block.
-func (f *regAllocFn) Pred(pos *labelPosition, i int) *labelPosition {
-	predSB := pos.sb.Pred(i)
-	return f.m.getOrAllocateSSABlockLabelPosition(predSB)
-}
-
-// BlockParams implements regalloc.Function.
-func (f *regAllocFn) BlockParams(pos *labelPosition, regs *[]regalloc.VReg) []regalloc.VReg {
-	c := f.m.c
-	*regs = (*regs)[:0]
-	for i := 0; i < pos.sb.Params(); i++ {
-		v := c.VRegOf(pos.sb.Param(i))
-		*regs = append(*regs, v)
-	}
-	return *regs
-}
-
-// ID implements regalloc.Block.
-func (pos *labelPosition) ID() int32 {
-	return int32(pos.sb.ID())
-}
-
-// InstrIteratorBegin implements regalloc.Block.
-func (pos *labelPosition) InstrIteratorBegin() *instruction {
-	ret := pos.begin
-	pos.cur = ret
-	return ret
-}
-
-// InstrIteratorNext implements regalloc.Block.
-func (pos *labelPosition) InstrIteratorNext() *instruction {
-	for {
-		if pos.cur == pos.end {
-			return nil
-		}
-		instr := pos.cur.next
-		pos.cur = instr
-		if instr == nil {
-			return nil
-		} else if instr.addedBeforeRegAlloc {
-			// Only concerned about the instruction added before regalloc.
-			return instr
-		}
-	}
-}
-
-// InstrRevIteratorBegin implements regalloc.Block.
-func (pos *labelPosition) InstrRevIteratorBegin() *instruction {
-	pos.cur = pos.end
-	return pos.cur
-}
-
-// InstrRevIteratorNext implements regalloc.Block.
-func (pos *labelPosition) InstrRevIteratorNext() *instruction {
-	for {
-		if pos.cur == pos.begin {
-			return nil
-		}
-		instr := pos.cur.prev
-		pos.cur = instr
-		if instr == nil {
-			return nil
-		} else if instr.addedBeforeRegAlloc {
-			// Only concerned about the instruction added before regalloc.
-			return instr
-		}
-	}
-}
-
-// FirstInstr implements regalloc.Block.
-func (pos *labelPosition) FirstInstr() *instruction { return pos.begin }
-
-// LastInstrForInsertion implements regalloc.Block.
-func (pos *labelPosition) LastInstrForInsertion() *instruction {
-	return lastInstrForInsertion(pos.begin, pos.end)
-}
-
-// Preds implements regalloc.Block.
-func (pos *labelPosition) Preds() int { return pos.sb.Preds() }
-
-// Entry implements regalloc.Block.
-func (pos *labelPosition) Entry() bool { return pos.sb.EntryBlock() }
-
-// Succs implements regalloc.Block.
-func (pos *labelPosition) Succs() int { return pos.sb.Succs() }
-
-// LoopHeader implements regalloc.Block.
-func (pos *labelPosition) LoopHeader() bool { return pos.sb.LoopHeader() }
-
-// LoopNestingForestChildren implements regalloc.Block.
-func (pos *labelPosition) LoopNestingForestChildren() int {
-	return len(pos.sb.LoopNestingForestChildren())
-}
-
-func (m *machine) insertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
-	typ := src.RegType()
-	if typ != dst.RegType() {
-		panic("BUG: src and dst must have the same type")
-	}
-
-	mov := m.allocateInstr()
-	if typ == regalloc.RegTypeInt {
-		mov.asMovRR(src, dst, true)
-	} else {
-		mov.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), dst)
-	}
-
-	cur := instr.prev
-	prevNext := cur.next
-	cur = linkInstr(cur, mov)
-	linkInstr(cur, prevNext)
-}
-
-func (m *machine) insertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
-	if !v.IsRealReg() {
-		panic("BUG: VReg must be backed by real reg to be stored")
-	}
-
-	typ := m.c.TypeOf(v)
-
-	var prevNext, cur *instruction
-	if after {
-		cur, prevNext = instr, instr.next
-	} else {
-		cur, prevNext = instr.prev, instr
-	}
-
-	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
-	store := m.allocateInstr()
-	mem := newOperandMem(m.newAmodeImmReg(uint32(offsetFromSP), rspVReg))
-	switch typ {
-	case ssa.TypeI32:
-		store.asMovRM(v, mem, 4)
-	case ssa.TypeI64:
-		store.asMovRM(v, mem, 8)
-	case ssa.TypeF32:
-		store.asXmmMovRM(sseOpcodeMovss, v, mem)
-	case ssa.TypeF64:
-		store.asXmmMovRM(sseOpcodeMovsd, v, mem)
-	case ssa.TypeV128:
-		store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
-	}
-
-	cur = linkInstr(cur, store)
-	return linkInstr(cur, prevNext)
-}
-
-func (m *machine) insertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
-	if !v.IsRealReg() {
-		panic("BUG: VReg must be backed by real reg to be stored")
-	}
-
-	typ := m.c.TypeOf(v)
-	var prevNext, cur *instruction
-	if after {
-		cur, prevNext = instr, instr.next
-	} else {
-		cur, prevNext = instr.prev, instr
-	}
-
-	// Load the value to the temporary.
-	load := m.allocateInstr()
-	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
-	a := newOperandMem(m.newAmodeImmReg(uint32(offsetFromSP), rspVReg))
-	switch typ {
-	case ssa.TypeI32:
-		load.asMovzxRmR(extModeLQ, a, v)
-	case ssa.TypeI64:
-		load.asMov64MR(a, v)
-	case ssa.TypeF32:
-		load.asXmmUnaryRmR(sseOpcodeMovss, a, v)
-	case ssa.TypeF64:
-		load.asXmmUnaryRmR(sseOpcodeMovsd, a, v)
-	case ssa.TypeV128:
-		load.asXmmUnaryRmR(sseOpcodeMovdqu, a, v)
-	default:
-		panic("BUG")
-	}
-
-	cur = linkInstr(cur, load)
-	return linkInstr(cur, prevNext)
-}
-
-func (m *machine) swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
-	if x1.RegType() == regalloc.RegTypeInt {
-		prevNext := cur.next
-		xc := m.allocateInstr().asXCHG(x1, newOperandReg(x2), 8)
-		cur = linkInstr(cur, xc)
-		linkInstr(cur, prevNext)
-	} else {
-		if tmp.Valid() {
-			prevNext := cur.next
-			m.insertMoveBefore(tmp, x1, prevNext)
-			m.insertMoveBefore(x1, x2, prevNext)
-			m.insertMoveBefore(x2, tmp, prevNext)
-		} else {
-			prevNext := cur.next
-			r2 := x2.RealReg()
-			// Temporarily spill x1 to stack.
-			cur = m.insertStoreRegisterAt(x1, cur, true).prev
-			// Then move x2 to x1.
-			cur = linkInstr(cur, m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqa, newOperandReg(x2), x1))
-			linkInstr(cur, prevNext)
-			// Then reload the original value on x1 from stack to r2.
-			m.insertReloadRegisterAt(x1.SetRealReg(r2), cur, true)
-		}
-	}
-}
-
-func lastInstrForInsertion(begin, end *instruction) *instruction {
-	cur := end
-	for cur.kind == nop0 {
-		cur = cur.prev
-		if cur == begin {
-			return end
-		}
-	}
-	switch cur.kind {
-	case jmp:
-		return cur
-	default:
-		return end
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go
deleted file mode 100644
index 8d514d857..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go
+++ /dev/null
@@ -1,992 +0,0 @@
-package amd64
-
-import (
-	"fmt"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-var swizzleMask = [16]byte{
-	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-}
-
-func (m *machine) lowerSwizzle(x, y ssa.Value, ret ssa.Value) {
-	masklabel := m.getOrAllocateConstLabel(&m.constSwizzleMaskConstIndex, swizzleMask[:])
-
-	// Load mask to maskReg.
-	maskReg := m.c.AllocateVReg(ssa.TypeV128)
-	loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(masklabel)), maskReg)
-	m.insert(loadMask)
-
-	// Copy x and y to tmp registers.
-	xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-	tmpDst := m.copyToTmp(xx.reg())
-	yy := m.getOperand_Reg(m.c.ValueDefinition(y))
-	tmpX := m.copyToTmp(yy.reg())
-
-	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddusb, newOperandReg(maskReg), tmpX))
-	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpX), tmpDst))
-
-	// Copy the result to the destination register.
-	m.copyTo(tmpDst, m.c.VRegOf(ret))
-}
-
-func (m *machine) lowerInsertLane(x, y ssa.Value, index byte, ret ssa.Value, lane ssa.VecLane) {
-	// Copy x to tmp.
-	tmpDst := m.c.AllocateVReg(ssa.TypeV128)
-	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), tmpDst))
-
-	yy := m.getOperand_Reg(m.c.ValueDefinition(y))
-	switch lane {
-	case ssa.VecLaneI8x16:
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, index, yy, tmpDst))
-	case ssa.VecLaneI16x8:
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, index, yy, tmpDst))
-	case ssa.VecLaneI32x4:
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, index, yy, tmpDst))
-	case ssa.VecLaneI64x2:
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, index, yy, tmpDst))
-	case ssa.VecLaneF32x4:
-		// In INSERTPS instruction, the destination index is encoded at 4 and 5 bits of the argument.
-		// See https://www.felixcloutier.com/x86/insertps
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, index<<4, yy, tmpDst))
-	case ssa.VecLaneF64x2:
-		if index == 0 {
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, yy, tmpDst))
-		} else {
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, yy, tmpDst))
-		}
-	default:
-		panic(fmt.Sprintf("invalid lane type: %s", lane))
-	}
-
-	m.copyTo(tmpDst, m.c.VRegOf(ret))
-}
-
-func (m *machine) lowerExtractLane(x ssa.Value, index byte, signed bool, ret ssa.Value, lane ssa.VecLane) {
-	// Pextr variants are used to extract a lane from a vector register.
-	xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-
-	tmpDst := m.c.AllocateVReg(ret.Type())
-	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
-	switch lane {
-	case ssa.VecLaneI8x16:
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrb, index, xx, tmpDst))
-		if signed {
-			m.insert(m.allocateInstr().asMovsxRmR(extModeBL, newOperandReg(tmpDst), tmpDst))
-		} else {
-			m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(tmpDst), tmpDst))
-		}
-	case ssa.VecLaneI16x8:
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrw, index, xx, tmpDst))
-		if signed {
-			m.insert(m.allocateInstr().asMovsxRmR(extModeWL, newOperandReg(tmpDst), tmpDst))
-		} else {
-			m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(tmpDst), tmpDst))
-		}
-	case ssa.VecLaneI32x4:
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrd, index, xx, tmpDst))
-	case ssa.VecLaneI64x2:
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, index, xx, tmpDst))
-	case ssa.VecLaneF32x4:
-		if index == 0 {
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovss, xx, tmpDst))
-		} else {
-			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, index, xx, tmpDst))
-		}
-	case ssa.VecLaneF64x2:
-		if index == 0 {
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst))
-		} else {
-			m.copyTo(xx.reg(), tmpDst)
-			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0b00_00_11_10, newOperandReg(tmpDst), tmpDst))
-		}
-	default:
-		panic(fmt.Sprintf("invalid lane type: %s", lane))
-	}
-
-	m.copyTo(tmpDst, m.c.VRegOf(ret))
-}
-
-var sqmulRoundSat = [16]byte{
-	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
-	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
-}
-
-func (m *machine) lowerSqmulRoundSat(x, y, ret ssa.Value) {
-	// See https://github.com/WebAssembly/simd/pull/365 for the following logic.
-	maskLabel := m.getOrAllocateConstLabel(&m.constSqmulRoundSatIndex, sqmulRoundSat[:])
-
-	tmp := m.c.AllocateVReg(ssa.TypeV128)
-	loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp)
-	m.insert(loadMask)
-
-	xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
-	tmpX := m.copyToTmp(xx.reg())
-
-	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmulhrsw, yy, tmpX))
-	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqw, newOperandReg(tmpX), tmp))
-	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmpX))
-
-	m.copyTo(tmpX, m.c.VRegOf(ret))
-}
-
-func (m *machine) lowerVUshr(x, y, ret ssa.Value, lane ssa.VecLane) {
-	switch lane {
-	case ssa.VecLaneI8x16:
-		m.lowerVUshri8x16(x, y, ret)
-	case ssa.VecLaneI16x8, ssa.VecLaneI32x4, ssa.VecLaneI64x2:
-		m.lowerShr(x, y, ret, lane, false)
-	default:
-		panic(fmt.Sprintf("invalid lane type: %s", lane))
-	}
-}
-
-// i8x16LogicalSHRMaskTable is necessary for emulating non-existent packed bytes logical right shifts on amd64.
-// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
-var i8x16LogicalSHRMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
-	0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // for 1 shift
-	0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, // for 2 shift
-	0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // for 3 shift
-	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // for 4 shift
-	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // for 5 shift
-	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // for 6 shift
-	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // for 7 shift
-}
-
-func (m *machine) lowerVUshri8x16(x, y, ret ssa.Value) {
-	tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
-	// Load the modulo 8 mask to tmpReg.
-	m.lowerIconst(tmpGpReg, 0x7, false)
-	// Take the modulo 8 of the shift amount.
-	shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y))
-	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, tmpGpReg, false))
-
-	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-	xx := m.copyToTmp(_xx.reg())
-
-	vecTmp := m.c.AllocateVReg(ssa.TypeV128)
-	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), vecTmp, false))
-	m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrlw, newOperandReg(vecTmp), xx))
-
-	maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16LogicalSHRMaskTableIndex, i8x16LogicalSHRMaskTable[:])
-	base := m.c.AllocateVReg(ssa.TypeI64)
-	lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base)
-	m.insert(lea)
-
-	// Shift tmpGpReg by 4 to multiply the shift amount by 16.
-	m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false))
-
-	mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0)
-	loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), vecTmp)
-	m.insert(loadMask)
-
-	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(vecTmp), xx))
-	m.copyTo(xx, m.c.VRegOf(ret))
-}
-
-func (m *machine) lowerVSshr(x, y, ret ssa.Value, lane ssa.VecLane) {
-	switch lane {
-	case ssa.VecLaneI8x16:
-		m.lowerVSshri8x16(x, y, ret)
-	case ssa.VecLaneI16x8, ssa.VecLaneI32x4:
-		m.lowerShr(x, y, ret, lane, true)
-	case ssa.VecLaneI64x2:
-		m.lowerVSshri64x2(x, y, ret)
-	default:
-		panic(fmt.Sprintf("invalid lane type: %s", lane))
-	}
-}
-
-func (m *machine) lowerVSshri8x16(x, y, ret ssa.Value) {
-	shiftAmtReg := m.c.AllocateVReg(ssa.TypeI32)
-	// Load the modulo 8 mask to tmpReg.
-	m.lowerIconst(shiftAmtReg, 0x7, false)
-	// Take the modulo 8 of the shift amount.
-	shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y))
-	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, shiftAmtReg, false))
-
-	// Copy the x value to two temporary registers.
-	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-	xx := m.copyToTmp(_xx.reg())
-	vecTmp := m.c.AllocateVReg(ssa.TypeV128)
-	m.copyTo(xx, vecTmp)
-
-	// Assuming that we have
-	//  xx   = [b1, ..., b16]
-	//  vecTmp = [b1, ..., b16]
-	// at this point, then we use PUNPCKLBW and PUNPCKHBW to produce:
-	//  xx   = [b1, b1, b2, b2, ..., b8, b8]
-	//  vecTmp = [b9, b9, b10, b10, ..., b16, b16]
-	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpcklbw, newOperandReg(xx), xx))
-	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpckhbw, newOperandReg(vecTmp), vecTmp))
-
-	// Adding 8 to the shift amount, and then move the amount to vecTmp2.
-	vecTmp2 := m.c.AllocateVReg(ssa.TypeV128)
-	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(8), shiftAmtReg, false))
-	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(shiftAmtReg), vecTmp2, false))
-
-	// Perform the word packed arithmetic right shifts on vreg and vecTmp.
-	// This changes these two registers as:
-	//  xx   = [xxx, b1 >> s, xxx, b2 >> s, ..., xxx, b8 >> s]
-	//  vecTmp = [xxx, b9 >> s, xxx, b10 >> s, ..., xxx, b16 >> s]
-	// where xxx is 1 or 0 depending on each byte's sign, and ">>" is the arithmetic shift on a byte.
-	m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), xx))
-	m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), vecTmp))
-
-	// Finally, we can get the result by packing these two word vectors.
-	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePacksswb, newOperandReg(vecTmp), xx))
-
-	m.copyTo(xx, m.c.VRegOf(ret))
-}
-
-func (m *machine) lowerVSshri64x2(x, y, ret ssa.Value) {
-	// Load the shift amount to RCX.
-	shiftAmt := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
-	m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, shiftAmt, rcxVReg))
-
-	tmpGp := m.c.AllocateVReg(ssa.TypeI64)
-
-	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-	xxReg := m.copyToTmp(_xx.reg())
-
-	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
-	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 0, newOperandReg(xxReg), tmpGp))
-	m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true))
-	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), xxReg))
-	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 1, newOperandReg(xxReg), tmpGp))
-	m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true))
-	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), xxReg))
-
-	m.copyTo(xxReg, m.c.VRegOf(ret))
-}
-
-func (m *machine) lowerShr(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) {
-	var modulo uint64
-	var shiftOp sseOpcode
-	switch lane {
-	case ssa.VecLaneI16x8:
-		modulo = 0xf
-		if signed {
-			shiftOp = sseOpcodePsraw
-		} else {
-			shiftOp = sseOpcodePsrlw
-		}
-	case ssa.VecLaneI32x4:
-		modulo = 0x1f
-		if signed {
-			shiftOp = sseOpcodePsrad
-		} else {
-			shiftOp = sseOpcodePsrld
-		}
-	case ssa.VecLaneI64x2:
-		modulo = 0x3f
-		if signed {
-			panic("BUG")
-		}
-		shiftOp = sseOpcodePsrlq
-	default:
-		panic(fmt.Sprintf("invalid lane type: %s", lane))
-	}
-
-	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-	xx := m.copyToTmp(_xx.reg())
-
-	tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
-	// Load the modulo 8 mask to tmpReg.
-	m.lowerIconst(tmpGpReg, modulo, false)
-	// Take the modulo 8 of the shift amount.
-	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd,
-		m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false))
-	// And move it to a xmm register.
-	tmpVec := m.c.AllocateVReg(ssa.TypeV128)
-	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false))
-
-	// Then do the actual shift.
-	m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx))
-
-	m.copyTo(xx, m.c.VRegOf(ret))
-}
-
-func (m *machine) lowerVIshl(x, y, ret ssa.Value, lane ssa.VecLane) {
-	var modulo uint64
-	var shiftOp sseOpcode
-	var isI8x16 bool
-	switch lane {
-	case ssa.VecLaneI8x16:
-		isI8x16 = true
-		modulo = 0x7
-		shiftOp = sseOpcodePsllw
-	case ssa.VecLaneI16x8:
-		modulo = 0xf
-		shiftOp = sseOpcodePsllw
-	case ssa.VecLaneI32x4:
-		modulo = 0x1f
-		shiftOp = sseOpcodePslld
-	case ssa.VecLaneI64x2:
-		modulo = 0x3f
-		shiftOp = sseOpcodePsllq
-	default:
-		panic(fmt.Sprintf("invalid lane type: %s", lane))
-	}
-
-	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-	xx := m.copyToTmp(_xx.reg())
-
-	tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
-	// Load the modulo 8 mask to tmpReg.
-	m.lowerIconst(tmpGpReg, modulo, false)
-	// Take the modulo 8 of the shift amount.
-	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd,
-		m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false))
-	// And move it to a xmm register.
-	tmpVec := m.c.AllocateVReg(ssa.TypeV128)
-	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false))
-
-	// Then do the actual shift.
-	m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx))
-
-	if isI8x16 {
-		maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16SHLMaskTableIndex, i8x16SHLMaskTable[:])
-		base := m.c.AllocateVReg(ssa.TypeI64)
-		lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base)
-		m.insert(lea)
-
-		// Shift tmpGpReg by 4 to multiply the shift amount by 16.
-		m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false))
-
-		mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0)
-		loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), tmpVec)
-		m.insert(loadMask)
-
-		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(tmpVec), xx))
-	}
-
-	m.copyTo(xx, m.c.VRegOf(ret))
-}
-
-// i8x16SHLMaskTable is necessary for emulating non-existent packed bytes left shifts on amd64.
-// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
-var i8x16SHLMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
-	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, // for 1 shift
-	0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, // for 2 shift
-	0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, // for 3 shift
-	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // for 4 shift
-	0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, // for 5 shift
-	0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, // for 6 shift
-	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // for 7 shift
-}
-
-func (m *machine) lowerVRound(x, ret ssa.Value, imm byte, _64 bool) {
-	xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
-	var round sseOpcode
-	if _64 {
-		round = sseOpcodeRoundpd
-	} else {
-		round = sseOpcodeRoundps
-	}
-	m.insert(m.allocateInstr().asXmmUnaryRmRImm(round, imm, xx, m.c.VRegOf(ret)))
-}
-
-var (
-	allOnesI8x16              = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1}
-	allOnesI16x8              = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0}
-	extAddPairwiseI16x8uMask1 = [16]byte{0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80}
-	extAddPairwiseI16x8uMask2 = [16]byte{0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00}
-)
-
-func (m *machine) lowerExtIaddPairwise(x, ret ssa.Value, srcLane ssa.VecLane, signed bool) {
-	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-	xx := m.copyToTmp(_xx.reg())
-	switch srcLane {
-	case ssa.VecLaneI8x16:
-		allOneReg := m.c.AllocateVReg(ssa.TypeV128)
-		mask := m.getOrAllocateConstLabel(&m.constAllOnesI8x16Index, allOnesI8x16[:])
-		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOneReg))
-
-		var resultReg regalloc.VReg
-		if signed {
-			resultReg = allOneReg
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(xx), resultReg))
-		} else {
-			// Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned.
-			resultReg = xx
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(allOneReg), resultReg))
-		}
-		m.copyTo(resultReg, m.c.VRegOf(ret))
-
-	case ssa.VecLaneI16x8:
-		if signed {
-			allOnesReg := m.c.AllocateVReg(ssa.TypeV128)
-			mask := m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:])
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOnesReg))
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(allOnesReg), xx))
-			m.copyTo(xx, m.c.VRegOf(ret))
-		} else {
-			maskReg := m.c.AllocateVReg(ssa.TypeV128)
-			mask := m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask1Index, extAddPairwiseI16x8uMask1[:])
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
-
-			// Flip the sign bits on xx.
-			//
-			// Assuming that xx = [w1, ..., w8], now we have,
-			// 	xx[i] = int8(-w1) for i = 0...8
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(maskReg), xx))
-
-			mask = m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:])
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
-
-			// For i = 0,..4 (as this results in i32x4 lanes), now we have
-			// xx[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1)))
-			// c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr)
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(maskReg), xx))
-
-			mask = m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask2Index, extAddPairwiseI16x8uMask2[:])
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
-
-			// vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)).
-			// c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr)
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(maskReg), xx))
-
-			m.copyTo(xx, m.c.VRegOf(ret))
-		}
-	default:
-		panic(fmt.Sprintf("invalid lane type: %s", srcLane))
-	}
-}
-
-func (m *machine) lowerWidenLow(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
-	var sseOp sseOpcode
-	switch lane {
-	case ssa.VecLaneI8x16:
-		if signed {
-			sseOp = sseOpcodePmovsxbw
-		} else {
-			sseOp = sseOpcodePmovzxbw
-		}
-	case ssa.VecLaneI16x8:
-		if signed {
-			sseOp = sseOpcodePmovsxwd
-		} else {
-			sseOp = sseOpcodePmovzxwd
-		}
-	case ssa.VecLaneI32x4:
-		if signed {
-			sseOp = sseOpcodePmovsxdq
-		} else {
-			sseOp = sseOpcodePmovzxdq
-		}
-	default:
-		panic(fmt.Sprintf("invalid lane type: %s", lane))
-	}
-
-	xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
-	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, xx, m.c.VRegOf(ret)))
-}
-
-func (m *machine) lowerWidenHigh(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
-	tmp := m.c.AllocateVReg(ssa.TypeV128)
-	xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-	m.copyTo(xx.reg(), tmp)
-	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePalignr, 8, newOperandReg(tmp), tmp))
-
-	var sseOp sseOpcode
-	switch lane {
-	case ssa.VecLaneI8x16:
-		if signed {
-			sseOp = sseOpcodePmovsxbw
-		} else {
-			sseOp = sseOpcodePmovzxbw
-		}
-	case ssa.VecLaneI16x8:
-		if signed {
-			sseOp = sseOpcodePmovsxwd
-		} else {
-			sseOp = sseOpcodePmovzxwd
-		}
-	case ssa.VecLaneI32x4:
-		if signed {
-			sseOp = sseOpcodePmovsxdq
-		} else {
-			sseOp = sseOpcodePmovzxdq
-		}
-	default:
-		panic(fmt.Sprintf("invalid lane type: %s", lane))
-	}
-
-	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandReg(tmp), m.c.VRegOf(ret)))
-}
-
-func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, ret ssa.Value, lane ssa.VecLane) {
-	tmpDst, tmpGp := m.c.AllocateVReg(ssa.TypeV128), m.c.AllocateVReg(ssa.TypeI64)
-	am := newOperandMem(m.lowerToAddressMode(ptr, offset))
-
-	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
-	switch lane {
-	case ssa.VecLaneI8x16:
-		m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, am, tmpGp))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, newOperandReg(tmpGp), tmpDst))
-		tmpZeroVec := m.c.AllocateVReg(ssa.TypeV128)
-		m.insert(m.allocateInstr().asZeros(tmpZeroVec))
-		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpZeroVec), tmpDst))
-	case ssa.VecLaneI16x8:
-		m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, am, tmpGp))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, newOperandReg(tmpGp), tmpDst))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, newOperandReg(tmpGp), tmpDst))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
-	case ssa.VecLaneI32x4:
-		m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, am, tmpGp))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, newOperandReg(tmpGp), tmpDst))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
-	case ssa.VecLaneI64x2:
-		m.insert(m.allocateInstr().asMov64MR(am, tmpGp))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), tmpDst))
-		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), tmpDst))
-	default:
-		panic(fmt.Sprintf("invalid lane type: %s", lane))
-	}
-
-	m.copyTo(tmpDst, m.c.VRegOf(ret))
-}
-
-var f64x2CvtFromIMask = [16]byte{
-	0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-}
-
-func (m *machine) lowerVFcvtFromInt(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
-	switch lane {
-	case ssa.VecLaneF32x4:
-		if signed {
-			xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, xx, m.c.VRegOf(ret)))
-		} else {
-			xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-			// Copy the value to two temporary registers.
-			tmp := m.copyToTmp(xx.reg())
-			tmp2 := m.copyToTmp(xx.reg())
-
-			// Clear the higher 16 bits of each 32-bit element.
-			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePslld, newOperandImm32(0xa), tmp))
-			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0xa), tmp))
-
-			// Subtract the higher 16-bits from tmp2: clear the lower 16-bits of tmp2.
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubd, newOperandReg(tmp), tmp2))
-
-			// Convert the lower 16-bits in tmp.
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp))
-
-			// Left shift by one and convert tmp2, meaning that halved conversion result of higher 16-bits in tmp2.
-			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp2))
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp2), tmp2))
-
-			// Double the converted halved higher 16bits.
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp2), tmp2))
-
-			// Get the conversion result by add tmp (holding lower 16-bit conversion) into tmp2.
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp), tmp2))
-
-			m.copyTo(tmp2, m.c.VRegOf(ret))
-		}
-	case ssa.VecLaneF64x2:
-		if signed {
-			xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2pd, xx, m.c.VRegOf(ret)))
-		} else {
-			maskReg := m.c.AllocateVReg(ssa.TypeV128)
-			maskLabel := m.getOrAllocateConstLabel(&m.constF64x2CvtFromIMaskIndex, f64x2CvtFromIMask[:])
-			// maskReg = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg))
-
-			_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-			xx := m.copyToTmp(_xx.reg())
-
-			// Given that we have xx = [d1, d2, d3, d4], this results in
-			//	xx = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]]
-			//     = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52]
-			//     ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeUnpcklps, newOperandReg(maskReg), xx))
-
-			// maskReg = [float64(0x1.0p52), float64(0x1.0p52)]
-			maskLabel = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:])
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg))
-
-			// Now, we get the result as
-			// 	xx = [float64(uint32(d1)), float64(uint32(d2))]
-			// because the following equality always satisfies:
-			//  float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y))
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubpd, newOperandReg(maskReg), xx))
-
-			m.copyTo(xx, m.c.VRegOf(ret))
-		}
-	default:
-		panic(fmt.Sprintf("invalid lane type: %s", lane))
-	}
-}
-
-var (
-	// i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes.
-	i32sMaxOnF64x2 = [16]byte{
-		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
-		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
-	}
-
-	// i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes.
-	i32uMaxOnF64x2 = [16]byte{
-		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
-		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
-	}
-
-	// twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that
-	// with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics,
-	// like addition or subtraction, the resulted floating point holds exactly the same
-	// bit representations in 32-bit integer on its mantissa.
-	//
-	// Note: the name twop52 is common across various compiler ecosystem.
-	// 	E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28
-	// 	E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html
-	twop52 = [16]byte{
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
-	}
-)
-
-func (m *machine) lowerVFcvtToIntSat(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
-	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-	xx := m.copyToTmp(_xx.reg())
-
-	switch lane {
-	case ssa.VecLaneF32x4:
-		if signed {
-			tmp := m.copyToTmp(xx)
-
-			// Assuming we have xx = [v1, v2, v3, v4].
-			//
-			// Set all bits if lane is not NaN on tmp.
-			// tmp[i] = 0xffffffff  if vi != NaN
-			//        = 0           if vi == NaN
-			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp))
-
-			// Clear NaN lanes on xx, meaning that
-			// 	xx[i] = vi  if vi != NaN
-			//	        0   if vi == NaN
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp), xx))
-
-			// tmp[i] = ^vi         if vi != NaN
-			//        = 0xffffffff  if vi == NaN
-			// which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative.
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeXorps, newOperandReg(xx), tmp))
-
-			// xx[i] = int32(vi)   if vi != NaN and xx is not overflowing.
-			//       = 0x80000000  if vi != NaN and xx is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq)
-			//       = 0           if vi == NaN
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx))
-
-			// Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane.
-			//
-			// tmp[i] = 0x80000000                         if vi is positive
-			//        = any satisfying any&0x80000000 = 0  if vi is negative or zero.
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(xx), tmp))
-
-			// Arithmetic right shifting tmp by 31, meaning that we have
-			// tmp[i] = 0xffffffff if vi is positive, 0 otherwise.
-			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrad, newOperandImm32(0x1f), tmp))
-
-			// Flipping 0x80000000 if vi is positive, otherwise keep intact.
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), xx))
-		} else {
-			tmp := m.c.AllocateVReg(ssa.TypeV128)
-			m.insert(m.allocateInstr().asZeros(tmp))
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxps, newOperandReg(tmp), xx))
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp))
-			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0x1), tmp))
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp))
-			tmp2 := m.copyToTmp(xx)
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx))
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubps, newOperandReg(tmp), tmp2))
-			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredLE_OS), newOperandReg(tmp2), tmp))
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(tmp2), tmp2))
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp2))
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp))
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaxsd, newOperandReg(tmp), tmp2))
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(tmp2), xx))
-		}
-
-	case ssa.VecLaneF64x2:
-		tmp2 := m.c.AllocateVReg(ssa.TypeV128)
-		if signed {
-			tmp := m.copyToTmp(xx)
-
-			// Set all bits for non-NaN lanes, zeros otherwise.
-			// I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise.
-			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp))
-
-			maskLabel := m.getOrAllocateConstLabel(&m.constI32sMaxOnF64x2Index, i32sMaxOnF64x2[:])
-			// Load the 2147483647 into tmp2's each lane.
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp2))
-
-			// tmp[i] = 2147483647 if vi != NaN, 0 otherwise.
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp2), tmp))
-
-			// MINPD returns the source register's value as-is, so we have
-			//  xx[i] = vi   if vi != NaN
-			//        = 0    if vi == NaN
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp), xx))
-
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttpd2dq, newOperandReg(xx), xx))
-		} else {
-			tmp := m.c.AllocateVReg(ssa.TypeV128)
-			m.insert(m.allocateInstr().asZeros(tmp))
-
-			//  xx[i] = vi   if vi != NaN && vi > 0
-			//        = 0    if vi == NaN || vi <= 0
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxpd, newOperandReg(tmp), xx))
-
-			// tmp2[i] = float64(math.MaxUint32) = math.MaxUint32
-			maskIndex := m.getOrAllocateConstLabel(&m.constI32uMaxOnF64x2Index, i32uMaxOnF64x2[:])
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2))
-
-			// xx[i] = vi   if vi != NaN && vi > 0 && vi <= math.MaxUint32
-			//       = 0    otherwise
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp2), xx))
-
-			// Round the floating points into integer.
-			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeRoundpd, 0x3, newOperandReg(xx), xx))
-
-			// tmp2[i] = float64(0x1.0p52)
-			maskIndex = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:])
-			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2))
-
-			// xx[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32
-			//       = 0                                       otherwise
-			//
-			// This means that xx[i] holds exactly the same bit of uint32(vi) in its lower 32-bits.
-			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddpd, newOperandReg(tmp2), xx))
-
-			// At this point, we have
-			// 	xx  = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)]
-			//  tmp = [0, 0, 0, 0]
-			// as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in
-			//	xx = [xx[00], xx[10], tmp[00], tmp[00]] = [xx[00], xx[10], 0, 0]
-			// meaning that for i = 0 and 1, we have
-			//  xx[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32
-			//        = 0          otherwise.
-			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeShufps, 0b00_00_10_00, newOperandReg(tmp), xx))
-		}
-	default:
-		panic(fmt.Sprintf("invalid lane type: %s", lane))
-	}
-
-	m.copyTo(xx, m.c.VRegOf(ret))
-}
-
-func (m *machine) lowerNarrow(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) {
-	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-	xx := m.copyToTmp(_xx.reg())
-	yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
-
-	var sseOp sseOpcode
-	switch lane {
-	case ssa.VecLaneI16x8:
-		if signed {
-			sseOp = sseOpcodePacksswb
-		} else {
-			sseOp = sseOpcodePackuswb
-		}
-	case ssa.VecLaneI32x4:
-		if signed {
-			sseOp = sseOpcodePackssdw
-		} else {
-			sseOp = sseOpcodePackusdw
-		}
-	default:
-		panic(fmt.Sprintf("invalid lane type: %s", lane))
-	}
-	m.insert(m.allocateInstr().asXmmRmR(sseOp, yy, xx))
-	m.copyTo(xx, m.c.VRegOf(ret))
-}
-
-func (m *machine) lowerWideningPairwiseDotProductS(x, y, ret ssa.Value) {
-	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-	xx := m.copyToTmp(_xx.reg())
-	yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
-	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, yy, xx))
-	m.copyTo(xx, m.c.VRegOf(ret))
-}
-
-func (m *machine) lowerVIabs(instr *ssa.Instruction) {
-	x, lane := instr.ArgWithLane()
-	rd := m.c.VRegOf(instr.Return())
-
-	if lane == ssa.VecLaneI64x2 {
-		_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
-
-		blendReg := xmm0VReg
-		m.insert(m.allocateInstr().asDefineUninitializedReg(blendReg))
-
-		tmp := m.copyToTmp(_xx.reg())
-		xx := m.copyToTmp(_xx.reg())
-
-		// Clear all bits on blendReg.
-		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(blendReg), blendReg))
-		// Subtract xx from blendMaskReg.
-		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubq, newOperandReg(xx), blendReg))
-		// Copy the subtracted value ^^ back into tmp.
-		m.copyTo(blendReg, xx)
-
-		m.insert(m.allocateInstr().asBlendvpd(newOperandReg(tmp), xx))
-
-		m.copyTo(xx, rd)
-	} else {
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneI8x16:
-			vecOp = sseOpcodePabsb
-		case ssa.VecLaneI16x8:
-			vecOp = sseOpcodePabsw
-		case ssa.VecLaneI32x4:
-			vecOp = sseOpcodePabsd
-		}
-		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
-
-		i := m.allocateInstr()
-		i.asXmmUnaryRmR(vecOp, rn, rd)
-		m.insert(i)
-	}
-}
-
-func (m *machine) lowerVIpopcnt(instr *ssa.Instruction) {
-	x := instr.Arg()
-	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
-	rd := m.c.VRegOf(instr.Return())
-
-	tmp1 := m.c.AllocateVReg(ssa.TypeV128)
-	m.lowerVconst(tmp1, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f)
-
-	// Copy input into tmp2.
-	tmp2 := m.copyToTmp(rn.reg())
-
-	// Given that we have:
-	//  rm = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn.
-	//
-	// Take PAND on tmp1 and tmp2, so that we mask out all the higher bits.
-	//  tmp2 = [l1, ..., l16].
-	pand := m.allocateInstr()
-	pand.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp2)
-	m.insert(pand)
-
-	// Do logical (packed word) right shift by 4 on rm and PAND against the mask (tmp1); meaning that we have
-	//  tmp3 = [h1, ...., h16].
-	tmp3 := m.copyToTmp(rn.reg())
-	psrlw := m.allocateInstr()
-	psrlw.asXmmRmiReg(sseOpcodePsrlw, newOperandImm32(4), tmp3)
-	m.insert(psrlw)
-
-	pand2 := m.allocateInstr()
-	pand2.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp3)
-	m.insert(pand2)
-
-	// Read the popcntTable into tmp4, and we have
-	//  tmp4 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04]
-	tmp4 := m.c.AllocateVReg(ssa.TypeV128)
-	m.lowerVconst(tmp4, 0x03_02_02_01_02_01_01_00, 0x04_03_03_02_03_02_02_01)
-
-	// Make a copy for later.
-	tmp5 := m.copyToTmp(tmp4)
-
-	//  tmp4 = [popcnt(l1), ..., popcnt(l16)].
-	pshufb := m.allocateInstr()
-	pshufb.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp2), tmp4)
-	m.insert(pshufb)
-
-	pshufb2 := m.allocateInstr()
-	pshufb2.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp3), tmp5)
-	m.insert(pshufb2)
-
-	// tmp4 + tmp5 is the result.
-	paddb := m.allocateInstr()
-	paddb.asXmmRmR(sseOpcodePaddb, newOperandReg(tmp4), tmp5)
-	m.insert(paddb)
-
-	m.copyTo(tmp5, rd)
-}
-
-func (m *machine) lowerVImul(instr *ssa.Instruction) {
-	x, y, lane := instr.Arg2WithLane()
-	rd := m.c.VRegOf(instr.Return())
-	if lane == ssa.VecLaneI64x2 {
-		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
-		rm := m.getOperand_Reg(m.c.ValueDefinition(y))
-		// Assuming that we have
-		//	rm = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high]
-		//  rn = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high]
-		// where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane.
-
-		// Copy rn into tmp1.
-		tmp1 := m.copyToTmp(rn.reg())
-
-		// And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high]
-		shift := m.allocateInstr()
-		shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp1)
-		m.insert(shift)
-
-		// Execute "pmuludq rm,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit.
-		mul := m.allocateInstr()
-		mul.asXmmRmR(sseOpcodePmuludq, rm, tmp1)
-		m.insert(mul)
-
-		// Copy rm value into tmp2.
-		tmp2 := m.copyToTmp(rm.reg())
-
-		// And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high]
-		shift2 := m.allocateInstr()
-		shift2.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp2)
-		m.insert(shift2)
-
-		// Execute "pmuludq rm,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit.
-		mul2 := m.allocateInstr()
-		mul2.asXmmRmR(sseOpcodePmuludq, rn, tmp2)
-		m.insert(mul2)
-
-		// Adds tmp1 and tmp2 and do the logical left shift by 32-bit,
-		// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32]
-		add := m.allocateInstr()
-		add.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp2), tmp1)
-		m.insert(add)
-
-		shift3 := m.allocateInstr()
-		shift3.asXmmRmiReg(sseOpcodePsllq, newOperandImm32(32), tmp1)
-		m.insert(shift3)
-
-		// Copy rm value into tmp3.
-		tmp3 := m.copyToTmp(rm.reg())
-
-		// "pmuludq rm,tmp3" makes tmp3 = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit.
-		mul3 := m.allocateInstr()
-		mul3.asXmmRmR(sseOpcodePmuludq, rn, tmp3)
-		m.insert(mul3)
-
-		// Finally, we get the result by computing tmp1 + tmp3,
-		// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo]
-		add2 := m.allocateInstr()
-		add2.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp3), tmp1)
-		m.insert(add2)
-
-		m.copyTo(tmp1, rd)
-
-	} else {
-		var vecOp sseOpcode
-		switch lane {
-		case ssa.VecLaneI16x8:
-			vecOp = sseOpcodePmullw
-		case ssa.VecLaneI32x4:
-			vecOp = sseOpcodePmulld
-		default:
-			panic("unsupported: " + lane.String())
-		}
-		m.lowerVbBinOp(vecOp, x, y, instr.Return())
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go
deleted file mode 100644
index 787975683..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go
+++ /dev/null
@@ -1,336 +0,0 @@
-package amd64
-
-import (
-	"fmt"
-	"unsafe"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-)
-
-type operand struct {
-	kind operandKind
-	data uint64
-}
-
-type operandKind byte
-
-const (
-	// operandKindReg is an operand which is an integer Register.
-	operandKindReg operandKind = iota + 1
-
-	// operandKindMem is a value in Memory.
-	// 32, 64, or 128 bit value.
-	operandKindMem
-
-	// operandKindImm32 is a signed-32-bit integer immediate value.
-	operandKindImm32
-
-	// operandKindLabel is a label.
-	operandKindLabel
-)
-
-// String implements fmt.Stringer.
-func (o operandKind) String() string {
-	switch o {
-	case operandKindReg:
-		return "reg"
-	case operandKindMem:
-		return "mem"
-	case operandKindImm32:
-		return "imm32"
-	case operandKindLabel:
-		return "label"
-	default:
-		panic("BUG: invalid operand kind")
-	}
-}
-
-// format returns the string representation of the operand.
-// _64 is only for the case where the operand is a register, and it's integer.
-func (o *operand) format(_64 bool) string {
-	switch o.kind {
-	case operandKindReg:
-		return formatVRegSized(o.reg(), _64)
-	case operandKindMem:
-		return o.addressMode().String()
-	case operandKindImm32:
-		return fmt.Sprintf("$%d", int32(o.imm32()))
-	case operandKindLabel:
-		return label(o.imm32()).String()
-	default:
-		panic(fmt.Sprintf("BUG: invalid operand: %s", o.kind))
-	}
-}
-
-//go:inline
-func (o *operand) reg() regalloc.VReg {
-	return regalloc.VReg(o.data)
-}
-
-//go:inline
-func (o *operand) setReg(r regalloc.VReg) {
-	o.data = uint64(r)
-}
-
-//go:inline
-func (o *operand) addressMode() *amode {
-	return wazevoapi.PtrFromUintptr[amode](uintptr(o.data))
-}
-
-//go:inline
-func (o *operand) imm32() uint32 {
-	return uint32(o.data)
-}
-
-func (o *operand) label() label {
-	switch o.kind {
-	case operandKindLabel:
-		return label(o.data)
-	case operandKindMem:
-		mem := o.addressMode()
-		if mem.kind() != amodeRipRel {
-			panic("BUG: invalid label")
-		}
-		return label(mem.imm32)
-	default:
-		panic("BUG: invalid operand kind")
-	}
-}
-
-func newOperandLabel(label label) operand {
-	return operand{kind: operandKindLabel, data: uint64(label)}
-}
-
-func newOperandReg(r regalloc.VReg) operand {
-	return operand{kind: operandKindReg, data: uint64(r)}
-}
-
-func newOperandImm32(imm32 uint32) operand {
-	return operand{kind: operandKindImm32, data: uint64(imm32)}
-}
-
-func newOperandMem(amode *amode) operand {
-	return operand{kind: operandKindMem, data: uint64(uintptr(unsafe.Pointer(amode)))}
-}
-
-// amode is a memory operand (addressing mode).
-type amode struct {
-	kindWithShift uint32
-	imm32         uint32
-	base          regalloc.VReg
-
-	// For amodeRegRegShift:
-	index regalloc.VReg
-}
-
-type amodeKind byte
-
-const (
-	// amodeRegRegShift calculates sign-extend-32-to-64(Immediate) + base
-	amodeImmReg amodeKind = iota + 1
-
-	// amodeImmRBP is the same as amodeImmReg, but the base register is fixed to RBP.
-	// The only differece is that it doesn't tell the register allocator to use RBP which is distracting for the
-	// register allocator.
-	amodeImmRBP
-
-	// amodeRegRegShift calculates sign-extend-32-to-64(Immediate) + base + (Register2 << Shift)
-	amodeRegRegShift
-
-	// amodeRipRel is a RIP-relative addressing mode specified by the label.
-	amodeRipRel
-
-	// TODO: there are other addressing modes such as the one without base register.
-)
-
-func (a *amode) kind() amodeKind {
-	return amodeKind(a.kindWithShift & 0xff)
-}
-
-func (a *amode) shift() byte {
-	return byte(a.kindWithShift >> 8)
-}
-
-func (a *amode) uses(rs *[]regalloc.VReg) {
-	switch a.kind() {
-	case amodeImmReg:
-		*rs = append(*rs, a.base)
-	case amodeRegRegShift:
-		*rs = append(*rs, a.base, a.index)
-	case amodeImmRBP, amodeRipRel:
-	default:
-		panic("BUG: invalid amode kind")
-	}
-}
-
-func (a *amode) nregs() int {
-	switch a.kind() {
-	case amodeImmReg:
-		return 1
-	case amodeRegRegShift:
-		return 2
-	case amodeImmRBP, amodeRipRel:
-		return 0
-	default:
-		panic("BUG: invalid amode kind")
-	}
-}
-
-func (a *amode) assignUses(i int, reg regalloc.VReg) {
-	switch a.kind() {
-	case amodeImmReg:
-		if i == 0 {
-			a.base = reg
-		} else {
-			panic("BUG: invalid amode assignment")
-		}
-	case amodeRegRegShift:
-		if i == 0 {
-			a.base = reg
-		} else if i == 1 {
-			a.index = reg
-		} else {
-			panic("BUG: invalid amode assignment")
-		}
-	default:
-		panic("BUG: invalid amode assignment")
-	}
-}
-
-func (m *machine) newAmodeImmReg(imm32 uint32, base regalloc.VReg) *amode {
-	ret := m.amodePool.Allocate()
-	*ret = amode{kindWithShift: uint32(amodeImmReg), imm32: imm32, base: base}
-	return ret
-}
-
-func (m *machine) newAmodeImmRBPReg(imm32 uint32) *amode {
-	ret := m.amodePool.Allocate()
-	*ret = amode{kindWithShift: uint32(amodeImmRBP), imm32: imm32, base: rbpVReg}
-	return ret
-}
-
-func (m *machine) newAmodeRegRegShift(imm32 uint32, base, index regalloc.VReg, shift byte) *amode {
-	if shift > 3 {
-		panic(fmt.Sprintf("BUG: invalid shift (must be 3>=): %d", shift))
-	}
-	ret := m.amodePool.Allocate()
-	*ret = amode{kindWithShift: uint32(amodeRegRegShift) | uint32(shift)<<8, imm32: imm32, base: base, index: index}
-	return ret
-}
-
-func (m *machine) newAmodeRipRel(label label) *amode {
-	ret := m.amodePool.Allocate()
-	*ret = amode{kindWithShift: uint32(amodeRipRel), imm32: uint32(label)}
-	return ret
-}
-
-// String implements fmt.Stringer.
-func (a *amode) String() string {
-	switch a.kind() {
-	case amodeImmReg, amodeImmRBP:
-		if a.imm32 == 0 {
-			return fmt.Sprintf("(%s)", formatVRegSized(a.base, true))
-		}
-		return fmt.Sprintf("%d(%s)", int32(a.imm32), formatVRegSized(a.base, true))
-	case amodeRegRegShift:
-		shift := 1 << a.shift()
-		if a.imm32 == 0 {
-			return fmt.Sprintf(
-				"(%s,%s,%d)",
-				formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift)
-		}
-		return fmt.Sprintf(
-			"%d(%s,%s,%d)",
-			int32(a.imm32), formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift)
-	case amodeRipRel:
-		return fmt.Sprintf("%s(%%rip)", label(a.imm32))
-	default:
-		panic("BUG: invalid amode kind")
-	}
-}
-
-func (m *machine) getOperand_Mem_Reg(def backend.SSAValueDefinition) (op operand) {
-	if !def.IsFromInstr() {
-		return newOperandReg(m.c.VRegOf(def.V))
-	}
-
-	if def.V.Type() == ssa.TypeV128 {
-		// SIMD instructions require strict memory alignment, so we don't support the memory operand for V128 at the moment.
-		return m.getOperand_Reg(def)
-	}
-
-	if m.c.MatchInstr(def, ssa.OpcodeLoad) {
-		instr := def.Instr
-		ptr, offset, _ := instr.LoadData()
-		op = newOperandMem(m.lowerToAddressMode(ptr, offset))
-		instr.MarkLowered()
-		return op
-	}
-	return m.getOperand_Reg(def)
-}
-
-func (m *machine) getOperand_Mem_Imm32_Reg(def backend.SSAValueDefinition) (op operand) {
-	if !def.IsFromInstr() {
-		return newOperandReg(m.c.VRegOf(def.V))
-	}
-
-	if m.c.MatchInstr(def, ssa.OpcodeLoad) {
-		instr := def.Instr
-		ptr, offset, _ := instr.LoadData()
-		op = newOperandMem(m.lowerToAddressMode(ptr, offset))
-		instr.MarkLowered()
-		return op
-	}
-	return m.getOperand_Imm32_Reg(def)
-}
-
-func (m *machine) getOperand_Imm32_Reg(def backend.SSAValueDefinition) (op operand) {
-	if !def.IsFromInstr() {
-		return newOperandReg(m.c.VRegOf(def.V))
-	}
-
-	instr := def.Instr
-	if instr.Constant() {
-		// If the operation is 64-bit, x64 sign-extends the 32-bit immediate value.
-		// Therefore, we need to check if the immediate value is within the 32-bit range and if the sign bit is set,
-		// we should not use the immediate value.
-		if op, ok := asImm32Operand(instr.ConstantVal(), instr.Return().Type() == ssa.TypeI32); ok {
-			instr.MarkLowered()
-			return op
-		}
-	}
-	return m.getOperand_Reg(def)
-}
-
-func asImm32Operand(val uint64, allowSignExt bool) (operand, bool) {
-	if imm32, ok := asImm32(val, allowSignExt); ok {
-		return newOperandImm32(imm32), true
-	}
-	return operand{}, false
-}
-
-func asImm32(val uint64, allowSignExt bool) (uint32, bool) {
-	u32val := uint32(val)
-	if uint64(u32val) != val {
-		return 0, false
-	}
-	if !allowSignExt && u32val&0x80000000 != 0 {
-		return 0, false
-	}
-	return u32val, true
-}
-
-func (m *machine) getOperand_Reg(def backend.SSAValueDefinition) (op operand) {
-	var v regalloc.VReg
-	if instr := def.Instr; instr != nil && instr.Constant() {
-		// We inline all the constant instructions so that we could reduce the register usage.
-		v = m.lowerConstant(instr)
-		instr.MarkLowered()
-	} else {
-		v = m.c.VRegOf(def.V)
-	}
-	return newOperandReg(v)
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go
deleted file mode 100644
index 4aec856fa..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go
+++ /dev/null
@@ -1,181 +0,0 @@
-package amd64
-
-import (
-	"fmt"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-)
-
-// Amd64-specific registers.
-const (
-	// rax is a gp register.
-	rax = regalloc.RealRegInvalid + 1 + iota
-	// rcx is a gp register.
-	rcx
-	// rdx is a gp register.
-	rdx
-	// rbx is a gp register.
-	rbx
-	// rsp is a gp register.
-	rsp
-	// rbp is a gp register.
-	rbp
-	// rsi is a gp register.
-	rsi
-	// rdi is a gp register.
-	rdi
-	// r8 is a gp register.
-	r8
-	// r9 is a gp register.
-	r9
-	// r10 is a gp register.
-	r10
-	// r11 is a gp register.
-	r11
-	// r12 is a gp register.
-	r12
-	// r13 is a gp register.
-	r13
-	// r14 is a gp register.
-	r14
-	// r15 is a gp register.
-	r15
-
-	// xmm0 is a vector register.
-	xmm0
-	// xmm1 is a vector register.
-	xmm1
-	// xmm2 is a vector register.
-	xmm2
-	// xmm3 is a vector register.
-	xmm3
-	// xmm4 is a vector register.
-	xmm4
-	// xmm5 is a vector register.
-	xmm5
-	// xmm6 is a vector register.
-	xmm6
-	// xmm7 is a vector register.
-	xmm7
-	// xmm8 is a vector register.
-	xmm8
-	// xmm9 is a vector register.
-	xmm9
-	// xmm10 is a vector register.
-	xmm10
-	// xmm11 is a vector register.
-	xmm11
-	// xmm12 is a vector register.
-	xmm12
-	// xmm13 is a vector register.
-	xmm13
-	// xmm14 is a vector register.
-	xmm14
-	// xmm15 is a vector register.
-	xmm15
-)
-
-var (
-	raxVReg = regalloc.FromRealReg(rax, regalloc.RegTypeInt)
-	rcxVReg = regalloc.FromRealReg(rcx, regalloc.RegTypeInt)
-	rdxVReg = regalloc.FromRealReg(rdx, regalloc.RegTypeInt)
-	rbxVReg = regalloc.FromRealReg(rbx, regalloc.RegTypeInt)
-	rspVReg = regalloc.FromRealReg(rsp, regalloc.RegTypeInt)
-	rbpVReg = regalloc.FromRealReg(rbp, regalloc.RegTypeInt)
-	rsiVReg = regalloc.FromRealReg(rsi, regalloc.RegTypeInt)
-	rdiVReg = regalloc.FromRealReg(rdi, regalloc.RegTypeInt)
-	r8VReg  = regalloc.FromRealReg(r8, regalloc.RegTypeInt)
-	r9VReg  = regalloc.FromRealReg(r9, regalloc.RegTypeInt)
-	r10VReg = regalloc.FromRealReg(r10, regalloc.RegTypeInt)
-	r11VReg = regalloc.FromRealReg(r11, regalloc.RegTypeInt)
-	r12VReg = regalloc.FromRealReg(r12, regalloc.RegTypeInt)
-	r13VReg = regalloc.FromRealReg(r13, regalloc.RegTypeInt)
-	r14VReg = regalloc.FromRealReg(r14, regalloc.RegTypeInt)
-	r15VReg = regalloc.FromRealReg(r15, regalloc.RegTypeInt)
-
-	xmm0VReg  = regalloc.FromRealReg(xmm0, regalloc.RegTypeFloat)
-	xmm1VReg  = regalloc.FromRealReg(xmm1, regalloc.RegTypeFloat)
-	xmm2VReg  = regalloc.FromRealReg(xmm2, regalloc.RegTypeFloat)
-	xmm3VReg  = regalloc.FromRealReg(xmm3, regalloc.RegTypeFloat)
-	xmm4VReg  = regalloc.FromRealReg(xmm4, regalloc.RegTypeFloat)
-	xmm5VReg  = regalloc.FromRealReg(xmm5, regalloc.RegTypeFloat)
-	xmm6VReg  = regalloc.FromRealReg(xmm6, regalloc.RegTypeFloat)
-	xmm7VReg  = regalloc.FromRealReg(xmm7, regalloc.RegTypeFloat)
-	xmm8VReg  = regalloc.FromRealReg(xmm8, regalloc.RegTypeFloat)
-	xmm9VReg  = regalloc.FromRealReg(xmm9, regalloc.RegTypeFloat)
-	xmm10VReg = regalloc.FromRealReg(xmm10, regalloc.RegTypeFloat)
-	xmm11VReg = regalloc.FromRealReg(xmm11, regalloc.RegTypeFloat)
-	xmm12VReg = regalloc.FromRealReg(xmm12, regalloc.RegTypeFloat)
-	xmm13VReg = regalloc.FromRealReg(xmm13, regalloc.RegTypeFloat)
-	xmm14VReg = regalloc.FromRealReg(xmm14, regalloc.RegTypeFloat)
-	xmm15VReg = regalloc.FromRealReg(xmm15, regalloc.RegTypeFloat)
-)
-
-var regNames = [...]string{
-	rax:   "rax",
-	rcx:   "rcx",
-	rdx:   "rdx",
-	rbx:   "rbx",
-	rsp:   "rsp",
-	rbp:   "rbp",
-	rsi:   "rsi",
-	rdi:   "rdi",
-	r8:    "r8",
-	r9:    "r9",
-	r10:   "r10",
-	r11:   "r11",
-	r12:   "r12",
-	r13:   "r13",
-	r14:   "r14",
-	r15:   "r15",
-	xmm0:  "xmm0",
-	xmm1:  "xmm1",
-	xmm2:  "xmm2",
-	xmm3:  "xmm3",
-	xmm4:  "xmm4",
-	xmm5:  "xmm5",
-	xmm6:  "xmm6",
-	xmm7:  "xmm7",
-	xmm8:  "xmm8",
-	xmm9:  "xmm9",
-	xmm10: "xmm10",
-	xmm11: "xmm11",
-	xmm12: "xmm12",
-	xmm13: "xmm13",
-	xmm14: "xmm14",
-	xmm15: "xmm15",
-}
-
-func formatVRegSized(r regalloc.VReg, _64 bool) string {
-	if r.IsRealReg() {
-		if r.RegType() == regalloc.RegTypeInt {
-			rr := r.RealReg()
-			orig := regNames[rr]
-			if rr <= rdi {
-				if _64 {
-					return "%" + orig
-				} else {
-					return "%e" + orig[1:]
-				}
-			} else {
-				if _64 {
-					return "%" + orig
-				} else {
-					return "%" + orig + "d"
-				}
-			}
-		} else {
-			return "%" + regNames[r.RealReg()]
-		}
-	} else {
-		if r.RegType() == regalloc.RegTypeInt {
-			if _64 {
-				return fmt.Sprintf("%%r%d?", r.ID())
-			} else {
-				return fmt.Sprintf("%%r%dd?", r.ID())
-			}
-		} else {
-			return fmt.Sprintf("%%xmm%d?", r.ID())
-		}
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go
deleted file mode 100644
index ef823bdbd..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go
+++ /dev/null
@@ -1,130 +0,0 @@
-package amd64
-
-import (
-	"encoding/binary"
-	"reflect"
-	"unsafe"
-
-	"github.com/tetratelabs/wazero/internal/wasmdebug"
-)
-
-func stackView(rbp, top uintptr) []byte {
-	l := int(top - rbp)
-	var stackBuf []byte
-	{
-		//nolint:staticcheck
-		hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf))
-		hdr.Data = rbp
-		hdr.Len = l
-		hdr.Cap = l
-	}
-	return stackBuf
-}
-
-// UnwindStack implements wazevo.unwindStack.
-func UnwindStack(_, rbp, top uintptr, returnAddresses []uintptr) []uintptr {
-	stackBuf := stackView(rbp, top)
-
-	for i := uint64(0); i < uint64(len(stackBuf)); {
-		//       (high address)
-		//    +-----------------+
-		//    |     .......     |
-		//    |      ret Y      |
-		//    |     .......     |
-		//    |      ret 0      |
-		//    |      arg X      |
-		//    |     .......     |
-		//    |      arg 1      |
-		//    |      arg 0      |
-		//    |  ReturnAddress  |
-		//    |   Caller_RBP    |
-		//    +-----------------+ <---- Caller_RBP
-		//    |   ...........   |
-		//    |   clobbered  M  |
-		//    |   ............  |
-		//    |   clobbered  0  |
-		//    |   spill slot N  |
-		//    |   ............  |
-		//    |   spill slot 0  |
-		//    |  ReturnAddress  |
-		//    |   Caller_RBP    |
-		//    +-----------------+ <---- RBP
-		//       (low address)
-
-		callerRBP := binary.LittleEndian.Uint64(stackBuf[i:])
-		retAddr := binary.LittleEndian.Uint64(stackBuf[i+8:])
-		returnAddresses = append(returnAddresses, uintptr(retAddr))
-		i = callerRBP - uint64(rbp)
-		if len(returnAddresses) == wasmdebug.MaxFrames {
-			break
-		}
-	}
-	return returnAddresses
-}
-
-// GoCallStackView implements wazevo.goCallStackView.
-func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
-	//                  (high address)
-	//              +-----------------+ <----+
-	//              |   xxxxxxxxxxx   |      | ;; optional unused space to make it 16-byte aligned.
-	//           ^  |  arg[N]/ret[M]  |      |
-	// sliceSize |  |  ............   |      | SizeInBytes/8
-	//           |  |  arg[1]/ret[1]  |      |
-	//           v  |  arg[0]/ret[0]  | <----+
-	//              |   SizeInBytes   |
-	//              +-----------------+ <---- stackPointerBeforeGoCall
-	//                 (low address)
-	data := unsafe.Add(unsafe.Pointer(stackPointerBeforeGoCall), 8)
-	size := *stackPointerBeforeGoCall / 8
-	return unsafe.Slice((*uint64)(data), size)
-}
-
-func AdjustClonedStack(oldRsp, oldTop, rsp, rbp, top uintptr) {
-	diff := uint64(rsp - oldRsp)
-
-	newBuf := stackView(rbp, top)
-	for i := uint64(0); i < uint64(len(newBuf)); {
-		//       (high address)
-		//    +-----------------+
-		//    |     .......     |
-		//    |      ret Y      |
-		//    |     .......     |
-		//    |      ret 0      |
-		//    |      arg X      |
-		//    |     .......     |
-		//    |      arg 1      |
-		//    |      arg 0      |
-		//    |  ReturnAddress  |
-		//    |   Caller_RBP    |
-		//    +-----------------+ <---- Caller_RBP
-		//    |   ...........   |
-		//    |   clobbered  M  |
-		//    |   ............  |
-		//    |   clobbered  0  |
-		//    |   spill slot N  |
-		//    |   ............  |
-		//    |   spill slot 0  |
-		//    |  ReturnAddress  |
-		//    |   Caller_RBP    |
-		//    +-----------------+ <---- RBP
-		//       (low address)
-
-		callerRBP := binary.LittleEndian.Uint64(newBuf[i:])
-		if callerRBP == 0 {
-			// End of stack.
-			break
-		}
-		if i64 := int64(callerRBP); i64 < int64(oldRsp) || i64 >= int64(oldTop) {
-			panic("BUG: callerRBP is out of range")
-		}
-		if int(callerRBP) < 0 {
-			panic("BUG: callerRBP is negative")
-		}
-		adjustedCallerRBP := callerRBP + diff
-		if int(adjustedCallerRBP) < 0 {
-			panic("BUG: adjustedCallerRBP is negative")
-		}
-		binary.LittleEndian.PutUint64(newBuf[i:], adjustedCallerRBP)
-		i = adjustedCallerRBP - uint64(rbp)
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
deleted file mode 100644
index d1eaa7cd4..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
+++ /dev/null
@@ -1,333 +0,0 @@
-package arm64
-
-import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-// References:
-// * https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#arm64-architecture
-// * https://developer.arm.com/documentation/102374/0101/Procedure-Call-Standard
-
-var (
-	intParamResultRegs   = []regalloc.RealReg{x0, x1, x2, x3, x4, x5, x6, x7}
-	floatParamResultRegs = []regalloc.RealReg{v0, v1, v2, v3, v4, v5, v6, v7}
-)
-
-var regInfo = &regalloc.RegisterInfo{
-	AllocatableRegisters: [regalloc.NumRegType][]regalloc.RealReg{
-		// We don't allocate:
-		// - x18: Reserved by the macOS: https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms#Respect-the-purpose-of-specific-CPU-registers
-		// - x28: Reserved by Go runtime.
-		// - x27(=tmpReg): because of the reason described on tmpReg.
-		regalloc.RegTypeInt: {
-			x8, x9, x10, x11, x12, x13, x14, x15,
-			x16, x17, x19, x20, x21, x22, x23, x24, x25,
-			x26, x29, x30,
-			// These are the argument/return registers. Less preferred in the allocation.
-			x7, x6, x5, x4, x3, x2, x1, x0,
-		},
-		regalloc.RegTypeFloat: {
-			v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-			v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30,
-			// These are the argument/return registers. Less preferred in the allocation.
-			v7, v6, v5, v4, v3, v2, v1, v0,
-		},
-	},
-	CalleeSavedRegisters: regalloc.NewRegSet(
-		x19, x20, x21, x22, x23, x24, x25, x26, x28,
-		v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
-	),
-	CallerSavedRegisters: regalloc.NewRegSet(
-		x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x29, x30,
-		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
-	),
-	RealRegToVReg: []regalloc.VReg{
-		x0: x0VReg, x1: x1VReg, x2: x2VReg, x3: x3VReg, x4: x4VReg, x5: x5VReg, x6: x6VReg, x7: x7VReg, x8: x8VReg, x9: x9VReg, x10: x10VReg, x11: x11VReg, x12: x12VReg, x13: x13VReg, x14: x14VReg, x15: x15VReg, x16: x16VReg, x17: x17VReg, x18: x18VReg, x19: x19VReg, x20: x20VReg, x21: x21VReg, x22: x22VReg, x23: x23VReg, x24: x24VReg, x25: x25VReg, x26: x26VReg, x27: x27VReg, x28: x28VReg, x29: x29VReg, x30: x30VReg,
-		v0: v0VReg, v1: v1VReg, v2: v2VReg, v3: v3VReg, v4: v4VReg, v5: v5VReg, v6: v6VReg, v7: v7VReg, v8: v8VReg, v9: v9VReg, v10: v10VReg, v11: v11VReg, v12: v12VReg, v13: v13VReg, v14: v14VReg, v15: v15VReg, v16: v16VReg, v17: v17VReg, v18: v18VReg, v19: v19VReg, v20: v20VReg, v21: v21VReg, v22: v22VReg, v23: v23VReg, v24: v24VReg, v25: v25VReg, v26: v26VReg, v27: v27VReg, v28: v28VReg, v29: v29VReg, v30: v30VReg, v31: v31VReg,
-	},
-	RealRegName: func(r regalloc.RealReg) string { return regNames[r] },
-	RealRegType: func(r regalloc.RealReg) regalloc.RegType {
-		if r < v0 {
-			return regalloc.RegTypeInt
-		}
-		return regalloc.RegTypeFloat
-	},
-}
-
-// ArgsResultsRegs implements backend.Machine.
-func (m *machine) ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) {
-	return intParamResultRegs, floatParamResultRegs
-}
-
-// LowerParams implements backend.FunctionABI.
-func (m *machine) LowerParams(args []ssa.Value) {
-	a := m.currentABI
-
-	for i, ssaArg := range args {
-		if !ssaArg.Valid() {
-			continue
-		}
-		reg := m.compiler.VRegOf(ssaArg)
-		arg := &a.Args[i]
-		if arg.Kind == backend.ABIArgKindReg {
-			m.InsertMove(reg, arg.Reg, arg.Type)
-		} else {
-			// TODO: we could use pair load if there's consecutive loads for the same type.
-			//
-			//            (high address)
-			//          +-----------------+
-			//          |     .......     |
-			//          |      ret Y      |
-			//          |     .......     |
-			//          |      ret 0      |
-			//          |      arg X      |
-			//          |     .......     |
-			//          |      arg 1      |
-			//          |      arg 0      |    <-|
-			//          |   ReturnAddress |      |
-			//          +-----------------+      |
-			//          |   ...........   |      |
-			//          |   clobbered  M  |      |   argStackOffset: is unknown at this point of compilation.
-			//          |   ............  |      |
-			//          |   clobbered  0  |      |
-			//          |   spill slot N  |      |
-			//          |   ...........   |      |
-			//          |   spill slot 0  |      |
-			//   SP---> +-----------------+    <-+
-			//             (low address)
-
-			bits := arg.Type.Bits()
-			// At this point of compilation, we don't yet know how much space exist below the return address.
-			// So we instruct the address mode to add the `argStackOffset` to the offset at the later phase of compilation.
-			amode := m.amodePool.Allocate()
-			*amode = addressMode{imm: arg.Offset, rn: spVReg, kind: addressModeKindArgStackSpace}
-			load := m.allocateInstr()
-			switch arg.Type {
-			case ssa.TypeI32, ssa.TypeI64:
-				load.asULoad(reg, amode, bits)
-			case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-				load.asFpuLoad(reg, amode, bits)
-			default:
-				panic("BUG")
-			}
-			m.insert(load)
-			m.unresolvedAddressModes = append(m.unresolvedAddressModes, load)
-		}
-	}
-}
-
-// LowerReturns lowers the given returns.
-func (m *machine) LowerReturns(rets []ssa.Value) {
-	a := m.currentABI
-
-	l := len(rets) - 1
-	for i := range rets {
-		// Reverse order in order to avoid overwriting the stack returns existing in the return registers.
-		ret := rets[l-i]
-		r := &a.Rets[l-i]
-		reg := m.compiler.VRegOf(ret)
-		if def := m.compiler.ValueDefinition(ret); def.IsFromInstr() {
-			// Constant instructions are inlined.
-			if inst := def.Instr; inst.Constant() {
-				val := inst.Return()
-				valType := val.Type()
-				v := inst.ConstantVal()
-				m.insertLoadConstant(v, valType, reg)
-			}
-		}
-		if r.Kind == backend.ABIArgKindReg {
-			m.InsertMove(r.Reg, reg, ret.Type())
-		} else {
-			// TODO: we could use pair store if there's consecutive stores for the same type.
-			//
-			//            (high address)
-			//          +-----------------+
-			//          |     .......     |
-			//          |      ret Y      |
-			//          |     .......     |
-			//          |      ret 0      |    <-+
-			//          |      arg X      |      |
-			//          |     .......     |      |
-			//          |      arg 1      |      |
-			//          |      arg 0      |      |
-			//          |   ReturnAddress |      |
-			//          +-----------------+      |
-			//          |   ...........   |      |
-			//          |   spill slot M  |      |   retStackOffset: is unknown at this point of compilation.
-			//          |   ............  |      |
-			//          |   spill slot 2  |      |
-			//          |   spill slot 1  |      |
-			//          |   clobbered 0   |      |
-			//          |   clobbered 1   |      |
-			//          |   ...........   |      |
-			//          |   clobbered N   |      |
-			//   SP---> +-----------------+    <-+
-			//             (low address)
-
-			bits := r.Type.Bits()
-
-			// At this point of compilation, we don't yet know how much space exist below the return address.
-			// So we instruct the address mode to add the `retStackOffset` to the offset at the later phase of compilation.
-			amode := m.amodePool.Allocate()
-			*amode = addressMode{imm: r.Offset, rn: spVReg, kind: addressModeKindResultStackSpace}
-			store := m.allocateInstr()
-			store.asStore(operandNR(reg), amode, bits)
-			m.insert(store)
-			m.unresolvedAddressModes = append(m.unresolvedAddressModes, store)
-		}
-	}
-}
-
-// callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the
-// caller side of the function call.
-func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def backend.SSAValueDefinition, slotBegin int64) {
-	arg := &a.Args[argIndex]
-	if def.IsFromInstr() {
-		// Constant instructions are inlined.
-		if inst := def.Instr; inst.Constant() {
-			val := inst.Return()
-			valType := val.Type()
-			v := inst.ConstantVal()
-			m.insertLoadConstant(v, valType, reg)
-		}
-	}
-	if arg.Kind == backend.ABIArgKindReg {
-		m.InsertMove(arg.Reg, reg, arg.Type)
-	} else {
-		// TODO: we could use pair store if there's consecutive stores for the same type.
-		//
-		// Note that at this point, stack pointer is already adjusted.
-		bits := arg.Type.Bits()
-		amode := m.resolveAddressModeForOffset(arg.Offset-slotBegin, bits, spVReg, false)
-		store := m.allocateInstr()
-		store.asStore(operandNR(reg), amode, bits)
-		m.insert(store)
-	}
-}
-
-func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex int, reg regalloc.VReg, slotBegin int64) {
-	r := &a.Rets[retIndex]
-	if r.Kind == backend.ABIArgKindReg {
-		m.InsertMove(reg, r.Reg, r.Type)
-	} else {
-		// TODO: we could use pair load if there's consecutive loads for the same type.
-		amode := m.resolveAddressModeForOffset(a.ArgStackSize+r.Offset-slotBegin, r.Type.Bits(), spVReg, false)
-		ldr := m.allocateInstr()
-		switch r.Type {
-		case ssa.TypeI32, ssa.TypeI64:
-			ldr.asULoad(reg, amode, r.Type.Bits())
-		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-			ldr.asFpuLoad(reg, amode, r.Type.Bits())
-		default:
-			panic("BUG")
-		}
-		m.insert(ldr)
-	}
-}
-
-func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) (*instruction, *addressMode) {
-	m.pendingInstructions = m.pendingInstructions[:0]
-	mode := m.resolveAddressModeForOffset(offset, dstBits, rn, allowTmpRegUse)
-	for _, instr := range m.pendingInstructions {
-		cur = linkInstr(cur, instr)
-	}
-	return cur, mode
-}
-
-func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) *addressMode {
-	if rn.RegType() != regalloc.RegTypeInt {
-		panic("BUG: rn should be a pointer: " + formatVRegSized(rn, 64))
-	}
-	amode := m.amodePool.Allocate()
-	if offsetFitsInAddressModeKindRegUnsignedImm12(dstBits, offset) {
-		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: rn, imm: offset}
-	} else if offsetFitsInAddressModeKindRegSignedImm9(offset) {
-		*amode = addressMode{kind: addressModeKindRegSignedImm9, rn: rn, imm: offset}
-	} else {
-		var indexReg regalloc.VReg
-		if allowTmpRegUse {
-			m.lowerConstantI64(tmpRegVReg, offset)
-			indexReg = tmpRegVReg
-		} else {
-			indexReg = m.compiler.AllocateVReg(ssa.TypeI64)
-			m.lowerConstantI64(indexReg, offset)
-		}
-		*amode = addressMode{kind: addressModeKindRegReg, rn: rn, rm: indexReg, extOp: extendOpUXTX /* indicates index rm is 64-bit */}
-	}
-	return amode
-}
-
-func (m *machine) lowerCall(si *ssa.Instruction) {
-	isDirectCall := si.Opcode() == ssa.OpcodeCall
-	var indirectCalleePtr ssa.Value
-	var directCallee ssa.FuncRef
-	var sigID ssa.SignatureID
-	var args []ssa.Value
-	if isDirectCall {
-		directCallee, sigID, args = si.CallData()
-	} else {
-		indirectCalleePtr, sigID, args, _ /* on arm64, the calling convention is compatible with the Go runtime */ = si.CallIndirectData()
-	}
-	calleeABI := m.compiler.GetFunctionABI(m.compiler.SSABuilder().ResolveSignature(sigID))
-
-	stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize())
-	if m.maxRequiredStackSizeForCalls < stackSlotSize+16 {
-		m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // return address frame.
-	}
-
-	for i, arg := range args {
-		reg := m.compiler.VRegOf(arg)
-		def := m.compiler.ValueDefinition(arg)
-		m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize)
-	}
-
-	if isDirectCall {
-		call := m.allocateInstr()
-		call.asCall(directCallee, calleeABI)
-		m.insert(call)
-	} else {
-		ptr := m.compiler.VRegOf(indirectCalleePtr)
-		callInd := m.allocateInstr()
-		callInd.asCallIndirect(ptr, calleeABI)
-		m.insert(callInd)
-	}
-
-	var index int
-	r1, rs := si.Returns()
-	if r1.Valid() {
-		m.callerGenFunctionReturnVReg(calleeABI, 0, m.compiler.VRegOf(r1), stackSlotSize)
-		index++
-	}
-
-	for _, r := range rs {
-		m.callerGenFunctionReturnVReg(calleeABI, index, m.compiler.VRegOf(r), stackSlotSize)
-		index++
-	}
-}
-
-func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add bool) {
-	if imm12Operand, ok := asImm12Operand(uint64(diff)); ok {
-		alu := m.allocateInstr()
-		var ao aluOp
-		if add {
-			ao = aluOpAdd
-		} else {
-			ao = aluOpSub
-		}
-		alu.asALU(ao, rd, operandNR(spVReg), imm12Operand, true)
-		m.insert(alu)
-	} else {
-		m.lowerConstantI64(tmpRegVReg, diff)
-		alu := m.allocateInstr()
-		var ao aluOp
-		if add {
-			ao = aluOpAdd
-		} else {
-			ao = aluOpSub
-		}
-		alu.asALU(ao, rd, operandNR(spVReg), operandNR(tmpRegVReg), true)
-		m.insert(alu)
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go
deleted file mode 100644
index 5f0c613df..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go
+++ /dev/null
@@ -1,9 +0,0 @@
-package arm64
-
-// entrypoint enters the machine code generated by this backend which begins with the preamble generated by functionABI.EmitGoEntryPreamble below.
-// This implements wazevo.entrypoint, and see the comments there for detail.
-func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr)
-
-// afterGoFunctionCallEntrypoint enters the machine code after growing the stack.
-// This implements wazevo.afterGoFunctionCallEntrypoint, and see the comments there for detail.
-func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s
deleted file mode 100644
index 0b579f852..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s
+++ /dev/null
@@ -1,29 +0,0 @@
-//go:build arm64
-
-#include "funcdata.h"
-#include "textflag.h"
-
-// See the comments on EmitGoEntryPreamble for what this function is supposed to do.
-TEXT ·entrypoint(SB), NOSPLIT|NOFRAME, $0-48
-	MOVD preambleExecutable+0(FP), R27
-	MOVD functionExectuable+8(FP), R24
-	MOVD executionContextPtr+16(FP), R0
-	MOVD moduleContextPtr+24(FP), R1
-	MOVD paramResultSlicePtr+32(FP), R19
-	MOVD goAllocatedStackSlicePtr+40(FP), R26
-	JMP  (R27)
-
-TEXT ·afterGoFunctionCallEntrypoint(SB), NOSPLIT|NOFRAME, $0-32
-	MOVD goCallReturnAddress+0(FP), R20
-	MOVD executionContextPtr+8(FP), R0
-	MOVD stackPointer+16(FP), R19
-
-	// Save the current FP(R29), SP and LR(R30) into the wazevo.executionContext (stored in R0).
-	MOVD R29, 16(R0) // Store FP(R29) into [RO, #ExecutionContextOffsets.OriginalFramePointer]
-	MOVD RSP, R27    // Move SP to R27 (temporary register) since SP cannot be stored directly in str instructions.
-	MOVD R27, 24(R0) // Store R27 into [RO, #ExecutionContextOffsets.OriginalFramePointer]
-	MOVD R30, 32(R0) // Store R30 into [R0, #ExecutionContextOffsets.GoReturnAddress]
-
-	// Load the new stack pointer (which sits somewhere in Go-allocated stack) into SP.
-	MOVD R19, RSP
-	JMP  (R20)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
deleted file mode 100644
index f8b5d97ac..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
+++ /dev/null
@@ -1,233 +0,0 @@
-package arm64
-
-import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-)
-
-// CompileEntryPreamble implements backend.Machine. This assumes `entrypoint` function (in abi_go_entry_arm64.s) passes:
-//
-//  1. First (execution context ptr) and Second arguments are already passed in x0, and x1.
-//  2. param/result slice ptr in x19; the pointer to []uint64{} which is used to pass arguments and accept return values.
-//  3. Go-allocated stack slice ptr in x26.
-//  4. Function executable in x24.
-//
-// also SP and FP are correct Go-runtime-based values, and LR is the return address to the Go-side caller.
-func (m *machine) CompileEntryPreamble(signature *ssa.Signature) []byte {
-	root := m.constructEntryPreamble(signature)
-	m.encode(root)
-	return m.compiler.Buf()
-}
-
-var (
-	executionContextPtrReg = x0VReg
-	// callee-saved regs so that they can be used in the prologue and epilogue.
-	paramResultSlicePtr      = x19VReg
-	savedExecutionContextPtr = x20VReg
-	// goAllocatedStackPtr is not used in the epilogue.
-	goAllocatedStackPtr = x26VReg
-	// paramResultSliceCopied is not used in the epilogue.
-	paramResultSliceCopied = x25VReg
-	// tmpRegVReg is not used in the epilogue.
-	functionExecutable = x24VReg
-)
-
-func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regalloc.VReg, arg *backend.ABIArg, argStartOffsetFromSP int64) *instruction {
-	typ := arg.Type
-	bits := typ.Bits()
-	isStackArg := arg.Kind == backend.ABIArgKindStack
-
-	var loadTargetReg operand
-	if !isStackArg {
-		loadTargetReg = operandNR(arg.Reg)
-	} else {
-		switch typ {
-		case ssa.TypeI32, ssa.TypeI64:
-			loadTargetReg = operandNR(x15VReg)
-		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-			loadTargetReg = operandNR(v15VReg)
-		default:
-			panic("TODO?")
-		}
-	}
-
-	var postIndexImm int64
-	if typ == ssa.TypeV128 {
-		postIndexImm = 16 // v128 is represented as 2x64-bit in Go slice.
-	} else {
-		postIndexImm = 8
-	}
-	loadMode := m.amodePool.Allocate()
-	*loadMode = addressMode{kind: addressModeKindPostIndex, rn: paramSlicePtr, imm: postIndexImm}
-
-	instr := m.allocateInstr()
-	switch typ {
-	case ssa.TypeI32:
-		instr.asULoad(loadTargetReg.reg(), loadMode, 32)
-	case ssa.TypeI64:
-		instr.asULoad(loadTargetReg.reg(), loadMode, 64)
-	case ssa.TypeF32:
-		instr.asFpuLoad(loadTargetReg.reg(), loadMode, 32)
-	case ssa.TypeF64:
-		instr.asFpuLoad(loadTargetReg.reg(), loadMode, 64)
-	case ssa.TypeV128:
-		instr.asFpuLoad(loadTargetReg.reg(), loadMode, 128)
-	}
-	cur = linkInstr(cur, instr)
-
-	if isStackArg {
-		var storeMode *addressMode
-		cur, storeMode = m.resolveAddressModeForOffsetAndInsert(cur, argStartOffsetFromSP+arg.Offset, bits, spVReg, true)
-		toStack := m.allocateInstr()
-		toStack.asStore(loadTargetReg, storeMode, bits)
-		cur = linkInstr(cur, toStack)
-	}
-	return cur
-}
-
-func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr regalloc.VReg, result *backend.ABIArg, resultStartOffsetFromSP int64) *instruction {
-	isStackArg := result.Kind == backend.ABIArgKindStack
-	typ := result.Type
-	bits := typ.Bits()
-
-	var storeTargetReg operand
-	if !isStackArg {
-		storeTargetReg = operandNR(result.Reg)
-	} else {
-		switch typ {
-		case ssa.TypeI32, ssa.TypeI64:
-			storeTargetReg = operandNR(x15VReg)
-		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-			storeTargetReg = operandNR(v15VReg)
-		default:
-			panic("TODO?")
-		}
-	}
-
-	var postIndexImm int64
-	if typ == ssa.TypeV128 {
-		postIndexImm = 16 // v128 is represented as 2x64-bit in Go slice.
-	} else {
-		postIndexImm = 8
-	}
-
-	if isStackArg {
-		var loadMode *addressMode
-		cur, loadMode = m.resolveAddressModeForOffsetAndInsert(cur, resultStartOffsetFromSP+result.Offset, bits, spVReg, true)
-		toReg := m.allocateInstr()
-		switch typ {
-		case ssa.TypeI32, ssa.TypeI64:
-			toReg.asULoad(storeTargetReg.reg(), loadMode, bits)
-		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-			toReg.asFpuLoad(storeTargetReg.reg(), loadMode, bits)
-		default:
-			panic("TODO?")
-		}
-		cur = linkInstr(cur, toReg)
-	}
-
-	mode := m.amodePool.Allocate()
-	*mode = addressMode{kind: addressModeKindPostIndex, rn: resultSlicePtr, imm: postIndexImm}
-	instr := m.allocateInstr()
-	instr.asStore(storeTargetReg, mode, bits)
-	cur = linkInstr(cur, instr)
-	return cur
-}
-
-func (m *machine) constructEntryPreamble(sig *ssa.Signature) (root *instruction) {
-	abi := backend.FunctionABI{}
-	abi.Init(sig, intParamResultRegs, floatParamResultRegs)
-
-	root = m.allocateNop()
-
-	//// ----------------------------------- prologue ----------------------------------- ////
-
-	// First, we save executionContextPtrReg into a callee-saved register so that it can be used in epilogue as well.
-	// 		mov savedExecutionContextPtr, x0
-	cur := m.move64(savedExecutionContextPtr, executionContextPtrReg, root)
-
-	// Next, save the current FP, SP and LR into the wazevo.executionContext:
-	// 		str fp, [savedExecutionContextPtr, #OriginalFramePointer]
-	//      mov tmp, sp ;; sp cannot be str'ed directly.
-	// 		str sp, [savedExecutionContextPtr, #OriginalStackPointer]
-	// 		str lr, [savedExecutionContextPtr, #GoReturnAddress]
-	cur = m.loadOrStoreAtExecutionContext(fpVReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, true, cur)
-	cur = m.move64(tmpRegVReg, spVReg, cur)
-	cur = m.loadOrStoreAtExecutionContext(tmpRegVReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, true, cur)
-	cur = m.loadOrStoreAtExecutionContext(lrVReg, wazevoapi.ExecutionContextOffsetGoReturnAddress, true, cur)
-
-	// Then, move the Go-allocated stack pointer to SP:
-	// 		mov sp, goAllocatedStackPtr
-	cur = m.move64(spVReg, goAllocatedStackPtr, cur)
-
-	prReg := paramResultSlicePtr
-	if len(abi.Args) > 2 && len(abi.Rets) > 0 {
-		// paramResultSlicePtr is modified during the execution of goEntryPreamblePassArg,
-		// so copy it to another reg.
-		cur = m.move64(paramResultSliceCopied, paramResultSlicePtr, cur)
-		prReg = paramResultSliceCopied
-	}
-
-	stackSlotSize := int64(abi.AlignedArgResultStackSlotSize())
-	for i := range abi.Args {
-		if i < 2 {
-			// module context ptr and execution context ptr are passed in x0 and x1 by the Go assembly function.
-			continue
-		}
-		arg := &abi.Args[i]
-		cur = m.goEntryPreamblePassArg(cur, prReg, arg, -stackSlotSize)
-	}
-
-	// Call the real function.
-	bl := m.allocateInstr()
-	bl.asCallIndirect(functionExecutable, &abi)
-	cur = linkInstr(cur, bl)
-
-	///// ----------------------------------- epilogue ----------------------------------- /////
-
-	// Store the register results into paramResultSlicePtr.
-	for i := range abi.Rets {
-		cur = m.goEntryPreamblePassResult(cur, paramResultSlicePtr, &abi.Rets[i], abi.ArgStackSize-stackSlotSize)
-	}
-
-	// Finally, restore the FP, SP and LR, and return to the Go code.
-	// 		ldr fp, [savedExecutionContextPtr, #OriginalFramePointer]
-	// 		ldr tmp, [savedExecutionContextPtr, #OriginalStackPointer]
-	//      mov sp, tmp ;; sp cannot be str'ed directly.
-	// 		ldr lr, [savedExecutionContextPtr, #GoReturnAddress]
-	// 		ret ;; --> return to the Go code
-	cur = m.loadOrStoreAtExecutionContext(fpVReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, false, cur)
-	cur = m.loadOrStoreAtExecutionContext(tmpRegVReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, false, cur)
-	cur = m.move64(spVReg, tmpRegVReg, cur)
-	cur = m.loadOrStoreAtExecutionContext(lrVReg, wazevoapi.ExecutionContextOffsetGoReturnAddress, false, cur)
-	retInst := m.allocateInstr()
-	retInst.asRet()
-	linkInstr(cur, retInst)
-	return
-}
-
-func (m *machine) move64(dst, src regalloc.VReg, prev *instruction) *instruction {
-	instr := m.allocateInstr()
-	instr.asMove64(dst, src)
-	return linkInstr(prev, instr)
-}
-
-func (m *machine) loadOrStoreAtExecutionContext(d regalloc.VReg, offset wazevoapi.Offset, store bool, prev *instruction) *instruction {
-	instr := m.allocateInstr()
-	mode := m.amodePool.Allocate()
-	*mode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: savedExecutionContextPtr, imm: offset.I64()}
-	if store {
-		instr.asStore(operandNR(d), mode, 64)
-	} else {
-		instr.asULoad(d, mode, 64)
-	}
-	return linkInstr(prev, instr)
-}
-
-func linkInstr(prev, next *instruction) *instruction {
-	prev.next = next
-	next.prev = prev
-	return next
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
deleted file mode 100644
index 06f8a4a05..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
+++ /dev/null
@@ -1,430 +0,0 @@
-package arm64
-
-import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-)
-
-var calleeSavedRegistersSorted = []regalloc.VReg{
-	x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg,
-	v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg,
-}
-
-// CompileGoFunctionTrampoline implements backend.Machine.
-func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte {
-	argBegin := 1 // Skips exec context by default.
-	if needModuleContextPtr {
-		argBegin++
-	}
-
-	abi := &backend.FunctionABI{}
-	abi.Init(sig, intParamResultRegs, floatParamResultRegs)
-	m.currentABI = abi
-
-	cur := m.allocateInstr()
-	cur.asNop0()
-	m.rootInstr = cur
-
-	// Execution context is always the first argument.
-	execCtrPtr := x0VReg
-
-	// In the following, we create the following stack layout:
-	//
-	//                   (high address)
-	//     SP ------> +-----------------+  <----+
-	//                |     .......     |       |
-	//                |      ret Y      |       |
-	//                |     .......     |       |
-	//                |      ret 0      |       |
-	//                |      arg X      |       |  size_of_arg_ret
-	//                |     .......     |       |
-	//                |      arg 1      |       |
-	//                |      arg 0      |  <----+ <-------- originalArg0Reg
-	//                | size_of_arg_ret |
-	//                |  ReturnAddress  |
-	//                +-----------------+ <----+
-	//                |      xxxx       |      |  ;; might be padded to make it 16-byte aligned.
-	//           +--->|  arg[N]/ret[M]  |      |
-	//  sliceSize|    |   ............  |      | goCallStackSize
-	//           |    |  arg[1]/ret[1]  |      |
-	//           +--->|  arg[0]/ret[0]  | <----+ <-------- arg0ret0AddrReg
-	//                |    sliceSize    |
-	//                |   frame_size    |
-	//                +-----------------+
-	//                   (low address)
-	//
-	// where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions,
-	// therefore will be accessed as the usual []uint64. So that's where we need to pass/receive
-	// the arguments/return values.
-
-	// First of all, to update the SP, and create "ReturnAddress + size_of_arg_ret".
-	cur = m.createReturnAddrAndSizeOfArgRetSlot(cur)
-
-	const frameInfoSize = 16 // == frame_size + sliceSize.
-
-	// Next, we should allocate the stack for the Go function call if necessary.
-	goCallStackSize, sliceSizeInBytes := backend.GoFunctionCallRequiredStackSize(sig, argBegin)
-	cur = m.insertStackBoundsCheck(goCallStackSize+frameInfoSize, cur)
-
-	originalArg0Reg := x17VReg // Caller save, so we can use it for whatever we want.
-	if m.currentABI.AlignedArgResultStackSlotSize() > 0 {
-		// At this point, SP points to `ReturnAddress`, so add 16 to get the original arg 0 slot.
-		cur = m.addsAddOrSubStackPointer(cur, originalArg0Reg, frameInfoSize, true)
-	}
-
-	// Save the callee saved registers.
-	cur = m.saveRegistersInExecutionContext(cur, calleeSavedRegistersSorted)
-
-	if needModuleContextPtr {
-		offset := wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.I64()
-		if !offsetFitsInAddressModeKindRegUnsignedImm12(64, offset) {
-			panic("BUG: too large or un-aligned offset for goFunctionCallCalleeModuleContextOpaque in execution context")
-		}
-
-		// Module context is always the second argument.
-		moduleCtrPtr := x1VReg
-		store := m.allocateInstr()
-		amode := m.amodePool.Allocate()
-		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtrPtr, imm: offset}
-		store.asStore(operandNR(moduleCtrPtr), amode, 64)
-		cur = linkInstr(cur, store)
-	}
-
-	// Advances the stack pointer.
-	cur = m.addsAddOrSubStackPointer(cur, spVReg, goCallStackSize, false)
-
-	// Copy the pointer to x15VReg.
-	arg0ret0AddrReg := x15VReg // Caller save, so we can use it for whatever we want.
-	copySp := m.allocateInstr()
-	copySp.asMove64(arg0ret0AddrReg, spVReg)
-	cur = linkInstr(cur, copySp)
-
-	// Next, we need to store all the arguments to the stack in the typical Wasm stack style.
-	for i := range abi.Args[argBegin:] {
-		arg := &abi.Args[argBegin+i]
-		store := m.allocateInstr()
-		var v regalloc.VReg
-		if arg.Kind == backend.ABIArgKindReg {
-			v = arg.Reg
-		} else {
-			cur, v = m.goFunctionCallLoadStackArg(cur, originalArg0Reg, arg,
-				// Caller save, so we can use it for whatever we want.
-				x11VReg, v11VReg)
-		}
-
-		var sizeInBits byte
-		if arg.Type == ssa.TypeV128 {
-			sizeInBits = 128
-		} else {
-			sizeInBits = 64
-		}
-		amode := m.amodePool.Allocate()
-		*amode = addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg, imm: int64(sizeInBits / 8)}
-		store.asStore(operandNR(v), amode, sizeInBits)
-		cur = linkInstr(cur, store)
-	}
-
-	// Finally, now that we've advanced SP to arg[0]/ret[0], we allocate `frame_size + sliceSize`.
-	var frameSizeReg, sliceSizeReg regalloc.VReg
-	if goCallStackSize > 0 {
-		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, goCallStackSize)
-		frameSizeReg = tmpRegVReg
-		cur = m.lowerConstantI64AndInsert(cur, x16VReg, sliceSizeInBytes/8)
-		sliceSizeReg = x16VReg
-	} else {
-		frameSizeReg = xzrVReg
-		sliceSizeReg = xzrVReg
-	}
-	_amode := addressModePreOrPostIndex(m, spVReg, -16, true)
-	storeP := m.allocateInstr()
-	storeP.asStorePair64(frameSizeReg, sliceSizeReg, _amode)
-	cur = linkInstr(cur, storeP)
-
-	// Set the exit status on the execution context.
-	cur = m.setExitCode(cur, x0VReg, exitCode)
-
-	// Save the current stack pointer.
-	cur = m.saveCurrentStackPointer(cur, x0VReg)
-
-	// Exit the execution.
-	cur = m.storeReturnAddressAndExit(cur)
-
-	// After the call, we need to restore the callee saved registers.
-	cur = m.restoreRegistersInExecutionContext(cur, calleeSavedRegistersSorted)
-
-	// Get the pointer to the arg[0]/ret[0]: We need to skip `frame_size + sliceSize`.
-	if len(abi.Rets) > 0 {
-		cur = m.addsAddOrSubStackPointer(cur, arg0ret0AddrReg, frameInfoSize, true)
-	}
-
-	// Advances the SP so that it points to `ReturnAddress`.
-	cur = m.addsAddOrSubStackPointer(cur, spVReg, frameInfoSize+goCallStackSize, true)
-	ldr := m.allocateInstr()
-	// And load the return address.
-	amode := addressModePreOrPostIndex(m, spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */)
-	ldr.asULoad(lrVReg, amode, 64)
-	cur = linkInstr(cur, ldr)
-
-	originalRet0Reg := x17VReg // Caller save, so we can use it for whatever we want.
-	if m.currentABI.RetStackSize > 0 {
-		cur = m.addsAddOrSubStackPointer(cur, originalRet0Reg, m.currentABI.ArgStackSize, true)
-	}
-
-	// Make the SP point to the original address (above the result slot).
-	if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
-		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
-	}
-
-	for i := range abi.Rets {
-		r := &abi.Rets[i]
-		if r.Kind == backend.ABIArgKindReg {
-			loadIntoReg := m.allocateInstr()
-			mode := m.amodePool.Allocate()
-			*mode = addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
-			switch r.Type {
-			case ssa.TypeI32:
-				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asULoad(r.Reg, mode, 32)
-			case ssa.TypeI64:
-				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asULoad(r.Reg, mode, 64)
-			case ssa.TypeF32:
-				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asFpuLoad(r.Reg, mode, 32)
-			case ssa.TypeF64:
-				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asFpuLoad(r.Reg, mode, 64)
-			case ssa.TypeV128:
-				mode.imm = 16
-				loadIntoReg.asFpuLoad(r.Reg, mode, 128)
-			default:
-				panic("TODO")
-			}
-			cur = linkInstr(cur, loadIntoReg)
-		} else {
-			// First we need to load the value to a temporary just like ^^.
-			intTmp, floatTmp := x11VReg, v11VReg
-			loadIntoTmpReg := m.allocateInstr()
-			mode := m.amodePool.Allocate()
-			*mode = addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
-			var resultReg regalloc.VReg
-			switch r.Type {
-			case ssa.TypeI32:
-				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asULoad(intTmp, mode, 32)
-				resultReg = intTmp
-			case ssa.TypeI64:
-				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asULoad(intTmp, mode, 64)
-				resultReg = intTmp
-			case ssa.TypeF32:
-				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asFpuLoad(floatTmp, mode, 32)
-				resultReg = floatTmp
-			case ssa.TypeF64:
-				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asFpuLoad(floatTmp, mode, 64)
-				resultReg = floatTmp
-			case ssa.TypeV128:
-				mode.imm = 16
-				loadIntoTmpReg.asFpuLoad(floatTmp, mode, 128)
-				resultReg = floatTmp
-			default:
-				panic("TODO")
-			}
-			cur = linkInstr(cur, loadIntoTmpReg)
-			cur = m.goFunctionCallStoreStackResult(cur, originalRet0Reg, r, resultReg)
-		}
-	}
-
-	ret := m.allocateInstr()
-	ret.asRet()
-	linkInstr(cur, ret)
-
-	m.encode(m.rootInstr)
-	return m.compiler.Buf()
-}
-
-func (m *machine) saveRegistersInExecutionContext(cur *instruction, regs []regalloc.VReg) *instruction {
-	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
-	for _, v := range regs {
-		store := m.allocateInstr()
-		var sizeInBits byte
-		switch v.RegType() {
-		case regalloc.RegTypeInt:
-			sizeInBits = 64
-		case regalloc.RegTypeFloat:
-			sizeInBits = 128
-		}
-		mode := m.amodePool.Allocate()
-		*mode = addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			// Execution context is always the first argument.
-			rn: x0VReg, imm: offset,
-		}
-		store.asStore(operandNR(v), mode, sizeInBits)
-		store.prev = cur
-		cur.next = store
-		cur = store
-		offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally store regs at the offset of multiple of 16.
-	}
-	return cur
-}
-
-func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []regalloc.VReg) *instruction {
-	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
-	for _, v := range regs {
-		load := m.allocateInstr()
-		var as func(dst regalloc.VReg, amode *addressMode, sizeInBits byte)
-		var sizeInBits byte
-		switch v.RegType() {
-		case regalloc.RegTypeInt:
-			as = load.asULoad
-			sizeInBits = 64
-		case regalloc.RegTypeFloat:
-			as = load.asFpuLoad
-			sizeInBits = 128
-		}
-		mode := m.amodePool.Allocate()
-		*mode = addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			// Execution context is always the first argument.
-			rn: x0VReg, imm: offset,
-		}
-		as(v, mode, sizeInBits)
-		cur = linkInstr(cur, load)
-		offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally load regs at the offset of multiple of 16.
-	}
-	return cur
-}
-
-func (m *machine) lowerConstantI64AndInsert(cur *instruction, dst regalloc.VReg, v int64) *instruction {
-	m.pendingInstructions = m.pendingInstructions[:0]
-	m.lowerConstantI64(dst, v)
-	for _, instr := range m.pendingInstructions {
-		cur = linkInstr(cur, instr)
-	}
-	return cur
-}
-
-func (m *machine) lowerConstantI32AndInsert(cur *instruction, dst regalloc.VReg, v int32) *instruction {
-	m.pendingInstructions = m.pendingInstructions[:0]
-	m.lowerConstantI32(dst, v)
-	for _, instr := range m.pendingInstructions {
-		cur = linkInstr(cur, instr)
-	}
-	return cur
-}
-
-func (m *machine) setExitCode(cur *instruction, execCtr regalloc.VReg, exitCode wazevoapi.ExitCode) *instruction {
-	constReg := x17VReg // caller-saved, so we can use it.
-	cur = m.lowerConstantI32AndInsert(cur, constReg, int32(exitCode))
-
-	// Set the exit status on the execution context.
-	setExistStatus := m.allocateInstr()
-	mode := m.amodePool.Allocate()
-	*mode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtr, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64()}
-	setExistStatus.asStore(operandNR(constReg), mode, 32)
-	cur = linkInstr(cur, setExistStatus)
-	return cur
-}
-
-func (m *machine) storeReturnAddressAndExit(cur *instruction) *instruction {
-	// Read the return address into tmp, and store it in the execution context.
-	adr := m.allocateInstr()
-	adr.asAdr(tmpRegVReg, exitSequenceSize+8)
-	cur = linkInstr(cur, adr)
-
-	storeReturnAddr := m.allocateInstr()
-	mode := m.amodePool.Allocate()
-	*mode = addressMode{
-		kind: addressModeKindRegUnsignedImm12,
-		// Execution context is always the first argument.
-		rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
-	}
-	storeReturnAddr.asStore(operandNR(tmpRegVReg), mode, 64)
-	cur = linkInstr(cur, storeReturnAddr)
-
-	// Exit the execution.
-	trapSeq := m.allocateInstr()
-	trapSeq.asExitSequence(x0VReg)
-	cur = linkInstr(cur, trapSeq)
-	return cur
-}
-
-func (m *machine) saveCurrentStackPointer(cur *instruction, execCtr regalloc.VReg) *instruction {
-	// Save the current stack pointer:
-	// 	mov tmp, sp,
-	// 	str tmp, [exec_ctx, #stackPointerBeforeGoCall]
-	movSp := m.allocateInstr()
-	movSp.asMove64(tmpRegVReg, spVReg)
-	cur = linkInstr(cur, movSp)
-
-	strSp := m.allocateInstr()
-	mode := m.amodePool.Allocate()
-	*mode = addressMode{
-		kind: addressModeKindRegUnsignedImm12,
-		rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
-	}
-	strSp.asStore(operandNR(tmpRegVReg), mode, 64)
-	cur = linkInstr(cur, strSp)
-	return cur
-}
-
-func (m *machine) goFunctionCallLoadStackArg(cur *instruction, originalArg0Reg regalloc.VReg, arg *backend.ABIArg, intVReg, floatVReg regalloc.VReg) (*instruction, regalloc.VReg) {
-	load := m.allocateInstr()
-	var result regalloc.VReg
-	mode := m.amodePool.Allocate()
-	*mode = addressMode{kind: addressModeKindPostIndex, rn: originalArg0Reg}
-	switch arg.Type {
-	case ssa.TypeI32:
-		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asULoad(intVReg, mode, 32)
-		result = intVReg
-	case ssa.TypeI64:
-		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asULoad(intVReg, mode, 64)
-		result = intVReg
-	case ssa.TypeF32:
-		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asFpuLoad(floatVReg, mode, 32)
-		result = floatVReg
-	case ssa.TypeF64:
-		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asFpuLoad(floatVReg, mode, 64)
-		result = floatVReg
-	case ssa.TypeV128:
-		mode.imm = 16
-		load.asFpuLoad(floatVReg, mode, 128)
-		result = floatVReg
-	default:
-		panic("TODO")
-	}
-
-	cur = linkInstr(cur, load)
-	return cur, result
-}
-
-func (m *machine) goFunctionCallStoreStackResult(cur *instruction, originalRet0Reg regalloc.VReg, result *backend.ABIArg, resultVReg regalloc.VReg) *instruction {
-	store := m.allocateInstr()
-	mode := m.amodePool.Allocate()
-	*mode = addressMode{kind: addressModeKindPostIndex, rn: originalRet0Reg}
-	var sizeInBits byte
-	switch result.Type {
-	case ssa.TypeI32, ssa.TypeF32:
-		mode.imm = 8
-		sizeInBits = 32
-	case ssa.TypeI64, ssa.TypeF64:
-		mode.imm = 8
-		sizeInBits = 64
-	case ssa.TypeV128:
-		mode.imm = 16
-		sizeInBits = 128
-	default:
-		panic("TODO")
-	}
-	store.asStore(operandNR(resultVReg), mode, sizeInBits)
-	return linkInstr(cur, store)
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go
deleted file mode 100644
index 6f6cdd1b2..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go
+++ /dev/null
@@ -1,215 +0,0 @@
-package arm64
-
-import (
-	"strconv"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-type (
-	cond     uint64
-	condKind byte
-)
-
-const (
-	// condKindRegisterZero represents a condition which checks if the register is zero.
-	// This indicates that the instruction must be encoded as CBZ:
-	// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/CBZ--Compare-and-Branch-on-Zero-
-	condKindRegisterZero condKind = iota
-	// condKindRegisterNotZero indicates that the instruction must be encoded as CBNZ:
-	// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/CBNZ--Compare-and-Branch-on-Nonzero-
-	condKindRegisterNotZero
-	// condKindCondFlagSet indicates that the instruction must be encoded as B.cond:
-	// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/B-cond--Branch-conditionally-
-	condKindCondFlagSet
-)
-
-// kind returns the kind of condition which is stored in the first two bits.
-func (c cond) kind() condKind {
-	return condKind(c & 0b11)
-}
-
-func (c cond) asUint64() uint64 {
-	return uint64(c)
-}
-
-// register returns the register for register conditions.
-// This panics if the condition is not a register condition (condKindRegisterZero or condKindRegisterNotZero).
-func (c cond) register() regalloc.VReg {
-	if c.kind() != condKindRegisterZero && c.kind() != condKindRegisterNotZero {
-		panic("condition is not a register")
-	}
-	return regalloc.VReg(c >> 2)
-}
-
-func registerAsRegZeroCond(r regalloc.VReg) cond {
-	return cond(r)<<2 | cond(condKindRegisterZero)
-}
-
-func registerAsRegNotZeroCond(r regalloc.VReg) cond {
-	return cond(r)<<2 | cond(condKindRegisterNotZero)
-}
-
-func (c cond) flag() condFlag {
-	if c.kind() != condKindCondFlagSet {
-		panic("condition is not a flag")
-	}
-	return condFlag(c >> 2)
-}
-
-func (c condFlag) asCond() cond {
-	return cond(c)<<2 | cond(condKindCondFlagSet)
-}
-
-// condFlag represents a condition flag for conditional branches.
-// The value matches the encoding of condition flags in the ARM64 instruction set.
-// https://developer.arm.com/documentation/den0024/a/The-A64-instruction-set/Data-processing-instructions/Conditional-instructions
-type condFlag uint8
-
-const (
-	eq condFlag = iota // eq represents "equal"
-	ne                 // ne represents "not equal"
-	hs                 // hs represents "higher or same"
-	lo                 // lo represents "lower"
-	mi                 // mi represents "minus or negative result"
-	pl                 // pl represents "plus or positive result"
-	vs                 // vs represents "overflow set"
-	vc                 // vc represents "overflow clear"
-	hi                 // hi represents "higher"
-	ls                 // ls represents "lower or same"
-	ge                 // ge represents "greater or equal"
-	lt                 // lt represents "less than"
-	gt                 // gt represents "greater than"
-	le                 // le represents "less than or equal"
-	al                 // al represents "always"
-	nv                 // nv represents "never"
-)
-
-// invert returns the inverted condition.
-func (c condFlag) invert() condFlag {
-	switch c {
-	case eq:
-		return ne
-	case ne:
-		return eq
-	case hs:
-		return lo
-	case lo:
-		return hs
-	case mi:
-		return pl
-	case pl:
-		return mi
-	case vs:
-		return vc
-	case vc:
-		return vs
-	case hi:
-		return ls
-	case ls:
-		return hi
-	case ge:
-		return lt
-	case lt:
-		return ge
-	case gt:
-		return le
-	case le:
-		return gt
-	case al:
-		return nv
-	case nv:
-		return al
-	default:
-		panic(c)
-	}
-}
-
-// String implements fmt.Stringer.
-func (c condFlag) String() string {
-	switch c {
-	case eq:
-		return "eq"
-	case ne:
-		return "ne"
-	case hs:
-		return "hs"
-	case lo:
-		return "lo"
-	case mi:
-		return "mi"
-	case pl:
-		return "pl"
-	case vs:
-		return "vs"
-	case vc:
-		return "vc"
-	case hi:
-		return "hi"
-	case ls:
-		return "ls"
-	case ge:
-		return "ge"
-	case lt:
-		return "lt"
-	case gt:
-		return "gt"
-	case le:
-		return "le"
-	case al:
-		return "al"
-	case nv:
-		return "nv"
-	default:
-		panic(strconv.Itoa(int(c)))
-	}
-}
-
-// condFlagFromSSAIntegerCmpCond returns the condition flag for the given ssa.IntegerCmpCond.
-func condFlagFromSSAIntegerCmpCond(c ssa.IntegerCmpCond) condFlag {
-	switch c {
-	case ssa.IntegerCmpCondEqual:
-		return eq
-	case ssa.IntegerCmpCondNotEqual:
-		return ne
-	case ssa.IntegerCmpCondSignedLessThan:
-		return lt
-	case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
-		return ge
-	case ssa.IntegerCmpCondSignedGreaterThan:
-		return gt
-	case ssa.IntegerCmpCondSignedLessThanOrEqual:
-		return le
-	case ssa.IntegerCmpCondUnsignedLessThan:
-		return lo
-	case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual:
-		return hs
-	case ssa.IntegerCmpCondUnsignedGreaterThan:
-		return hi
-	case ssa.IntegerCmpCondUnsignedLessThanOrEqual:
-		return ls
-	default:
-		panic(c)
-	}
-}
-
-// condFlagFromSSAFloatCmpCond returns the condition flag for the given ssa.FloatCmpCond.
-func condFlagFromSSAFloatCmpCond(c ssa.FloatCmpCond) condFlag {
-	switch c {
-	case ssa.FloatCmpCondEqual:
-		return eq
-	case ssa.FloatCmpCondNotEqual:
-		return ne
-	case ssa.FloatCmpCondLessThan:
-		return mi
-	case ssa.FloatCmpCondLessThanOrEqual:
-		return ls
-	case ssa.FloatCmpCondGreaterThan:
-		return gt
-	case ssa.FloatCmpCondGreaterThanOrEqual:
-		return ge
-	default:
-		panic(c)
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
deleted file mode 100644
index 1f563428a..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
+++ /dev/null
@@ -1,2534 +0,0 @@
-package arm64
-
-import (
-	"fmt"
-	"math"
-	"unsafe"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-)
-
-type (
-	// instruction represents either a real instruction in arm64, or the meta instructions
-	// that are convenient for code generation. For example, inline constants are also treated
-	// as instructions.
-	//
-	// Basically, each instruction knows how to get encoded in binaries. Hence, the final output of compilation
-	// can be considered equivalent to the sequence of such instructions.
-	//
-	// Each field is interpreted depending on the kind.
-	//
-	// TODO: optimize the layout later once the impl settles.
-	instruction struct {
-		prev, next          *instruction
-		u1, u2              uint64
-		rd                  regalloc.VReg
-		rm, rn              operand
-		kind                instructionKind
-		addedBeforeRegAlloc bool
-	}
-
-	// instructionKind represents the kind of instruction.
-	// This controls how the instruction struct is interpreted.
-	instructionKind byte
-)
-
-// IsCall implements regalloc.Instr IsCall.
-func (i *instruction) IsCall() bool {
-	return i.kind == call
-}
-
-// IsIndirectCall implements regalloc.Instr IsIndirectCall.
-func (i *instruction) IsIndirectCall() bool {
-	return i.kind == callInd
-}
-
-// IsReturn implements regalloc.Instr IsReturn.
-func (i *instruction) IsReturn() bool {
-	return i.kind == ret
-}
-
-type defKind byte
-
-const (
-	defKindNone defKind = iota + 1
-	defKindRD
-	defKindCall
-)
-
-var defKinds = [numInstructionKinds]defKind{
-	adr:                  defKindRD,
-	aluRRR:               defKindRD,
-	aluRRRR:              defKindRD,
-	aluRRImm12:           defKindRD,
-	aluRRBitmaskImm:      defKindRD,
-	aluRRRShift:          defKindRD,
-	aluRRImmShift:        defKindRD,
-	aluRRRExtend:         defKindRD,
-	bitRR:                defKindRD,
-	movZ:                 defKindRD,
-	movK:                 defKindRD,
-	movN:                 defKindRD,
-	mov32:                defKindRD,
-	mov64:                defKindRD,
-	fpuMov64:             defKindRD,
-	fpuMov128:            defKindRD,
-	fpuRR:                defKindRD,
-	fpuRRR:               defKindRD,
-	nop0:                 defKindNone,
-	call:                 defKindCall,
-	callInd:              defKindCall,
-	ret:                  defKindNone,
-	store8:               defKindNone,
-	store16:              defKindNone,
-	store32:              defKindNone,
-	store64:              defKindNone,
-	exitSequence:         defKindNone,
-	condBr:               defKindNone,
-	br:                   defKindNone,
-	brTableSequence:      defKindNone,
-	cSet:                 defKindRD,
-	extend:               defKindRD,
-	fpuCmp:               defKindNone,
-	uLoad8:               defKindRD,
-	uLoad16:              defKindRD,
-	uLoad32:              defKindRD,
-	sLoad8:               defKindRD,
-	sLoad16:              defKindRD,
-	sLoad32:              defKindRD,
-	uLoad64:              defKindRD,
-	fpuLoad32:            defKindRD,
-	fpuLoad64:            defKindRD,
-	fpuLoad128:           defKindRD,
-	vecLoad1R:            defKindRD,
-	loadFpuConst32:       defKindRD,
-	loadFpuConst64:       defKindRD,
-	loadFpuConst128:      defKindRD,
-	fpuStore32:           defKindNone,
-	fpuStore64:           defKindNone,
-	fpuStore128:          defKindNone,
-	udf:                  defKindNone,
-	cSel:                 defKindRD,
-	fpuCSel:              defKindRD,
-	movToVec:             defKindRD,
-	movFromVec:           defKindRD,
-	movFromVecSigned:     defKindRD,
-	vecDup:               defKindRD,
-	vecDupElement:        defKindRD,
-	vecExtract:           defKindRD,
-	vecMisc:              defKindRD,
-	vecMovElement:        defKindRD,
-	vecLanes:             defKindRD,
-	vecShiftImm:          defKindRD,
-	vecTbl:               defKindRD,
-	vecTbl2:              defKindRD,
-	vecPermute:           defKindRD,
-	vecRRR:               defKindRD,
-	vecRRRRewrite:        defKindNone,
-	fpuToInt:             defKindRD,
-	intToFpu:             defKindRD,
-	cCmpImm:              defKindNone,
-	movToFPSR:            defKindNone,
-	movFromFPSR:          defKindRD,
-	emitSourceOffsetInfo: defKindNone,
-	atomicRmw:            defKindRD,
-	atomicCas:            defKindNone,
-	atomicLoad:           defKindRD,
-	atomicStore:          defKindNone,
-	dmb:                  defKindNone,
-	loadConstBlockArg:    defKindRD,
-}
-
-// Defs returns the list of regalloc.VReg that are defined by the instruction.
-// In order to reduce the number of allocations, the caller can pass the slice to be used.
-func (i *instruction) Defs(regs *[]regalloc.VReg) []regalloc.VReg {
-	*regs = (*regs)[:0]
-	switch defKinds[i.kind] {
-	case defKindNone:
-	case defKindRD:
-		*regs = append(*regs, i.rd)
-	case defKindCall:
-		_, _, retIntRealRegs, retFloatRealRegs, _ := backend.ABIInfoFromUint64(i.u2)
-		for i := byte(0); i < retIntRealRegs; i++ {
-			*regs = append(*regs, regInfo.RealRegToVReg[intParamResultRegs[i]])
-		}
-		for i := byte(0); i < retFloatRealRegs; i++ {
-			*regs = append(*regs, regInfo.RealRegToVReg[floatParamResultRegs[i]])
-		}
-	default:
-		panic(fmt.Sprintf("defKind for %v not defined", i))
-	}
-	return *regs
-}
-
-// AssignDef implements regalloc.Instr AssignDef.
-func (i *instruction) AssignDef(reg regalloc.VReg) {
-	switch defKinds[i.kind] {
-	case defKindNone:
-	case defKindRD:
-		i.rd = reg
-	case defKindCall:
-		panic("BUG: call instructions shouldn't be assigned")
-	default:
-		panic(fmt.Sprintf("defKind for %v not defined", i))
-	}
-}
-
-type useKind byte
-
-const (
-	useKindNone useKind = iota + 1
-	useKindRN
-	useKindRNRM
-	useKindRNRMRA
-	useKindRNRN1RM
-	useKindCall
-	useKindCallInd
-	useKindAMode
-	useKindRNAMode
-	useKindCond
-	// useKindRDRewrite indicates an instruction where RD is used both as a source and destination.
-	// A temporary register for RD must be allocated explicitly with the source copied to this
-	// register before the instruction and the value copied from this register to the instruction
-	// return register.
-	useKindRDRewrite
-)
-
-var useKinds = [numInstructionKinds]useKind{
-	udf:                  useKindNone,
-	aluRRR:               useKindRNRM,
-	aluRRRR:              useKindRNRMRA,
-	aluRRImm12:           useKindRN,
-	aluRRBitmaskImm:      useKindRN,
-	aluRRRShift:          useKindRNRM,
-	aluRRImmShift:        useKindRN,
-	aluRRRExtend:         useKindRNRM,
-	bitRR:                useKindRN,
-	movZ:                 useKindNone,
-	movK:                 useKindNone,
-	movN:                 useKindNone,
-	mov32:                useKindRN,
-	mov64:                useKindRN,
-	fpuMov64:             useKindRN,
-	fpuMov128:            useKindRN,
-	fpuRR:                useKindRN,
-	fpuRRR:               useKindRNRM,
-	nop0:                 useKindNone,
-	call:                 useKindCall,
-	callInd:              useKindCallInd,
-	ret:                  useKindNone,
-	store8:               useKindRNAMode,
-	store16:              useKindRNAMode,
-	store32:              useKindRNAMode,
-	store64:              useKindRNAMode,
-	exitSequence:         useKindRN,
-	condBr:               useKindCond,
-	br:                   useKindNone,
-	brTableSequence:      useKindRN,
-	cSet:                 useKindNone,
-	extend:               useKindRN,
-	fpuCmp:               useKindRNRM,
-	uLoad8:               useKindAMode,
-	uLoad16:              useKindAMode,
-	uLoad32:              useKindAMode,
-	sLoad8:               useKindAMode,
-	sLoad16:              useKindAMode,
-	sLoad32:              useKindAMode,
-	uLoad64:              useKindAMode,
-	fpuLoad32:            useKindAMode,
-	fpuLoad64:            useKindAMode,
-	fpuLoad128:           useKindAMode,
-	fpuStore32:           useKindRNAMode,
-	fpuStore64:           useKindRNAMode,
-	fpuStore128:          useKindRNAMode,
-	loadFpuConst32:       useKindNone,
-	loadFpuConst64:       useKindNone,
-	loadFpuConst128:      useKindNone,
-	vecLoad1R:            useKindRN,
-	cSel:                 useKindRNRM,
-	fpuCSel:              useKindRNRM,
-	movToVec:             useKindRN,
-	movFromVec:           useKindRN,
-	movFromVecSigned:     useKindRN,
-	vecDup:               useKindRN,
-	vecDupElement:        useKindRN,
-	vecExtract:           useKindRNRM,
-	cCmpImm:              useKindRN,
-	vecMisc:              useKindRN,
-	vecMovElement:        useKindRN,
-	vecLanes:             useKindRN,
-	vecShiftImm:          useKindRN,
-	vecTbl:               useKindRNRM,
-	vecTbl2:              useKindRNRN1RM,
-	vecRRR:               useKindRNRM,
-	vecRRRRewrite:        useKindRDRewrite,
-	vecPermute:           useKindRNRM,
-	fpuToInt:             useKindRN,
-	intToFpu:             useKindRN,
-	movToFPSR:            useKindRN,
-	movFromFPSR:          useKindNone,
-	adr:                  useKindNone,
-	emitSourceOffsetInfo: useKindNone,
-	atomicRmw:            useKindRNRM,
-	atomicCas:            useKindRDRewrite,
-	atomicLoad:           useKindRN,
-	atomicStore:          useKindRNRM,
-	loadConstBlockArg:    useKindNone,
-	dmb:                  useKindNone,
-}
-
-// Uses returns the list of regalloc.VReg that are used by the instruction.
-// In order to reduce the number of allocations, the caller can pass the slice to be used.
-func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
-	*regs = (*regs)[:0]
-	switch useKinds[i.kind] {
-	case useKindNone:
-	case useKindRN:
-		if rn := i.rn.reg(); rn.Valid() {
-			*regs = append(*regs, rn)
-		}
-	case useKindRNRM:
-		if rn := i.rn.reg(); rn.Valid() {
-			*regs = append(*regs, rn)
-		}
-		if rm := i.rm.reg(); rm.Valid() {
-			*regs = append(*regs, rm)
-		}
-	case useKindRNRMRA:
-		if rn := i.rn.reg(); rn.Valid() {
-			*regs = append(*regs, rn)
-		}
-		if rm := i.rm.reg(); rm.Valid() {
-			*regs = append(*regs, rm)
-		}
-		if ra := regalloc.VReg(i.u2); ra.Valid() {
-			*regs = append(*regs, ra)
-		}
-	case useKindRNRN1RM:
-		if rn := i.rn.reg(); rn.Valid() && rn.IsRealReg() {
-			rn1 := regalloc.FromRealReg(rn.RealReg()+1, rn.RegType())
-			*regs = append(*regs, rn, rn1)
-		}
-		if rm := i.rm.reg(); rm.Valid() {
-			*regs = append(*regs, rm)
-		}
-	case useKindAMode:
-		amode := i.getAmode()
-		if amodeRN := amode.rn; amodeRN.Valid() {
-			*regs = append(*regs, amodeRN)
-		}
-		if amodeRM := amode.rm; amodeRM.Valid() {
-			*regs = append(*regs, amodeRM)
-		}
-	case useKindRNAMode:
-		*regs = append(*regs, i.rn.reg())
-		amode := i.getAmode()
-		if amodeRN := amode.rn; amodeRN.Valid() {
-			*regs = append(*regs, amodeRN)
-		}
-		if amodeRM := amode.rm; amodeRM.Valid() {
-			*regs = append(*regs, amodeRM)
-		}
-	case useKindCond:
-		cnd := cond(i.u1)
-		if cnd.kind() != condKindCondFlagSet {
-			*regs = append(*regs, cnd.register())
-		}
-	case useKindCallInd:
-		*regs = append(*regs, i.rn.nr())
-		fallthrough
-	case useKindCall:
-		argIntRealRegs, argFloatRealRegs, _, _, _ := backend.ABIInfoFromUint64(i.u2)
-		for i := byte(0); i < argIntRealRegs; i++ {
-			*regs = append(*regs, regInfo.RealRegToVReg[intParamResultRegs[i]])
-		}
-		for i := byte(0); i < argFloatRealRegs; i++ {
-			*regs = append(*regs, regInfo.RealRegToVReg[floatParamResultRegs[i]])
-		}
-	case useKindRDRewrite:
-		*regs = append(*regs, i.rn.reg())
-		*regs = append(*regs, i.rm.reg())
-		*regs = append(*regs, i.rd)
-	default:
-		panic(fmt.Sprintf("useKind for %v not defined", i))
-	}
-	return *regs
-}
-
-func (i *instruction) AssignUse(index int, reg regalloc.VReg) {
-	switch useKinds[i.kind] {
-	case useKindNone:
-	case useKindRN:
-		if rn := i.rn.reg(); rn.Valid() {
-			i.rn = i.rn.assignReg(reg)
-		}
-	case useKindRNRM:
-		if index == 0 {
-			if rn := i.rn.reg(); rn.Valid() {
-				i.rn = i.rn.assignReg(reg)
-			}
-		} else {
-			if rm := i.rm.reg(); rm.Valid() {
-				i.rm = i.rm.assignReg(reg)
-			}
-		}
-	case useKindRDRewrite:
-		if index == 0 {
-			if rn := i.rn.reg(); rn.Valid() {
-				i.rn = i.rn.assignReg(reg)
-			}
-		} else if index == 1 {
-			if rm := i.rm.reg(); rm.Valid() {
-				i.rm = i.rm.assignReg(reg)
-			}
-		} else {
-			if rd := i.rd; rd.Valid() {
-				i.rd = reg
-			}
-		}
-	case useKindRNRN1RM:
-		if index == 0 {
-			if rn := i.rn.reg(); rn.Valid() {
-				i.rn = i.rn.assignReg(reg)
-			}
-			if rn1 := i.rn.reg() + 1; rn1.Valid() {
-				i.rm = i.rm.assignReg(reg + 1)
-			}
-		} else {
-			if rm := i.rm.reg(); rm.Valid() {
-				i.rm = i.rm.assignReg(reg)
-			}
-		}
-	case useKindRNRMRA:
-		if index == 0 {
-			if rn := i.rn.reg(); rn.Valid() {
-				i.rn = i.rn.assignReg(reg)
-			}
-		} else if index == 1 {
-			if rm := i.rm.reg(); rm.Valid() {
-				i.rm = i.rm.assignReg(reg)
-			}
-		} else {
-			if ra := regalloc.VReg(i.u2); ra.Valid() {
-				i.u2 = uint64(reg)
-			}
-		}
-	case useKindAMode:
-		if index == 0 {
-			amode := i.getAmode()
-			if amodeRN := amode.rn; amodeRN.Valid() {
-				amode.rn = reg
-			}
-		} else {
-			amode := i.getAmode()
-			if amodeRM := amode.rm; amodeRM.Valid() {
-				amode.rm = reg
-			}
-		}
-	case useKindRNAMode:
-		if index == 0 {
-			i.rn = i.rn.assignReg(reg)
-		} else if index == 1 {
-			amode := i.getAmode()
-			if amodeRN := amode.rn; amodeRN.Valid() {
-				amode.rn = reg
-			} else {
-				panic("BUG")
-			}
-		} else {
-			amode := i.getAmode()
-			if amodeRM := amode.rm; amodeRM.Valid() {
-				amode.rm = reg
-			} else {
-				panic("BUG")
-			}
-		}
-	case useKindCond:
-		c := cond(i.u1)
-		switch c.kind() {
-		case condKindRegisterZero:
-			i.u1 = uint64(registerAsRegZeroCond(reg))
-		case condKindRegisterNotZero:
-			i.u1 = uint64(registerAsRegNotZeroCond(reg))
-		}
-	case useKindCall:
-		panic("BUG: call instructions shouldn't be assigned")
-	case useKindCallInd:
-		i.rn = i.rn.assignReg(reg)
-	default:
-		panic(fmt.Sprintf("useKind for %v not defined", i))
-	}
-}
-
-func (i *instruction) asCall(ref ssa.FuncRef, abi *backend.FunctionABI) {
-	i.kind = call
-	i.u1 = uint64(ref)
-	if abi != nil {
-		i.u2 = abi.ABIInfoAsUint64()
-	}
-}
-
-func (i *instruction) asCallIndirect(ptr regalloc.VReg, abi *backend.FunctionABI) {
-	i.kind = callInd
-	i.rn = operandNR(ptr)
-	if abi != nil {
-		i.u2 = abi.ABIInfoAsUint64()
-	}
-}
-
-func (i *instruction) callFuncRef() ssa.FuncRef {
-	return ssa.FuncRef(i.u1)
-}
-
-// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
-func (i *instruction) asMOVZ(dst regalloc.VReg, imm uint64, shift uint32, dst64bit bool) {
-	i.kind = movZ
-	i.rd = dst
-	i.u1 = imm
-	i.u2 = uint64(shift)
-	if dst64bit {
-		i.u2 |= 1 << 32
-	}
-}
-
-// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
-func (i *instruction) asMOVK(dst regalloc.VReg, imm uint64, shift uint32, dst64bit bool) {
-	i.kind = movK
-	i.rd = dst
-	i.u1 = imm
-	i.u2 = uint64(shift)
-	if dst64bit {
-		i.u2 |= 1 << 32
-	}
-}
-
-// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
-func (i *instruction) asMOVN(dst regalloc.VReg, imm uint64, shift uint32, dst64bit bool) {
-	i.kind = movN
-	i.rd = dst
-	i.u1 = imm
-	i.u2 = uint64(shift)
-	if dst64bit {
-		i.u2 |= 1 << 32
-	}
-}
-
-func (i *instruction) asNop0() *instruction {
-	i.kind = nop0
-	return i
-}
-
-func (i *instruction) asNop0WithLabel(l label) {
-	i.kind = nop0
-	i.u1 = uint64(l)
-}
-
-func (i *instruction) nop0Label() label {
-	return label(i.u1)
-}
-
-func (i *instruction) asRet() {
-	i.kind = ret
-}
-
-func (i *instruction) asStorePair64(src1, src2 regalloc.VReg, amode *addressMode) {
-	i.kind = storeP64
-	i.rn = operandNR(src1)
-	i.rm = operandNR(src2)
-	i.setAmode(amode)
-}
-
-func (i *instruction) asLoadPair64(src1, src2 regalloc.VReg, amode *addressMode) {
-	i.kind = loadP64
-	i.rn = operandNR(src1)
-	i.rm = operandNR(src2)
-	i.setAmode(amode)
-}
-
-func (i *instruction) asStore(src operand, amode *addressMode, sizeInBits byte) {
-	switch sizeInBits {
-	case 8:
-		i.kind = store8
-	case 16:
-		i.kind = store16
-	case 32:
-		if src.reg().RegType() == regalloc.RegTypeInt {
-			i.kind = store32
-		} else {
-			i.kind = fpuStore32
-		}
-	case 64:
-		if src.reg().RegType() == regalloc.RegTypeInt {
-			i.kind = store64
-		} else {
-			i.kind = fpuStore64
-		}
-	case 128:
-		i.kind = fpuStore128
-	}
-	i.rn = src
-	i.setAmode(amode)
-}
-
-func (i *instruction) asSLoad(dst regalloc.VReg, amode *addressMode, sizeInBits byte) {
-	switch sizeInBits {
-	case 8:
-		i.kind = sLoad8
-	case 16:
-		i.kind = sLoad16
-	case 32:
-		i.kind = sLoad32
-	default:
-		panic("BUG")
-	}
-	i.rd = dst
-	i.setAmode(amode)
-}
-
-func (i *instruction) asULoad(dst regalloc.VReg, amode *addressMode, sizeInBits byte) {
-	switch sizeInBits {
-	case 8:
-		i.kind = uLoad8
-	case 16:
-		i.kind = uLoad16
-	case 32:
-		i.kind = uLoad32
-	case 64:
-		i.kind = uLoad64
-	}
-	i.rd = dst
-	i.setAmode(amode)
-}
-
-func (i *instruction) asFpuLoad(dst regalloc.VReg, amode *addressMode, sizeInBits byte) {
-	switch sizeInBits {
-	case 32:
-		i.kind = fpuLoad32
-	case 64:
-		i.kind = fpuLoad64
-	case 128:
-		i.kind = fpuLoad128
-	}
-	i.rd = dst
-	i.setAmode(amode)
-}
-
-func (i *instruction) getAmode() *addressMode {
-	return wazevoapi.PtrFromUintptr[addressMode](uintptr(i.u1))
-}
-
-func (i *instruction) setAmode(a *addressMode) {
-	i.u1 = uint64(uintptr(unsafe.Pointer(a)))
-}
-
-func (i *instruction) asVecLoad1R(rd regalloc.VReg, rn operand, arr vecArrangement) {
-	// NOTE: currently only has support for no-offset loads, though it is suspicious that
-	// we would need to support offset load (that is only available for post-index).
-	i.kind = vecLoad1R
-	i.rd = rd
-	i.rn = rn
-	i.u1 = uint64(arr)
-}
-
-func (i *instruction) asCSet(rd regalloc.VReg, mask bool, c condFlag) {
-	i.kind = cSet
-	i.rd = rd
-	i.u1 = uint64(c)
-	if mask {
-		i.u2 = 1
-	}
-}
-
-func (i *instruction) asCSel(rd regalloc.VReg, rn, rm operand, c condFlag, _64bit bool) {
-	i.kind = cSel
-	i.rd = rd
-	i.rn = rn
-	i.rm = rm
-	i.u1 = uint64(c)
-	if _64bit {
-		i.u2 = 1
-	}
-}
-
-func (i *instruction) asFpuCSel(rd regalloc.VReg, rn, rm operand, c condFlag, _64bit bool) {
-	i.kind = fpuCSel
-	i.rd = rd
-	i.rn = rn
-	i.rm = rm
-	i.u1 = uint64(c)
-	if _64bit {
-		i.u2 = 1
-	}
-}
-
-func (i *instruction) asBr(target label) {
-	if target == labelReturn {
-		panic("BUG: call site should special case for returnLabel")
-	}
-	i.kind = br
-	i.u1 = uint64(target)
-}
-
-func (i *instruction) asBrTableSequence(indexReg regalloc.VReg, targetIndex, targetCounts int) {
-	i.kind = brTableSequence
-	i.rn = operandNR(indexReg)
-	i.u1 = uint64(targetIndex)
-	i.u2 = uint64(targetCounts)
-}
-
-func (i *instruction) brTableSequenceOffsetsResolved() {
-	i.rm.data = 1 // indicate that the offsets are resolved, for debugging.
-}
-
-func (i *instruction) brLabel() label {
-	return label(i.u1)
-}
-
-// brOffsetResolved is called when the target label is resolved.
-func (i *instruction) brOffsetResolve(offset int64) {
-	i.u2 = uint64(offset)
-	i.rm.data = 1 // indicate that the offset is resolved, for debugging.
-}
-
-func (i *instruction) brOffset() int64 {
-	return int64(i.u2)
-}
-
-// asCondBr encodes a conditional branch instruction. is64bit is only needed when cond is not flag.
-func (i *instruction) asCondBr(c cond, target label, is64bit bool) {
-	i.kind = condBr
-	i.u1 = c.asUint64()
-	i.u2 = uint64(target)
-	if is64bit {
-		i.u2 |= 1 << 32
-	}
-}
-
-func (i *instruction) setCondBrTargets(target label) {
-	i.u2 = uint64(target)
-}
-
-func (i *instruction) condBrLabel() label {
-	return label(i.u2)
-}
-
-// condBrOffsetResolve is called when the target label is resolved.
-func (i *instruction) condBrOffsetResolve(offset int64) {
-	i.rn.data = uint64(offset)
-	i.rn.data2 = 1 // indicate that the offset is resolved, for debugging.
-}
-
-// condBrOffsetResolved returns true if condBrOffsetResolve is already called.
-func (i *instruction) condBrOffsetResolved() bool {
-	return i.rn.data2 == 1
-}
-
-func (i *instruction) condBrOffset() int64 {
-	return int64(i.rn.data)
-}
-
-func (i *instruction) condBrCond() cond {
-	return cond(i.u1)
-}
-
-func (i *instruction) condBr64bit() bool {
-	return i.u2&(1<<32) != 0
-}
-
-func (i *instruction) asLoadFpuConst32(rd regalloc.VReg, raw uint64) {
-	i.kind = loadFpuConst32
-	i.u1 = raw
-	i.rd = rd
-}
-
-func (i *instruction) asLoadFpuConst64(rd regalloc.VReg, raw uint64) {
-	i.kind = loadFpuConst64
-	i.u1 = raw
-	i.rd = rd
-}
-
-func (i *instruction) asLoadFpuConst128(rd regalloc.VReg, lo, hi uint64) {
-	i.kind = loadFpuConst128
-	i.u1 = lo
-	i.u2 = hi
-	i.rd = rd
-}
-
-func (i *instruction) asFpuCmp(rn, rm operand, is64bit bool) {
-	i.kind = fpuCmp
-	i.rn, i.rm = rn, rm
-	if is64bit {
-		i.u1 = 1
-	}
-}
-
-func (i *instruction) asCCmpImm(rn operand, imm uint64, c condFlag, flag byte, is64bit bool) {
-	i.kind = cCmpImm
-	i.rn = rn
-	i.rm.data = imm
-	i.u1 = uint64(c)
-	i.u2 = uint64(flag)
-	if is64bit {
-		i.u2 |= 1 << 32
-	}
-}
-
-// asALU setups a basic ALU instruction.
-func (i *instruction) asALU(aluOp aluOp, rd regalloc.VReg, rn, rm operand, dst64bit bool) {
-	switch rm.kind {
-	case operandKindNR:
-		i.kind = aluRRR
-	case operandKindSR:
-		i.kind = aluRRRShift
-	case operandKindER:
-		i.kind = aluRRRExtend
-	case operandKindImm12:
-		i.kind = aluRRImm12
-	default:
-		panic("BUG")
-	}
-	i.u1 = uint64(aluOp)
-	i.rd, i.rn, i.rm = rd, rn, rm
-	if dst64bit {
-		i.u2 |= 1 << 32
-	}
-}
-
-// asALU setups a basic ALU instruction.
-func (i *instruction) asALURRRR(aluOp aluOp, rd regalloc.VReg, rn, rm operand, ra regalloc.VReg, dst64bit bool) {
-	i.kind = aluRRRR
-	i.u1 = uint64(aluOp)
-	i.rd, i.rn, i.rm, i.u2 = rd, rn, rm, uint64(ra)
-	if dst64bit {
-		i.u1 |= 1 << 32
-	}
-}
-
-// asALUShift setups a shift based ALU instruction.
-func (i *instruction) asALUShift(aluOp aluOp, rd regalloc.VReg, rn, rm operand, dst64bit bool) {
-	switch rm.kind {
-	case operandKindNR:
-		i.kind = aluRRR // If the shift amount op is a register, then the instruction is encoded as a normal ALU instruction with two register operands.
-	case operandKindShiftImm:
-		i.kind = aluRRImmShift
-	default:
-		panic("BUG")
-	}
-	i.u1 = uint64(aluOp)
-	i.rd, i.rn, i.rm = rd, rn, rm
-	if dst64bit {
-		i.u2 |= 1 << 32
-	}
-}
-
-func (i *instruction) asALUBitmaskImm(aluOp aluOp, rd, rn regalloc.VReg, imm uint64, dst64bit bool) {
-	i.kind = aluRRBitmaskImm
-	i.u1 = uint64(aluOp)
-	i.rn, i.rd = operandNR(rn), rd
-	i.u2 = imm
-	if dst64bit {
-		i.u1 |= 1 << 32
-	}
-}
-
-func (i *instruction) asMovToFPSR(rn regalloc.VReg) {
-	i.kind = movToFPSR
-	i.rn = operandNR(rn)
-}
-
-func (i *instruction) asMovFromFPSR(rd regalloc.VReg) {
-	i.kind = movFromFPSR
-	i.rd = rd
-}
-
-func (i *instruction) asBitRR(bitOp bitOp, rd, rn regalloc.VReg, is64bit bool) {
-	i.kind = bitRR
-	i.rn, i.rd = operandNR(rn), rd
-	i.u1 = uint64(bitOp)
-	if is64bit {
-		i.u2 = 1
-	}
-}
-
-func (i *instruction) asFpuRRR(op fpuBinOp, rd regalloc.VReg, rn, rm operand, dst64bit bool) {
-	i.kind = fpuRRR
-	i.u1 = uint64(op)
-	i.rd, i.rn, i.rm = rd, rn, rm
-	if dst64bit {
-		i.u2 = 1
-	}
-}
-
-func (i *instruction) asFpuRR(op fpuUniOp, rd regalloc.VReg, rn operand, dst64bit bool) {
-	i.kind = fpuRR
-	i.u1 = uint64(op)
-	i.rd, i.rn = rd, rn
-	if dst64bit {
-		i.u2 = 1
-	}
-}
-
-func (i *instruction) asExtend(rd, rn regalloc.VReg, fromBits, toBits byte, signed bool) {
-	i.kind = extend
-	i.rn, i.rd = operandNR(rn), rd
-	i.u1 = uint64(fromBits)
-	i.u2 = uint64(toBits)
-	if signed {
-		i.u2 |= 1 << 32
-	}
-}
-
-func (i *instruction) asMove32(rd, rn regalloc.VReg) {
-	i.kind = mov32
-	i.rn, i.rd = operandNR(rn), rd
-}
-
-func (i *instruction) asMove64(rd, rn regalloc.VReg) *instruction {
-	i.kind = mov64
-	i.rn, i.rd = operandNR(rn), rd
-	return i
-}
-
-func (i *instruction) asFpuMov64(rd, rn regalloc.VReg) {
-	i.kind = fpuMov64
-	i.rn, i.rd = operandNR(rn), rd
-}
-
-func (i *instruction) asFpuMov128(rd, rn regalloc.VReg) *instruction {
-	i.kind = fpuMov128
-	i.rn, i.rd = operandNR(rn), rd
-	return i
-}
-
-func (i *instruction) asMovToVec(rd regalloc.VReg, rn operand, arr vecArrangement, index vecIndex) {
-	i.kind = movToVec
-	i.rd = rd
-	i.rn = rn
-	i.u1, i.u2 = uint64(arr), uint64(index)
-}
-
-func (i *instruction) asMovFromVec(rd regalloc.VReg, rn operand, arr vecArrangement, index vecIndex, signed bool) {
-	if signed {
-		i.kind = movFromVecSigned
-	} else {
-		i.kind = movFromVec
-	}
-	i.rd = rd
-	i.rn = rn
-	i.u1, i.u2 = uint64(arr), uint64(index)
-}
-
-func (i *instruction) asVecDup(rd regalloc.VReg, rn operand, arr vecArrangement) {
-	i.kind = vecDup
-	i.u1 = uint64(arr)
-	i.rn, i.rd = rn, rd
-}
-
-func (i *instruction) asVecDupElement(rd regalloc.VReg, rn operand, arr vecArrangement, index vecIndex) {
-	i.kind = vecDupElement
-	i.u1 = uint64(arr)
-	i.rn, i.rd = rn, rd
-	i.u2 = uint64(index)
-}
-
-func (i *instruction) asVecExtract(rd regalloc.VReg, rn, rm operand, arr vecArrangement, index uint32) {
-	i.kind = vecExtract
-	i.u1 = uint64(arr)
-	i.rn, i.rm, i.rd = rn, rm, rd
-	i.u2 = uint64(index)
-}
-
-func (i *instruction) asVecMovElement(rd regalloc.VReg, rn operand, arr vecArrangement, rdIndex, rnIndex vecIndex) {
-	i.kind = vecMovElement
-	i.u1 = uint64(arr)
-	i.u2 = uint64(rdIndex) | uint64(rnIndex)<<32
-	i.rn, i.rd = rn, rd
-}
-
-func (i *instruction) asVecMisc(op vecOp, rd regalloc.VReg, rn operand, arr vecArrangement) {
-	i.kind = vecMisc
-	i.u1 = uint64(op)
-	i.rn, i.rd = rn, rd
-	i.u2 = uint64(arr)
-}
-
-func (i *instruction) asVecLanes(op vecOp, rd regalloc.VReg, rn operand, arr vecArrangement) {
-	i.kind = vecLanes
-	i.u1 = uint64(op)
-	i.rn, i.rd = rn, rd
-	i.u2 = uint64(arr)
-}
-
-func (i *instruction) asVecShiftImm(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) *instruction {
-	i.kind = vecShiftImm
-	i.u1 = uint64(op)
-	i.rn, i.rm, i.rd = rn, rm, rd
-	i.u2 = uint64(arr)
-	return i
-}
-
-func (i *instruction) asVecTbl(nregs byte, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
-	switch nregs {
-	case 0, 1:
-		i.kind = vecTbl
-	case 2:
-		i.kind = vecTbl2
-		if !rn.reg().IsRealReg() {
-			panic("rn is not a RealReg")
-		}
-		if rn.realReg() == v31 {
-			panic("rn cannot be v31")
-		}
-	default:
-		panic(fmt.Sprintf("unsupported number of registers %d", nregs))
-	}
-	i.rn, i.rm, i.rd = rn, rm, rd
-	i.u2 = uint64(arr)
-}
-
-func (i *instruction) asVecPermute(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
-	i.kind = vecPermute
-	i.u1 = uint64(op)
-	i.rn, i.rm, i.rd = rn, rm, rd
-	i.u2 = uint64(arr)
-}
-
-func (i *instruction) asVecRRR(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) *instruction {
-	i.kind = vecRRR
-	i.u1 = uint64(op)
-	i.rn, i.rd, i.rm = rn, rd, rm
-	i.u2 = uint64(arr)
-	return i
-}
-
-// asVecRRRRewrite encodes a vector instruction that rewrites the destination register.
-// IMPORTANT: the destination register must be already defined before this instruction.
-func (i *instruction) asVecRRRRewrite(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
-	i.kind = vecRRRRewrite
-	i.u1 = uint64(op)
-	i.rn, i.rd, i.rm = rn, rd, rm
-	i.u2 = uint64(arr)
-}
-
-func (i *instruction) IsCopy() bool {
-	op := i.kind
-	// We do not include mov32 as it is not a copy instruction in the sense that it does not preserve the upper 32 bits,
-	// and it is only used in the translation of IReduce, not the actual copy indeed.
-	return op == mov64 || op == fpuMov64 || op == fpuMov128
-}
-
-// String implements fmt.Stringer.
-func (i *instruction) String() (str string) {
-	is64SizeBitToSize := func(v uint64) byte {
-		if v == 0 {
-			return 32
-		}
-		return 64
-	}
-
-	switch i.kind {
-	case nop0:
-		if i.u1 != 0 {
-			l := label(i.u1)
-			str = fmt.Sprintf("%s:", l)
-		} else {
-			str = "nop0"
-		}
-	case aluRRR:
-		size := is64SizeBitToSize(i.u2 >> 32)
-		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size),
-			i.rm.format(size))
-	case aluRRRR:
-		size := is64SizeBitToSize(i.u1 >> 32)
-		str = fmt.Sprintf("%s %s, %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(regalloc.VReg(i.u2), size))
-	case aluRRImm12:
-		size := is64SizeBitToSize(i.u2 >> 32)
-		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size), i.rm.format(size))
-	case aluRRBitmaskImm:
-		size := is64SizeBitToSize(i.u1 >> 32)
-		rd, rn := formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size)
-		if size == 32 {
-			str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, uint32(i.u2))
-		} else {
-			str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, i.u2)
-		}
-	case aluRRImmShift:
-		size := is64SizeBitToSize(i.u2 >> 32)
-		str = fmt.Sprintf("%s %s, %s, %#x",
-			aluOp(i.u1).String(),
-			formatVRegSized(i.rd, size),
-			formatVRegSized(i.rn.nr(), size),
-			i.rm.shiftImm(),
-		)
-	case aluRRRShift:
-		size := is64SizeBitToSize(i.u2 >> 32)
-		str = fmt.Sprintf("%s %s, %s, %s",
-			aluOp(i.u1).String(),
-			formatVRegSized(i.rd, size),
-			formatVRegSized(i.rn.nr(), size),
-			i.rm.format(size),
-		)
-	case aluRRRExtend:
-		size := is64SizeBitToSize(i.u2 >> 32)
-		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd, size),
-			formatVRegSized(i.rn.nr(), size),
-			// Regardless of the source size, the register is formatted in 32-bit.
-			i.rm.format(32),
-		)
-	case bitRR:
-		size := is64SizeBitToSize(i.u2)
-		str = fmt.Sprintf("%s %s, %s",
-			bitOp(i.u1),
-			formatVRegSized(i.rd, size),
-			formatVRegSized(i.rn.nr(), size),
-		)
-	case uLoad8:
-		str = fmt.Sprintf("ldrb %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
-	case sLoad8:
-		str = fmt.Sprintf("ldrsb %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
-	case uLoad16:
-		str = fmt.Sprintf("ldrh %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
-	case sLoad16:
-		str = fmt.Sprintf("ldrsh %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
-	case uLoad32:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
-	case sLoad32:
-		str = fmt.Sprintf("ldrs %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
-	case uLoad64:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 64), i.getAmode().format(64))
-	case store8:
-		str = fmt.Sprintf("strb %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(8))
-	case store16:
-		str = fmt.Sprintf("strh %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(16))
-	case store32:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(32))
-	case store64:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.getAmode().format(64))
-	case storeP64:
-		str = fmt.Sprintf("stp %s, %s, %s",
-			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.getAmode().format(64))
-	case loadP64:
-		str = fmt.Sprintf("ldp %s, %s, %s",
-			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.getAmode().format(64))
-	case mov64:
-		str = fmt.Sprintf("mov %s, %s",
-			formatVRegSized(i.rd, 64),
-			formatVRegSized(i.rn.nr(), 64))
-	case mov32:
-		str = fmt.Sprintf("mov %s, %s", formatVRegSized(i.rd, 32), formatVRegSized(i.rn.nr(), 32))
-	case movZ:
-		size := is64SizeBitToSize(i.u2 >> 32)
-		str = fmt.Sprintf("movz %s, #%#x, lsl %d", formatVRegSized(i.rd, size), uint16(i.u1), uint32(i.u2)*16)
-	case movN:
-		size := is64SizeBitToSize(i.u2 >> 32)
-		str = fmt.Sprintf("movn %s, #%#x, lsl %d", formatVRegSized(i.rd, size), uint16(i.u1), uint32(i.u2)*16)
-	case movK:
-		size := is64SizeBitToSize(i.u2 >> 32)
-		str = fmt.Sprintf("movk %s, #%#x, lsl %d", formatVRegSized(i.rd, size), uint16(i.u1), uint32(i.u2)*16)
-	case extend:
-		fromBits, toBits := byte(i.u1), byte(i.u2)
-
-		var signedStr string
-		if i.u2>>32 == 1 {
-			signedStr = "s"
-		} else {
-			signedStr = "u"
-		}
-		var fromStr string
-		switch fromBits {
-		case 8:
-			fromStr = "b"
-		case 16:
-			fromStr = "h"
-		case 32:
-			fromStr = "w"
-		}
-		str = fmt.Sprintf("%sxt%s %s, %s", signedStr, fromStr, formatVRegSized(i.rd, toBits), formatVRegSized(i.rn.nr(), 32))
-	case cSel:
-		size := is64SizeBitToSize(i.u2)
-		str = fmt.Sprintf("csel %s, %s, %s, %s",
-			formatVRegSized(i.rd, size),
-			formatVRegSized(i.rn.nr(), size),
-			formatVRegSized(i.rm.nr(), size),
-			condFlag(i.u1),
-		)
-	case cSet:
-		if i.u2 != 0 {
-			str = fmt.Sprintf("csetm %s, %s", formatVRegSized(i.rd, 64), condFlag(i.u1))
-		} else {
-			str = fmt.Sprintf("cset %s, %s", formatVRegSized(i.rd, 64), condFlag(i.u1))
-		}
-	case cCmpImm:
-		size := is64SizeBitToSize(i.u2 >> 32)
-		str = fmt.Sprintf("ccmp %s, #%#x, #%#x, %s",
-			formatVRegSized(i.rn.nr(), size), i.rm.data,
-			i.u2&0b1111,
-			condFlag(i.u1))
-	case fpuMov64:
-		str = fmt.Sprintf("mov %s, %s",
-			formatVRegVec(i.rd, vecArrangement8B, vecIndexNone),
-			formatVRegVec(i.rn.nr(), vecArrangement8B, vecIndexNone))
-	case fpuMov128:
-		str = fmt.Sprintf("mov %s, %s",
-			formatVRegVec(i.rd, vecArrangement16B, vecIndexNone),
-			formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone))
-	case fpuMovFromVec:
-		panic("TODO")
-	case fpuRR:
-		dstSz := is64SizeBitToSize(i.u2)
-		srcSz := dstSz
-		op := fpuUniOp(i.u1)
-		switch op {
-		case fpuUniOpCvt32To64:
-			srcSz = 32
-		case fpuUniOpCvt64To32:
-			srcSz = 64
-		}
-		str = fmt.Sprintf("%s %s, %s", op.String(),
-			formatVRegSized(i.rd, dstSz), formatVRegSized(i.rn.nr(), srcSz))
-	case fpuRRR:
-		size := is64SizeBitToSize(i.u2)
-		str = fmt.Sprintf("%s %s, %s, %s", fpuBinOp(i.u1).String(),
-			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
-	case fpuRRI:
-		panic("TODO")
-	case fpuRRRR:
-		panic("TODO")
-	case fpuCmp:
-		size := is64SizeBitToSize(i.u1)
-		str = fmt.Sprintf("fcmp %s, %s",
-			formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
-	case fpuLoad32:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
-	case fpuStore32:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(64))
-	case fpuLoad64:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 64), i.getAmode().format(64))
-	case fpuStore64:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.getAmode().format(64))
-	case fpuLoad128:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 128), i.getAmode().format(64))
-	case fpuStore128:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 128), i.getAmode().format(64))
-	case loadFpuConst32:
-		str = fmt.Sprintf("ldr %s, #8; b 8; data.f32 %f", formatVRegSized(i.rd, 32), math.Float32frombits(uint32(i.u1)))
-	case loadFpuConst64:
-		str = fmt.Sprintf("ldr %s, #8; b 16; data.f64 %f", formatVRegSized(i.rd, 64), math.Float64frombits(i.u1))
-	case loadFpuConst128:
-		str = fmt.Sprintf("ldr %s, #8; b 32; data.v128  %016x %016x",
-			formatVRegSized(i.rd, 128), i.u1, i.u2)
-	case fpuToInt:
-		var op, src, dst string
-		if signed := i.u1 == 1; signed {
-			op = "fcvtzs"
-		} else {
-			op = "fcvtzu"
-		}
-		if src64 := i.u2&1 != 0; src64 {
-			src = formatVRegWidthVec(i.rn.nr(), vecArrangementD)
-		} else {
-			src = formatVRegWidthVec(i.rn.nr(), vecArrangementS)
-		}
-		if dst64 := i.u2&2 != 0; dst64 {
-			dst = formatVRegSized(i.rd, 64)
-		} else {
-			dst = formatVRegSized(i.rd, 32)
-		}
-		str = fmt.Sprintf("%s %s, %s", op, dst, src)
-
-	case intToFpu:
-		var op, src, dst string
-		if signed := i.u1 == 1; signed {
-			op = "scvtf"
-		} else {
-			op = "ucvtf"
-		}
-		if src64 := i.u2&1 != 0; src64 {
-			src = formatVRegSized(i.rn.nr(), 64)
-		} else {
-			src = formatVRegSized(i.rn.nr(), 32)
-		}
-		if dst64 := i.u2&2 != 0; dst64 {
-			dst = formatVRegWidthVec(i.rd, vecArrangementD)
-		} else {
-			dst = formatVRegWidthVec(i.rd, vecArrangementS)
-		}
-		str = fmt.Sprintf("%s %s, %s", op, dst, src)
-	case fpuCSel:
-		size := is64SizeBitToSize(i.u2)
-		str = fmt.Sprintf("fcsel %s, %s, %s, %s",
-			formatVRegSized(i.rd, size),
-			formatVRegSized(i.rn.nr(), size),
-			formatVRegSized(i.rm.nr(), size),
-			condFlag(i.u1),
-		)
-	case movToVec:
-		var size byte
-		arr := vecArrangement(i.u1)
-		switch arr {
-		case vecArrangementB, vecArrangementH, vecArrangementS:
-			size = 32
-		case vecArrangementD:
-			size = 64
-		default:
-			panic("unsupported arrangement " + arr.String())
-		}
-		str = fmt.Sprintf("ins %s, %s", formatVRegVec(i.rd, arr, vecIndex(i.u2)), formatVRegSized(i.rn.nr(), size))
-	case movFromVec, movFromVecSigned:
-		var size byte
-		var opcode string
-		arr := vecArrangement(i.u1)
-		signed := i.kind == movFromVecSigned
-		switch arr {
-		case vecArrangementB, vecArrangementH, vecArrangementS:
-			size = 32
-			if signed {
-				opcode = "smov"
-			} else {
-				opcode = "umov"
-			}
-		case vecArrangementD:
-			size = 64
-			if signed {
-				opcode = "smov"
-			} else {
-				opcode = "mov"
-			}
-		default:
-			panic("unsupported arrangement " + arr.String())
-		}
-		str = fmt.Sprintf("%s %s, %s", opcode, formatVRegSized(i.rd, size), formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)))
-	case vecDup:
-		str = fmt.Sprintf("dup %s, %s",
-			formatVRegVec(i.rd, vecArrangement(i.u1), vecIndexNone),
-			formatVRegSized(i.rn.nr(), 64),
-		)
-	case vecDupElement:
-		arr := vecArrangement(i.u1)
-		str = fmt.Sprintf("dup %s, %s",
-			formatVRegVec(i.rd, arr, vecIndexNone),
-			formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)),
-		)
-	case vecDupFromFpu:
-		panic("TODO")
-	case vecExtract:
-		str = fmt.Sprintf("ext %s, %s, %s, #%d",
-			formatVRegVec(i.rd, vecArrangement(i.u1), vecIndexNone),
-			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndexNone),
-			formatVRegVec(i.rm.nr(), vecArrangement(i.u1), vecIndexNone),
-			uint32(i.u2),
-		)
-	case vecExtend:
-		panic("TODO")
-	case vecMovElement:
-		str = fmt.Sprintf("mov %s, %s",
-			formatVRegVec(i.rd, vecArrangement(i.u1), vecIndex(i.u2&0xffffffff)),
-			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndex(i.u2>>32)),
-		)
-	case vecMiscNarrow:
-		panic("TODO")
-	case vecRRR, vecRRRRewrite:
-		str = fmt.Sprintf("%s %s, %s, %s",
-			vecOp(i.u1),
-			formatVRegVec(i.rd, vecArrangement(i.u2), vecIndexNone),
-			formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone),
-			formatVRegVec(i.rm.nr(), vecArrangement(i.u2), vecIndexNone),
-		)
-	case vecMisc:
-		vop := vecOp(i.u1)
-		if vop == vecOpCmeq0 {
-			str = fmt.Sprintf("cmeq %s, %s, #0",
-				formatVRegVec(i.rd, vecArrangement(i.u2), vecIndexNone),
-				formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
-		} else {
-			str = fmt.Sprintf("%s %s, %s",
-				vop,
-				formatVRegVec(i.rd, vecArrangement(i.u2), vecIndexNone),
-				formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
-		}
-	case vecLanes:
-		arr := vecArrangement(i.u2)
-		var destArr vecArrangement
-		switch arr {
-		case vecArrangement8B, vecArrangement16B:
-			destArr = vecArrangementH
-		case vecArrangement4H, vecArrangement8H:
-			destArr = vecArrangementS
-		case vecArrangement4S:
-			destArr = vecArrangementD
-		default:
-			panic("invalid arrangement " + arr.String())
-		}
-		str = fmt.Sprintf("%s %s, %s",
-			vecOp(i.u1),
-			formatVRegWidthVec(i.rd, destArr),
-			formatVRegVec(i.rn.nr(), arr, vecIndexNone))
-	case vecShiftImm:
-		arr := vecArrangement(i.u2)
-		str = fmt.Sprintf("%s %s, %s, #%d",
-			vecOp(i.u1),
-			formatVRegVec(i.rd, arr, vecIndexNone),
-			formatVRegVec(i.rn.nr(), arr, vecIndexNone),
-			i.rm.shiftImm())
-	case vecTbl:
-		arr := vecArrangement(i.u2)
-		str = fmt.Sprintf("tbl %s, { %s }, %s",
-			formatVRegVec(i.rd, arr, vecIndexNone),
-			formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone),
-			formatVRegVec(i.rm.nr(), arr, vecIndexNone))
-	case vecTbl2:
-		arr := vecArrangement(i.u2)
-		rd, rn, rm := i.rd, i.rn.nr(), i.rm.nr()
-		rn1 := regalloc.FromRealReg(rn.RealReg()+1, rn.RegType())
-		str = fmt.Sprintf("tbl %s, { %s, %s }, %s",
-			formatVRegVec(rd, arr, vecIndexNone),
-			formatVRegVec(rn, vecArrangement16B, vecIndexNone),
-			formatVRegVec(rn1, vecArrangement16B, vecIndexNone),
-			formatVRegVec(rm, arr, vecIndexNone))
-	case vecPermute:
-		arr := vecArrangement(i.u2)
-		str = fmt.Sprintf("%s %s, %s, %s",
-			vecOp(i.u1),
-			formatVRegVec(i.rd, arr, vecIndexNone),
-			formatVRegVec(i.rn.nr(), arr, vecIndexNone),
-			formatVRegVec(i.rm.nr(), arr, vecIndexNone))
-	case movToFPSR:
-		str = fmt.Sprintf("msr fpsr, %s", formatVRegSized(i.rn.nr(), 64))
-	case movFromFPSR:
-		str = fmt.Sprintf("mrs %s fpsr", formatVRegSized(i.rd, 64))
-	case call:
-		str = fmt.Sprintf("bl %s", ssa.FuncRef(i.u1))
-	case callInd:
-		str = fmt.Sprintf("bl %s", formatVRegSized(i.rn.nr(), 64))
-	case ret:
-		str = "ret"
-	case br:
-		target := label(i.u1)
-		if i.rm.data != 0 {
-			str = fmt.Sprintf("b #%#x (%s)", i.brOffset(), target.String())
-		} else {
-			str = fmt.Sprintf("b %s", target.String())
-		}
-	case condBr:
-		size := is64SizeBitToSize(i.u2 >> 32)
-		c := cond(i.u1)
-		target := label(i.u2 & 0xffffffff)
-		switch c.kind() {
-		case condKindRegisterZero:
-			if !i.condBrOffsetResolved() {
-				str = fmt.Sprintf("cbz %s, (%s)", formatVRegSized(c.register(), size), target.String())
-			} else {
-				str = fmt.Sprintf("cbz %s, #%#x %s", formatVRegSized(c.register(), size), i.condBrOffset(), target.String())
-			}
-		case condKindRegisterNotZero:
-			if offset := i.condBrOffset(); offset != 0 {
-				str = fmt.Sprintf("cbnz %s, #%#x (%s)", formatVRegSized(c.register(), size), offset, target.String())
-			} else {
-				str = fmt.Sprintf("cbnz %s, %s", formatVRegSized(c.register(), size), target.String())
-			}
-		case condKindCondFlagSet:
-			if offset := i.condBrOffset(); offset != 0 {
-				if target == labelInvalid {
-					str = fmt.Sprintf("b.%s #%#x", c.flag(), offset)
-				} else {
-					str = fmt.Sprintf("b.%s #%#x, (%s)", c.flag(), offset, target.String())
-				}
-			} else {
-				str = fmt.Sprintf("b.%s %s", c.flag(), target.String())
-			}
-		}
-	case adr:
-		str = fmt.Sprintf("adr %s, #%#x", formatVRegSized(i.rd, 64), int64(i.u1))
-	case brTableSequence:
-		targetIndex := i.u1
-		str = fmt.Sprintf("br_table_sequence %s, table_index=%d", formatVRegSized(i.rn.nr(), 64), targetIndex)
-	case exitSequence:
-		str = fmt.Sprintf("exit_sequence %s", formatVRegSized(i.rn.nr(), 64))
-	case atomicRmw:
-		m := atomicRmwOp(i.u1).String()
-		size := byte(32)
-		switch i.u2 {
-		case 8:
-			size = 64
-		case 2:
-			m = m + "h"
-		case 1:
-			m = m + "b"
-		}
-		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), 64))
-	case atomicCas:
-		m := "casal"
-		size := byte(32)
-		switch i.u2 {
-		case 8:
-			size = 64
-		case 2:
-			m = m + "h"
-		case 1:
-			m = m + "b"
-		}
-		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rd, size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
-	case atomicLoad:
-		m := "ldar"
-		size := byte(32)
-		switch i.u2 {
-		case 8:
-			size = 64
-		case 2:
-			m = m + "h"
-		case 1:
-			m = m + "b"
-		}
-		str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), 64))
-	case atomicStore:
-		m := "stlr"
-		size := byte(32)
-		switch i.u2 {
-		case 8:
-			size = 64
-		case 2:
-			m = m + "h"
-		case 1:
-			m = m + "b"
-		}
-		str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
-	case dmb:
-		str = "dmb"
-	case udf:
-		str = "udf"
-	case emitSourceOffsetInfo:
-		str = fmt.Sprintf("source_offset_info %d", ssa.SourceOffset(i.u1))
-	case vecLoad1R:
-		str = fmt.Sprintf("ld1r {%s}, [%s]", formatVRegVec(i.rd, vecArrangement(i.u1), vecIndexNone), formatVRegSized(i.rn.nr(), 64))
-	case loadConstBlockArg:
-		str = fmt.Sprintf("load_const_block_arg %s, %#x", formatVRegSized(i.rd, 64), i.u1)
-	default:
-		panic(i.kind)
-	}
-	return
-}
-
-func (i *instruction) asAdr(rd regalloc.VReg, offset int64) {
-	i.kind = adr
-	i.rd = rd
-	i.u1 = uint64(offset)
-}
-
-func (i *instruction) asAtomicRmw(op atomicRmwOp, rn, rs, rt regalloc.VReg, size uint64) {
-	i.kind = atomicRmw
-	i.rd, i.rn, i.rm = rt, operandNR(rn), operandNR(rs)
-	i.u1 = uint64(op)
-	i.u2 = size
-}
-
-func (i *instruction) asAtomicCas(rn, rs, rt regalloc.VReg, size uint64) {
-	i.kind = atomicCas
-	i.rm, i.rn, i.rd = operandNR(rt), operandNR(rn), rs
-	i.u2 = size
-}
-
-func (i *instruction) asAtomicLoad(rn, rt regalloc.VReg, size uint64) {
-	i.kind = atomicLoad
-	i.rn, i.rd = operandNR(rn), rt
-	i.u2 = size
-}
-
-func (i *instruction) asAtomicStore(rn, rt operand, size uint64) {
-	i.kind = atomicStore
-	i.rn, i.rm = rn, rt
-	i.u2 = size
-}
-
-func (i *instruction) asDMB() {
-	i.kind = dmb
-}
-
-// TODO: delete unnecessary things.
-const (
-	// nop0 represents a no-op of zero size.
-	nop0 instructionKind = iota + 1
-	// aluRRR represents an ALU operation with two register sources and a register destination.
-	aluRRR
-	// aluRRRR represents an ALU operation with three register sources and a register destination.
-	aluRRRR
-	// aluRRImm12 represents an ALU operation with a register source and an immediate-12 source, with a register destination.
-	aluRRImm12
-	// aluRRBitmaskImm represents an ALU operation with a register source and a bitmask immediate, with a register destination.
-	aluRRBitmaskImm
-	// aluRRImmShift represents an ALU operation with a register source and an immediate-shifted source, with a register destination.
-	aluRRImmShift
-	// aluRRRShift represents an ALU operation with two register sources, one of which can be shifted, with a register destination.
-	aluRRRShift
-	// aluRRRExtend represents an ALU operation with two register sources, one of which can be extended, with a register destination.
-	aluRRRExtend
-	// bitRR represents a bit op instruction with a single register source.
-	bitRR
-	// uLoad8 represents an unsigned 8-bit load.
-	uLoad8
-	// sLoad8 represents a signed 8-bit load into 64-bit register.
-	sLoad8
-	// uLoad16 represents an unsigned 16-bit load into 64-bit register.
-	uLoad16
-	// sLoad16 represents a signed 16-bit load into 64-bit register.
-	sLoad16
-	// uLoad32 represents an unsigned 32-bit load into 64-bit register.
-	uLoad32
-	// sLoad32 represents a signed 32-bit load into 64-bit register.
-	sLoad32
-	// uLoad64 represents a 64-bit load.
-	uLoad64
-	// store8 represents an 8-bit store.
-	store8
-	// store16 represents a 16-bit store.
-	store16
-	// store32 represents a 32-bit store.
-	store32
-	// store64 represents a 64-bit store.
-	store64
-	// storeP64 represents a store of a pair of registers.
-	storeP64
-	// loadP64 represents a load of a pair of registers.
-	loadP64
-	// mov64 represents a MOV instruction. These are encoded as ORR's but we keep them separate for better handling.
-	mov64
-	// mov32 represents a 32-bit MOV. This zeroes the top 32 bits of the destination.
-	mov32
-	// movZ represents a MOVZ with a 16-bit immediate.
-	movZ
-	// movN represents a MOVN with a 16-bit immediate.
-	movN
-	// movK represents a MOVK with a 16-bit immediate.
-	movK
-	// extend represents a sign- or zero-extend operation.
-	extend
-	// cSel represents a conditional-select operation.
-	cSel
-	// cSet represents a conditional-set operation.
-	cSet
-	// cCmpImm represents a conditional comparison with an immediate.
-	cCmpImm
-	// fpuMov64 represents a FPU move. Distinct from a vector-register move; moving just 64 bits appears to be significantly faster.
-	fpuMov64
-	// fpuMov128 represents a vector register move.
-	fpuMov128
-	// fpuMovFromVec represents a move to scalar from a vector element.
-	fpuMovFromVec
-	// fpuRR represents a 1-op FPU instruction.
-	fpuRR
-	// fpuRRR represents a 2-op FPU instruction.
-	fpuRRR
-	// fpuRRI represents a 2-op FPU instruction with immediate value.
-	fpuRRI
-	// fpuRRRR represents a 3-op FPU instruction.
-	fpuRRRR
-	// fpuCmp represents a FPU comparison, either 32 or 64 bit.
-	fpuCmp
-	// fpuLoad32 represents a floating-point load, single-precision (32 bit).
-	fpuLoad32
-	// fpuStore32 represents a floating-point store, single-precision (32 bit).
-	fpuStore32
-	// fpuLoad64 represents a floating-point load, double-precision (64 bit).
-	fpuLoad64
-	// fpuStore64 represents a floating-point store, double-precision (64 bit).
-	fpuStore64
-	// fpuLoad128 represents a floating-point/vector load, 128 bit.
-	fpuLoad128
-	// fpuStore128 represents a floating-point/vector store, 128 bit.
-	fpuStore128
-	// loadFpuConst32 represents a load of a 32-bit floating-point constant.
-	loadFpuConst32
-	// loadFpuConst64 represents a load of a 64-bit floating-point constant.
-	loadFpuConst64
-	// loadFpuConst128 represents a load of a 128-bit floating-point constant.
-	loadFpuConst128
-	// vecLoad1R represents a load of a one single-element structure that replicates to all lanes of a vector.
-	vecLoad1R
-	// fpuToInt represents a conversion from FP to integer.
-	fpuToInt
-	// intToFpu represents a conversion from integer to FP.
-	intToFpu
-	// fpuCSel represents a 32/64-bit FP conditional select.
-	fpuCSel
-	// movToVec represents a move to a vector element from a GPR.
-	movToVec
-	// movFromVec represents an unsigned move from a vector element to a GPR.
-	movFromVec
-	// movFromVecSigned represents a signed move from a vector element to a GPR.
-	movFromVecSigned
-	// vecDup represents a duplication of general-purpose register to vector.
-	vecDup
-	// vecDupElement represents a duplication of a vector element to vector or scalar.
-	vecDupElement
-	// vecDupFromFpu represents a duplication of scalar to vector.
-	vecDupFromFpu
-	// vecExtract represents a vector extraction operation.
-	vecExtract
-	// vecExtend represents a vector extension operation.
-	vecExtend
-	// vecMovElement represents a move vector element to another vector element operation.
-	vecMovElement
-	// vecMiscNarrow represents a vector narrowing operation.
-	vecMiscNarrow
-	// vecRRR represents a vector ALU operation.
-	vecRRR
-	// vecRRRRewrite is exactly the same as vecRRR except that this rewrites the destination register.
-	// For example, BSL instruction rewrites the destination register, and the existing value influences the result.
-	// Therefore, the "destination" register in vecRRRRewrite will be treated as "use" which makes the register outlive
-	// the instruction while this instruction doesn't have "def" in the context of register allocation.
-	vecRRRRewrite
-	// vecMisc represents a vector two register miscellaneous instruction.
-	vecMisc
-	// vecLanes represents a vector instruction across lanes.
-	vecLanes
-	// vecShiftImm represents a SIMD scalar shift by immediate instruction.
-	vecShiftImm
-	// vecTbl represents a table vector lookup - single register table.
-	vecTbl
-	// vecTbl2 represents a table vector lookup - two register table.
-	vecTbl2
-	// vecPermute represents a vector permute instruction.
-	vecPermute
-	// movToNZCV represents a move to the FPSR.
-	movToFPSR
-	// movFromNZCV represents a move from the FPSR.
-	movFromFPSR
-	// call represents a machine call instruction.
-	call
-	// callInd represents a machine indirect-call instruction.
-	callInd
-	// ret represents a machine return instruction.
-	ret
-	// br represents an unconditional branch.
-	br
-	// condBr represents a conditional branch.
-	condBr
-	// adr represents a compute the address (using a PC-relative offset) of a memory location.
-	adr
-	// brTableSequence represents a jump-table sequence.
-	brTableSequence
-	// exitSequence consists of multiple instructions, and exits the execution immediately.
-	// See encodeExitSequence.
-	exitSequence
-	// atomicRmw represents an atomic read-modify-write operation with two register sources and a register destination.
-	atomicRmw
-	// atomicCas represents an atomic compare-and-swap operation with three register sources. The value is loaded to
-	// the source register containing the comparison value.
-	atomicCas
-	// atomicLoad represents an atomic load with one source register and a register destination.
-	atomicLoad
-	// atomicStore represents an atomic store with two source registers and no destination.
-	atomicStore
-	// dmb represents the data memory barrier instruction in inner-shareable (ish) mode.
-	dmb
-	// UDF is the undefined instruction. For debugging only.
-	udf
-	// loadConstBlockArg represents a load of a constant block argument.
-	loadConstBlockArg
-
-	// emitSourceOffsetInfo is a dummy instruction to emit source offset info.
-	// The existence of this instruction does not affect the execution.
-	emitSourceOffsetInfo
-
-	// ------------------- do not define below this line -------------------
-	numInstructionKinds
-)
-
-func (i *instruction) asLoadConstBlockArg(v uint64, typ ssa.Type, dst regalloc.VReg) *instruction {
-	i.kind = loadConstBlockArg
-	i.u1 = v
-	i.u2 = uint64(typ)
-	i.rd = dst
-	return i
-}
-
-func (i *instruction) loadConstBlockArgData() (v uint64, typ ssa.Type, dst regalloc.VReg) {
-	return i.u1, ssa.Type(i.u2), i.rd
-}
-
-func (i *instruction) asEmitSourceOffsetInfo(l ssa.SourceOffset) *instruction {
-	i.kind = emitSourceOffsetInfo
-	i.u1 = uint64(l)
-	return i
-}
-
-func (i *instruction) sourceOffsetInfo() ssa.SourceOffset {
-	return ssa.SourceOffset(i.u1)
-}
-
-func (i *instruction) asUDF() *instruction {
-	i.kind = udf
-	return i
-}
-
-func (i *instruction) asFpuToInt(rd regalloc.VReg, rn operand, rdSigned, src64bit, dst64bit bool) {
-	i.kind = fpuToInt
-	i.rn = rn
-	i.rd = rd
-	if rdSigned {
-		i.u1 = 1
-	}
-	if src64bit {
-		i.u2 = 1
-	}
-	if dst64bit {
-		i.u2 |= 2
-	}
-}
-
-func (i *instruction) asIntToFpu(rd regalloc.VReg, rn operand, rnSigned, src64bit, dst64bit bool) {
-	i.kind = intToFpu
-	i.rn = rn
-	i.rd = rd
-	if rnSigned {
-		i.u1 = 1
-	}
-	if src64bit {
-		i.u2 = 1
-	}
-	if dst64bit {
-		i.u2 |= 2
-	}
-}
-
-func (i *instruction) asExitSequence(ctx regalloc.VReg) *instruction {
-	i.kind = exitSequence
-	i.rn = operandNR(ctx)
-	return i
-}
-
-// aluOp determines the type of ALU operation. Instructions whose kind is one of
-// aluRRR, aluRRRR, aluRRImm12, aluRRBitmaskImm, aluRRImmShift, aluRRRShift and aluRRRExtend
-// would use this type.
-type aluOp uint32
-
-func (a aluOp) String() string {
-	switch a {
-	case aluOpAdd:
-		return "add"
-	case aluOpSub:
-		return "sub"
-	case aluOpOrr:
-		return "orr"
-	case aluOpOrn:
-		return "orn"
-	case aluOpAnd:
-		return "and"
-	case aluOpAnds:
-		return "ands"
-	case aluOpBic:
-		return "bic"
-	case aluOpEor:
-		return "eor"
-	case aluOpAddS:
-		return "adds"
-	case aluOpSubS:
-		return "subs"
-	case aluOpSMulH:
-		return "sMulH"
-	case aluOpUMulH:
-		return "uMulH"
-	case aluOpSDiv:
-		return "sdiv"
-	case aluOpUDiv:
-		return "udiv"
-	case aluOpRotR:
-		return "ror"
-	case aluOpLsr:
-		return "lsr"
-	case aluOpAsr:
-		return "asr"
-	case aluOpLsl:
-		return "lsl"
-	case aluOpMAdd:
-		return "madd"
-	case aluOpMSub:
-		return "msub"
-	}
-	panic(int(a))
-}
-
-const (
-	// 32/64-bit Add.
-	aluOpAdd aluOp = iota
-	// 32/64-bit Subtract.
-	aluOpSub
-	// 32/64-bit Bitwise OR.
-	aluOpOrr
-	// 32/64-bit Bitwise OR NOT.
-	aluOpOrn
-	// 32/64-bit Bitwise AND.
-	aluOpAnd
-	// 32/64-bit Bitwise ANDS.
-	aluOpAnds
-	// 32/64-bit Bitwise AND NOT.
-	aluOpBic
-	// 32/64-bit Bitwise XOR (Exclusive OR).
-	aluOpEor
-	// 32/64-bit Add setting flags.
-	aluOpAddS
-	// 32/64-bit Subtract setting flags.
-	aluOpSubS
-	// Signed multiply, high-word result.
-	aluOpSMulH
-	// Unsigned multiply, high-word result.
-	aluOpUMulH
-	// 64-bit Signed divide.
-	aluOpSDiv
-	// 64-bit Unsigned divide.
-	aluOpUDiv
-	// 32/64-bit Rotate right.
-	aluOpRotR
-	// 32/64-bit Logical shift right.
-	aluOpLsr
-	// 32/64-bit Arithmetic shift right.
-	aluOpAsr
-	// 32/64-bit Logical shift left.
-	aluOpLsl /// Multiply-add
-
-	// MAdd and MSub are only applicable for aluRRRR.
-	aluOpMAdd
-	aluOpMSub
-)
-
-// vecOp determines the type of vector operation. Instructions whose kind is one of
-// vecOpCnt would use this type.
-type vecOp int
-
-// String implements fmt.Stringer.
-func (b vecOp) String() string {
-	switch b {
-	case vecOpCnt:
-		return "cnt"
-	case vecOpCmeq:
-		return "cmeq"
-	case vecOpCmgt:
-		return "cmgt"
-	case vecOpCmhi:
-		return "cmhi"
-	case vecOpCmge:
-		return "cmge"
-	case vecOpCmhs:
-		return "cmhs"
-	case vecOpFcmeq:
-		return "fcmeq"
-	case vecOpFcmgt:
-		return "fcmgt"
-	case vecOpFcmge:
-		return "fcmge"
-	case vecOpCmeq0:
-		return "cmeq0"
-	case vecOpUaddlv:
-		return "uaddlv"
-	case vecOpBit:
-		return "bit"
-	case vecOpBic:
-		return "bic"
-	case vecOpBsl:
-		return "bsl"
-	case vecOpNot:
-		return "not"
-	case vecOpAnd:
-		return "and"
-	case vecOpOrr:
-		return "orr"
-	case vecOpEOR:
-		return "eor"
-	case vecOpFadd:
-		return "fadd"
-	case vecOpAdd:
-		return "add"
-	case vecOpAddp:
-		return "addp"
-	case vecOpAddv:
-		return "addv"
-	case vecOpSub:
-		return "sub"
-	case vecOpFsub:
-		return "fsub"
-	case vecOpSmin:
-		return "smin"
-	case vecOpUmin:
-		return "umin"
-	case vecOpUminv:
-		return "uminv"
-	case vecOpSmax:
-		return "smax"
-	case vecOpUmax:
-		return "umax"
-	case vecOpUmaxp:
-		return "umaxp"
-	case vecOpUrhadd:
-		return "urhadd"
-	case vecOpFmul:
-		return "fmul"
-	case vecOpSqrdmulh:
-		return "sqrdmulh"
-	case vecOpMul:
-		return "mul"
-	case vecOpUmlal:
-		return "umlal"
-	case vecOpFdiv:
-		return "fdiv"
-	case vecOpFsqrt:
-		return "fsqrt"
-	case vecOpAbs:
-		return "abs"
-	case vecOpFabs:
-		return "fabs"
-	case vecOpNeg:
-		return "neg"
-	case vecOpFneg:
-		return "fneg"
-	case vecOpFrintp:
-		return "frintp"
-	case vecOpFrintm:
-		return "frintm"
-	case vecOpFrintn:
-		return "frintn"
-	case vecOpFrintz:
-		return "frintz"
-	case vecOpFcvtl:
-		return "fcvtl"
-	case vecOpFcvtn:
-		return "fcvtn"
-	case vecOpFcvtzu:
-		return "fcvtzu"
-	case vecOpFcvtzs:
-		return "fcvtzs"
-	case vecOpScvtf:
-		return "scvtf"
-	case vecOpUcvtf:
-		return "ucvtf"
-	case vecOpSqxtn:
-		return "sqxtn"
-	case vecOpUqxtn:
-		return "uqxtn"
-	case vecOpSqxtun:
-		return "sqxtun"
-	case vecOpRev64:
-		return "rev64"
-	case vecOpXtn:
-		return "xtn"
-	case vecOpShll:
-		return "shll"
-	case vecOpSshl:
-		return "sshl"
-	case vecOpSshll:
-		return "sshll"
-	case vecOpUshl:
-		return "ushl"
-	case vecOpUshll:
-		return "ushll"
-	case vecOpSshr:
-		return "sshr"
-	case vecOpZip1:
-		return "zip1"
-	case vecOpFmin:
-		return "fmin"
-	case vecOpFmax:
-		return "fmax"
-	case vecOpSmull:
-		return "smull"
-	case vecOpSmull2:
-		return "smull2"
-	}
-	panic(int(b))
-}
-
-const (
-	vecOpCnt vecOp = iota
-	vecOpCmeq0
-	vecOpCmeq
-	vecOpCmgt
-	vecOpCmhi
-	vecOpCmge
-	vecOpCmhs
-	vecOpFcmeq
-	vecOpFcmgt
-	vecOpFcmge
-	vecOpUaddlv
-	vecOpBit
-	vecOpBic
-	vecOpBsl
-	vecOpNot
-	vecOpAnd
-	vecOpOrr
-	vecOpEOR
-	vecOpAdd
-	vecOpFadd
-	vecOpAddv
-	vecOpSqadd
-	vecOpUqadd
-	vecOpAddp
-	vecOpSub
-	vecOpFsub
-	vecOpSqsub
-	vecOpUqsub
-	vecOpSmin
-	vecOpUmin
-	vecOpUminv
-	vecOpFmin
-	vecOpSmax
-	vecOpUmax
-	vecOpUmaxp
-	vecOpFmax
-	vecOpUrhadd
-	vecOpMul
-	vecOpFmul
-	vecOpSqrdmulh
-	vecOpUmlal
-	vecOpFdiv
-	vecOpFsqrt
-	vecOpAbs
-	vecOpFabs
-	vecOpNeg
-	vecOpFneg
-	vecOpFrintm
-	vecOpFrintn
-	vecOpFrintp
-	vecOpFrintz
-	vecOpFcvtl
-	vecOpFcvtn
-	vecOpFcvtzs
-	vecOpFcvtzu
-	vecOpScvtf
-	vecOpUcvtf
-	vecOpSqxtn
-	vecOpSqxtun
-	vecOpUqxtn
-	vecOpRev64
-	vecOpXtn
-	vecOpShll
-	vecOpSshl
-	vecOpSshll
-	vecOpUshl
-	vecOpUshll
-	vecOpSshr
-	vecOpZip1
-	vecOpSmull
-	vecOpSmull2
-)
-
-// bitOp determines the type of bitwise operation. Instructions whose kind is one of
-// bitOpRbit and bitOpClz would use this type.
-type bitOp int
-
-// String implements fmt.Stringer.
-func (b bitOp) String() string {
-	switch b {
-	case bitOpRbit:
-		return "rbit"
-	case bitOpClz:
-		return "clz"
-	}
-	panic(int(b))
-}
-
-const (
-	// 32/64-bit Rbit.
-	bitOpRbit bitOp = iota
-	// 32/64-bit Clz.
-	bitOpClz
-)
-
-// fpuUniOp represents a unary floating-point unit (FPU) operation.
-type fpuUniOp byte
-
-const (
-	fpuUniOpNeg fpuUniOp = iota
-	fpuUniOpCvt32To64
-	fpuUniOpCvt64To32
-	fpuUniOpSqrt
-	fpuUniOpRoundPlus
-	fpuUniOpRoundMinus
-	fpuUniOpRoundZero
-	fpuUniOpRoundNearest
-	fpuUniOpAbs
-)
-
-// String implements the fmt.Stringer.
-func (f fpuUniOp) String() string {
-	switch f {
-	case fpuUniOpNeg:
-		return "fneg"
-	case fpuUniOpCvt32To64:
-		return "fcvt"
-	case fpuUniOpCvt64To32:
-		return "fcvt"
-	case fpuUniOpSqrt:
-		return "fsqrt"
-	case fpuUniOpRoundPlus:
-		return "frintp"
-	case fpuUniOpRoundMinus:
-		return "frintm"
-	case fpuUniOpRoundZero:
-		return "frintz"
-	case fpuUniOpRoundNearest:
-		return "frintn"
-	case fpuUniOpAbs:
-		return "fabs"
-	}
-	panic(int(f))
-}
-
-// fpuBinOp represents a binary floating-point unit (FPU) operation.
-type fpuBinOp byte
-
-const (
-	fpuBinOpAdd = iota
-	fpuBinOpSub
-	fpuBinOpMul
-	fpuBinOpDiv
-	fpuBinOpMax
-	fpuBinOpMin
-)
-
-// String implements the fmt.Stringer.
-func (f fpuBinOp) String() string {
-	switch f {
-	case fpuBinOpAdd:
-		return "fadd"
-	case fpuBinOpSub:
-		return "fsub"
-	case fpuBinOpMul:
-		return "fmul"
-	case fpuBinOpDiv:
-		return "fdiv"
-	case fpuBinOpMax:
-		return "fmax"
-	case fpuBinOpMin:
-		return "fmin"
-	}
-	panic(int(f))
-}
-
-// extMode represents the mode of a register operand extension.
-// For example, aluRRRExtend instructions need this info to determine the extensions.
-type extMode byte
-
-const (
-	extModeNone extMode = iota
-	// extModeZeroExtend64 suggests a zero-extension to 32 bits if the original bit size is less than 32.
-	extModeZeroExtend32
-	// extModeSignExtend64 stands for a sign-extension to 32 bits if the original bit size is less than 32.
-	extModeSignExtend32
-	// extModeZeroExtend64 suggests a zero-extension to 64 bits if the original bit size is less than 64.
-	extModeZeroExtend64
-	// extModeSignExtend64 stands for a sign-extension to 64 bits if the original bit size is less than 64.
-	extModeSignExtend64
-)
-
-func (e extMode) bits() byte {
-	switch e {
-	case extModeZeroExtend32, extModeSignExtend32:
-		return 32
-	case extModeZeroExtend64, extModeSignExtend64:
-		return 64
-	default:
-		return 0
-	}
-}
-
-func (e extMode) signed() bool {
-	switch e {
-	case extModeSignExtend32, extModeSignExtend64:
-		return true
-	default:
-		return false
-	}
-}
-
-func extModeOf(t ssa.Type, signed bool) extMode {
-	switch t.Bits() {
-	case 32:
-		if signed {
-			return extModeSignExtend32
-		}
-		return extModeZeroExtend32
-	case 64:
-		if signed {
-			return extModeSignExtend64
-		}
-		return extModeZeroExtend64
-	default:
-		panic("TODO? do we need narrower than 32 bits?")
-	}
-}
-
-type extendOp byte
-
-const (
-	extendOpUXTB extendOp = 0b000
-	extendOpUXTH extendOp = 0b001
-	extendOpUXTW extendOp = 0b010
-	// extendOpUXTX does nothing, but convenient symbol that officially exists. See:
-	// https://stackoverflow.com/questions/72041372/what-do-the-uxtx-and-sxtx-extensions-mean-for-32-bit-aarch64-adds-instruct
-	extendOpUXTX extendOp = 0b011
-	extendOpSXTB extendOp = 0b100
-	extendOpSXTH extendOp = 0b101
-	extendOpSXTW extendOp = 0b110
-	// extendOpSXTX does nothing, but convenient symbol that officially exists. See:
-	// https://stackoverflow.com/questions/72041372/what-do-the-uxtx-and-sxtx-extensions-mean-for-32-bit-aarch64-adds-instruct
-	extendOpSXTX extendOp = 0b111
-	extendOpNone extendOp = 0xff
-)
-
-func (e extendOp) srcBits() byte {
-	switch e {
-	case extendOpUXTB, extendOpSXTB:
-		return 8
-	case extendOpUXTH, extendOpSXTH:
-		return 16
-	case extendOpUXTW, extendOpSXTW:
-		return 32
-	case extendOpUXTX, extendOpSXTX:
-		return 64
-	}
-	panic(int(e))
-}
-
-func (e extendOp) String() string {
-	switch e {
-	case extendOpUXTB:
-		return "UXTB"
-	case extendOpUXTH:
-		return "UXTH"
-	case extendOpUXTW:
-		return "UXTW"
-	case extendOpUXTX:
-		return "UXTX"
-	case extendOpSXTB:
-		return "SXTB"
-	case extendOpSXTH:
-		return "SXTH"
-	case extendOpSXTW:
-		return "SXTW"
-	case extendOpSXTX:
-		return "SXTX"
-	}
-	panic(int(e))
-}
-
-func extendOpFrom(signed bool, from byte) extendOp {
-	switch from {
-	case 8:
-		if signed {
-			return extendOpSXTB
-		}
-		return extendOpUXTB
-	case 16:
-		if signed {
-			return extendOpSXTH
-		}
-		return extendOpUXTH
-	case 32:
-		if signed {
-			return extendOpSXTW
-		}
-		return extendOpUXTW
-	case 64:
-		if signed {
-			return extendOpSXTX
-		}
-		return extendOpUXTX
-	}
-	panic("invalid extendOpFrom")
-}
-
-type shiftOp byte
-
-const (
-	shiftOpLSL shiftOp = 0b00
-	shiftOpLSR shiftOp = 0b01
-	shiftOpASR shiftOp = 0b10
-	shiftOpROR shiftOp = 0b11
-)
-
-func (s shiftOp) String() string {
-	switch s {
-	case shiftOpLSL:
-		return "lsl"
-	case shiftOpLSR:
-		return "lsr"
-	case shiftOpASR:
-		return "asr"
-	case shiftOpROR:
-		return "ror"
-	}
-	panic(int(s))
-}
-
-const exitSequenceSize = 6 * 4 // 6 instructions as in encodeExitSequence.
-
-// size returns the size of the instruction in encoded bytes.
-func (i *instruction) size() int64 {
-	switch i.kind {
-	case exitSequence:
-		return exitSequenceSize // 5 instructions as in encodeExitSequence.
-	case nop0, loadConstBlockArg:
-		return 0
-	case emitSourceOffsetInfo:
-		return 0
-	case loadFpuConst32:
-		if i.u1 == 0 {
-			return 4 // zero loading can be encoded as a single instruction.
-		}
-		return 4 + 4 + 4
-	case loadFpuConst64:
-		if i.u1 == 0 {
-			return 4 // zero loading can be encoded as a single instruction.
-		}
-		return 4 + 4 + 8
-	case loadFpuConst128:
-		if i.u1 == 0 && i.u2 == 0 {
-			return 4 // zero loading can be encoded as a single instruction.
-		}
-		return 4 + 4 + 16
-	case brTableSequence:
-		return 4*4 + int64(i.u2)*4
-	default:
-		return 4
-	}
-}
-
-// vecArrangement is the arrangement of data within a vector register.
-type vecArrangement byte
-
-const (
-	// vecArrangementNone is an arrangement indicating no data is stored.
-	vecArrangementNone vecArrangement = iota
-	// vecArrangement8B is an arrangement of 8 bytes (64-bit vector)
-	vecArrangement8B
-	// vecArrangement16B is an arrangement of 16 bytes (128-bit vector)
-	vecArrangement16B
-	// vecArrangement4H is an arrangement of 4 half precisions (64-bit vector)
-	vecArrangement4H
-	// vecArrangement8H is an arrangement of 8 half precisions (128-bit vector)
-	vecArrangement8H
-	// vecArrangement2S is an arrangement of 2 single precisions (64-bit vector)
-	vecArrangement2S
-	// vecArrangement4S is an arrangement of 4 single precisions (128-bit vector)
-	vecArrangement4S
-	// vecArrangement1D is an arrangement of 1 double precision (64-bit vector)
-	vecArrangement1D
-	// vecArrangement2D is an arrangement of 2 double precisions (128-bit vector)
-	vecArrangement2D
-
-	// Assign each vector size specifier to a vector arrangement ID.
-	// Instructions can only have an arrangement or a size specifier, but not both, so it
-	// simplifies the internal representation of vector instructions by being able to
-	// store either into the same field.
-
-	// vecArrangementB is a size specifier of byte
-	vecArrangementB
-	// vecArrangementH is a size specifier of word (16-bit)
-	vecArrangementH
-	// vecArrangementS is a size specifier of double word (32-bit)
-	vecArrangementS
-	// vecArrangementD is a size specifier of quad word (64-bit)
-	vecArrangementD
-	// vecArrangementQ is a size specifier of the entire vector (128-bit)
-	vecArrangementQ
-)
-
-// String implements fmt.Stringer
-func (v vecArrangement) String() (ret string) {
-	switch v {
-	case vecArrangement8B:
-		ret = "8B"
-	case vecArrangement16B:
-		ret = "16B"
-	case vecArrangement4H:
-		ret = "4H"
-	case vecArrangement8H:
-		ret = "8H"
-	case vecArrangement2S:
-		ret = "2S"
-	case vecArrangement4S:
-		ret = "4S"
-	case vecArrangement1D:
-		ret = "1D"
-	case vecArrangement2D:
-		ret = "2D"
-	case vecArrangementB:
-		ret = "B"
-	case vecArrangementH:
-		ret = "H"
-	case vecArrangementS:
-		ret = "S"
-	case vecArrangementD:
-		ret = "D"
-	case vecArrangementQ:
-		ret = "Q"
-	case vecArrangementNone:
-		ret = "none"
-	default:
-		panic(v)
-	}
-	return
-}
-
-// vecIndex is the index of an element of a vector register
-type vecIndex byte
-
-// vecIndexNone indicates no vector index specified.
-const vecIndexNone = ^vecIndex(0)
-
-func ssaLaneToArrangement(lane ssa.VecLane) vecArrangement {
-	switch lane {
-	case ssa.VecLaneI8x16:
-		return vecArrangement16B
-	case ssa.VecLaneI16x8:
-		return vecArrangement8H
-	case ssa.VecLaneI32x4:
-		return vecArrangement4S
-	case ssa.VecLaneI64x2:
-		return vecArrangement2D
-	case ssa.VecLaneF32x4:
-		return vecArrangement4S
-	case ssa.VecLaneF64x2:
-		return vecArrangement2D
-	default:
-		panic(lane)
-	}
-}
-
-// atomicRmwOp is the type of atomic read-modify-write operation.
-type atomicRmwOp byte
-
-const (
-	// atomicRmwOpAdd is an atomic add operation.
-	atomicRmwOpAdd atomicRmwOp = iota
-	// atomicRmwOpClr is an atomic clear operation, i.e. AND NOT.
-	atomicRmwOpClr
-	// atomicRmwOpSet is an atomic set operation, i.e. OR.
-	atomicRmwOpSet
-	// atomicRmwOpEor is an atomic exclusive OR operation.
-	atomicRmwOpEor
-	// atomicRmwOpSwp is an atomic swap operation.
-	atomicRmwOpSwp
-)
-
-// String implements fmt.Stringer
-func (a atomicRmwOp) String() string {
-	switch a {
-	case atomicRmwOpAdd:
-		return "ldaddal"
-	case atomicRmwOpClr:
-		return "ldclral"
-	case atomicRmwOpSet:
-		return "ldsetal"
-	case atomicRmwOpEor:
-		return "ldeoral"
-	case atomicRmwOpSwp:
-		return "swpal"
-	}
-	panic(fmt.Sprintf("unknown atomicRmwOp: %d", a))
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
deleted file mode 100644
index 21be9b71e..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
+++ /dev/null
@@ -1,2351 +0,0 @@
-package arm64
-
-import (
-	"context"
-	"fmt"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-)
-
-// Encode implements backend.Machine Encode.
-func (m *machine) Encode(ctx context.Context) error {
-	m.resolveRelativeAddresses(ctx)
-	m.encode(m.rootInstr)
-	if l := len(m.compiler.Buf()); l > maxFunctionExecutableSize {
-		return fmt.Errorf("function size exceeds the limit: %d > %d", l, maxFunctionExecutableSize)
-	}
-	return nil
-}
-
-func (m *machine) encode(root *instruction) {
-	for cur := root; cur != nil; cur = cur.next {
-		cur.encode(m)
-	}
-}
-
-func (i *instruction) encode(m *machine) {
-	c := m.compiler
-	switch kind := i.kind; kind {
-	case nop0, emitSourceOffsetInfo, loadConstBlockArg:
-	case exitSequence:
-		encodeExitSequence(c, i.rn.reg())
-	case ret:
-		// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/RET--Return-from-subroutine-?lang=en
-		c.Emit4Bytes(encodeRet())
-	case br:
-		imm := i.brOffset()
-		c.Emit4Bytes(encodeUnconditionalBranch(false, imm))
-	case call:
-		// We still don't know the exact address of the function to call, so we emit a placeholder.
-		c.AddRelocationInfo(i.callFuncRef())
-		c.Emit4Bytes(encodeUnconditionalBranch(true, 0)) // 0 = placeholder
-	case callInd:
-		c.Emit4Bytes(encodeUnconditionalBranchReg(regNumberInEncoding[i.rn.realReg()], true))
-	case store8, store16, store32, store64, fpuStore32, fpuStore64, fpuStore128:
-		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rn.realReg()], *i.getAmode()))
-	case uLoad8, uLoad16, uLoad32, uLoad64, sLoad8, sLoad16, sLoad32, fpuLoad32, fpuLoad64, fpuLoad128:
-		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rd.RealReg()], *i.getAmode()))
-	case vecLoad1R:
-		c.Emit4Bytes(encodeVecLoad1R(
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			vecArrangement(i.u1)))
-	case condBr:
-		imm19 := i.condBrOffset()
-		if imm19%4 != 0 {
-			panic("imm26 for branch must be a multiple of 4")
-		}
-
-		imm19U32 := uint32(imm19/4) & 0b111_11111111_11111111
-		brCond := i.condBrCond()
-		switch brCond.kind() {
-		case condKindRegisterZero:
-			rt := regNumberInEncoding[brCond.register().RealReg()]
-			c.Emit4Bytes(encodeCBZCBNZ(rt, false, imm19U32, i.condBr64bit()))
-		case condKindRegisterNotZero:
-			rt := regNumberInEncoding[brCond.register().RealReg()]
-			c.Emit4Bytes(encodeCBZCBNZ(rt, true, imm19U32, i.condBr64bit()))
-		case condKindCondFlagSet:
-			// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B-cond--Branch-conditionally-
-			fl := brCond.flag()
-			c.Emit4Bytes(0b01010100<<24 | (imm19U32 << 5) | uint32(fl))
-		default:
-			panic("BUG")
-		}
-	case movN:
-		c.Emit4Bytes(encodeMoveWideImmediate(0b00, regNumberInEncoding[i.rd.RealReg()], i.u1, uint32(i.u2), uint32(i.u2>>32)))
-	case movZ:
-		c.Emit4Bytes(encodeMoveWideImmediate(0b10, regNumberInEncoding[i.rd.RealReg()], i.u1, uint32(i.u2), uint32(i.u2>>32)))
-	case movK:
-		c.Emit4Bytes(encodeMoveWideImmediate(0b11, regNumberInEncoding[i.rd.RealReg()], i.u1, uint32(i.u2), uint32(i.u2>>32)))
-	case mov32:
-		to, from := i.rd.RealReg(), i.rn.realReg()
-		c.Emit4Bytes(encodeAsMov32(regNumberInEncoding[from], regNumberInEncoding[to]))
-	case mov64:
-		to, from := i.rd.RealReg(), i.rn.realReg()
-		toIsSp := to == sp
-		fromIsSp := from == sp
-		c.Emit4Bytes(encodeMov64(regNumberInEncoding[to], regNumberInEncoding[from], toIsSp, fromIsSp))
-	case loadP64, storeP64:
-		rt, rt2 := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()]
-		amode := i.getAmode()
-		rn := regNumberInEncoding[amode.rn.RealReg()]
-		var pre bool
-		switch amode.kind {
-		case addressModeKindPostIndex:
-		case addressModeKindPreIndex:
-			pre = true
-		default:
-			panic("BUG")
-		}
-		c.Emit4Bytes(encodePreOrPostIndexLoadStorePair64(pre, kind == loadP64, rn, rt, rt2, amode.imm))
-	case loadFpuConst32:
-		rd := regNumberInEncoding[i.rd.RealReg()]
-		if i.u1 == 0 {
-			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B))
-		} else {
-			encodeLoadFpuConst32(c, rd, i.u1)
-		}
-	case loadFpuConst64:
-		rd := regNumberInEncoding[i.rd.RealReg()]
-		if i.u1 == 0 {
-			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B))
-		} else {
-			encodeLoadFpuConst64(c, regNumberInEncoding[i.rd.RealReg()], i.u1)
-		}
-	case loadFpuConst128:
-		rd := regNumberInEncoding[i.rd.RealReg()]
-		lo, hi := i.u1, i.u2
-		if lo == 0 && hi == 0 {
-			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement16B))
-		} else {
-			encodeLoadFpuConst128(c, rd, lo, hi)
-		}
-	case aluRRRR:
-		c.Emit4Bytes(encodeAluRRRR(
-			aluOp(i.u1),
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[i.rm.realReg()],
-			regNumberInEncoding[regalloc.VReg(i.u2).RealReg()],
-			uint32(i.u1>>32),
-		))
-	case aluRRImmShift:
-		c.Emit4Bytes(encodeAluRRImm(
-			aluOp(i.u1),
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			uint32(i.rm.shiftImm()),
-			uint32(i.u2>>32),
-		))
-	case aluRRR:
-		rn := i.rn.realReg()
-		c.Emit4Bytes(encodeAluRRR(
-			aluOp(i.u1),
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[rn],
-			regNumberInEncoding[i.rm.realReg()],
-			i.u2>>32 == 1,
-			rn == sp,
-		))
-	case aluRRRExtend:
-		rm, exo, to := i.rm.er()
-		c.Emit4Bytes(encodeAluRRRExtend(
-			aluOp(i.u1),
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[rm.RealReg()],
-			exo,
-			to,
-		))
-	case aluRRRShift:
-		r, amt, sop := i.rm.sr()
-		c.Emit4Bytes(encodeAluRRRShift(
-			aluOp(i.u1),
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[r.RealReg()],
-			uint32(amt),
-			sop,
-			i.u2>>32 == 1,
-		))
-	case aluRRBitmaskImm:
-		c.Emit4Bytes(encodeAluBitmaskImmediate(
-			aluOp(i.u1),
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			i.u2,
-			i.u1>>32 == 1,
-		))
-	case bitRR:
-		c.Emit4Bytes(encodeBitRR(
-			bitOp(i.u1),
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			uint32(i.u2)),
-		)
-	case aluRRImm12:
-		imm12, shift := i.rm.imm12()
-		c.Emit4Bytes(encodeAluRRImm12(
-			aluOp(i.u1),
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			imm12, shift,
-			i.u2>>32 == 1,
-		))
-	case fpuRRR:
-		c.Emit4Bytes(encodeFpuRRR(
-			fpuBinOp(i.u1),
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[i.rm.realReg()],
-			i.u2 == 1,
-		))
-	case fpuMov64, fpuMov128:
-		// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/MOV--vector---Move-vector--an-alias-of-ORR--vector--register--
-		rd := regNumberInEncoding[i.rd.RealReg()]
-		rn := regNumberInEncoding[i.rn.realReg()]
-		var q uint32
-		if kind == fpuMov128 {
-			q = 0b1
-		}
-		c.Emit4Bytes(q<<30 | 0b1110101<<21 | rn<<16 | 0b000111<<10 | rn<<5 | rd)
-	case cSet:
-		rd := regNumberInEncoding[i.rd.RealReg()]
-		cf := condFlag(i.u1)
-		if i.u2 == 1 {
-			// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/CSETM--Conditional-Set-Mask--an-alias-of-CSINV-
-			// Note that we set 64bit version here.
-			c.Emit4Bytes(0b1101101010011111<<16 | uint32(cf.invert())<<12 | 0b011111<<5 | rd)
-		} else {
-			// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/CSET--Conditional-Set--an-alias-of-CSINC-
-			// Note that we set 64bit version here.
-			c.Emit4Bytes(0b1001101010011111<<16 | uint32(cf.invert())<<12 | 0b111111<<5 | rd)
-		}
-	case extend:
-		c.Emit4Bytes(encodeExtend((i.u2>>32) == 1, byte(i.u1), byte(i.u2), regNumberInEncoding[i.rd.RealReg()], regNumberInEncoding[i.rn.realReg()]))
-	case fpuCmp:
-		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/FCMP--Floating-point-quiet-Compare--scalar--?lang=en
-		rn, rm := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()]
-		var ftype uint32
-		if i.u1 == 1 {
-			ftype = 0b01 // double precision.
-		}
-		c.Emit4Bytes(0b1111<<25 | ftype<<22 | 1<<21 | rm<<16 | 0b1<<13 | rn<<5)
-	case udf:
-		// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UDF--Permanently-Undefined-?lang=en
-		if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
-			c.Emit4Bytes(dummyInstruction)
-		} else {
-			c.Emit4Bytes(0)
-		}
-	case adr:
-		c.Emit4Bytes(encodeAdr(regNumberInEncoding[i.rd.RealReg()], uint32(i.u1)))
-	case cSel:
-		c.Emit4Bytes(encodeConditionalSelect(
-			kind,
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[i.rm.realReg()],
-			condFlag(i.u1),
-			i.u2 == 1,
-		))
-	case fpuCSel:
-		c.Emit4Bytes(encodeFpuCSel(
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[i.rm.realReg()],
-			condFlag(i.u1),
-			i.u2 == 1,
-		))
-	case movToVec:
-		c.Emit4Bytes(encodeMoveToVec(
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			vecArrangement(byte(i.u1)),
-			vecIndex(i.u2),
-		))
-	case movFromVec, movFromVecSigned:
-		c.Emit4Bytes(encodeMoveFromVec(
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			vecArrangement(byte(i.u1)),
-			vecIndex(i.u2),
-			i.kind == movFromVecSigned,
-		))
-	case vecDup:
-		c.Emit4Bytes(encodeVecDup(
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			vecArrangement(byte(i.u1))))
-	case vecDupElement:
-		c.Emit4Bytes(encodeVecDupElement(
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			vecArrangement(byte(i.u1)),
-			vecIndex(i.u2)))
-	case vecExtract:
-		c.Emit4Bytes(encodeVecExtract(
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[i.rm.realReg()],
-			vecArrangement(byte(i.u1)),
-			uint32(i.u2)))
-	case vecPermute:
-		c.Emit4Bytes(encodeVecPermute(
-			vecOp(i.u1),
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[i.rm.realReg()],
-			vecArrangement(byte(i.u2))))
-	case vecMovElement:
-		c.Emit4Bytes(encodeVecMovElement(
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			vecArrangement(i.u1),
-			uint32(i.u2), uint32(i.u2>>32),
-		))
-	case vecMisc:
-		c.Emit4Bytes(encodeAdvancedSIMDTwoMisc(
-			vecOp(i.u1),
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			vecArrangement(i.u2),
-		))
-	case vecLanes:
-		c.Emit4Bytes(encodeVecLanes(
-			vecOp(i.u1),
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			vecArrangement(i.u2),
-		))
-	case vecShiftImm:
-		c.Emit4Bytes(encodeVecShiftImm(
-			vecOp(i.u1),
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			uint32(i.rm.shiftImm()),
-			vecArrangement(i.u2),
-		))
-	case vecTbl:
-		c.Emit4Bytes(encodeVecTbl(
-			1,
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[i.rm.realReg()],
-			vecArrangement(i.u2)),
-		)
-	case vecTbl2:
-		c.Emit4Bytes(encodeVecTbl(
-			2,
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[i.rm.realReg()],
-			vecArrangement(i.u2)),
-		)
-	case brTableSequence:
-		targets := m.jmpTableTargets[i.u1]
-		encodeBrTableSequence(c, i.rn.reg(), targets)
-	case fpuToInt, intToFpu:
-		c.Emit4Bytes(encodeCnvBetweenFloatInt(i))
-	case fpuRR:
-		c.Emit4Bytes(encodeFloatDataOneSource(
-			fpuUniOp(i.u1),
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			i.u2 == 1,
-		))
-	case vecRRR:
-		if op := vecOp(i.u1); op == vecOpBsl || op == vecOpBit || op == vecOpUmlal {
-			panic(fmt.Sprintf("vecOp %s must use vecRRRRewrite instead of vecRRR", op.String()))
-		}
-		fallthrough
-	case vecRRRRewrite:
-		c.Emit4Bytes(encodeVecRRR(
-			vecOp(i.u1),
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[i.rm.realReg()],
-			vecArrangement(i.u2),
-		))
-	case cCmpImm:
-		// Conditional compare (immediate) in https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
-		sf := uint32((i.u2 >> 32) & 0b1)
-		nzcv := uint32(i.u2 & 0b1111)
-		cond := uint32(condFlag(i.u1))
-		imm := uint32(i.rm.data & 0b11111)
-		rn := regNumberInEncoding[i.rn.realReg()]
-		c.Emit4Bytes(
-			sf<<31 | 0b111101001<<22 | imm<<16 | cond<<12 | 0b1<<11 | rn<<5 | nzcv,
-		)
-	case movFromFPSR:
-		rt := regNumberInEncoding[i.rd.RealReg()]
-		c.Emit4Bytes(encodeSystemRegisterMove(rt, true))
-	case movToFPSR:
-		rt := regNumberInEncoding[i.rn.realReg()]
-		c.Emit4Bytes(encodeSystemRegisterMove(rt, false))
-	case atomicRmw:
-		c.Emit4Bytes(encodeAtomicRmw(
-			atomicRmwOp(i.u1),
-			regNumberInEncoding[i.rm.realReg()],
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			uint32(i.u2),
-		))
-	case atomicCas:
-		c.Emit4Bytes(encodeAtomicCas(
-			regNumberInEncoding[i.rd.RealReg()],
-			regNumberInEncoding[i.rm.realReg()],
-			regNumberInEncoding[i.rn.realReg()],
-			uint32(i.u2),
-		))
-	case atomicLoad:
-		c.Emit4Bytes(encodeAtomicLoadStore(
-			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[i.rd.RealReg()],
-			uint32(i.u2),
-			1,
-		))
-	case atomicStore:
-		c.Emit4Bytes(encodeAtomicLoadStore(
-			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[i.rm.realReg()],
-			uint32(i.u2),
-			0,
-		))
-	case dmb:
-		c.Emit4Bytes(encodeDMB())
-	default:
-		panic(i.String())
-	}
-}
-
-func encodeMov64(rd, rn uint32, toIsSp, fromIsSp bool) uint32 {
-	if toIsSp || fromIsSp {
-		// This is an alias of ADD (immediate):
-		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MOV--to-from-SP---Move-between-register-and-stack-pointer--an-alias-of-ADD--immediate--
-		return encodeAddSubtractImmediate(0b100, 0, 0, rn, rd)
-	} else {
-		// This is an alias of ORR (shifted register):
-		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MOV--register---Move--register---an-alias-of-ORR--shifted-register--
-		return encodeLogicalShiftedRegister(0b101, 0, rn, 0, regNumberInEncoding[xzr], rd)
-	}
-}
-
-// encodeSystemRegisterMove encodes as "System register move" in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en
-//
-// Note that currently we only supports read/write of FPSR.
-func encodeSystemRegisterMove(rt uint32, fromSystem bool) uint32 {
-	ret := 0b11010101<<24 | 0b11011<<16 | 0b01000100<<8 | 0b001<<5 | rt
-	if fromSystem {
-		ret |= 0b1 << 21
-	}
-	return ret
-}
-
-// encodeVecRRR encodes as either "Advanced SIMD three *" in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
-func encodeVecRRR(op vecOp, rd, rn, rm uint32, arr vecArrangement) uint32 {
-	switch op {
-	case vecOpBit:
-		_, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b10 /* always has size 0b10 */, 0b1, q)
-	case vecOpBic:
-		if arr > vecArrangement16B {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		_, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b01 /* always has size 0b01 */, 0b0, q)
-	case vecOpBsl:
-		if arr > vecArrangement16B {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		_, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b01 /* always has size 0b01 */, 0b1, q)
-	case vecOpAnd:
-		if arr > vecArrangement16B {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		_, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b00 /* always has size 0b00 */, 0b0, q)
-	case vecOpOrr:
-		_, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b10 /* always has size 0b10 */, 0b0, q)
-	case vecOpEOR:
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, size, 0b1, q)
-	case vecOpCmeq:
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10001, size, 0b1, q)
-	case vecOpCmgt:
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00110, size, 0b0, q)
-	case vecOpCmhi:
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00110, size, 0b1, q)
-	case vecOpCmge:
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00111, size, 0b0, q)
-	case vecOpCmhs:
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00111, size, 0b1, q)
-	case vecOpFcmeq:
-		var size, q uint32
-		switch arr {
-		case vecArrangement4S:
-			size, q = 0b00, 0b1
-		case vecArrangement2S:
-			size, q = 0b00, 0b0
-		case vecArrangement2D:
-			size, q = 0b01, 0b1
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11100, size, 0b0, q)
-	case vecOpFcmgt:
-		if arr < vecArrangement2S || arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11100, size, 0b1, q)
-	case vecOpFcmge:
-		var size, q uint32
-		switch arr {
-		case vecArrangement4S:
-			size, q = 0b00, 0b1
-		case vecArrangement2S:
-			size, q = 0b00, 0b0
-		case vecArrangement2D:
-			size, q = 0b01, 0b1
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11100, size, 0b1, q)
-	case vecOpAdd:
-		if arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10000, size, 0b0, q)
-	case vecOpSqadd:
-		if arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00001, size, 0b0, q)
-	case vecOpUqadd:
-		if arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00001, size, 0b1, q)
-	case vecOpAddp:
-		if arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10111, size, 0b0, q)
-	case vecOpSqsub:
-		if arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00101, size, 0b0, q)
-	case vecOpUqsub:
-		if arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00101, size, 0b1, q)
-	case vecOpSub:
-		if arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10000, size, 0b1, q)
-	case vecOpFmin:
-		if arr < vecArrangement2S || arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11110, size, 0b0, q)
-	case vecOpSmin:
-		if arr > vecArrangement4S {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01101, size, 0b0, q)
-	case vecOpUmin:
-		if arr > vecArrangement4S {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01101, size, 0b1, q)
-	case vecOpFmax:
-		var size, q uint32
-		switch arr {
-		case vecArrangement4S:
-			size, q = 0b00, 0b1
-		case vecArrangement2S:
-			size, q = 0b00, 0b0
-		case vecArrangement2D:
-			size, q = 0b01, 0b1
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11110, size, 0b0, q)
-	case vecOpFadd:
-		var size, q uint32
-		switch arr {
-		case vecArrangement4S:
-			size, q = 0b00, 0b1
-		case vecArrangement2S:
-			size, q = 0b00, 0b0
-		case vecArrangement2D:
-			size, q = 0b01, 0b1
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11010, size, 0b0, q)
-	case vecOpFsub:
-		if arr < vecArrangement2S || arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11010, size, 0b0, q)
-	case vecOpFmul:
-		var size, q uint32
-		switch arr {
-		case vecArrangement4S:
-			size, q = 0b00, 0b1
-		case vecArrangement2S:
-			size, q = 0b00, 0b0
-		case vecArrangement2D:
-			size, q = 0b01, 0b1
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11011, size, 0b1, q)
-	case vecOpSqrdmulh:
-		if arr < vecArrangement4H || arr > vecArrangement4S {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10110, size, 0b1, q)
-	case vecOpFdiv:
-		var size, q uint32
-		switch arr {
-		case vecArrangement4S:
-			size, q = 0b00, 0b1
-		case vecArrangement2S:
-			size, q = 0b00, 0b0
-		case vecArrangement2D:
-			size, q = 0b01, 0b1
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11111, size, 0b1, q)
-	case vecOpSmax:
-		if arr > vecArrangement4S {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01100, size, 0b0, q)
-	case vecOpUmax:
-		if arr > vecArrangement4S {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01100, size, 0b1, q)
-	case vecOpUmaxp:
-		if arr > vecArrangement4S {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10100, size, 0b1, q)
-	case vecOpUrhadd:
-		if arr > vecArrangement4S {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00010, size, 0b1, q)
-	case vecOpMul:
-		if arr > vecArrangement4S {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10011, size, 0b0, q)
-	case vecOpUmlal:
-		if arr > vecArrangement4S {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeDifferent(rd, rn, rm, 0b1000, size, 0b1, q)
-	case vecOpSshl:
-		if arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01000, size, 0b0, q)
-	case vecOpUshl:
-		if arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01000, size, 0b1, q)
-
-	case vecOpSmull:
-		if arr > vecArrangement4S {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, _ := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeDifferent(rd, rn, rm, 0b1100, size, 0b0, 0b0)
-
-	case vecOpSmull2:
-		if arr > vecArrangement4S {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, _ := arrToSizeQEncoded(arr)
-		return encodeAdvancedSIMDThreeDifferent(rd, rn, rm, 0b1100, size, 0b0, 0b1)
-
-	default:
-		panic("TODO: " + op.String())
-	}
-}
-
-func arrToSizeQEncoded(arr vecArrangement) (size, q uint32) {
-	switch arr {
-	case vecArrangement16B:
-		q = 0b1
-		fallthrough
-	case vecArrangement8B:
-		size = 0b00
-	case vecArrangement8H:
-		q = 0b1
-		fallthrough
-	case vecArrangement4H:
-		size = 0b01
-	case vecArrangement4S:
-		q = 0b1
-		fallthrough
-	case vecArrangement2S:
-		size = 0b10
-	case vecArrangement2D:
-		q = 0b1
-		fallthrough
-	case vecArrangement1D:
-		size = 0b11
-	default:
-		panic("BUG")
-	}
-	return
-}
-
-// encodeAdvancedSIMDThreeSame encodes as "Advanced SIMD three same" in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
-func encodeAdvancedSIMDThreeSame(rd, rn, rm, opcode, size, U, Q uint32) uint32 {
-	return Q<<30 | U<<29 | 0b111<<25 | size<<22 | 0b1<<21 | rm<<16 | opcode<<11 | 0b1<<10 | rn<<5 | rd
-}
-
-// encodeAdvancedSIMDThreeDifferent encodes as "Advanced SIMD three different" in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
-func encodeAdvancedSIMDThreeDifferent(rd, rn, rm, opcode, size, U, Q uint32) uint32 {
-	return Q<<30 | U<<29 | 0b111<<25 | size<<22 | 0b1<<21 | rm<<16 | opcode<<12 | rn<<5 | rd
-}
-
-// encodeFloatDataOneSource encodes as "Floating-point data-processing (1 source)" in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp
-func encodeFloatDataOneSource(op fpuUniOp, rd, rn uint32, dst64bit bool) uint32 {
-	var opcode, ptype uint32
-	switch op {
-	case fpuUniOpCvt32To64:
-		opcode = 0b000101
-	case fpuUniOpCvt64To32:
-		opcode = 0b000100
-		ptype = 0b01
-	case fpuUniOpNeg:
-		opcode = 0b000010
-		if dst64bit {
-			ptype = 0b01
-		}
-	case fpuUniOpSqrt:
-		opcode = 0b000011
-		if dst64bit {
-			ptype = 0b01
-		}
-	case fpuUniOpRoundPlus:
-		opcode = 0b001001
-		if dst64bit {
-			ptype = 0b01
-		}
-	case fpuUniOpRoundMinus:
-		opcode = 0b001010
-		if dst64bit {
-			ptype = 0b01
-		}
-	case fpuUniOpRoundZero:
-		opcode = 0b001011
-		if dst64bit {
-			ptype = 0b01
-		}
-	case fpuUniOpRoundNearest:
-		opcode = 0b001000
-		if dst64bit {
-			ptype = 0b01
-		}
-	case fpuUniOpAbs:
-		opcode = 0b000001
-		if dst64bit {
-			ptype = 0b01
-		}
-	default:
-		panic("BUG")
-	}
-	return 0b1111<<25 | ptype<<22 | 0b1<<21 | opcode<<15 | 0b1<<14 | rn<<5 | rd
-}
-
-// encodeCnvBetweenFloatInt encodes as "Conversion between floating-point and integer" in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
-func encodeCnvBetweenFloatInt(i *instruction) uint32 {
-	rd := regNumberInEncoding[i.rd.RealReg()]
-	rn := regNumberInEncoding[i.rn.realReg()]
-
-	var opcode uint32
-	var rmode uint32
-	var ptype uint32
-	var sf uint32
-	switch i.kind {
-	case intToFpu: // Either UCVTF or SCVTF.
-		rmode = 0b00
-
-		signed := i.u1 == 1
-		src64bit := i.u2&1 != 0
-		dst64bit := i.u2&2 != 0
-		if signed {
-			opcode = 0b010
-		} else {
-			opcode = 0b011
-		}
-		if src64bit {
-			sf = 0b1
-		}
-		if dst64bit {
-			ptype = 0b01
-		} else {
-			ptype = 0b00
-		}
-	case fpuToInt: // Either FCVTZU or FCVTZS.
-		rmode = 0b11
-
-		signed := i.u1 == 1
-		src64bit := i.u2&1 != 0
-		dst64bit := i.u2&2 != 0
-
-		if signed {
-			opcode = 0b000
-		} else {
-			opcode = 0b001
-		}
-		if dst64bit {
-			sf = 0b1
-		}
-		if src64bit {
-			ptype = 0b01
-		} else {
-			ptype = 0b00
-		}
-	}
-	return sf<<31 | 0b1111<<25 | ptype<<22 | 0b1<<21 | rmode<<19 | opcode<<16 | rn<<5 | rd
-}
-
-// encodeAdr encodes a PC-relative ADR instruction.
-// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/ADR--Form-PC-relative-address-
-func encodeAdr(rd uint32, offset uint32) uint32 {
-	if offset >= 1<<20 {
-		panic("BUG: too large adr instruction")
-	}
-	return offset&0b11<<29 | 0b1<<28 | offset&0b1111111111_1111111100<<3 | rd
-}
-
-// encodeFpuCSel encodes as "Floating-point conditional select" in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
-func encodeFpuCSel(rd, rn, rm uint32, c condFlag, _64bit bool) uint32 {
-	var ftype uint32
-	if _64bit {
-		ftype = 0b01 // double precision.
-	}
-	return 0b1111<<25 | ftype<<22 | 0b1<<21 | rm<<16 | uint32(c)<<12 | 0b11<<10 | rn<<5 | rd
-}
-
-// encodeMoveToVec encodes as "Move general-purpose register to a vector element" (represented as `ins`) in
-// https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/MOV--vector--from-general-
-// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--from-general---Move-general-purpose-register-to-a-vector-element--an-alias-of-INS--general--?lang=en
-func encodeMoveToVec(rd, rn uint32, arr vecArrangement, index vecIndex) uint32 {
-	var imm5 uint32
-	switch arr {
-	case vecArrangementB:
-		imm5 |= 0b1
-		imm5 |= uint32(index) << 1
-		if index > 0b1111 {
-			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", index))
-		}
-	case vecArrangementH:
-		imm5 |= 0b10
-		imm5 |= uint32(index) << 2
-		if index > 0b111 {
-			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", index))
-		}
-	case vecArrangementS:
-		imm5 |= 0b100
-		imm5 |= uint32(index) << 3
-		if index > 0b11 {
-			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", index))
-		}
-	case vecArrangementD:
-		imm5 |= 0b1000
-		imm5 |= uint32(index) << 4
-		if index > 0b1 {
-			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", index))
-		}
-	default:
-		panic("Unsupported arrangement " + arr.String())
-	}
-
-	return 0b01001110000<<21 | imm5<<16 | 0b000111<<10 | rn<<5 | rd
-}
-
-// encodeMoveToVec encodes as "Move vector element to another vector element, mov (element)" (represented as `ins`) in
-// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--element---Move-vector-element-to-another-vector-element--an-alias-of-INS--element--?lang=en
-// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en
-func encodeVecMovElement(rd, rn uint32, arr vecArrangement, srcIndex, dstIndex uint32) uint32 {
-	var imm4, imm5 uint32
-	switch arr {
-	case vecArrangementB:
-		imm5 |= 0b1
-		imm5 |= srcIndex << 1
-		imm4 = dstIndex
-		if srcIndex > 0b1111 {
-			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", srcIndex))
-		}
-	case vecArrangementH:
-		imm5 |= 0b10
-		imm5 |= srcIndex << 2
-		imm4 = dstIndex << 1
-		if srcIndex > 0b111 {
-			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", srcIndex))
-		}
-	case vecArrangementS:
-		imm5 |= 0b100
-		imm5 |= srcIndex << 3
-		imm4 = dstIndex << 2
-		if srcIndex > 0b11 {
-			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", srcIndex))
-		}
-	case vecArrangementD:
-		imm5 |= 0b1000
-		imm5 |= srcIndex << 4
-		imm4 = dstIndex << 3
-		if srcIndex > 0b1 {
-			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", srcIndex))
-		}
-	default:
-		panic("Unsupported arrangement " + arr.String())
-	}
-
-	return 0b01101110000<<21 | imm5<<16 | imm4<<11 | 0b1<<10 | rn<<5 | rd
-}
-
-// encodeUnconditionalBranchReg encodes as "Unconditional branch (register)" in:
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en
-func encodeUnconditionalBranchReg(rn uint32, link bool) uint32 {
-	var opc uint32
-	if link {
-		opc = 0b0001
-	}
-	return 0b1101011<<25 | opc<<21 | 0b11111<<16 | rn<<5
-}
-
-// encodeMoveFromVec encodes as "Move vector element to a general-purpose register"
-// (represented as `umov` when dest is 32-bit, `umov` otherwise) in
-// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register-?lang=en
-// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--to-general---Move-vector-element-to-general-purpose-register--an-alias-of-UMOV-?lang=en
-func encodeMoveFromVec(rd, rn uint32, arr vecArrangement, index vecIndex, signed bool) uint32 {
-	var op, imm4, q, imm5 uint32
-	switch {
-	case arr == vecArrangementB:
-		imm5 |= 0b1
-		imm5 |= uint32(index) << 1
-		if index > 0b1111 {
-			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", index))
-		}
-	case arr == vecArrangementH:
-		imm5 |= 0b10
-		imm5 |= uint32(index) << 2
-		if index > 0b111 {
-			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", index))
-		}
-	case arr == vecArrangementS && signed:
-		q = 0b1
-		fallthrough
-	case arr == vecArrangementS:
-		imm5 |= 0b100
-		imm5 |= uint32(index) << 3
-		if index > 0b11 {
-			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", index))
-		}
-	case arr == vecArrangementD && !signed:
-		imm5 |= 0b1000
-		imm5 |= uint32(index) << 4
-		q = 0b1
-		if index > 0b1 {
-			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", index))
-		}
-	default:
-		panic("Unsupported arrangement " + arr.String())
-	}
-	if signed {
-		op, imm4 = 0, 0b0101
-	} else {
-		op, imm4 = 0, 0b0111
-	}
-	return op<<29 | 0b01110000<<21 | q<<30 | imm5<<16 | imm4<<11 | 1<<10 | rn<<5 | rd
-}
-
-// encodeVecDup encodes as "Duplicate general-purpose register to vector" DUP (general)
-// (represented as `dup`)
-// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--general---Duplicate-general-purpose-register-to-vector-?lang=en
-func encodeVecDup(rd, rn uint32, arr vecArrangement) uint32 {
-	var q, imm5 uint32
-	switch arr {
-	case vecArrangement8B:
-		q, imm5 = 0b0, 0b1
-	case vecArrangement16B:
-		q, imm5 = 0b1, 0b1
-	case vecArrangement4H:
-		q, imm5 = 0b0, 0b10
-	case vecArrangement8H:
-		q, imm5 = 0b1, 0b10
-	case vecArrangement2S:
-		q, imm5 = 0b0, 0b100
-	case vecArrangement4S:
-		q, imm5 = 0b1, 0b100
-	case vecArrangement2D:
-		q, imm5 = 0b1, 0b1000
-	default:
-		panic("Unsupported arrangement " + arr.String())
-	}
-	return q<<30 | 0b001110000<<21 | imm5<<16 | 0b000011<<10 | rn<<5 | rd
-}
-
-// encodeVecDup encodes as "Duplicate vector element to vector or scalar" DUP (element).
-// (represented as `dup`)
-// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar-
-func encodeVecDupElement(rd, rn uint32, arr vecArrangement, srcIndex vecIndex) uint32 {
-	var q, imm5 uint32
-	q = 0b1
-	switch arr {
-	case vecArrangementB:
-		imm5 |= 0b1
-		imm5 |= uint32(srcIndex) << 1
-	case vecArrangementH:
-		imm5 |= 0b10
-		imm5 |= uint32(srcIndex) << 2
-	case vecArrangementS:
-		imm5 |= 0b100
-		imm5 |= uint32(srcIndex) << 3
-	case vecArrangementD:
-		imm5 |= 0b1000
-		imm5 |= uint32(srcIndex) << 4
-	default:
-		panic("unsupported arrangement" + arr.String())
-	}
-
-	return q<<30 | 0b001110000<<21 | imm5<<16 | 0b1<<10 | rn<<5 | rd
-}
-
-// encodeVecExtract encodes as "Advanced SIMD extract."
-// Currently only `ext` is defined.
-// https://developer.arm.com/documentation/ddi0602/2023-06/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp
-// https://developer.arm.com/documentation/ddi0602/2023-06/SIMD-FP-Instructions/EXT--Extract-vector-from-pair-of-vectors-?lang=en
-func encodeVecExtract(rd, rn, rm uint32, arr vecArrangement, index uint32) uint32 {
-	var q, imm4 uint32
-	switch arr {
-	case vecArrangement8B:
-		q, imm4 = 0, 0b0111&uint32(index)
-	case vecArrangement16B:
-		q, imm4 = 1, 0b1111&uint32(index)
-	default:
-		panic("Unsupported arrangement " + arr.String())
-	}
-	return q<<30 | 0b101110000<<21 | rm<<16 | imm4<<11 | rn<<5 | rd
-}
-
-// encodeVecPermute encodes as "Advanced SIMD permute."
-// https://developer.arm.com/documentation/ddi0602/2023-06/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp
-func encodeVecPermute(op vecOp, rd, rn, rm uint32, arr vecArrangement) uint32 {
-	var q, size, opcode uint32
-	switch op {
-	case vecOpZip1:
-		opcode = 0b011
-		if arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q = arrToSizeQEncoded(arr)
-	default:
-		panic("TODO: " + op.String())
-	}
-	return q<<30 | 0b001110<<24 | size<<22 | rm<<16 | opcode<<12 | 0b10<<10 | rn<<5 | rd
-}
-
-// encodeConditionalSelect encodes as "Conditional select" in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#condsel
-func encodeConditionalSelect(kind instructionKind, rd, rn, rm uint32, c condFlag, _64bit bool) uint32 {
-	if kind != cSel {
-		panic("TODO: support other conditional select")
-	}
-
-	ret := 0b110101<<23 | rm<<16 | uint32(c)<<12 | rn<<5 | rd
-	if _64bit {
-		ret |= 0b1 << 31
-	}
-	return ret
-}
-
-const dummyInstruction uint32 = 0x14000000 // "b 0"
-
-// encodeLoadFpuConst32 encodes the following three instructions:
-//
-//	ldr s8, #8  ;; literal load of data.f32
-//	b 8           ;; skip the data
-//	data.f32 xxxxxxx
-func encodeLoadFpuConst32(c backend.Compiler, rd uint32, rawF32 uint64) {
-	c.Emit4Bytes(
-		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--?lang=en
-		0b111<<26 | (0x8/4)<<5 | rd,
-	)
-	c.Emit4Bytes(encodeUnconditionalBranch(false, 8)) // b 8
-	if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
-		// Inlined data.f32 cannot be disassembled, so we add a dummy instruction here.
-		c.Emit4Bytes(dummyInstruction)
-	} else {
-		c.Emit4Bytes(uint32(rawF32)) // data.f32 xxxxxxx
-	}
-}
-
-// encodeLoadFpuConst64 encodes the following three instructions:
-//
-//	ldr d8, #8  ;; literal load of data.f64
-//	b 12           ;; skip the data
-//	data.f64 xxxxxxx
-func encodeLoadFpuConst64(c backend.Compiler, rd uint32, rawF64 uint64) {
-	c.Emit4Bytes(
-		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--?lang=en
-		0b1<<30 | 0b111<<26 | (0x8/4)<<5 | rd,
-	)
-	c.Emit4Bytes(encodeUnconditionalBranch(false, 12)) // b 12
-	if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
-		// Inlined data.f64 cannot be disassembled, so we add dummy instructions here.
-		c.Emit4Bytes(dummyInstruction)
-		c.Emit4Bytes(dummyInstruction)
-	} else {
-		// data.f64 xxxxxxx
-		c.Emit4Bytes(uint32(rawF64))
-		c.Emit4Bytes(uint32(rawF64 >> 32))
-	}
-}
-
-// encodeLoadFpuConst128 encodes the following three instructions:
-//
-//	ldr v8, #8  ;; literal load of data.f64
-//	b 20           ;; skip the data
-//	data.v128 xxxxxxx
-func encodeLoadFpuConst128(c backend.Compiler, rd uint32, lo, hi uint64) {
-	c.Emit4Bytes(
-		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--?lang=en
-		0b1<<31 | 0b111<<26 | (0x8/4)<<5 | rd,
-	)
-	c.Emit4Bytes(encodeUnconditionalBranch(false, 20)) // b 20
-	if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
-		// Inlined data.v128 cannot be disassembled, so we add dummy instructions here.
-		c.Emit4Bytes(dummyInstruction)
-		c.Emit4Bytes(dummyInstruction)
-		c.Emit4Bytes(dummyInstruction)
-		c.Emit4Bytes(dummyInstruction)
-	} else {
-		// data.v128 xxxxxxx
-		c.Emit4Bytes(uint32(lo))
-		c.Emit4Bytes(uint32(lo >> 32))
-		c.Emit4Bytes(uint32(hi))
-		c.Emit4Bytes(uint32(hi >> 32))
-	}
-}
-
-// encodeAluRRRR encodes as Data-processing (3 source) in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
-func encodeAluRRRR(op aluOp, rd, rn, rm, ra, _64bit uint32) uint32 {
-	var oO, op31 uint32
-	switch op {
-	case aluOpMAdd:
-		op31, oO = 0b000, 0b0
-	case aluOpMSub:
-		op31, oO = 0b000, 0b1
-	default:
-		panic("TODO/BUG")
-	}
-	return _64bit<<31 | 0b11011<<24 | op31<<21 | rm<<16 | oO<<15 | ra<<10 | rn<<5 | rd
-}
-
-// encodeBitRR encodes as Data-processing (1 source) in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
-func encodeBitRR(op bitOp, rd, rn, _64bit uint32) uint32 {
-	var opcode2, opcode uint32
-	switch op {
-	case bitOpRbit:
-		opcode2, opcode = 0b00000, 0b000000
-	case bitOpClz:
-		opcode2, opcode = 0b00000, 0b000100
-	default:
-		panic("TODO/BUG")
-	}
-	return _64bit<<31 | 0b1_0_11010110<<21 | opcode2<<15 | opcode<<10 | rn<<5 | rd
-}
-
-func encodeAsMov32(rn, rd uint32) uint32 {
-	// This is an alias of ORR (shifted register):
-	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MOV--register---Move--register---an-alias-of-ORR--shifted-register--
-	return encodeLogicalShiftedRegister(0b001, 0, rn, 0, regNumberInEncoding[xzr], rd)
-}
-
-// encodeExtend encodes extension instructions.
-func encodeExtend(signed bool, from, to byte, rd, rn uint32) uint32 {
-	// UTXB: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UXTB--Unsigned-Extend-Byte--an-alias-of-UBFM-?lang=en
-	// UTXH: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UXTH--Unsigned-Extend-Halfword--an-alias-of-UBFM-?lang=en
-	// STXB: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SXTB--Signed-Extend-Byte--an-alias-of-SBFM-
-	// STXH: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SXTH--Sign-Extend-Halfword--an-alias-of-SBFM-
-	// STXW: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SXTW--Sign-Extend-Word--an-alias-of-SBFM-
-	var _31to10 uint32
-	switch {
-	case !signed && from == 8 && to == 32:
-		// 32-bit UXTB
-		_31to10 = 0b0101001100000000000111
-	case !signed && from == 16 && to == 32:
-		// 32-bit UXTH
-		_31to10 = 0b0101001100000000001111
-	case !signed && from == 8 && to == 64:
-		// 64-bit UXTB
-		_31to10 = 0b0101001100000000000111
-	case !signed && from == 16 && to == 64:
-		// 64-bit UXTH
-		_31to10 = 0b0101001100000000001111
-	case !signed && from == 32 && to == 64:
-		return encodeAsMov32(rn, rd)
-	case signed && from == 8 && to == 32:
-		// 32-bit SXTB
-		_31to10 = 0b0001001100000000000111
-	case signed && from == 16 && to == 32:
-		// 32-bit SXTH
-		_31to10 = 0b0001001100000000001111
-	case signed && from == 8 && to == 64:
-		// 64-bit SXTB
-		_31to10 = 0b1001001101000000000111
-	case signed && from == 16 && to == 64:
-		// 64-bit SXTH
-		_31to10 = 0b1001001101000000001111
-	case signed && from == 32 && to == 64:
-		// SXTW
-		_31to10 = 0b1001001101000000011111
-	default:
-		panic("BUG")
-	}
-	return _31to10<<10 | rn<<5 | rd
-}
-
-func encodeLoadOrStore(kind instructionKind, rt uint32, amode addressMode) uint32 {
-	var _22to31 uint32
-	var bits int64
-	switch kind {
-	case uLoad8:
-		_22to31 = 0b0011100001
-		bits = 8
-	case sLoad8:
-		_22to31 = 0b0011100010
-		bits = 8
-	case uLoad16:
-		_22to31 = 0b0111100001
-		bits = 16
-	case sLoad16:
-		_22to31 = 0b0111100010
-		bits = 16
-	case uLoad32:
-		_22to31 = 0b1011100001
-		bits = 32
-	case sLoad32:
-		_22to31 = 0b1011100010
-		bits = 32
-	case uLoad64:
-		_22to31 = 0b1111100001
-		bits = 64
-	case fpuLoad32:
-		_22to31 = 0b1011110001
-		bits = 32
-	case fpuLoad64:
-		_22to31 = 0b1111110001
-		bits = 64
-	case fpuLoad128:
-		_22to31 = 0b0011110011
-		bits = 128
-	case store8:
-		_22to31 = 0b0011100000
-		bits = 8
-	case store16:
-		_22to31 = 0b0111100000
-		bits = 16
-	case store32:
-		_22to31 = 0b1011100000
-		bits = 32
-	case store64:
-		_22to31 = 0b1111100000
-		bits = 64
-	case fpuStore32:
-		_22to31 = 0b1011110000
-		bits = 32
-	case fpuStore64:
-		_22to31 = 0b1111110000
-		bits = 64
-	case fpuStore128:
-		_22to31 = 0b0011110010
-		bits = 128
-	default:
-		panic("BUG")
-	}
-
-	switch amode.kind {
-	case addressModeKindRegScaledExtended:
-		return encodeLoadOrStoreExtended(_22to31,
-			regNumberInEncoding[amode.rn.RealReg()],
-			regNumberInEncoding[amode.rm.RealReg()],
-			rt, true, amode.extOp)
-	case addressModeKindRegScaled:
-		return encodeLoadOrStoreExtended(_22to31,
-			regNumberInEncoding[amode.rn.RealReg()], regNumberInEncoding[amode.rm.RealReg()],
-			rt, true, extendOpNone)
-	case addressModeKindRegExtended:
-		return encodeLoadOrStoreExtended(_22to31,
-			regNumberInEncoding[amode.rn.RealReg()], regNumberInEncoding[amode.rm.RealReg()],
-			rt, false, amode.extOp)
-	case addressModeKindRegReg:
-		return encodeLoadOrStoreExtended(_22to31,
-			regNumberInEncoding[amode.rn.RealReg()], regNumberInEncoding[amode.rm.RealReg()],
-			rt, false, extendOpNone)
-	case addressModeKindRegSignedImm9:
-		// e.g. https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled--
-		return encodeLoadOrStoreSIMM9(_22to31, 0b00 /* unscaled */, regNumberInEncoding[amode.rn.RealReg()], rt, amode.imm)
-	case addressModeKindPostIndex:
-		return encodeLoadOrStoreSIMM9(_22to31, 0b01 /* post index */, regNumberInEncoding[amode.rn.RealReg()], rt, amode.imm)
-	case addressModeKindPreIndex:
-		return encodeLoadOrStoreSIMM9(_22to31, 0b11 /* pre index */, regNumberInEncoding[amode.rn.RealReg()], rt, amode.imm)
-	case addressModeKindRegUnsignedImm12:
-		// "unsigned immediate" in https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Loads-and-Stores?lang=en
-		rn := regNumberInEncoding[amode.rn.RealReg()]
-		imm := amode.imm
-		div := bits / 8
-		if imm != 0 && !offsetFitsInAddressModeKindRegUnsignedImm12(byte(bits), imm) {
-			panic("BUG")
-		}
-		imm /= div
-		return _22to31<<22 | 0b1<<24 | uint32(imm&0b111111111111)<<10 | rn<<5 | rt
-	default:
-		panic("BUG")
-	}
-}
-
-// encodeVecLoad1R encodes as Load one single-element structure and Replicate to all lanes (of one register) in
-// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LD1R--Load-one-single-element-structure-and-Replicate-to-all-lanes--of-one-register--?lang=en#sa_imm
-func encodeVecLoad1R(rt, rn uint32, arr vecArrangement) uint32 {
-	size, q := arrToSizeQEncoded(arr)
-	return q<<30 | 0b001101010000001100<<12 | size<<10 | rn<<5 | rt
-}
-
-// encodeAluBitmaskImmediate encodes as Logical (immediate) in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
-func encodeAluBitmaskImmediate(op aluOp, rd, rn uint32, imm uint64, _64bit bool) uint32 {
-	var _31to23 uint32
-	switch op {
-	case aluOpAnd:
-		_31to23 = 0b00_100100
-	case aluOpOrr:
-		_31to23 = 0b01_100100
-	case aluOpEor:
-		_31to23 = 0b10_100100
-	case aluOpAnds:
-		_31to23 = 0b11_100100
-	default:
-		panic("BUG")
-	}
-	if _64bit {
-		_31to23 |= 0b1 << 8
-	}
-	immr, imms, N := bitmaskImmediate(imm, _64bit)
-	return _31to23<<23 | uint32(N)<<22 | uint32(immr)<<16 | uint32(imms)<<10 | rn<<5 | rd
-}
-
-func bitmaskImmediate(c uint64, is64bit bool) (immr, imms, N byte) {
-	var size uint32
-	switch {
-	case c != c>>32|c<<32:
-		size = 64
-	case c != c>>16|c<<48:
-		size = 32
-		c = uint64(int32(c))
-	case c != c>>8|c<<56:
-		size = 16
-		c = uint64(int16(c))
-	case c != c>>4|c<<60:
-		size = 8
-		c = uint64(int8(c))
-	case c != c>>2|c<<62:
-		size = 4
-		c = uint64(int64(c<<60) >> 60)
-	default:
-		size = 2
-		c = uint64(int64(c<<62) >> 62)
-	}
-
-	neg := false
-	if int64(c) < 0 {
-		c = ^c
-		neg = true
-	}
-
-	onesSize, nonZeroPos := getOnesSequenceSize(c)
-	if neg {
-		nonZeroPos = onesSize + nonZeroPos
-		onesSize = size - onesSize
-	}
-
-	var mode byte = 32
-	if is64bit && size == 64 {
-		N, mode = 0b1, 64
-	}
-
-	immr = byte((size - nonZeroPos) & (size - 1) & uint32(mode-1))
-	imms = byte((onesSize - 1) | 63&^(size<<1-1))
-	return
-}
-
-func getOnesSequenceSize(x uint64) (size, nonZeroPos uint32) {
-	// Take 0b00111000 for example:
-	y := getLowestBit(x)               // = 0b0000100
-	nonZeroPos = setBitPos(y)          // = 2
-	size = setBitPos(x+y) - nonZeroPos // = setBitPos(0b0100000) - 2 = 5 - 2 = 3
-	return
-}
-
-func setBitPos(x uint64) (ret uint32) {
-	for ; ; ret++ {
-		if x == 0b1 {
-			break
-		}
-		x = x >> 1
-	}
-	return
-}
-
-// encodeLoadOrStoreExtended encodes store/load instruction as "extended register offset" in Load/store register (register offset):
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Loads-and-Stores?lang=en
-func encodeLoadOrStoreExtended(_22to32 uint32, rn, rm, rt uint32, scaled bool, extOp extendOp) uint32 {
-	var option uint32
-	switch extOp {
-	case extendOpUXTW:
-		option = 0b010
-	case extendOpSXTW:
-		option = 0b110
-	case extendOpNone:
-		option = 0b111
-	default:
-		panic("BUG")
-	}
-	var s uint32
-	if scaled {
-		s = 0b1
-	}
-	return _22to32<<22 | 0b1<<21 | rm<<16 | option<<13 | s<<12 | 0b10<<10 | rn<<5 | rt
-}
-
-// encodeLoadOrStoreSIMM9 encodes store/load instruction as one of post-index, pre-index or unscaled immediate as in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Loads-and-Stores?lang=en
-func encodeLoadOrStoreSIMM9(_22to32, _1011 uint32, rn, rt uint32, imm9 int64) uint32 {
-	return _22to32<<22 | (uint32(imm9)&0b111111111)<<12 | _1011<<10 | rn<<5 | rt
-}
-
-// encodeFpuRRR encodes as single or double precision (depending on `_64bit`) of Floating-point data-processing (2 source) in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
-func encodeFpuRRR(op fpuBinOp, rd, rn, rm uint32, _64bit bool) (ret uint32) {
-	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADD--vector--Add-vectors--scalar--floating-point-and-integer-
-	var opcode uint32
-	switch op {
-	case fpuBinOpAdd:
-		opcode = 0b0010
-	case fpuBinOpSub:
-		opcode = 0b0011
-	case fpuBinOpMul:
-		opcode = 0b0000
-	case fpuBinOpDiv:
-		opcode = 0b0001
-	case fpuBinOpMax:
-		opcode = 0b0100
-	case fpuBinOpMin:
-		opcode = 0b0101
-	default:
-		panic("BUG")
-	}
-	var ptype uint32
-	if _64bit {
-		ptype = 0b01
-	}
-	return 0b1111<<25 | ptype<<22 | 0b1<<21 | rm<<16 | opcode<<12 | 0b1<<11 | rn<<5 | rd
-}
-
-// encodeAluRRImm12 encodes as Add/subtract (immediate) in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
-func encodeAluRRImm12(op aluOp, rd, rn uint32, imm12 uint16, shiftBit byte, _64bit bool) uint32 {
-	var _31to24 uint32
-	switch op {
-	case aluOpAdd:
-		_31to24 = 0b00_10001
-	case aluOpAddS:
-		_31to24 = 0b01_10001
-	case aluOpSub:
-		_31to24 = 0b10_10001
-	case aluOpSubS:
-		_31to24 = 0b11_10001
-	default:
-		panic("BUG")
-	}
-	if _64bit {
-		_31to24 |= 0b1 << 7
-	}
-	return _31to24<<24 | uint32(shiftBit)<<22 | uint32(imm12&0b111111111111)<<10 | rn<<5 | rd
-}
-
-// encodeAluRRR encodes as Data Processing (shifted register), depending on aluOp.
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift
-func encodeAluRRRShift(op aluOp, rd, rn, rm, amount uint32, shiftOp shiftOp, _64bit bool) uint32 {
-	var _31to24 uint32
-	var opc, n uint32
-	switch op {
-	case aluOpAdd:
-		_31to24 = 0b00001011
-	case aluOpAddS:
-		_31to24 = 0b00101011
-	case aluOpSub:
-		_31to24 = 0b01001011
-	case aluOpSubS:
-		_31to24 = 0b01101011
-	case aluOpAnd, aluOpOrr, aluOpEor, aluOpAnds:
-		// "Logical (shifted register)".
-		switch op {
-		case aluOpAnd:
-			// all zeros
-		case aluOpOrr:
-			opc = 0b01
-		case aluOpEor:
-			opc = 0b10
-		case aluOpAnds:
-			opc = 0b11
-		}
-		_31to24 = 0b000_01010
-	default:
-		panic(op.String())
-	}
-
-	if _64bit {
-		_31to24 |= 0b1 << 7
-	}
-
-	var shift uint32
-	switch shiftOp {
-	case shiftOpLSL:
-		shift = 0b00
-	case shiftOpLSR:
-		shift = 0b01
-	case shiftOpASR:
-		shift = 0b10
-	default:
-		panic(shiftOp.String())
-	}
-	return opc<<29 | n<<21 | _31to24<<24 | shift<<22 | rm<<16 | (amount << 10) | (rn << 5) | rd
-}
-
-// "Add/subtract (extended register)" in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_ext
-func encodeAluRRRExtend(ao aluOp, rd, rn, rm uint32, extOp extendOp, to byte) uint32 {
-	var s, op uint32
-	switch ao {
-	case aluOpAdd:
-		op = 0b0
-	case aluOpAddS:
-		op, s = 0b0, 0b1
-	case aluOpSub:
-		op = 0b1
-	case aluOpSubS:
-		op, s = 0b1, 0b1
-	default:
-		panic("BUG: extended register operand can be used only for add/sub")
-	}
-
-	var sf uint32
-	if to == 64 {
-		sf = 0b1
-	}
-
-	var option uint32
-	switch extOp {
-	case extendOpUXTB:
-		option = 0b000
-	case extendOpUXTH:
-		option = 0b001
-	case extendOpUXTW:
-		option = 0b010
-	case extendOpSXTB:
-		option = 0b100
-	case extendOpSXTH:
-		option = 0b101
-	case extendOpSXTW:
-		option = 0b110
-	case extendOpSXTX, extendOpUXTX:
-		panic(fmt.Sprintf("%s is essentially noop, and should be handled much earlier than encoding", extOp.String()))
-	}
-	return sf<<31 | op<<30 | s<<29 | 0b1011001<<21 | rm<<16 | option<<13 | rn<<5 | rd
-}
-
-// encodeAluRRR encodes as Data Processing (register), depending on aluOp.
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
-func encodeAluRRR(op aluOp, rd, rn, rm uint32, _64bit, isRnSp bool) uint32 {
-	var _31to21, _15to10 uint32
-	switch op {
-	case aluOpAdd:
-		if isRnSp {
-			// "Extended register" with UXTW.
-			_31to21 = 0b00001011_001
-			_15to10 = 0b011000
-		} else {
-			// "Shifted register" with shift = 0
-			_31to21 = 0b00001011_000
-		}
-	case aluOpAddS:
-		if isRnSp {
-			panic("TODO")
-		}
-		// "Shifted register" with shift = 0
-		_31to21 = 0b00101011_000
-	case aluOpSub:
-		if isRnSp {
-			// "Extended register" with UXTW.
-			_31to21 = 0b01001011_001
-			_15to10 = 0b011000
-		} else {
-			// "Shifted register" with shift = 0
-			_31to21 = 0b01001011_000
-		}
-	case aluOpSubS:
-		if isRnSp {
-			panic("TODO")
-		}
-		// "Shifted register" with shift = 0
-		_31to21 = 0b01101011_000
-	case aluOpAnd, aluOpOrr, aluOpOrn, aluOpEor, aluOpAnds:
-		// "Logical (shifted register)".
-		var opc, n uint32
-		switch op {
-		case aluOpAnd:
-			// all zeros
-		case aluOpOrr:
-			opc = 0b01
-		case aluOpOrn:
-			opc = 0b01
-			n = 1
-		case aluOpEor:
-			opc = 0b10
-		case aluOpAnds:
-			opc = 0b11
-		}
-		_31to21 = 0b000_01010_000 | opc<<8 | n
-	case aluOpLsl, aluOpAsr, aluOpLsr, aluOpRotR:
-		// "Data-processing (2 source)".
-		_31to21 = 0b00011010_110
-		switch op {
-		case aluOpLsl:
-			_15to10 = 0b001000
-		case aluOpLsr:
-			_15to10 = 0b001001
-		case aluOpAsr:
-			_15to10 = 0b001010
-		case aluOpRotR:
-			_15to10 = 0b001011
-		}
-	case aluOpSDiv:
-		// "Data-processing (2 source)".
-		_31to21 = 0b11010110
-		_15to10 = 0b000011
-	case aluOpUDiv:
-		// "Data-processing (2 source)".
-		_31to21 = 0b11010110
-		_15to10 = 0b000010
-	default:
-		panic(op.String())
-	}
-	if _64bit {
-		_31to21 |= 0b1 << 10
-	}
-	return _31to21<<21 | rm<<16 | (_15to10 << 10) | (rn << 5) | rd
-}
-
-// encodeLogicalShiftedRegister encodes as Logical (shifted register) in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
-func encodeLogicalShiftedRegister(sf_opc uint32, shift_N uint32, rm uint32, imm6 uint32, rn, rd uint32) (ret uint32) {
-	ret = sf_opc << 29
-	ret |= 0b01010 << 24
-	ret |= shift_N << 21
-	ret |= rm << 16
-	ret |= imm6 << 10
-	ret |= rn << 5
-	ret |= rd
-	return
-}
-
-// encodeAddSubtractImmediate encodes as Add/subtract (immediate) in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
-func encodeAddSubtractImmediate(sf_op_s uint32, sh uint32, imm12 uint32, rn, rd uint32) (ret uint32) {
-	ret = sf_op_s << 29
-	ret |= 0b100010 << 23
-	ret |= sh << 22
-	ret |= imm12 << 10
-	ret |= rn << 5
-	ret |= rd
-	return
-}
-
-// encodePreOrPostIndexLoadStorePair64 encodes as Load/store pair (pre/post-indexed) in
-// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
-// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STP--Store-Pair-of-Registers-
-func encodePreOrPostIndexLoadStorePair64(pre bool, load bool, rn, rt, rt2 uint32, imm7 int64) (ret uint32) {
-	if imm7%8 != 0 {
-		panic("imm7 for pair load/store must be a multiple of 8")
-	}
-	imm7 /= 8
-	ret = rt
-	ret |= rn << 5
-	ret |= rt2 << 10
-	ret |= (uint32(imm7) & 0b1111111) << 15
-	if load {
-		ret |= 0b1 << 22
-	}
-	ret |= 0b101010001 << 23
-	if pre {
-		ret |= 0b1 << 24
-	}
-	return
-}
-
-// encodeUnconditionalBranch encodes as B or BL instructions:
-// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B--Branch-
-// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link-
-func encodeUnconditionalBranch(link bool, imm26 int64) (ret uint32) {
-	if imm26%4 != 0 {
-		panic("imm26 for branch must be a multiple of 4")
-	}
-	imm26 /= 4
-	ret = uint32(imm26 & 0b11_11111111_11111111_11111111)
-	ret |= 0b101 << 26
-	if link {
-		ret |= 0b1 << 31
-	}
-	return
-}
-
-// encodeCBZCBNZ encodes as either CBZ or CBNZ:
-// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CBZ--Compare-and-Branch-on-Zero-
-// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CBNZ--Compare-and-Branch-on-Nonzero-
-func encodeCBZCBNZ(rt uint32, nz bool, imm19 uint32, _64bit bool) (ret uint32) {
-	ret = rt
-	ret |= imm19 << 5
-	if nz {
-		ret |= 1 << 24
-	}
-	ret |= 0b11010 << 25
-	if _64bit {
-		ret |= 1 << 31
-	}
-	return
-}
-
-// encodeMoveWideImmediate encodes as either MOVZ, MOVN or MOVK, as Move wide (immediate) in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
-//
-// "shift" must have been divided by 16 at this point.
-func encodeMoveWideImmediate(opc uint32, rd uint32, imm uint64, shift, _64bit uint32) (ret uint32) {
-	ret = rd
-	ret |= uint32(imm&0xffff) << 5
-	ret |= (shift) << 21
-	ret |= 0b100101 << 23
-	ret |= opc << 29
-	ret |= _64bit << 31
-	return
-}
-
-// encodeAluRRImm encodes as "Bitfield" in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en#log_imm
-func encodeAluRRImm(op aluOp, rd, rn, amount, _64bit uint32) uint32 {
-	var opc uint32
-	var immr, imms uint32
-	switch op {
-	case aluOpLsl:
-		// LSL (immediate) is an alias for UBFM.
-		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/UBFM--Unsigned-Bitfield-Move-?lang=en
-		opc = 0b10
-		if amount == 0 {
-			// This can be encoded as NOP, but we don't do it for consistency: lsr xn, xm, #0
-			immr = 0
-			if _64bit == 1 {
-				imms = 0b111111
-			} else {
-				imms = 0b11111
-			}
-		} else {
-			if _64bit == 1 {
-				immr = 64 - amount
-			} else {
-				immr = (32 - amount) & 0b11111
-			}
-			imms = immr - 1
-		}
-	case aluOpLsr:
-		// LSR (immediate) is an alias for UBFM.
-		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LSR--immediate---Logical-Shift-Right--immediate---an-alias-of-UBFM-?lang=en
-		opc = 0b10
-		imms, immr = 0b011111|_64bit<<5, amount
-	case aluOpAsr:
-		// ASR (immediate) is an alias for SBFM.
-		// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SBFM--Signed-Bitfield-Move-?lang=en
-		opc = 0b00
-		imms, immr = 0b011111|_64bit<<5, amount
-	default:
-		panic(op.String())
-	}
-	return _64bit<<31 | opc<<29 | 0b100110<<23 | _64bit<<22 | immr<<16 | imms<<10 | rn<<5 | rd
-}
-
-// encodeVecLanes encodes as Data Processing (Advanced SIMD across lanes) depending on vecOp in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
-func encodeVecLanes(op vecOp, rd uint32, rn uint32, arr vecArrangement) uint32 {
-	var u, q, size, opcode uint32
-	switch arr {
-	case vecArrangement8B:
-		q, size = 0b0, 0b00
-	case vecArrangement16B:
-		q, size = 0b1, 0b00
-	case vecArrangement4H:
-		q, size = 0, 0b01
-	case vecArrangement8H:
-		q, size = 1, 0b01
-	case vecArrangement4S:
-		q, size = 1, 0b10
-	default:
-		panic("unsupported arrangement: " + arr.String())
-	}
-	switch op {
-	case vecOpUaddlv:
-		u, opcode = 1, 0b00011
-	case vecOpUminv:
-		u, opcode = 1, 0b11010
-	case vecOpAddv:
-		u, opcode = 0, 0b11011
-	default:
-		panic("unsupported or illegal vecOp: " + op.String())
-	}
-	return q<<30 | u<<29 | 0b1110<<24 | size<<22 | 0b11000<<17 | opcode<<12 | 0b10<<10 | rn<<5 | rd
-}
-
-// encodeVecLanes encodes as Data Processing (Advanced SIMD scalar shift by immediate) depending on vecOp in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
-func encodeVecShiftImm(op vecOp, rd uint32, rn, amount uint32, arr vecArrangement) uint32 {
-	var u, q, immh, immb, opcode uint32
-	switch op {
-	case vecOpSshll:
-		u, opcode = 0b0, 0b10100
-	case vecOpUshll:
-		u, opcode = 0b1, 0b10100
-	case vecOpSshr:
-		u, opcode = 0, 0b00000
-	default:
-		panic("unsupported or illegal vecOp: " + op.String())
-	}
-	switch arr {
-	case vecArrangement16B:
-		q = 0b1
-		fallthrough
-	case vecArrangement8B:
-		immh = 0b0001
-		immb = 8 - uint32(amount&0b111)
-	case vecArrangement8H:
-		q = 0b1
-		fallthrough
-	case vecArrangement4H:
-		v := 16 - uint32(amount&0b1111)
-		immb = v & 0b111
-		immh = 0b0010 | (v >> 3)
-	case vecArrangement4S:
-		q = 0b1
-		fallthrough
-	case vecArrangement2S:
-		v := 32 - uint32(amount&0b11111)
-		immb = v & 0b111
-		immh = 0b0100 | (v >> 3)
-	case vecArrangement2D:
-		q = 0b1
-		v := 64 - uint32(amount&0b111111)
-		immb = v & 0b111
-		immh = 0b1000 | (v >> 3)
-	default:
-		panic("unsupported arrangement: " + arr.String())
-	}
-	return q<<30 | u<<29 | 0b011110<<23 | immh<<19 | immb<<16 | 0b000001<<10 | opcode<<11 | 0b1<<10 | rn<<5 | rd
-}
-
-// encodeVecTbl encodes as Data Processing (Advanced SIMD table lookup) in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp
-//
-// Note: tblOp may encode tbl1, tbl2... in the future. Currently, it is ignored.
-func encodeVecTbl(nregs, rd, rn, rm uint32, arr vecArrangement) uint32 {
-	var q, op2, len, op uint32
-
-	switch nregs {
-	case 1:
-		// tbl: single-register
-		len = 0b00
-	case 2:
-		// tbl2: 2-register table
-		len = 0b01
-	default:
-		panic(fmt.Sprintf("unsupported number or registers %d", nregs))
-	}
-	switch arr {
-	case vecArrangement8B:
-		q = 0b0
-	case vecArrangement16B:
-		q = 0b1
-	default:
-		panic("unsupported arrangement: " + arr.String())
-	}
-
-	return q<<30 | 0b001110<<24 | op2<<22 | rm<<16 | len<<13 | op<<12 | rn<<5 | rd
-}
-
-// encodeVecMisc encodes as Data Processing (Advanced SIMD two-register miscellaneous) depending on vecOp in
-// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp
-func encodeAdvancedSIMDTwoMisc(op vecOp, rd, rn uint32, arr vecArrangement) uint32 {
-	var q, u, size, opcode uint32
-	switch op {
-	case vecOpCnt:
-		opcode = 0b00101
-		switch arr {
-		case vecArrangement8B:
-			q, size = 0b0, 0b00
-		case vecArrangement16B:
-			q, size = 0b1, 0b00
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-	case vecOpCmeq0:
-		if arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		opcode = 0b01001
-		size, q = arrToSizeQEncoded(arr)
-	case vecOpNot:
-		u = 1
-		opcode = 0b00101
-		switch arr {
-		case vecArrangement8B:
-			q, size = 0b0, 0b00
-		case vecArrangement16B:
-			q, size = 0b1, 0b00
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-	case vecOpAbs:
-		if arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		opcode = 0b01011
-		u = 0b0
-		size, q = arrToSizeQEncoded(arr)
-	case vecOpNeg:
-		if arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		opcode = 0b01011
-		u = 0b1
-		size, q = arrToSizeQEncoded(arr)
-	case vecOpFabs:
-		if arr < vecArrangement2S || arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		opcode = 0b01111
-		u = 0b0
-		size, q = arrToSizeQEncoded(arr)
-	case vecOpFneg:
-		if arr < vecArrangement2S || arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		opcode = 0b01111
-		u = 0b1
-		size, q = arrToSizeQEncoded(arr)
-	case vecOpFrintm:
-		u = 0b0
-		opcode = 0b11001
-		switch arr {
-		case vecArrangement2S:
-			q, size = 0b0, 0b00
-		case vecArrangement4S:
-			q, size = 0b1, 0b00
-		case vecArrangement2D:
-			q, size = 0b1, 0b01
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-	case vecOpFrintn:
-		u = 0b0
-		opcode = 0b11000
-		switch arr {
-		case vecArrangement2S:
-			q, size = 0b0, 0b00
-		case vecArrangement4S:
-			q, size = 0b1, 0b00
-		case vecArrangement2D:
-			q, size = 0b1, 0b01
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-	case vecOpFrintp:
-		u = 0b0
-		opcode = 0b11000
-		if arr < vecArrangement2S || arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q = arrToSizeQEncoded(arr)
-	case vecOpFrintz:
-		u = 0b0
-		opcode = 0b11001
-		if arr < vecArrangement2S || arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q = arrToSizeQEncoded(arr)
-	case vecOpFsqrt:
-		if arr < vecArrangement2S || arr == vecArrangement1D {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		opcode = 0b11111
-		u = 0b1
-		size, q = arrToSizeQEncoded(arr)
-	case vecOpFcvtl:
-		opcode = 0b10111
-		u = 0b0
-		switch arr {
-		case vecArrangement2S:
-			size, q = 0b01, 0b0
-		case vecArrangement4H:
-			size, q = 0b00, 0b0
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-	case vecOpFcvtn:
-		opcode = 0b10110
-		u = 0b0
-		switch arr {
-		case vecArrangement2S:
-			size, q = 0b01, 0b0
-		case vecArrangement4H:
-			size, q = 0b00, 0b0
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-	case vecOpFcvtzs:
-		opcode = 0b11011
-		u = 0b0
-		switch arr {
-		case vecArrangement2S:
-			q, size = 0b0, 0b10
-		case vecArrangement4S:
-			q, size = 0b1, 0b10
-		case vecArrangement2D:
-			q, size = 0b1, 0b11
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-	case vecOpFcvtzu:
-		opcode = 0b11011
-		u = 0b1
-		switch arr {
-		case vecArrangement2S:
-			q, size = 0b0, 0b10
-		case vecArrangement4S:
-			q, size = 0b1, 0b10
-		case vecArrangement2D:
-			q, size = 0b1, 0b11
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-	case vecOpScvtf:
-		opcode = 0b11101
-		u = 0b0
-		switch arr {
-		case vecArrangement4S:
-			q, size = 0b1, 0b00
-		case vecArrangement2S:
-			q, size = 0b0, 0b00
-		case vecArrangement2D:
-			q, size = 0b1, 0b01
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-	case vecOpUcvtf:
-		opcode = 0b11101
-		u = 0b1
-		switch arr {
-		case vecArrangement4S:
-			q, size = 0b1, 0b00
-		case vecArrangement2S:
-			q, size = 0b0, 0b00
-		case vecArrangement2D:
-			q, size = 0b1, 0b01
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-	case vecOpSqxtn:
-		// When q == 1 it encodes sqxtn2 (operates on upper 64 bits).
-		opcode = 0b10100
-		u = 0b0
-		if arr > vecArrangement4S {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q = arrToSizeQEncoded(arr)
-	case vecOpUqxtn:
-		// When q == 1 it encodes uqxtn2 (operates on upper 64 bits).
-		opcode = 0b10100
-		u = 0b1
-		if arr > vecArrangement4S {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q = arrToSizeQEncoded(arr)
-	case vecOpSqxtun:
-		// When q == 1 it encodes sqxtun2 (operates on upper 64 bits).
-		opcode = 0b10010 // 0b10100
-		u = 0b1
-		if arr > vecArrangement4S {
-			panic("unsupported arrangement: " + arr.String())
-		}
-		size, q = arrToSizeQEncoded(arr)
-	case vecOpRev64:
-		opcode = 0b00000
-		size, q = arrToSizeQEncoded(arr)
-	case vecOpXtn:
-		u = 0b0
-		opcode = 0b10010
-		size, q = arrToSizeQEncoded(arr)
-	case vecOpShll:
-		u = 0b1
-		opcode = 0b10011
-		switch arr {
-		case vecArrangement8B:
-			q, size = 0b0, 0b00
-		case vecArrangement4H:
-			q, size = 0b0, 0b01
-		case vecArrangement2S:
-			q, size = 0b0, 0b10
-		default:
-			panic("unsupported arrangement: " + arr.String())
-		}
-	default:
-		panic("unsupported or illegal vecOp: " + op.String())
-	}
-	return q<<30 | u<<29 | 0b01110<<24 | size<<22 | 0b10000<<17 | opcode<<12 | 0b10<<10 | rn<<5 | rd
-}
-
-// brTableSequenceOffsetTableBegin is the offset inside the brTableSequence where the table begins after 4 instructions
-const brTableSequenceOffsetTableBegin = 16
-
-func encodeBrTableSequence(c backend.Compiler, index regalloc.VReg, targets []uint32) {
-	tmpRegNumber := regNumberInEncoding[tmp]
-	indexNumber := regNumberInEncoding[index.RealReg()]
-
-	// adr tmpReg, PC+16 (PC+16 is the address of the first label offset)
-	// ldrsw index, [tmpReg, index, UXTW 2] ;; index = int64(*(tmpReg + index*8))
-	// add tmpReg, tmpReg, index
-	// br tmpReg
-	// [offset_to_l1, offset_to_l2, ..., offset_to_lN]
-	c.Emit4Bytes(encodeAdr(tmpRegNumber, 16))
-	c.Emit4Bytes(encodeLoadOrStore(sLoad32, indexNumber,
-		addressMode{kind: addressModeKindRegScaledExtended, rn: tmpRegVReg, rm: index, extOp: extendOpUXTW},
-	))
-	c.Emit4Bytes(encodeAluRRR(aluOpAdd, tmpRegNumber, tmpRegNumber, indexNumber, true, false))
-	c.Emit4Bytes(encodeUnconditionalBranchReg(tmpRegNumber, false))
-
-	// Offsets are resolved in ResolveRelativeAddress phase.
-	for _, offset := range targets {
-		if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
-			// Inlined offset tables cannot be disassembled properly, so pad dummy instructions to make the debugging easier.
-			c.Emit4Bytes(dummyInstruction)
-		} else {
-			c.Emit4Bytes(offset)
-		}
-	}
-}
-
-// encodeExitSequence matches the implementation detail of functionABI.emitGoEntryPreamble.
-func encodeExitSequence(c backend.Compiler, ctxReg regalloc.VReg) {
-	// Restore the FP, SP and LR, and return to the Go code:
-	// 		ldr lr,  [ctxReg, #GoReturnAddress]
-	// 		ldr fp,  [ctxReg, #OriginalFramePointer]
-	// 		ldr tmp, [ctxReg, #OriginalStackPointer]
-	//      mov sp, tmp ;; sp cannot be str'ed directly.
-	// 		ret ;; --> return to the Go code
-
-	var ctxEvicted bool
-	if ctx := ctxReg.RealReg(); ctx == fp || ctx == lr {
-		// In order to avoid overwriting the context register, we move ctxReg to tmp.
-		c.Emit4Bytes(encodeMov64(regNumberInEncoding[tmp], regNumberInEncoding[ctx], false, false))
-		ctxReg = tmpRegVReg
-		ctxEvicted = true
-	}
-
-	restoreLr := encodeLoadOrStore(
-		uLoad64,
-		regNumberInEncoding[lr],
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   ctxReg,
-			imm:  wazevoapi.ExecutionContextOffsetGoReturnAddress.I64(),
-		},
-	)
-
-	restoreFp := encodeLoadOrStore(
-		uLoad64,
-		regNumberInEncoding[fp],
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   ctxReg,
-			imm:  wazevoapi.ExecutionContextOffsetOriginalFramePointer.I64(),
-		},
-	)
-
-	restoreSpToTmp := encodeLoadOrStore(
-		uLoad64,
-		regNumberInEncoding[tmp],
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   ctxReg,
-			imm:  wazevoapi.ExecutionContextOffsetOriginalStackPointer.I64(),
-		},
-	)
-
-	movTmpToSp := encodeAddSubtractImmediate(0b100, 0, 0,
-		regNumberInEncoding[tmp], regNumberInEncoding[sp])
-
-	c.Emit4Bytes(restoreFp)
-	c.Emit4Bytes(restoreLr)
-	c.Emit4Bytes(restoreSpToTmp)
-	c.Emit4Bytes(movTmpToSp)
-	c.Emit4Bytes(encodeRet())
-	if !ctxEvicted {
-		// In order to have the fixed-length exit sequence, we need to padd the binary.
-		// Since this will never be reached, we insert a dummy instruction.
-		c.Emit4Bytes(dummyInstruction)
-	}
-}
-
-func encodeRet() uint32 {
-	// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/RET--Return-from-subroutine-?lang=en
-	return 0b1101011001011111<<16 | regNumberInEncoding[lr]<<5
-}
-
-func encodeAtomicRmw(op atomicRmwOp, rs, rt, rn uint32, size uint32) uint32 {
-	var _31to21, _15to10, sz uint32
-
-	switch size {
-	case 8:
-		sz = 0b11
-	case 4:
-		sz = 0b10
-	case 2:
-		sz = 0b01
-	case 1:
-		sz = 0b00
-	}
-
-	_31to21 = 0b00111000_111 | sz<<9
-
-	switch op {
-	case atomicRmwOpAdd:
-		_15to10 = 0b000000
-	case atomicRmwOpClr:
-		_15to10 = 0b000100
-	case atomicRmwOpSet:
-		_15to10 = 0b001100
-	case atomicRmwOpEor:
-		_15to10 = 0b001000
-	case atomicRmwOpSwp:
-		_15to10 = 0b100000
-	}
-
-	return _31to21<<21 | rs<<16 | _15to10<<10 | rn<<5 | rt
-}
-
-func encodeAtomicCas(rs, rt, rn uint32, size uint32) uint32 {
-	var _31to21, _15to10, sz uint32
-
-	switch size {
-	case 8:
-		sz = 0b11
-	case 4:
-		sz = 0b10
-	case 2:
-		sz = 0b01
-	case 1:
-		sz = 0b00
-	}
-
-	_31to21 = 0b00001000_111 | sz<<9
-	_15to10 = 0b111111
-
-	return _31to21<<21 | rs<<16 | _15to10<<10 | rn<<5 | rt
-}
-
-func encodeAtomicLoadStore(rn, rt, size, l uint32) uint32 {
-	var _31to21, _20to16, _15to10, sz uint32
-
-	switch size {
-	case 8:
-		sz = 0b11
-	case 4:
-		sz = 0b10
-	case 2:
-		sz = 0b01
-	case 1:
-		sz = 0b00
-	}
-
-	_31to21 = 0b00001000_100 | sz<<9 | l<<1
-	_20to16 = 0b11111
-	_15to10 = 0b111111
-
-	return _31to21<<21 | _20to16<<16 | _15to10<<10 | rn<<5 | rt
-}
-
-func encodeDMB() uint32 {
-	return 0b11010101000000110011101110111111
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
deleted file mode 100644
index 6c6824fb0..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
+++ /dev/null
@@ -1,301 +0,0 @@
-package arm64
-
-import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-// lowerConstant allocates a new VReg and inserts the instruction to load the constant value.
-func (m *machine) lowerConstant(instr *ssa.Instruction) (vr regalloc.VReg) {
-	val := instr.Return()
-	valType := val.Type()
-
-	vr = m.compiler.AllocateVReg(valType)
-	v := instr.ConstantVal()
-	m.insertLoadConstant(v, valType, vr)
-	return
-}
-
-// InsertLoadConstantBlockArg implements backend.Machine.
-func (m *machine) InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) {
-	val := instr.Return()
-	valType := val.Type()
-	v := instr.ConstantVal()
-	load := m.allocateInstr()
-	load.asLoadConstBlockArg(v, valType, vr)
-	m.insert(load)
-}
-
-func (m *machine) lowerLoadConstantBlockArgAfterRegAlloc(i *instruction) {
-	v, typ, dst := i.loadConstBlockArgData()
-	m.insertLoadConstant(v, typ, dst)
-}
-
-func (m *machine) insertLoadConstant(v uint64, valType ssa.Type, vr regalloc.VReg) {
-	if valType.Bits() < 64 { // Clear the redundant bits just in case it's unexpectedly sign-extended, etc.
-		v = v & ((1 << valType.Bits()) - 1)
-	}
-
-	switch valType {
-	case ssa.TypeF32:
-		loadF := m.allocateInstr()
-		loadF.asLoadFpuConst32(vr, v)
-		m.insert(loadF)
-	case ssa.TypeF64:
-		loadF := m.allocateInstr()
-		loadF.asLoadFpuConst64(vr, v)
-		m.insert(loadF)
-	case ssa.TypeI32:
-		if v == 0 {
-			m.InsertMove(vr, xzrVReg, ssa.TypeI32)
-		} else {
-			m.lowerConstantI32(vr, int32(v))
-		}
-	case ssa.TypeI64:
-		if v == 0 {
-			m.InsertMove(vr, xzrVReg, ssa.TypeI64)
-		} else {
-			m.lowerConstantI64(vr, int64(v))
-		}
-	default:
-		panic("TODO")
-	}
-}
-
-// The following logics are based on the old asm/arm64 package.
-// https://github.com/tetratelabs/wazero/blob/39f2ff23a6d609e10c82b9cc0b981f6de5b87a9c/internal/asm/arm64/impl.go
-
-func (m *machine) lowerConstantI32(dst regalloc.VReg, c int32) {
-	// Following the logic here:
-	// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1637
-	ic := int64(uint32(c))
-	if ic >= 0 && (ic <= 0xfff || (ic&0xfff) == 0 && (uint64(ic>>12) <= 0xfff)) {
-		if isBitMaskImmediate(uint64(c), false) {
-			m.lowerConstViaBitMaskImmediate(uint64(uint32(c)), dst, false)
-			return
-		}
-	}
-
-	if t := const16bitAligned(int64(uint32(c))); t >= 0 {
-		// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
-		// We could load it into temporary with movk.
-		m.insertMOVZ(dst, uint64(uint32(c)>>(16*t)), t, false)
-	} else if t := const16bitAligned(int64(^c)); t >= 0 {
-		// Also, if the inverse of the const can fit within 16-bit range, do the same ^^.
-		m.insertMOVN(dst, uint64(^c>>(16*t)), t, false)
-	} else if isBitMaskImmediate(uint64(uint32(c)), false) {
-		m.lowerConstViaBitMaskImmediate(uint64(c), dst, false)
-	} else {
-		// Otherwise, we use MOVZ and MOVK to load it.
-		c16 := uint16(c)
-		m.insertMOVZ(dst, uint64(c16), 0, false)
-		c16 = uint16(uint32(c) >> 16)
-		m.insertMOVK(dst, uint64(c16), 1, false)
-	}
-}
-
-func (m *machine) lowerConstantI64(dst regalloc.VReg, c int64) {
-	// Following the logic here:
-	// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1798-L1852
-	if c >= 0 && (c <= 0xfff || (c&0xfff) == 0 && (uint64(c>>12) <= 0xfff)) {
-		if isBitMaskImmediate(uint64(c), true) {
-			m.lowerConstViaBitMaskImmediate(uint64(c), dst, true)
-			return
-		}
-	}
-
-	if t := const16bitAligned(c); t >= 0 {
-		// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
-		// We could load it into temporary with movk.
-		m.insertMOVZ(dst, uint64(c)>>(16*t), t, true)
-	} else if t := const16bitAligned(^c); t >= 0 {
-		// Also, if the reverse of the const can fit within 16-bit range, do the same ^^.
-		m.insertMOVN(dst, uint64(^c)>>(16*t), t, true)
-	} else if isBitMaskImmediate(uint64(c), true) {
-		m.lowerConstViaBitMaskImmediate(uint64(c), dst, true)
-	} else {
-		m.load64bitConst(c, dst)
-	}
-}
-
-func (m *machine) lowerConstViaBitMaskImmediate(c uint64, dst regalloc.VReg, b64 bool) {
-	instr := m.allocateInstr()
-	instr.asALUBitmaskImm(aluOpOrr, dst, xzrVReg, c, b64)
-	m.insert(instr)
-}
-
-// isBitMaskImmediate determines if the value can be encoded as "bitmask immediate".
-//
-//	Such an immediate is a 32-bit or 64-bit pattern viewed as a vector of identical elements of size e = 2, 4, 8, 16, 32, or 64 bits.
-//	Each element contains the same sub-pattern: a single run of 1 to e-1 non-zero bits, rotated by 0 to e-1 bits.
-//
-// See https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/MOV--bitmask-immediate-
-func isBitMaskImmediate(x uint64, _64 bool) bool {
-	// All zeros and ones are not "bitmask immediate" by definition.
-	if x == 0 || (_64 && x == 0xffff_ffff_ffff_ffff) || (!_64 && x == 0xffff_ffff) {
-		return false
-	}
-
-	switch {
-	case x != x>>32|x<<32:
-		// e = 64
-	case x != x>>16|x<<48:
-		// e = 32 (x == x>>32|x<<32).
-		// e.g. 0x00ff_ff00_00ff_ff00
-		x = uint64(int32(x))
-	case x != x>>8|x<<56:
-		// e = 16 (x == x>>16|x<<48).
-		// e.g. 0x00ff_00ff_00ff_00ff
-		x = uint64(int16(x))
-	case x != x>>4|x<<60:
-		// e = 8 (x == x>>8|x<<56).
-		// e.g. 0x0f0f_0f0f_0f0f_0f0f
-		x = uint64(int8(x))
-	default:
-		// e = 4 or 2.
-		return true
-	}
-	return sequenceOfSetbits(x) || sequenceOfSetbits(^x)
-}
-
-// sequenceOfSetbits returns true if the number's binary representation is the sequence set bit (1).
-// For example: 0b1110 -> true, 0b1010 -> false
-func sequenceOfSetbits(x uint64) bool {
-	y := getLowestBit(x)
-	// If x is a sequence of set bit, this should results in the number
-	// with only one set bit (i.e. power of two).
-	y += x
-	return (y-1)&y == 0
-}
-
-func getLowestBit(x uint64) uint64 {
-	return x & (^x + 1)
-}
-
-// const16bitAligned check if the value is on the 16-bit alignment.
-// If so, returns the shift num divided by 16, and otherwise -1.
-func const16bitAligned(v int64) (ret int) {
-	ret = -1
-	for s := 0; s < 64; s += 16 {
-		if (uint64(v) &^ (uint64(0xffff) << uint(s))) == 0 {
-			ret = s / 16
-			break
-		}
-	}
-	return
-}
-
-// load64bitConst loads a 64-bit constant into the register, following the same logic to decide how to load large 64-bit
-// consts as in the Go assembler.
-//
-// See https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6632-L6759
-func (m *machine) load64bitConst(c int64, dst regalloc.VReg) {
-	var bits [4]uint64
-	var zeros, negs int
-	for i := 0; i < 4; i++ {
-		bits[i] = uint64(c) >> uint(i*16) & 0xffff
-		if v := bits[i]; v == 0 {
-			zeros++
-		} else if v == 0xffff {
-			negs++
-		}
-	}
-
-	if zeros == 3 {
-		// one MOVZ instruction.
-		for i, v := range bits {
-			if v != 0 {
-				m.insertMOVZ(dst, v, i, true)
-			}
-		}
-	} else if negs == 3 {
-		// one MOVN instruction.
-		for i, v := range bits {
-			if v != 0xffff {
-				v = ^v
-				m.insertMOVN(dst, v, i, true)
-			}
-		}
-	} else if zeros == 2 {
-		// one MOVZ then one OVK.
-		var movz bool
-		for i, v := range bits {
-			if !movz && v != 0 { // MOVZ.
-				m.insertMOVZ(dst, v, i, true)
-				movz = true
-			} else if v != 0 {
-				m.insertMOVK(dst, v, i, true)
-			}
-		}
-
-	} else if negs == 2 {
-		// one MOVN then one or two MOVK.
-		var movn bool
-		for i, v := range bits { // Emit MOVN.
-			if !movn && v != 0xffff {
-				v = ^v
-				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
-				m.insertMOVN(dst, v, i, true)
-				movn = true
-			} else if v != 0xffff {
-				m.insertMOVK(dst, v, i, true)
-			}
-		}
-
-	} else if zeros == 1 {
-		// one MOVZ then two MOVK.
-		var movz bool
-		for i, v := range bits {
-			if !movz && v != 0 { // MOVZ.
-				m.insertMOVZ(dst, v, i, true)
-				movz = true
-			} else if v != 0 {
-				m.insertMOVK(dst, v, i, true)
-			}
-		}
-
-	} else if negs == 1 {
-		// one MOVN then two MOVK.
-		var movn bool
-		for i, v := range bits { // Emit MOVN.
-			if !movn && v != 0xffff {
-				v = ^v
-				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
-				m.insertMOVN(dst, v, i, true)
-				movn = true
-			} else if v != 0xffff {
-				m.insertMOVK(dst, v, i, true)
-			}
-		}
-
-	} else {
-		// one MOVZ then up to three MOVK.
-		var movz bool
-		for i, v := range bits {
-			if !movz && v != 0 { // MOVZ.
-				m.insertMOVZ(dst, v, i, true)
-				movz = true
-			} else if v != 0 {
-				m.insertMOVK(dst, v, i, true)
-			}
-		}
-	}
-}
-
-func (m *machine) insertMOVZ(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
-	instr := m.allocateInstr()
-	instr.asMOVZ(dst, v, uint32(shift), dst64)
-	m.insert(instr)
-}
-
-func (m *machine) insertMOVK(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
-	instr := m.allocateInstr()
-	instr.asMOVK(dst, v, uint32(shift), dst64)
-	m.insert(instr)
-}
-
-func (m *machine) insertMOVN(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
-	instr := m.allocateInstr()
-	instr.asMOVN(dst, v, uint32(shift), dst64)
-	m.insert(instr)
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
deleted file mode 100644
index f9df356c0..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
+++ /dev/null
@@ -1,2224 +0,0 @@
-package arm64
-
-// Files prefixed as lower_instr** do the instruction selection, meaning that lowering SSA level instructions
-// into machine specific instructions.
-//
-// Importantly, what the lower** functions does includes tree-matching; find the pattern from the given instruction tree,
-// and merge the multiple instructions if possible. It can be considered as "N:1" instruction selection.
-
-import (
-	"fmt"
-	"math"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-)
-
-// LowerSingleBranch implements backend.Machine.
-func (m *machine) LowerSingleBranch(br *ssa.Instruction) {
-	switch br.Opcode() {
-	case ssa.OpcodeJump:
-		_, _, targetBlkID := br.BranchData()
-		if br.IsFallthroughJump() {
-			return
-		}
-		b := m.allocateInstr()
-		targetBlk := m.compiler.SSABuilder().BasicBlock(targetBlkID)
-		if targetBlk.ReturnBlock() {
-			b.asRet()
-		} else {
-			b.asBr(ssaBlockLabel(targetBlk))
-		}
-		m.insert(b)
-	case ssa.OpcodeBrTable:
-		m.lowerBrTable(br)
-	default:
-		panic("BUG: unexpected branch opcode" + br.Opcode().String())
-	}
-}
-
-func (m *machine) lowerBrTable(i *ssa.Instruction) {
-	index, targetBlockIDs := i.BrTableData()
-	targetBlockCount := len(targetBlockIDs.View())
-	indexOperand := m.getOperand_NR(m.compiler.ValueDefinition(index), extModeNone)
-
-	// Firstly, we have to do the bounds check of the index, and
-	// set it to the default target (sitting at the end of the list) if it's out of bounds.
-
-	// mov  maxIndexReg #maximum_index
-	// subs wzr, index, maxIndexReg
-	// csel adjustedIndex, maxIndexReg, index, hs ;; if index is higher or equal than maxIndexReg.
-	maxIndexReg := m.compiler.AllocateVReg(ssa.TypeI32)
-	m.lowerConstantI32(maxIndexReg, int32(targetBlockCount-1))
-	subs := m.allocateInstr()
-	subs.asALU(aluOpSubS, xzrVReg, indexOperand, operandNR(maxIndexReg), false)
-	m.insert(subs)
-	csel := m.allocateInstr()
-	adjustedIndex := m.compiler.AllocateVReg(ssa.TypeI32)
-	csel.asCSel(adjustedIndex, operandNR(maxIndexReg), indexOperand, hs, false)
-	m.insert(csel)
-
-	brSequence := m.allocateInstr()
-
-	tableIndex := m.addJmpTableTarget(targetBlockIDs)
-	brSequence.asBrTableSequence(adjustedIndex, tableIndex, targetBlockCount)
-	m.insert(brSequence)
-}
-
-// LowerConditionalBranch implements backend.Machine.
-func (m *machine) LowerConditionalBranch(b *ssa.Instruction) {
-	cval, args, targetBlkID := b.BranchData()
-	if len(args) > 0 {
-		panic(fmt.Sprintf(
-			"conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s",
-			m.currentLabelPos.sb,
-			targetBlkID,
-		))
-	}
-
-	targetBlk := m.compiler.SSABuilder().BasicBlock(targetBlkID)
-	target := ssaBlockLabel(targetBlk)
-	cvalDef := m.compiler.ValueDefinition(cval)
-
-	switch {
-	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction.
-		cvalInstr := cvalDef.Instr
-		x, y, c := cvalInstr.IcmpData()
-		cc, signed := condFlagFromSSAIntegerCmpCond(c), c.Signed()
-		if b.Opcode() == ssa.OpcodeBrz {
-			cc = cc.invert()
-		}
-
-		if !m.tryLowerBandToFlag(x, y) {
-			m.lowerIcmpToFlag(x, y, signed)
-		}
-		cbr := m.allocateInstr()
-		cbr.asCondBr(cc.asCond(), target, false /* ignored */)
-		m.insert(cbr)
-		cvalDef.Instr.MarkLowered()
-	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly.
-		cvalInstr := cvalDef.Instr
-		x, y, c := cvalInstr.FcmpData()
-		cc := condFlagFromSSAFloatCmpCond(c)
-		if b.Opcode() == ssa.OpcodeBrz {
-			cc = cc.invert()
-		}
-		m.lowerFcmpToFlag(x, y)
-		cbr := m.allocateInstr()
-		cbr.asCondBr(cc.asCond(), target, false /* ignored */)
-		m.insert(cbr)
-		cvalDef.Instr.MarkLowered()
-	default:
-		rn := m.getOperand_NR(cvalDef, extModeNone)
-		var c cond
-		if b.Opcode() == ssa.OpcodeBrz {
-			c = registerAsRegZeroCond(rn.nr())
-		} else {
-			c = registerAsRegNotZeroCond(rn.nr())
-		}
-		cbr := m.allocateInstr()
-		cbr.asCondBr(c, target, false)
-		m.insert(cbr)
-	}
-}
-
-func (m *machine) tryLowerBandToFlag(x, y ssa.Value) (ok bool) {
-	xx := m.compiler.ValueDefinition(x)
-	yy := m.compiler.ValueDefinition(y)
-	if xx.IsFromInstr() && xx.Instr.Constant() && xx.Instr.ConstantVal() == 0 {
-		if m.compiler.MatchInstr(yy, ssa.OpcodeBand) {
-			bandInstr := yy.Instr
-			m.lowerBitwiseAluOp(bandInstr, aluOpAnds, true)
-			ok = true
-			bandInstr.MarkLowered()
-			return
-		}
-	}
-
-	if yy.IsFromInstr() && yy.Instr.Constant() && yy.Instr.ConstantVal() == 0 {
-		if m.compiler.MatchInstr(xx, ssa.OpcodeBand) {
-			bandInstr := xx.Instr
-			m.lowerBitwiseAluOp(bandInstr, aluOpAnds, true)
-			ok = true
-			bandInstr.MarkLowered()
-			return
-		}
-	}
-	return
-}
-
-// LowerInstr implements backend.Machine.
-func (m *machine) LowerInstr(instr *ssa.Instruction) {
-	if l := instr.SourceOffset(); l.Valid() {
-		info := m.allocateInstr().asEmitSourceOffsetInfo(l)
-		m.insert(info)
-	}
-
-	switch op := instr.Opcode(); op {
-	case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable:
-		panic("BUG: branching instructions are handled by LowerBranches")
-	case ssa.OpcodeReturn:
-		panic("BUG: return must be handled by backend.Compiler")
-	case ssa.OpcodeIadd, ssa.OpcodeIsub:
-		m.lowerSubOrAdd(instr, op == ssa.OpcodeIadd)
-	case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv, ssa.OpcodeFmax, ssa.OpcodeFmin:
-		m.lowerFpuBinOp(instr)
-	case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined.
-	case ssa.OpcodeExitWithCode:
-		execCtx, code := instr.ExitWithCodeData()
-		m.lowerExitWithCode(m.compiler.VRegOf(execCtx), code)
-	case ssa.OpcodeExitIfTrueWithCode:
-		execCtx, c, code := instr.ExitIfTrueWithCodeData()
-		m.lowerExitIfTrueWithCode(m.compiler.VRegOf(execCtx), c, code)
-	case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32:
-		m.lowerStore(instr)
-	case ssa.OpcodeLoad:
-		dst := instr.Return()
-		ptr, offset, typ := instr.LoadData()
-		m.lowerLoad(ptr, offset, typ, dst)
-	case ssa.OpcodeVZeroExtLoad:
-		dst := instr.Return()
-		ptr, offset, typ := instr.VZeroExtLoadData()
-		m.lowerLoad(ptr, offset, typ, dst)
-	case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32:
-		ptr, offset, _ := instr.LoadData()
-		ret := m.compiler.VRegOf(instr.Return())
-		m.lowerExtLoad(op, ptr, offset, ret)
-	case ssa.OpcodeCall, ssa.OpcodeCallIndirect:
-		m.lowerCall(instr)
-	case ssa.OpcodeIcmp:
-		m.lowerIcmp(instr)
-	case ssa.OpcodeVIcmp:
-		m.lowerVIcmp(instr)
-	case ssa.OpcodeVFcmp:
-		m.lowerVFcmp(instr)
-	case ssa.OpcodeVCeil:
-		m.lowerVecMisc(vecOpFrintp, instr)
-	case ssa.OpcodeVFloor:
-		m.lowerVecMisc(vecOpFrintm, instr)
-	case ssa.OpcodeVTrunc:
-		m.lowerVecMisc(vecOpFrintz, instr)
-	case ssa.OpcodeVNearest:
-		m.lowerVecMisc(vecOpFrintn, instr)
-	case ssa.OpcodeVMaxPseudo:
-		m.lowerVMinMaxPseudo(instr, true)
-	case ssa.OpcodeVMinPseudo:
-		m.lowerVMinMaxPseudo(instr, false)
-	case ssa.OpcodeBand:
-		m.lowerBitwiseAluOp(instr, aluOpAnd, false)
-	case ssa.OpcodeBor:
-		m.lowerBitwiseAluOp(instr, aluOpOrr, false)
-	case ssa.OpcodeBxor:
-		m.lowerBitwiseAluOp(instr, aluOpEor, false)
-	case ssa.OpcodeIshl:
-		m.lowerShifts(instr, extModeNone, aluOpLsl)
-	case ssa.OpcodeSshr:
-		if instr.Return().Type().Bits() == 64 {
-			m.lowerShifts(instr, extModeSignExtend64, aluOpAsr)
-		} else {
-			m.lowerShifts(instr, extModeSignExtend32, aluOpAsr)
-		}
-	case ssa.OpcodeUshr:
-		if instr.Return().Type().Bits() == 64 {
-			m.lowerShifts(instr, extModeZeroExtend64, aluOpLsr)
-		} else {
-			m.lowerShifts(instr, extModeZeroExtend32, aluOpLsr)
-		}
-	case ssa.OpcodeRotl:
-		m.lowerRotl(instr)
-	case ssa.OpcodeRotr:
-		m.lowerRotr(instr)
-	case ssa.OpcodeSExtend, ssa.OpcodeUExtend:
-		from, to, signed := instr.ExtendData()
-		m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed)
-	case ssa.OpcodeFcmp:
-		x, y, c := instr.FcmpData()
-		m.lowerFcmp(x, y, instr.Return(), c)
-	case ssa.OpcodeImul:
-		x, y := instr.Arg2()
-		result := instr.Return()
-		m.lowerImul(x, y, result)
-	case ssa.OpcodeUndefined:
-		undef := m.allocateInstr()
-		undef.asUDF()
-		m.insert(undef)
-	case ssa.OpcodeSelect:
-		c, x, y := instr.SelectData()
-		if x.Type() == ssa.TypeV128 {
-			rc := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
-			rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-			rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-			rd := m.compiler.VRegOf(instr.Return())
-			m.lowerSelectVec(rc, rn, rm, rd)
-		} else {
-			m.lowerSelect(c, x, y, instr.Return())
-		}
-	case ssa.OpcodeClz:
-		x := instr.Arg()
-		result := instr.Return()
-		m.lowerClz(x, result)
-	case ssa.OpcodeCtz:
-		x := instr.Arg()
-		result := instr.Return()
-		m.lowerCtz(x, result)
-	case ssa.OpcodePopcnt:
-		x := instr.Arg()
-		result := instr.Return()
-		m.lowerPopcnt(x, result)
-	case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat:
-		x, ctx := instr.Arg2()
-		result := instr.Return()
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := m.compiler.VRegOf(result)
-		ctxVReg := m.compiler.VRegOf(ctx)
-		m.lowerFpuToInt(rd, rn, ctxVReg, true, x.Type() == ssa.TypeF64,
-			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat)
-	case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat:
-		x, ctx := instr.Arg2()
-		result := instr.Return()
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := m.compiler.VRegOf(result)
-		ctxVReg := m.compiler.VRegOf(ctx)
-		m.lowerFpuToInt(rd, rn, ctxVReg, false, x.Type() == ssa.TypeF64,
-			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat)
-	case ssa.OpcodeFcvtFromSint:
-		x := instr.Arg()
-		result := instr.Return()
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := m.compiler.VRegOf(result)
-		m.lowerIntToFpu(rd, rn, true, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
-	case ssa.OpcodeFcvtFromUint:
-		x := instr.Arg()
-		result := instr.Return()
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := m.compiler.VRegOf(result)
-		m.lowerIntToFpu(rd, rn, false, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
-	case ssa.OpcodeFdemote:
-		v := instr.Arg()
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-		cnt := m.allocateInstr()
-		cnt.asFpuRR(fpuUniOpCvt64To32, rd, rn, false)
-		m.insert(cnt)
-	case ssa.OpcodeFpromote:
-		v := instr.Arg()
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-		cnt := m.allocateInstr()
-		cnt.asFpuRR(fpuUniOpCvt32To64, rd, rn, true)
-		m.insert(cnt)
-	case ssa.OpcodeIreduce:
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(instr.Arg()), extModeNone)
-		retVal := instr.Return()
-		rd := m.compiler.VRegOf(retVal)
-
-		if retVal.Type() != ssa.TypeI32 {
-			panic("TODO?: Ireduce to non-i32")
-		}
-		mov := m.allocateInstr()
-		mov.asMove32(rd, rn.reg())
-		m.insert(mov)
-	case ssa.OpcodeFneg:
-		m.lowerFpuUniOp(fpuUniOpNeg, instr.Arg(), instr.Return())
-	case ssa.OpcodeSqrt:
-		m.lowerFpuUniOp(fpuUniOpSqrt, instr.Arg(), instr.Return())
-	case ssa.OpcodeCeil:
-		m.lowerFpuUniOp(fpuUniOpRoundPlus, instr.Arg(), instr.Return())
-	case ssa.OpcodeFloor:
-		m.lowerFpuUniOp(fpuUniOpRoundMinus, instr.Arg(), instr.Return())
-	case ssa.OpcodeTrunc:
-		m.lowerFpuUniOp(fpuUniOpRoundZero, instr.Arg(), instr.Return())
-	case ssa.OpcodeNearest:
-		m.lowerFpuUniOp(fpuUniOpRoundNearest, instr.Arg(), instr.Return())
-	case ssa.OpcodeFabs:
-		m.lowerFpuUniOp(fpuUniOpAbs, instr.Arg(), instr.Return())
-	case ssa.OpcodeBitcast:
-		m.lowerBitcast(instr)
-	case ssa.OpcodeFcopysign:
-		x, y := instr.Arg2()
-		m.lowerFcopysign(x, y, instr.Return())
-	case ssa.OpcodeSdiv, ssa.OpcodeUdiv:
-		x, y, ctx := instr.Arg3()
-		ctxVReg := m.compiler.VRegOf(ctx)
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-		m.lowerIDiv(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSdiv)
-	case ssa.OpcodeSrem, ssa.OpcodeUrem:
-		x, y, ctx := instr.Arg3()
-		ctxVReg := m.compiler.VRegOf(ctx)
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-		m.lowerIRem(ctxVReg, rd, rn.nr(), rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem)
-	case ssa.OpcodeVconst:
-		result := m.compiler.VRegOf(instr.Return())
-		lo, hi := instr.VconstData()
-		v := m.allocateInstr()
-		v.asLoadFpuConst128(result, lo, hi)
-		m.insert(v)
-	case ssa.OpcodeVbnot:
-		x := instr.Arg()
-		ins := m.allocateInstr()
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-		ins.asVecMisc(vecOpNot, rd, rn, vecArrangement16B)
-		m.insert(ins)
-	case ssa.OpcodeVbxor:
-		x, y := instr.Arg2()
-		m.lowerVecRRR(vecOpEOR, x, y, instr.Return(), vecArrangement16B)
-	case ssa.OpcodeVbor:
-		x, y := instr.Arg2()
-		m.lowerVecRRR(vecOpOrr, x, y, instr.Return(), vecArrangement16B)
-	case ssa.OpcodeVband:
-		x, y := instr.Arg2()
-		m.lowerVecRRR(vecOpAnd, x, y, instr.Return(), vecArrangement16B)
-	case ssa.OpcodeVbandnot:
-		x, y := instr.Arg2()
-		m.lowerVecRRR(vecOpBic, x, y, instr.Return(), vecArrangement16B)
-	case ssa.OpcodeVbitselect:
-		c, x, y := instr.SelectData()
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		creg := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
-		tmp := m.compiler.AllocateVReg(ssa.TypeV128)
-
-		// creg is overwritten by BSL, so we need to move it to the result register before the instruction
-		// in case when it is used somewhere else.
-		mov := m.allocateInstr()
-		mov.asFpuMov128(tmp, creg.nr())
-		m.insert(mov)
-
-		ins := m.allocateInstr()
-		ins.asVecRRRRewrite(vecOpBsl, tmp, rn, rm, vecArrangement16B)
-		m.insert(ins)
-
-		mov2 := m.allocateInstr()
-		rd := m.compiler.VRegOf(instr.Return())
-		mov2.asFpuMov128(rd, tmp)
-		m.insert(mov2)
-	case ssa.OpcodeVanyTrue, ssa.OpcodeVallTrue:
-		x, lane := instr.ArgWithLane()
-		var arr vecArrangement
-		if op == ssa.OpcodeVallTrue {
-			arr = ssaLaneToArrangement(lane)
-		}
-		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-		m.lowerVcheckTrue(op, rm, rd, arr)
-	case ssa.OpcodeVhighBits:
-		x, lane := instr.ArgWithLane()
-		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVhighBits(rm, rd, arr)
-	case ssa.OpcodeVIadd:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpAdd, x, y, instr.Return(), arr)
-	case ssa.OpcodeExtIaddPairwise:
-		v, lane, signed := instr.ExtIaddPairwiseData()
-		vv := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
-
-		tmpLo, tmpHi := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-		var widen vecOp
-		if signed {
-			widen = vecOpSshll
-		} else {
-			widen = vecOpUshll
-		}
-
-		var loArr, hiArr, dstArr vecArrangement
-		switch lane {
-		case ssa.VecLaneI8x16:
-			loArr, hiArr, dstArr = vecArrangement8B, vecArrangement16B, vecArrangement8H
-		case ssa.VecLaneI16x8:
-			loArr, hiArr, dstArr = vecArrangement4H, vecArrangement8H, vecArrangement4S
-		case ssa.VecLaneI32x4:
-			loArr, hiArr, dstArr = vecArrangement2S, vecArrangement4S, vecArrangement2D
-		default:
-			panic("unsupported lane " + lane.String())
-		}
-
-		widenLo := m.allocateInstr().asVecShiftImm(widen, tmpLo.nr(), vv, operandShiftImm(0), loArr)
-		widenHi := m.allocateInstr().asVecShiftImm(widen, tmpHi.nr(), vv, operandShiftImm(0), hiArr)
-		addp := m.allocateInstr().asVecRRR(vecOpAddp, m.compiler.VRegOf(instr.Return()), tmpLo, tmpHi, dstArr)
-		m.insert(widenLo)
-		m.insert(widenHi)
-		m.insert(addp)
-
-	case ssa.OpcodeVSaddSat:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpSqadd, x, y, instr.Return(), arr)
-	case ssa.OpcodeVUaddSat:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpUqadd, x, y, instr.Return(), arr)
-	case ssa.OpcodeVIsub:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpSub, x, y, instr.Return(), arr)
-	case ssa.OpcodeVSsubSat:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpSqsub, x, y, instr.Return(), arr)
-	case ssa.OpcodeVUsubSat:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpUqsub, x, y, instr.Return(), arr)
-	case ssa.OpcodeVImin:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpSmin, x, y, instr.Return(), arr)
-	case ssa.OpcodeVUmin:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpUmin, x, y, instr.Return(), arr)
-	case ssa.OpcodeVImax:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpSmax, x, y, instr.Return(), arr)
-	case ssa.OpcodeVUmax:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpUmax, x, y, instr.Return(), arr)
-	case ssa.OpcodeVAvgRound:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpUrhadd, x, y, instr.Return(), arr)
-	case ssa.OpcodeVImul:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-		m.lowerVIMul(rd, rn, rm, arr)
-	case ssa.OpcodeVIabs:
-		m.lowerVecMisc(vecOpAbs, instr)
-	case ssa.OpcodeVIneg:
-		m.lowerVecMisc(vecOpNeg, instr)
-	case ssa.OpcodeVIpopcnt:
-		m.lowerVecMisc(vecOpCnt, instr)
-	case ssa.OpcodeVIshl,
-		ssa.OpcodeVSshr, ssa.OpcodeVUshr:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-		m.lowerVShift(op, rd, rn, rm, arr)
-	case ssa.OpcodeVSqrt:
-		m.lowerVecMisc(vecOpFsqrt, instr)
-	case ssa.OpcodeVFabs:
-		m.lowerVecMisc(vecOpFabs, instr)
-	case ssa.OpcodeVFneg:
-		m.lowerVecMisc(vecOpFneg, instr)
-	case ssa.OpcodeVFmin:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpFmin, x, y, instr.Return(), arr)
-	case ssa.OpcodeVFmax:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpFmax, x, y, instr.Return(), arr)
-	case ssa.OpcodeVFadd:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpFadd, x, y, instr.Return(), arr)
-	case ssa.OpcodeVFsub:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpFsub, x, y, instr.Return(), arr)
-	case ssa.OpcodeVFmul:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpFmul, x, y, instr.Return(), arr)
-	case ssa.OpcodeSqmulRoundSat:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpSqrdmulh, x, y, instr.Return(), arr)
-	case ssa.OpcodeVFdiv:
-		x, y, lane := instr.Arg2WithLane()
-		arr := ssaLaneToArrangement(lane)
-		m.lowerVecRRR(vecOpFdiv, x, y, instr.Return(), arr)
-	case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat:
-		x, lane := instr.ArgWithLane()
-		arr := ssaLaneToArrangement(lane)
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-		m.lowerVfpuToInt(rd, rn, arr, op == ssa.OpcodeVFcvtToSintSat)
-	case ssa.OpcodeVFcvtFromSint, ssa.OpcodeVFcvtFromUint:
-		x, lane := instr.ArgWithLane()
-		arr := ssaLaneToArrangement(lane)
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-		m.lowerVfpuFromInt(rd, rn, arr, op == ssa.OpcodeVFcvtFromSint)
-	case ssa.OpcodeSwidenLow, ssa.OpcodeUwidenLow:
-		x, lane := instr.ArgWithLane()
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-
-		var arr vecArrangement
-		switch lane {
-		case ssa.VecLaneI8x16:
-			arr = vecArrangement8B
-		case ssa.VecLaneI16x8:
-			arr = vecArrangement4H
-		case ssa.VecLaneI32x4:
-			arr = vecArrangement2S
-		}
-
-		shll := m.allocateInstr()
-		if signed := op == ssa.OpcodeSwidenLow; signed {
-			shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr)
-		} else {
-			shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr)
-		}
-		m.insert(shll)
-	case ssa.OpcodeSwidenHigh, ssa.OpcodeUwidenHigh:
-		x, lane := instr.ArgWithLane()
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-
-		arr := ssaLaneToArrangement(lane)
-
-		shll := m.allocateInstr()
-		if signed := op == ssa.OpcodeSwidenHigh; signed {
-			shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr)
-		} else {
-			shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr)
-		}
-		m.insert(shll)
-
-	case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow:
-		x, y, lane := instr.Arg2WithLane()
-		var arr, arr2 vecArrangement
-		switch lane {
-		case ssa.VecLaneI16x8: // I16x8
-			arr = vecArrangement8B
-			arr2 = vecArrangement16B // Implies sqxtn2.
-		case ssa.VecLaneI32x4:
-			arr = vecArrangement4H
-			arr2 = vecArrangement8H // Implies sqxtn2.
-		default:
-			panic("unsupported lane " + lane.String())
-		}
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-
-		tmp := m.compiler.AllocateVReg(ssa.TypeV128)
-
-		loQxtn := m.allocateInstr()
-		hiQxtn := m.allocateInstr()
-		if signed := op == ssa.OpcodeSnarrow; signed {
-			// Narrow lanes on rn and write them into lower-half of rd.
-			loQxtn.asVecMisc(vecOpSqxtn, tmp, rn, arr) // low
-			// Narrow lanes on rm and write them into higher-half of rd.
-			hiQxtn.asVecMisc(vecOpSqxtn, tmp, rm, arr2) // high (sqxtn2)
-		} else {
-			// Narrow lanes on rn and write them into lower-half of rd.
-			loQxtn.asVecMisc(vecOpSqxtun, tmp, rn, arr) // low
-			// Narrow lanes on rm and write them into higher-half of rd.
-			hiQxtn.asVecMisc(vecOpSqxtun, tmp, rm, arr2) // high (sqxtn2)
-		}
-		m.insert(loQxtn)
-		m.insert(hiQxtn)
-
-		mov := m.allocateInstr()
-		mov.asFpuMov128(rd, tmp)
-		m.insert(mov)
-	case ssa.OpcodeFvpromoteLow:
-		x, lane := instr.ArgWithLane()
-		if lane != ssa.VecLaneF32x4 {
-			panic("unsupported lane type " + lane.String())
-		}
-		ins := m.allocateInstr()
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-		ins.asVecMisc(vecOpFcvtl, rd, rn, vecArrangement2S)
-		m.insert(ins)
-	case ssa.OpcodeFvdemote:
-		x, lane := instr.ArgWithLane()
-		if lane != ssa.VecLaneF64x2 {
-			panic("unsupported lane type " + lane.String())
-		}
-		ins := m.allocateInstr()
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-		ins.asVecMisc(vecOpFcvtn, rd, rn, vecArrangement2S)
-		m.insert(ins)
-	case ssa.OpcodeExtractlane:
-		x, index, signed, lane := instr.ExtractlaneData()
-
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-
-		mov := m.allocateInstr()
-		switch lane {
-		case ssa.VecLaneI8x16:
-			mov.asMovFromVec(rd, rn, vecArrangementB, vecIndex(index), signed)
-		case ssa.VecLaneI16x8:
-			mov.asMovFromVec(rd, rn, vecArrangementH, vecIndex(index), signed)
-		case ssa.VecLaneI32x4:
-			mov.asMovFromVec(rd, rn, vecArrangementS, vecIndex(index), signed)
-		case ssa.VecLaneI64x2:
-			mov.asMovFromVec(rd, rn, vecArrangementD, vecIndex(index), signed)
-		case ssa.VecLaneF32x4:
-			mov.asVecMovElement(rd, rn, vecArrangementS, vecIndex(0), vecIndex(index))
-		case ssa.VecLaneF64x2:
-			mov.asVecMovElement(rd, rn, vecArrangementD, vecIndex(0), vecIndex(index))
-		default:
-			panic("unsupported lane: " + lane.String())
-		}
-
-		m.insert(mov)
-
-	case ssa.OpcodeInsertlane:
-		x, y, index, lane := instr.InsertlaneData()
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-		tmpReg := m.compiler.AllocateVReg(ssa.TypeV128)
-
-		// Initially mov rn to tmp.
-		mov1 := m.allocateInstr()
-		mov1.asFpuMov128(tmpReg, rn.nr())
-		m.insert(mov1)
-
-		// movToVec and vecMovElement do not clear the remaining bits to zero,
-		// thus, we can mov rm in-place to tmp.
-		mov2 := m.allocateInstr()
-		switch lane {
-		case ssa.VecLaneI8x16:
-			mov2.asMovToVec(tmpReg, rm, vecArrangementB, vecIndex(index))
-		case ssa.VecLaneI16x8:
-			mov2.asMovToVec(tmpReg, rm, vecArrangementH, vecIndex(index))
-		case ssa.VecLaneI32x4:
-			mov2.asMovToVec(tmpReg, rm, vecArrangementS, vecIndex(index))
-		case ssa.VecLaneI64x2:
-			mov2.asMovToVec(tmpReg, rm, vecArrangementD, vecIndex(index))
-		case ssa.VecLaneF32x4:
-			mov2.asVecMovElement(tmpReg, rm, vecArrangementS, vecIndex(index), vecIndex(0))
-		case ssa.VecLaneF64x2:
-			mov2.asVecMovElement(tmpReg, rm, vecArrangementD, vecIndex(index), vecIndex(0))
-		}
-		m.insert(mov2)
-
-		// Finally mov tmp to rd.
-		mov3 := m.allocateInstr()
-		mov3.asFpuMov128(rd, tmpReg)
-		m.insert(mov3)
-
-	case ssa.OpcodeSwizzle:
-		x, y, lane := instr.Arg2WithLane()
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-
-		arr := ssaLaneToArrangement(lane)
-
-		// tbl <rd>.<arr>, { <rn>.<arr> }, <rm>.<arr>
-		tbl1 := m.allocateInstr()
-		tbl1.asVecTbl(1, rd, rn, rm, arr)
-		m.insert(tbl1)
-
-	case ssa.OpcodeShuffle:
-		x, y, lane1, lane2 := instr.ShuffleData()
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-
-		m.lowerShuffle(rd, rn, rm, lane1, lane2)
-
-	case ssa.OpcodeSplat:
-		x, lane := instr.ArgWithLane()
-		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := m.compiler.VRegOf(instr.Return())
-
-		dup := m.allocateInstr()
-		switch lane {
-		case ssa.VecLaneI8x16:
-			dup.asVecDup(rd, rn, vecArrangement16B)
-		case ssa.VecLaneI16x8:
-			dup.asVecDup(rd, rn, vecArrangement8H)
-		case ssa.VecLaneI32x4:
-			dup.asVecDup(rd, rn, vecArrangement4S)
-		case ssa.VecLaneI64x2:
-			dup.asVecDup(rd, rn, vecArrangement2D)
-		case ssa.VecLaneF32x4:
-			dup.asVecDupElement(rd, rn, vecArrangementS, vecIndex(0))
-		case ssa.VecLaneF64x2:
-			dup.asVecDupElement(rd, rn, vecArrangementD, vecIndex(0))
-		}
-		m.insert(dup)
-
-	case ssa.OpcodeWideningPairwiseDotProductS:
-		x, y := instr.Arg2()
-		xx, yy := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone),
-			m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		tmp, tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-		m.insert(m.allocateInstr().asVecRRR(vecOpSmull, tmp.nr(), xx, yy, vecArrangement8H))
-		m.insert(m.allocateInstr().asVecRRR(vecOpSmull2, tmp2.nr(), xx, yy, vecArrangement8H))
-		m.insert(m.allocateInstr().asVecRRR(vecOpAddp, tmp.nr(), tmp, tmp2, vecArrangement4S))
-
-		rd := m.compiler.VRegOf(instr.Return())
-		m.insert(m.allocateInstr().asFpuMov128(rd, tmp.nr()))
-
-	case ssa.OpcodeLoadSplat:
-		ptr, offset, lane := instr.LoadSplatData()
-		m.lowerLoadSplat(ptr, offset, lane, instr.Return())
-
-	case ssa.OpcodeAtomicRmw:
-		m.lowerAtomicRmw(instr)
-
-	case ssa.OpcodeAtomicCas:
-		m.lowerAtomicCas(instr)
-
-	case ssa.OpcodeAtomicLoad:
-		m.lowerAtomicLoad(instr)
-
-	case ssa.OpcodeAtomicStore:
-		m.lowerAtomicStore(instr)
-
-	case ssa.OpcodeFence:
-		instr := m.allocateInstr()
-		instr.asDMB()
-		m.insert(instr)
-
-	default:
-		panic("TODO: lowering " + op.String())
-	}
-	m.FlushPendingInstructions()
-}
-
-func (m *machine) lowerShuffle(rd regalloc.VReg, rn, rm operand, lane1, lane2 uint64) {
-	// `tbl2` requires 2 consecutive registers, so we arbitrarily pick v29, v30.
-	vReg, wReg := v29VReg, v30VReg
-
-	// Initialize v29, v30 to rn, rm.
-	movv := m.allocateInstr()
-	movv.asFpuMov128(vReg, rn.nr())
-	m.insert(movv)
-
-	movw := m.allocateInstr()
-	movw.asFpuMov128(wReg, rm.nr())
-	m.insert(movw)
-
-	// `lane1`, `lane2` are already encoded as two u64s with the right layout:
-	//     lane1 := lane[7]<<56 | ... | lane[1]<<8 | lane[0]
-	//     lane2 := lane[15]<<56 | ... | lane[9]<<8 | lane[8]
-	// Thus, we can use loadFpuConst128.
-	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-	lfc := m.allocateInstr()
-	lfc.asLoadFpuConst128(tmp.nr(), lane1, lane2)
-	m.insert(lfc)
-
-	// tbl <rd>.16b, { <vReg>.16B, <wReg>.16b }, <tmp>.16b
-	tbl2 := m.allocateInstr()
-	tbl2.asVecTbl(2, rd, operandNR(vReg), tmp, vecArrangement16B)
-	m.insert(tbl2)
-}
-
-func (m *machine) lowerVShift(op ssa.Opcode, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
-	var modulo byte
-	switch arr {
-	case vecArrangement16B:
-		modulo = 0x7 // Modulo 8.
-	case vecArrangement8H:
-		modulo = 0xf // Modulo 16.
-	case vecArrangement4S:
-		modulo = 0x1f // Modulo 32.
-	case vecArrangement2D:
-		modulo = 0x3f // Modulo 64.
-	default:
-		panic("unsupported arrangment " + arr.String())
-	}
-
-	rtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
-	vtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-
-	and := m.allocateInstr()
-	and.asALUBitmaskImm(aluOpAnd, rtmp.nr(), rm.nr(), uint64(modulo), true)
-	m.insert(and)
-
-	if op != ssa.OpcodeVIshl {
-		// Negate the amount to make this as right shift.
-		neg := m.allocateInstr()
-		neg.asALU(aluOpSub, rtmp.nr(), operandNR(xzrVReg), rtmp, true)
-		m.insert(neg)
-	}
-
-	// Copy the shift amount into a vector register as sshl/ushl requires it to be there.
-	dup := m.allocateInstr()
-	dup.asVecDup(vtmp.nr(), rtmp, arr)
-	m.insert(dup)
-
-	if op == ssa.OpcodeVIshl || op == ssa.OpcodeVSshr {
-		sshl := m.allocateInstr()
-		sshl.asVecRRR(vecOpSshl, rd, rn, vtmp, arr)
-		m.insert(sshl)
-	} else {
-		ushl := m.allocateInstr()
-		ushl.asVecRRR(vecOpUshl, rd, rn, vtmp, arr)
-		m.insert(ushl)
-	}
-}
-
-func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm operand, rd regalloc.VReg, arr vecArrangement) {
-	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-
-	// Special case VallTrue for i64x2.
-	if op == ssa.OpcodeVallTrue && arr == vecArrangement2D {
-		// 	cmeq v3?.2d, v2?.2d, #0
-		//	addp v3?.2d, v3?.2d, v3?.2d
-		//	fcmp v3?, v3?
-		//	cset dst, eq
-
-		ins := m.allocateInstr()
-		ins.asVecMisc(vecOpCmeq0, tmp.nr(), rm, vecArrangement2D)
-		m.insert(ins)
-
-		addp := m.allocateInstr()
-		addp.asVecRRR(vecOpAddp, tmp.nr(), tmp, tmp, vecArrangement2D)
-		m.insert(addp)
-
-		fcmp := m.allocateInstr()
-		fcmp.asFpuCmp(tmp, tmp, true)
-		m.insert(fcmp)
-
-		cset := m.allocateInstr()
-		cset.asCSet(rd, false, eq)
-		m.insert(cset)
-
-		return
-	}
-
-	// Create a scalar value with umaxp or uminv, then compare it against zero.
-	ins := m.allocateInstr()
-	if op == ssa.OpcodeVanyTrue {
-		// 	umaxp v4?.16b, v2?.16b, v2?.16b
-		ins.asVecRRR(vecOpUmaxp, tmp.nr(), rm, rm, vecArrangement16B)
-	} else {
-		// 	uminv d4?, v2?.4s
-		ins.asVecLanes(vecOpUminv, tmp.nr(), rm, arr)
-	}
-	m.insert(ins)
-
-	//	mov x3?, v4?.d[0]
-	//	ccmp x3?, #0x0, #0x0, al
-	//	cset x3?, ne
-	//	mov x0, x3?
-
-	movv := m.allocateInstr()
-	movv.asMovFromVec(rd, tmp, vecArrangementD, vecIndex(0), false)
-	m.insert(movv)
-
-	fc := m.allocateInstr()
-	fc.asCCmpImm(operandNR(rd), uint64(0), al, 0, true)
-	m.insert(fc)
-
-	cset := m.allocateInstr()
-	cset.asCSet(rd, false, ne)
-	m.insert(cset)
-}
-
-func (m *machine) lowerVhighBits(rm operand, rd regalloc.VReg, arr vecArrangement) {
-	r0 := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
-	v0 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-	v1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-
-	switch arr {
-	case vecArrangement16B:
-		//	sshr v6?.16b, v2?.16b, #7
-		//	movz x4?, #0x201, lsl 0
-		//	movk x4?, #0x804, lsl 16
-		//	movk x4?, #0x2010, lsl 32
-		//	movk x4?, #0x8040, lsl 48
-		//	dup v5?.2d, x4?
-		//	and v6?.16b, v6?.16b, v5?.16b
-		//	ext v5?.16b, v6?.16b, v6?.16b, #8
-		//	zip1 v5?.16b, v6?.16b, v5?.16b
-		//	addv s5?, v5?.8h
-		//	umov s3?, v5?.h[0]
-
-		// Right arithmetic shift on the original vector and store the result into v1. So we have:
-		// v1[i] = 0xff if vi<0, 0 otherwise.
-		sshr := m.allocateInstr()
-		sshr.asVecShiftImm(vecOpSshr, v1.nr(), rm, operandShiftImm(7), vecArrangement16B)
-		m.insert(sshr)
-
-		// Load the bit mask into r0.
-		m.insertMOVZ(r0.nr(), 0x0201, 0, true)
-		m.insertMOVK(r0.nr(), 0x0804, 1, true)
-		m.insertMOVK(r0.nr(), 0x2010, 2, true)
-		m.insertMOVK(r0.nr(), 0x8040, 3, true)
-
-		// dup r0 to v0.
-		dup := m.allocateInstr()
-		dup.asVecDup(v0.nr(), r0, vecArrangement2D)
-		m.insert(dup)
-
-		// Lane-wise logical AND with the bit mask, meaning that we have
-		// v[i] = (1 << i) if vi<0, 0 otherwise.
-		//
-		// Below, we use the following notation:
-		// wi := (1 << i) if vi<0, 0 otherwise.
-		and := m.allocateInstr()
-		and.asVecRRR(vecOpAnd, v1.nr(), v1, v0, vecArrangement16B)
-		m.insert(and)
-
-		// Swap the lower and higher 8 byte elements, and write it into v0, meaning that we have
-		// v0[i] = w(i+8) if i < 8, w(i-8) otherwise.
-		ext := m.allocateInstr()
-		ext.asVecExtract(v0.nr(), v1, v1, vecArrangement16B, uint32(8))
-		m.insert(ext)
-
-		// v = [w0, w8, ..., w7, w15]
-		zip1 := m.allocateInstr()
-		zip1.asVecPermute(vecOpZip1, v0.nr(), v1, v0, vecArrangement16B)
-		m.insert(zip1)
-
-		// v.h[0] = w0 + ... + w15
-		addv := m.allocateInstr()
-		addv.asVecLanes(vecOpAddv, v0.nr(), v0, vecArrangement8H)
-		m.insert(addv)
-
-		// Extract the v.h[0] as the result.
-		movfv := m.allocateInstr()
-		movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false)
-		m.insert(movfv)
-	case vecArrangement8H:
-		//	sshr v6?.8h, v2?.8h, #15
-		//	movz x4?, #0x1, lsl 0
-		//	movk x4?, #0x2, lsl 16
-		//	movk x4?, #0x4, lsl 32
-		//	movk x4?, #0x8, lsl 48
-		//	dup v5?.2d, x4?
-		//	lsl x4?, x4?, 0x4
-		//	ins v5?.d[1], x4?
-		//	and v5?.16b, v6?.16b, v5?.16b
-		//	addv s5?, v5?.8h
-		//	umov s3?, v5?.h[0]
-
-		// Right arithmetic shift on the original vector and store the result into v1. So we have:
-		// v[i] = 0xffff if vi<0, 0 otherwise.
-		sshr := m.allocateInstr()
-		sshr.asVecShiftImm(vecOpSshr, v1.nr(), rm, operandShiftImm(15), vecArrangement8H)
-		m.insert(sshr)
-
-		// Load the bit mask into r0.
-		m.lowerConstantI64(r0.nr(), 0x0008000400020001)
-
-		// dup r0 to vector v0.
-		dup := m.allocateInstr()
-		dup.asVecDup(v0.nr(), r0, vecArrangement2D)
-		m.insert(dup)
-
-		lsl := m.allocateInstr()
-		lsl.asALUShift(aluOpLsl, r0.nr(), r0, operandShiftImm(4), true)
-		m.insert(lsl)
-
-		movv := m.allocateInstr()
-		movv.asMovToVec(v0.nr(), r0, vecArrangementD, vecIndex(1))
-		m.insert(movv)
-
-		// Lane-wise logical AND with the bitmask, meaning that we have
-		// v[i] = (1 << i)     if vi<0, 0 otherwise for i=0..3
-		//      = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7
-		and := m.allocateInstr()
-		and.asVecRRR(vecOpAnd, v0.nr(), v1, v0, vecArrangement16B)
-		m.insert(and)
-
-		addv := m.allocateInstr()
-		addv.asVecLanes(vecOpAddv, v0.nr(), v0, vecArrangement8H)
-		m.insert(addv)
-
-		movfv := m.allocateInstr()
-		movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false)
-		m.insert(movfv)
-	case vecArrangement4S:
-		// 	sshr v6?.8h, v2?.8h, #15
-		//	movz x4?, #0x1, lsl 0
-		//	movk x4?, #0x2, lsl 16
-		//	movk x4?, #0x4, lsl 32
-		//	movk x4?, #0x8, lsl 48
-		//	dup v5?.2d, x4?
-		//	lsl x4?, x4?, 0x4
-		//	ins v5?.d[1], x4?
-		//	and v5?.16b, v6?.16b, v5?.16b
-		//	addv s5?, v5?.8h
-		//	umov s3?, v5?.h[0]
-
-		// Right arithmetic shift on the original vector and store the result into v1. So we have:
-		// v[i] = 0xffffffff if vi<0, 0 otherwise.
-		sshr := m.allocateInstr()
-		sshr.asVecShiftImm(vecOpSshr, v1.nr(), rm, operandShiftImm(31), vecArrangement4S)
-		m.insert(sshr)
-
-		// Load the bit mask into r0.
-		m.lowerConstantI64(r0.nr(), 0x0000000200000001)
-
-		// dup r0 to vector v0.
-		dup := m.allocateInstr()
-		dup.asVecDup(v0.nr(), r0, vecArrangement2D)
-		m.insert(dup)
-
-		lsl := m.allocateInstr()
-		lsl.asALUShift(aluOpLsl, r0.nr(), r0, operandShiftImm(2), true)
-		m.insert(lsl)
-
-		movv := m.allocateInstr()
-		movv.asMovToVec(v0.nr(), r0, vecArrangementD, vecIndex(1))
-		m.insert(movv)
-
-		// Lane-wise logical AND with the bitmask, meaning that we have
-		// v[i] = (1 << i)     if vi<0, 0 otherwise for i in [0, 1]
-		//      = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3]
-		and := m.allocateInstr()
-		and.asVecRRR(vecOpAnd, v0.nr(), v1, v0, vecArrangement16B)
-		m.insert(and)
-
-		addv := m.allocateInstr()
-		addv.asVecLanes(vecOpAddv, v0.nr(), v0, vecArrangement4S)
-		m.insert(addv)
-
-		movfv := m.allocateInstr()
-		movfv.asMovFromVec(rd, v0, vecArrangementS, vecIndex(0), false)
-		m.insert(movfv)
-	case vecArrangement2D:
-		// 	mov d3?, v2?.d[0]
-		//	mov x4?, v2?.d[1]
-		//	lsr x4?, x4?, 0x3f
-		//	lsr d3?, d3?, 0x3f
-		//	add s3?, s3?, w4?, lsl #1
-
-		// Move the lower 64-bit int into result.
-		movv0 := m.allocateInstr()
-		movv0.asMovFromVec(rd, rm, vecArrangementD, vecIndex(0), false)
-		m.insert(movv0)
-
-		// Move the higher 64-bit int into r0.
-		movv1 := m.allocateInstr()
-		movv1.asMovFromVec(r0.nr(), rm, vecArrangementD, vecIndex(1), false)
-		m.insert(movv1)
-
-		// Move the sign bit into the least significant bit.
-		lsr1 := m.allocateInstr()
-		lsr1.asALUShift(aluOpLsr, r0.nr(), r0, operandShiftImm(63), true)
-		m.insert(lsr1)
-
-		lsr2 := m.allocateInstr()
-		lsr2.asALUShift(aluOpLsr, rd, operandNR(rd), operandShiftImm(63), true)
-		m.insert(lsr2)
-
-		// rd = (r0<<1) | rd
-		lsl := m.allocateInstr()
-		lsl.asALU(aluOpAdd, rd, operandNR(rd), operandSR(r0.nr(), 1, shiftOpLSL), false)
-		m.insert(lsl)
-	default:
-		panic("Unsupported " + arr.String())
-	}
-}
-
-func (m *machine) lowerVecMisc(op vecOp, instr *ssa.Instruction) {
-	x, lane := instr.ArgWithLane()
-	arr := ssaLaneToArrangement(lane)
-	ins := m.allocateInstr()
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-	rd := m.compiler.VRegOf(instr.Return())
-	ins.asVecMisc(op, rd, rn, arr)
-	m.insert(ins)
-}
-
-func (m *machine) lowerVecRRR(op vecOp, x, y, ret ssa.Value, arr vecArrangement) {
-	ins := m.allocateInstr()
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	rd := m.compiler.VRegOf(ret)
-	ins.asVecRRR(op, rd, rn, rm, arr)
-	m.insert(ins)
-}
-
-func (m *machine) lowerVIMul(rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
-	if arr != vecArrangement2D {
-		mul := m.allocateInstr()
-		mul.asVecRRR(vecOpMul, rd, rn, rm, arr)
-		m.insert(mul)
-	} else {
-		tmp1 := m.compiler.AllocateVReg(ssa.TypeV128)
-		tmp2 := m.compiler.AllocateVReg(ssa.TypeV128)
-		tmp3 := m.compiler.AllocateVReg(ssa.TypeV128)
-
-		tmpRes := m.compiler.AllocateVReg(ssa.TypeV128)
-
-		// Following the algorithm in https://chromium-review.googlesource.com/c/v8/v8/+/1781696
-		rev64 := m.allocateInstr()
-		rev64.asVecMisc(vecOpRev64, tmp2, rm, vecArrangement4S)
-		m.insert(rev64)
-
-		mul := m.allocateInstr()
-		mul.asVecRRR(vecOpMul, tmp2, operandNR(tmp2), rn, vecArrangement4S)
-		m.insert(mul)
-
-		xtn1 := m.allocateInstr()
-		xtn1.asVecMisc(vecOpXtn, tmp1, rn, vecArrangement2S)
-		m.insert(xtn1)
-
-		addp := m.allocateInstr()
-		addp.asVecRRR(vecOpAddp, tmp2, operandNR(tmp2), operandNR(tmp2), vecArrangement4S)
-		m.insert(addp)
-
-		xtn2 := m.allocateInstr()
-		xtn2.asVecMisc(vecOpXtn, tmp3, rm, vecArrangement2S)
-		m.insert(xtn2)
-
-		// Note: do not write the result directly into result yet. This is the same reason as in bsl.
-		// In short, in UMLAL instruction, the result register is also one of the source register, and
-		// the value on the result register is significant.
-		shll := m.allocateInstr()
-		shll.asVecMisc(vecOpShll, tmpRes, operandNR(tmp2), vecArrangement2S)
-		m.insert(shll)
-
-		umlal := m.allocateInstr()
-		umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, operandNR(tmp3), operandNR(tmp1), vecArrangement2S)
-		m.insert(umlal)
-
-		mov := m.allocateInstr()
-		mov.asFpuMov128(rd, tmpRes)
-		m.insert(mov)
-	}
-}
-
-func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) {
-	x, y, lane := instr.Arg2WithLane()
-	arr := ssaLaneToArrangement(lane)
-
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-
-	// Note: this usage of tmp is important.
-	// BSL modifies the destination register, so we need to use a temporary register so that
-	// the actual definition of the destination register happens *after* the BSL instruction.
-	// That way, we can force the spill instruction to be inserted after the BSL instruction.
-	tmp := m.compiler.AllocateVReg(ssa.TypeV128)
-
-	fcmgt := m.allocateInstr()
-	if max {
-		fcmgt.asVecRRR(vecOpFcmgt, tmp, rm, rn, arr)
-	} else {
-		// If min, swap the args.
-		fcmgt.asVecRRR(vecOpFcmgt, tmp, rn, rm, arr)
-	}
-	m.insert(fcmgt)
-
-	bsl := m.allocateInstr()
-	bsl.asVecRRRRewrite(vecOpBsl, tmp, rm, rn, vecArrangement16B)
-	m.insert(bsl)
-
-	res := operandNR(m.compiler.VRegOf(instr.Return()))
-	mov2 := m.allocateInstr()
-	mov2.asFpuMov128(res.nr(), tmp)
-	m.insert(mov2)
-}
-
-func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn regalloc.VReg, rm operand, _64bit, signed bool) {
-	div := m.allocateInstr()
-
-	if signed {
-		div.asALU(aluOpSDiv, rd, operandNR(rn), rm, _64bit)
-	} else {
-		div.asALU(aluOpUDiv, rd, operandNR(rn), rm, _64bit)
-	}
-	m.insert(div)
-
-	// Check if rm is zero:
-	m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero)
-
-	// rd = rn-rd*rm by MSUB instruction.
-	msub := m.allocateInstr()
-	msub.asALURRRR(aluOpMSub, rd, operandNR(rd), rm, rn, _64bit)
-	m.insert(msub)
-}
-
-func (m *machine) lowerIDiv(execCtxVReg, rd regalloc.VReg, rn, rm operand, _64bit, signed bool) {
-	div := m.allocateInstr()
-
-	if signed {
-		div.asALU(aluOpSDiv, rd, rn, rm, _64bit)
-	} else {
-		div.asALU(aluOpUDiv, rd, rn, rm, _64bit)
-	}
-	m.insert(div)
-
-	// Check if rm is zero:
-	m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero)
-
-	if signed {
-		// We need to check the signed overflow which happens iff "math.MinInt{32,64} / -1"
-		minusOneCheck := m.allocateInstr()
-		// Sets eq condition if rm == -1.
-		minusOneCheck.asALU(aluOpAddS, xzrVReg, rm, operandImm12(1, 0), _64bit)
-		m.insert(minusOneCheck)
-
-		ccmp := m.allocateInstr()
-		// If eq condition is set, sets the flag by the result based on "rn - 1", otherwise clears the flag.
-		ccmp.asCCmpImm(rn, 1, eq, 0, _64bit)
-		m.insert(ccmp)
-
-		// Check the overflow flag.
-		m.exitIfNot(execCtxVReg, vs.invert().asCond(), false, wazevoapi.ExitCodeIntegerOverflow)
-	}
-}
-
-// exitIfNot emits a conditional branch to exit if the condition is not met.
-// If `c` (cond type) is a register, `cond64bit` must be chosen to indicate whether the register is 32-bit or 64-bit.
-// Otherwise, `cond64bit` is ignored.
-func (m *machine) exitIfNot(execCtxVReg regalloc.VReg, c cond, cond64bit bool, code wazevoapi.ExitCode) {
-	execCtxTmp := m.copyToTmp(execCtxVReg)
-
-	cbr := m.allocateInstr()
-	m.insert(cbr)
-	m.lowerExitWithCode(execCtxTmp, code)
-	// Conditional branch target is after exit.
-	l := m.insertBrTargetLabel()
-	cbr.asCondBr(c, l, cond64bit)
-}
-
-func (m *machine) lowerFcopysign(x, y, ret ssa.Value) {
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	var tmpI, tmpF regalloc.VReg
-	_64 := x.Type() == ssa.TypeF64
-	if _64 {
-		tmpF = m.compiler.AllocateVReg(ssa.TypeF64)
-		tmpI = m.compiler.AllocateVReg(ssa.TypeI64)
-	} else {
-		tmpF = m.compiler.AllocateVReg(ssa.TypeF32)
-		tmpI = m.compiler.AllocateVReg(ssa.TypeI32)
-	}
-	rd := m.compiler.VRegOf(ret)
-	m.lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF, _64)
-}
-
-func (m *machine) lowerFcopysignImpl(rd regalloc.VReg, rn, rm operand, tmpI, tmpF regalloc.VReg, _64bit bool) {
-	// This is exactly the same code emitted by GCC for "__builtin_copysign":
-	//
-	//    mov     x0, -9223372036854775808
-	//    fmov    d2, x0
-	//    vbit    v0.8b, v1.8b, v2.8b
-	//
-
-	setMSB := m.allocateInstr()
-	if _64bit {
-		m.lowerConstantI64(tmpI, math.MinInt64)
-		setMSB.asMovToVec(tmpF, operandNR(tmpI), vecArrangementD, vecIndex(0))
-	} else {
-		m.lowerConstantI32(tmpI, math.MinInt32)
-		setMSB.asMovToVec(tmpF, operandNR(tmpI), vecArrangementS, vecIndex(0))
-	}
-	m.insert(setMSB)
-
-	tmpReg := m.compiler.AllocateVReg(ssa.TypeF64)
-
-	mov := m.allocateInstr()
-	mov.asFpuMov64(tmpReg, rn.nr())
-	m.insert(mov)
-
-	vbit := m.allocateInstr()
-	vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, operandNR(tmpF), vecArrangement8B)
-	m.insert(vbit)
-
-	movDst := m.allocateInstr()
-	movDst.asFpuMov64(rd, tmpReg)
-	m.insert(movDst)
-}
-
-func (m *machine) lowerBitcast(instr *ssa.Instruction) {
-	v, dstType := instr.BitcastData()
-	srcType := v.Type()
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
-	rd := m.compiler.VRegOf(instr.Return())
-	srcInt := srcType.IsInt()
-	dstInt := dstType.IsInt()
-	switch {
-	case srcInt && !dstInt: // Int to Float:
-		mov := m.allocateInstr()
-		var arr vecArrangement
-		if srcType.Bits() == 64 {
-			arr = vecArrangementD
-		} else {
-			arr = vecArrangementS
-		}
-		mov.asMovToVec(rd, rn, arr, vecIndex(0))
-		m.insert(mov)
-	case !srcInt && dstInt: // Float to Int:
-		mov := m.allocateInstr()
-		var arr vecArrangement
-		if dstType.Bits() == 64 {
-			arr = vecArrangementD
-		} else {
-			arr = vecArrangementS
-		}
-		mov.asMovFromVec(rd, rn, arr, vecIndex(0), false)
-		m.insert(mov)
-	default:
-		panic("TODO?BUG?")
-	}
-}
-
-func (m *machine) lowerFpuUniOp(op fpuUniOp, in, out ssa.Value) {
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(in), extModeNone)
-	rd := m.compiler.VRegOf(out)
-
-	neg := m.allocateInstr()
-	neg.asFpuRR(op, rd, rn, in.Type().Bits() == 64)
-	m.insert(neg)
-}
-
-func (m *machine) lowerFpuToInt(rd regalloc.VReg, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) {
-	if !nonTrapping {
-		// First of all, we have to clear the FPU flags.
-		flagClear := m.allocateInstr()
-		flagClear.asMovToFPSR(xzrVReg)
-		m.insert(flagClear)
-	}
-
-	// Then, do the conversion which doesn't trap inherently.
-	cvt := m.allocateInstr()
-	cvt.asFpuToInt(rd, rn, signed, src64bit, dst64bit)
-	m.insert(cvt)
-
-	if !nonTrapping {
-		tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
-
-		// After the conversion, check the FPU flags.
-		getFlag := m.allocateInstr()
-		getFlag.asMovFromFPSR(tmpReg)
-		m.insert(getFlag)
-
-		execCtx := m.copyToTmp(ctx)
-		_rn := operandNR(m.copyToTmp(rn.nr()))
-
-		// Check if the conversion was undefined by comparing the status with 1.
-		// See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register
-		alu := m.allocateInstr()
-		alu.asALU(aluOpSubS, xzrVReg, operandNR(tmpReg), operandImm12(1, 0), true)
-		m.insert(alu)
-
-		// If it is not undefined, we can return the result.
-		ok := m.allocateInstr()
-		m.insert(ok)
-
-		// Otherwise, we have to choose the status depending on it is overflow or NaN conversion.
-
-		// Comparing itself to check if it is a NaN.
-		fpuCmp := m.allocateInstr()
-		fpuCmp.asFpuCmp(_rn, _rn, src64bit)
-		m.insert(fpuCmp)
-		// If the VC flag is not set (== VS flag is set), it is a NaN.
-		m.exitIfNot(execCtx, vc.asCond(), false, wazevoapi.ExitCodeInvalidConversionToInteger)
-		// Otherwise, it is an overflow.
-		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
-
-		// Conditional branch target is after exit.
-		l := m.insertBrTargetLabel()
-		ok.asCondBr(ne.asCond(), l, false /* ignored */)
-	}
-}
-
-func (m *machine) lowerIntToFpu(rd regalloc.VReg, rn operand, signed, src64bit, dst64bit bool) {
-	cvt := m.allocateInstr()
-	cvt.asIntToFpu(rd, rn, signed, src64bit, dst64bit)
-	m.insert(cvt)
-}
-
-func (m *machine) lowerFpuBinOp(si *ssa.Instruction) {
-	instr := m.allocateInstr()
-	var op fpuBinOp
-	switch si.Opcode() {
-	case ssa.OpcodeFadd:
-		op = fpuBinOpAdd
-	case ssa.OpcodeFsub:
-		op = fpuBinOpSub
-	case ssa.OpcodeFmul:
-		op = fpuBinOpMul
-	case ssa.OpcodeFdiv:
-		op = fpuBinOpDiv
-	case ssa.OpcodeFmax:
-		op = fpuBinOpMax
-	case ssa.OpcodeFmin:
-		op = fpuBinOpMin
-	}
-	x, y := si.Arg2()
-	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
-	rn := m.getOperand_NR(xDef, extModeNone)
-	rm := m.getOperand_NR(yDef, extModeNone)
-	rd := m.compiler.VRegOf(si.Return())
-	instr.asFpuRRR(op, rd, rn, rm, x.Type().Bits() == 64)
-	m.insert(instr)
-}
-
-func (m *machine) lowerSubOrAdd(si *ssa.Instruction, add bool) {
-	x, y := si.Arg2()
-	if !x.Type().IsInt() {
-		panic("BUG?")
-	}
-
-	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
-	rn := m.getOperand_NR(xDef, extModeNone)
-	rm, yNegated := m.getOperand_MaybeNegatedImm12_ER_SR_NR(yDef, extModeNone)
-
-	var aop aluOp
-	switch {
-	case add && !yNegated: // rn+rm = x+y
-		aop = aluOpAdd
-	case add && yNegated: // rn-rm = x-(-y) = x+y
-		aop = aluOpSub
-	case !add && !yNegated: // rn-rm = x-y
-		aop = aluOpSub
-	case !add && yNegated: // rn+rm = x-(-y) = x-y
-		aop = aluOpAdd
-	}
-	rd := m.compiler.VRegOf(si.Return())
-	alu := m.allocateInstr()
-	alu.asALU(aop, rd, rn, rm, x.Type().Bits() == 64)
-	m.insert(alu)
-}
-
-// InsertMove implements backend.Machine.
-func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) {
-	instr := m.allocateInstr()
-	switch typ {
-	case ssa.TypeI32, ssa.TypeI64:
-		instr.asMove64(dst, src)
-	case ssa.TypeF32, ssa.TypeF64:
-		instr.asFpuMov64(dst, src)
-	case ssa.TypeV128:
-		instr.asFpuMov128(dst, src)
-	default:
-		panic("TODO")
-	}
-	m.insert(instr)
-}
-
-func (m *machine) lowerIcmp(si *ssa.Instruction) {
-	x, y, c := si.IcmpData()
-	flag := condFlagFromSSAIntegerCmpCond(c)
-
-	in64bit := x.Type().Bits() == 64
-	var ext extMode
-	if in64bit {
-		if c.Signed() {
-			ext = extModeSignExtend64
-		} else {
-			ext = extModeZeroExtend64
-		}
-	} else {
-		if c.Signed() {
-			ext = extModeSignExtend32
-		} else {
-			ext = extModeZeroExtend32
-		}
-	}
-
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
-	rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), ext)
-	alu := m.allocateInstr()
-	alu.asALU(aluOpSubS, xzrVReg, rn, rm, in64bit)
-	m.insert(alu)
-
-	cset := m.allocateInstr()
-	cset.asCSet(m.compiler.VRegOf(si.Return()), false, flag)
-	m.insert(cset)
-}
-
-func (m *machine) lowerVIcmp(si *ssa.Instruction) {
-	x, y, c, lane := si.VIcmpData()
-	flag := condFlagFromSSAIntegerCmpCond(c)
-	arr := ssaLaneToArrangement(lane)
-
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	rd := m.compiler.VRegOf(si.Return())
-
-	switch flag {
-	case eq:
-		cmp := m.allocateInstr()
-		cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr)
-		m.insert(cmp)
-	case ne:
-		cmp := m.allocateInstr()
-		cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr)
-		m.insert(cmp)
-		not := m.allocateInstr()
-		not.asVecMisc(vecOpNot, rd, operandNR(rd), vecArrangement16B)
-		m.insert(not)
-	case ge:
-		cmp := m.allocateInstr()
-		cmp.asVecRRR(vecOpCmge, rd, rn, rm, arr)
-		m.insert(cmp)
-	case gt:
-		cmp := m.allocateInstr()
-		cmp.asVecRRR(vecOpCmgt, rd, rn, rm, arr)
-		m.insert(cmp)
-	case le:
-		cmp := m.allocateInstr()
-		cmp.asVecRRR(vecOpCmge, rd, rm, rn, arr) // rm, rn are swapped
-		m.insert(cmp)
-	case lt:
-		cmp := m.allocateInstr()
-		cmp.asVecRRR(vecOpCmgt, rd, rm, rn, arr) // rm, rn are swapped
-		m.insert(cmp)
-	case hs:
-		cmp := m.allocateInstr()
-		cmp.asVecRRR(vecOpCmhs, rd, rn, rm, arr)
-		m.insert(cmp)
-	case hi:
-		cmp := m.allocateInstr()
-		cmp.asVecRRR(vecOpCmhi, rd, rn, rm, arr)
-		m.insert(cmp)
-	case ls:
-		cmp := m.allocateInstr()
-		cmp.asVecRRR(vecOpCmhs, rd, rm, rn, arr) // rm, rn are swapped
-		m.insert(cmp)
-	case lo:
-		cmp := m.allocateInstr()
-		cmp.asVecRRR(vecOpCmhi, rd, rm, rn, arr) // rm, rn are swapped
-		m.insert(cmp)
-	}
-}
-
-func (m *machine) lowerVFcmp(si *ssa.Instruction) {
-	x, y, c, lane := si.VFcmpData()
-	flag := condFlagFromSSAFloatCmpCond(c)
-	arr := ssaLaneToArrangement(lane)
-
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	rd := m.compiler.VRegOf(si.Return())
-
-	switch flag {
-	case eq:
-		cmp := m.allocateInstr()
-		cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr)
-		m.insert(cmp)
-	case ne:
-		cmp := m.allocateInstr()
-		cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr)
-		m.insert(cmp)
-		not := m.allocateInstr()
-		not.asVecMisc(vecOpNot, rd, operandNR(rd), vecArrangement16B)
-		m.insert(not)
-	case ge:
-		cmp := m.allocateInstr()
-		cmp.asVecRRR(vecOpFcmge, rd, rn, rm, arr)
-		m.insert(cmp)
-	case gt:
-		cmp := m.allocateInstr()
-		cmp.asVecRRR(vecOpFcmgt, rd, rn, rm, arr)
-		m.insert(cmp)
-	case mi:
-		cmp := m.allocateInstr()
-		cmp.asVecRRR(vecOpFcmgt, rd, rm, rn, arr) // rm, rn are swapped
-		m.insert(cmp)
-	case ls:
-		cmp := m.allocateInstr()
-		cmp.asVecRRR(vecOpFcmge, rd, rm, rn, arr) // rm, rn are swapped
-		m.insert(cmp)
-	}
-}
-
-func (m *machine) lowerVfpuToInt(rd regalloc.VReg, rn operand, arr vecArrangement, signed bool) {
-	cvt := m.allocateInstr()
-	if signed {
-		cvt.asVecMisc(vecOpFcvtzs, rd, rn, arr)
-	} else {
-		cvt.asVecMisc(vecOpFcvtzu, rd, rn, arr)
-	}
-	m.insert(cvt)
-
-	if arr == vecArrangement2D {
-		narrow := m.allocateInstr()
-		if signed {
-			narrow.asVecMisc(vecOpSqxtn, rd, operandNR(rd), vecArrangement2S)
-		} else {
-			narrow.asVecMisc(vecOpUqxtn, rd, operandNR(rd), vecArrangement2S)
-		}
-		m.insert(narrow)
-	}
-}
-
-func (m *machine) lowerVfpuFromInt(rd regalloc.VReg, rn operand, arr vecArrangement, signed bool) {
-	cvt := m.allocateInstr()
-	if signed {
-		cvt.asVecMisc(vecOpScvtf, rd, rn, arr)
-	} else {
-		cvt.asVecMisc(vecOpUcvtf, rd, rn, arr)
-	}
-	m.insert(cvt)
-}
-
-func (m *machine) lowerShifts(si *ssa.Instruction, ext extMode, aluOp aluOp) {
-	x, amount := si.Arg2()
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
-	rm := m.getOperand_ShiftImm_NR(m.compiler.ValueDefinition(amount), ext, x.Type().Bits())
-	rd := m.compiler.VRegOf(si.Return())
-
-	alu := m.allocateInstr()
-	alu.asALUShift(aluOp, rd, rn, rm, x.Type().Bits() == 64)
-	m.insert(alu)
-}
-
-func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp, ignoreResult bool) {
-	x, y := si.Arg2()
-
-	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
-	rn := m.getOperand_NR(xDef, extModeNone)
-
-	var rd regalloc.VReg
-	if ignoreResult {
-		rd = xzrVReg
-	} else {
-		rd = m.compiler.VRegOf(si.Return())
-	}
-
-	_64 := x.Type().Bits() == 64
-	alu := m.allocateInstr()
-	if instr := yDef.Instr; instr != nil && instr.Constant() {
-		c := instr.ConstantVal()
-		if isBitMaskImmediate(c, _64) {
-			// Constant bit wise operations can be lowered to a single instruction.
-			alu.asALUBitmaskImm(op, rd, rn.nr(), c, _64)
-			m.insert(alu)
-			return
-		}
-	}
-
-	rm := m.getOperand_SR_NR(yDef, extModeNone)
-	alu.asALU(op, rd, rn, rm, _64)
-	m.insert(alu)
-}
-
-func (m *machine) lowerRotl(si *ssa.Instruction) {
-	x, y := si.Arg2()
-	r := si.Return()
-	_64 := r.Type().Bits() == 64
-
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	var tmp regalloc.VReg
-	if _64 {
-		tmp = m.compiler.AllocateVReg(ssa.TypeI64)
-	} else {
-		tmp = m.compiler.AllocateVReg(ssa.TypeI32)
-	}
-	rd := m.compiler.VRegOf(r)
-
-	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
-	m.lowerRotlImpl(rd, rn, rm, tmp, _64)
-}
-
-func (m *machine) lowerRotlImpl(rd regalloc.VReg, rn, rm operand, tmp regalloc.VReg, is64bit bool) {
-	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
-	neg := m.allocateInstr()
-	neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rm, is64bit)
-	m.insert(neg)
-	alu := m.allocateInstr()
-	alu.asALU(aluOpRotR, rd, rn, operandNR(tmp), is64bit)
-	m.insert(alu)
-}
-
-func (m *machine) lowerRotr(si *ssa.Instruction) {
-	x, y := si.Arg2()
-
-	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
-	rn := m.getOperand_NR(xDef, extModeNone)
-	rm := m.getOperand_NR(yDef, extModeNone)
-	rd := m.compiler.VRegOf(si.Return())
-
-	alu := m.allocateInstr()
-	alu.asALU(aluOpRotR, rd, rn, rm, si.Return().Type().Bits() == 64)
-	m.insert(alu)
-}
-
-func (m *machine) lowerExtend(arg, ret ssa.Value, from, to byte, signed bool) {
-	rd := m.compiler.VRegOf(ret)
-	def := m.compiler.ValueDefinition(arg)
-
-	if instr := def.Instr; !signed && from == 32 && instr != nil {
-		// We can optimize out the unsigned extend because:
-		// 	Writes to the W register set bits [63:32] of the X register to zero
-		//  https://developer.arm.com/documentation/den0024/a/An-Introduction-to-the-ARMv8-Instruction-Sets/The-ARMv8-instruction-sets/Distinguishing-between-32-bit-and-64-bit-A64-instructions
-		switch instr.Opcode() {
-		case
-			ssa.OpcodeIadd, ssa.OpcodeIsub, ssa.OpcodeLoad,
-			ssa.OpcodeBand, ssa.OpcodeBor, ssa.OpcodeBnot,
-			ssa.OpcodeIshl, ssa.OpcodeUshr, ssa.OpcodeSshr,
-			ssa.OpcodeRotl, ssa.OpcodeRotr,
-			ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32:
-			// So, if the argument is the result of a 32-bit operation, we can just copy the register.
-			// It is highly likely that this copy will be optimized out after register allocation.
-			rn := m.compiler.VRegOf(arg)
-			mov := m.allocateInstr()
-			// Note: do not use move32 as it will be lowered to a 32-bit move, which is not copy (that is actually the impl of UExtend).
-			mov.asMove64(rd, rn)
-			m.insert(mov)
-			return
-		default:
-		}
-	}
-	rn := m.getOperand_NR(def, extModeNone)
-
-	ext := m.allocateInstr()
-	ext.asExtend(rd, rn.nr(), from, to, signed)
-	m.insert(ext)
-}
-
-func (m *machine) lowerFcmp(x, y, result ssa.Value, c ssa.FloatCmpCond) {
-	rn, rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone), m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-
-	fc := m.allocateInstr()
-	fc.asFpuCmp(rn, rm, x.Type().Bits() == 64)
-	m.insert(fc)
-
-	cset := m.allocateInstr()
-	cset.asCSet(m.compiler.VRegOf(result), false, condFlagFromSSAFloatCmpCond(c))
-	m.insert(cset)
-}
-
-func (m *machine) lowerImul(x, y, result ssa.Value) {
-	rd := m.compiler.VRegOf(result)
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-
-	// TODO: if this comes before Add/Sub, we could merge it by putting it into the place of xzrVReg.
-
-	mul := m.allocateInstr()
-	mul.asALURRRR(aluOpMAdd, rd, rn, rm, xzrVReg, x.Type().Bits() == 64)
-	m.insert(mul)
-}
-
-func (m *machine) lowerClz(x, result ssa.Value) {
-	rd := m.compiler.VRegOf(result)
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-	clz := m.allocateInstr()
-	clz.asBitRR(bitOpClz, rd, rn.nr(), x.Type().Bits() == 64)
-	m.insert(clz)
-}
-
-func (m *machine) lowerCtz(x, result ssa.Value) {
-	rd := m.compiler.VRegOf(result)
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-	rbit := m.allocateInstr()
-	_64 := x.Type().Bits() == 64
-	var tmpReg regalloc.VReg
-	if _64 {
-		tmpReg = m.compiler.AllocateVReg(ssa.TypeI64)
-	} else {
-		tmpReg = m.compiler.AllocateVReg(ssa.TypeI32)
-	}
-	rbit.asBitRR(bitOpRbit, tmpReg, rn.nr(), _64)
-	m.insert(rbit)
-
-	clz := m.allocateInstr()
-	clz.asBitRR(bitOpClz, rd, tmpReg, _64)
-	m.insert(clz)
-}
-
-func (m *machine) lowerPopcnt(x, result ssa.Value) {
-	// arm64 doesn't have an instruction for population count on scalar register,
-	// so we use the vector instruction `cnt`.
-	// This is exactly what the official Go implements bits.OneCount.
-	// For example, "func () int { return bits.OneCount(10) }" is compiled as
-	//
-	//    MOVD    $10, R0 ;; Load 10.
-	//    FMOVD   R0, F0
-	//    VCNT    V0.B8, V0.B8
-	//    UADDLV  V0.B8, V0
-	//
-	// In aarch64 asm, FMOVD is encoded as `ins`, VCNT is `cnt`,
-	// and the registers may use different names. In our encoding we use the following
-	// instructions:
-	//
-	//    ins v0.d[0], x0     ;; mov from GPR to vec (FMOV above) is encoded as INS
-	//    cnt v0.16b, v0.16b  ;; we use vec arrangement 16b
-	//    uaddlv h0, v0.8b    ;; h0 is still v0 with the dest width specifier 'H', implied when src arrangement is 8b
-	//    mov x5, v0.d[0]     ;; finally we mov the result back to a GPR
-	//
-
-	rd := m.compiler.VRegOf(result)
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-
-	rf1 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
-	ins := m.allocateInstr()
-	ins.asMovToVec(rf1.nr(), rn, vecArrangementD, vecIndex(0))
-	m.insert(ins)
-
-	rf2 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
-	cnt := m.allocateInstr()
-	cnt.asVecMisc(vecOpCnt, rf2.nr(), rf1, vecArrangement16B)
-	m.insert(cnt)
-
-	rf3 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
-	uaddlv := m.allocateInstr()
-	uaddlv.asVecLanes(vecOpUaddlv, rf3.nr(), rf2, vecArrangement8B)
-	m.insert(uaddlv)
-
-	mov := m.allocateInstr()
-	mov.asMovFromVec(rd, rf3, vecArrangementD, vecIndex(0), false)
-	m.insert(mov)
-}
-
-// lowerExitWithCode lowers the lowerExitWithCode takes a context pointer as argument.
-func (m *machine) lowerExitWithCode(execCtxVReg regalloc.VReg, code wazevoapi.ExitCode) {
-	tmpReg1 := m.compiler.AllocateVReg(ssa.TypeI32)
-	loadExitCodeConst := m.allocateInstr()
-	loadExitCodeConst.asMOVZ(tmpReg1, uint64(code), 0, true)
-
-	setExitCode := m.allocateInstr()
-	mode := m.amodePool.Allocate()
-	*mode = addressMode{
-		kind: addressModeKindRegUnsignedImm12,
-		rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
-	}
-	setExitCode.asStore(operandNR(tmpReg1), mode, 32)
-
-	// In order to unwind the stack, we also need to push the current stack pointer:
-	tmp2 := m.compiler.AllocateVReg(ssa.TypeI64)
-	movSpToTmp := m.allocateInstr()
-	movSpToTmp.asMove64(tmp2, spVReg)
-	strSpToExecCtx := m.allocateInstr()
-	mode2 := m.amodePool.Allocate()
-	*mode2 = addressMode{
-		kind: addressModeKindRegUnsignedImm12,
-		rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
-	}
-	strSpToExecCtx.asStore(operandNR(tmp2), mode2, 64)
-	// Also the address of this exit.
-	tmp3 := m.compiler.AllocateVReg(ssa.TypeI64)
-	currentAddrToTmp := m.allocateInstr()
-	currentAddrToTmp.asAdr(tmp3, 0)
-	storeCurrentAddrToExecCtx := m.allocateInstr()
-	mode3 := m.amodePool.Allocate()
-	*mode3 = addressMode{
-		kind: addressModeKindRegUnsignedImm12,
-		rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
-	}
-	storeCurrentAddrToExecCtx.asStore(operandNR(tmp3), mode3, 64)
-
-	exitSeq := m.allocateInstr()
-	exitSeq.asExitSequence(execCtxVReg)
-
-	m.insert(loadExitCodeConst)
-	m.insert(setExitCode)
-	m.insert(movSpToTmp)
-	m.insert(strSpToExecCtx)
-	m.insert(currentAddrToTmp)
-	m.insert(storeCurrentAddrToExecCtx)
-	m.insert(exitSeq)
-}
-
-func (m *machine) lowerIcmpToFlag(x, y ssa.Value, signed bool) {
-	if x.Type() != y.Type() {
-		panic(
-			fmt.Sprintf("TODO(maybe): support icmp with different types: v%d=%s != v%d=%s",
-				x.ID(), x.Type(), y.ID(), y.Type()))
-	}
-
-	extMod := extModeOf(x.Type(), signed)
-
-	// First operand must be in pure register form.
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extMod)
-	// Second operand can be in any of Imm12, ER, SR, or NR form supported by the SUBS instructions.
-	rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), extMod)
-
-	alu := m.allocateInstr()
-	// subs zr, rn, rm
-	alu.asALU(
-		aluOpSubS,
-		// We don't need the result, just need to set flags.
-		xzrVReg,
-		rn,
-		rm,
-		x.Type().Bits() == 64,
-	)
-	m.insert(alu)
-}
-
-func (m *machine) lowerFcmpToFlag(x, y ssa.Value) {
-	if x.Type() != y.Type() {
-		panic("TODO(maybe): support icmp with different types")
-	}
-
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	cmp := m.allocateInstr()
-	cmp.asFpuCmp(rn, rm, x.Type().Bits() == 64)
-	m.insert(cmp)
-}
-
-func (m *machine) lowerExitIfTrueWithCode(execCtxVReg regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) {
-	condDef := m.compiler.ValueDefinition(cond)
-	if !m.compiler.MatchInstr(condDef, ssa.OpcodeIcmp) {
-		panic("TODO: OpcodeExitIfTrueWithCode must come after Icmp at the moment: " + condDef.Instr.Opcode().String())
-	}
-	condDef.Instr.MarkLowered()
-
-	cvalInstr := condDef.Instr
-	x, y, c := cvalInstr.IcmpData()
-	signed := c.Signed()
-
-	if !m.tryLowerBandToFlag(x, y) {
-		m.lowerIcmpToFlag(x, y, signed)
-	}
-
-	// We need to copy the execution context to a temp register, because if it's spilled,
-	// it might end up being reloaded inside the exiting branch.
-	execCtxTmp := m.copyToTmp(execCtxVReg)
-
-	// We have to skip the entire exit sequence if the condition is false.
-	cbr := m.allocateInstr()
-	m.insert(cbr)
-	m.lowerExitWithCode(execCtxTmp, code)
-	// conditional branch target is after exit.
-	l := m.insertBrTargetLabel()
-	cbr.asCondBr(condFlagFromSSAIntegerCmpCond(c).invert().asCond(), l, false /* ignored */)
-}
-
-func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
-	cvalDef := m.compiler.ValueDefinition(c)
-
-	var cc condFlag
-	switch {
-	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction.
-		cvalInstr := cvalDef.Instr
-		x, y, c := cvalInstr.IcmpData()
-		cc = condFlagFromSSAIntegerCmpCond(c)
-		m.lowerIcmpToFlag(x, y, c.Signed())
-		cvalDef.Instr.MarkLowered()
-	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly.
-		cvalInstr := cvalDef.Instr
-		x, y, c := cvalInstr.FcmpData()
-		cc = condFlagFromSSAFloatCmpCond(c)
-		m.lowerFcmpToFlag(x, y)
-		cvalDef.Instr.MarkLowered()
-	default:
-		rn := m.getOperand_NR(cvalDef, extModeNone)
-		if c.Type() != ssa.TypeI32 && c.Type() != ssa.TypeI64 {
-			panic("TODO?BUG?: support select with non-integer condition")
-		}
-		alu := m.allocateInstr()
-		// subs zr, rn, zr
-		alu.asALU(
-			aluOpSubS,
-			// We don't need the result, just need to set flags.
-			xzrVReg,
-			rn,
-			operandNR(xzrVReg),
-			c.Type().Bits() == 64,
-		)
-		m.insert(alu)
-		cc = ne
-	}
-
-	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-
-	rd := m.compiler.VRegOf(result)
-	switch x.Type() {
-	case ssa.TypeI32, ssa.TypeI64:
-		// csel rd, rn, rm, cc
-		csel := m.allocateInstr()
-		csel.asCSel(rd, rn, rm, cc, x.Type().Bits() == 64)
-		m.insert(csel)
-	case ssa.TypeF32, ssa.TypeF64:
-		// fcsel rd, rn, rm, cc
-		fcsel := m.allocateInstr()
-		fcsel.asFpuCSel(rd, rn, rm, cc, x.Type().Bits() == 64)
-		m.insert(fcsel)
-	default:
-		panic("BUG")
-	}
-}
-
-func (m *machine) lowerSelectVec(rc, rn, rm operand, rd regalloc.VReg) {
-	// First check if `rc` is zero or not.
-	checkZero := m.allocateInstr()
-	checkZero.asALU(aluOpSubS, xzrVReg, rc, operandNR(xzrVReg), false)
-	m.insert(checkZero)
-
-	// Then use CSETM to set all bits to one if `rc` is zero.
-	allOnesOrZero := m.compiler.AllocateVReg(ssa.TypeI64)
-	cset := m.allocateInstr()
-	cset.asCSet(allOnesOrZero, true, ne)
-	m.insert(cset)
-
-	// Then move the bits to the result vector register.
-	tmp2 := m.compiler.AllocateVReg(ssa.TypeV128)
-	dup := m.allocateInstr()
-	dup.asVecDup(tmp2, operandNR(allOnesOrZero), vecArrangement2D)
-	m.insert(dup)
-
-	// Now that `tmp2` has either all bits one or zero depending on `rc`,
-	// we can use bsl to select between `rn` and `rm`.
-	ins := m.allocateInstr()
-	ins.asVecRRRRewrite(vecOpBsl, tmp2, rn, rm, vecArrangement16B)
-	m.insert(ins)
-
-	// Finally, move the result to the destination register.
-	mov2 := m.allocateInstr()
-	mov2.asFpuMov128(rd, tmp2)
-	m.insert(mov2)
-}
-
-func (m *machine) lowerAtomicRmw(si *ssa.Instruction) {
-	ssaOp, size := si.AtomicRmwData()
-
-	var op atomicRmwOp
-	var negateArg bool
-	var flipArg bool
-	switch ssaOp {
-	case ssa.AtomicRmwOpAdd:
-		op = atomicRmwOpAdd
-	case ssa.AtomicRmwOpSub:
-		op = atomicRmwOpAdd
-		negateArg = true
-	case ssa.AtomicRmwOpAnd:
-		op = atomicRmwOpClr
-		flipArg = true
-	case ssa.AtomicRmwOpOr:
-		op = atomicRmwOpSet
-	case ssa.AtomicRmwOpXor:
-		op = atomicRmwOpEor
-	case ssa.AtomicRmwOpXchg:
-		op = atomicRmwOpSwp
-	default:
-		panic(fmt.Sprintf("unknown ssa atomic rmw op: %s", ssaOp))
-	}
-
-	addr, val := si.Arg2()
-	addrDef, valDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(val)
-	rn := m.getOperand_NR(addrDef, extModeNone)
-	rt := m.compiler.VRegOf(si.Return())
-	rs := m.getOperand_NR(valDef, extModeNone)
-
-	_64 := si.Return().Type().Bits() == 64
-	var tmp regalloc.VReg
-	if _64 {
-		tmp = m.compiler.AllocateVReg(ssa.TypeI64)
-	} else {
-		tmp = m.compiler.AllocateVReg(ssa.TypeI32)
-	}
-	m.lowerAtomicRmwImpl(op, rn.nr(), rs.nr(), rt, tmp, size, negateArg, flipArg, _64)
-}
-
-func (m *machine) lowerAtomicRmwImpl(op atomicRmwOp, rn, rs, rt, tmp regalloc.VReg, size uint64, negateArg, flipArg, dst64bit bool) {
-	switch {
-	case negateArg:
-		neg := m.allocateInstr()
-		neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), operandNR(rs), dst64bit)
-		m.insert(neg)
-	case flipArg:
-		flip := m.allocateInstr()
-		flip.asALU(aluOpOrn, tmp, operandNR(xzrVReg), operandNR(rs), dst64bit)
-		m.insert(flip)
-	default:
-		tmp = rs
-	}
-
-	rmw := m.allocateInstr()
-	rmw.asAtomicRmw(op, rn, tmp, rt, size)
-	m.insert(rmw)
-}
-
-func (m *machine) lowerAtomicCas(si *ssa.Instruction) {
-	addr, exp, repl := si.Arg3()
-	size := si.AtomicTargetSize()
-
-	addrDef, expDef, replDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(exp), m.compiler.ValueDefinition(repl)
-	rn := m.getOperand_NR(addrDef, extModeNone)
-	rt := m.getOperand_NR(replDef, extModeNone)
-	rs := m.getOperand_NR(expDef, extModeNone)
-	tmp := m.compiler.AllocateVReg(si.Return().Type())
-
-	_64 := si.Return().Type().Bits() == 64
-	// rs is overwritten by CAS, so we need to move it to the result register before the instruction
-	// in case when it is used somewhere else.
-	mov := m.allocateInstr()
-	if _64 {
-		mov.asMove64(tmp, rs.nr())
-	} else {
-		mov.asMove32(tmp, rs.nr())
-	}
-	m.insert(mov)
-
-	m.lowerAtomicCasImpl(rn.nr(), tmp, rt.nr(), size)
-
-	mov2 := m.allocateInstr()
-	rd := m.compiler.VRegOf(si.Return())
-	if _64 {
-		mov2.asMove64(rd, tmp)
-	} else {
-		mov2.asMove32(rd, tmp)
-	}
-	m.insert(mov2)
-}
-
-func (m *machine) lowerAtomicCasImpl(rn, rs, rt regalloc.VReg, size uint64) {
-	cas := m.allocateInstr()
-	cas.asAtomicCas(rn, rs, rt, size)
-	m.insert(cas)
-}
-
-func (m *machine) lowerAtomicLoad(si *ssa.Instruction) {
-	addr := si.Arg()
-	size := si.AtomicTargetSize()
-
-	addrDef := m.compiler.ValueDefinition(addr)
-	rn := m.getOperand_NR(addrDef, extModeNone)
-	rt := m.compiler.VRegOf(si.Return())
-
-	m.lowerAtomicLoadImpl(rn.nr(), rt, size)
-}
-
-func (m *machine) lowerAtomicLoadImpl(rn, rt regalloc.VReg, size uint64) {
-	ld := m.allocateInstr()
-	ld.asAtomicLoad(rn, rt, size)
-	m.insert(ld)
-}
-
-func (m *machine) lowerAtomicStore(si *ssa.Instruction) {
-	addr, val := si.Arg2()
-	size := si.AtomicTargetSize()
-
-	addrDef := m.compiler.ValueDefinition(addr)
-	valDef := m.compiler.ValueDefinition(val)
-	rn := m.getOperand_NR(addrDef, extModeNone)
-	rt := m.getOperand_NR(valDef, extModeNone)
-
-	m.lowerAtomicStoreImpl(rn, rt, size)
-}
-
-func (m *machine) lowerAtomicStoreImpl(rn, rt operand, size uint64) {
-	ld := m.allocateInstr()
-	ld.asAtomicStore(rn, rt, size)
-	m.insert(ld)
-}
-
-// copyToTmp copies the given regalloc.VReg to a temporary register. This is called before cbr to avoid the regalloc issue
-// e.g. reload happening in the middle of the exit sequence which is not the path the normal path executes
-func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg {
-	typ := m.compiler.TypeOf(v)
-	mov := m.allocateInstr()
-	tmp := m.compiler.AllocateVReg(typ)
-	if typ.IsInt() {
-		mov.asMove64(tmp, v)
-	} else {
-		mov.asFpuMov128(tmp, v)
-	}
-	m.insert(mov)
-	return tmp
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go
deleted file mode 100644
index 7a398c3d0..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go
+++ /dev/null
@@ -1,340 +0,0 @@
-package arm64
-
-// This file contains the logic to "find and determine operands" for instructions.
-// In order to finalize the form of an operand, we might end up merging/eliminating
-// the source instructions into an operand whenever possible.
-
-import (
-	"fmt"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-type (
-	// operand represents an operand of an instruction whose type is determined by the kind.
-	operand struct {
-		kind        operandKind
-		data, data2 uint64
-	}
-	operandKind byte
-)
-
-// Here's the list of operand kinds. We use the abbreviation of the kind name not only for these consts,
-// but also names of functions which return the operand of the kind.
-const (
-	// operandKindNR represents "NormalRegister" (NR). This is literally the register without any special operation unlike others.
-	operandKindNR operandKind = iota
-	// operandKindSR represents "Shifted Register" (SR). This is a register which is shifted by a constant.
-	// Some of the arm64 instructions can take this kind of operand.
-	operandKindSR
-	// operandKindER represents "Extended Register (ER). This is a register which is sign/zero-extended to a larger size.
-	// Some of the arm64 instructions can take this kind of operand.
-	operandKindER
-	// operandKindImm12 represents "Immediate 12" (Imm12). This is a 12-bit immediate value which can be either shifted by 12 or not.
-	// See asImm12 function for detail.
-	operandKindImm12
-	// operandKindShiftImm represents "Shifted Immediate" (ShiftImm) used by shift operations.
-	operandKindShiftImm
-)
-
-// String implements fmt.Stringer for debugging.
-func (o operand) format(size byte) string {
-	switch o.kind {
-	case operandKindNR:
-		return formatVRegSized(o.nr(), size)
-	case operandKindSR:
-		r, amt, sop := o.sr()
-		return fmt.Sprintf("%s, %s #%d", formatVRegSized(r, size), sop, amt)
-	case operandKindER:
-		r, eop, _ := o.er()
-		return fmt.Sprintf("%s %s", formatVRegSized(r, size), eop)
-	case operandKindImm12:
-		imm12, shiftBit := o.imm12()
-		if shiftBit == 1 {
-			return fmt.Sprintf("#%#x", uint64(imm12)<<12)
-		} else {
-			return fmt.Sprintf("#%#x", imm12)
-		}
-	default:
-		panic(fmt.Sprintf("unknown operand kind: %d", o.kind))
-	}
-}
-
-// operandNR encodes the given VReg as an operand of operandKindNR.
-func operandNR(r regalloc.VReg) operand {
-	return operand{kind: operandKindNR, data: uint64(r)}
-}
-
-// nr decodes the underlying VReg assuming the operand is of operandKindNR.
-func (o operand) nr() regalloc.VReg {
-	return regalloc.VReg(o.data)
-}
-
-// operandER encodes the given VReg as an operand of operandKindER.
-func operandER(r regalloc.VReg, eop extendOp, to byte) operand {
-	if to < 32 {
-		panic("TODO?BUG?: when we need to extend to less than 32 bits?")
-	}
-	return operand{kind: operandKindER, data: uint64(r), data2: uint64(eop)<<32 | uint64(to)}
-}
-
-// er decodes the underlying VReg, extend operation, and the target size assuming the operand is of operandKindER.
-func (o operand) er() (r regalloc.VReg, eop extendOp, to byte) {
-	return regalloc.VReg(o.data), extendOp(o.data2>>32) & 0xff, byte(o.data2 & 0xff)
-}
-
-// operandSR encodes the given VReg as an operand of operandKindSR.
-func operandSR(r regalloc.VReg, amt byte, sop shiftOp) operand {
-	return operand{kind: operandKindSR, data: uint64(r), data2: uint64(amt)<<32 | uint64(sop)}
-}
-
-// sr decodes the underlying VReg, shift amount, and shift operation assuming the operand is of operandKindSR.
-func (o operand) sr() (r regalloc.VReg, amt byte, sop shiftOp) {
-	return regalloc.VReg(o.data), byte(o.data2>>32) & 0xff, shiftOp(o.data2) & 0xff
-}
-
-// operandImm12 encodes the given imm12 as an operand of operandKindImm12.
-func operandImm12(imm12 uint16, shiftBit byte) operand {
-	return operand{kind: operandKindImm12, data: uint64(imm12) | uint64(shiftBit)<<32}
-}
-
-// imm12 decodes the underlying imm12 data assuming the operand is of operandKindImm12.
-func (o operand) imm12() (v uint16, shiftBit byte) {
-	return uint16(o.data), byte(o.data >> 32)
-}
-
-// operandShiftImm encodes the given amount as an operand of operandKindShiftImm.
-func operandShiftImm(amount byte) operand {
-	return operand{kind: operandKindShiftImm, data: uint64(amount)}
-}
-
-// shiftImm decodes the underlying shift amount data assuming the operand is of operandKindShiftImm.
-func (o operand) shiftImm() byte {
-	return byte(o.data)
-}
-
-// reg returns the register of the operand if applicable.
-func (o operand) reg() regalloc.VReg {
-	switch o.kind {
-	case operandKindNR:
-		return o.nr()
-	case operandKindSR:
-		r, _, _ := o.sr()
-		return r
-	case operandKindER:
-		r, _, _ := o.er()
-		return r
-	case operandKindImm12:
-		// Does not have a register.
-	case operandKindShiftImm:
-		// Does not have a register.
-	default:
-		panic(o.kind)
-	}
-	return regalloc.VRegInvalid
-}
-
-func (o operand) realReg() regalloc.RealReg {
-	return o.nr().RealReg()
-}
-
-func (o operand) assignReg(v regalloc.VReg) operand {
-	switch o.kind {
-	case operandKindNR:
-		return operandNR(v)
-	case operandKindSR:
-		_, amt, sop := o.sr()
-		return operandSR(v, amt, sop)
-	case operandKindER:
-		_, eop, to := o.er()
-		return operandER(v, eop, to)
-	case operandKindImm12:
-		// Does not have a register.
-	case operandKindShiftImm:
-		// Does not have a register.
-	}
-	panic(o.kind)
-}
-
-// ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def).
-//
-// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
-// If the operand can be expressed as operandKindImm12, `mode` is ignored.
-func (m *machine) getOperand_Imm12_ER_SR_NR(def backend.SSAValueDefinition, mode extMode) (op operand) {
-	if !def.IsFromInstr() {
-		return operandNR(m.compiler.VRegOf(def.V))
-	}
-
-	instr := def.Instr
-	if instr.Opcode() == ssa.OpcodeIconst {
-		if imm12Op, ok := asImm12Operand(instr.ConstantVal()); ok {
-			instr.MarkLowered()
-			return imm12Op
-		}
-	}
-	return m.getOperand_ER_SR_NR(def, mode)
-}
-
-// getOperand_MaybeNegatedImm12_ER_SR_NR is almost the same as getOperand_Imm12_ER_SR_NR, but this might negate the immediate value.
-// If the immediate value is negated, the second return value is true, otherwise always false.
-func (m *machine) getOperand_MaybeNegatedImm12_ER_SR_NR(def backend.SSAValueDefinition, mode extMode) (op operand, negatedImm12 bool) {
-	if !def.IsFromInstr() {
-		return operandNR(m.compiler.VRegOf(def.V)), false
-	}
-
-	instr := def.Instr
-	if instr.Opcode() == ssa.OpcodeIconst {
-		c := instr.ConstantVal()
-		if imm12Op, ok := asImm12Operand(c); ok {
-			instr.MarkLowered()
-			return imm12Op, false
-		}
-
-		signExtended := int64(c)
-		if def.V.Type().Bits() == 32 {
-			signExtended = (signExtended << 32) >> 32
-		}
-		negatedWithoutSign := -signExtended
-		if imm12Op, ok := asImm12Operand(uint64(negatedWithoutSign)); ok {
-			instr.MarkLowered()
-			return imm12Op, true
-		}
-	}
-	return m.getOperand_ER_SR_NR(def, mode), false
-}
-
-// ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def).
-//
-// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
-func (m *machine) getOperand_ER_SR_NR(def backend.SSAValueDefinition, mode extMode) (op operand) {
-	if !def.IsFromInstr() {
-		return operandNR(m.compiler.VRegOf(def.V))
-	}
-
-	if m.compiler.MatchInstr(def, ssa.OpcodeSExtend) || m.compiler.MatchInstr(def, ssa.OpcodeUExtend) {
-		extInstr := def.Instr
-
-		signed := extInstr.Opcode() == ssa.OpcodeSExtend
-		innerExtFromBits, innerExtToBits := extInstr.ExtendFromToBits()
-		modeBits, modeSigned := mode.bits(), mode.signed()
-		if mode == extModeNone || innerExtToBits == modeBits {
-			eop := extendOpFrom(signed, innerExtFromBits)
-			extArg := m.getOperand_NR(m.compiler.ValueDefinition(extInstr.Arg()), extModeNone)
-			op = operandER(extArg.nr(), eop, innerExtToBits)
-			extInstr.MarkLowered()
-			return
-		}
-
-		if innerExtToBits > modeBits {
-			panic("BUG?TODO?: need the results of inner extension to be larger than the mode")
-		}
-
-		switch {
-		case (!signed && !modeSigned) || (signed && modeSigned):
-			// Two sign/zero extensions are equivalent to one sign/zero extension for the larger size.
-			eop := extendOpFrom(modeSigned, innerExtFromBits)
-			op = operandER(m.compiler.VRegOf(extInstr.Arg()), eop, modeBits)
-			extInstr.MarkLowered()
-		case (signed && !modeSigned) || (!signed && modeSigned):
-			// We need to {sign, zero}-extend the result of the {zero,sign} extension.
-			eop := extendOpFrom(modeSigned, innerExtToBits)
-			op = operandER(m.compiler.VRegOf(extInstr.Return()), eop, modeBits)
-			// Note that we failed to merge the inner extension instruction this case.
-		}
-		return
-	}
-	return m.getOperand_SR_NR(def, mode)
-}
-
-// ensureValueNR returns an operand of either operandKindSR or operandKindNR from the given value (defined by `def).
-//
-// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
-func (m *machine) getOperand_SR_NR(def backend.SSAValueDefinition, mode extMode) (op operand) {
-	if !def.IsFromInstr() {
-		return operandNR(m.compiler.VRegOf(def.V))
-	}
-
-	if m.compiler.MatchInstr(def, ssa.OpcodeIshl) {
-		// Check if the shift amount is constant instruction.
-		targetVal, amountVal := def.Instr.Arg2()
-		targetVReg := m.getOperand_NR(m.compiler.ValueDefinition(targetVal), extModeNone).nr()
-		amountDef := m.compiler.ValueDefinition(amountVal)
-		if amountDef.IsFromInstr() && amountDef.Instr.Constant() {
-			// If that is the case, we can use the shifted register operand (SR).
-			c := byte(amountDef.Instr.ConstantVal()) & (targetVal.Type().Bits() - 1) // Clears the unnecessary bits.
-			def.Instr.MarkLowered()
-			amountDef.Instr.MarkLowered()
-			return operandSR(targetVReg, c, shiftOpLSL)
-		}
-	}
-	return m.getOperand_NR(def, mode)
-}
-
-// getOperand_ShiftImm_NR returns an operand of either operandKindShiftImm or operandKindNR from the given value (defined by `def).
-func (m *machine) getOperand_ShiftImm_NR(def backend.SSAValueDefinition, mode extMode, shiftBitWidth byte) (op operand) {
-	if !def.IsFromInstr() {
-		return operandNR(m.compiler.VRegOf(def.V))
-	}
-
-	instr := def.Instr
-	if instr.Constant() {
-		amount := byte(instr.ConstantVal()) & (shiftBitWidth - 1) // Clears the unnecessary bits.
-		return operandShiftImm(amount)
-	}
-	return m.getOperand_NR(def, mode)
-}
-
-// ensureValueNR returns an operand of operandKindNR from the given value (defined by `def).
-//
-// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
-func (m *machine) getOperand_NR(def backend.SSAValueDefinition, mode extMode) (op operand) {
-	var v regalloc.VReg
-	if def.IsFromInstr() && def.Instr.Constant() {
-		// We inline all the constant instructions so that we could reduce the register usage.
-		v = m.lowerConstant(def.Instr)
-		def.Instr.MarkLowered()
-	} else {
-		v = m.compiler.VRegOf(def.V)
-	}
-
-	r := v
-	switch inBits := def.V.Type().Bits(); {
-	case mode == extModeNone:
-	case inBits == 32 && (mode == extModeZeroExtend32 || mode == extModeSignExtend32):
-	case inBits == 32 && mode == extModeZeroExtend64:
-		extended := m.compiler.AllocateVReg(ssa.TypeI64)
-		ext := m.allocateInstr()
-		ext.asExtend(extended, v, 32, 64, false)
-		m.insert(ext)
-		r = extended
-	case inBits == 32 && mode == extModeSignExtend64:
-		extended := m.compiler.AllocateVReg(ssa.TypeI64)
-		ext := m.allocateInstr()
-		ext.asExtend(extended, v, 32, 64, true)
-		m.insert(ext)
-		r = extended
-	case inBits == 64 && (mode == extModeZeroExtend64 || mode == extModeSignExtend64):
-	}
-	return operandNR(r)
-}
-
-func asImm12Operand(val uint64) (op operand, ok bool) {
-	v, shiftBit, ok := asImm12(val)
-	if !ok {
-		return operand{}, false
-	}
-	return operandImm12(v, shiftBit), true
-}
-
-func asImm12(val uint64) (v uint16, shiftBit byte, ok bool) {
-	const mask1, mask2 uint64 = 0xfff, 0xfff_000
-	if val&^mask1 == 0 {
-		return uint16(val), 0, true
-	} else if val&^mask2 == 0 {
-		return uint16(val >> 12), 1, true
-	} else {
-		return 0, 0, false
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
deleted file mode 100644
index fd0760d72..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
+++ /dev/null
@@ -1,451 +0,0 @@
-package arm64
-
-import (
-	"fmt"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-)
-
-type (
-	// addressMode represents an ARM64 addressing mode.
-	//
-	// https://developer.arm.com/documentation/102374/0101/Loads-and-stores---addressing
-	// TODO: use the bit-packed layout like operand struct.
-	addressMode struct {
-		kind   addressModeKind
-		rn, rm regalloc.VReg
-		extOp  extendOp
-		imm    int64
-	}
-
-	// addressModeKind represents the kind of ARM64 addressing mode.
-	addressModeKind byte
-)
-
-func resetAddressMode(a *addressMode) {
-	a.kind = 0
-	a.rn = 0
-	a.rm = 0
-	a.extOp = 0
-	a.imm = 0
-}
-
-const (
-	// addressModeKindRegExtended takes a base register and an index register. The index register is sign/zero-extended,
-	// and then scaled by bits(type)/8.
-	//
-	// e.g.
-	// 	- ldrh w1, [x2, w3, SXTW #1] ;; sign-extended and scaled by 2 (== LSL #1)
-	// 	- strh w1, [x2, w3, UXTW #1] ;; zero-extended and scaled by 2 (== LSL #1)
-	// 	- ldr w1, [x2, w3, SXTW #2] ;; sign-extended and scaled by 4 (== LSL #2)
-	// 	- str x1, [x2, w3, UXTW #3] ;; zero-extended and scaled by 8 (== LSL #3)
-	//
-	// See the following pages:
-	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--register---Load-Register-Halfword--register--
-	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--register---Load-Register--register--
-	addressModeKindRegScaledExtended addressModeKind = iota
-
-	// addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without extension factor.
-	addressModeKindRegScaled
-
-	// addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without scale factor.
-	addressModeKindRegExtended
-
-	// addressModeKindRegReg takes a base register and an index register. The index register is not either scaled or extended.
-	addressModeKindRegReg
-
-	// addressModeKindRegSignedImm9 takes a base register and a 9-bit "signed" immediate offset (-256 to 255).
-	// The immediate will be sign-extended, and be added to the base register.
-	// This is a.k.a. "unscaled" since the immediate is not scaled.
-	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled--
-	addressModeKindRegSignedImm9
-
-	// addressModeKindRegUnsignedImm12 takes a base register and a 12-bit "unsigned" immediate offset.  scaled by
-	// the size of the type. In other words, the actual offset will be imm12 * bits(type)/8.
-	// See "Unsigned offset" in the following pages:
-	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
-	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
-	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
-	addressModeKindRegUnsignedImm12
-
-	// addressModePostIndex takes a base register and a 9-bit "signed" immediate offset.
-	// After the load/store, the base register will be updated by the offset.
-	//
-	// Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset.
-	//
-	// See "Post-index" in the following pages for examples:
-	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
-	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
-	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
-	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
-	addressModeKindPostIndex
-
-	// addressModePostIndex takes a base register and a 9-bit "signed" immediate offset.
-	// Before the load/store, the base register will be updated by the offset.
-	//
-	// Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset.
-	//
-	// See "Pre-index" in the following pages for examples:
-	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
-	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
-	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
-	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
-	addressModeKindPreIndex
-
-	// addressModeKindArgStackSpace is used to resolve the address of the argument stack space
-	// exiting right above the stack pointer. Since we don't know the exact stack space needed for a function
-	// at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above.
-	addressModeKindArgStackSpace
-
-	// addressModeKindResultStackSpace is used to resolve the address of the result stack space
-	// exiting right above the stack pointer. Since we don't know the exact stack space needed for a function
-	// at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above.
-	addressModeKindResultStackSpace
-)
-
-func (a addressMode) format(dstSizeBits byte) (ret string) {
-	base := formatVRegSized(a.rn, 64)
-	if rn := a.rn; rn.RegType() != regalloc.RegTypeInt {
-		panic("invalid base register type: " + a.rn.RegType().String())
-	} else if rn.IsRealReg() && v0 <= a.rn.RealReg() && a.rn.RealReg() <= v30 {
-		panic("BUG: likely a bug in reg alloc or reset behavior")
-	}
-
-	switch a.kind {
-	case addressModeKindRegScaledExtended:
-		amount := a.sizeInBitsToShiftAmount(dstSizeBits)
-		ret = fmt.Sprintf("[%s, %s, %s #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp, amount)
-	case addressModeKindRegScaled:
-		amount := a.sizeInBitsToShiftAmount(dstSizeBits)
-		ret = fmt.Sprintf("[%s, %s, lsl #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), amount)
-	case addressModeKindRegExtended:
-		ret = fmt.Sprintf("[%s, %s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp)
-	case addressModeKindRegReg:
-		ret = fmt.Sprintf("[%s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()))
-	case addressModeKindRegSignedImm9:
-		if a.imm != 0 {
-			ret = fmt.Sprintf("[%s, #%#x]", base, a.imm)
-		} else {
-			ret = fmt.Sprintf("[%s]", base)
-		}
-	case addressModeKindRegUnsignedImm12:
-		if a.imm != 0 {
-			ret = fmt.Sprintf("[%s, #%#x]", base, a.imm)
-		} else {
-			ret = fmt.Sprintf("[%s]", base)
-		}
-	case addressModeKindPostIndex:
-		ret = fmt.Sprintf("[%s], #%#x", base, a.imm)
-	case addressModeKindPreIndex:
-		ret = fmt.Sprintf("[%s, #%#x]!", base, a.imm)
-	case addressModeKindArgStackSpace:
-		ret = fmt.Sprintf("[#arg_space, #%#x]", a.imm)
-	case addressModeKindResultStackSpace:
-		ret = fmt.Sprintf("[#ret_space, #%#x]", a.imm)
-	}
-	return
-}
-
-func addressModePreOrPostIndex(m *machine, rn regalloc.VReg, imm int64, preIndex bool) *addressMode {
-	if !offsetFitsInAddressModeKindRegSignedImm9(imm) {
-		panic(fmt.Sprintf("BUG: offset %#x does not fit in addressModeKindRegSignedImm9", imm))
-	}
-	mode := m.amodePool.Allocate()
-	if preIndex {
-		*mode = addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm}
-	} else {
-		*mode = addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm}
-	}
-	return mode
-}
-
-func offsetFitsInAddressModeKindRegUnsignedImm12(dstSizeInBits byte, offset int64) bool {
-	divisor := int64(dstSizeInBits) / 8
-	return 0 < offset && offset%divisor == 0 && offset/divisor < 4096
-}
-
-func offsetFitsInAddressModeKindRegSignedImm9(offset int64) bool {
-	return -256 <= offset && offset <= 255
-}
-
-func (a addressMode) indexRegBits() byte {
-	bits := a.extOp.srcBits()
-	if bits != 32 && bits != 64 {
-		panic("invalid index register for address mode. it must be either 32 or 64 bits")
-	}
-	return bits
-}
-
-func (a addressMode) sizeInBitsToShiftAmount(sizeInBits byte) (lsl byte) {
-	switch sizeInBits {
-	case 8:
-		lsl = 0
-	case 16:
-		lsl = 1
-	case 32:
-		lsl = 2
-	case 64:
-		lsl = 3
-	}
-	return
-}
-
-func extLoadSignSize(op ssa.Opcode) (size byte, signed bool) {
-	switch op {
-	case ssa.OpcodeUload8:
-		size, signed = 8, false
-	case ssa.OpcodeUload16:
-		size, signed = 16, false
-	case ssa.OpcodeUload32:
-		size, signed = 32, false
-	case ssa.OpcodeSload8:
-		size, signed = 8, true
-	case ssa.OpcodeSload16:
-		size, signed = 16, true
-	case ssa.OpcodeSload32:
-		size, signed = 32, true
-	default:
-		panic("BUG")
-	}
-	return
-}
-
-func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, ret regalloc.VReg) {
-	size, signed := extLoadSignSize(op)
-	amode := m.lowerToAddressMode(ptr, offset, size)
-	load := m.allocateInstr()
-	if signed {
-		load.asSLoad(ret, amode, size)
-	} else {
-		load.asULoad(ret, amode, size)
-	}
-	m.insert(load)
-}
-
-func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, ret ssa.Value) {
-	amode := m.lowerToAddressMode(ptr, offset, typ.Bits())
-
-	dst := m.compiler.VRegOf(ret)
-	load := m.allocateInstr()
-	switch typ {
-	case ssa.TypeI32, ssa.TypeI64:
-		load.asULoad(dst, amode, typ.Bits())
-	case ssa.TypeF32, ssa.TypeF64:
-		load.asFpuLoad(dst, amode, typ.Bits())
-	case ssa.TypeV128:
-		load.asFpuLoad(dst, amode, 128)
-	default:
-		panic("TODO")
-	}
-	m.insert(load)
-}
-
-func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, lane ssa.VecLane, ret ssa.Value) {
-	// vecLoad1R has offset address mode (base+imm) only for post index, so we simply add the offset to the base.
-	base := m.getOperand_NR(m.compiler.ValueDefinition(ptr), extModeNone).nr()
-	offsetReg := m.compiler.AllocateVReg(ssa.TypeI64)
-	m.lowerConstantI64(offsetReg, int64(offset))
-	addedBase := m.addReg64ToReg64(base, offsetReg)
-
-	rd := m.compiler.VRegOf(ret)
-
-	ld1r := m.allocateInstr()
-	ld1r.asVecLoad1R(rd, operandNR(addedBase), ssaLaneToArrangement(lane))
-	m.insert(ld1r)
-}
-
-func (m *machine) lowerStore(si *ssa.Instruction) {
-	// TODO: merge consecutive stores into a single pair store instruction.
-	value, ptr, offset, storeSizeInBits := si.StoreData()
-	amode := m.lowerToAddressMode(ptr, offset, storeSizeInBits)
-
-	valueOp := m.getOperand_NR(m.compiler.ValueDefinition(value), extModeNone)
-	store := m.allocateInstr()
-	store.asStore(valueOp, amode, storeSizeInBits)
-	m.insert(store)
-}
-
-// lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
-func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode *addressMode) {
-	// TODO: currently the instruction selection logic doesn't support addressModeKindRegScaledExtended and
-	// addressModeKindRegScaled since collectAddends doesn't take ssa.OpcodeIshl into account. This should be fixed
-	// to support more efficient address resolution.
-
-	a32s, a64s, offset := m.collectAddends(ptr)
-	offset += int64(offsetBase)
-	return m.lowerToAddressModeFromAddends(a32s, a64s, size, offset)
-}
-
-// lowerToAddressModeFromAddends creates an addressMode from a list of addends collected by collectAddends.
-// During the construction, this might emit additional instructions.
-//
-// Extracted as a separate function for easy testing.
-func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32], a64s *wazevoapi.Queue[regalloc.VReg], size byte, offset int64) (amode *addressMode) {
-	amode = m.amodePool.Allocate()
-	switch a64sExist, a32sExist := !a64s.Empty(), !a32s.Empty(); {
-	case a64sExist && a32sExist:
-		var base regalloc.VReg
-		base = a64s.Dequeue()
-		var a32 addend32
-		a32 = a32s.Dequeue()
-		*amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext}
-	case a64sExist && offsetFitsInAddressModeKindRegUnsignedImm12(size, offset):
-		var base regalloc.VReg
-		base = a64s.Dequeue()
-		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset}
-		offset = 0
-	case a64sExist && offsetFitsInAddressModeKindRegSignedImm9(offset):
-		var base regalloc.VReg
-		base = a64s.Dequeue()
-		*amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset}
-		offset = 0
-	case a64sExist:
-		var base regalloc.VReg
-		base = a64s.Dequeue()
-		if !a64s.Empty() {
-			index := a64s.Dequeue()
-			*amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */}
-		} else {
-			*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
-		}
-	case a32sExist:
-		base32 := a32s.Dequeue()
-
-		// First we need 64-bit base.
-		base := m.compiler.AllocateVReg(ssa.TypeI64)
-		baseExt := m.allocateInstr()
-		var signed bool
-		if base32.ext == extendOpSXTW {
-			signed = true
-		}
-		baseExt.asExtend(base, base32.r, 32, 64, signed)
-		m.insert(baseExt)
-
-		if !a32s.Empty() {
-			index := a32s.Dequeue()
-			*amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext}
-		} else {
-			*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
-		}
-	default: // Only static offsets.
-		tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
-		m.lowerConstantI64(tmpReg, offset)
-		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0}
-		offset = 0
-	}
-
-	baseReg := amode.rn
-	if offset > 0 {
-		baseReg = m.addConstToReg64(baseReg, offset) // baseReg += offset
-	}
-
-	for !a64s.Empty() {
-		a64 := a64s.Dequeue()
-		baseReg = m.addReg64ToReg64(baseReg, a64) // baseReg += a64
-	}
-
-	for !a32s.Empty() {
-		a32 := a32s.Dequeue()
-		baseReg = m.addRegToReg64Ext(baseReg, a32.r, a32.ext) // baseReg += (a32 extended to 64-bit)
-	}
-	amode.rn = baseReg
-	return
-}
-
-var addendsMatchOpcodes = [4]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst}
-
-func (m *machine) collectAddends(ptr ssa.Value) (addends32 *wazevoapi.Queue[addend32], addends64 *wazevoapi.Queue[regalloc.VReg], offset int64) {
-	m.addendsWorkQueue.Reset()
-	m.addends32.Reset()
-	m.addends64.Reset()
-	m.addendsWorkQueue.Enqueue(ptr)
-
-	for !m.addendsWorkQueue.Empty() {
-		v := m.addendsWorkQueue.Dequeue()
-
-		def := m.compiler.ValueDefinition(v)
-		switch op := m.compiler.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op {
-		case ssa.OpcodeIadd:
-			// If the addend is an add, we recursively collect its operands.
-			x, y := def.Instr.Arg2()
-			m.addendsWorkQueue.Enqueue(x)
-			m.addendsWorkQueue.Enqueue(y)
-			def.Instr.MarkLowered()
-		case ssa.OpcodeIconst:
-			// If the addend is constant, we just statically merge it into the offset.
-			ic := def.Instr
-			u64 := ic.ConstantVal()
-			if ic.Return().Type().Bits() == 32 {
-				offset += int64(int32(u64)) // sign-extend.
-			} else {
-				offset += int64(u64)
-			}
-			def.Instr.MarkLowered()
-		case ssa.OpcodeUExtend, ssa.OpcodeSExtend:
-			input := def.Instr.Arg()
-			if input.Type().Bits() != 32 {
-				panic("illegal size: " + input.Type().String())
-			}
-
-			var ext extendOp
-			if op == ssa.OpcodeUExtend {
-				ext = extendOpUXTW
-			} else {
-				ext = extendOpSXTW
-			}
-
-			inputDef := m.compiler.ValueDefinition(input)
-			constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant()
-			switch {
-			case constInst && ext == extendOpUXTW:
-				// Zero-extension of a 32-bit constant can be merged into the offset.
-				offset += int64(uint32(inputDef.Instr.ConstantVal()))
-			case constInst && ext == extendOpSXTW:
-				// Sign-extension of a 32-bit constant can be merged into the offset.
-				offset += int64(int32(inputDef.Instr.ConstantVal())) // sign-extend!
-			default:
-				m.addends32.Enqueue(addend32{r: m.getOperand_NR(inputDef, extModeNone).nr(), ext: ext})
-			}
-			def.Instr.MarkLowered()
-			continue
-		default:
-			// If the addend is not one of them, we simply use it as-is (without merging!), optionally zero-extending it.
-			m.addends64.Enqueue(m.getOperand_NR(def, extModeZeroExtend64 /* optional zero ext */).nr())
-		}
-	}
-	return &m.addends32, &m.addends64, offset
-}
-
-func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) {
-	rd = m.compiler.AllocateVReg(ssa.TypeI64)
-	alu := m.allocateInstr()
-	if imm12Op, ok := asImm12Operand(uint64(c)); ok {
-		alu.asALU(aluOpAdd, rd, operandNR(r), imm12Op, true)
-	} else if imm12Op, ok = asImm12Operand(uint64(-c)); ok {
-		alu.asALU(aluOpSub, rd, operandNR(r), imm12Op, true)
-	} else {
-		tmp := m.compiler.AllocateVReg(ssa.TypeI64)
-		m.load64bitConst(c, tmp)
-		alu.asALU(aluOpAdd, rd, operandNR(r), operandNR(tmp), true)
-	}
-	m.insert(alu)
-	return
-}
-
-func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) {
-	rd = m.compiler.AllocateVReg(ssa.TypeI64)
-	alu := m.allocateInstr()
-	alu.asALU(aluOpAdd, rd, operandNR(rn), operandNR(rm), true)
-	m.insert(alu)
-	return
-}
-
-func (m *machine) addRegToReg64Ext(rn, rm regalloc.VReg, ext extendOp) (rd regalloc.VReg) {
-	rd = m.compiler.AllocateVReg(ssa.TypeI64)
-	alu := m.allocateInstr()
-	alu.asALU(aluOpAdd, rd, operandNR(rn), operandER(rm, ext, 64), true)
-	m.insert(alu)
-	return
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
deleted file mode 100644
index 00e6b238f..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
+++ /dev/null
@@ -1,631 +0,0 @@
-package arm64
-
-import (
-	"context"
-	"fmt"
-	"math"
-	"strings"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-)
-
-type (
-	// machine implements backend.Machine.
-	machine struct {
-		compiler   backend.Compiler
-		currentABI *backend.FunctionABI
-		instrPool  wazevoapi.Pool[instruction]
-		// labelPositionPool is the pool of labelPosition. The id is the label where
-		// if the label is less than the maxSSABlockID, it's the ssa.BasicBlockID.
-		labelPositionPool wazevoapi.IDedPool[labelPosition]
-
-		// nextLabel is the next label to be allocated. The first free label comes after maxSSABlockID
-		// so that we can have an identical label for the SSA block ID, which is useful for debugging.
-		nextLabel label
-		// rootInstr is the first instruction of the function.
-		rootInstr *instruction
-		// currentLabelPos is the currently-compiled ssa.BasicBlock's labelPosition.
-		currentLabelPos *labelPosition
-		// orderedSSABlockLabelPos is the ordered list of labelPosition in the generated code for each ssa.BasicBlock.
-		orderedSSABlockLabelPos []*labelPosition
-		// returnLabelPos is the labelPosition for the return block.
-		returnLabelPos labelPosition
-		// perBlockHead and perBlockEnd are the head and tail of the instruction list per currently-compiled ssa.BasicBlock.
-		perBlockHead, perBlockEnd *instruction
-		// pendingInstructions are the instructions which are not yet emitted into the instruction list.
-		pendingInstructions []*instruction
-		// maxSSABlockID is the maximum ssa.BasicBlockID in the current function.
-		maxSSABlockID label
-
-		regAlloc   regalloc.Allocator[*instruction, *labelPosition, *regAllocFn]
-		regAllocFn regAllocFn
-
-		amodePool wazevoapi.Pool[addressMode]
-
-		// addendsWorkQueue is used during address lowering, defined here for reuse.
-		addendsWorkQueue wazevoapi.Queue[ssa.Value]
-		addends32        wazevoapi.Queue[addend32]
-		// addends64 is used during address lowering, defined here for reuse.
-		addends64              wazevoapi.Queue[regalloc.VReg]
-		unresolvedAddressModes []*instruction
-
-		// condBrRelocs holds the conditional branches which need offset relocation.
-		condBrRelocs []condBrReloc
-
-		// jmpTableTargets holds the labels of the jump table targets.
-		jmpTableTargets [][]uint32
-		// jmpTableTargetNext is the index to the jmpTableTargets slice to be used for the next jump table.
-		jmpTableTargetsNext int
-
-		// spillSlotSize is the size of the stack slot in bytes used for spilling registers.
-		// During the execution of the function, the stack looks like:
-		//
-		//
-		//            (high address)
-		//          +-----------------+
-		//          |     .......     |
-		//          |      ret Y      |
-		//          |     .......     |
-		//          |      ret 0      |
-		//          |      arg X      |
-		//          |     .......     |
-		//          |      arg 1      |
-		//          |      arg 0      |
-		//          |      xxxxx      |
-		//          |   ReturnAddress |
-		//          +-----------------+   <<-|
-		//          |   ...........   |      |
-		//          |   spill slot M  |      | <--- spillSlotSize
-		//          |   ............  |      |
-		//          |   spill slot 2  |      |
-		//          |   spill slot 1  |   <<-+
-		//          |   clobbered N   |
-		//          |   ...........   |
-		//          |   clobbered 1   |
-		//          |   clobbered 0   |
-		//   SP---> +-----------------+
-		//             (low address)
-		//
-		// and it represents the size of the space between FP and the first spilled slot. This must be a multiple of 16.
-		// Also note that this is only known after register allocation.
-		spillSlotSize int64
-		spillSlots    map[regalloc.VRegID]int64 // regalloc.VRegID to offset.
-		// clobberedRegs holds real-register backed VRegs saved at the function prologue, and restored at the epilogue.
-		clobberedRegs []regalloc.VReg
-
-		maxRequiredStackSizeForCalls int64
-		stackBoundsCheckDisabled     bool
-
-		regAllocStarted bool
-	}
-
-	addend32 struct {
-		r   regalloc.VReg
-		ext extendOp
-	}
-
-	condBrReloc struct {
-		cbr *instruction
-		// currentLabelPos is the labelPosition within which condBr is defined.
-		currentLabelPos *labelPosition
-		// Next block's labelPosition.
-		nextLabel label
-		offset    int64
-	}
-)
-
-type (
-	// label represents a position in the generated code which is either
-	// a real instruction or the constant InstructionPool (e.g. jump tables).
-	//
-	// This is exactly the same as the traditional "label" in assembly code.
-	label uint32
-
-	// labelPosition represents the regions of the generated code which the label represents.
-	// This implements regalloc.Block.
-	labelPosition struct {
-		// sb is not nil if this corresponds to a ssa.BasicBlock.
-		sb ssa.BasicBlock
-		// cur is used to walk through the instructions in the block during the register allocation.
-		cur,
-		// begin and end are the first and last instructions of the block.
-		begin, end *instruction
-		// binaryOffset is the offset in the binary where the label is located.
-		binaryOffset int64
-	}
-)
-
-const (
-	labelReturn  label = math.MaxUint32
-	labelInvalid       = labelReturn - 1
-)
-
-// String implements backend.Machine.
-func (l label) String() string {
-	return fmt.Sprintf("L%d", l)
-}
-
-func resetLabelPosition(l *labelPosition) {
-	*l = labelPosition{}
-}
-
-// NewBackend returns a new backend for arm64.
-func NewBackend() backend.Machine {
-	m := &machine{
-		spillSlots:        make(map[regalloc.VRegID]int64),
-		regAlloc:          regalloc.NewAllocator[*instruction, *labelPosition, *regAllocFn](regInfo),
-		amodePool:         wazevoapi.NewPool[addressMode](resetAddressMode),
-		instrPool:         wazevoapi.NewPool[instruction](resetInstruction),
-		labelPositionPool: wazevoapi.NewIDedPool[labelPosition](resetLabelPosition),
-	}
-	m.regAllocFn.m = m
-	return m
-}
-
-func ssaBlockLabel(sb ssa.BasicBlock) label {
-	if sb.ReturnBlock() {
-		return labelReturn
-	}
-	return label(sb.ID())
-}
-
-// getOrAllocateSSABlockLabelPosition returns the labelPosition for the given basic block.
-func (m *machine) getOrAllocateSSABlockLabelPosition(sb ssa.BasicBlock) *labelPosition {
-	if sb.ReturnBlock() {
-		m.returnLabelPos.sb = sb
-		return &m.returnLabelPos
-	}
-
-	l := ssaBlockLabel(sb)
-	pos := m.labelPositionPool.GetOrAllocate(int(l))
-	pos.sb = sb
-	return pos
-}
-
-// LinkAdjacentBlocks implements backend.Machine.
-func (m *machine) LinkAdjacentBlocks(prev, next ssa.BasicBlock) {
-	prevPos, nextPos := m.getOrAllocateSSABlockLabelPosition(prev), m.getOrAllocateSSABlockLabelPosition(next)
-	prevPos.end.next = nextPos.begin
-}
-
-// StartBlock implements backend.Machine.
-func (m *machine) StartBlock(blk ssa.BasicBlock) {
-	m.currentLabelPos = m.getOrAllocateSSABlockLabelPosition(blk)
-	labelPos := m.currentLabelPos
-	end := m.allocateNop()
-	m.perBlockHead, m.perBlockEnd = end, end
-	labelPos.begin, labelPos.end = end, end
-	m.orderedSSABlockLabelPos = append(m.orderedSSABlockLabelPos, labelPos)
-}
-
-// EndBlock implements ExecutableContext.
-func (m *machine) EndBlock() {
-	// Insert nop0 as the head of the block for convenience to simplify the logic of inserting instructions.
-	m.insertAtPerBlockHead(m.allocateNop())
-
-	m.currentLabelPos.begin = m.perBlockHead
-
-	if m.currentLabelPos.sb.EntryBlock() {
-		m.rootInstr = m.perBlockHead
-	}
-}
-
-func (m *machine) insertAtPerBlockHead(i *instruction) {
-	if m.perBlockHead == nil {
-		m.perBlockHead = i
-		m.perBlockEnd = i
-		return
-	}
-
-	i.next = m.perBlockHead
-	m.perBlockHead.prev = i
-	m.perBlockHead = i
-}
-
-// FlushPendingInstructions implements backend.Machine.
-func (m *machine) FlushPendingInstructions() {
-	l := len(m.pendingInstructions)
-	if l == 0 {
-		return
-	}
-	for i := l - 1; i >= 0; i-- { // reverse because we lower instructions in reverse order.
-		m.insertAtPerBlockHead(m.pendingInstructions[i])
-	}
-	m.pendingInstructions = m.pendingInstructions[:0]
-}
-
-// RegAlloc implements backend.Machine Function.
-func (m *machine) RegAlloc() {
-	m.regAllocStarted = true
-	m.regAlloc.DoAllocation(&m.regAllocFn)
-	// Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes.
-	m.spillSlotSize = (m.spillSlotSize + 15) &^ 15
-}
-
-// Reset implements backend.Machine.
-func (m *machine) Reset() {
-	m.clobberedRegs = m.clobberedRegs[:0]
-	for key := range m.spillSlots {
-		m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key))
-	}
-	for _, key := range m.clobberedRegs {
-		delete(m.spillSlots, regalloc.VRegID(key))
-	}
-	m.clobberedRegs = m.clobberedRegs[:0]
-	m.regAllocStarted = false
-	m.regAlloc.Reset()
-	m.spillSlotSize = 0
-	m.unresolvedAddressModes = m.unresolvedAddressModes[:0]
-	m.maxRequiredStackSizeForCalls = 0
-	m.jmpTableTargetsNext = 0
-	m.amodePool.Reset()
-	m.instrPool.Reset()
-	m.labelPositionPool.Reset()
-	m.pendingInstructions = m.pendingInstructions[:0]
-	m.perBlockHead, m.perBlockEnd, m.rootInstr = nil, nil, nil
-	m.orderedSSABlockLabelPos = m.orderedSSABlockLabelPos[:0]
-}
-
-// StartLoweringFunction implements backend.Machine StartLoweringFunction.
-func (m *machine) StartLoweringFunction(maxBlockID ssa.BasicBlockID) {
-	m.maxSSABlockID = label(maxBlockID)
-	m.nextLabel = label(maxBlockID) + 1
-}
-
-// SetCurrentABI implements backend.Machine SetCurrentABI.
-func (m *machine) SetCurrentABI(abi *backend.FunctionABI) {
-	m.currentABI = abi
-}
-
-// DisableStackCheck implements backend.Machine DisableStackCheck.
-func (m *machine) DisableStackCheck() {
-	m.stackBoundsCheckDisabled = true
-}
-
-// SetCompiler implements backend.Machine.
-func (m *machine) SetCompiler(ctx backend.Compiler) {
-	m.compiler = ctx
-	m.regAllocFn.ssaB = ctx.SSABuilder()
-}
-
-func (m *machine) insert(i *instruction) {
-	m.pendingInstructions = append(m.pendingInstructions, i)
-}
-
-func (m *machine) insertBrTargetLabel() label {
-	nop, l := m.allocateBrTarget()
-	m.insert(nop)
-	return l
-}
-
-func (m *machine) allocateBrTarget() (nop *instruction, l label) {
-	l = m.nextLabel
-	m.nextLabel++
-	nop = m.allocateInstr()
-	nop.asNop0WithLabel(l)
-	pos := m.labelPositionPool.GetOrAllocate(int(l))
-	pos.begin, pos.end = nop, nop
-	return
-}
-
-// allocateInstr allocates an instruction.
-func (m *machine) allocateInstr() *instruction {
-	instr := m.instrPool.Allocate()
-	if !m.regAllocStarted {
-		instr.addedBeforeRegAlloc = true
-	}
-	return instr
-}
-
-func resetInstruction(i *instruction) {
-	*i = instruction{}
-}
-
-func (m *machine) allocateNop() *instruction {
-	instr := m.allocateInstr()
-	instr.asNop0()
-	return instr
-}
-
-func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruction) {
-	amode := i.getAmode()
-	switch amode.kind {
-	case addressModeKindResultStackSpace:
-		amode.imm += ret0offset
-	case addressModeKindArgStackSpace:
-		amode.imm += arg0offset
-	default:
-		panic("BUG")
-	}
-
-	var sizeInBits byte
-	switch i.kind {
-	case store8, uLoad8:
-		sizeInBits = 8
-	case store16, uLoad16:
-		sizeInBits = 16
-	case store32, fpuStore32, uLoad32, fpuLoad32:
-		sizeInBits = 32
-	case store64, fpuStore64, uLoad64, fpuLoad64:
-		sizeInBits = 64
-	case fpuStore128, fpuLoad128:
-		sizeInBits = 128
-	default:
-		panic("BUG")
-	}
-
-	if offsetFitsInAddressModeKindRegUnsignedImm12(sizeInBits, amode.imm) {
-		amode.kind = addressModeKindRegUnsignedImm12
-	} else {
-		// This case, we load the offset into the temporary register,
-		// and then use it as the index register.
-		newPrev := m.lowerConstantI64AndInsert(i.prev, tmpRegVReg, amode.imm)
-		linkInstr(newPrev, i)
-		*amode = addressMode{kind: addressModeKindRegReg, rn: amode.rn, rm: tmpRegVReg, extOp: extendOpUXTX /* indicates rm reg is 64-bit */}
-	}
-}
-
-// resolveRelativeAddresses resolves the relative addresses before encoding.
-func (m *machine) resolveRelativeAddresses(ctx context.Context) {
-	for {
-		if len(m.unresolvedAddressModes) > 0 {
-			arg0offset, ret0offset := m.arg0OffsetFromSP(), m.ret0OffsetFromSP()
-			for _, i := range m.unresolvedAddressModes {
-				m.resolveAddressingMode(arg0offset, ret0offset, i)
-			}
-		}
-
-		// Reuse the slice to gather the unresolved conditional branches.
-		m.condBrRelocs = m.condBrRelocs[:0]
-
-		var fn string
-		var fnIndex int
-		var labelPosToLabel map[*labelPosition]label
-		if wazevoapi.PerfMapEnabled {
-			labelPosToLabel = make(map[*labelPosition]label)
-			for i := 0; i <= m.labelPositionPool.MaxIDEncountered(); i++ {
-				labelPosToLabel[m.labelPositionPool.Get(i)] = label(i)
-			}
-
-			fn = wazevoapi.GetCurrentFunctionName(ctx)
-			fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx)
-		}
-
-		// Next, in order to determine the offsets of relative jumps, we have to calculate the size of each label.
-		var offset int64
-		for i, pos := range m.orderedSSABlockLabelPos {
-			pos.binaryOffset = offset
-			var size int64
-			for cur := pos.begin; ; cur = cur.next {
-				switch cur.kind {
-				case nop0:
-					l := cur.nop0Label()
-					if pos := m.labelPositionPool.Get(int(l)); pos != nil {
-						pos.binaryOffset = offset + size
-					}
-				case condBr:
-					if !cur.condBrOffsetResolved() {
-						var nextLabel label
-						if i < len(m.orderedSSABlockLabelPos)-1 {
-							// Note: this is only used when the block ends with fallthrough,
-							// therefore can be safely assumed that the next block exists when it's needed.
-							nextLabel = ssaBlockLabel(m.orderedSSABlockLabelPos[i+1].sb)
-						}
-						m.condBrRelocs = append(m.condBrRelocs, condBrReloc{
-							cbr: cur, currentLabelPos: pos, offset: offset + size,
-							nextLabel: nextLabel,
-						})
-					}
-				}
-				size += cur.size()
-				if cur == pos.end {
-					break
-				}
-			}
-
-			if wazevoapi.PerfMapEnabled {
-				if size > 0 {
-					wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelPosToLabel[pos]))
-				}
-			}
-			offset += size
-		}
-
-		// Before resolving any offsets, we need to check if all the conditional branches can be resolved.
-		var needRerun bool
-		for i := range m.condBrRelocs {
-			reloc := &m.condBrRelocs[i]
-			cbr := reloc.cbr
-			offset := reloc.offset
-
-			target := cbr.condBrLabel()
-			offsetOfTarget := m.labelPositionPool.Get(int(target)).binaryOffset
-			diff := offsetOfTarget - offset
-			if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
-				// This case the conditional branch is too huge. We place the trampoline instructions at the end of the current block,
-				// and jump to it.
-				m.insertConditionalJumpTrampoline(cbr, reloc.currentLabelPos, reloc.nextLabel)
-				// Then, we need to recall this function to fix up the label offsets
-				// as they have changed after the trampoline is inserted.
-				needRerun = true
-			}
-		}
-		if needRerun {
-			if wazevoapi.PerfMapEnabled {
-				wazevoapi.PerfMap.Clear()
-			}
-		} else {
-			break
-		}
-	}
-
-	var currentOffset int64
-	for cur := m.rootInstr; cur != nil; cur = cur.next {
-		switch cur.kind {
-		case br:
-			target := cur.brLabel()
-			offsetOfTarget := m.labelPositionPool.Get(int(target)).binaryOffset
-			diff := offsetOfTarget - currentOffset
-			divided := diff >> 2
-			if divided < minSignedInt26 || divided > maxSignedInt26 {
-				// This means the currently compiled single function is extremely large.
-				panic("too large function that requires branch relocation of large unconditional branch larger than 26-bit range")
-			}
-			cur.brOffsetResolve(diff)
-		case condBr:
-			if !cur.condBrOffsetResolved() {
-				target := cur.condBrLabel()
-				offsetOfTarget := m.labelPositionPool.Get(int(target)).binaryOffset
-				diff := offsetOfTarget - currentOffset
-				if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
-					panic("BUG: branch relocation for large conditional branch larger than 19-bit range must be handled properly")
-				}
-				cur.condBrOffsetResolve(diff)
-			}
-		case brTableSequence:
-			tableIndex := cur.u1
-			targets := m.jmpTableTargets[tableIndex]
-			for i := range targets {
-				l := label(targets[i])
-				offsetOfTarget := m.labelPositionPool.Get(int(l)).binaryOffset
-				diff := offsetOfTarget - (currentOffset + brTableSequenceOffsetTableBegin)
-				targets[i] = uint32(diff)
-			}
-			cur.brTableSequenceOffsetsResolved()
-		case emitSourceOffsetInfo:
-			m.compiler.AddSourceOffsetInfo(currentOffset, cur.sourceOffsetInfo())
-		}
-		currentOffset += cur.size()
-	}
-}
-
-const (
-	maxSignedInt26 = 1<<25 - 1
-	minSignedInt26 = -(1 << 25)
-
-	maxSignedInt19 = 1<<18 - 1
-	minSignedInt19 = -(1 << 18)
-)
-
-func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *labelPosition, nextLabel label) {
-	cur := currentBlk.end
-	originalTarget := cbr.condBrLabel()
-	endNext := cur.next
-
-	if cur.kind != br {
-		// If the current block ends with a conditional branch, we can just insert the trampoline after it.
-		// Otherwise, we need to insert "skip" instruction to skip the trampoline instructions.
-		skip := m.allocateInstr()
-		skip.asBr(nextLabel)
-		cur = linkInstr(cur, skip)
-	}
-
-	cbrNewTargetInstr, cbrNewTargetLabel := m.allocateBrTarget()
-	cbr.setCondBrTargets(cbrNewTargetLabel)
-	cur = linkInstr(cur, cbrNewTargetInstr)
-
-	// Then insert the unconditional branch to the original, which should be possible to get encoded
-	// as 26-bit offset should be enough for any practical application.
-	br := m.allocateInstr()
-	br.asBr(originalTarget)
-	cur = linkInstr(cur, br)
-
-	// Update the end of the current block.
-	currentBlk.end = cur
-
-	linkInstr(cur, endNext)
-}
-
-// Format implements backend.Machine.
-func (m *machine) Format() string {
-	begins := map[*instruction]label{}
-	for l := label(0); l < m.nextLabel; l++ {
-		pos := m.labelPositionPool.Get(int(l))
-		if pos != nil {
-			begins[pos.begin] = l
-		}
-	}
-
-	var lines []string
-	for cur := m.rootInstr; cur != nil; cur = cur.next {
-		if l, ok := begins[cur]; ok {
-			var labelStr string
-			if l <= m.maxSSABlockID {
-				labelStr = fmt.Sprintf("%s (SSA Block: blk%d):", l, int(l))
-			} else {
-				labelStr = fmt.Sprintf("%s:", l)
-			}
-			lines = append(lines, labelStr)
-		}
-		if cur.kind == nop0 {
-			continue
-		}
-		lines = append(lines, "\t"+cur.String())
-	}
-	return "\n" + strings.Join(lines, "\n") + "\n"
-}
-
-// InsertReturn implements backend.Machine.
-func (m *machine) InsertReturn() {
-	i := m.allocateInstr()
-	i.asRet()
-	m.insert(i)
-}
-
-func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 {
-	offset, ok := m.spillSlots[id]
-	if !ok {
-		offset = m.spillSlotSize
-		// TODO: this should be aligned depending on the `size` to use Imm12 offset load/store as much as possible.
-		m.spillSlots[id] = offset
-		m.spillSlotSize += int64(size)
-	}
-	return offset + 16 // spill slot starts above the clobbered registers and the frame size.
-}
-
-func (m *machine) clobberedRegSlotSize() int64 {
-	return int64(len(m.clobberedRegs) * 16)
-}
-
-func (m *machine) arg0OffsetFromSP() int64 {
-	return m.frameSize() +
-		16 + // 16-byte aligned return address
-		16 // frame size saved below the clobbered registers.
-}
-
-func (m *machine) ret0OffsetFromSP() int64 {
-	return m.arg0OffsetFromSP() + m.currentABI.ArgStackSize
-}
-
-func (m *machine) requiredStackSize() int64 {
-	return m.maxRequiredStackSizeForCalls +
-		m.frameSize() +
-		16 + // 16-byte aligned return address.
-		16 // frame size saved below the clobbered registers.
-}
-
-func (m *machine) frameSize() int64 {
-	s := m.clobberedRegSlotSize() + m.spillSlotSize
-	if s&0xf != 0 {
-		panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s))
-	}
-	return s
-}
-
-func (m *machine) addJmpTableTarget(targets ssa.Values) (index int) {
-	if m.jmpTableTargetsNext == len(m.jmpTableTargets) {
-		m.jmpTableTargets = append(m.jmpTableTargets, make([]uint32, 0, len(targets.View())))
-	}
-
-	index = m.jmpTableTargetsNext
-	m.jmpTableTargetsNext++
-	m.jmpTableTargets[index] = m.jmpTableTargets[index][:0]
-	for _, targetBlockID := range targets.View() {
-		target := m.compiler.SSABuilder().BasicBlock(ssa.BasicBlockID(targetBlockID))
-		m.jmpTableTargets[index] = append(m.jmpTableTargets[index], uint32(target.ID()))
-	}
-	return
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
deleted file mode 100644
index c646a8fab..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
+++ /dev/null
@@ -1,467 +0,0 @@
-package arm64
-
-import (
-	"fmt"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-)
-
-// PostRegAlloc implements backend.Machine.
-func (m *machine) PostRegAlloc() {
-	m.setupPrologue()
-	m.postRegAlloc()
-}
-
-// setupPrologue initializes the prologue of the function.
-func (m *machine) setupPrologue() {
-	cur := m.rootInstr
-	prevInitInst := cur.next
-
-	//
-	//                   (high address)                    (high address)
-	//         SP----> +-----------------+               +------------------+ <----+
-	//                 |     .......     |               |     .......      |      |
-	//                 |      ret Y      |               |      ret Y       |      |
-	//                 |     .......     |               |     .......      |      |
-	//                 |      ret 0      |               |      ret 0       |      |
-	//                 |      arg X      |               |      arg X       |      |  size_of_arg_ret.
-	//                 |     .......     |     ====>     |     .......      |      |
-	//                 |      arg 1      |               |      arg 1       |      |
-	//                 |      arg 0      |               |      arg 0       | <----+
-	//                 |-----------------|               |  size_of_arg_ret |
-	//                                                   |  return address  |
-	//                                                   +------------------+ <---- SP
-	//                    (low address)                     (low address)
-
-	// Saves the return address (lr) and the size_of_arg_ret below the SP.
-	// size_of_arg_ret is used for stack unwinding.
-	cur = m.createReturnAddrAndSizeOfArgRetSlot(cur)
-
-	if !m.stackBoundsCheckDisabled {
-		cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur)
-	}
-
-	// Decrement SP if spillSlotSize > 0.
-	if m.spillSlotSize == 0 && len(m.spillSlots) != 0 {
-		panic(fmt.Sprintf("BUG: spillSlotSize=%d, spillSlots=%v\n", m.spillSlotSize, m.spillSlots))
-	}
-
-	if regs := m.clobberedRegs; len(regs) > 0 {
-		//
-		//            (high address)                  (high address)
-		//          +-----------------+             +-----------------+
-		//          |     .......     |             |     .......     |
-		//          |      ret Y      |             |      ret Y      |
-		//          |     .......     |             |     .......     |
-		//          |      ret 0      |             |      ret 0      |
-		//          |      arg X      |             |      arg X      |
-		//          |     .......     |             |     .......     |
-		//          |      arg 1      |             |      arg 1      |
-		//          |      arg 0      |             |      arg 0      |
-		//          | size_of_arg_ret |             | size_of_arg_ret |
-		//          |   ReturnAddress |             |  ReturnAddress  |
-		//  SP----> +-----------------+    ====>    +-----------------+
-		//             (low address)                |   clobbered M   |
-		//                                          |   ............  |
-		//                                          |   clobbered 0   |
-		//                                          +-----------------+ <----- SP
-		//                                             (low address)
-		//
-		_amode := addressModePreOrPostIndex(m, spVReg,
-			-16,  // stack pointer must be 16-byte aligned.
-			true, // Decrement before store.
-		)
-		for _, vr := range regs {
-			// TODO: pair stores to reduce the number of instructions.
-			store := m.allocateInstr()
-			store.asStore(operandNR(vr), _amode, regTypeToRegisterSizeInBits(vr.RegType()))
-			cur = linkInstr(cur, store)
-		}
-	}
-
-	if size := m.spillSlotSize; size > 0 {
-		// Check if size is 16-byte aligned.
-		if size&0xf != 0 {
-			panic(fmt.Errorf("BUG: spill slot size %d is not 16-byte aligned", size))
-		}
-
-		cur = m.addsAddOrSubStackPointer(cur, spVReg, size, false)
-
-		// At this point, the stack looks like:
-		//
-		//            (high address)
-		//          +------------------+
-		//          |     .......      |
-		//          |      ret Y       |
-		//          |     .......      |
-		//          |      ret 0       |
-		//          |      arg X       |
-		//          |     .......      |
-		//          |      arg 1       |
-		//          |      arg 0       |
-		//          |  size_of_arg_ret |
-		//          |   ReturnAddress  |
-		//          +------------------+
-		//          |    clobbered M   |
-		//          |   ............   |
-		//          |    clobbered 0   |
-		//          |   spill slot N   |
-		//          |   ............   |
-		//          |   spill slot 2   |
-		//          |   spill slot 0   |
-		//  SP----> +------------------+
-		//             (low address)
-	}
-
-	// We push the frame size into the stack to make it possible to unwind stack:
-	//
-	//
-	//            (high address)                  (high address)
-	//         +-----------------+                +-----------------+
-	//         |     .......     |                |     .......     |
-	//         |      ret Y      |                |      ret Y      |
-	//         |     .......     |                |     .......     |
-	//         |      ret 0      |                |      ret 0      |
-	//         |      arg X      |                |      arg X      |
-	//         |     .......     |                |     .......     |
-	//         |      arg 1      |                |      arg 1      |
-	//         |      arg 0      |                |      arg 0      |
-	//         | size_of_arg_ret |                | size_of_arg_ret |
-	//         |  ReturnAddress  |                |  ReturnAddress  |
-	//         +-----------------+      ==>       +-----------------+ <----+
-	//         |   clobbered  M  |                |   clobbered  M  |      |
-	//         |   ............  |                |   ............  |      |
-	//         |   clobbered  2  |                |   clobbered  2  |      |
-	//         |   clobbered  1  |                |   clobbered  1  |      | frame size
-	//         |   clobbered  0  |                |   clobbered  0  |      |
-	//         |   spill slot N  |                |   spill slot N  |      |
-	//         |   ............  |                |   ............  |      |
-	//         |   spill slot 0  |                |   spill slot 0  | <----+
-	// SP--->  +-----------------+                |     xxxxxx      |  ;; unused space to make it 16-byte aligned.
-	//                                            |   frame_size    |
-	//                                            +-----------------+ <---- SP
-	//            (low address)
-	//
-	cur = m.createFrameSizeSlot(cur, m.frameSize())
-
-	linkInstr(cur, prevInitInst)
-}
-
-func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruction {
-	// First we decrement the stack pointer to point the arg0 slot.
-	var sizeOfArgRetReg regalloc.VReg
-	s := int64(m.currentABI.AlignedArgResultStackSlotSize())
-	if s > 0 {
-		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
-		sizeOfArgRetReg = tmpRegVReg
-
-		subSp := m.allocateInstr()
-		subSp.asALU(aluOpSub, spVReg, operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
-		cur = linkInstr(cur, subSp)
-	} else {
-		sizeOfArgRetReg = xzrVReg
-	}
-
-	// Saves the return address (lr) and the size_of_arg_ret below the SP.
-	// size_of_arg_ret is used for stack unwinding.
-	pstr := m.allocateInstr()
-	amode := addressModePreOrPostIndex(m, spVReg, -16, true /* decrement before store */)
-	pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode)
-	cur = linkInstr(cur, pstr)
-	return cur
-}
-
-func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction {
-	var frameSizeReg regalloc.VReg
-	if s > 0 {
-		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
-		frameSizeReg = tmpRegVReg
-	} else {
-		frameSizeReg = xzrVReg
-	}
-	_amode := addressModePreOrPostIndex(m, spVReg,
-		-16,  // stack pointer must be 16-byte aligned.
-		true, // Decrement before store.
-	)
-	store := m.allocateInstr()
-	store.asStore(operandNR(frameSizeReg), _amode, 64)
-	cur = linkInstr(cur, store)
-	return cur
-}
-
-// postRegAlloc does multiple things while walking through the instructions:
-// 1. Removes the redundant copy instruction.
-// 2. Inserts the epilogue.
-func (m *machine) postRegAlloc() {
-	for cur := m.rootInstr; cur != nil; cur = cur.next {
-		switch cur.kind {
-		case ret:
-			m.setupEpilogueAfter(cur.prev)
-		case loadConstBlockArg:
-			lc := cur
-			next := lc.next
-			m.pendingInstructions = m.pendingInstructions[:0]
-			m.lowerLoadConstantBlockArgAfterRegAlloc(lc)
-			for _, instr := range m.pendingInstructions {
-				cur = linkInstr(cur, instr)
-			}
-			linkInstr(cur, next)
-			m.pendingInstructions = m.pendingInstructions[:0]
-		default:
-			// Removes the redundant copy instruction.
-			if cur.IsCopy() && cur.rn.realReg() == cur.rd.RealReg() {
-				prev, next := cur.prev, cur.next
-				// Remove the copy instruction.
-				prev.next = next
-				if next != nil {
-					next.prev = prev
-				}
-			}
-		}
-	}
-}
-
-func (m *machine) setupEpilogueAfter(cur *instruction) {
-	prevNext := cur.next
-
-	// We've stored the frame size in the prologue, and now that we are about to return from this function, we won't need it anymore.
-	cur = m.addsAddOrSubStackPointer(cur, spVReg, 16, true)
-
-	if s := m.spillSlotSize; s > 0 {
-		// Adjust SP to the original value:
-		//
-		//            (high address)                        (high address)
-		//          +-----------------+                  +-----------------+
-		//          |     .......     |                  |     .......     |
-		//          |      ret Y      |                  |      ret Y      |
-		//          |     .......     |                  |     .......     |
-		//          |      ret 0      |                  |      ret 0      |
-		//          |      arg X      |                  |      arg X      |
-		//          |     .......     |                  |     .......     |
-		//          |      arg 1      |                  |      arg 1      |
-		//          |      arg 0      |                  |      arg 0      |
-		//          |      xxxxx      |                  |      xxxxx      |
-		//          |   ReturnAddress |                  |   ReturnAddress |
-		//          +-----------------+      ====>       +-----------------+
-		//          |    clobbered M  |                  |    clobbered M  |
-		//          |   ............  |                  |   ............  |
-		//          |    clobbered 1  |                  |    clobbered 1  |
-		//          |    clobbered 0  |                  |    clobbered 0  |
-		//          |   spill slot N  |                  +-----------------+ <---- SP
-		//          |   ............  |
-		//          |   spill slot 0  |
-		//   SP---> +-----------------+
-		//             (low address)
-		//
-		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
-	}
-
-	// First we need to restore the clobbered registers.
-	if len(m.clobberedRegs) > 0 {
-		//            (high address)
-		//          +-----------------+                      +-----------------+
-		//          |     .......     |                      |     .......     |
-		//          |      ret Y      |                      |      ret Y      |
-		//          |     .......     |                      |     .......     |
-		//          |      ret 0      |                      |      ret 0      |
-		//          |      arg X      |                      |      arg X      |
-		//          |     .......     |                      |     .......     |
-		//          |      arg 1      |                      |      arg 1      |
-		//          |      arg 0      |                      |      arg 0      |
-		//          |      xxxxx      |                      |      xxxxx      |
-		//          |   ReturnAddress |                      |   ReturnAddress |
-		//          +-----------------+      ========>       +-----------------+ <---- SP
-		//          |   clobbered M   |
-		//          |   ...........   |
-		//          |   clobbered 1   |
-		//          |   clobbered 0   |
-		//   SP---> +-----------------+
-		//             (low address)
-
-		l := len(m.clobberedRegs) - 1
-		for i := range m.clobberedRegs {
-			vr := m.clobberedRegs[l-i] // reverse order to restore.
-			load := m.allocateInstr()
-			amode := addressModePreOrPostIndex(m, spVReg,
-				16,    // stack pointer must be 16-byte aligned.
-				false, // Increment after store.
-			)
-			// TODO: pair loads to reduce the number of instructions.
-			switch regTypeToRegisterSizeInBits(vr.RegType()) {
-			case 64: // save int reg.
-				load.asULoad(vr, amode, 64)
-			case 128: // save vector reg.
-				load.asFpuLoad(vr, amode, 128)
-			}
-			cur = linkInstr(cur, load)
-		}
-	}
-
-	// Reload the return address (lr).
-	//
-	//            +-----------------+          +-----------------+
-	//            |     .......     |          |     .......     |
-	//            |      ret Y      |          |      ret Y      |
-	//            |     .......     |          |     .......     |
-	//            |      ret 0      |          |      ret 0      |
-	//            |      arg X      |          |      arg X      |
-	//            |     .......     |   ===>   |     .......     |
-	//            |      arg 1      |          |      arg 1      |
-	//            |      arg 0      |          |      arg 0      |
-	//            |      xxxxx      |          +-----------------+ <---- SP
-	//            |  ReturnAddress  |
-	//    SP----> +-----------------+
-
-	ldr := m.allocateInstr()
-	ldr.asULoad(lrVReg,
-		addressModePreOrPostIndex(m, spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
-	cur = linkInstr(cur, ldr)
-
-	if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
-		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
-	}
-
-	linkInstr(cur, prevNext)
-}
-
-// saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
-// stack space left. Basically this is the combination of CalleeSavedRegisters plus argument registers execpt for x0,
-// which always points to the execution context whenever the native code is entered from Go.
-var saveRequiredRegs = []regalloc.VReg{
-	x1VReg, x2VReg, x3VReg, x4VReg, x5VReg, x6VReg, x7VReg,
-	x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg, lrVReg,
-	v0VReg, v1VReg, v2VReg, v3VReg, v4VReg, v5VReg, v6VReg, v7VReg,
-	v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg,
-}
-
-// insertStackBoundsCheck will insert the instructions after `cur` to check the
-// stack bounds, and if there's no sufficient spaces required for the function,
-// exit the execution and try growing it in Go world.
-//
-// TODO: we should be able to share the instructions across all the functions to reduce the size of compiled executable.
-func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction {
-	if requiredStackSize%16 != 0 {
-		panic("BUG")
-	}
-
-	if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok {
-		// sub tmp, sp, #requiredStackSize
-		sub := m.allocateInstr()
-		sub.asALU(aluOpSub, tmpRegVReg, operandNR(spVReg), immm12op, true)
-		cur = linkInstr(cur, sub)
-	} else {
-		// This case, we first load the requiredStackSize into the temporary register,
-		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
-		// Then subtract it.
-		sub := m.allocateInstr()
-		sub.asALU(aluOpSub, tmpRegVReg, operandNR(spVReg), operandNR(tmpRegVReg), true)
-		cur = linkInstr(cur, sub)
-	}
-
-	tmp2 := x11VReg // Caller save, so it is safe to use it here in the prologue.
-
-	// ldr tmp2, [executionContext #StackBottomPtr]
-	ldr := m.allocateInstr()
-	amode := m.amodePool.Allocate()
-	*amode = addressMode{
-		kind: addressModeKindRegUnsignedImm12,
-		rn:   x0VReg, // execution context is always the first argument.
-		imm:  wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(),
-	}
-	ldr.asULoad(tmp2, amode, 64)
-	cur = linkInstr(cur, ldr)
-
-	// subs xzr, tmp, tmp2
-	subs := m.allocateInstr()
-	subs.asALU(aluOpSubS, xzrVReg, operandNR(tmpRegVReg), operandNR(tmp2), true)
-	cur = linkInstr(cur, subs)
-
-	// b.ge #imm
-	cbr := m.allocateInstr()
-	cbr.asCondBr(ge.asCond(), labelInvalid, false /* ignored */)
-	cur = linkInstr(cur, cbr)
-
-	// Set the required stack size and set it to the exec context.
-	{
-		// First load the requiredStackSize into the temporary register,
-		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
-		setRequiredStackSize := m.allocateInstr()
-		amode := m.amodePool.Allocate()
-		*amode = addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			// Execution context is always the first argument.
-			rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
-		}
-		setRequiredStackSize.asStore(operandNR(tmpRegVReg), amode, 64)
-
-		cur = linkInstr(cur, setRequiredStackSize)
-	}
-
-	ldrAddress := m.allocateInstr()
-	amode2 := m.amodePool.Allocate()
-	*amode2 = addressMode{
-		kind: addressModeKindRegUnsignedImm12,
-		rn:   x0VReg, // execution context is always the first argument
-		imm:  wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(),
-	}
-	ldrAddress.asULoad(tmpRegVReg, amode2, 64)
-	cur = linkInstr(cur, ldrAddress)
-
-	// Then jumps to the stack grow call sequence's address, meaning
-	// transferring the control to the code compiled by CompileStackGrowCallSequence.
-	bl := m.allocateInstr()
-	bl.asCallIndirect(tmpRegVReg, nil)
-	cur = linkInstr(cur, bl)
-
-	// Now that we know the entire code, we can finalize how many bytes
-	// we have to skip when the stack size is sufficient.
-	var cbrOffset int64
-	for _cur := cbr; ; _cur = _cur.next {
-		cbrOffset += _cur.size()
-		if _cur == cur {
-			break
-		}
-	}
-	cbr.condBrOffsetResolve(cbrOffset)
-	return cur
-}
-
-// CompileStackGrowCallSequence implements backend.Machine.
-func (m *machine) CompileStackGrowCallSequence() []byte {
-	cur := m.allocateInstr()
-	cur.asNop0()
-	m.rootInstr = cur
-
-	// Save the callee saved and argument registers.
-	cur = m.saveRegistersInExecutionContext(cur, saveRequiredRegs)
-
-	// Save the current stack pointer.
-	cur = m.saveCurrentStackPointer(cur, x0VReg)
-
-	// Set the exit status on the execution context.
-	cur = m.setExitCode(cur, x0VReg, wazevoapi.ExitCodeGrowStack)
-
-	// Exit the execution.
-	cur = m.storeReturnAddressAndExit(cur)
-
-	// After the exit, restore the saved registers.
-	cur = m.restoreRegistersInExecutionContext(cur, saveRequiredRegs)
-
-	// Then goes back the original address of this stack grow call.
-	ret := m.allocateInstr()
-	ret.asRet()
-	linkInstr(cur, ret)
-
-	m.encode(m.rootInstr)
-	return m.compiler.Buf()
-}
-
-func (m *machine) addsAddOrSubStackPointer(cur *instruction, rd regalloc.VReg, diff int64, add bool) *instruction {
-	m.pendingInstructions = m.pendingInstructions[:0]
-	m.insertAddOrSubStackPointer(rd, diff, add)
-	for _, inserted := range m.pendingInstructions {
-		cur = linkInstr(cur, inserted)
-	}
-	return cur
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
deleted file mode 100644
index f2ed53ae5..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
+++ /dev/null
@@ -1,351 +0,0 @@
-package arm64
-
-// This file implements the interfaces required for register allocations. See backend.RegAllocFunctionMachine.
-
-import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-// regAllocFn implements regalloc.Function.
-type regAllocFn struct {
-	ssaB                   ssa.Builder
-	m                      *machine
-	loopNestingForestRoots []ssa.BasicBlock
-	blockIter              int
-}
-
-// PostOrderBlockIteratorBegin implements regalloc.Function.
-func (f *regAllocFn) PostOrderBlockIteratorBegin() *labelPosition {
-	f.blockIter = len(f.m.orderedSSABlockLabelPos) - 1
-	return f.PostOrderBlockIteratorNext()
-}
-
-// PostOrderBlockIteratorNext implements regalloc.Function.
-func (f *regAllocFn) PostOrderBlockIteratorNext() *labelPosition {
-	if f.blockIter < 0 {
-		return nil
-	}
-	b := f.m.orderedSSABlockLabelPos[f.blockIter]
-	f.blockIter--
-	return b
-}
-
-// ReversePostOrderBlockIteratorBegin implements regalloc.Function.
-func (f *regAllocFn) ReversePostOrderBlockIteratorBegin() *labelPosition {
-	f.blockIter = 0
-	return f.ReversePostOrderBlockIteratorNext()
-}
-
-// ReversePostOrderBlockIteratorNext implements regalloc.Function.
-func (f *regAllocFn) ReversePostOrderBlockIteratorNext() *labelPosition {
-	if f.blockIter >= len(f.m.orderedSSABlockLabelPos) {
-		return nil
-	}
-	b := f.m.orderedSSABlockLabelPos[f.blockIter]
-	f.blockIter++
-	return b
-}
-
-// ClobberedRegisters implements regalloc.Function.
-func (f *regAllocFn) ClobberedRegisters(regs []regalloc.VReg) {
-	f.m.clobberedRegs = append(f.m.clobberedRegs[:0], regs...)
-}
-
-// LoopNestingForestRoots implements regalloc.Function.
-func (f *regAllocFn) LoopNestingForestRoots() int {
-	f.loopNestingForestRoots = f.ssaB.LoopNestingForestRoots()
-	return len(f.loopNestingForestRoots)
-}
-
-// LoopNestingForestRoot implements regalloc.Function.
-func (f *regAllocFn) LoopNestingForestRoot(i int) *labelPosition {
-	root := f.loopNestingForestRoots[i]
-	pos := f.m.getOrAllocateSSABlockLabelPosition(root)
-	return pos
-}
-
-// LowestCommonAncestor implements regalloc.Function.
-func (f *regAllocFn) LowestCommonAncestor(blk1, blk2 *labelPosition) *labelPosition {
-	sb := f.ssaB.LowestCommonAncestor(blk1.sb, blk2.sb)
-	pos := f.m.getOrAllocateSSABlockLabelPosition(sb)
-	return pos
-}
-
-// Idom implements regalloc.Function.
-func (f *regAllocFn) Idom(blk *labelPosition) *labelPosition {
-	sb := f.ssaB.Idom(blk.sb)
-	pos := f.m.getOrAllocateSSABlockLabelPosition(sb)
-	return pos
-}
-
-// SwapBefore implements regalloc.Function.
-func (f *regAllocFn) SwapBefore(x1, x2, tmp regalloc.VReg, instr *instruction) {
-	f.m.swap(instr.prev, x1, x2, tmp)
-}
-
-// StoreRegisterBefore implements regalloc.Function.
-func (f *regAllocFn) StoreRegisterBefore(v regalloc.VReg, instr *instruction) {
-	m := f.m
-	m.insertStoreRegisterAt(v, instr, false)
-}
-
-// StoreRegisterAfter implements regalloc.Function.
-func (f *regAllocFn) StoreRegisterAfter(v regalloc.VReg, instr *instruction) {
-	m := f.m
-	m.insertStoreRegisterAt(v, instr, true)
-}
-
-// ReloadRegisterBefore implements regalloc.Function.
-func (f *regAllocFn) ReloadRegisterBefore(v regalloc.VReg, instr *instruction) {
-	m := f.m
-	m.insertReloadRegisterAt(v, instr, false)
-}
-
-// ReloadRegisterAfter implements regalloc.Function.
-func (f *regAllocFn) ReloadRegisterAfter(v regalloc.VReg, instr *instruction) {
-	m := f.m
-	m.insertReloadRegisterAt(v, instr, true)
-}
-
-// InsertMoveBefore implements regalloc.Function.
-func (f *regAllocFn) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
-	f.m.insertMoveBefore(dst, src, instr)
-}
-
-// LoopNestingForestChild implements regalloc.Function.
-func (f *regAllocFn) LoopNestingForestChild(pos *labelPosition, i int) *labelPosition {
-	childSB := pos.sb.LoopNestingForestChildren()[i]
-	return f.m.getOrAllocateSSABlockLabelPosition(childSB)
-}
-
-// Succ implements regalloc.Block.
-func (f *regAllocFn) Succ(pos *labelPosition, i int) *labelPosition {
-	succSB := pos.sb.Succ(i)
-	if succSB.ReturnBlock() {
-		return nil
-	}
-	return f.m.getOrAllocateSSABlockLabelPosition(succSB)
-}
-
-// Pred implements regalloc.Block.
-func (f *regAllocFn) Pred(pos *labelPosition, i int) *labelPosition {
-	predSB := pos.sb.Pred(i)
-	return f.m.getOrAllocateSSABlockLabelPosition(predSB)
-}
-
-// BlockParams implements regalloc.Function.
-func (f *regAllocFn) BlockParams(pos *labelPosition, regs *[]regalloc.VReg) []regalloc.VReg {
-	c := f.m.compiler
-	*regs = (*regs)[:0]
-	for i := 0; i < pos.sb.Params(); i++ {
-		v := c.VRegOf(pos.sb.Param(i))
-		*regs = append(*regs, v)
-	}
-	return *regs
-}
-
-// ID implements regalloc.Block.
-func (pos *labelPosition) ID() int32 {
-	return int32(pos.sb.ID())
-}
-
-// InstrIteratorBegin implements regalloc.Block.
-func (pos *labelPosition) InstrIteratorBegin() *instruction {
-	ret := pos.begin
-	pos.cur = ret
-	return ret
-}
-
-// InstrIteratorNext implements regalloc.Block.
-func (pos *labelPosition) InstrIteratorNext() *instruction {
-	for {
-		if pos.cur == pos.end {
-			return nil
-		}
-		instr := pos.cur.next
-		pos.cur = instr
-		if instr == nil {
-			return nil
-		} else if instr.addedBeforeRegAlloc {
-			// Only concerned about the instruction added before regalloc.
-			return instr
-		}
-	}
-}
-
-// InstrRevIteratorBegin implements regalloc.Block.
-func (pos *labelPosition) InstrRevIteratorBegin() *instruction {
-	pos.cur = pos.end
-	return pos.cur
-}
-
-// InstrRevIteratorNext implements regalloc.Block.
-func (pos *labelPosition) InstrRevIteratorNext() *instruction {
-	for {
-		if pos.cur == pos.begin {
-			return nil
-		}
-		instr := pos.cur.prev
-		pos.cur = instr
-		if instr == nil {
-			return nil
-		} else if instr.addedBeforeRegAlloc {
-			// Only concerned about the instruction added before regalloc.
-			return instr
-		}
-	}
-}
-
-// FirstInstr implements regalloc.Block.
-func (pos *labelPosition) FirstInstr() *instruction { return pos.begin }
-
-// LastInstrForInsertion implements regalloc.Block.
-func (pos *labelPosition) LastInstrForInsertion() *instruction {
-	return lastInstrForInsertion(pos.begin, pos.end)
-}
-
-// Preds implements regalloc.Block.
-func (pos *labelPosition) Preds() int { return pos.sb.Preds() }
-
-// Entry implements regalloc.Block.
-func (pos *labelPosition) Entry() bool { return pos.sb.EntryBlock() }
-
-// Succs implements regalloc.Block.
-func (pos *labelPosition) Succs() int { return pos.sb.Succs() }
-
-// LoopHeader implements regalloc.Block.
-func (pos *labelPosition) LoopHeader() bool { return pos.sb.LoopHeader() }
-
-// LoopNestingForestChildren implements regalloc.Block.
-func (pos *labelPosition) LoopNestingForestChildren() int {
-	return len(pos.sb.LoopNestingForestChildren())
-}
-
-func (m *machine) swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
-	prevNext := cur.next
-	var mov1, mov2, mov3 *instruction
-	if x1.RegType() == regalloc.RegTypeInt {
-		if !tmp.Valid() {
-			tmp = tmpRegVReg
-		}
-		mov1 = m.allocateInstr().asMove64(tmp, x1)
-		mov2 = m.allocateInstr().asMove64(x1, x2)
-		mov3 = m.allocateInstr().asMove64(x2, tmp)
-		cur = linkInstr(cur, mov1)
-		cur = linkInstr(cur, mov2)
-		cur = linkInstr(cur, mov3)
-		linkInstr(cur, prevNext)
-	} else {
-		if !tmp.Valid() {
-			r2 := x2.RealReg()
-			// Temporarily spill x1 to stack.
-			cur = m.insertStoreRegisterAt(x1, cur, true).prev
-			// Then move x2 to x1.
-			cur = linkInstr(cur, m.allocateInstr().asFpuMov128(x1, x2))
-			linkInstr(cur, prevNext)
-			// Then reload the original value on x1 from stack to r2.
-			m.insertReloadRegisterAt(x1.SetRealReg(r2), cur, true)
-		} else {
-			mov1 = m.allocateInstr().asFpuMov128(tmp, x1)
-			mov2 = m.allocateInstr().asFpuMov128(x1, x2)
-			mov3 = m.allocateInstr().asFpuMov128(x2, tmp)
-			cur = linkInstr(cur, mov1)
-			cur = linkInstr(cur, mov2)
-			cur = linkInstr(cur, mov3)
-			linkInstr(cur, prevNext)
-		}
-	}
-}
-
-func (m *machine) insertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
-	typ := src.RegType()
-	if typ != dst.RegType() {
-		panic("BUG: src and dst must have the same type")
-	}
-
-	mov := m.allocateInstr()
-	if typ == regalloc.RegTypeInt {
-		mov.asMove64(dst, src)
-	} else {
-		mov.asFpuMov128(dst, src)
-	}
-
-	cur := instr.prev
-	prevNext := cur.next
-	cur = linkInstr(cur, mov)
-	linkInstr(cur, prevNext)
-}
-
-func (m *machine) insertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
-	if !v.IsRealReg() {
-		panic("BUG: VReg must be backed by real reg to be stored")
-	}
-
-	typ := m.compiler.TypeOf(v)
-
-	var prevNext, cur *instruction
-	if after {
-		cur, prevNext = instr, instr.next
-	} else {
-		cur, prevNext = instr.prev, instr
-	}
-
-	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
-	var amode *addressMode
-	cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
-	store := m.allocateInstr()
-	store.asStore(operandNR(v), amode, typ.Bits())
-
-	cur = linkInstr(cur, store)
-	return linkInstr(cur, prevNext)
-}
-
-func (m *machine) insertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
-	if !v.IsRealReg() {
-		panic("BUG: VReg must be backed by real reg to be stored")
-	}
-
-	typ := m.compiler.TypeOf(v)
-
-	var prevNext, cur *instruction
-	if after {
-		cur, prevNext = instr, instr.next
-	} else {
-		cur, prevNext = instr.prev, instr
-	}
-
-	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
-	var amode *addressMode
-	cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
-	load := m.allocateInstr()
-	switch typ {
-	case ssa.TypeI32, ssa.TypeI64:
-		load.asULoad(v, amode, typ.Bits())
-	case ssa.TypeF32, ssa.TypeF64:
-		load.asFpuLoad(v, amode, typ.Bits())
-	case ssa.TypeV128:
-		load.asFpuLoad(v, amode, 128)
-	default:
-		panic("TODO")
-	}
-
-	cur = linkInstr(cur, load)
-	return linkInstr(cur, prevNext)
-}
-
-func lastInstrForInsertion(begin, end *instruction) *instruction {
-	cur := end
-	for cur.kind == nop0 {
-		cur = cur.prev
-		if cur == begin {
-			return end
-		}
-	}
-	switch cur.kind {
-	case br:
-		return cur
-	default:
-		return end
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go
deleted file mode 100644
index 932fe842b..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go
+++ /dev/null
@@ -1,122 +0,0 @@
-package arm64
-
-import (
-	"encoding/binary"
-	"fmt"
-	"math"
-	"sort"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
-)
-
-const (
-	// trampolineCallSize is the size of the trampoline instruction sequence for each function in an island.
-	trampolineCallSize = 4*4 + 4 // Four instructions + 32-bit immediate.
-
-	// Unconditional branch offset is encoded as divided by 4 in imm26.
-	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link-?lang=en
-
-	maxUnconditionalBranchOffset = maxSignedInt26 * 4
-	minUnconditionalBranchOffset = minSignedInt26 * 4
-
-	// trampolineIslandInterval is the range of the trampoline island.
-	// Half of the range is used for the trampoline island, and the other half is used for the function.
-	trampolineIslandInterval = (maxUnconditionalBranchOffset - 1) / 2
-
-	// maxNumFunctions explicitly specifies the maximum number of functions that can be allowed in a single executable.
-	maxNumFunctions = trampolineIslandInterval >> 6
-
-	// maxFunctionExecutableSize is the maximum size of a function that can exist in a trampoline island.
-	// Conservatively set to 1/4 of the trampoline island interval.
-	maxFunctionExecutableSize = trampolineIslandInterval >> 2
-)
-
-// CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo.
-func (m *machine) CallTrampolineIslandInfo(numFunctions int) (interval, size int, err error) {
-	if numFunctions > maxNumFunctions {
-		return 0, 0, fmt.Errorf("too many functions: %d > %d", numFunctions, maxNumFunctions)
-	}
-	return trampolineIslandInterval, trampolineCallSize * numFunctions, nil
-}
-
-// ResolveRelocations implements backend.Machine ResolveRelocations.
-func (m *machine) ResolveRelocations(
-	refToBinaryOffset []int,
-	importedFns int,
-	executable []byte,
-	relocations []backend.RelocationInfo,
-	callTrampolineIslandOffsets []int,
-) {
-	for _, islandOffset := range callTrampolineIslandOffsets {
-		encodeCallTrampolineIsland(refToBinaryOffset, importedFns, islandOffset, executable)
-	}
-
-	for _, r := range relocations {
-		instrOffset := r.Offset
-		calleeFnOffset := refToBinaryOffset[r.FuncRef]
-		diff := int64(calleeFnOffset) - (instrOffset)
-		// Check if the diff is within the range of the branch instruction.
-		if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset {
-			// Find the near trampoline island from callTrampolineIslandOffsets.
-			islandOffset := searchTrampolineIsland(callTrampolineIslandOffsets, int(instrOffset))
-			islandTargetOffset := islandOffset + trampolineCallSize*int(r.FuncRef)
-			diff = int64(islandTargetOffset) - (instrOffset)
-			if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset {
-				panic("BUG in trampoline placement")
-			}
-		}
-		binary.LittleEndian.PutUint32(executable[instrOffset:instrOffset+4], encodeUnconditionalBranch(true, diff))
-	}
-}
-
-// encodeCallTrampolineIsland encodes a trampoline island for the given functions.
-// Each island consists of a trampoline instruction sequence for each function.
-// Each trampoline instruction sequence consists of 4 instructions + 32-bit immediate.
-func encodeCallTrampolineIsland(refToBinaryOffset []int, importedFns int, islandOffset int, executable []byte) {
-	// We skip the imported functions: they don't need trampolines
-	// and are not accounted for.
-	binaryOffsets := refToBinaryOffset[importedFns:]
-
-	for i := 0; i < len(binaryOffsets); i++ {
-		trampolineOffset := islandOffset + trampolineCallSize*i
-
-		fnOffset := binaryOffsets[i]
-		diff := fnOffset - (trampolineOffset + 16)
-		if diff > math.MaxInt32 || diff < math.MinInt32 {
-			// This case even amd64 can't handle. 4GB is too big.
-			panic("too big binary")
-		}
-
-		// The tmpReg, tmpReg2 is safe to overwrite (in fact any caller-saved register is safe to use).
-		tmpReg, tmpReg2 := regNumberInEncoding[tmpRegVReg.RealReg()], regNumberInEncoding[x11]
-
-		// adr tmpReg, PC+16: load the address of #diff into tmpReg.
-		binary.LittleEndian.PutUint32(executable[trampolineOffset:], encodeAdr(tmpReg, 16))
-		// ldrsw tmpReg2, [tmpReg]: Load #diff into tmpReg2.
-		binary.LittleEndian.PutUint32(executable[trampolineOffset+4:],
-			encodeLoadOrStore(sLoad32, tmpReg2, addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpRegVReg}))
-		// add tmpReg, tmpReg2, tmpReg: add #diff to the address of #diff, getting the absolute address of the function.
-		binary.LittleEndian.PutUint32(executable[trampolineOffset+8:],
-			encodeAluRRR(aluOpAdd, tmpReg, tmpReg, tmpReg2, true, false))
-		// br tmpReg: branch to the function without overwriting the link register.
-		binary.LittleEndian.PutUint32(executable[trampolineOffset+12:], encodeUnconditionalBranchReg(tmpReg, false))
-		// #diff
-		binary.LittleEndian.PutUint32(executable[trampolineOffset+16:], uint32(diff))
-	}
-}
-
-// searchTrampolineIsland finds the nearest trampoline island from callTrampolineIslandOffsets.
-// Note that even if the offset is in the middle of two islands, it returns the latter one.
-// That is ok because the island is always placed in the middle of the range.
-//
-// precondition: callTrampolineIslandOffsets is sorted in ascending order.
-func searchTrampolineIsland(callTrampolineIslandOffsets []int, offset int) int {
-	l := len(callTrampolineIslandOffsets)
-	n := sort.Search(l, func(i int) bool {
-		return callTrampolineIslandOffsets[i] >= offset
-	})
-	if n == l {
-		n = l - 1
-	}
-	return callTrampolineIslandOffsets[n]
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go
deleted file mode 100644
index 45737516d..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go
+++ /dev/null
@@ -1,397 +0,0 @@
-package arm64
-
-import (
-	"fmt"
-	"strconv"
-	"strings"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-)
-
-// Arm64-specific registers.
-//
-// See https://developer.arm.com/documentation/dui0801/a/Overview-of-AArch64-state/Predeclared-core-register-names-in-AArch64-state
-
-const (
-	// General purpose registers. Note that we do not distinguish wn and xn registers
-	// because they are the same from the perspective of register allocator, and
-	// the size can be determined by the type of the instruction.
-
-	x0 = regalloc.RealRegInvalid + 1 + iota
-	x1
-	x2
-	x3
-	x4
-	x5
-	x6
-	x7
-	x8
-	x9
-	x10
-	x11
-	x12
-	x13
-	x14
-	x15
-	x16
-	x17
-	x18
-	x19
-	x20
-	x21
-	x22
-	x23
-	x24
-	x25
-	x26
-	x27
-	x28
-	x29
-	x30
-
-	// Vector registers. Note that we do not distinguish vn and dn, ... registers
-	// because they are the same from the perspective of register allocator, and
-	// the size can be determined by the type of the instruction.
-
-	v0
-	v1
-	v2
-	v3
-	v4
-	v5
-	v6
-	v7
-	v8
-	v9
-	v10
-	v11
-	v12
-	v13
-	v14
-	v15
-	v16
-	v17
-	v18
-	v19
-	v20
-	v21
-	v22
-	v23
-	v24
-	v25
-	v26
-	v27
-	v28
-	v29
-	v30
-	v31
-
-	// Special registers
-
-	xzr
-	sp
-	lr  = x30
-	fp  = x29
-	tmp = x27
-)
-
-var (
-	x0VReg  = regalloc.FromRealReg(x0, regalloc.RegTypeInt)
-	x1VReg  = regalloc.FromRealReg(x1, regalloc.RegTypeInt)
-	x2VReg  = regalloc.FromRealReg(x2, regalloc.RegTypeInt)
-	x3VReg  = regalloc.FromRealReg(x3, regalloc.RegTypeInt)
-	x4VReg  = regalloc.FromRealReg(x4, regalloc.RegTypeInt)
-	x5VReg  = regalloc.FromRealReg(x5, regalloc.RegTypeInt)
-	x6VReg  = regalloc.FromRealReg(x6, regalloc.RegTypeInt)
-	x7VReg  = regalloc.FromRealReg(x7, regalloc.RegTypeInt)
-	x8VReg  = regalloc.FromRealReg(x8, regalloc.RegTypeInt)
-	x9VReg  = regalloc.FromRealReg(x9, regalloc.RegTypeInt)
-	x10VReg = regalloc.FromRealReg(x10, regalloc.RegTypeInt)
-	x11VReg = regalloc.FromRealReg(x11, regalloc.RegTypeInt)
-	x12VReg = regalloc.FromRealReg(x12, regalloc.RegTypeInt)
-	x13VReg = regalloc.FromRealReg(x13, regalloc.RegTypeInt)
-	x14VReg = regalloc.FromRealReg(x14, regalloc.RegTypeInt)
-	x15VReg = regalloc.FromRealReg(x15, regalloc.RegTypeInt)
-	x16VReg = regalloc.FromRealReg(x16, regalloc.RegTypeInt)
-	x17VReg = regalloc.FromRealReg(x17, regalloc.RegTypeInt)
-	x18VReg = regalloc.FromRealReg(x18, regalloc.RegTypeInt)
-	x19VReg = regalloc.FromRealReg(x19, regalloc.RegTypeInt)
-	x20VReg = regalloc.FromRealReg(x20, regalloc.RegTypeInt)
-	x21VReg = regalloc.FromRealReg(x21, regalloc.RegTypeInt)
-	x22VReg = regalloc.FromRealReg(x22, regalloc.RegTypeInt)
-	x23VReg = regalloc.FromRealReg(x23, regalloc.RegTypeInt)
-	x24VReg = regalloc.FromRealReg(x24, regalloc.RegTypeInt)
-	x25VReg = regalloc.FromRealReg(x25, regalloc.RegTypeInt)
-	x26VReg = regalloc.FromRealReg(x26, regalloc.RegTypeInt)
-	x27VReg = regalloc.FromRealReg(x27, regalloc.RegTypeInt)
-	x28VReg = regalloc.FromRealReg(x28, regalloc.RegTypeInt)
-	x29VReg = regalloc.FromRealReg(x29, regalloc.RegTypeInt)
-	x30VReg = regalloc.FromRealReg(x30, regalloc.RegTypeInt)
-	v0VReg  = regalloc.FromRealReg(v0, regalloc.RegTypeFloat)
-	v1VReg  = regalloc.FromRealReg(v1, regalloc.RegTypeFloat)
-	v2VReg  = regalloc.FromRealReg(v2, regalloc.RegTypeFloat)
-	v3VReg  = regalloc.FromRealReg(v3, regalloc.RegTypeFloat)
-	v4VReg  = regalloc.FromRealReg(v4, regalloc.RegTypeFloat)
-	v5VReg  = regalloc.FromRealReg(v5, regalloc.RegTypeFloat)
-	v6VReg  = regalloc.FromRealReg(v6, regalloc.RegTypeFloat)
-	v7VReg  = regalloc.FromRealReg(v7, regalloc.RegTypeFloat)
-	v8VReg  = regalloc.FromRealReg(v8, regalloc.RegTypeFloat)
-	v9VReg  = regalloc.FromRealReg(v9, regalloc.RegTypeFloat)
-	v10VReg = regalloc.FromRealReg(v10, regalloc.RegTypeFloat)
-	v11VReg = regalloc.FromRealReg(v11, regalloc.RegTypeFloat)
-	v12VReg = regalloc.FromRealReg(v12, regalloc.RegTypeFloat)
-	v13VReg = regalloc.FromRealReg(v13, regalloc.RegTypeFloat)
-	v14VReg = regalloc.FromRealReg(v14, regalloc.RegTypeFloat)
-	v15VReg = regalloc.FromRealReg(v15, regalloc.RegTypeFloat)
-	v16VReg = regalloc.FromRealReg(v16, regalloc.RegTypeFloat)
-	v17VReg = regalloc.FromRealReg(v17, regalloc.RegTypeFloat)
-	v18VReg = regalloc.FromRealReg(v18, regalloc.RegTypeFloat)
-	v19VReg = regalloc.FromRealReg(v19, regalloc.RegTypeFloat)
-	v20VReg = regalloc.FromRealReg(v20, regalloc.RegTypeFloat)
-	v21VReg = regalloc.FromRealReg(v21, regalloc.RegTypeFloat)
-	v22VReg = regalloc.FromRealReg(v22, regalloc.RegTypeFloat)
-	v23VReg = regalloc.FromRealReg(v23, regalloc.RegTypeFloat)
-	v24VReg = regalloc.FromRealReg(v24, regalloc.RegTypeFloat)
-	v25VReg = regalloc.FromRealReg(v25, regalloc.RegTypeFloat)
-	v26VReg = regalloc.FromRealReg(v26, regalloc.RegTypeFloat)
-	v27VReg = regalloc.FromRealReg(v27, regalloc.RegTypeFloat)
-	// lr (link register) holds the return address at the function entry.
-	lrVReg = x30VReg
-	// tmpReg is used to perform spill/load on large stack offsets, and load large constants.
-	// Therefore, be cautious to use this register in the middle of the compilation, especially before the register allocation.
-	// This is the same as golang/go, but it's only described in the source code:
-	// https://github.com/golang/go/blob/18e17e2cb12837ea2c8582ecdb0cc780f49a1aac/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go#L59
-	// https://github.com/golang/go/blob/18e17e2cb12837ea2c8582ecdb0cc780f49a1aac/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go#L13-L15
-	tmpRegVReg = regalloc.FromRealReg(tmp, regalloc.RegTypeInt)
-	v28VReg    = regalloc.FromRealReg(v28, regalloc.RegTypeFloat)
-	v29VReg    = regalloc.FromRealReg(v29, regalloc.RegTypeFloat)
-	v30VReg    = regalloc.FromRealReg(v30, regalloc.RegTypeFloat)
-	v31VReg    = regalloc.FromRealReg(v31, regalloc.RegTypeFloat)
-	xzrVReg    = regalloc.FromRealReg(xzr, regalloc.RegTypeInt)
-	spVReg     = regalloc.FromRealReg(sp, regalloc.RegTypeInt)
-	fpVReg     = regalloc.FromRealReg(fp, regalloc.RegTypeInt)
-)
-
-var regNames = [...]string{
-	x0:  "x0",
-	x1:  "x1",
-	x2:  "x2",
-	x3:  "x3",
-	x4:  "x4",
-	x5:  "x5",
-	x6:  "x6",
-	x7:  "x7",
-	x8:  "x8",
-	x9:  "x9",
-	x10: "x10",
-	x11: "x11",
-	x12: "x12",
-	x13: "x13",
-	x14: "x14",
-	x15: "x15",
-	x16: "x16",
-	x17: "x17",
-	x18: "x18",
-	x19: "x19",
-	x20: "x20",
-	x21: "x21",
-	x22: "x22",
-	x23: "x23",
-	x24: "x24",
-	x25: "x25",
-	x26: "x26",
-	x27: "x27",
-	x28: "x28",
-	x29: "x29",
-	x30: "x30",
-	xzr: "xzr",
-	sp:  "sp",
-	v0:  "v0",
-	v1:  "v1",
-	v2:  "v2",
-	v3:  "v3",
-	v4:  "v4",
-	v5:  "v5",
-	v6:  "v6",
-	v7:  "v7",
-	v8:  "v8",
-	v9:  "v9",
-	v10: "v10",
-	v11: "v11",
-	v12: "v12",
-	v13: "v13",
-	v14: "v14",
-	v15: "v15",
-	v16: "v16",
-	v17: "v17",
-	v18: "v18",
-	v19: "v19",
-	v20: "v20",
-	v21: "v21",
-	v22: "v22",
-	v23: "v23",
-	v24: "v24",
-	v25: "v25",
-	v26: "v26",
-	v27: "v27",
-	v28: "v28",
-	v29: "v29",
-	v30: "v30",
-	v31: "v31",
-}
-
-func formatVRegSized(r regalloc.VReg, size byte) (ret string) {
-	if r.IsRealReg() {
-		ret = regNames[r.RealReg()]
-		switch ret[0] {
-		case 'x':
-			switch size {
-			case 32:
-				ret = strings.Replace(ret, "x", "w", 1)
-			case 64:
-			default:
-				panic("BUG: invalid register size: " + strconv.Itoa(int(size)))
-			}
-		case 'v':
-			switch size {
-			case 32:
-				ret = strings.Replace(ret, "v", "s", 1)
-			case 64:
-				ret = strings.Replace(ret, "v", "d", 1)
-			case 128:
-				ret = strings.Replace(ret, "v", "q", 1)
-			default:
-				panic("BUG: invalid register size")
-			}
-		}
-	} else {
-		switch r.RegType() {
-		case regalloc.RegTypeInt:
-			switch size {
-			case 32:
-				ret = fmt.Sprintf("w%d?", r.ID())
-			case 64:
-				ret = fmt.Sprintf("x%d?", r.ID())
-			default:
-				panic("BUG: invalid register size: " + strconv.Itoa(int(size)))
-			}
-		case regalloc.RegTypeFloat:
-			switch size {
-			case 32:
-				ret = fmt.Sprintf("s%d?", r.ID())
-			case 64:
-				ret = fmt.Sprintf("d%d?", r.ID())
-			case 128:
-				ret = fmt.Sprintf("q%d?", r.ID())
-			default:
-				panic("BUG: invalid register size")
-			}
-		default:
-			panic(fmt.Sprintf("BUG: invalid register type: %d for %s", r.RegType(), r))
-		}
-	}
-	return
-}
-
-func formatVRegWidthVec(r regalloc.VReg, width vecArrangement) (ret string) {
-	var id string
-	wspec := strings.ToLower(width.String())
-	if r.IsRealReg() {
-		id = regNames[r.RealReg()][1:]
-	} else {
-		id = fmt.Sprintf("%d?", r.ID())
-	}
-	ret = fmt.Sprintf("%s%s", wspec, id)
-	return
-}
-
-func formatVRegVec(r regalloc.VReg, arr vecArrangement, index vecIndex) (ret string) {
-	id := fmt.Sprintf("v%d?", r.ID())
-	if r.IsRealReg() {
-		id = regNames[r.RealReg()]
-	}
-	ret = fmt.Sprintf("%s.%s", id, strings.ToLower(arr.String()))
-	if index != vecIndexNone {
-		ret += fmt.Sprintf("[%d]", index)
-	}
-	return
-}
-
-func regTypeToRegisterSizeInBits(r regalloc.RegType) byte {
-	switch r {
-	case regalloc.RegTypeInt:
-		return 64
-	case regalloc.RegTypeFloat:
-		return 128
-	default:
-		panic("BUG: invalid register type")
-	}
-}
-
-var regNumberInEncoding = [...]uint32{
-	x0:  0,
-	x1:  1,
-	x2:  2,
-	x3:  3,
-	x4:  4,
-	x5:  5,
-	x6:  6,
-	x7:  7,
-	x8:  8,
-	x9:  9,
-	x10: 10,
-	x11: 11,
-	x12: 12,
-	x13: 13,
-	x14: 14,
-	x15: 15,
-	x16: 16,
-	x17: 17,
-	x18: 18,
-	x19: 19,
-	x20: 20,
-	x21: 21,
-	x22: 22,
-	x23: 23,
-	x24: 24,
-	x25: 25,
-	x26: 26,
-	x27: 27,
-	x28: 28,
-	x29: 29,
-	x30: 30,
-	xzr: 31,
-	sp:  31,
-	v0:  0,
-	v1:  1,
-	v2:  2,
-	v3:  3,
-	v4:  4,
-	v5:  5,
-	v6:  6,
-	v7:  7,
-	v8:  8,
-	v9:  9,
-	v10: 10,
-	v11: 11,
-	v12: 12,
-	v13: 13,
-	v14: 14,
-	v15: 15,
-	v16: 16,
-	v17: 17,
-	v18: 18,
-	v19: 19,
-	v20: 20,
-	v21: 21,
-	v22: 22,
-	v23: 23,
-	v24: 24,
-	v25: 25,
-	v26: 26,
-	v27: 27,
-	v28: 28,
-	v29: 29,
-	v30: 30,
-	v31: 31,
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go
deleted file mode 100644
index a72b86f6b..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go
+++ /dev/null
@@ -1,84 +0,0 @@
-package arm64
-
-import (
-	"encoding/binary"
-	"reflect"
-	"unsafe"
-
-	"github.com/tetratelabs/wazero/internal/wasmdebug"
-)
-
-// UnwindStack implements wazevo.unwindStack.
-func UnwindStack(sp, _, top uintptr, returnAddresses []uintptr) []uintptr {
-	l := int(top - sp)
-
-	var stackBuf []byte
-	{
-		//nolint:staticcheck
-		hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf))
-		hdr.Data = sp
-		hdr.Len = l
-		hdr.Cap = l
-	}
-
-	for i := uint64(0); i < uint64(l); {
-		//       (high address)
-		//    +-----------------+
-		//    |     .......     |
-		//    |      ret Y      |  <----+
-		//    |     .......     |       |
-		//    |      ret 0      |       |
-		//    |      arg X      |       |  size_of_arg_ret
-		//    |     .......     |       |
-		//    |      arg 1      |       |
-		//    |      arg 0      |  <----+
-		//    | size_of_arg_ret |
-		//    |  ReturnAddress  |
-		//    +-----------------+ <----+
-		//    |   ...........   |      |
-		//    |   spill slot M  |      |
-		//    |   ............  |      |
-		//    |   spill slot 2  |      |
-		//    |   spill slot 1  |      | frame size
-		//    |   spill slot 1  |      |
-		//    |   clobbered N   |      |
-		//    |   ............  |      |
-		//    |   clobbered 0   | <----+
-		//    |     xxxxxx      |  ;; unused space to make it 16-byte aligned.
-		//    |   frame_size    |
-		//    +-----------------+ <---- SP
-		//       (low address)
-
-		frameSize := binary.LittleEndian.Uint64(stackBuf[i:])
-		i += frameSize +
-			16 // frame size + aligned space.
-		retAddr := binary.LittleEndian.Uint64(stackBuf[i:])
-		i += 8 // ret addr.
-		sizeOfArgRet := binary.LittleEndian.Uint64(stackBuf[i:])
-		i += 8 + sizeOfArgRet
-		returnAddresses = append(returnAddresses, uintptr(retAddr))
-		if len(returnAddresses) == wasmdebug.MaxFrames {
-			break
-		}
-	}
-	return returnAddresses
-}
-
-// GoCallStackView implements wazevo.goCallStackView.
-func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
-	//                  (high address)
-	//              +-----------------+ <----+
-	//              |   xxxxxxxxxxx   |      | ;; optional unused space to make it 16-byte aligned.
-	//           ^  |  arg[N]/ret[M]  |      |
-	// sliceSize |  |  ............   |      | sliceSize
-	//           |  |  arg[1]/ret[1]  |      |
-	//           v  |  arg[0]/ret[0]  | <----+
-	//              |    sliceSize    |
-	//              |   frame_size    |
-	//              +-----------------+ <---- stackPointerBeforeGoCall
-	//                 (low address)
-	ptr := unsafe.Pointer(stackPointerBeforeGoCall)
-	data := (*uint64)(unsafe.Add(ptr, 16)) // skips the (frame_size, sliceSize).
-	size := *(*uint64)(unsafe.Add(ptr, 8))
-	return unsafe.Slice(data, size)
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go
deleted file mode 100644
index 3a29e7cd6..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go
+++ /dev/null
@@ -1,119 +0,0 @@
-package backend
-
-import (
-	"context"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-)
-
-type (
-	// Machine is a backend for a specific ISA machine.
-	Machine interface {
-		// StartLoweringFunction is called when the compilation of the given function is started.
-		// The maxBlockID is the maximum ssa.BasicBlockID in the function.
-		StartLoweringFunction(maxBlockID ssa.BasicBlockID)
-
-		// LinkAdjacentBlocks is called after finished lowering all blocks in order to create one single instruction list.
-		LinkAdjacentBlocks(prev, next ssa.BasicBlock)
-
-		// StartBlock is called when the compilation of the given block is started.
-		// The order of this being called is the reverse post order of the ssa.BasicBlock(s) as we iterate with
-		// ssa.Builder BlockIteratorReversePostOrderBegin and BlockIteratorReversePostOrderEnd.
-		StartBlock(ssa.BasicBlock)
-
-		// EndBlock is called when the compilation of the current block is finished.
-		EndBlock()
-
-		// FlushPendingInstructions flushes the pending instructions to the buffer.
-		// This will be called after the lowering of each SSA Instruction.
-		FlushPendingInstructions()
-
-		// DisableStackCheck disables the stack check for the current compilation for debugging/testing.
-		DisableStackCheck()
-
-		// SetCurrentABI initializes the FunctionABI for the given signature.
-		SetCurrentABI(abi *FunctionABI)
-
-		// SetCompiler sets the compilation context used for the lifetime of Machine.
-		// This is only called once per Machine, i.e. before the first compilation.
-		SetCompiler(Compiler)
-
-		// LowerSingleBranch is called when the compilation of the given single branch is started.
-		LowerSingleBranch(b *ssa.Instruction)
-
-		// LowerConditionalBranch is called when the compilation of the given conditional branch is started.
-		LowerConditionalBranch(b *ssa.Instruction)
-
-		// LowerInstr is called for each instruction in the given block except for the ones marked as already lowered
-		// via Compiler.MarkLowered. The order is reverse, i.e. from the last instruction to the first one.
-		//
-		// Note: this can lower multiple instructions (which produce the inputs) at once whenever it's possible
-		// for optimization.
-		LowerInstr(*ssa.Instruction)
-
-		// Reset resets the machine state for the next compilation.
-		Reset()
-
-		// InsertMove inserts a move instruction from src to dst whose type is typ.
-		InsertMove(dst, src regalloc.VReg, typ ssa.Type)
-
-		// InsertReturn inserts the return instruction to return from the current function.
-		InsertReturn()
-
-		// InsertLoadConstantBlockArg inserts the instruction(s) to load the constant value into the given regalloc.VReg.
-		InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg)
-
-		// Format returns the string representation of the currently compiled machine code.
-		// This is only for testing purpose.
-		Format() string
-
-		// RegAlloc does the register allocation after lowering.
-		RegAlloc()
-
-		// PostRegAlloc does the post register allocation, e.g. setting up prologue/epilogue, redundant move elimination, etc.
-		PostRegAlloc()
-
-		// ResolveRelocations resolves the relocations after emitting machine code.
-		//  * refToBinaryOffset: the map from the function reference (ssa.FuncRef) to the executable offset.
-		//  * importedFns: the max index of the imported functions at the beginning of refToBinaryOffset
-		//  * executable: the binary to resolve the relocations.
-		//  * relocations: the relocations to resolve.
-		//  * callTrampolineIslandOffsets: the offsets of the trampoline islands in the executable.
-		ResolveRelocations(
-			refToBinaryOffset []int,
-			importedFns int,
-			executable []byte,
-			relocations []RelocationInfo,
-			callTrampolineIslandOffsets []int,
-		)
-
-		// Encode encodes the machine instructions to the Compiler.
-		Encode(ctx context.Context) error
-
-		// CompileGoFunctionTrampoline compiles the trampoline function  to call a Go function of the given exit code and signature.
-		CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte
-
-		// CompileStackGrowCallSequence returns the sequence of instructions shared by all functions to
-		// call the stack grow builtin function.
-		CompileStackGrowCallSequence() []byte
-
-		// CompileEntryPreamble returns the sequence of instructions shared by multiple functions to
-		// enter the function from Go.
-		CompileEntryPreamble(signature *ssa.Signature) []byte
-
-		// LowerParams lowers the given parameters.
-		LowerParams(params []ssa.Value)
-
-		// LowerReturns lowers the given returns.
-		LowerReturns(returns []ssa.Value)
-
-		// ArgsResultsRegs returns the registers used for arguments and return values.
-		ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg)
-
-		// CallTrampolineIslandInfo returns the interval of the offset where the trampoline island is placed, and
-		// the size of the trampoline island. If islandSize is zero, the trampoline island is not used on this machine.
-		CallTrampolineIslandInfo(numFunctions int) (interval, islandSize int, err error)
-	}
-)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go
deleted file mode 100644
index 5d15bd9dc..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go
+++ /dev/null
@@ -1,124 +0,0 @@
-package regalloc
-
-import "fmt"
-
-// These interfaces are implemented by ISA-specific backends to abstract away the details, and allow the register
-// allocators to work on any ISA.
-
-type (
-	// Function is the top-level interface to do register allocation, which corresponds to a CFG containing
-	// Blocks(s).
-	//
-	// I is the type of the instruction, and B is the type of the basic block.
-	Function[I Instr, B Block[I]] interface {
-		// PostOrderBlockIteratorBegin returns the first block in the post-order traversal of the CFG.
-		// In other words, the last blocks in the CFG will be returned first.
-		PostOrderBlockIteratorBegin() B
-		// PostOrderBlockIteratorNext returns the next block in the post-order traversal of the CFG.
-		PostOrderBlockIteratorNext() B
-		// ReversePostOrderBlockIteratorBegin returns the first block in the reverse post-order traversal of the CFG.
-		// In other words, the first blocks in the CFG will be returned first.
-		ReversePostOrderBlockIteratorBegin() B
-		// ReversePostOrderBlockIteratorNext returns the next block in the reverse post-order traversal of the CFG.
-		ReversePostOrderBlockIteratorNext() B
-		// ClobberedRegisters tell the clobbered registers by this function.
-		ClobberedRegisters([]VReg)
-		// LoopNestingForestRoots returns the number of roots of the loop nesting forest in a function.
-		LoopNestingForestRoots() int
-		// LoopNestingForestRoot returns the i-th root of the loop nesting forest in a function.
-		LoopNestingForestRoot(i int) B
-		// LowestCommonAncestor returns the lowest common ancestor of two blocks in the dominator tree.
-		LowestCommonAncestor(blk1, blk2 B) B
-		// Idom returns the immediate dominator of the given block.
-		Idom(blk B) B
-
-		// LoopNestingForestChild returns the i-th child of the block in the loop nesting forest.
-		LoopNestingForestChild(b B, i int) B
-		// Pred returns the i-th predecessor of the block in the CFG.
-		Pred(b B, i int) B
-		// Succ returns the i-th successor of the block in the CFG.
-		Succ(b B, i int) B
-		// BlockParams returns the virtual registers used as the parameters of this block.
-		BlockParams(B, *[]VReg) []VReg
-
-		// Followings are for rewriting the function.
-
-		// SwapBefore swaps the two virtual registers at the end of the given block.
-		SwapBefore(x1, x2, tmp VReg, instr I)
-		// StoreRegisterBefore inserts store instruction(s) before the given instruction for the given virtual register.
-		StoreRegisterBefore(v VReg, instr I)
-		// StoreRegisterAfter inserts store instruction(s) after the given instruction for the given virtual register.
-		StoreRegisterAfter(v VReg, instr I)
-		// ReloadRegisterBefore inserts reload instruction(s) before the given instruction for the given virtual register.
-		ReloadRegisterBefore(v VReg, instr I)
-		// ReloadRegisterAfter inserts reload instruction(s) after the given instruction for the given virtual register.
-		ReloadRegisterAfter(v VReg, instr I)
-		// InsertMoveBefore inserts move instruction(s) before the given instruction for the given virtual registers.
-		InsertMoveBefore(dst, src VReg, instr I)
-	}
-
-	// Block is a basic block in the CFG of a function, and it consists of multiple instructions, and predecessor Block(s).
-	// Right now, this corresponds to a ssa.BasicBlock lowered to the machine level.
-	Block[I Instr] interface {
-		comparable
-		// ID returns the unique identifier of this block which is ordered in the reverse post-order traversal of the CFG.
-		ID() int32
-		// InstrIteratorBegin returns the first instruction in this block. Instructions added after lowering must be skipped.
-		// Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr.
-		InstrIteratorBegin() I
-		// InstrIteratorNext returns the next instruction in this block. Instructions added after lowering must be skipped.
-		// Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr.
-		InstrIteratorNext() I
-		// InstrRevIteratorBegin is the same as InstrIteratorBegin, but in the reverse order.
-		InstrRevIteratorBegin() I
-		// InstrRevIteratorNext is the same as InstrIteratorNext, but in the reverse order.
-		InstrRevIteratorNext() I
-		// FirstInstr returns the fist instruction in this block where instructions will be inserted after it.
-		FirstInstr() I
-		// LastInstrForInsertion returns the last instruction in this block where instructions will be inserted before it.
-		// Such insertions only happen when we need to insert spill/reload instructions to adjust the merge edges.
-		// At the time of register allocation, all the critical edges are already split, so there is no need
-		// to worry about the case where branching instruction has multiple successors.
-		// Therefore, usually, it is the nop instruction, but if the block ends with an unconditional branching, then it returns
-		// the unconditional branch, not the nop. In other words it is either nop or unconditional branch.
-		LastInstrForInsertion() I
-		// Preds returns the number of predecessors of this block in the CFG.
-		Preds() int
-		// Entry returns true if the block is for the entry block.
-		Entry() bool
-		// Succs returns the number of successors of this block in the CFG.
-		Succs() int
-		// LoopHeader returns true if this block is a loop header.
-		LoopHeader() bool
-		// LoopNestingForestChildren returns the number of children of this block in the loop nesting forest.
-		LoopNestingForestChildren() int
-	}
-
-	// Instr is an instruction in a block, abstracting away the underlying ISA.
-	Instr interface {
-		comparable
-		fmt.Stringer
-		// Defs returns the virtual registers defined by this instruction.
-		Defs(*[]VReg) []VReg
-		// Uses returns the virtual registers used by this instruction.
-		// Note: multiple returned []VReg will not be held at the same time, so it's safe to use the same slice for this.
-		Uses(*[]VReg) []VReg
-		// AssignUse assigns the RealReg-allocated virtual register used by this instruction at the given index.
-		AssignUse(index int, v VReg)
-		// AssignDef assigns a RealReg-allocated virtual register defined by this instruction.
-		// This only accepts one register because we don't allocate registers for multi-def instructions (i.e. call instruction)
-		AssignDef(VReg)
-		// IsCopy returns true if this instruction is a move instruction between two registers.
-		// If true, the instruction is of the form of dst = src, and if the src and dst do not interfere with each other,
-		// we could coalesce them, and hence the copy can be eliminated from the final code.
-		IsCopy() bool
-		// IsCall returns true if this instruction is a call instruction. The result is used to insert
-		// caller saved register spills and restores.
-		IsCall() bool
-		// IsIndirectCall returns true if this instruction is an indirect call instruction which calls a function pointer.
-		//  The result is used to insert caller saved register spills and restores.
-		IsIndirectCall() bool
-		// IsReturn returns true if this instruction is a return instruction.
-		IsReturn() bool
-	}
-)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go
deleted file mode 100644
index 46df807e6..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go
+++ /dev/null
@@ -1,123 +0,0 @@
-package regalloc
-
-import (
-	"fmt"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-// VReg represents a register which is assigned to an SSA value. This is used to represent a register in the backend.
-// A VReg may or may not be a physical register, and the info of physical register can be obtained by RealReg.
-type VReg uint64
-
-// VRegID is the lower 32bit of VReg, which is the pure identifier of VReg without RealReg info.
-type VRegID uint32
-
-// RealReg returns the RealReg of this VReg.
-func (v VReg) RealReg() RealReg {
-	return RealReg(v >> 32)
-}
-
-// IsRealReg returns true if this VReg is backed by a physical register.
-func (v VReg) IsRealReg() bool {
-	return v.RealReg() != RealRegInvalid
-}
-
-// FromRealReg returns a VReg from the given RealReg and RegType.
-// This is used to represent a specific pre-colored register in the backend.
-func FromRealReg(r RealReg, typ RegType) VReg {
-	rid := VRegID(r)
-	if rid > vRegIDReservedForRealNum {
-		panic(fmt.Sprintf("invalid real reg %d", r))
-	}
-	return VReg(r).SetRealReg(r).SetRegType(typ)
-}
-
-// SetRealReg sets the RealReg of this VReg and returns the updated VReg.
-func (v VReg) SetRealReg(r RealReg) VReg {
-	return VReg(r)<<32 | (v & 0xff_00_ffffffff)
-}
-
-// RegType returns the RegType of this VReg.
-func (v VReg) RegType() RegType {
-	return RegType(v >> 40)
-}
-
-// SetRegType sets the RegType of this VReg and returns the updated VReg.
-func (v VReg) SetRegType(t RegType) VReg {
-	return VReg(t)<<40 | (v & 0x00_ff_ffffffff)
-}
-
-// ID returns the VRegID of this VReg.
-func (v VReg) ID() VRegID {
-	return VRegID(v & 0xffffffff)
-}
-
-// Valid returns true if this VReg is Valid.
-func (v VReg) Valid() bool {
-	return v.ID() != vRegIDInvalid && v.RegType() != RegTypeInvalid
-}
-
-// RealReg represents a physical register.
-type RealReg byte
-
-const RealRegInvalid RealReg = 0
-
-const (
-	vRegIDInvalid            VRegID = 1 << 31
-	VRegIDNonReservedBegin          = vRegIDReservedForRealNum
-	vRegIDReservedForRealNum VRegID = 128
-	VRegInvalid                     = VReg(vRegIDInvalid)
-)
-
-// String implements fmt.Stringer.
-func (r RealReg) String() string {
-	switch r {
-	case RealRegInvalid:
-		return "invalid"
-	default:
-		return fmt.Sprintf("r%d", r)
-	}
-}
-
-// String implements fmt.Stringer.
-func (v VReg) String() string {
-	if v.IsRealReg() {
-		return fmt.Sprintf("r%d", v.ID())
-	}
-	return fmt.Sprintf("v%d?", v.ID())
-}
-
-// RegType represents the type of a register.
-type RegType byte
-
-const (
-	RegTypeInvalid RegType = iota
-	RegTypeInt
-	RegTypeFloat
-	NumRegType
-)
-
-// String implements fmt.Stringer.
-func (r RegType) String() string {
-	switch r {
-	case RegTypeInt:
-		return "int"
-	case RegTypeFloat:
-		return "float"
-	default:
-		return "invalid"
-	}
-}
-
-// RegTypeOf returns the RegType of the given ssa.Type.
-func RegTypeOf(p ssa.Type) RegType {
-	switch p {
-	case ssa.TypeI32, ssa.TypeI64:
-		return RegTypeInt
-	case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-		return RegTypeFloat
-	default:
-		panic("invalid type")
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
deleted file mode 100644
index a5857f4f2..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
+++ /dev/null
@@ -1,1189 +0,0 @@
-// Package regalloc performs register allocation. The algorithm can work on any ISA by implementing the interfaces in
-// api.go.
-//
-// References:
-//   - https://web.stanford.edu/class/archive/cs/cs143/cs143.1128/lectures/17/Slides17.pdf
-//   - https://en.wikipedia.org/wiki/Chaitin%27s_algorithm
-//   - https://llvm.org/ProjectsWithLLVM/2004-Fall-CS426-LS.pdf
-//   - https://pfalcon.github.io/ssabook/latest/book-full.pdf: Chapter 9. for liveness analysis.
-//   - https://github.com/golang/go/blob/release-branch.go1.21/src/cmd/compile/internal/ssa/regalloc.go
-package regalloc
-
-import (
-	"fmt"
-	"math"
-	"strings"
-
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
-)
-
-// NewAllocator returns a new Allocator.
-func NewAllocator[I Instr, B Block[I], F Function[I, B]](allocatableRegs *RegisterInfo) Allocator[I, B, F] {
-	a := Allocator[I, B, F]{
-		regInfo:            allocatableRegs,
-		phiDefInstListPool: wazevoapi.NewPool[phiDefInstList[I]](resetPhiDefInstList[I]),
-		blockStates:        wazevoapi.NewIDedPool[blockState[I, B, F]](resetBlockState[I, B, F]),
-	}
-	a.state.vrStates = wazevoapi.NewIDedPool[vrState[I, B, F]](resetVrState[I, B, F])
-	a.state.reset()
-	for _, regs := range allocatableRegs.AllocatableRegisters {
-		for _, r := range regs {
-			a.allocatableSet = a.allocatableSet.add(r)
-		}
-	}
-	return a
-}
-
-type (
-	// RegisterInfo holds the statically-known ISA-specific register information.
-	RegisterInfo struct {
-		// AllocatableRegisters is a 2D array of allocatable RealReg, indexed by regTypeNum and regNum.
-		// The order matters: the first element is the most preferred one when allocating.
-		AllocatableRegisters [NumRegType][]RealReg
-		CalleeSavedRegisters RegSet
-		CallerSavedRegisters RegSet
-		RealRegToVReg        []VReg
-		// RealRegName returns the name of the given RealReg for debugging.
-		RealRegName func(r RealReg) string
-		RealRegType func(r RealReg) RegType
-	}
-
-	// Allocator is a register allocator.
-	Allocator[I Instr, B Block[I], F Function[I, B]] struct {
-		// regInfo is static per ABI/ISA, and is initialized by the machine during Machine.PrepareRegisterAllocator.
-		regInfo *RegisterInfo
-		// allocatableSet is a set of allocatable RealReg derived from regInfo. Static per ABI/ISA.
-		allocatableSet           RegSet
-		allocatedCalleeSavedRegs []VReg
-		vs                       []VReg
-		ss                       []*vrState[I, B, F]
-		copies                   []_copy[I, B, F]
-		phiDefInstListPool       wazevoapi.Pool[phiDefInstList[I]]
-
-		// Followings are re-used during various places.
-		blks  []B
-		reals []RealReg
-
-		// Following two fields are updated while iterating the blocks in the reverse postorder.
-		state       state[I, B, F]
-		blockStates wazevoapi.IDedPool[blockState[I, B, F]]
-	}
-
-	// _copy represents a source and destination pair of a copy instruction.
-	_copy[I Instr, B Block[I], F Function[I, B]] struct {
-		src   *vrState[I, B, F]
-		dstID VRegID
-	}
-
-	// programCounter represents an opaque index into the program which is used to represents a LiveInterval of a VReg.
-	programCounter int32
-
-	state[I Instr, B Block[I], F Function[I, B]] struct {
-		argRealRegs []VReg
-		regsInUse   regInUseSet[I, B, F]
-		vrStates    wazevoapi.IDedPool[vrState[I, B, F]]
-
-		currentBlockID int32
-
-		// allocatedRegSet is a set of RealReg that are allocated during the allocation phase. This is reset per function.
-		allocatedRegSet RegSet
-	}
-
-	blockState[I Instr, B Block[I], F Function[I, B]] struct {
-		// liveIns is a list of VReg that are live at the beginning of the block.
-		liveIns []*vrState[I, B, F]
-		// seen is true if the block is visited during the liveness analysis.
-		seen bool
-		// visited is true if the block is visited during the allocation phase.
-		visited            bool
-		startFromPredIndex int
-		// startRegs is a list of RealReg that are used at the beginning of the block. This is used to fix the merge edges.
-		startRegs regInUseSet[I, B, F]
-		// endRegs is a list of RealReg that are used at the end of the block. This is used to fix the merge edges.
-		endRegs regInUseSet[I, B, F]
-	}
-
-	vrState[I Instr, B Block[I], f Function[I, B]] struct {
-		v VReg
-		r RealReg
-		// defInstr is the instruction that defines this value. If this is the phi value and not the entry block, this is nil.
-		defInstr I
-		// defBlk is the block that defines this value. If this is the phi value, this is the block whose arguments contain this value.
-		defBlk B
-		// lca = lowest common ancestor. This is the block that is the lowest common ancestor of all the blocks that
-		// reloads this value. This is used to determine the spill location. Only valid if spilled=true.
-		lca B
-		// lastUse is the program counter of the last use of this value. This changes while iterating the block, and
-		// should not be used across the blocks as it becomes invalid. To check the validity, use lastUseUpdatedAtBlockID.
-		lastUse                 programCounter
-		lastUseUpdatedAtBlockID int32
-		// spilled is true if this value is spilled i.e. the value is reload from the stack somewhere in the program.
-		//
-		// Note that this field is used during liveness analysis for different purpose. This is used to determine the
-		// value is live-in or not.
-		spilled bool
-		// isPhi is true if this is a phi value.
-		isPhi      bool
-		desiredLoc desiredLoc
-		// phiDefInstList is a list of instructions that defines this phi value.
-		// This is used to determine the spill location, and only valid if isPhi=true.
-		*phiDefInstList[I]
-	}
-
-	// phiDefInstList is a linked list of instructions that defines a phi value.
-	phiDefInstList[I Instr] struct {
-		instr I
-		v     VReg
-		next  *phiDefInstList[I]
-	}
-
-	// desiredLoc represents a desired location for a VReg.
-	desiredLoc uint16
-	// desiredLocKind is a kind of desired location for a VReg.
-	desiredLocKind uint16
-)
-
-const (
-	// desiredLocKindUnspecified is a kind of desired location for a VReg that is not specified.
-	desiredLocKindUnspecified desiredLocKind = iota
-	// desiredLocKindStack is a kind of desired location for a VReg that is on the stack, only used for the phi values.
-	desiredLocKindStack
-	// desiredLocKindReg is a kind of desired location for a VReg that is in a register.
-	desiredLocKindReg
-	desiredLocUnspecified = desiredLoc(desiredLocKindUnspecified)
-	desiredLocStack       = desiredLoc(desiredLocKindStack)
-)
-
-func newDesiredLocReg(r RealReg) desiredLoc {
-	return desiredLoc(desiredLocKindReg) | desiredLoc(r<<2)
-}
-
-func (d desiredLoc) realReg() RealReg {
-	return RealReg(d >> 2)
-}
-
-func (d desiredLoc) stack() bool {
-	return d&3 == desiredLoc(desiredLocKindStack)
-}
-
-func resetPhiDefInstList[I Instr](l *phiDefInstList[I]) {
-	var nilInstr I
-	l.instr = nilInstr
-	l.next = nil
-	l.v = VRegInvalid
-}
-
-func (s *state[I, B, F]) dump(info *RegisterInfo) { //nolint:unused
-	fmt.Println("\t\tstate:")
-	fmt.Println("\t\t\targRealRegs:", s.argRealRegs)
-	fmt.Println("\t\t\tregsInUse", s.regsInUse.format(info))
-	fmt.Println("\t\t\tallocatedRegSet:", s.allocatedRegSet.format(info))
-	fmt.Println("\t\t\tused:", s.regsInUse.format(info))
-	var strs []string
-	for i := 0; i <= s.vrStates.MaxIDEncountered(); i++ {
-		vs := s.vrStates.Get(i)
-		if vs == nil {
-			continue
-		}
-		if vs.r != RealRegInvalid {
-			strs = append(strs, fmt.Sprintf("(v%d: %s)", vs.v.ID(), info.RealRegName(vs.r)))
-		}
-	}
-	fmt.Println("\t\t\tvrStates:", strings.Join(strs, ", "))
-}
-
-func (s *state[I, B, F]) reset() {
-	s.argRealRegs = s.argRealRegs[:0]
-	s.vrStates.Reset()
-	s.allocatedRegSet = RegSet(0)
-	s.regsInUse.reset()
-	s.currentBlockID = -1
-}
-
-func resetVrState[I Instr, B Block[I], F Function[I, B]](vs *vrState[I, B, F]) {
-	vs.v = VRegInvalid
-	vs.r = RealRegInvalid
-	var nilInstr I
-	vs.defInstr = nilInstr
-	var nilBlk B
-	vs.defBlk = nilBlk
-	vs.spilled = false
-	vs.lastUse = -1
-	vs.lastUseUpdatedAtBlockID = -1
-	vs.lca = nilBlk
-	vs.isPhi = false
-	vs.phiDefInstList = nil
-	vs.desiredLoc = desiredLocUnspecified
-}
-
-func (s *state[I, B, F]) getOrAllocateVRegState(v VReg) *vrState[I, B, F] {
-	st := s.vrStates.GetOrAllocate(int(v.ID()))
-	if st.v == VRegInvalid {
-		st.v = v
-	}
-	return st
-}
-
-func (s *state[I, B, F]) getVRegState(v VRegID) *vrState[I, B, F] {
-	return s.vrStates.Get(int(v))
-}
-
-func (s *state[I, B, F]) useRealReg(r RealReg, vr *vrState[I, B, F]) {
-	s.regsInUse.add(r, vr)
-	vr.r = r
-	s.allocatedRegSet = s.allocatedRegSet.add(r)
-}
-
-func (s *state[I, B, F]) releaseRealReg(r RealReg) {
-	current := s.regsInUse.get(r)
-	if current != nil {
-		s.regsInUse.remove(r)
-		current.r = RealRegInvalid
-	}
-}
-
-// recordReload records that the given VReg is reloaded in the given block.
-// This is used to determine the spill location by tracking the lowest common ancestor of all the blocks that reloads the value.
-func (vs *vrState[I, B, F]) recordReload(f F, blk B) {
-	vs.spilled = true
-	var nilBlk B
-	if lca := vs.lca; lca == nilBlk {
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Printf("\t\tv%d is reloaded in blk%d,\n", vs.v.ID(), blk.ID())
-		}
-		vs.lca = blk
-	} else if lca != blk {
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Printf("\t\tv%d is reloaded in blk%d, lca=%d\n", vs.v.ID(), blk.ID(), vs.lca.ID())
-		}
-		vs.lca = f.LowestCommonAncestor(lca, blk)
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Printf("updated lca=%d\n", vs.lca.ID())
-		}
-	}
-}
-
-func (a *Allocator[I, B, F]) findOrSpillAllocatable(s *state[I, B, F], allocatable []RealReg, forbiddenMask RegSet, preferred RealReg) (r RealReg) {
-	r = RealRegInvalid
-	// First, check if the preferredMask has any allocatable register.
-	if preferred != RealRegInvalid && !forbiddenMask.has(preferred) && !s.regsInUse.has(preferred) {
-		return preferred
-	}
-
-	var lastUseAt programCounter
-	var spillVReg VReg
-	for _, candidateReal := range allocatable {
-		if forbiddenMask.has(candidateReal) {
-			continue
-		}
-
-		using := s.regsInUse.get(candidateReal)
-		if using == nil {
-			// This is not used at this point.
-			return candidateReal
-		}
-
-		// Real registers in use should not be spilled, so we skip them.
-		// For example, if the register is used as an argument register, and it might be
-		// spilled and not reloaded when it ends up being used as a temporary to pass
-		// stack based argument.
-		if using.v.IsRealReg() {
-			continue
-		}
-
-		isPreferred := candidateReal == preferred
-
-		// last == -1 means the value won't be used anymore.
-		if last := using.lastUse; r == RealRegInvalid || isPreferred || last == -1 || (lastUseAt != -1 && last > lastUseAt) {
-			lastUseAt = last
-			r = candidateReal
-			spillVReg = using.v
-			if isPreferred {
-				break
-			}
-		}
-	}
-
-	if r == RealRegInvalid {
-		panic("not found any allocatable register")
-	}
-
-	if wazevoapi.RegAllocLoggingEnabled {
-		fmt.Printf("\tspilling v%d when lastUseAt=%d and regsInUse=%s\n", spillVReg.ID(), lastUseAt, s.regsInUse.format(a.regInfo))
-	}
-	s.releaseRealReg(r)
-	return r
-}
-
-func (s *state[I, B, F]) findAllocatable(allocatable []RealReg, forbiddenMask RegSet) RealReg {
-	for _, r := range allocatable {
-		if !s.regsInUse.has(r) && !forbiddenMask.has(r) {
-			return r
-		}
-	}
-	return RealRegInvalid
-}
-
-func (s *state[I, B, F]) resetAt(bs *blockState[I, B, F]) {
-	s.regsInUse.range_(func(_ RealReg, vs *vrState[I, B, F]) {
-		vs.r = RealRegInvalid
-	})
-	s.regsInUse.reset()
-	bs.endRegs.range_(func(r RealReg, vs *vrState[I, B, F]) {
-		if vs.lastUseUpdatedAtBlockID == s.currentBlockID && vs.lastUse == programCounterLiveIn {
-			s.regsInUse.add(r, vs)
-			vs.r = r
-		}
-	})
-}
-
-func resetBlockState[I Instr, B Block[I], F Function[I, B]](b *blockState[I, B, F]) {
-	b.seen = false
-	b.visited = false
-	b.endRegs.reset()
-	b.startRegs.reset()
-	b.startFromPredIndex = -1
-	b.liveIns = b.liveIns[:0]
-}
-
-func (b *blockState[I, B, F]) dump(a *RegisterInfo) {
-	fmt.Println("\t\tblockState:")
-	fmt.Println("\t\t\tstartRegs:", b.startRegs.format(a))
-	fmt.Println("\t\t\tendRegs:", b.endRegs.format(a))
-	fmt.Println("\t\t\tstartFromPredIndex:", b.startFromPredIndex)
-	fmt.Println("\t\t\tvisited:", b.visited)
-}
-
-// DoAllocation performs register allocation on the given Function.
-func (a *Allocator[I, B, F]) DoAllocation(f F) {
-	a.livenessAnalysis(f)
-	a.alloc(f)
-	a.determineCalleeSavedRealRegs(f)
-}
-
-func (a *Allocator[I, B, F]) determineCalleeSavedRealRegs(f F) {
-	a.allocatedCalleeSavedRegs = a.allocatedCalleeSavedRegs[:0]
-	a.state.allocatedRegSet.Range(func(allocatedRealReg RealReg) {
-		if a.regInfo.CalleeSavedRegisters.has(allocatedRealReg) {
-			a.allocatedCalleeSavedRegs = append(a.allocatedCalleeSavedRegs, a.regInfo.RealRegToVReg[allocatedRealReg])
-		}
-	})
-	f.ClobberedRegisters(a.allocatedCalleeSavedRegs)
-}
-
-func (a *Allocator[I, B, F]) getOrAllocateBlockState(blockID int32) *blockState[I, B, F] {
-	return a.blockStates.GetOrAllocate(int(blockID))
-}
-
-// phiBlk returns the block that defines the given phi value, nil otherwise.
-func (vs *vrState[I, B, F]) phiBlk() B {
-	if vs.isPhi {
-		return vs.defBlk
-	}
-	var nilBlk B
-	return nilBlk
-}
-
-const (
-	programCounterLiveIn  = math.MinInt32
-	programCounterLiveOut = math.MaxInt32
-)
-
-// liveAnalysis constructs Allocator.blockLivenessData.
-// The algorithm here is described in https://pfalcon.github.io/ssabook/latest/book-full.pdf Chapter 9.2.
-func (a *Allocator[I, B, F]) livenessAnalysis(f F) {
-	s := &a.state
-
-	for i := VRegID(0); i < vRegIDReservedForRealNum; i++ {
-		s.getOrAllocateVRegState(VReg(i).SetRealReg(RealReg(i)))
-	}
-
-	var nilBlk B
-	var nilInstr I
-	for blk := f.PostOrderBlockIteratorBegin(); blk != nilBlk; blk = f.PostOrderBlockIteratorNext() {
-		// We should gather phi value data.
-		for _, p := range f.BlockParams(blk, &a.vs) {
-			vs := s.getOrAllocateVRegState(p)
-			vs.isPhi = true
-			vs.defBlk = blk
-		}
-
-		blkID := blk.ID()
-		info := a.getOrAllocateBlockState(blkID)
-
-		a.ss = a.ss[:0]
-		const (
-			flagDeleted = false
-			flagLive    = true
-		)
-		ns := blk.Succs()
-		for i := 0; i < ns; i++ {
-			succ := f.Succ(blk, i)
-			if succ == nilBlk {
-				continue
-			}
-
-			succID := succ.ID()
-			succInfo := a.getOrAllocateBlockState(succID)
-			if !succInfo.seen { // This means the back edge.
-				continue
-			}
-
-			for _, st := range succInfo.liveIns {
-				if st.phiBlk() != succ && st.spilled != flagLive { //nolint:gosimple
-					// We use .spilled field to store the flag.
-					st.spilled = flagLive
-					a.ss = append(a.ss, st)
-				}
-			}
-		}
-
-		for instr := blk.InstrRevIteratorBegin(); instr != nilInstr; instr = blk.InstrRevIteratorNext() {
-
-			var use, def VReg
-			var defIsPhi bool
-			for _, def = range instr.Defs(&a.vs) {
-				if !def.IsRealReg() {
-					st := s.getOrAllocateVRegState(def)
-					defIsPhi = st.isPhi
-					// Note: We use .spilled field to store the flag.
-					st.spilled = flagDeleted
-				}
-			}
-			for _, use = range instr.Uses(&a.vs) {
-				if !use.IsRealReg() {
-					st := s.getOrAllocateVRegState(use)
-					// Note: We use .spilled field to store the flag.
-					if st.spilled != flagLive { //nolint:gosimple
-						st.spilled = flagLive
-						a.ss = append(a.ss, st)
-					}
-				}
-			}
-
-			if defIsPhi {
-				if use.Valid() && use.IsRealReg() {
-					// If the destination is a phi value, and the source is a real register, this is the beginning of the function.
-					a.state.argRealRegs = append(a.state.argRealRegs, use)
-				}
-			}
-		}
-
-		for _, st := range a.ss {
-			// We use .spilled field to store the flag.
-			if st.spilled == flagLive { //nolint:gosimple
-				info.liveIns = append(info.liveIns, st)
-				st.spilled = false
-			}
-		}
-
-		info.seen = true
-	}
-
-	nrs := f.LoopNestingForestRoots()
-	for i := 0; i < nrs; i++ {
-		root := f.LoopNestingForestRoot(i)
-		a.loopTreeDFS(f, root)
-	}
-}
-
-// loopTreeDFS implements the Algorithm 9.3 in the book in an iterative way.
-func (a *Allocator[I, B, F]) loopTreeDFS(f F, entry B) {
-	a.blks = a.blks[:0]
-	a.blks = append(a.blks, entry)
-
-	for len(a.blks) > 0 {
-		tail := len(a.blks) - 1
-		loop := a.blks[tail]
-		a.blks = a.blks[:tail]
-		a.ss = a.ss[:0]
-		const (
-			flagDone    = false
-			flagPending = true
-		)
-		info := a.getOrAllocateBlockState(loop.ID())
-		for _, st := range info.liveIns {
-			if st.phiBlk() != loop {
-				a.ss = append(a.ss, st)
-				// We use .spilled field to store the flag.
-				st.spilled = flagPending
-			}
-		}
-
-		var siblingAddedView []*vrState[I, B, F]
-		cn := loop.LoopNestingForestChildren()
-		for i := 0; i < cn; i++ {
-			child := f.LoopNestingForestChild(loop, i)
-			childID := child.ID()
-			childInfo := a.getOrAllocateBlockState(childID)
-
-			if i == 0 {
-				begin := len(childInfo.liveIns)
-				for _, st := range a.ss {
-					// We use .spilled field to store the flag.
-					if st.spilled == flagPending { //nolint:gosimple
-						st.spilled = flagDone
-						// TODO: deduplicate, though I don't think it has much impact.
-						childInfo.liveIns = append(childInfo.liveIns, st)
-					}
-				}
-				siblingAddedView = childInfo.liveIns[begin:]
-			} else {
-				// TODO: deduplicate, though I don't think it has much impact.
-				childInfo.liveIns = append(childInfo.liveIns, siblingAddedView...)
-			}
-
-			if child.LoopHeader() {
-				a.blks = append(a.blks, child)
-			}
-		}
-
-		if cn == 0 {
-			// If there's no forest child, we haven't cleared the .spilled field at this point.
-			for _, st := range a.ss {
-				st.spilled = false
-			}
-		}
-	}
-}
-
-// alloc allocates registers for the given function by iterating the blocks in the reverse postorder.
-// The algorithm here is derived from the Go compiler's allocator https://github.com/golang/go/blob/release-branch.go1.21/src/cmd/compile/internal/ssa/regalloc.go
-// In short, this is a simply linear scan register allocation where each block inherits the register allocation state from
-// one of its predecessors. Each block inherits the selected state and starts allocation from there.
-// If there's a discrepancy in the end states between predecessors, the adjustments are made to ensure consistency after allocation is done (which we call "fixing merge state").
-// The spill instructions (store into the dedicated slots) are inserted after all the allocations and fixing merge states. That is because
-// at the point, we all know where the reloads happen, and therefore we can know the best place to spill the values. More precisely,
-// the spill happens in the block that is the lowest common ancestor of all the blocks that reloads the value.
-//
-// All of these logics are almost the same as Go's compiler which has a dedicated description in the source file ^^.
-func (a *Allocator[I, B, F]) alloc(f F) {
-	// First we allocate each block in the reverse postorder (at least one predecessor should be allocated for each block).
-	var nilBlk B
-	for blk := f.ReversePostOrderBlockIteratorBegin(); blk != nilBlk; blk = f.ReversePostOrderBlockIteratorNext() {
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Printf("========== allocating blk%d ========\n", blk.ID())
-		}
-		if blk.Entry() {
-			a.finalizeStartReg(f, blk)
-		}
-		a.allocBlock(f, blk)
-	}
-	// After the allocation, we all know the start and end state of each block. So we can fix the merge states.
-	for blk := f.ReversePostOrderBlockIteratorBegin(); blk != nilBlk; blk = f.ReversePostOrderBlockIteratorNext() {
-		a.fixMergeState(f, blk)
-	}
-	// Finally, we insert the spill instructions as we know all the places where the reloads happen.
-	a.scheduleSpills(f)
-}
-
-func (a *Allocator[I, B, F]) updateLiveInVRState(liveness *blockState[I, B, F]) {
-	currentBlockID := a.state.currentBlockID
-	for _, vs := range liveness.liveIns {
-		vs.lastUse = programCounterLiveIn
-		vs.lastUseUpdatedAtBlockID = currentBlockID
-	}
-}
-
-func (a *Allocator[I, B, F]) finalizeStartReg(f F, blk B) {
-	bID := blk.ID()
-	s := &a.state
-	currentBlkState := a.getOrAllocateBlockState(bID)
-	if currentBlkState.startFromPredIndex > -1 {
-		return
-	}
-
-	s.currentBlockID = bID
-	a.updateLiveInVRState(currentBlkState)
-
-	preds := blk.Preds()
-	var predState *blockState[I, B, F]
-	switch preds {
-	case 0: // This is the entry block.
-	case 1:
-		predID := f.Pred(blk, 0).ID()
-		predState = a.getOrAllocateBlockState(predID)
-		currentBlkState.startFromPredIndex = 0
-	default:
-		// TODO: there should be some better heuristic to choose the predecessor.
-		for i := 0; i < preds; i++ {
-			predID := f.Pred(blk, i).ID()
-			if _predState := a.getOrAllocateBlockState(predID); _predState.visited {
-				predState = _predState
-				currentBlkState.startFromPredIndex = i
-				break
-			}
-		}
-	}
-	if predState == nil {
-		if !blk.Entry() {
-			panic(fmt.Sprintf("BUG: at lease one predecessor should be visited for blk%d", blk.ID()))
-		}
-		for _, u := range s.argRealRegs {
-			s.useRealReg(u.RealReg(), s.getVRegState(u.ID()))
-		}
-		currentBlkState.startFromPredIndex = 0
-	} else {
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Printf("allocating blk%d starting from blk%d (on index=%d) \n",
-				bID, f.Pred(blk, currentBlkState.startFromPredIndex).ID(), currentBlkState.startFromPredIndex)
-		}
-		s.resetAt(predState)
-	}
-
-	s.regsInUse.range_(func(allocated RealReg, v *vrState[I, B, F]) {
-		currentBlkState.startRegs.add(allocated, v)
-	})
-	if wazevoapi.RegAllocLoggingEnabled {
-		fmt.Printf("finalized start reg for blk%d: %s\n", blk.ID(), currentBlkState.startRegs.format(a.regInfo))
-	}
-}
-
-func (a *Allocator[I, B, F]) allocBlock(f F, blk B) {
-	bID := blk.ID()
-	s := &a.state
-	currentBlkState := a.getOrAllocateBlockState(bID)
-	s.currentBlockID = bID
-
-	if currentBlkState.startFromPredIndex < 0 {
-		panic("BUG: startFromPredIndex should be set in finalizeStartReg prior to allocBlock")
-	}
-
-	// Clears the previous state.
-	s.regsInUse.range_(func(allocatedRealReg RealReg, vr *vrState[I, B, F]) { vr.r = RealRegInvalid })
-	s.regsInUse.reset()
-	// Then set the start state.
-	currentBlkState.startRegs.range_(func(allocatedRealReg RealReg, vr *vrState[I, B, F]) { s.useRealReg(allocatedRealReg, vr) })
-
-	desiredUpdated := a.ss[:0]
-
-	// Update the last use of each VReg.
-	a.copies = a.copies[:0] // Stores the copy instructions.
-	var pc programCounter
-	var nilInstr I
-	for instr := blk.InstrIteratorBegin(); instr != nilInstr; instr = blk.InstrIteratorNext() {
-		var useState *vrState[I, B, F]
-		for _, use := range instr.Uses(&a.vs) {
-			useState = s.getVRegState(use.ID())
-			if !use.IsRealReg() {
-				useState.lastUse = pc
-			}
-		}
-
-		if instr.IsCopy() {
-			def := instr.Defs(&a.vs)[0]
-			a.copies = append(a.copies, _copy[I, B, F]{src: useState, dstID: def.ID()})
-			r := def.RealReg()
-			if r != RealRegInvalid {
-				if !useState.isPhi { // TODO: no idea why do we need this.
-					useState.desiredLoc = newDesiredLocReg(r)
-					desiredUpdated = append(desiredUpdated, useState)
-				}
-			}
-		}
-		pc++
-	}
-
-	// Mark all live-out values by checking live-in of the successors.
-	// While doing so, we also update the desired register values.
-	var succ B
-	var nilBlk B
-	for i, ns := 0, blk.Succs(); i < ns; i++ {
-		succ = f.Succ(blk, i)
-		if succ == nilBlk {
-			continue
-		}
-
-		succID := succ.ID()
-		succState := a.getOrAllocateBlockState(succID)
-		for _, st := range succState.liveIns {
-			if st.phiBlk() != succ {
-				st.lastUse = programCounterLiveOut
-			}
-		}
-
-		if succState.startFromPredIndex > -1 {
-			if wazevoapi.RegAllocLoggingEnabled {
-				fmt.Printf("blk%d -> blk%d: start_regs: %s\n", bID, succID, succState.startRegs.format(a.regInfo))
-			}
-			succState.startRegs.range_(func(allocatedRealReg RealReg, vs *vrState[I, B, F]) {
-				vs.desiredLoc = newDesiredLocReg(allocatedRealReg)
-				desiredUpdated = append(desiredUpdated, vs)
-			})
-			for _, p := range f.BlockParams(succ, &a.vs) {
-				vs := s.getVRegState(p.ID())
-				if vs.desiredLoc.realReg() == RealRegInvalid {
-					vs.desiredLoc = desiredLocStack
-					desiredUpdated = append(desiredUpdated, vs)
-				}
-			}
-		}
-	}
-
-	// Propagate the desired register values from the end of the block to the beginning.
-	for _, instr := range a.copies {
-		defState := s.getVRegState(instr.dstID)
-		desired := defState.desiredLoc.realReg()
-		useState := instr.src
-		if useState.phiBlk() != succ && useState.desiredLoc == desiredLocUnspecified {
-			useState.desiredLoc = newDesiredLocReg(desired)
-			desiredUpdated = append(desiredUpdated, useState)
-		}
-	}
-
-	pc = 0
-	for instr := blk.InstrIteratorBegin(); instr != nilInstr; instr = blk.InstrIteratorNext() {
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Println(instr)
-		}
-
-		var currentUsedSet RegSet
-		killSet := a.reals[:0]
-
-		// Gather the set of registers that will be used in the current instruction.
-		uses := instr.Uses(&a.vs)
-		for _, use := range uses {
-			if use.IsRealReg() {
-				r := use.RealReg()
-				currentUsedSet = currentUsedSet.add(r)
-				if a.allocatableSet.has(r) {
-					killSet = append(killSet, r)
-				}
-			} else {
-				vs := s.getVRegState(use.ID())
-				if r := vs.r; r != RealRegInvalid {
-					currentUsedSet = currentUsedSet.add(r)
-				}
-			}
-		}
-
-		for i, use := range uses {
-			if !use.IsRealReg() {
-				vs := s.getVRegState(use.ID())
-				killed := vs.lastUse == pc
-				r := vs.r
-
-				if r == RealRegInvalid {
-					r = a.findOrSpillAllocatable(s, a.regInfo.AllocatableRegisters[use.RegType()], currentUsedSet,
-						// Prefer the desired register if it's available.
-						vs.desiredLoc.realReg())
-					vs.recordReload(f, blk)
-					f.ReloadRegisterBefore(use.SetRealReg(r), instr)
-					s.useRealReg(r, vs)
-				}
-				if wazevoapi.RegAllocLoggingEnabled {
-					fmt.Printf("\ttrying to use v%v on %s\n", use.ID(), a.regInfo.RealRegName(r))
-				}
-				instr.AssignUse(i, use.SetRealReg(r))
-				currentUsedSet = currentUsedSet.add(r)
-				if killed {
-					if wazevoapi.RegAllocLoggingEnabled {
-						fmt.Printf("\tkill v%d with %s\n", use.ID(), a.regInfo.RealRegName(r))
-					}
-					killSet = append(killSet, r)
-				}
-			}
-		}
-
-		isIndirect := instr.IsIndirectCall()
-		if instr.IsCall() || isIndirect {
-			addr := RealRegInvalid
-			if isIndirect {
-				addr = a.vs[0].RealReg()
-			}
-			a.releaseCallerSavedRegs(addr)
-		}
-
-		for _, r := range killSet {
-			s.releaseRealReg(r)
-		}
-		a.reals = killSet
-
-		defs := instr.Defs(&a.vs)
-		switch len(defs) {
-		default:
-			// Some instructions define multiple values on real registers.
-			// E.g. call instructions (following calling convention) / div instruction on x64 that defines both rax and rdx.
-			//
-			// Note that currently I assume that such instructions define only the pre colored real registers, not the VRegs
-			// that require allocations. If we need to support such case, we need to add the logic to handle it here,
-			// though is there any such instruction?
-			for _, def := range defs {
-				if !def.IsRealReg() {
-					panic("BUG: multiple defs should be on real registers")
-				}
-				r := def.RealReg()
-				if s.regsInUse.has(r) {
-					s.releaseRealReg(r)
-				}
-				s.useRealReg(r, s.getVRegState(def.ID()))
-			}
-		case 0:
-		case 1:
-			def := defs[0]
-			vState := s.getVRegState(def.ID())
-			if def.IsRealReg() {
-				r := def.RealReg()
-				if a.allocatableSet.has(r) {
-					if s.regsInUse.has(r) {
-						s.releaseRealReg(r)
-					}
-					s.useRealReg(r, vState)
-				}
-			} else {
-				r := vState.r
-
-				if desired := vState.desiredLoc.realReg(); desired != RealRegInvalid {
-					if r != desired {
-						if (vState.isPhi && vState.defBlk == succ) ||
-							// If this is not a phi and it's already assigned a real reg,
-							// this value has multiple definitions, hence we cannot assign the desired register.
-							(!s.regsInUse.has(desired) && r == RealRegInvalid) {
-							// If the phi value is passed via a real register, we force the value to be in the desired register.
-							if wazevoapi.RegAllocLoggingEnabled {
-								fmt.Printf("\t\tv%d is phi and desiredReg=%s\n", def.ID(), a.regInfo.RealRegName(desired))
-							}
-							if r != RealRegInvalid {
-								// If the value is already in a different real register, we release it to change the state.
-								// Otherwise, multiple registers might have the same values at the end, which results in
-								// messing up the merge state reconciliation.
-								s.releaseRealReg(r)
-							}
-							r = desired
-							s.releaseRealReg(r)
-							s.useRealReg(r, vState)
-						}
-					}
-				}
-
-				// Allocate a new real register if `def` is not currently assigned one.
-				// It can happen when multiple instructions define the same VReg (e.g. const loads).
-				if r == RealRegInvalid {
-					if instr.IsCopy() {
-						copySrc := instr.Uses(&a.vs)[0].RealReg()
-						if a.allocatableSet.has(copySrc) && !s.regsInUse.has(copySrc) {
-							r = copySrc
-						}
-					}
-					if r == RealRegInvalid {
-						typ := def.RegType()
-						r = a.findOrSpillAllocatable(s, a.regInfo.AllocatableRegisters[typ], RegSet(0), RealRegInvalid)
-					}
-					s.useRealReg(r, vState)
-				}
-				dr := def.SetRealReg(r)
-				instr.AssignDef(dr)
-				if wazevoapi.RegAllocLoggingEnabled {
-					fmt.Printf("\tdefining v%d with %s\n", def.ID(), a.regInfo.RealRegName(r))
-				}
-				if vState.isPhi {
-					if vState.desiredLoc.stack() { // Stack based phi value.
-						f.StoreRegisterAfter(dr, instr)
-						// Release the real register as it's not used anymore.
-						s.releaseRealReg(r)
-					} else {
-						// Only the register based phis are necessary to track the defining instructions
-						// since the stack-based phis are already having stores inserted ^.
-						n := a.phiDefInstListPool.Allocate()
-						n.instr = instr
-						n.next = vState.phiDefInstList
-						n.v = dr
-						vState.phiDefInstList = n
-					}
-				} else {
-					vState.defInstr = instr
-					vState.defBlk = blk
-				}
-			}
-		}
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Println(instr)
-		}
-		pc++
-	}
-
-	s.regsInUse.range_(func(allocated RealReg, v *vrState[I, B, F]) { currentBlkState.endRegs.add(allocated, v) })
-
-	currentBlkState.visited = true
-	if wazevoapi.RegAllocLoggingEnabled {
-		currentBlkState.dump(a.regInfo)
-	}
-
-	// Reset the desired end location.
-	for _, vs := range desiredUpdated {
-		vs.desiredLoc = desiredLocUnspecified
-	}
-	a.ss = desiredUpdated[:0]
-
-	for i := 0; i < blk.Succs(); i++ {
-		succ := f.Succ(blk, i)
-		if succ == nilBlk {
-			continue
-		}
-		// If the successor is not visited yet, finalize the start state.
-		a.finalizeStartReg(f, succ)
-	}
-}
-
-func (a *Allocator[I, B, F]) releaseCallerSavedRegs(addrReg RealReg) {
-	s := &a.state
-
-	for allocated := RealReg(0); allocated < 64; allocated++ {
-		if allocated == addrReg { // If this is the call indirect, we should not touch the addr register.
-			continue
-		}
-		if vs := s.regsInUse.get(allocated); vs != nil {
-			if vs.v.IsRealReg() {
-				continue // This is the argument register as it's already used by VReg backed by the corresponding RealReg.
-			}
-			if !a.regInfo.CallerSavedRegisters.has(allocated) {
-				// If this is not a caller-saved register, it is safe to keep it across the call.
-				continue
-			}
-			s.releaseRealReg(allocated)
-		}
-	}
-}
-
-func (a *Allocator[I, B, F]) fixMergeState(f F, blk B) {
-	preds := blk.Preds()
-	if preds <= 1 {
-		return
-	}
-
-	s := &a.state
-
-	// Restores the state at the beginning of the block.
-	bID := blk.ID()
-	blkSt := a.getOrAllocateBlockState(bID)
-	desiredOccupants := &blkSt.startRegs
-	var desiredOccupantsSet RegSet
-	for i, v := range desiredOccupants {
-		if v != nil {
-			desiredOccupantsSet = desiredOccupantsSet.add(RealReg(i))
-		}
-	}
-
-	if wazevoapi.RegAllocLoggingEnabled {
-		fmt.Println("fixMergeState", blk.ID(), ":", desiredOccupants.format(a.regInfo))
-	}
-
-	s.currentBlockID = bID
-	a.updateLiveInVRState(blkSt)
-
-	for i := 0; i < preds; i++ {
-		if i == blkSt.startFromPredIndex {
-			continue
-		}
-
-		pred := f.Pred(blk, i)
-		predSt := a.getOrAllocateBlockState(pred.ID())
-
-		s.resetAt(predSt)
-
-		// Finds the free registers if any.
-		intTmp, floatTmp := VRegInvalid, VRegInvalid
-		if intFree := s.findAllocatable(
-			a.regInfo.AllocatableRegisters[RegTypeInt], desiredOccupantsSet,
-		); intFree != RealRegInvalid {
-			intTmp = FromRealReg(intFree, RegTypeInt)
-		}
-		if floatFree := s.findAllocatable(
-			a.regInfo.AllocatableRegisters[RegTypeFloat], desiredOccupantsSet,
-		); floatFree != RealRegInvalid {
-			floatTmp = FromRealReg(floatFree, RegTypeFloat)
-		}
-
-		for r := RealReg(0); r < 64; r++ {
-			desiredVReg := desiredOccupants.get(r)
-			if desiredVReg == nil {
-				continue
-			}
-
-			currentVReg := s.regsInUse.get(r)
-			if currentVReg != nil && desiredVReg.v.ID() == currentVReg.v.ID() {
-				continue
-			}
-
-			typ := desiredVReg.v.RegType()
-			var tmpRealReg VReg
-			if typ == RegTypeInt {
-				tmpRealReg = intTmp
-			} else {
-				tmpRealReg = floatTmp
-			}
-			a.reconcileEdge(f, r, pred, currentVReg, desiredVReg, tmpRealReg, typ)
-		}
-	}
-}
-
-// reconcileEdge reconciles the register state between the current block and the predecessor for the real register `r`.
-//
-//   - currentVReg is the current VReg value that sits on the register `r`. This can be VRegInvalid if the register is not used at the end of the predecessor.
-//   - desiredVReg is the desired VReg value that should be on the register `r`.
-//   - freeReg is the temporary register that can be used to swap the values, which may or may not be used.
-//   - typ is the register type of the `r`.
-func (a *Allocator[I, B, F]) reconcileEdge(f F,
-	r RealReg,
-	pred B,
-	currentState, desiredState *vrState[I, B, F],
-	freeReg VReg,
-	typ RegType,
-) {
-	desiredVReg := desiredState.v
-	currentVReg := VRegInvalid
-	if currentState != nil {
-		currentVReg = currentState.v
-	}
-	// There are four cases to consider:
-	// 1. currentVReg is valid, but desiredVReg is on the stack.
-	// 2. Both currentVReg and desiredVReg are valid.
-	// 3. Desired is on a different register than `r` and currentReg is not valid.
-	// 4. Desired is on the stack and currentReg is not valid.
-
-	s := &a.state
-	if currentVReg.Valid() {
-		er := desiredState.r
-		if er == RealRegInvalid {
-			// Case 1: currentVReg is valid, but desiredVReg is on the stack.
-			if wazevoapi.RegAllocLoggingEnabled {
-				fmt.Printf("\t\tv%d is desired to be on %s, but currently on the stack\n",
-					desiredVReg.ID(), a.regInfo.RealRegName(r),
-				)
-			}
-			// We need to move the current value to the stack, and reload the desired value into the register.
-			// TODO: we can do better here.
-			f.StoreRegisterBefore(currentVReg.SetRealReg(r), pred.LastInstrForInsertion())
-			s.releaseRealReg(r)
-
-			desiredState.recordReload(f, pred)
-			f.ReloadRegisterBefore(desiredVReg.SetRealReg(r), pred.LastInstrForInsertion())
-			s.useRealReg(r, desiredState)
-			return
-		} else {
-			// Case 2: Both currentVReg and desiredVReg are valid.
-			if wazevoapi.RegAllocLoggingEnabled {
-				fmt.Printf("\t\tv%d is desired to be on %s, but currently on %s\n",
-					desiredVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er),
-				)
-			}
-			// This case, we need to swap the values between the current and desired values.
-			f.SwapBefore(
-				currentVReg.SetRealReg(r),
-				desiredVReg.SetRealReg(er),
-				freeReg,
-				pred.LastInstrForInsertion(),
-			)
-			s.allocatedRegSet = s.allocatedRegSet.add(freeReg.RealReg())
-			s.releaseRealReg(r)
-			s.releaseRealReg(er)
-			s.useRealReg(r, desiredState)
-			s.useRealReg(er, currentState)
-			if wazevoapi.RegAllocLoggingEnabled {
-				fmt.Printf("\t\tv%d previously on %s moved to %s\n", currentVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er))
-			}
-		}
-	} else {
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Printf("\t\tv%d is desired to be on %s, current not used\n",
-				desiredVReg.ID(), a.regInfo.RealRegName(r),
-			)
-		}
-		if currentReg := desiredState.r; currentReg != RealRegInvalid {
-			// Case 3: Desired is on a different register than `r` and currentReg is not valid.
-			// We simply need to move the desired value to the register.
-			f.InsertMoveBefore(
-				FromRealReg(r, typ),
-				desiredVReg.SetRealReg(currentReg),
-				pred.LastInstrForInsertion(),
-			)
-			s.releaseRealReg(currentReg)
-		} else {
-			// Case 4: Both currentVReg and desiredVReg are not valid.
-			// We simply need to reload the desired value into the register.
-			desiredState.recordReload(f, pred)
-			f.ReloadRegisterBefore(desiredVReg.SetRealReg(r), pred.LastInstrForInsertion())
-		}
-		s.useRealReg(r, desiredState)
-	}
-}
-
-func (a *Allocator[I, B, F]) scheduleSpills(f F) {
-	states := a.state.vrStates
-	for i := 0; i <= states.MaxIDEncountered(); i++ {
-		vs := states.Get(i)
-		if vs == nil {
-			continue
-		}
-		if vs.spilled {
-			a.scheduleSpill(f, vs)
-		}
-	}
-}
-
-func (a *Allocator[I, B, F]) scheduleSpill(f F, vs *vrState[I, B, F]) {
-	v := vs.v
-	// If the value is the phi value, we need to insert a spill after each phi definition.
-	if vs.isPhi {
-		for defInstr := vs.phiDefInstList; defInstr != nil; defInstr = defInstr.next {
-			f.StoreRegisterAfter(defInstr.v, defInstr.instr)
-		}
-		return
-	}
-
-	pos := vs.lca
-	definingBlk := vs.defBlk
-	r := RealRegInvalid
-	var nilBlk B
-	if definingBlk == nilBlk {
-		panic(fmt.Sprintf("BUG: definingBlk should not be nil for %s. This is likley a bug in backend lowering logic", vs.v.String()))
-	}
-	if pos == nilBlk {
-		panic(fmt.Sprintf("BUG: pos should not be nil for %s. This is likley a bug in backend lowering logic", vs.v.String()))
-	}
-
-	if wazevoapi.RegAllocLoggingEnabled {
-		fmt.Printf("v%d is spilled in blk%d, lca=blk%d\n", v.ID(), definingBlk.ID(), pos.ID())
-	}
-	for pos != definingBlk {
-		st := a.getOrAllocateBlockState(pos.ID())
-		for rr := RealReg(0); rr < 64; rr++ {
-			if vs := st.startRegs.get(rr); vs != nil && vs.v == v {
-				r = rr
-				// Already in the register, so we can place the spill at the beginning of the block.
-				break
-			}
-		}
-
-		if r != RealRegInvalid {
-			break
-		}
-
-		pos = f.Idom(pos)
-	}
-
-	if pos == definingBlk {
-		defInstr := vs.defInstr
-		defInstr.Defs(&a.vs)
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Printf("schedule spill v%d after %v\n", v.ID(), defInstr)
-		}
-		f.StoreRegisterAfter(a.vs[0], defInstr)
-	} else {
-		// Found an ancestor block that holds the value in the register at the beginning of the block.
-		// We need to insert a spill before the last use.
-		first := pos.FirstInstr()
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Printf("schedule spill v%d before %v\n", v.ID(), first)
-		}
-		f.StoreRegisterAfter(v.SetRealReg(r), first)
-	}
-}
-
-// Reset resets the allocator's internal state so that it can be reused.
-func (a *Allocator[I, B, F]) Reset() {
-	a.state.reset()
-	a.blockStates.Reset()
-	a.phiDefInstListPool.Reset()
-	a.vs = a.vs[:0]
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
deleted file mode 100644
index ce84c9c0c..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
+++ /dev/null
@@ -1,96 +0,0 @@
-package regalloc
-
-import (
-	"fmt"
-	"strings"
-)
-
-// NewRegSet returns a new RegSet with the given registers.
-func NewRegSet(regs ...RealReg) RegSet {
-	var ret RegSet
-	for _, r := range regs {
-		ret = ret.add(r)
-	}
-	return ret
-}
-
-// RegSet represents a set of registers.
-type RegSet uint64
-
-func (rs RegSet) format(info *RegisterInfo) string { //nolint:unused
-	var ret []string
-	for i := 0; i < 64; i++ {
-		if rs&(1<<uint(i)) != 0 {
-			ret = append(ret, info.RealRegName(RealReg(i)))
-		}
-	}
-	return strings.Join(ret, ", ")
-}
-
-func (rs RegSet) has(r RealReg) bool {
-	return rs&(1<<uint(r)) != 0
-}
-
-func (rs RegSet) add(r RealReg) RegSet {
-	if r >= 64 {
-		return rs
-	}
-	return rs | 1<<uint(r)
-}
-
-func (rs RegSet) Range(f func(allocatedRealReg RealReg)) {
-	for i := 0; i < 64; i++ {
-		if rs&(1<<uint(i)) != 0 {
-			f(RealReg(i))
-		}
-	}
-}
-
-type regInUseSet[I Instr, B Block[I], F Function[I, B]] [64]*vrState[I, B, F]
-
-func newRegInUseSet[I Instr, B Block[I], F Function[I, B]]() regInUseSet[I, B, F] {
-	var ret regInUseSet[I, B, F]
-	ret.reset()
-	return ret
-}
-
-func (rs *regInUseSet[I, B, F]) reset() {
-	clear(rs[:])
-}
-
-func (rs *regInUseSet[I, B, F]) format(info *RegisterInfo) string { //nolint:unused
-	var ret []string
-	for i, vr := range rs {
-		if vr != nil {
-			ret = append(ret, fmt.Sprintf("(%s->v%d)", info.RealRegName(RealReg(i)), vr.v.ID()))
-		}
-	}
-	return strings.Join(ret, ", ")
-}
-
-func (rs *regInUseSet[I, B, F]) has(r RealReg) bool {
-	return r < 64 && rs[r] != nil
-}
-
-func (rs *regInUseSet[I, B, F]) get(r RealReg) *vrState[I, B, F] {
-	return rs[r]
-}
-
-func (rs *regInUseSet[I, B, F]) remove(r RealReg) {
-	rs[r] = nil
-}
-
-func (rs *regInUseSet[I, B, F]) add(r RealReg, vr *vrState[I, B, F]) {
-	if r >= 64 {
-		return
-	}
-	rs[r] = vr
-}
-
-func (rs *regInUseSet[I, B, F]) range_(f func(allocatedRealReg RealReg, vr *vrState[I, B, F])) {
-	for i, vr := range rs {
-		if vr != nil {
-			f(RealReg(i), vr)
-		}
-	}
-}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go
deleted file mode 100644
index 47a275a3a..000000000
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go
+++ /dev/null
@@ -1,19 +0,0 @@
-package backend
-
-import (
-	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
-)
-
-// SSAValueDefinition represents a definition of an SSA value.
-type SSAValueDefinition struct {
-	V ssa.Value
-	// Instr is not nil if this is a definition from an instruction.
-	Instr *ssa.Instruction
-	// RefCount is the number of references to the result.
-	RefCount uint32
-}
-
-// IsFromInstr returns true if this definition is from an instruction.
-func (d *SSAValueDefinition) IsFromInstr() bool {
-	return d.Instr != nil
-}