99 files changed, 51250 insertions, 0 deletions
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/compiler.go b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/compiler.go
new file mode 100644
index 000000000..56dfac620
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/compiler.go
@@ -0,0 +1,3634 @@
+package interpreter
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"math"
+	"strings"
+
+	"github.com/tetratelabs/wazero/api"
+	"github.com/tetratelabs/wazero/internal/leb128"
+	"github.com/tetratelabs/wazero/internal/wasm"
+)
+
+type controlFrameKind byte
+
+const (
+	controlFrameKindBlockWithContinuationLabel controlFrameKind = iota
+	controlFrameKindBlockWithoutContinuationLabel
+	controlFrameKindFunction
+	controlFrameKindLoop
+	controlFrameKindIfWithElse
+	controlFrameKindIfWithoutElse
+)
+
+type (
+	controlFrame struct {
+		frameID uint32
+		// originalStackLen holds the number of values on the stack
+		// when Start executing this control frame minus params for the block.
+		originalStackLenWithoutParam int
+		blockType                    *wasm.FunctionType
+		kind                         controlFrameKind
+	}
+	controlFrames struct{ frames []controlFrame }
+)
+
+func (c *controlFrame) ensureContinuation() {
+	// Make sure that if the frame is block and doesn't have continuation,
+	// change the Kind so we can emit the continuation block
+	// later when we reach the End instruction of this frame.
+	if c.kind == controlFrameKindBlockWithoutContinuationLabel {
+		c.kind = controlFrameKindBlockWithContinuationLabel
+	}
+}
+
+func (c *controlFrame) asLabel() label {
+	switch c.kind {
+	case controlFrameKindBlockWithContinuationLabel,
+		controlFrameKindBlockWithoutContinuationLabel:
+		return newLabel(labelKindContinuation, c.frameID)
+	case controlFrameKindLoop:
+		return newLabel(labelKindHeader, c.frameID)
+	case controlFrameKindFunction:
+		return newLabel(labelKindReturn, 0)
+	case controlFrameKindIfWithElse,
+		controlFrameKindIfWithoutElse:
+		return newLabel(labelKindContinuation, c.frameID)
+	}
+	panic(fmt.Sprintf("unreachable: a bug in interpreterir implementation: %v", c.kind))
+}
+
+func (c *controlFrames) functionFrame() *controlFrame {
+	// No need to check stack bound
+	// as we can assume that all the operations
+	// are valid thanks to validateFunction
+	// at module validation phase.
+	return &c.frames[0]
+}
+
+func (c *controlFrames) get(n int) *controlFrame {
+	// No need to check stack bound
+	// as we can assume that all the operations
+	// are valid thanks to validateFunction
+	// at module validation phase.
+	return &c.frames[len(c.frames)-n-1]
+}
+
+func (c *controlFrames) top() *controlFrame {
+	// No need to check stack bound
+	// as we can assume that all the operations
+	// are valid thanks to validateFunction
+	// at module validation phase.
+	return &c.frames[len(c.frames)-1]
+}
+
+func (c *controlFrames) empty() bool {
+	return len(c.frames) == 0
+}
+
+func (c *controlFrames) pop() (frame *controlFrame) {
+	// No need to check stack bound
+	// as we can assume that all the operations
+	// are valid thanks to validateFunction
+	// at module validation phase.
+	frame = c.top()
+	c.frames = c.frames[:len(c.frames)-1]
+	return
+}
+
+func (c *controlFrames) push(frame controlFrame) {
+	c.frames = append(c.frames, frame)
+}
+
+func (c *compiler) initializeStack() {
+	// Reuse the existing slice.
+	c.localIndexToStackHeightInUint64 = c.localIndexToStackHeightInUint64[:0]
+	var current int
+	for _, lt := range c.sig.Params {
+		c.localIndexToStackHeightInUint64 = append(c.localIndexToStackHeightInUint64, current)
+		if lt == wasm.ValueTypeV128 {
+			current++
+		}
+		current++
+	}
+
+	if c.callFrameStackSizeInUint64 > 0 {
+		// We reserve the stack slots for result values below the return call frame slots.
+		if diff := c.sig.ResultNumInUint64 - c.sig.ParamNumInUint64; diff > 0 {
+			current += diff
+		}
+	}
+
+	// Non-func param locals Start after the return call frame.
+	current += c.callFrameStackSizeInUint64
+
+	for _, lt := range c.localTypes {
+		c.localIndexToStackHeightInUint64 = append(c.localIndexToStackHeightInUint64, current)
+		if lt == wasm.ValueTypeV128 {
+			current++
+		}
+		current++
+	}
+
+	// Push function arguments.
+	for _, t := range c.sig.Params {
+		c.stackPush(wasmValueTypeTounsignedType(t))
+	}
+
+	if c.callFrameStackSizeInUint64 > 0 {
+		// Reserve the stack slots for results.
+		for i := 0; i < c.sig.ResultNumInUint64-c.sig.ParamNumInUint64; i++ {
+			c.stackPush(unsignedTypeI64)
+		}
+
+		// Reserve the stack slots for call frame.
+		for i := 0; i < c.callFrameStackSizeInUint64; i++ {
+			c.stackPush(unsignedTypeI64)
+		}
+	}
+}
+
+// compiler is in charge of lowering raw Wasm function body to get compilationResult.
+// This is created per *wasm.Module and reused for all functions in it to reduce memory allocations.
+type compiler struct {
+	module                     *wasm.Module
+	enabledFeatures            api.CoreFeatures
+	callFrameStackSizeInUint64 int
+	stack                      []unsignedType
+	currentFrameID             uint32
+	controlFrames              controlFrames
+	unreachableState           struct {
+		on    bool
+		depth int
+	}
+	pc, currentOpPC uint64
+	result          compilationResult
+
+	// body holds the code for the function's body where Wasm instructions are stored.
+	body []byte
+	// sig is the function type of the target function.
+	sig *wasm.FunctionType
+	// localTypes holds the target function locals' value types except function params.
+	localTypes []wasm.ValueType
+	// localIndexToStackHeightInUint64 maps the local index (starting with function params) to the stack height
+	// where the local is places. This is the necessary mapping for functions who contain vector type locals.
+	localIndexToStackHeightInUint64 []int
+
+	// types hold all the function types in the module where the targe function exists.
+	types []wasm.FunctionType
+	// funcs holds the type indexes for all declared functions in the module where the target function exists.
+	funcs []uint32
+	// globals holds the global types for all declared globals in the module where the target function exists.
+	globals []wasm.GlobalType
+
+	// needSourceOffset is true if this module requires DWARF based stack trace.
+	needSourceOffset bool
+	// bodyOffsetInCodeSection is the offset of the body of this function in the original Wasm binary's code section.
+	bodyOffsetInCodeSection uint64
+
+	ensureTermination bool
+	// Pre-allocated bytes.Reader to be used in various places.
+	br             *bytes.Reader
+	funcTypeToSigs funcTypeToIRSignatures
+
+	next int
+}
+
+//lint:ignore U1000 for debugging only.
+func (c *compiler) stackDump() string {
+	strs := make([]string, 0, len(c.stack))
+	for _, s := range c.stack {
+		strs = append(strs, s.String())
+	}
+	return "[" + strings.Join(strs, ", ") + "]"
+}
+
+func (c *compiler) markUnreachable() {
+	c.unreachableState.on = true
+}
+
+func (c *compiler) resetUnreachable() {
+	c.unreachableState.on = false
+}
+
+// memoryType is the type of memory in a compiled module.
+type memoryType byte
+
+const (
+	// memoryTypeNone indicates there is no memory.
+	memoryTypeNone memoryType = iota
+	// memoryTypeStandard indicates there is a non-shared memory.
+	memoryTypeStandard
+	// memoryTypeShared indicates there is a shared memory.
+	memoryTypeShared
+)
+
+type compilationResult struct {
+	// Operations holds interpreterir operations compiled from Wasm instructions in a Wasm function.
+	Operations []unionOperation
+
+	// IROperationSourceOffsetsInWasmBinary is index-correlated with Operation and maps each operation to the corresponding source instruction's
+	// offset in the original WebAssembly binary.
+	// Non nil only when the given Wasm module has the DWARF section.
+	IROperationSourceOffsetsInWasmBinary []uint64
+
+	// LabelCallers maps label to the number of callers to that label.
+	// Here "callers" means that the call-sites which jumps to the label with br, br_if or br_table
+	// instructions.
+	//
+	// Note: zero possible and allowed in wasm. e.g.
+	//
+	//	(block
+	//	  (br 0)
+	//	  (block i32.const 1111)
+	//	)
+	//
+	// This example the label corresponding to `(block i32.const 1111)` is never be reached at runtime because `br 0` exits the function before we reach there
+	LabelCallers map[label]uint32
+	// UsesMemory is true if this function might use memory.
+	UsesMemory bool
+
+	// The following fields are per-module values, not per-function.
+
+	// Globals holds all the declarations of globals in the module from which this function is compiled.
+	Globals []wasm.GlobalType
+	// Functions holds all the declarations of function in the module from which this function is compiled, including itself.
+	Functions []wasm.Index
+	// Types holds all the types in the module from which this function is compiled.
+	Types []wasm.FunctionType
+	// Memory indicates the type of memory of the module.
+	Memory memoryType
+	// HasTable is true if the module from which this function is compiled has table declaration.
+	HasTable bool
+	// HasDataInstances is true if the module has data instances which might be used by memory.init or data.drop instructions.
+	HasDataInstances bool
+	// HasDataInstances is true if the module has element instances which might be used by table.init or elem.drop instructions.
+	HasElementInstances bool
+}
+
+// newCompiler returns the new *compiler for the given parameters.
+// Use compiler.Next function to get compilation result per function.
+func newCompiler(enabledFeatures api.CoreFeatures, callFrameStackSizeInUint64 int, module *wasm.Module, ensureTermination bool) (*compiler, error) {
+	functions, globals, mem, tables, err := module.AllDeclarations()
+	if err != nil {
+		return nil, err
+	}
+
+	hasTable, hasDataInstances, hasElementInstances := len(tables) > 0,
+		len(module.DataSection) > 0, len(module.ElementSection) > 0
+
+	var mt memoryType
+	switch {
+	case mem == nil:
+		mt = memoryTypeNone
+	case mem.IsShared:
+		mt = memoryTypeShared
+	default:
+		mt = memoryTypeStandard
+	}
+
+	types := module.TypeSection
+
+	c := &compiler{
+		module:                     module,
+		enabledFeatures:            enabledFeatures,
+		controlFrames:              controlFrames{},
+		callFrameStackSizeInUint64: callFrameStackSizeInUint64,
+		result: compilationResult{
+			Globals:             globals,
+			Functions:           functions,
+			Types:               types,
+			Memory:              mt,
+			HasTable:            hasTable,
+			HasDataInstances:    hasDataInstances,
+			HasElementInstances: hasElementInstances,
+			LabelCallers:        map[label]uint32{},
+		},
+		globals:           globals,
+		funcs:             functions,
+		types:             types,
+		ensureTermination: ensureTermination,
+		br:                bytes.NewReader(nil),
+		funcTypeToSigs: funcTypeToIRSignatures{
+			indirectCalls: make([]*signature, len(types)),
+			directCalls:   make([]*signature, len(types)),
+			wasmTypes:     types,
+		},
+		needSourceOffset: module.DWARFLines != nil,
+	}
+	return c, nil
+}
+
+// Next returns the next compilationResult for this compiler.
+func (c *compiler) Next() (*compilationResult, error) {
+	funcIndex := c.next
+	code := &c.module.CodeSection[funcIndex]
+	sig := &c.types[c.module.FunctionSection[funcIndex]]
+
+	// Reset the previous result.
+	c.result.Operations = c.result.Operations[:0]
+	c.result.IROperationSourceOffsetsInWasmBinary = c.result.IROperationSourceOffsetsInWasmBinary[:0]
+	c.result.UsesMemory = false
+	// Clears the existing entries in LabelCallers.
+	for frameID := uint32(0); frameID <= c.currentFrameID; frameID++ {
+		for k := labelKind(0); k < labelKindNum; k++ {
+			delete(c.result.LabelCallers, newLabel(k, frameID))
+		}
+	}
+	// Reset the previous states.
+	c.pc = 0
+	c.currentOpPC = 0
+	c.currentFrameID = 0
+	c.unreachableState.on, c.unreachableState.depth = false, 0
+
+	if err := c.compile(sig, code.Body, code.LocalTypes, code.BodyOffsetInCodeSection); err != nil {
+		return nil, err
+	}
+	c.next++
+	return &c.result, nil
+}
+
+// Compile lowers given function instance into interpreterir operations
+// so that the resulting operations can be consumed by the interpreter
+// or the compiler compilation engine.
+func (c *compiler) compile(sig *wasm.FunctionType, body []byte, localTypes []wasm.ValueType, bodyOffsetInCodeSection uint64) error {
+	// Set function specific fields.
+	c.body = body
+	c.localTypes = localTypes
+	c.sig = sig
+	c.bodyOffsetInCodeSection = bodyOffsetInCodeSection
+
+	// Reuses the underlying slices.
+	c.stack = c.stack[:0]
+	c.controlFrames.frames = c.controlFrames.frames[:0]
+
+	c.initializeStack()
+
+	// Emit const expressions for locals.
+	// Note that here we don't take function arguments
+	// into account, meaning that callers must push
+	// arguments before entering into the function body.
+	for _, t := range c.localTypes {
+		c.emitDefaultValue(t)
+	}
+
+	// Insert the function control frame.
+	c.controlFrames.push(controlFrame{
+		frameID:   c.nextFrameID(),
+		blockType: c.sig,
+		kind:      controlFrameKindFunction,
+	})
+
+	// Now, enter the function body.
+	for !c.controlFrames.empty() && c.pc < uint64(len(c.body)) {
+		if err := c.handleInstruction(); err != nil {
+			return fmt.Errorf("handling instruction: %w", err)
+		}
+	}
+	return nil
+}
+
+// Translate the current Wasm instruction to interpreterir's operations,
+// and emit the results into c.results.
+func (c *compiler) handleInstruction() error {
+	op := c.body[c.pc]
+	c.currentOpPC = c.pc
+	if false {
+		var instName string
+		if op == wasm.OpcodeVecPrefix {
+			instName = wasm.VectorInstructionName(c.body[c.pc+1])
+		} else if op == wasm.OpcodeAtomicPrefix {
+			instName = wasm.AtomicInstructionName(c.body[c.pc+1])
+		} else if op == wasm.OpcodeMiscPrefix {
+			instName = wasm.MiscInstructionName(c.body[c.pc+1])
+		} else {
+			instName = wasm.InstructionName(op)
+		}
+		fmt.Printf("handling %s, unreachable_state(on=%v,depth=%d), stack=%v\n",
+			instName, c.unreachableState.on, c.unreachableState.depth, c.stack,
+		)
+	}
+
+	var peekValueType unsignedType
+	if len(c.stack) > 0 {
+		peekValueType = c.stackPeek()
+	}
+
+	// Modify the stack according the current instruction.
+	// Note that some instructions will read "index" in
+	// applyToStack and advance c.pc inside the function.
+	index, err := c.applyToStack(op)
+	if err != nil {
+		return fmt.Errorf("apply stack failed for %s: %w", wasm.InstructionName(op), err)
+	}
+	// Now we handle each instruction, and
+	// emit the corresponding interpreterir operations to the results.
+operatorSwitch:
+	switch op {
+	case wasm.OpcodeUnreachable:
+		c.emit(newOperationUnreachable())
+		c.markUnreachable()
+	case wasm.OpcodeNop:
+		// Nop is noop!
+	case wasm.OpcodeBlock:
+		c.br.Reset(c.body[c.pc+1:])
+		bt, num, err := wasm.DecodeBlockType(c.types, c.br, c.enabledFeatures)
+		if err != nil {
+			return fmt.Errorf("reading block type for block instruction: %w", err)
+		}
+		c.pc += num
+
+		if c.unreachableState.on {
+			// If it is currently in unreachable,
+			// just remove the entire block.
+			c.unreachableState.depth++
+			break operatorSwitch
+		}
+
+		// Create a new frame -- entering this block.
+		frame := controlFrame{
+			frameID:                      c.nextFrameID(),
+			originalStackLenWithoutParam: len(c.stack) - len(bt.Params),
+			kind:                         controlFrameKindBlockWithoutContinuationLabel,
+			blockType:                    bt,
+		}
+		c.controlFrames.push(frame)
+
+	case wasm.OpcodeLoop:
+		c.br.Reset(c.body[c.pc+1:])
+		bt, num, err := wasm.DecodeBlockType(c.types, c.br, c.enabledFeatures)
+		if err != nil {
+			return fmt.Errorf("reading block type for loop instruction: %w", err)
+		}
+		c.pc += num
+
+		if c.unreachableState.on {
+			// If it is currently in unreachable,
+			// just remove the entire block.
+			c.unreachableState.depth++
+			break operatorSwitch
+		}
+
+		// Create a new frame -- entering loop.
+		frame := controlFrame{
+			frameID:                      c.nextFrameID(),
+			originalStackLenWithoutParam: len(c.stack) - len(bt.Params),
+			kind:                         controlFrameKindLoop,
+			blockType:                    bt,
+		}
+		c.controlFrames.push(frame)
+
+		// Prep labels for inside and the continuation of this loop.
+		loopLabel := newLabel(labelKindHeader, frame.frameID)
+		c.result.LabelCallers[loopLabel]++
+
+		// Emit the branch operation to enter inside the loop.
+		c.emit(newOperationBr(loopLabel))
+		c.emit(newOperationLabel(loopLabel))
+
+		// Insert the exit code check on the loop header, which is the only necessary point in the function body
+		// to prevent infinite loop.
+		//
+		// Note that this is a little aggressive: this checks the exit code regardless the loop header is actually
+		// the loop. In other words, this checks even when no br/br_if/br_table instructions jumping to this loop
+		// exist. However, in reality, that shouldn't be an issue since such "noop" loop header will highly likely be
+		// optimized out by almost all guest language compilers which have the control flow optimization passes.
+		if c.ensureTermination {
+			c.emit(newOperationBuiltinFunctionCheckExitCode())
+		}
+	case wasm.OpcodeIf:
+		c.br.Reset(c.body[c.pc+1:])
+		bt, num, err := wasm.DecodeBlockType(c.types, c.br, c.enabledFeatures)
+		if err != nil {
+			return fmt.Errorf("reading block type for if instruction: %w", err)
+		}
+		c.pc += num
+
+		if c.unreachableState.on {
+			// If it is currently in unreachable,
+			// just remove the entire block.
+			c.unreachableState.depth++
+			break operatorSwitch
+		}
+
+		// Create a new frame -- entering if.
+		frame := controlFrame{
+			frameID:                      c.nextFrameID(),
+			originalStackLenWithoutParam: len(c.stack) - len(bt.Params),
+			// Note this will be set to controlFrameKindIfWithElse
+			// when else opcode found later.
+			kind:      controlFrameKindIfWithoutElse,
+			blockType: bt,
+		}
+		c.controlFrames.push(frame)
+
+		// Prep labels for if and else of this if.
+		thenLabel := newLabel(labelKindHeader, frame.frameID)
+		elseLabel := newLabel(labelKindElse, frame.frameID)
+		c.result.LabelCallers[thenLabel]++
+		c.result.LabelCallers[elseLabel]++
+
+		// Emit the branch operation to enter the then block.
+		c.emit(newOperationBrIf(thenLabel, elseLabel, nopinclusiveRange))
+		c.emit(newOperationLabel(thenLabel))
+	case wasm.OpcodeElse:
+		frame := c.controlFrames.top()
+		if c.unreachableState.on && c.unreachableState.depth > 0 {
+			// If it is currently in unreachable, and the nested if,
+			// just remove the entire else block.
+			break operatorSwitch
+		} else if c.unreachableState.on {
+			// If it is currently in unreachable, and the non-nested if,
+			// reset the stack so we can correctly handle the else block.
+			top := c.controlFrames.top()
+			c.stack = c.stack[:top.originalStackLenWithoutParam]
+			top.kind = controlFrameKindIfWithElse
+
+			// Re-push the parameters to the if block so that else block can use them.
+			for _, t := range frame.blockType.Params {
+				c.stackPush(wasmValueTypeTounsignedType(t))
+			}
+
+			// We are no longer unreachable in else frame,
+			// so emit the correct label, and reset the unreachable state.
+			elseLabel := newLabel(labelKindElse, frame.frameID)
+			c.resetUnreachable()
+			c.emit(
+				newOperationLabel(elseLabel),
+			)
+			break operatorSwitch
+		}
+
+		// Change the Kind of this If block, indicating that
+		// the if has else block.
+		frame.kind = controlFrameKindIfWithElse
+
+		// We need to reset the stack so that
+		// the values pushed inside the then block
+		// do not affect the else block.
+		dropOp := newOperationDrop(c.getFrameDropRange(frame, false))
+
+		// Reset the stack manipulated by the then block, and re-push the block param types to the stack.
+
+		c.stack = c.stack[:frame.originalStackLenWithoutParam]
+		for _, t := range frame.blockType.Params {
+			c.stackPush(wasmValueTypeTounsignedType(t))
+		}
+
+		// Prep labels for else and the continuation of this if block.
+		elseLabel := newLabel(labelKindElse, frame.frameID)
+		continuationLabel := newLabel(labelKindContinuation, frame.frameID)
+		c.result.LabelCallers[continuationLabel]++
+
+		// Emit the instructions for exiting the if loop,
+		// and then the initiation of else block.
+		c.emit(dropOp)
+		// Jump to the continuation of this block.
+		c.emit(newOperationBr(continuationLabel))
+		// Initiate the else block.
+		c.emit(newOperationLabel(elseLabel))
+	case wasm.OpcodeEnd:
+		if c.unreachableState.on && c.unreachableState.depth > 0 {
+			c.unreachableState.depth--
+			break operatorSwitch
+		} else if c.unreachableState.on {
+			c.resetUnreachable()
+
+			frame := c.controlFrames.pop()
+			if c.controlFrames.empty() {
+				return nil
+			}
+
+			c.stack = c.stack[:frame.originalStackLenWithoutParam]
+			for _, t := range frame.blockType.Results {
+				c.stackPush(wasmValueTypeTounsignedType(t))
+			}
+
+			continuationLabel := newLabel(labelKindContinuation, frame.frameID)
+			if frame.kind == controlFrameKindIfWithoutElse {
+				// Emit the else label.
+				elseLabel := newLabel(labelKindElse, frame.frameID)
+				c.result.LabelCallers[continuationLabel]++
+				c.emit(newOperationLabel(elseLabel))
+				c.emit(newOperationBr(continuationLabel))
+				c.emit(newOperationLabel(continuationLabel))
+			} else {
+				c.emit(
+					newOperationLabel(continuationLabel),
+				)
+			}
+
+			break operatorSwitch
+		}
+
+		frame := c.controlFrames.pop()
+
+		// We need to reset the stack so that
+		// the values pushed inside the block.
+		dropOp := newOperationDrop(c.getFrameDropRange(frame, true))
+		c.stack = c.stack[:frame.originalStackLenWithoutParam]
+
+		// Push the result types onto the stack.
+		for _, t := range frame.blockType.Results {
+			c.stackPush(wasmValueTypeTounsignedType(t))
+		}
+
+		// Emit the instructions according to the Kind of the current control frame.
+		switch frame.kind {
+		case controlFrameKindFunction:
+			if !c.controlFrames.empty() {
+				// Should never happen. If so, there's a bug in the translation.
+				panic("bug: found more function control frames")
+			}
+			// Return from function.
+			c.emit(dropOp)
+			c.emit(newOperationBr(newLabel(labelKindReturn, 0)))
+		case controlFrameKindIfWithoutElse:
+			// This case we have to emit "empty" else label.
+			elseLabel := newLabel(labelKindElse, frame.frameID)
+			continuationLabel := newLabel(labelKindContinuation, frame.frameID)
+			c.result.LabelCallers[continuationLabel] += 2
+			c.emit(dropOp)
+			c.emit(newOperationBr(continuationLabel))
+			// Emit the else which soon branches into the continuation.
+			c.emit(newOperationLabel(elseLabel))
+			c.emit(newOperationBr(continuationLabel))
+			// Initiate the continuation.
+			c.emit(newOperationLabel(continuationLabel))
+		case controlFrameKindBlockWithContinuationLabel,
+			controlFrameKindIfWithElse:
+			continuationLabel := newLabel(labelKindContinuation, frame.frameID)
+			c.result.LabelCallers[continuationLabel]++
+			c.emit(dropOp)
+			c.emit(newOperationBr(continuationLabel))
+			c.emit(newOperationLabel(continuationLabel))
+		case controlFrameKindLoop, controlFrameKindBlockWithoutContinuationLabel:
+			c.emit(
+				dropOp,
+			)
+		default:
+			// Should never happen. If so, there's a bug in the translation.
+			panic(fmt.Errorf("bug: invalid control frame Kind: 0x%x", frame.kind))
+		}
+
+	case wasm.OpcodeBr:
+		targetIndex, n, err := leb128.LoadUint32(c.body[c.pc+1:])
+		if err != nil {
+			return fmt.Errorf("read the target for br_if: %w", err)
+		}
+		c.pc += n
+
+		if c.unreachableState.on {
+			// If it is currently in unreachable, br is no-op.
+			break operatorSwitch
+		}
+
+		targetFrame := c.controlFrames.get(int(targetIndex))
+		targetFrame.ensureContinuation()
+		dropOp := newOperationDrop(c.getFrameDropRange(targetFrame, false))
+		targetID := targetFrame.asLabel()
+		c.result.LabelCallers[targetID]++
+		c.emit(dropOp)
+		c.emit(newOperationBr(targetID))
+		// Br operation is stack-polymorphic, and mark the state as unreachable.
+		// That means subsequent instructions in the current control frame are "unreachable"
+		// and can be safely removed.
+		c.markUnreachable()
+	case wasm.OpcodeBrIf:
+		targetIndex, n, err := leb128.LoadUint32(c.body[c.pc+1:])
+		if err != nil {
+			return fmt.Errorf("read the target for br_if: %w", err)
+		}
+		c.pc += n
+
+		if c.unreachableState.on {
+			// If it is currently in unreachable, br-if is no-op.
+			break operatorSwitch
+		}
+
+		targetFrame := c.controlFrames.get(int(targetIndex))
+		targetFrame.ensureContinuation()
+		drop := c.getFrameDropRange(targetFrame, false)
+		target := targetFrame.asLabel()
+		c.result.LabelCallers[target]++
+
+		continuationLabel := newLabel(labelKindHeader, c.nextFrameID())
+		c.result.LabelCallers[continuationLabel]++
+		c.emit(newOperationBrIf(target, continuationLabel, drop))
+		// Start emitting else block operations.
+		c.emit(newOperationLabel(continuationLabel))
+	case wasm.OpcodeBrTable:
+		c.br.Reset(c.body[c.pc+1:])
+		r := c.br
+		numTargets, n, err := leb128.DecodeUint32(r)
+		if err != nil {
+			return fmt.Errorf("error reading number of targets in br_table: %w", err)
+		}
+		c.pc += n
+
+		if c.unreachableState.on {
+			// If it is currently in unreachable, br_table is no-op.
+			// But before proceeding to the next instruction, we must advance the pc
+			// according to the number of br_table targets.
+			for i := uint32(0); i <= numTargets; i++ { // inclusive as we also need to read the index of default target.
+				_, n, err := leb128.DecodeUint32(r)
+				if err != nil {
+					return fmt.Errorf("error reading target %d in br_table: %w", i, err)
+				}
+				c.pc += n
+			}
+			break operatorSwitch
+		}
+
+		// Read the branch targets.
+		s := numTargets * 2
+		targetLabels := make([]uint64, 2+s) // (label, inclusiveRange) * (default+numTargets)
+		for i := uint32(0); i < s; i += 2 {
+			l, n, err := leb128.DecodeUint32(r)
+			if err != nil {
+				return fmt.Errorf("error reading target %d in br_table: %w", i, err)
+			}
+			c.pc += n
+			targetFrame := c.controlFrames.get(int(l))
+			targetFrame.ensureContinuation()
+			drop := c.getFrameDropRange(targetFrame, false)
+			targetLabel := targetFrame.asLabel()
+			targetLabels[i] = uint64(targetLabel)
+			targetLabels[i+1] = drop.AsU64()
+			c.result.LabelCallers[targetLabel]++
+		}
+
+		// Prep default target control frame.
+		l, n, err := leb128.DecodeUint32(r)
+		if err != nil {
+			return fmt.Errorf("error reading default target of br_table: %w", err)
+		}
+		c.pc += n
+		defaultTargetFrame := c.controlFrames.get(int(l))
+		defaultTargetFrame.ensureContinuation()
+		defaultTargetDrop := c.getFrameDropRange(defaultTargetFrame, false)
+		defaultLabel := defaultTargetFrame.asLabel()
+		c.result.LabelCallers[defaultLabel]++
+		targetLabels[s] = uint64(defaultLabel)
+		targetLabels[s+1] = defaultTargetDrop.AsU64()
+		c.emit(newOperationBrTable(targetLabels))
+
+		// br_table operation is stack-polymorphic, and mark the state as unreachable.
+		// That means subsequent instructions in the current control frame are "unreachable"
+		// and can be safely removed.
+		c.markUnreachable()
+	case wasm.OpcodeReturn:
+		functionFrame := c.controlFrames.functionFrame()
+		dropOp := newOperationDrop(c.getFrameDropRange(functionFrame, false))
+
+		// Cleanup the stack and then jmp to function frame's continuation (meaning return).
+		c.emit(dropOp)
+		c.emit(newOperationBr(functionFrame.asLabel()))
+
+		// Return operation is stack-polymorphic, and mark the state as unreachable.
+		// That means subsequent instructions in the current control frame are "unreachable"
+		// and can be safely removed.
+		c.markUnreachable()
+	case wasm.OpcodeCall:
+		c.emit(
+			newOperationCall(index),
+		)
+	case wasm.OpcodeCallIndirect:
+		typeIndex := index
+		tableIndex, n, err := leb128.LoadUint32(c.body[c.pc+1:])
+		if err != nil {
+			return fmt.Errorf("read target for br_table: %w", err)
+		}
+		c.pc += n
+		c.emit(
+			newOperationCallIndirect(typeIndex, tableIndex),
+		)
+	case wasm.OpcodeDrop:
+		r := inclusiveRange{Start: 0, End: 0}
+		if peekValueType == unsignedTypeV128 {
+			// inclusiveRange is the range in uint64 representation, so dropping a vector value on top
+			// should be translated as drop [0..1] inclusively.
+			r.End++
+		}
+		c.emit(newOperationDrop(r))
+	case wasm.OpcodeSelect:
+		// If it is on the unreachable state, ignore the instruction.
+		if c.unreachableState.on {
+			break operatorSwitch
+		}
+		isTargetVector := c.stackPeek() == unsignedTypeV128
+		c.emit(
+			newOperationSelect(isTargetVector),
+		)
+	case wasm.OpcodeTypedSelect:
+		// Skips two bytes: vector size fixed to 1, and the value type for select.
+		c.pc += 2
+		// If it is on the unreachable state, ignore the instruction.
+		if c.unreachableState.on {
+			break operatorSwitch
+		}
+		// Typed select is semantically equivalent to select at runtime.
+		isTargetVector := c.stackPeek() == unsignedTypeV128
+		c.emit(
+			newOperationSelect(isTargetVector),
+		)
+	case wasm.OpcodeLocalGet:
+		depth := c.localDepth(index)
+		if isVector := c.localType(index) == wasm.ValueTypeV128; !isVector {
+			c.emit(
+				// -1 because we already manipulated the stack before
+				// called localDepth ^^.
+				newOperationPick(depth-1, isVector),
+			)
+		} else {
+			c.emit(
+				// -2 because we already manipulated the stack before
+				// called localDepth ^^.
+				newOperationPick(depth-2, isVector),
+			)
+		}
+	case wasm.OpcodeLocalSet:
+		depth := c.localDepth(index)
+
+		isVector := c.localType(index) == wasm.ValueTypeV128
+		if isVector {
+			c.emit(
+				// +2 because we already popped the operands for this operation from the c.stack before
+				// called localDepth ^^,
+				newOperationSet(depth+2, isVector),
+			)
+		} else {
+			c.emit(
+				// +1 because we already popped the operands for this operation from the c.stack before
+				// called localDepth ^^,
+				newOperationSet(depth+1, isVector),
+			)
+		}
+	case wasm.OpcodeLocalTee:
+		depth := c.localDepth(index)
+		isVector := c.localType(index) == wasm.ValueTypeV128
+		if isVector {
+			c.emit(newOperationPick(1, isVector))
+			c.emit(newOperationSet(depth+2, isVector))
+		} else {
+			c.emit(
+				newOperationPick(0, isVector))
+			c.emit(newOperationSet(depth+1, isVector))
+		}
+	case wasm.OpcodeGlobalGet:
+		c.emit(
+			newOperationGlobalGet(index),
+		)
+	case wasm.OpcodeGlobalSet:
+		c.emit(
+			newOperationGlobalSet(index),
+		)
+	case wasm.OpcodeI32Load:
+		imm, err := c.readMemoryArg(wasm.OpcodeI32LoadName)
+		if err != nil {
+			return err
+		}
+		c.emit(newOperationLoad(unsignedTypeI32, imm))
+	case wasm.OpcodeI64Load:
+		imm, err := c.readMemoryArg(wasm.OpcodeI64LoadName)
+		if err != nil {
+			return err
+		}
+		c.emit(newOperationLoad(unsignedTypeI64, imm))
+	case wasm.OpcodeF32Load:
+		imm, err := c.readMemoryArg(wasm.OpcodeF32LoadName)
+		if err != nil {
+			return err
+		}
+		c.emit(newOperationLoad(unsignedTypeF32, imm))
+	case wasm.OpcodeF64Load:
+		imm, err := c.readMemoryArg(wasm.OpcodeF64LoadName)
+		if err != nil {
+			return err
+		}
+		c.emit(newOperationLoad(unsignedTypeF64, imm))
+	case wasm.OpcodeI32Load8S:
+		imm, err := c.readMemoryArg(wasm.OpcodeI32Load8SName)
+		if err != nil {
+			return err
+		}
+		c.emit(newOperationLoad8(signedInt32, imm))
+	case wasm.OpcodeI32Load8U:
+		imm, err := c.readMemoryArg(wasm.OpcodeI32Load8UName)
+		if err != nil {
+			return err
+		}
+		c.emit(newOperationLoad8(signedUint32, imm))
+	case wasm.OpcodeI32Load16S:
+		imm, err := c.readMemoryArg(wasm.OpcodeI32Load16SName)
+		if err != nil {
+			return err
+		}
+		c.emit(newOperationLoad16(signedInt32, imm))
+	case wasm.OpcodeI32Load16U:
+		imm, err := c.readMemoryArg(wasm.OpcodeI32Load16UName)
+		if err != nil {
+			return err
+		}
+		c.emit(newOperationLoad16(signedUint32, imm))
+	case wasm.OpcodeI64Load8S:
+		imm, err := c.readMemoryArg(wasm.OpcodeI64Load8SName)
+		if err != nil {
+			return err
+		}
+		c.emit(newOperationLoad8(signedInt64, imm))
+	case wasm.OpcodeI64Load8U:
+		imm, err := c.readMemoryArg(wasm.OpcodeI64Load8UName)
+		if err != nil {
+			return err
+		}
+		c.emit(newOperationLoad8(signedUint64, imm))
+	case wasm.OpcodeI64Load16S:
+		imm, err := c.readMemoryArg(wasm.OpcodeI64Load16SName)
+		if err != nil {
+			return err
+		}
+		c.emit(newOperationLoad16(signedInt64, imm))
+	case wasm.OpcodeI64Load16U:
+		imm, err := c.readMemoryArg(wasm.OpcodeI64Load16UName)
+		if err != nil {
+			return err
+		}
+		c.emit(newOperationLoad16(signedUint64, imm))
+	case wasm.OpcodeI64Load32S:
+		imm, err := c.readMemoryArg(wasm.OpcodeI64Load32SName)
+		if err != nil {
+			return err
+		}
+		c.emit(newOperationLoad32(true, imm))
+	case wasm.OpcodeI64Load32U:
+		imm, err := c.readMemoryArg(wasm.OpcodeI64Load32UName)
+		if err != nil {
+			return err
+		}
+		c.emit(newOperationLoad32(false, imm))
+	case wasm.OpcodeI32Store:
+		imm, err := c.readMemoryArg(wasm.OpcodeI32StoreName)
+		if err != nil {
+			return err
+		}
+		c.emit(
+			newOperationStore(unsignedTypeI32, imm),
+		)
+	case wasm.OpcodeI64Store:
+		imm, err := c.readMemoryArg(wasm.OpcodeI64StoreName)
+		if err != nil {
+			return err
+		}
+		c.emit(
+			newOperationStore(unsignedTypeI64, imm),
+		)
+	case wasm.OpcodeF32Store:
+		imm, err := c.readMemoryArg(wasm.OpcodeF32StoreName)
+		if err != nil {
+			return err
+		}
+		c.emit(
+			newOperationStore(unsignedTypeF32, imm),
+		)
+	case wasm.OpcodeF64Store:
+		imm, err := c.readMemoryArg(wasm.OpcodeF64StoreName)
+		if err != nil {
+			return err
+		}
+		c.emit(
+			newOperationStore(unsignedTypeF64, imm),
+		)
+	case wasm.OpcodeI32Store8:
+		imm, err := c.readMemoryArg(wasm.OpcodeI32Store8Name)
+		if err != nil {
+			return err
+		}
+		c.emit(
+			newOperationStore8(imm),
+		)
+	case wasm.OpcodeI32Store16:
+		imm, err := c.readMemoryArg(wasm.OpcodeI32Store16Name)
+		if err != nil {
+			return err
+		}
+		c.emit(
+			newOperationStore16(imm),
+		)
+	case wasm.OpcodeI64Store8:
+		imm, err := c.readMemoryArg(wasm.OpcodeI64Store8Name)
+		if err != nil {
+			return err
+		}
+		c.emit(
+			newOperationStore8(imm),
+		)
+	case wasm.OpcodeI64Store16:
+		imm, err := c.readMemoryArg(wasm.OpcodeI64Store16Name)
+		if err != nil {
+			return err
+		}
+		c.emit(
+			newOperationStore16(imm),
+		)
+	case wasm.OpcodeI64Store32:
+		imm, err := c.readMemoryArg(wasm.OpcodeI64Store32Name)
+		if err != nil {
+			return err
+		}
+		c.emit(
+			newOperationStore32(imm),
+		)
+	case wasm.OpcodeMemorySize:
+		c.result.UsesMemory = true
+		c.pc++ // Skip the reserved one byte.
+		c.emit(
+			newOperationMemorySize(),
+		)
+	case wasm.OpcodeMemoryGrow:
+		c.result.UsesMemory = true
+		c.pc++ // Skip the reserved one byte.
+		c.emit(
+			newOperationMemoryGrow(),
+		)
+	case wasm.OpcodeI32Const:
+		val, num, err := leb128.LoadInt32(c.body[c.pc+1:])
+		if err != nil {
+			return fmt.Errorf("reading i32.const value: %v", err)
+		}
+		c.pc += num
+		c.emit(
+			newOperationConstI32(uint32(val)),
+		)
+	case wasm.OpcodeI64Const:
+		val, num, err := leb128.LoadInt64(c.body[c.pc+1:])
+		if err != nil {
+			return fmt.Errorf("reading i64.const value: %v", err)
+		}
+		c.pc += num
+		c.emit(
+			newOperationConstI64(uint64(val)),
+		)
+	case wasm.OpcodeF32Const:
+		v := math.Float32frombits(binary.LittleEndian.Uint32(c.body[c.pc+1:]))
+		c.pc += 4
+		c.emit(
+			newOperationConstF32(v),
+		)
+	case wasm.OpcodeF64Const:
+		v := math.Float64frombits(binary.LittleEndian.Uint64(c.body[c.pc+1:]))
+		c.pc += 8
+		c.emit(
+			newOperationConstF64(v),
+		)
+	case wasm.OpcodeI32Eqz:
+		c.emit(
+			newOperationEqz(unsignedInt32),
+		)
+	case wasm.OpcodeI32Eq:
+		c.emit(
+			newOperationEq(unsignedTypeI32),
+		)
+	case wasm.OpcodeI32Ne:
+		c.emit(
+			newOperationNe(unsignedTypeI32),
+		)
+	case wasm.OpcodeI32LtS:
+		c.emit(
+			newOperationLt(signedTypeInt32),
+		)
+	case wasm.OpcodeI32LtU:
+		c.emit(
+			newOperationLt(signedTypeUint32),
+		)
+	case wasm.OpcodeI32GtS:
+		c.emit(
+			newOperationGt(signedTypeInt32),
+		)
+	case wasm.OpcodeI32GtU:
+		c.emit(
+			newOperationGt(signedTypeUint32),
+		)
+	case wasm.OpcodeI32LeS:
+		c.emit(
+			newOperationLe(signedTypeInt32),
+		)
+	case wasm.OpcodeI32LeU:
+		c.emit(
+			newOperationLe(signedTypeUint32),
+		)
+	case wasm.OpcodeI32GeS:
+		c.emit(
+			newOperationGe(signedTypeInt32),
+		)
+	case wasm.OpcodeI32GeU:
+		c.emit(
+			newOperationGe(signedTypeUint32),
+		)
+	case wasm.OpcodeI64Eqz:
+		c.emit(
+			newOperationEqz(unsignedInt64),
+		)
+	case wasm.OpcodeI64Eq:
+		c.emit(
+			newOperationEq(unsignedTypeI64),
+		)
+	case wasm.OpcodeI64Ne:
+		c.emit(
+			newOperationNe(unsignedTypeI64),
+		)
+	case wasm.OpcodeI64LtS:
+		c.emit(
+			newOperationLt(signedTypeInt64),
+		)
+	case wasm.OpcodeI64LtU:
+		c.emit(
+			newOperationLt(signedTypeUint64),
+		)
+	case wasm.OpcodeI64GtS:
+		c.emit(
+			newOperationGt(signedTypeInt64),
+		)
+	case wasm.OpcodeI64GtU:
+		c.emit(
+			newOperationGt(signedTypeUint64),
+		)
+	case wasm.OpcodeI64LeS:
+		c.emit(
+			newOperationLe(signedTypeInt64),
+		)
+	case wasm.OpcodeI64LeU:
+		c.emit(
+			newOperationLe(signedTypeUint64),
+		)
+	case wasm.OpcodeI64GeS:
+		c.emit(
+			newOperationGe(signedTypeInt64),
+		)
+	case wasm.OpcodeI64GeU:
+		c.emit(
+			newOperationGe(signedTypeUint64),
+		)
+	case wasm.OpcodeF32Eq:
+		c.emit(
+			newOperationEq(unsignedTypeF32),
+		)
+	case wasm.OpcodeF32Ne:
+		c.emit(
+			newOperationNe(unsignedTypeF32),
+		)
+	case wasm.OpcodeF32Lt:
+		c.emit(
+			newOperationLt(signedTypeFloat32),
+		)
+	case wasm.OpcodeF32Gt:
+		c.emit(
+			newOperationGt(signedTypeFloat32),
+		)
+	case wasm.OpcodeF32Le:
+		c.emit(
+			newOperationLe(signedTypeFloat32),
+		)
+	case wasm.OpcodeF32Ge:
+		c.emit(
+			newOperationGe(signedTypeFloat32),
+		)
+	case wasm.OpcodeF64Eq:
+		c.emit(
+			newOperationEq(unsignedTypeF64),
+		)
+	case wasm.OpcodeF64Ne:
+		c.emit(
+			newOperationNe(unsignedTypeF64),
+		)
+	case wasm.OpcodeF64Lt:
+		c.emit(
+			newOperationLt(signedTypeFloat64),
+		)
+	case wasm.OpcodeF64Gt:
+		c.emit(
+			newOperationGt(signedTypeFloat64),
+		)
+	case wasm.OpcodeF64Le:
+		c.emit(
+			newOperationLe(signedTypeFloat64),
+		)
+	case wasm.OpcodeF64Ge:
+		c.emit(
+			newOperationGe(signedTypeFloat64),
+		)
+	case wasm.OpcodeI32Clz:
+		c.emit(
+			newOperationClz(unsignedInt32),
+		)
+	case wasm.OpcodeI32Ctz:
+		c.emit(
+			newOperationCtz(unsignedInt32),
+		)
+	case wasm.OpcodeI32Popcnt:
+		c.emit(
+			newOperationPopcnt(unsignedInt32),
+		)
+	case wasm.OpcodeI32Add:
+		c.emit(
+			newOperationAdd(unsignedTypeI32),
+		)
+	case wasm.OpcodeI32Sub:
+		c.emit(
+			newOperationSub(unsignedTypeI32),
+		)
+	case wasm.OpcodeI32Mul:
+		c.emit(
+			newOperationMul(unsignedTypeI32),
+		)
+	case wasm.OpcodeI32DivS:
+		c.emit(
+			newOperationDiv(signedTypeInt32),
+		)
+	case wasm.OpcodeI32DivU:
+		c.emit(
+			newOperationDiv(signedTypeUint32),
+		)
+	case wasm.OpcodeI32RemS:
+		c.emit(
+			newOperationRem(signedInt32),
+		)
+	case wasm.OpcodeI32RemU:
+		c.emit(
+			newOperationRem(signedUint32),
+		)
+	case wasm.OpcodeI32And:
+		c.emit(
+			newOperationAnd(unsignedInt32),
+		)
+	case wasm.OpcodeI32Or:
+		c.emit(
+			newOperationOr(unsignedInt32),
+		)
+	case wasm.OpcodeI32Xor:
+		c.emit(
+			newOperationXor(unsignedInt64),
+		)
+	case wasm.OpcodeI32Shl:
+		c.emit(
+			newOperationShl(unsignedInt32),
+		)
+	case wasm.OpcodeI32ShrS:
+		c.emit(
+			newOperationShr(signedInt32),
+		)
+	case wasm.OpcodeI32ShrU:
+		c.emit(
+			newOperationShr(signedUint32),
+		)
+	case wasm.OpcodeI32Rotl:
+		c.emit(
+			newOperationRotl(unsignedInt32),
+		)
+	case wasm.OpcodeI32Rotr:
+		c.emit(
+			newOperationRotr(unsignedInt32),
+		)
+	case wasm.OpcodeI64Clz:
+		c.emit(
+			newOperationClz(unsignedInt64),
+		)
+	case wasm.OpcodeI64Ctz:
+		c.emit(
+			newOperationCtz(unsignedInt64),
+		)
+	case wasm.OpcodeI64Popcnt:
+		c.emit(
+			newOperationPopcnt(unsignedInt64),
+		)
+	case wasm.OpcodeI64Add:
+		c.emit(
+			newOperationAdd(unsignedTypeI64),
+		)
+	case wasm.OpcodeI64Sub:
+		c.emit(
+			newOperationSub(unsignedTypeI64),
+		)
+	case wasm.OpcodeI64Mul:
+		c.emit(
+			newOperationMul(unsignedTypeI64),
+		)
+	case wasm.OpcodeI64DivS:
+		c.emit(
+			newOperationDiv(signedTypeInt64),
+		)
+	case wasm.OpcodeI64DivU:
+		c.emit(
+			newOperationDiv(signedTypeUint64),
+		)
+	case wasm.OpcodeI64RemS:
+		c.emit(
+			newOperationRem(signedInt64),
+		)
+	case wasm.OpcodeI64RemU:
+		c.emit(
+			newOperationRem(signedUint64),
+		)
+	case wasm.OpcodeI64And:
+		c.emit(
+			newOperationAnd(unsignedInt64),
+		)
+	case wasm.OpcodeI64Or:
+		c.emit(
+			newOperationOr(unsignedInt64),
+		)
+	case wasm.OpcodeI64Xor:
+		c.emit(
+			newOperationXor(unsignedInt64),
+		)
+	case wasm.OpcodeI64Shl:
+		c.emit(
+			newOperationShl(unsignedInt64),
+		)
+	case wasm.OpcodeI64ShrS:
+		c.emit(
+			newOperationShr(signedInt64),
+		)
+	case wasm.OpcodeI64ShrU:
+		c.emit(
+			newOperationShr(signedUint64),
+		)
+	case wasm.OpcodeI64Rotl:
+		c.emit(
+			newOperationRotl(unsignedInt64),
+		)
+	case wasm.OpcodeI64Rotr:
+		c.emit(
+			newOperationRotr(unsignedInt64),
+		)
+	case wasm.OpcodeF32Abs:
+		c.emit(
+			newOperationAbs(f32),
+		)
+	case wasm.OpcodeF32Neg:
+		c.emit(
+			newOperationNeg(f32),
+		)
+	case wasm.OpcodeF32Ceil:
+		c.emit(
+			newOperationCeil(f32),
+		)
+	case wasm.OpcodeF32Floor:
+		c.emit(
+			newOperationFloor(f32),
+		)
+	case wasm.OpcodeF32Trunc:
+		c.emit(
+			newOperationTrunc(f32),
+		)
+	case wasm.OpcodeF32Nearest:
+		c.emit(
+			newOperationNearest(f32),
+		)
+	case wasm.OpcodeF32Sqrt:
+		c.emit(
+			newOperationSqrt(f32),
+		)
+	case wasm.OpcodeF32Add:
+		c.emit(
+			newOperationAdd(unsignedTypeF32),
+		)
+	case wasm.OpcodeF32Sub:
+		c.emit(
+			newOperationSub(unsignedTypeF32),
+		)
+	case wasm.OpcodeF32Mul:
+		c.emit(
+			newOperationMul(unsignedTypeF32),
+		)
+	case wasm.OpcodeF32Div:
+		c.emit(
+			newOperationDiv(signedTypeFloat32),
+		)
+	case wasm.OpcodeF32Min:
+		c.emit(
+			newOperationMin(f32),
+		)
+	case wasm.OpcodeF32Max:
+		c.emit(
+			newOperationMax(f32),
+		)
+	case wasm.OpcodeF32Copysign:
+		c.emit(
+			newOperationCopysign(f32),
+		)
+	case wasm.OpcodeF64Abs:
+		c.emit(
+			newOperationAbs(f64),
+		)
+	case wasm.OpcodeF64Neg:
+		c.emit(
+			newOperationNeg(f64),
+		)
+	case wasm.OpcodeF64Ceil:
+		c.emit(
+			newOperationCeil(f64),
+		)
+	case wasm.OpcodeF64Floor:
+		c.emit(
+			newOperationFloor(f64),
+		)
+	case wasm.OpcodeF64Trunc:
+		c.emit(
+			newOperationTrunc(f64),
+		)
+	case wasm.OpcodeF64Nearest:
+		c.emit(
+			newOperationNearest(f64),
+		)
+	case wasm.OpcodeF64Sqrt:
+		c.emit(
+			newOperationSqrt(f64),
+		)
+	case wasm.OpcodeF64Add:
+		c.emit(
+			newOperationAdd(unsignedTypeF64),
+		)
+	case wasm.OpcodeF64Sub:
+		c.emit(
+			newOperationSub(unsignedTypeF64),
+		)
+	case wasm.OpcodeF64Mul:
+		c.emit(
+			newOperationMul(unsignedTypeF64),
+		)
+	case wasm.OpcodeF64Div:
+		c.emit(
+			newOperationDiv(signedTypeFloat64),
+		)
+	case wasm.OpcodeF64Min:
+		c.emit(
+			newOperationMin(f64),
+		)
+	case wasm.OpcodeF64Max:
+		c.emit(
+			newOperationMax(f64),
+		)
+	case wasm.OpcodeF64Copysign:
+		c.emit(
+			newOperationCopysign(f64),
+		)
+	case wasm.OpcodeI32WrapI64:
+		c.emit(
+			newOperationI32WrapFromI64(),
+		)
+	case wasm.OpcodeI32TruncF32S:
+		c.emit(
+			newOperationITruncFromF(f32, signedInt32, false),
+		)
+	case wasm.OpcodeI32TruncF32U:
+		c.emit(
+			newOperationITruncFromF(f32, signedUint32, false),
+		)
+	case wasm.OpcodeI32TruncF64S:
+		c.emit(
+			newOperationITruncFromF(f64, signedInt32, false),
+		)
+	case wasm.OpcodeI32TruncF64U:
+		c.emit(
+			newOperationITruncFromF(f64, signedUint32, false),
+		)
+	case wasm.OpcodeI64ExtendI32S:
+		c.emit(
+			newOperationExtend(true),
+		)
+	case wasm.OpcodeI64ExtendI32U:
+		c.emit(
+			newOperationExtend(false),
+		)
+	case wasm.OpcodeI64TruncF32S:
+		c.emit(
+			newOperationITruncFromF(f32, signedInt64, false),
+		)
+	case wasm.OpcodeI64TruncF32U:
+		c.emit(
+			newOperationITruncFromF(f32, signedUint64, false),
+		)
+	case wasm.OpcodeI64TruncF64S:
+		c.emit(
+			newOperationITruncFromF(f64, signedInt64, false),
+		)
+	case wasm.OpcodeI64TruncF64U:
+		c.emit(
+			newOperationITruncFromF(f64, signedUint64, false),
+		)
+	case wasm.OpcodeF32ConvertI32S:
+		c.emit(
+			newOperationFConvertFromI(signedInt32, f32),
+		)
+	case wasm.OpcodeF32ConvertI32U:
+		c.emit(
+			newOperationFConvertFromI(signedUint32, f32),
+		)
+	case wasm.OpcodeF32ConvertI64S:
+		c.emit(
+			newOperationFConvertFromI(signedInt64, f32),
+		)
+	case wasm.OpcodeF32ConvertI64U:
+		c.emit(
+			newOperationFConvertFromI(signedUint64, f32),
+		)
+	case wasm.OpcodeF32DemoteF64:
+		c.emit(
+			newOperationF32DemoteFromF64(),
+		)
+	case wasm.OpcodeF64ConvertI32S:
+		c.emit(
+			newOperationFConvertFromI(signedInt32, f64),
+		)
+	case wasm.OpcodeF64ConvertI32U:
+		c.emit(
+			newOperationFConvertFromI(signedUint32, f64),
+		)
+	case wasm.OpcodeF64ConvertI64S:
+		c.emit(
+			newOperationFConvertFromI(signedInt64, f64),
+		)
+	case wasm.OpcodeF64ConvertI64U:
+		c.emit(
+			newOperationFConvertFromI(signedUint64, f64),
+		)
+	case wasm.OpcodeF64PromoteF32:
+		c.emit(
+			newOperationF64PromoteFromF32(),
+		)
+	case wasm.OpcodeI32ReinterpretF32:
+		c.emit(
+			newOperationI32ReinterpretFromF32(),
+		)
+	case wasm.OpcodeI64ReinterpretF64:
+		c.emit(
+			newOperationI64ReinterpretFromF64(),
+		)
+	case wasm.OpcodeF32ReinterpretI32:
+		c.emit(
+			newOperationF32ReinterpretFromI32(),
+		)
+	case wasm.OpcodeF64ReinterpretI64:
+		c.emit(
+			newOperationF64ReinterpretFromI64(),
+		)
+	case wasm.OpcodeI32Extend8S:
+		c.emit(
+			newOperationSignExtend32From8(),
+		)
+	case wasm.OpcodeI32Extend16S:
+		c.emit(
+			newOperationSignExtend32From16(),
+		)
+	case wasm.OpcodeI64Extend8S:
+		c.emit(
+			newOperationSignExtend64From8(),
+		)
+	case wasm.OpcodeI64Extend16S:
+		c.emit(
+			newOperationSignExtend64From16(),
+		)
+	case wasm.OpcodeI64Extend32S:
+		c.emit(
+			newOperationSignExtend64From32(),
+		)
+	case wasm.OpcodeRefFunc:
+		c.pc++
+		index, num, err := leb128.LoadUint32(c.body[c.pc:])
+		if err != nil {
+			return fmt.Errorf("failed to read function index for ref.func: %v", err)
+		}
+		c.pc += num - 1
+		c.emit(
+			newOperationRefFunc(index),
+		)
+	case wasm.OpcodeRefNull:
+		c.pc++ // Skip the type of reftype as every ref value is opaque pointer.
+		c.emit(
+			newOperationConstI64(0),
+		)
+	case wasm.OpcodeRefIsNull:
+		// Simply compare the opaque pointer (i64) with zero.
+		c.emit(
+			newOperationEqz(unsignedInt64),
+		)
+	case wasm.OpcodeTableGet:
+		c.pc++
+		tableIndex, num, err := leb128.LoadUint32(c.body[c.pc:])
+		if err != nil {
+			return fmt.Errorf("failed to read function index for table.get: %v", err)
+		}
+		c.pc += num - 1
+		c.emit(
+			newOperationTableGet(tableIndex),
+		)
+	case wasm.OpcodeTableSet:
+		c.pc++
+		tableIndex, num, err := leb128.LoadUint32(c.body[c.pc:])
+		if err != nil {
+			return fmt.Errorf("failed to read function index for table.set: %v", err)
+		}
+		c.pc += num - 1
+		c.emit(
+			newOperationTableSet(tableIndex),
+		)
+	case wasm.OpcodeMiscPrefix:
+		c.pc++
+		// A misc opcode is encoded as an unsigned variable 32-bit integer.
+		miscOp, num, err := leb128.LoadUint32(c.body[c.pc:])
+		if err != nil {
+			return fmt.Errorf("failed to read misc opcode: %v", err)
+		}
+		c.pc += num - 1
+		switch byte(miscOp) {
+		case wasm.OpcodeMiscI32TruncSatF32S:
+			c.emit(
+				newOperationITruncFromF(f32, signedInt32, true),
+			)
+		case wasm.OpcodeMiscI32TruncSatF32U:
+			c.emit(
+				newOperationITruncFromF(f32, signedUint32, true),
+			)
+		case wasm.OpcodeMiscI32TruncSatF64S:
+			c.emit(
+				newOperationITruncFromF(f64, signedInt32, true),
+			)
+		case wasm.OpcodeMiscI32TruncSatF64U:
+			c.emit(
+				newOperationITruncFromF(f64, signedUint32, true),
+			)
+		case wasm.OpcodeMiscI64TruncSatF32S:
+			c.emit(
+				newOperationITruncFromF(f32, signedInt64, true),
+			)
+		case wasm.OpcodeMiscI64TruncSatF32U:
+			c.emit(
+				newOperationITruncFromF(f32, signedUint64, true),
+			)
+		case wasm.OpcodeMiscI64TruncSatF64S:
+			c.emit(
+				newOperationITruncFromF(f64, signedInt64, true),
+			)
+		case wasm.OpcodeMiscI64TruncSatF64U:
+			c.emit(
+				newOperationITruncFromF(f64, signedUint64, true),
+			)
+		case wasm.OpcodeMiscMemoryInit:
+			c.result.UsesMemory = true
+			dataIndex, num, err := leb128.LoadUint32(c.body[c.pc+1:])
+			if err != nil {
+				return fmt.Errorf("reading i32.const value: %v", err)
+			}
+			c.pc += num + 1 // +1 to skip the memory index which is fixed to zero.
+			c.emit(
+				newOperationMemoryInit(dataIndex),
+			)
+		case wasm.OpcodeMiscDataDrop:
+			dataIndex, num, err := leb128.LoadUint32(c.body[c.pc+1:])
+			if err != nil {
+				return fmt.Errorf("reading i32.const value: %v", err)
+			}
+			c.pc += num
+			c.emit(
+				newOperationDataDrop(dataIndex),
+			)
+		case wasm.OpcodeMiscMemoryCopy:
+			c.result.UsesMemory = true
+			c.pc += 2 // +2 to skip two memory indexes which are fixed to zero.
+			c.emit(
+				newOperationMemoryCopy(),
+			)
+		case wasm.OpcodeMiscMemoryFill:
+			c.result.UsesMemory = true
+			c.pc += 1 // +1 to skip the memory index which is fixed to zero.
+			c.emit(
+				newOperationMemoryFill(),
+			)
+		case wasm.OpcodeMiscTableInit:
+			elemIndex, num, err := leb128.LoadUint32(c.body[c.pc+1:])
+			if err != nil {
+				return fmt.Errorf("reading i32.const value: %v", err)
+			}
+			c.pc += num
+			// Read table index which is fixed to zero currently.
+			tableIndex, num, err := leb128.LoadUint32(c.body[c.pc+1:])
+			if err != nil {
+				return fmt.Errorf("reading i32.const value: %v", err)
+			}
+			c.pc += num
+			c.emit(
+				newOperationTableInit(elemIndex, tableIndex),
+			)
+		case wasm.OpcodeMiscElemDrop:
+			elemIndex, num, err := leb128.LoadUint32(c.body[c.pc+1:])
+			if err != nil {
+				return fmt.Errorf("reading i32.const value: %v", err)
+			}
+			c.pc += num
+			c.emit(
+				newOperationElemDrop(elemIndex),
+			)
+		case wasm.OpcodeMiscTableCopy:
+			// Read the source table inde.g.
+			dst, num, err := leb128.LoadUint32(c.body[c.pc+1:])
+			if err != nil {
+				return fmt.Errorf("reading i32.const value: %v", err)
+			}
+			c.pc += num
+			// Read the destination table inde.g.
+			src, num, err := leb128.LoadUint32(c.body[c.pc+1:])
+			if err != nil {
+				return fmt.Errorf("reading i32.const value: %v", err)
+			}
+			c.pc += num
+			c.emit(
+				newOperationTableCopy(src, dst),
+			)
+		case wasm.OpcodeMiscTableGrow:
+			// Read the source table inde.g.
+			tableIndex, num, err := leb128.LoadUint32(c.body[c.pc+1:])
+			if err != nil {
+				return fmt.Errorf("reading i32.const value: %v", err)
+			}
+			c.pc += num
+			c.emit(
+				newOperationTableGrow(tableIndex),
+			)
+		case wasm.OpcodeMiscTableSize:
+			// Read the source table inde.g.
+			tableIndex, num, err := leb128.LoadUint32(c.body[c.pc+1:])
+			if err != nil {
+				return fmt.Errorf("reading i32.const value: %v", err)
+			}
+			c.pc += num
+			c.emit(
+				newOperationTableSize(tableIndex),
+			)
+		case wasm.OpcodeMiscTableFill:
+			// Read the source table index.
+			tableIndex, num, err := leb128.LoadUint32(c.body[c.pc+1:])
+			if err != nil {
+				return fmt.Errorf("reading i32.const value: %v", err)
+			}
+			c.pc += num
+			c.emit(
+				newOperationTableFill(tableIndex),
+			)
+		default:
+			return fmt.Errorf("unsupported misc instruction in interpreterir: 0x%x", op)
+		}
+	case wasm.OpcodeVecPrefix:
+		c.pc++
+		switch vecOp := c.body[c.pc]; vecOp {
+		case wasm.OpcodeVecV128Const:
+			c.pc++
+			lo := binary.LittleEndian.Uint64(c.body[c.pc : c.pc+8])
+			c.pc += 8
+			hi := binary.LittleEndian.Uint64(c.body[c.pc : c.pc+8])
+			c.emit(
+				newOperationV128Const(lo, hi),
+			)
+			c.pc += 7
+		case wasm.OpcodeVecV128Load:
+			arg, err := c.readMemoryArg(wasm.OpcodeI32LoadName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationV128Load(v128LoadType128, arg),
+			)
+		case wasm.OpcodeVecV128Load8x8s:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load8x8SName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationV128Load(v128LoadType8x8s, arg),
+			)
+		case wasm.OpcodeVecV128Load8x8u:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load8x8UName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationV128Load(v128LoadType8x8u, arg),
+			)
+		case wasm.OpcodeVecV128Load16x4s:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load16x4SName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationV128Load(v128LoadType16x4s, arg),
+			)
+		case wasm.OpcodeVecV128Load16x4u:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load16x4UName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationV128Load(v128LoadType16x4u, arg),
+			)
+		case wasm.OpcodeVecV128Load32x2s:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load32x2SName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationV128Load(v128LoadType32x2s, arg),
+			)
+		case wasm.OpcodeVecV128Load32x2u:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load32x2UName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationV128Load(v128LoadType32x2u, arg),
+			)
+		case wasm.OpcodeVecV128Load8Splat:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load8SplatName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationV128Load(v128LoadType8Splat, arg),
+			)
+		case wasm.OpcodeVecV128Load16Splat:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load16SplatName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationV128Load(v128LoadType16Splat, arg),
+			)
+		case wasm.OpcodeVecV128Load32Splat:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load32SplatName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationV128Load(v128LoadType32Splat, arg),
+			)
+		case wasm.OpcodeVecV128Load64Splat:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load64SplatName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationV128Load(v128LoadType64Splat, arg),
+			)
+		case wasm.OpcodeVecV128Load32zero:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load32zeroName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationV128Load(v128LoadType32zero, arg),
+			)
+		case wasm.OpcodeVecV128Load64zero:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load64zeroName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationV128Load(v128LoadType64zero, arg),
+			)
+		case wasm.OpcodeVecV128Load8Lane:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load8LaneName)
+			if err != nil {
+				return err
+			}
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128LoadLane(laneIndex, 8, arg),
+			)
+		case wasm.OpcodeVecV128Load16Lane:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load16LaneName)
+			if err != nil {
+				return err
+			}
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128LoadLane(laneIndex, 16, arg),
+			)
+		case wasm.OpcodeVecV128Load32Lane:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load32LaneName)
+			if err != nil {
+				return err
+			}
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128LoadLane(laneIndex, 32, arg),
+			)
+		case wasm.OpcodeVecV128Load64Lane:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load64LaneName)
+			if err != nil {
+				return err
+			}
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128LoadLane(laneIndex, 64, arg),
+			)
+		case wasm.OpcodeVecV128Store:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128StoreName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationV128Store(arg),
+			)
+		case wasm.OpcodeVecV128Store8Lane:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Store8LaneName)
+			if err != nil {
+				return err
+			}
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128StoreLane(laneIndex, 8, arg),
+			)
+		case wasm.OpcodeVecV128Store16Lane:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Store16LaneName)
+			if err != nil {
+				return err
+			}
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128StoreLane(laneIndex, 16, arg),
+			)
+		case wasm.OpcodeVecV128Store32Lane:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Store32LaneName)
+			if err != nil {
+				return err
+			}
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128StoreLane(laneIndex, 32, arg),
+			)
+		case wasm.OpcodeVecV128Store64Lane:
+			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Store64LaneName)
+			if err != nil {
+				return err
+			}
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128StoreLane(laneIndex, 64, arg),
+			)
+		case wasm.OpcodeVecI8x16ExtractLaneS:
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128ExtractLane(laneIndex, true, shapeI8x16),
+			)
+		case wasm.OpcodeVecI8x16ExtractLaneU:
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128ExtractLane(laneIndex, false, shapeI8x16),
+			)
+		case wasm.OpcodeVecI16x8ExtractLaneS:
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128ExtractLane(laneIndex, true, shapeI16x8),
+			)
+		case wasm.OpcodeVecI16x8ExtractLaneU:
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128ExtractLane(laneIndex, false, shapeI16x8),
+			)
+		case wasm.OpcodeVecI32x4ExtractLane:
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128ExtractLane(laneIndex, false, shapeI32x4),
+			)
+		case wasm.OpcodeVecI64x2ExtractLane:
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128ExtractLane(laneIndex, false, shapeI64x2),
+			)
+		case wasm.OpcodeVecF32x4ExtractLane:
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128ExtractLane(laneIndex, false, shapeF32x4),
+			)
+		case wasm.OpcodeVecF64x2ExtractLane:
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128ExtractLane(laneIndex, false, shapeF64x2),
+			)
+		case wasm.OpcodeVecI8x16ReplaceLane:
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128ReplaceLane(laneIndex, shapeI8x16),
+			)
+		case wasm.OpcodeVecI16x8ReplaceLane:
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128ReplaceLane(laneIndex, shapeI16x8),
+			)
+		case wasm.OpcodeVecI32x4ReplaceLane:
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128ReplaceLane(laneIndex, shapeI32x4),
+			)
+		case wasm.OpcodeVecI64x2ReplaceLane:
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128ReplaceLane(laneIndex, shapeI64x2),
+			)
+		case wasm.OpcodeVecF32x4ReplaceLane:
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128ReplaceLane(laneIndex, shapeF32x4),
+			)
+		case wasm.OpcodeVecF64x2ReplaceLane:
+			c.pc++
+			laneIndex := c.body[c.pc]
+			c.emit(
+				newOperationV128ReplaceLane(laneIndex, shapeF64x2),
+			)
+		case wasm.OpcodeVecI8x16Splat:
+			c.emit(
+				newOperationV128Splat(shapeI8x16),
+			)
+		case wasm.OpcodeVecI16x8Splat:
+			c.emit(
+				newOperationV128Splat(shapeI16x8),
+			)
+		case wasm.OpcodeVecI32x4Splat:
+			c.emit(
+				newOperationV128Splat(shapeI32x4),
+			)
+		case wasm.OpcodeVecI64x2Splat:
+			c.emit(
+				newOperationV128Splat(shapeI64x2),
+			)
+		case wasm.OpcodeVecF32x4Splat:
+			c.emit(
+				newOperationV128Splat(shapeF32x4),
+			)
+		case wasm.OpcodeVecF64x2Splat:
+			c.emit(
+				newOperationV128Splat(shapeF64x2),
+			)
+		case wasm.OpcodeVecI8x16Swizzle:
+			c.emit(
+				newOperationV128Swizzle(),
+			)
+		case wasm.OpcodeVecV128i8x16Shuffle:
+			c.pc++
+			lanes := make([]uint64, 16)
+			for i := uint64(0); i < 16; i++ {
+				lanes[i] = uint64(c.body[c.pc+i])
+			}
+			op := newOperationV128Shuffle(lanes)
+			c.emit(op)
+			c.pc += 15
+		case wasm.OpcodeVecV128AnyTrue:
+			c.emit(
+				newOperationV128AnyTrue(),
+			)
+		case wasm.OpcodeVecI8x16AllTrue:
+			c.emit(
+				newOperationV128AllTrue(shapeI8x16),
+			)
+		case wasm.OpcodeVecI16x8AllTrue:
+			c.emit(
+				newOperationV128AllTrue(shapeI16x8),
+			)
+		case wasm.OpcodeVecI32x4AllTrue:
+			c.emit(
+				newOperationV128AllTrue(shapeI32x4),
+			)
+		case wasm.OpcodeVecI64x2AllTrue:
+			c.emit(
+				newOperationV128AllTrue(shapeI64x2),
+			)
+		case wasm.OpcodeVecI8x16BitMask:
+			c.emit(
+				newOperationV128BitMask(shapeI8x16),
+			)
+		case wasm.OpcodeVecI16x8BitMask:
+			c.emit(
+				newOperationV128BitMask(shapeI16x8),
+			)
+		case wasm.OpcodeVecI32x4BitMask:
+			c.emit(
+				newOperationV128BitMask(shapeI32x4),
+			)
+		case wasm.OpcodeVecI64x2BitMask:
+			c.emit(
+				newOperationV128BitMask(shapeI64x2),
+			)
+		case wasm.OpcodeVecV128And:
+			c.emit(
+				newOperationV128And(),
+			)
+		case wasm.OpcodeVecV128Not:
+			c.emit(
+				newOperationV128Not(),
+			)
+		case wasm.OpcodeVecV128Or:
+			c.emit(
+				newOperationV128Or(),
+			)
+		case wasm.OpcodeVecV128Xor:
+			c.emit(
+				newOperationV128Xor(),
+			)
+		case wasm.OpcodeVecV128Bitselect:
+			c.emit(
+				newOperationV128Bitselect(),
+			)
+		case wasm.OpcodeVecV128AndNot:
+			c.emit(
+				newOperationV128AndNot(),
+			)
+		case wasm.OpcodeVecI8x16Shl:
+			c.emit(
+				newOperationV128Shl(shapeI8x16),
+			)
+		case wasm.OpcodeVecI8x16ShrS:
+			c.emit(
+				newOperationV128Shr(shapeI8x16, true),
+			)
+		case wasm.OpcodeVecI8x16ShrU:
+			c.emit(
+				newOperationV128Shr(shapeI8x16, false),
+			)
+		case wasm.OpcodeVecI16x8Shl:
+			c.emit(
+				newOperationV128Shl(shapeI16x8),
+			)
+		case wasm.OpcodeVecI16x8ShrS:
+			c.emit(
+				newOperationV128Shr(shapeI16x8, true),
+			)
+		case wasm.OpcodeVecI16x8ShrU:
+			c.emit(
+				newOperationV128Shr(shapeI16x8, false),
+			)
+		case wasm.OpcodeVecI32x4Shl:
+			c.emit(
+				newOperationV128Shl(shapeI32x4),
+			)
+		case wasm.OpcodeVecI32x4ShrS:
+			c.emit(
+				newOperationV128Shr(shapeI32x4, true),
+			)
+		case wasm.OpcodeVecI32x4ShrU:
+			c.emit(
+				newOperationV128Shr(shapeI32x4, false),
+			)
+		case wasm.OpcodeVecI64x2Shl:
+			c.emit(
+				newOperationV128Shl(shapeI64x2),
+			)
+		case wasm.OpcodeVecI64x2ShrS:
+			c.emit(
+				newOperationV128Shr(shapeI64x2, true),
+			)
+		case wasm.OpcodeVecI64x2ShrU:
+			c.emit(
+				newOperationV128Shr(shapeI64x2, false),
+			)
+		case wasm.OpcodeVecI8x16Eq:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI8x16Eq),
+			)
+		case wasm.OpcodeVecI8x16Ne:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI8x16Ne),
+			)
+		case wasm.OpcodeVecI8x16LtS:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI8x16LtS),
+			)
+		case wasm.OpcodeVecI8x16LtU:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI8x16LtU),
+			)
+		case wasm.OpcodeVecI8x16GtS:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI8x16GtS),
+			)
+		case wasm.OpcodeVecI8x16GtU:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI8x16GtU),
+			)
+		case wasm.OpcodeVecI8x16LeS:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI8x16LeS),
+			)
+		case wasm.OpcodeVecI8x16LeU:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI8x16LeU),
+			)
+		case wasm.OpcodeVecI8x16GeS:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI8x16GeS),
+			)
+		case wasm.OpcodeVecI8x16GeU:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI8x16GeU),
+			)
+		case wasm.OpcodeVecI16x8Eq:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI16x8Eq),
+			)
+		case wasm.OpcodeVecI16x8Ne:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI16x8Ne),
+			)
+		case wasm.OpcodeVecI16x8LtS:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI16x8LtS),
+			)
+		case wasm.OpcodeVecI16x8LtU:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI16x8LtU),
+			)
+		case wasm.OpcodeVecI16x8GtS:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI16x8GtS),
+			)
+		case wasm.OpcodeVecI16x8GtU:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI16x8GtU),
+			)
+		case wasm.OpcodeVecI16x8LeS:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI16x8LeS),
+			)
+		case wasm.OpcodeVecI16x8LeU:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI16x8LeU),
+			)
+		case wasm.OpcodeVecI16x8GeS:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI16x8GeS),
+			)
+		case wasm.OpcodeVecI16x8GeU:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI16x8GeU),
+			)
+		case wasm.OpcodeVecI32x4Eq:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI32x4Eq),
+			)
+		case wasm.OpcodeVecI32x4Ne:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI32x4Ne),
+			)
+		case wasm.OpcodeVecI32x4LtS:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI32x4LtS),
+			)
+		case wasm.OpcodeVecI32x4LtU:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI32x4LtU),
+			)
+		case wasm.OpcodeVecI32x4GtS:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI32x4GtS),
+			)
+		case wasm.OpcodeVecI32x4GtU:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI32x4GtU),
+			)
+		case wasm.OpcodeVecI32x4LeS:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI32x4LeS),
+			)
+		case wasm.OpcodeVecI32x4LeU:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI32x4LeU),
+			)
+		case wasm.OpcodeVecI32x4GeS:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI32x4GeS),
+			)
+		case wasm.OpcodeVecI32x4GeU:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI32x4GeU),
+			)
+		case wasm.OpcodeVecI64x2Eq:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI64x2Eq),
+			)
+		case wasm.OpcodeVecI64x2Ne:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI64x2Ne),
+			)
+		case wasm.OpcodeVecI64x2LtS:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI64x2LtS),
+			)
+		case wasm.OpcodeVecI64x2GtS:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI64x2GtS),
+			)
+		case wasm.OpcodeVecI64x2LeS:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI64x2LeS),
+			)
+		case wasm.OpcodeVecI64x2GeS:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeI64x2GeS),
+			)
+		case wasm.OpcodeVecF32x4Eq:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeF32x4Eq),
+			)
+		case wasm.OpcodeVecF32x4Ne:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeF32x4Ne),
+			)
+		case wasm.OpcodeVecF32x4Lt:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeF32x4Lt),
+			)
+		case wasm.OpcodeVecF32x4Gt:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeF32x4Gt),
+			)
+		case wasm.OpcodeVecF32x4Le:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeF32x4Le),
+			)
+		case wasm.OpcodeVecF32x4Ge:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeF32x4Ge),
+			)
+		case wasm.OpcodeVecF64x2Eq:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeF64x2Eq),
+			)
+		case wasm.OpcodeVecF64x2Ne:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeF64x2Ne),
+			)
+		case wasm.OpcodeVecF64x2Lt:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeF64x2Lt),
+			)
+		case wasm.OpcodeVecF64x2Gt:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeF64x2Gt),
+			)
+		case wasm.OpcodeVecF64x2Le:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeF64x2Le),
+			)
+		case wasm.OpcodeVecF64x2Ge:
+			c.emit(
+				newOperationV128Cmp(v128CmpTypeF64x2Ge),
+			)
+		case wasm.OpcodeVecI8x16Neg:
+			c.emit(
+				newOperationV128Neg(shapeI8x16),
+			)
+		case wasm.OpcodeVecI16x8Neg:
+			c.emit(
+				newOperationV128Neg(shapeI16x8),
+			)
+		case wasm.OpcodeVecI32x4Neg:
+			c.emit(
+				newOperationV128Neg(shapeI32x4),
+			)
+		case wasm.OpcodeVecI64x2Neg:
+			c.emit(
+				newOperationV128Neg(shapeI64x2),
+			)
+		case wasm.OpcodeVecF32x4Neg:
+			c.emit(
+				newOperationV128Neg(shapeF32x4),
+			)
+		case wasm.OpcodeVecF64x2Neg:
+			c.emit(
+				newOperationV128Neg(shapeF64x2),
+			)
+		case wasm.OpcodeVecI8x16Add:
+			c.emit(
+				newOperationV128Add(shapeI8x16),
+			)
+		case wasm.OpcodeVecI16x8Add:
+			c.emit(
+				newOperationV128Add(shapeI16x8),
+			)
+		case wasm.OpcodeVecI32x4Add:
+			c.emit(
+				newOperationV128Add(shapeI32x4),
+			)
+		case wasm.OpcodeVecI64x2Add:
+			c.emit(
+				newOperationV128Add(shapeI64x2),
+			)
+		case wasm.OpcodeVecF32x4Add:
+			c.emit(
+				newOperationV128Add(shapeF32x4),
+			)
+		case wasm.OpcodeVecF64x2Add:
+			c.emit(
+				newOperationV128Add(shapeF64x2),
+			)
+		case wasm.OpcodeVecI8x16Sub:
+			c.emit(
+				newOperationV128Sub(shapeI8x16),
+			)
+		case wasm.OpcodeVecI16x8Sub:
+			c.emit(
+				newOperationV128Sub(shapeI16x8),
+			)
+		case wasm.OpcodeVecI32x4Sub:
+			c.emit(
+				newOperationV128Sub(shapeI32x4),
+			)
+		case wasm.OpcodeVecI64x2Sub:
+			c.emit(
+				newOperationV128Sub(shapeI64x2),
+			)
+		case wasm.OpcodeVecF32x4Sub:
+			c.emit(
+				newOperationV128Sub(shapeF32x4),
+			)
+		case wasm.OpcodeVecF64x2Sub:
+			c.emit(
+				newOperationV128Sub(shapeF64x2),
+			)
+		case wasm.OpcodeVecI8x16AddSatS:
+			c.emit(
+				newOperationV128AddSat(shapeI8x16, true),
+			)
+		case wasm.OpcodeVecI8x16AddSatU:
+			c.emit(
+				newOperationV128AddSat(shapeI8x16, false),
+			)
+		case wasm.OpcodeVecI16x8AddSatS:
+			c.emit(
+				newOperationV128AddSat(shapeI16x8, true),
+			)
+		case wasm.OpcodeVecI16x8AddSatU:
+			c.emit(
+				newOperationV128AddSat(shapeI16x8, false),
+			)
+		case wasm.OpcodeVecI8x16SubSatS:
+			c.emit(
+				newOperationV128SubSat(shapeI8x16, true),
+			)
+		case wasm.OpcodeVecI8x16SubSatU:
+			c.emit(
+				newOperationV128SubSat(shapeI8x16, false),
+			)
+		case wasm.OpcodeVecI16x8SubSatS:
+			c.emit(
+				newOperationV128SubSat(shapeI16x8, true),
+			)
+		case wasm.OpcodeVecI16x8SubSatU:
+			c.emit(
+				newOperationV128SubSat(shapeI16x8, false),
+			)
+		case wasm.OpcodeVecI16x8Mul:
+			c.emit(
+				newOperationV128Mul(shapeI16x8),
+			)
+		case wasm.OpcodeVecI32x4Mul:
+			c.emit(
+				newOperationV128Mul(shapeI32x4),
+			)
+		case wasm.OpcodeVecI64x2Mul:
+			c.emit(
+				newOperationV128Mul(shapeI64x2),
+			)
+		case wasm.OpcodeVecF32x4Mul:
+			c.emit(
+				newOperationV128Mul(shapeF32x4),
+			)
+		case wasm.OpcodeVecF64x2Mul:
+			c.emit(
+				newOperationV128Mul(shapeF64x2),
+			)
+		case wasm.OpcodeVecF32x4Sqrt:
+			c.emit(
+				newOperationV128Sqrt(shapeF32x4),
+			)
+		case wasm.OpcodeVecF64x2Sqrt:
+			c.emit(
+				newOperationV128Sqrt(shapeF64x2),
+			)
+		case wasm.OpcodeVecF32x4Div:
+			c.emit(
+				newOperationV128Div(shapeF32x4),
+			)
+		case wasm.OpcodeVecF64x2Div:
+			c.emit(
+				newOperationV128Div(shapeF64x2),
+			)
+		case wasm.OpcodeVecI8x16Abs:
+			c.emit(
+				newOperationV128Abs(shapeI8x16),
+			)
+		case wasm.OpcodeVecI8x16Popcnt:
+			c.emit(
+				newOperationV128Popcnt(shapeI8x16),
+			)
+		case wasm.OpcodeVecI16x8Abs:
+			c.emit(
+				newOperationV128Abs(shapeI16x8),
+			)
+		case wasm.OpcodeVecI32x4Abs:
+			c.emit(
+				newOperationV128Abs(shapeI32x4),
+			)
+		case wasm.OpcodeVecI64x2Abs:
+			c.emit(
+				newOperationV128Abs(shapeI64x2),
+			)
+		case wasm.OpcodeVecF32x4Abs:
+			c.emit(
+				newOperationV128Abs(shapeF32x4),
+			)
+		case wasm.OpcodeVecF64x2Abs:
+			c.emit(
+				newOperationV128Abs(shapeF64x2),
+			)
+		case wasm.OpcodeVecI8x16MinS:
+			c.emit(
+				newOperationV128Min(shapeI8x16, true),
+			)
+		case wasm.OpcodeVecI8x16MinU:
+			c.emit(
+				newOperationV128Min(shapeI8x16, false),
+			)
+		case wasm.OpcodeVecI8x16MaxS:
+			c.emit(
+				newOperationV128Max(shapeI8x16, true),
+			)
+		case wasm.OpcodeVecI8x16MaxU:
+			c.emit(
+				newOperationV128Max(shapeI8x16, false),
+			)
+		case wasm.OpcodeVecI8x16AvgrU:
+			c.emit(
+				newOperationV128AvgrU(shapeI8x16),
+			)
+		case wasm.OpcodeVecI16x8MinS:
+			c.emit(
+				newOperationV128Min(shapeI16x8, true),
+			)
+		case wasm.OpcodeVecI16x8MinU:
+			c.emit(
+				newOperationV128Min(shapeI16x8, false),
+			)
+		case wasm.OpcodeVecI16x8MaxS:
+			c.emit(
+				newOperationV128Max(shapeI16x8, true),
+			)
+		case wasm.OpcodeVecI16x8MaxU:
+			c.emit(
+				newOperationV128Max(shapeI16x8, false),
+			)
+		case wasm.OpcodeVecI16x8AvgrU:
+			c.emit(
+				newOperationV128AvgrU(shapeI16x8),
+			)
+		case wasm.OpcodeVecI32x4MinS:
+			c.emit(
+				newOperationV128Min(shapeI32x4, true),
+			)
+		case wasm.OpcodeVecI32x4MinU:
+			c.emit(
+				newOperationV128Min(shapeI32x4, false),
+			)
+		case wasm.OpcodeVecI32x4MaxS:
+			c.emit(
+				newOperationV128Max(shapeI32x4, true),
+			)
+		case wasm.OpcodeVecI32x4MaxU:
+			c.emit(
+				newOperationV128Max(shapeI32x4, false),
+			)
+		case wasm.OpcodeVecF32x4Min:
+			c.emit(
+				newOperationV128Min(shapeF32x4, false),
+			)
+		case wasm.OpcodeVecF32x4Max:
+			c.emit(
+				newOperationV128Max(shapeF32x4, false),
+			)
+		case wasm.OpcodeVecF64x2Min:
+			c.emit(
+				newOperationV128Min(shapeF64x2, false),
+			)
+		case wasm.OpcodeVecF64x2Max:
+			c.emit(
+				newOperationV128Max(shapeF64x2, false),
+			)
+		case wasm.OpcodeVecF32x4Pmin:
+			c.emit(
+				newOperationV128Pmin(shapeF32x4),
+			)
+		case wasm.OpcodeVecF32x4Pmax:
+			c.emit(
+				newOperationV128Pmax(shapeF32x4),
+			)
+		case wasm.OpcodeVecF64x2Pmin:
+			c.emit(
+				newOperationV128Pmin(shapeF64x2),
+			)
+		case wasm.OpcodeVecF64x2Pmax:
+			c.emit(
+				newOperationV128Pmax(shapeF64x2),
+			)
+		case wasm.OpcodeVecF32x4Ceil:
+			c.emit(
+				newOperationV128Ceil(shapeF32x4),
+			)
+		case wasm.OpcodeVecF32x4Floor:
+			c.emit(
+				newOperationV128Floor(shapeF32x4),
+			)
+		case wasm.OpcodeVecF32x4Trunc:
+			c.emit(
+				newOperationV128Trunc(shapeF32x4),
+			)
+		case wasm.OpcodeVecF32x4Nearest:
+			c.emit(
+				newOperationV128Nearest(shapeF32x4),
+			)
+		case wasm.OpcodeVecF64x2Ceil:
+			c.emit(
+				newOperationV128Ceil(shapeF64x2),
+			)
+		case wasm.OpcodeVecF64x2Floor:
+			c.emit(
+				newOperationV128Floor(shapeF64x2),
+			)
+		case wasm.OpcodeVecF64x2Trunc:
+			c.emit(
+				newOperationV128Trunc(shapeF64x2),
+			)
+		case wasm.OpcodeVecF64x2Nearest:
+			c.emit(
+				newOperationV128Nearest(shapeF64x2),
+			)
+		case wasm.OpcodeVecI16x8ExtendLowI8x16S:
+			c.emit(
+				newOperationV128Extend(shapeI8x16, true, true),
+			)
+		case wasm.OpcodeVecI16x8ExtendHighI8x16S:
+			c.emit(
+				newOperationV128Extend(shapeI8x16, true, false),
+			)
+		case wasm.OpcodeVecI16x8ExtendLowI8x16U:
+			c.emit(
+				newOperationV128Extend(shapeI8x16, false, true),
+			)
+		case wasm.OpcodeVecI16x8ExtendHighI8x16U:
+			c.emit(
+				newOperationV128Extend(shapeI8x16, false, false),
+			)
+		case wasm.OpcodeVecI32x4ExtendLowI16x8S:
+			c.emit(
+				newOperationV128Extend(shapeI16x8, true, true),
+			)
+		case wasm.OpcodeVecI32x4ExtendHighI16x8S:
+			c.emit(
+				newOperationV128Extend(shapeI16x8, true, false),
+			)
+		case wasm.OpcodeVecI32x4ExtendLowI16x8U:
+			c.emit(
+				newOperationV128Extend(shapeI16x8, false, true),
+			)
+		case wasm.OpcodeVecI32x4ExtendHighI16x8U:
+			c.emit(
+				newOperationV128Extend(shapeI16x8, false, false),
+			)
+		case wasm.OpcodeVecI64x2ExtendLowI32x4S:
+			c.emit(
+				newOperationV128Extend(shapeI32x4, true, true),
+			)
+		case wasm.OpcodeVecI64x2ExtendHighI32x4S:
+			c.emit(
+				newOperationV128Extend(shapeI32x4, true, false),
+			)
+		case wasm.OpcodeVecI64x2ExtendLowI32x4U:
+			c.emit(
+				newOperationV128Extend(shapeI32x4, false, true),
+			)
+		case wasm.OpcodeVecI64x2ExtendHighI32x4U:
+			c.emit(
+				newOperationV128Extend(shapeI32x4, false, false),
+			)
+		case wasm.OpcodeVecI16x8Q15mulrSatS:
+			c.emit(
+				newOperationV128Q15mulrSatS(),
+			)
+		case wasm.OpcodeVecI16x8ExtMulLowI8x16S:
+			c.emit(
+				newOperationV128ExtMul(shapeI8x16, true, true),
+			)
+		case wasm.OpcodeVecI16x8ExtMulHighI8x16S:
+			c.emit(
+				newOperationV128ExtMul(shapeI8x16, true, false),
+			)
+		case wasm.OpcodeVecI16x8ExtMulLowI8x16U:
+			c.emit(
+				newOperationV128ExtMul(shapeI8x16, false, true),
+			)
+		case wasm.OpcodeVecI16x8ExtMulHighI8x16U:
+			c.emit(
+				newOperationV128ExtMul(shapeI8x16, false, false),
+			)
+		case wasm.OpcodeVecI32x4ExtMulLowI16x8S:
+			c.emit(
+				newOperationV128ExtMul(shapeI16x8, true, true),
+			)
+		case wasm.OpcodeVecI32x4ExtMulHighI16x8S:
+			c.emit(
+				newOperationV128ExtMul(shapeI16x8, true, false),
+			)
+		case wasm.OpcodeVecI32x4ExtMulLowI16x8U:
+			c.emit(
+				newOperationV128ExtMul(shapeI16x8, false, true),
+			)
+		case wasm.OpcodeVecI32x4ExtMulHighI16x8U:
+			c.emit(
+				newOperationV128ExtMul(shapeI16x8, false, false),
+			)
+		case wasm.OpcodeVecI64x2ExtMulLowI32x4S:
+			c.emit(
+				newOperationV128ExtMul(shapeI32x4, true, true),
+			)
+		case wasm.OpcodeVecI64x2ExtMulHighI32x4S:
+			c.emit(
+				newOperationV128ExtMul(shapeI32x4, true, false),
+			)
+		case wasm.OpcodeVecI64x2ExtMulLowI32x4U:
+			c.emit(
+				newOperationV128ExtMul(shapeI32x4, false, true),
+			)
+		case wasm.OpcodeVecI64x2ExtMulHighI32x4U:
+			c.emit(
+				newOperationV128ExtMul(shapeI32x4, false, false),
+			)
+		case wasm.OpcodeVecI16x8ExtaddPairwiseI8x16S:
+			c.emit(
+				newOperationV128ExtAddPairwise(shapeI8x16, true),
+			)
+		case wasm.OpcodeVecI16x8ExtaddPairwiseI8x16U:
+			c.emit(
+				newOperationV128ExtAddPairwise(shapeI8x16, false),
+			)
+		case wasm.OpcodeVecI32x4ExtaddPairwiseI16x8S:
+			c.emit(
+				newOperationV128ExtAddPairwise(shapeI16x8, true),
+			)
+		case wasm.OpcodeVecI32x4ExtaddPairwiseI16x8U:
+			c.emit(
+				newOperationV128ExtAddPairwise(shapeI16x8, false),
+			)
+		case wasm.OpcodeVecF64x2PromoteLowF32x4Zero:
+			c.emit(
+				newOperationV128FloatPromote(),
+			)
+		case wasm.OpcodeVecF32x4DemoteF64x2Zero:
+			c.emit(
+				newOperationV128FloatDemote(),
+			)
+		case wasm.OpcodeVecF32x4ConvertI32x4S:
+			c.emit(
+				newOperationV128FConvertFromI(shapeF32x4, true),
+			)
+		case wasm.OpcodeVecF32x4ConvertI32x4U:
+			c.emit(
+				newOperationV128FConvertFromI(shapeF32x4, false),
+			)
+		case wasm.OpcodeVecF64x2ConvertLowI32x4S:
+			c.emit(
+				newOperationV128FConvertFromI(shapeF64x2, true),
+			)
+		case wasm.OpcodeVecF64x2ConvertLowI32x4U:
+			c.emit(
+				newOperationV128FConvertFromI(shapeF64x2, false),
+			)
+		case wasm.OpcodeVecI32x4DotI16x8S:
+			c.emit(
+				newOperationV128Dot(),
+			)
+		case wasm.OpcodeVecI8x16NarrowI16x8S:
+			c.emit(
+				newOperationV128Narrow(shapeI16x8, true),
+			)
+		case wasm.OpcodeVecI8x16NarrowI16x8U:
+			c.emit(
+				newOperationV128Narrow(shapeI16x8, false),
+			)
+		case wasm.OpcodeVecI16x8NarrowI32x4S:
+			c.emit(
+				newOperationV128Narrow(shapeI32x4, true),
+			)
+		case wasm.OpcodeVecI16x8NarrowI32x4U:
+			c.emit(
+				newOperationV128Narrow(shapeI32x4, false),
+			)
+		case wasm.OpcodeVecI32x4TruncSatF32x4S:
+			c.emit(
+				newOperationV128ITruncSatFromF(shapeF32x4, true),
+			)
+		case wasm.OpcodeVecI32x4TruncSatF32x4U:
+			c.emit(
+				newOperationV128ITruncSatFromF(shapeF32x4, false),
+			)
+		case wasm.OpcodeVecI32x4TruncSatF64x2SZero:
+			c.emit(
+				newOperationV128ITruncSatFromF(shapeF64x2, true),
+			)
+		case wasm.OpcodeVecI32x4TruncSatF64x2UZero:
+			c.emit(
+				newOperationV128ITruncSatFromF(shapeF64x2, false),
+			)
+		default:
+			return fmt.Errorf("unsupported vector instruction in interpreterir: %s", wasm.VectorInstructionName(vecOp))
+		}
+	case wasm.OpcodeAtomicPrefix:
+		c.pc++
+		atomicOp := c.body[c.pc]
+		switch atomicOp {
+		case wasm.OpcodeAtomicMemoryWait32:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicMemoryWait32Name)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicMemoryWait(unsignedTypeI32, imm),
+			)
+		case wasm.OpcodeAtomicMemoryWait64:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicMemoryWait64Name)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicMemoryWait(unsignedTypeI64, imm),
+			)
+		case wasm.OpcodeAtomicMemoryNotify:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicMemoryNotifyName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicMemoryNotify(imm),
+			)
+		case wasm.OpcodeAtomicFence:
+			// Skip immediate value
+			c.pc++
+			_ = c.body[c.pc]
+			c.emit(
+				newOperationAtomicFence(),
+			)
+		case wasm.OpcodeAtomicI32Load:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32LoadName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicLoad(unsignedTypeI32, imm),
+			)
+		case wasm.OpcodeAtomicI64Load:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64LoadName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicLoad(unsignedTypeI64, imm),
+			)
+		case wasm.OpcodeAtomicI32Load8U:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Load8UName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicLoad8(unsignedTypeI32, imm),
+			)
+		case wasm.OpcodeAtomicI32Load16U:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Load16UName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicLoad16(unsignedTypeI32, imm),
+			)
+		case wasm.OpcodeAtomicI64Load8U:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Load8UName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicLoad8(unsignedTypeI64, imm),
+			)
+		case wasm.OpcodeAtomicI64Load16U:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Load16UName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicLoad16(unsignedTypeI64, imm),
+			)
+		case wasm.OpcodeAtomicI64Load32U:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Load32UName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicLoad(unsignedTypeI32, imm),
+			)
+		case wasm.OpcodeAtomicI32Store:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32StoreName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicStore(unsignedTypeI32, imm),
+			)
+		case wasm.OpcodeAtomicI32Store8:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Store8Name)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicStore8(unsignedTypeI32, imm),
+			)
+		case wasm.OpcodeAtomicI32Store16:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Store16Name)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicStore16(unsignedTypeI32, imm),
+			)
+		case wasm.OpcodeAtomicI64Store:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64StoreName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicStore(unsignedTypeI64, imm),
+			)
+		case wasm.OpcodeAtomicI64Store8:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Store8Name)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicStore8(unsignedTypeI64, imm),
+			)
+		case wasm.OpcodeAtomicI64Store16:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Store16Name)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicStore16(unsignedTypeI64, imm),
+			)
+		case wasm.OpcodeAtomicI64Store32:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Store32Name)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicStore(unsignedTypeI32, imm),
+			)
+		case wasm.OpcodeAtomicI32RmwAdd:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32RmwAddName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpAdd),
+			)
+		case wasm.OpcodeAtomicI64RmwAdd:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64RmwAddName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI64, imm, atomicArithmeticOpAdd),
+			)
+		case wasm.OpcodeAtomicI32Rmw8AddU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw8AddUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW8(unsignedTypeI32, imm, atomicArithmeticOpAdd),
+			)
+		case wasm.OpcodeAtomicI64Rmw8AddU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw8AddUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW8(unsignedTypeI64, imm, atomicArithmeticOpAdd),
+			)
+		case wasm.OpcodeAtomicI32Rmw16AddU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw16AddUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW16(unsignedTypeI32, imm, atomicArithmeticOpAdd),
+			)
+		case wasm.OpcodeAtomicI64Rmw16AddU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw16AddUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW16(unsignedTypeI64, imm, atomicArithmeticOpAdd),
+			)
+		case wasm.OpcodeAtomicI64Rmw32AddU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw32AddUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpAdd),
+			)
+		case wasm.OpcodeAtomicI32RmwSub:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32RmwSubName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpSub),
+			)
+		case wasm.OpcodeAtomicI64RmwSub:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64RmwSubName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI64, imm, atomicArithmeticOpSub),
+			)
+		case wasm.OpcodeAtomicI32Rmw8SubU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw8SubUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW8(unsignedTypeI32, imm, atomicArithmeticOpSub),
+			)
+		case wasm.OpcodeAtomicI64Rmw8SubU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw8SubUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW8(unsignedTypeI64, imm, atomicArithmeticOpSub),
+			)
+		case wasm.OpcodeAtomicI32Rmw16SubU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw16SubUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW16(unsignedTypeI32, imm, atomicArithmeticOpSub),
+			)
+		case wasm.OpcodeAtomicI64Rmw16SubU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw16SubUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW16(unsignedTypeI64, imm, atomicArithmeticOpSub),
+			)
+		case wasm.OpcodeAtomicI64Rmw32SubU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw32SubUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpSub),
+			)
+		case wasm.OpcodeAtomicI32RmwAnd:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32RmwAndName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpAnd),
+			)
+		case wasm.OpcodeAtomicI64RmwAnd:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64RmwAndName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI64, imm, atomicArithmeticOpAnd),
+			)
+		case wasm.OpcodeAtomicI32Rmw8AndU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw8AndUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW8(unsignedTypeI32, imm, atomicArithmeticOpAnd),
+			)
+		case wasm.OpcodeAtomicI64Rmw8AndU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw8AndUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW8(unsignedTypeI64, imm, atomicArithmeticOpAnd),
+			)
+		case wasm.OpcodeAtomicI32Rmw16AndU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw16AndUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW16(unsignedTypeI32, imm, atomicArithmeticOpAnd),
+			)
+		case wasm.OpcodeAtomicI64Rmw16AndU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw16AndUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW16(unsignedTypeI64, imm, atomicArithmeticOpAnd),
+			)
+		case wasm.OpcodeAtomicI64Rmw32AndU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw32AndUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpAnd),
+			)
+		case wasm.OpcodeAtomicI32RmwOr:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32RmwOrName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpOr),
+			)
+		case wasm.OpcodeAtomicI64RmwOr:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64RmwOrName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI64, imm, atomicArithmeticOpOr),
+			)
+		case wasm.OpcodeAtomicI32Rmw8OrU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw8OrUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW8(unsignedTypeI32, imm, atomicArithmeticOpOr),
+			)
+		case wasm.OpcodeAtomicI64Rmw8OrU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw8OrUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW8(unsignedTypeI64, imm, atomicArithmeticOpOr),
+			)
+		case wasm.OpcodeAtomicI32Rmw16OrU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw16OrUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW16(unsignedTypeI32, imm, atomicArithmeticOpOr),
+			)
+		case wasm.OpcodeAtomicI64Rmw16OrU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw16OrUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW16(unsignedTypeI64, imm, atomicArithmeticOpOr),
+			)
+		case wasm.OpcodeAtomicI64Rmw32OrU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw32OrUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpOr),
+			)
+		case wasm.OpcodeAtomicI32RmwXor:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32RmwXorName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpXor),
+			)
+		case wasm.OpcodeAtomicI64RmwXor:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64RmwXorName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI64, imm, atomicArithmeticOpXor),
+			)
+		case wasm.OpcodeAtomicI32Rmw8XorU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw8XorUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW8(unsignedTypeI32, imm, atomicArithmeticOpXor),
+			)
+		case wasm.OpcodeAtomicI64Rmw8XorU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw8XorUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW8(unsignedTypeI64, imm, atomicArithmeticOpXor),
+			)
+		case wasm.OpcodeAtomicI32Rmw16XorU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw16XorUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW16(unsignedTypeI32, imm, atomicArithmeticOpXor),
+			)
+		case wasm.OpcodeAtomicI64Rmw16XorU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw16XorUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW16(unsignedTypeI64, imm, atomicArithmeticOpXor),
+			)
+		case wasm.OpcodeAtomicI64Rmw32XorU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw32XorUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpXor),
+			)
+		case wasm.OpcodeAtomicI32RmwXchg:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32RmwXchgName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpNop),
+			)
+		case wasm.OpcodeAtomicI64RmwXchg:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64RmwXchgName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI64, imm, atomicArithmeticOpNop),
+			)
+		case wasm.OpcodeAtomicI32Rmw8XchgU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw8XchgUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW8(unsignedTypeI32, imm, atomicArithmeticOpNop),
+			)
+		case wasm.OpcodeAtomicI64Rmw8XchgU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw8XchgUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW8(unsignedTypeI64, imm, atomicArithmeticOpNop),
+			)
+		case wasm.OpcodeAtomicI32Rmw16XchgU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw16XchgUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW16(unsignedTypeI32, imm, atomicArithmeticOpNop),
+			)
+		case wasm.OpcodeAtomicI64Rmw16XchgU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw16XchgUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW16(unsignedTypeI64, imm, atomicArithmeticOpNop),
+			)
+		case wasm.OpcodeAtomicI64Rmw32XchgU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw32XchgUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpNop),
+			)
+		case wasm.OpcodeAtomicI32RmwCmpxchg:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32RmwCmpxchgName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMWCmpxchg(unsignedTypeI32, imm),
+			)
+		case wasm.OpcodeAtomicI64RmwCmpxchg:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64RmwCmpxchgName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMWCmpxchg(unsignedTypeI64, imm),
+			)
+		case wasm.OpcodeAtomicI32Rmw8CmpxchgU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw8CmpxchgUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW8Cmpxchg(unsignedTypeI32, imm),
+			)
+		case wasm.OpcodeAtomicI64Rmw8CmpxchgU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw8CmpxchgUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW8Cmpxchg(unsignedTypeI64, imm),
+			)
+		case wasm.OpcodeAtomicI32Rmw16CmpxchgU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw16CmpxchgUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW16Cmpxchg(unsignedTypeI32, imm),
+			)
+		case wasm.OpcodeAtomicI64Rmw16CmpxchgU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw16CmpxchgUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMW16Cmpxchg(unsignedTypeI64, imm),
+			)
+		case wasm.OpcodeAtomicI64Rmw32CmpxchgU:
+			imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw32CmpxchgUName)
+			if err != nil {
+				return err
+			}
+			c.emit(
+				newOperationAtomicRMWCmpxchg(unsignedTypeI32, imm),
+			)
+		default:
+			return fmt.Errorf("unsupported atomic instruction in interpreterir: %s", wasm.AtomicInstructionName(atomicOp))
+		}
+	default:
+		return fmt.Errorf("unsupported instruction in interpreterir: 0x%x", op)
+	}
+
+	// Move the program counter to point to the next instruction.
+	c.pc++
+	return nil
+}
+
+func (c *compiler) nextFrameID() (id uint32) {
+	id = c.currentFrameID + 1
+	c.currentFrameID++
+	return
+}
+
+func (c *compiler) applyToStack(opcode wasm.Opcode) (index uint32, err error) {
+	switch opcode {
+	case
+		// These are the opcodes that is coupled with "index"　immediate
+		// and it DOES affect the signature of opcode.
+		wasm.OpcodeCall,
+		wasm.OpcodeCallIndirect,
+		wasm.OpcodeLocalGet,
+		wasm.OpcodeLocalSet,
+		wasm.OpcodeLocalTee,
+		wasm.OpcodeGlobalGet,
+		wasm.OpcodeGlobalSet:
+		// Assumes that we are at the opcode now so skip it before read immediates.
+		v, num, err := leb128.LoadUint32(c.body[c.pc+1:])
+		if err != nil {
+			return 0, fmt.Errorf("reading immediates: %w", err)
+		}
+		c.pc += num
+		index = v
+	default:
+		// Note that other opcodes are free of index
+		// as it doesn't affect the signature of opt code.
+		// In other words, the "index" argument of wasmOpcodeSignature
+		// is ignored there.
+	}
+
+	if c.unreachableState.on {
+		return 0, nil
+	}
+
+	// Retrieve the signature of the opcode.
+	s, err := c.wasmOpcodeSignature(opcode, index)
+	if err != nil {
+		return 0, err
+	}
+
+	// Manipulate the stack according to the signature.
+	// Note that the following algorithm assumes that
+	// the unknown type is unique in the signature,
+	// and is determined by the actual type on the stack.
+	// The determined type is stored in this typeParam.
+	var typeParam unsignedType
+	var typeParamFound bool
+	for i := range s.in {
+		want := s.in[len(s.in)-1-i]
+		actual := c.stackPop()
+		if want == unsignedTypeUnknown && typeParamFound {
+			want = typeParam
+		} else if want == unsignedTypeUnknown {
+			want = actual
+			typeParam = want
+			typeParamFound = true
+		}
+		if want != actual {
+			return 0, fmt.Errorf("input signature mismatch: want %s but have %s", want, actual)
+		}
+	}
+
+	for _, target := range s.out {
+		if target == unsignedTypeUnknown && !typeParamFound {
+			return 0, fmt.Errorf("cannot determine type of unknown result")
+		} else if target == unsignedTypeUnknown {
+			c.stackPush(typeParam)
+		} else {
+			c.stackPush(target)
+		}
+	}
+
+	return index, nil
+}
+
+func (c *compiler) stackPeek() (ret unsignedType) {
+	ret = c.stack[len(c.stack)-1]
+	return
+}
+
+func (c *compiler) stackPop() (ret unsignedType) {
+	// No need to check stack bound
+	// as we can assume that all the operations
+	// are valid thanks to validateFunction
+	// at module validation phase.
+	ret = c.stack[len(c.stack)-1]
+	c.stack = c.stack[:len(c.stack)-1]
+	return
+}
+
+func (c *compiler) stackPush(ts unsignedType) {
+	c.stack = append(c.stack, ts)
+}
+
+// emit adds the operations into the result.
+func (c *compiler) emit(op unionOperation) {
+	if !c.unreachableState.on {
+		switch op.Kind {
+		case operationKindDrop:
+			// If the drop range is nil,
+			// we could remove such operations.
+			// That happens when drop operation is unnecessary.
+			// i.e. when there's no need to adjust stack before jmp.
+			if int64(op.U1) == -1 {
+				return
+			}
+		}
+		c.result.Operations = append(c.result.Operations, op)
+		if c.needSourceOffset {
+			c.result.IROperationSourceOffsetsInWasmBinary = append(c.result.IROperationSourceOffsetsInWasmBinary,
+				c.currentOpPC+c.bodyOffsetInCodeSection)
+		}
+	}
+}
+
+// Emit const expression with default values of the given type.
+func (c *compiler) emitDefaultValue(t wasm.ValueType) {
+	switch t {
+	case wasm.ValueTypeI32:
+		c.stackPush(unsignedTypeI32)
+		c.emit(newOperationConstI32(0))
+	case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
+		c.stackPush(unsignedTypeI64)
+		c.emit(newOperationConstI64(0))
+	case wasm.ValueTypeF32:
+		c.stackPush(unsignedTypeF32)
+		c.emit(newOperationConstF32(0))
+	case wasm.ValueTypeF64:
+		c.stackPush(unsignedTypeF64)
+		c.emit(newOperationConstF64(0))
+	case wasm.ValueTypeV128:
+		c.stackPush(unsignedTypeV128)
+		c.emit(newOperationV128Const(0, 0))
+	}
+}
+
+// Returns the "depth" (starting from top of the stack)
+// of the n-th local.
+func (c *compiler) localDepth(index wasm.Index) int {
+	height := c.localIndexToStackHeightInUint64[index]
+	return c.stackLenInUint64(len(c.stack)) - 1 - int(height)
+}
+
+func (c *compiler) localType(index wasm.Index) (t wasm.ValueType) {
+	if params := uint32(len(c.sig.Params)); index < params {
+		t = c.sig.Params[index]
+	} else {
+		t = c.localTypes[index-params]
+	}
+	return
+}
+
+// getFrameDropRange returns the range (starting from top of the stack) that spans across the (uint64) stack. The range is
+// supposed to be dropped from the stack when the given frame exists or branch into it.
+//
+// * frame is the control frame which the call-site is trying to branch into or exit.
+// * isEnd true if the call-site is handling wasm.OpcodeEnd.
+func (c *compiler) getFrameDropRange(frame *controlFrame, isEnd bool) inclusiveRange {
+	var start int
+	if !isEnd && frame.kind == controlFrameKindLoop {
+		// If this is not End and the call-site is trying to branch into the Loop control frame,
+		// we have to Start executing from the beginning of the loop block.
+		// Therefore, we have to pass the inputs to the frame.
+		start = frame.blockType.ParamNumInUint64
+	} else {
+		start = frame.blockType.ResultNumInUint64
+	}
+	var end int
+	if frame.kind == controlFrameKindFunction {
+		// On the function return, we eliminate all the contents on the stack
+		// including locals (existing below of frame.originalStackLen)
+		end = c.stackLenInUint64(len(c.stack)) - 1
+	} else {
+		end = c.stackLenInUint64(len(c.stack)) - 1 - c.stackLenInUint64(frame.originalStackLenWithoutParam)
+	}
+	if start <= end {
+		return inclusiveRange{Start: int32(start), End: int32(end)}
+	} else {
+		return nopinclusiveRange
+	}
+}
+
+func (c *compiler) stackLenInUint64(ceil int) (ret int) {
+	for i := 0; i < ceil; i++ {
+		if c.stack[i] == unsignedTypeV128 {
+			ret += 2
+		} else {
+			ret++
+		}
+	}
+	return
+}
+
+func (c *compiler) readMemoryArg(tag string) (memoryArg, error) {
+	c.result.UsesMemory = true
+	alignment, num, err := leb128.LoadUint32(c.body[c.pc+1:])
+	if err != nil {
+		return memoryArg{}, fmt.Errorf("reading alignment for %s: %w", tag, err)
+	}
+	c.pc += num
+	offset, num, err := leb128.LoadUint32(c.body[c.pc+1:])
+	if err != nil {
+		return memoryArg{}, fmt.Errorf("reading offset for %s: %w", tag, err)
+	}
+	c.pc += num
+	return memoryArg{Offset: offset, Alignment: alignment}, nil
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/format.go b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/format.go
new file mode 100644
index 000000000..8af1d94b0
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/format.go
@@ -0,0 +1,22 @@
+package interpreter
+
+import (
+	"bytes"
+)
+
+func format(ops []unionOperation) string {
+	buf := bytes.NewBuffer(nil)
+
+	_, _ = buf.WriteString(".entrypoint\n")
+	for i := range ops {
+		op := &ops[i]
+		str := op.String()
+		isLabel := op.Kind == operationKindLabel
+		if !isLabel {
+			const indent = "\t"
+			str = indent + str
+		}
+		_, _ = buf.WriteString(str + "\n")
+	}
+	return buf.String()
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/interpreter.go b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/interpreter.go
new file mode 100644
index 000000000..a89ddc457
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/interpreter.go
@@ -0,0 +1,4583 @@
+package interpreter
+
+import (
+	"context"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"math"
+	"math/bits"
+	"sync"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/api"
+	"github.com/tetratelabs/wazero/experimental"
+	"github.com/tetratelabs/wazero/internal/expctxkeys"
+	"github.com/tetratelabs/wazero/internal/filecache"
+	"github.com/tetratelabs/wazero/internal/internalapi"
+	"github.com/tetratelabs/wazero/internal/moremath"
+	"github.com/tetratelabs/wazero/internal/wasm"
+	"github.com/tetratelabs/wazero/internal/wasmdebug"
+	"github.com/tetratelabs/wazero/internal/wasmruntime"
+)
+
+// callStackCeiling is the maximum WebAssembly call frame stack height. This allows wazero to raise
+// wasm.ErrCallStackOverflow instead of overflowing the Go runtime.
+//
+// The default value should suffice for most use cases. Those wishing to change this can via `go build -ldflags`.
+var callStackCeiling = 2000
+
+// engine is an interpreter implementation of wasm.Engine
+type engine struct {
+	enabledFeatures   api.CoreFeatures
+	compiledFunctions map[wasm.ModuleID][]compiledFunction // guarded by mutex.
+	mux               sync.RWMutex
+}
+
+func NewEngine(_ context.Context, enabledFeatures api.CoreFeatures, _ filecache.Cache) wasm.Engine {
+	return &engine{
+		enabledFeatures:   enabledFeatures,
+		compiledFunctions: map[wasm.ModuleID][]compiledFunction{},
+	}
+}
+
+// Close implements the same method as documented on wasm.Engine.
+func (e *engine) Close() (err error) {
+	return
+}
+
+// CompiledModuleCount implements the same method as documented on wasm.Engine.
+func (e *engine) CompiledModuleCount() uint32 {
+	return uint32(len(e.compiledFunctions))
+}
+
+// DeleteCompiledModule implements the same method as documented on wasm.Engine.
+func (e *engine) DeleteCompiledModule(m *wasm.Module) {
+	e.deleteCompiledFunctions(m)
+}
+
+func (e *engine) deleteCompiledFunctions(module *wasm.Module) {
+	e.mux.Lock()
+	defer e.mux.Unlock()
+	delete(e.compiledFunctions, module.ID)
+}
+
+func (e *engine) addCompiledFunctions(module *wasm.Module, fs []compiledFunction) {
+	e.mux.Lock()
+	defer e.mux.Unlock()
+	e.compiledFunctions[module.ID] = fs
+}
+
+func (e *engine) getCompiledFunctions(module *wasm.Module) (fs []compiledFunction, ok bool) {
+	e.mux.RLock()
+	defer e.mux.RUnlock()
+	fs, ok = e.compiledFunctions[module.ID]
+	return
+}
+
+// moduleEngine implements wasm.ModuleEngine
+type moduleEngine struct {
+	// codes are the compiled functions in a module instances.
+	// The index is module instance-scoped.
+	functions []function
+
+	// parentEngine holds *engine from which this module engine is created from.
+	parentEngine *engine
+}
+
+// GetGlobalValue implements the same method as documented on wasm.ModuleEngine.
+func (e *moduleEngine) GetGlobalValue(wasm.Index) (lo, hi uint64) {
+	panic("BUG: GetGlobalValue should never be called on interpreter mode")
+}
+
+// SetGlobalValue implements the same method as documented on wasm.ModuleEngine.
+func (e *moduleEngine) SetGlobalValue(idx wasm.Index, lo, hi uint64) {
+	panic("BUG: SetGlobalValue should never be called on interpreter mode")
+}
+
+// OwnsGlobals implements the same method as documented on wasm.ModuleEngine.
+func (e *moduleEngine) OwnsGlobals() bool { return false }
+
+// callEngine holds context per moduleEngine.Call, and shared across all the
+// function calls originating from the same moduleEngine.Call execution.
+//
+// This implements api.Function.
+type callEngine struct {
+	internalapi.WazeroOnlyType
+
+	// stack contains the operands.
+	// Note that all the values are represented as uint64.
+	stack []uint64
+
+	// frames are the function call stack.
+	frames []*callFrame
+
+	// f is the initial function for this call engine.
+	f *function
+
+	// stackiterator for Listeners to walk frames and stack.
+	stackIterator stackIterator
+}
+
+func (e *moduleEngine) newCallEngine(compiled *function) *callEngine {
+	return &callEngine{f: compiled}
+}
+
+func (ce *callEngine) pushValue(v uint64) {
+	ce.stack = append(ce.stack, v)
+}
+
+func (ce *callEngine) pushValues(v []uint64) {
+	ce.stack = append(ce.stack, v...)
+}
+
+func (ce *callEngine) popValue() (v uint64) {
+	// No need to check stack bound
+	// as we can assume that all the operations
+	// are valid thanks to validateFunction
+	// at module validation phase
+	// and interpreterir translation
+	// before compilation.
+	stackTopIndex := len(ce.stack) - 1
+	v = ce.stack[stackTopIndex]
+	ce.stack = ce.stack[:stackTopIndex]
+	return
+}
+
+func (ce *callEngine) popValues(v []uint64) {
+	stackTopIndex := len(ce.stack) - len(v)
+	copy(v, ce.stack[stackTopIndex:])
+	ce.stack = ce.stack[:stackTopIndex]
+}
+
+// peekValues peeks api.ValueType values from the stack and returns them.
+func (ce *callEngine) peekValues(count int) []uint64 {
+	if count == 0 {
+		return nil
+	}
+	stackLen := len(ce.stack)
+	return ce.stack[stackLen-count : stackLen]
+}
+
+func (ce *callEngine) drop(raw uint64) {
+	r := inclusiveRangeFromU64(raw)
+	if r.Start == -1 {
+		return
+	} else if r.Start == 0 {
+		ce.stack = ce.stack[:int32(len(ce.stack))-1-r.End]
+	} else {
+		newStack := ce.stack[:int32(len(ce.stack))-1-r.End]
+		newStack = append(newStack, ce.stack[int32(len(ce.stack))-r.Start:]...)
+		ce.stack = newStack
+	}
+}
+
+func (ce *callEngine) pushFrame(frame *callFrame) {
+	if callStackCeiling <= len(ce.frames) {
+		panic(wasmruntime.ErrRuntimeStackOverflow)
+	}
+	ce.frames = append(ce.frames, frame)
+}
+
+func (ce *callEngine) popFrame() (frame *callFrame) {
+	// No need to check stack bound as we can assume that all the operations are valid thanks to validateFunction at
+	// module validation phase and interpreterir translation before compilation.
+	oneLess := len(ce.frames) - 1
+	frame = ce.frames[oneLess]
+	ce.frames = ce.frames[:oneLess]
+	return
+}
+
+type callFrame struct {
+	// pc is the program counter representing the current position in code.body.
+	pc uint64
+	// f is the compiled function used in this function frame.
+	f *function
+	// base index in the frame of this function, used to detect the count of
+	// values on the stack.
+	base int
+}
+
+type compiledFunction struct {
+	source              *wasm.Module
+	body                []unionOperation
+	listener            experimental.FunctionListener
+	offsetsInWasmBinary []uint64
+	hostFn              interface{}
+	ensureTermination   bool
+	index               wasm.Index
+}
+
+type function struct {
+	funcType       *wasm.FunctionType
+	moduleInstance *wasm.ModuleInstance
+	typeID         wasm.FunctionTypeID
+	parent         *compiledFunction
+}
+
+// functionFromUintptr resurrects the original *function from the given uintptr
+// which comes from either funcref table or OpcodeRefFunc instruction.
+func functionFromUintptr(ptr uintptr) *function {
+	// Wraps ptrs as the double pointer in order to avoid the unsafe access as detected by race detector.
+	//
+	// For example, if we have (*function)(unsafe.Pointer(ptr)) instead, then the race detector's "checkptr"
+	// subroutine wanrs as "checkptr: pointer arithmetic result points to invalid allocation"
+	// https://github.com/golang/go/blob/1ce7fcf139417d618c2730010ede2afb41664211/src/runtime/checkptr.go#L69
+	var wrapped *uintptr = &ptr
+	return *(**function)(unsafe.Pointer(wrapped))
+}
+
+type snapshot struct {
+	stack  []uint64
+	frames []*callFrame
+	pc     uint64
+
+	ret []uint64
+
+	ce *callEngine
+}
+
+// Snapshot implements the same method as documented on experimental.Snapshotter.
+func (ce *callEngine) Snapshot() experimental.Snapshot {
+	stack := make([]uint64, len(ce.stack))
+	copy(stack, ce.stack)
+
+	frames := make([]*callFrame, len(ce.frames))
+	copy(frames, ce.frames)
+
+	return &snapshot{
+		stack:  stack,
+		frames: frames,
+		ce:     ce,
+	}
+}
+
+// Restore implements the same method as documented on experimental.Snapshot.
+func (s *snapshot) Restore(ret []uint64) {
+	s.ret = ret
+	panic(s)
+}
+
+func (s *snapshot) doRestore() {
+	ce := s.ce
+
+	ce.stack = s.stack
+	ce.frames = s.frames
+	ce.frames[len(ce.frames)-1].pc = s.pc
+
+	copy(ce.stack[len(ce.stack)-len(s.ret):], s.ret)
+}
+
+// Error implements the same method on error.
+func (s *snapshot) Error() string {
+	return "unhandled snapshot restore, this generally indicates restore was called from a different " +
+		"exported function invocation than snapshot"
+}
+
+// stackIterator implements experimental.StackIterator.
+type stackIterator struct {
+	stack   []uint64
+	frames  []*callFrame
+	started bool
+	fn      *function
+	pc      uint64
+}
+
+func (si *stackIterator) reset(stack []uint64, frames []*callFrame, f *function) {
+	si.fn = f
+	si.pc = 0
+	si.stack = stack
+	si.frames = frames
+	si.started = false
+}
+
+func (si *stackIterator) clear() {
+	si.stack = nil
+	si.frames = nil
+	si.started = false
+	si.fn = nil
+}
+
+// Next implements the same method as documented on experimental.StackIterator.
+func (si *stackIterator) Next() bool {
+	if !si.started {
+		si.started = true
+		return true
+	}
+
+	if len(si.frames) == 0 {
+		return false
+	}
+
+	frame := si.frames[len(si.frames)-1]
+	si.stack = si.stack[:frame.base]
+	si.fn = frame.f
+	si.pc = frame.pc
+	si.frames = si.frames[:len(si.frames)-1]
+	return true
+}
+
+// Function implements the same method as documented on
+// experimental.StackIterator.
+func (si *stackIterator) Function() experimental.InternalFunction {
+	return internalFunction{si.fn}
+}
+
+// ProgramCounter implements the same method as documented on
+// experimental.StackIterator.
+func (si *stackIterator) ProgramCounter() experimental.ProgramCounter {
+	return experimental.ProgramCounter(si.pc)
+}
+
+// internalFunction implements experimental.InternalFunction.
+type internalFunction struct{ *function }
+
+// Definition implements the same method as documented on
+// experimental.InternalFunction.
+func (f internalFunction) Definition() api.FunctionDefinition {
+	return f.definition()
+}
+
+// SourceOffsetForPC implements the same method as documented on
+// experimental.InternalFunction.
+func (f internalFunction) SourceOffsetForPC(pc experimental.ProgramCounter) uint64 {
+	offsetsMap := f.parent.offsetsInWasmBinary
+	if uint64(pc) < uint64(len(offsetsMap)) {
+		return offsetsMap[pc]
+	}
+	return 0
+}
+
+// interpreter mode doesn't maintain call frames in the stack, so pass the zero size to the IR.
+const callFrameStackSize = 0
+
+// CompileModule implements the same method as documented on wasm.Engine.
+func (e *engine) CompileModule(_ context.Context, module *wasm.Module, listeners []experimental.FunctionListener, ensureTermination bool) error {
+	if _, ok := e.getCompiledFunctions(module); ok { // cache hit!
+		return nil
+	}
+
+	funcs := make([]compiledFunction, len(module.FunctionSection))
+	irCompiler, err := newCompiler(e.enabledFeatures, callFrameStackSize, module, ensureTermination)
+	if err != nil {
+		return err
+	}
+	imported := module.ImportFunctionCount
+	for i := range module.CodeSection {
+		var lsn experimental.FunctionListener
+		if i < len(listeners) {
+			lsn = listeners[i]
+		}
+
+		compiled := &funcs[i]
+		// If this is the host function, there's nothing to do as the runtime representation of
+		// host function in interpreter is its Go function itself as opposed to Wasm functions,
+		// which need to be compiled down to
+		if codeSeg := &module.CodeSection[i]; codeSeg.GoFunc != nil {
+			compiled.hostFn = codeSeg.GoFunc
+		} else {
+			ir, err := irCompiler.Next()
+			if err != nil {
+				return err
+			}
+			err = e.lowerIR(ir, compiled)
+			if err != nil {
+				def := module.FunctionDefinition(uint32(i) + module.ImportFunctionCount)
+				return fmt.Errorf("failed to lower func[%s] to interpreterir: %w", def.DebugName(), err)
+			}
+		}
+		compiled.source = module
+		compiled.ensureTermination = ensureTermination
+		compiled.listener = lsn
+		compiled.index = imported + uint32(i)
+	}
+	e.addCompiledFunctions(module, funcs)
+	return nil
+}
+
+// NewModuleEngine implements the same method as documented on wasm.Engine.
+func (e *engine) NewModuleEngine(module *wasm.Module, instance *wasm.ModuleInstance) (wasm.ModuleEngine, error) {
+	me := &moduleEngine{
+		parentEngine: e,
+		functions:    make([]function, len(module.FunctionSection)+int(module.ImportFunctionCount)),
+	}
+
+	codes, ok := e.getCompiledFunctions(module)
+	if !ok {
+		return nil, errors.New("source module must be compiled before instantiation")
+	}
+
+	for i := range codes {
+		c := &codes[i]
+		offset := i + int(module.ImportFunctionCount)
+		typeIndex := module.FunctionSection[i]
+		me.functions[offset] = function{
+			moduleInstance: instance,
+			typeID:         instance.TypeIDs[typeIndex],
+			funcType:       &module.TypeSection[typeIndex],
+			parent:         c,
+		}
+	}
+	return me, nil
+}
+
+// lowerIR lowers the interpreterir operations to engine friendly struct.
+func (e *engine) lowerIR(ir *compilationResult, ret *compiledFunction) error {
+	// Copy the body from the result.
+	ret.body = make([]unionOperation, len(ir.Operations))
+	copy(ret.body, ir.Operations)
+	// Also copy the offsets if necessary.
+	if offsets := ir.IROperationSourceOffsetsInWasmBinary; len(offsets) > 0 {
+		ret.offsetsInWasmBinary = make([]uint64, len(offsets))
+		copy(ret.offsetsInWasmBinary, offsets)
+	}
+
+	labelAddressResolutions := [labelKindNum][]uint64{}
+
+	// First, we iterate all labels, and resolve the address.
+	for i := range ret.body {
+		op := &ret.body[i]
+		switch op.Kind {
+		case operationKindLabel:
+			label := label(op.U1)
+			address := uint64(i)
+
+			kind, fid := label.Kind(), label.FrameID()
+			frameToAddresses := labelAddressResolutions[label.Kind()]
+			// Expand the slice if necessary.
+			if diff := fid - len(frameToAddresses) + 1; diff > 0 {
+				for j := 0; j < diff; j++ {
+					frameToAddresses = append(frameToAddresses, 0)
+				}
+			}
+			frameToAddresses[fid] = address
+			labelAddressResolutions[kind] = frameToAddresses
+		}
+	}
+
+	// Then resolve the label as the index to the body.
+	for i := range ret.body {
+		op := &ret.body[i]
+		switch op.Kind {
+		case operationKindBr:
+			e.setLabelAddress(&op.U1, label(op.U1), labelAddressResolutions)
+		case operationKindBrIf:
+			e.setLabelAddress(&op.U1, label(op.U1), labelAddressResolutions)
+			e.setLabelAddress(&op.U2, label(op.U2), labelAddressResolutions)
+		case operationKindBrTable:
+			for j := 0; j < len(op.Us); j += 2 {
+				target := op.Us[j]
+				e.setLabelAddress(&op.Us[j], label(target), labelAddressResolutions)
+			}
+		}
+	}
+	return nil
+}
+
+func (e *engine) setLabelAddress(op *uint64, label label, labelAddressResolutions [labelKindNum][]uint64) {
+	if label.IsReturnTarget() {
+		// Jmp to the end of the possible binary.
+		*op = math.MaxUint64
+	} else {
+		*op = labelAddressResolutions[label.Kind()][label.FrameID()]
+	}
+}
+
+// ResolveImportedFunction implements wasm.ModuleEngine.
+func (e *moduleEngine) ResolveImportedFunction(index, indexInImportedModule wasm.Index, importedModuleEngine wasm.ModuleEngine) {
+	imported := importedModuleEngine.(*moduleEngine)
+	e.functions[index] = imported.functions[indexInImportedModule]
+}
+
+// ResolveImportedMemory implements wasm.ModuleEngine.
+func (e *moduleEngine) ResolveImportedMemory(wasm.ModuleEngine) {}
+
+// DoneInstantiation implements wasm.ModuleEngine.
+func (e *moduleEngine) DoneInstantiation() {}
+
+// FunctionInstanceReference implements the same method as documented on wasm.ModuleEngine.
+func (e *moduleEngine) FunctionInstanceReference(funcIndex wasm.Index) wasm.Reference {
+	return uintptr(unsafe.Pointer(&e.functions[funcIndex]))
+}
+
+// NewFunction implements the same method as documented on wasm.ModuleEngine.
+func (e *moduleEngine) NewFunction(index wasm.Index) (ce api.Function) {
+	// Note: The input parameters are pre-validated, so a compiled function is only absent on close. Updates to
+	// code on close aren't locked, neither is this read.
+	compiled := &e.functions[index]
+	return e.newCallEngine(compiled)
+}
+
+// LookupFunction implements the same method as documented on wasm.ModuleEngine.
+func (e *moduleEngine) LookupFunction(t *wasm.TableInstance, typeId wasm.FunctionTypeID, tableOffset wasm.Index) (*wasm.ModuleInstance, wasm.Index) {
+	if tableOffset >= uint32(len(t.References)) {
+		panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+	}
+	rawPtr := t.References[tableOffset]
+	if rawPtr == 0 {
+		panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+	}
+
+	tf := functionFromUintptr(rawPtr)
+	if tf.typeID != typeId {
+		panic(wasmruntime.ErrRuntimeIndirectCallTypeMismatch)
+	}
+	return tf.moduleInstance, tf.parent.index
+}
+
+// Definition implements the same method as documented on api.Function.
+func (ce *callEngine) Definition() api.FunctionDefinition {
+	return ce.f.definition()
+}
+
+func (f *function) definition() api.FunctionDefinition {
+	compiled := f.parent
+	return compiled.source.FunctionDefinition(compiled.index)
+}
+
+// Call implements the same method as documented on api.Function.
+func (ce *callEngine) Call(ctx context.Context, params ...uint64) (results []uint64, err error) {
+	ft := ce.f.funcType
+	if n := ft.ParamNumInUint64; n != len(params) {
+		return nil, fmt.Errorf("expected %d params, but passed %d", n, len(params))
+	}
+	return ce.call(ctx, params, nil)
+}
+
+// CallWithStack implements the same method as documented on api.Function.
+func (ce *callEngine) CallWithStack(ctx context.Context, stack []uint64) error {
+	params, results, err := wasm.SplitCallStack(ce.f.funcType, stack)
+	if err != nil {
+		return err
+	}
+	_, err = ce.call(ctx, params, results)
+	return err
+}
+
+func (ce *callEngine) call(ctx context.Context, params, results []uint64) (_ []uint64, err error) {
+	m := ce.f.moduleInstance
+	if ce.f.parent.ensureTermination {
+		select {
+		case <-ctx.Done():
+			// If the provided context is already done, close the call context
+			// and return the error.
+			m.CloseWithCtxErr(ctx)
+			return nil, m.FailIfClosed()
+		default:
+		}
+	}
+
+	if ctx.Value(expctxkeys.EnableSnapshotterKey{}) != nil {
+		ctx = context.WithValue(ctx, expctxkeys.SnapshotterKey{}, ce)
+	}
+
+	defer func() {
+		// If the module closed during the call, and the call didn't err for another reason, set an ExitError.
+		if err == nil {
+			err = m.FailIfClosed()
+		}
+		// TODO: ^^ Will not fail if the function was imported from a closed module.
+
+		if v := recover(); v != nil {
+			err = ce.recoverOnCall(ctx, m, v)
+		}
+	}()
+
+	ce.pushValues(params)
+
+	if ce.f.parent.ensureTermination {
+		done := m.CloseModuleOnCanceledOrTimeout(ctx)
+		defer done()
+	}
+
+	ce.callFunction(ctx, m, ce.f)
+
+	// This returns a safe copy of the results, instead of a slice view. If we
+	// returned a re-slice, the caller could accidentally or purposefully
+	// corrupt the stack of subsequent calls.
+	ft := ce.f.funcType
+	if results == nil && ft.ResultNumInUint64 > 0 {
+		results = make([]uint64, ft.ResultNumInUint64)
+	}
+	ce.popValues(results)
+	return results, nil
+}
+
+// functionListenerInvocation captures arguments needed to perform function
+// listener invocations when unwinding the call stack.
+type functionListenerInvocation struct {
+	experimental.FunctionListener
+	def api.FunctionDefinition
+}
+
+// recoverOnCall takes the recovered value `recoverOnCall`, and wraps it
+// with the call frame stack traces. Also, reset the state of callEngine
+// so that it can be used for the subsequent calls.
+func (ce *callEngine) recoverOnCall(ctx context.Context, m *wasm.ModuleInstance, v interface{}) (err error) {
+	if s, ok := v.(*snapshot); ok {
+		// A snapshot that wasn't handled was created by a different call engine possibly from a nested wasm invocation,
+		// let it propagate up to be handled by the caller.
+		panic(s)
+	}
+
+	builder := wasmdebug.NewErrorBuilder()
+	frameCount := len(ce.frames)
+	functionListeners := make([]functionListenerInvocation, 0, 16)
+
+	if frameCount > wasmdebug.MaxFrames {
+		frameCount = wasmdebug.MaxFrames
+	}
+	for i := 0; i < frameCount; i++ {
+		frame := ce.popFrame()
+		f := frame.f
+		def := f.definition()
+		var sources []string
+		if parent := frame.f.parent; parent.body != nil && len(parent.offsetsInWasmBinary) > 0 {
+			sources = parent.source.DWARFLines.Line(parent.offsetsInWasmBinary[frame.pc])
+		}
+		builder.AddFrame(def.DebugName(), def.ParamTypes(), def.ResultTypes(), sources)
+		if f.parent.listener != nil {
+			functionListeners = append(functionListeners, functionListenerInvocation{
+				FunctionListener: f.parent.listener,
+				def:              f.definition(),
+			})
+		}
+	}
+
+	err = builder.FromRecovered(v)
+	for i := range functionListeners {
+		functionListeners[i].Abort(ctx, m, functionListeners[i].def, err)
+	}
+
+	// Allows the reuse of CallEngine.
+	ce.stack, ce.frames = ce.stack[:0], ce.frames[:0]
+	return
+}
+
+func (ce *callEngine) callFunction(ctx context.Context, m *wasm.ModuleInstance, f *function) {
+	if f.parent.hostFn != nil {
+		ce.callGoFuncWithStack(ctx, m, f)
+	} else if lsn := f.parent.listener; lsn != nil {
+		ce.callNativeFuncWithListener(ctx, m, f, lsn)
+	} else {
+		ce.callNativeFunc(ctx, m, f)
+	}
+}
+
+func (ce *callEngine) callGoFunc(ctx context.Context, m *wasm.ModuleInstance, f *function, stack []uint64) {
+	typ := f.funcType
+	lsn := f.parent.listener
+	if lsn != nil {
+		params := stack[:typ.ParamNumInUint64]
+		ce.stackIterator.reset(ce.stack, ce.frames, f)
+		lsn.Before(ctx, m, f.definition(), params, &ce.stackIterator)
+		ce.stackIterator.clear()
+	}
+	frame := &callFrame{f: f, base: len(ce.stack)}
+	ce.pushFrame(frame)
+
+	fn := f.parent.hostFn
+	switch fn := fn.(type) {
+	case api.GoModuleFunction:
+		fn.Call(ctx, m, stack)
+	case api.GoFunction:
+		fn.Call(ctx, stack)
+	}
+
+	ce.popFrame()
+	if lsn != nil {
+		// TODO: This doesn't get the error due to use of panic to propagate them.
+		results := stack[:typ.ResultNumInUint64]
+		lsn.After(ctx, m, f.definition(), results)
+	}
+}
+
+func (ce *callEngine) callNativeFunc(ctx context.Context, m *wasm.ModuleInstance, f *function) {
+	frame := &callFrame{f: f, base: len(ce.stack)}
+	moduleInst := f.moduleInstance
+	functions := moduleInst.Engine.(*moduleEngine).functions
+	memoryInst := moduleInst.MemoryInstance
+	globals := moduleInst.Globals
+	tables := moduleInst.Tables
+	typeIDs := moduleInst.TypeIDs
+	dataInstances := moduleInst.DataInstances
+	elementInstances := moduleInst.ElementInstances
+	ce.pushFrame(frame)
+	body := frame.f.parent.body
+	bodyLen := uint64(len(body))
+	for frame.pc < bodyLen {
+		op := &body[frame.pc]
+		// TODO: add description of each operation/case
+		// on, for example, how many args are used,
+		// how the stack is modified, etc.
+		switch op.Kind {
+		case operationKindBuiltinFunctionCheckExitCode:
+			if err := m.FailIfClosed(); err != nil {
+				panic(err)
+			}
+			frame.pc++
+		case operationKindUnreachable:
+			panic(wasmruntime.ErrRuntimeUnreachable)
+		case operationKindBr:
+			frame.pc = op.U1
+		case operationKindBrIf:
+			if ce.popValue() > 0 {
+				ce.drop(op.U3)
+				frame.pc = op.U1
+			} else {
+				frame.pc = op.U2
+			}
+		case operationKindBrTable:
+			v := ce.popValue()
+			defaultAt := uint64(len(op.Us))/2 - 1
+			if v > defaultAt {
+				v = defaultAt
+			}
+			v *= 2
+			ce.drop(op.Us[v+1])
+			frame.pc = op.Us[v]
+		case operationKindCall:
+			func() {
+				if ctx.Value(expctxkeys.EnableSnapshotterKey{}) != nil {
+					defer func() {
+						if r := recover(); r != nil {
+							if s, ok := r.(*snapshot); ok && s.ce == ce {
+								s.doRestore()
+								frame = ce.frames[len(ce.frames)-1]
+								body = frame.f.parent.body
+								bodyLen = uint64(len(body))
+							} else {
+								panic(r)
+							}
+						}
+					}()
+				}
+				ce.callFunction(ctx, f.moduleInstance, &functions[op.U1])
+			}()
+			frame.pc++
+		case operationKindCallIndirect:
+			offset := ce.popValue()
+			table := tables[op.U2]
+			if offset >= uint64(len(table.References)) {
+				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+			}
+			rawPtr := table.References[offset]
+			if rawPtr == 0 {
+				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+			}
+
+			tf := functionFromUintptr(rawPtr)
+			if tf.typeID != typeIDs[op.U1] {
+				panic(wasmruntime.ErrRuntimeIndirectCallTypeMismatch)
+			}
+
+			ce.callFunction(ctx, f.moduleInstance, tf)
+			frame.pc++
+		case operationKindDrop:
+			ce.drop(op.U1)
+			frame.pc++
+		case operationKindSelect:
+			c := ce.popValue()
+			if op.B3 { // Target is vector.
+				x2Hi, x2Lo := ce.popValue(), ce.popValue()
+				if c == 0 {
+					_, _ = ce.popValue(), ce.popValue() // discard the x1's lo and hi bits.
+					ce.pushValue(x2Lo)
+					ce.pushValue(x2Hi)
+				}
+			} else {
+				v2 := ce.popValue()
+				if c == 0 {
+					_ = ce.popValue()
+					ce.pushValue(v2)
+				}
+			}
+			frame.pc++
+		case operationKindPick:
+			index := len(ce.stack) - 1 - int(op.U1)
+			ce.pushValue(ce.stack[index])
+			if op.B3 { // V128 value target.
+				ce.pushValue(ce.stack[index+1])
+			}
+			frame.pc++
+		case operationKindSet:
+			if op.B3 { // V128 value target.
+				lowIndex := len(ce.stack) - 1 - int(op.U1)
+				highIndex := lowIndex + 1
+				hi, lo := ce.popValue(), ce.popValue()
+				ce.stack[lowIndex], ce.stack[highIndex] = lo, hi
+			} else {
+				index := len(ce.stack) - 1 - int(op.U1)
+				ce.stack[index] = ce.popValue()
+			}
+			frame.pc++
+		case operationKindGlobalGet:
+			g := globals[op.U1]
+			ce.pushValue(g.Val)
+			if g.Type.ValType == wasm.ValueTypeV128 {
+				ce.pushValue(g.ValHi)
+			}
+			frame.pc++
+		case operationKindGlobalSet:
+			g := globals[op.U1]
+			if g.Type.ValType == wasm.ValueTypeV128 {
+				g.ValHi = ce.popValue()
+			}
+			g.Val = ce.popValue()
+			frame.pc++
+		case operationKindLoad:
+			offset := ce.popMemoryOffset(op)
+			switch unsignedType(op.B1) {
+			case unsignedTypeI32, unsignedTypeF32:
+				if val, ok := memoryInst.ReadUint32Le(offset); !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				} else {
+					ce.pushValue(uint64(val))
+				}
+			case unsignedTypeI64, unsignedTypeF64:
+				if val, ok := memoryInst.ReadUint64Le(offset); !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				} else {
+					ce.pushValue(val)
+				}
+			}
+			frame.pc++
+		case operationKindLoad8:
+			val, ok := memoryInst.ReadByte(ce.popMemoryOffset(op))
+			if !ok {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+
+			switch signedInt(op.B1) {
+			case signedInt32:
+				ce.pushValue(uint64(uint32(int8(val))))
+			case signedInt64:
+				ce.pushValue(uint64(int8(val)))
+			case signedUint32, signedUint64:
+				ce.pushValue(uint64(val))
+			}
+			frame.pc++
+		case operationKindLoad16:
+
+			val, ok := memoryInst.ReadUint16Le(ce.popMemoryOffset(op))
+			if !ok {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+
+			switch signedInt(op.B1) {
+			case signedInt32:
+				ce.pushValue(uint64(uint32(int16(val))))
+			case signedInt64:
+				ce.pushValue(uint64(int16(val)))
+			case signedUint32, signedUint64:
+				ce.pushValue(uint64(val))
+			}
+			frame.pc++
+		case operationKindLoad32:
+			val, ok := memoryInst.ReadUint32Le(ce.popMemoryOffset(op))
+			if !ok {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+
+			if op.B1 == 1 { // Signed
+				ce.pushValue(uint64(int32(val)))
+			} else {
+				ce.pushValue(uint64(val))
+			}
+			frame.pc++
+		case operationKindStore:
+			val := ce.popValue()
+			offset := ce.popMemoryOffset(op)
+			switch unsignedType(op.B1) {
+			case unsignedTypeI32, unsignedTypeF32:
+				if !memoryInst.WriteUint32Le(offset, uint32(val)) {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+			case unsignedTypeI64, unsignedTypeF64:
+				if !memoryInst.WriteUint64Le(offset, val) {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+			}
+			frame.pc++
+		case operationKindStore8:
+			val := byte(ce.popValue())
+			offset := ce.popMemoryOffset(op)
+			if !memoryInst.WriteByte(offset, val) {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+			frame.pc++
+		case operationKindStore16:
+			val := uint16(ce.popValue())
+			offset := ce.popMemoryOffset(op)
+			if !memoryInst.WriteUint16Le(offset, val) {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+			frame.pc++
+		case operationKindStore32:
+			val := uint32(ce.popValue())
+			offset := ce.popMemoryOffset(op)
+			if !memoryInst.WriteUint32Le(offset, val) {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+			frame.pc++
+		case operationKindMemorySize:
+			ce.pushValue(uint64(memoryInst.Pages()))
+			frame.pc++
+		case operationKindMemoryGrow:
+			n := ce.popValue()
+			if res, ok := memoryInst.Grow(uint32(n)); !ok {
+				ce.pushValue(uint64(0xffffffff)) // = -1 in signed 32-bit integer.
+			} else {
+				ce.pushValue(uint64(res))
+			}
+			frame.pc++
+		case operationKindConstI32, operationKindConstI64,
+			operationKindConstF32, operationKindConstF64:
+			ce.pushValue(op.U1)
+			frame.pc++
+		case operationKindEq:
+			var b bool
+			switch unsignedType(op.B1) {
+			case unsignedTypeI32:
+				v2, v1 := ce.popValue(), ce.popValue()
+				b = uint32(v1) == uint32(v2)
+			case unsignedTypeI64:
+				v2, v1 := ce.popValue(), ce.popValue()
+				b = v1 == v2
+			case unsignedTypeF32:
+				v2, v1 := ce.popValue(), ce.popValue()
+				b = math.Float32frombits(uint32(v2)) == math.Float32frombits(uint32(v1))
+			case unsignedTypeF64:
+				v2, v1 := ce.popValue(), ce.popValue()
+				b = math.Float64frombits(v2) == math.Float64frombits(v1)
+			}
+			if b {
+				ce.pushValue(1)
+			} else {
+				ce.pushValue(0)
+			}
+			frame.pc++
+		case operationKindNe:
+			var b bool
+			switch unsignedType(op.B1) {
+			case unsignedTypeI32, unsignedTypeI64:
+				v2, v1 := ce.popValue(), ce.popValue()
+				b = v1 != v2
+			case unsignedTypeF32:
+				v2, v1 := ce.popValue(), ce.popValue()
+				b = math.Float32frombits(uint32(v2)) != math.Float32frombits(uint32(v1))
+			case unsignedTypeF64:
+				v2, v1 := ce.popValue(), ce.popValue()
+				b = math.Float64frombits(v2) != math.Float64frombits(v1)
+			}
+			if b {
+				ce.pushValue(1)
+			} else {
+				ce.pushValue(0)
+			}
+			frame.pc++
+		case operationKindEqz:
+			if ce.popValue() == 0 {
+				ce.pushValue(1)
+			} else {
+				ce.pushValue(0)
+			}
+			frame.pc++
+		case operationKindLt:
+			v2 := ce.popValue()
+			v1 := ce.popValue()
+			var b bool
+			switch signedType(op.B1) {
+			case signedTypeInt32:
+				b = int32(v1) < int32(v2)
+			case signedTypeInt64:
+				b = int64(v1) < int64(v2)
+			case signedTypeUint32, signedTypeUint64:
+				b = v1 < v2
+			case signedTypeFloat32:
+				b = math.Float32frombits(uint32(v1)) < math.Float32frombits(uint32(v2))
+			case signedTypeFloat64:
+				b = math.Float64frombits(v1) < math.Float64frombits(v2)
+			}
+			if b {
+				ce.pushValue(1)
+			} else {
+				ce.pushValue(0)
+			}
+			frame.pc++
+		case operationKindGt:
+			v2 := ce.popValue()
+			v1 := ce.popValue()
+			var b bool
+			switch signedType(op.B1) {
+			case signedTypeInt32:
+				b = int32(v1) > int32(v2)
+			case signedTypeInt64:
+				b = int64(v1) > int64(v2)
+			case signedTypeUint32, signedTypeUint64:
+				b = v1 > v2
+			case signedTypeFloat32:
+				b = math.Float32frombits(uint32(v1)) > math.Float32frombits(uint32(v2))
+			case signedTypeFloat64:
+				b = math.Float64frombits(v1) > math.Float64frombits(v2)
+			}
+			if b {
+				ce.pushValue(1)
+			} else {
+				ce.pushValue(0)
+			}
+			frame.pc++
+		case operationKindLe:
+			v2 := ce.popValue()
+			v1 := ce.popValue()
+			var b bool
+			switch signedType(op.B1) {
+			case signedTypeInt32:
+				b = int32(v1) <= int32(v2)
+			case signedTypeInt64:
+				b = int64(v1) <= int64(v2)
+			case signedTypeUint32, signedTypeUint64:
+				b = v1 <= v2
+			case signedTypeFloat32:
+				b = math.Float32frombits(uint32(v1)) <= math.Float32frombits(uint32(v2))
+			case signedTypeFloat64:
+				b = math.Float64frombits(v1) <= math.Float64frombits(v2)
+			}
+			if b {
+				ce.pushValue(1)
+			} else {
+				ce.pushValue(0)
+			}
+			frame.pc++
+		case operationKindGe:
+			v2 := ce.popValue()
+			v1 := ce.popValue()
+			var b bool
+			switch signedType(op.B1) {
+			case signedTypeInt32:
+				b = int32(v1) >= int32(v2)
+			case signedTypeInt64:
+				b = int64(v1) >= int64(v2)
+			case signedTypeUint32, signedTypeUint64:
+				b = v1 >= v2
+			case signedTypeFloat32:
+				b = math.Float32frombits(uint32(v1)) >= math.Float32frombits(uint32(v2))
+			case signedTypeFloat64:
+				b = math.Float64frombits(v1) >= math.Float64frombits(v2)
+			}
+			if b {
+				ce.pushValue(1)
+			} else {
+				ce.pushValue(0)
+			}
+			frame.pc++
+		case operationKindAdd:
+			v2 := ce.popValue()
+			v1 := ce.popValue()
+			switch unsignedType(op.B1) {
+			case unsignedTypeI32:
+				v := uint32(v1) + uint32(v2)
+				ce.pushValue(uint64(v))
+			case unsignedTypeI64:
+				ce.pushValue(v1 + v2)
+			case unsignedTypeF32:
+				ce.pushValue(addFloat32bits(uint32(v1), uint32(v2)))
+			case unsignedTypeF64:
+				v := math.Float64frombits(v1) + math.Float64frombits(v2)
+				ce.pushValue(math.Float64bits(v))
+			}
+			frame.pc++
+		case operationKindSub:
+			v2 := ce.popValue()
+			v1 := ce.popValue()
+			switch unsignedType(op.B1) {
+			case unsignedTypeI32:
+				ce.pushValue(uint64(uint32(v1) - uint32(v2)))
+			case unsignedTypeI64:
+				ce.pushValue(v1 - v2)
+			case unsignedTypeF32:
+				ce.pushValue(subFloat32bits(uint32(v1), uint32(v2)))
+			case unsignedTypeF64:
+				v := math.Float64frombits(v1) - math.Float64frombits(v2)
+				ce.pushValue(math.Float64bits(v))
+			}
+			frame.pc++
+		case operationKindMul:
+			v2 := ce.popValue()
+			v1 := ce.popValue()
+			switch unsignedType(op.B1) {
+			case unsignedTypeI32:
+				ce.pushValue(uint64(uint32(v1) * uint32(v2)))
+			case unsignedTypeI64:
+				ce.pushValue(v1 * v2)
+			case unsignedTypeF32:
+				ce.pushValue(mulFloat32bits(uint32(v1), uint32(v2)))
+			case unsignedTypeF64:
+				v := math.Float64frombits(v2) * math.Float64frombits(v1)
+				ce.pushValue(math.Float64bits(v))
+			}
+			frame.pc++
+		case operationKindClz:
+			v := ce.popValue()
+			if op.B1 == 0 {
+				// unsignedInt32
+				ce.pushValue(uint64(bits.LeadingZeros32(uint32(v))))
+			} else {
+				// unsignedInt64
+				ce.pushValue(uint64(bits.LeadingZeros64(v)))
+			}
+			frame.pc++
+		case operationKindCtz:
+			v := ce.popValue()
+			if op.B1 == 0 {
+				// unsignedInt32
+				ce.pushValue(uint64(bits.TrailingZeros32(uint32(v))))
+			} else {
+				// unsignedInt64
+				ce.pushValue(uint64(bits.TrailingZeros64(v)))
+			}
+			frame.pc++
+		case operationKindPopcnt:
+			v := ce.popValue()
+			if op.B1 == 0 {
+				// unsignedInt32
+				ce.pushValue(uint64(bits.OnesCount32(uint32(v))))
+			} else {
+				// unsignedInt64
+				ce.pushValue(uint64(bits.OnesCount64(v)))
+			}
+			frame.pc++
+		case operationKindDiv:
+			// If an integer, check we won't divide by zero.
+			t := signedType(op.B1)
+			v2, v1 := ce.popValue(), ce.popValue()
+			switch t {
+			case signedTypeFloat32, signedTypeFloat64: // not integers
+			default:
+				if v2 == 0 {
+					panic(wasmruntime.ErrRuntimeIntegerDivideByZero)
+				}
+			}
+
+			switch t {
+			case signedTypeInt32:
+				d := int32(v2)
+				n := int32(v1)
+				if n == math.MinInt32 && d == -1 {
+					panic(wasmruntime.ErrRuntimeIntegerOverflow)
+				}
+				ce.pushValue(uint64(uint32(n / d)))
+			case signedTypeInt64:
+				d := int64(v2)
+				n := int64(v1)
+				if n == math.MinInt64 && d == -1 {
+					panic(wasmruntime.ErrRuntimeIntegerOverflow)
+				}
+				ce.pushValue(uint64(n / d))
+			case signedTypeUint32:
+				d := uint32(v2)
+				n := uint32(v1)
+				ce.pushValue(uint64(n / d))
+			case signedTypeUint64:
+				d := v2
+				n := v1
+				ce.pushValue(n / d)
+			case signedTypeFloat32:
+				ce.pushValue(divFloat32bits(uint32(v1), uint32(v2)))
+			case signedTypeFloat64:
+				ce.pushValue(math.Float64bits(math.Float64frombits(v1) / math.Float64frombits(v2)))
+			}
+			frame.pc++
+		case operationKindRem:
+			v2, v1 := ce.popValue(), ce.popValue()
+			if v2 == 0 {
+				panic(wasmruntime.ErrRuntimeIntegerDivideByZero)
+			}
+			switch signedInt(op.B1) {
+			case signedInt32:
+				d := int32(v2)
+				n := int32(v1)
+				ce.pushValue(uint64(uint32(n % d)))
+			case signedInt64:
+				d := int64(v2)
+				n := int64(v1)
+				ce.pushValue(uint64(n % d))
+			case signedUint32:
+				d := uint32(v2)
+				n := uint32(v1)
+				ce.pushValue(uint64(n % d))
+			case signedUint64:
+				d := v2
+				n := v1
+				ce.pushValue(n % d)
+			}
+			frame.pc++
+		case operationKindAnd:
+			v2 := ce.popValue()
+			v1 := ce.popValue()
+			if op.B1 == 0 {
+				// unsignedInt32
+				ce.pushValue(uint64(uint32(v2) & uint32(v1)))
+			} else {
+				// unsignedInt64
+				ce.pushValue(uint64(v2 & v1))
+			}
+			frame.pc++
+		case operationKindOr:
+			v2 := ce.popValue()
+			v1 := ce.popValue()
+			if op.B1 == 0 {
+				// unsignedInt32
+				ce.pushValue(uint64(uint32(v2) | uint32(v1)))
+			} else {
+				// unsignedInt64
+				ce.pushValue(uint64(v2 | v1))
+			}
+			frame.pc++
+		case operationKindXor:
+			v2 := ce.popValue()
+			v1 := ce.popValue()
+			if op.B1 == 0 {
+				// unsignedInt32
+				ce.pushValue(uint64(uint32(v2) ^ uint32(v1)))
+			} else {
+				// unsignedInt64
+				ce.pushValue(uint64(v2 ^ v1))
+			}
+			frame.pc++
+		case operationKindShl:
+			v2 := ce.popValue()
+			v1 := ce.popValue()
+			if op.B1 == 0 {
+				// unsignedInt32
+				ce.pushValue(uint64(uint32(v1) << (uint32(v2) % 32)))
+			} else {
+				// unsignedInt64
+				ce.pushValue(v1 << (v2 % 64))
+			}
+			frame.pc++
+		case operationKindShr:
+			v2 := ce.popValue()
+			v1 := ce.popValue()
+			switch signedInt(op.B1) {
+			case signedInt32:
+				ce.pushValue(uint64(uint32(int32(v1) >> (uint32(v2) % 32))))
+			case signedInt64:
+				ce.pushValue(uint64(int64(v1) >> (v2 % 64)))
+			case signedUint32:
+				ce.pushValue(uint64(uint32(v1) >> (uint32(v2) % 32)))
+			case signedUint64:
+				ce.pushValue(v1 >> (v2 % 64))
+			}
+			frame.pc++
+		case operationKindRotl:
+			v2 := ce.popValue()
+			v1 := ce.popValue()
+			if op.B1 == 0 {
+				// unsignedInt32
+				ce.pushValue(uint64(bits.RotateLeft32(uint32(v1), int(v2))))
+			} else {
+				// unsignedInt64
+				ce.pushValue(uint64(bits.RotateLeft64(v1, int(v2))))
+			}
+			frame.pc++
+		case operationKindRotr:
+			v2 := ce.popValue()
+			v1 := ce.popValue()
+			if op.B1 == 0 {
+				// unsignedInt32
+				ce.pushValue(uint64(bits.RotateLeft32(uint32(v1), -int(v2))))
+			} else {
+				// unsignedInt64
+				ce.pushValue(uint64(bits.RotateLeft64(v1, -int(v2))))
+			}
+			frame.pc++
+		case operationKindAbs:
+			if op.B1 == 0 {
+				// float32
+				const mask uint32 = 1 << 31
+				ce.pushValue(uint64(uint32(ce.popValue()) &^ mask))
+			} else {
+				// float64
+				const mask uint64 = 1 << 63
+				ce.pushValue(ce.popValue() &^ mask)
+			}
+			frame.pc++
+		case operationKindNeg:
+			if op.B1 == 0 {
+				// float32
+				v := -math.Float32frombits(uint32(ce.popValue()))
+				ce.pushValue(uint64(math.Float32bits(v)))
+			} else {
+				// float64
+				v := -math.Float64frombits(ce.popValue())
+				ce.pushValue(math.Float64bits(v))
+			}
+			frame.pc++
+		case operationKindCeil:
+			if op.B1 == 0 {
+				// float32
+				v := moremath.WasmCompatCeilF32(math.Float32frombits(uint32(ce.popValue())))
+				ce.pushValue(uint64(math.Float32bits(v)))
+			} else {
+				// float64
+				v := moremath.WasmCompatCeilF64(math.Float64frombits(ce.popValue()))
+				ce.pushValue(math.Float64bits(v))
+			}
+			frame.pc++
+		case operationKindFloor:
+			if op.B1 == 0 {
+				// float32
+				v := moremath.WasmCompatFloorF32(math.Float32frombits(uint32(ce.popValue())))
+				ce.pushValue(uint64(math.Float32bits(v)))
+			} else {
+				// float64
+				v := moremath.WasmCompatFloorF64(math.Float64frombits(ce.popValue()))
+				ce.pushValue(math.Float64bits(v))
+			}
+			frame.pc++
+		case operationKindTrunc:
+			if op.B1 == 0 {
+				// float32
+				v := moremath.WasmCompatTruncF32(math.Float32frombits(uint32(ce.popValue())))
+				ce.pushValue(uint64(math.Float32bits(v)))
+			} else {
+				// float64
+				v := moremath.WasmCompatTruncF64(math.Float64frombits(ce.popValue()))
+				ce.pushValue(math.Float64bits(v))
+			}
+			frame.pc++
+		case operationKindNearest:
+			if op.B1 == 0 {
+				// float32
+				f := math.Float32frombits(uint32(ce.popValue()))
+				ce.pushValue(uint64(math.Float32bits(moremath.WasmCompatNearestF32(f))))
+			} else {
+				// float64
+				f := math.Float64frombits(ce.popValue())
+				ce.pushValue(math.Float64bits(moremath.WasmCompatNearestF64(f)))
+			}
+			frame.pc++
+		case operationKindSqrt:
+			if op.B1 == 0 {
+				// float32
+				v := math.Sqrt(float64(math.Float32frombits(uint32(ce.popValue()))))
+				ce.pushValue(uint64(math.Float32bits(float32(v))))
+			} else {
+				// float64
+				v := math.Sqrt(math.Float64frombits(ce.popValue()))
+				ce.pushValue(math.Float64bits(v))
+			}
+			frame.pc++
+		case operationKindMin:
+			if op.B1 == 0 {
+				// float32
+				ce.pushValue(wasmCompatMin32bits(uint32(ce.popValue()), uint32(ce.popValue())))
+			} else {
+				v2 := math.Float64frombits(ce.popValue())
+				v1 := math.Float64frombits(ce.popValue())
+				ce.pushValue(math.Float64bits(moremath.WasmCompatMin64(v1, v2)))
+			}
+			frame.pc++
+		case operationKindMax:
+			if op.B1 == 0 {
+				ce.pushValue(wasmCompatMax32bits(uint32(ce.popValue()), uint32(ce.popValue())))
+			} else {
+				// float64
+				v2 := math.Float64frombits(ce.popValue())
+				v1 := math.Float64frombits(ce.popValue())
+				ce.pushValue(math.Float64bits(moremath.WasmCompatMax64(v1, v2)))
+			}
+			frame.pc++
+		case operationKindCopysign:
+			if op.B1 == 0 {
+				// float32
+				v2 := uint32(ce.popValue())
+				v1 := uint32(ce.popValue())
+				const signbit = 1 << 31
+				ce.pushValue(uint64(v1&^signbit | v2&signbit))
+			} else {
+				// float64
+				v2 := ce.popValue()
+				v1 := ce.popValue()
+				const signbit = 1 << 63
+				ce.pushValue(v1&^signbit | v2&signbit)
+			}
+			frame.pc++
+		case operationKindI32WrapFromI64:
+			ce.pushValue(uint64(uint32(ce.popValue())))
+			frame.pc++
+		case operationKindITruncFromF:
+			if op.B1 == 0 {
+				// float32
+				switch signedInt(op.B2) {
+				case signedInt32:
+					v := math.Trunc(float64(math.Float32frombits(uint32(ce.popValue()))))
+					if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN
+						if op.B3 {
+							// non-trapping conversion must cast nan to zero.
+							v = 0
+						} else {
+							panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
+						}
+					} else if v < math.MinInt32 || v > math.MaxInt32 {
+						if op.B3 {
+							// non-trapping conversion must "saturate" the value for overflowing sources.
+							if v < 0 {
+								v = math.MinInt32
+							} else {
+								v = math.MaxInt32
+							}
+						} else {
+							panic(wasmruntime.ErrRuntimeIntegerOverflow)
+						}
+					}
+					ce.pushValue(uint64(uint32(int32(v))))
+				case signedInt64:
+					v := math.Trunc(float64(math.Float32frombits(uint32(ce.popValue()))))
+					res := int64(v)
+					if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN
+						if op.B3 {
+							// non-trapping conversion must cast nan to zero.
+							res = 0
+						} else {
+							panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
+						}
+					} else if v < math.MinInt64 || v >= math.MaxInt64 {
+						// Note: math.MaxInt64 is rounded up to math.MaxInt64+1 in 64-bit float representation,
+						// and that's why we use '>=' not '>' to check overflow.
+						if op.B3 {
+							// non-trapping conversion must "saturate" the value for overflowing sources.
+							if v < 0 {
+								res = math.MinInt64
+							} else {
+								res = math.MaxInt64
+							}
+						} else {
+							panic(wasmruntime.ErrRuntimeIntegerOverflow)
+						}
+					}
+					ce.pushValue(uint64(res))
+				case signedUint32:
+					v := math.Trunc(float64(math.Float32frombits(uint32(ce.popValue()))))
+					if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN
+						if op.B3 {
+							// non-trapping conversion must cast nan to zero.
+							v = 0
+						} else {
+							panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
+						}
+					} else if v < 0 || v > math.MaxUint32 {
+						if op.B3 {
+							// non-trapping conversion must "saturate" the value for overflowing source.
+							if v < 0 {
+								v = 0
+							} else {
+								v = math.MaxUint32
+							}
+						} else {
+							panic(wasmruntime.ErrRuntimeIntegerOverflow)
+						}
+					}
+					ce.pushValue(uint64(uint32(v)))
+				case signedUint64:
+					v := math.Trunc(float64(math.Float32frombits(uint32(ce.popValue()))))
+					res := uint64(v)
+					if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN
+						if op.B3 {
+							// non-trapping conversion must cast nan to zero.
+							res = 0
+						} else {
+							panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
+						}
+					} else if v < 0 || v >= math.MaxUint64 {
+						// Note: math.MaxUint64 is rounded up to math.MaxUint64+1 in 64-bit float representation,
+						// and that's why we use '>=' not '>' to check overflow.
+						if op.B3 {
+							// non-trapping conversion must "saturate" the value for overflowing source.
+							if v < 0 {
+								res = 0
+							} else {
+								res = math.MaxUint64
+							}
+						} else {
+							panic(wasmruntime.ErrRuntimeIntegerOverflow)
+						}
+					}
+					ce.pushValue(res)
+				}
+			} else {
+				// float64
+				switch signedInt(op.B2) {
+				case signedInt32:
+					v := math.Trunc(math.Float64frombits(ce.popValue()))
+					if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN
+						if op.B3 {
+							// non-trapping conversion must cast nan to zero.
+							v = 0
+						} else {
+							panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
+						}
+					} else if v < math.MinInt32 || v > math.MaxInt32 {
+						if op.B3 {
+							// non-trapping conversion must "saturate" the value for overflowing source.
+							if v < 0 {
+								v = math.MinInt32
+							} else {
+								v = math.MaxInt32
+							}
+						} else {
+							panic(wasmruntime.ErrRuntimeIntegerOverflow)
+						}
+					}
+					ce.pushValue(uint64(uint32(int32(v))))
+				case signedInt64:
+					v := math.Trunc(math.Float64frombits(ce.popValue()))
+					res := int64(v)
+					if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN
+						if op.B3 {
+							// non-trapping conversion must cast nan to zero.
+							res = 0
+						} else {
+							panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
+						}
+					} else if v < math.MinInt64 || v >= math.MaxInt64 {
+						// Note: math.MaxInt64 is rounded up to math.MaxInt64+1 in 64-bit float representation,
+						// and that's why we use '>=' not '>' to check overflow.
+						if op.B3 {
+							// non-trapping conversion must "saturate" the value for overflowing source.
+							if v < 0 {
+								res = math.MinInt64
+							} else {
+								res = math.MaxInt64
+							}
+						} else {
+							panic(wasmruntime.ErrRuntimeIntegerOverflow)
+						}
+					}
+					ce.pushValue(uint64(res))
+				case signedUint32:
+					v := math.Trunc(math.Float64frombits(ce.popValue()))
+					if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN
+						if op.B3 {
+							// non-trapping conversion must cast nan to zero.
+							v = 0
+						} else {
+							panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
+						}
+					} else if v < 0 || v > math.MaxUint32 {
+						if op.B3 {
+							// non-trapping conversion must "saturate" the value for overflowing source.
+							if v < 0 {
+								v = 0
+							} else {
+								v = math.MaxUint32
+							}
+						} else {
+							panic(wasmruntime.ErrRuntimeIntegerOverflow)
+						}
+					}
+					ce.pushValue(uint64(uint32(v)))
+				case signedUint64:
+					v := math.Trunc(math.Float64frombits(ce.popValue()))
+					res := uint64(v)
+					if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN
+						if op.B3 {
+							// non-trapping conversion must cast nan to zero.
+							res = 0
+						} else {
+							panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
+						}
+					} else if v < 0 || v >= math.MaxUint64 {
+						// Note: math.MaxUint64 is rounded up to math.MaxUint64+1 in 64-bit float representation,
+						// and that's why we use '>=' not '>' to check overflow.
+						if op.B3 {
+							// non-trapping conversion must "saturate" the value for overflowing source.
+							if v < 0 {
+								res = 0
+							} else {
+								res = math.MaxUint64
+							}
+						} else {
+							panic(wasmruntime.ErrRuntimeIntegerOverflow)
+						}
+					}
+					ce.pushValue(res)
+				}
+			}
+			frame.pc++
+		case operationKindFConvertFromI:
+			switch signedInt(op.B1) {
+			case signedInt32:
+				if op.B2 == 0 {
+					// float32
+					v := float32(int32(ce.popValue()))
+					ce.pushValue(uint64(math.Float32bits(v)))
+				} else {
+					// float64
+					v := float64(int32(ce.popValue()))
+					ce.pushValue(math.Float64bits(v))
+				}
+			case signedInt64:
+				if op.B2 == 0 {
+					// float32
+					v := float32(int64(ce.popValue()))
+					ce.pushValue(uint64(math.Float32bits(v)))
+				} else {
+					// float64
+					v := float64(int64(ce.popValue()))
+					ce.pushValue(math.Float64bits(v))
+				}
+			case signedUint32:
+				if op.B2 == 0 {
+					// float32
+					v := float32(uint32(ce.popValue()))
+					ce.pushValue(uint64(math.Float32bits(v)))
+				} else {
+					// float64
+					v := float64(uint32(ce.popValue()))
+					ce.pushValue(math.Float64bits(v))
+				}
+			case signedUint64:
+				if op.B2 == 0 {
+					// float32
+					v := float32(ce.popValue())
+					ce.pushValue(uint64(math.Float32bits(v)))
+				} else {
+					// float64
+					v := float64(ce.popValue())
+					ce.pushValue(math.Float64bits(v))
+				}
+			}
+			frame.pc++
+		case operationKindF32DemoteFromF64:
+			v := float32(math.Float64frombits(ce.popValue()))
+			ce.pushValue(uint64(math.Float32bits(v)))
+			frame.pc++
+		case operationKindF64PromoteFromF32:
+			v := float64(math.Float32frombits(uint32(ce.popValue())))
+			ce.pushValue(math.Float64bits(v))
+			frame.pc++
+		case operationKindExtend:
+			if op.B1 == 1 {
+				// Signed.
+				v := int64(int32(ce.popValue()))
+				ce.pushValue(uint64(v))
+			} else {
+				v := uint64(uint32(ce.popValue()))
+				ce.pushValue(v)
+			}
+			frame.pc++
+		case operationKindSignExtend32From8:
+			v := uint32(int8(ce.popValue()))
+			ce.pushValue(uint64(v))
+			frame.pc++
+		case operationKindSignExtend32From16:
+			v := uint32(int16(ce.popValue()))
+			ce.pushValue(uint64(v))
+			frame.pc++
+		case operationKindSignExtend64From8:
+			v := int64(int8(ce.popValue()))
+			ce.pushValue(uint64(v))
+			frame.pc++
+		case operationKindSignExtend64From16:
+			v := int64(int16(ce.popValue()))
+			ce.pushValue(uint64(v))
+			frame.pc++
+		case operationKindSignExtend64From32:
+			v := int64(int32(ce.popValue()))
+			ce.pushValue(uint64(v))
+			frame.pc++
+		case operationKindMemoryInit:
+			dataInstance := dataInstances[op.U1]
+			copySize := ce.popValue()
+			inDataOffset := ce.popValue()
+			inMemoryOffset := ce.popValue()
+			if inDataOffset+copySize > uint64(len(dataInstance)) ||
+				inMemoryOffset+copySize > uint64(len(memoryInst.Buffer)) {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			} else if copySize != 0 {
+				copy(memoryInst.Buffer[inMemoryOffset:inMemoryOffset+copySize], dataInstance[inDataOffset:])
+			}
+			frame.pc++
+		case operationKindDataDrop:
+			dataInstances[op.U1] = nil
+			frame.pc++
+		case operationKindMemoryCopy:
+			memLen := uint64(len(memoryInst.Buffer))
+			copySize := ce.popValue()
+			sourceOffset := ce.popValue()
+			destinationOffset := ce.popValue()
+			if sourceOffset+copySize > memLen || destinationOffset+copySize > memLen {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			} else if copySize != 0 {
+				copy(memoryInst.Buffer[destinationOffset:],
+					memoryInst.Buffer[sourceOffset:sourceOffset+copySize])
+			}
+			frame.pc++
+		case operationKindMemoryFill:
+			fillSize := ce.popValue()
+			value := byte(ce.popValue())
+			offset := ce.popValue()
+			if fillSize+offset > uint64(len(memoryInst.Buffer)) {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			} else if fillSize != 0 {
+				// Uses the copy trick for faster filling buffer.
+				// https://gist.github.com/taylorza/df2f89d5f9ab3ffd06865062a4cf015d
+				buf := memoryInst.Buffer[offset : offset+fillSize]
+				buf[0] = value
+				for i := 1; i < len(buf); i *= 2 {
+					copy(buf[i:], buf[:i])
+				}
+			}
+			frame.pc++
+		case operationKindTableInit:
+			elementInstance := elementInstances[op.U1]
+			copySize := ce.popValue()
+			inElementOffset := ce.popValue()
+			inTableOffset := ce.popValue()
+			table := tables[op.U2]
+			if inElementOffset+copySize > uint64(len(elementInstance)) ||
+				inTableOffset+copySize > uint64(len(table.References)) {
+				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+			} else if copySize != 0 {
+				copy(table.References[inTableOffset:inTableOffset+copySize], elementInstance[inElementOffset:])
+			}
+			frame.pc++
+		case operationKindElemDrop:
+			elementInstances[op.U1] = nil
+			frame.pc++
+		case operationKindTableCopy:
+			srcTable, dstTable := tables[op.U1].References, tables[op.U2].References
+			copySize := ce.popValue()
+			sourceOffset := ce.popValue()
+			destinationOffset := ce.popValue()
+			if sourceOffset+copySize > uint64(len(srcTable)) || destinationOffset+copySize > uint64(len(dstTable)) {
+				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+			} else if copySize != 0 {
+				copy(dstTable[destinationOffset:], srcTable[sourceOffset:sourceOffset+copySize])
+			}
+			frame.pc++
+		case operationKindRefFunc:
+			ce.pushValue(uint64(uintptr(unsafe.Pointer(&functions[op.U1]))))
+			frame.pc++
+		case operationKindTableGet:
+			table := tables[op.U1]
+
+			offset := ce.popValue()
+			if offset >= uint64(len(table.References)) {
+				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+			}
+
+			ce.pushValue(uint64(table.References[offset]))
+			frame.pc++
+		case operationKindTableSet:
+			table := tables[op.U1]
+			ref := ce.popValue()
+
+			offset := ce.popValue()
+			if offset >= uint64(len(table.References)) {
+				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+			}
+
+			table.References[offset] = uintptr(ref) // externrefs are opaque uint64.
+			frame.pc++
+		case operationKindTableSize:
+			table := tables[op.U1]
+			ce.pushValue(uint64(len(table.References)))
+			frame.pc++
+		case operationKindTableGrow:
+			table := tables[op.U1]
+			num, ref := ce.popValue(), ce.popValue()
+			ret := table.Grow(uint32(num), uintptr(ref))
+			ce.pushValue(uint64(ret))
+			frame.pc++
+		case operationKindTableFill:
+			table := tables[op.U1]
+			num := ce.popValue()
+			ref := uintptr(ce.popValue())
+			offset := ce.popValue()
+			if num+offset > uint64(len(table.References)) {
+				panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+			} else if num > 0 {
+				// Uses the copy trick for faster filling the region with the value.
+				// https://gist.github.com/taylorza/df2f89d5f9ab3ffd06865062a4cf015d
+				targetRegion := table.References[offset : offset+num]
+				targetRegion[0] = ref
+				for i := 1; i < len(targetRegion); i *= 2 {
+					copy(targetRegion[i:], targetRegion[:i])
+				}
+			}
+			frame.pc++
+		case operationKindV128Const:
+			lo, hi := op.U1, op.U2
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case operationKindV128Add:
+			yHigh, yLow := ce.popValue(), ce.popValue()
+			xHigh, xLow := ce.popValue(), ce.popValue()
+			switch op.B1 {
+			case shapeI8x16:
+				ce.pushValue(
+					uint64(uint8(xLow>>8)+uint8(yLow>>8))<<8 | uint64(uint8(xLow)+uint8(yLow)) |
+						uint64(uint8(xLow>>24)+uint8(yLow>>24))<<24 | uint64(uint8(xLow>>16)+uint8(yLow>>16))<<16 |
+						uint64(uint8(xLow>>40)+uint8(yLow>>40))<<40 | uint64(uint8(xLow>>32)+uint8(yLow>>32))<<32 |
+						uint64(uint8(xLow>>56)+uint8(yLow>>56))<<56 | uint64(uint8(xLow>>48)+uint8(yLow>>48))<<48,
+				)
+				ce.pushValue(
+					uint64(uint8(xHigh>>8)+uint8(yHigh>>8))<<8 | uint64(uint8(xHigh)+uint8(yHigh)) |
+						uint64(uint8(xHigh>>24)+uint8(yHigh>>24))<<24 | uint64(uint8(xHigh>>16)+uint8(yHigh>>16))<<16 |
+						uint64(uint8(xHigh>>40)+uint8(yHigh>>40))<<40 | uint64(uint8(xHigh>>32)+uint8(yHigh>>32))<<32 |
+						uint64(uint8(xHigh>>56)+uint8(yHigh>>56))<<56 | uint64(uint8(xHigh>>48)+uint8(yHigh>>48))<<48,
+				)
+			case shapeI16x8:
+				ce.pushValue(
+					uint64(uint16(xLow>>16+yLow>>16))<<16 | uint64(uint16(xLow)+uint16(yLow)) |
+						uint64(uint16(xLow>>48+yLow>>48))<<48 | uint64(uint16(xLow>>32+yLow>>32))<<32,
+				)
+				ce.pushValue(
+					uint64(uint16(xHigh>>16)+uint16(yHigh>>16))<<16 | uint64(uint16(xHigh)+uint16(yHigh)) |
+						uint64(uint16(xHigh>>48)+uint16(yHigh>>48))<<48 | uint64(uint16(xHigh>>32)+uint16(yHigh>>32))<<32,
+				)
+			case shapeI32x4:
+				ce.pushValue(uint64(uint32(xLow>>32)+uint32(yLow>>32))<<32 | uint64(uint32(xLow)+uint32(yLow)))
+				ce.pushValue(uint64(uint32(xHigh>>32)+uint32(yHigh>>32))<<32 | uint64(uint32(xHigh)+uint32(yHigh)))
+			case shapeI64x2:
+				ce.pushValue(xLow + yLow)
+				ce.pushValue(xHigh + yHigh)
+			case shapeF32x4:
+				ce.pushValue(
+					addFloat32bits(uint32(xLow), uint32(yLow)) | addFloat32bits(uint32(xLow>>32), uint32(yLow>>32))<<32,
+				)
+				ce.pushValue(
+					addFloat32bits(uint32(xHigh), uint32(yHigh)) | addFloat32bits(uint32(xHigh>>32), uint32(yHigh>>32))<<32,
+				)
+			case shapeF64x2:
+				ce.pushValue(math.Float64bits(math.Float64frombits(xLow) + math.Float64frombits(yLow)))
+				ce.pushValue(math.Float64bits(math.Float64frombits(xHigh) + math.Float64frombits(yHigh)))
+			}
+			frame.pc++
+		case operationKindV128Sub:
+			yHigh, yLow := ce.popValue(), ce.popValue()
+			xHigh, xLow := ce.popValue(), ce.popValue()
+			switch op.B1 {
+			case shapeI8x16:
+				ce.pushValue(
+					uint64(uint8(xLow>>8)-uint8(yLow>>8))<<8 | uint64(uint8(xLow)-uint8(yLow)) |
+						uint64(uint8(xLow>>24)-uint8(yLow>>24))<<24 | uint64(uint8(xLow>>16)-uint8(yLow>>16))<<16 |
+						uint64(uint8(xLow>>40)-uint8(yLow>>40))<<40 | uint64(uint8(xLow>>32)-uint8(yLow>>32))<<32 |
+						uint64(uint8(xLow>>56)-uint8(yLow>>56))<<56 | uint64(uint8(xLow>>48)-uint8(yLow>>48))<<48,
+				)
+				ce.pushValue(
+					uint64(uint8(xHigh>>8)-uint8(yHigh>>8))<<8 | uint64(uint8(xHigh)-uint8(yHigh)) |
+						uint64(uint8(xHigh>>24)-uint8(yHigh>>24))<<24 | uint64(uint8(xHigh>>16)-uint8(yHigh>>16))<<16 |
+						uint64(uint8(xHigh>>40)-uint8(yHigh>>40))<<40 | uint64(uint8(xHigh>>32)-uint8(yHigh>>32))<<32 |
+						uint64(uint8(xHigh>>56)-uint8(yHigh>>56))<<56 | uint64(uint8(xHigh>>48)-uint8(yHigh>>48))<<48,
+				)
+			case shapeI16x8:
+				ce.pushValue(
+					uint64(uint16(xLow>>16)-uint16(yLow>>16))<<16 | uint64(uint16(xLow)-uint16(yLow)) |
+						uint64(uint16(xLow>>48)-uint16(yLow>>48))<<48 | uint64(uint16(xLow>>32)-uint16(yLow>>32))<<32,
+				)
+				ce.pushValue(
+					uint64(uint16(xHigh>>16)-uint16(yHigh>>16))<<16 | uint64(uint16(xHigh)-uint16(yHigh)) |
+						uint64(uint16(xHigh>>48)-uint16(yHigh>>48))<<48 | uint64(uint16(xHigh>>32)-uint16(yHigh>>32))<<32,
+				)
+			case shapeI32x4:
+				ce.pushValue(uint64(uint32(xLow>>32-yLow>>32))<<32 | uint64(uint32(xLow)-uint32(yLow)))
+				ce.pushValue(uint64(uint32(xHigh>>32-yHigh>>32))<<32 | uint64(uint32(xHigh)-uint32(yHigh)))
+			case shapeI64x2:
+				ce.pushValue(xLow - yLow)
+				ce.pushValue(xHigh - yHigh)
+			case shapeF32x4:
+				ce.pushValue(
+					subFloat32bits(uint32(xLow), uint32(yLow)) | subFloat32bits(uint32(xLow>>32), uint32(yLow>>32))<<32,
+				)
+				ce.pushValue(
+					subFloat32bits(uint32(xHigh), uint32(yHigh)) | subFloat32bits(uint32(xHigh>>32), uint32(yHigh>>32))<<32,
+				)
+			case shapeF64x2:
+				ce.pushValue(math.Float64bits(math.Float64frombits(xLow) - math.Float64frombits(yLow)))
+				ce.pushValue(math.Float64bits(math.Float64frombits(xHigh) - math.Float64frombits(yHigh)))
+			}
+			frame.pc++
+		case operationKindV128Load:
+			offset := ce.popMemoryOffset(op)
+			switch op.B1 {
+			case v128LoadType128:
+				lo, ok := memoryInst.ReadUint64Le(offset)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				ce.pushValue(lo)
+				hi, ok := memoryInst.ReadUint64Le(offset + 8)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				ce.pushValue(hi)
+			case v128LoadType8x8s:
+				data, ok := memoryInst.Read(offset, 8)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				ce.pushValue(
+					uint64(uint16(int8(data[3])))<<48 | uint64(uint16(int8(data[2])))<<32 | uint64(uint16(int8(data[1])))<<16 | uint64(uint16(int8(data[0]))),
+				)
+				ce.pushValue(
+					uint64(uint16(int8(data[7])))<<48 | uint64(uint16(int8(data[6])))<<32 | uint64(uint16(int8(data[5])))<<16 | uint64(uint16(int8(data[4]))),
+				)
+			case v128LoadType8x8u:
+				data, ok := memoryInst.Read(offset, 8)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				ce.pushValue(
+					uint64(data[3])<<48 | uint64(data[2])<<32 | uint64(data[1])<<16 | uint64(data[0]),
+				)
+				ce.pushValue(
+					uint64(data[7])<<48 | uint64(data[6])<<32 | uint64(data[5])<<16 | uint64(data[4]),
+				)
+			case v128LoadType16x4s:
+				data, ok := memoryInst.Read(offset, 8)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				ce.pushValue(
+					uint64(int16(binary.LittleEndian.Uint16(data[2:])))<<32 |
+						uint64(uint32(int16(binary.LittleEndian.Uint16(data)))),
+				)
+				ce.pushValue(
+					uint64(uint32(int16(binary.LittleEndian.Uint16(data[6:]))))<<32 |
+						uint64(uint32(int16(binary.LittleEndian.Uint16(data[4:])))),
+				)
+			case v128LoadType16x4u:
+				data, ok := memoryInst.Read(offset, 8)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				ce.pushValue(
+					uint64(binary.LittleEndian.Uint16(data[2:]))<<32 | uint64(binary.LittleEndian.Uint16(data)),
+				)
+				ce.pushValue(
+					uint64(binary.LittleEndian.Uint16(data[6:]))<<32 | uint64(binary.LittleEndian.Uint16(data[4:])),
+				)
+			case v128LoadType32x2s:
+				data, ok := memoryInst.Read(offset, 8)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				ce.pushValue(uint64(int32(binary.LittleEndian.Uint32(data))))
+				ce.pushValue(uint64(int32(binary.LittleEndian.Uint32(data[4:]))))
+			case v128LoadType32x2u:
+				data, ok := memoryInst.Read(offset, 8)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				ce.pushValue(uint64(binary.LittleEndian.Uint32(data)))
+				ce.pushValue(uint64(binary.LittleEndian.Uint32(data[4:])))
+			case v128LoadType8Splat:
+				v, ok := memoryInst.ReadByte(offset)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				v8 := uint64(v)<<56 | uint64(v)<<48 | uint64(v)<<40 | uint64(v)<<32 |
+					uint64(v)<<24 | uint64(v)<<16 | uint64(v)<<8 | uint64(v)
+				ce.pushValue(v8)
+				ce.pushValue(v8)
+			case v128LoadType16Splat:
+				v, ok := memoryInst.ReadUint16Le(offset)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				v4 := uint64(v)<<48 | uint64(v)<<32 | uint64(v)<<16 | uint64(v)
+				ce.pushValue(v4)
+				ce.pushValue(v4)
+			case v128LoadType32Splat:
+				v, ok := memoryInst.ReadUint32Le(offset)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				vv := uint64(v)<<32 | uint64(v)
+				ce.pushValue(vv)
+				ce.pushValue(vv)
+			case v128LoadType64Splat:
+				lo, ok := memoryInst.ReadUint64Le(offset)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				ce.pushValue(lo)
+				ce.pushValue(lo)
+			case v128LoadType32zero:
+				lo, ok := memoryInst.ReadUint32Le(offset)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				ce.pushValue(uint64(lo))
+				ce.pushValue(0)
+			case v128LoadType64zero:
+				lo, ok := memoryInst.ReadUint64Le(offset)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				ce.pushValue(lo)
+				ce.pushValue(0)
+			}
+			frame.pc++
+		case operationKindV128LoadLane:
+			hi, lo := ce.popValue(), ce.popValue()
+			offset := ce.popMemoryOffset(op)
+			switch op.B1 {
+			case 8:
+				b, ok := memoryInst.ReadByte(offset)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				if op.B2 < 8 {
+					s := op.B2 << 3
+					lo = (lo & ^(0xff << s)) | uint64(b)<<s
+				} else {
+					s := (op.B2 - 8) << 3
+					hi = (hi & ^(0xff << s)) | uint64(b)<<s
+				}
+			case 16:
+				b, ok := memoryInst.ReadUint16Le(offset)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				if op.B2 < 4 {
+					s := op.B2 << 4
+					lo = (lo & ^(0xff_ff << s)) | uint64(b)<<s
+				} else {
+					s := (op.B2 - 4) << 4
+					hi = (hi & ^(0xff_ff << s)) | uint64(b)<<s
+				}
+			case 32:
+				b, ok := memoryInst.ReadUint32Le(offset)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				if op.B2 < 2 {
+					s := op.B2 << 5
+					lo = (lo & ^(0xff_ff_ff_ff << s)) | uint64(b)<<s
+				} else {
+					s := (op.B2 - 2) << 5
+					hi = (hi & ^(0xff_ff_ff_ff << s)) | uint64(b)<<s
+				}
+			case 64:
+				b, ok := memoryInst.ReadUint64Le(offset)
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				if op.B2 == 0 {
+					lo = b
+				} else {
+					hi = b
+				}
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case operationKindV128Store:
+			hi, lo := ce.popValue(), ce.popValue()
+			offset := ce.popMemoryOffset(op)
+			// Write the upper bytes first to trigger an early error if the memory access is out of bounds.
+			// Otherwise, the lower bytes might be written to memory, but the upper bytes might not.
+			if uint64(offset)+8 > math.MaxUint32 {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+			if ok := memoryInst.WriteUint64Le(offset+8, hi); !ok {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+			if ok := memoryInst.WriteUint64Le(offset, lo); !ok {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+			frame.pc++
+		case operationKindV128StoreLane:
+			hi, lo := ce.popValue(), ce.popValue()
+			offset := ce.popMemoryOffset(op)
+			var ok bool
+			switch op.B1 {
+			case 8:
+				if op.B2 < 8 {
+					ok = memoryInst.WriteByte(offset, byte(lo>>(op.B2*8)))
+				} else {
+					ok = memoryInst.WriteByte(offset, byte(hi>>((op.B2-8)*8)))
+				}
+			case 16:
+				if op.B2 < 4 {
+					ok = memoryInst.WriteUint16Le(offset, uint16(lo>>(op.B2*16)))
+				} else {
+					ok = memoryInst.WriteUint16Le(offset, uint16(hi>>((op.B2-4)*16)))
+				}
+			case 32:
+				if op.B2 < 2 {
+					ok = memoryInst.WriteUint32Le(offset, uint32(lo>>(op.B2*32)))
+				} else {
+					ok = memoryInst.WriteUint32Le(offset, uint32(hi>>((op.B2-2)*32)))
+				}
+			case 64:
+				if op.B2 == 0 {
+					ok = memoryInst.WriteUint64Le(offset, lo)
+				} else {
+					ok = memoryInst.WriteUint64Le(offset, hi)
+				}
+			}
+			if !ok {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+			frame.pc++
+		case operationKindV128ReplaceLane:
+			v := ce.popValue()
+			hi, lo := ce.popValue(), ce.popValue()
+			switch op.B1 {
+			case shapeI8x16:
+				if op.B2 < 8 {
+					s := op.B2 << 3
+					lo = (lo & ^(0xff << s)) | uint64(byte(v))<<s
+				} else {
+					s := (op.B2 - 8) << 3
+					hi = (hi & ^(0xff << s)) | uint64(byte(v))<<s
+				}
+			case shapeI16x8:
+				if op.B2 < 4 {
+					s := op.B2 << 4
+					lo = (lo & ^(0xff_ff << s)) | uint64(uint16(v))<<s
+				} else {
+					s := (op.B2 - 4) << 4
+					hi = (hi & ^(0xff_ff << s)) | uint64(uint16(v))<<s
+				}
+			case shapeI32x4, shapeF32x4:
+				if op.B2 < 2 {
+					s := op.B2 << 5
+					lo = (lo & ^(0xff_ff_ff_ff << s)) | uint64(uint32(v))<<s
+				} else {
+					s := (op.B2 - 2) << 5
+					hi = (hi & ^(0xff_ff_ff_ff << s)) | uint64(uint32(v))<<s
+				}
+			case shapeI64x2, shapeF64x2:
+				if op.B2 == 0 {
+					lo = v
+				} else {
+					hi = v
+				}
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case operationKindV128ExtractLane:
+			hi, lo := ce.popValue(), ce.popValue()
+			var v uint64
+			switch op.B1 {
+			case shapeI8x16:
+				var u8 byte
+				if op.B2 < 8 {
+					u8 = byte(lo >> (op.B2 * 8))
+				} else {
+					u8 = byte(hi >> ((op.B2 - 8) * 8))
+				}
+				if op.B3 {
+					// sign-extend.
+					v = uint64(uint32(int8(u8)))
+				} else {
+					v = uint64(u8)
+				}
+			case shapeI16x8:
+				var u16 uint16
+				if op.B2 < 4 {
+					u16 = uint16(lo >> (op.B2 * 16))
+				} else {
+					u16 = uint16(hi >> ((op.B2 - 4) * 16))
+				}
+				if op.B3 {
+					// sign-extend.
+					v = uint64(uint32(int16(u16)))
+				} else {
+					v = uint64(u16)
+				}
+			case shapeI32x4, shapeF32x4:
+				if op.B2 < 2 {
+					v = uint64(uint32(lo >> (op.B2 * 32)))
+				} else {
+					v = uint64(uint32(hi >> ((op.B2 - 2) * 32)))
+				}
+			case shapeI64x2, shapeF64x2:
+				if op.B2 == 0 {
+					v = lo
+				} else {
+					v = hi
+				}
+			}
+			ce.pushValue(v)
+			frame.pc++
+		case operationKindV128Splat:
+			v := ce.popValue()
+			var hi, lo uint64
+			switch op.B1 {
+			case shapeI8x16:
+				v8 := uint64(byte(v))<<56 | uint64(byte(v))<<48 | uint64(byte(v))<<40 | uint64(byte(v))<<32 |
+					uint64(byte(v))<<24 | uint64(byte(v))<<16 | uint64(byte(v))<<8 | uint64(byte(v))
+				hi, lo = v8, v8
+			case shapeI16x8:
+				v4 := uint64(uint16(v))<<48 | uint64(uint16(v))<<32 | uint64(uint16(v))<<16 | uint64(uint16(v))
+				hi, lo = v4, v4
+			case shapeI32x4, shapeF32x4:
+				v2 := uint64(uint32(v))<<32 | uint64(uint32(v))
+				lo, hi = v2, v2
+			case shapeI64x2, shapeF64x2:
+				lo, hi = v, v
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case operationKindV128Swizzle:
+			idxHi, idxLo := ce.popValue(), ce.popValue()
+			baseHi, baseLo := ce.popValue(), ce.popValue()
+			var newVal [16]byte
+			for i := 0; i < 16; i++ {
+				var id byte
+				if i < 8 {
+					id = byte(idxLo >> (i * 8))
+				} else {
+					id = byte(idxHi >> ((i - 8) * 8))
+				}
+				if id < 8 {
+					newVal[i] = byte(baseLo >> (id * 8))
+				} else if id < 16 {
+					newVal[i] = byte(baseHi >> ((id - 8) * 8))
+				}
+			}
+			ce.pushValue(binary.LittleEndian.Uint64(newVal[:8]))
+			ce.pushValue(binary.LittleEndian.Uint64(newVal[8:]))
+			frame.pc++
+		case operationKindV128Shuffle:
+			xHi, xLo, yHi, yLo := ce.popValue(), ce.popValue(), ce.popValue(), ce.popValue()
+			var newVal [16]byte
+			for i, l := range op.Us {
+				if l < 8 {
+					newVal[i] = byte(yLo >> (l * 8))
+				} else if l < 16 {
+					newVal[i] = byte(yHi >> ((l - 8) * 8))
+				} else if l < 24 {
+					newVal[i] = byte(xLo >> ((l - 16) * 8))
+				} else if l < 32 {
+					newVal[i] = byte(xHi >> ((l - 24) * 8))
+				}
+			}
+			ce.pushValue(binary.LittleEndian.Uint64(newVal[:8]))
+			ce.pushValue(binary.LittleEndian.Uint64(newVal[8:]))
+			frame.pc++
+		case operationKindV128AnyTrue:
+			hi, lo := ce.popValue(), ce.popValue()
+			if hi != 0 || lo != 0 {
+				ce.pushValue(1)
+			} else {
+				ce.pushValue(0)
+			}
+			frame.pc++
+		case operationKindV128AllTrue:
+			hi, lo := ce.popValue(), ce.popValue()
+			var ret bool
+			switch op.B1 {
+			case shapeI8x16:
+				ret = (uint8(lo) != 0) && (uint8(lo>>8) != 0) && (uint8(lo>>16) != 0) && (uint8(lo>>24) != 0) &&
+					(uint8(lo>>32) != 0) && (uint8(lo>>40) != 0) && (uint8(lo>>48) != 0) && (uint8(lo>>56) != 0) &&
+					(uint8(hi) != 0) && (uint8(hi>>8) != 0) && (uint8(hi>>16) != 0) && (uint8(hi>>24) != 0) &&
+					(uint8(hi>>32) != 0) && (uint8(hi>>40) != 0) && (uint8(hi>>48) != 0) && (uint8(hi>>56) != 0)
+			case shapeI16x8:
+				ret = (uint16(lo) != 0) && (uint16(lo>>16) != 0) && (uint16(lo>>32) != 0) && (uint16(lo>>48) != 0) &&
+					(uint16(hi) != 0) && (uint16(hi>>16) != 0) && (uint16(hi>>32) != 0) && (uint16(hi>>48) != 0)
+			case shapeI32x4:
+				ret = (uint32(lo) != 0) && (uint32(lo>>32) != 0) &&
+					(uint32(hi) != 0) && (uint32(hi>>32) != 0)
+			case shapeI64x2:
+				ret = (lo != 0) &&
+					(hi != 0)
+			}
+			if ret {
+				ce.pushValue(1)
+			} else {
+				ce.pushValue(0)
+			}
+			frame.pc++
+		case operationKindV128BitMask:
+			// https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#bitmask-extraction
+			hi, lo := ce.popValue(), ce.popValue()
+			var res uint64
+			switch op.B1 {
+			case shapeI8x16:
+				for i := 0; i < 8; i++ {
+					if int8(lo>>(i*8)) < 0 {
+						res |= 1 << i
+					}
+				}
+				for i := 0; i < 8; i++ {
+					if int8(hi>>(i*8)) < 0 {
+						res |= 1 << (i + 8)
+					}
+				}
+			case shapeI16x8:
+				for i := 0; i < 4; i++ {
+					if int16(lo>>(i*16)) < 0 {
+						res |= 1 << i
+					}
+				}
+				for i := 0; i < 4; i++ {
+					if int16(hi>>(i*16)) < 0 {
+						res |= 1 << (i + 4)
+					}
+				}
+			case shapeI32x4:
+				for i := 0; i < 2; i++ {
+					if int32(lo>>(i*32)) < 0 {
+						res |= 1 << i
+					}
+				}
+				for i := 0; i < 2; i++ {
+					if int32(hi>>(i*32)) < 0 {
+						res |= 1 << (i + 2)
+					}
+				}
+			case shapeI64x2:
+				if int64(lo) < 0 {
+					res |= 0b01
+				}
+				if int(hi) < 0 {
+					res |= 0b10
+				}
+			}
+			ce.pushValue(res)
+			frame.pc++
+		case operationKindV128And:
+			x2Hi, x2Lo := ce.popValue(), ce.popValue()
+			x1Hi, x1Lo := ce.popValue(), ce.popValue()
+			ce.pushValue(x1Lo & x2Lo)
+			ce.pushValue(x1Hi & x2Hi)
+			frame.pc++
+		case operationKindV128Not:
+			hi, lo := ce.popValue(), ce.popValue()
+			ce.pushValue(^lo)
+			ce.pushValue(^hi)
+			frame.pc++
+		case operationKindV128Or:
+			x2Hi, x2Lo := ce.popValue(), ce.popValue()
+			x1Hi, x1Lo := ce.popValue(), ce.popValue()
+			ce.pushValue(x1Lo | x2Lo)
+			ce.pushValue(x1Hi | x2Hi)
+			frame.pc++
+		case operationKindV128Xor:
+			x2Hi, x2Lo := ce.popValue(), ce.popValue()
+			x1Hi, x1Lo := ce.popValue(), ce.popValue()
+			ce.pushValue(x1Lo ^ x2Lo)
+			ce.pushValue(x1Hi ^ x2Hi)
+			frame.pc++
+		case operationKindV128Bitselect:
+			// https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#bitwise-select
+			cHi, cLo := ce.popValue(), ce.popValue()
+			x2Hi, x2Lo := ce.popValue(), ce.popValue()
+			x1Hi, x1Lo := ce.popValue(), ce.popValue()
+			// v128.or(v128.and(v1, c), v128.and(v2, v128.not(c)))
+			ce.pushValue((x1Lo & cLo) | (x2Lo & (^cLo)))
+			ce.pushValue((x1Hi & cHi) | (x2Hi & (^cHi)))
+			frame.pc++
+		case operationKindV128AndNot:
+			x2Hi, x2Lo := ce.popValue(), ce.popValue()
+			x1Hi, x1Lo := ce.popValue(), ce.popValue()
+			ce.pushValue(x1Lo & (^x2Lo))
+			ce.pushValue(x1Hi & (^x2Hi))
+			frame.pc++
+		case operationKindV128Shl:
+			s := ce.popValue()
+			hi, lo := ce.popValue(), ce.popValue()
+			switch op.B1 {
+			case shapeI8x16:
+				s = s % 8
+				lo = uint64(uint8(lo<<s)) |
+					uint64(uint8((lo>>8)<<s))<<8 |
+					uint64(uint8((lo>>16)<<s))<<16 |
+					uint64(uint8((lo>>24)<<s))<<24 |
+					uint64(uint8((lo>>32)<<s))<<32 |
+					uint64(uint8((lo>>40)<<s))<<40 |
+					uint64(uint8((lo>>48)<<s))<<48 |
+					uint64(uint8((lo>>56)<<s))<<56
+				hi = uint64(uint8(hi<<s)) |
+					uint64(uint8((hi>>8)<<s))<<8 |
+					uint64(uint8((hi>>16)<<s))<<16 |
+					uint64(uint8((hi>>24)<<s))<<24 |
+					uint64(uint8((hi>>32)<<s))<<32 |
+					uint64(uint8((hi>>40)<<s))<<40 |
+					uint64(uint8((hi>>48)<<s))<<48 |
+					uint64(uint8((hi>>56)<<s))<<56
+			case shapeI16x8:
+				s = s % 16
+				lo = uint64(uint16(lo<<s)) |
+					uint64(uint16((lo>>16)<<s))<<16 |
+					uint64(uint16((lo>>32)<<s))<<32 |
+					uint64(uint16((lo>>48)<<s))<<48
+				hi = uint64(uint16(hi<<s)) |
+					uint64(uint16((hi>>16)<<s))<<16 |
+					uint64(uint16((hi>>32)<<s))<<32 |
+					uint64(uint16((hi>>48)<<s))<<48
+			case shapeI32x4:
+				s = s % 32
+				lo = uint64(uint32(lo<<s)) | uint64(uint32((lo>>32)<<s))<<32
+				hi = uint64(uint32(hi<<s)) | uint64(uint32((hi>>32)<<s))<<32
+			case shapeI64x2:
+				s = s % 64
+				lo = lo << s
+				hi = hi << s
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case operationKindV128Shr:
+			s := ce.popValue()
+			hi, lo := ce.popValue(), ce.popValue()
+			switch op.B1 {
+			case shapeI8x16:
+				s = s % 8
+				if op.B3 { // signed
+					lo = uint64(uint8(int8(lo)>>s)) |
+						uint64(uint8(int8(lo>>8)>>s))<<8 |
+						uint64(uint8(int8(lo>>16)>>s))<<16 |
+						uint64(uint8(int8(lo>>24)>>s))<<24 |
+						uint64(uint8(int8(lo>>32)>>s))<<32 |
+						uint64(uint8(int8(lo>>40)>>s))<<40 |
+						uint64(uint8(int8(lo>>48)>>s))<<48 |
+						uint64(uint8(int8(lo>>56)>>s))<<56
+					hi = uint64(uint8(int8(hi)>>s)) |
+						uint64(uint8(int8(hi>>8)>>s))<<8 |
+						uint64(uint8(int8(hi>>16)>>s))<<16 |
+						uint64(uint8(int8(hi>>24)>>s))<<24 |
+						uint64(uint8(int8(hi>>32)>>s))<<32 |
+						uint64(uint8(int8(hi>>40)>>s))<<40 |
+						uint64(uint8(int8(hi>>48)>>s))<<48 |
+						uint64(uint8(int8(hi>>56)>>s))<<56
+				} else {
+					lo = uint64(uint8(lo)>>s) |
+						uint64(uint8(lo>>8)>>s)<<8 |
+						uint64(uint8(lo>>16)>>s)<<16 |
+						uint64(uint8(lo>>24)>>s)<<24 |
+						uint64(uint8(lo>>32)>>s)<<32 |
+						uint64(uint8(lo>>40)>>s)<<40 |
+						uint64(uint8(lo>>48)>>s)<<48 |
+						uint64(uint8(lo>>56)>>s)<<56
+					hi = uint64(uint8(hi)>>s) |
+						uint64(uint8(hi>>8)>>s)<<8 |
+						uint64(uint8(hi>>16)>>s)<<16 |
+						uint64(uint8(hi>>24)>>s)<<24 |
+						uint64(uint8(hi>>32)>>s)<<32 |
+						uint64(uint8(hi>>40)>>s)<<40 |
+						uint64(uint8(hi>>48)>>s)<<48 |
+						uint64(uint8(hi>>56)>>s)<<56
+				}
+			case shapeI16x8:
+				s = s % 16
+				if op.B3 { // signed
+					lo = uint64(uint16(int16(lo)>>s)) |
+						uint64(uint16(int16(lo>>16)>>s))<<16 |
+						uint64(uint16(int16(lo>>32)>>s))<<32 |
+						uint64(uint16(int16(lo>>48)>>s))<<48
+					hi = uint64(uint16(int16(hi)>>s)) |
+						uint64(uint16(int16(hi>>16)>>s))<<16 |
+						uint64(uint16(int16(hi>>32)>>s))<<32 |
+						uint64(uint16(int16(hi>>48)>>s))<<48
+				} else {
+					lo = uint64(uint16(lo)>>s) |
+						uint64(uint16(lo>>16)>>s)<<16 |
+						uint64(uint16(lo>>32)>>s)<<32 |
+						uint64(uint16(lo>>48)>>s)<<48
+					hi = uint64(uint16(hi)>>s) |
+						uint64(uint16(hi>>16)>>s)<<16 |
+						uint64(uint16(hi>>32)>>s)<<32 |
+						uint64(uint16(hi>>48)>>s)<<48
+				}
+			case shapeI32x4:
+				s = s % 32
+				if op.B3 {
+					lo = uint64(uint32(int32(lo)>>s)) | uint64(uint32(int32(lo>>32)>>s))<<32
+					hi = uint64(uint32(int32(hi)>>s)) | uint64(uint32(int32(hi>>32)>>s))<<32
+				} else {
+					lo = uint64(uint32(lo)>>s) | uint64(uint32(lo>>32)>>s)<<32
+					hi = uint64(uint32(hi)>>s) | uint64(uint32(hi>>32)>>s)<<32
+				}
+			case shapeI64x2:
+				s = s % 64
+				if op.B3 { // signed
+					lo = uint64(int64(lo) >> s)
+					hi = uint64(int64(hi) >> s)
+				} else {
+					lo = lo >> s
+					hi = hi >> s
+				}
+
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case operationKindV128Cmp:
+			x2Hi, x2Lo := ce.popValue(), ce.popValue()
+			x1Hi, x1Lo := ce.popValue(), ce.popValue()
+			var result []bool
+			switch op.B1 {
+			case v128CmpTypeI8x16Eq:
+				result = []bool{
+					byte(x1Lo>>0) == byte(x2Lo>>0), byte(x1Lo>>8) == byte(x2Lo>>8),
+					byte(x1Lo>>16) == byte(x2Lo>>16), byte(x1Lo>>24) == byte(x2Lo>>24),
+					byte(x1Lo>>32) == byte(x2Lo>>32), byte(x1Lo>>40) == byte(x2Lo>>40),
+					byte(x1Lo>>48) == byte(x2Lo>>48), byte(x1Lo>>56) == byte(x2Lo>>56),
+					byte(x1Hi>>0) == byte(x2Hi>>0), byte(x1Hi>>8) == byte(x2Hi>>8),
+					byte(x1Hi>>16) == byte(x2Hi>>16), byte(x1Hi>>24) == byte(x2Hi>>24),
+					byte(x1Hi>>32) == byte(x2Hi>>32), byte(x1Hi>>40) == byte(x2Hi>>40),
+					byte(x1Hi>>48) == byte(x2Hi>>48), byte(x1Hi>>56) == byte(x2Hi>>56),
+				}
+			case v128CmpTypeI8x16Ne:
+				result = []bool{
+					byte(x1Lo>>0) != byte(x2Lo>>0), byte(x1Lo>>8) != byte(x2Lo>>8),
+					byte(x1Lo>>16) != byte(x2Lo>>16), byte(x1Lo>>24) != byte(x2Lo>>24),
+					byte(x1Lo>>32) != byte(x2Lo>>32), byte(x1Lo>>40) != byte(x2Lo>>40),
+					byte(x1Lo>>48) != byte(x2Lo>>48), byte(x1Lo>>56) != byte(x2Lo>>56),
+					byte(x1Hi>>0) != byte(x2Hi>>0), byte(x1Hi>>8) != byte(x2Hi>>8),
+					byte(x1Hi>>16) != byte(x2Hi>>16), byte(x1Hi>>24) != byte(x2Hi>>24),
+					byte(x1Hi>>32) != byte(x2Hi>>32), byte(x1Hi>>40) != byte(x2Hi>>40),
+					byte(x1Hi>>48) != byte(x2Hi>>48), byte(x1Hi>>56) != byte(x2Hi>>56),
+				}
+			case v128CmpTypeI8x16LtS:
+				result = []bool{
+					int8(x1Lo>>0) < int8(x2Lo>>0), int8(x1Lo>>8) < int8(x2Lo>>8),
+					int8(x1Lo>>16) < int8(x2Lo>>16), int8(x1Lo>>24) < int8(x2Lo>>24),
+					int8(x1Lo>>32) < int8(x2Lo>>32), int8(x1Lo>>40) < int8(x2Lo>>40),
+					int8(x1Lo>>48) < int8(x2Lo>>48), int8(x1Lo>>56) < int8(x2Lo>>56),
+					int8(x1Hi>>0) < int8(x2Hi>>0), int8(x1Hi>>8) < int8(x2Hi>>8),
+					int8(x1Hi>>16) < int8(x2Hi>>16), int8(x1Hi>>24) < int8(x2Hi>>24),
+					int8(x1Hi>>32) < int8(x2Hi>>32), int8(x1Hi>>40) < int8(x2Hi>>40),
+					int8(x1Hi>>48) < int8(x2Hi>>48), int8(x1Hi>>56) < int8(x2Hi>>56),
+				}
+			case v128CmpTypeI8x16LtU:
+				result = []bool{
+					byte(x1Lo>>0) < byte(x2Lo>>0), byte(x1Lo>>8) < byte(x2Lo>>8),
+					byte(x1Lo>>16) < byte(x2Lo>>16), byte(x1Lo>>24) < byte(x2Lo>>24),
+					byte(x1Lo>>32) < byte(x2Lo>>32), byte(x1Lo>>40) < byte(x2Lo>>40),
+					byte(x1Lo>>48) < byte(x2Lo>>48), byte(x1Lo>>56) < byte(x2Lo>>56),
+					byte(x1Hi>>0) < byte(x2Hi>>0), byte(x1Hi>>8) < byte(x2Hi>>8),
+					byte(x1Hi>>16) < byte(x2Hi>>16), byte(x1Hi>>24) < byte(x2Hi>>24),
+					byte(x1Hi>>32) < byte(x2Hi>>32), byte(x1Hi>>40) < byte(x2Hi>>40),
+					byte(x1Hi>>48) < byte(x2Hi>>48), byte(x1Hi>>56) < byte(x2Hi>>56),
+				}
+			case v128CmpTypeI8x16GtS:
+				result = []bool{
+					int8(x1Lo>>0) > int8(x2Lo>>0), int8(x1Lo>>8) > int8(x2Lo>>8),
+					int8(x1Lo>>16) > int8(x2Lo>>16), int8(x1Lo>>24) > int8(x2Lo>>24),
+					int8(x1Lo>>32) > int8(x2Lo>>32), int8(x1Lo>>40) > int8(x2Lo>>40),
+					int8(x1Lo>>48) > int8(x2Lo>>48), int8(x1Lo>>56) > int8(x2Lo>>56),
+					int8(x1Hi>>0) > int8(x2Hi>>0), int8(x1Hi>>8) > int8(x2Hi>>8),
+					int8(x1Hi>>16) > int8(x2Hi>>16), int8(x1Hi>>24) > int8(x2Hi>>24),
+					int8(x1Hi>>32) > int8(x2Hi>>32), int8(x1Hi>>40) > int8(x2Hi>>40),
+					int8(x1Hi>>48) > int8(x2Hi>>48), int8(x1Hi>>56) > int8(x2Hi>>56),
+				}
+			case v128CmpTypeI8x16GtU:
+				result = []bool{
+					byte(x1Lo>>0) > byte(x2Lo>>0), byte(x1Lo>>8) > byte(x2Lo>>8),
+					byte(x1Lo>>16) > byte(x2Lo>>16), byte(x1Lo>>24) > byte(x2Lo>>24),
+					byte(x1Lo>>32) > byte(x2Lo>>32), byte(x1Lo>>40) > byte(x2Lo>>40),
+					byte(x1Lo>>48) > byte(x2Lo>>48), byte(x1Lo>>56) > byte(x2Lo>>56),
+					byte(x1Hi>>0) > byte(x2Hi>>0), byte(x1Hi>>8) > byte(x2Hi>>8),
+					byte(x1Hi>>16) > byte(x2Hi>>16), byte(x1Hi>>24) > byte(x2Hi>>24),
+					byte(x1Hi>>32) > byte(x2Hi>>32), byte(x1Hi>>40) > byte(x2Hi>>40),
+					byte(x1Hi>>48) > byte(x2Hi>>48), byte(x1Hi>>56) > byte(x2Hi>>56),
+				}
+			case v128CmpTypeI8x16LeS:
+				result = []bool{
+					int8(x1Lo>>0) <= int8(x2Lo>>0), int8(x1Lo>>8) <= int8(x2Lo>>8),
+					int8(x1Lo>>16) <= int8(x2Lo>>16), int8(x1Lo>>24) <= int8(x2Lo>>24),
+					int8(x1Lo>>32) <= int8(x2Lo>>32), int8(x1Lo>>40) <= int8(x2Lo>>40),
+					int8(x1Lo>>48) <= int8(x2Lo>>48), int8(x1Lo>>56) <= int8(x2Lo>>56),
+					int8(x1Hi>>0) <= int8(x2Hi>>0), int8(x1Hi>>8) <= int8(x2Hi>>8),
+					int8(x1Hi>>16) <= int8(x2Hi>>16), int8(x1Hi>>24) <= int8(x2Hi>>24),
+					int8(x1Hi>>32) <= int8(x2Hi>>32), int8(x1Hi>>40) <= int8(x2Hi>>40),
+					int8(x1Hi>>48) <= int8(x2Hi>>48), int8(x1Hi>>56) <= int8(x2Hi>>56),
+				}
+			case v128CmpTypeI8x16LeU:
+				result = []bool{
+					byte(x1Lo>>0) <= byte(x2Lo>>0), byte(x1Lo>>8) <= byte(x2Lo>>8),
+					byte(x1Lo>>16) <= byte(x2Lo>>16), byte(x1Lo>>24) <= byte(x2Lo>>24),
+					byte(x1Lo>>32) <= byte(x2Lo>>32), byte(x1Lo>>40) <= byte(x2Lo>>40),
+					byte(x1Lo>>48) <= byte(x2Lo>>48), byte(x1Lo>>56) <= byte(x2Lo>>56),
+					byte(x1Hi>>0) <= byte(x2Hi>>0), byte(x1Hi>>8) <= byte(x2Hi>>8),
+					byte(x1Hi>>16) <= byte(x2Hi>>16), byte(x1Hi>>24) <= byte(x2Hi>>24),
+					byte(x1Hi>>32) <= byte(x2Hi>>32), byte(x1Hi>>40) <= byte(x2Hi>>40),
+					byte(x1Hi>>48) <= byte(x2Hi>>48), byte(x1Hi>>56) <= byte(x2Hi>>56),
+				}
+			case v128CmpTypeI8x16GeS:
+				result = []bool{
+					int8(x1Lo>>0) >= int8(x2Lo>>0), int8(x1Lo>>8) >= int8(x2Lo>>8),
+					int8(x1Lo>>16) >= int8(x2Lo>>16), int8(x1Lo>>24) >= int8(x2Lo>>24),
+					int8(x1Lo>>32) >= int8(x2Lo>>32), int8(x1Lo>>40) >= int8(x2Lo>>40),
+					int8(x1Lo>>48) >= int8(x2Lo>>48), int8(x1Lo>>56) >= int8(x2Lo>>56),
+					int8(x1Hi>>0) >= int8(x2Hi>>0), int8(x1Hi>>8) >= int8(x2Hi>>8),
+					int8(x1Hi>>16) >= int8(x2Hi>>16), int8(x1Hi>>24) >= int8(x2Hi>>24),
+					int8(x1Hi>>32) >= int8(x2Hi>>32), int8(x1Hi>>40) >= int8(x2Hi>>40),
+					int8(x1Hi>>48) >= int8(x2Hi>>48), int8(x1Hi>>56) >= int8(x2Hi>>56),
+				}
+			case v128CmpTypeI8x16GeU:
+				result = []bool{
+					byte(x1Lo>>0) >= byte(x2Lo>>0), byte(x1Lo>>8) >= byte(x2Lo>>8),
+					byte(x1Lo>>16) >= byte(x2Lo>>16), byte(x1Lo>>24) >= byte(x2Lo>>24),
+					byte(x1Lo>>32) >= byte(x2Lo>>32), byte(x1Lo>>40) >= byte(x2Lo>>40),
+					byte(x1Lo>>48) >= byte(x2Lo>>48), byte(x1Lo>>56) >= byte(x2Lo>>56),
+					byte(x1Hi>>0) >= byte(x2Hi>>0), byte(x1Hi>>8) >= byte(x2Hi>>8),
+					byte(x1Hi>>16) >= byte(x2Hi>>16), byte(x1Hi>>24) >= byte(x2Hi>>24),
+					byte(x1Hi>>32) >= byte(x2Hi>>32), byte(x1Hi>>40) >= byte(x2Hi>>40),
+					byte(x1Hi>>48) >= byte(x2Hi>>48), byte(x1Hi>>56) >= byte(x2Hi>>56),
+				}
+			case v128CmpTypeI16x8Eq:
+				result = []bool{
+					uint16(x1Lo>>0) == uint16(x2Lo>>0), uint16(x1Lo>>16) == uint16(x2Lo>>16),
+					uint16(x1Lo>>32) == uint16(x2Lo>>32), uint16(x1Lo>>48) == uint16(x2Lo>>48),
+					uint16(x1Hi>>0) == uint16(x2Hi>>0), uint16(x1Hi>>16) == uint16(x2Hi>>16),
+					uint16(x1Hi>>32) == uint16(x2Hi>>32), uint16(x1Hi>>48) == uint16(x2Hi>>48),
+				}
+			case v128CmpTypeI16x8Ne:
+				result = []bool{
+					uint16(x1Lo>>0) != uint16(x2Lo>>0), uint16(x1Lo>>16) != uint16(x2Lo>>16),
+					uint16(x1Lo>>32) != uint16(x2Lo>>32), uint16(x1Lo>>48) != uint16(x2Lo>>48),
+					uint16(x1Hi>>0) != uint16(x2Hi>>0), uint16(x1Hi>>16) != uint16(x2Hi>>16),
+					uint16(x1Hi>>32) != uint16(x2Hi>>32), uint16(x1Hi>>48) != uint16(x2Hi>>48),
+				}
+			case v128CmpTypeI16x8LtS:
+				result = []bool{
+					int16(x1Lo>>0) < int16(x2Lo>>0), int16(x1Lo>>16) < int16(x2Lo>>16),
+					int16(x1Lo>>32) < int16(x2Lo>>32), int16(x1Lo>>48) < int16(x2Lo>>48),
+					int16(x1Hi>>0) < int16(x2Hi>>0), int16(x1Hi>>16) < int16(x2Hi>>16),
+					int16(x1Hi>>32) < int16(x2Hi>>32), int16(x1Hi>>48) < int16(x2Hi>>48),
+				}
+			case v128CmpTypeI16x8LtU:
+				result = []bool{
+					uint16(x1Lo>>0) < uint16(x2Lo>>0), uint16(x1Lo>>16) < uint16(x2Lo>>16),
+					uint16(x1Lo>>32) < uint16(x2Lo>>32), uint16(x1Lo>>48) < uint16(x2Lo>>48),
+					uint16(x1Hi>>0) < uint16(x2Hi>>0), uint16(x1Hi>>16) < uint16(x2Hi>>16),
+					uint16(x1Hi>>32) < uint16(x2Hi>>32), uint16(x1Hi>>48) < uint16(x2Hi>>48),
+				}
+			case v128CmpTypeI16x8GtS:
+				result = []bool{
+					int16(x1Lo>>0) > int16(x2Lo>>0), int16(x1Lo>>16) > int16(x2Lo>>16),
+					int16(x1Lo>>32) > int16(x2Lo>>32), int16(x1Lo>>48) > int16(x2Lo>>48),
+					int16(x1Hi>>0) > int16(x2Hi>>0), int16(x1Hi>>16) > int16(x2Hi>>16),
+					int16(x1Hi>>32) > int16(x2Hi>>32), int16(x1Hi>>48) > int16(x2Hi>>48),
+				}
+			case v128CmpTypeI16x8GtU:
+				result = []bool{
+					uint16(x1Lo>>0) > uint16(x2Lo>>0), uint16(x1Lo>>16) > uint16(x2Lo>>16),
+					uint16(x1Lo>>32) > uint16(x2Lo>>32), uint16(x1Lo>>48) > uint16(x2Lo>>48),
+					uint16(x1Hi>>0) > uint16(x2Hi>>0), uint16(x1Hi>>16) > uint16(x2Hi>>16),
+					uint16(x1Hi>>32) > uint16(x2Hi>>32), uint16(x1Hi>>48) > uint16(x2Hi>>48),
+				}
+			case v128CmpTypeI16x8LeS:
+				result = []bool{
+					int16(x1Lo>>0) <= int16(x2Lo>>0), int16(x1Lo>>16) <= int16(x2Lo>>16),
+					int16(x1Lo>>32) <= int16(x2Lo>>32), int16(x1Lo>>48) <= int16(x2Lo>>48),
+					int16(x1Hi>>0) <= int16(x2Hi>>0), int16(x1Hi>>16) <= int16(x2Hi>>16),
+					int16(x1Hi>>32) <= int16(x2Hi>>32), int16(x1Hi>>48) <= int16(x2Hi>>48),
+				}
+			case v128CmpTypeI16x8LeU:
+				result = []bool{
+					uint16(x1Lo>>0) <= uint16(x2Lo>>0), uint16(x1Lo>>16) <= uint16(x2Lo>>16),
+					uint16(x1Lo>>32) <= uint16(x2Lo>>32), uint16(x1Lo>>48) <= uint16(x2Lo>>48),
+					uint16(x1Hi>>0) <= uint16(x2Hi>>0), uint16(x1Hi>>16) <= uint16(x2Hi>>16),
+					uint16(x1Hi>>32) <= uint16(x2Hi>>32), uint16(x1Hi>>48) <= uint16(x2Hi>>48),
+				}
+			case v128CmpTypeI16x8GeS:
+				result = []bool{
+					int16(x1Lo>>0) >= int16(x2Lo>>0), int16(x1Lo>>16) >= int16(x2Lo>>16),
+					int16(x1Lo>>32) >= int16(x2Lo>>32), int16(x1Lo>>48) >= int16(x2Lo>>48),
+					int16(x1Hi>>0) >= int16(x2Hi>>0), int16(x1Hi>>16) >= int16(x2Hi>>16),
+					int16(x1Hi>>32) >= int16(x2Hi>>32), int16(x1Hi>>48) >= int16(x2Hi>>48),
+				}
+			case v128CmpTypeI16x8GeU:
+				result = []bool{
+					uint16(x1Lo>>0) >= uint16(x2Lo>>0), uint16(x1Lo>>16) >= uint16(x2Lo>>16),
+					uint16(x1Lo>>32) >= uint16(x2Lo>>32), uint16(x1Lo>>48) >= uint16(x2Lo>>48),
+					uint16(x1Hi>>0) >= uint16(x2Hi>>0), uint16(x1Hi>>16) >= uint16(x2Hi>>16),
+					uint16(x1Hi>>32) >= uint16(x2Hi>>32), uint16(x1Hi>>48) >= uint16(x2Hi>>48),
+				}
+			case v128CmpTypeI32x4Eq:
+				result = []bool{
+					uint32(x1Lo>>0) == uint32(x2Lo>>0), uint32(x1Lo>>32) == uint32(x2Lo>>32),
+					uint32(x1Hi>>0) == uint32(x2Hi>>0), uint32(x1Hi>>32) == uint32(x2Hi>>32),
+				}
+			case v128CmpTypeI32x4Ne:
+				result = []bool{
+					uint32(x1Lo>>0) != uint32(x2Lo>>0), uint32(x1Lo>>32) != uint32(x2Lo>>32),
+					uint32(x1Hi>>0) != uint32(x2Hi>>0), uint32(x1Hi>>32) != uint32(x2Hi>>32),
+				}
+			case v128CmpTypeI32x4LtS:
+				result = []bool{
+					int32(x1Lo>>0) < int32(x2Lo>>0), int32(x1Lo>>32) < int32(x2Lo>>32),
+					int32(x1Hi>>0) < int32(x2Hi>>0), int32(x1Hi>>32) < int32(x2Hi>>32),
+				}
+			case v128CmpTypeI32x4LtU:
+				result = []bool{
+					uint32(x1Lo>>0) < uint32(x2Lo>>0), uint32(x1Lo>>32) < uint32(x2Lo>>32),
+					uint32(x1Hi>>0) < uint32(x2Hi>>0), uint32(x1Hi>>32) < uint32(x2Hi>>32),
+				}
+			case v128CmpTypeI32x4GtS:
+				result = []bool{
+					int32(x1Lo>>0) > int32(x2Lo>>0), int32(x1Lo>>32) > int32(x2Lo>>32),
+					int32(x1Hi>>0) > int32(x2Hi>>0), int32(x1Hi>>32) > int32(x2Hi>>32),
+				}
+			case v128CmpTypeI32x4GtU:
+				result = []bool{
+					uint32(x1Lo>>0) > uint32(x2Lo>>0), uint32(x1Lo>>32) > uint32(x2Lo>>32),
+					uint32(x1Hi>>0) > uint32(x2Hi>>0), uint32(x1Hi>>32) > uint32(x2Hi>>32),
+				}
+			case v128CmpTypeI32x4LeS:
+				result = []bool{
+					int32(x1Lo>>0) <= int32(x2Lo>>0), int32(x1Lo>>32) <= int32(x2Lo>>32),
+					int32(x1Hi>>0) <= int32(x2Hi>>0), int32(x1Hi>>32) <= int32(x2Hi>>32),
+				}
+			case v128CmpTypeI32x4LeU:
+				result = []bool{
+					uint32(x1Lo>>0) <= uint32(x2Lo>>0), uint32(x1Lo>>32) <= uint32(x2Lo>>32),
+					uint32(x1Hi>>0) <= uint32(x2Hi>>0), uint32(x1Hi>>32) <= uint32(x2Hi>>32),
+				}
+			case v128CmpTypeI32x4GeS:
+				result = []bool{
+					int32(x1Lo>>0) >= int32(x2Lo>>0), int32(x1Lo>>32) >= int32(x2Lo>>32),
+					int32(x1Hi>>0) >= int32(x2Hi>>0), int32(x1Hi>>32) >= int32(x2Hi>>32),
+				}
+			case v128CmpTypeI32x4GeU:
+				result = []bool{
+					uint32(x1Lo>>0) >= uint32(x2Lo>>0), uint32(x1Lo>>32) >= uint32(x2Lo>>32),
+					uint32(x1Hi>>0) >= uint32(x2Hi>>0), uint32(x1Hi>>32) >= uint32(x2Hi>>32),
+				}
+			case v128CmpTypeI64x2Eq:
+				result = []bool{x1Lo == x2Lo, x1Hi == x2Hi}
+			case v128CmpTypeI64x2Ne:
+				result = []bool{x1Lo != x2Lo, x1Hi != x2Hi}
+			case v128CmpTypeI64x2LtS:
+				result = []bool{int64(x1Lo) < int64(x2Lo), int64(x1Hi) < int64(x2Hi)}
+			case v128CmpTypeI64x2GtS:
+				result = []bool{int64(x1Lo) > int64(x2Lo), int64(x1Hi) > int64(x2Hi)}
+			case v128CmpTypeI64x2LeS:
+				result = []bool{int64(x1Lo) <= int64(x2Lo), int64(x1Hi) <= int64(x2Hi)}
+			case v128CmpTypeI64x2GeS:
+				result = []bool{int64(x1Lo) >= int64(x2Lo), int64(x1Hi) >= int64(x2Hi)}
+			case v128CmpTypeF32x4Eq:
+				result = []bool{
+					math.Float32frombits(uint32(x1Lo>>0)) == math.Float32frombits(uint32(x2Lo>>0)),
+					math.Float32frombits(uint32(x1Lo>>32)) == math.Float32frombits(uint32(x2Lo>>32)),
+					math.Float32frombits(uint32(x1Hi>>0)) == math.Float32frombits(uint32(x2Hi>>0)),
+					math.Float32frombits(uint32(x1Hi>>32)) == math.Float32frombits(uint32(x2Hi>>32)),
+				}
+			case v128CmpTypeF32x4Ne:
+				result = []bool{
+					math.Float32frombits(uint32(x1Lo>>0)) != math.Float32frombits(uint32(x2Lo>>0)),
+					math.Float32frombits(uint32(x1Lo>>32)) != math.Float32frombits(uint32(x2Lo>>32)),
+					math.Float32frombits(uint32(x1Hi>>0)) != math.Float32frombits(uint32(x2Hi>>0)),
+					math.Float32frombits(uint32(x1Hi>>32)) != math.Float32frombits(uint32(x2Hi>>32)),
+				}
+			case v128CmpTypeF32x4Lt:
+				result = []bool{
+					math.Float32frombits(uint32(x1Lo>>0)) < math.Float32frombits(uint32(x2Lo>>0)),
+					math.Float32frombits(uint32(x1Lo>>32)) < math.Float32frombits(uint32(x2Lo>>32)),
+					math.Float32frombits(uint32(x1Hi>>0)) < math.Float32frombits(uint32(x2Hi>>0)),
+					math.Float32frombits(uint32(x1Hi>>32)) < math.Float32frombits(uint32(x2Hi>>32)),
+				}
+			case v128CmpTypeF32x4Gt:
+				result = []bool{
+					math.Float32frombits(uint32(x1Lo>>0)) > math.Float32frombits(uint32(x2Lo>>0)),
+					math.Float32frombits(uint32(x1Lo>>32)) > math.Float32frombits(uint32(x2Lo>>32)),
+					math.Float32frombits(uint32(x1Hi>>0)) > math.Float32frombits(uint32(x2Hi>>0)),
+					math.Float32frombits(uint32(x1Hi>>32)) > math.Float32frombits(uint32(x2Hi>>32)),
+				}
+			case v128CmpTypeF32x4Le:
+				result = []bool{
+					math.Float32frombits(uint32(x1Lo>>0)) <= math.Float32frombits(uint32(x2Lo>>0)),
+					math.Float32frombits(uint32(x1Lo>>32)) <= math.Float32frombits(uint32(x2Lo>>32)),
+					math.Float32frombits(uint32(x1Hi>>0)) <= math.Float32frombits(uint32(x2Hi>>0)),
+					math.Float32frombits(uint32(x1Hi>>32)) <= math.Float32frombits(uint32(x2Hi>>32)),
+				}
+			case v128CmpTypeF32x4Ge:
+				result = []bool{
+					math.Float32frombits(uint32(x1Lo>>0)) >= math.Float32frombits(uint32(x2Lo>>0)),
+					math.Float32frombits(uint32(x1Lo>>32)) >= math.Float32frombits(uint32(x2Lo>>32)),
+					math.Float32frombits(uint32(x1Hi>>0)) >= math.Float32frombits(uint32(x2Hi>>0)),
+					math.Float32frombits(uint32(x1Hi>>32)) >= math.Float32frombits(uint32(x2Hi>>32)),
+				}
+			case v128CmpTypeF64x2Eq:
+				result = []bool{
+					math.Float64frombits(x1Lo) == math.Float64frombits(x2Lo),
+					math.Float64frombits(x1Hi) == math.Float64frombits(x2Hi),
+				}
+			case v128CmpTypeF64x2Ne:
+				result = []bool{
+					math.Float64frombits(x1Lo) != math.Float64frombits(x2Lo),
+					math.Float64frombits(x1Hi) != math.Float64frombits(x2Hi),
+				}
+			case v128CmpTypeF64x2Lt:
+				result = []bool{
+					math.Float64frombits(x1Lo) < math.Float64frombits(x2Lo),
+					math.Float64frombits(x1Hi) < math.Float64frombits(x2Hi),
+				}
+			case v128CmpTypeF64x2Gt:
+				result = []bool{
+					math.Float64frombits(x1Lo) > math.Float64frombits(x2Lo),
+					math.Float64frombits(x1Hi) > math.Float64frombits(x2Hi),
+				}
+			case v128CmpTypeF64x2Le:
+				result = []bool{
+					math.Float64frombits(x1Lo) <= math.Float64frombits(x2Lo),
+					math.Float64frombits(x1Hi) <= math.Float64frombits(x2Hi),
+				}
+			case v128CmpTypeF64x2Ge:
+				result = []bool{
+					math.Float64frombits(x1Lo) >= math.Float64frombits(x2Lo),
+					math.Float64frombits(x1Hi) >= math.Float64frombits(x2Hi),
+				}
+			}
+
+			var retLo, retHi uint64
+			laneNum := len(result)
+			switch laneNum {
+			case 16:
+				for i, b := range result {
+					if b {
+						if i < 8 {
+							retLo |= 0xff << (i * 8)
+						} else {
+							retHi |= 0xff << ((i - 8) * 8)
+						}
+					}
+				}
+			case 8:
+				for i, b := range result {
+					if b {
+						if i < 4 {
+							retLo |= 0xffff << (i * 16)
+						} else {
+							retHi |= 0xffff << ((i - 4) * 16)
+						}
+					}
+				}
+			case 4:
+				for i, b := range result {
+					if b {
+						if i < 2 {
+							retLo |= 0xffff_ffff << (i * 32)
+						} else {
+							retHi |= 0xffff_ffff << ((i - 2) * 32)
+						}
+					}
+				}
+			case 2:
+				if result[0] {
+					retLo = ^uint64(0)
+				}
+				if result[1] {
+					retHi = ^uint64(0)
+				}
+			}
+
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128AddSat:
+			x2hi, x2Lo := ce.popValue(), ce.popValue()
+			x1hi, x1Lo := ce.popValue(), ce.popValue()
+
+			var retLo, retHi uint64
+
+			// Lane-wise addition while saturating the overflowing values.
+			// https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#saturating-integer-addition
+			switch op.B1 {
+			case shapeI8x16:
+				for i := 0; i < 16; i++ {
+					var v, w byte
+					if i < 8 {
+						v, w = byte(x1Lo>>(i*8)), byte(x2Lo>>(i*8))
+					} else {
+						v, w = byte(x1hi>>((i-8)*8)), byte(x2hi>>((i-8)*8))
+					}
+
+					var uv uint64
+					if op.B3 { // signed
+						if subbed := int64(int8(v)) + int64(int8(w)); subbed < math.MinInt8 {
+							uv = uint64(byte(0x80))
+						} else if subbed > math.MaxInt8 {
+							uv = uint64(byte(0x7f))
+						} else {
+							uv = uint64(byte(int8(subbed)))
+						}
+					} else {
+						if subbed := int64(v) + int64(w); subbed < 0 {
+							uv = uint64(byte(0))
+						} else if subbed > math.MaxUint8 {
+							uv = uint64(byte(0xff))
+						} else {
+							uv = uint64(byte(subbed))
+						}
+					}
+
+					if i < 8 { // first 8 lanes are on lower 64bits.
+						retLo |= uv << (i * 8)
+					} else {
+						retHi |= uv << ((i - 8) * 8)
+					}
+				}
+			case shapeI16x8:
+				for i := 0; i < 8; i++ {
+					var v, w uint16
+					if i < 4 {
+						v, w = uint16(x1Lo>>(i*16)), uint16(x2Lo>>(i*16))
+					} else {
+						v, w = uint16(x1hi>>((i-4)*16)), uint16(x2hi>>((i-4)*16))
+					}
+
+					var uv uint64
+					if op.B3 { // signed
+						if added := int64(int16(v)) + int64(int16(w)); added < math.MinInt16 {
+							uv = uint64(uint16(0x8000))
+						} else if added > math.MaxInt16 {
+							uv = uint64(uint16(0x7fff))
+						} else {
+							uv = uint64(uint16(int16(added)))
+						}
+					} else {
+						if added := int64(v) + int64(w); added < 0 {
+							uv = uint64(uint16(0))
+						} else if added > math.MaxUint16 {
+							uv = uint64(uint16(0xffff))
+						} else {
+							uv = uint64(uint16(added))
+						}
+					}
+
+					if i < 4 { // first 4 lanes are on lower 64bits.
+						retLo |= uv << (i * 16)
+					} else {
+						retHi |= uv << ((i - 4) * 16)
+					}
+				}
+			}
+
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128SubSat:
+			x2hi, x2Lo := ce.popValue(), ce.popValue()
+			x1hi, x1Lo := ce.popValue(), ce.popValue()
+
+			var retLo, retHi uint64
+
+			// Lane-wise subtraction while saturating the overflowing values.
+			// https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#saturating-integer-subtraction
+			switch op.B1 {
+			case shapeI8x16:
+				for i := 0; i < 16; i++ {
+					var v, w byte
+					if i < 8 {
+						v, w = byte(x1Lo>>(i*8)), byte(x2Lo>>(i*8))
+					} else {
+						v, w = byte(x1hi>>((i-8)*8)), byte(x2hi>>((i-8)*8))
+					}
+
+					var uv uint64
+					if op.B3 { // signed
+						if subbed := int64(int8(v)) - int64(int8(w)); subbed < math.MinInt8 {
+							uv = uint64(byte(0x80))
+						} else if subbed > math.MaxInt8 {
+							uv = uint64(byte(0x7f))
+						} else {
+							uv = uint64(byte(int8(subbed)))
+						}
+					} else {
+						if subbed := int64(v) - int64(w); subbed < 0 {
+							uv = uint64(byte(0))
+						} else if subbed > math.MaxUint8 {
+							uv = uint64(byte(0xff))
+						} else {
+							uv = uint64(byte(subbed))
+						}
+					}
+
+					if i < 8 {
+						retLo |= uv << (i * 8)
+					} else {
+						retHi |= uv << ((i - 8) * 8)
+					}
+				}
+			case shapeI16x8:
+				for i := 0; i < 8; i++ {
+					var v, w uint16
+					if i < 4 {
+						v, w = uint16(x1Lo>>(i*16)), uint16(x2Lo>>(i*16))
+					} else {
+						v, w = uint16(x1hi>>((i-4)*16)), uint16(x2hi>>((i-4)*16))
+					}
+
+					var uv uint64
+					if op.B3 { // signed
+						if subbed := int64(int16(v)) - int64(int16(w)); subbed < math.MinInt16 {
+							uv = uint64(uint16(0x8000))
+						} else if subbed > math.MaxInt16 {
+							uv = uint64(uint16(0x7fff))
+						} else {
+							uv = uint64(uint16(int16(subbed)))
+						}
+					} else {
+						if subbed := int64(v) - int64(w); subbed < 0 {
+							uv = uint64(uint16(0))
+						} else if subbed > math.MaxUint16 {
+							uv = uint64(uint16(0xffff))
+						} else {
+							uv = uint64(uint16(subbed))
+						}
+					}
+
+					if i < 4 {
+						retLo |= uv << (i * 16)
+					} else {
+						retHi |= uv << ((i - 4) * 16)
+					}
+				}
+			}
+
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128Mul:
+			x2hi, x2lo := ce.popValue(), ce.popValue()
+			x1hi, x1lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			switch op.B1 {
+			case shapeI16x8:
+				retHi = uint64(uint16(x1hi)*uint16(x2hi)) | (uint64(uint16(x1hi>>16)*uint16(x2hi>>16)) << 16) |
+					(uint64(uint16(x1hi>>32)*uint16(x2hi>>32)) << 32) | (uint64(uint16(x1hi>>48)*uint16(x2hi>>48)) << 48)
+				retLo = uint64(uint16(x1lo)*uint16(x2lo)) | (uint64(uint16(x1lo>>16)*uint16(x2lo>>16)) << 16) |
+					(uint64(uint16(x1lo>>32)*uint16(x2lo>>32)) << 32) | (uint64(uint16(x1lo>>48)*uint16(x2lo>>48)) << 48)
+			case shapeI32x4:
+				retHi = uint64(uint32(x1hi)*uint32(x2hi)) | (uint64(uint32(x1hi>>32)*uint32(x2hi>>32)) << 32)
+				retLo = uint64(uint32(x1lo)*uint32(x2lo)) | (uint64(uint32(x1lo>>32)*uint32(x2lo>>32)) << 32)
+			case shapeI64x2:
+				retHi = x1hi * x2hi
+				retLo = x1lo * x2lo
+			case shapeF32x4:
+				retHi = mulFloat32bits(uint32(x1hi), uint32(x2hi)) | mulFloat32bits(uint32(x1hi>>32), uint32(x2hi>>32))<<32
+				retLo = mulFloat32bits(uint32(x1lo), uint32(x2lo)) | mulFloat32bits(uint32(x1lo>>32), uint32(x2lo>>32))<<32
+			case shapeF64x2:
+				retHi = math.Float64bits(math.Float64frombits(x1hi) * math.Float64frombits(x2hi))
+				retLo = math.Float64bits(math.Float64frombits(x1lo) * math.Float64frombits(x2lo))
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128Div:
+			x2hi, x2lo := ce.popValue(), ce.popValue()
+			x1hi, x1lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			if op.B1 == shapeF64x2 {
+				retHi = math.Float64bits(math.Float64frombits(x1hi) / math.Float64frombits(x2hi))
+				retLo = math.Float64bits(math.Float64frombits(x1lo) / math.Float64frombits(x2lo))
+			} else {
+				retHi = divFloat32bits(uint32(x1hi), uint32(x2hi)) | divFloat32bits(uint32(x1hi>>32), uint32(x2hi>>32))<<32
+				retLo = divFloat32bits(uint32(x1lo), uint32(x2lo)) | divFloat32bits(uint32(x1lo>>32), uint32(x2lo>>32))<<32
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128Neg:
+			hi, lo := ce.popValue(), ce.popValue()
+			switch op.B1 {
+			case shapeI8x16:
+				lo = uint64(-byte(lo)) | (uint64(-byte(lo>>8)) << 8) |
+					(uint64(-byte(lo>>16)) << 16) | (uint64(-byte(lo>>24)) << 24) |
+					(uint64(-byte(lo>>32)) << 32) | (uint64(-byte(lo>>40)) << 40) |
+					(uint64(-byte(lo>>48)) << 48) | (uint64(-byte(lo>>56)) << 56)
+				hi = uint64(-byte(hi)) | (uint64(-byte(hi>>8)) << 8) |
+					(uint64(-byte(hi>>16)) << 16) | (uint64(-byte(hi>>24)) << 24) |
+					(uint64(-byte(hi>>32)) << 32) | (uint64(-byte(hi>>40)) << 40) |
+					(uint64(-byte(hi>>48)) << 48) | (uint64(-byte(hi>>56)) << 56)
+			case shapeI16x8:
+				hi = uint64(-uint16(hi)) | (uint64(-uint16(hi>>16)) << 16) |
+					(uint64(-uint16(hi>>32)) << 32) | (uint64(-uint16(hi>>48)) << 48)
+				lo = uint64(-uint16(lo)) | (uint64(-uint16(lo>>16)) << 16) |
+					(uint64(-uint16(lo>>32)) << 32) | (uint64(-uint16(lo>>48)) << 48)
+			case shapeI32x4:
+				hi = uint64(-uint32(hi)) | (uint64(-uint32(hi>>32)) << 32)
+				lo = uint64(-uint32(lo)) | (uint64(-uint32(lo>>32)) << 32)
+			case shapeI64x2:
+				hi = -hi
+				lo = -lo
+			case shapeF32x4:
+				hi = uint64(math.Float32bits(-math.Float32frombits(uint32(hi)))) |
+					(uint64(math.Float32bits(-math.Float32frombits(uint32(hi>>32)))) << 32)
+				lo = uint64(math.Float32bits(-math.Float32frombits(uint32(lo)))) |
+					(uint64(math.Float32bits(-math.Float32frombits(uint32(lo>>32)))) << 32)
+			case shapeF64x2:
+				hi = math.Float64bits(-math.Float64frombits(hi))
+				lo = math.Float64bits(-math.Float64frombits(lo))
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case operationKindV128Sqrt:
+			hi, lo := ce.popValue(), ce.popValue()
+			if op.B1 == shapeF64x2 {
+				hi = math.Float64bits(math.Sqrt(math.Float64frombits(hi)))
+				lo = math.Float64bits(math.Sqrt(math.Float64frombits(lo)))
+			} else {
+				hi = uint64(math.Float32bits(float32(math.Sqrt(float64(math.Float32frombits(uint32(hi))))))) |
+					(uint64(math.Float32bits(float32(math.Sqrt(float64(math.Float32frombits(uint32(hi>>32))))))) << 32)
+				lo = uint64(math.Float32bits(float32(math.Sqrt(float64(math.Float32frombits(uint32(lo))))))) |
+					(uint64(math.Float32bits(float32(math.Sqrt(float64(math.Float32frombits(uint32(lo>>32))))))) << 32)
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case operationKindV128Abs:
+			hi, lo := ce.popValue(), ce.popValue()
+			switch op.B1 {
+			case shapeI8x16:
+				lo = uint64(i8Abs(byte(lo))) | (uint64(i8Abs(byte(lo>>8))) << 8) |
+					(uint64(i8Abs(byte(lo>>16))) << 16) | (uint64(i8Abs(byte(lo>>24))) << 24) |
+					(uint64(i8Abs(byte(lo>>32))) << 32) | (uint64(i8Abs(byte(lo>>40))) << 40) |
+					(uint64(i8Abs(byte(lo>>48))) << 48) | (uint64(i8Abs(byte(lo>>56))) << 56)
+				hi = uint64(i8Abs(byte(hi))) | (uint64(i8Abs(byte(hi>>8))) << 8) |
+					(uint64(i8Abs(byte(hi>>16))) << 16) | (uint64(i8Abs(byte(hi>>24))) << 24) |
+					(uint64(i8Abs(byte(hi>>32))) << 32) | (uint64(i8Abs(byte(hi>>40))) << 40) |
+					(uint64(i8Abs(byte(hi>>48))) << 48) | (uint64(i8Abs(byte(hi>>56))) << 56)
+			case shapeI16x8:
+				hi = uint64(i16Abs(uint16(hi))) | (uint64(i16Abs(uint16(hi>>16))) << 16) |
+					(uint64(i16Abs(uint16(hi>>32))) << 32) | (uint64(i16Abs(uint16(hi>>48))) << 48)
+				lo = uint64(i16Abs(uint16(lo))) | (uint64(i16Abs(uint16(lo>>16))) << 16) |
+					(uint64(i16Abs(uint16(lo>>32))) << 32) | (uint64(i16Abs(uint16(lo>>48))) << 48)
+			case shapeI32x4:
+				hi = uint64(i32Abs(uint32(hi))) | (uint64(i32Abs(uint32(hi>>32))) << 32)
+				lo = uint64(i32Abs(uint32(lo))) | (uint64(i32Abs(uint32(lo>>32))) << 32)
+			case shapeI64x2:
+				if int64(hi) < 0 {
+					hi = -hi
+				}
+				if int64(lo) < 0 {
+					lo = -lo
+				}
+			case shapeF32x4:
+				hi = hi &^ (1<<31 | 1<<63)
+				lo = lo &^ (1<<31 | 1<<63)
+			case shapeF64x2:
+				hi = hi &^ (1 << 63)
+				lo = lo &^ (1 << 63)
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case operationKindV128Popcnt:
+			hi, lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			for i := 0; i < 16; i++ {
+				var v byte
+				if i < 8 {
+					v = byte(lo >> (i * 8))
+				} else {
+					v = byte(hi >> ((i - 8) * 8))
+				}
+
+				var cnt uint64
+				for i := 0; i < 8; i++ {
+					if (v>>i)&0b1 != 0 {
+						cnt++
+					}
+				}
+
+				if i < 8 {
+					retLo |= cnt << (i * 8)
+				} else {
+					retHi |= cnt << ((i - 8) * 8)
+				}
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128Min:
+			x2hi, x2lo := ce.popValue(), ce.popValue()
+			x1hi, x1lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			switch op.B1 {
+			case shapeI8x16:
+				if op.B3 { // signed
+					retLo = uint64(i8MinS(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8MinS(uint8(x1lo), uint8(x2lo))) |
+						uint64(i8MinS(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8MinS(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 |
+						uint64(i8MinS(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8MinS(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 |
+						uint64(i8MinS(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8MinS(uint8(x1lo>>48), uint8(x2lo>>48)))<<48
+					retHi = uint64(i8MinS(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8MinS(uint8(x1hi), uint8(x2hi))) |
+						uint64(i8MinS(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8MinS(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 |
+						uint64(i8MinS(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8MinS(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 |
+						uint64(i8MinS(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8MinS(uint8(x1hi>>48), uint8(x2hi>>48)))<<48
+				} else {
+					retLo = uint64(i8MinU(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8MinU(uint8(x1lo), uint8(x2lo))) |
+						uint64(i8MinU(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8MinU(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 |
+						uint64(i8MinU(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8MinU(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 |
+						uint64(i8MinU(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8MinU(uint8(x1lo>>48), uint8(x2lo>>48)))<<48
+					retHi = uint64(i8MinU(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8MinU(uint8(x1hi), uint8(x2hi))) |
+						uint64(i8MinU(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8MinU(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 |
+						uint64(i8MinU(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8MinU(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 |
+						uint64(i8MinU(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8MinU(uint8(x1hi>>48), uint8(x2hi>>48)))<<48
+				}
+			case shapeI16x8:
+				if op.B3 { // signed
+					retLo = uint64(i16MinS(uint16(x1lo), uint16(x2lo))) |
+						uint64(i16MinS(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 |
+						uint64(i16MinS(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 |
+						uint64(i16MinS(uint16(x1lo>>48), uint16(x2lo>>48)))<<48
+					retHi = uint64(i16MinS(uint16(x1hi), uint16(x2hi))) |
+						uint64(i16MinS(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 |
+						uint64(i16MinS(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 |
+						uint64(i16MinS(uint16(x1hi>>48), uint16(x2hi>>48)))<<48
+				} else {
+					retLo = uint64(i16MinU(uint16(x1lo), uint16(x2lo))) |
+						uint64(i16MinU(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 |
+						uint64(i16MinU(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 |
+						uint64(i16MinU(uint16(x1lo>>48), uint16(x2lo>>48)))<<48
+					retHi = uint64(i16MinU(uint16(x1hi), uint16(x2hi))) |
+						uint64(i16MinU(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 |
+						uint64(i16MinU(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 |
+						uint64(i16MinU(uint16(x1hi>>48), uint16(x2hi>>48)))<<48
+				}
+			case shapeI32x4:
+				if op.B3 { // signed
+					retLo = uint64(i32MinS(uint32(x1lo), uint32(x2lo))) |
+						uint64(i32MinS(uint32(x1lo>>32), uint32(x2lo>>32)))<<32
+					retHi = uint64(i32MinS(uint32(x1hi), uint32(x2hi))) |
+						uint64(i32MinS(uint32(x1hi>>32), uint32(x2hi>>32)))<<32
+				} else {
+					retLo = uint64(i32MinU(uint32(x1lo), uint32(x2lo))) |
+						uint64(i32MinU(uint32(x1lo>>32), uint32(x2lo>>32)))<<32
+					retHi = uint64(i32MinU(uint32(x1hi), uint32(x2hi))) |
+						uint64(i32MinU(uint32(x1hi>>32), uint32(x2hi>>32)))<<32
+				}
+			case shapeF32x4:
+				retHi = wasmCompatMin32bits(uint32(x1hi), uint32(x2hi)) |
+					wasmCompatMin32bits(uint32(x1hi>>32), uint32(x2hi>>32))<<32
+				retLo = wasmCompatMin32bits(uint32(x1lo), uint32(x2lo)) |
+					wasmCompatMin32bits(uint32(x1lo>>32), uint32(x2lo>>32))<<32
+			case shapeF64x2:
+				retHi = math.Float64bits(moremath.WasmCompatMin64(
+					math.Float64frombits(x1hi),
+					math.Float64frombits(x2hi),
+				))
+				retLo = math.Float64bits(moremath.WasmCompatMin64(
+					math.Float64frombits(x1lo),
+					math.Float64frombits(x2lo),
+				))
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128Max:
+			x2hi, x2lo := ce.popValue(), ce.popValue()
+			x1hi, x1lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			switch op.B1 {
+			case shapeI8x16:
+				if op.B3 { // signed
+					retLo = uint64(i8MaxS(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8MaxS(uint8(x1lo), uint8(x2lo))) |
+						uint64(i8MaxS(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8MaxS(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 |
+						uint64(i8MaxS(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8MaxS(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 |
+						uint64(i8MaxS(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8MaxS(uint8(x1lo>>48), uint8(x2lo>>48)))<<48
+					retHi = uint64(i8MaxS(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8MaxS(uint8(x1hi), uint8(x2hi))) |
+						uint64(i8MaxS(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8MaxS(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 |
+						uint64(i8MaxS(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8MaxS(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 |
+						uint64(i8MaxS(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8MaxS(uint8(x1hi>>48), uint8(x2hi>>48)))<<48
+				} else {
+					retLo = uint64(i8MaxU(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8MaxU(uint8(x1lo), uint8(x2lo))) |
+						uint64(i8MaxU(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8MaxU(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 |
+						uint64(i8MaxU(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8MaxU(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 |
+						uint64(i8MaxU(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8MaxU(uint8(x1lo>>48), uint8(x2lo>>48)))<<48
+					retHi = uint64(i8MaxU(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8MaxU(uint8(x1hi), uint8(x2hi))) |
+						uint64(i8MaxU(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8MaxU(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 |
+						uint64(i8MaxU(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8MaxU(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 |
+						uint64(i8MaxU(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8MaxU(uint8(x1hi>>48), uint8(x2hi>>48)))<<48
+				}
+			case shapeI16x8:
+				if op.B3 { // signed
+					retLo = uint64(i16MaxS(uint16(x1lo), uint16(x2lo))) |
+						uint64(i16MaxS(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 |
+						uint64(i16MaxS(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 |
+						uint64(i16MaxS(uint16(x1lo>>48), uint16(x2lo>>48)))<<48
+					retHi = uint64(i16MaxS(uint16(x1hi), uint16(x2hi))) |
+						uint64(i16MaxS(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 |
+						uint64(i16MaxS(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 |
+						uint64(i16MaxS(uint16(x1hi>>48), uint16(x2hi>>48)))<<48
+				} else {
+					retLo = uint64(i16MaxU(uint16(x1lo), uint16(x2lo))) |
+						uint64(i16MaxU(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 |
+						uint64(i16MaxU(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 |
+						uint64(i16MaxU(uint16(x1lo>>48), uint16(x2lo>>48)))<<48
+					retHi = uint64(i16MaxU(uint16(x1hi), uint16(x2hi))) |
+						uint64(i16MaxU(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 |
+						uint64(i16MaxU(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 |
+						uint64(i16MaxU(uint16(x1hi>>48), uint16(x2hi>>48)))<<48
+				}
+			case shapeI32x4:
+				if op.B3 { // signed
+					retLo = uint64(i32MaxS(uint32(x1lo), uint32(x2lo))) |
+						uint64(i32MaxS(uint32(x1lo>>32), uint32(x2lo>>32)))<<32
+					retHi = uint64(i32MaxS(uint32(x1hi), uint32(x2hi))) |
+						uint64(i32MaxS(uint32(x1hi>>32), uint32(x2hi>>32)))<<32
+				} else {
+					retLo = uint64(i32MaxU(uint32(x1lo), uint32(x2lo))) |
+						uint64(i32MaxU(uint32(x1lo>>32), uint32(x2lo>>32)))<<32
+					retHi = uint64(i32MaxU(uint32(x1hi), uint32(x2hi))) |
+						uint64(i32MaxU(uint32(x1hi>>32), uint32(x2hi>>32)))<<32
+				}
+			case shapeF32x4:
+				retHi = wasmCompatMax32bits(uint32(x1hi), uint32(x2hi)) |
+					wasmCompatMax32bits(uint32(x1hi>>32), uint32(x2hi>>32))<<32
+				retLo = wasmCompatMax32bits(uint32(x1lo), uint32(x2lo)) |
+					wasmCompatMax32bits(uint32(x1lo>>32), uint32(x2lo>>32))<<32
+			case shapeF64x2:
+				retHi = math.Float64bits(moremath.WasmCompatMax64(
+					math.Float64frombits(x1hi),
+					math.Float64frombits(x2hi),
+				))
+				retLo = math.Float64bits(moremath.WasmCompatMax64(
+					math.Float64frombits(x1lo),
+					math.Float64frombits(x2lo),
+				))
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128AvgrU:
+			x2hi, x2lo := ce.popValue(), ce.popValue()
+			x1hi, x1lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			switch op.B1 {
+			case shapeI8x16:
+				retLo = uint64(i8RoundingAverage(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8RoundingAverage(uint8(x1lo), uint8(x2lo))) |
+					uint64(i8RoundingAverage(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8RoundingAverage(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 |
+					uint64(i8RoundingAverage(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8RoundingAverage(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 |
+					uint64(i8RoundingAverage(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8RoundingAverage(uint8(x1lo>>48), uint8(x2lo>>48)))<<48
+				retHi = uint64(i8RoundingAverage(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8RoundingAverage(uint8(x1hi), uint8(x2hi))) |
+					uint64(i8RoundingAverage(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8RoundingAverage(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 |
+					uint64(i8RoundingAverage(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8RoundingAverage(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 |
+					uint64(i8RoundingAverage(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8RoundingAverage(uint8(x1hi>>48), uint8(x2hi>>48)))<<48
+			case shapeI16x8:
+				retLo = uint64(i16RoundingAverage(uint16(x1lo), uint16(x2lo))) |
+					uint64(i16RoundingAverage(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 |
+					uint64(i16RoundingAverage(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 |
+					uint64(i16RoundingAverage(uint16(x1lo>>48), uint16(x2lo>>48)))<<48
+				retHi = uint64(i16RoundingAverage(uint16(x1hi), uint16(x2hi))) |
+					uint64(i16RoundingAverage(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 |
+					uint64(i16RoundingAverage(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 |
+					uint64(i16RoundingAverage(uint16(x1hi>>48), uint16(x2hi>>48)))<<48
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128Pmin:
+			x2hi, x2lo := ce.popValue(), ce.popValue()
+			x1hi, x1lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			if op.B1 == shapeF32x4 {
+				if flt32(math.Float32frombits(uint32(x2lo)), math.Float32frombits(uint32(x1lo))) {
+					retLo = x2lo & 0x00000000_ffffffff
+				} else {
+					retLo = x1lo & 0x00000000_ffffffff
+				}
+				if flt32(math.Float32frombits(uint32(x2lo>>32)), math.Float32frombits(uint32(x1lo>>32))) {
+					retLo |= x2lo & 0xffffffff_00000000
+				} else {
+					retLo |= x1lo & 0xffffffff_00000000
+				}
+				if flt32(math.Float32frombits(uint32(x2hi)), math.Float32frombits(uint32(x1hi))) {
+					retHi = x2hi & 0x00000000_ffffffff
+				} else {
+					retHi = x1hi & 0x00000000_ffffffff
+				}
+				if flt32(math.Float32frombits(uint32(x2hi>>32)), math.Float32frombits(uint32(x1hi>>32))) {
+					retHi |= x2hi & 0xffffffff_00000000
+				} else {
+					retHi |= x1hi & 0xffffffff_00000000
+				}
+			} else {
+				if flt64(math.Float64frombits(x2lo), math.Float64frombits(x1lo)) {
+					retLo = x2lo
+				} else {
+					retLo = x1lo
+				}
+				if flt64(math.Float64frombits(x2hi), math.Float64frombits(x1hi)) {
+					retHi = x2hi
+				} else {
+					retHi = x1hi
+				}
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128Pmax:
+			x2hi, x2lo := ce.popValue(), ce.popValue()
+			x1hi, x1lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			if op.B1 == shapeF32x4 {
+				if flt32(math.Float32frombits(uint32(x1lo)), math.Float32frombits(uint32(x2lo))) {
+					retLo = x2lo & 0x00000000_ffffffff
+				} else {
+					retLo = x1lo & 0x00000000_ffffffff
+				}
+				if flt32(math.Float32frombits(uint32(x1lo>>32)), math.Float32frombits(uint32(x2lo>>32))) {
+					retLo |= x2lo & 0xffffffff_00000000
+				} else {
+					retLo |= x1lo & 0xffffffff_00000000
+				}
+				if flt32(math.Float32frombits(uint32(x1hi)), math.Float32frombits(uint32(x2hi))) {
+					retHi = x2hi & 0x00000000_ffffffff
+				} else {
+					retHi = x1hi & 0x00000000_ffffffff
+				}
+				if flt32(math.Float32frombits(uint32(x1hi>>32)), math.Float32frombits(uint32(x2hi>>32))) {
+					retHi |= x2hi & 0xffffffff_00000000
+				} else {
+					retHi |= x1hi & 0xffffffff_00000000
+				}
+			} else {
+				if flt64(math.Float64frombits(x1lo), math.Float64frombits(x2lo)) {
+					retLo = x2lo
+				} else {
+					retLo = x1lo
+				}
+				if flt64(math.Float64frombits(x1hi), math.Float64frombits(x2hi)) {
+					retHi = x2hi
+				} else {
+					retHi = x1hi
+				}
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128Ceil:
+			hi, lo := ce.popValue(), ce.popValue()
+			if op.B1 == shapeF32x4 {
+				lo = uint64(math.Float32bits(moremath.WasmCompatCeilF32(math.Float32frombits(uint32(lo))))) |
+					(uint64(math.Float32bits(moremath.WasmCompatCeilF32(math.Float32frombits(uint32(lo>>32))))) << 32)
+				hi = uint64(math.Float32bits(moremath.WasmCompatCeilF32(math.Float32frombits(uint32(hi))))) |
+					(uint64(math.Float32bits(moremath.WasmCompatCeilF32(math.Float32frombits(uint32(hi>>32))))) << 32)
+			} else {
+				lo = math.Float64bits(moremath.WasmCompatCeilF64(math.Float64frombits(lo)))
+				hi = math.Float64bits(moremath.WasmCompatCeilF64(math.Float64frombits(hi)))
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case operationKindV128Floor:
+			hi, lo := ce.popValue(), ce.popValue()
+			if op.B1 == shapeF32x4 {
+				lo = uint64(math.Float32bits(moremath.WasmCompatFloorF32(math.Float32frombits(uint32(lo))))) |
+					(uint64(math.Float32bits(moremath.WasmCompatFloorF32(math.Float32frombits(uint32(lo>>32))))) << 32)
+				hi = uint64(math.Float32bits(moremath.WasmCompatFloorF32(math.Float32frombits(uint32(hi))))) |
+					(uint64(math.Float32bits(moremath.WasmCompatFloorF32(math.Float32frombits(uint32(hi>>32))))) << 32)
+			} else {
+				lo = math.Float64bits(moremath.WasmCompatFloorF64(math.Float64frombits(lo)))
+				hi = math.Float64bits(moremath.WasmCompatFloorF64(math.Float64frombits(hi)))
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case operationKindV128Trunc:
+			hi, lo := ce.popValue(), ce.popValue()
+			if op.B1 == shapeF32x4 {
+				lo = uint64(math.Float32bits(moremath.WasmCompatTruncF32(math.Float32frombits(uint32(lo))))) |
+					(uint64(math.Float32bits(moremath.WasmCompatTruncF32(math.Float32frombits(uint32(lo>>32))))) << 32)
+				hi = uint64(math.Float32bits(moremath.WasmCompatTruncF32(math.Float32frombits(uint32(hi))))) |
+					(uint64(math.Float32bits(moremath.WasmCompatTruncF32(math.Float32frombits(uint32(hi>>32))))) << 32)
+			} else {
+				lo = math.Float64bits(moremath.WasmCompatTruncF64(math.Float64frombits(lo)))
+				hi = math.Float64bits(moremath.WasmCompatTruncF64(math.Float64frombits(hi)))
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case operationKindV128Nearest:
+			hi, lo := ce.popValue(), ce.popValue()
+			if op.B1 == shapeF32x4 {
+				lo = uint64(math.Float32bits(moremath.WasmCompatNearestF32(math.Float32frombits(uint32(lo))))) |
+					(uint64(math.Float32bits(moremath.WasmCompatNearestF32(math.Float32frombits(uint32(lo>>32))))) << 32)
+				hi = uint64(math.Float32bits(moremath.WasmCompatNearestF32(math.Float32frombits(uint32(hi))))) |
+					(uint64(math.Float32bits(moremath.WasmCompatNearestF32(math.Float32frombits(uint32(hi>>32))))) << 32)
+			} else {
+				lo = math.Float64bits(moremath.WasmCompatNearestF64(math.Float64frombits(lo)))
+				hi = math.Float64bits(moremath.WasmCompatNearestF64(math.Float64frombits(hi)))
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case operationKindV128Extend:
+			hi, lo := ce.popValue(), ce.popValue()
+			var origin uint64
+			if op.B3 { // use lower 64 bits
+				origin = lo
+			} else {
+				origin = hi
+			}
+
+			signed := op.B2 == 1
+
+			var retHi, retLo uint64
+			switch op.B1 {
+			case shapeI8x16:
+				for i := 0; i < 8; i++ {
+					v8 := byte(origin >> (i * 8))
+
+					var v16 uint16
+					if signed {
+						v16 = uint16(int8(v8))
+					} else {
+						v16 = uint16(v8)
+					}
+
+					if i < 4 {
+						retLo |= uint64(v16) << (i * 16)
+					} else {
+						retHi |= uint64(v16) << ((i - 4) * 16)
+					}
+				}
+			case shapeI16x8:
+				for i := 0; i < 4; i++ {
+					v16 := uint16(origin >> (i * 16))
+
+					var v32 uint32
+					if signed {
+						v32 = uint32(int16(v16))
+					} else {
+						v32 = uint32(v16)
+					}
+
+					if i < 2 {
+						retLo |= uint64(v32) << (i * 32)
+					} else {
+						retHi |= uint64(v32) << ((i - 2) * 32)
+					}
+				}
+			case shapeI32x4:
+				v32Lo := uint32(origin)
+				v32Hi := uint32(origin >> 32)
+				if signed {
+					retLo = uint64(int32(v32Lo))
+					retHi = uint64(int32(v32Hi))
+				} else {
+					retLo = uint64(v32Lo)
+					retHi = uint64(v32Hi)
+				}
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128ExtMul:
+			x2Hi, x2Lo := ce.popValue(), ce.popValue()
+			x1Hi, x1Lo := ce.popValue(), ce.popValue()
+			var x1, x2 uint64
+			if op.B3 { // use lower 64 bits
+				x1, x2 = x1Lo, x2Lo
+			} else {
+				x1, x2 = x1Hi, x2Hi
+			}
+
+			signed := op.B2 == 1
+
+			var retLo, retHi uint64
+			switch op.B1 {
+			case shapeI8x16:
+				for i := 0; i < 8; i++ {
+					v1, v2 := byte(x1>>(i*8)), byte(x2>>(i*8))
+
+					var v16 uint16
+					if signed {
+						v16 = uint16(int16(int8(v1)) * int16(int8(v2)))
+					} else {
+						v16 = uint16(v1) * uint16(v2)
+					}
+
+					if i < 4 {
+						retLo |= uint64(v16) << (i * 16)
+					} else {
+						retHi |= uint64(v16) << ((i - 4) * 16)
+					}
+				}
+			case shapeI16x8:
+				for i := 0; i < 4; i++ {
+					v1, v2 := uint16(x1>>(i*16)), uint16(x2>>(i*16))
+
+					var v32 uint32
+					if signed {
+						v32 = uint32(int32(int16(v1)) * int32(int16(v2)))
+					} else {
+						v32 = uint32(v1) * uint32(v2)
+					}
+
+					if i < 2 {
+						retLo |= uint64(v32) << (i * 32)
+					} else {
+						retHi |= uint64(v32) << ((i - 2) * 32)
+					}
+				}
+			case shapeI32x4:
+				v1Lo, v2Lo := uint32(x1), uint32(x2)
+				v1Hi, v2Hi := uint32(x1>>32), uint32(x2>>32)
+				if signed {
+					retLo = uint64(int64(int32(v1Lo)) * int64(int32(v2Lo)))
+					retHi = uint64(int64(int32(v1Hi)) * int64(int32(v2Hi)))
+				} else {
+					retLo = uint64(v1Lo) * uint64(v2Lo)
+					retHi = uint64(v1Hi) * uint64(v2Hi)
+				}
+			}
+
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128Q15mulrSatS:
+			x2hi, x2Lo := ce.popValue(), ce.popValue()
+			x1hi, x1Lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			for i := 0; i < 8; i++ {
+				var v, w int16
+				if i < 4 {
+					v, w = int16(uint16(x1Lo>>(i*16))), int16(uint16(x2Lo>>(i*16)))
+				} else {
+					v, w = int16(uint16(x1hi>>((i-4)*16))), int16(uint16(x2hi>>((i-4)*16)))
+				}
+
+				var uv uint64
+				// https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#saturating-integer-q-format-rounding-multiplication
+				if calc := ((int32(v) * int32(w)) + 0x4000) >> 15; calc < math.MinInt16 {
+					uv = uint64(uint16(0x8000))
+				} else if calc > math.MaxInt16 {
+					uv = uint64(uint16(0x7fff))
+				} else {
+					uv = uint64(uint16(int16(calc)))
+				}
+
+				if i < 4 {
+					retLo |= uv << (i * 16)
+				} else {
+					retHi |= uv << ((i - 4) * 16)
+				}
+			}
+
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128ExtAddPairwise:
+			hi, lo := ce.popValue(), ce.popValue()
+
+			signed := op.B3
+
+			var retLo, retHi uint64
+			switch op.B1 {
+			case shapeI8x16:
+				for i := 0; i < 8; i++ {
+					var v1, v2 byte
+					if i < 4 {
+						v1, v2 = byte(lo>>((i*2)*8)), byte(lo>>((i*2+1)*8))
+					} else {
+						v1, v2 = byte(hi>>(((i-4)*2)*8)), byte(hi>>(((i-4)*2+1)*8))
+					}
+
+					var v16 uint16
+					if signed {
+						v16 = uint16(int16(int8(v1)) + int16(int8(v2)))
+					} else {
+						v16 = uint16(v1) + uint16(v2)
+					}
+
+					if i < 4 {
+						retLo |= uint64(v16) << (i * 16)
+					} else {
+						retHi |= uint64(v16) << ((i - 4) * 16)
+					}
+				}
+			case shapeI16x8:
+				for i := 0; i < 4; i++ {
+					var v1, v2 uint16
+					if i < 2 {
+						v1, v2 = uint16(lo>>((i*2)*16)), uint16(lo>>((i*2+1)*16))
+					} else {
+						v1, v2 = uint16(hi>>(((i-2)*2)*16)), uint16(hi>>(((i-2)*2+1)*16))
+					}
+
+					var v32 uint32
+					if signed {
+						v32 = uint32(int32(int16(v1)) + int32(int16(v2)))
+					} else {
+						v32 = uint32(v1) + uint32(v2)
+					}
+
+					if i < 2 {
+						retLo |= uint64(v32) << (i * 32)
+					} else {
+						retHi |= uint64(v32) << ((i - 2) * 32)
+					}
+				}
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128FloatPromote:
+			_, toPromote := ce.popValue(), ce.popValue()
+			ce.pushValue(math.Float64bits(float64(math.Float32frombits(uint32(toPromote)))))
+			ce.pushValue(math.Float64bits(float64(math.Float32frombits(uint32(toPromote >> 32)))))
+			frame.pc++
+		case operationKindV128FloatDemote:
+			hi, lo := ce.popValue(), ce.popValue()
+			ce.pushValue(
+				uint64(math.Float32bits(float32(math.Float64frombits(lo)))) |
+					(uint64(math.Float32bits(float32(math.Float64frombits(hi)))) << 32),
+			)
+			ce.pushValue(0)
+			frame.pc++
+		case operationKindV128FConvertFromI:
+			hi, lo := ce.popValue(), ce.popValue()
+			v1, v2, v3, v4 := uint32(lo), uint32(lo>>32), uint32(hi), uint32(hi>>32)
+			signed := op.B3
+
+			var retLo, retHi uint64
+			switch op.B1 { // Destination shape.
+			case shapeF32x4: // f32x4 from signed/unsigned i32x4
+				if signed {
+					retLo = uint64(math.Float32bits(float32(int32(v1)))) |
+						(uint64(math.Float32bits(float32(int32(v2)))) << 32)
+					retHi = uint64(math.Float32bits(float32(int32(v3)))) |
+						(uint64(math.Float32bits(float32(int32(v4)))) << 32)
+				} else {
+					retLo = uint64(math.Float32bits(float32(v1))) |
+						(uint64(math.Float32bits(float32(v2))) << 32)
+					retHi = uint64(math.Float32bits(float32(v3))) |
+						(uint64(math.Float32bits(float32(v4))) << 32)
+				}
+			case shapeF64x2: // f64x2 from signed/unsigned i32x4
+				if signed {
+					retLo, retHi = math.Float64bits(float64(int32(v1))), math.Float64bits(float64(int32(v2)))
+				} else {
+					retLo, retHi = math.Float64bits(float64(v1)), math.Float64bits(float64(v2))
+				}
+			}
+
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128Narrow:
+			x2Hi, x2Lo := ce.popValue(), ce.popValue()
+			x1Hi, x1Lo := ce.popValue(), ce.popValue()
+			signed := op.B3
+
+			var retLo, retHi uint64
+			switch op.B1 {
+			case shapeI16x8: // signed/unsigned i16x8 to i8x16
+				for i := 0; i < 8; i++ {
+					var v16 uint16
+					if i < 4 {
+						v16 = uint16(x1Lo >> (i * 16))
+					} else {
+						v16 = uint16(x1Hi >> ((i - 4) * 16))
+					}
+
+					var v byte
+					if signed {
+						if s := int16(v16); s > math.MaxInt8 {
+							v = math.MaxInt8
+						} else if s < math.MinInt8 {
+							s = math.MinInt8
+							v = byte(s)
+						} else {
+							v = byte(v16)
+						}
+					} else {
+						if s := int16(v16); s > math.MaxUint8 {
+							v = math.MaxUint8
+						} else if s < 0 {
+							v = 0
+						} else {
+							v = byte(v16)
+						}
+					}
+					retLo |= uint64(v) << (i * 8)
+				}
+				for i := 0; i < 8; i++ {
+					var v16 uint16
+					if i < 4 {
+						v16 = uint16(x2Lo >> (i * 16))
+					} else {
+						v16 = uint16(x2Hi >> ((i - 4) * 16))
+					}
+
+					var v byte
+					if signed {
+						if s := int16(v16); s > math.MaxInt8 {
+							v = math.MaxInt8
+						} else if s < math.MinInt8 {
+							s = math.MinInt8
+							v = byte(s)
+						} else {
+							v = byte(v16)
+						}
+					} else {
+						if s := int16(v16); s > math.MaxUint8 {
+							v = math.MaxUint8
+						} else if s < 0 {
+							v = 0
+						} else {
+							v = byte(v16)
+						}
+					}
+					retHi |= uint64(v) << (i * 8)
+				}
+			case shapeI32x4: // signed/unsigned i32x4 to i16x8
+				for i := 0; i < 4; i++ {
+					var v32 uint32
+					if i < 2 {
+						v32 = uint32(x1Lo >> (i * 32))
+					} else {
+						v32 = uint32(x1Hi >> ((i - 2) * 32))
+					}
+
+					var v uint16
+					if signed {
+						if s := int32(v32); s > math.MaxInt16 {
+							v = math.MaxInt16
+						} else if s < math.MinInt16 {
+							s = math.MinInt16
+							v = uint16(s)
+						} else {
+							v = uint16(v32)
+						}
+					} else {
+						if s := int32(v32); s > math.MaxUint16 {
+							v = math.MaxUint16
+						} else if s < 0 {
+							v = 0
+						} else {
+							v = uint16(v32)
+						}
+					}
+					retLo |= uint64(v) << (i * 16)
+				}
+
+				for i := 0; i < 4; i++ {
+					var v32 uint32
+					if i < 2 {
+						v32 = uint32(x2Lo >> (i * 32))
+					} else {
+						v32 = uint32(x2Hi >> ((i - 2) * 32))
+					}
+
+					var v uint16
+					if signed {
+						if s := int32(v32); s > math.MaxInt16 {
+							v = math.MaxInt16
+						} else if s < math.MinInt16 {
+							s = math.MinInt16
+							v = uint16(s)
+						} else {
+							v = uint16(v32)
+						}
+					} else {
+						if s := int32(v32); s > math.MaxUint16 {
+							v = math.MaxUint16
+						} else if s < 0 {
+							v = 0
+						} else {
+							v = uint16(v32)
+						}
+					}
+					retHi |= uint64(v) << (i * 16)
+				}
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindV128Dot:
+			x2Hi, x2Lo := ce.popValue(), ce.popValue()
+			x1Hi, x1Lo := ce.popValue(), ce.popValue()
+			ce.pushValue(
+				uint64(uint32(int32(int16(x1Lo>>0))*int32(int16(x2Lo>>0))+int32(int16(x1Lo>>16))*int32(int16(x2Lo>>16)))) |
+					(uint64(uint32(int32(int16(x1Lo>>32))*int32(int16(x2Lo>>32))+int32(int16(x1Lo>>48))*int32(int16(x2Lo>>48)))) << 32),
+			)
+			ce.pushValue(
+				uint64(uint32(int32(int16(x1Hi>>0))*int32(int16(x2Hi>>0))+int32(int16(x1Hi>>16))*int32(int16(x2Hi>>16)))) |
+					(uint64(uint32(int32(int16(x1Hi>>32))*int32(int16(x2Hi>>32))+int32(int16(x1Hi>>48))*int32(int16(x2Hi>>48)))) << 32),
+			)
+			frame.pc++
+		case operationKindV128ITruncSatFromF:
+			hi, lo := ce.popValue(), ce.popValue()
+			signed := op.B3
+			var retLo, retHi uint64
+
+			switch op.B1 {
+			case shapeF32x4: // f32x4 to i32x4
+				for i, f64 := range [4]float64{
+					math.Trunc(float64(math.Float32frombits(uint32(lo)))),
+					math.Trunc(float64(math.Float32frombits(uint32(lo >> 32)))),
+					math.Trunc(float64(math.Float32frombits(uint32(hi)))),
+					math.Trunc(float64(math.Float32frombits(uint32(hi >> 32)))),
+				} {
+
+					var v uint32
+					if math.IsNaN(f64) {
+						v = 0
+					} else if signed {
+						if f64 < math.MinInt32 {
+							f64 = math.MinInt32
+						} else if f64 > math.MaxInt32 {
+							f64 = math.MaxInt32
+						}
+						v = uint32(int32(f64))
+					} else {
+						if f64 < 0 {
+							f64 = 0
+						} else if f64 > math.MaxUint32 {
+							f64 = math.MaxUint32
+						}
+						v = uint32(f64)
+					}
+
+					if i < 2 {
+						retLo |= uint64(v) << (i * 32)
+					} else {
+						retHi |= uint64(v) << ((i - 2) * 32)
+					}
+				}
+
+			case shapeF64x2: // f64x2 to i32x4
+				for i, f := range [2]float64{
+					math.Trunc(math.Float64frombits(lo)),
+					math.Trunc(math.Float64frombits(hi)),
+				} {
+					var v uint32
+					if math.IsNaN(f) {
+						v = 0
+					} else if signed {
+						if f < math.MinInt32 {
+							f = math.MinInt32
+						} else if f > math.MaxInt32 {
+							f = math.MaxInt32
+						}
+						v = uint32(int32(f))
+					} else {
+						if f < 0 {
+							f = 0
+						} else if f > math.MaxUint32 {
+							f = math.MaxUint32
+						}
+						v = uint32(f)
+					}
+
+					retLo |= uint64(v) << (i * 32)
+				}
+			}
+
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case operationKindAtomicMemoryWait:
+			timeout := int64(ce.popValue())
+			exp := ce.popValue()
+			offset := ce.popMemoryOffset(op)
+			// Runtime instead of validation error because the spec intends to allow binaries to include
+			// such instructions as long as they are not executed.
+			if !memoryInst.Shared {
+				panic(wasmruntime.ErrRuntimeExpectedSharedMemory)
+			}
+
+			switch unsignedType(op.B1) {
+			case unsignedTypeI32:
+				if offset%4 != 0 {
+					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+				}
+				if int(offset) > len(memoryInst.Buffer)-4 {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				ce.pushValue(memoryInst.Wait32(offset, uint32(exp), timeout, func(mem *wasm.MemoryInstance, offset uint32) uint32 {
+					mem.Mux.Lock()
+					defer mem.Mux.Unlock()
+					value, _ := mem.ReadUint32Le(offset)
+					return value
+				}))
+			case unsignedTypeI64:
+				if offset%8 != 0 {
+					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+				}
+				if int(offset) > len(memoryInst.Buffer)-8 {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				ce.pushValue(memoryInst.Wait64(offset, exp, timeout, func(mem *wasm.MemoryInstance, offset uint32) uint64 {
+					mem.Mux.Lock()
+					defer mem.Mux.Unlock()
+					value, _ := mem.ReadUint64Le(offset)
+					return value
+				}))
+			}
+			frame.pc++
+		case operationKindAtomicMemoryNotify:
+			count := ce.popValue()
+			offset := ce.popMemoryOffset(op)
+			if offset%4 != 0 {
+				panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+			}
+			// Just a bounds check
+			if offset >= memoryInst.Size() {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+			res := memoryInst.Notify(offset, uint32(count))
+			ce.pushValue(uint64(res))
+			frame.pc++
+		case operationKindAtomicFence:
+			// Memory not required for fence only
+			if memoryInst != nil {
+				// An empty critical section can be used as a synchronization primitive, which is what
+				// fence is. Probably, there are no spectests or defined behavior to confirm this yet.
+				memoryInst.Mux.Lock()
+				memoryInst.Mux.Unlock() //nolint:staticcheck
+			}
+			frame.pc++
+		case operationKindAtomicLoad:
+			offset := ce.popMemoryOffset(op)
+			switch unsignedType(op.B1) {
+			case unsignedTypeI32:
+				if offset%4 != 0 {
+					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+				}
+				memoryInst.Mux.Lock()
+				val, ok := memoryInst.ReadUint32Le(offset)
+				memoryInst.Mux.Unlock()
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				ce.pushValue(uint64(val))
+			case unsignedTypeI64:
+				if offset%8 != 0 {
+					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+				}
+				memoryInst.Mux.Lock()
+				val, ok := memoryInst.ReadUint64Le(offset)
+				memoryInst.Mux.Unlock()
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				ce.pushValue(val)
+			}
+			frame.pc++
+		case operationKindAtomicLoad8:
+			offset := ce.popMemoryOffset(op)
+			memoryInst.Mux.Lock()
+			val, ok := memoryInst.ReadByte(offset)
+			memoryInst.Mux.Unlock()
+			if !ok {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+			ce.pushValue(uint64(val))
+			frame.pc++
+		case operationKindAtomicLoad16:
+			offset := ce.popMemoryOffset(op)
+			if offset%2 != 0 {
+				panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+			}
+			memoryInst.Mux.Lock()
+			val, ok := memoryInst.ReadUint16Le(offset)
+			memoryInst.Mux.Unlock()
+			if !ok {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+			ce.pushValue(uint64(val))
+			frame.pc++
+		case operationKindAtomicStore:
+			val := ce.popValue()
+			offset := ce.popMemoryOffset(op)
+			switch unsignedType(op.B1) {
+			case unsignedTypeI32:
+				if offset%4 != 0 {
+					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+				}
+				memoryInst.Mux.Lock()
+				ok := memoryInst.WriteUint32Le(offset, uint32(val))
+				memoryInst.Mux.Unlock()
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+			case unsignedTypeI64:
+				if offset%8 != 0 {
+					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+				}
+				memoryInst.Mux.Lock()
+				ok := memoryInst.WriteUint64Le(offset, val)
+				memoryInst.Mux.Unlock()
+				if !ok {
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+			}
+			frame.pc++
+		case operationKindAtomicStore8:
+			val := byte(ce.popValue())
+			offset := ce.popMemoryOffset(op)
+			memoryInst.Mux.Lock()
+			ok := memoryInst.WriteByte(offset, val)
+			memoryInst.Mux.Unlock()
+			if !ok {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+			frame.pc++
+		case operationKindAtomicStore16:
+			val := uint16(ce.popValue())
+			offset := ce.popMemoryOffset(op)
+			if offset%2 != 0 {
+				panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+			}
+			memoryInst.Mux.Lock()
+			ok := memoryInst.WriteUint16Le(offset, val)
+			memoryInst.Mux.Unlock()
+			if !ok {
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+			frame.pc++
+		case operationKindAtomicRMW:
+			val := ce.popValue()
+			offset := ce.popMemoryOffset(op)
+			switch unsignedType(op.B1) {
+			case unsignedTypeI32:
+				if offset%4 != 0 {
+					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+				}
+				memoryInst.Mux.Lock()
+				old, ok := memoryInst.ReadUint32Le(offset)
+				if !ok {
+					memoryInst.Mux.Unlock()
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				var newVal uint32
+				switch atomicArithmeticOp(op.B2) {
+				case atomicArithmeticOpAdd:
+					newVal = old + uint32(val)
+				case atomicArithmeticOpSub:
+					newVal = old - uint32(val)
+				case atomicArithmeticOpAnd:
+					newVal = old & uint32(val)
+				case atomicArithmeticOpOr:
+					newVal = old | uint32(val)
+				case atomicArithmeticOpXor:
+					newVal = old ^ uint32(val)
+				case atomicArithmeticOpNop:
+					newVal = uint32(val)
+				}
+				memoryInst.WriteUint32Le(offset, newVal)
+				memoryInst.Mux.Unlock()
+				ce.pushValue(uint64(old))
+			case unsignedTypeI64:
+				if offset%8 != 0 {
+					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+				}
+				memoryInst.Mux.Lock()
+				old, ok := memoryInst.ReadUint64Le(offset)
+				if !ok {
+					memoryInst.Mux.Unlock()
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				var newVal uint64
+				switch atomicArithmeticOp(op.B2) {
+				case atomicArithmeticOpAdd:
+					newVal = old + val
+				case atomicArithmeticOpSub:
+					newVal = old - val
+				case atomicArithmeticOpAnd:
+					newVal = old & val
+				case atomicArithmeticOpOr:
+					newVal = old | val
+				case atomicArithmeticOpXor:
+					newVal = old ^ val
+				case atomicArithmeticOpNop:
+					newVal = val
+				}
+				memoryInst.WriteUint64Le(offset, newVal)
+				memoryInst.Mux.Unlock()
+				ce.pushValue(old)
+			}
+			frame.pc++
+		case operationKindAtomicRMW8:
+			val := ce.popValue()
+			offset := ce.popMemoryOffset(op)
+			memoryInst.Mux.Lock()
+			old, ok := memoryInst.ReadByte(offset)
+			if !ok {
+				memoryInst.Mux.Unlock()
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+			arg := byte(val)
+			var newVal byte
+			switch atomicArithmeticOp(op.B2) {
+			case atomicArithmeticOpAdd:
+				newVal = old + arg
+			case atomicArithmeticOpSub:
+				newVal = old - arg
+			case atomicArithmeticOpAnd:
+				newVal = old & arg
+			case atomicArithmeticOpOr:
+				newVal = old | arg
+			case atomicArithmeticOpXor:
+				newVal = old ^ arg
+			case atomicArithmeticOpNop:
+				newVal = arg
+			}
+			memoryInst.WriteByte(offset, newVal)
+			memoryInst.Mux.Unlock()
+			ce.pushValue(uint64(old))
+			frame.pc++
+		case operationKindAtomicRMW16:
+			val := ce.popValue()
+			offset := ce.popMemoryOffset(op)
+			if offset%2 != 0 {
+				panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+			}
+			memoryInst.Mux.Lock()
+			old, ok := memoryInst.ReadUint16Le(offset)
+			if !ok {
+				memoryInst.Mux.Unlock()
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+			arg := uint16(val)
+			var newVal uint16
+			switch atomicArithmeticOp(op.B2) {
+			case atomicArithmeticOpAdd:
+				newVal = old + arg
+			case atomicArithmeticOpSub:
+				newVal = old - arg
+			case atomicArithmeticOpAnd:
+				newVal = old & arg
+			case atomicArithmeticOpOr:
+				newVal = old | arg
+			case atomicArithmeticOpXor:
+				newVal = old ^ arg
+			case atomicArithmeticOpNop:
+				newVal = arg
+			}
+			memoryInst.WriteUint16Le(offset, newVal)
+			memoryInst.Mux.Unlock()
+			ce.pushValue(uint64(old))
+			frame.pc++
+		case operationKindAtomicRMWCmpxchg:
+			rep := ce.popValue()
+			exp := ce.popValue()
+			offset := ce.popMemoryOffset(op)
+			switch unsignedType(op.B1) {
+			case unsignedTypeI32:
+				if offset%4 != 0 {
+					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+				}
+				memoryInst.Mux.Lock()
+				old, ok := memoryInst.ReadUint32Le(offset)
+				if !ok {
+					memoryInst.Mux.Unlock()
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				if old == uint32(exp) {
+					memoryInst.WriteUint32Le(offset, uint32(rep))
+				}
+				memoryInst.Mux.Unlock()
+				ce.pushValue(uint64(old))
+			case unsignedTypeI64:
+				if offset%8 != 0 {
+					panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+				}
+				memoryInst.Mux.Lock()
+				old, ok := memoryInst.ReadUint64Le(offset)
+				if !ok {
+					memoryInst.Mux.Unlock()
+					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+				}
+				if old == exp {
+					memoryInst.WriteUint64Le(offset, rep)
+				}
+				memoryInst.Mux.Unlock()
+				ce.pushValue(old)
+			}
+			frame.pc++
+		case operationKindAtomicRMW8Cmpxchg:
+			rep := byte(ce.popValue())
+			exp := byte(ce.popValue())
+			offset := ce.popMemoryOffset(op)
+			memoryInst.Mux.Lock()
+			old, ok := memoryInst.ReadByte(offset)
+			if !ok {
+				memoryInst.Mux.Unlock()
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+			if old == exp {
+				memoryInst.WriteByte(offset, rep)
+			}
+			memoryInst.Mux.Unlock()
+			ce.pushValue(uint64(old))
+			frame.pc++
+		case operationKindAtomicRMW16Cmpxchg:
+			rep := uint16(ce.popValue())
+			exp := uint16(ce.popValue())
+			offset := ce.popMemoryOffset(op)
+			if offset%2 != 0 {
+				panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+			}
+			memoryInst.Mux.Lock()
+			old, ok := memoryInst.ReadUint16Le(offset)
+			if !ok {
+				memoryInst.Mux.Unlock()
+				panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+			}
+			if old == exp {
+				memoryInst.WriteUint16Le(offset, rep)
+			}
+			memoryInst.Mux.Unlock()
+			ce.pushValue(uint64(old))
+			frame.pc++
+		default:
+			frame.pc++
+		}
+	}
+	ce.popFrame()
+}
+
+func wasmCompatMax32bits(v1, v2 uint32) uint64 {
+	return uint64(math.Float32bits(moremath.WasmCompatMax32(
+		math.Float32frombits(v1),
+		math.Float32frombits(v2),
+	)))
+}
+
+func wasmCompatMin32bits(v1, v2 uint32) uint64 {
+	return uint64(math.Float32bits(moremath.WasmCompatMin32(
+		math.Float32frombits(v1),
+		math.Float32frombits(v2),
+	)))
+}
+
+func addFloat32bits(v1, v2 uint32) uint64 {
+	return uint64(math.Float32bits(math.Float32frombits(v1) + math.Float32frombits(v2)))
+}
+
+func subFloat32bits(v1, v2 uint32) uint64 {
+	return uint64(math.Float32bits(math.Float32frombits(v1) - math.Float32frombits(v2)))
+}
+
+func mulFloat32bits(v1, v2 uint32) uint64 {
+	return uint64(math.Float32bits(math.Float32frombits(v1) * math.Float32frombits(v2)))
+}
+
+func divFloat32bits(v1, v2 uint32) uint64 {
+	return uint64(math.Float32bits(math.Float32frombits(v1) / math.Float32frombits(v2)))
+}
+
+// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/exec/numerics.html#xref-exec-numerics-op-flt-mathrm-flt-n-z-1-z-2
+func flt32(z1, z2 float32) bool {
+	if z1 != z1 || z2 != z2 {
+		return false
+	} else if z1 == z2 {
+		return false
+	} else if math.IsInf(float64(z1), 1) {
+		return false
+	} else if math.IsInf(float64(z1), -1) {
+		return true
+	} else if math.IsInf(float64(z2), 1) {
+		return true
+	} else if math.IsInf(float64(z2), -1) {
+		return false
+	}
+	return z1 < z2
+}
+
+// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/exec/numerics.html#xref-exec-numerics-op-flt-mathrm-flt-n-z-1-z-2
+func flt64(z1, z2 float64) bool {
+	if z1 != z1 || z2 != z2 {
+		return false
+	} else if z1 == z2 {
+		return false
+	} else if math.IsInf(z1, 1) {
+		return false
+	} else if math.IsInf(z1, -1) {
+		return true
+	} else if math.IsInf(z2, 1) {
+		return true
+	} else if math.IsInf(z2, -1) {
+		return false
+	}
+	return z1 < z2
+}
+
+func i8RoundingAverage(v1, v2 byte) byte {
+	// https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#lane-wise-integer-rounding-average
+	return byte((uint16(v1) + uint16(v2) + uint16(1)) / 2)
+}
+
+func i16RoundingAverage(v1, v2 uint16) uint16 {
+	// https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#lane-wise-integer-rounding-average
+	return uint16((uint32(v1) + uint32(v2) + 1) / 2)
+}
+
+func i8Abs(v byte) byte {
+	if i := int8(v); i < 0 {
+		return byte(-i)
+	} else {
+		return byte(i)
+	}
+}
+
+func i8MaxU(v1, v2 byte) byte {
+	if v1 < v2 {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i8MinU(v1, v2 byte) byte {
+	if v1 > v2 {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i8MaxS(v1, v2 byte) byte {
+	if int8(v1) < int8(v2) {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i8MinS(v1, v2 byte) byte {
+	if int8(v1) > int8(v2) {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i16MaxU(v1, v2 uint16) uint16 {
+	if v1 < v2 {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i16MinU(v1, v2 uint16) uint16 {
+	if v1 > v2 {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i16MaxS(v1, v2 uint16) uint16 {
+	if int16(v1) < int16(v2) {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i16MinS(v1, v2 uint16) uint16 {
+	if int16(v1) > int16(v2) {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i32MaxU(v1, v2 uint32) uint32 {
+	if v1 < v2 {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i32MinU(v1, v2 uint32) uint32 {
+	if v1 > v2 {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i32MaxS(v1, v2 uint32) uint32 {
+	if int32(v1) < int32(v2) {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i32MinS(v1, v2 uint32) uint32 {
+	if int32(v1) > int32(v2) {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i16Abs(v uint16) uint16 {
+	if i := int16(v); i < 0 {
+		return uint16(-i)
+	} else {
+		return uint16(i)
+	}
+}
+
+func i32Abs(v uint32) uint32 {
+	if i := int32(v); i < 0 {
+		return uint32(-i)
+	} else {
+		return uint32(i)
+	}
+}
+
+func (ce *callEngine) callNativeFuncWithListener(ctx context.Context, m *wasm.ModuleInstance, f *function, fnl experimental.FunctionListener) context.Context {
+	def, typ := f.definition(), f.funcType
+
+	ce.stackIterator.reset(ce.stack, ce.frames, f)
+	fnl.Before(ctx, m, def, ce.peekValues(typ.ParamNumInUint64), &ce.stackIterator)
+	ce.stackIterator.clear()
+	ce.callNativeFunc(ctx, m, f)
+	fnl.After(ctx, m, def, ce.peekValues(typ.ResultNumInUint64))
+	return ctx
+}
+
+// popMemoryOffset takes a memory offset off the stack for use in load and store instructions.
+// As the top of stack value is 64-bit, this ensures it is in range before returning it.
+func (ce *callEngine) popMemoryOffset(op *unionOperation) uint32 {
+	offset := op.U2 + ce.popValue()
+	if offset > math.MaxUint32 {
+		panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+	}
+	return uint32(offset)
+}
+
+func (ce *callEngine) callGoFuncWithStack(ctx context.Context, m *wasm.ModuleInstance, f *function) {
+	typ := f.funcType
+	paramLen := typ.ParamNumInUint64
+	resultLen := typ.ResultNumInUint64
+	stackLen := paramLen
+
+	// In the interpreter engine, ce.stack may only have capacity to store
+	// parameters. Grow when there are more results than parameters.
+	if growLen := resultLen - paramLen; growLen > 0 {
+		for i := 0; i < growLen; i++ {
+			ce.stack = append(ce.stack, 0)
+		}
+		stackLen += growLen
+	}
+
+	// Pass the stack elements to the go function.
+	stack := ce.stack[len(ce.stack)-stackLen:]
+	ce.callGoFunc(ctx, m, f, stack)
+
+	// Shrink the stack when there were more parameters than results.
+	if shrinkLen := paramLen - resultLen; shrinkLen > 0 {
+		ce.stack = ce.stack[0 : len(ce.stack)-shrinkLen]
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/operations.go b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/operations.go
new file mode 100644
index 000000000..3087a718f
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/operations.go
@@ -0,0 +1,2812 @@
+package interpreter
+
+import (
+	"fmt"
+	"math"
+	"strings"
+)
+
+// unsignedInt represents unsigned 32-bit or 64-bit integers.
+type unsignedInt byte
+
+const (
+	unsignedInt32 unsignedInt = iota
+	unsignedInt64
+)
+
+// String implements fmt.Stringer.
+func (s unsignedInt) String() (ret string) {
+	switch s {
+	case unsignedInt32:
+		ret = "i32"
+	case unsignedInt64:
+		ret = "i64"
+	}
+	return
+}
+
+// signedInt represents signed or unsigned integers.
+type signedInt byte
+
+const (
+	signedInt32 signedInt = iota
+	signedInt64
+	signedUint32
+	signedUint64
+)
+
+// String implements fmt.Stringer.
+func (s signedInt) String() (ret string) {
+	switch s {
+	case signedUint32:
+		ret = "u32"
+	case signedUint64:
+		ret = "u64"
+	case signedInt32:
+		ret = "s32"
+	case signedInt64:
+		ret = "s64"
+	}
+	return
+}
+
+// float represents the scalar double or single precision floating points.
+type float byte
+
+const (
+	f32 float = iota
+	f64
+)
+
+// String implements fmt.Stringer.
+func (s float) String() (ret string) {
+	switch s {
+	case f32:
+		ret = "f32"
+	case f64:
+		ret = "f64"
+	}
+	return
+}
+
+// unsignedType is the union of unsignedInt, float and V128 vector type.
+type unsignedType byte
+
+const (
+	unsignedTypeI32 unsignedType = iota
+	unsignedTypeI64
+	unsignedTypeF32
+	unsignedTypeF64
+	unsignedTypeV128
+	unsignedTypeUnknown
+)
+
+// String implements fmt.Stringer.
+func (s unsignedType) String() (ret string) {
+	switch s {
+	case unsignedTypeI32:
+		ret = "i32"
+	case unsignedTypeI64:
+		ret = "i64"
+	case unsignedTypeF32:
+		ret = "f32"
+	case unsignedTypeF64:
+		ret = "f64"
+	case unsignedTypeV128:
+		ret = "v128"
+	case unsignedTypeUnknown:
+		ret = "unknown"
+	}
+	return
+}
+
+// signedType is the union of signedInt and float types.
+type signedType byte
+
+const (
+	signedTypeInt32 signedType = iota
+	signedTypeUint32
+	signedTypeInt64
+	signedTypeUint64
+	signedTypeFloat32
+	signedTypeFloat64
+)
+
+// String implements fmt.Stringer.
+func (s signedType) String() (ret string) {
+	switch s {
+	case signedTypeInt32:
+		ret = "s32"
+	case signedTypeUint32:
+		ret = "u32"
+	case signedTypeInt64:
+		ret = "s64"
+	case signedTypeUint64:
+		ret = "u64"
+	case signedTypeFloat32:
+		ret = "f32"
+	case signedTypeFloat64:
+		ret = "f64"
+	}
+	return
+}
+
+// operationKind is the Kind of each implementation of Operation interface.
+type operationKind uint16
+
+// String implements fmt.Stringer.
+func (o operationKind) String() (ret string) {
+	switch o {
+	case operationKindUnreachable:
+		ret = "Unreachable"
+	case operationKindLabel:
+		ret = "label"
+	case operationKindBr:
+		ret = "Br"
+	case operationKindBrIf:
+		ret = "BrIf"
+	case operationKindBrTable:
+		ret = "BrTable"
+	case operationKindCall:
+		ret = "Call"
+	case operationKindCallIndirect:
+		ret = "CallIndirect"
+	case operationKindDrop:
+		ret = "Drop"
+	case operationKindSelect:
+		ret = "Select"
+	case operationKindPick:
+		ret = "Pick"
+	case operationKindSet:
+		ret = "Swap"
+	case operationKindGlobalGet:
+		ret = "GlobalGet"
+	case operationKindGlobalSet:
+		ret = "GlobalSet"
+	case operationKindLoad:
+		ret = "Load"
+	case operationKindLoad8:
+		ret = "Load8"
+	case operationKindLoad16:
+		ret = "Load16"
+	case operationKindLoad32:
+		ret = "Load32"
+	case operationKindStore:
+		ret = "Store"
+	case operationKindStore8:
+		ret = "Store8"
+	case operationKindStore16:
+		ret = "Store16"
+	case operationKindStore32:
+		ret = "Store32"
+	case operationKindMemorySize:
+		ret = "MemorySize"
+	case operationKindMemoryGrow:
+		ret = "MemoryGrow"
+	case operationKindConstI32:
+		ret = "ConstI32"
+	case operationKindConstI64:
+		ret = "ConstI64"
+	case operationKindConstF32:
+		ret = "ConstF32"
+	case operationKindConstF64:
+		ret = "ConstF64"
+	case operationKindEq:
+		ret = "Eq"
+	case operationKindNe:
+		ret = "Ne"
+	case operationKindEqz:
+		ret = "Eqz"
+	case operationKindLt:
+		ret = "Lt"
+	case operationKindGt:
+		ret = "Gt"
+	case operationKindLe:
+		ret = "Le"
+	case operationKindGe:
+		ret = "Ge"
+	case operationKindAdd:
+		ret = "Add"
+	case operationKindSub:
+		ret = "Sub"
+	case operationKindMul:
+		ret = "Mul"
+	case operationKindClz:
+		ret = "Clz"
+	case operationKindCtz:
+		ret = "Ctz"
+	case operationKindPopcnt:
+		ret = "Popcnt"
+	case operationKindDiv:
+		ret = "Div"
+	case operationKindRem:
+		ret = "Rem"
+	case operationKindAnd:
+		ret = "And"
+	case operationKindOr:
+		ret = "Or"
+	case operationKindXor:
+		ret = "Xor"
+	case operationKindShl:
+		ret = "Shl"
+	case operationKindShr:
+		ret = "Shr"
+	case operationKindRotl:
+		ret = "Rotl"
+	case operationKindRotr:
+		ret = "Rotr"
+	case operationKindAbs:
+		ret = "Abs"
+	case operationKindNeg:
+		ret = "Neg"
+	case operationKindCeil:
+		ret = "Ceil"
+	case operationKindFloor:
+		ret = "Floor"
+	case operationKindTrunc:
+		ret = "Trunc"
+	case operationKindNearest:
+		ret = "Nearest"
+	case operationKindSqrt:
+		ret = "Sqrt"
+	case operationKindMin:
+		ret = "Min"
+	case operationKindMax:
+		ret = "Max"
+	case operationKindCopysign:
+		ret = "Copysign"
+	case operationKindI32WrapFromI64:
+		ret = "I32WrapFromI64"
+	case operationKindITruncFromF:
+		ret = "ITruncFromF"
+	case operationKindFConvertFromI:
+		ret = "FConvertFromI"
+	case operationKindF32DemoteFromF64:
+		ret = "F32DemoteFromF64"
+	case operationKindF64PromoteFromF32:
+		ret = "F64PromoteFromF32"
+	case operationKindI32ReinterpretFromF32:
+		ret = "I32ReinterpretFromF32"
+	case operationKindI64ReinterpretFromF64:
+		ret = "I64ReinterpretFromF64"
+	case operationKindF32ReinterpretFromI32:
+		ret = "F32ReinterpretFromI32"
+	case operationKindF64ReinterpretFromI64:
+		ret = "F64ReinterpretFromI64"
+	case operationKindExtend:
+		ret = "Extend"
+	case operationKindMemoryInit:
+		ret = "MemoryInit"
+	case operationKindDataDrop:
+		ret = "DataDrop"
+	case operationKindMemoryCopy:
+		ret = "MemoryCopy"
+	case operationKindMemoryFill:
+		ret = "MemoryFill"
+	case operationKindTableInit:
+		ret = "TableInit"
+	case operationKindElemDrop:
+		ret = "ElemDrop"
+	case operationKindTableCopy:
+		ret = "TableCopy"
+	case operationKindRefFunc:
+		ret = "RefFunc"
+	case operationKindTableGet:
+		ret = "TableGet"
+	case operationKindTableSet:
+		ret = "TableSet"
+	case operationKindTableSize:
+		ret = "TableSize"
+	case operationKindTableGrow:
+		ret = "TableGrow"
+	case operationKindTableFill:
+		ret = "TableFill"
+	case operationKindV128Const:
+		ret = "ConstV128"
+	case operationKindV128Add:
+		ret = "V128Add"
+	case operationKindV128Sub:
+		ret = "V128Sub"
+	case operationKindV128Load:
+		ret = "V128Load"
+	case operationKindV128LoadLane:
+		ret = "V128LoadLane"
+	case operationKindV128Store:
+		ret = "V128Store"
+	case operationKindV128StoreLane:
+		ret = "V128StoreLane"
+	case operationKindV128ExtractLane:
+		ret = "V128ExtractLane"
+	case operationKindV128ReplaceLane:
+		ret = "V128ReplaceLane"
+	case operationKindV128Splat:
+		ret = "V128Splat"
+	case operationKindV128Shuffle:
+		ret = "V128Shuffle"
+	case operationKindV128Swizzle:
+		ret = "V128Swizzle"
+	case operationKindV128AnyTrue:
+		ret = "V128AnyTrue"
+	case operationKindV128AllTrue:
+		ret = "V128AllTrue"
+	case operationKindV128And:
+		ret = "V128And"
+	case operationKindV128Not:
+		ret = "V128Not"
+	case operationKindV128Or:
+		ret = "V128Or"
+	case operationKindV128Xor:
+		ret = "V128Xor"
+	case operationKindV128Bitselect:
+		ret = "V128Bitselect"
+	case operationKindV128AndNot:
+		ret = "V128AndNot"
+	case operationKindV128BitMask:
+		ret = "V128BitMask"
+	case operationKindV128Shl:
+		ret = "V128Shl"
+	case operationKindV128Shr:
+		ret = "V128Shr"
+	case operationKindV128Cmp:
+		ret = "V128Cmp"
+	case operationKindSignExtend32From8:
+		ret = "SignExtend32From8"
+	case operationKindSignExtend32From16:
+		ret = "SignExtend32From16"
+	case operationKindSignExtend64From8:
+		ret = "SignExtend64From8"
+	case operationKindSignExtend64From16:
+		ret = "SignExtend64From16"
+	case operationKindSignExtend64From32:
+		ret = "SignExtend64From32"
+	case operationKindV128AddSat:
+		ret = "V128AddSat"
+	case operationKindV128SubSat:
+		ret = "V128SubSat"
+	case operationKindV128Mul:
+		ret = "V128Mul"
+	case operationKindV128Div:
+		ret = "V128Div"
+	case operationKindV128Neg:
+		ret = "V128Neg"
+	case operationKindV128Sqrt:
+		ret = "V128Sqrt"
+	case operationKindV128Abs:
+		ret = "V128Abs"
+	case operationKindV128Popcnt:
+		ret = "V128Popcnt"
+	case operationKindV128Min:
+		ret = "V128Min"
+	case operationKindV128Max:
+		ret = "V128Max"
+	case operationKindV128AvgrU:
+		ret = "V128AvgrU"
+	case operationKindV128Ceil:
+		ret = "V128Ceil"
+	case operationKindV128Floor:
+		ret = "V128Floor"
+	case operationKindV128Trunc:
+		ret = "V128Trunc"
+	case operationKindV128Nearest:
+		ret = "V128Nearest"
+	case operationKindV128Pmin:
+		ret = "V128Pmin"
+	case operationKindV128Pmax:
+		ret = "V128Pmax"
+	case operationKindV128Extend:
+		ret = "V128Extend"
+	case operationKindV128ExtMul:
+		ret = "V128ExtMul"
+	case operationKindV128Q15mulrSatS:
+		ret = "V128Q15mulrSatS"
+	case operationKindV128ExtAddPairwise:
+		ret = "V128ExtAddPairwise"
+	case operationKindV128FloatPromote:
+		ret = "V128FloatPromote"
+	case operationKindV128FloatDemote:
+		ret = "V128FloatDemote"
+	case operationKindV128FConvertFromI:
+		ret = "V128FConvertFromI"
+	case operationKindV128Dot:
+		ret = "V128Dot"
+	case operationKindV128Narrow:
+		ret = "V128Narrow"
+	case operationKindV128ITruncSatFromF:
+		ret = "V128ITruncSatFromF"
+	case operationKindBuiltinFunctionCheckExitCode:
+		ret = "BuiltinFunctionCheckExitCode"
+	case operationKindAtomicMemoryWait:
+		ret = "operationKindAtomicMemoryWait"
+	case operationKindAtomicMemoryNotify:
+		ret = "operationKindAtomicMemoryNotify"
+	case operationKindAtomicFence:
+		ret = "operationKindAtomicFence"
+	case operationKindAtomicLoad:
+		ret = "operationKindAtomicLoad"
+	case operationKindAtomicLoad8:
+		ret = "operationKindAtomicLoad8"
+	case operationKindAtomicLoad16:
+		ret = "operationKindAtomicLoad16"
+	case operationKindAtomicStore:
+		ret = "operationKindAtomicStore"
+	case operationKindAtomicStore8:
+		ret = "operationKindAtomicStore8"
+	case operationKindAtomicStore16:
+		ret = "operationKindAtomicStore16"
+	case operationKindAtomicRMW:
+		ret = "operationKindAtomicRMW"
+	case operationKindAtomicRMW8:
+		ret = "operationKindAtomicRMW8"
+	case operationKindAtomicRMW16:
+		ret = "operationKindAtomicRMW16"
+	case operationKindAtomicRMWCmpxchg:
+		ret = "operationKindAtomicRMWCmpxchg"
+	case operationKindAtomicRMW8Cmpxchg:
+		ret = "operationKindAtomicRMW8Cmpxchg"
+	case operationKindAtomicRMW16Cmpxchg:
+		ret = "operationKindAtomicRMW16Cmpxchg"
+	default:
+		panic(fmt.Errorf("unknown operation %d", o))
+	}
+	return
+}
+
+const (
+	// operationKindUnreachable is the Kind for NewOperationUnreachable.
+	operationKindUnreachable operationKind = iota
+	// operationKindLabel is the Kind for NewOperationLabel.
+	operationKindLabel
+	// operationKindBr is the Kind for NewOperationBr.
+	operationKindBr
+	// operationKindBrIf is the Kind for NewOperationBrIf.
+	operationKindBrIf
+	// operationKindBrTable is the Kind for NewOperationBrTable.
+	operationKindBrTable
+	// operationKindCall is the Kind for NewOperationCall.
+	operationKindCall
+	// operationKindCallIndirect is the Kind for NewOperationCallIndirect.
+	operationKindCallIndirect
+	// operationKindDrop is the Kind for NewOperationDrop.
+	operationKindDrop
+	// operationKindSelect is the Kind for NewOperationSelect.
+	operationKindSelect
+	// operationKindPick is the Kind for NewOperationPick.
+	operationKindPick
+	// operationKindSet is the Kind for NewOperationSet.
+	operationKindSet
+	// operationKindGlobalGet is the Kind for NewOperationGlobalGet.
+	operationKindGlobalGet
+	// operationKindGlobalSet is the Kind for NewOperationGlobalSet.
+	operationKindGlobalSet
+	// operationKindLoad is the Kind for NewOperationLoad.
+	operationKindLoad
+	// operationKindLoad8 is the Kind for NewOperationLoad8.
+	operationKindLoad8
+	// operationKindLoad16 is the Kind for NewOperationLoad16.
+	operationKindLoad16
+	// operationKindLoad32 is the Kind for NewOperationLoad32.
+	operationKindLoad32
+	// operationKindStore is the Kind for NewOperationStore.
+	operationKindStore
+	// operationKindStore8 is the Kind for NewOperationStore8.
+	operationKindStore8
+	// operationKindStore16 is the Kind for NewOperationStore16.
+	operationKindStore16
+	// operationKindStore32 is the Kind for NewOperationStore32.
+	operationKindStore32
+	// operationKindMemorySize is the Kind for NewOperationMemorySize.
+	operationKindMemorySize
+	// operationKindMemoryGrow is the Kind for NewOperationMemoryGrow.
+	operationKindMemoryGrow
+	// operationKindConstI32 is the Kind for NewOperationConstI32.
+	operationKindConstI32
+	// operationKindConstI64 is the Kind for NewOperationConstI64.
+	operationKindConstI64
+	// operationKindConstF32 is the Kind for NewOperationConstF32.
+	operationKindConstF32
+	// operationKindConstF64 is the Kind for NewOperationConstF64.
+	operationKindConstF64
+	// operationKindEq is the Kind for NewOperationEq.
+	operationKindEq
+	// operationKindNe is the Kind for NewOperationNe.
+	operationKindNe
+	// operationKindEqz is the Kind for NewOperationEqz.
+	operationKindEqz
+	// operationKindLt is the Kind for NewOperationLt.
+	operationKindLt
+	// operationKindGt is the Kind for NewOperationGt.
+	operationKindGt
+	// operationKindLe is the Kind for NewOperationLe.
+	operationKindLe
+	// operationKindGe is the Kind for NewOperationGe.
+	operationKindGe
+	// operationKindAdd is the Kind for NewOperationAdd.
+	operationKindAdd
+	// operationKindSub is the Kind for NewOperationSub.
+	operationKindSub
+	// operationKindMul is the Kind for NewOperationMul.
+	operationKindMul
+	// operationKindClz is the Kind for NewOperationClz.
+	operationKindClz
+	// operationKindCtz is the Kind for NewOperationCtz.
+	operationKindCtz
+	// operationKindPopcnt is the Kind for NewOperationPopcnt.
+	operationKindPopcnt
+	// operationKindDiv is the Kind for NewOperationDiv.
+	operationKindDiv
+	// operationKindRem is the Kind for NewOperationRem.
+	operationKindRem
+	// operationKindAnd is the Kind for NewOperationAnd.
+	operationKindAnd
+	// operationKindOr is the Kind for NewOperationOr.
+	operationKindOr
+	// operationKindXor is the Kind for NewOperationXor.
+	operationKindXor
+	// operationKindShl is the Kind for NewOperationShl.
+	operationKindShl
+	// operationKindShr is the Kind for NewOperationShr.
+	operationKindShr
+	// operationKindRotl is the Kind for NewOperationRotl.
+	operationKindRotl
+	// operationKindRotr is the Kind for NewOperationRotr.
+	operationKindRotr
+	// operationKindAbs is the Kind for NewOperationAbs.
+	operationKindAbs
+	// operationKindNeg is the Kind for NewOperationNeg.
+	operationKindNeg
+	// operationKindCeil is the Kind for NewOperationCeil.
+	operationKindCeil
+	// operationKindFloor is the Kind for NewOperationFloor.
+	operationKindFloor
+	// operationKindTrunc is the Kind for NewOperationTrunc.
+	operationKindTrunc
+	// operationKindNearest is the Kind for NewOperationNearest.
+	operationKindNearest
+	// operationKindSqrt is the Kind for NewOperationSqrt.
+	operationKindSqrt
+	// operationKindMin is the Kind for NewOperationMin.
+	operationKindMin
+	// operationKindMax is the Kind for NewOperationMax.
+	operationKindMax
+	// operationKindCopysign is the Kind for NewOperationCopysign.
+	operationKindCopysign
+	// operationKindI32WrapFromI64 is the Kind for NewOperationI32WrapFromI64.
+	operationKindI32WrapFromI64
+	// operationKindITruncFromF is the Kind for NewOperationITruncFromF.
+	operationKindITruncFromF
+	// operationKindFConvertFromI is the Kind for NewOperationFConvertFromI.
+	operationKindFConvertFromI
+	// operationKindF32DemoteFromF64 is the Kind for NewOperationF32DemoteFromF64.
+	operationKindF32DemoteFromF64
+	// operationKindF64PromoteFromF32 is the Kind for NewOperationF64PromoteFromF32.
+	operationKindF64PromoteFromF32
+	// operationKindI32ReinterpretFromF32 is the Kind for NewOperationI32ReinterpretFromF32.
+	operationKindI32ReinterpretFromF32
+	// operationKindI64ReinterpretFromF64 is the Kind for NewOperationI64ReinterpretFromF64.
+	operationKindI64ReinterpretFromF64
+	// operationKindF32ReinterpretFromI32 is the Kind for NewOperationF32ReinterpretFromI32.
+	operationKindF32ReinterpretFromI32
+	// operationKindF64ReinterpretFromI64 is the Kind for NewOperationF64ReinterpretFromI64.
+	operationKindF64ReinterpretFromI64
+	// operationKindExtend is the Kind for NewOperationExtend.
+	operationKindExtend
+	// operationKindSignExtend32From8 is the Kind for NewOperationSignExtend32From8.
+	operationKindSignExtend32From8
+	// operationKindSignExtend32From16 is the Kind for NewOperationSignExtend32From16.
+	operationKindSignExtend32From16
+	// operationKindSignExtend64From8 is the Kind for NewOperationSignExtend64From8.
+	operationKindSignExtend64From8
+	// operationKindSignExtend64From16 is the Kind for NewOperationSignExtend64From16.
+	operationKindSignExtend64From16
+	// operationKindSignExtend64From32 is the Kind for NewOperationSignExtend64From32.
+	operationKindSignExtend64From32
+	// operationKindMemoryInit is the Kind for NewOperationMemoryInit.
+	operationKindMemoryInit
+	// operationKindDataDrop is the Kind for NewOperationDataDrop.
+	operationKindDataDrop
+	// operationKindMemoryCopy is the Kind for NewOperationMemoryCopy.
+	operationKindMemoryCopy
+	// operationKindMemoryFill is the Kind for NewOperationMemoryFill.
+	operationKindMemoryFill
+	// operationKindTableInit is the Kind for NewOperationTableInit.
+	operationKindTableInit
+	// operationKindElemDrop is the Kind for NewOperationElemDrop.
+	operationKindElemDrop
+	// operationKindTableCopy is the Kind for NewOperationTableCopy.
+	operationKindTableCopy
+	// operationKindRefFunc is the Kind for NewOperationRefFunc.
+	operationKindRefFunc
+	// operationKindTableGet is the Kind for NewOperationTableGet.
+	operationKindTableGet
+	// operationKindTableSet is the Kind for NewOperationTableSet.
+	operationKindTableSet
+	// operationKindTableSize is the Kind for NewOperationTableSize.
+	operationKindTableSize
+	// operationKindTableGrow is the Kind for NewOperationTableGrow.
+	operationKindTableGrow
+	// operationKindTableFill is the Kind for NewOperationTableFill.
+	operationKindTableFill
+
+	// Vector value related instructions are prefixed by V128.
+
+	// operationKindV128Const is the Kind for NewOperationV128Const.
+	operationKindV128Const
+	// operationKindV128Add is the Kind for NewOperationV128Add.
+	operationKindV128Add
+	// operationKindV128Sub is the Kind for NewOperationV128Sub.
+	operationKindV128Sub
+	// operationKindV128Load is the Kind for NewOperationV128Load.
+	operationKindV128Load
+	// operationKindV128LoadLane is the Kind for NewOperationV128LoadLane.
+	operationKindV128LoadLane
+	// operationKindV128Store is the Kind for NewOperationV128Store.
+	operationKindV128Store
+	// operationKindV128StoreLane is the Kind for NewOperationV128StoreLane.
+	operationKindV128StoreLane
+	// operationKindV128ExtractLane is the Kind for NewOperationV128ExtractLane.
+	operationKindV128ExtractLane
+	// operationKindV128ReplaceLane is the Kind for NewOperationV128ReplaceLane.
+	operationKindV128ReplaceLane
+	// operationKindV128Splat is the Kind for NewOperationV128Splat.
+	operationKindV128Splat
+	// operationKindV128Shuffle is the Kind for NewOperationV128Shuffle.
+	operationKindV128Shuffle
+	// operationKindV128Swizzle is the Kind for NewOperationV128Swizzle.
+	operationKindV128Swizzle
+	// operationKindV128AnyTrue is the Kind for NewOperationV128AnyTrue.
+	operationKindV128AnyTrue
+	// operationKindV128AllTrue is the Kind for NewOperationV128AllTrue.
+	operationKindV128AllTrue
+	// operationKindV128BitMask is the Kind for NewOperationV128BitMask.
+	operationKindV128BitMask
+	// operationKindV128And is the Kind for NewOperationV128And.
+	operationKindV128And
+	// operationKindV128Not is the Kind for NewOperationV128Not.
+	operationKindV128Not
+	// operationKindV128Or is the Kind for NewOperationV128Or.
+	operationKindV128Or
+	// operationKindV128Xor is the Kind for NewOperationV128Xor.
+	operationKindV128Xor
+	// operationKindV128Bitselect is the Kind for NewOperationV128Bitselect.
+	operationKindV128Bitselect
+	// operationKindV128AndNot is the Kind for NewOperationV128AndNot.
+	operationKindV128AndNot
+	// operationKindV128Shl is the Kind for NewOperationV128Shl.
+	operationKindV128Shl
+	// operationKindV128Shr is the Kind for NewOperationV128Shr.
+	operationKindV128Shr
+	// operationKindV128Cmp is the Kind for NewOperationV128Cmp.
+	operationKindV128Cmp
+	// operationKindV128AddSat is the Kind for NewOperationV128AddSat.
+	operationKindV128AddSat
+	// operationKindV128SubSat is the Kind for NewOperationV128SubSat.
+	operationKindV128SubSat
+	// operationKindV128Mul is the Kind for NewOperationV128Mul.
+	operationKindV128Mul
+	// operationKindV128Div is the Kind for NewOperationV128Div.
+	operationKindV128Div
+	// operationKindV128Neg is the Kind for NewOperationV128Neg.
+	operationKindV128Neg
+	// operationKindV128Sqrt is the Kind for NewOperationV128Sqrt.
+	operationKindV128Sqrt
+	// operationKindV128Abs is the Kind for NewOperationV128Abs.
+	operationKindV128Abs
+	// operationKindV128Popcnt is the Kind for NewOperationV128Popcnt.
+	operationKindV128Popcnt
+	// operationKindV128Min is the Kind for NewOperationV128Min.
+	operationKindV128Min
+	// operationKindV128Max is the Kind for NewOperationV128Max.
+	operationKindV128Max
+	// operationKindV128AvgrU is the Kind for NewOperationV128AvgrU.
+	operationKindV128AvgrU
+	// operationKindV128Pmin is the Kind for NewOperationV128Pmin.
+	operationKindV128Pmin
+	// operationKindV128Pmax is the Kind for NewOperationV128Pmax.
+	operationKindV128Pmax
+	// operationKindV128Ceil is the Kind for NewOperationV128Ceil.
+	operationKindV128Ceil
+	// operationKindV128Floor is the Kind for NewOperationV128Floor.
+	operationKindV128Floor
+	// operationKindV128Trunc is the Kind for NewOperationV128Trunc.
+	operationKindV128Trunc
+	// operationKindV128Nearest is the Kind for NewOperationV128Nearest.
+	operationKindV128Nearest
+	// operationKindV128Extend is the Kind for NewOperationV128Extend.
+	operationKindV128Extend
+	// operationKindV128ExtMul is the Kind for NewOperationV128ExtMul.
+	operationKindV128ExtMul
+	// operationKindV128Q15mulrSatS is the Kind for NewOperationV128Q15mulrSatS.
+	operationKindV128Q15mulrSatS
+	// operationKindV128ExtAddPairwise is the Kind for NewOperationV128ExtAddPairwise.
+	operationKindV128ExtAddPairwise
+	// operationKindV128FloatPromote is the Kind for NewOperationV128FloatPromote.
+	operationKindV128FloatPromote
+	// operationKindV128FloatDemote is the Kind for NewOperationV128FloatDemote.
+	operationKindV128FloatDemote
+	// operationKindV128FConvertFromI is the Kind for NewOperationV128FConvertFromI.
+	operationKindV128FConvertFromI
+	// operationKindV128Dot is the Kind for NewOperationV128Dot.
+	operationKindV128Dot
+	// operationKindV128Narrow is the Kind for NewOperationV128Narrow.
+	operationKindV128Narrow
+	// operationKindV128ITruncSatFromF is the Kind for NewOperationV128ITruncSatFromF.
+	operationKindV128ITruncSatFromF
+
+	// operationKindBuiltinFunctionCheckExitCode is the Kind for NewOperationBuiltinFunctionCheckExitCode.
+	operationKindBuiltinFunctionCheckExitCode
+
+	// operationKindAtomicMemoryWait is the kind for NewOperationAtomicMemoryWait.
+	operationKindAtomicMemoryWait
+	// operationKindAtomicMemoryNotify is the kind for NewOperationAtomicMemoryNotify.
+	operationKindAtomicMemoryNotify
+	// operationKindAtomicFence is the kind for NewOperationAtomicFence.
+	operationKindAtomicFence
+	// operationKindAtomicLoad is the kind for NewOperationAtomicLoad.
+	operationKindAtomicLoad
+	// operationKindAtomicLoad8 is the kind for NewOperationAtomicLoad8.
+	operationKindAtomicLoad8
+	// operationKindAtomicLoad16 is the kind for NewOperationAtomicLoad16.
+	operationKindAtomicLoad16
+	// operationKindAtomicStore is the kind for NewOperationAtomicStore.
+	operationKindAtomicStore
+	// operationKindAtomicStore8 is the kind for NewOperationAtomicStore8.
+	operationKindAtomicStore8
+	// operationKindAtomicStore16 is the kind for NewOperationAtomicStore16.
+	operationKindAtomicStore16
+
+	// operationKindAtomicRMW is the kind for NewOperationAtomicRMW.
+	operationKindAtomicRMW
+	// operationKindAtomicRMW8 is the kind for NewOperationAtomicRMW8.
+	operationKindAtomicRMW8
+	// operationKindAtomicRMW16 is the kind for NewOperationAtomicRMW16.
+	operationKindAtomicRMW16
+
+	// operationKindAtomicRMWCmpxchg is the kind for NewOperationAtomicRMWCmpxchg.
+	operationKindAtomicRMWCmpxchg
+	// operationKindAtomicRMW8Cmpxchg is the kind for NewOperationAtomicRMW8Cmpxchg.
+	operationKindAtomicRMW8Cmpxchg
+	// operationKindAtomicRMW16Cmpxchg is the kind for NewOperationAtomicRMW16Cmpxchg.
+	operationKindAtomicRMW16Cmpxchg
+
+	// operationKindEnd is always placed at the bottom of this iota definition to be used in the test.
+	operationKindEnd
+)
+
+// NewOperationBuiltinFunctionCheckExitCode is a constructor for unionOperation with Kind operationKindBuiltinFunctionCheckExitCode.
+//
+// OperationBuiltinFunctionCheckExitCode corresponds to the instruction to check the api.Module is already closed due to
+// context.DeadlineExceeded, context.Canceled, or the explicit call of CloseWithExitCode on api.Module.
+func newOperationBuiltinFunctionCheckExitCode() unionOperation {
+	return unionOperation{Kind: operationKindBuiltinFunctionCheckExitCode}
+}
+
+// label is the unique identifier for each block in a single function in interpreterir
+// where "block" consists of multiple operations, and must End with branching operations
+// (e.g. operationKindBr or operationKindBrIf).
+type label uint64
+
+// Kind returns the labelKind encoded in this label.
+func (l label) Kind() labelKind {
+	return labelKind(uint32(l))
+}
+
+// FrameID returns the frame id encoded in this label.
+func (l label) FrameID() int {
+	return int(uint32(l >> 32))
+}
+
+// NewLabel is a constructor for a label.
+func newLabel(kind labelKind, frameID uint32) label {
+	return label(kind) | label(frameID)<<32
+}
+
+// String implements fmt.Stringer.
+func (l label) String() (ret string) {
+	frameID := l.FrameID()
+	switch l.Kind() {
+	case labelKindHeader:
+		ret = fmt.Sprintf(".L%d", frameID)
+	case labelKindElse:
+		ret = fmt.Sprintf(".L%d_else", frameID)
+	case labelKindContinuation:
+		ret = fmt.Sprintf(".L%d_cont", frameID)
+	case labelKindReturn:
+		return ".return"
+	}
+	return
+}
+
+func (l label) IsReturnTarget() bool {
+	return l.Kind() == labelKindReturn
+}
+
+// labelKind is the Kind of the label.
+type labelKind = byte
+
+const (
+	// labelKindHeader is the header for various blocks. For example, the "then" block of
+	// wasm.OpcodeIfName in Wasm has the label of this Kind.
+	labelKindHeader labelKind = iota
+	// labelKindElse is the Kind of label for "else" block of wasm.OpcodeIfName in Wasm.
+	labelKindElse
+	// labelKindContinuation is the Kind of label which is the continuation of blocks.
+	// For example, for wasm text like
+	// (func
+	//   ....
+	//   (if (local.get 0) (then (nop)) (else (nop)))
+	//   return
+	// )
+	// we have the continuation block (of if-block) corresponding to "return" opcode.
+	labelKindContinuation
+	labelKindReturn
+	labelKindNum
+)
+
+// unionOperation implements Operation and is the compilation (engine.lowerIR) result of a interpreterir.Operation.
+//
+// Not all operations result in a unionOperation, e.g. interpreterir.OperationI32ReinterpretFromF32, and some operations are
+// more complex than others, e.g. interpreterir.NewOperationBrTable.
+//
+// Note: This is a form of union type as it can store fields needed for any operation. Hence, most fields are opaque and
+// only relevant when in context of its kind.
+type unionOperation struct {
+	// Kind determines how to interpret the other fields in this struct.
+	Kind   operationKind
+	B1, B2 byte
+	B3     bool
+	U1, U2 uint64
+	U3     uint64
+	Us     []uint64
+}
+
+// String implements fmt.Stringer.
+func (o unionOperation) String() string {
+	switch o.Kind {
+	case operationKindUnreachable,
+		operationKindSelect,
+		operationKindMemorySize,
+		operationKindMemoryGrow,
+		operationKindI32WrapFromI64,
+		operationKindF32DemoteFromF64,
+		operationKindF64PromoteFromF32,
+		operationKindI32ReinterpretFromF32,
+		operationKindI64ReinterpretFromF64,
+		operationKindF32ReinterpretFromI32,
+		operationKindF64ReinterpretFromI64,
+		operationKindSignExtend32From8,
+		operationKindSignExtend32From16,
+		operationKindSignExtend64From8,
+		operationKindSignExtend64From16,
+		operationKindSignExtend64From32,
+		operationKindMemoryInit,
+		operationKindDataDrop,
+		operationKindMemoryCopy,
+		operationKindMemoryFill,
+		operationKindTableInit,
+		operationKindElemDrop,
+		operationKindTableCopy,
+		operationKindRefFunc,
+		operationKindTableGet,
+		operationKindTableSet,
+		operationKindTableSize,
+		operationKindTableGrow,
+		operationKindTableFill,
+		operationKindBuiltinFunctionCheckExitCode:
+		return o.Kind.String()
+
+	case operationKindCall,
+		operationKindGlobalGet,
+		operationKindGlobalSet:
+		return fmt.Sprintf("%s %d", o.Kind, o.B1)
+
+	case operationKindLabel:
+		return label(o.U1).String()
+
+	case operationKindBr:
+		return fmt.Sprintf("%s %s", o.Kind, label(o.U1).String())
+
+	case operationKindBrIf:
+		thenTarget := label(o.U1)
+		elseTarget := label(o.U2)
+		return fmt.Sprintf("%s %s, %s", o.Kind, thenTarget, elseTarget)
+
+	case operationKindBrTable:
+		var targets []string
+		var defaultLabel label
+		if len(o.Us) > 0 {
+			targets = make([]string, len(o.Us)-1)
+			for i, t := range o.Us[1:] {
+				targets[i] = label(t).String()
+			}
+			defaultLabel = label(o.Us[0])
+		}
+		return fmt.Sprintf("%s [%s] %s", o.Kind, strings.Join(targets, ","), defaultLabel)
+
+	case operationKindCallIndirect:
+		return fmt.Sprintf("%s: type=%d, table=%d", o.Kind, o.U1, o.U2)
+
+	case operationKindDrop:
+		start := int64(o.U1)
+		end := int64(o.U2)
+		return fmt.Sprintf("%s %d..%d", o.Kind, start, end)
+
+	case operationKindPick, operationKindSet:
+		return fmt.Sprintf("%s %d (is_vector=%v)", o.Kind, o.U1, o.B3)
+
+	case operationKindLoad, operationKindStore:
+		return fmt.Sprintf("%s.%s (align=%d, offset=%d)", unsignedType(o.B1), o.Kind, o.U1, o.U2)
+
+	case operationKindLoad8,
+		operationKindLoad16:
+		return fmt.Sprintf("%s.%s (align=%d, offset=%d)", signedType(o.B1), o.Kind, o.U1, o.U2)
+
+	case operationKindStore8,
+		operationKindStore16,
+		operationKindStore32:
+		return fmt.Sprintf("%s (align=%d, offset=%d)", o.Kind, o.U1, o.U2)
+
+	case operationKindLoad32:
+		var t string
+		if o.B1 == 1 {
+			t = "i64"
+		} else {
+			t = "u64"
+		}
+		return fmt.Sprintf("%s.%s (align=%d, offset=%d)", t, o.Kind, o.U1, o.U2)
+
+	case operationKindEq,
+		operationKindNe,
+		operationKindAdd,
+		operationKindSub,
+		operationKindMul:
+		return fmt.Sprintf("%s.%s", unsignedType(o.B1), o.Kind)
+
+	case operationKindEqz,
+		operationKindClz,
+		operationKindCtz,
+		operationKindPopcnt,
+		operationKindAnd,
+		operationKindOr,
+		operationKindXor,
+		operationKindShl,
+		operationKindRotl,
+		operationKindRotr:
+		return fmt.Sprintf("%s.%s", unsignedInt(o.B1), o.Kind)
+
+	case operationKindRem, operationKindShr:
+		return fmt.Sprintf("%s.%s", signedInt(o.B1), o.Kind)
+
+	case operationKindLt,
+		operationKindGt,
+		operationKindLe,
+		operationKindGe,
+		operationKindDiv:
+		return fmt.Sprintf("%s.%s", signedType(o.B1), o.Kind)
+
+	case operationKindAbs,
+		operationKindNeg,
+		operationKindCeil,
+		operationKindFloor,
+		operationKindTrunc,
+		operationKindNearest,
+		operationKindSqrt,
+		operationKindMin,
+		operationKindMax,
+		operationKindCopysign:
+		return fmt.Sprintf("%s.%s", float(o.B1), o.Kind)
+
+	case operationKindConstI32,
+		operationKindConstI64:
+		return fmt.Sprintf("%s %#x", o.Kind, o.U1)
+
+	case operationKindConstF32:
+		return fmt.Sprintf("%s %f", o.Kind, math.Float32frombits(uint32(o.U1)))
+	case operationKindConstF64:
+		return fmt.Sprintf("%s %f", o.Kind, math.Float64frombits(o.U1))
+
+	case operationKindITruncFromF:
+		return fmt.Sprintf("%s.%s.%s (non_trapping=%v)", signedInt(o.B2), o.Kind, float(o.B1), o.B3)
+	case operationKindFConvertFromI:
+		return fmt.Sprintf("%s.%s.%s", float(o.B2), o.Kind, signedInt(o.B1))
+	case operationKindExtend:
+		var in, out string
+		if o.B3 {
+			in = "i32"
+			out = "i64"
+		} else {
+			in = "u32"
+			out = "u64"
+		}
+		return fmt.Sprintf("%s.%s.%s", out, o.Kind, in)
+
+	case operationKindV128Const:
+		return fmt.Sprintf("%s [%#x, %#x]", o.Kind, o.U1, o.U2)
+	case operationKindV128Add,
+		operationKindV128Sub:
+		return fmt.Sprintf("%s (shape=%s)", o.Kind, shapeName(o.B1))
+	case operationKindV128Load,
+		operationKindV128LoadLane,
+		operationKindV128Store,
+		operationKindV128StoreLane,
+		operationKindV128ExtractLane,
+		operationKindV128ReplaceLane,
+		operationKindV128Splat,
+		operationKindV128Shuffle,
+		operationKindV128Swizzle,
+		operationKindV128AnyTrue,
+		operationKindV128AllTrue,
+		operationKindV128BitMask,
+		operationKindV128And,
+		operationKindV128Not,
+		operationKindV128Or,
+		operationKindV128Xor,
+		operationKindV128Bitselect,
+		operationKindV128AndNot,
+		operationKindV128Shl,
+		operationKindV128Shr,
+		operationKindV128Cmp,
+		operationKindV128AddSat,
+		operationKindV128SubSat,
+		operationKindV128Mul,
+		operationKindV128Div,
+		operationKindV128Neg,
+		operationKindV128Sqrt,
+		operationKindV128Abs,
+		operationKindV128Popcnt,
+		operationKindV128Min,
+		operationKindV128Max,
+		operationKindV128AvgrU,
+		operationKindV128Pmin,
+		operationKindV128Pmax,
+		operationKindV128Ceil,
+		operationKindV128Floor,
+		operationKindV128Trunc,
+		operationKindV128Nearest,
+		operationKindV128Extend,
+		operationKindV128ExtMul,
+		operationKindV128Q15mulrSatS,
+		operationKindV128ExtAddPairwise,
+		operationKindV128FloatPromote,
+		operationKindV128FloatDemote,
+		operationKindV128FConvertFromI,
+		operationKindV128Dot,
+		operationKindV128Narrow:
+		return o.Kind.String()
+
+	case operationKindV128ITruncSatFromF:
+		if o.B3 {
+			return fmt.Sprintf("%s.%sS", o.Kind, shapeName(o.B1))
+		} else {
+			return fmt.Sprintf("%s.%sU", o.Kind, shapeName(o.B1))
+		}
+
+	case operationKindAtomicMemoryWait,
+		operationKindAtomicMemoryNotify,
+		operationKindAtomicFence,
+		operationKindAtomicLoad,
+		operationKindAtomicLoad8,
+		operationKindAtomicLoad16,
+		operationKindAtomicStore,
+		operationKindAtomicStore8,
+		operationKindAtomicStore16,
+		operationKindAtomicRMW,
+		operationKindAtomicRMW8,
+		operationKindAtomicRMW16,
+		operationKindAtomicRMWCmpxchg,
+		operationKindAtomicRMW8Cmpxchg,
+		operationKindAtomicRMW16Cmpxchg:
+		return o.Kind.String()
+
+	default:
+		panic(fmt.Sprintf("TODO: %v", o.Kind))
+	}
+}
+
+// NewOperationUnreachable is a constructor for unionOperation with operationKindUnreachable
+//
+// This corresponds to wasm.OpcodeUnreachable.
+//
+// The engines are expected to exit the execution with wasmruntime.ErrRuntimeUnreachable error.
+func newOperationUnreachable() unionOperation {
+	return unionOperation{Kind: operationKindUnreachable}
+}
+
+// NewOperationLabel is a constructor for unionOperation with operationKindLabel.
+//
+// This is used to inform the engines of the beginning of a label.
+func newOperationLabel(label label) unionOperation {
+	return unionOperation{Kind: operationKindLabel, U1: uint64(label)}
+}
+
+// NewOperationBr is a constructor for unionOperation with operationKindBr.
+//
+// The engines are expected to branch into U1 label.
+func newOperationBr(target label) unionOperation {
+	return unionOperation{Kind: operationKindBr, U1: uint64(target)}
+}
+
+// NewOperationBrIf is a constructor for unionOperation with operationKindBrIf.
+//
+// The engines are expected to pop a value and branch into U1 label if the value equals 1.
+// Otherwise, the code branches into U2 label.
+func newOperationBrIf(thenTarget, elseTarget label, thenDrop inclusiveRange) unionOperation {
+	return unionOperation{
+		Kind: operationKindBrIf,
+		U1:   uint64(thenTarget),
+		U2:   uint64(elseTarget),
+		U3:   thenDrop.AsU64(),
+	}
+}
+
+// NewOperationBrTable is a constructor for unionOperation with operationKindBrTable.
+//
+// This corresponds to wasm.OpcodeBrTableName except that the label
+// here means the interpreterir level, not the ones of Wasm.
+//
+// The engines are expected to do the br_table operation based on the default (Us[len(Us)-1], Us[len(Us)-2]) and
+// targets (Us[:len(Us)-1], Rs[:len(Us)-1]). More precisely, this pops a value from the stack (called "index")
+// and decides which branch we go into next based on the value.
+//
+// For example, assume we have operations like {default: L_DEFAULT, targets: [L0, L1, L2]}.
+// If "index" >= len(defaults), then branch into the L_DEFAULT label.
+// Otherwise, we enter label of targets[index].
+func newOperationBrTable(targetLabelsAndRanges []uint64) unionOperation {
+	return unionOperation{
+		Kind: operationKindBrTable,
+		Us:   targetLabelsAndRanges,
+	}
+}
+
+// NewOperationCall is a constructor for unionOperation with operationKindCall.
+//
+// This corresponds to wasm.OpcodeCallName, and engines are expected to
+// enter into a function whose index equals OperationCall.FunctionIndex.
+func newOperationCall(functionIndex uint32) unionOperation {
+	return unionOperation{Kind: operationKindCall, U1: uint64(functionIndex)}
+}
+
+// NewOperationCallIndirect implements Operation.
+//
+// This corresponds to wasm.OpcodeCallIndirectName, and engines are expected to
+// consume the one value from the top of stack (called "offset"),
+// and make a function call against the function whose function address equals
+// Tables[OperationCallIndirect.TableIndex][offset].
+//
+// Note: This is called indirect function call in the sense that the target function is indirectly
+// determined by the current state (top value) of the stack.
+// Therefore, two checks are performed at runtime before entering the target function:
+// 1) whether "offset" exceeds the length of table Tables[OperationCallIndirect.TableIndex].
+// 2) whether the type of the function table[offset] matches the function type specified by OperationCallIndirect.TypeIndex.
+func newOperationCallIndirect(typeIndex, tableIndex uint32) unionOperation {
+	return unionOperation{Kind: operationKindCallIndirect, U1: uint64(typeIndex), U2: uint64(tableIndex)}
+}
+
+// inclusiveRange is the range which spans across the value stack starting from the top to the bottom, and
+// both boundary are included in the range.
+type inclusiveRange struct {
+	Start, End int32
+}
+
+// AsU64 is be used to convert inclusiveRange to uint64 so that it can be stored in unionOperation.
+func (i inclusiveRange) AsU64() uint64 {
+	return uint64(uint32(i.Start))<<32 | uint64(uint32(i.End))
+}
+
+// inclusiveRangeFromU64 retrieves inclusiveRange from the given uint64 which is stored in unionOperation.
+func inclusiveRangeFromU64(v uint64) inclusiveRange {
+	return inclusiveRange{
+		Start: int32(uint32(v >> 32)),
+		End:   int32(uint32(v)),
+	}
+}
+
+// nopinclusiveRange is inclusiveRange which corresponds to no-operation.
+var nopinclusiveRange = inclusiveRange{Start: -1, End: -1}
+
+// NewOperationDrop is a constructor for unionOperation with operationKindDrop.
+//
+// The engines are expected to discard the values selected by NewOperationDrop.Depth which
+// starts from the top of the stack to the bottom.
+//
+// depth spans across the uint64 value stack at runtime to be dropped by this operation.
+func newOperationDrop(depth inclusiveRange) unionOperation {
+	return unionOperation{Kind: operationKindDrop, U1: depth.AsU64()}
+}
+
+// NewOperationSelect is a constructor for unionOperation with operationKindSelect.
+//
+// This corresponds to wasm.OpcodeSelect.
+//
+// The engines are expected to pop three values, say [..., x2, x1, c], then if the value "c" equals zero,
+// "x1" is pushed back onto the stack and, otherwise "x2" is pushed back.
+//
+// isTargetVector true if the selection target value's type is wasm.ValueTypeV128.
+func newOperationSelect(isTargetVector bool) unionOperation {
+	return unionOperation{Kind: operationKindSelect, B3: isTargetVector}
+}
+
+// NewOperationPick is a constructor for unionOperation with operationKindPick.
+//
+// The engines are expected to copy a value pointed by depth, and push the
+// copied value onto the top of the stack.
+//
+// depth is the location of the pick target in the uint64 value stack at runtime.
+// If isTargetVector=true, this points to the location of the lower 64-bits of the vector.
+func newOperationPick(depth int, isTargetVector bool) unionOperation {
+	return unionOperation{Kind: operationKindPick, U1: uint64(depth), B3: isTargetVector}
+}
+
+// NewOperationSet is a constructor for unionOperation with operationKindSet.
+//
+// The engines are expected to set the top value of the stack to the location specified by
+// depth.
+//
+// depth is the location of the set target in the uint64 value stack at runtime.
+// If isTargetVector=true, this points the location of the lower 64-bits of the vector.
+func newOperationSet(depth int, isTargetVector bool) unionOperation {
+	return unionOperation{Kind: operationKindSet, U1: uint64(depth), B3: isTargetVector}
+}
+
+// NewOperationGlobalGet is a constructor for unionOperation with operationKindGlobalGet.
+//
+// The engines are expected to read the global value specified by OperationGlobalGet.Index,
+// and push the copy of the value onto the stack.
+//
+// See wasm.OpcodeGlobalGet.
+func newOperationGlobalGet(index uint32) unionOperation {
+	return unionOperation{Kind: operationKindGlobalGet, U1: uint64(index)}
+}
+
+// NewOperationGlobalSet is a constructor for unionOperation with operationKindGlobalSet.
+//
+// The engines are expected to consume the value from the top of the stack,
+// and write the value into the global specified by OperationGlobalSet.Index.
+//
+// See wasm.OpcodeGlobalSet.
+func newOperationGlobalSet(index uint32) unionOperation {
+	return unionOperation{Kind: operationKindGlobalSet, U1: uint64(index)}
+}
+
+// memoryArg is the "memarg" to all memory instructions.
+//
+// See https://www.w3.org/TR/2019/REC-wasm-core-1-20191205/#memory-instructions%E2%91%A0
+type memoryArg struct {
+	// Alignment the expected alignment (expressed as the exponent of a power of 2). Default to the natural alignment.
+	//
+	// "Natural alignment" is defined here as the smallest power of two that can hold the size of the value type. Ex
+	// wasm.ValueTypeI64 is encoded in 8 little-endian bytes. 2^3 = 8, so the natural alignment is three.
+	Alignment uint32
+
+	// Offset is the address offset added to the instruction's dynamic address operand, yielding a 33-bit effective
+	// address that is the zero-based index at which the memory is accessed. Default to zero.
+	Offset uint32
+}
+
+// NewOperationLoad is a constructor for unionOperation with operationKindLoad.
+//
+// This corresponds to wasm.OpcodeI32LoadName wasm.OpcodeI64LoadName wasm.OpcodeF32LoadName and wasm.OpcodeF64LoadName.
+//
+// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary,
+// otherwise load the corresponding value following the semantics of the corresponding WebAssembly instruction.
+func newOperationLoad(unsignedType unsignedType, arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindLoad, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationLoad8 is a constructor for unionOperation with operationKindLoad8.
+//
+// This corresponds to wasm.OpcodeI32Load8SName wasm.OpcodeI32Load8UName wasm.OpcodeI64Load8SName wasm.OpcodeI64Load8UName.
+//
+// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary,
+// otherwise load the corresponding value following the semantics of the corresponding WebAssembly instruction.
+func newOperationLoad8(signedInt signedInt, arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindLoad8, B1: byte(signedInt), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationLoad16 is a constructor for unionOperation with operationKindLoad16.
+//
+// This corresponds to wasm.OpcodeI32Load16SName wasm.OpcodeI32Load16UName wasm.OpcodeI64Load16SName wasm.OpcodeI64Load16UName.
+//
+// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary,
+// otherwise load the corresponding value following the semantics of the corresponding WebAssembly instruction.
+func newOperationLoad16(signedInt signedInt, arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindLoad16, B1: byte(signedInt), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationLoad32 is a constructor for unionOperation with operationKindLoad32.
+//
+// This corresponds to wasm.OpcodeI64Load32SName wasm.OpcodeI64Load32UName.
+//
+// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary,
+// otherwise load the corresponding value following the semantics of the corresponding WebAssembly instruction.
+func newOperationLoad32(signed bool, arg memoryArg) unionOperation {
+	sigB := byte(0)
+	if signed {
+		sigB = 1
+	}
+	return unionOperation{Kind: operationKindLoad32, B1: sigB, U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationStore is a constructor for unionOperation with operationKindStore.
+//
+// # This corresponds to wasm.OpcodeI32StoreName wasm.OpcodeI64StoreName wasm.OpcodeF32StoreName wasm.OpcodeF64StoreName
+//
+// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary,
+// otherwise store the corresponding value following the semantics of the corresponding WebAssembly instruction.
+func newOperationStore(unsignedType unsignedType, arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindStore, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationStore8 is a constructor for unionOperation with operationKindStore8.
+//
+// # This corresponds to wasm.OpcodeI32Store8Name wasm.OpcodeI64Store8Name
+//
+// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary,
+// otherwise store the corresponding value following the semantics of the corresponding WebAssembly instruction.
+func newOperationStore8(arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindStore8, U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationStore16 is a constructor for unionOperation with operationKindStore16.
+//
+// # This corresponds to wasm.OpcodeI32Store16Name wasm.OpcodeI64Store16Name
+//
+// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary,
+// otherwise store the corresponding value following the semantics of the corresponding WebAssembly instruction.
+func newOperationStore16(arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindStore16, U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationStore32 is a constructor for unionOperation with operationKindStore32.
+//
+// # This corresponds to wasm.OpcodeI64Store32Name
+//
+// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary,
+// otherwise store the corresponding value following the semantics of the corresponding WebAssembly instruction.
+func newOperationStore32(arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindStore32, U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationMemorySize is a constructor for unionOperation with operationKindMemorySize.
+//
+// This corresponds to wasm.OpcodeMemorySize.
+//
+// The engines are expected to push the current page size of the memory onto the stack.
+func newOperationMemorySize() unionOperation {
+	return unionOperation{Kind: operationKindMemorySize}
+}
+
+// NewOperationMemoryGrow is a constructor for unionOperation with operationKindMemoryGrow.
+//
+// This corresponds to wasm.OpcodeMemoryGrow.
+//
+// The engines are expected to pop one value from the top of the stack, then
+// execute wasm.MemoryInstance Grow with the value, and push the previous
+// page size of the memory onto the stack.
+func newOperationMemoryGrow() unionOperation {
+	return unionOperation{Kind: operationKindMemoryGrow}
+}
+
+// NewOperationConstI32 is a constructor for unionOperation with OperationConstI32.
+//
+// This corresponds to wasm.OpcodeI32Const.
+func newOperationConstI32(value uint32) unionOperation {
+	return unionOperation{Kind: operationKindConstI32, U1: uint64(value)}
+}
+
+// NewOperationConstI64 is a constructor for unionOperation with OperationConstI64.
+//
+// This corresponds to wasm.OpcodeI64Const.
+func newOperationConstI64(value uint64) unionOperation {
+	return unionOperation{Kind: operationKindConstI64, U1: value}
+}
+
+// NewOperationConstF32 is a constructor for unionOperation with OperationConstF32.
+//
+// This corresponds to wasm.OpcodeF32Const.
+func newOperationConstF32(value float32) unionOperation {
+	return unionOperation{Kind: operationKindConstF32, U1: uint64(math.Float32bits(value))}
+}
+
+// NewOperationConstF64 is a constructor for unionOperation with OperationConstF64.
+//
+// This corresponds to wasm.OpcodeF64Const.
+func newOperationConstF64(value float64) unionOperation {
+	return unionOperation{Kind: operationKindConstF64, U1: math.Float64bits(value)}
+}
+
+// NewOperationEq is a constructor for unionOperation with operationKindEq.
+//
+// This corresponds to wasm.OpcodeI32EqName wasm.OpcodeI64EqName wasm.OpcodeF32EqName wasm.OpcodeF64EqName
+func newOperationEq(b unsignedType) unionOperation {
+	return unionOperation{Kind: operationKindEq, B1: byte(b)}
+}
+
+// NewOperationNe is a constructor for unionOperation with operationKindNe.
+//
+// This corresponds to wasm.OpcodeI32NeName wasm.OpcodeI64NeName wasm.OpcodeF32NeName wasm.OpcodeF64NeName
+func newOperationNe(b unsignedType) unionOperation {
+	return unionOperation{Kind: operationKindNe, B1: byte(b)}
+}
+
+// NewOperationEqz is a constructor for unionOperation with operationKindEqz.
+//
+// This corresponds to wasm.OpcodeI32EqzName wasm.OpcodeI64EqzName
+func newOperationEqz(b unsignedInt) unionOperation {
+	return unionOperation{Kind: operationKindEqz, B1: byte(b)}
+}
+
+// NewOperationLt is a constructor for unionOperation with operationKindLt.
+//
+// This corresponds to wasm.OpcodeI32LtS wasm.OpcodeI32LtU wasm.OpcodeI64LtS wasm.OpcodeI64LtU wasm.OpcodeF32Lt wasm.OpcodeF64Lt
+func newOperationLt(b signedType) unionOperation {
+	return unionOperation{Kind: operationKindLt, B1: byte(b)}
+}
+
+// NewOperationGt is a constructor for unionOperation with operationKindGt.
+//
+// This corresponds to wasm.OpcodeI32GtS wasm.OpcodeI32GtU wasm.OpcodeI64GtS wasm.OpcodeI64GtU wasm.OpcodeF32Gt wasm.OpcodeF64Gt
+func newOperationGt(b signedType) unionOperation {
+	return unionOperation{Kind: operationKindGt, B1: byte(b)}
+}
+
+// NewOperationLe is a constructor for unionOperation with operationKindLe.
+//
+// This corresponds to wasm.OpcodeI32LeS wasm.OpcodeI32LeU wasm.OpcodeI64LeS wasm.OpcodeI64LeU wasm.OpcodeF32Le wasm.OpcodeF64Le
+func newOperationLe(b signedType) unionOperation {
+	return unionOperation{Kind: operationKindLe, B1: byte(b)}
+}
+
+// NewOperationGe is a constructor for unionOperation with operationKindGe.
+//
+// This corresponds to wasm.OpcodeI32GeS wasm.OpcodeI32GeU wasm.OpcodeI64GeS wasm.OpcodeI64GeU wasm.OpcodeF32Ge wasm.OpcodeF64Ge
+// NewOperationGe is the constructor for OperationGe
+func newOperationGe(b signedType) unionOperation {
+	return unionOperation{Kind: operationKindGe, B1: byte(b)}
+}
+
+// NewOperationAdd is a constructor for unionOperation with operationKindAdd.
+//
+// This corresponds to wasm.OpcodeI32AddName wasm.OpcodeI64AddName wasm.OpcodeF32AddName wasm.OpcodeF64AddName.
+func newOperationAdd(b unsignedType) unionOperation {
+	return unionOperation{Kind: operationKindAdd, B1: byte(b)}
+}
+
+// NewOperationSub is a constructor for unionOperation with operationKindSub.
+//
+// This corresponds to wasm.OpcodeI32SubName wasm.OpcodeI64SubName wasm.OpcodeF32SubName wasm.OpcodeF64SubName.
+func newOperationSub(b unsignedType) unionOperation {
+	return unionOperation{Kind: operationKindSub, B1: byte(b)}
+}
+
+// NewOperationMul is a constructor for unionOperation with wperationKindMul.
+//
+// This corresponds to wasm.OpcodeI32MulName wasm.OpcodeI64MulName wasm.OpcodeF32MulName wasm.OpcodeF64MulName.
+// NewOperationMul is the constructor for OperationMul
+func newOperationMul(b unsignedType) unionOperation {
+	return unionOperation{Kind: operationKindMul, B1: byte(b)}
+}
+
+// NewOperationClz is a constructor for unionOperation with operationKindClz.
+//
+// This corresponds to wasm.OpcodeI32ClzName wasm.OpcodeI64ClzName.
+//
+// The engines are expected to count up the leading zeros in the
+// current top of the stack, and push the count result.
+// For example, stack of [..., 0x00_ff_ff_ff] results in [..., 8].
+// See wasm.OpcodeI32Clz wasm.OpcodeI64Clz
+func newOperationClz(b unsignedInt) unionOperation {
+	return unionOperation{Kind: operationKindClz, B1: byte(b)}
+}
+
+// NewOperationCtz is a constructor for unionOperation with operationKindCtz.
+//
+// This corresponds to wasm.OpcodeI32CtzName wasm.OpcodeI64CtzName.
+//
+// The engines are expected to count up the trailing zeros in the
+// current top of the stack, and push the count result.
+// For example, stack of [..., 0xff_ff_ff_00] results in [..., 8].
+func newOperationCtz(b unsignedInt) unionOperation {
+	return unionOperation{Kind: operationKindCtz, B1: byte(b)}
+}
+
+// NewOperationPopcnt is a constructor for unionOperation with operationKindPopcnt.
+//
+// This corresponds to wasm.OpcodeI32PopcntName wasm.OpcodeI64PopcntName.
+//
+// The engines are expected to count up the number of set bits in the
+// current top of the stack, and push the count result.
+// For example, stack of [..., 0b00_00_00_11] results in [..., 2].
+func newOperationPopcnt(b unsignedInt) unionOperation {
+	return unionOperation{Kind: operationKindPopcnt, B1: byte(b)}
+}
+
+// NewOperationDiv is a constructor for unionOperation with operationKindDiv.
+//
+// This corresponds to wasm.OpcodeI32DivS wasm.OpcodeI32DivU wasm.OpcodeI64DivS
+//
+//	wasm.OpcodeI64DivU wasm.OpcodeF32Div wasm.OpcodeF64Div.
+func newOperationDiv(b signedType) unionOperation {
+	return unionOperation{Kind: operationKindDiv, B1: byte(b)}
+}
+
+// NewOperationRem is a constructor for unionOperation with operationKindRem.
+//
+// This corresponds to wasm.OpcodeI32RemS wasm.OpcodeI32RemU wasm.OpcodeI64RemS wasm.OpcodeI64RemU.
+//
+// The engines are expected to perform division on the top
+// two values of integer type on the stack and puts the remainder of the result
+// onto the stack. For example, stack [..., 10, 3] results in [..., 1] where
+// the quotient is discarded.
+// NewOperationRem is the constructor for OperationRem
+func newOperationRem(b signedInt) unionOperation {
+	return unionOperation{Kind: operationKindRem, B1: byte(b)}
+}
+
+// NewOperationAnd is a constructor for unionOperation with operationKindAnd.
+//
+// # This corresponds to wasm.OpcodeI32AndName wasm.OpcodeI64AndName
+//
+// The engines are expected to perform "And" operation on
+// top two values on the stack, and pushes the result.
+func newOperationAnd(b unsignedInt) unionOperation {
+	return unionOperation{Kind: operationKindAnd, B1: byte(b)}
+}
+
+// NewOperationOr is a constructor for unionOperation with operationKindOr.
+//
+// # This corresponds to wasm.OpcodeI32OrName wasm.OpcodeI64OrName
+//
+// The engines are expected to perform "Or" operation on
+// top two values on the stack, and pushes the result.
+func newOperationOr(b unsignedInt) unionOperation {
+	return unionOperation{Kind: operationKindOr, B1: byte(b)}
+}
+
+// NewOperationXor is a constructor for unionOperation with operationKindXor.
+//
+// # This corresponds to wasm.OpcodeI32XorName wasm.OpcodeI64XorName
+//
+// The engines are expected to perform "Xor" operation on
+// top two values on the stack, and pushes the result.
+func newOperationXor(b unsignedInt) unionOperation {
+	return unionOperation{Kind: operationKindXor, B1: byte(b)}
+}
+
+// NewOperationShl is a constructor for unionOperation with operationKindShl.
+//
+// # This corresponds to wasm.OpcodeI32ShlName wasm.OpcodeI64ShlName
+//
+// The engines are expected to perform "Shl" operation on
+// top two values on the stack, and pushes the result.
+func newOperationShl(b unsignedInt) unionOperation {
+	return unionOperation{Kind: operationKindShl, B1: byte(b)}
+}
+
+// NewOperationShr is a constructor for unionOperation with operationKindShr.
+//
+// # This corresponds to wasm.OpcodeI32ShrSName wasm.OpcodeI32ShrUName wasm.OpcodeI64ShrSName wasm.OpcodeI64ShrUName
+//
+// If OperationShr.Type is signed integer, then, the engines are expected to perform arithmetic right shift on the two
+// top values on the stack, otherwise do the logical right shift.
+func newOperationShr(b signedInt) unionOperation {
+	return unionOperation{Kind: operationKindShr, B1: byte(b)}
+}
+
+// NewOperationRotl is a constructor for unionOperation with operationKindRotl.
+//
+// # This corresponds to wasm.OpcodeI32RotlName wasm.OpcodeI64RotlName
+//
+// The engines are expected to perform "Rotl" operation on
+// top two values on the stack, and pushes the result.
+func newOperationRotl(b unsignedInt) unionOperation {
+	return unionOperation{Kind: operationKindRotl, B1: byte(b)}
+}
+
+// NewOperationRotr is a constructor for unionOperation with operationKindRotr.
+//
+// # This corresponds to wasm.OpcodeI32RotrName wasm.OpcodeI64RotrName
+//
+// The engines are expected to perform "Rotr" operation on
+// top two values on the stack, and pushes the result.
+func newOperationRotr(b unsignedInt) unionOperation {
+	return unionOperation{Kind: operationKindRotr, B1: byte(b)}
+}
+
+// NewOperationAbs is a constructor for unionOperation with operationKindAbs.
+//
+// This corresponds to wasm.OpcodeF32Abs wasm.OpcodeF64Abs
+func newOperationAbs(b float) unionOperation {
+	return unionOperation{Kind: operationKindAbs, B1: byte(b)}
+}
+
+// NewOperationNeg is a constructor for unionOperation with operationKindNeg.
+//
+// This corresponds to wasm.OpcodeF32Neg wasm.OpcodeF64Neg
+func newOperationNeg(b float) unionOperation {
+	return unionOperation{Kind: operationKindNeg, B1: byte(b)}
+}
+
+// NewOperationCeil is a constructor for unionOperation with operationKindCeil.
+//
+// This corresponds to wasm.OpcodeF32CeilName wasm.OpcodeF64CeilName
+func newOperationCeil(b float) unionOperation {
+	return unionOperation{Kind: operationKindCeil, B1: byte(b)}
+}
+
+// NewOperationFloor is a constructor for unionOperation with operationKindFloor.
+//
+// This corresponds to wasm.OpcodeF32FloorName wasm.OpcodeF64FloorName
+func newOperationFloor(b float) unionOperation {
+	return unionOperation{Kind: operationKindFloor, B1: byte(b)}
+}
+
+// NewOperationTrunc is a constructor for unionOperation with operationKindTrunc.
+//
+// This corresponds to wasm.OpcodeF32TruncName wasm.OpcodeF64TruncName
+func newOperationTrunc(b float) unionOperation {
+	return unionOperation{Kind: operationKindTrunc, B1: byte(b)}
+}
+
+// NewOperationNearest is a constructor for unionOperation with operationKindNearest.
+//
+// # This corresponds to wasm.OpcodeF32NearestName wasm.OpcodeF64NearestName
+//
+// Note: this is *not* equivalent to math.Round and instead has the same
+// the semantics of LLVM's rint intrinsic. See https://llvm.org/docs/LangRef.html#llvm-rint-intrinsic.
+// For example, math.Round(-4.5) produces -5 while we want to produce -4.
+func newOperationNearest(b float) unionOperation {
+	return unionOperation{Kind: operationKindNearest, B1: byte(b)}
+}
+
+// NewOperationSqrt is a constructor for unionOperation with operationKindSqrt.
+//
+// This corresponds to wasm.OpcodeF32SqrtName wasm.OpcodeF64SqrtName
+func newOperationSqrt(b float) unionOperation {
+	return unionOperation{Kind: operationKindSqrt, B1: byte(b)}
+}
+
+// NewOperationMin is a constructor for unionOperation with operationKindMin.
+//
+// # This corresponds to wasm.OpcodeF32MinName wasm.OpcodeF64MinName
+//
+// The engines are expected to pop two values from the stack, and push back the maximum of
+// these two values onto the stack. For example, stack [..., 100.1, 1.9] results in [..., 1.9].
+//
+// Note: WebAssembly specifies that min/max must always return NaN if one of values is NaN,
+// which is a different behavior different from math.Min.
+func newOperationMin(b float) unionOperation {
+	return unionOperation{Kind: operationKindMin, B1: byte(b)}
+}
+
+// NewOperationMax is a constructor for unionOperation with operationKindMax.
+//
+// # This corresponds to wasm.OpcodeF32MaxName wasm.OpcodeF64MaxName
+//
+// The engines are expected to pop two values from the stack, and push back the maximum of
+// these two values onto the stack. For example, stack [..., 100.1, 1.9] results in [..., 100.1].
+//
+// Note: WebAssembly specifies that min/max must always return NaN if one of values is NaN,
+// which is a different behavior different from math.Max.
+func newOperationMax(b float) unionOperation {
+	return unionOperation{Kind: operationKindMax, B1: byte(b)}
+}
+
+// NewOperationCopysign is a constructor for unionOperation with operationKindCopysign.
+//
+// # This corresponds to wasm.OpcodeF32CopysignName wasm.OpcodeF64CopysignName
+//
+// The engines are expected to pop two float values from the stack, and copy the signbit of
+// the first-popped value to the last one.
+// For example, stack [..., 1.213, -5.0] results in [..., -1.213].
+func newOperationCopysign(b float) unionOperation {
+	return unionOperation{Kind: operationKindCopysign, B1: byte(b)}
+}
+
+// NewOperationI32WrapFromI64 is a constructor for unionOperation with operationKindI32WrapFromI64.
+//
+// This corresponds to wasm.OpcodeI32WrapI64 and equivalent to uint64(uint32(v)) in Go.
+//
+// The engines are expected to replace the 64-bit int on top of the stack
+// with the corresponding 32-bit integer.
+func newOperationI32WrapFromI64() unionOperation {
+	return unionOperation{Kind: operationKindI32WrapFromI64}
+}
+
+// NewOperationITruncFromF is a constructor for unionOperation with operationKindITruncFromF.
+//
+// This corresponds to
+//
+//	wasm.OpcodeI32TruncF32SName wasm.OpcodeI32TruncF32UName wasm.OpcodeI32TruncF64SName
+//	wasm.OpcodeI32TruncF64UName wasm.OpcodeI64TruncF32SName wasm.OpcodeI64TruncF32UName wasm.OpcodeI64TruncF64SName
+//	wasm.OpcodeI64TruncF64UName. wasm.OpcodeI32TruncSatF32SName wasm.OpcodeI32TruncSatF32UName
+//	wasm.OpcodeI32TruncSatF64SName wasm.OpcodeI32TruncSatF64UName wasm.OpcodeI64TruncSatF32SName
+//	wasm.OpcodeI64TruncSatF32UName wasm.OpcodeI64TruncSatF64SName wasm.OpcodeI64TruncSatF64UName
+//
+// See [1] and [2] for when we encounter undefined behavior in the WebAssembly specification if NewOperationITruncFromF.NonTrapping == false.
+// To summarize, if the source float value is NaN or doesn't fit in the destination range of integers (incl. +=Inf),
+// then the runtime behavior is undefined. In wazero, the engines are expected to exit the execution in these undefined cases with
+// wasmruntime.ErrRuntimeInvalidConversionToInteger error.
+//
+// [1] https://www.w3.org/TR/2019/REC-wasm-core-1-20191205/#-hrefop-trunc-umathrmtruncmathsfu_m-n-z for unsigned integers.
+// [2] https://www.w3.org/TR/2019/REC-wasm-core-1-20191205/#-hrefop-trunc-smathrmtruncmathsfs_m-n-z for signed integers.
+//
+// nonTrapping true if this conversion is "nontrapping" in the sense of the
+// https://github.com/WebAssembly/spec/blob/ce4b6c4d47eb06098cc7ab2e81f24748da822f20/proposals/nontrapping-float-to-int-conversion/Overview.md
+func newOperationITruncFromF(inputType float, outputType signedInt, nonTrapping bool) unionOperation {
+	return unionOperation{
+		Kind: operationKindITruncFromF,
+		B1:   byte(inputType),
+		B2:   byte(outputType),
+		B3:   nonTrapping,
+	}
+}
+
+// NewOperationFConvertFromI is a constructor for unionOperation with operationKindFConvertFromI.
+//
+// This corresponds to
+//
+//	wasm.OpcodeF32ConvertI32SName wasm.OpcodeF32ConvertI32UName wasm.OpcodeF32ConvertI64SName wasm.OpcodeF32ConvertI64UName
+//	wasm.OpcodeF64ConvertI32SName wasm.OpcodeF64ConvertI32UName wasm.OpcodeF64ConvertI64SName wasm.OpcodeF64ConvertI64UName
+//
+// and equivalent to float32(uint32(x)), float32(int32(x)), etc in Go.
+func newOperationFConvertFromI(inputType signedInt, outputType float) unionOperation {
+	return unionOperation{
+		Kind: operationKindFConvertFromI,
+		B1:   byte(inputType),
+		B2:   byte(outputType),
+	}
+}
+
+// NewOperationF32DemoteFromF64 is a constructor for unionOperation with operationKindF32DemoteFromF64.
+//
+// This corresponds to wasm.OpcodeF32DemoteF64 and is equivalent float32(float64(v)).
+func newOperationF32DemoteFromF64() unionOperation {
+	return unionOperation{Kind: operationKindF32DemoteFromF64}
+}
+
+// NewOperationF64PromoteFromF32 is a constructor for unionOperation with operationKindF64PromoteFromF32.
+//
+// This corresponds to wasm.OpcodeF64PromoteF32 and is equivalent float64(float32(v)).
+func newOperationF64PromoteFromF32() unionOperation {
+	return unionOperation{Kind: operationKindF64PromoteFromF32}
+}
+
+// NewOperationI32ReinterpretFromF32 is a constructor for unionOperation with operationKindI32ReinterpretFromF32.
+//
+// This corresponds to wasm.OpcodeI32ReinterpretF32Name.
+func newOperationI32ReinterpretFromF32() unionOperation {
+	return unionOperation{Kind: operationKindI32ReinterpretFromF32}
+}
+
+// NewOperationI64ReinterpretFromF64 is a constructor for unionOperation with operationKindI64ReinterpretFromF64.
+//
+// This corresponds to wasm.OpcodeI64ReinterpretF64Name.
+func newOperationI64ReinterpretFromF64() unionOperation {
+	return unionOperation{Kind: operationKindI64ReinterpretFromF64}
+}
+
+// NewOperationF32ReinterpretFromI32 is a constructor for unionOperation with operationKindF32ReinterpretFromI32.
+//
+// This corresponds to wasm.OpcodeF32ReinterpretI32Name.
+func newOperationF32ReinterpretFromI32() unionOperation {
+	return unionOperation{Kind: operationKindF32ReinterpretFromI32}
+}
+
+// NewOperationF64ReinterpretFromI64 is a constructor for unionOperation with operationKindF64ReinterpretFromI64.
+//
+// This corresponds to wasm.OpcodeF64ReinterpretI64Name.
+func newOperationF64ReinterpretFromI64() unionOperation {
+	return unionOperation{Kind: operationKindF64ReinterpretFromI64}
+}
+
+// NewOperationExtend is a constructor for unionOperation with operationKindExtend.
+//
+// # This corresponds to wasm.OpcodeI64ExtendI32SName wasm.OpcodeI64ExtendI32UName
+//
+// The engines are expected to extend the 32-bit signed or unsigned int on top of the stack
+// as a 64-bit integer of corresponding signedness. For unsigned case, this is just reinterpreting the
+// underlying bit pattern as 64-bit integer. For signed case, this is sign-extension which preserves the
+// original integer's sign.
+func newOperationExtend(signed bool) unionOperation {
+	op := unionOperation{Kind: operationKindExtend}
+	if signed {
+		op.B1 = 1
+	}
+	return op
+}
+
+// NewOperationSignExtend32From8 is a constructor for unionOperation with operationKindSignExtend32From8.
+//
+// This corresponds to wasm.OpcodeI32Extend8SName.
+//
+// The engines are expected to sign-extend the first 8-bits of 32-bit in as signed 32-bit int.
+func newOperationSignExtend32From8() unionOperation {
+	return unionOperation{Kind: operationKindSignExtend32From8}
+}
+
+// NewOperationSignExtend32From16 is a constructor for unionOperation with operationKindSignExtend32From16.
+//
+// This corresponds to wasm.OpcodeI32Extend16SName.
+//
+// The engines are expected to sign-extend the first 16-bits of 32-bit in as signed 32-bit int.
+func newOperationSignExtend32From16() unionOperation {
+	return unionOperation{Kind: operationKindSignExtend32From16}
+}
+
+// NewOperationSignExtend64From8 is a constructor for unionOperation with operationKindSignExtend64From8.
+//
+// This corresponds to wasm.OpcodeI64Extend8SName.
+//
+// The engines are expected to sign-extend the first 8-bits of 64-bit in as signed 32-bit int.
+func newOperationSignExtend64From8() unionOperation {
+	return unionOperation{Kind: operationKindSignExtend64From8}
+}
+
+// NewOperationSignExtend64From16 is a constructor for unionOperation with operationKindSignExtend64From16.
+//
+// This corresponds to wasm.OpcodeI64Extend16SName.
+//
+// The engines are expected to sign-extend the first 16-bits of 64-bit in as signed 32-bit int.
+func newOperationSignExtend64From16() unionOperation {
+	return unionOperation{Kind: operationKindSignExtend64From16}
+}
+
+// NewOperationSignExtend64From32 is a constructor for unionOperation with operationKindSignExtend64From32.
+//
+// This corresponds to wasm.OpcodeI64Extend32SName.
+//
+// The engines are expected to sign-extend the first 32-bits of 64-bit in as signed 32-bit int.
+func newOperationSignExtend64From32() unionOperation {
+	return unionOperation{Kind: operationKindSignExtend64From32}
+}
+
+// NewOperationMemoryInit is a constructor for unionOperation with operationKindMemoryInit.
+//
+// This corresponds to wasm.OpcodeMemoryInitName.
+//
+// dataIndex is the index of the data instance in ModuleInstance.DataInstances
+// by which this operation instantiates a part of the memory.
+func newOperationMemoryInit(dataIndex uint32) unionOperation {
+	return unionOperation{Kind: operationKindMemoryInit, U1: uint64(dataIndex)}
+}
+
+// NewOperationDataDrop implements Operation.
+//
+// This corresponds to wasm.OpcodeDataDropName.
+//
+// dataIndex is the index of the data instance in ModuleInstance.DataInstances
+// which this operation drops.
+func newOperationDataDrop(dataIndex uint32) unionOperation {
+	return unionOperation{Kind: operationKindDataDrop, U1: uint64(dataIndex)}
+}
+
+// NewOperationMemoryCopy is a consuctor for unionOperation with operationKindMemoryCopy.
+//
+// This corresponds to wasm.OpcodeMemoryCopyName.
+func newOperationMemoryCopy() unionOperation {
+	return unionOperation{Kind: operationKindMemoryCopy}
+}
+
+// NewOperationMemoryFill is a consuctor for unionOperation with operationKindMemoryFill.
+func newOperationMemoryFill() unionOperation {
+	return unionOperation{Kind: operationKindMemoryFill}
+}
+
+// NewOperationTableInit is a constructor for unionOperation with operationKindTableInit.
+//
+// This corresponds to wasm.OpcodeTableInitName.
+//
+// elemIndex is the index of the element by which this operation initializes a part of the table.
+// tableIndex is the index of the table on which this operation initialize by the target element.
+func newOperationTableInit(elemIndex, tableIndex uint32) unionOperation {
+	return unionOperation{Kind: operationKindTableInit, U1: uint64(elemIndex), U2: uint64(tableIndex)}
+}
+
+// NewOperationElemDrop is a constructor for unionOperation with operationKindElemDrop.
+//
+// This corresponds to wasm.OpcodeElemDropName.
+//
+// elemIndex is the index of the element which this operation drops.
+func newOperationElemDrop(elemIndex uint32) unionOperation {
+	return unionOperation{Kind: operationKindElemDrop, U1: uint64(elemIndex)}
+}
+
+// NewOperationTableCopy implements Operation.
+//
+// This corresponds to wasm.OpcodeTableCopyName.
+func newOperationTableCopy(srcTableIndex, dstTableIndex uint32) unionOperation {
+	return unionOperation{Kind: operationKindTableCopy, U1: uint64(srcTableIndex), U2: uint64(dstTableIndex)}
+}
+
+// NewOperationRefFunc constructor for unionOperation with operationKindRefFunc.
+//
+// This corresponds to wasm.OpcodeRefFuncName, and engines are expected to
+// push the opaque pointer value of engine specific func for the given FunctionIndex.
+//
+// Note: in wazero, we express any reference types (funcref or externref) as opaque pointers which is uint64.
+// Therefore, the engine implementations emit instructions to push the address of *function onto the stack.
+func newOperationRefFunc(functionIndex uint32) unionOperation {
+	return unionOperation{Kind: operationKindRefFunc, U1: uint64(functionIndex)}
+}
+
+// NewOperationTableGet constructor for unionOperation with operationKindTableGet.
+//
+// This corresponds to wasm.OpcodeTableGetName.
+func newOperationTableGet(tableIndex uint32) unionOperation {
+	return unionOperation{Kind: operationKindTableGet, U1: uint64(tableIndex)}
+}
+
+// NewOperationTableSet constructor for unionOperation with operationKindTableSet.
+//
+// This corresponds to wasm.OpcodeTableSetName.
+func newOperationTableSet(tableIndex uint32) unionOperation {
+	return unionOperation{Kind: operationKindTableSet, U1: uint64(tableIndex)}
+}
+
+// NewOperationTableSize constructor for unionOperation with operationKindTableSize.
+//
+// This corresponds to wasm.OpcodeTableSizeName.
+func newOperationTableSize(tableIndex uint32) unionOperation {
+	return unionOperation{Kind: operationKindTableSize, U1: uint64(tableIndex)}
+}
+
+// NewOperationTableGrow constructor for unionOperation with operationKindTableGrow.
+//
+// This corresponds to wasm.OpcodeTableGrowName.
+func newOperationTableGrow(tableIndex uint32) unionOperation {
+	return unionOperation{Kind: operationKindTableGrow, U1: uint64(tableIndex)}
+}
+
+// NewOperationTableFill constructor for unionOperation with operationKindTableFill.
+//
+// This corresponds to wasm.OpcodeTableFillName.
+func newOperationTableFill(tableIndex uint32) unionOperation {
+	return unionOperation{Kind: operationKindTableFill, U1: uint64(tableIndex)}
+}
+
+// NewOperationV128Const constructor for unionOperation with operationKindV128Const
+func newOperationV128Const(lo, hi uint64) unionOperation {
+	return unionOperation{Kind: operationKindV128Const, U1: lo, U2: hi}
+}
+
+// shape corresponds to a shape of v128 values.
+// https://webassembly.github.io/spec/core/syntax/instructions.html#syntax-shape
+type shape = byte
+
+const (
+	shapeI8x16 shape = iota
+	shapeI16x8
+	shapeI32x4
+	shapeI64x2
+	shapeF32x4
+	shapeF64x2
+)
+
+func shapeName(s shape) (ret string) {
+	switch s {
+	case shapeI8x16:
+		ret = "I8x16"
+	case shapeI16x8:
+		ret = "I16x8"
+	case shapeI32x4:
+		ret = "I32x4"
+	case shapeI64x2:
+		ret = "I64x2"
+	case shapeF32x4:
+		ret = "F32x4"
+	case shapeF64x2:
+		ret = "F64x2"
+	}
+	return
+}
+
+// NewOperationV128Add constructor for unionOperation with operationKindV128Add.
+//
+// This corresponds to wasm.OpcodeVecI8x16AddName wasm.OpcodeVecI16x8AddName wasm.OpcodeVecI32x4AddName
+//
+//	wasm.OpcodeVecI64x2AddName wasm.OpcodeVecF32x4AddName wasm.OpcodeVecF64x2AddName
+func newOperationV128Add(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128Add, B1: shape}
+}
+
+// NewOperationV128Sub constructor for unionOperation with operationKindV128Sub.
+//
+// This corresponds to wasm.OpcodeVecI8x16SubName wasm.OpcodeVecI16x8SubName wasm.OpcodeVecI32x4SubName
+//
+//	wasm.OpcodeVecI64x2SubName wasm.OpcodeVecF32x4SubName wasm.OpcodeVecF64x2SubName
+func newOperationV128Sub(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128Sub, B1: shape}
+}
+
+// v128LoadType represents a type of wasm.OpcodeVecV128Load* instructions.
+type v128LoadType = byte
+
+const (
+	// v128LoadType128 corresponds to wasm.OpcodeVecV128LoadName.
+	v128LoadType128 v128LoadType = iota
+	// v128LoadType8x8s corresponds to wasm.OpcodeVecV128Load8x8SName.
+	v128LoadType8x8s
+	// v128LoadType8x8u corresponds to wasm.OpcodeVecV128Load8x8UName.
+	v128LoadType8x8u
+	// v128LoadType16x4s corresponds to wasm.OpcodeVecV128Load16x4SName
+	v128LoadType16x4s
+	// v128LoadType16x4u corresponds to wasm.OpcodeVecV128Load16x4UName
+	v128LoadType16x4u
+	// v128LoadType32x2s corresponds to wasm.OpcodeVecV128Load32x2SName
+	v128LoadType32x2s
+	// v128LoadType32x2u corresponds to wasm.OpcodeVecV128Load32x2UName
+	v128LoadType32x2u
+	// v128LoadType8Splat corresponds to wasm.OpcodeVecV128Load8SplatName
+	v128LoadType8Splat
+	// v128LoadType16Splat corresponds to wasm.OpcodeVecV128Load16SplatName
+	v128LoadType16Splat
+	// v128LoadType32Splat corresponds to wasm.OpcodeVecV128Load32SplatName
+	v128LoadType32Splat
+	// v128LoadType64Splat corresponds to wasm.OpcodeVecV128Load64SplatName
+	v128LoadType64Splat
+	// v128LoadType32zero corresponds to wasm.OpcodeVecV128Load32zeroName
+	v128LoadType32zero
+	// v128LoadType64zero corresponds to wasm.OpcodeVecV128Load64zeroName
+	v128LoadType64zero
+)
+
+// NewOperationV128Load is a constructor for unionOperation with operationKindV128Load.
+//
+// This corresponds to
+//
+//	wasm.OpcodeVecV128LoadName wasm.OpcodeVecV128Load8x8SName wasm.OpcodeVecV128Load8x8UName
+//	wasm.OpcodeVecV128Load16x4SName wasm.OpcodeVecV128Load16x4UName wasm.OpcodeVecV128Load32x2SName
+//	wasm.OpcodeVecV128Load32x2UName wasm.OpcodeVecV128Load8SplatName wasm.OpcodeVecV128Load16SplatName
+//	wasm.OpcodeVecV128Load32SplatName wasm.OpcodeVecV128Load64SplatName wasm.OpcodeVecV128Load32zeroName
+//	wasm.OpcodeVecV128Load64zeroName
+func newOperationV128Load(loadType v128LoadType, arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindV128Load, B1: loadType, U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationV128LoadLane is a constructor for unionOperation with operationKindV128LoadLane.
+//
+// This corresponds to wasm.OpcodeVecV128Load8LaneName wasm.OpcodeVecV128Load16LaneName
+//
+//	wasm.OpcodeVecV128Load32LaneName wasm.OpcodeVecV128Load64LaneName.
+//
+// laneIndex is >=0 && <(128/LaneSize).
+// laneSize is either 8, 16, 32, or 64.
+func newOperationV128LoadLane(laneIndex, laneSize byte, arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindV128LoadLane, B1: laneSize, B2: laneIndex, U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationV128Store is a constructor for unionOperation with operationKindV128Store.
+//
+// This corresponds to wasm.OpcodeVecV128Load8LaneName wasm.OpcodeVecV128Load16LaneName
+//
+//	wasm.OpcodeVecV128Load32LaneName wasm.OpcodeVecV128Load64LaneName.
+func newOperationV128Store(arg memoryArg) unionOperation {
+	return unionOperation{
+		Kind: operationKindV128Store,
+		U1:   uint64(arg.Alignment),
+		U2:   uint64(arg.Offset),
+	}
+}
+
+// NewOperationV128StoreLane implements Operation.
+//
+// This corresponds to wasm.OpcodeVecV128Load8LaneName wasm.OpcodeVecV128Load16LaneName
+//
+//	wasm.OpcodeVecV128Load32LaneName wasm.OpcodeVecV128Load64LaneName.
+//
+// laneIndex is >=0 && <(128/LaneSize).
+// laneSize is either 8, 16, 32, or 64.
+func newOperationV128StoreLane(laneIndex byte, laneSize byte, arg memoryArg) unionOperation {
+	return unionOperation{
+		Kind: operationKindV128StoreLane,
+		B1:   laneSize,
+		B2:   laneIndex,
+		U1:   uint64(arg.Alignment),
+		U2:   uint64(arg.Offset),
+	}
+}
+
+// NewOperationV128ExtractLane is a constructor for unionOperation with operationKindV128ExtractLane.
+//
+// This corresponds to
+//
+//	wasm.OpcodeVecI8x16ExtractLaneSName wasm.OpcodeVecI8x16ExtractLaneUName
+//	wasm.OpcodeVecI16x8ExtractLaneSName wasm.OpcodeVecI16x8ExtractLaneUName
+//	wasm.OpcodeVecI32x4ExtractLaneName wasm.OpcodeVecI64x2ExtractLaneName
+//	wasm.OpcodeVecF32x4ExtractLaneName wasm.OpcodeVecF64x2ExtractLaneName.
+//
+// laneIndex is >=0 && <M where shape = NxM.
+// signed is used when shape is either i8x16 or i16x2 to specify whether to sign-extend or not.
+func newOperationV128ExtractLane(laneIndex byte, signed bool, shape shape) unionOperation {
+	return unionOperation{
+		Kind: operationKindV128ExtractLane,
+		B1:   shape,
+		B2:   laneIndex,
+		B3:   signed,
+	}
+}
+
+// NewOperationV128ReplaceLane is a constructor for unionOperation with operationKindV128ReplaceLane.
+//
+// This corresponds to
+//
+//	wasm.OpcodeVecI8x16ReplaceLaneName wasm.OpcodeVecI16x8ReplaceLaneName
+//	wasm.OpcodeVecI32x4ReplaceLaneName wasm.OpcodeVecI64x2ReplaceLaneName
+//	wasm.OpcodeVecF32x4ReplaceLaneName wasm.OpcodeVecF64x2ReplaceLaneName.
+//
+// laneIndex is >=0 && <M where shape = NxM.
+func newOperationV128ReplaceLane(laneIndex byte, shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128ReplaceLane, B1: shape, B2: laneIndex}
+}
+
+// NewOperationV128Splat is a constructor for unionOperation with operationKindV128Splat.
+//
+// This corresponds to
+//
+//	wasm.OpcodeVecI8x16SplatName wasm.OpcodeVecI16x8SplatName
+//	wasm.OpcodeVecI32x4SplatName wasm.OpcodeVecI64x2SplatName
+//	wasm.OpcodeVecF32x4SplatName wasm.OpcodeVecF64x2SplatName.
+func newOperationV128Splat(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128Splat, B1: shape}
+}
+
+// NewOperationV128Shuffle is a constructor for unionOperation with operationKindV128Shuffle.
+func newOperationV128Shuffle(lanes []uint64) unionOperation {
+	return unionOperation{Kind: operationKindV128Shuffle, Us: lanes}
+}
+
+// NewOperationV128Swizzle is a constructor for unionOperation with operationKindV128Swizzle.
+//
+// This corresponds to wasm.OpcodeVecI8x16SwizzleName.
+func newOperationV128Swizzle() unionOperation {
+	return unionOperation{Kind: operationKindV128Swizzle}
+}
+
+// NewOperationV128AnyTrue is a constructor for unionOperation with operationKindV128AnyTrue.
+//
+// This corresponds to wasm.OpcodeVecV128AnyTrueName.
+func newOperationV128AnyTrue() unionOperation {
+	return unionOperation{Kind: operationKindV128AnyTrue}
+}
+
+// NewOperationV128AllTrue is a constructor for unionOperation with operationKindV128AllTrue.
+//
+// This corresponds to
+//
+//	wasm.OpcodeVecI8x16AllTrueName wasm.OpcodeVecI16x8AllTrueName
+//	wasm.OpcodeVecI32x4AllTrueName wasm.OpcodeVecI64x2AllTrueName.
+func newOperationV128AllTrue(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128AllTrue, B1: shape}
+}
+
+// NewOperationV128BitMask is a constructor for unionOperation with operationKindV128BitMask.
+//
+// This corresponds to
+//
+//	wasm.OpcodeVecI8x16BitMaskName wasm.OpcodeVecI16x8BitMaskName
+//	wasm.OpcodeVecI32x4BitMaskName wasm.OpcodeVecI64x2BitMaskName.
+func newOperationV128BitMask(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128BitMask, B1: shape}
+}
+
+// NewOperationV128And is a constructor for unionOperation with operationKindV128And.
+//
+// This corresponds to wasm.OpcodeVecV128And.
+func newOperationV128And() unionOperation {
+	return unionOperation{Kind: operationKindV128And}
+}
+
+// NewOperationV128Not is a constructor for unionOperation with operationKindV128Not.
+//
+// This corresponds to wasm.OpcodeVecV128Not.
+func newOperationV128Not() unionOperation {
+	return unionOperation{Kind: operationKindV128Not}
+}
+
+// NewOperationV128Or is a constructor for unionOperation with operationKindV128Or.
+//
+// This corresponds to wasm.OpcodeVecV128Or.
+func newOperationV128Or() unionOperation {
+	return unionOperation{Kind: operationKindV128Or}
+}
+
+// NewOperationV128Xor is a constructor for unionOperation with operationKindV128Xor.
+//
+// This corresponds to wasm.OpcodeVecV128Xor.
+func newOperationV128Xor() unionOperation {
+	return unionOperation{Kind: operationKindV128Xor}
+}
+
+// NewOperationV128Bitselect is a constructor for unionOperation with operationKindV128Bitselect.
+//
+// This corresponds to wasm.OpcodeVecV128Bitselect.
+func newOperationV128Bitselect() unionOperation {
+	return unionOperation{Kind: operationKindV128Bitselect}
+}
+
+// NewOperationV128AndNot is a constructor for unionOperation with operationKindV128AndNot.
+//
+// This corresponds to wasm.OpcodeVecV128AndNot.
+func newOperationV128AndNot() unionOperation {
+	return unionOperation{Kind: operationKindV128AndNot}
+}
+
+// NewOperationV128Shl is a constructor for unionOperation with operationKindV128Shl.
+//
+// This corresponds to
+//
+//	wasm.OpcodeVecI8x16ShlName wasm.OpcodeVecI16x8ShlName
+//	wasm.OpcodeVecI32x4ShlName wasm.OpcodeVecI64x2ShlName
+func newOperationV128Shl(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128Shl, B1: shape}
+}
+
+// NewOperationV128Shr is a constructor for unionOperation with operationKindV128Shr.
+//
+// This corresponds to
+//
+//	wasm.OpcodeVecI8x16ShrSName wasm.OpcodeVecI8x16ShrUName wasm.OpcodeVecI16x8ShrSName
+//	wasm.OpcodeVecI16x8ShrUName wasm.OpcodeVecI32x4ShrSName wasm.OpcodeVecI32x4ShrUName.
+//	wasm.OpcodeVecI64x2ShrSName wasm.OpcodeVecI64x2ShrUName.
+func newOperationV128Shr(shape shape, signed bool) unionOperation {
+	return unionOperation{Kind: operationKindV128Shr, B1: shape, B3: signed}
+}
+
+// NewOperationV128Cmp is a constructor for unionOperation with operationKindV128Cmp.
+//
+// This corresponds to
+//
+//	wasm.OpcodeVecI8x16EqName, wasm.OpcodeVecI8x16NeName, wasm.OpcodeVecI8x16LtSName, wasm.OpcodeVecI8x16LtUName, wasm.OpcodeVecI8x16GtSName,
+//	wasm.OpcodeVecI8x16GtUName, wasm.OpcodeVecI8x16LeSName, wasm.OpcodeVecI8x16LeUName, wasm.OpcodeVecI8x16GeSName, wasm.OpcodeVecI8x16GeUName,
+//	wasm.OpcodeVecI16x8EqName, wasm.OpcodeVecI16x8NeName, wasm.OpcodeVecI16x8LtSName, wasm.OpcodeVecI16x8LtUName, wasm.OpcodeVecI16x8GtSName,
+//	wasm.OpcodeVecI16x8GtUName, wasm.OpcodeVecI16x8LeSName, wasm.OpcodeVecI16x8LeUName, wasm.OpcodeVecI16x8GeSName, wasm.OpcodeVecI16x8GeUName,
+//	wasm.OpcodeVecI32x4EqName, wasm.OpcodeVecI32x4NeName, wasm.OpcodeVecI32x4LtSName, wasm.OpcodeVecI32x4LtUName, wasm.OpcodeVecI32x4GtSName,
+//	wasm.OpcodeVecI32x4GtUName, wasm.OpcodeVecI32x4LeSName, wasm.OpcodeVecI32x4LeUName, wasm.OpcodeVecI32x4GeSName, wasm.OpcodeVecI32x4GeUName,
+//	wasm.OpcodeVecI64x2EqName, wasm.OpcodeVecI64x2NeName, wasm.OpcodeVecI64x2LtSName, wasm.OpcodeVecI64x2GtSName, wasm.OpcodeVecI64x2LeSName,
+//	wasm.OpcodeVecI64x2GeSName, wasm.OpcodeVecF32x4EqName, wasm.OpcodeVecF32x4NeName, wasm.OpcodeVecF32x4LtName, wasm.OpcodeVecF32x4GtName,
+//	wasm.OpcodeVecF32x4LeName, wasm.OpcodeVecF32x4GeName, wasm.OpcodeVecF64x2EqName, wasm.OpcodeVecF64x2NeName, wasm.OpcodeVecF64x2LtName,
+//	wasm.OpcodeVecF64x2GtName, wasm.OpcodeVecF64x2LeName, wasm.OpcodeVecF64x2GeName
+func newOperationV128Cmp(cmpType v128CmpType) unionOperation {
+	return unionOperation{Kind: operationKindV128Cmp, B1: cmpType}
+}
+
+// v128CmpType represents a type of vector comparison operation.
+type v128CmpType = byte
+
+const (
+	// v128CmpTypeI8x16Eq corresponds to wasm.OpcodeVecI8x16EqName.
+	v128CmpTypeI8x16Eq v128CmpType = iota
+	// v128CmpTypeI8x16Ne corresponds to wasm.OpcodeVecI8x16NeName.
+	v128CmpTypeI8x16Ne
+	// v128CmpTypeI8x16LtS corresponds to wasm.OpcodeVecI8x16LtSName.
+	v128CmpTypeI8x16LtS
+	// v128CmpTypeI8x16LtU corresponds to wasm.OpcodeVecI8x16LtUName.
+	v128CmpTypeI8x16LtU
+	// v128CmpTypeI8x16GtS corresponds to wasm.OpcodeVecI8x16GtSName.
+	v128CmpTypeI8x16GtS
+	// v128CmpTypeI8x16GtU corresponds to wasm.OpcodeVecI8x16GtUName.
+	v128CmpTypeI8x16GtU
+	// v128CmpTypeI8x16LeS corresponds to wasm.OpcodeVecI8x16LeSName.
+	v128CmpTypeI8x16LeS
+	// v128CmpTypeI8x16LeU corresponds to wasm.OpcodeVecI8x16LeUName.
+	v128CmpTypeI8x16LeU
+	// v128CmpTypeI8x16GeS corresponds to wasm.OpcodeVecI8x16GeSName.
+	v128CmpTypeI8x16GeS
+	// v128CmpTypeI8x16GeU corresponds to wasm.OpcodeVecI8x16GeUName.
+	v128CmpTypeI8x16GeU
+	// v128CmpTypeI16x8Eq corresponds to wasm.OpcodeVecI16x8EqName.
+	v128CmpTypeI16x8Eq
+	// v128CmpTypeI16x8Ne corresponds to wasm.OpcodeVecI16x8NeName.
+	v128CmpTypeI16x8Ne
+	// v128CmpTypeI16x8LtS corresponds to wasm.OpcodeVecI16x8LtSName.
+	v128CmpTypeI16x8LtS
+	// v128CmpTypeI16x8LtU corresponds to wasm.OpcodeVecI16x8LtUName.
+	v128CmpTypeI16x8LtU
+	// v128CmpTypeI16x8GtS corresponds to wasm.OpcodeVecI16x8GtSName.
+	v128CmpTypeI16x8GtS
+	// v128CmpTypeI16x8GtU corresponds to wasm.OpcodeVecI16x8GtUName.
+	v128CmpTypeI16x8GtU
+	// v128CmpTypeI16x8LeS corresponds to wasm.OpcodeVecI16x8LeSName.
+	v128CmpTypeI16x8LeS
+	// v128CmpTypeI16x8LeU corresponds to wasm.OpcodeVecI16x8LeUName.
+	v128CmpTypeI16x8LeU
+	// v128CmpTypeI16x8GeS corresponds to wasm.OpcodeVecI16x8GeSName.
+	v128CmpTypeI16x8GeS
+	// v128CmpTypeI16x8GeU corresponds to wasm.OpcodeVecI16x8GeUName.
+	v128CmpTypeI16x8GeU
+	// v128CmpTypeI32x4Eq corresponds to wasm.OpcodeVecI32x4EqName.
+	v128CmpTypeI32x4Eq
+	// v128CmpTypeI32x4Ne corresponds to wasm.OpcodeVecI32x4NeName.
+	v128CmpTypeI32x4Ne
+	// v128CmpTypeI32x4LtS corresponds to wasm.OpcodeVecI32x4LtSName.
+	v128CmpTypeI32x4LtS
+	// v128CmpTypeI32x4LtU corresponds to wasm.OpcodeVecI32x4LtUName.
+	v128CmpTypeI32x4LtU
+	// v128CmpTypeI32x4GtS corresponds to wasm.OpcodeVecI32x4GtSName.
+	v128CmpTypeI32x4GtS
+	// v128CmpTypeI32x4GtU corresponds to wasm.OpcodeVecI32x4GtUName.
+	v128CmpTypeI32x4GtU
+	// v128CmpTypeI32x4LeS corresponds to wasm.OpcodeVecI32x4LeSName.
+	v128CmpTypeI32x4LeS
+	// v128CmpTypeI32x4LeU corresponds to wasm.OpcodeVecI32x4LeUName.
+	v128CmpTypeI32x4LeU
+	// v128CmpTypeI32x4GeS corresponds to wasm.OpcodeVecI32x4GeSName.
+	v128CmpTypeI32x4GeS
+	// v128CmpTypeI32x4GeU corresponds to wasm.OpcodeVecI32x4GeUName.
+	v128CmpTypeI32x4GeU
+	// v128CmpTypeI64x2Eq corresponds to wasm.OpcodeVecI64x2EqName.
+	v128CmpTypeI64x2Eq
+	// v128CmpTypeI64x2Ne corresponds to wasm.OpcodeVecI64x2NeName.
+	v128CmpTypeI64x2Ne
+	// v128CmpTypeI64x2LtS corresponds to wasm.OpcodeVecI64x2LtSName.
+	v128CmpTypeI64x2LtS
+	// v128CmpTypeI64x2GtS corresponds to wasm.OpcodeVecI64x2GtSName.
+	v128CmpTypeI64x2GtS
+	// v128CmpTypeI64x2LeS corresponds to wasm.OpcodeVecI64x2LeSName.
+	v128CmpTypeI64x2LeS
+	// v128CmpTypeI64x2GeS corresponds to wasm.OpcodeVecI64x2GeSName.
+	v128CmpTypeI64x2GeS
+	// v128CmpTypeF32x4Eq corresponds to wasm.OpcodeVecF32x4EqName.
+	v128CmpTypeF32x4Eq
+	// v128CmpTypeF32x4Ne corresponds to wasm.OpcodeVecF32x4NeName.
+	v128CmpTypeF32x4Ne
+	// v128CmpTypeF32x4Lt corresponds to wasm.OpcodeVecF32x4LtName.
+	v128CmpTypeF32x4Lt
+	// v128CmpTypeF32x4Gt corresponds to wasm.OpcodeVecF32x4GtName.
+	v128CmpTypeF32x4Gt
+	// v128CmpTypeF32x4Le corresponds to wasm.OpcodeVecF32x4LeName.
+	v128CmpTypeF32x4Le
+	// v128CmpTypeF32x4Ge corresponds to wasm.OpcodeVecF32x4GeName.
+	v128CmpTypeF32x4Ge
+	// v128CmpTypeF64x2Eq corresponds to wasm.OpcodeVecF64x2EqName.
+	v128CmpTypeF64x2Eq
+	// v128CmpTypeF64x2Ne corresponds to wasm.OpcodeVecF64x2NeName.
+	v128CmpTypeF64x2Ne
+	// v128CmpTypeF64x2Lt corresponds to wasm.OpcodeVecF64x2LtName.
+	v128CmpTypeF64x2Lt
+	// v128CmpTypeF64x2Gt corresponds to wasm.OpcodeVecF64x2GtName.
+	v128CmpTypeF64x2Gt
+	// v128CmpTypeF64x2Le corresponds to wasm.OpcodeVecF64x2LeName.
+	v128CmpTypeF64x2Le
+	// v128CmpTypeF64x2Ge corresponds to wasm.OpcodeVecF64x2GeName.
+	v128CmpTypeF64x2Ge
+)
+
+// NewOperationV128AddSat is a constructor for unionOperation with operationKindV128AddSat.
+//
+// This corresponds to wasm.OpcodeVecI8x16AddSatUName wasm.OpcodeVecI8x16AddSatSName
+//
+//	wasm.OpcodeVecI16x8AddSatUName wasm.OpcodeVecI16x8AddSatSName
+//
+// shape is either shapeI8x16 or shapeI16x8.
+func newOperationV128AddSat(shape shape, signed bool) unionOperation {
+	return unionOperation{Kind: operationKindV128AddSat, B1: shape, B3: signed}
+}
+
+// NewOperationV128SubSat is a constructor for unionOperation with operationKindV128SubSat.
+//
+// This corresponds to wasm.OpcodeVecI8x16SubSatUName wasm.OpcodeVecI8x16SubSatSName
+//
+//	wasm.OpcodeVecI16x8SubSatUName wasm.OpcodeVecI16x8SubSatSName
+//
+// shape is either shapeI8x16 or shapeI16x8.
+func newOperationV128SubSat(shape shape, signed bool) unionOperation {
+	return unionOperation{Kind: operationKindV128SubSat, B1: shape, B3: signed}
+}
+
+// NewOperationV128Mul is a constructor for unionOperation with operationKindV128Mul
+//
+// This corresponds to wasm.OpcodeVecF32x4MulName wasm.OpcodeVecF64x2MulName
+//
+//		wasm.OpcodeVecI16x8MulName wasm.OpcodeVecI32x4MulName wasm.OpcodeVecI64x2MulName.
+//	 shape is either shapeI16x8, shapeI32x4, shapeI64x2, shapeF32x4 or shapeF64x2.
+func newOperationV128Mul(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128Mul, B1: shape}
+}
+
+// NewOperationV128Div is a constructor for unionOperation with operationKindV128Div.
+//
+// This corresponds to wasm.OpcodeVecF32x4DivName wasm.OpcodeVecF64x2DivName.
+// shape is either shapeF32x4 or shapeF64x2.
+func newOperationV128Div(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128Div, B1: shape}
+}
+
+// NewOperationV128Neg is a constructor for unionOperation with operationKindV128Neg.
+//
+// This corresponds to wasm.OpcodeVecI8x16NegName wasm.OpcodeVecI16x8NegName wasm.OpcodeVecI32x4NegName
+//
+//	wasm.OpcodeVecI64x2NegName wasm.OpcodeVecF32x4NegName wasm.OpcodeVecF64x2NegName.
+func newOperationV128Neg(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128Neg, B1: shape}
+}
+
+// NewOperationV128Sqrt is a constructor for unionOperation with 128operationKindV128Sqrt.
+//
+// shape is either shapeF32x4 or shapeF64x2.
+// This corresponds to wasm.OpcodeVecF32x4SqrtName wasm.OpcodeVecF64x2SqrtName.
+func newOperationV128Sqrt(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128Sqrt, B1: shape}
+}
+
+// NewOperationV128Abs is a constructor for unionOperation with operationKindV128Abs.
+//
+// This corresponds to wasm.OpcodeVecI8x16AbsName wasm.OpcodeVecI16x8AbsName wasm.OpcodeVecI32x4AbsName
+//
+//	wasm.OpcodeVecI64x2AbsName wasm.OpcodeVecF32x4AbsName wasm.OpcodeVecF64x2AbsName.
+func newOperationV128Abs(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128Abs, B1: shape}
+}
+
+// NewOperationV128Popcnt is a constructor for unionOperation with operationKindV128Popcnt.
+//
+// This corresponds to wasm.OpcodeVecI8x16PopcntName.
+func newOperationV128Popcnt(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128Popcnt, B1: shape}
+}
+
+// NewOperationV128Min is a constructor for unionOperation with operationKindV128Min.
+//
+// This corresponds to
+//
+//	wasm.OpcodeVecI8x16MinSName wasm.OpcodeVecI8x16MinUName　wasm.OpcodeVecI16x8MinSName wasm.OpcodeVecI16x8MinUName
+//	wasm.OpcodeVecI32x4MinSName wasm.OpcodeVecI32x4MinUName　wasm.OpcodeVecI16x8MinSName wasm.OpcodeVecI16x8MinUName
+//	wasm.OpcodeVecF32x4MinName wasm.OpcodeVecF64x2MinName
+func newOperationV128Min(shape shape, signed bool) unionOperation {
+	return unionOperation{Kind: operationKindV128Min, B1: shape, B3: signed}
+}
+
+// NewOperationV128Max is a constructor for unionOperation with operationKindV128Max.
+//
+// This corresponds to
+//
+//	wasm.OpcodeVecI8x16MaxSName wasm.OpcodeVecI8x16MaxUName　wasm.OpcodeVecI16x8MaxSName wasm.OpcodeVecI16x8MaxUName
+//	wasm.OpcodeVecI32x4MaxSName wasm.OpcodeVecI32x4MaxUName　wasm.OpcodeVecI16x8MaxSName wasm.OpcodeVecI16x8MaxUName
+//	wasm.OpcodeVecF32x4MaxName wasm.OpcodeVecF64x2MaxName.
+func newOperationV128Max(shape shape, signed bool) unionOperation {
+	return unionOperation{Kind: operationKindV128Max, B1: shape, B3: signed}
+}
+
+// NewOperationV128AvgrU is a constructor for unionOperation with operationKindV128AvgrU.
+//
+// This corresponds to wasm.OpcodeVecI8x16AvgrUName.
+func newOperationV128AvgrU(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128AvgrU, B1: shape}
+}
+
+// NewOperationV128Pmin is a constructor for unionOperation with operationKindV128Pmin.
+//
+// This corresponds to wasm.OpcodeVecF32x4PminName wasm.OpcodeVecF64x2PminName.
+func newOperationV128Pmin(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128Pmin, B1: shape}
+}
+
+// NewOperationV128Pmax is a constructor for unionOperation with operationKindV128Pmax.
+//
+// This corresponds to wasm.OpcodeVecF32x4PmaxName wasm.OpcodeVecF64x2PmaxName.
+func newOperationV128Pmax(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128Pmax, B1: shape}
+}
+
+// NewOperationV128Ceil is a constructor for unionOperation with operationKindV128Ceil.
+//
+// This corresponds to wasm.OpcodeVecF32x4CeilName wasm.OpcodeVecF64x2CeilName
+func newOperationV128Ceil(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128Ceil, B1: shape}
+}
+
+// NewOperationV128Floor is a constructor for unionOperation with operationKindV128Floor.
+//
+// This corresponds to wasm.OpcodeVecF32x4FloorName wasm.OpcodeVecF64x2FloorName
+func newOperationV128Floor(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128Floor, B1: shape}
+}
+
+// NewOperationV128Trunc is a constructor for unionOperation with operationKindV128Trunc.
+//
+// This corresponds to wasm.OpcodeVecF32x4TruncName wasm.OpcodeVecF64x2TruncName
+func newOperationV128Trunc(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128Trunc, B1: shape}
+}
+
+// NewOperationV128Nearest is a constructor for unionOperation with operationKindV128Nearest.
+//
+// This corresponds to wasm.OpcodeVecF32x4NearestName wasm.OpcodeVecF64x2NearestName
+func newOperationV128Nearest(shape shape) unionOperation {
+	return unionOperation{Kind: operationKindV128Nearest, B1: shape}
+}
+
+// NewOperationV128Extend is a constructor for unionOperation with operationKindV128Extend.
+//
+// This corresponds to
+//
+//	wasm.OpcodeVecI16x8ExtendLowI8x16SName wasm.OpcodeVecI16x8ExtendHighI8x16SName
+//	wasm.OpcodeVecI16x8ExtendLowI8x16UName wasm.OpcodeVecI16x8ExtendHighI8x16UName
+//	wasm.OpcodeVecI32x4ExtendLowI16x8SName wasm.OpcodeVecI32x4ExtendHighI16x8SName
+//	wasm.OpcodeVecI32x4ExtendLowI16x8UName wasm.OpcodeVecI32x4ExtendHighI16x8UName
+//	wasm.OpcodeVecI64x2ExtendLowI32x4SName wasm.OpcodeVecI64x2ExtendHighI32x4SName
+//	wasm.OpcodeVecI64x2ExtendLowI32x4UName wasm.OpcodeVecI64x2ExtendHighI32x4UName
+//
+// originshape is the shape of the original lanes for extension which is
+// either shapeI8x16, shapeI16x8, or shapeI32x4.
+// useLow true if it uses the lower half of vector for extension.
+func newOperationV128Extend(originshape shape, signed bool, useLow bool) unionOperation {
+	op := unionOperation{Kind: operationKindV128Extend}
+	op.B1 = originshape
+	if signed {
+		op.B2 = 1
+	}
+	op.B3 = useLow
+	return op
+}
+
+// NewOperationV128ExtMul is a constructor for unionOperation with operationKindV128ExtMul.
+//
+// This corresponds to
+//
+//		wasm.OpcodeVecI16x8ExtMulLowI8x16SName wasm.OpcodeVecI16x8ExtMulLowI8x16UName
+//		wasm.OpcodeVecI16x8ExtMulHighI8x16SName wasm.OpcodeVecI16x8ExtMulHighI8x16UName
+//	 wasm.OpcodeVecI32x4ExtMulLowI16x8SName wasm.OpcodeVecI32x4ExtMulLowI16x8UName
+//		wasm.OpcodeVecI32x4ExtMulHighI16x8SName wasm.OpcodeVecI32x4ExtMulHighI16x8UName
+//	 wasm.OpcodeVecI64x2ExtMulLowI32x4SName wasm.OpcodeVecI64x2ExtMulLowI32x4UName
+//		wasm.OpcodeVecI64x2ExtMulHighI32x4SName wasm.OpcodeVecI64x2ExtMulHighI32x4UName.
+//
+// originshape is the shape of the original lanes for extension which is
+// either shapeI8x16, shapeI16x8, or shapeI32x4.
+// useLow true if it uses the lower half of vector for extension.
+func newOperationV128ExtMul(originshape shape, signed bool, useLow bool) unionOperation {
+	op := unionOperation{Kind: operationKindV128ExtMul}
+	op.B1 = originshape
+	if signed {
+		op.B2 = 1
+	}
+	op.B3 = useLow
+	return op
+}
+
+// NewOperationV128Q15mulrSatS is a constructor for unionOperation with operationKindV128Q15mulrSatS.
+//
+// This corresponds to wasm.OpcodeVecI16x8Q15mulrSatSName
+func newOperationV128Q15mulrSatS() unionOperation {
+	return unionOperation{Kind: operationKindV128Q15mulrSatS}
+}
+
+// NewOperationV128ExtAddPairwise is a constructor for unionOperation with operationKindV128ExtAddPairwise.
+//
+// This corresponds to
+//
+//	wasm.OpcodeVecI16x8ExtaddPairwiseI8x16SName wasm.OpcodeVecI16x8ExtaddPairwiseI8x16UName
+//	wasm.OpcodeVecI32x4ExtaddPairwiseI16x8SName wasm.OpcodeVecI32x4ExtaddPairwiseI16x8UName.
+//
+// originshape is the shape of the original lanes for extension which is
+// either shapeI8x16, or shapeI16x8.
+func newOperationV128ExtAddPairwise(originshape shape, signed bool) unionOperation {
+	return unionOperation{Kind: operationKindV128ExtAddPairwise, B1: originshape, B3: signed}
+}
+
+// NewOperationV128FloatPromote is a constructor for unionOperation with NewOperationV128FloatPromote.
+//
+// This corresponds to wasm.OpcodeVecF64x2PromoteLowF32x4ZeroName
+// This discards the higher 64-bit of a vector, and promotes two
+// 32-bit floats in the lower 64-bit as two 64-bit floats.
+func newOperationV128FloatPromote() unionOperation {
+	return unionOperation{Kind: operationKindV128FloatPromote}
+}
+
+// NewOperationV128FloatDemote is a constructor for unionOperation with NewOperationV128FloatDemote.
+//
+// This corresponds to wasm.OpcodeVecF32x4DemoteF64x2ZeroName.
+func newOperationV128FloatDemote() unionOperation {
+	return unionOperation{Kind: operationKindV128FloatDemote}
+}
+
+// NewOperationV128FConvertFromI is a constructor for unionOperation with NewOperationV128FConvertFromI.
+//
+// This corresponds to
+//
+//	wasm.OpcodeVecF32x4ConvertI32x4SName wasm.OpcodeVecF32x4ConvertI32x4UName
+//	wasm.OpcodeVecF64x2ConvertLowI32x4SName wasm.OpcodeVecF64x2ConvertLowI32x4UName.
+//
+// destinationshape is the shape of the destination lanes for conversion which is
+// either shapeF32x4, or shapeF64x2.
+func newOperationV128FConvertFromI(destinationshape shape, signed bool) unionOperation {
+	return unionOperation{Kind: operationKindV128FConvertFromI, B1: destinationshape, B3: signed}
+}
+
+// NewOperationV128Dot is a constructor for unionOperation with operationKindV128Dot.
+//
+// This corresponds to wasm.OpcodeVecI32x4DotI16x8SName
+func newOperationV128Dot() unionOperation {
+	return unionOperation{Kind: operationKindV128Dot}
+}
+
+// NewOperationV128Narrow is a constructor for unionOperation with operationKindV128Narrow.
+//
+// This corresponds to
+//
+//	wasm.OpcodeVecI8x16NarrowI16x8SName wasm.OpcodeVecI8x16NarrowI16x8UName
+//	wasm.OpcodeVecI16x8NarrowI32x4SName wasm.OpcodeVecI16x8NarrowI32x4UName.
+//
+// originshape is the shape of the original lanes for narrowing which is
+// either shapeI16x8, or shapeI32x4.
+func newOperationV128Narrow(originshape shape, signed bool) unionOperation {
+	return unionOperation{Kind: operationKindV128Narrow, B1: originshape, B3: signed}
+}
+
+// NewOperationV128ITruncSatFromF is a constructor for unionOperation with operationKindV128ITruncSatFromF.
+//
+// This corresponds to
+//
+//	wasm.OpcodeVecI32x4TruncSatF64x2UZeroName wasm.OpcodeVecI32x4TruncSatF64x2SZeroName
+//	wasm.OpcodeVecI32x4TruncSatF32x4UName wasm.OpcodeVecI32x4TruncSatF32x4SName.
+//
+// originshape is the shape of the original lanes for truncation which is
+// either shapeF32x4, or shapeF64x2.
+func newOperationV128ITruncSatFromF(originshape shape, signed bool) unionOperation {
+	return unionOperation{Kind: operationKindV128ITruncSatFromF, B1: originshape, B3: signed}
+}
+
+// atomicArithmeticOp is the type for the operation kind of atomic arithmetic operations.
+type atomicArithmeticOp byte
+
+const (
+	// atomicArithmeticOpAdd is the kind for an add operation.
+	atomicArithmeticOpAdd atomicArithmeticOp = iota
+	// atomicArithmeticOpSub is the kind for a sub operation.
+	atomicArithmeticOpSub
+	// atomicArithmeticOpAnd is the kind for a bitwise and operation.
+	atomicArithmeticOpAnd
+	// atomicArithmeticOpOr is the kind for a bitwise or operation.
+	atomicArithmeticOpOr
+	// atomicArithmeticOpXor is the kind for a bitwise xor operation.
+	atomicArithmeticOpXor
+	// atomicArithmeticOpNop is the kind for a nop operation.
+	atomicArithmeticOpNop
+)
+
+// NewOperationAtomicMemoryWait is a constructor for unionOperation with operationKindAtomicMemoryWait.
+//
+// This corresponds to
+//
+//	wasm.OpcodeAtomicWait32Name wasm.OpcodeAtomicWait64Name
+func newOperationAtomicMemoryWait(unsignedType unsignedType, arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindAtomicMemoryWait, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationAtomicMemoryNotify is a constructor for unionOperation with operationKindAtomicMemoryNotify.
+//
+// This corresponds to
+//
+//	wasm.OpcodeAtomicNotifyName
+func newOperationAtomicMemoryNotify(arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindAtomicMemoryNotify, U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationAtomicFence is a constructor for unionOperation with operationKindAtomicFence.
+//
+// This corresponds to
+//
+//	wasm.OpcodeAtomicFenceName
+func newOperationAtomicFence() unionOperation {
+	return unionOperation{Kind: operationKindAtomicFence}
+}
+
+// NewOperationAtomicLoad is a constructor for unionOperation with operationKindAtomicLoad.
+//
+// This corresponds to
+//
+//	wasm.OpcodeAtomicI32LoadName wasm.OpcodeAtomicI64LoadName
+func newOperationAtomicLoad(unsignedType unsignedType, arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindAtomicLoad, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationAtomicLoad8 is a constructor for unionOperation with operationKindAtomicLoad8.
+//
+// This corresponds to
+//
+//	wasm.OpcodeAtomicI32Load8UName wasm.OpcodeAtomicI64Load8UName
+func newOperationAtomicLoad8(unsignedType unsignedType, arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindAtomicLoad8, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationAtomicLoad16 is a constructor for unionOperation with operationKindAtomicLoad16.
+//
+// This corresponds to
+//
+//	wasm.OpcodeAtomicI32Load16UName wasm.OpcodeAtomicI64Load16UName
+func newOperationAtomicLoad16(unsignedType unsignedType, arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindAtomicLoad16, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationAtomicStore is a constructor for unionOperation with operationKindAtomicStore.
+//
+// This corresponds to
+//
+//	wasm.OpcodeAtomicI32StoreName wasm.OpcodeAtomicI64StoreName
+func newOperationAtomicStore(unsignedType unsignedType, arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindAtomicStore, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationAtomicStore8 is a constructor for unionOperation with operationKindAtomicStore8.
+//
+// This corresponds to
+//
+//	wasm.OpcodeAtomicI32Store8UName wasm.OpcodeAtomicI64Store8UName
+func newOperationAtomicStore8(unsignedType unsignedType, arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindAtomicStore8, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationAtomicStore16 is a constructor for unionOperation with operationKindAtomicStore16.
+//
+// This corresponds to
+//
+//	wasm.OpcodeAtomicI32Store16UName wasm.OpcodeAtomicI64Store16UName
+func newOperationAtomicStore16(unsignedType unsignedType, arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindAtomicStore16, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationAtomicRMW is a constructor for unionOperation with operationKindAtomicRMW.
+//
+// This corresponds to
+//
+//	wasm.OpcodeAtomicI32RMWAddName wasm.OpcodeAtomicI64RmwAddName
+//	wasm.OpcodeAtomicI32RMWSubName wasm.OpcodeAtomicI64RmwSubName
+//	wasm.OpcodeAtomicI32RMWAndName wasm.OpcodeAtomicI64RmwAndName
+//	wasm.OpcodeAtomicI32RMWOrName wasm.OpcodeAtomicI64RmwOrName
+//	wasm.OpcodeAtomicI32RMWXorName wasm.OpcodeAtomicI64RmwXorName
+func newOperationAtomicRMW(unsignedType unsignedType, arg memoryArg, op atomicArithmeticOp) unionOperation {
+	return unionOperation{Kind: operationKindAtomicRMW, B1: byte(unsignedType), B2: byte(op), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationAtomicRMW8 is a constructor for unionOperation with operationKindAtomicRMW8.
+//
+// This corresponds to
+//
+//	wasm.OpcodeAtomicI32RMW8AddUName wasm.OpcodeAtomicI64Rmw8AddUName
+//	wasm.OpcodeAtomicI32RMW8SubUName wasm.OpcodeAtomicI64Rmw8SubUName
+//	wasm.OpcodeAtomicI32RMW8AndUName wasm.OpcodeAtomicI64Rmw8AndUName
+//	wasm.OpcodeAtomicI32RMW8OrUName wasm.OpcodeAtomicI64Rmw8OrUName
+//	wasm.OpcodeAtomicI32RMW8XorUName wasm.OpcodeAtomicI64Rmw8XorUName
+func newOperationAtomicRMW8(unsignedType unsignedType, arg memoryArg, op atomicArithmeticOp) unionOperation {
+	return unionOperation{Kind: operationKindAtomicRMW8, B1: byte(unsignedType), B2: byte(op), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationAtomicRMW16 is a constructor for unionOperation with operationKindAtomicRMW16.
+//
+// This corresponds to
+//
+//	wasm.OpcodeAtomicI32RMW16AddUName wasm.OpcodeAtomicI64Rmw16AddUName
+//	wasm.OpcodeAtomicI32RMW16SubUName wasm.OpcodeAtomicI64Rmw16SubUName
+//	wasm.OpcodeAtomicI32RMW16AndUName wasm.OpcodeAtomicI64Rmw16AndUName
+//	wasm.OpcodeAtomicI32RMW16OrUName wasm.OpcodeAtomicI64Rmw16OrUName
+//	wasm.OpcodeAtomicI32RMW16XorUName wasm.OpcodeAtomicI64Rmw16XorUName
+func newOperationAtomicRMW16(unsignedType unsignedType, arg memoryArg, op atomicArithmeticOp) unionOperation {
+	return unionOperation{Kind: operationKindAtomicRMW16, B1: byte(unsignedType), B2: byte(op), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationAtomicRMWCmpxchg is a constructor for unionOperation with operationKindAtomicRMWCmpxchg.
+//
+// This corresponds to
+//
+//	wasm.OpcodeAtomicI32RMWCmpxchgName wasm.OpcodeAtomicI64RmwCmpxchgName
+func newOperationAtomicRMWCmpxchg(unsignedType unsignedType, arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindAtomicRMWCmpxchg, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationAtomicRMW8Cmpxchg is a constructor for unionOperation with operationKindAtomicRMW8Cmpxchg.
+//
+// This corresponds to
+//
+//	wasm.OpcodeAtomicI32RMW8CmpxchgUName wasm.OpcodeAtomicI64Rmw8CmpxchgUName
+func newOperationAtomicRMW8Cmpxchg(unsignedType unsignedType, arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindAtomicRMW8Cmpxchg, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
+
+// NewOperationAtomicRMW16Cmpxchg is a constructor for unionOperation with operationKindAtomicRMW16Cmpxchg.
+//
+// This corresponds to
+//
+//	wasm.OpcodeAtomicI32RMW16CmpxchgUName wasm.OpcodeAtomicI64Rmw16CmpxchgUName
+func newOperationAtomicRMW16Cmpxchg(unsignedType unsignedType, arg memoryArg) unionOperation {
+	return unionOperation{Kind: operationKindAtomicRMW16Cmpxchg, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/signature.go b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/signature.go
new file mode 100644
index 000000000..7b9d5602d
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/signature.go
@@ -0,0 +1,767 @@
+package interpreter
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/wasm"
+)
+
+// signature represents how a Wasm opcode
+// manipulates the value stacks in terms of value types.
+type signature struct {
+	in, out []unsignedType
+}
+
+var (
+	signature_None_None    = &signature{}
+	signature_Unknown_None = &signature{
+		in: []unsignedType{unsignedTypeUnknown},
+	}
+	signature_None_I32 = &signature{
+		out: []unsignedType{unsignedTypeI32},
+	}
+	signature_None_I64 = &signature{
+		out: []unsignedType{unsignedTypeI64},
+	}
+	signature_None_V128 = &signature{
+		out: []unsignedType{unsignedTypeV128},
+	}
+	signature_None_F32 = &signature{
+		out: []unsignedType{unsignedTypeF32},
+	}
+	signature_None_F64 = &signature{
+		out: []unsignedType{unsignedTypeF64},
+	}
+	signature_I32_None = &signature{
+		in: []unsignedType{unsignedTypeI32},
+	}
+	signature_I64_None = &signature{
+		in: []unsignedType{unsignedTypeI64},
+	}
+	signature_F32_None = &signature{
+		in: []unsignedType{unsignedTypeF32},
+	}
+	signature_F64_None = &signature{
+		in: []unsignedType{unsignedTypeF64},
+	}
+	signature_V128_None = &signature{
+		in: []unsignedType{unsignedTypeV128},
+	}
+	signature_I32_I32 = &signature{
+		in:  []unsignedType{unsignedTypeI32},
+		out: []unsignedType{unsignedTypeI32},
+	}
+	signature_I32_I64 = &signature{
+		in:  []unsignedType{unsignedTypeI32},
+		out: []unsignedType{unsignedTypeI64},
+	}
+	signature_I64_I64 = &signature{
+		in:  []unsignedType{unsignedTypeI64},
+		out: []unsignedType{unsignedTypeI64},
+	}
+	signature_I32_F32 = &signature{
+		in:  []unsignedType{unsignedTypeI32},
+		out: []unsignedType{unsignedTypeF32},
+	}
+	signature_I32_F64 = &signature{
+		in:  []unsignedType{unsignedTypeI32},
+		out: []unsignedType{unsignedTypeF64},
+	}
+	signature_I64_I32 = &signature{
+		in:  []unsignedType{unsignedTypeI64},
+		out: []unsignedType{unsignedTypeI32},
+	}
+	signature_I64_F32 = &signature{
+		in:  []unsignedType{unsignedTypeI64},
+		out: []unsignedType{unsignedTypeF32},
+	}
+	signature_I64_F64 = &signature{
+		in:  []unsignedType{unsignedTypeI64},
+		out: []unsignedType{unsignedTypeF64},
+	}
+	signature_F32_I32 = &signature{
+		in:  []unsignedType{unsignedTypeF32},
+		out: []unsignedType{unsignedTypeI32},
+	}
+	signature_F32_I64 = &signature{
+		in:  []unsignedType{unsignedTypeF32},
+		out: []unsignedType{unsignedTypeI64},
+	}
+	signature_F32_F64 = &signature{
+		in:  []unsignedType{unsignedTypeF32},
+		out: []unsignedType{unsignedTypeF64},
+	}
+	signature_F32_F32 = &signature{
+		in:  []unsignedType{unsignedTypeF32},
+		out: []unsignedType{unsignedTypeF32},
+	}
+	signature_F64_I32 = &signature{
+		in:  []unsignedType{unsignedTypeF64},
+		out: []unsignedType{unsignedTypeI32},
+	}
+	signature_F64_F32 = &signature{
+		in:  []unsignedType{unsignedTypeF64},
+		out: []unsignedType{unsignedTypeF32},
+	}
+	signature_F64_I64 = &signature{
+		in:  []unsignedType{unsignedTypeF64},
+		out: []unsignedType{unsignedTypeI64},
+	}
+	signature_F64_F64 = &signature{
+		in:  []unsignedType{unsignedTypeF64},
+		out: []unsignedType{unsignedTypeF64},
+	}
+	signature_I32I32_None = &signature{
+		in: []unsignedType{unsignedTypeI32, unsignedTypeI32},
+	}
+
+	signature_I32I32_I32 = &signature{
+		in:  []unsignedType{unsignedTypeI32, unsignedTypeI32},
+		out: []unsignedType{unsignedTypeI32},
+	}
+	signature_I32I64_None = &signature{
+		in: []unsignedType{unsignedTypeI32, unsignedTypeI64},
+	}
+	signature_I32F32_None = &signature{
+		in: []unsignedType{unsignedTypeI32, unsignedTypeF32},
+	}
+	signature_I32F64_None = &signature{
+		in: []unsignedType{unsignedTypeI32, unsignedTypeF64},
+	}
+	signature_I64I32_I32 = &signature{
+		in:  []unsignedType{unsignedTypeI64, unsignedTypeI32},
+		out: []unsignedType{unsignedTypeI32},
+	}
+	signature_I64I64_I32 = &signature{
+		in:  []unsignedType{unsignedTypeI64, unsignedTypeI64},
+		out: []unsignedType{unsignedTypeI32},
+	}
+	signature_I64I64_I64 = &signature{
+		in:  []unsignedType{unsignedTypeI64, unsignedTypeI64},
+		out: []unsignedType{unsignedTypeI64},
+	}
+	signature_F32F32_I32 = &signature{
+		in:  []unsignedType{unsignedTypeF32, unsignedTypeF32},
+		out: []unsignedType{unsignedTypeI32},
+	}
+	signature_F32F32_F32 = &signature{
+		in:  []unsignedType{unsignedTypeF32, unsignedTypeF32},
+		out: []unsignedType{unsignedTypeF32},
+	}
+	signature_F64F64_I32 = &signature{
+		in:  []unsignedType{unsignedTypeF64, unsignedTypeF64},
+		out: []unsignedType{unsignedTypeI32},
+	}
+	signature_F64F64_F64 = &signature{
+		in:  []unsignedType{unsignedTypeF64, unsignedTypeF64},
+		out: []unsignedType{unsignedTypeF64},
+	}
+	signature_I32I32I32_None = &signature{
+		in: []unsignedType{unsignedTypeI32, unsignedTypeI32, unsignedTypeI32},
+	}
+	signature_I32I64I32_None = &signature{
+		in: []unsignedType{unsignedTypeI32, unsignedTypeI64, unsignedTypeI32},
+	}
+	signature_UnknownUnknownI32_Unknown = &signature{
+		in:  []unsignedType{unsignedTypeUnknown, unsignedTypeUnknown, unsignedTypeI32},
+		out: []unsignedType{unsignedTypeUnknown},
+	}
+	signature_V128V128_V128 = &signature{
+		in:  []unsignedType{unsignedTypeV128, unsignedTypeV128},
+		out: []unsignedType{unsignedTypeV128},
+	}
+	signature_V128V128V128_V32 = &signature{
+		in:  []unsignedType{unsignedTypeV128, unsignedTypeV128, unsignedTypeV128},
+		out: []unsignedType{unsignedTypeV128},
+	}
+	signature_I32_V128 = &signature{
+		in:  []unsignedType{unsignedTypeI32},
+		out: []unsignedType{unsignedTypeV128},
+	}
+	signature_I32V128_None = &signature{
+		in: []unsignedType{unsignedTypeI32, unsignedTypeV128},
+	}
+	signature_I32V128_V128 = &signature{
+		in:  []unsignedType{unsignedTypeI32, unsignedTypeV128},
+		out: []unsignedType{unsignedTypeV128},
+	}
+	signature_V128I32_V128 = &signature{
+		in:  []unsignedType{unsignedTypeV128, unsignedTypeI32},
+		out: []unsignedType{unsignedTypeV128},
+	}
+	signature_V128I64_V128 = &signature{
+		in:  []unsignedType{unsignedTypeV128, unsignedTypeI64},
+		out: []unsignedType{unsignedTypeV128},
+	}
+	signature_V128F32_V128 = &signature{
+		in:  []unsignedType{unsignedTypeV128, unsignedTypeF32},
+		out: []unsignedType{unsignedTypeV128},
+	}
+	signature_V128F64_V128 = &signature{
+		in:  []unsignedType{unsignedTypeV128, unsignedTypeF64},
+		out: []unsignedType{unsignedTypeV128},
+	}
+	signature_V128_I32 = &signature{
+		in:  []unsignedType{unsignedTypeV128},
+		out: []unsignedType{unsignedTypeI32},
+	}
+	signature_V128_I64 = &signature{
+		in:  []unsignedType{unsignedTypeV128},
+		out: []unsignedType{unsignedTypeI64},
+	}
+	signature_V128_F32 = &signature{
+		in:  []unsignedType{unsignedTypeV128},
+		out: []unsignedType{unsignedTypeF32},
+	}
+	signature_V128_F64 = &signature{
+		in:  []unsignedType{unsignedTypeV128},
+		out: []unsignedType{unsignedTypeF64},
+	}
+	signature_V128_V128 = &signature{
+		in:  []unsignedType{unsignedTypeV128},
+		out: []unsignedType{unsignedTypeV128},
+	}
+	signature_I64_V128 = &signature{
+		in:  []unsignedType{unsignedTypeI64},
+		out: []unsignedType{unsignedTypeV128},
+	}
+	signature_F32_V128 = &signature{
+		in:  []unsignedType{unsignedTypeF32},
+		out: []unsignedType{unsignedTypeV128},
+	}
+	signature_F64_V128 = &signature{
+		in:  []unsignedType{unsignedTypeF64},
+		out: []unsignedType{unsignedTypeV128},
+	}
+	signature_I32I64_I64 = &signature{
+		in:  []unsignedType{unsignedTypeI32, unsignedTypeI64},
+		out: []unsignedType{unsignedTypeI64},
+	}
+	signature_I32I32I64_I32 = &signature{
+		in:  []unsignedType{unsignedTypeI32, unsignedTypeI32, unsignedTypeI64},
+		out: []unsignedType{unsignedTypeI32},
+	}
+	signature_I32I64I64_I32 = &signature{
+		in:  []unsignedType{unsignedTypeI32, unsignedTypeI64, unsignedTypeI64},
+		out: []unsignedType{unsignedTypeI32},
+	}
+	signature_I32I32I32_I32 = &signature{
+		in:  []unsignedType{unsignedTypeI32, unsignedTypeI32, unsignedTypeI32},
+		out: []unsignedType{unsignedTypeI32},
+	}
+	signature_I32I64I64_I64 = &signature{
+		in:  []unsignedType{unsignedTypeI32, unsignedTypeI64, unsignedTypeI64},
+		out: []unsignedType{unsignedTypeI64},
+	}
+)
+
+// wasmOpcodeSignature returns the signature of given Wasm opcode.
+// Note that some of opcodes' signature vary depending on
+// the function instance (for example, local types).
+// "index" parameter is not used by most of opcodes.
+// The returned signature is used for stack validation when lowering Wasm's opcodes to interpreterir.
+func (c *compiler) wasmOpcodeSignature(op wasm.Opcode, index uint32) (*signature, error) {
+	switch op {
+	case wasm.OpcodeUnreachable, wasm.OpcodeNop, wasm.OpcodeBlock, wasm.OpcodeLoop:
+		return signature_None_None, nil
+	case wasm.OpcodeIf:
+		return signature_I32_None, nil
+	case wasm.OpcodeElse, wasm.OpcodeEnd, wasm.OpcodeBr:
+		return signature_None_None, nil
+	case wasm.OpcodeBrIf, wasm.OpcodeBrTable:
+		return signature_I32_None, nil
+	case wasm.OpcodeReturn:
+		return signature_None_None, nil
+	case wasm.OpcodeCall:
+		return c.funcTypeToSigs.get(c.funcs[index], false /* direct */), nil
+	case wasm.OpcodeCallIndirect:
+		return c.funcTypeToSigs.get(index, true /* call_indirect */), nil
+	case wasm.OpcodeDrop:
+		return signature_Unknown_None, nil
+	case wasm.OpcodeSelect, wasm.OpcodeTypedSelect:
+		return signature_UnknownUnknownI32_Unknown, nil
+	case wasm.OpcodeLocalGet:
+		inputLen := uint32(len(c.sig.Params))
+		if l := uint32(len(c.localTypes)) + inputLen; index >= l {
+			return nil, fmt.Errorf("invalid local index for local.get %d >= %d", index, l)
+		}
+		var t wasm.ValueType
+		if index < inputLen {
+			t = c.sig.Params[index]
+		} else {
+			t = c.localTypes[index-inputLen]
+		}
+		return wasmValueTypeToUnsignedOutSignature(t), nil
+	case wasm.OpcodeLocalSet:
+		inputLen := uint32(len(c.sig.Params))
+		if l := uint32(len(c.localTypes)) + inputLen; index >= l {
+			return nil, fmt.Errorf("invalid local index for local.get %d >= %d", index, l)
+		}
+		var t wasm.ValueType
+		if index < inputLen {
+			t = c.sig.Params[index]
+		} else {
+			t = c.localTypes[index-inputLen]
+		}
+		return wasmValueTypeToUnsignedInSignature(t), nil
+	case wasm.OpcodeLocalTee:
+		inputLen := uint32(len(c.sig.Params))
+		if l := uint32(len(c.localTypes)) + inputLen; index >= l {
+			return nil, fmt.Errorf("invalid local index for local.get %d >= %d", index, l)
+		}
+		var t wasm.ValueType
+		if index < inputLen {
+			t = c.sig.Params[index]
+		} else {
+			t = c.localTypes[index-inputLen]
+		}
+		return wasmValueTypeToUnsignedInOutSignature(t), nil
+	case wasm.OpcodeGlobalGet:
+		if len(c.globals) <= int(index) {
+			return nil, fmt.Errorf("invalid global index for global.get %d >= %d", index, len(c.globals))
+		}
+		return wasmValueTypeToUnsignedOutSignature(c.globals[index].ValType), nil
+	case wasm.OpcodeGlobalSet:
+		if len(c.globals) <= int(index) {
+			return nil, fmt.Errorf("invalid global index for global.get %d >= %d", index, len(c.globals))
+		}
+		return wasmValueTypeToUnsignedInSignature(c.globals[index].ValType), nil
+	case wasm.OpcodeI32Load:
+		return signature_I32_I32, nil
+	case wasm.OpcodeI64Load:
+		return signature_I32_I64, nil
+	case wasm.OpcodeF32Load:
+		return signature_I32_F32, nil
+	case wasm.OpcodeF64Load:
+		return signature_I32_F64, nil
+	case wasm.OpcodeI32Load8S, wasm.OpcodeI32Load8U, wasm.OpcodeI32Load16S, wasm.OpcodeI32Load16U:
+		return signature_I32_I32, nil
+	case wasm.OpcodeI64Load8S, wasm.OpcodeI64Load8U, wasm.OpcodeI64Load16S, wasm.OpcodeI64Load16U,
+		wasm.OpcodeI64Load32S, wasm.OpcodeI64Load32U:
+		return signature_I32_I64, nil
+	case wasm.OpcodeI32Store:
+		return signature_I32I32_None, nil
+	case wasm.OpcodeI64Store:
+		return signature_I32I64_None, nil
+	case wasm.OpcodeF32Store:
+		return signature_I32F32_None, nil
+	case wasm.OpcodeF64Store:
+		return signature_I32F64_None, nil
+	case wasm.OpcodeI32Store8:
+		return signature_I32I32_None, nil
+	case wasm.OpcodeI32Store16:
+		return signature_I32I32_None, nil
+	case wasm.OpcodeI64Store8:
+		return signature_I32I64_None, nil
+	case wasm.OpcodeI64Store16:
+		return signature_I32I64_None, nil
+	case wasm.OpcodeI64Store32:
+		return signature_I32I64_None, nil
+	case wasm.OpcodeMemorySize:
+		return signature_None_I32, nil
+	case wasm.OpcodeMemoryGrow:
+		return signature_I32_I32, nil
+	case wasm.OpcodeI32Const:
+		return signature_None_I32, nil
+	case wasm.OpcodeI64Const:
+		return signature_None_I64, nil
+	case wasm.OpcodeF32Const:
+		return signature_None_F32, nil
+	case wasm.OpcodeF64Const:
+		return signature_None_F64, nil
+	case wasm.OpcodeI32Eqz:
+		return signature_I32_I32, nil
+	case wasm.OpcodeI32Eq, wasm.OpcodeI32Ne, wasm.OpcodeI32LtS,
+		wasm.OpcodeI32LtU, wasm.OpcodeI32GtS, wasm.OpcodeI32GtU,
+		wasm.OpcodeI32LeS, wasm.OpcodeI32LeU, wasm.OpcodeI32GeS,
+		wasm.OpcodeI32GeU:
+		return signature_I32I32_I32, nil
+	case wasm.OpcodeI64Eqz:
+		return signature_I64_I32, nil
+	case wasm.OpcodeI64Eq, wasm.OpcodeI64Ne, wasm.OpcodeI64LtS,
+		wasm.OpcodeI64LtU, wasm.OpcodeI64GtS, wasm.OpcodeI64GtU,
+		wasm.OpcodeI64LeS, wasm.OpcodeI64LeU, wasm.OpcodeI64GeS,
+		wasm.OpcodeI64GeU:
+		return signature_I64I64_I32, nil
+	case wasm.OpcodeF32Eq, wasm.OpcodeF32Ne, wasm.OpcodeF32Lt,
+		wasm.OpcodeF32Gt, wasm.OpcodeF32Le, wasm.OpcodeF32Ge:
+		return signature_F32F32_I32, nil
+	case wasm.OpcodeF64Eq, wasm.OpcodeF64Ne, wasm.OpcodeF64Lt,
+		wasm.OpcodeF64Gt, wasm.OpcodeF64Le, wasm.OpcodeF64Ge:
+		return signature_F64F64_I32, nil
+	case wasm.OpcodeI32Clz, wasm.OpcodeI32Ctz, wasm.OpcodeI32Popcnt:
+		return signature_I32_I32, nil
+	case wasm.OpcodeI32Add, wasm.OpcodeI32Sub, wasm.OpcodeI32Mul,
+		wasm.OpcodeI32DivS, wasm.OpcodeI32DivU, wasm.OpcodeI32RemS,
+		wasm.OpcodeI32RemU, wasm.OpcodeI32And, wasm.OpcodeI32Or,
+		wasm.OpcodeI32Xor, wasm.OpcodeI32Shl, wasm.OpcodeI32ShrS,
+		wasm.OpcodeI32ShrU, wasm.OpcodeI32Rotl, wasm.OpcodeI32Rotr:
+		return signature_I32I32_I32, nil
+	case wasm.OpcodeI64Clz, wasm.OpcodeI64Ctz, wasm.OpcodeI64Popcnt:
+		return signature_I64_I64, nil
+	case wasm.OpcodeI64Add, wasm.OpcodeI64Sub, wasm.OpcodeI64Mul,
+		wasm.OpcodeI64DivS, wasm.OpcodeI64DivU, wasm.OpcodeI64RemS,
+		wasm.OpcodeI64RemU, wasm.OpcodeI64And, wasm.OpcodeI64Or,
+		wasm.OpcodeI64Xor, wasm.OpcodeI64Shl, wasm.OpcodeI64ShrS,
+		wasm.OpcodeI64ShrU, wasm.OpcodeI64Rotl, wasm.OpcodeI64Rotr:
+		return signature_I64I64_I64, nil
+	case wasm.OpcodeF32Abs, wasm.OpcodeF32Neg, wasm.OpcodeF32Ceil,
+		wasm.OpcodeF32Floor, wasm.OpcodeF32Trunc, wasm.OpcodeF32Nearest,
+		wasm.OpcodeF32Sqrt:
+		return signature_F32_F32, nil
+	case wasm.OpcodeF32Add, wasm.OpcodeF32Sub, wasm.OpcodeF32Mul,
+		wasm.OpcodeF32Div, wasm.OpcodeF32Min, wasm.OpcodeF32Max,
+		wasm.OpcodeF32Copysign:
+		return signature_F32F32_F32, nil
+	case wasm.OpcodeF64Abs, wasm.OpcodeF64Neg, wasm.OpcodeF64Ceil,
+		wasm.OpcodeF64Floor, wasm.OpcodeF64Trunc, wasm.OpcodeF64Nearest,
+		wasm.OpcodeF64Sqrt:
+		return signature_F64_F64, nil
+	case wasm.OpcodeF64Add, wasm.OpcodeF64Sub, wasm.OpcodeF64Mul,
+		wasm.OpcodeF64Div, wasm.OpcodeF64Min, wasm.OpcodeF64Max,
+		wasm.OpcodeF64Copysign:
+		return signature_F64F64_F64, nil
+	case wasm.OpcodeI32WrapI64:
+		return signature_I64_I32, nil
+	case wasm.OpcodeI32TruncF32S, wasm.OpcodeI32TruncF32U:
+		return signature_F32_I32, nil
+	case wasm.OpcodeI32TruncF64S, wasm.OpcodeI32TruncF64U:
+		return signature_F64_I32, nil
+	case wasm.OpcodeI64ExtendI32S, wasm.OpcodeI64ExtendI32U:
+		return signature_I32_I64, nil
+	case wasm.OpcodeI64TruncF32S, wasm.OpcodeI64TruncF32U:
+		return signature_F32_I64, nil
+	case wasm.OpcodeI64TruncF64S, wasm.OpcodeI64TruncF64U:
+		return signature_F64_I64, nil
+	case wasm.OpcodeF32ConvertI32S, wasm.OpcodeF32ConvertI32U:
+		return signature_I32_F32, nil
+	case wasm.OpcodeF32ConvertI64S, wasm.OpcodeF32ConvertI64U:
+		return signature_I64_F32, nil
+	case wasm.OpcodeF32DemoteF64:
+		return signature_F64_F32, nil
+	case wasm.OpcodeF64ConvertI32S, wasm.OpcodeF64ConvertI32U:
+		return signature_I32_F64, nil
+	case wasm.OpcodeF64ConvertI64S, wasm.OpcodeF64ConvertI64U:
+		return signature_I64_F64, nil
+	case wasm.OpcodeF64PromoteF32:
+		return signature_F32_F64, nil
+	case wasm.OpcodeI32ReinterpretF32:
+		return signature_F32_I32, nil
+	case wasm.OpcodeI64ReinterpretF64:
+		return signature_F64_I64, nil
+	case wasm.OpcodeF32ReinterpretI32:
+		return signature_I32_F32, nil
+	case wasm.OpcodeF64ReinterpretI64:
+		return signature_I64_F64, nil
+	case wasm.OpcodeI32Extend8S, wasm.OpcodeI32Extend16S:
+		return signature_I32_I32, nil
+	case wasm.OpcodeI64Extend8S, wasm.OpcodeI64Extend16S, wasm.OpcodeI64Extend32S:
+		return signature_I64_I64, nil
+	case wasm.OpcodeTableGet:
+		// table.get takes table's offset and pushes the ref type value of opaque pointer as i64 value onto the stack.
+		return signature_I32_I64, nil
+	case wasm.OpcodeTableSet:
+		// table.set takes table's offset and the ref type value of opaque pointer as i64 value.
+		return signature_I32I64_None, nil
+	case wasm.OpcodeRefFunc:
+		// ref.func is translated as pushing the compiled function's opaque pointer (uint64) at interpreterir layer.
+		return signature_None_I64, nil
+	case wasm.OpcodeRefIsNull:
+		// ref.is_null is translated as checking if the uint64 on the top of the stack (opaque pointer) is zero or not.
+		return signature_I64_I32, nil
+	case wasm.OpcodeRefNull:
+		// ref.null is translated as i64.const 0.
+		return signature_None_I64, nil
+	case wasm.OpcodeMiscPrefix:
+		switch miscOp := c.body[c.pc+1]; miscOp {
+		case wasm.OpcodeMiscI32TruncSatF32S, wasm.OpcodeMiscI32TruncSatF32U:
+			return signature_F32_I32, nil
+		case wasm.OpcodeMiscI32TruncSatF64S, wasm.OpcodeMiscI32TruncSatF64U:
+			return signature_F64_I32, nil
+		case wasm.OpcodeMiscI64TruncSatF32S, wasm.OpcodeMiscI64TruncSatF32U:
+			return signature_F32_I64, nil
+		case wasm.OpcodeMiscI64TruncSatF64S, wasm.OpcodeMiscI64TruncSatF64U:
+			return signature_F64_I64, nil
+		case wasm.OpcodeMiscMemoryInit, wasm.OpcodeMiscMemoryCopy, wasm.OpcodeMiscMemoryFill,
+			wasm.OpcodeMiscTableInit, wasm.OpcodeMiscTableCopy:
+			return signature_I32I32I32_None, nil
+		case wasm.OpcodeMiscDataDrop, wasm.OpcodeMiscElemDrop:
+			return signature_None_None, nil
+		case wasm.OpcodeMiscTableGrow:
+			return signature_I64I32_I32, nil
+		case wasm.OpcodeMiscTableSize:
+			return signature_None_I32, nil
+		case wasm.OpcodeMiscTableFill:
+			return signature_I32I64I32_None, nil
+		default:
+			return nil, fmt.Errorf("unsupported misc instruction in interpreterir: 0x%x", op)
+		}
+	case wasm.OpcodeVecPrefix:
+		switch vecOp := c.body[c.pc+1]; vecOp {
+		case wasm.OpcodeVecV128Const:
+			return signature_None_V128, nil
+		case wasm.OpcodeVecV128Load, wasm.OpcodeVecV128Load8x8s, wasm.OpcodeVecV128Load8x8u,
+			wasm.OpcodeVecV128Load16x4s, wasm.OpcodeVecV128Load16x4u, wasm.OpcodeVecV128Load32x2s,
+			wasm.OpcodeVecV128Load32x2u, wasm.OpcodeVecV128Load8Splat, wasm.OpcodeVecV128Load16Splat,
+			wasm.OpcodeVecV128Load32Splat, wasm.OpcodeVecV128Load64Splat, wasm.OpcodeVecV128Load32zero,
+			wasm.OpcodeVecV128Load64zero:
+			return signature_I32_V128, nil
+		case wasm.OpcodeVecV128Load8Lane, wasm.OpcodeVecV128Load16Lane,
+			wasm.OpcodeVecV128Load32Lane, wasm.OpcodeVecV128Load64Lane:
+			return signature_I32V128_V128, nil
+		case wasm.OpcodeVecV128Store,
+			wasm.OpcodeVecV128Store8Lane,
+			wasm.OpcodeVecV128Store16Lane,
+			wasm.OpcodeVecV128Store32Lane,
+			wasm.OpcodeVecV128Store64Lane:
+			return signature_I32V128_None, nil
+		case wasm.OpcodeVecI8x16ExtractLaneS,
+			wasm.OpcodeVecI8x16ExtractLaneU,
+			wasm.OpcodeVecI16x8ExtractLaneS,
+			wasm.OpcodeVecI16x8ExtractLaneU,
+			wasm.OpcodeVecI32x4ExtractLane:
+			return signature_V128_I32, nil
+		case wasm.OpcodeVecI64x2ExtractLane:
+			return signature_V128_I64, nil
+		case wasm.OpcodeVecF32x4ExtractLane:
+			return signature_V128_F32, nil
+		case wasm.OpcodeVecF64x2ExtractLane:
+			return signature_V128_F64, nil
+		case wasm.OpcodeVecI8x16ReplaceLane, wasm.OpcodeVecI16x8ReplaceLane, wasm.OpcodeVecI32x4ReplaceLane,
+			wasm.OpcodeVecI8x16Shl, wasm.OpcodeVecI8x16ShrS, wasm.OpcodeVecI8x16ShrU,
+			wasm.OpcodeVecI16x8Shl, wasm.OpcodeVecI16x8ShrS, wasm.OpcodeVecI16x8ShrU,
+			wasm.OpcodeVecI32x4Shl, wasm.OpcodeVecI32x4ShrS, wasm.OpcodeVecI32x4ShrU,
+			wasm.OpcodeVecI64x2Shl, wasm.OpcodeVecI64x2ShrS, wasm.OpcodeVecI64x2ShrU:
+			return signature_V128I32_V128, nil
+		case wasm.OpcodeVecI64x2ReplaceLane:
+			return signature_V128I64_V128, nil
+		case wasm.OpcodeVecF32x4ReplaceLane:
+			return signature_V128F32_V128, nil
+		case wasm.OpcodeVecF64x2ReplaceLane:
+			return signature_V128F64_V128, nil
+		case wasm.OpcodeVecI8x16Splat,
+			wasm.OpcodeVecI16x8Splat,
+			wasm.OpcodeVecI32x4Splat:
+			return signature_I32_V128, nil
+		case wasm.OpcodeVecI64x2Splat:
+			return signature_I64_V128, nil
+		case wasm.OpcodeVecF32x4Splat:
+			return signature_F32_V128, nil
+		case wasm.OpcodeVecF64x2Splat:
+			return signature_F64_V128, nil
+		case wasm.OpcodeVecV128i8x16Shuffle, wasm.OpcodeVecI8x16Swizzle, wasm.OpcodeVecV128And, wasm.OpcodeVecV128Or, wasm.OpcodeVecV128Xor, wasm.OpcodeVecV128AndNot:
+			return signature_V128V128_V128, nil
+		case wasm.OpcodeVecI8x16AllTrue, wasm.OpcodeVecI16x8AllTrue, wasm.OpcodeVecI32x4AllTrue, wasm.OpcodeVecI64x2AllTrue,
+			wasm.OpcodeVecV128AnyTrue,
+			wasm.OpcodeVecI8x16BitMask, wasm.OpcodeVecI16x8BitMask, wasm.OpcodeVecI32x4BitMask, wasm.OpcodeVecI64x2BitMask:
+			return signature_V128_I32, nil
+		case wasm.OpcodeVecV128Not, wasm.OpcodeVecI8x16Neg, wasm.OpcodeVecI16x8Neg, wasm.OpcodeVecI32x4Neg, wasm.OpcodeVecI64x2Neg,
+			wasm.OpcodeVecF32x4Neg, wasm.OpcodeVecF64x2Neg, wasm.OpcodeVecF32x4Sqrt, wasm.OpcodeVecF64x2Sqrt,
+			wasm.OpcodeVecI8x16Abs, wasm.OpcodeVecI8x16Popcnt, wasm.OpcodeVecI16x8Abs, wasm.OpcodeVecI32x4Abs, wasm.OpcodeVecI64x2Abs,
+			wasm.OpcodeVecF32x4Abs, wasm.OpcodeVecF64x2Abs,
+			wasm.OpcodeVecF32x4Ceil, wasm.OpcodeVecF32x4Floor, wasm.OpcodeVecF32x4Trunc, wasm.OpcodeVecF32x4Nearest,
+			wasm.OpcodeVecF64x2Ceil, wasm.OpcodeVecF64x2Floor, wasm.OpcodeVecF64x2Trunc, wasm.OpcodeVecF64x2Nearest,
+			wasm.OpcodeVecI16x8ExtendLowI8x16S, wasm.OpcodeVecI16x8ExtendHighI8x16S, wasm.OpcodeVecI16x8ExtendLowI8x16U, wasm.OpcodeVecI16x8ExtendHighI8x16U,
+			wasm.OpcodeVecI32x4ExtendLowI16x8S, wasm.OpcodeVecI32x4ExtendHighI16x8S, wasm.OpcodeVecI32x4ExtendLowI16x8U, wasm.OpcodeVecI32x4ExtendHighI16x8U,
+			wasm.OpcodeVecI64x2ExtendLowI32x4S, wasm.OpcodeVecI64x2ExtendHighI32x4S, wasm.OpcodeVecI64x2ExtendLowI32x4U, wasm.OpcodeVecI64x2ExtendHighI32x4U,
+			wasm.OpcodeVecI16x8ExtaddPairwiseI8x16S, wasm.OpcodeVecI16x8ExtaddPairwiseI8x16U, wasm.OpcodeVecI32x4ExtaddPairwiseI16x8S, wasm.OpcodeVecI32x4ExtaddPairwiseI16x8U,
+			wasm.OpcodeVecF64x2PromoteLowF32x4Zero, wasm.OpcodeVecF32x4DemoteF64x2Zero,
+			wasm.OpcodeVecF32x4ConvertI32x4S, wasm.OpcodeVecF32x4ConvertI32x4U,
+			wasm.OpcodeVecF64x2ConvertLowI32x4S, wasm.OpcodeVecF64x2ConvertLowI32x4U,
+			wasm.OpcodeVecI32x4TruncSatF32x4S, wasm.OpcodeVecI32x4TruncSatF32x4U,
+			wasm.OpcodeVecI32x4TruncSatF64x2SZero, wasm.OpcodeVecI32x4TruncSatF64x2UZero:
+			return signature_V128_V128, nil
+		case wasm.OpcodeVecV128Bitselect:
+			return signature_V128V128V128_V32, nil
+		case wasm.OpcodeVecI8x16Eq, wasm.OpcodeVecI8x16Ne, wasm.OpcodeVecI8x16LtS, wasm.OpcodeVecI8x16LtU, wasm.OpcodeVecI8x16GtS,
+			wasm.OpcodeVecI8x16GtU, wasm.OpcodeVecI8x16LeS, wasm.OpcodeVecI8x16LeU, wasm.OpcodeVecI8x16GeS, wasm.OpcodeVecI8x16GeU,
+			wasm.OpcodeVecI16x8Eq, wasm.OpcodeVecI16x8Ne, wasm.OpcodeVecI16x8LtS, wasm.OpcodeVecI16x8LtU, wasm.OpcodeVecI16x8GtS,
+			wasm.OpcodeVecI16x8GtU, wasm.OpcodeVecI16x8LeS, wasm.OpcodeVecI16x8LeU, wasm.OpcodeVecI16x8GeS, wasm.OpcodeVecI16x8GeU,
+			wasm.OpcodeVecI32x4Eq, wasm.OpcodeVecI32x4Ne, wasm.OpcodeVecI32x4LtS, wasm.OpcodeVecI32x4LtU, wasm.OpcodeVecI32x4GtS,
+			wasm.OpcodeVecI32x4GtU, wasm.OpcodeVecI32x4LeS, wasm.OpcodeVecI32x4LeU, wasm.OpcodeVecI32x4GeS, wasm.OpcodeVecI32x4GeU,
+			wasm.OpcodeVecI64x2Eq, wasm.OpcodeVecI64x2Ne, wasm.OpcodeVecI64x2LtS, wasm.OpcodeVecI64x2GtS, wasm.OpcodeVecI64x2LeS,
+			wasm.OpcodeVecI64x2GeS, wasm.OpcodeVecF32x4Eq, wasm.OpcodeVecF32x4Ne, wasm.OpcodeVecF32x4Lt, wasm.OpcodeVecF32x4Gt,
+			wasm.OpcodeVecF32x4Le, wasm.OpcodeVecF32x4Ge, wasm.OpcodeVecF64x2Eq, wasm.OpcodeVecF64x2Ne, wasm.OpcodeVecF64x2Lt,
+			wasm.OpcodeVecF64x2Gt, wasm.OpcodeVecF64x2Le, wasm.OpcodeVecF64x2Ge,
+			wasm.OpcodeVecI8x16Add, wasm.OpcodeVecI8x16AddSatS, wasm.OpcodeVecI8x16AddSatU, wasm.OpcodeVecI8x16Sub,
+			wasm.OpcodeVecI8x16SubSatS, wasm.OpcodeVecI8x16SubSatU,
+			wasm.OpcodeVecI16x8Add, wasm.OpcodeVecI16x8AddSatS, wasm.OpcodeVecI16x8AddSatU, wasm.OpcodeVecI16x8Sub,
+			wasm.OpcodeVecI16x8SubSatS, wasm.OpcodeVecI16x8SubSatU, wasm.OpcodeVecI16x8Mul,
+			wasm.OpcodeVecI32x4Add, wasm.OpcodeVecI32x4Sub, wasm.OpcodeVecI32x4Mul,
+			wasm.OpcodeVecI64x2Add, wasm.OpcodeVecI64x2Sub, wasm.OpcodeVecI64x2Mul,
+			wasm.OpcodeVecF32x4Add, wasm.OpcodeVecF32x4Sub, wasm.OpcodeVecF32x4Mul, wasm.OpcodeVecF32x4Div,
+			wasm.OpcodeVecF64x2Add, wasm.OpcodeVecF64x2Sub, wasm.OpcodeVecF64x2Mul, wasm.OpcodeVecF64x2Div,
+			wasm.OpcodeVecI8x16MinS, wasm.OpcodeVecI8x16MinU, wasm.OpcodeVecI8x16MaxS, wasm.OpcodeVecI8x16MaxU, wasm.OpcodeVecI8x16AvgrU,
+			wasm.OpcodeVecI16x8MinS, wasm.OpcodeVecI16x8MinU, wasm.OpcodeVecI16x8MaxS, wasm.OpcodeVecI16x8MaxU, wasm.OpcodeVecI16x8AvgrU,
+			wasm.OpcodeVecI32x4MinS, wasm.OpcodeVecI32x4MinU, wasm.OpcodeVecI32x4MaxS, wasm.OpcodeVecI32x4MaxU,
+			wasm.OpcodeVecF32x4Min, wasm.OpcodeVecF32x4Max, wasm.OpcodeVecF64x2Min, wasm.OpcodeVecF64x2Max,
+			wasm.OpcodeVecF32x4Pmin, wasm.OpcodeVecF32x4Pmax, wasm.OpcodeVecF64x2Pmin, wasm.OpcodeVecF64x2Pmax,
+			wasm.OpcodeVecI16x8Q15mulrSatS,
+			wasm.OpcodeVecI16x8ExtMulLowI8x16S, wasm.OpcodeVecI16x8ExtMulHighI8x16S, wasm.OpcodeVecI16x8ExtMulLowI8x16U, wasm.OpcodeVecI16x8ExtMulHighI8x16U,
+			wasm.OpcodeVecI32x4ExtMulLowI16x8S, wasm.OpcodeVecI32x4ExtMulHighI16x8S, wasm.OpcodeVecI32x4ExtMulLowI16x8U, wasm.OpcodeVecI32x4ExtMulHighI16x8U,
+			wasm.OpcodeVecI64x2ExtMulLowI32x4S, wasm.OpcodeVecI64x2ExtMulHighI32x4S, wasm.OpcodeVecI64x2ExtMulLowI32x4U, wasm.OpcodeVecI64x2ExtMulHighI32x4U,
+			wasm.OpcodeVecI32x4DotI16x8S,
+			wasm.OpcodeVecI8x16NarrowI16x8S, wasm.OpcodeVecI8x16NarrowI16x8U, wasm.OpcodeVecI16x8NarrowI32x4S, wasm.OpcodeVecI16x8NarrowI32x4U:
+			return signature_V128V128_V128, nil
+		default:
+			return nil, fmt.Errorf("unsupported vector instruction in interpreterir: %s", wasm.VectorInstructionName(vecOp))
+		}
+	case wasm.OpcodeAtomicPrefix:
+		switch atomicOp := c.body[c.pc+1]; atomicOp {
+		case wasm.OpcodeAtomicMemoryNotify:
+			return signature_I32I32_I32, nil
+		case wasm.OpcodeAtomicMemoryWait32:
+			return signature_I32I32I64_I32, nil
+		case wasm.OpcodeAtomicMemoryWait64:
+			return signature_I32I64I64_I32, nil
+		case wasm.OpcodeAtomicFence:
+			return signature_None_None, nil
+		case wasm.OpcodeAtomicI32Load, wasm.OpcodeAtomicI32Load8U, wasm.OpcodeAtomicI32Load16U:
+			return signature_I32_I32, nil
+		case wasm.OpcodeAtomicI64Load, wasm.OpcodeAtomicI64Load8U, wasm.OpcodeAtomicI64Load16U, wasm.OpcodeAtomicI64Load32U:
+			return signature_I32_I64, nil
+		case wasm.OpcodeAtomicI32Store, wasm.OpcodeAtomicI32Store8, wasm.OpcodeAtomicI32Store16:
+			return signature_I32I32_None, nil
+		case wasm.OpcodeAtomicI64Store, wasm.OpcodeAtomicI64Store8, wasm.OpcodeAtomicI64Store16, wasm.OpcodeAtomicI64Store32:
+			return signature_I32I64_None, nil
+		case wasm.OpcodeAtomicI32RmwAdd, wasm.OpcodeAtomicI32RmwSub, wasm.OpcodeAtomicI32RmwAnd, wasm.OpcodeAtomicI32RmwOr, wasm.OpcodeAtomicI32RmwXor, wasm.OpcodeAtomicI32RmwXchg,
+			wasm.OpcodeAtomicI32Rmw8AddU, wasm.OpcodeAtomicI32Rmw8SubU, wasm.OpcodeAtomicI32Rmw8AndU, wasm.OpcodeAtomicI32Rmw8OrU, wasm.OpcodeAtomicI32Rmw8XorU, wasm.OpcodeAtomicI32Rmw8XchgU,
+			wasm.OpcodeAtomicI32Rmw16AddU, wasm.OpcodeAtomicI32Rmw16SubU, wasm.OpcodeAtomicI32Rmw16AndU, wasm.OpcodeAtomicI32Rmw16OrU, wasm.OpcodeAtomicI32Rmw16XorU, wasm.OpcodeAtomicI32Rmw16XchgU:
+			return signature_I32I32_I32, nil
+		case wasm.OpcodeAtomicI64RmwAdd, wasm.OpcodeAtomicI64RmwSub, wasm.OpcodeAtomicI64RmwAnd, wasm.OpcodeAtomicI64RmwOr, wasm.OpcodeAtomicI64RmwXor, wasm.OpcodeAtomicI64RmwXchg,
+			wasm.OpcodeAtomicI64Rmw8AddU, wasm.OpcodeAtomicI64Rmw8SubU, wasm.OpcodeAtomicI64Rmw8AndU, wasm.OpcodeAtomicI64Rmw8OrU, wasm.OpcodeAtomicI64Rmw8XorU, wasm.OpcodeAtomicI64Rmw8XchgU,
+			wasm.OpcodeAtomicI64Rmw16AddU, wasm.OpcodeAtomicI64Rmw16SubU, wasm.OpcodeAtomicI64Rmw16AndU, wasm.OpcodeAtomicI64Rmw16OrU, wasm.OpcodeAtomicI64Rmw16XorU, wasm.OpcodeAtomicI64Rmw16XchgU,
+			wasm.OpcodeAtomicI64Rmw32AddU, wasm.OpcodeAtomicI64Rmw32SubU, wasm.OpcodeAtomicI64Rmw32AndU, wasm.OpcodeAtomicI64Rmw32OrU, wasm.OpcodeAtomicI64Rmw32XorU, wasm.OpcodeAtomicI64Rmw32XchgU:
+			return signature_I32I64_I64, nil
+		case wasm.OpcodeAtomicI32RmwCmpxchg, wasm.OpcodeAtomicI32Rmw8CmpxchgU, wasm.OpcodeAtomicI32Rmw16CmpxchgU:
+			return signature_I32I32I32_I32, nil
+		case wasm.OpcodeAtomicI64RmwCmpxchg, wasm.OpcodeAtomicI64Rmw8CmpxchgU, wasm.OpcodeAtomicI64Rmw16CmpxchgU, wasm.OpcodeAtomicI64Rmw32CmpxchgU:
+			return signature_I32I64I64_I64, nil
+		default:
+			return nil, fmt.Errorf("unsupported atomic instruction in interpreterir: %s", wasm.AtomicInstructionName(atomicOp))
+		}
+	default:
+		return nil, fmt.Errorf("unsupported instruction in interpreterir: 0x%x", op)
+	}
+}
+
+// funcTypeToIRSignatures is the central cache for a module to get the *signature
+// for function calls.
+type funcTypeToIRSignatures struct {
+	directCalls   []*signature
+	indirectCalls []*signature
+	wasmTypes     []wasm.FunctionType
+}
+
+// get returns the *signature for the direct or indirect function call against functions whose type is at `typeIndex`.
+func (f *funcTypeToIRSignatures) get(typeIndex wasm.Index, indirect bool) *signature {
+	var sig *signature
+	if indirect {
+		sig = f.indirectCalls[typeIndex]
+	} else {
+		sig = f.directCalls[typeIndex]
+	}
+	if sig != nil {
+		return sig
+	}
+
+	tp := &f.wasmTypes[typeIndex]
+	if indirect {
+		sig = &signature{
+			in:  make([]unsignedType, 0, len(tp.Params)+1), // +1 to reserve space for call indirect index.
+			out: make([]unsignedType, 0, len(tp.Results)),
+		}
+	} else {
+		sig = &signature{
+			in:  make([]unsignedType, 0, len(tp.Params)),
+			out: make([]unsignedType, 0, len(tp.Results)),
+		}
+	}
+
+	for _, vt := range tp.Params {
+		sig.in = append(sig.in, wasmValueTypeTounsignedType(vt))
+	}
+	for _, vt := range tp.Results {
+		sig.out = append(sig.out, wasmValueTypeTounsignedType(vt))
+	}
+
+	if indirect {
+		sig.in = append(sig.in, unsignedTypeI32)
+		f.indirectCalls[typeIndex] = sig
+	} else {
+		f.directCalls[typeIndex] = sig
+	}
+	return sig
+}
+
+func wasmValueTypeTounsignedType(vt wasm.ValueType) unsignedType {
+	switch vt {
+	case wasm.ValueTypeI32:
+		return unsignedTypeI32
+	case wasm.ValueTypeI64,
+		// From interpreterir layer, ref type values are opaque 64-bit pointers.
+		wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
+		return unsignedTypeI64
+	case wasm.ValueTypeF32:
+		return unsignedTypeF32
+	case wasm.ValueTypeF64:
+		return unsignedTypeF64
+	case wasm.ValueTypeV128:
+		return unsignedTypeV128
+	}
+	panic("unreachable")
+}
+
+func wasmValueTypeToUnsignedOutSignature(vt wasm.ValueType) *signature {
+	switch vt {
+	case wasm.ValueTypeI32:
+		return signature_None_I32
+	case wasm.ValueTypeI64,
+		// From interpreterir layer, ref type values are opaque 64-bit pointers.
+		wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
+		return signature_None_I64
+	case wasm.ValueTypeF32:
+		return signature_None_F32
+	case wasm.ValueTypeF64:
+		return signature_None_F64
+	case wasm.ValueTypeV128:
+		return signature_None_V128
+	}
+	panic("unreachable")
+}
+
+func wasmValueTypeToUnsignedInSignature(vt wasm.ValueType) *signature {
+	switch vt {
+	case wasm.ValueTypeI32:
+		return signature_I32_None
+	case wasm.ValueTypeI64,
+		// From interpreterir layer, ref type values are opaque 64-bit pointers.
+		wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
+		return signature_I64_None
+	case wasm.ValueTypeF32:
+		return signature_F32_None
+	case wasm.ValueTypeF64:
+		return signature_F64_None
+	case wasm.ValueTypeV128:
+		return signature_V128_None
+	}
+	panic("unreachable")
+}
+
+func wasmValueTypeToUnsignedInOutSignature(vt wasm.ValueType) *signature {
+	switch vt {
+	case wasm.ValueTypeI32:
+		return signature_I32_I32
+	case wasm.ValueTypeI64,
+		// At interpreterir layer, ref type values are opaque 64-bit pointers.
+		wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
+		return signature_I64_I64
+	case wasm.ValueTypeF32:
+		return signature_F32_F32
+	case wasm.ValueTypeF64:
+		return signature_F64_F64
+	case wasm.ValueTypeV128:
+		return signature_V128_V128
+	}
+	panic("unreachable")
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go
new file mode 100644
index 000000000..cf91c6b7a
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go
@@ -0,0 +1,170 @@
+package backend
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type (
+	// FunctionABI represents the ABI information for a function which corresponds to a ssa.Signature.
+	FunctionABI struct {
+		Initialized bool
+
+		Args, Rets                 []ABIArg
+		ArgStackSize, RetStackSize int64
+
+		ArgIntRealRegs   byte
+		ArgFloatRealRegs byte
+		RetIntRealRegs   byte
+		RetFloatRealRegs byte
+	}
+
+	// ABIArg represents either argument or return value's location.
+	ABIArg struct {
+		// Index is the index of the argument.
+		Index int
+		// Kind is the kind of the argument.
+		Kind ABIArgKind
+		// Reg is valid if Kind == ABIArgKindReg.
+		// This VReg must be based on RealReg.
+		Reg regalloc.VReg
+		// Offset is valid if Kind == ABIArgKindStack.
+		// This is the offset from the beginning of either arg or ret stack slot.
+		Offset int64
+		// Type is the type of the argument.
+		Type ssa.Type
+	}
+
+	// ABIArgKind is the kind of ABI argument.
+	ABIArgKind byte
+)
+
+const (
+	// ABIArgKindReg represents an argument passed in a register.
+	ABIArgKindReg = iota
+	// ABIArgKindStack represents an argument passed in the stack.
+	ABIArgKindStack
+)
+
+// String implements fmt.Stringer.
+func (a *ABIArg) String() string {
+	return fmt.Sprintf("args[%d]: %s", a.Index, a.Kind)
+}
+
+// String implements fmt.Stringer.
+func (a ABIArgKind) String() string {
+	switch a {
+	case ABIArgKindReg:
+		return "reg"
+	case ABIArgKindStack:
+		return "stack"
+	default:
+		panic("BUG")
+	}
+}
+
+// Init initializes the abiImpl for the given signature.
+func (a *FunctionABI) Init(sig *ssa.Signature, argResultInts, argResultFloats []regalloc.RealReg) {
+	if len(a.Rets) < len(sig.Results) {
+		a.Rets = make([]ABIArg, len(sig.Results))
+	}
+	a.Rets = a.Rets[:len(sig.Results)]
+	a.RetStackSize = a.setABIArgs(a.Rets, sig.Results, argResultInts, argResultFloats)
+	if argsNum := len(sig.Params); len(a.Args) < argsNum {
+		a.Args = make([]ABIArg, argsNum)
+	}
+	a.Args = a.Args[:len(sig.Params)]
+	a.ArgStackSize = a.setABIArgs(a.Args, sig.Params, argResultInts, argResultFloats)
+
+	// Gather the real registers usages in arg/return.
+	a.ArgIntRealRegs, a.ArgFloatRealRegs = 0, 0
+	a.RetIntRealRegs, a.RetFloatRealRegs = 0, 0
+	for i := range a.Rets {
+		r := &a.Rets[i]
+		if r.Kind == ABIArgKindReg {
+			if r.Type.IsInt() {
+				a.RetIntRealRegs++
+			} else {
+				a.RetFloatRealRegs++
+			}
+		}
+	}
+	for i := range a.Args {
+		arg := &a.Args[i]
+		if arg.Kind == ABIArgKindReg {
+			if arg.Type.IsInt() {
+				a.ArgIntRealRegs++
+			} else {
+				a.ArgFloatRealRegs++
+			}
+		}
+	}
+
+	a.Initialized = true
+}
+
+// setABIArgs sets the ABI arguments in the given slice. This assumes that len(s) >= len(types)
+// where if len(s) > len(types), the last elements of s is for the multi-return slot.
+func (a *FunctionABI) setABIArgs(s []ABIArg, types []ssa.Type, ints, floats []regalloc.RealReg) (stackSize int64) {
+	il, fl := len(ints), len(floats)
+
+	var stackOffset int64
+	intParamIndex, floatParamIndex := 0, 0
+	for i, typ := range types {
+		arg := &s[i]
+		arg.Index = i
+		arg.Type = typ
+		if typ.IsInt() {
+			if intParamIndex >= il {
+				arg.Kind = ABIArgKindStack
+				const slotSize = 8 // Align 8 bytes.
+				arg.Offset = stackOffset
+				stackOffset += slotSize
+			} else {
+				arg.Kind = ABIArgKindReg
+				arg.Reg = regalloc.FromRealReg(ints[intParamIndex], regalloc.RegTypeInt)
+				intParamIndex++
+			}
+		} else {
+			if floatParamIndex >= fl {
+				arg.Kind = ABIArgKindStack
+				slotSize := int64(8)   // Align at least 8 bytes.
+				if typ.Bits() == 128 { // Vector.
+					slotSize = 16
+				}
+				arg.Offset = stackOffset
+				stackOffset += slotSize
+			} else {
+				arg.Kind = ABIArgKindReg
+				arg.Reg = regalloc.FromRealReg(floats[floatParamIndex], regalloc.RegTypeFloat)
+				floatParamIndex++
+			}
+		}
+	}
+	return stackOffset
+}
+
+func (a *FunctionABI) AlignedArgResultStackSlotSize() uint32 {
+	stackSlotSize := a.RetStackSize + a.ArgStackSize
+	// Align stackSlotSize to 16 bytes.
+	stackSlotSize = (stackSlotSize + 15) &^ 15
+	// Check overflow 32-bit.
+	if stackSlotSize > 0xFFFFFFFF {
+		panic("ABI stack slot size overflow")
+	}
+	return uint32(stackSlotSize)
+}
+
+func (a *FunctionABI) ABIInfoAsUint64() uint64 {
+	return uint64(a.ArgIntRealRegs)<<56 |
+		uint64(a.ArgFloatRealRegs)<<48 |
+		uint64(a.RetIntRealRegs)<<40 |
+		uint64(a.RetFloatRealRegs)<<32 |
+		uint64(a.AlignedArgResultStackSlotSize())
+}
+
+func ABIInfoFromUint64(info uint64) (argIntRealRegs, argFloatRealRegs, retIntRealRegs, retFloatRealRegs byte, stackSlotSize uint32) {
+	return byte(info >> 56), byte(info >> 48), byte(info >> 40), byte(info >> 32), uint32(info)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go
new file mode 100644
index 000000000..dd67da43e
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go
@@ -0,0 +1,3 @@
+// Package backend must be free of Wasm-specific concept. In other words,
+// this package must not import internal/wasm package.
+package backend
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go
new file mode 100644
index 000000000..59bbfe02d
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go
@@ -0,0 +1,417 @@
+package backend
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// NewCompiler returns a new Compiler that can generate a machine code.
+func NewCompiler(ctx context.Context, mach Machine, builder ssa.Builder) Compiler {
+	return newCompiler(ctx, mach, builder)
+}
+
+func newCompiler(_ context.Context, mach Machine, builder ssa.Builder) *compiler {
+	argResultInts, argResultFloats := mach.ArgsResultsRegs()
+	c := &compiler{
+		mach: mach, ssaBuilder: builder,
+		nextVRegID:      regalloc.VRegIDNonReservedBegin,
+		argResultInts:   argResultInts,
+		argResultFloats: argResultFloats,
+	}
+	mach.SetCompiler(c)
+	return c
+}
+
+// Compiler is the backend of wazevo which takes ssa.Builder and Machine,
+// use the information there to emit the final machine code.
+type Compiler interface {
+	// SSABuilder returns the ssa.Builder used by this compiler.
+	SSABuilder() ssa.Builder
+
+	// Compile executes the following steps:
+	// 	1. Lower()
+	// 	2. RegAlloc()
+	// 	3. Finalize()
+	// 	4. Encode()
+	//
+	// Each step can be called individually for testing purpose, therefore they are exposed in this interface too.
+	//
+	// The returned byte slices are the machine code and the relocation information for the machine code.
+	// The caller is responsible for copying them immediately since the compiler may reuse the buffer.
+	Compile(ctx context.Context) (_ []byte, _ []RelocationInfo, _ error)
+
+	// Lower lowers the given ssa.Instruction to the machine-specific instructions.
+	Lower()
+
+	// RegAlloc performs the register allocation after Lower is called.
+	RegAlloc()
+
+	// Finalize performs the finalization of the compilation, including machine code emission.
+	// This must be called after RegAlloc.
+	Finalize(ctx context.Context) error
+
+	// Buf returns the buffer of the encoded machine code. This is only used for testing purpose.
+	Buf() []byte
+
+	BufPtr() *[]byte
+
+	// Format returns the debug string of the current state of the compiler.
+	Format() string
+
+	// Init initializes the internal state of the compiler for the next compilation.
+	Init()
+
+	// AllocateVReg allocates a new virtual register of the given type.
+	AllocateVReg(typ ssa.Type) regalloc.VReg
+
+	// ValueDefinition returns the definition of the given value.
+	ValueDefinition(ssa.Value) *SSAValueDefinition
+
+	// VRegOf returns the virtual register of the given ssa.Value.
+	VRegOf(value ssa.Value) regalloc.VReg
+
+	// TypeOf returns the ssa.Type of the given virtual register.
+	TypeOf(regalloc.VReg) ssa.Type
+
+	// MatchInstr returns true if the given definition is from an instruction with the given opcode, the current group ID,
+	// and a refcount of 1. That means, the instruction can be merged/swapped within the current instruction group.
+	MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool
+
+	// MatchInstrOneOf is the same as MatchInstr but for multiple opcodes. If it matches one of ssa.Opcode,
+	// this returns the opcode. Otherwise, this returns ssa.OpcodeInvalid.
+	//
+	// Note: caller should be careful to avoid excessive allocation on opcodes slice.
+	MatchInstrOneOf(def *SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode
+
+	// AddRelocationInfo appends the relocation information for the function reference at the current buffer offset.
+	AddRelocationInfo(funcRef ssa.FuncRef)
+
+	// AddSourceOffsetInfo appends the source offset information for the given offset.
+	AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset)
+
+	// SourceOffsetInfo returns the source offset information for the current buffer offset.
+	SourceOffsetInfo() []SourceOffsetInfo
+
+	// EmitByte appends a byte to the buffer. Used during the code emission.
+	EmitByte(b byte)
+
+	// Emit4Bytes appends 4 bytes to the buffer. Used during the code emission.
+	Emit4Bytes(b uint32)
+
+	// Emit8Bytes appends 8 bytes to the buffer. Used during the code emission.
+	Emit8Bytes(b uint64)
+
+	// GetFunctionABI returns the ABI information for the given signature.
+	GetFunctionABI(sig *ssa.Signature) *FunctionABI
+}
+
+// RelocationInfo represents the relocation information for a call instruction.
+type RelocationInfo struct {
+	// Offset represents the offset from the beginning of the machine code of either a function or the entire module.
+	Offset int64
+	// Target is the target function of the call instruction.
+	FuncRef ssa.FuncRef
+}
+
+// compiler implements Compiler.
+type compiler struct {
+	mach       Machine
+	currentGID ssa.InstructionGroupID
+	ssaBuilder ssa.Builder
+	// nextVRegID is the next virtual register ID to be allocated.
+	nextVRegID regalloc.VRegID
+	// ssaValueToVRegs maps ssa.ValueID to regalloc.VReg.
+	ssaValueToVRegs [] /* VRegID to */ regalloc.VReg
+	// ssaValueDefinitions maps ssa.ValueID to its definition.
+	ssaValueDefinitions []SSAValueDefinition
+	// ssaValueRefCounts is a cached list obtained by ssa.Builder.ValueRefCounts().
+	ssaValueRefCounts []int
+	// returnVRegs is the list of virtual registers that store the return values.
+	returnVRegs  []regalloc.VReg
+	varEdges     [][2]regalloc.VReg
+	varEdgeTypes []ssa.Type
+	constEdges   []struct {
+		cInst *ssa.Instruction
+		dst   regalloc.VReg
+	}
+	vRegSet         []bool
+	vRegIDs         []regalloc.VRegID
+	tempRegs        []regalloc.VReg
+	tmpVals         []ssa.Value
+	ssaTypeOfVRegID [] /* VRegID to */ ssa.Type
+	buf             []byte
+	relocations     []RelocationInfo
+	sourceOffsets   []SourceOffsetInfo
+	// abis maps ssa.SignatureID to the ABI implementation.
+	abis                           []FunctionABI
+	argResultInts, argResultFloats []regalloc.RealReg
+}
+
+// SourceOffsetInfo is a data to associate the source offset with the executable offset.
+type SourceOffsetInfo struct {
+	// SourceOffset is the source offset in the original source code.
+	SourceOffset ssa.SourceOffset
+	// ExecutableOffset is the offset in the compiled executable.
+	ExecutableOffset int64
+}
+
+// Compile implements Compiler.Compile.
+func (c *compiler) Compile(ctx context.Context) ([]byte, []RelocationInfo, error) {
+	c.Lower()
+	if wazevoapi.PrintSSAToBackendIRLowering && wazevoapi.PrintEnabledIndex(ctx) {
+		fmt.Printf("[[[after lowering for %s ]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
+	}
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After lowering to ISA specific IR", c.Format())
+	}
+	c.RegAlloc()
+	if wazevoapi.PrintRegisterAllocated && wazevoapi.PrintEnabledIndex(ctx) {
+		fmt.Printf("[[[after regalloc for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
+	}
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After Register Allocation", c.Format())
+	}
+	if err := c.Finalize(ctx); err != nil {
+		return nil, nil, err
+	}
+	if wazevoapi.PrintFinalizedMachineCode && wazevoapi.PrintEnabledIndex(ctx) {
+		fmt.Printf("[[[after finalize for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
+	}
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After Finalization", c.Format())
+	}
+	return c.buf, c.relocations, nil
+}
+
+// RegAlloc implements Compiler.RegAlloc.
+func (c *compiler) RegAlloc() {
+	c.mach.RegAlloc()
+}
+
+// Finalize implements Compiler.Finalize.
+func (c *compiler) Finalize(ctx context.Context) error {
+	c.mach.PostRegAlloc()
+	return c.mach.Encode(ctx)
+}
+
+// setCurrentGroupID sets the current instruction group ID.
+func (c *compiler) setCurrentGroupID(gid ssa.InstructionGroupID) {
+	c.currentGID = gid
+}
+
+// assignVirtualRegisters assigns a virtual register to each ssa.ValueID Valid in the ssa.Builder.
+func (c *compiler) assignVirtualRegisters() {
+	builder := c.ssaBuilder
+	refCounts := builder.ValueRefCounts()
+	c.ssaValueRefCounts = refCounts
+
+	need := len(refCounts)
+	if need >= len(c.ssaValueToVRegs) {
+		c.ssaValueToVRegs = append(c.ssaValueToVRegs, make([]regalloc.VReg, need+1)...)
+	}
+	if need >= len(c.ssaValueDefinitions) {
+		c.ssaValueDefinitions = append(c.ssaValueDefinitions, make([]SSAValueDefinition, need+1)...)
+	}
+
+	for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() {
+		// First we assign a virtual register to each parameter.
+		for i := 0; i < blk.Params(); i++ {
+			p := blk.Param(i)
+			pid := p.ID()
+			typ := p.Type()
+			vreg := c.AllocateVReg(typ)
+			c.ssaValueToVRegs[pid] = vreg
+			c.ssaValueDefinitions[pid] = SSAValueDefinition{BlockParamValue: p, BlkParamVReg: vreg}
+			c.ssaTypeOfVRegID[vreg.ID()] = p.Type()
+		}
+
+		// Assigns each value to a virtual register produced by instructions.
+		for cur := blk.Root(); cur != nil; cur = cur.Next() {
+			r, rs := cur.Returns()
+			var N int
+			if r.Valid() {
+				id := r.ID()
+				ssaTyp := r.Type()
+				typ := r.Type()
+				vReg := c.AllocateVReg(typ)
+				c.ssaValueToVRegs[id] = vReg
+				c.ssaValueDefinitions[id] = SSAValueDefinition{
+					Instr:    cur,
+					N:        0,
+					RefCount: refCounts[id],
+				}
+				c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp
+				N++
+			}
+			for _, r := range rs {
+				id := r.ID()
+				ssaTyp := r.Type()
+				vReg := c.AllocateVReg(ssaTyp)
+				c.ssaValueToVRegs[id] = vReg
+				c.ssaValueDefinitions[id] = SSAValueDefinition{
+					Instr:    cur,
+					N:        N,
+					RefCount: refCounts[id],
+				}
+				c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp
+				N++
+			}
+		}
+	}
+
+	for i, retBlk := 0, builder.ReturnBlock(); i < retBlk.Params(); i++ {
+		typ := retBlk.Param(i).Type()
+		vReg := c.AllocateVReg(typ)
+		c.returnVRegs = append(c.returnVRegs, vReg)
+		c.ssaTypeOfVRegID[vReg.ID()] = typ
+	}
+}
+
+// AllocateVReg implements Compiler.AllocateVReg.
+func (c *compiler) AllocateVReg(typ ssa.Type) regalloc.VReg {
+	regType := regalloc.RegTypeOf(typ)
+	r := regalloc.VReg(c.nextVRegID).SetRegType(regType)
+
+	id := r.ID()
+	if int(id) >= len(c.ssaTypeOfVRegID) {
+		c.ssaTypeOfVRegID = append(c.ssaTypeOfVRegID, make([]ssa.Type, id+1)...)
+	}
+	c.ssaTypeOfVRegID[id] = typ
+	c.nextVRegID++
+	return r
+}
+
+// Init implements Compiler.Init.
+func (c *compiler) Init() {
+	c.currentGID = 0
+	c.nextVRegID = regalloc.VRegIDNonReservedBegin
+	c.returnVRegs = c.returnVRegs[:0]
+	c.mach.Reset()
+	c.varEdges = c.varEdges[:0]
+	c.constEdges = c.constEdges[:0]
+	c.buf = c.buf[:0]
+	c.sourceOffsets = c.sourceOffsets[:0]
+	c.relocations = c.relocations[:0]
+}
+
+// ValueDefinition implements Compiler.ValueDefinition.
+func (c *compiler) ValueDefinition(value ssa.Value) *SSAValueDefinition {
+	return &c.ssaValueDefinitions[value.ID()]
+}
+
+// VRegOf implements Compiler.VRegOf.
+func (c *compiler) VRegOf(value ssa.Value) regalloc.VReg {
+	return c.ssaValueToVRegs[value.ID()]
+}
+
+// Format implements Compiler.Format.
+func (c *compiler) Format() string {
+	return c.mach.Format()
+}
+
+// TypeOf implements Compiler.Format.
+func (c *compiler) TypeOf(v regalloc.VReg) ssa.Type {
+	return c.ssaTypeOfVRegID[v.ID()]
+}
+
+// MatchInstr implements Compiler.MatchInstr.
+func (c *compiler) MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool {
+	instr := def.Instr
+	return def.IsFromInstr() &&
+		instr.Opcode() == opcode &&
+		instr.GroupID() == c.currentGID &&
+		def.RefCount < 2
+}
+
+// MatchInstrOneOf implements Compiler.MatchInstrOneOf.
+func (c *compiler) MatchInstrOneOf(def *SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode {
+	instr := def.Instr
+	if !def.IsFromInstr() {
+		return ssa.OpcodeInvalid
+	}
+
+	if instr.GroupID() != c.currentGID {
+		return ssa.OpcodeInvalid
+	}
+
+	if def.RefCount >= 2 {
+		return ssa.OpcodeInvalid
+	}
+
+	opcode := instr.Opcode()
+	for _, op := range opcodes {
+		if opcode == op {
+			return opcode
+		}
+	}
+	return ssa.OpcodeInvalid
+}
+
+// SSABuilder implements Compiler .SSABuilder.
+func (c *compiler) SSABuilder() ssa.Builder {
+	return c.ssaBuilder
+}
+
+// AddSourceOffsetInfo implements Compiler.AddSourceOffsetInfo.
+func (c *compiler) AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset) {
+	c.sourceOffsets = append(c.sourceOffsets, SourceOffsetInfo{
+		SourceOffset:     sourceOffset,
+		ExecutableOffset: executableOffset,
+	})
+}
+
+// SourceOffsetInfo implements Compiler.SourceOffsetInfo.
+func (c *compiler) SourceOffsetInfo() []SourceOffsetInfo {
+	return c.sourceOffsets
+}
+
+// AddRelocationInfo implements Compiler.AddRelocationInfo.
+func (c *compiler) AddRelocationInfo(funcRef ssa.FuncRef) {
+	c.relocations = append(c.relocations, RelocationInfo{
+		Offset:  int64(len(c.buf)),
+		FuncRef: funcRef,
+	})
+}
+
+// Emit8Bytes implements Compiler.Emit8Bytes.
+func (c *compiler) Emit8Bytes(b uint64) {
+	c.buf = append(c.buf, byte(b), byte(b>>8), byte(b>>16), byte(b>>24), byte(b>>32), byte(b>>40), byte(b>>48), byte(b>>56))
+}
+
+// Emit4Bytes implements Compiler.Emit4Bytes.
+func (c *compiler) Emit4Bytes(b uint32) {
+	c.buf = append(c.buf, byte(b), byte(b>>8), byte(b>>16), byte(b>>24))
+}
+
+// EmitByte implements Compiler.EmitByte.
+func (c *compiler) EmitByte(b byte) {
+	c.buf = append(c.buf, b)
+}
+
+// Buf implements Compiler.Buf.
+func (c *compiler) Buf() []byte {
+	return c.buf
+}
+
+// BufPtr implements Compiler.BufPtr.
+func (c *compiler) BufPtr() *[]byte {
+	return &c.buf
+}
+
+func (c *compiler) GetFunctionABI(sig *ssa.Signature) *FunctionABI {
+	if int(sig.ID) >= len(c.abis) {
+		c.abis = append(c.abis, make([]FunctionABI, int(sig.ID)+1)...)
+	}
+
+	abi := &c.abis[sig.ID]
+	if abi.Initialized {
+		return abi
+	}
+
+	abi.Init(sig, c.argResultInts, c.argResultFloats)
+	return abi
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go
new file mode 100644
index 000000000..80e65668a
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go
@@ -0,0 +1,226 @@
+package backend
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// Lower implements Compiler.Lower.
+func (c *compiler) Lower() {
+	c.assignVirtualRegisters()
+	c.mach.SetCurrentABI(c.GetFunctionABI(c.ssaBuilder.Signature()))
+	c.mach.ExecutableContext().StartLoweringFunction(c.ssaBuilder.BlockIDMax())
+	c.lowerBlocks()
+}
+
+// lowerBlocks lowers each block in the ssa.Builder.
+func (c *compiler) lowerBlocks() {
+	builder := c.ssaBuilder
+	for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() {
+		c.lowerBlock(blk)
+	}
+
+	ectx := c.mach.ExecutableContext()
+	// After lowering all blocks, we need to link adjacent blocks to layout one single instruction list.
+	var prev ssa.BasicBlock
+	for next := builder.BlockIteratorReversePostOrderBegin(); next != nil; next = builder.BlockIteratorReversePostOrderNext() {
+		if prev != nil {
+			ectx.LinkAdjacentBlocks(prev, next)
+		}
+		prev = next
+	}
+}
+
+func (c *compiler) lowerBlock(blk ssa.BasicBlock) {
+	mach := c.mach
+	ectx := mach.ExecutableContext()
+	ectx.StartBlock(blk)
+
+	// We traverse the instructions in reverse order because we might want to lower multiple
+	// instructions together.
+	cur := blk.Tail()
+
+	// First gather the branching instructions at the end of the blocks.
+	var br0, br1 *ssa.Instruction
+	if cur.IsBranching() {
+		br0 = cur
+		cur = cur.Prev()
+		if cur != nil && cur.IsBranching() {
+			br1 = cur
+			cur = cur.Prev()
+		}
+	}
+
+	if br0 != nil {
+		c.lowerBranches(br0, br1)
+	}
+
+	if br1 != nil && br0 == nil {
+		panic("BUG? when a block has conditional branch but doesn't end with an unconditional branch?")
+	}
+
+	// Now start lowering the non-branching instructions.
+	for ; cur != nil; cur = cur.Prev() {
+		c.setCurrentGroupID(cur.GroupID())
+		if cur.Lowered() {
+			continue
+		}
+
+		switch cur.Opcode() {
+		case ssa.OpcodeReturn:
+			rets := cur.ReturnVals()
+			if len(rets) > 0 {
+				c.mach.LowerReturns(rets)
+			}
+			c.mach.InsertReturn()
+		default:
+			mach.LowerInstr(cur)
+		}
+		ectx.FlushPendingInstructions()
+	}
+
+	// Finally, if this is the entry block, we have to insert copies of arguments from the real location to the VReg.
+	if blk.EntryBlock() {
+		c.lowerFunctionArguments(blk)
+	}
+
+	ectx.EndBlock()
+}
+
+// lowerBranches is called right after StartBlock and before any LowerInstr call if
+// there are branches to the given block. br0 is the very end of the block and b1 is the before the br0 if it exists.
+// At least br0 is not nil, but br1 can be nil if there's no branching before br0.
+//
+// See ssa.Instruction IsBranching, and the comment on ssa.BasicBlock.
+func (c *compiler) lowerBranches(br0, br1 *ssa.Instruction) {
+	ectx := c.mach.ExecutableContext()
+
+	c.setCurrentGroupID(br0.GroupID())
+	c.mach.LowerSingleBranch(br0)
+	ectx.FlushPendingInstructions()
+	if br1 != nil {
+		c.setCurrentGroupID(br1.GroupID())
+		c.mach.LowerConditionalBranch(br1)
+		ectx.FlushPendingInstructions()
+	}
+
+	if br0.Opcode() == ssa.OpcodeJump {
+		_, args, target := br0.BranchData()
+		argExists := len(args) != 0
+		if argExists && br1 != nil {
+			panic("BUG: critical edge split failed")
+		}
+		if argExists && target.ReturnBlock() {
+			if len(args) > 0 {
+				c.mach.LowerReturns(args)
+			}
+		} else if argExists {
+			c.lowerBlockArguments(args, target)
+		}
+	}
+	ectx.FlushPendingInstructions()
+}
+
+func (c *compiler) lowerFunctionArguments(entry ssa.BasicBlock) {
+	ectx := c.mach.ExecutableContext()
+
+	c.tmpVals = c.tmpVals[:0]
+	for i := 0; i < entry.Params(); i++ {
+		p := entry.Param(i)
+		if c.ssaValueRefCounts[p.ID()] > 0 {
+			c.tmpVals = append(c.tmpVals, p)
+		} else {
+			// If the argument is not used, we can just pass an invalid value.
+			c.tmpVals = append(c.tmpVals, ssa.ValueInvalid)
+		}
+	}
+	c.mach.LowerParams(c.tmpVals)
+	ectx.FlushPendingInstructions()
+}
+
+// lowerBlockArguments lowers how to pass arguments to the given successor block.
+func (c *compiler) lowerBlockArguments(args []ssa.Value, succ ssa.BasicBlock) {
+	if len(args) != succ.Params() {
+		panic("BUG: mismatched number of arguments")
+	}
+
+	c.varEdges = c.varEdges[:0]
+	c.varEdgeTypes = c.varEdgeTypes[:0]
+	c.constEdges = c.constEdges[:0]
+	for i := 0; i < len(args); i++ {
+		dst := succ.Param(i)
+		src := args[i]
+
+		dstReg := c.VRegOf(dst)
+		srcDef := c.ssaValueDefinitions[src.ID()]
+		if srcDef.IsFromInstr() && srcDef.Instr.Constant() {
+			c.constEdges = append(c.constEdges, struct {
+				cInst *ssa.Instruction
+				dst   regalloc.VReg
+			}{cInst: srcDef.Instr, dst: dstReg})
+		} else {
+			srcReg := c.VRegOf(src)
+			// Even when the src=dst, insert the move so that we can keep such registers keep-alive.
+			c.varEdges = append(c.varEdges, [2]regalloc.VReg{srcReg, dstReg})
+			c.varEdgeTypes = append(c.varEdgeTypes, src.Type())
+		}
+	}
+
+	// Check if there's an overlap among the dsts and srcs in varEdges.
+	c.vRegIDs = c.vRegIDs[:0]
+	for _, edge := range c.varEdges {
+		src := edge[0].ID()
+		if int(src) >= len(c.vRegSet) {
+			c.vRegSet = append(c.vRegSet, make([]bool, src+1)...)
+		}
+		c.vRegSet[src] = true
+		c.vRegIDs = append(c.vRegIDs, src)
+	}
+	separated := true
+	for _, edge := range c.varEdges {
+		dst := edge[1].ID()
+		if int(dst) >= len(c.vRegSet) {
+			c.vRegSet = append(c.vRegSet, make([]bool, dst+1)...)
+		} else {
+			if c.vRegSet[dst] {
+				separated = false
+				break
+			}
+		}
+	}
+	for _, id := range c.vRegIDs {
+		c.vRegSet[id] = false // reset for the next use.
+	}
+
+	if separated {
+		// If there's no overlap, we can simply move the source to destination.
+		for i, edge := range c.varEdges {
+			src, dst := edge[0], edge[1]
+			c.mach.InsertMove(dst, src, c.varEdgeTypes[i])
+		}
+	} else {
+		// Otherwise, we allocate a temporary registers and move the source to the temporary register,
+		//
+		// First move all of them to temporary registers.
+		c.tempRegs = c.tempRegs[:0]
+		for i, edge := range c.varEdges {
+			src := edge[0]
+			typ := c.varEdgeTypes[i]
+			temp := c.AllocateVReg(typ)
+			c.tempRegs = append(c.tempRegs, temp)
+			c.mach.InsertMove(temp, src, typ)
+		}
+		// Then move the temporary registers to the destination.
+		for i, edge := range c.varEdges {
+			temp := c.tempRegs[i]
+			dst := edge[1]
+			c.mach.InsertMove(dst, temp, c.varEdgeTypes[i])
+		}
+	}
+
+	// Finally, move the constants.
+	for _, edge := range c.constEdges {
+		cInst, dst := edge.cInst, edge.dst
+		c.mach.InsertLoadConstantBlockArg(cInst, dst)
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go
new file mode 100644
index 000000000..81c6a6b62
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go
@@ -0,0 +1,219 @@
+package backend
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type ExecutableContext interface {
+	// StartLoweringFunction is called when the lowering of the given function is started.
+	// maximumBlockID is the maximum value of ssa.BasicBlockID existing in the function.
+	StartLoweringFunction(maximumBlockID ssa.BasicBlockID)
+
+	// LinkAdjacentBlocks is called after finished lowering all blocks in order to create one single instruction list.
+	LinkAdjacentBlocks(prev, next ssa.BasicBlock)
+
+	// StartBlock is called when the compilation of the given block is started.
+	// The order of this being called is the reverse post order of the ssa.BasicBlock(s) as we iterate with
+	// ssa.Builder BlockIteratorReversePostOrderBegin and BlockIteratorReversePostOrderEnd.
+	StartBlock(ssa.BasicBlock)
+
+	// EndBlock is called when the compilation of the current block is finished.
+	EndBlock()
+
+	// FlushPendingInstructions flushes the pending instructions to the buffer.
+	// This will be called after the lowering of each SSA Instruction.
+	FlushPendingInstructions()
+}
+
+type ExecutableContextT[Instr any] struct {
+	CurrentSSABlk ssa.BasicBlock
+
+	// InstrPool is the InstructionPool of instructions.
+	InstructionPool wazevoapi.Pool[Instr]
+	asNop           func(*Instr)
+	setNext         func(*Instr, *Instr)
+	setPrev         func(*Instr, *Instr)
+
+	// RootInstr is the root instruction of the executable.
+	RootInstr         *Instr
+	labelPositionPool wazevoapi.Pool[LabelPosition[Instr]]
+	NextLabel         Label
+	// LabelPositions maps a label to the instructions of the region which the label represents.
+	LabelPositions     map[Label]*LabelPosition[Instr]
+	OrderedBlockLabels []*LabelPosition[Instr]
+
+	// PerBlockHead and PerBlockEnd are the head and tail of the instruction list per currently-compiled ssa.BasicBlock.
+	PerBlockHead, PerBlockEnd *Instr
+	// PendingInstructions are the instructions which are not yet emitted into the instruction list.
+	PendingInstructions []*Instr
+
+	// SsaBlockIDToLabels maps an SSA block ID to the label.
+	SsaBlockIDToLabels []Label
+}
+
+func NewExecutableContextT[Instr any](
+	resetInstruction func(*Instr),
+	setNext func(*Instr, *Instr),
+	setPrev func(*Instr, *Instr),
+	asNop func(*Instr),
+) *ExecutableContextT[Instr] {
+	return &ExecutableContextT[Instr]{
+		InstructionPool:   wazevoapi.NewPool[Instr](resetInstruction),
+		asNop:             asNop,
+		setNext:           setNext,
+		setPrev:           setPrev,
+		labelPositionPool: wazevoapi.NewPool[LabelPosition[Instr]](resetLabelPosition[Instr]),
+		LabelPositions:    make(map[Label]*LabelPosition[Instr]),
+		NextLabel:         LabelInvalid,
+	}
+}
+
+func resetLabelPosition[T any](l *LabelPosition[T]) {
+	*l = LabelPosition[T]{}
+}
+
+// StartLoweringFunction implements ExecutableContext.
+func (e *ExecutableContextT[Instr]) StartLoweringFunction(max ssa.BasicBlockID) {
+	imax := int(max)
+	if len(e.SsaBlockIDToLabels) <= imax {
+		// Eagerly allocate labels for the blocks since the underlying slice will be used for the next iteration.
+		e.SsaBlockIDToLabels = append(e.SsaBlockIDToLabels, make([]Label, imax+1)...)
+	}
+}
+
+func (e *ExecutableContextT[Instr]) StartBlock(blk ssa.BasicBlock) {
+	e.CurrentSSABlk = blk
+
+	l := e.SsaBlockIDToLabels[e.CurrentSSABlk.ID()]
+	if l == LabelInvalid {
+		l = e.AllocateLabel()
+		e.SsaBlockIDToLabels[blk.ID()] = l
+	}
+
+	end := e.allocateNop0()
+	e.PerBlockHead, e.PerBlockEnd = end, end
+
+	labelPos, ok := e.LabelPositions[l]
+	if !ok {
+		labelPos = e.AllocateLabelPosition(l)
+		e.LabelPositions[l] = labelPos
+	}
+	e.OrderedBlockLabels = append(e.OrderedBlockLabels, labelPos)
+	labelPos.Begin, labelPos.End = end, end
+	labelPos.SB = blk
+}
+
+// EndBlock implements ExecutableContext.
+func (e *ExecutableContextT[T]) EndBlock() {
+	// Insert nop0 as the head of the block for convenience to simplify the logic of inserting instructions.
+	e.insertAtPerBlockHead(e.allocateNop0())
+
+	l := e.SsaBlockIDToLabels[e.CurrentSSABlk.ID()]
+	e.LabelPositions[l].Begin = e.PerBlockHead
+
+	if e.CurrentSSABlk.EntryBlock() {
+		e.RootInstr = e.PerBlockHead
+	}
+}
+
+func (e *ExecutableContextT[T]) insertAtPerBlockHead(i *T) {
+	if e.PerBlockHead == nil {
+		e.PerBlockHead = i
+		e.PerBlockEnd = i
+		return
+	}
+	e.setNext(i, e.PerBlockHead)
+	e.setPrev(e.PerBlockHead, i)
+	e.PerBlockHead = i
+}
+
+// FlushPendingInstructions implements ExecutableContext.
+func (e *ExecutableContextT[T]) FlushPendingInstructions() {
+	l := len(e.PendingInstructions)
+	if l == 0 {
+		return
+	}
+	for i := l - 1; i >= 0; i-- { // reverse because we lower instructions in reverse order.
+		e.insertAtPerBlockHead(e.PendingInstructions[i])
+	}
+	e.PendingInstructions = e.PendingInstructions[:0]
+}
+
+func (e *ExecutableContextT[T]) Reset() {
+	e.labelPositionPool.Reset()
+	e.InstructionPool.Reset()
+	for l := Label(0); l <= e.NextLabel; l++ {
+		delete(e.LabelPositions, l)
+	}
+	e.PendingInstructions = e.PendingInstructions[:0]
+	e.OrderedBlockLabels = e.OrderedBlockLabels[:0]
+	e.RootInstr = nil
+	e.SsaBlockIDToLabels = e.SsaBlockIDToLabels[:0]
+	e.PerBlockHead, e.PerBlockEnd = nil, nil
+	e.NextLabel = LabelInvalid
+}
+
+// AllocateLabel allocates an unused label.
+func (e *ExecutableContextT[T]) AllocateLabel() Label {
+	e.NextLabel++
+	return e.NextLabel
+}
+
+func (e *ExecutableContextT[T]) AllocateLabelPosition(la Label) *LabelPosition[T] {
+	l := e.labelPositionPool.Allocate()
+	l.L = la
+	return l
+}
+
+func (e *ExecutableContextT[T]) GetOrAllocateSSABlockLabel(blk ssa.BasicBlock) Label {
+	if blk.ReturnBlock() {
+		return LabelReturn
+	}
+	l := e.SsaBlockIDToLabels[blk.ID()]
+	if l == LabelInvalid {
+		l = e.AllocateLabel()
+		e.SsaBlockIDToLabels[blk.ID()] = l
+	}
+	return l
+}
+
+func (e *ExecutableContextT[T]) allocateNop0() *T {
+	i := e.InstructionPool.Allocate()
+	e.asNop(i)
+	return i
+}
+
+// LinkAdjacentBlocks implements backend.Machine.
+func (e *ExecutableContextT[T]) LinkAdjacentBlocks(prev, next ssa.BasicBlock) {
+	prevLabelPos := e.LabelPositions[e.GetOrAllocateSSABlockLabel(prev)]
+	nextLabelPos := e.LabelPositions[e.GetOrAllocateSSABlockLabel(next)]
+	e.setNext(prevLabelPos.End, nextLabelPos.Begin)
+}
+
+// LabelPosition represents the regions of the generated code which the label represents.
+type LabelPosition[Instr any] struct {
+	SB           ssa.BasicBlock
+	L            Label
+	Begin, End   *Instr
+	BinaryOffset int64
+}
+
+// Label represents a position in the generated code which is either
+// a real instruction or the constant InstructionPool (e.g. jump tables).
+//
+// This is exactly the same as the traditional "label" in assembly code.
+type Label uint32
+
+const (
+	LabelInvalid Label = 0
+	LabelReturn  Label = math.MaxUint32
+)
+
+// String implements backend.Machine.
+func (l Label) String() string {
+	return fmt.Sprintf("L%d", l)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go
new file mode 100644
index 000000000..6fe6d7b3c
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go
@@ -0,0 +1,33 @@
+package backend
+
+import "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+
+// GoFunctionCallRequiredStackSize returns the size of the stack required for the Go function call.
+// argBegin is the index of the first argument in the signature which is not either execution context or module context.
+func GoFunctionCallRequiredStackSize(sig *ssa.Signature, argBegin int) (ret, retUnaligned int64) {
+	var paramNeededInBytes, resultNeededInBytes int64
+	for _, p := range sig.Params[argBegin:] {
+		s := int64(p.Size())
+		if s < 8 {
+			s = 8 // We use uint64 for all basic types, except SIMD v128.
+		}
+		paramNeededInBytes += s
+	}
+	for _, r := range sig.Results {
+		s := int64(r.Size())
+		if s < 8 {
+			s = 8 // We use uint64 for all basic types, except SIMD v128.
+		}
+		resultNeededInBytes += s
+	}
+
+	if paramNeededInBytes > resultNeededInBytes {
+		ret = paramNeededInBytes
+	} else {
+		ret = resultNeededInBytes
+	}
+	retUnaligned = ret
+	// Align to 16 bytes.
+	ret = (ret + 15) &^ 15
+	return
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go
new file mode 100644
index 000000000..130f8c621
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go
@@ -0,0 +1,186 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// For the details of the ABI, see:
+// https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#amd64-architecture
+
+var (
+	intArgResultRegs   = []regalloc.RealReg{rax, rbx, rcx, rdi, rsi, r8, r9, r10, r11}
+	floatArgResultRegs = []regalloc.RealReg{xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7}
+)
+
+var regInfo = &regalloc.RegisterInfo{
+	AllocatableRegisters: [regalloc.NumRegType][]regalloc.RealReg{
+		regalloc.RegTypeInt: {
+			rax, rcx, rdx, rbx, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15,
+		},
+		regalloc.RegTypeFloat: {
+			xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
+		},
+	},
+	CalleeSavedRegisters: regalloc.NewRegSet(
+		rdx, r12, r13, r14, r15,
+		xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
+	),
+	CallerSavedRegisters: regalloc.NewRegSet(
+		rax, rcx, rbx, rsi, rdi, r8, r9, r10, r11,
+		xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
+	),
+	RealRegToVReg: []regalloc.VReg{
+		rax: raxVReg, rcx: rcxVReg, rdx: rdxVReg, rbx: rbxVReg, rsp: rspVReg, rbp: rbpVReg, rsi: rsiVReg, rdi: rdiVReg,
+		r8: r8VReg, r9: r9VReg, r10: r10VReg, r11: r11VReg, r12: r12VReg, r13: r13VReg, r14: r14VReg, r15: r15VReg,
+		xmm0: xmm0VReg, xmm1: xmm1VReg, xmm2: xmm2VReg, xmm3: xmm3VReg, xmm4: xmm4VReg, xmm5: xmm5VReg, xmm6: xmm6VReg,
+		xmm7: xmm7VReg, xmm8: xmm8VReg, xmm9: xmm9VReg, xmm10: xmm10VReg, xmm11: xmm11VReg, xmm12: xmm12VReg,
+		xmm13: xmm13VReg, xmm14: xmm14VReg, xmm15: xmm15VReg,
+	},
+	RealRegName: func(r regalloc.RealReg) string { return regNames[r] },
+	RealRegType: func(r regalloc.RealReg) regalloc.RegType {
+		if r < xmm0 {
+			return regalloc.RegTypeInt
+		}
+		return regalloc.RegTypeFloat
+	},
+}
+
+// ArgsResultsRegs implements backend.Machine.
+func (m *machine) ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) {
+	return intArgResultRegs, floatArgResultRegs
+}
+
+// LowerParams implements backend.Machine.
+func (m *machine) LowerParams(args []ssa.Value) {
+	a := m.currentABI
+
+	for i, ssaArg := range args {
+		if !ssaArg.Valid() {
+			continue
+		}
+		reg := m.c.VRegOf(ssaArg)
+		arg := &a.Args[i]
+		if arg.Kind == backend.ABIArgKindReg {
+			m.InsertMove(reg, arg.Reg, arg.Type)
+		} else {
+			//
+			//            (high address)
+			//          +-----------------+
+			//          |     .......     |
+			//          |      ret Y      |
+			//          |     .......     |
+			//          |      ret 0      |
+			//          |      arg X      |
+			//          |     .......     |
+			//          |      arg 1      |
+			//          |      arg 0      |
+			//          |   ReturnAddress |
+			//          |    Caller_RBP   |
+			//          +-----------------+ <-- RBP
+			//          |   ...........   |
+			//          |   clobbered  M  |
+			//          |   ............  |
+			//          |   clobbered  0  |
+			//          |   spill slot N  |
+			//          |   ...........   |
+			//          |   spill slot 0  |
+			//   RSP--> +-----------------+
+			//             (low address)
+
+			// Load the value from the arg stack slot above the current RBP.
+			load := m.allocateInstr()
+			mem := newOperandMem(m.newAmodeImmRBPReg(uint32(arg.Offset + 16)))
+			switch arg.Type {
+			case ssa.TypeI32:
+				load.asMovzxRmR(extModeLQ, mem, reg)
+			case ssa.TypeI64:
+				load.asMov64MR(mem, reg)
+			case ssa.TypeF32:
+				load.asXmmUnaryRmR(sseOpcodeMovss, mem, reg)
+			case ssa.TypeF64:
+				load.asXmmUnaryRmR(sseOpcodeMovsd, mem, reg)
+			case ssa.TypeV128:
+				load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, reg)
+			default:
+				panic("BUG")
+			}
+			m.insert(load)
+		}
+	}
+}
+
+// LowerReturns implements backend.Machine.
+func (m *machine) LowerReturns(rets []ssa.Value) {
+	// Load the XMM registers first as it might need a temporary register to inline
+	// constant return.
+	a := m.currentABI
+	for i, ret := range rets {
+		r := &a.Rets[i]
+		if !r.Type.IsInt() {
+			m.LowerReturn(ret, r)
+		}
+	}
+	// Then load the GPR registers.
+	for i, ret := range rets {
+		r := &a.Rets[i]
+		if r.Type.IsInt() {
+			m.LowerReturn(ret, r)
+		}
+	}
+}
+
+func (m *machine) LowerReturn(ret ssa.Value, r *backend.ABIArg) {
+	reg := m.c.VRegOf(ret)
+	if def := m.c.ValueDefinition(ret); def.IsFromInstr() {
+		// Constant instructions are inlined.
+		if inst := def.Instr; inst.Constant() {
+			m.insertLoadConstant(inst, reg)
+		}
+	}
+	if r.Kind == backend.ABIArgKindReg {
+		m.InsertMove(r.Reg, reg, ret.Type())
+	} else {
+		//
+		//            (high address)
+		//          +-----------------+
+		//          |     .......     |
+		//          |      ret Y      |
+		//          |     .......     |
+		//          |      ret 0      |
+		//          |      arg X      |
+		//          |     .......     |
+		//          |      arg 1      |
+		//          |      arg 0      |
+		//          |   ReturnAddress |
+		//          |    Caller_RBP   |
+		//          +-----------------+ <-- RBP
+		//          |   ...........   |
+		//          |   clobbered  M  |
+		//          |   ............  |
+		//          |   clobbered  0  |
+		//          |   spill slot N  |
+		//          |   ...........   |
+		//          |   spill slot 0  |
+		//   RSP--> +-----------------+
+		//             (low address)
+
+		// Store the value to the return stack slot above the current RBP.
+		store := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmRBPReg(uint32(m.currentABI.ArgStackSize + 16 + r.Offset)))
+		switch r.Type {
+		case ssa.TypeI32:
+			store.asMovRM(reg, mem, 4)
+		case ssa.TypeI64:
+			store.asMovRM(reg, mem, 8)
+		case ssa.TypeF32:
+			store.asXmmMovRM(sseOpcodeMovss, reg, mem)
+		case ssa.TypeF64:
+			store.asXmmMovRM(sseOpcodeMovsd, reg, mem)
+		case ssa.TypeV128:
+			store.asXmmMovRM(sseOpcodeMovdqu, reg, mem)
+		}
+		m.insert(store)
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go
new file mode 100644
index 000000000..cbf1cfdc5
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go
@@ -0,0 +1,9 @@
+package amd64
+
+// entrypoint enters the machine code generated by this backend which begins with the preamble generated by functionABI.EmitGoEntryPreamble below.
+// This implements wazevo.entrypoint, and see the comments there for detail.
+func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr)
+
+// afterGoFunctionCallEntrypoint enters the machine code after growing the stack.
+// This implements wazevo.afterGoFunctionCallEntrypoint, and see the comments there for detail.
+func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s
new file mode 100644
index 000000000..e9cb131d1
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s
@@ -0,0 +1,29 @@
+#include "funcdata.h"
+#include "textflag.h"
+
+// entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr
+TEXT ·entrypoint(SB), NOSPLIT|NOFRAME, $0-48
+	MOVQ preambleExecutable+0(FP), R11
+	MOVQ functionExectuable+8(FP), R14
+	MOVQ executionContextPtr+16(FP), AX       // First argument is passed in AX.
+	MOVQ moduleContextPtr+24(FP), BX          // Second argument is passed in BX.
+	MOVQ paramResultSlicePtr+32(FP), R12
+	MOVQ goAllocatedStackSlicePtr+40(FP), R13
+	JMP  R11
+
+// afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
+TEXT ·afterGoFunctionCallEntrypoint(SB), NOSPLIT|NOFRAME, $0-32
+	MOVQ executable+0(FP), CX
+	MOVQ executionContextPtr+8(FP), AX // First argument is passed in AX.
+
+	// Save the stack pointer and frame pointer.
+	MOVQ BP, 16(AX) // 16 == ExecutionContextOffsetOriginalFramePointer
+	MOVQ SP, 24(AX) // 24 == ExecutionContextOffsetOriginalStackPointer
+
+	// Then set the stack pointer and frame pointer to the values we got from the Go runtime.
+	MOVQ framePointer+24(FP), BP
+
+	// WARNING: do not update SP before BP, because the Go translates (FP) as (SP) + 8.
+	MOVQ stackPointer+16(FP), SP
+
+	JMP CX
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go
new file mode 100644
index 000000000..882d06c06
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go
@@ -0,0 +1,248 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+var (
+	executionContextPtrReg = raxVReg
+
+	// Followings are callee saved registers. They can be used freely in the entry preamble
+	// since the preamble is called via Go assembly function which has stack-based ABI.
+
+	// savedExecutionContextPtr also must be a callee-saved reg so that they can be used in the prologue and epilogue.
+	savedExecutionContextPtr = rdxVReg
+	// paramResultSlicePtr must match with entrypoint function in abi_entry_amd64.s.
+	paramResultSlicePtr = r12VReg
+	// goAllocatedStackPtr must match with entrypoint function in abi_entry_amd64.s.
+	goAllocatedStackPtr = r13VReg
+	// functionExecutable must match with entrypoint function in abi_entry_amd64.s.
+	functionExecutable = r14VReg
+	tmpIntReg          = r15VReg
+	tmpXmmReg          = xmm15VReg
+)
+
+// CompileEntryPreamble implements backend.Machine.
+func (m *machine) CompileEntryPreamble(sig *ssa.Signature) []byte {
+	root := m.compileEntryPreamble(sig)
+	m.encodeWithoutSSA(root)
+	buf := m.c.Buf()
+	return buf
+}
+
+func (m *machine) compileEntryPreamble(sig *ssa.Signature) *instruction {
+	abi := backend.FunctionABI{}
+	abi.Init(sig, intArgResultRegs, floatArgResultRegs)
+
+	root := m.allocateNop()
+
+	//// ----------------------------------- prologue ----------------------------------- ////
+
+	// First, we save executionContextPtrReg into a callee-saved register so that it can be used in epilogue as well.
+	// 		mov %executionContextPtrReg, %savedExecutionContextPtr
+	cur := m.move64(executionContextPtrReg, savedExecutionContextPtr, root)
+
+	// Next is to save the original RBP and RSP into the execution context.
+	cur = m.saveOriginalRSPRBP(cur)
+
+	// Now set the RSP to the Go-allocated stack pointer.
+	// 		mov %goAllocatedStackPtr, %rsp
+	cur = m.move64(goAllocatedStackPtr, rspVReg, cur)
+
+	if stackSlotSize := abi.AlignedArgResultStackSlotSize(); stackSlotSize > 0 {
+		// Allocate stack slots for the arguments and return values.
+		// 		sub $stackSlotSize, %rsp
+		spDec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(stackSlotSize)), rspVReg, true)
+		cur = linkInstr(cur, spDec)
+	}
+
+	var offset uint32
+	for i := range abi.Args {
+		if i < 2 {
+			// module context ptr and execution context ptr are passed in rax and rbx by the Go assembly function.
+			continue
+		}
+		arg := &abi.Args[i]
+		cur = m.goEntryPreamblePassArg(cur, paramResultSlicePtr, offset, arg)
+		if arg.Type == ssa.TypeV128 {
+			offset += 16
+		} else {
+			offset += 8
+		}
+	}
+
+	// Zero out RBP so that the unwind/stack growth code can correctly detect the end of the stack.
+	zerosRbp := m.allocateInstr().asAluRmiR(aluRmiROpcodeXor, newOperandReg(rbpVReg), rbpVReg, true)
+	cur = linkInstr(cur, zerosRbp)
+
+	// Now ready to call the real function. Note that at this point stack pointer is already set to the Go-allocated,
+	// which is aligned to 16 bytes.
+	call := m.allocateInstr().asCallIndirect(newOperandReg(functionExecutable), &abi)
+	cur = linkInstr(cur, call)
+
+	//// ----------------------------------- epilogue ----------------------------------- ////
+
+	// Read the results from regs and the stack, and set them correctly into the paramResultSlicePtr.
+	offset = 0
+	for i := range abi.Rets {
+		r := &abi.Rets[i]
+		cur = m.goEntryPreamblePassResult(cur, paramResultSlicePtr, offset, r, uint32(abi.ArgStackSize))
+		if r.Type == ssa.TypeV128 {
+			offset += 16
+		} else {
+			offset += 8
+		}
+	}
+
+	// Finally, restore the original RBP and RSP.
+	cur = m.restoreOriginalRSPRBP(cur)
+
+	ret := m.allocateInstr().asRet()
+	linkInstr(cur, ret)
+	return root
+}
+
+// saveOriginalRSPRBP saves the original RSP and RBP into the execution context.
+func (m *machine) saveOriginalRSPRBP(cur *instruction) *instruction {
+	// 		mov %rbp, wazevoapi.ExecutionContextOffsetOriginalFramePointer(%executionContextPtrReg)
+	// 		mov %rsp, wazevoapi.ExecutionContextOffsetOriginalStackPointer(%executionContextPtrReg)
+	cur = m.loadOrStore64AtExecutionCtx(executionContextPtrReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, rbpVReg, true, cur)
+	cur = m.loadOrStore64AtExecutionCtx(executionContextPtrReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, rspVReg, true, cur)
+	return cur
+}
+
+// restoreOriginalRSPRBP restores the original RSP and RBP from the execution context.
+func (m *machine) restoreOriginalRSPRBP(cur *instruction) *instruction {
+	// 		mov wazevoapi.ExecutionContextOffsetOriginalFramePointer(%executionContextPtrReg), %rbp
+	// 		mov wazevoapi.ExecutionContextOffsetOriginalStackPointer(%executionContextPtrReg), %rsp
+	cur = m.loadOrStore64AtExecutionCtx(savedExecutionContextPtr, wazevoapi.ExecutionContextOffsetOriginalFramePointer, rbpVReg, false, cur)
+	cur = m.loadOrStore64AtExecutionCtx(savedExecutionContextPtr, wazevoapi.ExecutionContextOffsetOriginalStackPointer, rspVReg, false, cur)
+	return cur
+}
+
+func (m *machine) move64(src, dst regalloc.VReg, prev *instruction) *instruction {
+	mov := m.allocateInstr().asMovRR(src, dst, true)
+	return linkInstr(prev, mov)
+}
+
+func (m *machine) loadOrStore64AtExecutionCtx(execCtx regalloc.VReg, offset wazevoapi.Offset, r regalloc.VReg, store bool, prev *instruction) *instruction {
+	mem := newOperandMem(m.newAmodeImmReg(offset.U32(), execCtx))
+	instr := m.allocateInstr()
+	if store {
+		instr.asMovRM(r, mem, 8)
+	} else {
+		instr.asMov64MR(mem, r)
+	}
+	return linkInstr(prev, instr)
+}
+
+// This is for debugging.
+func (m *machine) linkUD2(cur *instruction) *instruction { //nolint
+	return linkInstr(cur, m.allocateInstr().asUD2())
+}
+
+func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regalloc.VReg, offsetInParamSlice uint32, arg *backend.ABIArg) *instruction {
+	var dst regalloc.VReg
+	argTyp := arg.Type
+	if arg.Kind == backend.ABIArgKindStack {
+		// Caller saved registers ca
+		switch argTyp {
+		case ssa.TypeI32, ssa.TypeI64:
+			dst = tmpIntReg
+		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+			dst = tmpXmmReg
+		default:
+			panic("BUG")
+		}
+	} else {
+		dst = arg.Reg
+	}
+
+	load := m.allocateInstr()
+	a := newOperandMem(m.newAmodeImmReg(offsetInParamSlice, paramSlicePtr))
+	switch arg.Type {
+	case ssa.TypeI32:
+		load.asMovzxRmR(extModeLQ, a, dst)
+	case ssa.TypeI64:
+		load.asMov64MR(a, dst)
+	case ssa.TypeF32:
+		load.asXmmUnaryRmR(sseOpcodeMovss, a, dst)
+	case ssa.TypeF64:
+		load.asXmmUnaryRmR(sseOpcodeMovsd, a, dst)
+	case ssa.TypeV128:
+		load.asXmmUnaryRmR(sseOpcodeMovdqu, a, dst)
+	}
+
+	cur = linkInstr(cur, load)
+	if arg.Kind == backend.ABIArgKindStack {
+		// Store back to the stack.
+		store := m.allocateInstr()
+		a := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset), rspVReg))
+		switch arg.Type {
+		case ssa.TypeI32:
+			store.asMovRM(dst, a, 4)
+		case ssa.TypeI64:
+			store.asMovRM(dst, a, 8)
+		case ssa.TypeF32:
+			store.asXmmMovRM(sseOpcodeMovss, dst, a)
+		case ssa.TypeF64:
+			store.asXmmMovRM(sseOpcodeMovsd, dst, a)
+		case ssa.TypeV128:
+			store.asXmmMovRM(sseOpcodeMovdqu, dst, a)
+		}
+		cur = linkInstr(cur, store)
+	}
+	return cur
+}
+
+func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr regalloc.VReg, offsetInResultSlice uint32, result *backend.ABIArg, resultStackSlotBeginOffset uint32) *instruction {
+	var r regalloc.VReg
+	if result.Kind == backend.ABIArgKindStack {
+		// Load the value to the temporary.
+		load := m.allocateInstr()
+		offset := resultStackSlotBeginOffset + uint32(result.Offset)
+		a := newOperandMem(m.newAmodeImmReg(offset, rspVReg))
+		switch result.Type {
+		case ssa.TypeI32:
+			r = tmpIntReg
+			load.asMovzxRmR(extModeLQ, a, r)
+		case ssa.TypeI64:
+			r = tmpIntReg
+			load.asMov64MR(a, r)
+		case ssa.TypeF32:
+			r = tmpXmmReg
+			load.asXmmUnaryRmR(sseOpcodeMovss, a, r)
+		case ssa.TypeF64:
+			r = tmpXmmReg
+			load.asXmmUnaryRmR(sseOpcodeMovsd, a, r)
+		case ssa.TypeV128:
+			r = tmpXmmReg
+			load.asXmmUnaryRmR(sseOpcodeMovdqu, a, r)
+		default:
+			panic("BUG")
+		}
+		cur = linkInstr(cur, load)
+	} else {
+		r = result.Reg
+	}
+
+	store := m.allocateInstr()
+	a := newOperandMem(m.newAmodeImmReg(offsetInResultSlice, resultSlicePtr))
+	switch result.Type {
+	case ssa.TypeI32:
+		store.asMovRM(r, a, 4)
+	case ssa.TypeI64:
+		store.asMovRM(r, a, 8)
+	case ssa.TypeF32:
+		store.asXmmMovRM(sseOpcodeMovss, r, a)
+	case ssa.TypeF64:
+		store.asXmmMovRM(sseOpcodeMovsd, r, a)
+	case ssa.TypeV128:
+		store.asXmmMovRM(sseOpcodeMovdqu, r, a)
+	}
+
+	return linkInstr(cur, store)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go
new file mode 100644
index 000000000..751050aff
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go
@@ -0,0 +1,443 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+var calleeSavedVRegs = []regalloc.VReg{
+	rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg,
+	xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg,
+}
+
+// CompileGoFunctionTrampoline implements backend.Machine.
+func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte {
+	ectx := m.ectx
+	argBegin := 1 // Skips exec context by default.
+	if needModuleContextPtr {
+		argBegin++
+	}
+
+	abi := &backend.FunctionABI{}
+	abi.Init(sig, intArgResultRegs, floatArgResultRegs)
+	m.currentABI = abi
+
+	cur := m.allocateNop()
+	ectx.RootInstr = cur
+
+	// Execution context is always the first argument.
+	execCtrPtr := raxVReg
+
+	// First we update RBP and RSP just like the normal prologue.
+	//
+	//                   (high address)                     (high address)
+	//       RBP ----> +-----------------+                +-----------------+
+	//                 |     .......     |                |     .......     |
+	//                 |      ret Y      |                |      ret Y      |
+	//                 |     .......     |                |     .......     |
+	//                 |      ret 0      |                |      ret 0      |
+	//                 |      arg X      |                |      arg X      |
+	//                 |     .......     |     ====>      |     .......     |
+	//                 |      arg 1      |                |      arg 1      |
+	//                 |      arg 0      |                |      arg 0      |
+	//                 |   Return Addr   |                |   Return Addr   |
+	//       RSP ----> +-----------------+                |    Caller_RBP   |
+	//                    (low address)                   +-----------------+ <----- RSP, RBP
+	//
+	cur = m.setupRBPRSP(cur)
+
+	goSliceSizeAligned, goSliceSizeAlignedUnaligned := backend.GoFunctionCallRequiredStackSize(sig, argBegin)
+	cur = m.insertStackBoundsCheck(goSliceSizeAligned+8 /* size of the Go slice */, cur)
+
+	// Save the callee saved registers.
+	cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs)
+
+	if needModuleContextPtr {
+		moduleCtrPtr := rbxVReg // Module context is always the second argument.
+		mem := m.newAmodeImmReg(
+			wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.U32(),
+			execCtrPtr)
+		store := m.allocateInstr().asMovRM(moduleCtrPtr, newOperandMem(mem), 8)
+		cur = linkInstr(cur, store)
+	}
+
+	// Now let's advance the RSP to the stack slot for the arguments.
+	//
+	//                (high address)                     (high address)
+	//              +-----------------+               +-----------------+
+	//              |     .......     |               |     .......     |
+	//              |      ret Y      |               |      ret Y      |
+	//              |     .......     |               |     .......     |
+	//              |      ret 0      |               |      ret 0      |
+	//              |      arg X      |               |      arg X      |
+	//              |     .......     |   =======>    |     .......     |
+	//              |      arg 1      |               |      arg 1      |
+	//              |      arg 0      |               |      arg 0      |
+	//              |   Return Addr   |               |   Return Addr   |
+	//              |    Caller_RBP   |               |    Caller_RBP   |
+	//  RBP,RSP --> +-----------------+               +-----------------+ <----- RBP
+	//                 (low address)                  |  arg[N]/ret[M]  |
+	//                                                |    ..........   |
+	//                                                |  arg[1]/ret[1]  |
+	//                                                |  arg[0]/ret[0]  |
+	//                                                +-----------------+ <----- RSP
+	//                                                   (low address)
+	//
+	// where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions,
+	// therefore will be accessed as the usual []uint64. So that's where we need to pass/receive
+	// the arguments/return values to/from Go function.
+	cur = m.addRSP(-int32(goSliceSizeAligned), cur)
+
+	// Next, we need to store all the arguments to the stack in the typical Wasm stack style.
+	var offsetInGoSlice int32
+	for i := range abi.Args[argBegin:] {
+		arg := &abi.Args[argBegin+i]
+		var v regalloc.VReg
+		if arg.Kind == backend.ABIArgKindReg {
+			v = arg.Reg
+		} else {
+			// We have saved callee saved registers, so we can use them.
+			if arg.Type.IsInt() {
+				v = r15VReg
+			} else {
+				v = xmm15VReg
+			}
+			mem := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg))
+			load := m.allocateInstr()
+			switch arg.Type {
+			case ssa.TypeI32:
+				load.asMovzxRmR(extModeLQ, mem, v)
+			case ssa.TypeI64:
+				load.asMov64MR(mem, v)
+			case ssa.TypeF32:
+				load.asXmmUnaryRmR(sseOpcodeMovss, mem, v)
+			case ssa.TypeF64:
+				load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v)
+			case ssa.TypeV128:
+				load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
+			default:
+				panic("BUG")
+			}
+			cur = linkInstr(cur, load)
+		}
+
+		store := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg))
+		switch arg.Type {
+		case ssa.TypeI32:
+			store.asMovRM(v, mem, 4)
+			offsetInGoSlice += 8 // always uint64 rep.
+		case ssa.TypeI64:
+			store.asMovRM(v, mem, 8)
+			offsetInGoSlice += 8
+		case ssa.TypeF32:
+			store.asXmmMovRM(sseOpcodeMovss, v, mem)
+			offsetInGoSlice += 8 // always uint64 rep.
+		case ssa.TypeF64:
+			store.asXmmMovRM(sseOpcodeMovsd, v, mem)
+			offsetInGoSlice += 8
+		case ssa.TypeV128:
+			store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
+			offsetInGoSlice += 16
+		default:
+			panic("BUG")
+		}
+		cur = linkInstr(cur, store)
+	}
+
+	// Finally we push the size of the slice to the stack so the stack looks like:
+	//
+	//          (high address)
+	//       +-----------------+
+	//       |     .......     |
+	//       |      ret Y      |
+	//       |     .......     |
+	//       |      ret 0      |
+	//       |      arg X      |
+	//       |     .......     |
+	//       |      arg 1      |
+	//       |      arg 0      |
+	//       |   Return Addr   |
+	//       |    Caller_RBP   |
+	//       +-----------------+ <----- RBP
+	//       |  arg[N]/ret[M]  |
+	//       |    ..........   |
+	//       |  arg[1]/ret[1]  |
+	//       |  arg[0]/ret[0]  |
+	//       |    slice size   |
+	//       +-----------------+ <----- RSP
+	//         (low address)
+	//
+	// 		push $sliceSize
+	cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandImm32(uint32(goSliceSizeAlignedUnaligned))))
+
+	// Load the exitCode to the register.
+	exitCodeReg := r12VReg // Callee saved which is already saved.
+	cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(exitCode), false))
+
+	saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg)
+	cur = linkInstr(cur, setExitCode)
+	cur = linkInstr(cur, saveRsp)
+	cur = linkInstr(cur, saveRbp)
+
+	// Ready to exit the execution.
+	cur = m.storeReturnAddressAndExit(cur, execCtrPtr)
+
+	// We don't need the slice size anymore, so pop it.
+	cur = m.addRSP(8, cur)
+
+	// Ready to set up the results.
+	offsetInGoSlice = 0
+	// To avoid overwriting with the execution context pointer by the result, we need to track the offset,
+	// and defer the restoration of the result to the end of this function.
+	var argOverlapWithExecCtxOffset int32 = -1
+	for i := range abi.Rets {
+		r := &abi.Rets[i]
+		var v regalloc.VReg
+		isRegResult := r.Kind == backend.ABIArgKindReg
+		if isRegResult {
+			v = r.Reg
+			if v.RealReg() == execCtrPtr.RealReg() {
+				argOverlapWithExecCtxOffset = offsetInGoSlice
+				offsetInGoSlice += 8 // always uint64 rep.
+				continue
+			}
+		} else {
+			if r.Type.IsInt() {
+				v = r15VReg
+			} else {
+				v = xmm15VReg
+			}
+		}
+
+		load := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg))
+		switch r.Type {
+		case ssa.TypeI32:
+			load.asMovzxRmR(extModeLQ, mem, v)
+			offsetInGoSlice += 8 // always uint64 rep.
+		case ssa.TypeI64:
+			load.asMov64MR(mem, v)
+			offsetInGoSlice += 8
+		case ssa.TypeF32:
+			load.asXmmUnaryRmR(sseOpcodeMovss, mem, v)
+			offsetInGoSlice += 8 // always uint64 rep.
+		case ssa.TypeF64:
+			load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v)
+			offsetInGoSlice += 8
+		case ssa.TypeV128:
+			load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
+			offsetInGoSlice += 16
+		default:
+			panic("BUG")
+		}
+		cur = linkInstr(cur, load)
+
+		if !isRegResult {
+			// We need to store it back to the result slot above rbp.
+			store := m.allocateInstr()
+			mem := newOperandMem(m.newAmodeImmReg(uint32(abi.ArgStackSize+r.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg))
+			switch r.Type {
+			case ssa.TypeI32:
+				store.asMovRM(v, mem, 4)
+			case ssa.TypeI64:
+				store.asMovRM(v, mem, 8)
+			case ssa.TypeF32:
+				store.asXmmMovRM(sseOpcodeMovss, v, mem)
+			case ssa.TypeF64:
+				store.asXmmMovRM(sseOpcodeMovsd, v, mem)
+			case ssa.TypeV128:
+				store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
+			default:
+				panic("BUG")
+			}
+			cur = linkInstr(cur, store)
+		}
+	}
+
+	// Before return, we need to restore the callee saved registers.
+	cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs)
+
+	if argOverlapWithExecCtxOffset >= 0 {
+		// At this point execCtt is not used anymore, so we can finally store the
+		// result to the register which overlaps with the execution context pointer.
+		mem := newOperandMem(m.newAmodeImmReg(uint32(argOverlapWithExecCtxOffset), rspVReg))
+		load := m.allocateInstr().asMov64MR(mem, execCtrPtr)
+		cur = linkInstr(cur, load)
+	}
+
+	// Finally ready to return.
+	cur = m.revertRBPRSP(cur)
+	linkInstr(cur, m.allocateInstr().asRet())
+
+	m.encodeWithoutSSA(ectx.RootInstr)
+	return m.c.Buf()
+}
+
+func (m *machine) saveRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction {
+	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
+	for _, v := range regs {
+		store := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx))
+		switch v.RegType() {
+		case regalloc.RegTypeInt:
+			store.asMovRM(v, mem, 8)
+		case regalloc.RegTypeFloat:
+			store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
+		default:
+			panic("BUG")
+		}
+		cur = linkInstr(cur, store)
+		offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally.
+	}
+	return cur
+}
+
+func (m *machine) restoreRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction {
+	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
+	for _, v := range regs {
+		load := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx))
+		switch v.RegType() {
+		case regalloc.RegTypeInt:
+			load.asMov64MR(mem, v)
+		case regalloc.RegTypeFloat:
+			load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
+		default:
+			panic("BUG")
+		}
+		cur = linkInstr(cur, load)
+		offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally.
+	}
+	return cur
+}
+
+func (m *machine) storeReturnAddressAndExit(cur *instruction, execCtx regalloc.VReg) *instruction {
+	readRip := m.allocateInstr()
+	cur = linkInstr(cur, readRip)
+
+	ripReg := r12VReg // Callee saved which is already saved.
+	saveRip := m.allocateInstr().asMovRM(
+		ripReg,
+		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)),
+		8,
+	)
+	cur = linkInstr(cur, saveRip)
+
+	exit := m.allocateExitSeq(execCtx)
+	cur = linkInstr(cur, exit)
+
+	nop, l := m.allocateBrTarget()
+	cur = linkInstr(cur, nop)
+	readRip.asLEA(newOperandLabel(l), ripReg)
+	return cur
+}
+
+// saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
+// stack space left. Basically this is the all allocatable registers except for RSP and RBP, and RAX which contains the
+// execution context pointer. ExecCtx pointer is always the first argument so we don't need to save it.
+var stackGrowSaveVRegs = []regalloc.VReg{
+	rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg,
+	rcxVReg, rbxVReg, rsiVReg, rdiVReg, r8VReg, r9VReg, r10VReg, r11VReg,
+	xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg,
+	xmm0VReg, xmm1VReg, xmm2VReg, xmm3VReg, xmm4VReg, xmm5VReg, xmm6VReg, xmm7VReg,
+}
+
+// CompileStackGrowCallSequence implements backend.Machine.
+func (m *machine) CompileStackGrowCallSequence() []byte {
+	ectx := m.ectx
+
+	cur := m.allocateNop()
+	ectx.RootInstr = cur
+
+	cur = m.setupRBPRSP(cur)
+
+	// Execution context is always the first argument.
+	execCtrPtr := raxVReg
+
+	// Save the callee saved and argument registers.
+	cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs)
+
+	// Load the exitCode to the register.
+	exitCodeReg := r12VReg // Already saved.
+	cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(wazevoapi.ExitCodeGrowStack), false))
+
+	saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg)
+	cur = linkInstr(cur, setExitCode)
+	cur = linkInstr(cur, saveRsp)
+	cur = linkInstr(cur, saveRbp)
+
+	// Ready to exit the execution.
+	cur = m.storeReturnAddressAndExit(cur, execCtrPtr)
+
+	// After the exit, restore the saved registers.
+	cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs)
+
+	// Finally ready to return.
+	cur = m.revertRBPRSP(cur)
+	linkInstr(cur, m.allocateInstr().asRet())
+
+	m.encodeWithoutSSA(ectx.RootInstr)
+	return m.c.Buf()
+}
+
+// insertStackBoundsCheck will insert the instructions after `cur` to check the
+// stack bounds, and if there's no sufficient spaces required for the function,
+// exit the execution and try growing it in Go world.
+func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction {
+	//		add $requiredStackSize, %rsp ;; Temporarily update the sp.
+	// 		cmp ExecutionContextOffsetStackBottomPtr(%rax), %rsp ;; Compare the stack bottom and the sp.
+	// 		ja .ok
+	//		sub $requiredStackSize, %rsp ;; Reverse the temporary update.
+	//      pushq r15 ;; save the temporary.
+	//		mov $requiredStackSize, %r15
+	//		mov %15, ExecutionContextOffsetStackGrowRequiredSize(%rax) ;; Set the required size in the execution context.
+	//      popq r15 ;; restore the temporary.
+	//		callq *ExecutionContextOffsetStackGrowCallTrampolineAddress(%rax) ;; Call the Go function to grow the stack.
+	//		jmp .cont
+	// .ok:
+	//		sub $requiredStackSize, %rsp ;; Reverse the temporary update.
+	// .cont:
+	cur = m.addRSP(-int32(requiredStackSize), cur)
+	cur = linkInstr(cur, m.allocateInstr().asCmpRmiR(true,
+		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackBottomPtr.U32(), raxVReg)),
+		rspVReg, true))
+
+	ja := m.allocateInstr()
+	cur = linkInstr(cur, ja)
+
+	cur = m.addRSP(int32(requiredStackSize), cur)
+
+	// Save the temporary.
+
+	cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r15VReg)))
+	// Load the required size to the temporary.
+	cur = linkInstr(cur, m.allocateInstr().asImm(r15VReg, uint64(requiredStackSize), true))
+	// Set the required size in the execution context.
+	cur = linkInstr(cur, m.allocateInstr().asMovRM(r15VReg,
+		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.U32(), raxVReg)), 8))
+	// Restore the temporary.
+	cur = linkInstr(cur, m.allocateInstr().asPop64(r15VReg))
+	// Call the Go function to grow the stack.
+	cur = linkInstr(cur, m.allocateInstr().asCallIndirect(newOperandMem(m.newAmodeImmReg(
+		wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.U32(), raxVReg)), nil))
+	// Jump to the continuation.
+	jmpToCont := m.allocateInstr()
+	cur = linkInstr(cur, jmpToCont)
+
+	// .ok:
+	okInstr, ok := m.allocateBrTarget()
+	cur = linkInstr(cur, okInstr)
+	ja.asJmpIf(condNBE, newOperandLabel(ok))
+	// On the ok path, we only need to reverse the temporary update.
+	cur = m.addRSP(int32(requiredStackSize), cur)
+
+	// .cont:
+	contInstr, cont := m.allocateBrTarget()
+	cur = linkInstr(cur, contInstr)
+	jmpToCont.asJmp(newOperandLabel(cont))
+
+	return cur
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go
new file mode 100644
index 000000000..75cbeab75
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go
@@ -0,0 +1,168 @@
+package amd64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type cond byte
+
+const (
+	// condO represents (overflow) condition.
+	condO cond = iota
+	// condNO represents (no overflow) condition.
+	condNO
+	// condB represents (< unsigned) condition.
+	condB
+	// condNB represents (>= unsigned) condition.
+	condNB
+	// condZ represents (zero) condition.
+	condZ
+	// condNZ represents (not-zero) condition.
+	condNZ
+	// condBE represents (<= unsigned) condition.
+	condBE
+	// condNBE represents (> unsigned) condition.
+	condNBE
+	// condS represents (negative) condition.
+	condS
+	// condNS represents (not-negative) condition.
+	condNS
+	// condP represents (parity) condition.
+	condP
+	// condNP represents (not parity) condition.
+	condNP
+	// condL represents (< signed) condition.
+	condL
+	// condNL represents (>= signed) condition.
+	condNL
+	// condLE represents (<= signed) condition.
+	condLE
+	// condNLE represents (> signed) condition.
+	condNLE
+
+	condInvalid
+)
+
+func (c cond) String() string {
+	switch c {
+	case condO:
+		return "o"
+	case condNO:
+		return "no"
+	case condB:
+		return "b"
+	case condNB:
+		return "nb"
+	case condZ:
+		return "z"
+	case condNZ:
+		return "nz"
+	case condBE:
+		return "be"
+	case condNBE:
+		return "nbe"
+	case condS:
+		return "s"
+	case condNS:
+		return "ns"
+	case condL:
+		return "l"
+	case condNL:
+		return "nl"
+	case condLE:
+		return "le"
+	case condNLE:
+		return "nle"
+	case condP:
+		return "p"
+	case condNP:
+		return "np"
+	default:
+		panic("unreachable")
+	}
+}
+
+func condFromSSAIntCmpCond(origin ssa.IntegerCmpCond) cond {
+	switch origin {
+	case ssa.IntegerCmpCondEqual:
+		return condZ
+	case ssa.IntegerCmpCondNotEqual:
+		return condNZ
+	case ssa.IntegerCmpCondSignedLessThan:
+		return condL
+	case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
+		return condNL
+	case ssa.IntegerCmpCondSignedGreaterThan:
+		return condNLE
+	case ssa.IntegerCmpCondSignedLessThanOrEqual:
+		return condLE
+	case ssa.IntegerCmpCondUnsignedLessThan:
+		return condB
+	case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual:
+		return condNB
+	case ssa.IntegerCmpCondUnsignedGreaterThan:
+		return condNBE
+	case ssa.IntegerCmpCondUnsignedLessThanOrEqual:
+		return condBE
+	default:
+		panic("unreachable")
+	}
+}
+
+func condFromSSAFloatCmpCond(origin ssa.FloatCmpCond) cond {
+	switch origin {
+	case ssa.FloatCmpCondGreaterThanOrEqual:
+		return condNB
+	case ssa.FloatCmpCondGreaterThan:
+		return condNBE
+	case ssa.FloatCmpCondEqual, ssa.FloatCmpCondNotEqual, ssa.FloatCmpCondLessThan, ssa.FloatCmpCondLessThanOrEqual:
+		panic(fmt.Sprintf("cond %s must be treated as a special case", origin))
+	default:
+		panic("unreachable")
+	}
+}
+
+func (c cond) encoding() byte {
+	return byte(c)
+}
+
+func (c cond) invert() cond {
+	switch c {
+	case condO:
+		return condNO
+	case condNO:
+		return condO
+	case condB:
+		return condNB
+	case condNB:
+		return condB
+	case condZ:
+		return condNZ
+	case condNZ:
+		return condZ
+	case condBE:
+		return condNBE
+	case condNBE:
+		return condBE
+	case condS:
+		return condNS
+	case condNS:
+		return condS
+	case condP:
+		return condNP
+	case condNP:
+		return condP
+	case condL:
+		return condNL
+	case condNL:
+		return condL
+	case condLE:
+		return condNLE
+	case condNLE:
+		return condLE
+	default:
+		panic("unreachable")
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go
new file mode 100644
index 000000000..5e731e822
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go
@@ -0,0 +1,35 @@
+package amd64
+
+// extMode represents the mode of extension in movzx/movsx.
+type extMode byte
+
+const (
+	// extModeBL represents Byte -> Longword.
+	extModeBL extMode = iota
+	// extModeBQ represents Byte -> Quadword.
+	extModeBQ
+	// extModeWL represents Word -> Longword.
+	extModeWL
+	// extModeWQ represents Word -> Quadword.
+	extModeWQ
+	// extModeLQ represents Longword -> Quadword.
+	extModeLQ
+)
+
+// String implements fmt.Stringer.
+func (e extMode) String() string {
+	switch e {
+	case extModeBL:
+		return "bl"
+	case extModeBQ:
+		return "bq"
+	case extModeWL:
+		return "wl"
+	case extModeWQ:
+		return "wq"
+	case extModeLQ:
+		return "lq"
+	default:
+		panic("BUG: invalid ext mode")
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go
new file mode 100644
index 000000000..d27e79c0e
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go
@@ -0,0 +1,2472 @@
+package amd64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type instruction struct {
+	prev, next          *instruction
+	op1, op2            operand
+	u1, u2              uint64
+	b1                  bool
+	addedBeforeRegAlloc bool
+	kind                instructionKind
+}
+
+// Next implements regalloc.Instr.
+func (i *instruction) Next() regalloc.Instr {
+	return i.next
+}
+
+// Prev implements regalloc.Instr.
+func (i *instruction) Prev() regalloc.Instr {
+	return i.prev
+}
+
+// IsCall implements regalloc.Instr.
+func (i *instruction) IsCall() bool { return i.kind == call }
+
+// IsIndirectCall implements regalloc.Instr.
+func (i *instruction) IsIndirectCall() bool { return i.kind == callIndirect }
+
+// IsReturn implements regalloc.Instr.
+func (i *instruction) IsReturn() bool { return i.kind == ret }
+
+// AddedBeforeRegAlloc implements regalloc.Instr.
+func (i *instruction) AddedBeforeRegAlloc() bool { return i.addedBeforeRegAlloc }
+
+// String implements regalloc.Instr.
+func (i *instruction) String() string {
+	switch i.kind {
+	case nop0:
+		return "nop"
+	case sourceOffsetInfo:
+		return fmt.Sprintf("source_offset_info %d", i.u1)
+	case ret:
+		return "ret"
+	case imm:
+		if i.b1 {
+			return fmt.Sprintf("movabsq $%d, %s", int64(i.u1), i.op2.format(true))
+		} else {
+			return fmt.Sprintf("movl $%d, %s", int32(i.u1), i.op2.format(false))
+		}
+	case aluRmiR:
+		return fmt.Sprintf("%s %s, %s", aluRmiROpcode(i.u1), i.op1.format(i.b1), i.op2.format(i.b1))
+	case movRR:
+		if i.b1 {
+			return fmt.Sprintf("movq %s, %s", i.op1.format(true), i.op2.format(true))
+		} else {
+			return fmt.Sprintf("movl %s, %s", i.op1.format(false), i.op2.format(false))
+		}
+	case xmmRmR:
+		return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(false), i.op2.format(false))
+	case gprToXmm:
+		return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(i.b1), i.op2.format(i.b1))
+	case xmmUnaryRmR:
+		return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(false), i.op2.format(false))
+	case xmmUnaryRmRImm:
+		return fmt.Sprintf("%s $%d, %s, %s", sseOpcode(i.u1), roundingMode(i.u2), i.op1.format(false), i.op2.format(false))
+	case unaryRmR:
+		var suffix string
+		if i.b1 {
+			suffix = "q"
+		} else {
+			suffix = "l"
+		}
+		return fmt.Sprintf("%s%s %s, %s", unaryRmROpcode(i.u1), suffix, i.op1.format(i.b1), i.op2.format(i.b1))
+	case not:
+		var op string
+		if i.b1 {
+			op = "notq"
+		} else {
+			op = "notl"
+		}
+		return fmt.Sprintf("%s %s", op, i.op1.format(i.b1))
+	case neg:
+		var op string
+		if i.b1 {
+			op = "negq"
+		} else {
+			op = "negl"
+		}
+		return fmt.Sprintf("%s %s", op, i.op1.format(i.b1))
+	case div:
+		var prefix string
+		var op string
+		if i.b1 {
+			op = "divq"
+		} else {
+			op = "divl"
+		}
+		if i.u1 != 0 {
+			prefix = "i"
+		}
+		return fmt.Sprintf("%s%s %s", prefix, op, i.op1.format(i.b1))
+	case mulHi:
+		signed, _64 := i.u1 != 0, i.b1
+		var op string
+		switch {
+		case signed && _64:
+			op = "imulq"
+		case !signed && _64:
+			op = "mulq"
+		case signed && !_64:
+			op = "imull"
+		case !signed && !_64:
+			op = "mull"
+		}
+		return fmt.Sprintf("%s %s", op, i.op1.format(i.b1))
+	case signExtendData:
+		var op string
+		if i.b1 {
+			op = "cqo"
+		} else {
+			op = "cdq"
+		}
+		return op
+	case movzxRmR:
+		return fmt.Sprintf("movzx.%s %s, %s", extMode(i.u1), i.op1.format(true), i.op2.format(true))
+	case mov64MR:
+		return fmt.Sprintf("movq %s, %s", i.op1.format(true), i.op2.format(true))
+	case lea:
+		return fmt.Sprintf("lea %s, %s", i.op1.format(true), i.op2.format(true))
+	case movsxRmR:
+		return fmt.Sprintf("movsx.%s %s, %s", extMode(i.u1), i.op1.format(true), i.op2.format(true))
+	case movRM:
+		var suffix string
+		switch i.u1 {
+		case 1:
+			suffix = "b"
+		case 2:
+			suffix = "w"
+		case 4:
+			suffix = "l"
+		case 8:
+			suffix = "q"
+		}
+		return fmt.Sprintf("mov.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true))
+	case shiftR:
+		var suffix string
+		if i.b1 {
+			suffix = "q"
+		} else {
+			suffix = "l"
+		}
+		return fmt.Sprintf("%s%s %s, %s", shiftROp(i.u1), suffix, i.op1.format(false), i.op2.format(i.b1))
+	case xmmRmiReg:
+		return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(true), i.op2.format(true))
+	case cmpRmiR:
+		var op, suffix string
+		if i.u1 != 0 {
+			op = "cmp"
+		} else {
+			op = "test"
+		}
+		if i.b1 {
+			suffix = "q"
+		} else {
+			suffix = "l"
+		}
+		if op == "test" && i.op1.kind == operandKindMem {
+			// Print consistently with AT&T syntax.
+			return fmt.Sprintf("%s%s %s, %s", op, suffix, i.op2.format(i.b1), i.op1.format(i.b1))
+		}
+		return fmt.Sprintf("%s%s %s, %s", op, suffix, i.op1.format(i.b1), i.op2.format(i.b1))
+	case setcc:
+		return fmt.Sprintf("set%s %s", cond(i.u1), i.op2.format(true))
+	case cmove:
+		var suffix string
+		if i.b1 {
+			suffix = "q"
+		} else {
+			suffix = "l"
+		}
+		return fmt.Sprintf("cmov%s%s %s, %s", cond(i.u1), suffix, i.op1.format(i.b1), i.op2.format(i.b1))
+	case push64:
+		return fmt.Sprintf("pushq %s", i.op1.format(true))
+	case pop64:
+		return fmt.Sprintf("popq %s", i.op1.format(true))
+	case xmmMovRM:
+		return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(true), i.op2.format(true))
+	case xmmLoadConst:
+		panic("TODO")
+	case xmmToGpr:
+		return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(i.b1), i.op2.format(i.b1))
+	case cvtUint64ToFloatSeq:
+		panic("TODO")
+	case cvtFloatToSintSeq:
+		panic("TODO")
+	case cvtFloatToUintSeq:
+		panic("TODO")
+	case xmmMinMaxSeq:
+		panic("TODO")
+	case xmmCmpRmR:
+		return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(false), i.op2.format(false))
+	case xmmRmRImm:
+		op := sseOpcode(i.u1)
+		r1, r2 := i.op1.format(op == sseOpcodePextrq || op == sseOpcodePinsrq),
+			i.op2.format(op == sseOpcodePextrq || op == sseOpcodePinsrq)
+		return fmt.Sprintf("%s $%d, %s, %s", op, i.u2, r1, r2)
+	case jmp:
+		return fmt.Sprintf("jmp %s", i.op1.format(true))
+	case jmpIf:
+		return fmt.Sprintf("j%s %s", cond(i.u1), i.op1.format(true))
+	case jmpTableIsland:
+		return fmt.Sprintf("jump_table_island: jmp_table_index=%d", i.u1)
+	case exitSequence:
+		return fmt.Sprintf("exit_sequence %s", i.op1.format(true))
+	case ud2:
+		return "ud2"
+	case call:
+		return fmt.Sprintf("call %s", ssa.FuncRef(i.u1))
+	case callIndirect:
+		return fmt.Sprintf("callq *%s", i.op1.format(true))
+	case xchg:
+		var suffix string
+		switch i.u1 {
+		case 1:
+			suffix = "b"
+		case 2:
+			suffix = "w"
+		case 4:
+			suffix = "l"
+		case 8:
+			suffix = "q"
+		}
+		return fmt.Sprintf("xchg.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true))
+	case zeros:
+		return fmt.Sprintf("xor %s, %s", i.op2.format(true), i.op2.format(true))
+	case fcvtToSintSequence:
+		execCtx, src, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat := i.fcvtToSintSequenceData()
+		return fmt.Sprintf(
+			"fcvtToSintSequence execCtx=%s, src=%s, tmpGp=%s, tmpGp2=%s, tmpXmm=%s, src64=%v, dst64=%v, sat=%v",
+			formatVRegSized(execCtx, true),
+			formatVRegSized(src, true),
+			formatVRegSized(tmpGp, true),
+			formatVRegSized(tmpGp2, true),
+			formatVRegSized(tmpXmm, true), src64, dst64, sat)
+	case fcvtToUintSequence:
+		execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat := i.fcvtToUintSequenceData()
+		return fmt.Sprintf(
+			"fcvtToUintSequence execCtx=%s, src=%s, tmpGp=%s, tmpGp2=%s, tmpXmm=%s, tmpXmm2=%s, src64=%v, dst64=%v, sat=%v",
+			formatVRegSized(execCtx, true),
+			formatVRegSized(src, true),
+			formatVRegSized(tmpGp, true),
+			formatVRegSized(tmpGp2, true),
+			formatVRegSized(tmpXmm, true),
+			formatVRegSized(tmpXmm2, true), src64, dst64, sat)
+	case idivRemSequence:
+		execCtx, divisor, tmpGp, isDiv, signed, _64 := i.idivRemSequenceData()
+		return fmt.Sprintf("idivRemSequence execCtx=%s, divisor=%s, tmpGp=%s, isDiv=%v, signed=%v, _64=%v",
+			formatVRegSized(execCtx, true), formatVRegSized(divisor, _64), formatVRegSized(tmpGp, _64), isDiv, signed, _64)
+	case defineUninitializedReg:
+		return fmt.Sprintf("defineUninitializedReg %s", i.op2.format(true))
+	case xmmCMov:
+		return fmt.Sprintf("xmmcmov%s %s, %s", cond(i.u1), i.op1.format(true), i.op2.format(true))
+	case blendvpd:
+		return fmt.Sprintf("blendvpd %s, %s, %%xmm0", i.op1.format(false), i.op2.format(false))
+	case mfence:
+		return "mfence"
+	case lockcmpxchg:
+		var suffix string
+		switch i.u1 {
+		case 1:
+			suffix = "b"
+		case 2:
+			suffix = "w"
+		case 4:
+			suffix = "l"
+		case 8:
+			suffix = "q"
+		}
+		return fmt.Sprintf("lock cmpxchg.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true))
+	case lockxadd:
+		var suffix string
+		switch i.u1 {
+		case 1:
+			suffix = "b"
+		case 2:
+			suffix = "w"
+		case 4:
+			suffix = "l"
+		case 8:
+			suffix = "q"
+		}
+		return fmt.Sprintf("lock xadd.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true))
+
+	case nopUseReg:
+		return fmt.Sprintf("nop_use_reg %s", i.op1.format(true))
+
+	default:
+		panic(fmt.Sprintf("BUG: %d", int(i.kind)))
+	}
+}
+
+// Defs implements regalloc.Instr.
+func (i *instruction) Defs(regs *[]regalloc.VReg) []regalloc.VReg {
+	*regs = (*regs)[:0]
+	switch dk := defKinds[i.kind]; dk {
+	case defKindNone:
+	case defKindOp2:
+		*regs = append(*regs, i.op2.reg())
+	case defKindCall:
+		_, _, retIntRealRegs, retFloatRealRegs, _ := backend.ABIInfoFromUint64(i.u2)
+		for i := byte(0); i < retIntRealRegs; i++ {
+			*regs = append(*regs, regInfo.RealRegToVReg[intArgResultRegs[i]])
+		}
+		for i := byte(0); i < retFloatRealRegs; i++ {
+			*regs = append(*regs, regInfo.RealRegToVReg[floatArgResultRegs[i]])
+		}
+	case defKindDivRem:
+		_, _, _, isDiv, _, _ := i.idivRemSequenceData()
+		if isDiv {
+			*regs = append(*regs, raxVReg)
+		} else {
+			*regs = append(*regs, rdxVReg)
+		}
+	default:
+		panic(fmt.Sprintf("BUG: invalid defKind \"%s\" for %s", dk, i))
+	}
+	return *regs
+}
+
+// Uses implements regalloc.Instr.
+func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
+	*regs = (*regs)[:0]
+	switch uk := useKinds[i.kind]; uk {
+	case useKindNone:
+	case useKindOp1Op2Reg, useKindOp1RegOp2:
+		opAny, opReg := &i.op1, &i.op2
+		if uk == useKindOp1RegOp2 {
+			opAny, opReg = opReg, opAny
+		}
+		// The destination operand (op2) can be only reg,
+		// the source operand (op1) can be imm32, reg or mem.
+		switch opAny.kind {
+		case operandKindReg:
+			*regs = append(*regs, opAny.reg())
+		case operandKindMem:
+			opAny.addressMode().uses(regs)
+		case operandKindImm32:
+		default:
+			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+		}
+		if opReg.kind != operandKindReg {
+			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+		}
+		*regs = append(*regs, opReg.reg())
+	case useKindOp1:
+		op := i.op1
+		switch op.kind {
+		case operandKindReg:
+			*regs = append(*regs, op.reg())
+		case operandKindMem:
+			op.addressMode().uses(regs)
+		case operandKindImm32, operandKindLabel:
+		default:
+			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+		}
+	case useKindCallInd:
+		op := i.op1
+		switch op.kind {
+		case operandKindReg:
+			*regs = append(*regs, op.reg())
+		case operandKindMem:
+			op.addressMode().uses(regs)
+		default:
+			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+		}
+		fallthrough
+	case useKindCall:
+		argIntRealRegs, argFloatRealRegs, _, _, _ := backend.ABIInfoFromUint64(i.u2)
+		for i := byte(0); i < argIntRealRegs; i++ {
+			*regs = append(*regs, regInfo.RealRegToVReg[intArgResultRegs[i]])
+		}
+		for i := byte(0); i < argFloatRealRegs; i++ {
+			*regs = append(*regs, regInfo.RealRegToVReg[floatArgResultRegs[i]])
+		}
+	case useKindFcvtToSintSequence:
+		execCtx, src, tmpGp, tmpGp2, tmpXmm, _, _, _ := i.fcvtToSintSequenceData()
+		*regs = append(*regs, execCtx, src, tmpGp, tmpGp2, tmpXmm)
+	case useKindFcvtToUintSequence:
+		execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, _, _, _ := i.fcvtToUintSequenceData()
+		*regs = append(*regs, execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2)
+	case useKindDivRem:
+		execCtx, divisor, tmpGp, _, _, _ := i.idivRemSequenceData()
+		// idiv uses rax and rdx as implicit operands.
+		*regs = append(*regs, raxVReg, rdxVReg, execCtx, divisor, tmpGp)
+	case useKindBlendvpd:
+		*regs = append(*regs, xmm0VReg)
+
+		opAny, opReg := &i.op1, &i.op2
+		switch opAny.kind {
+		case operandKindReg:
+			*regs = append(*regs, opAny.reg())
+		case operandKindMem:
+			opAny.addressMode().uses(regs)
+		default:
+			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+		}
+		if opReg.kind != operandKindReg {
+			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+		}
+		*regs = append(*regs, opReg.reg())
+
+	case useKindRaxOp1RegOp2:
+		opReg, opAny := &i.op1, &i.op2
+		*regs = append(*regs, raxVReg, opReg.reg())
+		switch opAny.kind {
+		case operandKindReg:
+			*regs = append(*regs, opAny.reg())
+		case operandKindMem:
+			opAny.addressMode().uses(regs)
+		default:
+			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+		}
+		if opReg.kind != operandKindReg {
+			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+		}
+
+	default:
+		panic(fmt.Sprintf("BUG: invalid useKind %s for %s", uk, i))
+	}
+	return *regs
+}
+
+// AssignUse implements regalloc.Instr.
+func (i *instruction) AssignUse(index int, v regalloc.VReg) {
+	switch uk := useKinds[i.kind]; uk {
+	case useKindNone:
+	case useKindCallInd:
+		if index != 0 {
+			panic("BUG")
+		}
+		op := &i.op1
+		switch op.kind {
+		case operandKindReg:
+			op.setReg(v)
+		case operandKindMem:
+			op.addressMode().assignUses(index, v)
+		default:
+			panic("BUG")
+		}
+	case useKindOp1Op2Reg, useKindOp1RegOp2:
+		op, opMustBeReg := &i.op1, &i.op2
+		if uk == useKindOp1RegOp2 {
+			op, opMustBeReg = opMustBeReg, op
+		}
+		switch op.kind {
+		case operandKindReg:
+			if index == 0 {
+				op.setReg(v)
+			} else if index == 1 {
+				opMustBeReg.setReg(v)
+			} else {
+				panic("BUG")
+			}
+		case operandKindMem:
+			nregs := op.addressMode().nregs()
+			if index < nregs {
+				op.addressMode().assignUses(index, v)
+			} else if index == nregs {
+				opMustBeReg.setReg(v)
+			} else {
+				panic("BUG")
+			}
+		case operandKindImm32:
+			if index == 0 {
+				opMustBeReg.setReg(v)
+			} else {
+				panic("BUG")
+			}
+		default:
+			panic(fmt.Sprintf("BUG: invalid operand pair: %s", i))
+		}
+	case useKindOp1:
+		op := &i.op1
+		switch op.kind {
+		case operandKindReg:
+			if index != 0 {
+				panic("BUG")
+			}
+			op.setReg(v)
+		case operandKindMem:
+			op.addressMode().assignUses(index, v)
+		default:
+			panic(fmt.Sprintf("BUG: invalid operand: %s", i))
+		}
+	case useKindFcvtToSintSequence:
+		switch index {
+		case 0:
+			i.op1.addressMode().base = v
+		case 1:
+			i.op1.addressMode().index = v
+		case 2:
+			i.op2.addressMode().base = v
+		case 3:
+			i.op2.addressMode().index = v
+		case 4:
+			i.u1 = uint64(v)
+		default:
+			panic("BUG")
+		}
+	case useKindFcvtToUintSequence:
+		switch index {
+		case 0:
+			i.op1.addressMode().base = v
+		case 1:
+			i.op1.addressMode().index = v
+		case 2:
+			i.op2.addressMode().base = v
+		case 3:
+			i.op2.addressMode().index = v
+		case 4:
+			i.u1 = uint64(v)
+		case 5:
+			i.u2 = uint64(v)
+		default:
+			panic("BUG")
+		}
+	case useKindDivRem:
+		switch index {
+		case 0:
+			if v != raxVReg {
+				panic("BUG")
+			}
+		case 1:
+			if v != rdxVReg {
+				panic("BUG")
+			}
+		case 2:
+			i.op1.setReg(v)
+		case 3:
+			i.op2.setReg(v)
+		case 4:
+			i.u1 = uint64(v)
+		default:
+			panic("BUG")
+		}
+	case useKindBlendvpd:
+		op, opMustBeReg := &i.op1, &i.op2
+		if index == 0 {
+			if v.RealReg() != xmm0 {
+				panic("BUG")
+			}
+		} else {
+			switch op.kind {
+			case operandKindReg:
+				switch index {
+				case 1:
+					op.setReg(v)
+				case 2:
+					opMustBeReg.setReg(v)
+				default:
+					panic("BUG")
+				}
+			case operandKindMem:
+				nregs := op.addressMode().nregs()
+				index--
+				if index < nregs {
+					op.addressMode().assignUses(index, v)
+				} else if index == nregs {
+					opMustBeReg.setReg(v)
+				} else {
+					panic("BUG")
+				}
+			default:
+				panic(fmt.Sprintf("BUG: invalid operand pair: %s", i))
+			}
+		}
+
+	case useKindRaxOp1RegOp2:
+		switch index {
+		case 0:
+			if v.RealReg() != rax {
+				panic("BUG")
+			}
+		case 1:
+			i.op1.setReg(v)
+		default:
+			op := &i.op2
+			switch op.kind {
+			case operandKindReg:
+				switch index {
+				case 1:
+					op.setReg(v)
+				case 2:
+					op.setReg(v)
+				default:
+					panic("BUG")
+				}
+			case operandKindMem:
+				nregs := op.addressMode().nregs()
+				index -= 2
+				if index < nregs {
+					op.addressMode().assignUses(index, v)
+				} else if index == nregs {
+					op.setReg(v)
+				} else {
+					panic("BUG")
+				}
+			default:
+				panic(fmt.Sprintf("BUG: invalid operand pair: %s", i))
+			}
+		}
+	default:
+		panic(fmt.Sprintf("BUG: invalid useKind %s for %s", uk, i))
+	}
+}
+
+// AssignDef implements regalloc.Instr.
+func (i *instruction) AssignDef(reg regalloc.VReg) {
+	switch dk := defKinds[i.kind]; dk {
+	case defKindNone:
+	case defKindOp2:
+		i.op2.setReg(reg)
+	default:
+		panic(fmt.Sprintf("BUG: invalid defKind \"%s\" for %s", dk, i))
+	}
+}
+
+// IsCopy implements regalloc.Instr.
+func (i *instruction) IsCopy() bool {
+	k := i.kind
+	if k == movRR {
+		return true
+	}
+	if k == xmmUnaryRmR {
+		if i.op1.kind == operandKindReg {
+			sse := sseOpcode(i.u1)
+			return sse == sseOpcodeMovss || sse == sseOpcodeMovsd || sse == sseOpcodeMovdqu
+		}
+	}
+	return false
+}
+
+func resetInstruction(i *instruction) {
+	*i = instruction{}
+}
+
+func setNext(i *instruction, next *instruction) {
+	i.next = next
+}
+
+func setPrev(i *instruction, prev *instruction) {
+	i.prev = prev
+}
+
+func asNop(i *instruction) {
+	i.kind = nop0
+}
+
+func (i *instruction) asNop0WithLabel(label backend.Label) *instruction { //nolint
+	i.kind = nop0
+	i.u1 = uint64(label)
+	return i
+}
+
+func (i *instruction) nop0Label() backend.Label {
+	return backend.Label(i.u1)
+}
+
+type instructionKind byte
+
+const (
+	nop0 instructionKind = iota + 1
+
+	// Integer arithmetic/bit-twiddling: (add sub and or xor mul, etc.) (32 64) (reg addr imm) reg
+	aluRmiR
+
+	// Instructions on GPR that only read src and defines dst (dst is not modified): bsr, etc.
+	unaryRmR
+
+	// Bitwise not
+	not
+
+	// Integer negation
+	neg
+
+	// Integer quotient and remainder: (div idiv) $rax $rdx (reg addr)
+	div
+
+	// The high bits (RDX) of a (un)signed multiply: RDX:RAX := RAX * rhs.
+	mulHi
+
+	// Do a sign-extend based on the sign of the value in rax into rdx: (cwd cdq cqo)
+	// or al into ah: (cbw)
+	signExtendData
+
+	// Constant materialization: (imm32 imm64) reg.
+	// Either: movl $imm32, %reg32 or movabsq $imm64, %reg64.
+	imm
+
+	// GPR to GPR move: mov (64 32) reg reg.
+	movRR
+
+	// movzxRmR is zero-extended loads or move (R to R), except for 64 bits: movz (bl bq wl wq lq) addr reg.
+	// Note that the lq variant doesn't really exist since the default zero-extend rule makes it
+	// unnecessary. For that case we emit the equivalent "movl AM, reg32".
+	movzxRmR
+
+	// mov64MR is a plain 64-bit integer load, since movzxRmR can't represent that.
+	mov64MR
+
+	// Loads the memory address of addr into dst.
+	lea
+
+	// Sign-extended loads and moves: movs (bl bq wl wq lq) addr reg.
+	movsxRmR
+
+	// Integer stores: mov (b w l q) reg addr.
+	movRM
+
+	// Arithmetic shifts: (shl shr sar) (b w l q) imm reg.
+	shiftR
+
+	// Arithmetic SIMD shifts.
+	xmmRmiReg
+
+	// Integer comparisons/tests: cmp or test (b w l q) (reg addr imm) reg.
+	cmpRmiR
+
+	// Materializes the requested condition code in the destination reg.
+	setcc
+
+	// Integer conditional move.
+	// Overwrites the destination register.
+	cmove
+
+	// pushq (reg addr imm)
+	push64
+
+	// popq reg
+	pop64
+
+	// XMM (scalar or vector) binary op: (add sub and or xor mul adc? sbb?) (32 64) (reg addr) reg
+	xmmRmR
+
+	// XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg.
+	//
+	// This differs from xmmRmR in that the dst register of xmmUnaryRmR is not used in the
+	// computation of the instruction dst value and so does not have to be a previously valid
+	// value. This is characteristic of mov instructions.
+	xmmUnaryRmR
+
+	// XMM (scalar or vector) unary op with immediate: roundss, roundsd, etc.
+	//
+	// This differs from XMM_RM_R_IMM in that the dst register of
+	// XmmUnaryRmRImm is not used in the computation of the instruction dst
+	// value and so does not have to be a previously valid value.
+	xmmUnaryRmRImm
+
+	// XMM (scalar or vector) unary op (from xmm to mem): stores, movd, movq
+	xmmMovRM
+
+	// XMM (vector) unary op (to move a constant value into an xmm register): movups
+	xmmLoadConst
+
+	// XMM (scalar) unary op (from xmm to integer reg): movd, movq, cvtts{s,d}2si
+	xmmToGpr
+
+	// XMM (scalar) unary op (from integer to float reg): movd, movq, cvtsi2s{s,d}
+	gprToXmm
+
+	// Converts an unsigned int64 to a float32/float64.
+	cvtUint64ToFloatSeq
+
+	// Converts a scalar xmm to a signed int32/int64.
+	cvtFloatToSintSeq
+
+	// Converts a scalar xmm to an unsigned int32/int64.
+	cvtFloatToUintSeq
+
+	// A sequence to compute min/max with the proper NaN semantics for xmm registers.
+	xmmMinMaxSeq
+
+	// Float comparisons/tests: cmp (b w l q) (reg addr imm) reg.
+	xmmCmpRmR
+
+	// A binary XMM instruction with an 8-bit immediate: e.g. cmp (ps pd) imm (reg addr) reg
+	xmmRmRImm
+
+	// Direct call: call simm32.
+	// Note that the offset is the relative to the *current RIP*, which points to the first byte of the next instruction.
+	call
+
+	// Indirect call: callq (reg mem).
+	callIndirect
+
+	// Return.
+	ret
+
+	// Jump: jmp (reg, mem, imm32 or label)
+	jmp
+
+	// Jump conditionally: jcond cond label.
+	jmpIf
+
+	// jmpTableIsland is to emit the jump table.
+	jmpTableIsland
+
+	// exitSequence exits the execution and go back to the Go world.
+	exitSequence
+
+	// An instruction that will always trigger the illegal instruction exception.
+	ud2
+
+	// xchg is described in https://www.felixcloutier.com/x86/xchg.
+	// This instruction uses two operands, where one of them can be a memory address, and swaps their values.
+	// If the dst is a memory address, the execution is atomic.
+	xchg
+
+	// lockcmpxchg is the cmpxchg instruction https://www.felixcloutier.com/x86/cmpxchg with a lock prefix.
+	lockcmpxchg
+
+	// zeros puts zeros into the destination register. This is implemented as xor reg, reg for
+	// either integer or XMM registers. The reason why we have this instruction instead of using aluRmiR
+	// is that it requires the already-defined registers. From reg alloc's perspective, this defines
+	// the destination register and takes no inputs.
+	zeros
+
+	// sourceOffsetInfo is a dummy instruction to emit source offset info.
+	// The existence of this instruction does not affect the execution.
+	sourceOffsetInfo
+
+	// defineUninitializedReg is a no-op instruction that defines a register without a defining instruction.
+	defineUninitializedReg
+
+	// fcvtToSintSequence is a sequence of instructions to convert a float to a signed integer.
+	fcvtToSintSequence
+
+	// fcvtToUintSequence is a sequence of instructions to convert a float to an unsigned integer.
+	fcvtToUintSequence
+
+	// xmmCMov is a conditional move instruction for XMM registers. Lowered after register allocation.
+	xmmCMov
+
+	// idivRemSequence is a sequence of instructions to compute both the quotient and remainder of a division.
+	idivRemSequence
+
+	// blendvpd is https://www.felixcloutier.com/x86/blendvpd.
+	blendvpd
+
+	// mfence is https://www.felixcloutier.com/x86/mfence
+	mfence
+
+	// lockxadd is xadd https://www.felixcloutier.com/x86/xadd with a lock prefix.
+	lockxadd
+
+	// nopUseReg is a meta instruction that uses one register and does nothing.
+	nopUseReg
+
+	instrMax
+)
+
+func (i *instruction) asMFence() *instruction {
+	i.kind = mfence
+	return i
+}
+
+func (i *instruction) asNopUseReg(r regalloc.VReg) *instruction {
+	i.kind = nopUseReg
+	i.op1 = newOperandReg(r)
+	return i
+}
+
+func (i *instruction) asIdivRemSequence(execCtx, divisor, tmpGp regalloc.VReg, isDiv, signed, _64 bool) *instruction {
+	i.kind = idivRemSequence
+	i.op1 = newOperandReg(execCtx)
+	i.op2 = newOperandReg(divisor)
+	i.u1 = uint64(tmpGp)
+	if isDiv {
+		i.u2 |= 1
+	}
+	if signed {
+		i.u2 |= 2
+	}
+	if _64 {
+		i.u2 |= 4
+	}
+	return i
+}
+
+func (i *instruction) idivRemSequenceData() (
+	execCtx, divisor, tmpGp regalloc.VReg, isDiv, signed, _64 bool,
+) {
+	if i.kind != idivRemSequence {
+		panic("BUG")
+	}
+	return i.op1.reg(), i.op2.reg(), regalloc.VReg(i.u1), i.u2&1 != 0, i.u2&2 != 0, i.u2&4 != 0
+}
+
+func (i *instruction) asXmmCMov(cc cond, x operand, rd regalloc.VReg, size byte) *instruction {
+	i.kind = xmmCMov
+	i.op1 = x
+	i.op2 = newOperandReg(rd)
+	i.u1 = uint64(cc)
+	i.u2 = uint64(size)
+	return i
+}
+
+func (i *instruction) asDefineUninitializedReg(r regalloc.VReg) *instruction {
+	i.kind = defineUninitializedReg
+	i.op2 = newOperandReg(r)
+	return i
+}
+
+func (m *machine) allocateFcvtToUintSequence(
+	execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2 regalloc.VReg,
+	src64, dst64, sat bool,
+) *instruction {
+	i := m.allocateInstr()
+	i.kind = fcvtToUintSequence
+	op1a := m.amodePool.Allocate()
+	op2a := m.amodePool.Allocate()
+	i.op1 = newOperandMem(op1a)
+	i.op2 = newOperandMem(op2a)
+	if src64 {
+		op1a.imm32 = 1
+	} else {
+		op1a.imm32 = 0
+	}
+	if dst64 {
+		op1a.imm32 |= 2
+	}
+	if sat {
+		op1a.imm32 |= 4
+	}
+
+	op1a.base = execCtx
+	op1a.index = src
+	op2a.base = tmpGp
+	op2a.index = tmpGp2
+	i.u1 = uint64(tmpXmm)
+	i.u2 = uint64(tmpXmm2)
+	return i
+}
+
+func (i *instruction) fcvtToUintSequenceData() (
+	execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2 regalloc.VReg, src64, dst64, sat bool,
+) {
+	if i.kind != fcvtToUintSequence {
+		panic("BUG")
+	}
+	op1a := i.op1.addressMode()
+	op2a := i.op2.addressMode()
+	return op1a.base, op1a.index, op2a.base, op2a.index, regalloc.VReg(i.u1), regalloc.VReg(i.u2),
+		op1a.imm32&1 != 0, op1a.imm32&2 != 0, op1a.imm32&4 != 0
+}
+
+func (m *machine) allocateFcvtToSintSequence(
+	execCtx, src, tmpGp, tmpGp2, tmpXmm regalloc.VReg,
+	src64, dst64, sat bool,
+) *instruction {
+	i := m.allocateInstr()
+	i.kind = fcvtToSintSequence
+	op1a := m.amodePool.Allocate()
+	op2a := m.amodePool.Allocate()
+	i.op1 = newOperandMem(op1a)
+	i.op2 = newOperandMem(op2a)
+	op1a.base = execCtx
+	op1a.index = src
+	op2a.base = tmpGp
+	op2a.index = tmpGp2
+	i.u1 = uint64(tmpXmm)
+	if src64 {
+		i.u2 = 1
+	} else {
+		i.u2 = 0
+	}
+	if dst64 {
+		i.u2 |= 2
+	}
+	if sat {
+		i.u2 |= 4
+	}
+	return i
+}
+
+func (i *instruction) fcvtToSintSequenceData() (
+	execCtx, src, tmpGp, tmpGp2, tmpXmm regalloc.VReg, src64, dst64, sat bool,
+) {
+	if i.kind != fcvtToSintSequence {
+		panic("BUG")
+	}
+	op1a := i.op1.addressMode()
+	op2a := i.op2.addressMode()
+	return op1a.base, op1a.index, op2a.base, op2a.index, regalloc.VReg(i.u1),
+		i.u2&1 != 0, i.u2&2 != 0, i.u2&4 != 0
+}
+
+func (k instructionKind) String() string {
+	switch k {
+	case nop0:
+		return "nop"
+	case ret:
+		return "ret"
+	case imm:
+		return "imm"
+	case aluRmiR:
+		return "aluRmiR"
+	case movRR:
+		return "movRR"
+	case xmmRmR:
+		return "xmmRmR"
+	case gprToXmm:
+		return "gprToXmm"
+	case xmmUnaryRmR:
+		return "xmmUnaryRmR"
+	case xmmUnaryRmRImm:
+		return "xmmUnaryRmRImm"
+	case unaryRmR:
+		return "unaryRmR"
+	case not:
+		return "not"
+	case neg:
+		return "neg"
+	case div:
+		return "div"
+	case mulHi:
+		return "mulHi"
+	case signExtendData:
+		return "signExtendData"
+	case movzxRmR:
+		return "movzxRmR"
+	case mov64MR:
+		return "mov64MR"
+	case lea:
+		return "lea"
+	case movsxRmR:
+		return "movsxRmR"
+	case movRM:
+		return "movRM"
+	case shiftR:
+		return "shiftR"
+	case xmmRmiReg:
+		return "xmmRmiReg"
+	case cmpRmiR:
+		return "cmpRmiR"
+	case setcc:
+		return "setcc"
+	case cmove:
+		return "cmove"
+	case push64:
+		return "push64"
+	case pop64:
+		return "pop64"
+	case xmmMovRM:
+		return "xmmMovRM"
+	case xmmLoadConst:
+		return "xmmLoadConst"
+	case xmmToGpr:
+		return "xmmToGpr"
+	case cvtUint64ToFloatSeq:
+		return "cvtUint64ToFloatSeq"
+	case cvtFloatToSintSeq:
+		return "cvtFloatToSintSeq"
+	case cvtFloatToUintSeq:
+		return "cvtFloatToUintSeq"
+	case xmmMinMaxSeq:
+		return "xmmMinMaxSeq"
+	case xmmCmpRmR:
+		return "xmmCmpRmR"
+	case xmmRmRImm:
+		return "xmmRmRImm"
+	case jmpIf:
+		return "jmpIf"
+	case jmp:
+		return "jmp"
+	case jmpTableIsland:
+		return "jmpTableIsland"
+	case exitSequence:
+		return "exit_sequence"
+	case ud2:
+		return "ud2"
+	case xchg:
+		return "xchg"
+	case zeros:
+		return "zeros"
+	case fcvtToSintSequence:
+		return "fcvtToSintSequence"
+	case fcvtToUintSequence:
+		return "fcvtToUintSequence"
+	case xmmCMov:
+		return "xmmCMov"
+	case idivRemSequence:
+		return "idivRemSequence"
+	case mfence:
+		return "mfence"
+	case lockcmpxchg:
+		return "lockcmpxchg"
+	case lockxadd:
+		return "lockxadd"
+	default:
+		panic("BUG")
+	}
+}
+
+type aluRmiROpcode byte
+
+const (
+	aluRmiROpcodeAdd aluRmiROpcode = iota + 1
+	aluRmiROpcodeSub
+	aluRmiROpcodeAnd
+	aluRmiROpcodeOr
+	aluRmiROpcodeXor
+	aluRmiROpcodeMul
+)
+
+func (a aluRmiROpcode) String() string {
+	switch a {
+	case aluRmiROpcodeAdd:
+		return "add"
+	case aluRmiROpcodeSub:
+		return "sub"
+	case aluRmiROpcodeAnd:
+		return "and"
+	case aluRmiROpcodeOr:
+		return "or"
+	case aluRmiROpcodeXor:
+		return "xor"
+	case aluRmiROpcodeMul:
+		return "imul"
+	default:
+		panic("BUG")
+	}
+}
+
+func (i *instruction) asJmpIf(cond cond, target operand) *instruction {
+	i.kind = jmpIf
+	i.u1 = uint64(cond)
+	i.op1 = target
+	return i
+}
+
+// asJmpTableSequence is used to emit the jump table.
+// targetSliceIndex is the index of the target slice in machine.jmpTableTargets.
+func (i *instruction) asJmpTableSequence(targetSliceIndex int, targetCount int) *instruction {
+	i.kind = jmpTableIsland
+	i.u1 = uint64(targetSliceIndex)
+	i.u2 = uint64(targetCount)
+	return i
+}
+
+func (i *instruction) asJmp(target operand) *instruction {
+	i.kind = jmp
+	i.op1 = target
+	return i
+}
+
+func (i *instruction) jmpLabel() backend.Label {
+	switch i.kind {
+	case jmp, jmpIf, lea, xmmUnaryRmR:
+		return i.op1.label()
+	default:
+		panic("BUG")
+	}
+}
+
+func (i *instruction) asLEA(target operand, rd regalloc.VReg) *instruction {
+	i.kind = lea
+	i.op1 = target
+	i.op2 = newOperandReg(rd)
+	return i
+}
+
+func (i *instruction) asCall(ref ssa.FuncRef, abi *backend.FunctionABI) *instruction {
+	i.kind = call
+	i.u1 = uint64(ref)
+	if abi != nil {
+		i.u2 = abi.ABIInfoAsUint64()
+	}
+	return i
+}
+
+func (i *instruction) asCallIndirect(ptr operand, abi *backend.FunctionABI) *instruction {
+	if ptr.kind != operandKindReg && ptr.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = callIndirect
+	i.op1 = ptr
+	if abi != nil {
+		i.u2 = abi.ABIInfoAsUint64()
+	}
+	return i
+}
+
+func (i *instruction) asRet() *instruction {
+	i.kind = ret
+	return i
+}
+
+func (i *instruction) asImm(dst regalloc.VReg, value uint64, _64 bool) *instruction {
+	i.kind = imm
+	i.op2 = newOperandReg(dst)
+	i.u1 = value
+	i.b1 = _64
+	return i
+}
+
+func (i *instruction) asAluRmiR(op aluRmiROpcode, rm operand, rd regalloc.VReg, _64 bool) *instruction {
+	if rm.kind != operandKindReg && rm.kind != operandKindMem && rm.kind != operandKindImm32 {
+		panic("BUG")
+	}
+	i.kind = aluRmiR
+	i.op1 = rm
+	i.op2 = newOperandReg(rd)
+	i.u1 = uint64(op)
+	i.b1 = _64
+	return i
+}
+
+func (i *instruction) asZeros(dst regalloc.VReg) *instruction {
+	i.kind = zeros
+	i.op2 = newOperandReg(dst)
+	return i
+}
+
+func (i *instruction) asBlendvpd(rm operand, rd regalloc.VReg) *instruction {
+	if rm.kind != operandKindReg && rm.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = blendvpd
+	i.op1 = rm
+	i.op2 = newOperandReg(rd)
+	return i
+}
+
+func (i *instruction) asXmmRmR(op sseOpcode, rm operand, rd regalloc.VReg) *instruction {
+	if rm.kind != operandKindReg && rm.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = xmmRmR
+	i.op1 = rm
+	i.op2 = newOperandReg(rd)
+	i.u1 = uint64(op)
+	return i
+}
+
+func (i *instruction) asXmmRmRImm(op sseOpcode, imm uint8, rm operand, rd regalloc.VReg) *instruction {
+	if rm.kind != operandKindReg && rm.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = xmmRmRImm
+	i.op1 = rm
+	i.op2 = newOperandReg(rd)
+	i.u1 = uint64(op)
+	i.u2 = uint64(imm)
+	return i
+}
+
+func (i *instruction) asGprToXmm(op sseOpcode, rm operand, rd regalloc.VReg, _64 bool) *instruction {
+	if rm.kind != operandKindReg && rm.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = gprToXmm
+	i.op1 = rm
+	i.op2 = newOperandReg(rd)
+	i.u1 = uint64(op)
+	i.b1 = _64
+	return i
+}
+
+func (i *instruction) asEmitSourceOffsetInfo(l ssa.SourceOffset) *instruction {
+	i.kind = sourceOffsetInfo
+	i.u1 = uint64(l)
+	return i
+}
+
+func (i *instruction) sourceOffsetInfo() ssa.SourceOffset {
+	return ssa.SourceOffset(i.u1)
+}
+
+func (i *instruction) asXmmToGpr(op sseOpcode, rm, rd regalloc.VReg, _64 bool) *instruction {
+	i.kind = xmmToGpr
+	i.op1 = newOperandReg(rm)
+	i.op2 = newOperandReg(rd)
+	i.u1 = uint64(op)
+	i.b1 = _64
+	return i
+}
+
+func (i *instruction) asMovRM(rm regalloc.VReg, rd operand, size byte) *instruction {
+	if rd.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = movRM
+	i.op1 = newOperandReg(rm)
+	i.op2 = rd
+	i.u1 = uint64(size)
+	return i
+}
+
+func (i *instruction) asMovsxRmR(ext extMode, src operand, rd regalloc.VReg) *instruction {
+	if src.kind != operandKindReg && src.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = movsxRmR
+	i.op1 = src
+	i.op2 = newOperandReg(rd)
+	i.u1 = uint64(ext)
+	return i
+}
+
+func (i *instruction) asMovzxRmR(ext extMode, src operand, rd regalloc.VReg) *instruction {
+	if src.kind != operandKindReg && src.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = movzxRmR
+	i.op1 = src
+	i.op2 = newOperandReg(rd)
+	i.u1 = uint64(ext)
+	return i
+}
+
+func (i *instruction) asSignExtendData(_64 bool) *instruction {
+	i.kind = signExtendData
+	i.b1 = _64
+	return i
+}
+
+func (i *instruction) asUD2() *instruction {
+	i.kind = ud2
+	return i
+}
+
+func (i *instruction) asDiv(rn operand, signed bool, _64 bool) *instruction {
+	i.kind = div
+	i.op1 = rn
+	i.b1 = _64
+	if signed {
+		i.u1 = 1
+	}
+	return i
+}
+
+func (i *instruction) asMov64MR(rm operand, rd regalloc.VReg) *instruction {
+	if rm.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = mov64MR
+	i.op1 = rm
+	i.op2 = newOperandReg(rd)
+	return i
+}
+
+func (i *instruction) asMovRR(rm, rd regalloc.VReg, _64 bool) *instruction {
+	i.kind = movRR
+	i.op1 = newOperandReg(rm)
+	i.op2 = newOperandReg(rd)
+	i.b1 = _64
+	return i
+}
+
+func (i *instruction) asNot(rm operand, _64 bool) *instruction {
+	if rm.kind != operandKindReg && rm.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = not
+	i.op1 = rm
+	i.b1 = _64
+	return i
+}
+
+func (i *instruction) asNeg(rm operand, _64 bool) *instruction {
+	if rm.kind != operandKindReg && rm.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = neg
+	i.op1 = rm
+	i.b1 = _64
+	return i
+}
+
+func (i *instruction) asMulHi(rm operand, signed, _64 bool) *instruction {
+	if rm.kind != operandKindReg && (rm.kind != operandKindMem) {
+		panic("BUG")
+	}
+	i.kind = mulHi
+	i.op1 = rm
+	i.b1 = _64
+	if signed {
+		i.u1 = 1
+	}
+	return i
+}
+
+func (i *instruction) asUnaryRmR(op unaryRmROpcode, rm operand, rd regalloc.VReg, _64 bool) *instruction {
+	if rm.kind != operandKindReg && rm.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = unaryRmR
+	i.op1 = rm
+	i.op2 = newOperandReg(rd)
+	i.u1 = uint64(op)
+	i.b1 = _64
+	return i
+}
+
+func (i *instruction) asShiftR(op shiftROp, amount operand, rd regalloc.VReg, _64 bool) *instruction {
+	if amount.kind != operandKindReg && amount.kind != operandKindImm32 {
+		panic("BUG")
+	}
+	i.kind = shiftR
+	i.op1 = amount
+	i.op2 = newOperandReg(rd)
+	i.u1 = uint64(op)
+	i.b1 = _64
+	return i
+}
+
+func (i *instruction) asXmmRmiReg(op sseOpcode, rm operand, rd regalloc.VReg) *instruction {
+	if rm.kind != operandKindReg && rm.kind != operandKindImm32 && rm.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = xmmRmiReg
+	i.op1 = rm
+	i.op2 = newOperandReg(rd)
+	i.u1 = uint64(op)
+	return i
+}
+
+func (i *instruction) asCmpRmiR(cmp bool, rm operand, rn regalloc.VReg, _64 bool) *instruction {
+	if rm.kind != operandKindReg && rm.kind != operandKindImm32 && rm.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = cmpRmiR
+	i.op1 = rm
+	i.op2 = newOperandReg(rn)
+	if cmp {
+		i.u1 = 1
+	}
+	i.b1 = _64
+	return i
+}
+
+func (i *instruction) asSetcc(c cond, rd regalloc.VReg) *instruction {
+	i.kind = setcc
+	i.op2 = newOperandReg(rd)
+	i.u1 = uint64(c)
+	return i
+}
+
+func (i *instruction) asCmove(c cond, rm operand, rd regalloc.VReg, _64 bool) *instruction {
+	i.kind = cmove
+	i.op1 = rm
+	i.op2 = newOperandReg(rd)
+	i.u1 = uint64(c)
+	i.b1 = _64
+	return i
+}
+
+func (m *machine) allocateExitSeq(execCtx regalloc.VReg) *instruction {
+	i := m.allocateInstr()
+	i.kind = exitSequence
+	i.op1 = newOperandReg(execCtx)
+	// Allocate the address mode that will be used in encoding the exit sequence.
+	i.op2 = newOperandMem(m.amodePool.Allocate())
+	return i
+}
+
+func (i *instruction) asXmmUnaryRmR(op sseOpcode, rm operand, rd regalloc.VReg) *instruction {
+	if rm.kind != operandKindReg && rm.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = xmmUnaryRmR
+	i.op1 = rm
+	i.op2 = newOperandReg(rd)
+	i.u1 = uint64(op)
+	return i
+}
+
+func (i *instruction) asXmmUnaryRmRImm(op sseOpcode, imm byte, rm operand, rd regalloc.VReg) *instruction {
+	if rm.kind != operandKindReg && rm.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = xmmUnaryRmRImm
+	i.op1 = rm
+	i.op2 = newOperandReg(rd)
+	i.u1 = uint64(op)
+	i.u2 = uint64(imm)
+	return i
+}
+
+func (i *instruction) asXmmCmpRmR(op sseOpcode, rm operand, rd regalloc.VReg) *instruction {
+	if rm.kind != operandKindReg && rm.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = xmmCmpRmR
+	i.op1 = rm
+	i.op2 = newOperandReg(rd)
+	i.u1 = uint64(op)
+	return i
+}
+
+func (i *instruction) asXmmMovRM(op sseOpcode, rm regalloc.VReg, rd operand) *instruction {
+	if rd.kind != operandKindMem {
+		panic("BUG")
+	}
+	i.kind = xmmMovRM
+	i.op1 = newOperandReg(rm)
+	i.op2 = rd
+	i.u1 = uint64(op)
+	return i
+}
+
+func (i *instruction) asPop64(rm regalloc.VReg) *instruction {
+	i.kind = pop64
+	i.op1 = newOperandReg(rm)
+	return i
+}
+
+func (i *instruction) asPush64(op operand) *instruction {
+	if op.kind != operandKindReg && op.kind != operandKindMem && op.kind != operandKindImm32 {
+		panic("BUG")
+	}
+	i.kind = push64
+	i.op1 = op
+	return i
+}
+
+func (i *instruction) asXCHG(rm regalloc.VReg, rd operand, size byte) *instruction {
+	i.kind = xchg
+	i.op1 = newOperandReg(rm)
+	i.op2 = rd
+	i.u1 = uint64(size)
+	return i
+}
+
+func (i *instruction) asLockCmpXCHG(rm regalloc.VReg, rd *amode, size byte) *instruction {
+	i.kind = lockcmpxchg
+	i.op1 = newOperandReg(rm)
+	i.op2 = newOperandMem(rd)
+	i.u1 = uint64(size)
+	return i
+}
+
+func (i *instruction) asLockXAdd(rm regalloc.VReg, rd *amode, size byte) *instruction {
+	i.kind = lockxadd
+	i.op1 = newOperandReg(rm)
+	i.op2 = newOperandMem(rd)
+	i.u1 = uint64(size)
+	return i
+}
+
+type unaryRmROpcode byte
+
+const (
+	unaryRmROpcodeBsr unaryRmROpcode = iota
+	unaryRmROpcodeBsf
+	unaryRmROpcodeLzcnt
+	unaryRmROpcodeTzcnt
+	unaryRmROpcodePopcnt
+)
+
+func (u unaryRmROpcode) String() string {
+	switch u {
+	case unaryRmROpcodeBsr:
+		return "bsr"
+	case unaryRmROpcodeBsf:
+		return "bsf"
+	case unaryRmROpcodeLzcnt:
+		return "lzcnt"
+	case unaryRmROpcodeTzcnt:
+		return "tzcnt"
+	case unaryRmROpcodePopcnt:
+		return "popcnt"
+	default:
+		panic("BUG")
+	}
+}
+
+type shiftROp byte
+
+const (
+	shiftROpRotateLeft           shiftROp = 0
+	shiftROpRotateRight          shiftROp = 1
+	shiftROpShiftLeft            shiftROp = 4
+	shiftROpShiftRightLogical    shiftROp = 5
+	shiftROpShiftRightArithmetic shiftROp = 7
+)
+
+func (s shiftROp) String() string {
+	switch s {
+	case shiftROpRotateLeft:
+		return "rol"
+	case shiftROpRotateRight:
+		return "ror"
+	case shiftROpShiftLeft:
+		return "shl"
+	case shiftROpShiftRightLogical:
+		return "shr"
+	case shiftROpShiftRightArithmetic:
+		return "sar"
+	default:
+		panic("BUG")
+	}
+}
+
+type sseOpcode byte
+
+const (
+	sseOpcodeInvalid sseOpcode = iota
+	sseOpcodeAddps
+	sseOpcodeAddpd
+	sseOpcodeAddss
+	sseOpcodeAddsd
+	sseOpcodeAndps
+	sseOpcodeAndpd
+	sseOpcodeAndnps
+	sseOpcodeAndnpd
+	sseOpcodeBlendvps
+	sseOpcodeBlendvpd
+	sseOpcodeComiss
+	sseOpcodeComisd
+	sseOpcodeCmpps
+	sseOpcodeCmppd
+	sseOpcodeCmpss
+	sseOpcodeCmpsd
+	sseOpcodeCvtdq2ps
+	sseOpcodeCvtdq2pd
+	sseOpcodeCvtsd2ss
+	sseOpcodeCvtsd2si
+	sseOpcodeCvtsi2ss
+	sseOpcodeCvtsi2sd
+	sseOpcodeCvtss2si
+	sseOpcodeCvtss2sd
+	sseOpcodeCvttps2dq
+	sseOpcodeCvttss2si
+	sseOpcodeCvttsd2si
+	sseOpcodeDivps
+	sseOpcodeDivpd
+	sseOpcodeDivss
+	sseOpcodeDivsd
+	sseOpcodeInsertps
+	sseOpcodeMaxps
+	sseOpcodeMaxpd
+	sseOpcodeMaxss
+	sseOpcodeMaxsd
+	sseOpcodeMinps
+	sseOpcodeMinpd
+	sseOpcodeMinss
+	sseOpcodeMinsd
+	sseOpcodeMovaps
+	sseOpcodeMovapd
+	sseOpcodeMovd
+	sseOpcodeMovdqa
+	sseOpcodeMovdqu
+	sseOpcodeMovlhps
+	sseOpcodeMovmskps
+	sseOpcodeMovmskpd
+	sseOpcodeMovq
+	sseOpcodeMovss
+	sseOpcodeMovsd
+	sseOpcodeMovups
+	sseOpcodeMovupd
+	sseOpcodeMulps
+	sseOpcodeMulpd
+	sseOpcodeMulss
+	sseOpcodeMulsd
+	sseOpcodeOrps
+	sseOpcodeOrpd
+	sseOpcodePabsb
+	sseOpcodePabsw
+	sseOpcodePabsd
+	sseOpcodePackssdw
+	sseOpcodePacksswb
+	sseOpcodePackusdw
+	sseOpcodePackuswb
+	sseOpcodePaddb
+	sseOpcodePaddd
+	sseOpcodePaddq
+	sseOpcodePaddw
+	sseOpcodePaddsb
+	sseOpcodePaddsw
+	sseOpcodePaddusb
+	sseOpcodePaddusw
+	sseOpcodePalignr
+	sseOpcodePand
+	sseOpcodePandn
+	sseOpcodePavgb
+	sseOpcodePavgw
+	sseOpcodePcmpeqb
+	sseOpcodePcmpeqw
+	sseOpcodePcmpeqd
+	sseOpcodePcmpeqq
+	sseOpcodePcmpgtb
+	sseOpcodePcmpgtw
+	sseOpcodePcmpgtd
+	sseOpcodePcmpgtq
+	sseOpcodePextrb
+	sseOpcodePextrw
+	sseOpcodePextrd
+	sseOpcodePextrq
+	sseOpcodePinsrb
+	sseOpcodePinsrw
+	sseOpcodePinsrd
+	sseOpcodePinsrq
+	sseOpcodePmaddwd
+	sseOpcodePmaxsb
+	sseOpcodePmaxsw
+	sseOpcodePmaxsd
+	sseOpcodePmaxub
+	sseOpcodePmaxuw
+	sseOpcodePmaxud
+	sseOpcodePminsb
+	sseOpcodePminsw
+	sseOpcodePminsd
+	sseOpcodePminub
+	sseOpcodePminuw
+	sseOpcodePminud
+	sseOpcodePmovmskb
+	sseOpcodePmovsxbd
+	sseOpcodePmovsxbw
+	sseOpcodePmovsxbq
+	sseOpcodePmovsxwd
+	sseOpcodePmovsxwq
+	sseOpcodePmovsxdq
+	sseOpcodePmovzxbd
+	sseOpcodePmovzxbw
+	sseOpcodePmovzxbq
+	sseOpcodePmovzxwd
+	sseOpcodePmovzxwq
+	sseOpcodePmovzxdq
+	sseOpcodePmulld
+	sseOpcodePmullw
+	sseOpcodePmuludq
+	sseOpcodePor
+	sseOpcodePshufb
+	sseOpcodePshufd
+	sseOpcodePsllw
+	sseOpcodePslld
+	sseOpcodePsllq
+	sseOpcodePsraw
+	sseOpcodePsrad
+	sseOpcodePsrlw
+	sseOpcodePsrld
+	sseOpcodePsrlq
+	sseOpcodePsubb
+	sseOpcodePsubd
+	sseOpcodePsubq
+	sseOpcodePsubw
+	sseOpcodePsubsb
+	sseOpcodePsubsw
+	sseOpcodePsubusb
+	sseOpcodePsubusw
+	sseOpcodePtest
+	sseOpcodePunpckhbw
+	sseOpcodePunpcklbw
+	sseOpcodePxor
+	sseOpcodeRcpss
+	sseOpcodeRoundps
+	sseOpcodeRoundpd
+	sseOpcodeRoundss
+	sseOpcodeRoundsd
+	sseOpcodeRsqrtss
+	sseOpcodeSqrtps
+	sseOpcodeSqrtpd
+	sseOpcodeSqrtss
+	sseOpcodeSqrtsd
+	sseOpcodeSubps
+	sseOpcodeSubpd
+	sseOpcodeSubss
+	sseOpcodeSubsd
+	sseOpcodeUcomiss
+	sseOpcodeUcomisd
+	sseOpcodeXorps
+	sseOpcodeXorpd
+	sseOpcodePmulhrsw
+	sseOpcodeUnpcklps
+	sseOpcodeCvtps2pd
+	sseOpcodeCvtpd2ps
+	sseOpcodeCvttpd2dq
+	sseOpcodeShufps
+	sseOpcodePmaddubsw
+)
+
+func (s sseOpcode) String() string {
+	switch s {
+	case sseOpcodeInvalid:
+		return "invalid"
+	case sseOpcodeAddps:
+		return "addps"
+	case sseOpcodeAddpd:
+		return "addpd"
+	case sseOpcodeAddss:
+		return "addss"
+	case sseOpcodeAddsd:
+		return "addsd"
+	case sseOpcodeAndps:
+		return "andps"
+	case sseOpcodeAndpd:
+		return "andpd"
+	case sseOpcodeAndnps:
+		return "andnps"
+	case sseOpcodeAndnpd:
+		return "andnpd"
+	case sseOpcodeBlendvps:
+		return "blendvps"
+	case sseOpcodeBlendvpd:
+		return "blendvpd"
+	case sseOpcodeComiss:
+		return "comiss"
+	case sseOpcodeComisd:
+		return "comisd"
+	case sseOpcodeCmpps:
+		return "cmpps"
+	case sseOpcodeCmppd:
+		return "cmppd"
+	case sseOpcodeCmpss:
+		return "cmpss"
+	case sseOpcodeCmpsd:
+		return "cmpsd"
+	case sseOpcodeCvtdq2ps:
+		return "cvtdq2ps"
+	case sseOpcodeCvtdq2pd:
+		return "cvtdq2pd"
+	case sseOpcodeCvtsd2ss:
+		return "cvtsd2ss"
+	case sseOpcodeCvtsd2si:
+		return "cvtsd2si"
+	case sseOpcodeCvtsi2ss:
+		return "cvtsi2ss"
+	case sseOpcodeCvtsi2sd:
+		return "cvtsi2sd"
+	case sseOpcodeCvtss2si:
+		return "cvtss2si"
+	case sseOpcodeCvtss2sd:
+		return "cvtss2sd"
+	case sseOpcodeCvttps2dq:
+		return "cvttps2dq"
+	case sseOpcodeCvttss2si:
+		return "cvttss2si"
+	case sseOpcodeCvttsd2si:
+		return "cvttsd2si"
+	case sseOpcodeDivps:
+		return "divps"
+	case sseOpcodeDivpd:
+		return "divpd"
+	case sseOpcodeDivss:
+		return "divss"
+	case sseOpcodeDivsd:
+		return "divsd"
+	case sseOpcodeInsertps:
+		return "insertps"
+	case sseOpcodeMaxps:
+		return "maxps"
+	case sseOpcodeMaxpd:
+		return "maxpd"
+	case sseOpcodeMaxss:
+		return "maxss"
+	case sseOpcodeMaxsd:
+		return "maxsd"
+	case sseOpcodeMinps:
+		return "minps"
+	case sseOpcodeMinpd:
+		return "minpd"
+	case sseOpcodeMinss:
+		return "minss"
+	case sseOpcodeMinsd:
+		return "minsd"
+	case sseOpcodeMovaps:
+		return "movaps"
+	case sseOpcodeMovapd:
+		return "movapd"
+	case sseOpcodeMovd:
+		return "movd"
+	case sseOpcodeMovdqa:
+		return "movdqa"
+	case sseOpcodeMovdqu:
+		return "movdqu"
+	case sseOpcodeMovlhps:
+		return "movlhps"
+	case sseOpcodeMovmskps:
+		return "movmskps"
+	case sseOpcodeMovmskpd:
+		return "movmskpd"
+	case sseOpcodeMovq:
+		return "movq"
+	case sseOpcodeMovss:
+		return "movss"
+	case sseOpcodeMovsd:
+		return "movsd"
+	case sseOpcodeMovups:
+		return "movups"
+	case sseOpcodeMovupd:
+		return "movupd"
+	case sseOpcodeMulps:
+		return "mulps"
+	case sseOpcodeMulpd:
+		return "mulpd"
+	case sseOpcodeMulss:
+		return "mulss"
+	case sseOpcodeMulsd:
+		return "mulsd"
+	case sseOpcodeOrps:
+		return "orps"
+	case sseOpcodeOrpd:
+		return "orpd"
+	case sseOpcodePabsb:
+		return "pabsb"
+	case sseOpcodePabsw:
+		return "pabsw"
+	case sseOpcodePabsd:
+		return "pabsd"
+	case sseOpcodePackssdw:
+		return "packssdw"
+	case sseOpcodePacksswb:
+		return "packsswb"
+	case sseOpcodePackusdw:
+		return "packusdw"
+	case sseOpcodePackuswb:
+		return "packuswb"
+	case sseOpcodePaddb:
+		return "paddb"
+	case sseOpcodePaddd:
+		return "paddd"
+	case sseOpcodePaddq:
+		return "paddq"
+	case sseOpcodePaddw:
+		return "paddw"
+	case sseOpcodePaddsb:
+		return "paddsb"
+	case sseOpcodePaddsw:
+		return "paddsw"
+	case sseOpcodePaddusb:
+		return "paddusb"
+	case sseOpcodePaddusw:
+		return "paddusw"
+	case sseOpcodePalignr:
+		return "palignr"
+	case sseOpcodePand:
+		return "pand"
+	case sseOpcodePandn:
+		return "pandn"
+	case sseOpcodePavgb:
+		return "pavgb"
+	case sseOpcodePavgw:
+		return "pavgw"
+	case sseOpcodePcmpeqb:
+		return "pcmpeqb"
+	case sseOpcodePcmpeqw:
+		return "pcmpeqw"
+	case sseOpcodePcmpeqd:
+		return "pcmpeqd"
+	case sseOpcodePcmpeqq:
+		return "pcmpeqq"
+	case sseOpcodePcmpgtb:
+		return "pcmpgtb"
+	case sseOpcodePcmpgtw:
+		return "pcmpgtw"
+	case sseOpcodePcmpgtd:
+		return "pcmpgtd"
+	case sseOpcodePcmpgtq:
+		return "pcmpgtq"
+	case sseOpcodePextrb:
+		return "pextrb"
+	case sseOpcodePextrw:
+		return "pextrw"
+	case sseOpcodePextrd:
+		return "pextrd"
+	case sseOpcodePextrq:
+		return "pextrq"
+	case sseOpcodePinsrb:
+		return "pinsrb"
+	case sseOpcodePinsrw:
+		return "pinsrw"
+	case sseOpcodePinsrd:
+		return "pinsrd"
+	case sseOpcodePinsrq:
+		return "pinsrq"
+	case sseOpcodePmaddwd:
+		return "pmaddwd"
+	case sseOpcodePmaxsb:
+		return "pmaxsb"
+	case sseOpcodePmaxsw:
+		return "pmaxsw"
+	case sseOpcodePmaxsd:
+		return "pmaxsd"
+	case sseOpcodePmaxub:
+		return "pmaxub"
+	case sseOpcodePmaxuw:
+		return "pmaxuw"
+	case sseOpcodePmaxud:
+		return "pmaxud"
+	case sseOpcodePminsb:
+		return "pminsb"
+	case sseOpcodePminsw:
+		return "pminsw"
+	case sseOpcodePminsd:
+		return "pminsd"
+	case sseOpcodePminub:
+		return "pminub"
+	case sseOpcodePminuw:
+		return "pminuw"
+	case sseOpcodePminud:
+		return "pminud"
+	case sseOpcodePmovmskb:
+		return "pmovmskb"
+	case sseOpcodePmovsxbd:
+		return "pmovsxbd"
+	case sseOpcodePmovsxbw:
+		return "pmovsxbw"
+	case sseOpcodePmovsxbq:
+		return "pmovsxbq"
+	case sseOpcodePmovsxwd:
+		return "pmovsxwd"
+	case sseOpcodePmovsxwq:
+		return "pmovsxwq"
+	case sseOpcodePmovsxdq:
+		return "pmovsxdq"
+	case sseOpcodePmovzxbd:
+		return "pmovzxbd"
+	case sseOpcodePmovzxbw:
+		return "pmovzxbw"
+	case sseOpcodePmovzxbq:
+		return "pmovzxbq"
+	case sseOpcodePmovzxwd:
+		return "pmovzxwd"
+	case sseOpcodePmovzxwq:
+		return "pmovzxwq"
+	case sseOpcodePmovzxdq:
+		return "pmovzxdq"
+	case sseOpcodePmulld:
+		return "pmulld"
+	case sseOpcodePmullw:
+		return "pmullw"
+	case sseOpcodePmuludq:
+		return "pmuludq"
+	case sseOpcodePor:
+		return "por"
+	case sseOpcodePshufb:
+		return "pshufb"
+	case sseOpcodePshufd:
+		return "pshufd"
+	case sseOpcodePsllw:
+		return "psllw"
+	case sseOpcodePslld:
+		return "pslld"
+	case sseOpcodePsllq:
+		return "psllq"
+	case sseOpcodePsraw:
+		return "psraw"
+	case sseOpcodePsrad:
+		return "psrad"
+	case sseOpcodePsrlw:
+		return "psrlw"
+	case sseOpcodePsrld:
+		return "psrld"
+	case sseOpcodePsrlq:
+		return "psrlq"
+	case sseOpcodePsubb:
+		return "psubb"
+	case sseOpcodePsubd:
+		return "psubd"
+	case sseOpcodePsubq:
+		return "psubq"
+	case sseOpcodePsubw:
+		return "psubw"
+	case sseOpcodePsubsb:
+		return "psubsb"
+	case sseOpcodePsubsw:
+		return "psubsw"
+	case sseOpcodePsubusb:
+		return "psubusb"
+	case sseOpcodePsubusw:
+		return "psubusw"
+	case sseOpcodePtest:
+		return "ptest"
+	case sseOpcodePunpckhbw:
+		return "punpckhbw"
+	case sseOpcodePunpcklbw:
+		return "punpcklbw"
+	case sseOpcodePxor:
+		return "pxor"
+	case sseOpcodeRcpss:
+		return "rcpss"
+	case sseOpcodeRoundps:
+		return "roundps"
+	case sseOpcodeRoundpd:
+		return "roundpd"
+	case sseOpcodeRoundss:
+		return "roundss"
+	case sseOpcodeRoundsd:
+		return "roundsd"
+	case sseOpcodeRsqrtss:
+		return "rsqrtss"
+	case sseOpcodeSqrtps:
+		return "sqrtps"
+	case sseOpcodeSqrtpd:
+		return "sqrtpd"
+	case sseOpcodeSqrtss:
+		return "sqrtss"
+	case sseOpcodeSqrtsd:
+		return "sqrtsd"
+	case sseOpcodeSubps:
+		return "subps"
+	case sseOpcodeSubpd:
+		return "subpd"
+	case sseOpcodeSubss:
+		return "subss"
+	case sseOpcodeSubsd:
+		return "subsd"
+	case sseOpcodeUcomiss:
+		return "ucomiss"
+	case sseOpcodeUcomisd:
+		return "ucomisd"
+	case sseOpcodeXorps:
+		return "xorps"
+	case sseOpcodeXorpd:
+		return "xorpd"
+	case sseOpcodePmulhrsw:
+		return "pmulhrsw"
+	case sseOpcodeUnpcklps:
+		return "unpcklps"
+	case sseOpcodeCvtps2pd:
+		return "cvtps2pd"
+	case sseOpcodeCvtpd2ps:
+		return "cvtpd2ps"
+	case sseOpcodeCvttpd2dq:
+		return "cvttpd2dq"
+	case sseOpcodeShufps:
+		return "shufps"
+	case sseOpcodePmaddubsw:
+		return "pmaddubsw"
+	default:
+		panic("BUG")
+	}
+}
+
+type roundingMode uint8
+
+const (
+	roundingModeNearest roundingMode = iota
+	roundingModeDown
+	roundingModeUp
+	roundingModeZero
+)
+
+func (r roundingMode) String() string {
+	switch r {
+	case roundingModeNearest:
+		return "nearest"
+	case roundingModeDown:
+		return "down"
+	case roundingModeUp:
+		return "up"
+	case roundingModeZero:
+		return "zero"
+	default:
+		panic("BUG")
+	}
+}
+
+// cmpPred is the immediate value for a comparison operation in xmmRmRImm.
+type cmpPred uint8
+
+const (
+	// cmpPredEQ_OQ is Equal (ordered, non-signaling)
+	cmpPredEQ_OQ cmpPred = iota
+	// cmpPredLT_OS is Less-than (ordered, signaling)
+	cmpPredLT_OS
+	// cmpPredLE_OS is Less-than-or-equal (ordered, signaling)
+	cmpPredLE_OS
+	// cmpPredUNORD_Q is Unordered (non-signaling)
+	cmpPredUNORD_Q
+	// cmpPredNEQ_UQ is Not-equal (unordered, non-signaling)
+	cmpPredNEQ_UQ
+	// cmpPredNLT_US is Not-less-than (unordered, signaling)
+	cmpPredNLT_US
+	// cmpPredNLE_US is Not-less-than-or-equal (unordered, signaling)
+	cmpPredNLE_US
+	// cmpPredORD_Q is Ordered (non-signaling)
+	cmpPredORD_Q
+	// cmpPredEQ_UQ is Equal (unordered, non-signaling)
+	cmpPredEQ_UQ
+	// cmpPredNGE_US is Not-greater-than-or-equal (unordered, signaling)
+	cmpPredNGE_US
+	// cmpPredNGT_US is Not-greater-than (unordered, signaling)
+	cmpPredNGT_US
+	// cmpPredFALSE_OQ is False (ordered, non-signaling)
+	cmpPredFALSE_OQ
+	// cmpPredNEQ_OQ is Not-equal (ordered, non-signaling)
+	cmpPredNEQ_OQ
+	// cmpPredGE_OS is Greater-than-or-equal (ordered, signaling)
+	cmpPredGE_OS
+	// cmpPredGT_OS is Greater-than (ordered, signaling)
+	cmpPredGT_OS
+	// cmpPredTRUE_UQ is True (unordered, non-signaling)
+	cmpPredTRUE_UQ
+	// Equal (ordered, signaling)
+	cmpPredEQ_OS
+	// Less-than (ordered, nonsignaling)
+	cmpPredLT_OQ
+	// Less-than-or-equal (ordered, nonsignaling)
+	cmpPredLE_OQ
+	// Unordered (signaling)
+	cmpPredUNORD_S
+	// Not-equal (unordered, signaling)
+	cmpPredNEQ_US
+	// Not-less-than (unordered, nonsignaling)
+	cmpPredNLT_UQ
+	// Not-less-than-or-equal (unordered, nonsignaling)
+	cmpPredNLE_UQ
+	// Ordered (signaling)
+	cmpPredORD_S
+	// Equal (unordered, signaling)
+	cmpPredEQ_US
+	// Not-greater-than-or-equal (unordered, non-signaling)
+	cmpPredNGE_UQ
+	// Not-greater-than (unordered, nonsignaling)
+	cmpPredNGT_UQ
+	// False (ordered, signaling)
+	cmpPredFALSE_OS
+	// Not-equal (ordered, signaling)
+	cmpPredNEQ_OS
+	// Greater-than-or-equal (ordered, nonsignaling)
+	cmpPredGE_OQ
+	// Greater-than (ordered, nonsignaling)
+	cmpPredGT_OQ
+	// True (unordered, signaling)
+	cmpPredTRUE_US
+)
+
+func (r cmpPred) String() string {
+	switch r {
+	case cmpPredEQ_OQ:
+		return "eq_oq"
+	case cmpPredLT_OS:
+		return "lt_os"
+	case cmpPredLE_OS:
+		return "le_os"
+	case cmpPredUNORD_Q:
+		return "unord_q"
+	case cmpPredNEQ_UQ:
+		return "neq_uq"
+	case cmpPredNLT_US:
+		return "nlt_us"
+	case cmpPredNLE_US:
+		return "nle_us"
+	case cmpPredORD_Q:
+		return "ord_q"
+	case cmpPredEQ_UQ:
+		return "eq_uq"
+	case cmpPredNGE_US:
+		return "nge_us"
+	case cmpPredNGT_US:
+		return "ngt_us"
+	case cmpPredFALSE_OQ:
+		return "false_oq"
+	case cmpPredNEQ_OQ:
+		return "neq_oq"
+	case cmpPredGE_OS:
+		return "ge_os"
+	case cmpPredGT_OS:
+		return "gt_os"
+	case cmpPredTRUE_UQ:
+		return "true_uq"
+	case cmpPredEQ_OS:
+		return "eq_os"
+	case cmpPredLT_OQ:
+		return "lt_oq"
+	case cmpPredLE_OQ:
+		return "le_oq"
+	case cmpPredUNORD_S:
+		return "unord_s"
+	case cmpPredNEQ_US:
+		return "neq_us"
+	case cmpPredNLT_UQ:
+		return "nlt_uq"
+	case cmpPredNLE_UQ:
+		return "nle_uq"
+	case cmpPredORD_S:
+		return "ord_s"
+	case cmpPredEQ_US:
+		return "eq_us"
+	case cmpPredNGE_UQ:
+		return "nge_uq"
+	case cmpPredNGT_UQ:
+		return "ngt_uq"
+	case cmpPredFALSE_OS:
+		return "false_os"
+	case cmpPredNEQ_OS:
+		return "neq_os"
+	case cmpPredGE_OQ:
+		return "ge_oq"
+	case cmpPredGT_OQ:
+		return "gt_oq"
+	case cmpPredTRUE_US:
+		return "true_us"
+	default:
+		panic("BUG")
+	}
+}
+
+func linkInstr(prev, next *instruction) *instruction {
+	prev.next = next
+	next.prev = prev
+	return next
+}
+
+type defKind byte
+
+const (
+	defKindNone defKind = iota + 1
+	defKindOp2
+	defKindCall
+	defKindDivRem
+)
+
+var defKinds = [instrMax]defKind{
+	nop0:                   defKindNone,
+	ret:                    defKindNone,
+	movRR:                  defKindOp2,
+	movRM:                  defKindNone,
+	xmmMovRM:               defKindNone,
+	aluRmiR:                defKindNone,
+	shiftR:                 defKindNone,
+	imm:                    defKindOp2,
+	unaryRmR:               defKindOp2,
+	xmmRmiReg:              defKindNone,
+	xmmUnaryRmR:            defKindOp2,
+	xmmUnaryRmRImm:         defKindOp2,
+	xmmCmpRmR:              defKindNone,
+	xmmRmR:                 defKindNone,
+	xmmRmRImm:              defKindNone,
+	mov64MR:                defKindOp2,
+	movsxRmR:               defKindOp2,
+	movzxRmR:               defKindOp2,
+	gprToXmm:               defKindOp2,
+	xmmToGpr:               defKindOp2,
+	cmove:                  defKindNone,
+	call:                   defKindCall,
+	callIndirect:           defKindCall,
+	ud2:                    defKindNone,
+	jmp:                    defKindNone,
+	jmpIf:                  defKindNone,
+	jmpTableIsland:         defKindNone,
+	cmpRmiR:                defKindNone,
+	exitSequence:           defKindNone,
+	lea:                    defKindOp2,
+	setcc:                  defKindOp2,
+	zeros:                  defKindOp2,
+	sourceOffsetInfo:       defKindNone,
+	fcvtToSintSequence:     defKindNone,
+	defineUninitializedReg: defKindOp2,
+	fcvtToUintSequence:     defKindNone,
+	xmmCMov:                defKindOp2,
+	idivRemSequence:        defKindDivRem,
+	blendvpd:               defKindNone,
+	mfence:                 defKindNone,
+	xchg:                   defKindNone,
+	lockcmpxchg:            defKindNone,
+	lockxadd:               defKindNone,
+	neg:                    defKindNone,
+	nopUseReg:              defKindNone,
+}
+
+// String implements fmt.Stringer.
+func (d defKind) String() string {
+	switch d {
+	case defKindNone:
+		return "none"
+	case defKindOp2:
+		return "op2"
+	case defKindCall:
+		return "call"
+	case defKindDivRem:
+		return "divrem"
+	default:
+		return "invalid"
+	}
+}
+
+type useKind byte
+
+const (
+	useKindNone useKind = iota + 1
+	useKindOp1
+	// useKindOp1Op2Reg is Op1 can be any operand, Op2 must be a register.
+	useKindOp1Op2Reg
+	// useKindOp1RegOp2 is Op1 must be a register, Op2 can be any operand.
+	useKindOp1RegOp2
+	// useKindRaxOp1RegOp2 is Op1 must be a register, Op2 can be any operand, and RAX is used.
+	useKindRaxOp1RegOp2
+	useKindDivRem
+	useKindBlendvpd
+	useKindCall
+	useKindCallInd
+	useKindFcvtToSintSequence
+	useKindFcvtToUintSequence
+)
+
+var useKinds = [instrMax]useKind{
+	nop0:                   useKindNone,
+	ret:                    useKindNone,
+	movRR:                  useKindOp1,
+	movRM:                  useKindOp1RegOp2,
+	xmmMovRM:               useKindOp1RegOp2,
+	cmove:                  useKindOp1Op2Reg,
+	aluRmiR:                useKindOp1Op2Reg,
+	shiftR:                 useKindOp1Op2Reg,
+	imm:                    useKindNone,
+	unaryRmR:               useKindOp1,
+	xmmRmiReg:              useKindOp1Op2Reg,
+	xmmUnaryRmR:            useKindOp1,
+	xmmUnaryRmRImm:         useKindOp1,
+	xmmCmpRmR:              useKindOp1Op2Reg,
+	xmmRmR:                 useKindOp1Op2Reg,
+	xmmRmRImm:              useKindOp1Op2Reg,
+	mov64MR:                useKindOp1,
+	movzxRmR:               useKindOp1,
+	movsxRmR:               useKindOp1,
+	gprToXmm:               useKindOp1,
+	xmmToGpr:               useKindOp1,
+	call:                   useKindCall,
+	callIndirect:           useKindCallInd,
+	ud2:                    useKindNone,
+	jmpIf:                  useKindOp1,
+	jmp:                    useKindOp1,
+	cmpRmiR:                useKindOp1Op2Reg,
+	exitSequence:           useKindOp1,
+	lea:                    useKindOp1,
+	jmpTableIsland:         useKindNone,
+	setcc:                  useKindNone,
+	zeros:                  useKindNone,
+	sourceOffsetInfo:       useKindNone,
+	fcvtToSintSequence:     useKindFcvtToSintSequence,
+	defineUninitializedReg: useKindNone,
+	fcvtToUintSequence:     useKindFcvtToUintSequence,
+	xmmCMov:                useKindOp1,
+	idivRemSequence:        useKindDivRem,
+	blendvpd:               useKindBlendvpd,
+	mfence:                 useKindNone,
+	xchg:                   useKindOp1RegOp2,
+	lockcmpxchg:            useKindRaxOp1RegOp2,
+	lockxadd:               useKindOp1RegOp2,
+	neg:                    useKindOp1,
+	nopUseReg:              useKindOp1,
+}
+
+func (u useKind) String() string {
+	switch u {
+	case useKindNone:
+		return "none"
+	case useKindOp1:
+		return "op1"
+	case useKindOp1Op2Reg:
+		return "op1op2Reg"
+	case useKindOp1RegOp2:
+		return "op1RegOp2"
+	case useKindCall:
+		return "call"
+	case useKindCallInd:
+		return "callInd"
+	default:
+		return "invalid"
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go
new file mode 100644
index 000000000..6637b428c
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go
@@ -0,0 +1,1683 @@
+package amd64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+func (i *instruction) encode(c backend.Compiler) (needsLabelResolution bool) {
+	switch kind := i.kind; kind {
+	case nop0, sourceOffsetInfo, defineUninitializedReg, fcvtToSintSequence, fcvtToUintSequence, nopUseReg:
+	case ret:
+		encodeRet(c)
+	case imm:
+		dst := regEncodings[i.op2.reg().RealReg()]
+		con := i.u1
+		if i.b1 { // 64 bit.
+			if lower32willSignExtendTo64(con) {
+				// Sign extend mov(imm32).
+				encodeRegReg(c,
+					legacyPrefixesNone,
+					0xc7, 1,
+					0,
+					dst,
+					rexInfo(0).setW(),
+				)
+				c.Emit4Bytes(uint32(con))
+			} else {
+				c.EmitByte(rexEncodingW | dst.rexBit())
+				c.EmitByte(0xb8 | dst.encoding())
+				c.Emit8Bytes(con)
+			}
+		} else {
+			if dst.rexBit() > 0 {
+				c.EmitByte(rexEncodingDefault | 0x1)
+			}
+			c.EmitByte(0xb8 | dst.encoding())
+			c.Emit4Bytes(uint32(con))
+		}
+
+	case aluRmiR:
+		var rex rexInfo
+		if i.b1 {
+			rex = rex.setW()
+		} else {
+			rex = rex.clearW()
+		}
+
+		dst := regEncodings[i.op2.reg().RealReg()]
+
+		aluOp := aluRmiROpcode(i.u1)
+		if aluOp == aluRmiROpcodeMul {
+			op1 := i.op1
+			const regMemOpc, regMemOpcNum = 0x0FAF, 2
+			switch op1.kind {
+			case operandKindReg:
+				src := regEncodings[op1.reg().RealReg()]
+				encodeRegReg(c, legacyPrefixesNone, regMemOpc, regMemOpcNum, dst, src, rex)
+			case operandKindMem:
+				m := i.op1.addressMode()
+				encodeRegMem(c, legacyPrefixesNone, regMemOpc, regMemOpcNum, dst, m, rex)
+			case operandKindImm32:
+				imm8 := lower8willSignExtendTo32(op1.imm32())
+				var opc uint32
+				if imm8 {
+					opc = 0x6b
+				} else {
+					opc = 0x69
+				}
+				encodeRegReg(c, legacyPrefixesNone, opc, 1, dst, dst, rex)
+				if imm8 {
+					c.EmitByte(byte(op1.imm32()))
+				} else {
+					c.Emit4Bytes(op1.imm32())
+				}
+			default:
+				panic("BUG: invalid operand kind")
+			}
+		} else {
+			const opcodeNum = 1
+			var opcR, opcM, subOpcImm uint32
+			switch aluOp {
+			case aluRmiROpcodeAdd:
+				opcR, opcM, subOpcImm = 0x01, 0x03, 0x0
+			case aluRmiROpcodeSub:
+				opcR, opcM, subOpcImm = 0x29, 0x2b, 0x5
+			case aluRmiROpcodeAnd:
+				opcR, opcM, subOpcImm = 0x21, 0x23, 0x4
+			case aluRmiROpcodeOr:
+				opcR, opcM, subOpcImm = 0x09, 0x0b, 0x1
+			case aluRmiROpcodeXor:
+				opcR, opcM, subOpcImm = 0x31, 0x33, 0x6
+			default:
+				panic("BUG: invalid aluRmiROpcode")
+			}
+
+			op1 := i.op1
+			switch op1.kind {
+			case operandKindReg:
+				src := regEncodings[op1.reg().RealReg()]
+				encodeRegReg(c, legacyPrefixesNone, opcR, opcodeNum, src, dst, rex)
+			case operandKindMem:
+				m := i.op1.addressMode()
+				encodeRegMem(c, legacyPrefixesNone, opcM, opcodeNum, dst, m, rex)
+			case operandKindImm32:
+				imm8 := lower8willSignExtendTo32(op1.imm32())
+				var opc uint32
+				if imm8 {
+					opc = 0x83
+				} else {
+					opc = 0x81
+				}
+				encodeRegReg(c, legacyPrefixesNone, opc, opcodeNum, regEnc(subOpcImm), dst, rex)
+				if imm8 {
+					c.EmitByte(byte(op1.imm32()))
+				} else {
+					c.Emit4Bytes(op1.imm32())
+				}
+			default:
+				panic("BUG: invalid operand kind")
+			}
+		}
+
+	case movRR:
+		src := regEncodings[i.op1.reg().RealReg()]
+		dst := regEncodings[i.op2.reg().RealReg()]
+		var rex rexInfo
+		if i.b1 {
+			rex = rex.setW()
+		} else {
+			rex = rex.clearW()
+		}
+		encodeRegReg(c, legacyPrefixesNone, 0x89, 1, src, dst, rex)
+
+	case xmmRmR, blendvpd:
+		op := sseOpcode(i.u1)
+		var legPrex legacyPrefixes
+		var opcode uint32
+		var opcodeNum uint32
+		switch op {
+		case sseOpcodeAddps:
+			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F58, 2
+		case sseOpcodeAddpd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F58, 2
+		case sseOpcodeAddss:
+			legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F58, 2
+		case sseOpcodeAddsd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F58, 2
+		case sseOpcodeAndps:
+			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F54, 2
+		case sseOpcodeAndpd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F54, 2
+		case sseOpcodeAndnps:
+			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F55, 2
+		case sseOpcodeAndnpd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F55, 2
+		case sseOpcodeBlendvps:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3814, 3
+		case sseOpcodeBlendvpd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3815, 3
+		case sseOpcodeDivps:
+			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5E, 2
+		case sseOpcodeDivpd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5E, 2
+		case sseOpcodeDivss:
+			legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5E, 2
+		case sseOpcodeDivsd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5E, 2
+		case sseOpcodeMaxps:
+			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5F, 2
+		case sseOpcodeMaxpd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5F, 2
+		case sseOpcodeMaxss:
+			legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5F, 2
+		case sseOpcodeMaxsd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5F, 2
+		case sseOpcodeMinps:
+			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5D, 2
+		case sseOpcodeMinpd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5D, 2
+		case sseOpcodeMinss:
+			legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5D, 2
+		case sseOpcodeMinsd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5D, 2
+		case sseOpcodeMovlhps:
+			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F16, 2
+		case sseOpcodeMovsd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F10, 2
+		case sseOpcodeMulps:
+			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F59, 2
+		case sseOpcodeMulpd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F59, 2
+		case sseOpcodeMulss:
+			legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F59, 2
+		case sseOpcodeMulsd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F59, 2
+		case sseOpcodeOrpd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F56, 2
+		case sseOpcodeOrps:
+			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F56, 2
+		case sseOpcodePackssdw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F6B, 2
+		case sseOpcodePacksswb:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F63, 2
+		case sseOpcodePackusdw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F382B, 3
+		case sseOpcodePackuswb:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F67, 2
+		case sseOpcodePaddb:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFC, 2
+		case sseOpcodePaddd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFE, 2
+		case sseOpcodePaddq:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD4, 2
+		case sseOpcodePaddw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFD, 2
+		case sseOpcodePaddsb:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEC, 2
+		case sseOpcodePaddsw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FED, 2
+		case sseOpcodePaddusb:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDC, 2
+		case sseOpcodePaddusw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDD, 2
+		case sseOpcodePand:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDB, 2
+		case sseOpcodePandn:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDF, 2
+		case sseOpcodePavgb:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE0, 2
+		case sseOpcodePavgw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE3, 2
+		case sseOpcodePcmpeqb:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F74, 2
+		case sseOpcodePcmpeqw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F75, 2
+		case sseOpcodePcmpeqd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F76, 2
+		case sseOpcodePcmpeqq:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3829, 3
+		case sseOpcodePcmpgtb:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F64, 2
+		case sseOpcodePcmpgtw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F65, 2
+		case sseOpcodePcmpgtd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F66, 2
+		case sseOpcodePcmpgtq:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3837, 3
+		case sseOpcodePmaddwd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF5, 2
+		case sseOpcodePmaxsb:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383C, 3
+		case sseOpcodePmaxsw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEE, 2
+		case sseOpcodePmaxsd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383D, 3
+		case sseOpcodePmaxub:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDE, 2
+		case sseOpcodePmaxuw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383E, 3
+		case sseOpcodePmaxud:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383F, 3
+		case sseOpcodePminsb:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3838, 3
+		case sseOpcodePminsw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEA, 2
+		case sseOpcodePminsd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3839, 3
+		case sseOpcodePminub:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDA, 2
+		case sseOpcodePminuw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383A, 3
+		case sseOpcodePminud:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383B, 3
+		case sseOpcodePmulld:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3840, 3
+		case sseOpcodePmullw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD5, 2
+		case sseOpcodePmuludq:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF4, 2
+		case sseOpcodePor:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEB, 2
+		case sseOpcodePshufb:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3800, 3
+		case sseOpcodePsubb:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF8, 2
+		case sseOpcodePsubd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFA, 2
+		case sseOpcodePsubq:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFB, 2
+		case sseOpcodePsubw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF9, 2
+		case sseOpcodePsubsb:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE8, 2
+		case sseOpcodePsubsw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE9, 2
+		case sseOpcodePsubusb:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD8, 2
+		case sseOpcodePsubusw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD9, 2
+		case sseOpcodePunpckhbw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F68, 2
+		case sseOpcodePunpcklbw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F60, 2
+		case sseOpcodePxor:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEF, 2
+		case sseOpcodeSubps:
+			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5C, 2
+		case sseOpcodeSubpd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5C, 2
+		case sseOpcodeSubss:
+			legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5C, 2
+		case sseOpcodeSubsd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5C, 2
+		case sseOpcodeXorps:
+			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F57, 2
+		case sseOpcodeXorpd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F57, 2
+		case sseOpcodePmulhrsw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F380B, 3
+		case sseOpcodeUnpcklps:
+			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F14, 2
+		case sseOpcodePmaddubsw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3804, 3
+		default:
+			if kind == blendvpd {
+				legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3815, 3
+			} else {
+				panic(fmt.Sprintf("Unsupported sseOpcode: %s", op))
+			}
+		}
+
+		dst := regEncodings[i.op2.reg().RealReg()]
+
+		rex := rexInfo(0).clearW()
+		op1 := i.op1
+		if op1.kind == operandKindReg {
+			src := regEncodings[op1.reg().RealReg()]
+			encodeRegReg(c, legPrex, opcode, opcodeNum, dst, src, rex)
+		} else if i.op1.kind == operandKindMem {
+			m := i.op1.addressMode()
+			encodeRegMem(c, legPrex, opcode, opcodeNum, dst, m, rex)
+		} else {
+			panic("BUG: invalid operand kind")
+		}
+
+	case gprToXmm:
+		var legPrefix legacyPrefixes
+		var opcode uint32
+		const opcodeNum = 2
+		switch sseOpcode(i.u1) {
+		case sseOpcodeMovd, sseOpcodeMovq:
+			legPrefix, opcode = legacyPrefixes0x66, 0x0f6e
+		case sseOpcodeCvtsi2ss:
+			legPrefix, opcode = legacyPrefixes0xF3, 0x0f2a
+		case sseOpcodeCvtsi2sd:
+			legPrefix, opcode = legacyPrefixes0xF2, 0x0f2a
+		default:
+			panic(fmt.Sprintf("Unsupported sseOpcode: %s", sseOpcode(i.u1)))
+		}
+
+		var rex rexInfo
+		if i.b1 {
+			rex = rex.setW()
+		} else {
+			rex = rex.clearW()
+		}
+		dst := regEncodings[i.op2.reg().RealReg()]
+
+		op1 := i.op1
+		if op1.kind == operandKindReg {
+			src := regEncodings[op1.reg().RealReg()]
+			encodeRegReg(c, legPrefix, opcode, opcodeNum, dst, src, rex)
+		} else if i.op1.kind == operandKindMem {
+			m := i.op1.addressMode()
+			encodeRegMem(c, legPrefix, opcode, opcodeNum, dst, m, rex)
+		} else {
+			panic("BUG: invalid operand kind")
+		}
+
+	case xmmUnaryRmR:
+		var prefix legacyPrefixes
+		var opcode uint32
+		var opcodeNum uint32
+		op := sseOpcode(i.u1)
+		switch op {
+		case sseOpcodeCvtss2sd:
+			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5A, 2
+		case sseOpcodeCvtsd2ss:
+			prefix, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5A, 2
+		case sseOpcodeMovaps:
+			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F28, 2
+		case sseOpcodeMovapd:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F28, 2
+		case sseOpcodeMovdqa:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F6F, 2
+		case sseOpcodeMovdqu:
+			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F6F, 2
+		case sseOpcodeMovsd:
+			prefix, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F10, 2
+		case sseOpcodeMovss:
+			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F10, 2
+		case sseOpcodeMovups:
+			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F10, 2
+		case sseOpcodeMovupd:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F10, 2
+		case sseOpcodePabsb:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F381C, 3
+		case sseOpcodePabsw:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F381D, 3
+		case sseOpcodePabsd:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F381E, 3
+		case sseOpcodePmovsxbd:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3821, 3
+		case sseOpcodePmovsxbw:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3820, 3
+		case sseOpcodePmovsxbq:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3822, 3
+		case sseOpcodePmovsxwd:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3823, 3
+		case sseOpcodePmovsxwq:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3824, 3
+		case sseOpcodePmovsxdq:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3825, 3
+		case sseOpcodePmovzxbd:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3831, 3
+		case sseOpcodePmovzxbw:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3830, 3
+		case sseOpcodePmovzxbq:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3832, 3
+		case sseOpcodePmovzxwd:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3833, 3
+		case sseOpcodePmovzxwq:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3834, 3
+		case sseOpcodePmovzxdq:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3835, 3
+		case sseOpcodeSqrtps:
+			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F51, 2
+		case sseOpcodeSqrtpd:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F51, 2
+		case sseOpcodeSqrtss:
+			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F51, 2
+		case sseOpcodeSqrtsd:
+			prefix, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F51, 2
+		case sseOpcodeXorps:
+			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F57, 2
+		case sseOpcodeXorpd:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F57, 2
+		case sseOpcodeCvtdq2ps:
+			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F5B, 2
+		case sseOpcodeCvtdq2pd:
+			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0FE6, 2
+		case sseOpcodeCvtps2pd:
+			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F5A, 2
+		case sseOpcodeCvtpd2ps:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5A, 2
+		case sseOpcodeCvttps2dq:
+			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5B, 2
+		case sseOpcodeCvttpd2dq:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE6, 2
+		default:
+			panic(fmt.Sprintf("Unsupported sseOpcode: %s", op))
+		}
+
+		dst := regEncodings[i.op2.reg().RealReg()]
+
+		rex := rexInfo(0).clearW()
+		op1 := i.op1
+		if op1.kind == operandKindReg {
+			src := regEncodings[op1.reg().RealReg()]
+			encodeRegReg(c, prefix, opcode, opcodeNum, dst, src, rex)
+		} else if i.op1.kind == operandKindMem {
+			m := i.op1.addressMode()
+			needsLabelResolution = encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex)
+		} else {
+			panic("BUG: invalid operand kind")
+		}
+
+	case xmmUnaryRmRImm:
+		var prefix legacyPrefixes
+		var opcode uint32
+		var opcodeNum uint32
+		op := sseOpcode(i.u1)
+		switch op {
+		case sseOpcodeRoundps:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a08, 3
+		case sseOpcodeRoundss:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a0a, 3
+		case sseOpcodeRoundpd:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a09, 3
+		case sseOpcodeRoundsd:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a0b, 3
+		}
+		rex := rexInfo(0).clearW()
+		dst := regEncodings[i.op2.reg().RealReg()]
+		op1 := i.op1
+		if op1.kind == operandKindReg {
+			src := regEncodings[op1.reg().RealReg()]
+			encodeRegReg(c, prefix, opcode, opcodeNum, dst, src, rex)
+		} else if i.op1.kind == operandKindMem {
+			m := i.op1.addressMode()
+			encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex)
+		} else {
+			panic("BUG: invalid operand kind")
+		}
+
+		c.EmitByte(byte(i.u2))
+
+	case unaryRmR:
+		var prefix legacyPrefixes
+		var opcode uint32
+		var opcodeNum uint32
+		op := unaryRmROpcode(i.u1)
+		// We assume size is either 32 or 64.
+		switch op {
+		case unaryRmROpcodeBsr:
+			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0fbd, 2
+		case unaryRmROpcodeBsf:
+			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0fbc, 2
+		case unaryRmROpcodeLzcnt:
+			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0fbd, 2
+		case unaryRmROpcodeTzcnt:
+			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0fbc, 2
+		case unaryRmROpcodePopcnt:
+			prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0fb8, 2
+		default:
+			panic(fmt.Sprintf("Unsupported unaryRmROpcode: %s", op))
+		}
+
+		dst := regEncodings[i.op2.reg().RealReg()]
+
+		rex := rexInfo(0)
+		if i.b1 { // 64 bit.
+			rex = rexInfo(0).setW()
+		} else {
+			rex = rexInfo(0).clearW()
+		}
+		op1 := i.op1
+		if op1.kind == operandKindReg {
+			src := regEncodings[op1.reg().RealReg()]
+			encodeRegReg(c, prefix, opcode, opcodeNum, dst, src, rex)
+		} else if i.op1.kind == operandKindMem {
+			m := i.op1.addressMode()
+			encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex)
+		} else {
+			panic("BUG: invalid operand kind")
+		}
+
+	case not:
+		var prefix legacyPrefixes
+		src := regEncodings[i.op1.reg().RealReg()]
+		rex := rexInfo(0)
+		if i.b1 { // 64 bit.
+			rex = rexInfo(0).setW()
+		} else {
+			rex = rexInfo(0).clearW()
+		}
+		subopcode := uint8(2)
+		encodeEncEnc(c, prefix, 0xf7, 1, subopcode, uint8(src), rex)
+
+	case neg:
+		var prefix legacyPrefixes
+		src := regEncodings[i.op1.reg().RealReg()]
+		rex := rexInfo(0)
+		if i.b1 { // 64 bit.
+			rex = rexInfo(0).setW()
+		} else {
+			rex = rexInfo(0).clearW()
+		}
+		subopcode := uint8(3)
+		encodeEncEnc(c, prefix, 0xf7, 1, subopcode, uint8(src), rex)
+
+	case div:
+		rex := rexInfo(0)
+		if i.b1 { // 64 bit.
+			rex = rexInfo(0).setW()
+		} else {
+			rex = rexInfo(0).clearW()
+		}
+		var subopcode uint8
+		if i.u1 != 0 { // Signed.
+			subopcode = 7
+		} else {
+			subopcode = 6
+		}
+
+		divisor := i.op1
+		if divisor.kind == operandKindReg {
+			src := regEncodings[divisor.reg().RealReg()]
+			encodeEncEnc(c, legacyPrefixesNone, 0xf7, 1, subopcode, uint8(src), rex)
+		} else if divisor.kind == operandKindMem {
+			m := divisor.addressMode()
+			encodeEncMem(c, legacyPrefixesNone, 0xf7, 1, subopcode, m, rex)
+		} else {
+			panic("BUG: invalid operand kind")
+		}
+
+	case mulHi:
+		var prefix legacyPrefixes
+		rex := rexInfo(0)
+		if i.b1 { // 64 bit.
+			rex = rexInfo(0).setW()
+		} else {
+			rex = rexInfo(0).clearW()
+		}
+
+		signed := i.u1 != 0
+		var subopcode uint8
+		if signed {
+			subopcode = 5
+		} else {
+			subopcode = 4
+		}
+
+		// src1 is implicitly rax,
+		// dst_lo is implicitly rax,
+		// dst_hi is implicitly rdx.
+		src2 := i.op1
+		if src2.kind == operandKindReg {
+			src := regEncodings[src2.reg().RealReg()]
+			encodeEncEnc(c, prefix, 0xf7, 1, subopcode, uint8(src), rex)
+		} else if src2.kind == operandKindMem {
+			m := src2.addressMode()
+			encodeEncMem(c, prefix, 0xf7, 1, subopcode, m, rex)
+		} else {
+			panic("BUG: invalid operand kind")
+		}
+
+	case signExtendData:
+		if i.b1 { // 64 bit.
+			c.EmitByte(0x48)
+			c.EmitByte(0x99)
+		} else {
+			c.EmitByte(0x99)
+		}
+	case movzxRmR, movsxRmR:
+		signed := i.kind == movsxRmR
+
+		ext := extMode(i.u1)
+		var opcode uint32
+		var opcodeNum uint32
+		var rex rexInfo
+		switch ext {
+		case extModeBL:
+			if signed {
+				opcode, opcodeNum, rex = 0x0fbe, 2, rex.clearW()
+			} else {
+				opcode, opcodeNum, rex = 0x0fb6, 2, rex.clearW()
+			}
+		case extModeBQ:
+			if signed {
+				opcode, opcodeNum, rex = 0x0fbe, 2, rex.setW()
+			} else {
+				opcode, opcodeNum, rex = 0x0fb6, 2, rex.setW()
+			}
+		case extModeWL:
+			if signed {
+				opcode, opcodeNum, rex = 0x0fbf, 2, rex.clearW()
+			} else {
+				opcode, opcodeNum, rex = 0x0fb7, 2, rex.clearW()
+			}
+		case extModeWQ:
+			if signed {
+				opcode, opcodeNum, rex = 0x0fbf, 2, rex.setW()
+			} else {
+				opcode, opcodeNum, rex = 0x0fb7, 2, rex.setW()
+			}
+		case extModeLQ:
+			if signed {
+				opcode, opcodeNum, rex = 0x63, 1, rex.setW()
+			} else {
+				opcode, opcodeNum, rex = 0x8b, 1, rex.clearW()
+			}
+		default:
+			panic("BUG: invalid extMode")
+		}
+
+		op := i.op1
+		dst := regEncodings[i.op2.reg().RealReg()]
+		switch op.kind {
+		case operandKindReg:
+			src := regEncodings[op.reg().RealReg()]
+			if ext == extModeBL || ext == extModeBQ {
+				// Some destinations must be encoded with REX.R = 1.
+				if e := src.encoding(); e >= 4 && e <= 7 {
+					rex = rex.always()
+				}
+			}
+			encodeRegReg(c, legacyPrefixesNone, opcode, opcodeNum, dst, src, rex)
+		case operandKindMem:
+			m := op.addressMode()
+			encodeRegMem(c, legacyPrefixesNone, opcode, opcodeNum, dst, m, rex)
+		default:
+			panic("BUG: invalid operand kind")
+		}
+
+	case mov64MR:
+		m := i.op1.addressMode()
+		encodeLoad64(c, m, i.op2.reg().RealReg())
+
+	case lea:
+		needsLabelResolution = true
+		dst := regEncodings[i.op2.reg().RealReg()]
+		rex := rexInfo(0).setW()
+		const opcode, opcodeNum = 0x8d, 1
+		switch i.op1.kind {
+		case operandKindMem:
+			a := i.op1.addressMode()
+			encodeRegMem(c, legacyPrefixesNone, opcode, opcodeNum, dst, a, rex)
+		case operandKindLabel:
+			rex.encode(c, regRexBit(byte(dst)), 0)
+			c.EmitByte(byte((opcode) & 0xff))
+
+			// Indicate "LEAQ [RIP + 32bit displacement].
+			// https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing
+			c.EmitByte(encodeModRM(0b00, dst.encoding(), 0b101))
+
+			// This will be resolved later, so we just emit a placeholder (0xffffffff for testing).
+			c.Emit4Bytes(0xffffffff)
+		default:
+			panic("BUG: invalid operand kind")
+		}
+
+	case movRM:
+		m := i.op2.addressMode()
+		src := regEncodings[i.op1.reg().RealReg()]
+
+		var rex rexInfo
+		switch i.u1 {
+		case 1:
+			if e := src.encoding(); e >= 4 && e <= 7 {
+				rex = rex.always()
+			}
+			encodeRegMem(c, legacyPrefixesNone, 0x88, 1, src, m, rex.clearW())
+		case 2:
+			encodeRegMem(c, legacyPrefixes0x66, 0x89, 1, src, m, rex.clearW())
+		case 4:
+			encodeRegMem(c, legacyPrefixesNone, 0x89, 1, src, m, rex.clearW())
+		case 8:
+			encodeRegMem(c, legacyPrefixesNone, 0x89, 1, src, m, rex.setW())
+		default:
+			panic(fmt.Sprintf("BUG: invalid size %d: %s", i.u1, i.String()))
+		}
+
+	case shiftR:
+		src := regEncodings[i.op2.reg().RealReg()]
+		amount := i.op1
+
+		var opcode uint32
+		var prefix legacyPrefixes
+		rex := rexInfo(0)
+		if i.b1 { // 64 bit.
+			rex = rexInfo(0).setW()
+		} else {
+			rex = rexInfo(0).clearW()
+		}
+
+		switch amount.kind {
+		case operandKindReg:
+			if amount.reg() != rcxVReg {
+				panic("BUG: invalid reg operand: must be rcx")
+			}
+			opcode, prefix = 0xd3, legacyPrefixesNone
+			encodeEncEnc(c, prefix, opcode, 1, uint8(i.u1), uint8(src), rex)
+		case operandKindImm32:
+			opcode, prefix = 0xc1, legacyPrefixesNone
+			encodeEncEnc(c, prefix, opcode, 1, uint8(i.u1), uint8(src), rex)
+			c.EmitByte(byte(amount.imm32()))
+		default:
+			panic("BUG: invalid operand kind")
+		}
+	case xmmRmiReg:
+		const legPrefix = legacyPrefixes0x66
+		rex := rexInfo(0).clearW()
+		dst := regEncodings[i.op2.reg().RealReg()]
+
+		var opcode uint32
+		var regDigit uint8
+
+		op := sseOpcode(i.u1)
+		op1 := i.op1
+		if i.op1.kind == operandKindImm32 {
+			switch op {
+			case sseOpcodePsllw:
+				opcode, regDigit = 0x0f71, 6
+			case sseOpcodePslld:
+				opcode, regDigit = 0x0f72, 6
+			case sseOpcodePsllq:
+				opcode, regDigit = 0x0f73, 6
+			case sseOpcodePsraw:
+				opcode, regDigit = 0x0f71, 4
+			case sseOpcodePsrad:
+				opcode, regDigit = 0x0f72, 4
+			case sseOpcodePsrlw:
+				opcode, regDigit = 0x0f71, 2
+			case sseOpcodePsrld:
+				opcode, regDigit = 0x0f72, 2
+			case sseOpcodePsrlq:
+				opcode, regDigit = 0x0f73, 2
+			default:
+				panic("invalid opcode")
+			}
+
+			encodeEncEnc(c, legPrefix, opcode, 2, regDigit, uint8(dst), rex)
+			imm32 := op1.imm32()
+			if imm32 > 0xff&imm32 {
+				panic("immediate value does not fit 1 byte")
+			}
+			c.EmitByte(uint8(imm32))
+		} else {
+			switch op {
+			case sseOpcodePsllw:
+				opcode = 0x0ff1
+			case sseOpcodePslld:
+				opcode = 0x0ff2
+			case sseOpcodePsllq:
+				opcode = 0x0ff3
+			case sseOpcodePsraw:
+				opcode = 0x0fe1
+			case sseOpcodePsrad:
+				opcode = 0x0fe2
+			case sseOpcodePsrlw:
+				opcode = 0x0fd1
+			case sseOpcodePsrld:
+				opcode = 0x0fd2
+			case sseOpcodePsrlq:
+				opcode = 0x0fd3
+			default:
+				panic("invalid opcode")
+			}
+
+			if op1.kind == operandKindReg {
+				reg := regEncodings[op1.reg().RealReg()]
+				encodeRegReg(c, legPrefix, opcode, 2, dst, reg, rex)
+			} else if op1.kind == operandKindMem {
+				m := op1.addressMode()
+				encodeRegMem(c, legPrefix, opcode, 2, dst, m, rex)
+			} else {
+				panic("BUG: invalid operand kind")
+			}
+		}
+
+	case cmpRmiR:
+		var opcode uint32
+		isCmp := i.u1 != 0
+		rex := rexInfo(0)
+		_64 := i.b1
+		if _64 { // 64 bit.
+			rex = rex.setW()
+		} else {
+			rex = rex.clearW()
+		}
+		dst := regEncodings[i.op2.reg().RealReg()]
+		op1 := i.op1
+		switch op1.kind {
+		case operandKindReg:
+			reg := regEncodings[op1.reg().RealReg()]
+			if isCmp {
+				opcode = 0x39
+			} else {
+				opcode = 0x85
+			}
+			// Here we swap the encoding of the operands for CMP to be consistent with the output of LLVM/GCC.
+			encodeRegReg(c, legacyPrefixesNone, opcode, 1, reg, dst, rex)
+
+		case operandKindMem:
+			if isCmp {
+				opcode = 0x3b
+			} else {
+				opcode = 0x85
+			}
+			m := op1.addressMode()
+			encodeRegMem(c, legacyPrefixesNone, opcode, 1, dst, m, rex)
+
+		case operandKindImm32:
+			imm32 := op1.imm32()
+			useImm8 := isCmp && lower8willSignExtendTo32(imm32)
+			var subopcode uint8
+
+			switch {
+			case isCmp && useImm8:
+				opcode, subopcode = 0x83, 7
+			case isCmp && !useImm8:
+				opcode, subopcode = 0x81, 7
+			default:
+				opcode, subopcode = 0xf7, 0
+			}
+			encodeEncEnc(c, legacyPrefixesNone, opcode, 1, subopcode, uint8(dst), rex)
+			if useImm8 {
+				c.EmitByte(uint8(imm32))
+			} else {
+				c.Emit4Bytes(imm32)
+			}
+
+		default:
+			panic("BUG: invalid operand kind")
+		}
+	case setcc:
+		cc := cond(i.u1)
+		dst := regEncodings[i.op2.reg().RealReg()]
+		rex := rexInfo(0).clearW().always()
+		opcode := uint32(0x0f90) + uint32(cc)
+		encodeEncEnc(c, legacyPrefixesNone, opcode, 2, 0, uint8(dst), rex)
+	case cmove:
+		cc := cond(i.u1)
+		dst := regEncodings[i.op2.reg().RealReg()]
+		rex := rexInfo(0)
+		if i.b1 { // 64 bit.
+			rex = rex.setW()
+		} else {
+			rex = rex.clearW()
+		}
+		opcode := uint32(0x0f40) + uint32(cc)
+		src := i.op1
+		switch src.kind {
+		case operandKindReg:
+			srcReg := regEncodings[src.reg().RealReg()]
+			encodeRegReg(c, legacyPrefixesNone, opcode, 2, dst, srcReg, rex)
+		case operandKindMem:
+			m := src.addressMode()
+			encodeRegMem(c, legacyPrefixesNone, opcode, 2, dst, m, rex)
+		default:
+			panic("BUG: invalid operand kind")
+		}
+	case push64:
+		op := i.op1
+
+		switch op.kind {
+		case operandKindReg:
+			dst := regEncodings[op.reg().RealReg()]
+			if dst.rexBit() > 0 {
+				c.EmitByte(rexEncodingDefault | 0x1)
+			}
+			c.EmitByte(0x50 | dst.encoding())
+		case operandKindMem:
+			m := op.addressMode()
+			encodeRegMem(
+				c, legacyPrefixesNone, 0xff, 1, regEnc(6), m, rexInfo(0).clearW(),
+			)
+		case operandKindImm32:
+			c.EmitByte(0x68)
+			c.Emit4Bytes(op.imm32())
+		default:
+			panic("BUG: invalid operand kind")
+		}
+
+	case pop64:
+		dst := regEncodings[i.op1.reg().RealReg()]
+		if dst.rexBit() > 0 {
+			c.EmitByte(rexEncodingDefault | 0x1)
+		}
+		c.EmitByte(0x58 | dst.encoding())
+
+	case xmmMovRM:
+		var legPrefix legacyPrefixes
+		var opcode uint32
+		const opcodeNum = 2
+		switch sseOpcode(i.u1) {
+		case sseOpcodeMovaps:
+			legPrefix, opcode = legacyPrefixesNone, 0x0f29
+		case sseOpcodeMovapd:
+			legPrefix, opcode = legacyPrefixes0x66, 0x0f29
+		case sseOpcodeMovdqa:
+			legPrefix, opcode = legacyPrefixes0x66, 0x0f7f
+		case sseOpcodeMovdqu:
+			legPrefix, opcode = legacyPrefixes0xF3, 0x0f7f
+		case sseOpcodeMovss:
+			legPrefix, opcode = legacyPrefixes0xF3, 0x0f11
+		case sseOpcodeMovsd:
+			legPrefix, opcode = legacyPrefixes0xF2, 0x0f11
+		case sseOpcodeMovups:
+			legPrefix, opcode = legacyPrefixesNone, 0x0f11
+		case sseOpcodeMovupd:
+			legPrefix, opcode = legacyPrefixes0x66, 0x0f11
+		default:
+			panic(fmt.Sprintf("Unsupported sseOpcode: %s", sseOpcode(i.u1)))
+		}
+
+		dst := regEncodings[i.op1.reg().RealReg()]
+		encodeRegMem(c, legPrefix, opcode, opcodeNum, dst, i.op2.addressMode(), rexInfo(0).clearW())
+	case xmmLoadConst:
+		panic("TODO")
+	case xmmToGpr:
+		var legPrefix legacyPrefixes
+		var opcode uint32
+		var argSwap bool
+		const opcodeNum = 2
+		switch sseOpcode(i.u1) {
+		case sseOpcodeMovd, sseOpcodeMovq:
+			legPrefix, opcode, argSwap = legacyPrefixes0x66, 0x0f7e, false
+		case sseOpcodeMovmskps:
+			legPrefix, opcode, argSwap = legacyPrefixesNone, 0x0f50, true
+		case sseOpcodeMovmskpd:
+			legPrefix, opcode, argSwap = legacyPrefixes0x66, 0x0f50, true
+		case sseOpcodePmovmskb:
+			legPrefix, opcode, argSwap = legacyPrefixes0x66, 0x0fd7, true
+		case sseOpcodeCvttss2si:
+			legPrefix, opcode, argSwap = legacyPrefixes0xF3, 0x0f2c, true
+		case sseOpcodeCvttsd2si:
+			legPrefix, opcode, argSwap = legacyPrefixes0xF2, 0x0f2c, true
+		default:
+			panic(fmt.Sprintf("Unsupported sseOpcode: %s", sseOpcode(i.u1)))
+		}
+
+		var rex rexInfo
+		if i.b1 {
+			rex = rex.setW()
+		} else {
+			rex = rex.clearW()
+		}
+		src := regEncodings[i.op1.reg().RealReg()]
+		dst := regEncodings[i.op2.reg().RealReg()]
+		if argSwap {
+			src, dst = dst, src
+		}
+		encodeRegReg(c, legPrefix, opcode, opcodeNum, src, dst, rex)
+
+	case cvtUint64ToFloatSeq:
+		panic("TODO")
+	case cvtFloatToSintSeq:
+		panic("TODO")
+	case cvtFloatToUintSeq:
+		panic("TODO")
+	case xmmMinMaxSeq:
+		panic("TODO")
+	case xmmCmpRmR:
+		var prefix legacyPrefixes
+		var opcode uint32
+		var opcodeNum uint32
+		rex := rexInfo(0)
+		_64 := i.b1
+		if _64 { // 64 bit.
+			rex = rex.setW()
+		} else {
+			rex = rex.clearW()
+		}
+
+		op := sseOpcode(i.u1)
+		switch op {
+		case sseOpcodePtest:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3817, 3
+		case sseOpcodeUcomisd:
+			prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f2e, 2
+		case sseOpcodeUcomiss:
+			prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0f2e, 2
+		default:
+			panic(fmt.Sprintf("Unsupported sseOpcode: %s", op))
+		}
+
+		dst := regEncodings[i.op2.reg().RealReg()]
+		op1 := i.op1
+		switch op1.kind {
+		case operandKindReg:
+			reg := regEncodings[op1.reg().RealReg()]
+			encodeRegReg(c, prefix, opcode, opcodeNum, dst, reg, rex)
+
+		case operandKindMem:
+			m := op1.addressMode()
+			encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex)
+
+		default:
+			panic("BUG: invalid operand kind")
+		}
+	case xmmRmRImm:
+		op := sseOpcode(i.u1)
+		var legPrex legacyPrefixes
+		var opcode uint32
+		var opcodeNum uint32
+		var swap bool
+		switch op {
+		case sseOpcodeCmpps:
+			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0FC2, 2
+		case sseOpcodeCmppd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FC2, 2
+		case sseOpcodeCmpss:
+			legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0FC2, 2
+		case sseOpcodeCmpsd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0FC2, 2
+		case sseOpcodeInsertps:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A21, 3
+		case sseOpcodePalignr:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A0F, 3
+		case sseOpcodePinsrb:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A20, 3
+		case sseOpcodePinsrw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FC4, 2
+		case sseOpcodePinsrd, sseOpcodePinsrq:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A22, 3
+		case sseOpcodePextrb:
+			swap = true
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A14, 3
+		case sseOpcodePextrw:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FC5, 2
+		case sseOpcodePextrd, sseOpcodePextrq:
+			swap = true
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A16, 3
+		case sseOpcodePshufd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F70, 2
+		case sseOpcodeRoundps:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A08, 3
+		case sseOpcodeRoundpd:
+			legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A09, 3
+		case sseOpcodeShufps:
+			legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0FC6, 2
+		default:
+			panic(fmt.Sprintf("Unsupported sseOpcode: %s", op))
+		}
+
+		dst := regEncodings[i.op2.reg().RealReg()]
+
+		var rex rexInfo
+		if op == sseOpcodePextrq || op == sseOpcodePinsrq {
+			rex = rexInfo(0).setW()
+		} else {
+			rex = rexInfo(0).clearW()
+		}
+		op1 := i.op1
+		if op1.kind == operandKindReg {
+			src := regEncodings[op1.reg().RealReg()]
+			if swap {
+				src, dst = dst, src
+			}
+			encodeRegReg(c, legPrex, opcode, opcodeNum, dst, src, rex)
+		} else if i.op1.kind == operandKindMem {
+			if swap {
+				panic("BUG: this is not possible to encode")
+			}
+			m := i.op1.addressMode()
+			encodeRegMem(c, legPrex, opcode, opcodeNum, dst, m, rex)
+		} else {
+			panic("BUG: invalid operand kind")
+		}
+
+		c.EmitByte(byte(i.u2))
+
+	case jmp:
+		const (
+			regMemOpcode    = 0xff
+			regMemOpcodeNum = 1
+			regMemSubOpcode = 4
+		)
+		op := i.op1
+		switch op.kind {
+		case operandKindLabel:
+			needsLabelResolution = true
+			fallthrough
+		case operandKindImm32:
+			c.EmitByte(0xe9)
+			c.Emit4Bytes(op.imm32())
+		case operandKindMem:
+			m := op.addressMode()
+			encodeRegMem(c,
+				legacyPrefixesNone,
+				regMemOpcode, regMemOpcodeNum,
+				regMemSubOpcode, m, rexInfo(0).clearW(),
+			)
+		case operandKindReg:
+			r := op.reg().RealReg()
+			encodeRegReg(
+				c,
+				legacyPrefixesNone,
+				regMemOpcode, regMemOpcodeNum,
+				regMemSubOpcode,
+				regEncodings[r], rexInfo(0).clearW(),
+			)
+		default:
+			panic("BUG: invalid operand kind")
+		}
+
+	case jmpIf:
+		op := i.op1
+		switch op.kind {
+		case operandKindLabel:
+			needsLabelResolution = true
+			fallthrough
+		case operandKindImm32:
+			c.EmitByte(0x0f)
+			c.EmitByte(0x80 | cond(i.u1).encoding())
+			c.Emit4Bytes(op.imm32())
+		default:
+			panic("BUG: invalid operand kind")
+		}
+
+	case jmpTableIsland:
+		needsLabelResolution = true
+		for tc := uint64(0); tc < i.u2; tc++ {
+			c.Emit8Bytes(0)
+		}
+
+	case exitSequence:
+		execCtx := i.op1.reg()
+		allocatedAmode := i.op2.addressMode()
+
+		// Restore the RBP, RSP, and return to the Go code:
+		*allocatedAmode = amode{
+			kindWithShift: uint32(amodeImmReg), base: execCtx,
+			imm32: wazevoapi.ExecutionContextOffsetOriginalFramePointer.U32(),
+		}
+		encodeLoad64(c, allocatedAmode, rbp)
+		allocatedAmode.imm32 = wazevoapi.ExecutionContextOffsetOriginalStackPointer.U32()
+		encodeLoad64(c, allocatedAmode, rsp)
+		encodeRet(c)
+
+	case ud2:
+		c.EmitByte(0x0f)
+		c.EmitByte(0x0b)
+
+	case call:
+		c.EmitByte(0xe8)
+		// Meaning that the call target is a function value, and requires relocation.
+		c.AddRelocationInfo(ssa.FuncRef(i.u1))
+		// Note that this is zero as a placeholder for the call target if it's a function value.
+		c.Emit4Bytes(uint32(i.u2))
+
+	case callIndirect:
+		op := i.op1
+
+		const opcodeNum = 1
+		const opcode = 0xff
+		rex := rexInfo(0).clearW()
+		switch op.kind {
+		case operandKindReg:
+			dst := regEncodings[op.reg().RealReg()]
+			encodeRegReg(c,
+				legacyPrefixesNone,
+				opcode, opcodeNum,
+				regEnc(2),
+				dst,
+				rex,
+			)
+		case operandKindMem:
+			m := op.addressMode()
+			encodeRegMem(c,
+				legacyPrefixesNone,
+				opcode, opcodeNum,
+				regEnc(2),
+				m,
+				rex,
+			)
+		default:
+			panic("BUG: invalid operand kind")
+		}
+
+	case xchg:
+		src, dst := regEncodings[i.op1.reg().RealReg()], i.op2
+		size := i.u1
+
+		var rex rexInfo
+		var opcode uint32
+		lp := legacyPrefixesNone
+		switch size {
+		case 8:
+			opcode = 0x87
+			rex = rexInfo(0).setW()
+		case 4:
+			opcode = 0x87
+			rex = rexInfo(0).clearW()
+		case 2:
+			lp = legacyPrefixes0x66
+			opcode = 0x87
+			rex = rexInfo(0).clearW()
+		case 1:
+			opcode = 0x86
+			if i.op2.kind == operandKindReg {
+				panic("TODO?: xchg on two 1-byte registers")
+			}
+			// Some destinations must be encoded with REX.R = 1.
+			if e := src.encoding(); e >= 4 && e <= 7 {
+				rex = rexInfo(0).always()
+			}
+		default:
+			panic(fmt.Sprintf("BUG: invalid size %d: %s", size, i.String()))
+		}
+
+		switch dst.kind {
+		case operandKindMem:
+			m := dst.addressMode()
+			encodeRegMem(c, lp, opcode, 1, src, m, rex)
+		case operandKindReg:
+			r := dst.reg().RealReg()
+			encodeRegReg(c, lp, opcode, 1, src, regEncodings[r], rex)
+		default:
+			panic("BUG: invalid operand kind")
+		}
+
+	case lockcmpxchg:
+		src, dst := regEncodings[i.op1.reg().RealReg()], i.op2
+		size := i.u1
+
+		var rex rexInfo
+		var opcode uint32
+		lp := legacyPrefixes0xF0 // Lock prefix.
+		switch size {
+		case 8:
+			opcode = 0x0FB1
+			rex = rexInfo(0).setW()
+		case 4:
+			opcode = 0x0FB1
+			rex = rexInfo(0).clearW()
+		case 2:
+			lp = legacyPrefixes0x660xF0 // Legacy prefix + Lock prefix.
+			opcode = 0x0FB1
+			rex = rexInfo(0).clearW()
+		case 1:
+			opcode = 0x0FB0
+			// Some destinations must be encoded with REX.R = 1.
+			if e := src.encoding(); e >= 4 && e <= 7 {
+				rex = rexInfo(0).always()
+			}
+		default:
+			panic(fmt.Sprintf("BUG: invalid size %d: %s", size, i.String()))
+		}
+
+		switch dst.kind {
+		case operandKindMem:
+			m := dst.addressMode()
+			encodeRegMem(c, lp, opcode, 2, src, m, rex)
+		default:
+			panic("BUG: invalid operand kind")
+		}
+
+	case lockxadd:
+		src, dst := regEncodings[i.op1.reg().RealReg()], i.op2
+		size := i.u1
+
+		var rex rexInfo
+		var opcode uint32
+		lp := legacyPrefixes0xF0 // Lock prefix.
+		switch size {
+		case 8:
+			opcode = 0x0FC1
+			rex = rexInfo(0).setW()
+		case 4:
+			opcode = 0x0FC1
+			rex = rexInfo(0).clearW()
+		case 2:
+			lp = legacyPrefixes0x660xF0 // Legacy prefix + Lock prefix.
+			opcode = 0x0FC1
+			rex = rexInfo(0).clearW()
+		case 1:
+			opcode = 0x0FC0
+			// Some destinations must be encoded with REX.R = 1.
+			if e := src.encoding(); e >= 4 && e <= 7 {
+				rex = rexInfo(0).always()
+			}
+		default:
+			panic(fmt.Sprintf("BUG: invalid size %d: %s", size, i.String()))
+		}
+
+		switch dst.kind {
+		case operandKindMem:
+			m := dst.addressMode()
+			encodeRegMem(c, lp, opcode, 2, src, m, rex)
+		default:
+			panic("BUG: invalid operand kind")
+		}
+
+	case zeros:
+		r := i.op2.reg()
+		if r.RegType() == regalloc.RegTypeInt {
+			i.asAluRmiR(aluRmiROpcodeXor, newOperandReg(r), r, true)
+		} else {
+			i.asXmmRmR(sseOpcodePxor, newOperandReg(r), r)
+		}
+		i.encode(c)
+
+	case mfence:
+		// https://www.felixcloutier.com/x86/mfence
+		c.EmitByte(0x0f)
+		c.EmitByte(0xae)
+		c.EmitByte(0xf0)
+
+	default:
+		panic(fmt.Sprintf("TODO: %v", i.kind))
+	}
+	return
+}
+
+func encodeLoad64(c backend.Compiler, m *amode, rd regalloc.RealReg) {
+	dst := regEncodings[rd]
+	encodeRegMem(c, legacyPrefixesNone, 0x8b, 1, dst, m, rexInfo(0).setW())
+}
+
+func encodeRet(c backend.Compiler) {
+	c.EmitByte(0xc3)
+}
+
+func encodeEncEnc(
+	c backend.Compiler,
+	legPrefixes legacyPrefixes,
+	opcodes uint32,
+	opcodeNum uint32,
+	r uint8,
+	rm uint8,
+	rex rexInfo,
+) {
+	legPrefixes.encode(c)
+	rex.encode(c, r>>3, rm>>3)
+
+	for opcodeNum > 0 {
+		opcodeNum--
+		c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff))
+	}
+	c.EmitByte(encodeModRM(3, r&7, rm&7))
+}
+
+func encodeRegReg(
+	c backend.Compiler,
+	legPrefixes legacyPrefixes,
+	opcodes uint32,
+	opcodeNum uint32,
+	r regEnc,
+	rm regEnc,
+	rex rexInfo,
+) {
+	encodeEncEnc(c, legPrefixes, opcodes, opcodeNum, uint8(r), uint8(rm), rex)
+}
+
+func encodeModRM(mod byte, reg byte, rm byte) byte {
+	return mod<<6 | reg<<3 | rm
+}
+
+func encodeSIB(shift byte, encIndex byte, encBase byte) byte {
+	return shift<<6 | encIndex<<3 | encBase
+}
+
+func encodeRegMem(
+	c backend.Compiler, legPrefixes legacyPrefixes, opcodes uint32, opcodeNum uint32, r regEnc, m *amode, rex rexInfo,
+) (needsLabelResolution bool) {
+	needsLabelResolution = encodeEncMem(c, legPrefixes, opcodes, opcodeNum, uint8(r), m, rex)
+	return
+}
+
+func encodeEncMem(
+	c backend.Compiler, legPrefixes legacyPrefixes, opcodes uint32, opcodeNum uint32, r uint8, m *amode, rex rexInfo,
+) (needsLabelResolution bool) {
+	legPrefixes.encode(c)
+
+	const (
+		modNoDisplacement    = 0b00
+		modShortDisplacement = 0b01
+		modLongDisplacement  = 0b10
+
+		useSBI = 4 // the encoding of rsp or r12 register.
+	)
+
+	switch m.kind() {
+	case amodeImmReg, amodeImmRBP:
+		base := m.base.RealReg()
+		baseEnc := regEncodings[base]
+
+		rex.encode(c, regRexBit(r), baseEnc.rexBit())
+
+		for opcodeNum > 0 {
+			opcodeNum--
+			c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff))
+		}
+
+		// SIB byte is the last byte of the memory encoding before the displacement
+		const sibByte = 0x24 // == encodeSIB(0, 4, 4)
+
+		immZero, baseRbp, baseR13 := m.imm32 == 0, base == rbp, base == r13
+		short := lower8willSignExtendTo32(m.imm32)
+		rspOrR12 := base == rsp || base == r12
+
+		if immZero && !baseRbp && !baseR13 { // rbp or r13 can't be used as base for without displacement encoding.
+			c.EmitByte(encodeModRM(modNoDisplacement, regEncoding(r), baseEnc.encoding()))
+			if rspOrR12 {
+				c.EmitByte(sibByte)
+			}
+		} else if short { // Note: this includes the case where m.imm32 == 0 && base == rbp || base == r13.
+			c.EmitByte(encodeModRM(modShortDisplacement, regEncoding(r), baseEnc.encoding()))
+			if rspOrR12 {
+				c.EmitByte(sibByte)
+			}
+			c.EmitByte(byte(m.imm32))
+		} else {
+			c.EmitByte(encodeModRM(modLongDisplacement, regEncoding(r), baseEnc.encoding()))
+			if rspOrR12 {
+				c.EmitByte(sibByte)
+			}
+			c.Emit4Bytes(m.imm32)
+		}
+
+	case amodeRegRegShift:
+		base := m.base.RealReg()
+		baseEnc := regEncodings[base]
+		index := m.index.RealReg()
+		indexEnc := regEncodings[index]
+
+		if index == rsp {
+			panic("BUG: rsp can't be used as index of addressing mode")
+		}
+
+		rex.encodeForIndex(c, regEnc(r), indexEnc, baseEnc)
+
+		for opcodeNum > 0 {
+			opcodeNum--
+			c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff))
+		}
+
+		immZero, baseRbp, baseR13 := m.imm32 == 0, base == rbp, base == r13
+		if immZero && !baseRbp && !baseR13 { // rbp or r13 can't be used as base for without displacement encoding. (curious why? because it's interpreted as RIP relative addressing).
+			c.EmitByte(encodeModRM(modNoDisplacement, regEncoding(r), useSBI))
+			c.EmitByte(encodeSIB(m.shift(), indexEnc.encoding(), baseEnc.encoding()))
+		} else if lower8willSignExtendTo32(m.imm32) {
+			c.EmitByte(encodeModRM(modShortDisplacement, regEncoding(r), useSBI))
+			c.EmitByte(encodeSIB(m.shift(), indexEnc.encoding(), baseEnc.encoding()))
+			c.EmitByte(byte(m.imm32))
+		} else {
+			c.EmitByte(encodeModRM(modLongDisplacement, regEncoding(r), useSBI))
+			c.EmitByte(encodeSIB(m.shift(), indexEnc.encoding(), baseEnc.encoding()))
+			c.Emit4Bytes(m.imm32)
+		}
+
+	case amodeRipRel:
+		rex.encode(c, regRexBit(r), 0)
+		for opcodeNum > 0 {
+			opcodeNum--
+			c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff))
+		}
+
+		// Indicate "LEAQ [RIP + 32bit displacement].
+		// https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing
+		c.EmitByte(encodeModRM(0b00, regEncoding(r), 0b101))
+
+		// This will be resolved later, so we just emit a placeholder.
+		needsLabelResolution = true
+		c.Emit4Bytes(0)
+
+	default:
+		panic("BUG: invalid addressing mode")
+	}
+	return
+}
+
+const (
+	rexEncodingDefault byte = 0x40
+	rexEncodingW            = rexEncodingDefault | 0x08
+)
+
+// rexInfo is a bit set to indicate:
+//
+//	0x01: W bit must be cleared.
+//	0x02: REX prefix must be emitted.
+type rexInfo byte
+
+func (ri rexInfo) setW() rexInfo {
+	return ri | 0x01
+}
+
+func (ri rexInfo) clearW() rexInfo {
+	return ri & 0x02
+}
+
+func (ri rexInfo) always() rexInfo {
+	return ri | 0x02
+}
+
+func (ri rexInfo) notAlways() rexInfo { //nolint
+	return ri & 0x01
+}
+
+func (ri rexInfo) encode(c backend.Compiler, r uint8, b uint8) {
+	var w byte = 0
+	if ri&0x01 != 0 {
+		w = 0x01
+	}
+	rex := rexEncodingDefault | w<<3 | r<<2 | b
+	if rex != rexEncodingDefault || ri&0x02 != 0 {
+		c.EmitByte(rex)
+	}
+}
+
+func (ri rexInfo) encodeForIndex(c backend.Compiler, encR regEnc, encIndex regEnc, encBase regEnc) {
+	var w byte = 0
+	if ri&0x01 != 0 {
+		w = 0x01
+	}
+	r := encR.rexBit()
+	x := encIndex.rexBit()
+	b := encBase.rexBit()
+	rex := byte(0x40) | w<<3 | r<<2 | x<<1 | b
+	if rex != 0x40 || ri&0x02 != 0 {
+		c.EmitByte(rex)
+	}
+}
+
+type regEnc byte
+
+func (r regEnc) rexBit() byte {
+	return regRexBit(byte(r))
+}
+
+func (r regEnc) encoding() byte {
+	return regEncoding(byte(r))
+}
+
+func regRexBit(r byte) byte {
+	return r >> 3
+}
+
+func regEncoding(r byte) byte {
+	return r & 0x07
+}
+
+var regEncodings = [...]regEnc{
+	rax:   0b000,
+	rcx:   0b001,
+	rdx:   0b010,
+	rbx:   0b011,
+	rsp:   0b100,
+	rbp:   0b101,
+	rsi:   0b110,
+	rdi:   0b111,
+	r8:    0b1000,
+	r9:    0b1001,
+	r10:   0b1010,
+	r11:   0b1011,
+	r12:   0b1100,
+	r13:   0b1101,
+	r14:   0b1110,
+	r15:   0b1111,
+	xmm0:  0b000,
+	xmm1:  0b001,
+	xmm2:  0b010,
+	xmm3:  0b011,
+	xmm4:  0b100,
+	xmm5:  0b101,
+	xmm6:  0b110,
+	xmm7:  0b111,
+	xmm8:  0b1000,
+	xmm9:  0b1001,
+	xmm10: 0b1010,
+	xmm11: 0b1011,
+	xmm12: 0b1100,
+	xmm13: 0b1101,
+	xmm14: 0b1110,
+	xmm15: 0b1111,
+}
+
+type legacyPrefixes byte
+
+const (
+	legacyPrefixesNone legacyPrefixes = iota
+	legacyPrefixes0x66
+	legacyPrefixes0xF0
+	legacyPrefixes0x660xF0
+	legacyPrefixes0xF2
+	legacyPrefixes0xF3
+)
+
+func (p legacyPrefixes) encode(c backend.Compiler) {
+	switch p {
+	case legacyPrefixesNone:
+	case legacyPrefixes0x66:
+		c.EmitByte(0x66)
+	case legacyPrefixes0xF0:
+		c.EmitByte(0xf0)
+	case legacyPrefixes0x660xF0:
+		c.EmitByte(0x66)
+		c.EmitByte(0xf0)
+	case legacyPrefixes0xF2:
+		c.EmitByte(0xf2)
+	case legacyPrefixes0xF3:
+		c.EmitByte(0xf3)
+	default:
+		panic("BUG: invalid legacy prefix")
+	}
+}
+
+func lower32willSignExtendTo64(x uint64) bool {
+	xs := int64(x)
+	return xs == int64(uint64(int32(xs)))
+}
+
+func lower8willSignExtendTo32(x uint32) bool {
+	xs := int32(x)
+	return xs == ((xs << 24) >> 24)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go
new file mode 100644
index 000000000..55d05ef63
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go
@@ -0,0 +1,71 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// lowerConstant allocates a new VReg and inserts the instruction to load the constant value.
+func (m *machine) lowerConstant(instr *ssa.Instruction) (vr regalloc.VReg) {
+	val := instr.Return()
+	valType := val.Type()
+
+	vr = m.c.AllocateVReg(valType)
+	m.insertLoadConstant(instr, vr)
+	return
+}
+
+// InsertLoadConstantBlockArg implements backend.Machine.
+func (m *machine) InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) {
+	m.insertLoadConstant(instr, vr)
+}
+
+func (m *machine) insertLoadConstant(instr *ssa.Instruction, vr regalloc.VReg) {
+	val := instr.Return()
+	valType := val.Type()
+	v := instr.ConstantVal()
+
+	bits := valType.Bits()
+	if bits < 64 { // Clear the redundant bits just in case it's unexpectedly sign-extended, etc.
+		v = v & ((1 << valType.Bits()) - 1)
+	}
+
+	switch valType {
+	case ssa.TypeF32, ssa.TypeF64:
+		m.lowerFconst(vr, v, bits == 64)
+	case ssa.TypeI32, ssa.TypeI64:
+		m.lowerIconst(vr, v, bits == 64)
+	default:
+		panic("BUG")
+	}
+}
+
+func (m *machine) lowerFconst(dst regalloc.VReg, c uint64, _64 bool) {
+	if c == 0 {
+		xor := m.allocateInstr().asZeros(dst)
+		m.insert(xor)
+	} else {
+		var tmpType ssa.Type
+		if _64 {
+			tmpType = ssa.TypeI64
+		} else {
+			tmpType = ssa.TypeI32
+		}
+		tmpInt := m.c.AllocateVReg(tmpType)
+		loadToGP := m.allocateInstr().asImm(tmpInt, c, _64)
+		m.insert(loadToGP)
+
+		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpInt), dst, _64)
+		m.insert(movToXmm)
+	}
+}
+
+func (m *machine) lowerIconst(dst regalloc.VReg, c uint64, _64 bool) {
+	i := m.allocateInstr()
+	if c == 0 {
+		i.asZeros(dst)
+	} else {
+		i.asImm(dst, c, _64)
+	}
+	m.insert(i)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go
new file mode 100644
index 000000000..bee673d25
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go
@@ -0,0 +1,187 @@
+package amd64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+var addendsMatchOpcodes = [...]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst, ssa.OpcodeIshl}
+
+type addend struct {
+	r     regalloc.VReg
+	off   int64
+	shift byte
+}
+
+func (a addend) String() string {
+	return fmt.Sprintf("addend{r=%s, off=%d, shift=%d}", a.r, a.off, a.shift)
+}
+
+// lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
+func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32) (am *amode) {
+	def := m.c.ValueDefinition(ptr)
+
+	if offsetBase&0x80000000 != 0 {
+		// Special casing the huge base offset whose MSB is set. In x64, the immediate is always
+		// sign-extended, but our IR semantics requires the offset base is always unsigned.
+		// Note that this should be extremely rare or even this shouldn't hit in the real application,
+		// therefore we don't need to optimize this case in my opinion.
+
+		a := m.lowerAddend(def)
+		off64 := a.off + int64(offsetBase)
+		offsetBaseReg := m.c.AllocateVReg(ssa.TypeI64)
+		m.lowerIconst(offsetBaseReg, uint64(off64), true)
+		if a.r != regalloc.VRegInvalid {
+			return m.newAmodeRegRegShift(0, offsetBaseReg, a.r, a.shift)
+		} else {
+			return m.newAmodeImmReg(0, offsetBaseReg)
+		}
+	}
+
+	if op := m.c.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op == ssa.OpcodeIadd {
+		add := def.Instr
+		x, y := add.Arg2()
+		xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+		ax := m.lowerAddend(xDef)
+		ay := m.lowerAddend(yDef)
+		add.MarkLowered()
+		return m.lowerAddendsToAmode(ax, ay, offsetBase)
+	} else {
+		// If it is not an Iadd, then we lower the one addend.
+		a := m.lowerAddend(def)
+		// off is always 0 if r is valid.
+		if a.r != regalloc.VRegInvalid {
+			if a.shift != 0 {
+				tmpReg := m.c.AllocateVReg(ssa.TypeI64)
+				m.lowerIconst(tmpReg, 0, true)
+				return m.newAmodeRegRegShift(offsetBase, tmpReg, a.r, a.shift)
+			}
+			return m.newAmodeImmReg(offsetBase, a.r)
+		} else {
+			off64 := a.off + int64(offsetBase)
+			tmpReg := m.c.AllocateVReg(ssa.TypeI64)
+			m.lowerIconst(tmpReg, uint64(off64), true)
+			return m.newAmodeImmReg(0, tmpReg)
+		}
+	}
+}
+
+func (m *machine) lowerAddendsToAmode(x, y addend, offBase uint32) *amode {
+	if x.r != regalloc.VRegInvalid && x.off != 0 || y.r != regalloc.VRegInvalid && y.off != 0 {
+		panic("invalid input")
+	}
+
+	u64 := uint64(x.off+y.off) + uint64(offBase)
+	if u64 != 0 {
+		if _, ok := asImm32(u64, false); !ok {
+			tmpReg := m.c.AllocateVReg(ssa.TypeI64)
+			m.lowerIconst(tmpReg, u64, true)
+			// Blank u64 as it has been already lowered.
+			u64 = 0
+
+			if x.r == regalloc.VRegInvalid {
+				x.r = tmpReg
+			} else if y.r == regalloc.VRegInvalid {
+				y.r = tmpReg
+			} else {
+				// We already know that either rx or ry is invalid,
+				// so we overwrite it with the temporary register.
+				panic("BUG")
+			}
+		}
+	}
+
+	u32 := uint32(u64)
+	switch {
+	// We assume rx, ry are valid iff offx, offy are 0.
+	case x.r != regalloc.VRegInvalid && y.r != regalloc.VRegInvalid:
+		switch {
+		case x.shift != 0 && y.shift != 0:
+			// Cannot absorb two shifted registers, must lower one to a shift instruction.
+			shifted := m.allocateInstr()
+			shifted.asShiftR(shiftROpShiftLeft, newOperandImm32(uint32(x.shift)), x.r, true)
+			m.insert(shifted)
+
+			return m.newAmodeRegRegShift(u32, x.r, y.r, y.shift)
+		case x.shift != 0 && y.shift == 0:
+			// Swap base and index.
+			x, y = y, x
+			fallthrough
+		default:
+			return m.newAmodeRegRegShift(u32, x.r, y.r, y.shift)
+		}
+	case x.r == regalloc.VRegInvalid && y.r != regalloc.VRegInvalid:
+		x, y = y, x
+		fallthrough
+	case x.r != regalloc.VRegInvalid && y.r == regalloc.VRegInvalid:
+		if x.shift != 0 {
+			zero := m.c.AllocateVReg(ssa.TypeI64)
+			m.lowerIconst(zero, 0, true)
+			return m.newAmodeRegRegShift(u32, zero, x.r, x.shift)
+		}
+		return m.newAmodeImmReg(u32, x.r)
+	default: // Both are invalid: use the offset.
+		tmpReg := m.c.AllocateVReg(ssa.TypeI64)
+		m.lowerIconst(tmpReg, u64, true)
+		return m.newAmodeImmReg(0, tmpReg)
+	}
+}
+
+func (m *machine) lowerAddend(x *backend.SSAValueDefinition) addend {
+	if x.IsFromBlockParam() {
+		return addend{x.BlkParamVReg, 0, 0}
+	}
+	// Ensure the addend is not referenced in multiple places; we will discard nested Iadds.
+	op := m.c.MatchInstrOneOf(x, addendsMatchOpcodes[:])
+	if op != ssa.OpcodeInvalid && op != ssa.OpcodeIadd {
+		return m.lowerAddendFromInstr(x.Instr)
+	}
+	p := m.getOperand_Reg(x)
+	return addend{p.reg(), 0, 0}
+}
+
+// lowerAddendFromInstr takes an instruction returns a Vreg and an offset that can be used in an address mode.
+// The Vreg is regalloc.VRegInvalid if the addend cannot be lowered to a register.
+// The offset is 0 if the addend can be lowered to a register.
+func (m *machine) lowerAddendFromInstr(instr *ssa.Instruction) addend {
+	instr.MarkLowered()
+	switch op := instr.Opcode(); op {
+	case ssa.OpcodeIconst:
+		u64 := instr.ConstantVal()
+		if instr.Return().Type().Bits() == 32 {
+			return addend{regalloc.VRegInvalid, int64(int32(u64)), 0} // sign-extend.
+		} else {
+			return addend{regalloc.VRegInvalid, int64(u64), 0}
+		}
+	case ssa.OpcodeUExtend, ssa.OpcodeSExtend:
+		input := instr.Arg()
+		inputDef := m.c.ValueDefinition(input)
+		if input.Type().Bits() != 32 {
+			panic("BUG: invalid input type " + input.Type().String())
+		}
+		constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant()
+		switch {
+		case constInst && op == ssa.OpcodeSExtend:
+			return addend{regalloc.VRegInvalid, int64(uint32(inputDef.Instr.ConstantVal())), 0}
+		case constInst && op == ssa.OpcodeUExtend:
+			return addend{regalloc.VRegInvalid, int64(int32(inputDef.Instr.ConstantVal())), 0} // sign-extend!
+		default:
+			r := m.getOperand_Reg(inputDef)
+			return addend{r.reg(), 0, 0}
+		}
+	case ssa.OpcodeIshl:
+		// If the addend is a shift, we can only handle it if the shift amount is a constant.
+		x, amount := instr.Arg2()
+		amountDef := m.c.ValueDefinition(amount)
+		if amountDef.IsFromInstr() && amountDef.Instr.Constant() && amountDef.Instr.ConstantVal() <= 3 {
+			r := m.getOperand_Reg(m.c.ValueDefinition(x))
+			return addend{r.reg(), 0, uint8(amountDef.Instr.ConstantVal())}
+		}
+		r := m.getOperand_Reg(m.c.ValueDefinition(x))
+		return addend{r.reg(), 0, 0}
+	}
+	panic("BUG: invalid opcode")
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
new file mode 100644
index 000000000..310ad2203
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
@@ -0,0 +1,3611 @@
+package amd64
+
+import (
+	"context"
+	"encoding/binary"
+	"fmt"
+	"math"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+	"github.com/tetratelabs/wazero/internal/platform"
+)
+
+// NewBackend returns a new backend for arm64.
+func NewBackend() backend.Machine {
+	ectx := backend.NewExecutableContextT[instruction](
+		resetInstruction,
+		setNext,
+		setPrev,
+		asNop,
+	)
+	return &machine{
+		ectx:                                ectx,
+		cpuFeatures:                         platform.CpuFeatures,
+		regAlloc:                            regalloc.NewAllocator(regInfo),
+		spillSlots:                          map[regalloc.VRegID]int64{},
+		amodePool:                           wazevoapi.NewPool[amode](nil),
+		constSwizzleMaskConstIndex:          -1,
+		constSqmulRoundSatIndex:             -1,
+		constI8x16SHLMaskTableIndex:         -1,
+		constI8x16LogicalSHRMaskTableIndex:  -1,
+		constF64x2CvtFromIMaskIndex:         -1,
+		constTwop52Index:                    -1,
+		constI32sMaxOnF64x2Index:            -1,
+		constI32uMaxOnF64x2Index:            -1,
+		constAllOnesI8x16Index:              -1,
+		constAllOnesI16x8Index:              -1,
+		constExtAddPairwiseI16x8uMask1Index: -1,
+		constExtAddPairwiseI16x8uMask2Index: -1,
+	}
+}
+
+type (
+	// machine implements backend.Machine for amd64.
+	machine struct {
+		c                        backend.Compiler
+		ectx                     *backend.ExecutableContextT[instruction]
+		stackBoundsCheckDisabled bool
+
+		amodePool wazevoapi.Pool[amode]
+
+		cpuFeatures platform.CpuFeatureFlags
+
+		regAlloc        regalloc.Allocator
+		regAllocFn      *backend.RegAllocFunction[*instruction, *machine]
+		regAllocStarted bool
+
+		spillSlotSize int64
+		spillSlots    map[regalloc.VRegID]int64
+		currentABI    *backend.FunctionABI
+		clobberedRegs []regalloc.VReg
+
+		maxRequiredStackSizeForCalls int64
+
+		labelResolutionPends []labelResolutionPend
+
+		jmpTableTargets [][]uint32
+		consts          []_const
+
+		constSwizzleMaskConstIndex, constSqmulRoundSatIndex,
+		constI8x16SHLMaskTableIndex, constI8x16LogicalSHRMaskTableIndex,
+		constF64x2CvtFromIMaskIndex, constTwop52Index,
+		constI32sMaxOnF64x2Index, constI32uMaxOnF64x2Index,
+		constAllOnesI8x16Index, constAllOnesI16x8Index,
+		constExtAddPairwiseI16x8uMask1Index, constExtAddPairwiseI16x8uMask2Index int
+	}
+
+	_const struct {
+		lo, hi uint64
+		_var   []byte
+		label  *labelPosition
+	}
+
+	labelResolutionPend struct {
+		instr       *instruction
+		instrOffset int64
+		// imm32Offset is the offset of the last 4 bytes of the instruction.
+		imm32Offset int64
+	}
+
+	labelPosition = backend.LabelPosition[instruction]
+)
+
+func (m *machine) getOrAllocateConstLabel(i *int, _var []byte) backend.Label {
+	index := *i
+	if index == -1 {
+		label := m.allocateLabel()
+		index = len(m.consts)
+		m.consts = append(m.consts, _const{
+			_var:  _var,
+			label: label,
+		})
+		*i = index
+	}
+	return m.consts[index].label.L
+}
+
+// Reset implements backend.Machine.
+func (m *machine) Reset() {
+	m.consts = m.consts[:0]
+	m.clobberedRegs = m.clobberedRegs[:0]
+	for key := range m.spillSlots {
+		m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key))
+	}
+	for _, key := range m.clobberedRegs {
+		delete(m.spillSlots, regalloc.VRegID(key))
+	}
+
+	m.stackBoundsCheckDisabled = false
+	m.ectx.Reset()
+
+	m.regAllocFn.Reset()
+	m.regAlloc.Reset()
+	m.regAllocStarted = false
+	m.clobberedRegs = m.clobberedRegs[:0]
+
+	m.spillSlotSize = 0
+	m.maxRequiredStackSizeForCalls = 0
+
+	m.amodePool.Reset()
+	m.jmpTableTargets = m.jmpTableTargets[:0]
+	m.constSwizzleMaskConstIndex = -1
+	m.constSqmulRoundSatIndex = -1
+	m.constI8x16SHLMaskTableIndex = -1
+	m.constI8x16LogicalSHRMaskTableIndex = -1
+	m.constF64x2CvtFromIMaskIndex = -1
+	m.constTwop52Index = -1
+	m.constI32sMaxOnF64x2Index = -1
+	m.constI32uMaxOnF64x2Index = -1
+	m.constAllOnesI8x16Index = -1
+	m.constAllOnesI16x8Index = -1
+	m.constExtAddPairwiseI16x8uMask1Index = -1
+	m.constExtAddPairwiseI16x8uMask2Index = -1
+}
+
+// ExecutableContext implements backend.Machine.
+func (m *machine) ExecutableContext() backend.ExecutableContext { return m.ectx }
+
+// DisableStackCheck implements backend.Machine.
+func (m *machine) DisableStackCheck() { m.stackBoundsCheckDisabled = true }
+
+// SetCompiler implements backend.Machine.
+func (m *machine) SetCompiler(c backend.Compiler) {
+	m.c = c
+	m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, c.SSABuilder(), c)
+}
+
+// SetCurrentABI implements backend.Machine.
+func (m *machine) SetCurrentABI(abi *backend.FunctionABI) {
+	m.currentABI = abi
+}
+
+// RegAlloc implements backend.Machine.
+func (m *machine) RegAlloc() {
+	rf := m.regAllocFn
+	for _, pos := range m.ectx.OrderedBlockLabels {
+		rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End)
+	}
+
+	m.regAllocStarted = true
+	m.regAlloc.DoAllocation(rf)
+	// Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes.
+	m.spillSlotSize = (m.spillSlotSize + 15) &^ 15
+}
+
+// InsertReturn implements backend.Machine.
+func (m *machine) InsertReturn() {
+	i := m.allocateInstr().asRet()
+	m.insert(i)
+}
+
+// LowerSingleBranch implements backend.Machine.
+func (m *machine) LowerSingleBranch(b *ssa.Instruction) {
+	ectx := m.ectx
+	switch b.Opcode() {
+	case ssa.OpcodeJump:
+		_, _, targetBlk := b.BranchData()
+		if b.IsFallthroughJump() {
+			return
+		}
+		jmp := m.allocateInstr()
+		target := ectx.GetOrAllocateSSABlockLabel(targetBlk)
+		if target == backend.LabelReturn {
+			jmp.asRet()
+		} else {
+			jmp.asJmp(newOperandLabel(target))
+		}
+		m.insert(jmp)
+	case ssa.OpcodeBrTable:
+		index, target := b.BrTableData()
+		m.lowerBrTable(index, target)
+	default:
+		panic("BUG: unexpected branch opcode" + b.Opcode().String())
+	}
+}
+
+func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) {
+	// TODO: reuse the slice!
+	labels := make([]uint32, len(targets))
+	for j, target := range targets {
+		labels[j] = uint32(m.ectx.GetOrAllocateSSABlockLabel(target))
+	}
+	index = len(m.jmpTableTargets)
+	m.jmpTableTargets = append(m.jmpTableTargets, labels)
+	return
+}
+
+var condBranchMatches = [...]ssa.Opcode{ssa.OpcodeIcmp, ssa.OpcodeFcmp}
+
+func (m *machine) lowerBrTable(index ssa.Value, targets []ssa.BasicBlock) {
+	_v := m.getOperand_Reg(m.c.ValueDefinition(index))
+	v := m.copyToTmp(_v.reg())
+
+	// First, we need to do the bounds check.
+	maxIndex := m.c.AllocateVReg(ssa.TypeI32)
+	m.lowerIconst(maxIndex, uint64(len(targets)-1), false)
+	cmp := m.allocateInstr().asCmpRmiR(true, newOperandReg(maxIndex), v, false)
+	m.insert(cmp)
+
+	// Then do the conditional move maxIndex to v if v > maxIndex.
+	cmov := m.allocateInstr().asCmove(condNB, newOperandReg(maxIndex), v, false)
+	m.insert(cmov)
+
+	// Now that v has the correct index. Load the address of the jump table into the addr.
+	addr := m.c.AllocateVReg(ssa.TypeI64)
+	leaJmpTableAddr := m.allocateInstr()
+	m.insert(leaJmpTableAddr)
+
+	// Then add the target's offset into jmpTableAddr.
+	loadTargetOffsetFromJmpTable := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd,
+		// Shift by 3 because each entry is 8 bytes.
+		newOperandMem(m.newAmodeRegRegShift(0, addr, v, 3)), addr, true)
+	m.insert(loadTargetOffsetFromJmpTable)
+
+	// Now ready to jump.
+	jmp := m.allocateInstr().asJmp(newOperandReg(addr))
+	m.insert(jmp)
+
+	jmpTableBegin, jmpTableBeginLabel := m.allocateBrTarget()
+	m.insert(jmpTableBegin)
+	leaJmpTableAddr.asLEA(newOperandLabel(jmpTableBeginLabel), addr)
+
+	jmpTable := m.allocateInstr()
+	targetSliceIndex := m.addJmpTableTarget(targets)
+	jmpTable.asJmpTableSequence(targetSliceIndex, len(targets))
+	m.insert(jmpTable)
+}
+
+// LowerConditionalBranch implements backend.Machine.
+func (m *machine) LowerConditionalBranch(b *ssa.Instruction) {
+	exctx := m.ectx
+	cval, args, targetBlk := b.BranchData()
+	if len(args) > 0 {
+		panic(fmt.Sprintf(
+			"conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s",
+			exctx.CurrentSSABlk,
+			targetBlk,
+		))
+	}
+
+	target := exctx.GetOrAllocateSSABlockLabel(targetBlk)
+	cvalDef := m.c.ValueDefinition(cval)
+
+	switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) {
+	case ssa.OpcodeIcmp:
+		cvalInstr := cvalDef.Instr
+		x, y, c := cvalInstr.IcmpData()
+
+		cc := condFromSSAIntCmpCond(c)
+		if b.Opcode() == ssa.OpcodeBrz {
+			cc = cc.invert()
+		}
+
+		// First, perform the comparison and set the flag.
+		xd, yd := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+		if !m.tryLowerBandToFlag(xd, yd) {
+			m.lowerIcmpToFlag(xd, yd, x.Type() == ssa.TypeI64)
+		}
+
+		// Then perform the conditional branch.
+		m.insert(m.allocateInstr().asJmpIf(cc, newOperandLabel(target)))
+		cvalDef.Instr.MarkLowered()
+	case ssa.OpcodeFcmp:
+		cvalInstr := cvalDef.Instr
+
+		f1, f2, and := m.lowerFcmpToFlags(cvalInstr)
+		isBrz := b.Opcode() == ssa.OpcodeBrz
+		if isBrz {
+			f1 = f1.invert()
+		}
+		if f2 == condInvalid {
+			m.insert(m.allocateInstr().asJmpIf(f1, newOperandLabel(target)))
+		} else {
+			if isBrz {
+				f2 = f2.invert()
+				and = !and
+			}
+			jmp1, jmp2 := m.allocateInstr(), m.allocateInstr()
+			m.insert(jmp1)
+			m.insert(jmp2)
+			notTaken, notTakenLabel := m.allocateBrTarget()
+			m.insert(notTaken)
+			if and {
+				jmp1.asJmpIf(f1.invert(), newOperandLabel(notTakenLabel))
+				jmp2.asJmpIf(f2, newOperandLabel(target))
+			} else {
+				jmp1.asJmpIf(f1, newOperandLabel(target))
+				jmp2.asJmpIf(f2, newOperandLabel(target))
+			}
+		}
+
+		cvalDef.Instr.MarkLowered()
+	default:
+		v := m.getOperand_Reg(cvalDef)
+
+		var cc cond
+		if b.Opcode() == ssa.OpcodeBrz {
+			cc = condZ
+		} else {
+			cc = condNZ
+		}
+
+		// Perform test %v, %v to set the flag.
+		cmp := m.allocateInstr().asCmpRmiR(false, v, v.reg(), false)
+		m.insert(cmp)
+		m.insert(m.allocateInstr().asJmpIf(cc, newOperandLabel(target)))
+	}
+}
+
+// LowerInstr implements backend.Machine.
+func (m *machine) LowerInstr(instr *ssa.Instruction) {
+	if l := instr.SourceOffset(); l.Valid() {
+		info := m.allocateInstr().asEmitSourceOffsetInfo(l)
+		m.insert(info)
+	}
+
+	switch op := instr.Opcode(); op {
+	case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable:
+		panic("BUG: branching instructions are handled by LowerBranches")
+	case ssa.OpcodeReturn:
+		panic("BUG: return must be handled by backend.Compiler")
+	case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined.
+	case ssa.OpcodeCall, ssa.OpcodeCallIndirect:
+		m.lowerCall(instr)
+	case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32:
+		m.lowerStore(instr)
+	case ssa.OpcodeIadd:
+		m.lowerAluRmiROp(instr, aluRmiROpcodeAdd)
+	case ssa.OpcodeIsub:
+		m.lowerAluRmiROp(instr, aluRmiROpcodeSub)
+	case ssa.OpcodeImul:
+		m.lowerAluRmiROp(instr, aluRmiROpcodeMul)
+	case ssa.OpcodeSdiv, ssa.OpcodeUdiv, ssa.OpcodeSrem, ssa.OpcodeUrem:
+		isDiv := op == ssa.OpcodeSdiv || op == ssa.OpcodeUdiv
+		isSigned := op == ssa.OpcodeSdiv || op == ssa.OpcodeSrem
+		m.lowerIDivRem(instr, isDiv, isSigned)
+	case ssa.OpcodeBand:
+		m.lowerAluRmiROp(instr, aluRmiROpcodeAnd)
+	case ssa.OpcodeBor:
+		m.lowerAluRmiROp(instr, aluRmiROpcodeOr)
+	case ssa.OpcodeBxor:
+		m.lowerAluRmiROp(instr, aluRmiROpcodeXor)
+	case ssa.OpcodeIshl:
+		m.lowerShiftR(instr, shiftROpShiftLeft)
+	case ssa.OpcodeSshr:
+		m.lowerShiftR(instr, shiftROpShiftRightArithmetic)
+	case ssa.OpcodeUshr:
+		m.lowerShiftR(instr, shiftROpShiftRightLogical)
+	case ssa.OpcodeRotl:
+		m.lowerShiftR(instr, shiftROpRotateLeft)
+	case ssa.OpcodeRotr:
+		m.lowerShiftR(instr, shiftROpRotateRight)
+	case ssa.OpcodeClz:
+		m.lowerClz(instr)
+	case ssa.OpcodeCtz:
+		m.lowerCtz(instr)
+	case ssa.OpcodePopcnt:
+		m.lowerUnaryRmR(instr, unaryRmROpcodePopcnt)
+	case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv:
+		m.lowerXmmRmR(instr)
+	case ssa.OpcodeFabs:
+		m.lowerFabsFneg(instr)
+	case ssa.OpcodeFneg:
+		m.lowerFabsFneg(instr)
+	case ssa.OpcodeCeil:
+		m.lowerRound(instr, roundingModeUp)
+	case ssa.OpcodeFloor:
+		m.lowerRound(instr, roundingModeDown)
+	case ssa.OpcodeTrunc:
+		m.lowerRound(instr, roundingModeZero)
+	case ssa.OpcodeNearest:
+		m.lowerRound(instr, roundingModeNearest)
+	case ssa.OpcodeFmin, ssa.OpcodeFmax:
+		m.lowerFminFmax(instr)
+	case ssa.OpcodeFcopysign:
+		m.lowerFcopysign(instr)
+	case ssa.OpcodeBitcast:
+		m.lowerBitcast(instr)
+	case ssa.OpcodeSqrt:
+		m.lowerSqrt(instr)
+	case ssa.OpcodeFpromote:
+		v := instr.Arg()
+		rn := m.getOperand_Reg(m.c.ValueDefinition(v))
+		rd := m.c.VRegOf(instr.Return())
+		cnt := m.allocateInstr()
+		cnt.asXmmUnaryRmR(sseOpcodeCvtss2sd, rn, rd)
+		m.insert(cnt)
+	case ssa.OpcodeFdemote:
+		v := instr.Arg()
+		rn := m.getOperand_Reg(m.c.ValueDefinition(v))
+		rd := m.c.VRegOf(instr.Return())
+		cnt := m.allocateInstr()
+		cnt.asXmmUnaryRmR(sseOpcodeCvtsd2ss, rn, rd)
+		m.insert(cnt)
+	case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat:
+		x, ctx := instr.Arg2()
+		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+		rd := m.c.VRegOf(instr.Return())
+		ctxVReg := m.c.VRegOf(ctx)
+		m.lowerFcvtToSint(ctxVReg, rn.reg(), rd, x.Type() == ssa.TypeF64,
+			instr.Return().Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat)
+	case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat:
+		x, ctx := instr.Arg2()
+		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+		rd := m.c.VRegOf(instr.Return())
+		ctxVReg := m.c.VRegOf(ctx)
+		m.lowerFcvtToUint(ctxVReg, rn.reg(), rd, x.Type() == ssa.TypeF64,
+			instr.Return().Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat)
+	case ssa.OpcodeFcvtFromSint:
+		x := instr.Arg()
+		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+		rd := newOperandReg(m.c.VRegOf(instr.Return()))
+		m.lowerFcvtFromSint(rn, rd,
+			x.Type() == ssa.TypeI64, instr.Return().Type().Bits() == 64)
+	case ssa.OpcodeFcvtFromUint:
+		x := instr.Arg()
+		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+		rd := newOperandReg(m.c.VRegOf(instr.Return()))
+		m.lowerFcvtFromUint(rn, rd, x.Type() == ssa.TypeI64,
+			instr.Return().Type().Bits() == 64)
+	case ssa.OpcodeVanyTrue:
+		m.lowerVanyTrue(instr)
+	case ssa.OpcodeVallTrue:
+		m.lowerVallTrue(instr)
+	case ssa.OpcodeVhighBits:
+		m.lowerVhighBits(instr)
+	case ssa.OpcodeVbnot:
+		m.lowerVbnot(instr)
+	case ssa.OpcodeVband:
+		x, y := instr.Arg2()
+		m.lowerVbBinOp(sseOpcodePand, x, y, instr.Return())
+	case ssa.OpcodeVbor:
+		x, y := instr.Arg2()
+		m.lowerVbBinOp(sseOpcodePor, x, y, instr.Return())
+	case ssa.OpcodeVbxor:
+		x, y := instr.Arg2()
+		m.lowerVbBinOp(sseOpcodePxor, x, y, instr.Return())
+	case ssa.OpcodeVbandnot:
+		m.lowerVbandnot(instr, sseOpcodePandn)
+	case ssa.OpcodeVbitselect:
+		m.lowerVbitselect(instr)
+	case ssa.OpcodeVIadd:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI8x16:
+			vecOp = sseOpcodePaddb
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePaddw
+		case ssa.VecLaneI32x4:
+			vecOp = sseOpcodePaddd
+		case ssa.VecLaneI64x2:
+			vecOp = sseOpcodePaddq
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+	case ssa.OpcodeVSaddSat:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI8x16:
+			vecOp = sseOpcodePaddsb
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePaddsw
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+	case ssa.OpcodeVUaddSat:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI8x16:
+			vecOp = sseOpcodePaddusb
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePaddusw
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+	case ssa.OpcodeVIsub:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI8x16:
+			vecOp = sseOpcodePsubb
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePsubw
+		case ssa.VecLaneI32x4:
+			vecOp = sseOpcodePsubd
+		case ssa.VecLaneI64x2:
+			vecOp = sseOpcodePsubq
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+	case ssa.OpcodeVSsubSat:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI8x16:
+			vecOp = sseOpcodePsubsb
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePsubsw
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+	case ssa.OpcodeVUsubSat:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI8x16:
+			vecOp = sseOpcodePsubusb
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePsubusw
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+	case ssa.OpcodeVImul:
+		m.lowerVImul(instr)
+	case ssa.OpcodeVIneg:
+		x, lane := instr.ArgWithLane()
+		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+		rd := m.c.VRegOf(instr.Return())
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI8x16:
+			vecOp = sseOpcodePsubb
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePsubw
+		case ssa.VecLaneI32x4:
+			vecOp = sseOpcodePsubd
+		case ssa.VecLaneI64x2:
+			vecOp = sseOpcodePsubq
+		default:
+			panic("BUG")
+		}
+
+		tmp := m.c.AllocateVReg(ssa.TypeV128)
+		m.insert(m.allocateInstr().asZeros(tmp))
+
+		i := m.allocateInstr()
+		i.asXmmRmR(vecOp, rn, tmp)
+		m.insert(i)
+
+		m.copyTo(tmp, rd)
+	case ssa.OpcodeVFadd:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneF32x4:
+			vecOp = sseOpcodeAddps
+		case ssa.VecLaneF64x2:
+			vecOp = sseOpcodeAddpd
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+	case ssa.OpcodeVFsub:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneF32x4:
+			vecOp = sseOpcodeSubps
+		case ssa.VecLaneF64x2:
+			vecOp = sseOpcodeSubpd
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+	case ssa.OpcodeVFdiv:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneF32x4:
+			vecOp = sseOpcodeDivps
+		case ssa.VecLaneF64x2:
+			vecOp = sseOpcodeDivpd
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+	case ssa.OpcodeVFmul:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneF32x4:
+			vecOp = sseOpcodeMulps
+		case ssa.VecLaneF64x2:
+			vecOp = sseOpcodeMulpd
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+	case ssa.OpcodeVFneg:
+		x, lane := instr.ArgWithLane()
+		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+		rd := m.c.VRegOf(instr.Return())
+
+		tmp := m.c.AllocateVReg(ssa.TypeV128)
+
+		var shiftOp, xorOp sseOpcode
+		var shiftAmt uint32
+		switch lane {
+		case ssa.VecLaneF32x4:
+			shiftOp, shiftAmt, xorOp = sseOpcodePslld, 31, sseOpcodeXorps
+		case ssa.VecLaneF64x2:
+			shiftOp, shiftAmt, xorOp = sseOpcodePsllq, 63, sseOpcodeXorpd
+		}
+
+		zero := m.allocateInstr()
+		zero.asZeros(tmp)
+		m.insert(zero)
+
+		// Set all bits on tmp by CMPPD with arg=0 (== pseudo CMPEQPD instruction).
+		// See https://www.felixcloutier.com/x86/cmpps
+		//
+		// Note: if we do not clear all the bits ^ with XORPS, this might end up not setting ones on some lane
+		// if the lane is NaN.
+		cmp := m.allocateInstr()
+		cmp.asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_UQ), newOperandReg(tmp), tmp)
+		m.insert(cmp)
+
+		// Do the left shift on each lane to set only the most significant bit in each.
+		i := m.allocateInstr()
+		i.asXmmRmiReg(shiftOp, newOperandImm32(shiftAmt), tmp)
+		m.insert(i)
+
+		// Get the negated result by XOR on each lane with tmp.
+		i = m.allocateInstr()
+		i.asXmmRmR(xorOp, rn, tmp)
+		m.insert(i)
+
+		m.copyTo(tmp, rd)
+
+	case ssa.OpcodeVSqrt:
+		x, lane := instr.ArgWithLane()
+		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+		rd := m.c.VRegOf(instr.Return())
+
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneF32x4:
+			vecOp = sseOpcodeSqrtps
+		case ssa.VecLaneF64x2:
+			vecOp = sseOpcodeSqrtpd
+		}
+		i := m.allocateInstr()
+		i.asXmmUnaryRmR(vecOp, rn, rd)
+		m.insert(i)
+
+	case ssa.OpcodeVImin:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI8x16:
+			vecOp = sseOpcodePminsb
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePminsw
+		case ssa.VecLaneI32x4:
+			vecOp = sseOpcodePminsd
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+	case ssa.OpcodeVUmin:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI8x16:
+			vecOp = sseOpcodePminub
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePminuw
+		case ssa.VecLaneI32x4:
+			vecOp = sseOpcodePminud
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+	case ssa.OpcodeVImax:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI8x16:
+			vecOp = sseOpcodePmaxsb
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePmaxsw
+		case ssa.VecLaneI32x4:
+			vecOp = sseOpcodePmaxsd
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+	case ssa.OpcodeVUmax:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI8x16:
+			vecOp = sseOpcodePmaxub
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePmaxuw
+		case ssa.VecLaneI32x4:
+			vecOp = sseOpcodePmaxud
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+	case ssa.OpcodeVAvgRound:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI8x16:
+			vecOp = sseOpcodePavgb
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePavgw
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+
+	case ssa.OpcodeVIcmp:
+		x, y, c, lane := instr.VIcmpData()
+		m.lowerVIcmp(x, y, c, instr.Return(), lane)
+
+	case ssa.OpcodeVFcmp:
+		x, y, c, lane := instr.VFcmpData()
+		m.lowerVFcmp(x, y, c, instr.Return(), lane)
+
+	case ssa.OpcodeExtractlane:
+		x, index, signed, lane := instr.ExtractlaneData()
+		m.lowerExtractLane(x, index, signed, instr.Return(), lane)
+
+	case ssa.OpcodeInsertlane:
+		x, y, index, lane := instr.InsertlaneData()
+		m.lowerInsertLane(x, y, index, instr.Return(), lane)
+
+	case ssa.OpcodeSwizzle:
+		x, y, _ := instr.Arg2WithLane()
+		m.lowerSwizzle(x, y, instr.Return())
+
+	case ssa.OpcodeShuffle:
+		x, y, lo, hi := instr.ShuffleData()
+		m.lowerShuffle(x, y, lo, hi, instr.Return())
+
+	case ssa.OpcodeSplat:
+		x, lane := instr.ArgWithLane()
+		m.lowerSplat(x, instr.Return(), lane)
+
+	case ssa.OpcodeSqmulRoundSat:
+		x, y := instr.Arg2()
+		m.lowerSqmulRoundSat(x, y, instr.Return())
+
+	case ssa.OpcodeVZeroExtLoad:
+		ptr, offset, typ := instr.VZeroExtLoadData()
+		var sseOp sseOpcode
+		// Both movss and movsd clears the higher bits of the destination register upt 128 bits.
+		// https://www.felixcloutier.com/x86/movss
+		// https://www.felixcloutier.com/x86/movsd
+		if typ == ssa.TypeF32 {
+			sseOp = sseOpcodeMovss
+		} else {
+			sseOp = sseOpcodeMovsd
+		}
+		mem := m.lowerToAddressMode(ptr, offset)
+		dst := m.c.VRegOf(instr.Return())
+		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandMem(mem), dst))
+
+	case ssa.OpcodeVMinPseudo:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneF32x4:
+			vecOp = sseOpcodeMinps
+		case ssa.VecLaneF64x2:
+			vecOp = sseOpcodeMinpd
+		default:
+			panic("BUG: unexpected lane type")
+		}
+		m.lowerVbBinOpUnaligned(vecOp, y, x, instr.Return())
+
+	case ssa.OpcodeVMaxPseudo:
+		x, y, lane := instr.Arg2WithLane()
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneF32x4:
+			vecOp = sseOpcodeMaxps
+		case ssa.VecLaneF64x2:
+			vecOp = sseOpcodeMaxpd
+		default:
+			panic("BUG: unexpected lane type")
+		}
+		m.lowerVbBinOpUnaligned(vecOp, y, x, instr.Return())
+
+	case ssa.OpcodeVIshl:
+		x, y, lane := instr.Arg2WithLane()
+		m.lowerVIshl(x, y, instr.Return(), lane)
+
+	case ssa.OpcodeVSshr:
+		x, y, lane := instr.Arg2WithLane()
+		m.lowerVSshr(x, y, instr.Return(), lane)
+
+	case ssa.OpcodeVUshr:
+		x, y, lane := instr.Arg2WithLane()
+		m.lowerVUshr(x, y, instr.Return(), lane)
+
+	case ssa.OpcodeVCeil:
+		x, lane := instr.ArgWithLane()
+		m.lowerVRound(x, instr.Return(), 0x2, lane == ssa.VecLaneF64x2)
+
+	case ssa.OpcodeVFloor:
+		x, lane := instr.ArgWithLane()
+		m.lowerVRound(x, instr.Return(), 0x1, lane == ssa.VecLaneF64x2)
+
+	case ssa.OpcodeVTrunc:
+		x, lane := instr.ArgWithLane()
+		m.lowerVRound(x, instr.Return(), 0x3, lane == ssa.VecLaneF64x2)
+
+	case ssa.OpcodeVNearest:
+		x, lane := instr.ArgWithLane()
+		m.lowerVRound(x, instr.Return(), 0x0, lane == ssa.VecLaneF64x2)
+
+	case ssa.OpcodeExtIaddPairwise:
+		x, lane, signed := instr.ExtIaddPairwiseData()
+		m.lowerExtIaddPairwise(x, instr.Return(), lane, signed)
+
+	case ssa.OpcodeUwidenLow, ssa.OpcodeSwidenLow:
+		x, lane := instr.ArgWithLane()
+		m.lowerWidenLow(x, instr.Return(), lane, op == ssa.OpcodeSwidenLow)
+
+	case ssa.OpcodeUwidenHigh, ssa.OpcodeSwidenHigh:
+		x, lane := instr.ArgWithLane()
+		m.lowerWidenHigh(x, instr.Return(), lane, op == ssa.OpcodeSwidenHigh)
+
+	case ssa.OpcodeLoadSplat:
+		ptr, offset, lane := instr.LoadSplatData()
+		m.lowerLoadSplat(ptr, offset, instr.Return(), lane)
+
+	case ssa.OpcodeVFcvtFromUint, ssa.OpcodeVFcvtFromSint:
+		x, lane := instr.ArgWithLane()
+		m.lowerVFcvtFromInt(x, instr.Return(), lane, op == ssa.OpcodeVFcvtFromSint)
+
+	case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat:
+		x, lane := instr.ArgWithLane()
+		m.lowerVFcvtToIntSat(x, instr.Return(), lane, op == ssa.OpcodeVFcvtToSintSat)
+
+	case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow:
+		x, y, lane := instr.Arg2WithLane()
+		m.lowerNarrow(x, y, instr.Return(), lane, op == ssa.OpcodeSnarrow)
+
+	case ssa.OpcodeFvpromoteLow:
+		x := instr.Arg()
+		src := m.getOperand_Reg(m.c.ValueDefinition(x))
+		dst := m.c.VRegOf(instr.Return())
+		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtps2pd, src, dst))
+
+	case ssa.OpcodeFvdemote:
+		x := instr.Arg()
+		src := m.getOperand_Reg(m.c.ValueDefinition(x))
+		dst := m.c.VRegOf(instr.Return())
+		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtpd2ps, src, dst))
+
+	case ssa.OpcodeWideningPairwiseDotProductS:
+		x, y := instr.Arg2()
+		m.lowerWideningPairwiseDotProductS(x, y, instr.Return())
+
+	case ssa.OpcodeVIabs:
+		m.lowerVIabs(instr)
+	case ssa.OpcodeVIpopcnt:
+		m.lowerVIpopcnt(instr)
+	case ssa.OpcodeVFmin:
+		m.lowerVFmin(instr)
+	case ssa.OpcodeVFmax:
+		m.lowerVFmax(instr)
+	case ssa.OpcodeVFabs:
+		m.lowerVFabs(instr)
+	case ssa.OpcodeUndefined:
+		m.insert(m.allocateInstr().asUD2())
+	case ssa.OpcodeExitWithCode:
+		execCtx, code := instr.ExitWithCodeData()
+		m.lowerExitWithCode(m.c.VRegOf(execCtx), code)
+	case ssa.OpcodeExitIfTrueWithCode:
+		execCtx, c, code := instr.ExitIfTrueWithCodeData()
+		m.lowerExitIfTrueWithCode(m.c.VRegOf(execCtx), c, code)
+	case ssa.OpcodeLoad:
+		ptr, offset, typ := instr.LoadData()
+		dst := m.c.VRegOf(instr.Return())
+		m.lowerLoad(ptr, offset, typ, dst)
+	case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32:
+		ptr, offset, _ := instr.LoadData()
+		ret := m.c.VRegOf(instr.Return())
+		m.lowerExtLoad(op, ptr, offset, ret)
+	case ssa.OpcodeVconst:
+		result := m.c.VRegOf(instr.Return())
+		lo, hi := instr.VconstData()
+		m.lowerVconst(result, lo, hi)
+	case ssa.OpcodeSExtend, ssa.OpcodeUExtend:
+		from, to, signed := instr.ExtendData()
+		m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed)
+	case ssa.OpcodeIcmp:
+		m.lowerIcmp(instr)
+	case ssa.OpcodeFcmp:
+		m.lowerFcmp(instr)
+	case ssa.OpcodeSelect:
+		cval, x, y := instr.SelectData()
+		m.lowerSelect(x, y, cval, instr.Return())
+	case ssa.OpcodeIreduce:
+		rn := m.getOperand_Mem_Reg(m.c.ValueDefinition(instr.Arg()))
+		retVal := instr.Return()
+		rd := m.c.VRegOf(retVal)
+
+		if retVal.Type() != ssa.TypeI32 {
+			panic("TODO?: Ireduce to non-i32")
+		}
+		m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, rn, rd))
+
+	case ssa.OpcodeAtomicLoad:
+		ptr := instr.Arg()
+		size := instr.AtomicTargetSize()
+		dst := m.c.VRegOf(instr.Return())
+
+		// At this point, the ptr is ensured to be aligned, so using a normal load is atomic.
+		// https://github.com/golang/go/blob/adead1a93f472affa97c494ef19f2f492ee6f34a/src/runtime/internal/atomic/atomic_amd64.go#L30
+		mem := newOperandMem(m.lowerToAddressMode(ptr, 0))
+		load := m.allocateInstr()
+		switch size {
+		case 8:
+			load.asMov64MR(mem, dst)
+		case 4:
+			load.asMovzxRmR(extModeLQ, mem, dst)
+		case 2:
+			load.asMovzxRmR(extModeWQ, mem, dst)
+		case 1:
+			load.asMovzxRmR(extModeBQ, mem, dst)
+		default:
+			panic("BUG")
+		}
+		m.insert(load)
+
+	case ssa.OpcodeFence:
+		m.insert(m.allocateInstr().asMFence())
+
+	case ssa.OpcodeAtomicStore:
+		ptr, _val := instr.Arg2()
+		size := instr.AtomicTargetSize()
+
+		val := m.getOperand_Reg(m.c.ValueDefinition(_val))
+		// The content on the val register will be overwritten by xchg, so we need to copy it to a temporary register.
+		copied := m.copyToTmp(val.reg())
+
+		mem := newOperandMem(m.lowerToAddressMode(ptr, 0))
+		store := m.allocateInstr().asXCHG(copied, mem, byte(size))
+		m.insert(store)
+
+	case ssa.OpcodeAtomicCas:
+		addr, exp, repl := instr.Arg3()
+		size := instr.AtomicTargetSize()
+		m.lowerAtomicCas(addr, exp, repl, size, instr.Return())
+
+	case ssa.OpcodeAtomicRmw:
+		addr, val := instr.Arg2()
+		atomicOp, size := instr.AtomicRmwData()
+		m.lowerAtomicRmw(atomicOp, addr, val, size, instr.Return())
+
+	default:
+		panic("TODO: lowering " + op.String())
+	}
+}
+
+func (m *machine) lowerAtomicRmw(op ssa.AtomicRmwOp, addr, val ssa.Value, size uint64, ret ssa.Value) {
+	mem := m.lowerToAddressMode(addr, 0)
+	_val := m.getOperand_Reg(m.c.ValueDefinition(val))
+
+	switch op {
+	case ssa.AtomicRmwOpAdd, ssa.AtomicRmwOpSub:
+		valCopied := m.copyToTmp(_val.reg())
+		if op == ssa.AtomicRmwOpSub {
+			// Negate the value.
+			m.insert(m.allocateInstr().asNeg(newOperandReg(valCopied), true))
+		}
+		m.insert(m.allocateInstr().asLockXAdd(valCopied, mem, byte(size)))
+		m.clearHigherBitsForAtomic(valCopied, size, ret.Type())
+		m.copyTo(valCopied, m.c.VRegOf(ret))
+
+	case ssa.AtomicRmwOpAnd, ssa.AtomicRmwOpOr, ssa.AtomicRmwOpXor:
+		accumulator := raxVReg
+		// Reserve rax for the accumulator to make regalloc happy.
+		// Note: do this initialization before defining valCopied, because it might be the same register and
+		// if that happens, the unnecessary load/store will be performed inside the loop.
+		// This can be mitigated in any way once the register allocator is clever enough.
+		m.insert(m.allocateInstr().asDefineUninitializedReg(accumulator))
+
+		// Copy the value to a temporary register.
+		valCopied := m.copyToTmp(_val.reg())
+		m.clearHigherBitsForAtomic(valCopied, size, ret.Type())
+
+		memOp := newOperandMem(mem)
+		tmp := m.c.AllocateVReg(ssa.TypeI64)
+		beginLoop, beginLoopLabel := m.allocateBrTarget()
+		{
+			m.insert(beginLoop)
+			// Reset the value on tmp by the original value.
+			m.copyTo(valCopied, tmp)
+			// Load the current value at the memory location into accumulator.
+			switch size {
+			case 1:
+				m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, memOp, accumulator))
+			case 2:
+				m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, memOp, accumulator))
+			case 4:
+				m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, memOp, accumulator))
+			case 8:
+				m.insert(m.allocateInstr().asMov64MR(memOp, accumulator))
+			default:
+				panic("BUG")
+			}
+			// Then perform the logical operation on the accumulator and the value on tmp.
+			switch op {
+			case ssa.AtomicRmwOpAnd:
+				m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, newOperandReg(accumulator), tmp, true))
+			case ssa.AtomicRmwOpOr:
+				m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeOr, newOperandReg(accumulator), tmp, true))
+			case ssa.AtomicRmwOpXor:
+				m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeXor, newOperandReg(accumulator), tmp, true))
+			default:
+				panic("BUG")
+			}
+			// Finally, try compare-exchange the value at the memory location with the tmp.
+			m.insert(m.allocateInstr().asLockCmpXCHG(tmp, memOp.addressMode(), byte(size)))
+			// If it succeeds, ZF will be set, and we can break the loop.
+			m.insert(m.allocateInstr().asJmpIf(condNZ, newOperandLabel(beginLoopLabel)))
+		}
+
+		// valCopied must be alive at the end of the loop.
+		m.insert(m.allocateInstr().asNopUseReg(valCopied))
+
+		// At this point, accumulator contains the result.
+		m.clearHigherBitsForAtomic(accumulator, size, ret.Type())
+		m.copyTo(accumulator, m.c.VRegOf(ret))
+
+	case ssa.AtomicRmwOpXchg:
+		valCopied := m.copyToTmp(_val.reg())
+
+		m.insert(m.allocateInstr().asXCHG(valCopied, newOperandMem(mem), byte(size)))
+		m.clearHigherBitsForAtomic(valCopied, size, ret.Type())
+		m.copyTo(valCopied, m.c.VRegOf(ret))
+
+	default:
+		panic("BUG")
+	}
+}
+
+func (m *machine) lowerAtomicCas(addr, exp, repl ssa.Value, size uint64, ret ssa.Value) {
+	mem := m.lowerToAddressMode(addr, 0)
+	expOp := m.getOperand_Reg(m.c.ValueDefinition(exp))
+	replOp := m.getOperand_Reg(m.c.ValueDefinition(repl))
+
+	accumulator := raxVReg
+	m.copyTo(expOp.reg(), accumulator)
+	m.insert(m.allocateInstr().asLockCmpXCHG(replOp.reg(), mem, byte(size)))
+	m.clearHigherBitsForAtomic(accumulator, size, ret.Type())
+	m.copyTo(accumulator, m.c.VRegOf(ret))
+}
+
+func (m *machine) clearHigherBitsForAtomic(r regalloc.VReg, valSize uint64, resultType ssa.Type) {
+	switch resultType {
+	case ssa.TypeI32:
+		switch valSize {
+		case 1:
+			m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(r), r))
+		case 2:
+			m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(r), r))
+		}
+	case ssa.TypeI64:
+		switch valSize {
+		case 1:
+			m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(r), r))
+		case 2:
+			m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, newOperandReg(r), r))
+		case 4:
+			m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, newOperandReg(r), r))
+		}
+	}
+}
+
+func (m *machine) lowerFcmp(instr *ssa.Instruction) {
+	f1, f2, and := m.lowerFcmpToFlags(instr)
+	rd := m.c.VRegOf(instr.Return())
+	if f2 == condInvalid {
+		tmp := m.c.AllocateVReg(ssa.TypeI32)
+		m.insert(m.allocateInstr().asSetcc(f1, tmp))
+		// On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match
+		// the semantics of Icmp that sets either 0 or 1.
+		m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp), rd))
+	} else {
+		tmp1, tmp2 := m.c.AllocateVReg(ssa.TypeI32), m.c.AllocateVReg(ssa.TypeI32)
+		m.insert(m.allocateInstr().asSetcc(f1, tmp1))
+		m.insert(m.allocateInstr().asSetcc(f2, tmp2))
+		var op aluRmiROpcode
+		if and {
+			op = aluRmiROpcodeAnd
+		} else {
+			op = aluRmiROpcodeOr
+		}
+		m.insert(m.allocateInstr().asAluRmiR(op, newOperandReg(tmp1), tmp2, false))
+		m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp2), rd))
+	}
+}
+
+func (m *machine) lowerIcmp(instr *ssa.Instruction) {
+	x, y, c := instr.IcmpData()
+	m.lowerIcmpToFlag(m.c.ValueDefinition(x), m.c.ValueDefinition(y), x.Type() == ssa.TypeI64)
+	rd := m.c.VRegOf(instr.Return())
+	tmp := m.c.AllocateVReg(ssa.TypeI32)
+	m.insert(m.allocateInstr().asSetcc(condFromSSAIntCmpCond(c), tmp))
+	// On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match
+	// the semantics of Icmp that sets either 0 or 1.
+	m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp), rd))
+}
+
+func (m *machine) lowerSelect(x, y, cval, ret ssa.Value) {
+	xo, yo := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y))
+	rd := m.c.VRegOf(ret)
+
+	var cond cond
+	cvalDef := m.c.ValueDefinition(cval)
+	switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) {
+	case ssa.OpcodeIcmp:
+		icmp := cvalDef.Instr
+		xc, yc, cc := icmp.IcmpData()
+		m.lowerIcmpToFlag(m.c.ValueDefinition(xc), m.c.ValueDefinition(yc), xc.Type() == ssa.TypeI64)
+		cond = condFromSSAIntCmpCond(cc)
+		icmp.Lowered()
+	default: // TODO: match ssa.OpcodeFcmp for optimization, but seems a bit complex.
+		cv := m.getOperand_Reg(cvalDef)
+		test := m.allocateInstr().asCmpRmiR(false, cv, cv.reg(), false)
+		m.insert(test)
+		cond = condNZ
+	}
+
+	if typ := x.Type(); typ.IsInt() {
+		_64 := typ.Bits() == 64
+		mov := m.allocateInstr()
+		tmp := m.c.AllocateVReg(typ)
+		switch yo.kind {
+		case operandKindReg:
+			mov.asMovRR(yo.reg(), tmp, _64)
+		case operandKindMem:
+			if _64 {
+				mov.asMov64MR(yo, tmp)
+			} else {
+				mov.asMovzxRmR(extModeLQ, yo, tmp)
+			}
+		default:
+			panic("BUG")
+		}
+		m.insert(mov)
+		cmov := m.allocateInstr().asCmove(cond, xo, tmp, _64)
+		m.insert(cmov)
+		m.insert(m.allocateInstr().asMovRR(tmp, rd, _64))
+	} else {
+		mov := m.allocateInstr()
+		tmp := m.c.AllocateVReg(typ)
+		switch typ {
+		case ssa.TypeF32:
+			mov.asXmmUnaryRmR(sseOpcodeMovss, yo, tmp)
+		case ssa.TypeF64:
+			mov.asXmmUnaryRmR(sseOpcodeMovsd, yo, tmp)
+		case ssa.TypeV128:
+			mov.asXmmUnaryRmR(sseOpcodeMovdqu, yo, tmp)
+		default:
+			panic("BUG")
+		}
+		m.insert(mov)
+
+		cmov := m.allocateInstr().asXmmCMov(cond, xo, tmp, typ.Size())
+		m.insert(cmov)
+
+		m.copyTo(tmp, rd)
+	}
+}
+
+func (m *machine) lowerXmmCmovAfterRegAlloc(i *instruction) {
+	x := i.op1
+	rd := i.op2.reg()
+	cond := cond(i.u1)
+
+	jcc := m.allocateInstr()
+	m.insert(jcc)
+
+	mov := m.allocateInstr()
+	switch i.u2 {
+	case 4:
+		mov.asXmmUnaryRmR(sseOpcodeMovss, x, rd)
+	case 8:
+		mov.asXmmUnaryRmR(sseOpcodeMovsd, x, rd)
+	case 16:
+		mov.asXmmUnaryRmR(sseOpcodeMovdqu, x, rd)
+	default:
+		panic("BUG")
+	}
+	m.insert(mov)
+
+	nop, end := m.allocateBrTarget()
+	m.insert(nop)
+	jcc.asJmpIf(cond.invert(), newOperandLabel(end))
+}
+
+func (m *machine) lowerExtend(_arg, ret ssa.Value, from, to byte, signed bool) {
+	rd0 := m.c.VRegOf(ret)
+	arg := m.getOperand_Mem_Reg(m.c.ValueDefinition(_arg))
+
+	rd := m.c.AllocateVReg(ret.Type())
+
+	ext := m.allocateInstr()
+	switch {
+	case from == 8 && to == 16 && signed:
+		ext.asMovsxRmR(extModeBQ, arg, rd)
+	case from == 8 && to == 16 && !signed:
+		ext.asMovzxRmR(extModeBL, arg, rd)
+	case from == 8 && to == 32 && signed:
+		ext.asMovsxRmR(extModeBL, arg, rd)
+	case from == 8 && to == 32 && !signed:
+		ext.asMovzxRmR(extModeBQ, arg, rd)
+	case from == 8 && to == 64 && signed:
+		ext.asMovsxRmR(extModeBQ, arg, rd)
+	case from == 8 && to == 64 && !signed:
+		ext.asMovzxRmR(extModeBQ, arg, rd)
+	case from == 16 && to == 32 && signed:
+		ext.asMovsxRmR(extModeWL, arg, rd)
+	case from == 16 && to == 32 && !signed:
+		ext.asMovzxRmR(extModeWL, arg, rd)
+	case from == 16 && to == 64 && signed:
+		ext.asMovsxRmR(extModeWQ, arg, rd)
+	case from == 16 && to == 64 && !signed:
+		ext.asMovzxRmR(extModeWQ, arg, rd)
+	case from == 32 && to == 64 && signed:
+		ext.asMovsxRmR(extModeLQ, arg, rd)
+	case from == 32 && to == 64 && !signed:
+		ext.asMovzxRmR(extModeLQ, arg, rd)
+	default:
+		panic(fmt.Sprintf("BUG: unhandled extend: from=%d, to=%d, signed=%t", from, to, signed))
+	}
+	m.insert(ext)
+
+	m.copyTo(rd, rd0)
+}
+
+func (m *machine) lowerVconst(dst regalloc.VReg, lo, hi uint64) {
+	if lo == 0 && hi == 0 {
+		m.insert(m.allocateInstr().asZeros(dst))
+		return
+	}
+
+	load := m.allocateInstr()
+	constLabel := m.allocateLabel()
+	m.consts = append(m.consts, _const{label: constLabel, lo: lo, hi: hi})
+	load.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(constLabel.L)), dst)
+	m.insert(load)
+}
+
+func (m *machine) lowerCtz(instr *ssa.Instruction) {
+	if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) {
+		m.lowerUnaryRmR(instr, unaryRmROpcodeTzcnt)
+	} else {
+		// On processors that do not support TZCNT, the BSF instruction is
+		// executed instead. The key difference between TZCNT and BSF
+		// instruction is that if source operand is zero, the content of
+		// destination operand is undefined.
+		// https://www.felixcloutier.com/x86/tzcnt.html
+
+		x := instr.Arg()
+		if !x.Type().IsInt() {
+			panic("BUG?")
+		}
+		_64 := x.Type().Bits() == 64
+
+		xDef := m.c.ValueDefinition(x)
+		tmp := m.c.AllocateVReg(x.Type())
+		rm := m.getOperand_Reg(xDef)
+
+		// First, we have to check if the target is non-zero.
+		test := m.allocateInstr()
+		test.asCmpRmiR(false, rm, rm.reg(), _64)
+		m.insert(test)
+
+		jmpNz := m.allocateInstr()
+		m.insert(jmpNz)
+
+		// If the value is zero, we just push the const value.
+		m.lowerIconst(tmp, uint64(x.Type().Bits()), _64)
+
+		// Now jump right after the non-zero case.
+		jmpAtEnd := m.allocateInstr()
+		m.insert(jmpAtEnd)
+
+		// jmpNz target label is set here.
+		nop, nz := m.allocateBrTarget()
+		jmpNz.asJmpIf(condNZ, newOperandLabel(nz))
+		m.insert(nop)
+
+		// Emit the non-zero case.
+		bsr := m.allocateInstr()
+		bsr.asUnaryRmR(unaryRmROpcodeBsf, rm, tmp, _64)
+		m.insert(bsr)
+
+		// jmpAtEnd target label is set here.
+		nopEnd, end := m.allocateBrTarget()
+		jmpAtEnd.asJmp(newOperandLabel(end))
+		m.insert(nopEnd)
+
+		m.copyTo(tmp, m.c.VRegOf(instr.Return()))
+	}
+}
+
+func (m *machine) lowerClz(instr *ssa.Instruction) {
+	if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) {
+		m.lowerUnaryRmR(instr, unaryRmROpcodeLzcnt)
+	} else {
+		// On processors that do not support LZCNT, we combine BSR (calculating
+		// most significant set bit) with XOR. This logic is described in
+		// "Replace Raw Assembly Code with Builtin Intrinsics" section in:
+		// https://developer.apple.com/documentation/apple-silicon/addressing-architectural-differences-in-your-macos-code.
+
+		x := instr.Arg()
+		if !x.Type().IsInt() {
+			panic("BUG?")
+		}
+		_64 := x.Type().Bits() == 64
+
+		xDef := m.c.ValueDefinition(x)
+		rm := m.getOperand_Reg(xDef)
+		tmp := m.c.AllocateVReg(x.Type())
+
+		// First, we have to check if the rm is non-zero as BSR is undefined
+		// on zero. See https://www.felixcloutier.com/x86/bsr.
+		test := m.allocateInstr()
+		test.asCmpRmiR(false, rm, rm.reg(), _64)
+		m.insert(test)
+
+		jmpNz := m.allocateInstr()
+		m.insert(jmpNz)
+
+		// If the value is zero, we just push the const value.
+		m.lowerIconst(tmp, uint64(x.Type().Bits()), _64)
+
+		// Now jump right after the non-zero case.
+		jmpAtEnd := m.allocateInstr()
+		m.insert(jmpAtEnd)
+
+		// jmpNz target label is set here.
+		nop, nz := m.allocateBrTarget()
+		jmpNz.asJmpIf(condNZ, newOperandLabel(nz))
+		m.insert(nop)
+
+		// Emit the non-zero case.
+		bsr := m.allocateInstr()
+		bsr.asUnaryRmR(unaryRmROpcodeBsr, rm, tmp, _64)
+		m.insert(bsr)
+
+		// Now we XOR the value with the bit length minus one.
+		xor := m.allocateInstr()
+		xor.asAluRmiR(aluRmiROpcodeXor, newOperandImm32(uint32(x.Type().Bits()-1)), tmp, _64)
+		m.insert(xor)
+
+		// jmpAtEnd target label is set here.
+		nopEnd, end := m.allocateBrTarget()
+		jmpAtEnd.asJmp(newOperandLabel(end))
+		m.insert(nopEnd)
+
+		m.copyTo(tmp, m.c.VRegOf(instr.Return()))
+	}
+}
+
+func (m *machine) lowerUnaryRmR(si *ssa.Instruction, op unaryRmROpcode) {
+	x := si.Arg()
+	if !x.Type().IsInt() {
+		panic("BUG?")
+	}
+	_64 := x.Type().Bits() == 64
+
+	xDef := m.c.ValueDefinition(x)
+	rm := m.getOperand_Mem_Reg(xDef)
+	rd := m.c.VRegOf(si.Return())
+
+	instr := m.allocateInstr()
+	instr.asUnaryRmR(op, rm, rd, _64)
+	m.insert(instr)
+}
+
+func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, dst regalloc.VReg) {
+	mem := newOperandMem(m.lowerToAddressMode(ptr, offset))
+	load := m.allocateInstr()
+	switch typ {
+	case ssa.TypeI32:
+		load.asMovzxRmR(extModeLQ, mem, dst)
+	case ssa.TypeI64:
+		load.asMov64MR(mem, dst)
+	case ssa.TypeF32:
+		load.asXmmUnaryRmR(sseOpcodeMovss, mem, dst)
+	case ssa.TypeF64:
+		load.asXmmUnaryRmR(sseOpcodeMovsd, mem, dst)
+	case ssa.TypeV128:
+		load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, dst)
+	default:
+		panic("BUG")
+	}
+	m.insert(load)
+}
+
+func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, dst regalloc.VReg) {
+	mem := newOperandMem(m.lowerToAddressMode(ptr, offset))
+	load := m.allocateInstr()
+	switch op {
+	case ssa.OpcodeUload8:
+		load.asMovzxRmR(extModeBQ, mem, dst)
+	case ssa.OpcodeUload16:
+		load.asMovzxRmR(extModeWQ, mem, dst)
+	case ssa.OpcodeUload32:
+		load.asMovzxRmR(extModeLQ, mem, dst)
+	case ssa.OpcodeSload8:
+		load.asMovsxRmR(extModeBQ, mem, dst)
+	case ssa.OpcodeSload16:
+		load.asMovsxRmR(extModeWQ, mem, dst)
+	case ssa.OpcodeSload32:
+		load.asMovsxRmR(extModeLQ, mem, dst)
+	default:
+		panic("BUG")
+	}
+	m.insert(load)
+}
+
+func (m *machine) lowerExitIfTrueWithCode(execCtx regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) {
+	condDef := m.c.ValueDefinition(cond)
+	if !m.c.MatchInstr(condDef, ssa.OpcodeIcmp) {
+		panic("TODO: ExitIfTrue must come after Icmp at the moment: " + condDef.Instr.Opcode().String())
+	}
+	cvalInstr := condDef.Instr
+	cvalInstr.MarkLowered()
+
+	// We need to copy the execution context to a temp register, because if it's spilled,
+	// it might end up being reloaded inside the exiting branch.
+	execCtxTmp := m.copyToTmp(execCtx)
+
+	x, y, c := cvalInstr.IcmpData()
+	xx, yy := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+	if !m.tryLowerBandToFlag(xx, yy) {
+		m.lowerIcmpToFlag(xx, yy, x.Type() == ssa.TypeI64)
+	}
+
+	jmpIf := m.allocateInstr()
+	m.insert(jmpIf)
+	l := m.lowerExitWithCode(execCtxTmp, code)
+	jmpIf.asJmpIf(condFromSSAIntCmpCond(c).invert(), newOperandLabel(l))
+}
+
+func (m *machine) tryLowerBandToFlag(x, y *backend.SSAValueDefinition) (ok bool) {
+	var target *backend.SSAValueDefinition
+	if x.IsFromInstr() && x.Instr.Constant() && x.Instr.ConstantVal() == 0 {
+		if m.c.MatchInstr(y, ssa.OpcodeBand) {
+			target = y
+		}
+	}
+
+	if y.IsFromInstr() && y.Instr.Constant() && y.Instr.ConstantVal() == 0 {
+		if m.c.MatchInstr(x, ssa.OpcodeBand) {
+			target = x
+		}
+	}
+
+	if target == nil {
+		return false
+	}
+
+	bandInstr := target.Instr
+	bandX, bandY := bandInstr.Arg2()
+
+	xx := m.getOperand_Reg(m.c.ValueDefinition(bandX))
+	yy := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(bandY))
+	test := m.allocateInstr().asCmpRmiR(false, yy, xx.reg(), bandX.Type() == ssa.TypeI64)
+	m.insert(test)
+	bandInstr.MarkLowered()
+	return true
+}
+
+func (m *machine) allocateExitInstructions(execCtx, exitCodeReg regalloc.VReg) (saveRsp, saveRbp, setExitCode *instruction) {
+	saveRsp = m.allocateInstr().asMovRM(
+		rspVReg,
+		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.U32(), execCtx)),
+		8,
+	)
+
+	saveRbp = m.allocateInstr().asMovRM(
+		rbpVReg,
+		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetFramePointerBeforeGoCall.U32(), execCtx)),
+		8,
+	)
+	setExitCode = m.allocateInstr().asMovRM(
+		exitCodeReg,
+		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetExitCodeOffset.U32(), execCtx)),
+		4,
+	)
+	return
+}
+
+func (m *machine) lowerExitWithCode(execCtx regalloc.VReg, code wazevoapi.ExitCode) (afterLabel backend.Label) {
+	exitCodeReg := rbpVReg
+	saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtx, exitCodeReg)
+
+	// Set save RSP, RBP, and write exit code.
+	m.insert(saveRsp)
+	m.insert(saveRbp)
+	m.lowerIconst(exitCodeReg, uint64(code), false)
+	m.insert(setExitCode)
+
+	ripReg := rbpVReg
+
+	// Next is to save the current address for stack unwinding.
+	nop, currentAddrLabel := m.allocateBrTarget()
+	m.insert(nop)
+	readRip := m.allocateInstr().asLEA(newOperandLabel(currentAddrLabel), ripReg)
+	m.insert(readRip)
+	saveRip := m.allocateInstr().asMovRM(
+		ripReg,
+		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)),
+		8,
+	)
+	m.insert(saveRip)
+
+	// Finally exit.
+	exitSq := m.allocateExitSeq(execCtx)
+	m.insert(exitSq)
+
+	// Return the label for continuation.
+	continuation, afterLabel := m.allocateBrTarget()
+	m.insert(continuation)
+	return afterLabel
+}
+
+func (m *machine) lowerAluRmiROp(si *ssa.Instruction, op aluRmiROpcode) {
+	x, y := si.Arg2()
+	if !x.Type().IsInt() {
+		panic("BUG?")
+	}
+
+	_64 := x.Type().Bits() == 64
+
+	xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+
+	// TODO: commutative args can be swapped if one of them is an immediate.
+	rn := m.getOperand_Reg(xDef)
+	rm := m.getOperand_Mem_Imm32_Reg(yDef)
+	rd := m.c.VRegOf(si.Return())
+
+	// rn is being overwritten, so we first copy its value to a temp register,
+	// in case it is referenced again later.
+	tmp := m.copyToTmp(rn.reg())
+
+	alu := m.allocateInstr()
+	alu.asAluRmiR(op, rm, tmp, _64)
+	m.insert(alu)
+
+	// tmp now contains the result, we copy it to the dest register.
+	m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerShiftR(si *ssa.Instruction, op shiftROp) {
+	x, amt := si.Arg2()
+	if !x.Type().IsInt() {
+		panic("BUG?")
+	}
+	_64 := x.Type().Bits() == 64
+
+	xDef, amtDef := m.c.ValueDefinition(x), m.c.ValueDefinition(amt)
+
+	opAmt := m.getOperand_Imm32_Reg(amtDef)
+	rx := m.getOperand_Reg(xDef)
+	rd := m.c.VRegOf(si.Return())
+
+	// rx is being overwritten, so we first copy its value to a temp register,
+	// in case it is referenced again later.
+	tmpDst := m.copyToTmp(rx.reg())
+
+	if opAmt.kind == operandKindReg {
+		// If opAmt is a register we must copy its value to rcx,
+		// because shiftR encoding mandates that the shift amount is in rcx.
+		m.copyTo(opAmt.reg(), rcxVReg)
+
+		alu := m.allocateInstr()
+		alu.asShiftR(op, newOperandReg(rcxVReg), tmpDst, _64)
+		m.insert(alu)
+
+	} else {
+		alu := m.allocateInstr()
+		alu.asShiftR(op, opAmt, tmpDst, _64)
+		m.insert(alu)
+	}
+
+	// tmp now contains the result, we copy it to the dest register.
+	m.copyTo(tmpDst, rd)
+}
+
+func (m *machine) lowerXmmRmR(instr *ssa.Instruction) {
+	x, y := instr.Arg2()
+	if !x.Type().IsFloat() {
+		panic("BUG?")
+	}
+	_64 := x.Type().Bits() == 64
+
+	var op sseOpcode
+	if _64 {
+		switch instr.Opcode() {
+		case ssa.OpcodeFadd:
+			op = sseOpcodeAddsd
+		case ssa.OpcodeFsub:
+			op = sseOpcodeSubsd
+		case ssa.OpcodeFmul:
+			op = sseOpcodeMulsd
+		case ssa.OpcodeFdiv:
+			op = sseOpcodeDivsd
+		default:
+			panic("BUG")
+		}
+	} else {
+		switch instr.Opcode() {
+		case ssa.OpcodeFadd:
+			op = sseOpcodeAddss
+		case ssa.OpcodeFsub:
+			op = sseOpcodeSubss
+		case ssa.OpcodeFmul:
+			op = sseOpcodeMulss
+		case ssa.OpcodeFdiv:
+			op = sseOpcodeDivss
+		default:
+			panic("BUG")
+		}
+	}
+
+	xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+	rn := m.getOperand_Reg(yDef)
+	rm := m.getOperand_Reg(xDef)
+	rd := m.c.VRegOf(instr.Return())
+
+	// rm is being overwritten, so we first copy its value to a temp register,
+	// in case it is referenced again later.
+	tmp := m.copyToTmp(rm.reg())
+
+	xmm := m.allocateInstr().asXmmRmR(op, rn, tmp)
+	m.insert(xmm)
+
+	m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerSqrt(instr *ssa.Instruction) {
+	x := instr.Arg()
+	if !x.Type().IsFloat() {
+		panic("BUG")
+	}
+	_64 := x.Type().Bits() == 64
+	var op sseOpcode
+	if _64 {
+		op = sseOpcodeSqrtsd
+	} else {
+		op = sseOpcodeSqrtss
+	}
+
+	xDef := m.c.ValueDefinition(x)
+	rm := m.getOperand_Mem_Reg(xDef)
+	rd := m.c.VRegOf(instr.Return())
+
+	xmm := m.allocateInstr().asXmmUnaryRmR(op, rm, rd)
+	m.insert(xmm)
+}
+
+func (m *machine) lowerFabsFneg(instr *ssa.Instruction) {
+	x := instr.Arg()
+	if !x.Type().IsFloat() {
+		panic("BUG")
+	}
+	_64 := x.Type().Bits() == 64
+	var op sseOpcode
+	var mask uint64
+	if _64 {
+		switch instr.Opcode() {
+		case ssa.OpcodeFabs:
+			mask, op = 0x7fffffffffffffff, sseOpcodeAndpd
+		case ssa.OpcodeFneg:
+			mask, op = 0x8000000000000000, sseOpcodeXorpd
+		}
+	} else {
+		switch instr.Opcode() {
+		case ssa.OpcodeFabs:
+			mask, op = 0x7fffffff, sseOpcodeAndps
+		case ssa.OpcodeFneg:
+			mask, op = 0x80000000, sseOpcodeXorps
+		}
+	}
+
+	tmp := m.c.AllocateVReg(x.Type())
+
+	xDef := m.c.ValueDefinition(x)
+	rm := m.getOperand_Reg(xDef)
+	rd := m.c.VRegOf(instr.Return())
+
+	m.lowerFconst(tmp, mask, _64)
+
+	xmm := m.allocateInstr().asXmmRmR(op, rm, tmp)
+	m.insert(xmm)
+
+	m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerStore(si *ssa.Instruction) {
+	value, ptr, offset, storeSizeInBits := si.StoreData()
+	rm := m.getOperand_Reg(m.c.ValueDefinition(value))
+	mem := newOperandMem(m.lowerToAddressMode(ptr, offset))
+
+	store := m.allocateInstr()
+	switch value.Type() {
+	case ssa.TypeI32:
+		store.asMovRM(rm.reg(), mem, storeSizeInBits/8)
+	case ssa.TypeI64:
+		store.asMovRM(rm.reg(), mem, storeSizeInBits/8)
+	case ssa.TypeF32:
+		store.asXmmMovRM(sseOpcodeMovss, rm.reg(), mem)
+	case ssa.TypeF64:
+		store.asXmmMovRM(sseOpcodeMovsd, rm.reg(), mem)
+	case ssa.TypeV128:
+		store.asXmmMovRM(sseOpcodeMovdqu, rm.reg(), mem)
+	default:
+		panic("BUG")
+	}
+	m.insert(store)
+}
+
+func (m *machine) lowerCall(si *ssa.Instruction) {
+	isDirectCall := si.Opcode() == ssa.OpcodeCall
+	var indirectCalleePtr ssa.Value
+	var directCallee ssa.FuncRef
+	var sigID ssa.SignatureID
+	var args []ssa.Value
+	var isMemmove bool
+	if isDirectCall {
+		directCallee, sigID, args = si.CallData()
+	} else {
+		indirectCalleePtr, sigID, args, isMemmove = si.CallIndirectData()
+	}
+	calleeABI := m.c.GetFunctionABI(m.c.SSABuilder().ResolveSignature(sigID))
+
+	stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize())
+	if m.maxRequiredStackSizeForCalls < stackSlotSize+16 {
+		m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // 16 == return address + RBP.
+	}
+
+	// Note: See machine.SetupPrologue for the stack layout.
+	// The stack pointer decrease/increase will be inserted later in the compilation.
+
+	for i, arg := range args {
+		reg := m.c.VRegOf(arg)
+		def := m.c.ValueDefinition(arg)
+		m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize)
+	}
+
+	if isMemmove {
+		// Go's memmove *might* use all xmm0-xmm15, so we need to release them.
+		// https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#architecture-specifics
+		// https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/runtime/memmove_amd64.s#L271-L286
+		for i := regalloc.RealReg(0); i < 16; i++ {
+			m.insert(m.allocateInstr().asDefineUninitializedReg(regInfo.RealRegToVReg[xmm0+i]))
+		}
+	}
+
+	if isDirectCall {
+		call := m.allocateInstr().asCall(directCallee, calleeABI)
+		m.insert(call)
+	} else {
+		ptrOp := m.getOperand_Mem_Reg(m.c.ValueDefinition(indirectCalleePtr))
+		callInd := m.allocateInstr().asCallIndirect(ptrOp, calleeABI)
+		m.insert(callInd)
+	}
+
+	if isMemmove {
+		for i := regalloc.RealReg(0); i < 16; i++ {
+			m.insert(m.allocateInstr().asNopUseReg(regInfo.RealRegToVReg[xmm0+i]))
+		}
+	}
+
+	var index int
+	r1, rs := si.Returns()
+	if r1.Valid() {
+		m.callerGenFunctionReturnVReg(calleeABI, 0, m.c.VRegOf(r1), stackSlotSize)
+		index++
+	}
+
+	for _, r := range rs {
+		m.callerGenFunctionReturnVReg(calleeABI, index, m.c.VRegOf(r), stackSlotSize)
+		index++
+	}
+}
+
+// callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the
+// caller side of the function call.
+func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def *backend.SSAValueDefinition, stackSlotSize int64) {
+	arg := &a.Args[argIndex]
+	if def != nil && def.IsFromInstr() {
+		// Constant instructions are inlined.
+		if inst := def.Instr; inst.Constant() {
+			m.insertLoadConstant(inst, reg)
+		}
+	}
+	if arg.Kind == backend.ABIArgKindReg {
+		m.InsertMove(arg.Reg, reg, arg.Type)
+	} else {
+		store := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmReg(
+			// -stackSlotSize because the stack pointer is not yet decreased.
+			uint32(arg.Offset-stackSlotSize), rspVReg))
+		switch arg.Type {
+		case ssa.TypeI32:
+			store.asMovRM(reg, mem, 4)
+		case ssa.TypeI64:
+			store.asMovRM(reg, mem, 8)
+		case ssa.TypeF32:
+			store.asXmmMovRM(sseOpcodeMovss, reg, mem)
+		case ssa.TypeF64:
+			store.asXmmMovRM(sseOpcodeMovsd, reg, mem)
+		case ssa.TypeV128:
+			store.asXmmMovRM(sseOpcodeMovdqu, reg, mem)
+		default:
+			panic("BUG")
+		}
+		m.insert(store)
+	}
+}
+
+func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex int, reg regalloc.VReg, stackSlotSize int64) {
+	r := &a.Rets[retIndex]
+	if r.Kind == backend.ABIArgKindReg {
+		m.InsertMove(reg, r.Reg, r.Type)
+	} else {
+		load := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmReg(
+			// -stackSlotSize because the stack pointer is not yet decreased.
+			uint32(a.ArgStackSize+r.Offset-stackSlotSize), rspVReg))
+		switch r.Type {
+		case ssa.TypeI32:
+			load.asMovzxRmR(extModeLQ, mem, reg)
+		case ssa.TypeI64:
+			load.asMov64MR(mem, reg)
+		case ssa.TypeF32:
+			load.asXmmUnaryRmR(sseOpcodeMovss, mem, reg)
+		case ssa.TypeF64:
+			load.asXmmUnaryRmR(sseOpcodeMovsd, mem, reg)
+		case ssa.TypeV128:
+			load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, reg)
+		default:
+			panic("BUG")
+		}
+		m.insert(load)
+	}
+}
+
+// InsertMove implements backend.Machine.
+func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) {
+	switch typ {
+	case ssa.TypeI32, ssa.TypeI64:
+		i := m.allocateInstr().asMovRR(src, dst, typ.Bits() == 64)
+		m.insert(i)
+	case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+		var op sseOpcode
+		switch typ {
+		case ssa.TypeF32:
+			op = sseOpcodeMovss
+		case ssa.TypeF64:
+			op = sseOpcodeMovsd
+		case ssa.TypeV128:
+			op = sseOpcodeMovdqa
+		}
+		i := m.allocateInstr().asXmmUnaryRmR(op, newOperandReg(src), dst)
+		m.insert(i)
+	default:
+		panic("BUG")
+	}
+}
+
+// Format implements backend.Machine.
+func (m *machine) Format() string {
+	ectx := m.ectx
+	begins := map[*instruction]backend.Label{}
+	for l, pos := range ectx.LabelPositions {
+		begins[pos.Begin] = l
+	}
+
+	irBlocks := map[backend.Label]ssa.BasicBlockID{}
+	for i, l := range ectx.SsaBlockIDToLabels {
+		irBlocks[l] = ssa.BasicBlockID(i)
+	}
+
+	var lines []string
+	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+		if l, ok := begins[cur]; ok {
+			var labelStr string
+			if blkID, ok := irBlocks[l]; ok {
+				labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID)
+			} else {
+				labelStr = fmt.Sprintf("%s:", l)
+			}
+			lines = append(lines, labelStr)
+		}
+		if cur.kind == nop0 {
+			continue
+		}
+		lines = append(lines, "\t"+cur.String())
+	}
+	for _, vc := range m.consts {
+		if vc._var == nil {
+			lines = append(lines, fmt.Sprintf("%s: const [%d %d]", vc.label.L, vc.lo, vc.hi))
+		} else {
+			lines = append(lines, fmt.Sprintf("%s: const %#x", vc.label.L, vc._var))
+		}
+	}
+	return "\n" + strings.Join(lines, "\n") + "\n"
+}
+
+func (m *machine) encodeWithoutSSA(root *instruction) {
+	m.labelResolutionPends = m.labelResolutionPends[:0]
+	ectx := m.ectx
+
+	bufPtr := m.c.BufPtr()
+	for cur := root; cur != nil; cur = cur.next {
+		offset := int64(len(*bufPtr))
+		if cur.kind == nop0 {
+			l := cur.nop0Label()
+			if pos, ok := ectx.LabelPositions[l]; ok {
+				pos.BinaryOffset = offset
+			}
+		}
+
+		needLabelResolution := cur.encode(m.c)
+		if needLabelResolution {
+			m.labelResolutionPends = append(m.labelResolutionPends,
+				labelResolutionPend{instr: cur, imm32Offset: int64(len(*bufPtr)) - 4},
+			)
+		}
+	}
+
+	for i := range m.labelResolutionPends {
+		p := &m.labelResolutionPends[i]
+		switch p.instr.kind {
+		case jmp, jmpIf, lea:
+			target := p.instr.jmpLabel()
+			targetOffset := ectx.LabelPositions[target].BinaryOffset
+			imm32Offset := p.imm32Offset
+			jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction.
+			binary.LittleEndian.PutUint32((*bufPtr)[imm32Offset:], uint32(jmpOffset))
+		default:
+			panic("BUG")
+		}
+	}
+}
+
+// Encode implements backend.Machine Encode.
+func (m *machine) Encode(ctx context.Context) (err error) {
+	ectx := m.ectx
+	bufPtr := m.c.BufPtr()
+
+	var fn string
+	var fnIndex int
+	var labelToSSABlockID map[backend.Label]ssa.BasicBlockID
+	if wazevoapi.PerfMapEnabled {
+		fn = wazevoapi.GetCurrentFunctionName(ctx)
+		labelToSSABlockID = make(map[backend.Label]ssa.BasicBlockID)
+		for i, l := range ectx.SsaBlockIDToLabels {
+			labelToSSABlockID[l] = ssa.BasicBlockID(i)
+		}
+		fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx)
+	}
+
+	m.labelResolutionPends = m.labelResolutionPends[:0]
+	for _, pos := range ectx.OrderedBlockLabels {
+		offset := int64(len(*bufPtr))
+		pos.BinaryOffset = offset
+		for cur := pos.Begin; cur != pos.End.next; cur = cur.next {
+			offset := int64(len(*bufPtr))
+
+			switch cur.kind {
+			case nop0:
+				l := cur.nop0Label()
+				if pos, ok := ectx.LabelPositions[l]; ok {
+					pos.BinaryOffset = offset
+				}
+			case sourceOffsetInfo:
+				m.c.AddSourceOffsetInfo(offset, cur.sourceOffsetInfo())
+			}
+
+			needLabelResolution := cur.encode(m.c)
+			if needLabelResolution {
+				m.labelResolutionPends = append(m.labelResolutionPends,
+					labelResolutionPend{instr: cur, instrOffset: offset, imm32Offset: int64(len(*bufPtr)) - 4},
+				)
+			}
+		}
+
+		if wazevoapi.PerfMapEnabled {
+			l := pos.L
+			var labelStr string
+			if blkID, ok := labelToSSABlockID[l]; ok {
+				labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID)
+			} else {
+				labelStr = l.String()
+			}
+			size := int64(len(*bufPtr)) - offset
+			wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr))
+		}
+	}
+
+	for i := range m.consts {
+		offset := int64(len(*bufPtr))
+		vc := &m.consts[i]
+		vc.label.BinaryOffset = offset
+		if vc._var == nil {
+			lo, hi := vc.lo, vc.hi
+			m.c.Emit8Bytes(lo)
+			m.c.Emit8Bytes(hi)
+		} else {
+			for _, b := range vc._var {
+				m.c.EmitByte(b)
+			}
+		}
+	}
+
+	buf := *bufPtr
+	for i := range m.labelResolutionPends {
+		p := &m.labelResolutionPends[i]
+		switch p.instr.kind {
+		case jmp, jmpIf, lea, xmmUnaryRmR:
+			target := p.instr.jmpLabel()
+			targetOffset := ectx.LabelPositions[target].BinaryOffset
+			imm32Offset := p.imm32Offset
+			jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction.
+			binary.LittleEndian.PutUint32(buf[imm32Offset:], uint32(jmpOffset))
+		case jmpTableIsland:
+			tableBegin := p.instrOffset
+			// Each entry is the offset from the beginning of the jmpTableIsland instruction in 8 bytes.
+			targets := m.jmpTableTargets[p.instr.u1]
+			for i, l := range targets {
+				targetOffset := ectx.LabelPositions[backend.Label(l)].BinaryOffset
+				jmpOffset := targetOffset - tableBegin
+				binary.LittleEndian.PutUint64(buf[tableBegin+int64(i)*8:], uint64(jmpOffset))
+			}
+		default:
+			panic("BUG")
+		}
+	}
+	return
+}
+
+// ResolveRelocations implements backend.Machine.
+func (m *machine) ResolveRelocations(refToBinaryOffset []int, binary []byte, relocations []backend.RelocationInfo, _ []int) {
+	for _, r := range relocations {
+		offset := r.Offset
+		calleeFnOffset := refToBinaryOffset[r.FuncRef]
+		// offset is the offset of the last 4 bytes of the call instruction.
+		callInstrOffsetBytes := binary[offset : offset+4]
+		diff := int64(calleeFnOffset) - (offset + 4) // +4 because we want the offset of the next instruction (In x64, RIP always points to the next instruction).
+		callInstrOffsetBytes[0] = byte(diff)
+		callInstrOffsetBytes[1] = byte(diff >> 8)
+		callInstrOffsetBytes[2] = byte(diff >> 16)
+		callInstrOffsetBytes[3] = byte(diff >> 24)
+	}
+}
+
+// CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo.
+func (m *machine) CallTrampolineIslandInfo(_ int) (_, _ int, _ error) { return }
+
+func (m *machine) lowerIcmpToFlag(xd, yd *backend.SSAValueDefinition, _64 bool) {
+	x := m.getOperand_Reg(xd)
+	y := m.getOperand_Mem_Imm32_Reg(yd)
+	cmp := m.allocateInstr().asCmpRmiR(true, y, x.reg(), _64)
+	m.insert(cmp)
+}
+
+func (m *machine) lowerFcmpToFlags(instr *ssa.Instruction) (f1, f2 cond, and bool) {
+	x, y, c := instr.FcmpData()
+	switch c {
+	case ssa.FloatCmpCondEqual:
+		f1, f2 = condNP, condZ
+		and = true
+	case ssa.FloatCmpCondNotEqual:
+		f1, f2 = condP, condNZ
+	case ssa.FloatCmpCondLessThan:
+		f1 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThan)
+		f2 = condInvalid
+		x, y = y, x
+	case ssa.FloatCmpCondLessThanOrEqual:
+		f1 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThanOrEqual)
+		f2 = condInvalid
+		x, y = y, x
+	default:
+		f1 = condFromSSAFloatCmpCond(c)
+		f2 = condInvalid
+	}
+
+	var opc sseOpcode
+	if x.Type() == ssa.TypeF32 {
+		opc = sseOpcodeUcomiss
+	} else {
+		opc = sseOpcodeUcomisd
+	}
+
+	xr := m.getOperand_Reg(m.c.ValueDefinition(x))
+	yr := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+	m.insert(m.allocateInstr().asXmmCmpRmR(opc, yr, xr.reg()))
+	return
+}
+
+// allocateInstr allocates an instruction.
+func (m *machine) allocateInstr() *instruction {
+	instr := m.ectx.InstructionPool.Allocate()
+	if !m.regAllocStarted {
+		instr.addedBeforeRegAlloc = true
+	}
+	return instr
+}
+
+func (m *machine) allocateNop() *instruction {
+	instr := m.allocateInstr()
+	instr.kind = nop0
+	return instr
+}
+
+func (m *machine) insert(i *instruction) {
+	ectx := m.ectx
+	ectx.PendingInstructions = append(ectx.PendingInstructions, i)
+}
+
+func (m *machine) allocateBrTarget() (nop *instruction, l backend.Label) { //nolint
+	pos := m.allocateLabel()
+	l = pos.L
+	nop = m.allocateInstr()
+	nop.asNop0WithLabel(l)
+	pos.Begin, pos.End = nop, nop
+	return
+}
+
+func (m *machine) allocateLabel() *labelPosition {
+	ectx := m.ectx
+	l := ectx.AllocateLabel()
+	pos := ectx.AllocateLabelPosition(l)
+	ectx.LabelPositions[l] = pos
+	return pos
+}
+
+func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 {
+	offset, ok := m.spillSlots[id]
+	if !ok {
+		offset = m.spillSlotSize
+		m.spillSlots[id] = offset
+		m.spillSlotSize += int64(size)
+	}
+	return offset
+}
+
+func (m *machine) copyTo(src regalloc.VReg, dst regalloc.VReg) {
+	mov := m.allocateInstr()
+	if src.RegType() == regalloc.RegTypeInt {
+		mov.asMovRR(src, dst, true)
+	} else {
+		mov.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), dst)
+	}
+	m.insert(mov)
+}
+
+func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg {
+	typ := m.c.TypeOf(v)
+	tmp := m.c.AllocateVReg(typ)
+	m.copyTo(v, tmp)
+	return tmp
+}
+
+func (m *machine) requiredStackSize() int64 {
+	return m.maxRequiredStackSizeForCalls +
+		m.frameSize() +
+		16 + // Need for stack checking.
+		16 // return address and the caller RBP.
+}
+
+func (m *machine) frameSize() int64 {
+	s := m.clobberedRegSlotSize() + m.spillSlotSize
+	if s&0xf != 0 {
+		panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s))
+	}
+	return s
+}
+
+func (m *machine) clobberedRegSlotSize() int64 {
+	return int64(len(m.clobberedRegs) * 16)
+}
+
+func (m *machine) lowerIDivRem(si *ssa.Instruction, isDiv bool, signed bool) {
+	x, y, execCtx := si.Arg3()
+
+	dividend := m.getOperand_Reg(m.c.ValueDefinition(x))
+	divisor := m.getOperand_Reg(m.c.ValueDefinition(y))
+	ctxVReg := m.c.VRegOf(execCtx)
+	tmpGp := m.c.AllocateVReg(si.Return().Type())
+
+	m.copyTo(dividend.reg(), raxVReg)
+	m.insert(m.allocateInstr().asDefineUninitializedReg(rdxVReg))
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
+	seq := m.allocateInstr().asIdivRemSequence(ctxVReg, divisor.reg(), tmpGp, isDiv, signed, x.Type().Bits() == 64)
+	m.insert(seq)
+	rd := m.c.VRegOf(si.Return())
+	if isDiv {
+		m.copyTo(raxVReg, rd)
+	} else {
+		m.copyTo(rdxVReg, rd)
+	}
+}
+
+func (m *machine) lowerIDivRemSequenceAfterRegAlloc(i *instruction) {
+	execCtx, divisor, tmpGp, isDiv, signed, _64 := i.idivRemSequenceData()
+
+	dividend := raxVReg
+
+	// Ensure yr is not zero.
+	test := m.allocateInstr()
+	test.asCmpRmiR(false, newOperandReg(divisor), divisor, _64)
+	m.insert(test)
+
+	jnz := m.allocateInstr()
+	m.insert(jnz)
+
+	nz := m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerDivisionByZero)
+
+	// If not zero, we can proceed with the division.
+	jnz.asJmpIf(condNZ, newOperandLabel(nz))
+
+	var ifRemNeg1 *instruction
+	if signed {
+		var neg1 uint64
+		if _64 {
+			neg1 = 0xffffffffffffffff
+		} else {
+			neg1 = 0xffffffff
+		}
+		m.lowerIconst(tmpGp, neg1, _64)
+
+		if isDiv {
+			// For signed division, we have to have branches for "math.MinInt{32,64} / -1"
+			// case which results in the floating point exception via division error as
+			// the resulting value exceeds the maximum of signed int.
+
+			// First, we check if the divisor is -1.
+			cmp := m.allocateInstr()
+			cmp.asCmpRmiR(true, newOperandReg(tmpGp), divisor, _64)
+			m.insert(cmp)
+
+			ifNotNeg1 := m.allocateInstr()
+			m.insert(ifNotNeg1)
+
+			var minInt uint64
+			if _64 {
+				minInt = 0x8000000000000000
+			} else {
+				minInt = 0x80000000
+			}
+			m.lowerIconst(tmpGp, minInt, _64)
+
+			// Next we check if the quotient is the most negative value for the signed integer, i.e.
+			// if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1) respectively.
+			cmp2 := m.allocateInstr()
+			cmp2.asCmpRmiR(true, newOperandReg(tmpGp), dividend, _64)
+			m.insert(cmp2)
+
+			ifNotMinInt := m.allocateInstr()
+			m.insert(ifNotMinInt)
+
+			// Trap if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1),
+			// as that is the overflow in division as the result becomes 2^31 which is larger than
+			// the maximum of signed 32-bit int (2^31-1).
+			end := m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
+			ifNotNeg1.asJmpIf(condNZ, newOperandLabel(end))
+			ifNotMinInt.asJmpIf(condNZ, newOperandLabel(end))
+		} else {
+			// If it is remainder, zeros DX register and compare the divisor to -1.
+			xor := m.allocateInstr().asZeros(rdxVReg)
+			m.insert(xor)
+
+			// We check if the divisor is -1.
+			cmp := m.allocateInstr()
+			cmp.asCmpRmiR(true, newOperandReg(tmpGp), divisor, _64)
+			m.insert(cmp)
+
+			ifRemNeg1 = m.allocateInstr()
+			m.insert(ifRemNeg1)
+		}
+
+		// Sign-extend DX register to have 2*x.Type().Bits() dividend over DX and AX registers.
+		sed := m.allocateInstr()
+		sed.asSignExtendData(_64)
+		m.insert(sed)
+	} else {
+		// Zeros DX register to have 2*x.Type().Bits() dividend over DX and AX registers.
+		zeros := m.allocateInstr().asZeros(rdxVReg)
+		m.insert(zeros)
+	}
+
+	div := m.allocateInstr()
+	div.asDiv(newOperandReg(divisor), signed, _64)
+	m.insert(div)
+
+	nop, end := m.allocateBrTarget()
+	m.insert(nop)
+	// If we are compiling a Rem instruction, when the divisor is -1 we land at the end of the function.
+	if ifRemNeg1 != nil {
+		ifRemNeg1.asJmpIf(condZ, newOperandLabel(end))
+	}
+}
+
+func (m *machine) lowerRound(instr *ssa.Instruction, imm roundingMode) {
+	x := instr.Arg()
+	if !x.Type().IsFloat() {
+		panic("BUG?")
+	}
+	var op sseOpcode
+	if x.Type().Bits() == 64 {
+		op = sseOpcodeRoundsd
+	} else {
+		op = sseOpcodeRoundss
+	}
+
+	xDef := m.c.ValueDefinition(x)
+	rm := m.getOperand_Mem_Reg(xDef)
+	rd := m.c.VRegOf(instr.Return())
+
+	xmm := m.allocateInstr().asXmmUnaryRmRImm(op, uint8(imm), rm, rd)
+	m.insert(xmm)
+}
+
+func (m *machine) lowerFminFmax(instr *ssa.Instruction) {
+	x, y := instr.Arg2()
+	if !x.Type().IsFloat() {
+		panic("BUG?")
+	}
+
+	_64 := x.Type().Bits() == 64
+	isMin := instr.Opcode() == ssa.OpcodeFmin
+	var minMaxOp sseOpcode
+
+	switch {
+	case _64 && isMin:
+		minMaxOp = sseOpcodeMinpd
+	case _64 && !isMin:
+		minMaxOp = sseOpcodeMaxpd
+	case !_64 && isMin:
+		minMaxOp = sseOpcodeMinps
+	case !_64 && !isMin:
+		minMaxOp = sseOpcodeMaxps
+	}
+
+	xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+	rm := m.getOperand_Reg(xDef)
+	// We cannot ensure that y is aligned to 16 bytes, so we have to use it on reg.
+	rn := m.getOperand_Reg(yDef)
+	rd := m.c.VRegOf(instr.Return())
+
+	tmp := m.copyToTmp(rm.reg())
+
+	// Check if this is (either x1 or x2 is NaN) or (x1 equals x2) case.
+	cmp := m.allocateInstr()
+	if _64 {
+		cmp.asXmmCmpRmR(sseOpcodeUcomisd, rn, tmp)
+	} else {
+		cmp.asXmmCmpRmR(sseOpcodeUcomiss, rn, tmp)
+	}
+	m.insert(cmp)
+
+	// At this point, we have the three cases of conditional flags below
+	// (See https://www.felixcloutier.com/x86/ucomiss#operation for detail.)
+	//
+	// 1) Two values are NaN-free and different: All flags are cleared.
+	// 2) Two values are NaN-free and equal: Only ZF flags is set.
+	// 3) One of Two values is NaN: ZF, PF and CF flags are set.
+
+	// Jump instruction to handle 1) case by checking the ZF flag
+	// as ZF is only set for 2) and 3) cases.
+	nanFreeOrDiffJump := m.allocateInstr()
+	m.insert(nanFreeOrDiffJump)
+
+	// Start handling 2) and 3).
+
+	// Jump if one of two values is NaN by checking the parity flag (PF).
+	ifIsNan := m.allocateInstr()
+	m.insert(ifIsNan)
+
+	// Start handling 2) NaN-free and equal.
+
+	// Before we exit this case, we have to ensure that positive zero (or negative zero for min instruction) is
+	// returned if two values are positive and negative zeros.
+	var op sseOpcode
+	switch {
+	case !_64 && isMin:
+		op = sseOpcodeOrps
+	case _64 && isMin:
+		op = sseOpcodeOrpd
+	case !_64 && !isMin:
+		op = sseOpcodeAndps
+	case _64 && !isMin:
+		op = sseOpcodeAndpd
+	}
+	orAnd := m.allocateInstr()
+	orAnd.asXmmRmR(op, rn, tmp)
+	m.insert(orAnd)
+
+	// Done, jump to end.
+	sameExitJump := m.allocateInstr()
+	m.insert(sameExitJump)
+
+	// Start handling 3) either is NaN.
+	isNanTarget, isNan := m.allocateBrTarget()
+	m.insert(isNanTarget)
+	ifIsNan.asJmpIf(condP, newOperandLabel(isNan))
+
+	// We emit the ADD instruction to produce the NaN in tmp.
+	add := m.allocateInstr()
+	if _64 {
+		add.asXmmRmR(sseOpcodeAddsd, rn, tmp)
+	} else {
+		add.asXmmRmR(sseOpcodeAddss, rn, tmp)
+	}
+	m.insert(add)
+
+	// Exit from the NaN case branch.
+	nanExitJmp := m.allocateInstr()
+	m.insert(nanExitJmp)
+
+	// Start handling 1).
+	doMinMaxTarget, doMinMax := m.allocateBrTarget()
+	m.insert(doMinMaxTarget)
+	nanFreeOrDiffJump.asJmpIf(condNZ, newOperandLabel(doMinMax))
+
+	// Now handle the NaN-free and different values case.
+	minMax := m.allocateInstr()
+	minMax.asXmmRmR(minMaxOp, rn, tmp)
+	m.insert(minMax)
+
+	endNop, end := m.allocateBrTarget()
+	m.insert(endNop)
+	nanExitJmp.asJmp(newOperandLabel(end))
+	sameExitJump.asJmp(newOperandLabel(end))
+
+	m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerFcopysign(instr *ssa.Instruction) {
+	x, y := instr.Arg2()
+	if !x.Type().IsFloat() {
+		panic("BUG")
+	}
+
+	_64 := x.Type().Bits() == 64
+
+	xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+	rm := m.getOperand_Reg(xDef)
+	rn := m.getOperand_Reg(yDef)
+	rd := m.c.VRegOf(instr.Return())
+
+	// Clear the non-sign bits of src via AND with the mask.
+	var opAnd, opOr sseOpcode
+	var signMask uint64
+	if _64 {
+		signMask, opAnd, opOr = 0x8000000000000000, sseOpcodeAndpd, sseOpcodeOrpd
+	} else {
+		signMask, opAnd, opOr = 0x80000000, sseOpcodeAndps, sseOpcodeOrps
+	}
+
+	signBitReg := m.c.AllocateVReg(x.Type())
+	m.lowerFconst(signBitReg, signMask, _64)
+	nonSignBitReg := m.c.AllocateVReg(x.Type())
+	m.lowerFconst(nonSignBitReg, ^signMask, _64)
+
+	// Extract the sign bits of rn.
+	and := m.allocateInstr().asXmmRmR(opAnd, rn, signBitReg)
+	m.insert(and)
+
+	// Clear the sign bit of dst via AND with the non-sign bit mask.
+	xor := m.allocateInstr().asXmmRmR(opAnd, rm, nonSignBitReg)
+	m.insert(xor)
+
+	// Copy the sign bits of src to dst via OR.
+	or := m.allocateInstr().asXmmRmR(opOr, newOperandReg(signBitReg), nonSignBitReg)
+	m.insert(or)
+
+	m.copyTo(nonSignBitReg, rd)
+}
+
+func (m *machine) lowerBitcast(instr *ssa.Instruction) {
+	x, dstTyp := instr.BitcastData()
+	srcTyp := x.Type()
+	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+	rd := m.c.VRegOf(instr.Return())
+	switch {
+	case srcTyp == ssa.TypeF32 && dstTyp == ssa.TypeI32:
+		cvt := m.allocateInstr().asXmmToGpr(sseOpcodeMovd, rn.reg(), rd, false)
+		m.insert(cvt)
+	case srcTyp == ssa.TypeI32 && dstTyp == ssa.TypeF32:
+		cvt := m.allocateInstr().asGprToXmm(sseOpcodeMovd, rn, rd, false)
+		m.insert(cvt)
+	case srcTyp == ssa.TypeF64 && dstTyp == ssa.TypeI64:
+		cvt := m.allocateInstr().asXmmToGpr(sseOpcodeMovq, rn.reg(), rd, true)
+		m.insert(cvt)
+	case srcTyp == ssa.TypeI64 && dstTyp == ssa.TypeF64:
+		cvt := m.allocateInstr().asGprToXmm(sseOpcodeMovq, rn, rd, true)
+		m.insert(cvt)
+	default:
+		panic(fmt.Sprintf("invalid bitcast from %s to %s", srcTyp, dstTyp))
+	}
+}
+
+func (m *machine) lowerFcvtToSint(ctxVReg, rn, rd regalloc.VReg, src64, dst64, sat bool) {
+	var tmpXmm regalloc.VReg
+	if dst64 {
+		tmpXmm = m.c.AllocateVReg(ssa.TypeF64)
+	} else {
+		tmpXmm = m.c.AllocateVReg(ssa.TypeF32)
+	}
+
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm))
+	tmpGp, tmpGp2 := m.c.AllocateVReg(ssa.TypeI64), m.c.AllocateVReg(ssa.TypeI64)
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp2))
+
+	m.insert(m.allocateFcvtToSintSequence(ctxVReg, rn, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat))
+	m.copyTo(tmpGp, rd)
+}
+
+func (m *machine) lowerFcvtToSintSequenceAfterRegalloc(i *instruction) {
+	execCtx, src, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat := i.fcvtToSintSequenceData()
+	var cmpOp, truncOp sseOpcode
+	if src64 {
+		cmpOp, truncOp = sseOpcodeUcomisd, sseOpcodeCvttsd2si
+	} else {
+		cmpOp, truncOp = sseOpcodeUcomiss, sseOpcodeCvttss2si
+	}
+
+	trunc := m.allocateInstr()
+	trunc.asXmmToGpr(truncOp, src, tmpGp, dst64)
+	m.insert(trunc)
+
+	// Check if the dst operand was INT_MIN, by checking it against 1.
+	cmp1 := m.allocateInstr()
+	cmp1.asCmpRmiR(true, newOperandImm32(1), tmpGp, dst64)
+	m.insert(cmp1)
+
+	// If no overflow, then we are done.
+	doneTarget, done := m.allocateBrTarget()
+	ifNoOverflow := m.allocateInstr()
+	ifNoOverflow.asJmpIf(condNO, newOperandLabel(done))
+	m.insert(ifNoOverflow)
+
+	// Now, check for NaN.
+	cmpNan := m.allocateInstr()
+	cmpNan.asXmmCmpRmR(cmpOp, newOperandReg(src), src)
+	m.insert(cmpNan)
+
+	// We allocate the "non-nan target" here, but we will insert it later.
+	notNanTarget, notNaN := m.allocateBrTarget()
+	ifNotNan := m.allocateInstr()
+	ifNotNan.asJmpIf(condNP, newOperandLabel(notNaN))
+	m.insert(ifNotNan)
+
+	if sat {
+		// If NaN and saturating, return 0.
+		zeroDst := m.allocateInstr().asZeros(tmpGp)
+		m.insert(zeroDst)
+
+		jmpEnd := m.allocateInstr()
+		jmpEnd.asJmp(newOperandLabel(done))
+		m.insert(jmpEnd)
+
+		// Otherwise:
+		m.insert(notNanTarget)
+
+		// Zero-out the tmp register.
+		zero := m.allocateInstr().asZeros(tmpXmm)
+		m.insert(zero)
+
+		cmpXmm := m.allocateInstr().asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src)
+		m.insert(cmpXmm)
+
+		// if >= jump to end.
+		jmpEnd2 := m.allocateInstr()
+		jmpEnd2.asJmpIf(condB, newOperandLabel(done))
+		m.insert(jmpEnd2)
+
+		// Otherwise, saturate to INT_MAX.
+		if dst64 {
+			m.lowerIconst(tmpGp, math.MaxInt64, dst64)
+		} else {
+			m.lowerIconst(tmpGp, math.MaxInt32, dst64)
+		}
+
+	} else {
+
+		// If non-sat, NaN, trap.
+		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeInvalidConversionToInteger)
+
+		// Otherwise, we will jump here.
+		m.insert(notNanTarget)
+
+		// jump over trap if src larger than threshold
+		condAboveThreshold := condNB
+
+		// The magic constants are various combination of minInt for int[32|64] represented as float[32|64].
+		var minInt uint64
+		switch {
+		case src64 && dst64:
+			minInt = 0xc3e0000000000000
+		case src64 && !dst64:
+			condAboveThreshold = condNBE
+			minInt = 0xC1E0_0000_0020_0000
+		case !src64 && dst64:
+			minInt = 0xDF00_0000
+		case !src64 && !dst64:
+			minInt = 0xCF00_0000
+		}
+
+		loadToGP := m.allocateInstr().asImm(tmpGp2, minInt, src64)
+		m.insert(loadToGP)
+
+		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp2), tmpXmm, src64)
+		m.insert(movToXmm)
+
+		cmpXmm := m.allocateInstr().asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src)
+		m.insert(cmpXmm)
+
+		jmpIfLarger := m.allocateInstr()
+		checkPositiveTarget, checkPositive := m.allocateBrTarget()
+		jmpIfLarger.asJmpIf(condAboveThreshold, newOperandLabel(checkPositive))
+		m.insert(jmpIfLarger)
+
+		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
+
+		// If positive, it was a real overflow.
+		m.insert(checkPositiveTarget)
+
+		// Zero out the temp register.
+		xorpd := m.allocateInstr()
+		xorpd.asXmmRmR(sseOpcodeXorpd, newOperandReg(tmpXmm), tmpXmm)
+		m.insert(xorpd)
+
+		pos := m.allocateInstr()
+		pos.asXmmCmpRmR(cmpOp, newOperandReg(src), tmpXmm)
+		m.insert(pos)
+
+		// If >= jump to end.
+		jmp := m.allocateInstr().asJmpIf(condNB, newOperandLabel(done))
+		m.insert(jmp)
+		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
+	}
+
+	m.insert(doneTarget)
+}
+
+func (m *machine) lowerFcvtToUint(ctxVReg, rn, rd regalloc.VReg, src64, dst64, sat bool) {
+	tmpXmm, tmpXmm2 := m.c.AllocateVReg(ssa.TypeF64), m.c.AllocateVReg(ssa.TypeF64)
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm))
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm2))
+	tmpGp, tmpGp2 := m.c.AllocateVReg(ssa.TypeI64), m.c.AllocateVReg(ssa.TypeI64)
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp2))
+
+	m.insert(m.allocateFcvtToUintSequence(
+		ctxVReg, rn, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat,
+	))
+	m.copyTo(tmpGp, rd)
+}
+
+func (m *machine) lowerFcvtToUintSequenceAfterRegalloc(i *instruction) {
+	execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat := i.fcvtToUintSequenceData()
+
+	var subOp, cmpOp, truncOp sseOpcode
+	if src64 {
+		subOp, cmpOp, truncOp = sseOpcodeSubsd, sseOpcodeUcomisd, sseOpcodeCvttsd2si
+	} else {
+		subOp, cmpOp, truncOp = sseOpcodeSubss, sseOpcodeUcomiss, sseOpcodeCvttss2si
+	}
+
+	doneTarget, done := m.allocateBrTarget()
+
+	switch {
+	case src64 && dst64:
+		loadToGP := m.allocateInstr().asImm(tmpGp, 0x43e0000000000000, true)
+		m.insert(loadToGP)
+		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, true)
+		m.insert(movToXmm)
+	case src64 && !dst64:
+		loadToGP := m.allocateInstr().asImm(tmpGp, 0x41e0000000000000, true)
+		m.insert(loadToGP)
+		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, true)
+		m.insert(movToXmm)
+	case !src64 && dst64:
+		loadToGP := m.allocateInstr().asImm(tmpGp, 0x5f000000, false)
+		m.insert(loadToGP)
+		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, false)
+		m.insert(movToXmm)
+	case !src64 && !dst64:
+		loadToGP := m.allocateInstr().asImm(tmpGp, 0x4f000000, false)
+		m.insert(loadToGP)
+		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, false)
+		m.insert(movToXmm)
+	}
+
+	cmp := m.allocateInstr()
+	cmp.asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src)
+	m.insert(cmp)
+
+	// If above `tmp` ("large threshold"), jump to `ifAboveThreshold`
+	ifAboveThresholdTarget, ifAboveThreshold := m.allocateBrTarget()
+	jmpIfAboveThreshold := m.allocateInstr()
+	jmpIfAboveThreshold.asJmpIf(condNB, newOperandLabel(ifAboveThreshold))
+	m.insert(jmpIfAboveThreshold)
+
+	ifNotNaNTarget, ifNotNaN := m.allocateBrTarget()
+	jmpIfNotNaN := m.allocateInstr()
+	jmpIfNotNaN.asJmpIf(condNP, newOperandLabel(ifNotNaN))
+	m.insert(jmpIfNotNaN)
+
+	// If NaN, handle the error condition.
+	if sat {
+		// On NaN, saturating, we just return 0.
+		zeros := m.allocateInstr().asZeros(tmpGp)
+		m.insert(zeros)
+
+		jmpEnd := m.allocateInstr()
+		jmpEnd.asJmp(newOperandLabel(done))
+		m.insert(jmpEnd)
+	} else {
+		// On NaN, non-saturating, we trap.
+		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeInvalidConversionToInteger)
+	}
+
+	// If not NaN, land here.
+	m.insert(ifNotNaNTarget)
+
+	// Truncation happens here.
+
+	trunc := m.allocateInstr()
+	trunc.asXmmToGpr(truncOp, src, tmpGp, dst64)
+	m.insert(trunc)
+
+	// Check if the result is negative.
+	cmpNeg := m.allocateInstr()
+	cmpNeg.asCmpRmiR(true, newOperandImm32(0), tmpGp, dst64)
+	m.insert(cmpNeg)
+
+	// If non-neg, jump to end.
+	jmpIfNonNeg := m.allocateInstr()
+	jmpIfNonNeg.asJmpIf(condNL, newOperandLabel(done))
+	m.insert(jmpIfNonNeg)
+
+	if sat {
+		// If the input was "small" (< 2**(width -1)), the only way to get an integer
+		// overflow is because the input was too small: saturate to the min value, i.e. 0.
+		zeros := m.allocateInstr().asZeros(tmpGp)
+		m.insert(zeros)
+
+		jmpEnd := m.allocateInstr()
+		jmpEnd.asJmp(newOperandLabel(done))
+		m.insert(jmpEnd)
+	} else {
+		// If not saturating, trap.
+		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
+	}
+
+	// If above the threshold, land here.
+	m.insert(ifAboveThresholdTarget)
+
+	// tmpDiff := threshold - rn.
+	copySrc := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), tmpXmm2)
+	m.insert(copySrc)
+
+	sub := m.allocateInstr()
+	sub.asXmmRmR(subOp, newOperandReg(tmpXmm), tmpXmm2) // must be -0x8000000000000000
+	m.insert(sub)
+
+	trunc2 := m.allocateInstr()
+	trunc2.asXmmToGpr(truncOp, tmpXmm2, tmpGp, dst64)
+	m.insert(trunc2)
+
+	// Check if the result is negative.
+	cmpNeg2 := m.allocateInstr().asCmpRmiR(true, newOperandImm32(0), tmpGp, dst64)
+	m.insert(cmpNeg2)
+
+	ifNextLargeTarget, ifNextLarge := m.allocateBrTarget()
+	jmpIfNextLarge := m.allocateInstr()
+	jmpIfNextLarge.asJmpIf(condNL, newOperandLabel(ifNextLarge))
+	m.insert(jmpIfNextLarge)
+
+	if sat {
+		// The input was "large" (>= maxInt), so the only way to get an integer
+		// overflow is because the input was too large: saturate to the max value.
+		var maxInt uint64
+		if dst64 {
+			maxInt = math.MaxUint64
+		} else {
+			maxInt = math.MaxUint32
+		}
+		m.lowerIconst(tmpGp, maxInt, dst64)
+
+		jmpToEnd := m.allocateInstr()
+		jmpToEnd.asJmp(newOperandLabel(done))
+		m.insert(jmpToEnd)
+	} else {
+		// If not saturating, trap.
+		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
+	}
+
+	m.insert(ifNextLargeTarget)
+
+	var op operand
+	if dst64 {
+		m.lowerIconst(tmpGp2, 0x8000000000000000, true)
+		op = newOperandReg(tmpGp2)
+	} else {
+		op = newOperandImm32(0x80000000)
+	}
+
+	add := m.allocateInstr()
+	add.asAluRmiR(aluRmiROpcodeAdd, op, tmpGp, dst64)
+	m.insert(add)
+
+	m.insert(doneTarget)
+}
+
+func (m *machine) lowerFcvtFromSint(rn, rd operand, src64, dst64 bool) {
+	var op sseOpcode
+	if dst64 {
+		op = sseOpcodeCvtsi2sd
+	} else {
+		op = sseOpcodeCvtsi2ss
+	}
+
+	trunc := m.allocateInstr()
+	trunc.asGprToXmm(op, rn, rd.reg(), src64)
+	m.insert(trunc)
+}
+
+func (m *machine) lowerFcvtFromUint(rn, rd operand, src64, dst64 bool) {
+	var op sseOpcode
+	if dst64 {
+		op = sseOpcodeCvtsi2sd
+	} else {
+		op = sseOpcodeCvtsi2ss
+	}
+
+	// Src is 32 bit, then we just perform the conversion with 64 bit width.
+	//
+	// See the following link for why we use 64bit conversion for unsigned 32bit integer sources:
+	// https://stackoverflow.com/questions/41495498/fpu-operations-generated-by-gcc-during-casting-integer-to-float.
+	//
+	// Here's the summary:
+	// >> CVTSI2SS is indeed designed for converting a signed integer to a scalar single-precision float,
+	// >> not an unsigned integer like you have here. So what gives? Well, a 64-bit processor has 64-bit wide
+	// >> registers available, so the unsigned 32-bit input values can be stored as signed 64-bit intermediate values,
+	// >> which allows CVTSI2SS to be used after all.
+	//
+	if !src64 {
+		// Before we convert, we have to clear the higher 32-bits of the 64-bit register
+		// to get the correct result.
+		tmp := m.c.AllocateVReg(ssa.TypeI32)
+		m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, rn, tmp))
+		m.insert(m.allocateInstr().asGprToXmm(op, newOperandReg(tmp), rd.reg(), true))
+		return
+	}
+
+	// If uint64, we have to do a bit more work.
+	endTarget, end := m.allocateBrTarget()
+
+	var tmpXmm regalloc.VReg
+	if dst64 {
+		tmpXmm = m.c.AllocateVReg(ssa.TypeF64)
+	} else {
+		tmpXmm = m.c.AllocateVReg(ssa.TypeF32)
+	}
+
+	// Check if the most significant bit (sign bit) is set.
+	test := m.allocateInstr()
+	test.asCmpRmiR(false, rn, rn.reg(), src64)
+	m.insert(test)
+
+	// Jump if the sign bit is set.
+	ifSignTarget, ifSign := m.allocateBrTarget()
+	jmpIfNeg := m.allocateInstr()
+	jmpIfNeg.asJmpIf(condS, newOperandLabel(ifSign))
+	m.insert(jmpIfNeg)
+
+	// If the sign bit is not set, we could fit the unsigned int into float32/float64.
+	// So, we convert it to float and emit jump instruction to exit from this branch.
+	cvt := m.allocateInstr()
+	cvt.asGprToXmm(op, rn, tmpXmm, src64)
+	m.insert(cvt)
+
+	// We are done, jump to end.
+	jmpEnd := m.allocateInstr()
+	jmpEnd.asJmp(newOperandLabel(end))
+	m.insert(jmpEnd)
+
+	// Now handling the case where sign-bit is set.
+	// We emit the following sequences:
+	// 	   mov      %rn, %tmp
+	// 	   shr      1, %tmp
+	// 	   mov      %rn, %tmp2
+	// 	   and      1, %tmp2
+	// 	   or       %tmp2, %tmp
+	// 	   cvtsi2ss %tmp, %xmm0
+	// 	   addsd    %xmm0, %xmm0
+	m.insert(ifSignTarget)
+
+	tmp := m.copyToTmp(rn.reg())
+	shr := m.allocateInstr()
+	shr.asShiftR(shiftROpShiftRightLogical, newOperandImm32(1), tmp, src64)
+	m.insert(shr)
+
+	tmp2 := m.copyToTmp(rn.reg())
+	and := m.allocateInstr()
+	and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp2, src64)
+	m.insert(and)
+
+	or := m.allocateInstr()
+	or.asAluRmiR(aluRmiROpcodeOr, newOperandReg(tmp2), tmp, src64)
+	m.insert(or)
+
+	cvt2 := m.allocateInstr()
+	cvt2.asGprToXmm(op, newOperandReg(tmp), tmpXmm, src64)
+	m.insert(cvt2)
+
+	addsd := m.allocateInstr()
+	if dst64 {
+		addsd.asXmmRmR(sseOpcodeAddsd, newOperandReg(tmpXmm), tmpXmm)
+	} else {
+		addsd.asXmmRmR(sseOpcodeAddss, newOperandReg(tmpXmm), tmpXmm)
+	}
+	m.insert(addsd)
+
+	m.insert(endTarget)
+	m.copyTo(tmpXmm, rd.reg())
+}
+
+func (m *machine) lowerVanyTrue(instr *ssa.Instruction) {
+	x := instr.Arg()
+	rm := m.getOperand_Reg(m.c.ValueDefinition(x))
+	rd := m.c.VRegOf(instr.Return())
+
+	tmp := m.c.AllocateVReg(ssa.TypeI32)
+
+	cmp := m.allocateInstr()
+	cmp.asXmmCmpRmR(sseOpcodePtest, rm, rm.reg())
+	m.insert(cmp)
+
+	setcc := m.allocateInstr()
+	setcc.asSetcc(condNZ, tmp)
+	m.insert(setcc)
+
+	// Clear the irrelevant bits.
+	and := m.allocateInstr()
+	and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp, false)
+	m.insert(and)
+
+	m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerVallTrue(instr *ssa.Instruction) {
+	x, lane := instr.ArgWithLane()
+	var op sseOpcode
+	switch lane {
+	case ssa.VecLaneI8x16:
+		op = sseOpcodePcmpeqb
+	case ssa.VecLaneI16x8:
+		op = sseOpcodePcmpeqw
+	case ssa.VecLaneI32x4:
+		op = sseOpcodePcmpeqd
+	case ssa.VecLaneI64x2:
+		op = sseOpcodePcmpeqq
+	}
+	rm := m.getOperand_Reg(m.c.ValueDefinition(x))
+	rd := m.c.VRegOf(instr.Return())
+
+	tmp := m.c.AllocateVReg(ssa.TypeV128)
+
+	zeros := m.allocateInstr()
+	zeros.asZeros(tmp)
+	m.insert(zeros)
+
+	pcmp := m.allocateInstr()
+	pcmp.asXmmRmR(op, rm, tmp)
+	m.insert(pcmp)
+
+	test := m.allocateInstr()
+	test.asXmmCmpRmR(sseOpcodePtest, newOperandReg(tmp), tmp)
+	m.insert(test)
+
+	tmp2 := m.c.AllocateVReg(ssa.TypeI32)
+
+	setcc := m.allocateInstr()
+	setcc.asSetcc(condZ, tmp2)
+	m.insert(setcc)
+
+	// Clear the irrelevant bits.
+	and := m.allocateInstr()
+	and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp2, false)
+	m.insert(and)
+
+	m.copyTo(tmp2, rd)
+}
+
+func (m *machine) lowerVhighBits(instr *ssa.Instruction) {
+	x, lane := instr.ArgWithLane()
+	rm := m.getOperand_Reg(m.c.ValueDefinition(x))
+	rd := m.c.VRegOf(instr.Return())
+	switch lane {
+	case ssa.VecLaneI8x16:
+		mov := m.allocateInstr()
+		mov.asXmmToGpr(sseOpcodePmovmskb, rm.reg(), rd, false)
+		m.insert(mov)
+
+	case ssa.VecLaneI16x8:
+		// When we have:
+		// 	R1 = [R1(w1), R1(w2), R1(w3), R1(w4), R1(w5), R1(w6), R1(w7), R1(v8)]
+		// 	R2 = [R2(w1), R2(w2), R2(w3), R2(v4), R2(w5), R2(w6), R2(w7), R2(v8)]
+		//	where RX(wn) is n-th signed word (16-bit) of RX register,
+		//
+		// "PACKSSWB R1, R2" produces
+		//  R1 = [
+		// 		byte_sat(R1(w1)), byte_sat(R1(w2)), byte_sat(R1(w3)), byte_sat(R1(w4)),
+		// 		byte_sat(R1(w5)), byte_sat(R1(w6)), byte_sat(R1(w7)), byte_sat(R1(w8)),
+		// 		byte_sat(R2(w1)), byte_sat(R2(w2)), byte_sat(R2(w3)), byte_sat(R2(w4)),
+		// 		byte_sat(R2(w5)), byte_sat(R2(w6)), byte_sat(R2(w7)), byte_sat(R2(w8)),
+		//  ]
+		//  where R1 is the destination register, and
+		// 	byte_sat(w) = int8(w) if w fits as signed 8-bit,
+		//                0x80 if w is less than 0x80
+		//                0x7F if w is greater than 0x7f
+		//
+		// See https://www.felixcloutier.com/x86/packsswb:packssdw for detail.
+		//
+		// Therefore, v.register ends up having i-th and (i+8)-th bit set if i-th lane is negative (for i in 0..8).
+		tmp := m.copyToTmp(rm.reg())
+		res := m.c.AllocateVReg(ssa.TypeI32)
+
+		pak := m.allocateInstr()
+		pak.asXmmRmR(sseOpcodePacksswb, rm, tmp)
+		m.insert(pak)
+
+		mov := m.allocateInstr()
+		mov.asXmmToGpr(sseOpcodePmovmskb, tmp, res, false)
+		m.insert(mov)
+
+		// Clear the higher bits than 8.
+		shr := m.allocateInstr()
+		shr.asShiftR(shiftROpShiftRightLogical, newOperandImm32(8), res, false)
+		m.insert(shr)
+
+		m.copyTo(res, rd)
+
+	case ssa.VecLaneI32x4:
+		mov := m.allocateInstr()
+		mov.asXmmToGpr(sseOpcodeMovmskps, rm.reg(), rd, true)
+		m.insert(mov)
+
+	case ssa.VecLaneI64x2:
+		mov := m.allocateInstr()
+		mov.asXmmToGpr(sseOpcodeMovmskpd, rm.reg(), rd, true)
+		m.insert(mov)
+	}
+}
+
+func (m *machine) lowerVbnot(instr *ssa.Instruction) {
+	x := instr.Arg()
+	xDef := m.c.ValueDefinition(x)
+	rm := m.getOperand_Reg(xDef)
+	rd := m.c.VRegOf(instr.Return())
+
+	tmp := m.copyToTmp(rm.reg())
+	tmp2 := m.c.AllocateVReg(ssa.TypeV128)
+
+	// Ensure tmp2 is considered defined by regalloc.
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2))
+
+	// Set all bits on tmp register.
+	pak := m.allocateInstr()
+	pak.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp2), tmp2)
+	m.insert(pak)
+
+	// Then XOR with tmp to reverse all bits on v.register.
+	xor := m.allocateInstr()
+	xor.asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp)
+	m.insert(xor)
+
+	m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerSplat(x, ret ssa.Value, lane ssa.VecLane) {
+	tmpDst := m.c.AllocateVReg(ssa.TypeV128)
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
+
+	switch lane {
+	case ssa.VecLaneI8x16:
+		tmp := m.c.AllocateVReg(ssa.TypeV128)
+		m.insert(m.allocateInstr().asDefineUninitializedReg(tmp))
+		xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, xx, tmpDst))
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp))
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpDst))
+	case ssa.VecLaneI16x8:
+		xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, xx, tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, xx, tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
+	case ssa.VecLaneI32x4:
+		xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, xx, tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
+	case ssa.VecLaneI64x2:
+		xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, xx, tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, xx, tmpDst))
+	case ssa.VecLaneF32x4:
+		xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, 0, xx, tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
+	case ssa.VecLaneF64x2:
+		xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst))
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, xx, tmpDst))
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerShuffle(x, y ssa.Value, lo, hi uint64, ret ssa.Value) {
+	var xMask, yMask [2]uint64
+	for i := 0; i < 8; i++ {
+		loLane := byte(lo >> (i * 8))
+		if loLane < 16 {
+			xMask[0] |= uint64(loLane) << (i * 8)
+			yMask[0] |= uint64(0x80) << (i * 8)
+		} else {
+			xMask[0] |= uint64(0x80) << (i * 8)
+			yMask[0] |= uint64(loLane-16) << (i * 8)
+		}
+		hiLane := byte(hi >> (i * 8))
+		if hiLane < 16 {
+			xMask[1] |= uint64(hiLane) << (i * 8)
+			yMask[1] |= uint64(0x80) << (i * 8)
+		} else {
+			xMask[1] |= uint64(0x80) << (i * 8)
+			yMask[1] |= uint64(hiLane-16) << (i * 8)
+		}
+	}
+
+	xmaskLabel := m.allocateLabel()
+	m.consts = append(m.consts, _const{lo: xMask[0], hi: xMask[1], label: xmaskLabel})
+	ymaskLabel := m.allocateLabel()
+	m.consts = append(m.consts, _const{lo: yMask[0], hi: yMask[1], label: ymaskLabel})
+
+	xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y))
+	tmpX, tmpY := m.copyToTmp(xx.reg()), m.copyToTmp(yy.reg())
+
+	// Apply mask to X.
+	tmp := m.c.AllocateVReg(ssa.TypeV128)
+	loadMaskLo := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(xmaskLabel.L)), tmp)
+	m.insert(loadMaskLo)
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpX))
+
+	// Apply mask to Y.
+	loadMaskHi := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(ymaskLabel.L)), tmp)
+	m.insert(loadMaskHi)
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpY))
+
+	// Combine the results.
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodeOrps, newOperandReg(tmpX), tmpY))
+
+	m.copyTo(tmpY, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVbBinOpUnaligned(op sseOpcode, x, y, ret ssa.Value) {
+	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+	rm := m.getOperand_Reg(m.c.ValueDefinition(y))
+	rd := m.c.VRegOf(ret)
+
+	tmp := m.copyToTmp(rn.reg())
+
+	binOp := m.allocateInstr()
+	binOp.asXmmRmR(op, rm, tmp)
+	m.insert(binOp)
+
+	m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerVbBinOp(op sseOpcode, x, y, ret ssa.Value) {
+	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+	rm := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+	rd := m.c.VRegOf(ret)
+
+	tmp := m.copyToTmp(rn.reg())
+
+	binOp := m.allocateInstr()
+	binOp.asXmmRmR(op, rm, tmp)
+	m.insert(binOp)
+
+	m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerVFcmp(x, y ssa.Value, c ssa.FloatCmpCond, ret ssa.Value, lane ssa.VecLane) {
+	var cmpOp sseOpcode
+	switch lane {
+	case ssa.VecLaneF32x4:
+		cmpOp = sseOpcodeCmpps
+	case ssa.VecLaneF64x2:
+		cmpOp = sseOpcodeCmppd
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	xx, yy := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+	var cmpImm cmpPred
+	switch c {
+	case ssa.FloatCmpCondGreaterThan:
+		yy, xx = xx, yy
+		cmpImm = cmpPredLT_OS
+	case ssa.FloatCmpCondGreaterThanOrEqual:
+		yy, xx = xx, yy
+		cmpImm = cmpPredLE_OS
+	case ssa.FloatCmpCondEqual:
+		cmpImm = cmpPredEQ_OQ
+	case ssa.FloatCmpCondNotEqual:
+		cmpImm = cmpPredNEQ_UQ
+	case ssa.FloatCmpCondLessThan:
+		cmpImm = cmpPredLT_OS
+	case ssa.FloatCmpCondLessThanOrEqual:
+		cmpImm = cmpPredLE_OS
+	default:
+		panic(fmt.Sprintf("invalid float comparison condition: %s", c))
+	}
+
+	tmp := m.c.AllocateVReg(ssa.TypeV128)
+	xxx := m.getOperand_Mem_Reg(xx)
+	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, xxx, tmp))
+
+	rm := m.getOperand_Mem_Reg(yy)
+	m.insert(m.allocateInstr().asXmmRmRImm(cmpOp, byte(cmpImm), rm, tmp))
+
+	m.copyTo(tmp, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVIcmp(x, y ssa.Value, c ssa.IntegerCmpCond, ret ssa.Value, lane ssa.VecLane) {
+	var eq, gt, maxu, minu, mins sseOpcode
+	switch lane {
+	case ssa.VecLaneI8x16:
+		eq, gt, maxu, minu, mins = sseOpcodePcmpeqb, sseOpcodePcmpgtb, sseOpcodePmaxub, sseOpcodePminub, sseOpcodePminsb
+	case ssa.VecLaneI16x8:
+		eq, gt, maxu, minu, mins = sseOpcodePcmpeqw, sseOpcodePcmpgtw, sseOpcodePmaxuw, sseOpcodePminuw, sseOpcodePminsw
+	case ssa.VecLaneI32x4:
+		eq, gt, maxu, minu, mins = sseOpcodePcmpeqd, sseOpcodePcmpgtd, sseOpcodePmaxud, sseOpcodePminud, sseOpcodePminsd
+	case ssa.VecLaneI64x2:
+		eq, gt = sseOpcodePcmpeqq, sseOpcodePcmpgtq
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	tmp := m.c.AllocateVReg(ssa.TypeV128)
+	var op operand
+	switch c {
+	case ssa.IntegerCmpCondSignedLessThanOrEqual:
+		if lane == ssa.VecLaneI64x2 {
+			x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+			// Copy x to tmp.
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp))
+			op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+		} else {
+			y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+			// Copy y to tmp.
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp))
+			op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+		}
+	case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
+		if lane == ssa.VecLaneI64x2 {
+			y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+			// Copy y to tmp.
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp))
+			op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+		} else {
+			x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+			// Copy x to tmp.
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp))
+			op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+		}
+	case ssa.IntegerCmpCondSignedLessThan, ssa.IntegerCmpCondUnsignedLessThan, ssa.IntegerCmpCondUnsignedLessThanOrEqual:
+		y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+		// Copy y to tmp.
+		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp))
+		op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+	default:
+		x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+		// Copy x to tmp.
+		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp))
+		op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+	}
+
+	switch c {
+	case ssa.IntegerCmpCondEqual:
+		m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
+	case ssa.IntegerCmpCondNotEqual:
+		// First we compare for equality.
+		m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
+		// Then flip the bits. To do so, we set all bits on tmp2.
+		tmp2 := m.c.AllocateVReg(ssa.TypeV128)
+		m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2))
+		m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2))
+		// And then xor with tmp.
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp))
+	case ssa.IntegerCmpCondSignedGreaterThan, ssa.IntegerCmpCondSignedLessThan:
+		m.insert(m.allocateInstr().asXmmRmR(gt, op, tmp))
+	case ssa.IntegerCmpCondSignedGreaterThanOrEqual, ssa.IntegerCmpCondSignedLessThanOrEqual:
+		if lane == ssa.VecLaneI64x2 {
+			m.insert(m.allocateInstr().asXmmRmR(gt, op, tmp))
+			// Then flip the bits. To do so, we set all bits on tmp2.
+			tmp2 := m.c.AllocateVReg(ssa.TypeV128)
+			m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2))
+			m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2))
+			// And then xor with tmp.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp))
+		} else {
+			// First take min of x and y.
+			m.insert(m.allocateInstr().asXmmRmR(mins, op, tmp))
+			// Then compare for equality.
+			m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
+		}
+	case ssa.IntegerCmpCondUnsignedGreaterThan, ssa.IntegerCmpCondUnsignedLessThan:
+		// First maxu of x and y.
+		m.insert(m.allocateInstr().asXmmRmR(maxu, op, tmp))
+		// Then compare for equality.
+		m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
+		// Then flip the bits. To do so, we set all bits on tmp2.
+		tmp2 := m.c.AllocateVReg(ssa.TypeV128)
+		m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2))
+		m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2))
+		// And then xor with tmp.
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp))
+	case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual, ssa.IntegerCmpCondUnsignedLessThanOrEqual:
+		m.insert(m.allocateInstr().asXmmRmR(minu, op, tmp))
+		m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp))
+	default:
+		panic("BUG")
+	}
+
+	m.copyTo(tmp, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVbandnot(instr *ssa.Instruction, op sseOpcode) {
+	x, y := instr.Arg2()
+	xDef := m.c.ValueDefinition(x)
+	yDef := m.c.ValueDefinition(y)
+	rm, rn := m.getOperand_Reg(xDef), m.getOperand_Reg(yDef)
+	rd := m.c.VRegOf(instr.Return())
+
+	tmp := m.copyToTmp(rn.reg())
+
+	// pandn between rn, rm.
+	pand := m.allocateInstr()
+	pand.asXmmRmR(sseOpcodePandn, rm, tmp)
+	m.insert(pand)
+
+	m.copyTo(tmp, rd)
+}
+
+func (m *machine) lowerVbitselect(instr *ssa.Instruction) {
+	c, x, y := instr.SelectData()
+	xDef := m.c.ValueDefinition(x)
+	yDef := m.c.ValueDefinition(y)
+	rm, rn := m.getOperand_Reg(xDef), m.getOperand_Reg(yDef)
+	creg := m.getOperand_Reg(m.c.ValueDefinition(c))
+	rd := m.c.VRegOf(instr.Return())
+
+	tmpC := m.copyToTmp(creg.reg())
+	tmpX := m.copyToTmp(rm.reg())
+
+	// And between c, x (overwrites x).
+	pand := m.allocateInstr()
+	pand.asXmmRmR(sseOpcodePand, creg, tmpX)
+	m.insert(pand)
+
+	// Andn between y, c (overwrites c).
+	pandn := m.allocateInstr()
+	pandn.asXmmRmR(sseOpcodePandn, rn, tmpC)
+	m.insert(pandn)
+
+	por := m.allocateInstr()
+	por.asXmmRmR(sseOpcodePor, newOperandReg(tmpC), tmpX)
+	m.insert(por)
+
+	m.copyTo(tmpX, rd)
+}
+
+func (m *machine) lowerVFmin(instr *ssa.Instruction) {
+	x, y, lane := instr.Arg2WithLane()
+	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+	rm := m.getOperand_Reg(m.c.ValueDefinition(y))
+	rd := m.c.VRegOf(instr.Return())
+
+	var min, cmp, andn, or, srl /* shift right logical */ sseOpcode
+	var shiftNumToInverseNaN uint32
+	if lane == ssa.VecLaneF32x4 {
+		min, cmp, andn, or, srl, shiftNumToInverseNaN = sseOpcodeMinps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodePsrld, 0xa
+	} else {
+		min, cmp, andn, or, srl, shiftNumToInverseNaN = sseOpcodeMinpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodePsrlq, 0xd
+	}
+
+	tmp1 := m.copyToTmp(rn.reg())
+	tmp2 := m.copyToTmp(rm.reg())
+
+	// tmp1=min(rn, rm)
+	minIns1 := m.allocateInstr()
+	minIns1.asXmmRmR(min, rn, tmp2)
+	m.insert(minIns1)
+
+	// tmp2=min(rm, rn)
+	minIns2 := m.allocateInstr()
+	minIns2.asXmmRmR(min, rm, tmp1)
+	m.insert(minIns2)
+
+	// tmp3:=tmp1=min(rn, rm)
+	tmp3 := m.copyToTmp(tmp1)
+
+	// tmp1 = -0         if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN
+	//       NaN         if rn == NaN || rm == NaN
+	//       min(rm, rm) otherwise
+	orIns := m.allocateInstr()
+	orIns.asXmmRmR(or, newOperandReg(tmp2), tmp1)
+	m.insert(orIns)
+
+	// tmp3 is originally min(rn,rm).
+	// tmp3 = 0^ (set all bits) if rn == NaN || rm == NaN
+	//        0 otherwise
+	cmpIns := m.allocateInstr()
+	cmpIns.asXmmRmRImm(cmp, uint8(cmpPredUNORD_Q), newOperandReg(tmp2), tmp3)
+	m.insert(cmpIns)
+
+	// tmp1 = -0          if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN
+	//        ^0          if rn == NaN || rm == NaN
+	//        min(v1, v2) otherwise
+	orIns2 := m.allocateInstr()
+	orIns2.asXmmRmR(or, newOperandReg(tmp3), tmp1)
+	m.insert(orIns2)
+
+	// tmp3 = set all bits on the mantissa bits
+	//        0 otherwise
+	shift := m.allocateInstr()
+	shift.asXmmRmiReg(srl, newOperandImm32(shiftNumToInverseNaN), tmp3)
+	m.insert(shift)
+
+	// tmp3 = tmp1 and !tmp3
+	//     = -0                                                   if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN
+	//       set all bits on exponential and sign bit (== NaN)    if rn == NaN || rm == NaN
+	//       min(rn, rm)                                          otherwise
+	andnIns := m.allocateInstr()
+	andnIns.asXmmRmR(andn, newOperandReg(tmp1), tmp3)
+	m.insert(andnIns)
+
+	m.copyTo(tmp3, rd)
+}
+
+func (m *machine) lowerVFmax(instr *ssa.Instruction) {
+	x, y, lane := instr.Arg2WithLane()
+	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+	rm := m.getOperand_Reg(m.c.ValueDefinition(y))
+	rd := m.c.VRegOf(instr.Return())
+
+	var max, cmp, andn, or, xor, sub, srl /* shift right logical */ sseOpcode
+	var shiftNumToInverseNaN uint32
+	if lane == ssa.VecLaneF32x4 {
+		max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = sseOpcodeMaxps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodeXorps, sseOpcodeSubps, sseOpcodePsrld, 0xa
+	} else {
+		max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = sseOpcodeMaxpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodeXorpd, sseOpcodeSubpd, sseOpcodePsrlq, 0xd
+	}
+
+	tmp0 := m.copyToTmp(rm.reg())
+	tmp1 := m.copyToTmp(rn.reg())
+
+	// tmp0=max(rn, rm)
+	maxIns1 := m.allocateInstr()
+	maxIns1.asXmmRmR(max, rn, tmp0)
+	m.insert(maxIns1)
+
+	// tmp1=max(rm, rn)
+	maxIns2 := m.allocateInstr()
+	maxIns2.asXmmRmR(max, rm, tmp1)
+	m.insert(maxIns2)
+
+	// tmp2=max(rm, rn)
+	tmp2 := m.copyToTmp(tmp1)
+
+	// tmp2 = -0       if (rn == -0 && rm == 0) || (rn == 0 && rm == -0)
+	//         0       if (rn == 0 && rm ==  0)
+	//        -0       if (rn == -0 && rm == -0)
+	//       v1^v2     if rn == NaN || rm == NaN
+	//         0       otherwise
+	xorInstr := m.allocateInstr()
+	xorInstr.asXmmRmR(xor, newOperandReg(tmp0), tmp2)
+	m.insert(xorInstr)
+	// tmp1 = -0           if (rn == -0 && rm == 0) || (rn == 0 && rm == -0)
+	//         0           if (rn == 0 && rm ==  0)
+	//        -0           if (rn == -0 && rm == -0)
+	//        NaN          if rn == NaN || rm == NaN
+	//        max(v1, v2)  otherwise
+	orInstr := m.allocateInstr()
+	orInstr.asXmmRmR(or, newOperandReg(tmp2), tmp1)
+	m.insert(orInstr)
+
+	tmp3 := m.copyToTmp(tmp1)
+
+	// tmp3 = 0           if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) || (rn == 0 && rm ==  0)
+	//       -0           if (rn == -0 && rm == -0)
+	//       NaN          if rn == NaN || rm == NaN
+	//       max(v1, v2)  otherwise
+	//
+	// Note: -0 - (-0) = 0 (!= -0) in floating point operation.
+	subIns := m.allocateInstr()
+	subIns.asXmmRmR(sub, newOperandReg(tmp2), tmp3)
+	m.insert(subIns)
+
+	// tmp1 = 0^ if rn == NaN || rm == NaN
+	cmpIns := m.allocateInstr()
+	cmpIns.asXmmRmRImm(cmp, uint8(cmpPredUNORD_Q), newOperandReg(tmp1), tmp1)
+	m.insert(cmpIns)
+
+	// tmp1 = set all bits on the mantissa bits
+	//        0 otherwise
+	shift := m.allocateInstr()
+	shift.asXmmRmiReg(srl, newOperandImm32(shiftNumToInverseNaN), tmp1)
+	m.insert(shift)
+
+	andnIns := m.allocateInstr()
+	andnIns.asXmmRmR(andn, newOperandReg(tmp3), tmp1)
+	m.insert(andnIns)
+
+	m.copyTo(tmp1, rd)
+}
+
+func (m *machine) lowerVFabs(instr *ssa.Instruction) {
+	x, lane := instr.ArgWithLane()
+	rm := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+	rd := m.c.VRegOf(instr.Return())
+
+	tmp := m.c.AllocateVReg(ssa.TypeV128)
+
+	def := m.allocateInstr()
+	def.asDefineUninitializedReg(tmp)
+	m.insert(def)
+
+	// Set all bits on tmp.
+	pcmp := m.allocateInstr()
+	pcmp.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp)
+	m.insert(pcmp)
+
+	switch lane {
+	case ssa.VecLaneF32x4:
+		// Shift right packed single floats by 1 to clear the sign bits.
+		shift := m.allocateInstr()
+		shift.asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp)
+		m.insert(shift)
+		// Clear the sign bit of rm.
+		andp := m.allocateInstr()
+		andp.asXmmRmR(sseOpcodeAndpd, rm, tmp)
+		m.insert(andp)
+	case ssa.VecLaneF64x2:
+		// Shift right packed single floats by 1 to clear the sign bits.
+		shift := m.allocateInstr()
+		shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(1), tmp)
+		m.insert(shift)
+		// Clear the sign bit of rm.
+		andp := m.allocateInstr()
+		andp.asXmmRmR(sseOpcodeAndps, rm, tmp)
+		m.insert(andp)
+	}
+
+	m.copyTo(tmp, rd)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go
new file mode 100644
index 000000000..8fa974c66
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go
@@ -0,0 +1,304 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+)
+
+// PostRegAlloc implements backend.Machine.
+func (m *machine) PostRegAlloc() {
+	m.setupPrologue()
+	m.postRegAlloc()
+}
+
+func (m *machine) setupPrologue() {
+	cur := m.ectx.RootInstr
+	prevInitInst := cur.next
+
+	// At this point, we have the stack layout as follows:
+	//
+	//                   (high address)
+	//                 +-----------------+ <----- RBP (somewhere in the middle of the stack)
+	//                 |     .......     |
+	//                 |      ret Y      |
+	//                 |     .......     |
+	//                 |      ret 0      |
+	//                 |      arg X      |
+	//                 |     .......     |
+	//                 |      arg 1      |
+	//                 |      arg 0      |
+	//                 |   Return Addr   |
+	//       RSP ----> +-----------------+
+	//                    (low address)
+
+	// First, we push the RBP, and update the RBP to the current RSP.
+	//
+	//                   (high address)                     (high address)
+	//       RBP ----> +-----------------+                +-----------------+
+	//                 |     .......     |                |     .......     |
+	//                 |      ret Y      |                |      ret Y      |
+	//                 |     .......     |                |     .......     |
+	//                 |      ret 0      |                |      ret 0      |
+	//                 |      arg X      |                |      arg X      |
+	//                 |     .......     |     ====>      |     .......     |
+	//                 |      arg 1      |                |      arg 1      |
+	//                 |      arg 0      |                |      arg 0      |
+	//                 |   Return Addr   |                |   Return Addr   |
+	//       RSP ----> +-----------------+                |    Caller_RBP   |
+	//                    (low address)                   +-----------------+ <----- RSP, RBP
+	//
+	cur = m.setupRBPRSP(cur)
+
+	if !m.stackBoundsCheckDisabled {
+		cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur)
+	}
+
+	//
+	//            (high address)
+	//          +-----------------+                  +-----------------+
+	//          |     .......     |                  |     .......     |
+	//          |      ret Y      |                  |      ret Y      |
+	//          |     .......     |                  |     .......     |
+	//          |      ret 0      |                  |      ret 0      |
+	//          |      arg X      |                  |      arg X      |
+	//          |     .......     |                  |     .......     |
+	//          |      arg 1      |                  |      arg 1      |
+	//          |      arg 0      |                  |      arg 0      |
+	//          |      xxxxx      |                  |      xxxxx      |
+	//          |   Return Addr   |                  |   Return Addr   |
+	//          |    Caller_RBP   |      ====>       |    Caller_RBP   |
+	// RBP,RSP->+-----------------+                  +-----------------+ <----- RBP
+	//             (low address)                     |   clobbered M   |
+	//                                               |   clobbered 1   |
+	//                                               |   ...........   |
+	//                                               |   clobbered 0   |
+	//                                               +-----------------+ <----- RSP
+	//
+	if regs := m.clobberedRegs; len(regs) > 0 {
+		for i := range regs {
+			r := regs[len(regs)-1-i] // Reverse order.
+			if r.RegType() == regalloc.RegTypeInt {
+				cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r)))
+			} else {
+				// Push the XMM register is not supported by the PUSH instruction.
+				cur = m.addRSP(-16, cur)
+				push := m.allocateInstr().asXmmMovRM(
+					sseOpcodeMovdqu, r, newOperandMem(m.newAmodeImmReg(0, rspVReg)),
+				)
+				cur = linkInstr(cur, push)
+			}
+		}
+	}
+
+	if size := m.spillSlotSize; size > 0 {
+		// Simply decrease the RSP to allocate the spill slots.
+		// 		sub $size, %rsp
+		cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(size)), rspVReg, true))
+
+		// At this point, we have the stack layout as follows:
+		//
+		//            (high address)
+		//          +-----------------+
+		//          |     .......     |
+		//          |      ret Y      |
+		//          |     .......     |
+		//          |      ret 0      |
+		//          |      arg X      |
+		//          |     .......     |
+		//          |      arg 1      |
+		//          |      arg 0      |
+		//          |   ReturnAddress |
+		//          |   Caller_RBP    |
+		//          +-----------------+ <--- RBP
+		//          |    clobbered M  |
+		//          |   ............  |
+		//          |    clobbered 1  |
+		//          |    clobbered 0  |
+		//          |   spill slot N  |
+		//          |   ............  |
+		//          |   spill slot 0  |
+		//          +-----------------+ <--- RSP
+		//             (low address)
+	}
+
+	linkInstr(cur, prevInitInst)
+}
+
+// postRegAlloc does multiple things while walking through the instructions:
+// 1. Inserts the epilogue code.
+// 2. Removes the redundant copy instruction.
+// 3. Inserts the dec/inc RSP instruction right before/after the call instruction.
+// 4. Lowering that is supposed to be done after regalloc.
+func (m *machine) postRegAlloc() {
+	ectx := m.ectx
+	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+		switch k := cur.kind; k {
+		case ret:
+			m.setupEpilogueAfter(cur.prev)
+			continue
+		case fcvtToSintSequence, fcvtToUintSequence:
+			m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
+			if k == fcvtToSintSequence {
+				m.lowerFcvtToSintSequenceAfterRegalloc(cur)
+			} else {
+				m.lowerFcvtToUintSequenceAfterRegalloc(cur)
+			}
+			prev := cur.prev
+			next := cur.next
+			cur := prev
+			for _, instr := range m.ectx.PendingInstructions {
+				cur = linkInstr(cur, instr)
+			}
+			linkInstr(cur, next)
+			continue
+		case xmmCMov:
+			m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
+			m.lowerXmmCmovAfterRegAlloc(cur)
+			prev := cur.prev
+			next := cur.next
+			cur := prev
+			for _, instr := range m.ectx.PendingInstructions {
+				cur = linkInstr(cur, instr)
+			}
+			linkInstr(cur, next)
+			continue
+		case idivRemSequence:
+			m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
+			m.lowerIDivRemSequenceAfterRegAlloc(cur)
+			prev := cur.prev
+			next := cur.next
+			cur := prev
+			for _, instr := range m.ectx.PendingInstructions {
+				cur = linkInstr(cur, instr)
+			}
+			linkInstr(cur, next)
+			continue
+		case call, callIndirect:
+			// At this point, reg alloc is done, therefore we can safely insert dec/inc RPS instruction
+			// right before/after the call instruction. If this is done before reg alloc, the stack slot
+			// can point to the wrong location and therefore results in a wrong value.
+			call := cur
+			next := call.next
+			_, _, _, _, size := backend.ABIInfoFromUint64(call.u2)
+			if size > 0 {
+				dec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(size), rspVReg, true)
+				linkInstr(call.prev, dec)
+				linkInstr(dec, call)
+				inc := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(size), rspVReg, true)
+				linkInstr(call, inc)
+				linkInstr(inc, next)
+			}
+			continue
+		}
+
+		// Removes the redundant copy instruction.
+		if cur.IsCopy() && cur.op1.reg().RealReg() == cur.op2.reg().RealReg() {
+			prev, next := cur.prev, cur.next
+			// Remove the copy instruction.
+			prev.next = next
+			if next != nil {
+				next.prev = prev
+			}
+		}
+	}
+}
+
+func (m *machine) setupEpilogueAfter(cur *instruction) {
+	prevNext := cur.next
+
+	// At this point, we have the stack layout as follows:
+	//
+	//            (high address)
+	//          +-----------------+
+	//          |     .......     |
+	//          |      ret Y      |
+	//          |     .......     |
+	//          |      ret 0      |
+	//          |      arg X      |
+	//          |     .......     |
+	//          |      arg 1      |
+	//          |      arg 0      |
+	//          |   ReturnAddress |
+	//          |   Caller_RBP    |
+	//          +-----------------+ <--- RBP
+	//          |    clobbered M  |
+	//          |   ............  |
+	//          |    clobbered 1  |
+	//          |    clobbered 0  |
+	//          |   spill slot N  |
+	//          |   ............  |
+	//          |   spill slot 0  |
+	//          +-----------------+ <--- RSP
+	//             (low address)
+
+	if size := m.spillSlotSize; size > 0 {
+		// Simply increase the RSP to free the spill slots.
+		// 		add $size, %rsp
+		cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(uint32(size)), rspVReg, true))
+	}
+
+	//
+	//             (high address)
+	//            +-----------------+                     +-----------------+
+	//            |     .......     |                     |     .......     |
+	//            |      ret Y      |                     |      ret Y      |
+	//            |     .......     |                     |     .......     |
+	//            |      ret 0      |                     |      ret 0      |
+	//            |      arg X      |                     |      arg X      |
+	//            |     .......     |                     |     .......     |
+	//            |      arg 1      |                     |      arg 1      |
+	//            |      arg 0      |                     |      arg 0      |
+	//            |   ReturnAddress |                     |   ReturnAddress |
+	//            |    Caller_RBP   |                     |    Caller_RBP   |
+	//   RBP ---> +-----------------+      ========>      +-----------------+ <---- RSP, RBP
+	//            |    clobbered M  |
+	//            |   ............  |
+	//            |    clobbered 1  |
+	//            |    clobbered 0  |
+	//   RSP ---> +-----------------+
+	//               (low address)
+	//
+	if regs := m.clobberedRegs; len(regs) > 0 {
+		for _, r := range regs {
+			if r.RegType() == regalloc.RegTypeInt {
+				cur = linkInstr(cur, m.allocateInstr().asPop64(r))
+			} else {
+				// Pop the XMM register is not supported by the POP instruction.
+				pop := m.allocateInstr().asXmmUnaryRmR(
+					sseOpcodeMovdqu, newOperandMem(m.newAmodeImmReg(0, rspVReg)), r,
+				)
+				cur = linkInstr(cur, pop)
+				cur = m.addRSP(16, cur)
+			}
+		}
+	}
+
+	// Now roll back the RSP to RBP, and pop the caller's RBP.
+	cur = m.revertRBPRSP(cur)
+
+	linkInstr(cur, prevNext)
+}
+
+func (m *machine) addRSP(offset int32, cur *instruction) *instruction {
+	if offset == 0 {
+		return cur
+	}
+	opcode := aluRmiROpcodeAdd
+	if offset < 0 {
+		opcode = aluRmiROpcodeSub
+		offset = -offset
+	}
+	return linkInstr(cur, m.allocateInstr().asAluRmiR(opcode, newOperandImm32(uint32(offset)), rspVReg, true))
+}
+
+func (m *machine) setupRBPRSP(cur *instruction) *instruction {
+	cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(rbpVReg)))
+	cur = linkInstr(cur, m.allocateInstr().asMovRR(rspVReg, rbpVReg, true))
+	return cur
+}
+
+func (m *machine) revertRBPRSP(cur *instruction) *instruction {
+	cur = linkInstr(cur, m.allocateInstr().asMovRR(rbpVReg, rspVReg, true))
+	cur = linkInstr(cur, m.allocateInstr().asPop64(rbpVReg))
+	return cur
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go
new file mode 100644
index 000000000..0bb28ee9e
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go
@@ -0,0 +1,153 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// InsertMoveBefore implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
+	typ := src.RegType()
+	if typ != dst.RegType() {
+		panic("BUG: src and dst must have the same type")
+	}
+
+	mov := m.allocateInstr()
+	if typ == regalloc.RegTypeInt {
+		mov.asMovRR(src, dst, true)
+	} else {
+		mov.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), dst)
+	}
+
+	cur := instr.prev
+	prevNext := cur.next
+	cur = linkInstr(cur, mov)
+	linkInstr(cur, prevNext)
+}
+
+// InsertStoreRegisterAt implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+	if !v.IsRealReg() {
+		panic("BUG: VReg must be backed by real reg to be stored")
+	}
+
+	typ := m.c.TypeOf(v)
+
+	var prevNext, cur *instruction
+	if after {
+		cur, prevNext = instr, instr.next
+	} else {
+		cur, prevNext = instr.prev, instr
+	}
+
+	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
+	store := m.allocateInstr()
+	mem := newOperandMem(m.newAmodeImmReg(uint32(offsetFromSP), rspVReg))
+	switch typ {
+	case ssa.TypeI32:
+		store.asMovRM(v, mem, 4)
+	case ssa.TypeI64:
+		store.asMovRM(v, mem, 8)
+	case ssa.TypeF32:
+		store.asXmmMovRM(sseOpcodeMovss, v, mem)
+	case ssa.TypeF64:
+		store.asXmmMovRM(sseOpcodeMovsd, v, mem)
+	case ssa.TypeV128:
+		store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
+	}
+
+	cur = linkInstr(cur, store)
+	return linkInstr(cur, prevNext)
+}
+
+// InsertReloadRegisterAt implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+	if !v.IsRealReg() {
+		panic("BUG: VReg must be backed by real reg to be stored")
+	}
+
+	typ := m.c.TypeOf(v)
+	var prevNext, cur *instruction
+	if after {
+		cur, prevNext = instr, instr.next
+	} else {
+		cur, prevNext = instr.prev, instr
+	}
+
+	// Load the value to the temporary.
+	load := m.allocateInstr()
+	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
+	a := newOperandMem(m.newAmodeImmReg(uint32(offsetFromSP), rspVReg))
+	switch typ {
+	case ssa.TypeI32:
+		load.asMovzxRmR(extModeLQ, a, v)
+	case ssa.TypeI64:
+		load.asMov64MR(a, v)
+	case ssa.TypeF32:
+		load.asXmmUnaryRmR(sseOpcodeMovss, a, v)
+	case ssa.TypeF64:
+		load.asXmmUnaryRmR(sseOpcodeMovsd, a, v)
+	case ssa.TypeV128:
+		load.asXmmUnaryRmR(sseOpcodeMovdqu, a, v)
+	default:
+		panic("BUG")
+	}
+
+	cur = linkInstr(cur, load)
+	return linkInstr(cur, prevNext)
+}
+
+// ClobberedRegisters implements backend.RegAllocFunctionMachine.
+func (m *machine) ClobberedRegisters(regs []regalloc.VReg) {
+	m.clobberedRegs = append(m.clobberedRegs[:0], regs...)
+}
+
+// Swap implements backend.RegAllocFunctionMachine.
+func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
+	if x1.RegType() == regalloc.RegTypeInt {
+		prevNext := cur.next
+		xc := m.allocateInstr().asXCHG(x1, newOperandReg(x2), 8)
+		cur = linkInstr(cur, xc)
+		linkInstr(cur, prevNext)
+	} else {
+		if tmp.Valid() {
+			prevNext := cur.next
+			m.InsertMoveBefore(tmp, x1, prevNext)
+			m.InsertMoveBefore(x1, x2, prevNext)
+			m.InsertMoveBefore(x2, tmp, prevNext)
+		} else {
+			prevNext := cur.next
+			r2 := x2.RealReg()
+			// Temporarily spill x1 to stack.
+			cur = m.InsertStoreRegisterAt(x1, cur, true).prev
+			// Then move x2 to x1.
+			cur = linkInstr(cur, m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqa, newOperandReg(x2), x1))
+			linkInstr(cur, prevNext)
+			// Then reload the original value on x1 from stack to r2.
+			m.InsertReloadRegisterAt(x1.SetRealReg(r2), cur, true)
+		}
+	}
+}
+
+// LastInstrForInsertion implements backend.RegAllocFunctionMachine.
+func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction {
+	cur := end
+	for cur.kind == nop0 {
+		cur = cur.prev
+		if cur == begin {
+			return end
+		}
+	}
+	switch cur.kind {
+	case jmp:
+		return cur
+	default:
+		return end
+	}
+}
+
+// SSABlockLabel implements backend.RegAllocFunctionMachine.
+func (m *machine) SSABlockLabel(id ssa.BasicBlockID) backend.Label {
+	return m.ectx.SsaBlockIDToLabels[id]
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go
new file mode 100644
index 000000000..539a8b754
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go
@@ -0,0 +1,992 @@
+package amd64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+var swizzleMask = [16]byte{
+	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+}
+
+func (m *machine) lowerSwizzle(x, y ssa.Value, ret ssa.Value) {
+	masklabel := m.getOrAllocateConstLabel(&m.constSwizzleMaskConstIndex, swizzleMask[:])
+
+	// Load mask to maskReg.
+	maskReg := m.c.AllocateVReg(ssa.TypeV128)
+	loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(masklabel)), maskReg)
+	m.insert(loadMask)
+
+	// Copy x and y to tmp registers.
+	xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	tmpDst := m.copyToTmp(xx.reg())
+	yy := m.getOperand_Reg(m.c.ValueDefinition(y))
+	tmpX := m.copyToTmp(yy.reg())
+
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddusb, newOperandReg(maskReg), tmpX))
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpX), tmpDst))
+
+	// Copy the result to the destination register.
+	m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerInsertLane(x, y ssa.Value, index byte, ret ssa.Value, lane ssa.VecLane) {
+	// Copy x to tmp.
+	tmpDst := m.c.AllocateVReg(ssa.TypeV128)
+	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), tmpDst))
+
+	yy := m.getOperand_Reg(m.c.ValueDefinition(y))
+	switch lane {
+	case ssa.VecLaneI8x16:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, index, yy, tmpDst))
+	case ssa.VecLaneI16x8:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, index, yy, tmpDst))
+	case ssa.VecLaneI32x4:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, index, yy, tmpDst))
+	case ssa.VecLaneI64x2:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, index, yy, tmpDst))
+	case ssa.VecLaneF32x4:
+		// In INSERTPS instruction, the destination index is encoded at 4 and 5 bits of the argument.
+		// See https://www.felixcloutier.com/x86/insertps
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, index<<4, yy, tmpDst))
+	case ssa.VecLaneF64x2:
+		if index == 0 {
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, yy, tmpDst))
+		} else {
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, yy, tmpDst))
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerExtractLane(x ssa.Value, index byte, signed bool, ret ssa.Value, lane ssa.VecLane) {
+	// Pextr variants are used to extract a lane from a vector register.
+	xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+
+	tmpDst := m.c.AllocateVReg(ret.Type())
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
+	switch lane {
+	case ssa.VecLaneI8x16:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrb, index, xx, tmpDst))
+		if signed {
+			m.insert(m.allocateInstr().asMovsxRmR(extModeBL, newOperandReg(tmpDst), tmpDst))
+		} else {
+			m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(tmpDst), tmpDst))
+		}
+	case ssa.VecLaneI16x8:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrw, index, xx, tmpDst))
+		if signed {
+			m.insert(m.allocateInstr().asMovsxRmR(extModeWL, newOperandReg(tmpDst), tmpDst))
+		} else {
+			m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(tmpDst), tmpDst))
+		}
+	case ssa.VecLaneI32x4:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrd, index, xx, tmpDst))
+	case ssa.VecLaneI64x2:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, index, xx, tmpDst))
+	case ssa.VecLaneF32x4:
+		if index == 0 {
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovss, xx, tmpDst))
+		} else {
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, index, xx, tmpDst))
+		}
+	case ssa.VecLaneF64x2:
+		if index == 0 {
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst))
+		} else {
+			m.copyTo(xx.reg(), tmpDst)
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0b00_00_11_10, newOperandReg(tmpDst), tmpDst))
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+var sqmulRoundSat = [16]byte{
+	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+}
+
+func (m *machine) lowerSqmulRoundSat(x, y, ret ssa.Value) {
+	// See https://github.com/WebAssembly/simd/pull/365 for the following logic.
+	maskLabel := m.getOrAllocateConstLabel(&m.constSqmulRoundSatIndex, sqmulRoundSat[:])
+
+	tmp := m.c.AllocateVReg(ssa.TypeV128)
+	loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp)
+	m.insert(loadMask)
+
+	xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+	tmpX := m.copyToTmp(xx.reg())
+
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmulhrsw, yy, tmpX))
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmpX), tmp))
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmpX))
+
+	m.copyTo(tmpX, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVUshr(x, y, ret ssa.Value, lane ssa.VecLane) {
+	switch lane {
+	case ssa.VecLaneI8x16:
+		m.lowerVUshri8x16(x, y, ret)
+	case ssa.VecLaneI16x8, ssa.VecLaneI32x4, ssa.VecLaneI64x2:
+		m.lowerShr(x, y, ret, lane, false)
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+}
+
+// i8x16LogicalSHRMaskTable is necessary for emulating non-existent packed bytes logical right shifts on amd64.
+// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
+var i8x16LogicalSHRMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
+	0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // for 1 shift
+	0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, // for 2 shift
+	0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // for 3 shift
+	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // for 4 shift
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // for 5 shift
+	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // for 6 shift
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // for 7 shift
+}
+
+func (m *machine) lowerVUshri8x16(x, y, ret ssa.Value) {
+	tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
+	// Load the modulo 8 mask to tmpReg.
+	m.lowerIconst(tmpGpReg, 0x7, false)
+	// Take the modulo 8 of the shift amount.
+	shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y))
+	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, tmpGpReg, false))
+
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+
+	vecTmp := m.c.AllocateVReg(ssa.TypeV128)
+	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), vecTmp, false))
+	m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrlw, newOperandReg(vecTmp), xx))
+
+	maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16LogicalSHRMaskTableIndex, i8x16LogicalSHRMaskTable[:])
+	base := m.c.AllocateVReg(ssa.TypeI64)
+	lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base)
+	m.insert(lea)
+
+	// Shift tmpGpReg by 4 to multiply the shift amount by 16.
+	m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false))
+
+	mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0)
+	loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), vecTmp)
+	m.insert(loadMask)
+
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(vecTmp), xx))
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVSshr(x, y, ret ssa.Value, lane ssa.VecLane) {
+	switch lane {
+	case ssa.VecLaneI8x16:
+		m.lowerVSshri8x16(x, y, ret)
+	case ssa.VecLaneI16x8, ssa.VecLaneI32x4:
+		m.lowerShr(x, y, ret, lane, true)
+	case ssa.VecLaneI64x2:
+		m.lowerVSshri64x2(x, y, ret)
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+}
+
+func (m *machine) lowerVSshri8x16(x, y, ret ssa.Value) {
+	shiftAmtReg := m.c.AllocateVReg(ssa.TypeI32)
+	// Load the modulo 8 mask to tmpReg.
+	m.lowerIconst(shiftAmtReg, 0x7, false)
+	// Take the modulo 8 of the shift amount.
+	shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y))
+	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, shiftAmtReg, false))
+
+	// Copy the x value to two temporary registers.
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+	vecTmp := m.c.AllocateVReg(ssa.TypeV128)
+	m.copyTo(xx, vecTmp)
+
+	// Assuming that we have
+	//  xx   = [b1, ..., b16]
+	//  vecTmp = [b1, ..., b16]
+	// at this point, then we use PUNPCKLBW and PUNPCKHBW to produce:
+	//  xx   = [b1, b1, b2, b2, ..., b8, b8]
+	//  vecTmp = [b9, b9, b10, b10, ..., b16, b16]
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpcklbw, newOperandReg(xx), xx))
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpckhbw, newOperandReg(vecTmp), vecTmp))
+
+	// Adding 8 to the shift amount, and then move the amount to vecTmp2.
+	vecTmp2 := m.c.AllocateVReg(ssa.TypeV128)
+	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(8), shiftAmtReg, false))
+	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(shiftAmtReg), vecTmp2, false))
+
+	// Perform the word packed arithmetic right shifts on vreg and vecTmp.
+	// This changes these two registers as:
+	//  xx   = [xxx, b1 >> s, xxx, b2 >> s, ..., xxx, b8 >> s]
+	//  vecTmp = [xxx, b9 >> s, xxx, b10 >> s, ..., xxx, b16 >> s]
+	// where xxx is 1 or 0 depending on each byte's sign, and ">>" is the arithmetic shift on a byte.
+	m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), xx))
+	m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), vecTmp))
+
+	// Finally, we can get the result by packing these two word vectors.
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePacksswb, newOperandReg(vecTmp), xx))
+
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVSshri64x2(x, y, ret ssa.Value) {
+	// Load the shift amount to RCX.
+	shiftAmt := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+	m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, shiftAmt, rcxVReg))
+
+	tmpGp := m.c.AllocateVReg(ssa.TypeI64)
+
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xxReg := m.copyToTmp(_xx.reg())
+
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
+	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 0, newOperandReg(xxReg), tmpGp))
+	m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true))
+	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), xxReg))
+	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 1, newOperandReg(xxReg), tmpGp))
+	m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true))
+	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), xxReg))
+
+	m.copyTo(xxReg, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerShr(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	var modulo uint64
+	var shiftOp sseOpcode
+	switch lane {
+	case ssa.VecLaneI16x8:
+		modulo = 0xf
+		if signed {
+			shiftOp = sseOpcodePsraw
+		} else {
+			shiftOp = sseOpcodePsrlw
+		}
+	case ssa.VecLaneI32x4:
+		modulo = 0x1f
+		if signed {
+			shiftOp = sseOpcodePsrad
+		} else {
+			shiftOp = sseOpcodePsrld
+		}
+	case ssa.VecLaneI64x2:
+		modulo = 0x3f
+		if signed {
+			panic("BUG")
+		}
+		shiftOp = sseOpcodePsrlq
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+
+	tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
+	// Load the modulo 8 mask to tmpReg.
+	m.lowerIconst(tmpGpReg, modulo, false)
+	// Take the modulo 8 of the shift amount.
+	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd,
+		m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false))
+	// And move it to a xmm register.
+	tmpVec := m.c.AllocateVReg(ssa.TypeV128)
+	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false))
+
+	// Then do the actual shift.
+	m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx))
+
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVIshl(x, y, ret ssa.Value, lane ssa.VecLane) {
+	var modulo uint64
+	var shiftOp sseOpcode
+	var isI8x16 bool
+	switch lane {
+	case ssa.VecLaneI8x16:
+		isI8x16 = true
+		modulo = 0x7
+		shiftOp = sseOpcodePsllw
+	case ssa.VecLaneI16x8:
+		modulo = 0xf
+		shiftOp = sseOpcodePsllw
+	case ssa.VecLaneI32x4:
+		modulo = 0x1f
+		shiftOp = sseOpcodePslld
+	case ssa.VecLaneI64x2:
+		modulo = 0x3f
+		shiftOp = sseOpcodePsllq
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+
+	tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
+	// Load the modulo 8 mask to tmpReg.
+	m.lowerIconst(tmpGpReg, modulo, false)
+	// Take the modulo 8 of the shift amount.
+	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd,
+		m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false))
+	// And move it to a xmm register.
+	tmpVec := m.c.AllocateVReg(ssa.TypeV128)
+	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false))
+
+	// Then do the actual shift.
+	m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx))
+
+	if isI8x16 {
+		maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16SHLMaskTableIndex, i8x16SHLMaskTable[:])
+		base := m.c.AllocateVReg(ssa.TypeI64)
+		lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base)
+		m.insert(lea)
+
+		// Shift tmpGpReg by 4 to multiply the shift amount by 16.
+		m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false))
+
+		mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0)
+		loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), tmpVec)
+		m.insert(loadMask)
+
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(tmpVec), xx))
+	}
+
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+// i8x16SHLMaskTable is necessary for emulating non-existent packed bytes left shifts on amd64.
+// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
+var i8x16SHLMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, // for 1 shift
+	0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, // for 2 shift
+	0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, // for 3 shift
+	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // for 4 shift
+	0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, // for 5 shift
+	0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, // for 6 shift
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // for 7 shift
+}
+
+func (m *machine) lowerVRound(x, ret ssa.Value, imm byte, _64 bool) {
+	xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+	var round sseOpcode
+	if _64 {
+		round = sseOpcodeRoundpd
+	} else {
+		round = sseOpcodeRoundps
+	}
+	m.insert(m.allocateInstr().asXmmUnaryRmRImm(round, imm, xx, m.c.VRegOf(ret)))
+}
+
+var (
+	allOnesI8x16              = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1}
+	allOnesI16x8              = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0}
+	extAddPairwiseI16x8uMask1 = [16]byte{0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80}
+	extAddPairwiseI16x8uMask2 = [16]byte{0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00}
+)
+
+func (m *machine) lowerExtIaddPairwise(x, ret ssa.Value, srcLane ssa.VecLane, signed bool) {
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+	switch srcLane {
+	case ssa.VecLaneI8x16:
+		allOneReg := m.c.AllocateVReg(ssa.TypeV128)
+		mask := m.getOrAllocateConstLabel(&m.constAllOnesI8x16Index, allOnesI8x16[:])
+		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOneReg))
+
+		var resultReg regalloc.VReg
+		if signed {
+			resultReg = allOneReg
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(xx), resultReg))
+		} else {
+			// Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned.
+			resultReg = xx
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(allOneReg), resultReg))
+		}
+		m.copyTo(resultReg, m.c.VRegOf(ret))
+
+	case ssa.VecLaneI16x8:
+		if signed {
+			allOnesReg := m.c.AllocateVReg(ssa.TypeV128)
+			mask := m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOnesReg))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(allOnesReg), xx))
+			m.copyTo(xx, m.c.VRegOf(ret))
+		} else {
+			maskReg := m.c.AllocateVReg(ssa.TypeV128)
+			mask := m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask1Index, extAddPairwiseI16x8uMask1[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
+
+			// Flip the sign bits on xx.
+			//
+			// Assuming that xx = [w1, ..., w8], now we have,
+			// 	xx[i] = int8(-w1) for i = 0...8
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(maskReg), xx))
+
+			mask = m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
+
+			// For i = 0,..4 (as this results in i32x4 lanes), now we have
+			// xx[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1)))
+			// c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr)
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(maskReg), xx))
+
+			mask = m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask2Index, extAddPairwiseI16x8uMask2[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
+
+			// vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)).
+			// c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr)
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(maskReg), xx))
+
+			m.copyTo(xx, m.c.VRegOf(ret))
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", srcLane))
+	}
+}
+
+func (m *machine) lowerWidenLow(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	var sseOp sseOpcode
+	switch lane {
+	case ssa.VecLaneI8x16:
+		if signed {
+			sseOp = sseOpcodePmovsxbw
+		} else {
+			sseOp = sseOpcodePmovzxbw
+		}
+	case ssa.VecLaneI16x8:
+		if signed {
+			sseOp = sseOpcodePmovsxwd
+		} else {
+			sseOp = sseOpcodePmovzxwd
+		}
+	case ssa.VecLaneI32x4:
+		if signed {
+			sseOp = sseOpcodePmovsxdq
+		} else {
+			sseOp = sseOpcodePmovzxdq
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, xx, m.c.VRegOf(ret)))
+}
+
+func (m *machine) lowerWidenHigh(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	tmp := m.c.AllocateVReg(ssa.TypeV128)
+	xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	m.copyTo(xx.reg(), tmp)
+	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePalignr, 8, newOperandReg(tmp), tmp))
+
+	var sseOp sseOpcode
+	switch lane {
+	case ssa.VecLaneI8x16:
+		if signed {
+			sseOp = sseOpcodePmovsxbw
+		} else {
+			sseOp = sseOpcodePmovzxbw
+		}
+	case ssa.VecLaneI16x8:
+		if signed {
+			sseOp = sseOpcodePmovsxwd
+		} else {
+			sseOp = sseOpcodePmovzxwd
+		}
+	case ssa.VecLaneI32x4:
+		if signed {
+			sseOp = sseOpcodePmovsxdq
+		} else {
+			sseOp = sseOpcodePmovzxdq
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandReg(tmp), m.c.VRegOf(ret)))
+}
+
+func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, ret ssa.Value, lane ssa.VecLane) {
+	tmpDst, tmpGp := m.c.AllocateVReg(ssa.TypeV128), m.c.AllocateVReg(ssa.TypeI64)
+	am := newOperandMem(m.lowerToAddressMode(ptr, offset))
+
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
+	switch lane {
+	case ssa.VecLaneI8x16:
+		m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, am, tmpGp))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, newOperandReg(tmpGp), tmpDst))
+		tmpZeroVec := m.c.AllocateVReg(ssa.TypeV128)
+		m.insert(m.allocateInstr().asZeros(tmpZeroVec))
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpZeroVec), tmpDst))
+	case ssa.VecLaneI16x8:
+		m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, am, tmpGp))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, newOperandReg(tmpGp), tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, newOperandReg(tmpGp), tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
+	case ssa.VecLaneI32x4:
+		m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, am, tmpGp))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, newOperandReg(tmpGp), tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
+	case ssa.VecLaneI64x2:
+		m.insert(m.allocateInstr().asMov64MR(am, tmpGp))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), tmpDst))
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+var f64x2CvtFromIMask = [16]byte{
+	0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+}
+
+func (m *machine) lowerVFcvtFromInt(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	switch lane {
+	case ssa.VecLaneF32x4:
+		if signed {
+			xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, xx, m.c.VRegOf(ret)))
+		} else {
+			xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+			// Copy the value to two temporary registers.
+			tmp := m.copyToTmp(xx.reg())
+			tmp2 := m.copyToTmp(xx.reg())
+
+			// Clear the higher 16 bits of each 32-bit element.
+			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePslld, newOperandImm32(0xa), tmp))
+			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0xa), tmp))
+
+			// Subtract the higher 16-bits from tmp2: clear the lower 16-bits of tmp2.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubd, newOperandReg(tmp), tmp2))
+
+			// Convert the lower 16-bits in tmp.
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp))
+
+			// Left shift by one and convert tmp2, meaning that halved conversion result of higher 16-bits in tmp2.
+			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp2))
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp2), tmp2))
+
+			// Double the converted halved higher 16bits.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp2), tmp2))
+
+			// Get the conversion result by add tmp (holding lower 16-bit conversion) into tmp2.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp), tmp2))
+
+			m.copyTo(tmp2, m.c.VRegOf(ret))
+		}
+	case ssa.VecLaneF64x2:
+		if signed {
+			xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2pd, xx, m.c.VRegOf(ret)))
+		} else {
+			maskReg := m.c.AllocateVReg(ssa.TypeV128)
+			maskLabel := m.getOrAllocateConstLabel(&m.constF64x2CvtFromIMaskIndex, f64x2CvtFromIMask[:])
+			// maskReg = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg))
+
+			_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+			xx := m.copyToTmp(_xx.reg())
+
+			// Given that we have xx = [d1, d2, d3, d4], this results in
+			//	xx = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]]
+			//     = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52]
+			//     ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeUnpcklps, newOperandReg(maskReg), xx))
+
+			// maskReg = [float64(0x1.0p52), float64(0x1.0p52)]
+			maskLabel = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg))
+
+			// Now, we get the result as
+			// 	xx = [float64(uint32(d1)), float64(uint32(d2))]
+			// because the following equality always satisfies:
+			//  float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubpd, newOperandReg(maskReg), xx))
+
+			m.copyTo(xx, m.c.VRegOf(ret))
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+}
+
+var (
+	// i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes.
+	i32sMaxOnF64x2 = [16]byte{
+		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
+		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
+	}
+
+	// i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes.
+	i32uMaxOnF64x2 = [16]byte{
+		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
+		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
+	}
+
+	// twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that
+	// with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics,
+	// like addition or subtraction, the resulted floating point holds exactly the same
+	// bit representations in 32-bit integer on its mantissa.
+	//
+	// Note: the name twop52 is common across various compiler ecosystem.
+	// 	E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28
+	// 	E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html
+	twop52 = [16]byte{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
+	}
+)
+
+func (m *machine) lowerVFcvtToIntSat(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+
+	switch lane {
+	case ssa.VecLaneF32x4:
+		if signed {
+			tmp := m.copyToTmp(xx)
+
+			// Assuming we have xx = [v1, v2, v3, v4].
+			//
+			// Set all bits if lane is not NaN on tmp.
+			// tmp[i] = 0xffffffff  if vi != NaN
+			//        = 0           if vi == NaN
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp))
+
+			// Clear NaN lanes on xx, meaning that
+			// 	xx[i] = vi  if vi != NaN
+			//	        0   if vi == NaN
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp), xx))
+
+			// tmp[i] = ^vi         if vi != NaN
+			//        = 0xffffffff  if vi == NaN
+			// which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeXorps, newOperandReg(xx), tmp))
+
+			// xx[i] = int32(vi)   if vi != NaN and xx is not overflowing.
+			//       = 0x80000000  if vi != NaN and xx is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq)
+			//       = 0           if vi == NaN
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx))
+
+			// Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane.
+			//
+			// tmp[i] = 0x80000000                         if vi is positive
+			//        = any satisfying any&0x80000000 = 0  if vi is negative or zero.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(xx), tmp))
+
+			// Arithmetic right shifting tmp by 31, meaning that we have
+			// tmp[i] = 0xffffffff if vi is positive, 0 otherwise.
+			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrad, newOperandImm32(0x1f), tmp))
+
+			// Flipping 0x80000000 if vi is positive, otherwise keep intact.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), xx))
+		} else {
+			tmp := m.c.AllocateVReg(ssa.TypeV128)
+			m.insert(m.allocateInstr().asZeros(tmp))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxps, newOperandReg(tmp), xx))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp))
+			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0x1), tmp))
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp))
+			tmp2 := m.copyToTmp(xx)
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubps, newOperandReg(tmp), tmp2))
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredLE_OS), newOperandReg(tmp2), tmp))
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(tmp2), tmp2))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp2))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaxsd, newOperandReg(tmp), tmp2))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(tmp2), xx))
+		}
+
+	case ssa.VecLaneF64x2:
+		tmp2 := m.c.AllocateVReg(ssa.TypeV128)
+		if signed {
+			tmp := m.copyToTmp(xx)
+
+			// Set all bits for non-NaN lanes, zeros otherwise.
+			// I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise.
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp))
+
+			maskLabel := m.getOrAllocateConstLabel(&m.constI32sMaxOnF64x2Index, i32sMaxOnF64x2[:])
+			// Load the 2147483647 into tmp2's each lane.
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp2))
+
+			// tmp[i] = 2147483647 if vi != NaN, 0 otherwise.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp2), tmp))
+
+			// MINPD returns the source register's value as-is, so we have
+			//  xx[i] = vi   if vi != NaN
+			//        = 0    if vi == NaN
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp), xx))
+
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttpd2dq, newOperandReg(xx), xx))
+		} else {
+			tmp := m.c.AllocateVReg(ssa.TypeV128)
+			m.insert(m.allocateInstr().asZeros(tmp))
+
+			//  xx[i] = vi   if vi != NaN && vi > 0
+			//        = 0    if vi == NaN || vi <= 0
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxpd, newOperandReg(tmp), xx))
+
+			// tmp2[i] = float64(math.MaxUint32) = math.MaxUint32
+			maskIndex := m.getOrAllocateConstLabel(&m.constI32uMaxOnF64x2Index, i32uMaxOnF64x2[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2))
+
+			// xx[i] = vi   if vi != NaN && vi > 0 && vi <= math.MaxUint32
+			//       = 0    otherwise
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp2), xx))
+
+			// Round the floating points into integer.
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeRoundpd, 0x3, newOperandReg(xx), xx))
+
+			// tmp2[i] = float64(0x1.0p52)
+			maskIndex = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2))
+
+			// xx[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32
+			//       = 0                                       otherwise
+			//
+			// This means that xx[i] holds exactly the same bit of uint32(vi) in its lower 32-bits.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddpd, newOperandReg(tmp2), xx))
+
+			// At this point, we have
+			// 	xx  = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)]
+			//  tmp = [0, 0, 0, 0]
+			// as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in
+			//	xx = [xx[00], xx[10], tmp[00], tmp[00]] = [xx[00], xx[10], 0, 0]
+			// meaning that for i = 0 and 1, we have
+			//  xx[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32
+			//        = 0          otherwise.
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeShufps, 0b00_00_10_00, newOperandReg(tmp), xx))
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerNarrow(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+	yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+
+	var sseOp sseOpcode
+	switch lane {
+	case ssa.VecLaneI16x8:
+		if signed {
+			sseOp = sseOpcodePacksswb
+		} else {
+			sseOp = sseOpcodePackuswb
+		}
+	case ssa.VecLaneI32x4:
+		if signed {
+			sseOp = sseOpcodePackssdw
+		} else {
+			sseOp = sseOpcodePackusdw
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+	m.insert(m.allocateInstr().asXmmRmR(sseOp, yy, xx))
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerWideningPairwiseDotProductS(x, y, ret ssa.Value) {
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+	yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, yy, xx))
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVIabs(instr *ssa.Instruction) {
+	x, lane := instr.ArgWithLane()
+	rd := m.c.VRegOf(instr.Return())
+
+	if lane == ssa.VecLaneI64x2 {
+		_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+
+		blendReg := xmm0VReg
+		m.insert(m.allocateInstr().asDefineUninitializedReg(blendReg))
+
+		tmp := m.copyToTmp(_xx.reg())
+		xx := m.copyToTmp(_xx.reg())
+
+		// Clear all bits on blendReg.
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(blendReg), blendReg))
+		// Subtract xx from blendMaskReg.
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubq, newOperandReg(xx), blendReg))
+		// Copy the subtracted value ^^ back into tmp.
+		m.copyTo(blendReg, xx)
+
+		m.insert(m.allocateInstr().asBlendvpd(newOperandReg(tmp), xx))
+
+		m.copyTo(xx, rd)
+	} else {
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI8x16:
+			vecOp = sseOpcodePabsb
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePabsw
+		case ssa.VecLaneI32x4:
+			vecOp = sseOpcodePabsd
+		}
+		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+
+		i := m.allocateInstr()
+		i.asXmmUnaryRmR(vecOp, rn, rd)
+		m.insert(i)
+	}
+}
+
+func (m *machine) lowerVIpopcnt(instr *ssa.Instruction) {
+	x := instr.Arg()
+	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+	rd := m.c.VRegOf(instr.Return())
+
+	tmp1 := m.c.AllocateVReg(ssa.TypeV128)
+	m.lowerVconst(tmp1, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f)
+
+	// Copy input into tmp2.
+	tmp2 := m.copyToTmp(rn.reg())
+
+	// Given that we have:
+	//  rm = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn.
+	//
+	// Take PAND on tmp1 and tmp2, so that we mask out all the higher bits.
+	//  tmp2 = [l1, ..., l16].
+	pand := m.allocateInstr()
+	pand.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp2)
+	m.insert(pand)
+
+	// Do logical (packed word) right shift by 4 on rm and PAND against the mask (tmp1); meaning that we have
+	//  tmp3 = [h1, ...., h16].
+	tmp3 := m.copyToTmp(rn.reg())
+	psrlw := m.allocateInstr()
+	psrlw.asXmmRmiReg(sseOpcodePsrlw, newOperandImm32(4), tmp3)
+	m.insert(psrlw)
+
+	pand2 := m.allocateInstr()
+	pand2.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp3)
+	m.insert(pand2)
+
+	// Read the popcntTable into tmp4, and we have
+	//  tmp4 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04]
+	tmp4 := m.c.AllocateVReg(ssa.TypeV128)
+	m.lowerVconst(tmp4, 0x03_02_02_01_02_01_01_00, 0x04_03_03_02_03_02_02_01)
+
+	// Make a copy for later.
+	tmp5 := m.copyToTmp(tmp4)
+
+	//  tmp4 = [popcnt(l1), ..., popcnt(l16)].
+	pshufb := m.allocateInstr()
+	pshufb.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp2), tmp4)
+	m.insert(pshufb)
+
+	pshufb2 := m.allocateInstr()
+	pshufb2.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp3), tmp5)
+	m.insert(pshufb2)
+
+	// tmp4 + tmp5 is the result.
+	paddb := m.allocateInstr()
+	paddb.asXmmRmR(sseOpcodePaddb, newOperandReg(tmp4), tmp5)
+	m.insert(paddb)
+
+	m.copyTo(tmp5, rd)
+}
+
+func (m *machine) lowerVImul(instr *ssa.Instruction) {
+	x, y, lane := instr.Arg2WithLane()
+	rd := m.c.VRegOf(instr.Return())
+	if lane == ssa.VecLaneI64x2 {
+		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+		rm := m.getOperand_Reg(m.c.ValueDefinition(y))
+		// Assuming that we have
+		//	rm = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high]
+		//  rn = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high]
+		// where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane.
+
+		// Copy rn into tmp1.
+		tmp1 := m.copyToTmp(rn.reg())
+
+		// And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high]
+		shift := m.allocateInstr()
+		shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp1)
+		m.insert(shift)
+
+		// Execute "pmuludq rm,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit.
+		mul := m.allocateInstr()
+		mul.asXmmRmR(sseOpcodePmuludq, rm, tmp1)
+		m.insert(mul)
+
+		// Copy rm value into tmp2.
+		tmp2 := m.copyToTmp(rm.reg())
+
+		// And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high]
+		shift2 := m.allocateInstr()
+		shift2.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp2)
+		m.insert(shift2)
+
+		// Execute "pmuludq rm,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit.
+		mul2 := m.allocateInstr()
+		mul2.asXmmRmR(sseOpcodePmuludq, rn, tmp2)
+		m.insert(mul2)
+
+		// Adds tmp1 and tmp2 and do the logical left shift by 32-bit,
+		// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32]
+		add := m.allocateInstr()
+		add.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp2), tmp1)
+		m.insert(add)
+
+		shift3 := m.allocateInstr()
+		shift3.asXmmRmiReg(sseOpcodePsllq, newOperandImm32(32), tmp1)
+		m.insert(shift3)
+
+		// Copy rm value into tmp3.
+		tmp3 := m.copyToTmp(rm.reg())
+
+		// "pmuludq rm,tmp3" makes tmp3 = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit.
+		mul3 := m.allocateInstr()
+		mul3.asXmmRmR(sseOpcodePmuludq, rn, tmp3)
+		m.insert(mul3)
+
+		// Finally, we get the result by computing tmp1 + tmp3,
+		// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo]
+		add2 := m.allocateInstr()
+		add2.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp3), tmp1)
+		m.insert(add2)
+
+		m.copyTo(tmp1, rd)
+
+	} else {
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePmullw
+		case ssa.VecLaneI32x4:
+			vecOp = sseOpcodePmulld
+		default:
+			panic("unsupported: " + lane.String())
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go
new file mode 100644
index 000000000..c6fcb8673
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go
@@ -0,0 +1,346 @@
+package amd64
+
+import (
+	"fmt"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type operand struct {
+	kind operandKind
+	data uint64
+}
+
+type operandKind byte
+
+const (
+	// operandKindReg is an operand which is an integer Register.
+	operandKindReg operandKind = iota + 1
+
+	// operandKindMem is a value in Memory.
+	// 32, 64, or 128 bit value.
+	operandKindMem
+
+	// operandKindImm32 is a signed-32-bit integer immediate value.
+	operandKindImm32
+
+	// operandKindLabel is a label.
+	operandKindLabel
+)
+
+// String implements fmt.Stringer.
+func (o operandKind) String() string {
+	switch o {
+	case operandKindReg:
+		return "reg"
+	case operandKindMem:
+		return "mem"
+	case operandKindImm32:
+		return "imm32"
+	case operandKindLabel:
+		return "label"
+	default:
+		panic("BUG: invalid operand kind")
+	}
+}
+
+// format returns the string representation of the operand.
+// _64 is only for the case where the operand is a register, and it's integer.
+func (o *operand) format(_64 bool) string {
+	switch o.kind {
+	case operandKindReg:
+		return formatVRegSized(o.reg(), _64)
+	case operandKindMem:
+		return o.addressMode().String()
+	case operandKindImm32:
+		return fmt.Sprintf("$%d", int32(o.imm32()))
+	case operandKindLabel:
+		return backend.Label(o.imm32()).String()
+	default:
+		panic(fmt.Sprintf("BUG: invalid operand: %s", o.kind))
+	}
+}
+
+//go:inline
+func (o *operand) reg() regalloc.VReg {
+	return regalloc.VReg(o.data)
+}
+
+//go:inline
+func (o *operand) setReg(r regalloc.VReg) {
+	o.data = uint64(r)
+}
+
+//go:inline
+func (o *operand) addressMode() *amode {
+	return wazevoapi.PtrFromUintptr[amode](uintptr(o.data))
+}
+
+//go:inline
+func (o *operand) imm32() uint32 {
+	return uint32(o.data)
+}
+
+func (o *operand) label() backend.Label {
+	switch o.kind {
+	case operandKindLabel:
+		return backend.Label(o.data)
+	case operandKindMem:
+		mem := o.addressMode()
+		if mem.kind() != amodeRipRel {
+			panic("BUG: invalid label")
+		}
+		return backend.Label(mem.imm32)
+	default:
+		panic("BUG: invalid operand kind")
+	}
+}
+
+func newOperandLabel(label backend.Label) operand {
+	return operand{kind: operandKindLabel, data: uint64(label)}
+}
+
+func newOperandReg(r regalloc.VReg) operand {
+	return operand{kind: operandKindReg, data: uint64(r)}
+}
+
+func newOperandImm32(imm32 uint32) operand {
+	return operand{kind: operandKindImm32, data: uint64(imm32)}
+}
+
+func newOperandMem(amode *amode) operand {
+	return operand{kind: operandKindMem, data: uint64(uintptr(unsafe.Pointer(amode)))}
+}
+
+// amode is a memory operand (addressing mode).
+type amode struct {
+	kindWithShift uint32
+	imm32         uint32
+	base          regalloc.VReg
+
+	// For amodeRegRegShift:
+	index regalloc.VReg
+}
+
+type amodeKind byte
+
+const (
+	// amodeRegRegShift calculates sign-extend-32-to-64(Immediate) + base
+	amodeImmReg amodeKind = iota + 1
+
+	// amodeImmRBP is the same as amodeImmReg, but the base register is fixed to RBP.
+	// The only differece is that it doesn't tell the register allocator to use RBP which is distracting for the
+	// register allocator.
+	amodeImmRBP
+
+	// amodeRegRegShift calculates sign-extend-32-to-64(Immediate) + base + (Register2 << Shift)
+	amodeRegRegShift
+
+	// amodeRipRel is a RIP-relative addressing mode specified by the label.
+	amodeRipRel
+
+	// TODO: there are other addressing modes such as the one without base register.
+)
+
+func (a *amode) kind() amodeKind {
+	return amodeKind(a.kindWithShift & 0xff)
+}
+
+func (a *amode) shift() byte {
+	return byte(a.kindWithShift >> 8)
+}
+
+func (a *amode) uses(rs *[]regalloc.VReg) {
+	switch a.kind() {
+	case amodeImmReg:
+		*rs = append(*rs, a.base)
+	case amodeRegRegShift:
+		*rs = append(*rs, a.base, a.index)
+	case amodeImmRBP, amodeRipRel:
+	default:
+		panic("BUG: invalid amode kind")
+	}
+}
+
+func (a *amode) nregs() int {
+	switch a.kind() {
+	case amodeImmReg:
+		return 1
+	case amodeRegRegShift:
+		return 2
+	case amodeImmRBP, amodeRipRel:
+		return 0
+	default:
+		panic("BUG: invalid amode kind")
+	}
+}
+
+func (a *amode) assignUses(i int, reg regalloc.VReg) {
+	switch a.kind() {
+	case amodeImmReg:
+		if i == 0 {
+			a.base = reg
+		} else {
+			panic("BUG: invalid amode assignment")
+		}
+	case amodeRegRegShift:
+		if i == 0 {
+			a.base = reg
+		} else if i == 1 {
+			a.index = reg
+		} else {
+			panic("BUG: invalid amode assignment")
+		}
+	default:
+		panic("BUG: invalid amode assignment")
+	}
+}
+
+func (m *machine) newAmodeImmReg(imm32 uint32, base regalloc.VReg) *amode {
+	ret := m.amodePool.Allocate()
+	*ret = amode{kindWithShift: uint32(amodeImmReg), imm32: imm32, base: base}
+	return ret
+}
+
+func (m *machine) newAmodeImmRBPReg(imm32 uint32) *amode {
+	ret := m.amodePool.Allocate()
+	*ret = amode{kindWithShift: uint32(amodeImmRBP), imm32: imm32, base: rbpVReg}
+	return ret
+}
+
+func (m *machine) newAmodeRegRegShift(imm32 uint32, base, index regalloc.VReg, shift byte) *amode {
+	if shift > 3 {
+		panic(fmt.Sprintf("BUG: invalid shift (must be 3>=): %d", shift))
+	}
+	ret := m.amodePool.Allocate()
+	*ret = amode{kindWithShift: uint32(amodeRegRegShift) | uint32(shift)<<8, imm32: imm32, base: base, index: index}
+	return ret
+}
+
+func (m *machine) newAmodeRipRel(label backend.Label) *amode {
+	ret := m.amodePool.Allocate()
+	*ret = amode{kindWithShift: uint32(amodeRipRel), imm32: uint32(label)}
+	return ret
+}
+
+// String implements fmt.Stringer.
+func (a *amode) String() string {
+	switch a.kind() {
+	case amodeImmReg, amodeImmRBP:
+		if a.imm32 == 0 {
+			return fmt.Sprintf("(%s)", formatVRegSized(a.base, true))
+		}
+		return fmt.Sprintf("%d(%s)", int32(a.imm32), formatVRegSized(a.base, true))
+	case amodeRegRegShift:
+		shift := 1 << a.shift()
+		if a.imm32 == 0 {
+			return fmt.Sprintf(
+				"(%s,%s,%d)",
+				formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift)
+		}
+		return fmt.Sprintf(
+			"%d(%s,%s,%d)",
+			int32(a.imm32), formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift)
+	case amodeRipRel:
+		return fmt.Sprintf("%s(%%rip)", backend.Label(a.imm32))
+	default:
+		panic("BUG: invalid amode kind")
+	}
+}
+
+func (m *machine) getOperand_Mem_Reg(def *backend.SSAValueDefinition) (op operand) {
+	if def.IsFromBlockParam() {
+		return newOperandReg(def.BlkParamVReg)
+	}
+
+	if def.SSAValue().Type() == ssa.TypeV128 {
+		// SIMD instructions require strict memory alignment, so we don't support the memory operand for V128 at the moment.
+		return m.getOperand_Reg(def)
+	}
+
+	if m.c.MatchInstr(def, ssa.OpcodeLoad) {
+		instr := def.Instr
+		ptr, offset, _ := instr.LoadData()
+		op = newOperandMem(m.lowerToAddressMode(ptr, offset))
+		instr.MarkLowered()
+		return op
+	}
+	return m.getOperand_Reg(def)
+}
+
+func (m *machine) getOperand_Mem_Imm32_Reg(def *backend.SSAValueDefinition) (op operand) {
+	if def.IsFromBlockParam() {
+		return newOperandReg(def.BlkParamVReg)
+	}
+
+	if m.c.MatchInstr(def, ssa.OpcodeLoad) {
+		instr := def.Instr
+		ptr, offset, _ := instr.LoadData()
+		op = newOperandMem(m.lowerToAddressMode(ptr, offset))
+		instr.MarkLowered()
+		return op
+	}
+	return m.getOperand_Imm32_Reg(def)
+}
+
+func (m *machine) getOperand_Imm32_Reg(def *backend.SSAValueDefinition) (op operand) {
+	if def.IsFromBlockParam() {
+		return newOperandReg(def.BlkParamVReg)
+	}
+
+	instr := def.Instr
+	if instr.Constant() {
+		// If the operation is 64-bit, x64 sign-extends the 32-bit immediate value.
+		// Therefore, we need to check if the immediate value is within the 32-bit range and if the sign bit is set,
+		// we should not use the immediate value.
+		if op, ok := asImm32Operand(instr.ConstantVal(), instr.Return().Type() == ssa.TypeI32); ok {
+			instr.MarkLowered()
+			return op
+		}
+	}
+	return m.getOperand_Reg(def)
+}
+
+func asImm32Operand(val uint64, allowSignExt bool) (operand, bool) {
+	if imm32, ok := asImm32(val, allowSignExt); ok {
+		return newOperandImm32(imm32), true
+	}
+	return operand{}, false
+}
+
+func asImm32(val uint64, allowSignExt bool) (uint32, bool) {
+	u32val := uint32(val)
+	if uint64(u32val) != val {
+		return 0, false
+	}
+	if !allowSignExt && u32val&0x80000000 != 0 {
+		return 0, false
+	}
+	return u32val, true
+}
+
+func (m *machine) getOperand_Reg(def *backend.SSAValueDefinition) (op operand) {
+	var v regalloc.VReg
+	if def.IsFromBlockParam() {
+		v = def.BlkParamVReg
+	} else {
+		instr := def.Instr
+		if instr.Constant() {
+			// We inline all the constant instructions so that we could reduce the register usage.
+			v = m.lowerConstant(instr)
+			instr.MarkLowered()
+		} else {
+			if n := def.N; n == 0 {
+				v = m.c.VRegOf(instr.Return())
+			} else {
+				_, rs := instr.Returns()
+				v = m.c.VRegOf(rs[n-1])
+			}
+		}
+	}
+	return newOperandReg(v)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go
new file mode 100644
index 000000000..5219837e3
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go
@@ -0,0 +1,11 @@
+//go:build !tinygo
+
+package amd64
+
+import "reflect"
+
+// setSliceLimits sets both Cap and Len for the given reflected slice.
+func setSliceLimits(s *reflect.SliceHeader, limit uintptr) {
+	s.Len = int(limit)
+	s.Cap = int(limit)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go
new file mode 100644
index 000000000..df4cf46ec
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go
@@ -0,0 +1,11 @@
+//go:build tinygo
+
+package amd64
+
+import "reflect"
+
+// setSliceLimits sets both Cap and Len for the given reflected slice.
+func setSliceLimits(s *reflect.SliceHeader, limit uintptr) {
+	s.Len = limit
+	s.Len = limit
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go
new file mode 100644
index 000000000..4aec856fa
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go
@@ -0,0 +1,181 @@
+package amd64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+)
+
+// Amd64-specific registers.
+const (
+	// rax is a gp register.
+	rax = regalloc.RealRegInvalid + 1 + iota
+	// rcx is a gp register.
+	rcx
+	// rdx is a gp register.
+	rdx
+	// rbx is a gp register.
+	rbx
+	// rsp is a gp register.
+	rsp
+	// rbp is a gp register.
+	rbp
+	// rsi is a gp register.
+	rsi
+	// rdi is a gp register.
+	rdi
+	// r8 is a gp register.
+	r8
+	// r9 is a gp register.
+	r9
+	// r10 is a gp register.
+	r10
+	// r11 is a gp register.
+	r11
+	// r12 is a gp register.
+	r12
+	// r13 is a gp register.
+	r13
+	// r14 is a gp register.
+	r14
+	// r15 is a gp register.
+	r15
+
+	// xmm0 is a vector register.
+	xmm0
+	// xmm1 is a vector register.
+	xmm1
+	// xmm2 is a vector register.
+	xmm2
+	// xmm3 is a vector register.
+	xmm3
+	// xmm4 is a vector register.
+	xmm4
+	// xmm5 is a vector register.
+	xmm5
+	// xmm6 is a vector register.
+	xmm6
+	// xmm7 is a vector register.
+	xmm7
+	// xmm8 is a vector register.
+	xmm8
+	// xmm9 is a vector register.
+	xmm9
+	// xmm10 is a vector register.
+	xmm10
+	// xmm11 is a vector register.
+	xmm11
+	// xmm12 is a vector register.
+	xmm12
+	// xmm13 is a vector register.
+	xmm13
+	// xmm14 is a vector register.
+	xmm14
+	// xmm15 is a vector register.
+	xmm15
+)
+
+var (
+	raxVReg = regalloc.FromRealReg(rax, regalloc.RegTypeInt)
+	rcxVReg = regalloc.FromRealReg(rcx, regalloc.RegTypeInt)
+	rdxVReg = regalloc.FromRealReg(rdx, regalloc.RegTypeInt)
+	rbxVReg = regalloc.FromRealReg(rbx, regalloc.RegTypeInt)
+	rspVReg = regalloc.FromRealReg(rsp, regalloc.RegTypeInt)
+	rbpVReg = regalloc.FromRealReg(rbp, regalloc.RegTypeInt)
+	rsiVReg = regalloc.FromRealReg(rsi, regalloc.RegTypeInt)
+	rdiVReg = regalloc.FromRealReg(rdi, regalloc.RegTypeInt)
+	r8VReg  = regalloc.FromRealReg(r8, regalloc.RegTypeInt)
+	r9VReg  = regalloc.FromRealReg(r9, regalloc.RegTypeInt)
+	r10VReg = regalloc.FromRealReg(r10, regalloc.RegTypeInt)
+	r11VReg = regalloc.FromRealReg(r11, regalloc.RegTypeInt)
+	r12VReg = regalloc.FromRealReg(r12, regalloc.RegTypeInt)
+	r13VReg = regalloc.FromRealReg(r13, regalloc.RegTypeInt)
+	r14VReg = regalloc.FromRealReg(r14, regalloc.RegTypeInt)
+	r15VReg = regalloc.FromRealReg(r15, regalloc.RegTypeInt)
+
+	xmm0VReg  = regalloc.FromRealReg(xmm0, regalloc.RegTypeFloat)
+	xmm1VReg  = regalloc.FromRealReg(xmm1, regalloc.RegTypeFloat)
+	xmm2VReg  = regalloc.FromRealReg(xmm2, regalloc.RegTypeFloat)
+	xmm3VReg  = regalloc.FromRealReg(xmm3, regalloc.RegTypeFloat)
+	xmm4VReg  = regalloc.FromRealReg(xmm4, regalloc.RegTypeFloat)
+	xmm5VReg  = regalloc.FromRealReg(xmm5, regalloc.RegTypeFloat)
+	xmm6VReg  = regalloc.FromRealReg(xmm6, regalloc.RegTypeFloat)
+	xmm7VReg  = regalloc.FromRealReg(xmm7, regalloc.RegTypeFloat)
+	xmm8VReg  = regalloc.FromRealReg(xmm8, regalloc.RegTypeFloat)
+	xmm9VReg  = regalloc.FromRealReg(xmm9, regalloc.RegTypeFloat)
+	xmm10VReg = regalloc.FromRealReg(xmm10, regalloc.RegTypeFloat)
+	xmm11VReg = regalloc.FromRealReg(xmm11, regalloc.RegTypeFloat)
+	xmm12VReg = regalloc.FromRealReg(xmm12, regalloc.RegTypeFloat)
+	xmm13VReg = regalloc.FromRealReg(xmm13, regalloc.RegTypeFloat)
+	xmm14VReg = regalloc.FromRealReg(xmm14, regalloc.RegTypeFloat)
+	xmm15VReg = regalloc.FromRealReg(xmm15, regalloc.RegTypeFloat)
+)
+
+var regNames = [...]string{
+	rax:   "rax",
+	rcx:   "rcx",
+	rdx:   "rdx",
+	rbx:   "rbx",
+	rsp:   "rsp",
+	rbp:   "rbp",
+	rsi:   "rsi",
+	rdi:   "rdi",
+	r8:    "r8",
+	r9:    "r9",
+	r10:   "r10",
+	r11:   "r11",
+	r12:   "r12",
+	r13:   "r13",
+	r14:   "r14",
+	r15:   "r15",
+	xmm0:  "xmm0",
+	xmm1:  "xmm1",
+	xmm2:  "xmm2",
+	xmm3:  "xmm3",
+	xmm4:  "xmm4",
+	xmm5:  "xmm5",
+	xmm6:  "xmm6",
+	xmm7:  "xmm7",
+	xmm8:  "xmm8",
+	xmm9:  "xmm9",
+	xmm10: "xmm10",
+	xmm11: "xmm11",
+	xmm12: "xmm12",
+	xmm13: "xmm13",
+	xmm14: "xmm14",
+	xmm15: "xmm15",
+}
+
+func formatVRegSized(r regalloc.VReg, _64 bool) string {
+	if r.IsRealReg() {
+		if r.RegType() == regalloc.RegTypeInt {
+			rr := r.RealReg()
+			orig := regNames[rr]
+			if rr <= rdi {
+				if _64 {
+					return "%" + orig
+				} else {
+					return "%e" + orig[1:]
+				}
+			} else {
+				if _64 {
+					return "%" + orig
+				} else {
+					return "%" + orig + "d"
+				}
+			}
+		} else {
+			return "%" + regNames[r.RealReg()]
+		}
+	} else {
+		if r.RegType() == regalloc.RegTypeInt {
+			if _64 {
+				return fmt.Sprintf("%%r%d?", r.ID())
+			} else {
+				return fmt.Sprintf("%%r%dd?", r.ID())
+			}
+		} else {
+			return fmt.Sprintf("%%xmm%d?", r.ID())
+		}
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go
new file mode 100644
index 000000000..05ba5f027
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go
@@ -0,0 +1,128 @@
+package amd64
+
+import (
+	"encoding/binary"
+	"reflect"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/internal/wasmdebug"
+)
+
+func stackView(rbp, top uintptr) []byte {
+	var stackBuf []byte
+	{
+		// TODO: use unsafe.Slice after floor version is set to Go 1.20.
+		hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf))
+		hdr.Data = rbp
+		setSliceLimits(hdr, top-rbp)
+	}
+	return stackBuf
+}
+
+// UnwindStack implements wazevo.unwindStack.
+func UnwindStack(_, rbp, top uintptr, returnAddresses []uintptr) []uintptr {
+	stackBuf := stackView(rbp, top)
+
+	for i := uint64(0); i < uint64(len(stackBuf)); {
+		//       (high address)
+		//    +-----------------+
+		//    |     .......     |
+		//    |      ret Y      |
+		//    |     .......     |
+		//    |      ret 0      |
+		//    |      arg X      |
+		//    |     .......     |
+		//    |      arg 1      |
+		//    |      arg 0      |
+		//    |  ReturnAddress  |
+		//    |   Caller_RBP    |
+		//    +-----------------+ <---- Caller_RBP
+		//    |   ...........   |
+		//    |   clobbered  M  |
+		//    |   ............  |
+		//    |   clobbered  0  |
+		//    |   spill slot N  |
+		//    |   ............  |
+		//    |   spill slot 0  |
+		//    |  ReturnAddress  |
+		//    |   Caller_RBP    |
+		//    +-----------------+ <---- RBP
+		//       (low address)
+
+		callerRBP := binary.LittleEndian.Uint64(stackBuf[i:])
+		retAddr := binary.LittleEndian.Uint64(stackBuf[i+8:])
+		returnAddresses = append(returnAddresses, uintptr(retAddr))
+		i = callerRBP - uint64(rbp)
+		if len(returnAddresses) == wasmdebug.MaxFrames {
+			break
+		}
+	}
+	return returnAddresses
+}
+
+// GoCallStackView implements wazevo.goCallStackView.
+func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
+	//                  (high address)
+	//              +-----------------+ <----+
+	//              |   xxxxxxxxxxx   |      | ;; optional unused space to make it 16-byte aligned.
+	//           ^  |  arg[N]/ret[M]  |      |
+	// sliceSize |  |  ............   |      | SizeInBytes/8
+	//           |  |  arg[1]/ret[1]  |      |
+	//           v  |  arg[0]/ret[0]  | <----+
+	//              |   SizeInBytes   |
+	//              +-----------------+ <---- stackPointerBeforeGoCall
+	//                 (low address)
+	data := unsafe.Pointer(uintptr(unsafe.Pointer(stackPointerBeforeGoCall)) + 8)
+	size := *stackPointerBeforeGoCall / 8
+	return unsafe.Slice((*uint64)(data), int(size))
+}
+
+func AdjustClonedStack(oldRsp, oldTop, rsp, rbp, top uintptr) {
+	diff := uint64(rsp - oldRsp)
+
+	newBuf := stackView(rbp, top)
+	for i := uint64(0); i < uint64(len(newBuf)); {
+		//       (high address)
+		//    +-----------------+
+		//    |     .......     |
+		//    |      ret Y      |
+		//    |     .......     |
+		//    |      ret 0      |
+		//    |      arg X      |
+		//    |     .......     |
+		//    |      arg 1      |
+		//    |      arg 0      |
+		//    |  ReturnAddress  |
+		//    |   Caller_RBP    |
+		//    +-----------------+ <---- Caller_RBP
+		//    |   ...........   |
+		//    |   clobbered  M  |
+		//    |   ............  |
+		//    |   clobbered  0  |
+		//    |   spill slot N  |
+		//    |   ............  |
+		//    |   spill slot 0  |
+		//    |  ReturnAddress  |
+		//    |   Caller_RBP    |
+		//    +-----------------+ <---- RBP
+		//       (low address)
+
+		callerRBP := binary.LittleEndian.Uint64(newBuf[i:])
+		if callerRBP == 0 {
+			// End of stack.
+			break
+		}
+		if i64 := int64(callerRBP); i64 < int64(oldRsp) || i64 >= int64(oldTop) {
+			panic("BUG: callerRBP is out of range")
+		}
+		if int(callerRBP) < 0 {
+			panic("BUG: callerRBP is negative")
+		}
+		adjustedCallerRBP := callerRBP + diff
+		if int(adjustedCallerRBP) < 0 {
+			panic("BUG: adjustedCallerRBP is negative")
+		}
+		binary.LittleEndian.PutUint64(newBuf[i:], adjustedCallerRBP)
+		i = adjustedCallerRBP - uint64(rbp)
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
new file mode 100644
index 000000000..6615471c6
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
@@ -0,0 +1,332 @@
+package arm64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// References:
+// * https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#arm64-architecture
+// * https://developer.arm.com/documentation/102374/0101/Procedure-Call-Standard
+
+var (
+	intParamResultRegs   = []regalloc.RealReg{x0, x1, x2, x3, x4, x5, x6, x7}
+	floatParamResultRegs = []regalloc.RealReg{v0, v1, v2, v3, v4, v5, v6, v7}
+)
+
+var regInfo = &regalloc.RegisterInfo{
+	AllocatableRegisters: [regalloc.NumRegType][]regalloc.RealReg{
+		// We don't allocate:
+		// - x18: Reserved by the macOS: https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms#Respect-the-purpose-of-specific-CPU-registers
+		// - x28: Reserved by Go runtime.
+		// - x27(=tmpReg): because of the reason described on tmpReg.
+		regalloc.RegTypeInt: {
+			x8, x9, x10, x11, x12, x13, x14, x15,
+			x16, x17, x19, x20, x21, x22, x23, x24, x25,
+			x26, x29, x30,
+			// These are the argument/return registers. Less preferred in the allocation.
+			x7, x6, x5, x4, x3, x2, x1, x0,
+		},
+		regalloc.RegTypeFloat: {
+			v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+			v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30,
+			// These are the argument/return registers. Less preferred in the allocation.
+			v7, v6, v5, v4, v3, v2, v1, v0,
+		},
+	},
+	CalleeSavedRegisters: regalloc.NewRegSet(
+		x19, x20, x21, x22, x23, x24, x25, x26, x28,
+		v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+	),
+	CallerSavedRegisters: regalloc.NewRegSet(
+		x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x29, x30,
+		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+	),
+	RealRegToVReg: []regalloc.VReg{
+		x0: x0VReg, x1: x1VReg, x2: x2VReg, x3: x3VReg, x4: x4VReg, x5: x5VReg, x6: x6VReg, x7: x7VReg, x8: x8VReg, x9: x9VReg, x10: x10VReg, x11: x11VReg, x12: x12VReg, x13: x13VReg, x14: x14VReg, x15: x15VReg, x16: x16VReg, x17: x17VReg, x18: x18VReg, x19: x19VReg, x20: x20VReg, x21: x21VReg, x22: x22VReg, x23: x23VReg, x24: x24VReg, x25: x25VReg, x26: x26VReg, x27: x27VReg, x28: x28VReg, x29: x29VReg, x30: x30VReg,
+		v0: v0VReg, v1: v1VReg, v2: v2VReg, v3: v3VReg, v4: v4VReg, v5: v5VReg, v6: v6VReg, v7: v7VReg, v8: v8VReg, v9: v9VReg, v10: v10VReg, v11: v11VReg, v12: v12VReg, v13: v13VReg, v14: v14VReg, v15: v15VReg, v16: v16VReg, v17: v17VReg, v18: v18VReg, v19: v19VReg, v20: v20VReg, v21: v21VReg, v22: v22VReg, v23: v23VReg, v24: v24VReg, v25: v25VReg, v26: v26VReg, v27: v27VReg, v28: v28VReg, v29: v29VReg, v30: v30VReg, v31: v31VReg,
+	},
+	RealRegName: func(r regalloc.RealReg) string { return regNames[r] },
+	RealRegType: func(r regalloc.RealReg) regalloc.RegType {
+		if r < v0 {
+			return regalloc.RegTypeInt
+		}
+		return regalloc.RegTypeFloat
+	},
+}
+
+// ArgsResultsRegs implements backend.Machine.
+func (m *machine) ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) {
+	return intParamResultRegs, floatParamResultRegs
+}
+
+// LowerParams implements backend.FunctionABI.
+func (m *machine) LowerParams(args []ssa.Value) {
+	a := m.currentABI
+
+	for i, ssaArg := range args {
+		if !ssaArg.Valid() {
+			continue
+		}
+		reg := m.compiler.VRegOf(ssaArg)
+		arg := &a.Args[i]
+		if arg.Kind == backend.ABIArgKindReg {
+			m.InsertMove(reg, arg.Reg, arg.Type)
+		} else {
+			// TODO: we could use pair load if there's consecutive loads for the same type.
+			//
+			//            (high address)
+			//          +-----------------+
+			//          |     .......     |
+			//          |      ret Y      |
+			//          |     .......     |
+			//          |      ret 0      |
+			//          |      arg X      |
+			//          |     .......     |
+			//          |      arg 1      |
+			//          |      arg 0      |    <-|
+			//          |   ReturnAddress |      |
+			//          +-----------------+      |
+			//          |   ...........   |      |
+			//          |   clobbered  M  |      |   argStackOffset: is unknown at this point of compilation.
+			//          |   ............  |      |
+			//          |   clobbered  0  |      |
+			//          |   spill slot N  |      |
+			//          |   ...........   |      |
+			//          |   spill slot 0  |      |
+			//   SP---> +-----------------+    <-+
+			//             (low address)
+
+			bits := arg.Type.Bits()
+			// At this point of compilation, we don't yet know how much space exist below the return address.
+			// So we instruct the address mode to add the `argStackOffset` to the offset at the later phase of compilation.
+			amode := addressMode{imm: arg.Offset, rn: spVReg, kind: addressModeKindArgStackSpace}
+			load := m.allocateInstr()
+			switch arg.Type {
+			case ssa.TypeI32, ssa.TypeI64:
+				load.asULoad(operandNR(reg), amode, bits)
+			case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+				load.asFpuLoad(operandNR(reg), amode, bits)
+			default:
+				panic("BUG")
+			}
+			m.insert(load)
+			m.unresolvedAddressModes = append(m.unresolvedAddressModes, load)
+		}
+	}
+}
+
+// LowerReturns lowers the given returns.
+func (m *machine) LowerReturns(rets []ssa.Value) {
+	a := m.currentABI
+
+	l := len(rets) - 1
+	for i := range rets {
+		// Reverse order in order to avoid overwriting the stack returns existing in the return registers.
+		ret := rets[l-i]
+		r := &a.Rets[l-i]
+		reg := m.compiler.VRegOf(ret)
+		if def := m.compiler.ValueDefinition(ret); def.IsFromInstr() {
+			// Constant instructions are inlined.
+			if inst := def.Instr; inst.Constant() {
+				val := inst.Return()
+				valType := val.Type()
+				v := inst.ConstantVal()
+				m.insertLoadConstant(v, valType, reg)
+			}
+		}
+		if r.Kind == backend.ABIArgKindReg {
+			m.InsertMove(r.Reg, reg, ret.Type())
+		} else {
+			// TODO: we could use pair store if there's consecutive stores for the same type.
+			//
+			//            (high address)
+			//          +-----------------+
+			//          |     .......     |
+			//          |      ret Y      |
+			//          |     .......     |
+			//          |      ret 0      |    <-+
+			//          |      arg X      |      |
+			//          |     .......     |      |
+			//          |      arg 1      |      |
+			//          |      arg 0      |      |
+			//          |   ReturnAddress |      |
+			//          +-----------------+      |
+			//          |   ...........   |      |
+			//          |   spill slot M  |      |   retStackOffset: is unknown at this point of compilation.
+			//          |   ............  |      |
+			//          |   spill slot 2  |      |
+			//          |   spill slot 1  |      |
+			//          |   clobbered 0   |      |
+			//          |   clobbered 1   |      |
+			//          |   ...........   |      |
+			//          |   clobbered N   |      |
+			//   SP---> +-----------------+    <-+
+			//             (low address)
+
+			bits := r.Type.Bits()
+
+			// At this point of compilation, we don't yet know how much space exist below the return address.
+			// So we instruct the address mode to add the `retStackOffset` to the offset at the later phase of compilation.
+			amode := addressMode{imm: r.Offset, rn: spVReg, kind: addressModeKindResultStackSpace}
+			store := m.allocateInstr()
+			store.asStore(operandNR(reg), amode, bits)
+			m.insert(store)
+			m.unresolvedAddressModes = append(m.unresolvedAddressModes, store)
+		}
+	}
+}
+
+// callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the
+// caller side of the function call.
+func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def *backend.SSAValueDefinition, slotBegin int64) {
+	arg := &a.Args[argIndex]
+	if def != nil && def.IsFromInstr() {
+		// Constant instructions are inlined.
+		if inst := def.Instr; inst.Constant() {
+			val := inst.Return()
+			valType := val.Type()
+			v := inst.ConstantVal()
+			m.insertLoadConstant(v, valType, reg)
+		}
+	}
+	if arg.Kind == backend.ABIArgKindReg {
+		m.InsertMove(arg.Reg, reg, arg.Type)
+	} else {
+		// TODO: we could use pair store if there's consecutive stores for the same type.
+		//
+		// Note that at this point, stack pointer is already adjusted.
+		bits := arg.Type.Bits()
+		amode := m.resolveAddressModeForOffset(arg.Offset-slotBegin, bits, spVReg, false)
+		store := m.allocateInstr()
+		store.asStore(operandNR(reg), amode, bits)
+		m.insert(store)
+	}
+}
+
+func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex int, reg regalloc.VReg, slotBegin int64) {
+	r := &a.Rets[retIndex]
+	if r.Kind == backend.ABIArgKindReg {
+		m.InsertMove(reg, r.Reg, r.Type)
+	} else {
+		// TODO: we could use pair load if there's consecutive loads for the same type.
+		amode := m.resolveAddressModeForOffset(a.ArgStackSize+r.Offset-slotBegin, r.Type.Bits(), spVReg, false)
+		ldr := m.allocateInstr()
+		switch r.Type {
+		case ssa.TypeI32, ssa.TypeI64:
+			ldr.asULoad(operandNR(reg), amode, r.Type.Bits())
+		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+			ldr.asFpuLoad(operandNR(reg), amode, r.Type.Bits())
+		default:
+			panic("BUG")
+		}
+		m.insert(ldr)
+	}
+}
+
+func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) (*instruction, addressMode) {
+	exct := m.executableContext
+	exct.PendingInstructions = exct.PendingInstructions[:0]
+	mode := m.resolveAddressModeForOffset(offset, dstBits, rn, allowTmpRegUse)
+	for _, instr := range exct.PendingInstructions {
+		cur = linkInstr(cur, instr)
+	}
+	return cur, mode
+}
+
+func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) addressMode {
+	if rn.RegType() != regalloc.RegTypeInt {
+		panic("BUG: rn should be a pointer: " + formatVRegSized(rn, 64))
+	}
+	var amode addressMode
+	if offsetFitsInAddressModeKindRegUnsignedImm12(dstBits, offset) {
+		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: rn, imm: offset}
+	} else if offsetFitsInAddressModeKindRegSignedImm9(offset) {
+		amode = addressMode{kind: addressModeKindRegSignedImm9, rn: rn, imm: offset}
+	} else {
+		var indexReg regalloc.VReg
+		if allowTmpRegUse {
+			m.lowerConstantI64(tmpRegVReg, offset)
+			indexReg = tmpRegVReg
+		} else {
+			indexReg = m.compiler.AllocateVReg(ssa.TypeI64)
+			m.lowerConstantI64(indexReg, offset)
+		}
+		amode = addressMode{kind: addressModeKindRegReg, rn: rn, rm: indexReg, extOp: extendOpUXTX /* indicates index rm is 64-bit */}
+	}
+	return amode
+}
+
+func (m *machine) lowerCall(si *ssa.Instruction) {
+	isDirectCall := si.Opcode() == ssa.OpcodeCall
+	var indirectCalleePtr ssa.Value
+	var directCallee ssa.FuncRef
+	var sigID ssa.SignatureID
+	var args []ssa.Value
+	if isDirectCall {
+		directCallee, sigID, args = si.CallData()
+	} else {
+		indirectCalleePtr, sigID, args, _ /* on arm64, the calling convention is compatible with the Go runtime */ = si.CallIndirectData()
+	}
+	calleeABI := m.compiler.GetFunctionABI(m.compiler.SSABuilder().ResolveSignature(sigID))
+
+	stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize())
+	if m.maxRequiredStackSizeForCalls < stackSlotSize+16 {
+		m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // return address frame.
+	}
+
+	for i, arg := range args {
+		reg := m.compiler.VRegOf(arg)
+		def := m.compiler.ValueDefinition(arg)
+		m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize)
+	}
+
+	if isDirectCall {
+		call := m.allocateInstr()
+		call.asCall(directCallee, calleeABI)
+		m.insert(call)
+	} else {
+		ptr := m.compiler.VRegOf(indirectCalleePtr)
+		callInd := m.allocateInstr()
+		callInd.asCallIndirect(ptr, calleeABI)
+		m.insert(callInd)
+	}
+
+	var index int
+	r1, rs := si.Returns()
+	if r1.Valid() {
+		m.callerGenFunctionReturnVReg(calleeABI, 0, m.compiler.VRegOf(r1), stackSlotSize)
+		index++
+	}
+
+	for _, r := range rs {
+		m.callerGenFunctionReturnVReg(calleeABI, index, m.compiler.VRegOf(r), stackSlotSize)
+		index++
+	}
+}
+
+func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add bool) {
+	if imm12Operand, ok := asImm12Operand(uint64(diff)); ok {
+		alu := m.allocateInstr()
+		var ao aluOp
+		if add {
+			ao = aluOpAdd
+		} else {
+			ao = aluOpSub
+		}
+		alu.asALU(ao, operandNR(rd), operandNR(spVReg), imm12Operand, true)
+		m.insert(alu)
+	} else {
+		m.lowerConstantI64(tmpRegVReg, diff)
+		alu := m.allocateInstr()
+		var ao aluOp
+		if add {
+			ao = aluOpAdd
+		} else {
+			ao = aluOpSub
+		}
+		alu.asALU(ao, operandNR(rd), operandNR(spVReg), operandNR(tmpRegVReg), true)
+		m.insert(alu)
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go
new file mode 100644
index 000000000..5f0c613df
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go
@@ -0,0 +1,9 @@
+package arm64
+
+// entrypoint enters the machine code generated by this backend which begins with the preamble generated by functionABI.EmitGoEntryPreamble below.
+// This implements wazevo.entrypoint, and see the comments there for detail.
+func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr)
+
+// afterGoFunctionCallEntrypoint enters the machine code after growing the stack.
+// This implements wazevo.afterGoFunctionCallEntrypoint, and see the comments there for detail.
+func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s
new file mode 100644
index 000000000..0b579f852
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s
@@ -0,0 +1,29 @@
+//go:build arm64
+
+#include "funcdata.h"
+#include "textflag.h"
+
+// See the comments on EmitGoEntryPreamble for what this function is supposed to do.
+TEXT ·entrypoint(SB), NOSPLIT|NOFRAME, $0-48
+	MOVD preambleExecutable+0(FP), R27
+	MOVD functionExectuable+8(FP), R24
+	MOVD executionContextPtr+16(FP), R0
+	MOVD moduleContextPtr+24(FP), R1
+	MOVD paramResultSlicePtr+32(FP), R19
+	MOVD goAllocatedStackSlicePtr+40(FP), R26
+	JMP  (R27)
+
+TEXT ·afterGoFunctionCallEntrypoint(SB), NOSPLIT|NOFRAME, $0-32
+	MOVD goCallReturnAddress+0(FP), R20
+	MOVD executionContextPtr+8(FP), R0
+	MOVD stackPointer+16(FP), R19
+
+	// Save the current FP(R29), SP and LR(R30) into the wazevo.executionContext (stored in R0).
+	MOVD R29, 16(R0) // Store FP(R29) into [RO, #ExecutionContextOffsets.OriginalFramePointer]
+	MOVD RSP, R27    // Move SP to R27 (temporary register) since SP cannot be stored directly in str instructions.
+	MOVD R27, 24(R0) // Store R27 into [RO, #ExecutionContextOffsets.OriginalFramePointer]
+	MOVD R30, 32(R0) // Store R30 into [R0, #ExecutionContextOffsets.GoReturnAddress]
+
+	// Load the new stack pointer (which sits somewhere in Go-allocated stack) into SP.
+	MOVD R19, RSP
+	JMP  (R20)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
new file mode 100644
index 000000000..7a9cceb33
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
@@ -0,0 +1,230 @@
+package arm64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// CompileEntryPreamble implements backend.Machine. This assumes `entrypoint` function (in abi_go_entry_arm64.s) passes:
+//
+//  1. First (execution context ptr) and Second arguments are already passed in x0, and x1.
+//  2. param/result slice ptr in x19; the pointer to []uint64{} which is used to pass arguments and accept return values.
+//  3. Go-allocated stack slice ptr in x26.
+//  4. Function executable in x24.
+//
+// also SP and FP are correct Go-runtime-based values, and LR is the return address to the Go-side caller.
+func (m *machine) CompileEntryPreamble(signature *ssa.Signature) []byte {
+	root := m.constructEntryPreamble(signature)
+	m.encode(root)
+	return m.compiler.Buf()
+}
+
+var (
+	executionContextPtrReg = x0VReg
+	// callee-saved regs so that they can be used in the prologue and epilogue.
+	paramResultSlicePtr      = x19VReg
+	savedExecutionContextPtr = x20VReg
+	// goAllocatedStackPtr is not used in the epilogue.
+	goAllocatedStackPtr = x26VReg
+	// paramResultSliceCopied is not used in the epilogue.
+	paramResultSliceCopied = x25VReg
+	// tmpRegVReg is not used in the epilogue.
+	functionExecutable = x24VReg
+)
+
+func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regalloc.VReg, arg *backend.ABIArg, argStartOffsetFromSP int64) *instruction {
+	typ := arg.Type
+	bits := typ.Bits()
+	isStackArg := arg.Kind == backend.ABIArgKindStack
+
+	var loadTargetReg operand
+	if !isStackArg {
+		loadTargetReg = operandNR(arg.Reg)
+	} else {
+		switch typ {
+		case ssa.TypeI32, ssa.TypeI64:
+			loadTargetReg = operandNR(x15VReg)
+		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+			loadTargetReg = operandNR(v15VReg)
+		default:
+			panic("TODO?")
+		}
+	}
+
+	var postIndexImm int64
+	if typ == ssa.TypeV128 {
+		postIndexImm = 16 // v128 is represented as 2x64-bit in Go slice.
+	} else {
+		postIndexImm = 8
+	}
+	loadMode := addressMode{kind: addressModeKindPostIndex, rn: paramSlicePtr, imm: postIndexImm}
+
+	instr := m.allocateInstr()
+	switch typ {
+	case ssa.TypeI32:
+		instr.asULoad(loadTargetReg, loadMode, 32)
+	case ssa.TypeI64:
+		instr.asULoad(loadTargetReg, loadMode, 64)
+	case ssa.TypeF32:
+		instr.asFpuLoad(loadTargetReg, loadMode, 32)
+	case ssa.TypeF64:
+		instr.asFpuLoad(loadTargetReg, loadMode, 64)
+	case ssa.TypeV128:
+		instr.asFpuLoad(loadTargetReg, loadMode, 128)
+	}
+	cur = linkInstr(cur, instr)
+
+	if isStackArg {
+		var storeMode addressMode
+		cur, storeMode = m.resolveAddressModeForOffsetAndInsert(cur, argStartOffsetFromSP+arg.Offset, bits, spVReg, true)
+		toStack := m.allocateInstr()
+		toStack.asStore(loadTargetReg, storeMode, bits)
+		cur = linkInstr(cur, toStack)
+	}
+	return cur
+}
+
+func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr regalloc.VReg, result *backend.ABIArg, resultStartOffsetFromSP int64) *instruction {
+	isStackArg := result.Kind == backend.ABIArgKindStack
+	typ := result.Type
+	bits := typ.Bits()
+
+	var storeTargetReg operand
+	if !isStackArg {
+		storeTargetReg = operandNR(result.Reg)
+	} else {
+		switch typ {
+		case ssa.TypeI32, ssa.TypeI64:
+			storeTargetReg = operandNR(x15VReg)
+		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+			storeTargetReg = operandNR(v15VReg)
+		default:
+			panic("TODO?")
+		}
+	}
+
+	var postIndexImm int64
+	if typ == ssa.TypeV128 {
+		postIndexImm = 16 // v128 is represented as 2x64-bit in Go slice.
+	} else {
+		postIndexImm = 8
+	}
+
+	if isStackArg {
+		var loadMode addressMode
+		cur, loadMode = m.resolveAddressModeForOffsetAndInsert(cur, resultStartOffsetFromSP+result.Offset, bits, spVReg, true)
+		toReg := m.allocateInstr()
+		switch typ {
+		case ssa.TypeI32, ssa.TypeI64:
+			toReg.asULoad(storeTargetReg, loadMode, bits)
+		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+			toReg.asFpuLoad(storeTargetReg, loadMode, bits)
+		default:
+			panic("TODO?")
+		}
+		cur = linkInstr(cur, toReg)
+	}
+
+	mode := addressMode{kind: addressModeKindPostIndex, rn: resultSlicePtr, imm: postIndexImm}
+	instr := m.allocateInstr()
+	instr.asStore(storeTargetReg, mode, bits)
+	cur = linkInstr(cur, instr)
+	return cur
+}
+
+func (m *machine) constructEntryPreamble(sig *ssa.Signature) (root *instruction) {
+	abi := backend.FunctionABI{}
+	abi.Init(sig, intParamResultRegs, floatParamResultRegs)
+
+	root = m.allocateNop()
+
+	//// ----------------------------------- prologue ----------------------------------- ////
+
+	// First, we save executionContextPtrReg into a callee-saved register so that it can be used in epilogue as well.
+	// 		mov savedExecutionContextPtr, x0
+	cur := m.move64(savedExecutionContextPtr, executionContextPtrReg, root)
+
+	// Next, save the current FP, SP and LR into the wazevo.executionContext:
+	// 		str fp, [savedExecutionContextPtr, #OriginalFramePointer]
+	//      mov tmp, sp ;; sp cannot be str'ed directly.
+	// 		str sp, [savedExecutionContextPtr, #OriginalStackPointer]
+	// 		str lr, [savedExecutionContextPtr, #GoReturnAddress]
+	cur = m.loadOrStoreAtExecutionContext(fpVReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, true, cur)
+	cur = m.move64(tmpRegVReg, spVReg, cur)
+	cur = m.loadOrStoreAtExecutionContext(tmpRegVReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, true, cur)
+	cur = m.loadOrStoreAtExecutionContext(lrVReg, wazevoapi.ExecutionContextOffsetGoReturnAddress, true, cur)
+
+	// Then, move the Go-allocated stack pointer to SP:
+	// 		mov sp, goAllocatedStackPtr
+	cur = m.move64(spVReg, goAllocatedStackPtr, cur)
+
+	prReg := paramResultSlicePtr
+	if len(abi.Args) > 2 && len(abi.Rets) > 0 {
+		// paramResultSlicePtr is modified during the execution of goEntryPreamblePassArg,
+		// so copy it to another reg.
+		cur = m.move64(paramResultSliceCopied, paramResultSlicePtr, cur)
+		prReg = paramResultSliceCopied
+	}
+
+	stackSlotSize := int64(abi.AlignedArgResultStackSlotSize())
+	for i := range abi.Args {
+		if i < 2 {
+			// module context ptr and execution context ptr are passed in x0 and x1 by the Go assembly function.
+			continue
+		}
+		arg := &abi.Args[i]
+		cur = m.goEntryPreamblePassArg(cur, prReg, arg, -stackSlotSize)
+	}
+
+	// Call the real function.
+	bl := m.allocateInstr()
+	bl.asCallIndirect(functionExecutable, &abi)
+	cur = linkInstr(cur, bl)
+
+	///// ----------------------------------- epilogue ----------------------------------- /////
+
+	// Store the register results into paramResultSlicePtr.
+	for i := range abi.Rets {
+		cur = m.goEntryPreamblePassResult(cur, paramResultSlicePtr, &abi.Rets[i], abi.ArgStackSize-stackSlotSize)
+	}
+
+	// Finally, restore the FP, SP and LR, and return to the Go code.
+	// 		ldr fp, [savedExecutionContextPtr, #OriginalFramePointer]
+	// 		ldr tmp, [savedExecutionContextPtr, #OriginalStackPointer]
+	//      mov sp, tmp ;; sp cannot be str'ed directly.
+	// 		ldr lr, [savedExecutionContextPtr, #GoReturnAddress]
+	// 		ret ;; --> return to the Go code
+	cur = m.loadOrStoreAtExecutionContext(fpVReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, false, cur)
+	cur = m.loadOrStoreAtExecutionContext(tmpRegVReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, false, cur)
+	cur = m.move64(spVReg, tmpRegVReg, cur)
+	cur = m.loadOrStoreAtExecutionContext(lrVReg, wazevoapi.ExecutionContextOffsetGoReturnAddress, false, cur)
+	retInst := m.allocateInstr()
+	retInst.asRet()
+	linkInstr(cur, retInst)
+	return
+}
+
+func (m *machine) move64(dst, src regalloc.VReg, prev *instruction) *instruction {
+	instr := m.allocateInstr()
+	instr.asMove64(dst, src)
+	return linkInstr(prev, instr)
+}
+
+func (m *machine) loadOrStoreAtExecutionContext(d regalloc.VReg, offset wazevoapi.Offset, store bool, prev *instruction) *instruction {
+	instr := m.allocateInstr()
+	mode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: savedExecutionContextPtr, imm: offset.I64()}
+	if store {
+		instr.asStore(operandNR(d), mode, 64)
+	} else {
+		instr.asULoad(operandNR(d), mode, 64)
+	}
+	return linkInstr(prev, instr)
+}
+
+func linkInstr(prev, next *instruction) *instruction {
+	prev.next = next
+	next.prev = prev
+	return next
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
new file mode 100644
index 000000000..466b1f960
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
@@ -0,0 +1,428 @@
+package arm64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+var calleeSavedRegistersSorted = []regalloc.VReg{
+	x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg,
+	v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg,
+}
+
+// CompileGoFunctionTrampoline implements backend.Machine.
+func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte {
+	exct := m.executableContext
+	argBegin := 1 // Skips exec context by default.
+	if needModuleContextPtr {
+		argBegin++
+	}
+
+	abi := &backend.FunctionABI{}
+	abi.Init(sig, intParamResultRegs, floatParamResultRegs)
+	m.currentABI = abi
+
+	cur := m.allocateInstr()
+	cur.asNop0()
+	exct.RootInstr = cur
+
+	// Execution context is always the first argument.
+	execCtrPtr := x0VReg
+
+	// In the following, we create the following stack layout:
+	//
+	//                   (high address)
+	//     SP ------> +-----------------+  <----+
+	//                |     .......     |       |
+	//                |      ret Y      |       |
+	//                |     .......     |       |
+	//                |      ret 0      |       |
+	//                |      arg X      |       |  size_of_arg_ret
+	//                |     .......     |       |
+	//                |      arg 1      |       |
+	//                |      arg 0      |  <----+ <-------- originalArg0Reg
+	//                | size_of_arg_ret |
+	//                |  ReturnAddress  |
+	//                +-----------------+ <----+
+	//                |      xxxx       |      |  ;; might be padded to make it 16-byte aligned.
+	//           +--->|  arg[N]/ret[M]  |      |
+	//  sliceSize|    |   ............  |      | goCallStackSize
+	//           |    |  arg[1]/ret[1]  |      |
+	//           +--->|  arg[0]/ret[0]  | <----+ <-------- arg0ret0AddrReg
+	//                |    sliceSize    |
+	//                |   frame_size    |
+	//                +-----------------+
+	//                   (low address)
+	//
+	// where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions,
+	// therefore will be accessed as the usual []uint64. So that's where we need to pass/receive
+	// the arguments/return values.
+
+	// First of all, to update the SP, and create "ReturnAddress + size_of_arg_ret".
+	cur = m.createReturnAddrAndSizeOfArgRetSlot(cur)
+
+	const frameInfoSize = 16 // == frame_size + sliceSize.
+
+	// Next, we should allocate the stack for the Go function call if necessary.
+	goCallStackSize, sliceSizeInBytes := backend.GoFunctionCallRequiredStackSize(sig, argBegin)
+	cur = m.insertStackBoundsCheck(goCallStackSize+frameInfoSize, cur)
+
+	originalArg0Reg := x17VReg // Caller save, so we can use it for whatever we want.
+	if m.currentABI.AlignedArgResultStackSlotSize() > 0 {
+		// At this point, SP points to `ReturnAddress`, so add 16 to get the original arg 0 slot.
+		cur = m.addsAddOrSubStackPointer(cur, originalArg0Reg, frameInfoSize, true)
+	}
+
+	// Save the callee saved registers.
+	cur = m.saveRegistersInExecutionContext(cur, calleeSavedRegistersSorted)
+
+	if needModuleContextPtr {
+		offset := wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.I64()
+		if !offsetFitsInAddressModeKindRegUnsignedImm12(64, offset) {
+			panic("BUG: too large or un-aligned offset for goFunctionCallCalleeModuleContextOpaque in execution context")
+		}
+
+		// Module context is always the second argument.
+		moduleCtrPtr := x1VReg
+		store := m.allocateInstr()
+		amode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtrPtr, imm: offset}
+		store.asStore(operandNR(moduleCtrPtr), amode, 64)
+		cur = linkInstr(cur, store)
+	}
+
+	// Advances the stack pointer.
+	cur = m.addsAddOrSubStackPointer(cur, spVReg, goCallStackSize, false)
+
+	// Copy the pointer to x15VReg.
+	arg0ret0AddrReg := x15VReg // Caller save, so we can use it for whatever we want.
+	copySp := m.allocateInstr()
+	copySp.asMove64(arg0ret0AddrReg, spVReg)
+	cur = linkInstr(cur, copySp)
+
+	// Next, we need to store all the arguments to the stack in the typical Wasm stack style.
+	for i := range abi.Args[argBegin:] {
+		arg := &abi.Args[argBegin+i]
+		store := m.allocateInstr()
+		var v regalloc.VReg
+		if arg.Kind == backend.ABIArgKindReg {
+			v = arg.Reg
+		} else {
+			cur, v = m.goFunctionCallLoadStackArg(cur, originalArg0Reg, arg,
+				// Caller save, so we can use it for whatever we want.
+				x11VReg, v11VReg)
+		}
+
+		var sizeInBits byte
+		if arg.Type == ssa.TypeV128 {
+			sizeInBits = 128
+		} else {
+			sizeInBits = 64
+		}
+		store.asStore(operandNR(v),
+			addressMode{
+				kind: addressModeKindPostIndex,
+				rn:   arg0ret0AddrReg, imm: int64(sizeInBits / 8),
+			}, sizeInBits)
+		cur = linkInstr(cur, store)
+	}
+
+	// Finally, now that we've advanced SP to arg[0]/ret[0], we allocate `frame_size + sliceSize`.
+	var frameSizeReg, sliceSizeReg regalloc.VReg
+	if goCallStackSize > 0 {
+		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, goCallStackSize)
+		frameSizeReg = tmpRegVReg
+		cur = m.lowerConstantI64AndInsert(cur, x16VReg, sliceSizeInBytes/8)
+		sliceSizeReg = x16VReg
+	} else {
+		frameSizeReg = xzrVReg
+		sliceSizeReg = xzrVReg
+	}
+	_amode := addressModePreOrPostIndex(spVReg, -16, true)
+	storeP := m.allocateInstr()
+	storeP.asStorePair64(frameSizeReg, sliceSizeReg, _amode)
+	cur = linkInstr(cur, storeP)
+
+	// Set the exit status on the execution context.
+	cur = m.setExitCode(cur, x0VReg, exitCode)
+
+	// Save the current stack pointer.
+	cur = m.saveCurrentStackPointer(cur, x0VReg)
+
+	// Exit the execution.
+	cur = m.storeReturnAddressAndExit(cur)
+
+	// After the call, we need to restore the callee saved registers.
+	cur = m.restoreRegistersInExecutionContext(cur, calleeSavedRegistersSorted)
+
+	// Get the pointer to the arg[0]/ret[0]: We need to skip `frame_size + sliceSize`.
+	if len(abi.Rets) > 0 {
+		cur = m.addsAddOrSubStackPointer(cur, arg0ret0AddrReg, frameInfoSize, true)
+	}
+
+	// Advances the SP so that it points to `ReturnAddress`.
+	cur = m.addsAddOrSubStackPointer(cur, spVReg, frameInfoSize+goCallStackSize, true)
+	ldr := m.allocateInstr()
+	// And load the return address.
+	ldr.asULoad(operandNR(lrVReg),
+		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
+	cur = linkInstr(cur, ldr)
+
+	originalRet0Reg := x17VReg // Caller save, so we can use it for whatever we want.
+	if m.currentABI.RetStackSize > 0 {
+		cur = m.addsAddOrSubStackPointer(cur, originalRet0Reg, m.currentABI.ArgStackSize, true)
+	}
+
+	// Make the SP point to the original address (above the result slot).
+	if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
+		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
+	}
+
+	for i := range abi.Rets {
+		r := &abi.Rets[i]
+		if r.Kind == backend.ABIArgKindReg {
+			loadIntoReg := m.allocateInstr()
+			mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
+			switch r.Type {
+			case ssa.TypeI32:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoReg.asULoad(operandNR(r.Reg), mode, 32)
+			case ssa.TypeI64:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoReg.asULoad(operandNR(r.Reg), mode, 64)
+			case ssa.TypeF32:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 32)
+			case ssa.TypeF64:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 64)
+			case ssa.TypeV128:
+				mode.imm = 16
+				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 128)
+			default:
+				panic("TODO")
+			}
+			cur = linkInstr(cur, loadIntoReg)
+		} else {
+			// First we need to load the value to a temporary just like ^^.
+			intTmp, floatTmp := x11VReg, v11VReg
+			loadIntoTmpReg := m.allocateInstr()
+			mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
+			var resultReg regalloc.VReg
+			switch r.Type {
+			case ssa.TypeI32:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 32)
+				resultReg = intTmp
+			case ssa.TypeI64:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 64)
+				resultReg = intTmp
+			case ssa.TypeF32:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 32)
+				resultReg = floatTmp
+			case ssa.TypeF64:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 64)
+				resultReg = floatTmp
+			case ssa.TypeV128:
+				mode.imm = 16
+				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 128)
+				resultReg = floatTmp
+			default:
+				panic("TODO")
+			}
+			cur = linkInstr(cur, loadIntoTmpReg)
+			cur = m.goFunctionCallStoreStackResult(cur, originalRet0Reg, r, resultReg)
+		}
+	}
+
+	ret := m.allocateInstr()
+	ret.asRet()
+	linkInstr(cur, ret)
+
+	m.encode(m.executableContext.RootInstr)
+	return m.compiler.Buf()
+}
+
+func (m *machine) saveRegistersInExecutionContext(cur *instruction, regs []regalloc.VReg) *instruction {
+	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
+	for _, v := range regs {
+		store := m.allocateInstr()
+		var sizeInBits byte
+		switch v.RegType() {
+		case regalloc.RegTypeInt:
+			sizeInBits = 64
+		case regalloc.RegTypeFloat:
+			sizeInBits = 128
+		}
+		store.asStore(operandNR(v),
+			addressMode{
+				kind: addressModeKindRegUnsignedImm12,
+				// Execution context is always the first argument.
+				rn: x0VReg, imm: offset,
+			}, sizeInBits)
+		store.prev = cur
+		cur.next = store
+		cur = store
+		offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally store regs at the offset of multiple of 16.
+	}
+	return cur
+}
+
+func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []regalloc.VReg) *instruction {
+	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
+	for _, v := range regs {
+		load := m.allocateInstr()
+		var as func(dst operand, amode addressMode, sizeInBits byte)
+		var sizeInBits byte
+		switch v.RegType() {
+		case regalloc.RegTypeInt:
+			as = load.asULoad
+			sizeInBits = 64
+		case regalloc.RegTypeFloat:
+			as = load.asFpuLoad
+			sizeInBits = 128
+		}
+		as(operandNR(v),
+			addressMode{
+				kind: addressModeKindRegUnsignedImm12,
+				// Execution context is always the first argument.
+				rn: x0VReg, imm: offset,
+			}, sizeInBits)
+		cur = linkInstr(cur, load)
+		offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally load regs at the offset of multiple of 16.
+	}
+	return cur
+}
+
+func (m *machine) lowerConstantI64AndInsert(cur *instruction, dst regalloc.VReg, v int64) *instruction {
+	exct := m.executableContext
+	exct.PendingInstructions = exct.PendingInstructions[:0]
+	m.lowerConstantI64(dst, v)
+	for _, instr := range exct.PendingInstructions {
+		cur = linkInstr(cur, instr)
+	}
+	return cur
+}
+
+func (m *machine) lowerConstantI32AndInsert(cur *instruction, dst regalloc.VReg, v int32) *instruction {
+	exct := m.executableContext
+	exct.PendingInstructions = exct.PendingInstructions[:0]
+	m.lowerConstantI32(dst, v)
+	for _, instr := range exct.PendingInstructions {
+		cur = linkInstr(cur, instr)
+	}
+	return cur
+}
+
+func (m *machine) setExitCode(cur *instruction, execCtr regalloc.VReg, exitCode wazevoapi.ExitCode) *instruction {
+	constReg := x17VReg // caller-saved, so we can use it.
+	cur = m.lowerConstantI32AndInsert(cur, constReg, int32(exitCode))
+
+	// Set the exit status on the execution context.
+	setExistStatus := m.allocateInstr()
+	setExistStatus.asStore(operandNR(constReg),
+		addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
+		}, 32)
+	cur = linkInstr(cur, setExistStatus)
+	return cur
+}
+
+func (m *machine) storeReturnAddressAndExit(cur *instruction) *instruction {
+	// Read the return address into tmp, and store it in the execution context.
+	adr := m.allocateInstr()
+	adr.asAdr(tmpRegVReg, exitSequenceSize+8)
+	cur = linkInstr(cur, adr)
+
+	storeReturnAddr := m.allocateInstr()
+	storeReturnAddr.asStore(operandNR(tmpRegVReg),
+		addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			// Execution context is always the first argument.
+			rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
+		}, 64)
+	cur = linkInstr(cur, storeReturnAddr)
+
+	// Exit the execution.
+	trapSeq := m.allocateInstr()
+	trapSeq.asExitSequence(x0VReg)
+	cur = linkInstr(cur, trapSeq)
+	return cur
+}
+
+func (m *machine) saveCurrentStackPointer(cur *instruction, execCtr regalloc.VReg) *instruction {
+	// Save the current stack pointer:
+	// 	mov tmp, sp,
+	// 	str tmp, [exec_ctx, #stackPointerBeforeGoCall]
+	movSp := m.allocateInstr()
+	movSp.asMove64(tmpRegVReg, spVReg)
+	cur = linkInstr(cur, movSp)
+
+	strSp := m.allocateInstr()
+	strSp.asStore(operandNR(tmpRegVReg),
+		addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
+		}, 64)
+	cur = linkInstr(cur, strSp)
+	return cur
+}
+
+func (m *machine) goFunctionCallLoadStackArg(cur *instruction, originalArg0Reg regalloc.VReg, arg *backend.ABIArg, intVReg, floatVReg regalloc.VReg) (*instruction, regalloc.VReg) {
+	load := m.allocateInstr()
+	var result regalloc.VReg
+	mode := addressMode{kind: addressModeKindPostIndex, rn: originalArg0Reg}
+	switch arg.Type {
+	case ssa.TypeI32:
+		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+		load.asULoad(operandNR(intVReg), mode, 32)
+		result = intVReg
+	case ssa.TypeI64:
+		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+		load.asULoad(operandNR(intVReg), mode, 64)
+		result = intVReg
+	case ssa.TypeF32:
+		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+		load.asFpuLoad(operandNR(floatVReg), mode, 32)
+		result = floatVReg
+	case ssa.TypeF64:
+		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+		load.asFpuLoad(operandNR(floatVReg), mode, 64)
+		result = floatVReg
+	case ssa.TypeV128:
+		mode.imm = 16
+		load.asFpuLoad(operandNR(floatVReg), mode, 128)
+		result = floatVReg
+	default:
+		panic("TODO")
+	}
+
+	cur = linkInstr(cur, load)
+	return cur, result
+}
+
+func (m *machine) goFunctionCallStoreStackResult(cur *instruction, originalRet0Reg regalloc.VReg, result *backend.ABIArg, resultVReg regalloc.VReg) *instruction {
+	store := m.allocateInstr()
+	mode := addressMode{kind: addressModeKindPostIndex, rn: originalRet0Reg}
+	var sizeInBits byte
+	switch result.Type {
+	case ssa.TypeI32, ssa.TypeF32:
+		mode.imm = 8
+		sizeInBits = 32
+	case ssa.TypeI64, ssa.TypeF64:
+		mode.imm = 8
+		sizeInBits = 64
+	case ssa.TypeV128:
+		mode.imm = 16
+		sizeInBits = 128
+	default:
+		panic("TODO")
+	}
+	store.asStore(operandNR(resultVReg), mode, sizeInBits)
+	return linkInstr(cur, store)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go
new file mode 100644
index 000000000..6f6cdd1b2
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go
@@ -0,0 +1,215 @@
+package arm64
+
+import (
+	"strconv"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type (
+	cond     uint64
+	condKind byte
+)
+
+const (
+	// condKindRegisterZero represents a condition which checks if the register is zero.
+	// This indicates that the instruction must be encoded as CBZ:
+	// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/CBZ--Compare-and-Branch-on-Zero-
+	condKindRegisterZero condKind = iota
+	// condKindRegisterNotZero indicates that the instruction must be encoded as CBNZ:
+	// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/CBNZ--Compare-and-Branch-on-Nonzero-
+	condKindRegisterNotZero
+	// condKindCondFlagSet indicates that the instruction must be encoded as B.cond:
+	// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/B-cond--Branch-conditionally-
+	condKindCondFlagSet
+)
+
+// kind returns the kind of condition which is stored in the first two bits.
+func (c cond) kind() condKind {
+	return condKind(c & 0b11)
+}
+
+func (c cond) asUint64() uint64 {
+	return uint64(c)
+}
+
+// register returns the register for register conditions.
+// This panics if the condition is not a register condition (condKindRegisterZero or condKindRegisterNotZero).
+func (c cond) register() regalloc.VReg {
+	if c.kind() != condKindRegisterZero && c.kind() != condKindRegisterNotZero {
+		panic("condition is not a register")
+	}
+	return regalloc.VReg(c >> 2)
+}
+
+func registerAsRegZeroCond(r regalloc.VReg) cond {
+	return cond(r)<<2 | cond(condKindRegisterZero)
+}
+
+func registerAsRegNotZeroCond(r regalloc.VReg) cond {
+	return cond(r)<<2 | cond(condKindRegisterNotZero)
+}
+
+func (c cond) flag() condFlag {
+	if c.kind() != condKindCondFlagSet {
+		panic("condition is not a flag")
+	}
+	return condFlag(c >> 2)
+}
+
+func (c condFlag) asCond() cond {
+	return cond(c)<<2 | cond(condKindCondFlagSet)
+}
+
+// condFlag represents a condition flag for conditional branches.
+// The value matches the encoding of condition flags in the ARM64 instruction set.
+// https://developer.arm.com/documentation/den0024/a/The-A64-instruction-set/Data-processing-instructions/Conditional-instructions
+type condFlag uint8
+
+const (
+	eq condFlag = iota // eq represents "equal"
+	ne                 // ne represents "not equal"
+	hs                 // hs represents "higher or same"
+	lo                 // lo represents "lower"
+	mi                 // mi represents "minus or negative result"
+	pl                 // pl represents "plus or positive result"
+	vs                 // vs represents "overflow set"
+	vc                 // vc represents "overflow clear"
+	hi                 // hi represents "higher"
+	ls                 // ls represents "lower or same"
+	ge                 // ge represents "greater or equal"
+	lt                 // lt represents "less than"
+	gt                 // gt represents "greater than"
+	le                 // le represents "less than or equal"
+	al                 // al represents "always"
+	nv                 // nv represents "never"
+)
+
+// invert returns the inverted condition.
+func (c condFlag) invert() condFlag {
+	switch c {
+	case eq:
+		return ne
+	case ne:
+		return eq
+	case hs:
+		return lo
+	case lo:
+		return hs
+	case mi:
+		return pl
+	case pl:
+		return mi
+	case vs:
+		return vc
+	case vc:
+		return vs
+	case hi:
+		return ls
+	case ls:
+		return hi
+	case ge:
+		return lt
+	case lt:
+		return ge
+	case gt:
+		return le
+	case le:
+		return gt
+	case al:
+		return nv
+	case nv:
+		return al
+	default:
+		panic(c)
+	}
+}
+
+// String implements fmt.Stringer.
+func (c condFlag) String() string {
+	switch c {
+	case eq:
+		return "eq"
+	case ne:
+		return "ne"
+	case hs:
+		return "hs"
+	case lo:
+		return "lo"
+	case mi:
+		return "mi"
+	case pl:
+		return "pl"
+	case vs:
+		return "vs"
+	case vc:
+		return "vc"
+	case hi:
+		return "hi"
+	case ls:
+		return "ls"
+	case ge:
+		return "ge"
+	case lt:
+		return "lt"
+	case gt:
+		return "gt"
+	case le:
+		return "le"
+	case al:
+		return "al"
+	case nv:
+		return "nv"
+	default:
+		panic(strconv.Itoa(int(c)))
+	}
+}
+
+// condFlagFromSSAIntegerCmpCond returns the condition flag for the given ssa.IntegerCmpCond.
+func condFlagFromSSAIntegerCmpCond(c ssa.IntegerCmpCond) condFlag {
+	switch c {
+	case ssa.IntegerCmpCondEqual:
+		return eq
+	case ssa.IntegerCmpCondNotEqual:
+		return ne
+	case ssa.IntegerCmpCondSignedLessThan:
+		return lt
+	case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
+		return ge
+	case ssa.IntegerCmpCondSignedGreaterThan:
+		return gt
+	case ssa.IntegerCmpCondSignedLessThanOrEqual:
+		return le
+	case ssa.IntegerCmpCondUnsignedLessThan:
+		return lo
+	case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual:
+		return hs
+	case ssa.IntegerCmpCondUnsignedGreaterThan:
+		return hi
+	case ssa.IntegerCmpCondUnsignedLessThanOrEqual:
+		return ls
+	default:
+		panic(c)
+	}
+}
+
+// condFlagFromSSAFloatCmpCond returns the condition flag for the given ssa.FloatCmpCond.
+func condFlagFromSSAFloatCmpCond(c ssa.FloatCmpCond) condFlag {
+	switch c {
+	case ssa.FloatCmpCondEqual:
+		return eq
+	case ssa.FloatCmpCondNotEqual:
+		return ne
+	case ssa.FloatCmpCondLessThan:
+		return mi
+	case ssa.FloatCmpCondLessThanOrEqual:
+		return ls
+	case ssa.FloatCmpCondGreaterThan:
+		return gt
+	case ssa.FloatCmpCondGreaterThanOrEqual:
+		return ge
+	default:
+		panic(c)
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
new file mode 100644
index 000000000..8aabc5997
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
@@ -0,0 +1,2545 @@
+package arm64
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type (
+	// instruction represents either a real instruction in arm64, or the meta instructions
+	// that are convenient for code generation. For example, inline constants are also treated
+	// as instructions.
+	//
+	// Basically, each instruction knows how to get encoded in binaries. Hence, the final output of compilation
+	// can be considered equivalent to the sequence of such instructions.
+	//
+	// Each field is interpreted depending on the kind.
+	//
+	// TODO: optimize the layout later once the impl settles.
+	instruction struct {
+		prev, next          *instruction
+		u1, u2, u3          uint64
+		rd, rm, rn, ra      operand
+		amode               addressMode
+		kind                instructionKind
+		addedBeforeRegAlloc bool
+	}
+
+	// instructionKind represents the kind of instruction.
+	// This controls how the instruction struct is interpreted.
+	instructionKind byte
+)
+
+func asNop0(i *instruction) {
+	i.kind = nop0
+}
+
+func setNext(i, next *instruction) {
+	i.next = next
+}
+
+func setPrev(i, prev *instruction) {
+	i.prev = prev
+}
+
+// IsCall implements regalloc.Instr IsCall.
+func (i *instruction) IsCall() bool {
+	return i.kind == call
+}
+
+// IsIndirectCall implements regalloc.Instr IsIndirectCall.
+func (i *instruction) IsIndirectCall() bool {
+	return i.kind == callInd
+}
+
+// IsReturn implements regalloc.Instr IsReturn.
+func (i *instruction) IsReturn() bool {
+	return i.kind == ret
+}
+
+// Next implements regalloc.Instr Next.
+func (i *instruction) Next() regalloc.Instr {
+	return i.next
+}
+
+// Prev implements regalloc.Instr Prev.
+func (i *instruction) Prev() regalloc.Instr {
+	return i.prev
+}
+
+// AddedBeforeRegAlloc implements regalloc.Instr AddedBeforeRegAlloc.
+func (i *instruction) AddedBeforeRegAlloc() bool {
+	return i.addedBeforeRegAlloc
+}
+
+type defKind byte
+
+const (
+	defKindNone defKind = iota + 1
+	defKindRD
+	defKindCall
+)
+
+var defKinds = [numInstructionKinds]defKind{
+	adr:                  defKindRD,
+	aluRRR:               defKindRD,
+	aluRRRR:              defKindRD,
+	aluRRImm12:           defKindRD,
+	aluRRBitmaskImm:      defKindRD,
+	aluRRRShift:          defKindRD,
+	aluRRImmShift:        defKindRD,
+	aluRRRExtend:         defKindRD,
+	bitRR:                defKindRD,
+	movZ:                 defKindRD,
+	movK:                 defKindRD,
+	movN:                 defKindRD,
+	mov32:                defKindRD,
+	mov64:                defKindRD,
+	fpuMov64:             defKindRD,
+	fpuMov128:            defKindRD,
+	fpuRR:                defKindRD,
+	fpuRRR:               defKindRD,
+	nop0:                 defKindNone,
+	call:                 defKindCall,
+	callInd:              defKindCall,
+	ret:                  defKindNone,
+	store8:               defKindNone,
+	store16:              defKindNone,
+	store32:              defKindNone,
+	store64:              defKindNone,
+	exitSequence:         defKindNone,
+	condBr:               defKindNone,
+	br:                   defKindNone,
+	brTableSequence:      defKindNone,
+	cSet:                 defKindRD,
+	extend:               defKindRD,
+	fpuCmp:               defKindNone,
+	uLoad8:               defKindRD,
+	uLoad16:              defKindRD,
+	uLoad32:              defKindRD,
+	sLoad8:               defKindRD,
+	sLoad16:              defKindRD,
+	sLoad32:              defKindRD,
+	uLoad64:              defKindRD,
+	fpuLoad32:            defKindRD,
+	fpuLoad64:            defKindRD,
+	fpuLoad128:           defKindRD,
+	vecLoad1R:            defKindRD,
+	loadFpuConst32:       defKindRD,
+	loadFpuConst64:       defKindRD,
+	loadFpuConst128:      defKindRD,
+	fpuStore32:           defKindNone,
+	fpuStore64:           defKindNone,
+	fpuStore128:          defKindNone,
+	udf:                  defKindNone,
+	cSel:                 defKindRD,
+	fpuCSel:              defKindRD,
+	movToVec:             defKindRD,
+	movFromVec:           defKindRD,
+	movFromVecSigned:     defKindRD,
+	vecDup:               defKindRD,
+	vecDupElement:        defKindRD,
+	vecExtract:           defKindRD,
+	vecMisc:              defKindRD,
+	vecMovElement:        defKindRD,
+	vecLanes:             defKindRD,
+	vecShiftImm:          defKindRD,
+	vecTbl:               defKindRD,
+	vecTbl2:              defKindRD,
+	vecPermute:           defKindRD,
+	vecRRR:               defKindRD,
+	vecRRRRewrite:        defKindNone,
+	fpuToInt:             defKindRD,
+	intToFpu:             defKindRD,
+	cCmpImm:              defKindNone,
+	movToFPSR:            defKindNone,
+	movFromFPSR:          defKindRD,
+	emitSourceOffsetInfo: defKindNone,
+	atomicRmw:            defKindRD,
+	atomicCas:            defKindNone,
+	atomicLoad:           defKindRD,
+	atomicStore:          defKindNone,
+	dmb:                  defKindNone,
+	loadConstBlockArg:    defKindRD,
+}
+
+// Defs returns the list of regalloc.VReg that are defined by the instruction.
+// In order to reduce the number of allocations, the caller can pass the slice to be used.
+func (i *instruction) Defs(regs *[]regalloc.VReg) []regalloc.VReg {
+	*regs = (*regs)[:0]
+	switch defKinds[i.kind] {
+	case defKindNone:
+	case defKindRD:
+		*regs = append(*regs, i.rd.nr())
+	case defKindCall:
+		_, _, retIntRealRegs, retFloatRealRegs, _ := backend.ABIInfoFromUint64(i.u2)
+		for i := byte(0); i < retIntRealRegs; i++ {
+			*regs = append(*regs, regInfo.RealRegToVReg[intParamResultRegs[i]])
+		}
+		for i := byte(0); i < retFloatRealRegs; i++ {
+			*regs = append(*regs, regInfo.RealRegToVReg[floatParamResultRegs[i]])
+		}
+	default:
+		panic(fmt.Sprintf("defKind for %v not defined", i))
+	}
+	return *regs
+}
+
+// AssignDef implements regalloc.Instr AssignDef.
+func (i *instruction) AssignDef(reg regalloc.VReg) {
+	switch defKinds[i.kind] {
+	case defKindNone:
+	case defKindRD:
+		i.rd = i.rd.assignReg(reg)
+	case defKindCall:
+		panic("BUG: call instructions shouldn't be assigned")
+	default:
+		panic(fmt.Sprintf("defKind for %v not defined", i))
+	}
+}
+
+type useKind byte
+
+const (
+	useKindNone useKind = iota + 1
+	useKindRN
+	useKindRNRM
+	useKindRNRMRA
+	useKindRNRN1RM
+	useKindCall
+	useKindCallInd
+	useKindAMode
+	useKindRNAMode
+	useKindCond
+	// useKindRDRewrite indicates an instruction where RD is used both as a source and destination.
+	// A temporary register for RD must be allocated explicitly with the source copied to this
+	// register before the instruction and the value copied from this register to the instruction
+	// return register.
+	useKindRDRewrite
+)
+
+var useKinds = [numInstructionKinds]useKind{
+	udf:                  useKindNone,
+	aluRRR:               useKindRNRM,
+	aluRRRR:              useKindRNRMRA,
+	aluRRImm12:           useKindRN,
+	aluRRBitmaskImm:      useKindRN,
+	aluRRRShift:          useKindRNRM,
+	aluRRImmShift:        useKindRN,
+	aluRRRExtend:         useKindRNRM,
+	bitRR:                useKindRN,
+	movZ:                 useKindNone,
+	movK:                 useKindNone,
+	movN:                 useKindNone,
+	mov32:                useKindRN,
+	mov64:                useKindRN,
+	fpuMov64:             useKindRN,
+	fpuMov128:            useKindRN,
+	fpuRR:                useKindRN,
+	fpuRRR:               useKindRNRM,
+	nop0:                 useKindNone,
+	call:                 useKindCall,
+	callInd:              useKindCallInd,
+	ret:                  useKindNone,
+	store8:               useKindRNAMode,
+	store16:              useKindRNAMode,
+	store32:              useKindRNAMode,
+	store64:              useKindRNAMode,
+	exitSequence:         useKindRN,
+	condBr:               useKindCond,
+	br:                   useKindNone,
+	brTableSequence:      useKindRN,
+	cSet:                 useKindNone,
+	extend:               useKindRN,
+	fpuCmp:               useKindRNRM,
+	uLoad8:               useKindAMode,
+	uLoad16:              useKindAMode,
+	uLoad32:              useKindAMode,
+	sLoad8:               useKindAMode,
+	sLoad16:              useKindAMode,
+	sLoad32:              useKindAMode,
+	uLoad64:              useKindAMode,
+	fpuLoad32:            useKindAMode,
+	fpuLoad64:            useKindAMode,
+	fpuLoad128:           useKindAMode,
+	fpuStore32:           useKindRNAMode,
+	fpuStore64:           useKindRNAMode,
+	fpuStore128:          useKindRNAMode,
+	loadFpuConst32:       useKindNone,
+	loadFpuConst64:       useKindNone,
+	loadFpuConst128:      useKindNone,
+	vecLoad1R:            useKindRN,
+	cSel:                 useKindRNRM,
+	fpuCSel:              useKindRNRM,
+	movToVec:             useKindRN,
+	movFromVec:           useKindRN,
+	movFromVecSigned:     useKindRN,
+	vecDup:               useKindRN,
+	vecDupElement:        useKindRN,
+	vecExtract:           useKindRNRM,
+	cCmpImm:              useKindRN,
+	vecMisc:              useKindRN,
+	vecMovElement:        useKindRN,
+	vecLanes:             useKindRN,
+	vecShiftImm:          useKindRN,
+	vecTbl:               useKindRNRM,
+	vecTbl2:              useKindRNRN1RM,
+	vecRRR:               useKindRNRM,
+	vecRRRRewrite:        useKindRDRewrite,
+	vecPermute:           useKindRNRM,
+	fpuToInt:             useKindRN,
+	intToFpu:             useKindRN,
+	movToFPSR:            useKindRN,
+	movFromFPSR:          useKindNone,
+	adr:                  useKindNone,
+	emitSourceOffsetInfo: useKindNone,
+	atomicRmw:            useKindRNRM,
+	atomicCas:            useKindRDRewrite,
+	atomicLoad:           useKindRN,
+	atomicStore:          useKindRNRM,
+	loadConstBlockArg:    useKindNone,
+	dmb:                  useKindNone,
+}
+
+// Uses returns the list of regalloc.VReg that are used by the instruction.
+// In order to reduce the number of allocations, the caller can pass the slice to be used.
+func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
+	*regs = (*regs)[:0]
+	switch useKinds[i.kind] {
+	case useKindNone:
+	case useKindRN:
+		if rn := i.rn.reg(); rn.Valid() {
+			*regs = append(*regs, rn)
+		}
+	case useKindRNRM:
+		if rn := i.rn.reg(); rn.Valid() {
+			*regs = append(*regs, rn)
+		}
+		if rm := i.rm.reg(); rm.Valid() {
+			*regs = append(*regs, rm)
+		}
+	case useKindRNRMRA:
+		if rn := i.rn.reg(); rn.Valid() {
+			*regs = append(*regs, rn)
+		}
+		if rm := i.rm.reg(); rm.Valid() {
+			*regs = append(*regs, rm)
+		}
+		if ra := i.ra.reg(); ra.Valid() {
+			*regs = append(*regs, ra)
+		}
+	case useKindRNRN1RM:
+		if rn := i.rn.reg(); rn.Valid() && rn.IsRealReg() {
+			rn1 := regalloc.FromRealReg(rn.RealReg()+1, rn.RegType())
+			*regs = append(*regs, rn, rn1)
+		}
+		if rm := i.rm.reg(); rm.Valid() {
+			*regs = append(*regs, rm)
+		}
+	case useKindAMode:
+		if amodeRN := i.amode.rn; amodeRN.Valid() {
+			*regs = append(*regs, amodeRN)
+		}
+		if amodeRM := i.amode.rm; amodeRM.Valid() {
+			*regs = append(*regs, amodeRM)
+		}
+	case useKindRNAMode:
+		*regs = append(*regs, i.rn.reg())
+		if amodeRN := i.amode.rn; amodeRN.Valid() {
+			*regs = append(*regs, amodeRN)
+		}
+		if amodeRM := i.amode.rm; amodeRM.Valid() {
+			*regs = append(*regs, amodeRM)
+		}
+	case useKindCond:
+		cnd := cond(i.u1)
+		if cnd.kind() != condKindCondFlagSet {
+			*regs = append(*regs, cnd.register())
+		}
+	case useKindCallInd:
+		*regs = append(*regs, i.rn.nr())
+		fallthrough
+	case useKindCall:
+		argIntRealRegs, argFloatRealRegs, _, _, _ := backend.ABIInfoFromUint64(i.u2)
+		for i := byte(0); i < argIntRealRegs; i++ {
+			*regs = append(*regs, regInfo.RealRegToVReg[intParamResultRegs[i]])
+		}
+		for i := byte(0); i < argFloatRealRegs; i++ {
+			*regs = append(*regs, regInfo.RealRegToVReg[floatParamResultRegs[i]])
+		}
+	case useKindRDRewrite:
+		*regs = append(*regs, i.rn.reg())
+		*regs = append(*regs, i.rm.reg())
+		*regs = append(*regs, i.rd.reg())
+	default:
+		panic(fmt.Sprintf("useKind for %v not defined", i))
+	}
+	return *regs
+}
+
+func (i *instruction) AssignUse(index int, reg regalloc.VReg) {
+	switch useKinds[i.kind] {
+	case useKindNone:
+	case useKindRN:
+		if rn := i.rn.reg(); rn.Valid() {
+			i.rn = i.rn.assignReg(reg)
+		}
+	case useKindRNRM:
+		if index == 0 {
+			if rn := i.rn.reg(); rn.Valid() {
+				i.rn = i.rn.assignReg(reg)
+			}
+		} else {
+			if rm := i.rm.reg(); rm.Valid() {
+				i.rm = i.rm.assignReg(reg)
+			}
+		}
+	case useKindRDRewrite:
+		if index == 0 {
+			if rn := i.rn.reg(); rn.Valid() {
+				i.rn = i.rn.assignReg(reg)
+			}
+		} else if index == 1 {
+			if rm := i.rm.reg(); rm.Valid() {
+				i.rm = i.rm.assignReg(reg)
+			}
+		} else {
+			if rd := i.rd.reg(); rd.Valid() {
+				i.rd = i.rd.assignReg(reg)
+			}
+		}
+	case useKindRNRN1RM:
+		if index == 0 {
+			if rn := i.rn.reg(); rn.Valid() {
+				i.rn = i.rn.assignReg(reg)
+			}
+			if rn1 := i.rn.reg() + 1; rn1.Valid() {
+				i.rm = i.rm.assignReg(reg + 1)
+			}
+		} else {
+			if rm := i.rm.reg(); rm.Valid() {
+				i.rm = i.rm.assignReg(reg)
+			}
+		}
+	case useKindRNRMRA:
+		if index == 0 {
+			if rn := i.rn.reg(); rn.Valid() {
+				i.rn = i.rn.assignReg(reg)
+			}
+		} else if index == 1 {
+			if rm := i.rm.reg(); rm.Valid() {
+				i.rm = i.rm.assignReg(reg)
+			}
+		} else {
+			if ra := i.ra.reg(); ra.Valid() {
+				i.ra = i.ra.assignReg(reg)
+			}
+		}
+	case useKindAMode:
+		if index == 0 {
+			if amodeRN := i.amode.rn; amodeRN.Valid() {
+				i.amode.rn = reg
+			}
+		} else {
+			if amodeRM := i.amode.rm; amodeRM.Valid() {
+				i.amode.rm = reg
+			}
+		}
+	case useKindRNAMode:
+		if index == 0 {
+			i.rn = i.rn.assignReg(reg)
+		} else if index == 1 {
+			if amodeRN := i.amode.rn; amodeRN.Valid() {
+				i.amode.rn = reg
+			} else {
+				panic("BUG")
+			}
+		} else {
+			if amodeRM := i.amode.rm; amodeRM.Valid() {
+				i.amode.rm = reg
+			} else {
+				panic("BUG")
+			}
+		}
+	case useKindCond:
+		c := cond(i.u1)
+		switch c.kind() {
+		case condKindRegisterZero:
+			i.u1 = uint64(registerAsRegZeroCond(reg))
+		case condKindRegisterNotZero:
+			i.u1 = uint64(registerAsRegNotZeroCond(reg))
+		}
+	case useKindCall:
+		panic("BUG: call instructions shouldn't be assigned")
+	case useKindCallInd:
+		i.rn = i.rn.assignReg(reg)
+	default:
+		panic(fmt.Sprintf("useKind for %v not defined", i))
+	}
+}
+
+func (i *instruction) asCall(ref ssa.FuncRef, abi *backend.FunctionABI) {
+	i.kind = call
+	i.u1 = uint64(ref)
+	if abi != nil {
+		i.u2 = abi.ABIInfoAsUint64()
+	}
+}
+
+func (i *instruction) asCallIndirect(ptr regalloc.VReg, abi *backend.FunctionABI) {
+	i.kind = callInd
+	i.rn = operandNR(ptr)
+	if abi != nil {
+		i.u2 = abi.ABIInfoAsUint64()
+	}
+}
+
+func (i *instruction) callFuncRef() ssa.FuncRef {
+	return ssa.FuncRef(i.u1)
+}
+
+// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
+func (i *instruction) asMOVZ(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+	i.kind = movZ
+	i.rd = operandNR(dst)
+	i.u1 = imm
+	i.u2 = shift
+	if dst64bit {
+		i.u3 = 1
+	}
+}
+
+// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
+func (i *instruction) asMOVK(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+	i.kind = movK
+	i.rd = operandNR(dst)
+	i.u1 = imm
+	i.u2 = shift
+	if dst64bit {
+		i.u3 = 1
+	}
+}
+
+// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
+func (i *instruction) asMOVN(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+	i.kind = movN
+	i.rd = operandNR(dst)
+	i.u1 = imm
+	i.u2 = shift
+	if dst64bit {
+		i.u3 = 1
+	}
+}
+
+func (i *instruction) asNop0() *instruction {
+	i.kind = nop0
+	return i
+}
+
+func (i *instruction) asNop0WithLabel(l label) {
+	i.kind = nop0
+	i.u1 = uint64(l)
+}
+
+func (i *instruction) nop0Label() label {
+	return label(i.u1)
+}
+
+func (i *instruction) asRet() {
+	i.kind = ret
+}
+
+func (i *instruction) asStorePair64(src1, src2 regalloc.VReg, amode addressMode) {
+	i.kind = storeP64
+	i.rn = operandNR(src1)
+	i.rm = operandNR(src2)
+	i.amode = amode
+}
+
+func (i *instruction) asLoadPair64(src1, src2 regalloc.VReg, amode addressMode) {
+	i.kind = loadP64
+	i.rn = operandNR(src1)
+	i.rm = operandNR(src2)
+	i.amode = amode
+}
+
+func (i *instruction) asStore(src operand, amode addressMode, sizeInBits byte) {
+	switch sizeInBits {
+	case 8:
+		i.kind = store8
+	case 16:
+		i.kind = store16
+	case 32:
+		if src.reg().RegType() == regalloc.RegTypeInt {
+			i.kind = store32
+		} else {
+			i.kind = fpuStore32
+		}
+	case 64:
+		if src.reg().RegType() == regalloc.RegTypeInt {
+			i.kind = store64
+		} else {
+			i.kind = fpuStore64
+		}
+	case 128:
+		i.kind = fpuStore128
+	}
+	i.rn = src
+	i.amode = amode
+}
+
+func (i *instruction) asSLoad(dst operand, amode addressMode, sizeInBits byte) {
+	switch sizeInBits {
+	case 8:
+		i.kind = sLoad8
+	case 16:
+		i.kind = sLoad16
+	case 32:
+		i.kind = sLoad32
+	default:
+		panic("BUG")
+	}
+	i.rd = dst
+	i.amode = amode
+}
+
+func (i *instruction) asULoad(dst operand, amode addressMode, sizeInBits byte) {
+	switch sizeInBits {
+	case 8:
+		i.kind = uLoad8
+	case 16:
+		i.kind = uLoad16
+	case 32:
+		i.kind = uLoad32
+	case 64:
+		i.kind = uLoad64
+	}
+	i.rd = dst
+	i.amode = amode
+}
+
+func (i *instruction) asFpuLoad(dst operand, amode addressMode, sizeInBits byte) {
+	switch sizeInBits {
+	case 32:
+		i.kind = fpuLoad32
+	case 64:
+		i.kind = fpuLoad64
+	case 128:
+		i.kind = fpuLoad128
+	}
+	i.rd = dst
+	i.amode = amode
+}
+
+func (i *instruction) asVecLoad1R(rd, rn operand, arr vecArrangement) {
+	// NOTE: currently only has support for no-offset loads, though it is suspicious that
+	// we would need to support offset load (that is only available for post-index).
+	i.kind = vecLoad1R
+	i.rd = rd
+	i.rn = rn
+	i.u1 = uint64(arr)
+}
+
+func (i *instruction) asCSet(rd regalloc.VReg, mask bool, c condFlag) {
+	i.kind = cSet
+	i.rd = operandNR(rd)
+	i.u1 = uint64(c)
+	if mask {
+		i.u2 = 1
+	}
+}
+
+func (i *instruction) asCSel(rd, rn, rm operand, c condFlag, _64bit bool) {
+	i.kind = cSel
+	i.rd = rd
+	i.rn = rn
+	i.rm = rm
+	i.u1 = uint64(c)
+	if _64bit {
+		i.u3 = 1
+	}
+}
+
+func (i *instruction) asFpuCSel(rd, rn, rm operand, c condFlag, _64bit bool) {
+	i.kind = fpuCSel
+	i.rd = rd
+	i.rn = rn
+	i.rm = rm
+	i.u1 = uint64(c)
+	if _64bit {
+		i.u3 = 1
+	}
+}
+
+func (i *instruction) asBr(target label) {
+	if target == labelReturn {
+		panic("BUG: call site should special case for returnLabel")
+	}
+	i.kind = br
+	i.u1 = uint64(target)
+}
+
+func (i *instruction) asBrTableSequence(indexReg regalloc.VReg, targetIndex, targetCounts int) {
+	i.kind = brTableSequence
+	i.rn = operandNR(indexReg)
+	i.u1 = uint64(targetIndex)
+	i.u2 = uint64(targetCounts)
+}
+
+func (i *instruction) brTableSequenceOffsetsResolved() {
+	i.u3 = 1 // indicate that the offsets are resolved, for debugging.
+}
+
+func (i *instruction) brLabel() label {
+	return label(i.u1)
+}
+
+// brOffsetResolved is called when the target label is resolved.
+func (i *instruction) brOffsetResolve(offset int64) {
+	i.u2 = uint64(offset)
+	i.u3 = 1 // indicate that the offset is resolved, for debugging.
+}
+
+func (i *instruction) brOffset() int64 {
+	return int64(i.u2)
+}
+
+// asCondBr encodes a conditional branch instruction. is64bit is only needed when cond is not flag.
+func (i *instruction) asCondBr(c cond, target label, is64bit bool) {
+	i.kind = condBr
+	i.u1 = c.asUint64()
+	i.u2 = uint64(target)
+	if is64bit {
+		i.u3 = 1
+	}
+}
+
+func (i *instruction) setCondBrTargets(target label) {
+	i.u2 = uint64(target)
+}
+
+func (i *instruction) condBrLabel() label {
+	return label(i.u2)
+}
+
+// condBrOffsetResolve is called when the target label is resolved.
+func (i *instruction) condBrOffsetResolve(offset int64) {
+	i.rd.data = uint64(offset)
+	i.rd.data2 = 1 // indicate that the offset is resolved, for debugging.
+}
+
+// condBrOffsetResolved returns true if condBrOffsetResolve is already called.
+func (i *instruction) condBrOffsetResolved() bool {
+	return i.rd.data2 == 1
+}
+
+func (i *instruction) condBrOffset() int64 {
+	return int64(i.rd.data)
+}
+
+func (i *instruction) condBrCond() cond {
+	return cond(i.u1)
+}
+
+func (i *instruction) condBr64bit() bool {
+	return i.u3 == 1
+}
+
+func (i *instruction) asLoadFpuConst32(rd regalloc.VReg, raw uint64) {
+	i.kind = loadFpuConst32
+	i.u1 = raw
+	i.rd = operandNR(rd)
+}
+
+func (i *instruction) asLoadFpuConst64(rd regalloc.VReg, raw uint64) {
+	i.kind = loadFpuConst64
+	i.u1 = raw
+	i.rd = operandNR(rd)
+}
+
+func (i *instruction) asLoadFpuConst128(rd regalloc.VReg, lo, hi uint64) {
+	i.kind = loadFpuConst128
+	i.u1 = lo
+	i.u2 = hi
+	i.rd = operandNR(rd)
+}
+
+func (i *instruction) asFpuCmp(rn, rm operand, is64bit bool) {
+	i.kind = fpuCmp
+	i.rn, i.rm = rn, rm
+	if is64bit {
+		i.u3 = 1
+	}
+}
+
+func (i *instruction) asCCmpImm(rn operand, imm uint64, c condFlag, flag byte, is64bit bool) {
+	i.kind = cCmpImm
+	i.rn = rn
+	i.rm.data = imm
+	i.u1 = uint64(c)
+	i.u2 = uint64(flag)
+	if is64bit {
+		i.u3 = 1
+	}
+}
+
+// asALU setups a basic ALU instruction.
+func (i *instruction) asALU(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
+	switch rm.kind {
+	case operandKindNR:
+		i.kind = aluRRR
+	case operandKindSR:
+		i.kind = aluRRRShift
+	case operandKindER:
+		i.kind = aluRRRExtend
+	case operandKindImm12:
+		i.kind = aluRRImm12
+	default:
+		panic("BUG")
+	}
+	i.u1 = uint64(aluOp)
+	i.rd, i.rn, i.rm = rd, rn, rm
+	if dst64bit {
+		i.u3 = 1
+	}
+}
+
+// asALU setups a basic ALU instruction.
+func (i *instruction) asALURRRR(aluOp aluOp, rd, rn, rm, ra operand, dst64bit bool) {
+	i.kind = aluRRRR
+	i.u1 = uint64(aluOp)
+	i.rd, i.rn, i.rm, i.ra = rd, rn, rm, ra
+	if dst64bit {
+		i.u3 = 1
+	}
+}
+
+// asALUShift setups a shift based ALU instruction.
+func (i *instruction) asALUShift(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
+	switch rm.kind {
+	case operandKindNR:
+		i.kind = aluRRR // If the shift amount op is a register, then the instruction is encoded as a normal ALU instruction with two register operands.
+	case operandKindShiftImm:
+		i.kind = aluRRImmShift
+	default:
+		panic("BUG")
+	}
+	i.u1 = uint64(aluOp)
+	i.rd, i.rn, i.rm = rd, rn, rm
+	if dst64bit {
+		i.u3 = 1
+	}
+}
+
+func (i *instruction) asALUBitmaskImm(aluOp aluOp, rd, rn regalloc.VReg, imm uint64, dst64bit bool) {
+	i.kind = aluRRBitmaskImm
+	i.u1 = uint64(aluOp)
+	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.u2 = imm
+	if dst64bit {
+		i.u3 = 1
+	}
+}
+
+func (i *instruction) asMovToFPSR(rn regalloc.VReg) {
+	i.kind = movToFPSR
+	i.rn = operandNR(rn)
+}
+
+func (i *instruction) asMovFromFPSR(rd regalloc.VReg) {
+	i.kind = movFromFPSR
+	i.rd = operandNR(rd)
+}
+
+func (i *instruction) asBitRR(bitOp bitOp, rd, rn regalloc.VReg, is64bit bool) {
+	i.kind = bitRR
+	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.u1 = uint64(bitOp)
+	if is64bit {
+		i.u2 = 1
+	}
+}
+
+func (i *instruction) asFpuRRR(op fpuBinOp, rd, rn, rm operand, dst64bit bool) {
+	i.kind = fpuRRR
+	i.u1 = uint64(op)
+	i.rd, i.rn, i.rm = rd, rn, rm
+	if dst64bit {
+		i.u3 = 1
+	}
+}
+
+func (i *instruction) asFpuRR(op fpuUniOp, rd, rn operand, dst64bit bool) {
+	i.kind = fpuRR
+	i.u1 = uint64(op)
+	i.rd, i.rn = rd, rn
+	if dst64bit {
+		i.u3 = 1
+	}
+}
+
+func (i *instruction) asExtend(rd, rn regalloc.VReg, fromBits, toBits byte, signed bool) {
+	i.kind = extend
+	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.u1 = uint64(fromBits)
+	i.u2 = uint64(toBits)
+	if signed {
+		i.u3 = 1
+	}
+}
+
+func (i *instruction) asMove32(rd, rn regalloc.VReg) {
+	i.kind = mov32
+	i.rn, i.rd = operandNR(rn), operandNR(rd)
+}
+
+func (i *instruction) asMove64(rd, rn regalloc.VReg) *instruction {
+	i.kind = mov64
+	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	return i
+}
+
+func (i *instruction) asFpuMov64(rd, rn regalloc.VReg) {
+	i.kind = fpuMov64
+	i.rn, i.rd = operandNR(rn), operandNR(rd)
+}
+
+func (i *instruction) asFpuMov128(rd, rn regalloc.VReg) *instruction {
+	i.kind = fpuMov128
+	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	return i
+}
+
+func (i *instruction) asMovToVec(rd, rn operand, arr vecArrangement, index vecIndex) {
+	i.kind = movToVec
+	i.rd = rd
+	i.rn = rn
+	i.u1, i.u2 = uint64(arr), uint64(index)
+}
+
+func (i *instruction) asMovFromVec(rd, rn operand, arr vecArrangement, index vecIndex, signed bool) {
+	if signed {
+		i.kind = movFromVecSigned
+	} else {
+		i.kind = movFromVec
+	}
+	i.rd = rd
+	i.rn = rn
+	i.u1, i.u2 = uint64(arr), uint64(index)
+}
+
+func (i *instruction) asVecDup(rd, rn operand, arr vecArrangement) {
+	i.kind = vecDup
+	i.u1 = uint64(arr)
+	i.rn, i.rd = rn, rd
+}
+
+func (i *instruction) asVecDupElement(rd, rn operand, arr vecArrangement, index vecIndex) {
+	i.kind = vecDupElement
+	i.u1 = uint64(arr)
+	i.rn, i.rd = rn, rd
+	i.u2 = uint64(index)
+}
+
+func (i *instruction) asVecExtract(rd, rn, rm operand, arr vecArrangement, index uint32) {
+	i.kind = vecExtract
+	i.u1 = uint64(arr)
+	i.rn, i.rm, i.rd = rn, rm, rd
+	i.u2 = uint64(index)
+}
+
+func (i *instruction) asVecMovElement(rd, rn operand, arr vecArrangement, rdIndex, rnIndex vecIndex) {
+	i.kind = vecMovElement
+	i.u1 = uint64(arr)
+	i.u2, i.u3 = uint64(rdIndex), uint64(rnIndex)
+	i.rn, i.rd = rn, rd
+}
+
+func (i *instruction) asVecMisc(op vecOp, rd, rn operand, arr vecArrangement) {
+	i.kind = vecMisc
+	i.u1 = uint64(op)
+	i.rn, i.rd = rn, rd
+	i.u2 = uint64(arr)
+}
+
+func (i *instruction) asVecLanes(op vecOp, rd, rn operand, arr vecArrangement) {
+	i.kind = vecLanes
+	i.u1 = uint64(op)
+	i.rn, i.rd = rn, rd
+	i.u2 = uint64(arr)
+}
+
+func (i *instruction) asVecShiftImm(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction {
+	i.kind = vecShiftImm
+	i.u1 = uint64(op)
+	i.rn, i.rm, i.rd = rn, rm, rd
+	i.u2 = uint64(arr)
+	return i
+}
+
+func (i *instruction) asVecTbl(nregs byte, rd, rn, rm operand, arr vecArrangement) {
+	switch nregs {
+	case 0, 1:
+		i.kind = vecTbl
+	case 2:
+		i.kind = vecTbl2
+		if !rn.reg().IsRealReg() {
+			panic("rn is not a RealReg")
+		}
+		if rn.realReg() == v31 {
+			panic("rn cannot be v31")
+		}
+	default:
+		panic(fmt.Sprintf("unsupported number of registers %d", nregs))
+	}
+	i.rn, i.rm, i.rd = rn, rm, rd
+	i.u2 = uint64(arr)
+}
+
+func (i *instruction) asVecPermute(op vecOp, rd, rn, rm operand, arr vecArrangement) {
+	i.kind = vecPermute
+	i.u1 = uint64(op)
+	i.rn, i.rm, i.rd = rn, rm, rd
+	i.u2 = uint64(arr)
+}
+
+func (i *instruction) asVecRRR(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction {
+	i.kind = vecRRR
+	i.u1 = uint64(op)
+	i.rn, i.rd, i.rm = rn, rd, rm
+	i.u2 = uint64(arr)
+	return i
+}
+
+// asVecRRRRewrite encodes a vector instruction that rewrites the destination register.
+// IMPORTANT: the destination register must be already defined before this instruction.
+func (i *instruction) asVecRRRRewrite(op vecOp, rd, rn, rm operand, arr vecArrangement) {
+	i.kind = vecRRRRewrite
+	i.u1 = uint64(op)
+	i.rn, i.rd, i.rm = rn, rd, rm
+	i.u2 = uint64(arr)
+}
+
+func (i *instruction) IsCopy() bool {
+	op := i.kind
+	// We do not include mov32 as it is not a copy instruction in the sense that it does not preserve the upper 32 bits,
+	// and it is only used in the translation of IReduce, not the actual copy indeed.
+	return op == mov64 || op == fpuMov64 || op == fpuMov128
+}
+
+// String implements fmt.Stringer.
+func (i *instruction) String() (str string) {
+	is64SizeBitToSize := func(u3 uint64) byte {
+		if u3 == 0 {
+			return 32
+		}
+		return 64
+	}
+
+	switch i.kind {
+	case nop0:
+		if i.u1 != 0 {
+			l := label(i.u1)
+			str = fmt.Sprintf("%s:", l)
+		} else {
+			str = "nop0"
+		}
+	case aluRRR:
+		size := is64SizeBitToSize(i.u3)
+		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
+			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size),
+			i.rm.format(size))
+	case aluRRRR:
+		size := is64SizeBitToSize(i.u3)
+		str = fmt.Sprintf("%s %s, %s, %s, %s", aluOp(i.u1).String(),
+			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.ra.nr(), size))
+	case aluRRImm12:
+		size := is64SizeBitToSize(i.u3)
+		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
+			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), i.rm.format(size))
+	case aluRRBitmaskImm:
+		size := is64SizeBitToSize(i.u3)
+		rd, rn := formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size)
+		if size == 32 {
+			str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, uint32(i.u2))
+		} else {
+			str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, i.u2)
+		}
+	case aluRRImmShift:
+		size := is64SizeBitToSize(i.u3)
+		str = fmt.Sprintf("%s %s, %s, %#x",
+			aluOp(i.u1).String(),
+			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rn.nr(), size),
+			i.rm.shiftImm(),
+		)
+	case aluRRRShift:
+		size := is64SizeBitToSize(i.u3)
+		str = fmt.Sprintf("%s %s, %s, %s",
+			aluOp(i.u1).String(),
+			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rn.nr(), size),
+			i.rm.format(size),
+		)
+	case aluRRRExtend:
+		size := is64SizeBitToSize(i.u3)
+		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
+			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rn.nr(), size),
+			// Regardless of the source size, the register is formatted in 32-bit.
+			i.rm.format(32),
+		)
+	case bitRR:
+		size := is64SizeBitToSize(i.u2)
+		str = fmt.Sprintf("%s %s, %s",
+			bitOp(i.u1),
+			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rn.nr(), size),
+		)
+	case uLoad8:
+		str = fmt.Sprintf("ldrb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+	case sLoad8:
+		str = fmt.Sprintf("ldrsb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+	case uLoad16:
+		str = fmt.Sprintf("ldrh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+	case sLoad16:
+		str = fmt.Sprintf("ldrsh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+	case uLoad32:
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+	case sLoad32:
+		str = fmt.Sprintf("ldrs %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+	case uLoad64:
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64))
+	case store8:
+		str = fmt.Sprintf("strb %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(8))
+	case store16:
+		str = fmt.Sprintf("strh %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(16))
+	case store32:
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(32))
+	case store64:
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64))
+	case storeP64:
+		str = fmt.Sprintf("stp %s, %s, %s",
+			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64))
+	case loadP64:
+		str = fmt.Sprintf("ldp %s, %s, %s",
+			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64))
+	case mov64:
+		str = fmt.Sprintf("mov %s, %s",
+			formatVRegSized(i.rd.nr(), 64),
+			formatVRegSized(i.rn.nr(), 64))
+	case mov32:
+		str = fmt.Sprintf("mov %s, %s", formatVRegSized(i.rd.nr(), 32), formatVRegSized(i.rn.nr(), 32))
+	case movZ:
+		size := is64SizeBitToSize(i.u3)
+		str = fmt.Sprintf("movz %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+	case movN:
+		size := is64SizeBitToSize(i.u3)
+		str = fmt.Sprintf("movn %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+	case movK:
+		size := is64SizeBitToSize(i.u3)
+		str = fmt.Sprintf("movk %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+	case extend:
+		fromBits, toBits := byte(i.u1), byte(i.u2)
+
+		var signedStr string
+		if i.u3 == 1 {
+			signedStr = "s"
+		} else {
+			signedStr = "u"
+		}
+		var fromStr string
+		switch fromBits {
+		case 8:
+			fromStr = "b"
+		case 16:
+			fromStr = "h"
+		case 32:
+			fromStr = "w"
+		}
+		str = fmt.Sprintf("%sxt%s %s, %s", signedStr, fromStr, formatVRegSized(i.rd.nr(), toBits), formatVRegSized(i.rn.nr(), 32))
+	case cSel:
+		size := is64SizeBitToSize(i.u3)
+		str = fmt.Sprintf("csel %s, %s, %s, %s",
+			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rn.nr(), size),
+			formatVRegSized(i.rm.nr(), size),
+			condFlag(i.u1),
+		)
+	case cSet:
+		if i.u2 != 0 {
+			str = fmt.Sprintf("csetm %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1))
+		} else {
+			str = fmt.Sprintf("cset %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1))
+		}
+	case cCmpImm:
+		size := is64SizeBitToSize(i.u3)
+		str = fmt.Sprintf("ccmp %s, #%#x, #%#x, %s",
+			formatVRegSized(i.rn.nr(), size), i.rm.data,
+			i.u2&0b1111,
+			condFlag(i.u1))
+	case fpuMov64:
+		str = fmt.Sprintf("mov %s, %s",
+			formatVRegVec(i.rd.nr(), vecArrangement8B, vecIndexNone),
+			formatVRegVec(i.rn.nr(), vecArrangement8B, vecIndexNone))
+	case fpuMov128:
+		str = fmt.Sprintf("mov %s, %s",
+			formatVRegVec(i.rd.nr(), vecArrangement16B, vecIndexNone),
+			formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone))
+	case fpuMovFromVec:
+		panic("TODO")
+	case fpuRR:
+		dstSz := is64SizeBitToSize(i.u3)
+		srcSz := dstSz
+		op := fpuUniOp(i.u1)
+		switch op {
+		case fpuUniOpCvt32To64:
+			srcSz = 32
+		case fpuUniOpCvt64To32:
+			srcSz = 64
+		}
+		str = fmt.Sprintf("%s %s, %s", op.String(),
+			formatVRegSized(i.rd.nr(), dstSz), formatVRegSized(i.rn.nr(), srcSz))
+	case fpuRRR:
+		size := is64SizeBitToSize(i.u3)
+		str = fmt.Sprintf("%s %s, %s, %s", fpuBinOp(i.u1).String(),
+			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
+	case fpuRRI:
+		panic("TODO")
+	case fpuRRRR:
+		panic("TODO")
+	case fpuCmp:
+		size := is64SizeBitToSize(i.u3)
+		str = fmt.Sprintf("fcmp %s, %s",
+			formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
+	case fpuLoad32:
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+	case fpuStore32:
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(64))
+	case fpuLoad64:
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64))
+	case fpuStore64:
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64))
+	case fpuLoad128:
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 128), i.amode.format(64))
+	case fpuStore128:
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 128), i.amode.format(64))
+	case loadFpuConst32:
+		str = fmt.Sprintf("ldr %s, #8; b 8; data.f32 %f", formatVRegSized(i.rd.nr(), 32), math.Float32frombits(uint32(i.u1)))
+	case loadFpuConst64:
+		str = fmt.Sprintf("ldr %s, #8; b 16; data.f64 %f", formatVRegSized(i.rd.nr(), 64), math.Float64frombits(i.u1))
+	case loadFpuConst128:
+		str = fmt.Sprintf("ldr %s, #8; b 32; data.v128  %016x %016x",
+			formatVRegSized(i.rd.nr(), 128), i.u1, i.u2)
+	case fpuToInt:
+		var op, src, dst string
+		if signed := i.u1 == 1; signed {
+			op = "fcvtzs"
+		} else {
+			op = "fcvtzu"
+		}
+		if src64 := i.u2 == 1; src64 {
+			src = formatVRegWidthVec(i.rn.nr(), vecArrangementD)
+		} else {
+			src = formatVRegWidthVec(i.rn.nr(), vecArrangementS)
+		}
+		if dst64 := i.u3 == 1; dst64 {
+			dst = formatVRegSized(i.rd.nr(), 64)
+		} else {
+			dst = formatVRegSized(i.rd.nr(), 32)
+		}
+		str = fmt.Sprintf("%s %s, %s", op, dst, src)
+
+	case intToFpu:
+		var op, src, dst string
+		if signed := i.u1 == 1; signed {
+			op = "scvtf"
+		} else {
+			op = "ucvtf"
+		}
+		if src64 := i.u2 == 1; src64 {
+			src = formatVRegSized(i.rn.nr(), 64)
+		} else {
+			src = formatVRegSized(i.rn.nr(), 32)
+		}
+		if dst64 := i.u3 == 1; dst64 {
+			dst = formatVRegWidthVec(i.rd.nr(), vecArrangementD)
+		} else {
+			dst = formatVRegWidthVec(i.rd.nr(), vecArrangementS)
+		}
+		str = fmt.Sprintf("%s %s, %s", op, dst, src)
+	case fpuCSel:
+		size := is64SizeBitToSize(i.u3)
+		str = fmt.Sprintf("fcsel %s, %s, %s, %s",
+			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rn.nr(), size),
+			formatVRegSized(i.rm.nr(), size),
+			condFlag(i.u1),
+		)
+	case movToVec:
+		var size byte
+		arr := vecArrangement(i.u1)
+		switch arr {
+		case vecArrangementB, vecArrangementH, vecArrangementS:
+			size = 32
+		case vecArrangementD:
+			size = 64
+		default:
+			panic("unsupported arrangement " + arr.String())
+		}
+		str = fmt.Sprintf("ins %s, %s", formatVRegVec(i.rd.nr(), arr, vecIndex(i.u2)), formatVRegSized(i.rn.nr(), size))
+	case movFromVec, movFromVecSigned:
+		var size byte
+		var opcode string
+		arr := vecArrangement(i.u1)
+		signed := i.kind == movFromVecSigned
+		switch arr {
+		case vecArrangementB, vecArrangementH, vecArrangementS:
+			size = 32
+			if signed {
+				opcode = "smov"
+			} else {
+				opcode = "umov"
+			}
+		case vecArrangementD:
+			size = 64
+			if signed {
+				opcode = "smov"
+			} else {
+				opcode = "mov"
+			}
+		default:
+			panic("unsupported arrangement " + arr.String())
+		}
+		str = fmt.Sprintf("%s %s, %s", opcode, formatVRegSized(i.rd.nr(), size), formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)))
+	case vecDup:
+		str = fmt.Sprintf("dup %s, %s",
+			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone),
+			formatVRegSized(i.rn.nr(), 64),
+		)
+	case vecDupElement:
+		arr := vecArrangement(i.u1)
+		str = fmt.Sprintf("dup %s, %s",
+			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)),
+		)
+	case vecDupFromFpu:
+		panic("TODO")
+	case vecExtract:
+		str = fmt.Sprintf("ext %s, %s, %s, #%d",
+			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone),
+			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndexNone),
+			formatVRegVec(i.rm.nr(), vecArrangement(i.u1), vecIndexNone),
+			uint32(i.u2),
+		)
+	case vecExtend:
+		panic("TODO")
+	case vecMovElement:
+		str = fmt.Sprintf("mov %s, %s",
+			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndex(i.u2)),
+			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndex(i.u3)),
+		)
+	case vecMiscNarrow:
+		panic("TODO")
+	case vecRRR, vecRRRRewrite:
+		str = fmt.Sprintf("%s %s, %s, %s",
+			vecOp(i.u1),
+			formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+			formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone),
+			formatVRegVec(i.rm.nr(), vecArrangement(i.u2), vecIndexNone),
+		)
+	case vecMisc:
+		vop := vecOp(i.u1)
+		if vop == vecOpCmeq0 {
+			str = fmt.Sprintf("cmeq %s, %s, #0",
+				formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+				formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
+		} else {
+			str = fmt.Sprintf("%s %s, %s",
+				vop,
+				formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+				formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
+		}
+	case vecLanes:
+		arr := vecArrangement(i.u2)
+		var destArr vecArrangement
+		switch arr {
+		case vecArrangement8B, vecArrangement16B:
+			destArr = vecArrangementH
+		case vecArrangement4H, vecArrangement8H:
+			destArr = vecArrangementS
+		case vecArrangement4S:
+			destArr = vecArrangementD
+		default:
+			panic("invalid arrangement " + arr.String())
+		}
+		str = fmt.Sprintf("%s %s, %s",
+			vecOp(i.u1),
+			formatVRegWidthVec(i.rd.nr(), destArr),
+			formatVRegVec(i.rn.nr(), arr, vecIndexNone))
+	case vecShiftImm:
+		arr := vecArrangement(i.u2)
+		str = fmt.Sprintf("%s %s, %s, #%d",
+			vecOp(i.u1),
+			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rn.nr(), arr, vecIndexNone),
+			i.rm.shiftImm())
+	case vecTbl:
+		arr := vecArrangement(i.u2)
+		str = fmt.Sprintf("tbl %s, { %s }, %s",
+			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone),
+			formatVRegVec(i.rm.nr(), arr, vecIndexNone))
+	case vecTbl2:
+		arr := vecArrangement(i.u2)
+		rd, rn, rm := i.rd.nr(), i.rn.nr(), i.rm.nr()
+		rn1 := regalloc.FromRealReg(rn.RealReg()+1, rn.RegType())
+		str = fmt.Sprintf("tbl %s, { %s, %s }, %s",
+			formatVRegVec(rd, arr, vecIndexNone),
+			formatVRegVec(rn, vecArrangement16B, vecIndexNone),
+			formatVRegVec(rn1, vecArrangement16B, vecIndexNone),
+			formatVRegVec(rm, arr, vecIndexNone))
+	case vecPermute:
+		arr := vecArrangement(i.u2)
+		str = fmt.Sprintf("%s %s, %s, %s",
+			vecOp(i.u1),
+			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rn.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rm.nr(), arr, vecIndexNone))
+	case movToFPSR:
+		str = fmt.Sprintf("msr fpsr, %s", formatVRegSized(i.rn.nr(), 64))
+	case movFromFPSR:
+		str = fmt.Sprintf("mrs %s fpsr", formatVRegSized(i.rd.nr(), 64))
+	case call:
+		str = fmt.Sprintf("bl %s", ssa.FuncRef(i.u1))
+	case callInd:
+		str = fmt.Sprintf("bl %s", formatVRegSized(i.rn.nr(), 64))
+	case ret:
+		str = "ret"
+	case br:
+		target := label(i.u1)
+		if i.u3 != 0 {
+			str = fmt.Sprintf("b #%#x (%s)", i.brOffset(), target.String())
+		} else {
+			str = fmt.Sprintf("b %s", target.String())
+		}
+	case condBr:
+		size := is64SizeBitToSize(i.u3)
+		c := cond(i.u1)
+		target := label(i.u2)
+		switch c.kind() {
+		case condKindRegisterZero:
+			if !i.condBrOffsetResolved() {
+				str = fmt.Sprintf("cbz %s, (%s)", formatVRegSized(c.register(), size), target.String())
+			} else {
+				str = fmt.Sprintf("cbz %s, #%#x %s", formatVRegSized(c.register(), size), i.condBrOffset(), target.String())
+			}
+		case condKindRegisterNotZero:
+			if offset := i.condBrOffset(); offset != 0 {
+				str = fmt.Sprintf("cbnz %s, #%#x (%s)", formatVRegSized(c.register(), size), offset, target.String())
+			} else {
+				str = fmt.Sprintf("cbnz %s, %s", formatVRegSized(c.register(), size), target.String())
+			}
+		case condKindCondFlagSet:
+			if offset := i.condBrOffset(); offset != 0 {
+				if target == labelInvalid {
+					str = fmt.Sprintf("b.%s #%#x", c.flag(), offset)
+				} else {
+					str = fmt.Sprintf("b.%s #%#x, (%s)", c.flag(), offset, target.String())
+				}
+			} else {
+				str = fmt.Sprintf("b.%s %s", c.flag(), target.String())
+			}
+		}
+	case adr:
+		str = fmt.Sprintf("adr %s, #%#x", formatVRegSized(i.rd.nr(), 64), int64(i.u1))
+	case brTableSequence:
+		targetIndex := i.u1
+		str = fmt.Sprintf("br_table_sequence %s, table_index=%d", formatVRegSized(i.rn.nr(), 64), targetIndex)
+	case exitSequence:
+		str = fmt.Sprintf("exit_sequence %s", formatVRegSized(i.rn.nr(), 64))
+	case atomicRmw:
+		m := atomicRmwOp(i.u1).String()
+		size := byte(32)
+		switch i.u2 {
+		case 8:
+			size = 64
+		case 2:
+			m = m + "h"
+		case 1:
+			m = m + "b"
+		}
+		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64))
+	case atomicCas:
+		m := "casal"
+		size := byte(32)
+		switch i.u2 {
+		case 8:
+			size = 64
+		case 2:
+			m = m + "h"
+		case 1:
+			m = m + "b"
+		}
+		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
+	case atomicLoad:
+		m := "ldar"
+		size := byte(32)
+		switch i.u2 {
+		case 8:
+			size = 64
+		case 2:
+			m = m + "h"
+		case 1:
+			m = m + "b"
+		}
+		str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64))
+	case atomicStore:
+		m := "stlr"
+		size := byte(32)
+		switch i.u2 {
+		case 8:
+			size = 64
+		case 2:
+			m = m + "h"
+		case 1:
+			m = m + "b"
+		}
+		str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
+	case dmb:
+		str = "dmb"
+	case udf:
+		str = "udf"
+	case emitSourceOffsetInfo:
+		str = fmt.Sprintf("source_offset_info %d", ssa.SourceOffset(i.u1))
+	case vecLoad1R:
+		str = fmt.Sprintf("ld1r {%s}, [%s]", formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone), formatVRegSized(i.rn.nr(), 64))
+	case loadConstBlockArg:
+		str = fmt.Sprintf("load_const_block_arg %s, %#x", formatVRegSized(i.rd.nr(), 64), i.u1)
+	default:
+		panic(i.kind)
+	}
+	return
+}
+
+func (i *instruction) asAdr(rd regalloc.VReg, offset int64) {
+	i.kind = adr
+	i.rd = operandNR(rd)
+	i.u1 = uint64(offset)
+}
+
+func (i *instruction) asAtomicRmw(op atomicRmwOp, rn, rs, rt operand, size uint64) {
+	i.kind = atomicRmw
+	i.rd, i.rn, i.rm = rt, rn, rs
+	i.u1 = uint64(op)
+	i.u2 = size
+}
+
+func (i *instruction) asAtomicCas(rn, rs, rt operand, size uint64) {
+	i.kind = atomicCas
+	i.rm, i.rn, i.rd = rt, rn, rs
+	i.u2 = size
+}
+
+func (i *instruction) asAtomicLoad(rn, rt operand, size uint64) {
+	i.kind = atomicLoad
+	i.rn, i.rd = rn, rt
+	i.u2 = size
+}
+
+func (i *instruction) asAtomicStore(rn, rt operand, size uint64) {
+	i.kind = atomicStore
+	i.rn, i.rm = rn, rt
+	i.u2 = size
+}
+
+func (i *instruction) asDMB() {
+	i.kind = dmb
+}
+
+// TODO: delete unnecessary things.
+const (
+	// nop0 represents a no-op of zero size.
+	nop0 instructionKind = iota + 1
+	// aluRRR represents an ALU operation with two register sources and a register destination.
+	aluRRR
+	// aluRRRR represents an ALU operation with three register sources and a register destination.
+	aluRRRR
+	// aluRRImm12 represents an ALU operation with a register source and an immediate-12 source, with a register destination.
+	aluRRImm12
+	// aluRRBitmaskImm represents an ALU operation with a register source and a bitmask immediate, with a register destination.
+	aluRRBitmaskImm
+	// aluRRImmShift represents an ALU operation with a register source and an immediate-shifted source, with a register destination.
+	aluRRImmShift
+	// aluRRRShift represents an ALU operation with two register sources, one of which can be shifted, with a register destination.
+	aluRRRShift
+	// aluRRRExtend represents an ALU operation with two register sources, one of which can be extended, with a register destination.
+	aluRRRExtend
+	// bitRR represents a bit op instruction with a single register source.
+	bitRR
+	// uLoad8 represents an unsigned 8-bit load.
+	uLoad8
+	// sLoad8 represents a signed 8-bit load into 64-bit register.
+	sLoad8
+	// uLoad16 represents an unsigned 16-bit load into 64-bit register.
+	uLoad16
+	// sLoad16 represents a signed 16-bit load into 64-bit register.
+	sLoad16
+	// uLoad32 represents an unsigned 32-bit load into 64-bit register.
+	uLoad32
+	// sLoad32 represents a signed 32-bit load into 64-bit register.
+	sLoad32
+	// uLoad64 represents a 64-bit load.
+	uLoad64
+	// store8 represents an 8-bit store.
+	store8
+	// store16 represents a 16-bit store.
+	store16
+	// store32 represents a 32-bit store.
+	store32
+	// store64 represents a 64-bit store.
+	store64
+	// storeP64 represents a store of a pair of registers.
+	storeP64
+	// loadP64 represents a load of a pair of registers.
+	loadP64
+	// mov64 represents a MOV instruction. These are encoded as ORR's but we keep them separate for better handling.
+	mov64
+	// mov32 represents a 32-bit MOV. This zeroes the top 32 bits of the destination.
+	mov32
+	// movZ represents a MOVZ with a 16-bit immediate.
+	movZ
+	// movN represents a MOVN with a 16-bit immediate.
+	movN
+	// movK represents a MOVK with a 16-bit immediate.
+	movK
+	// extend represents a sign- or zero-extend operation.
+	extend
+	// cSel represents a conditional-select operation.
+	cSel
+	// cSet represents a conditional-set operation.
+	cSet
+	// cCmpImm represents a conditional comparison with an immediate.
+	cCmpImm
+	// fpuMov64 represents a FPU move. Distinct from a vector-register move; moving just 64 bits appears to be significantly faster.
+	fpuMov64
+	// fpuMov128 represents a vector register move.
+	fpuMov128
+	// fpuMovFromVec represents a move to scalar from a vector element.
+	fpuMovFromVec
+	// fpuRR represents a 1-op FPU instruction.
+	fpuRR
+	// fpuRRR represents a 2-op FPU instruction.
+	fpuRRR
+	// fpuRRI represents a 2-op FPU instruction with immediate value.
+	fpuRRI
+	// fpuRRRR represents a 3-op FPU instruction.
+	fpuRRRR
+	// fpuCmp represents a FPU comparison, either 32 or 64 bit.
+	fpuCmp
+	// fpuLoad32 represents a floating-point load, single-precision (32 bit).
+	fpuLoad32
+	// fpuStore32 represents a floating-point store, single-precision (32 bit).
+	fpuStore32
+	// fpuLoad64 represents a floating-point load, double-precision (64 bit).
+	fpuLoad64
+	// fpuStore64 represents a floating-point store, double-precision (64 bit).
+	fpuStore64
+	// fpuLoad128 represents a floating-point/vector load, 128 bit.
+	fpuLoad128
+	// fpuStore128 represents a floating-point/vector store, 128 bit.
+	fpuStore128
+	// loadFpuConst32 represents a load of a 32-bit floating-point constant.
+	loadFpuConst32
+	// loadFpuConst64 represents a load of a 64-bit floating-point constant.
+	loadFpuConst64
+	// loadFpuConst128 represents a load of a 128-bit floating-point constant.
+	loadFpuConst128
+	// vecLoad1R represents a load of a one single-element structure that replicates to all lanes of a vector.
+	vecLoad1R
+	// fpuToInt represents a conversion from FP to integer.
+	fpuToInt
+	// intToFpu represents a conversion from integer to FP.
+	intToFpu
+	// fpuCSel represents a 32/64-bit FP conditional select.
+	fpuCSel
+	// movToVec represents a move to a vector element from a GPR.
+	movToVec
+	// movFromVec represents an unsigned move from a vector element to a GPR.
+	movFromVec
+	// movFromVecSigned represents a signed move from a vector element to a GPR.
+	movFromVecSigned
+	// vecDup represents a duplication of general-purpose register to vector.
+	vecDup
+	// vecDupElement represents a duplication of a vector element to vector or scalar.
+	vecDupElement
+	// vecDupFromFpu represents a duplication of scalar to vector.
+	vecDupFromFpu
+	// vecExtract represents a vector extraction operation.
+	vecExtract
+	// vecExtend represents a vector extension operation.
+	vecExtend
+	// vecMovElement represents a move vector element to another vector element operation.
+	vecMovElement
+	// vecMiscNarrow represents a vector narrowing operation.
+	vecMiscNarrow
+	// vecRRR represents a vector ALU operation.
+	vecRRR
+	// vecRRRRewrite is exactly the same as vecRRR except that this rewrites the destination register.
+	// For example, BSL instruction rewrites the destination register, and the existing value influences the result.
+	// Therefore, the "destination" register in vecRRRRewrite will be treated as "use" which makes the register outlive
+	// the instruction while this instruction doesn't have "def" in the context of register allocation.
+	vecRRRRewrite
+	// vecMisc represents a vector two register miscellaneous instruction.
+	vecMisc
+	// vecLanes represents a vector instruction across lanes.
+	vecLanes
+	// vecShiftImm represents a SIMD scalar shift by immediate instruction.
+	vecShiftImm
+	// vecTbl represents a table vector lookup - single register table.
+	vecTbl
+	// vecTbl2 represents a table vector lookup - two register table.
+	vecTbl2
+	// vecPermute represents a vector permute instruction.
+	vecPermute
+	// movToNZCV represents a move to the FPSR.
+	movToFPSR
+	// movFromNZCV represents a move from the FPSR.
+	movFromFPSR
+	// call represents a machine call instruction.
+	call
+	// callInd represents a machine indirect-call instruction.
+	callInd
+	// ret represents a machine return instruction.
+	ret
+	// br represents an unconditional branch.
+	br
+	// condBr represents a conditional branch.
+	condBr
+	// adr represents a compute the address (using a PC-relative offset) of a memory location.
+	adr
+	// brTableSequence represents a jump-table sequence.
+	brTableSequence
+	// exitSequence consists of multiple instructions, and exits the execution immediately.
+	// See encodeExitSequence.
+	exitSequence
+	// atomicRmw represents an atomic read-modify-write operation with two register sources and a register destination.
+	atomicRmw
+	// atomicCas represents an atomic compare-and-swap operation with three register sources. The value is loaded to
+	// the source register containing the comparison value.
+	atomicCas
+	// atomicLoad represents an atomic load with one source register and a register destination.
+	atomicLoad
+	// atomicStore represents an atomic store with two source registers and no destination.
+	atomicStore
+	// dmb represents the data memory barrier instruction in inner-shareable (ish) mode.
+	dmb
+	// UDF is the undefined instruction. For debugging only.
+	udf
+	// loadConstBlockArg represents a load of a constant block argument.
+	loadConstBlockArg
+
+	// emitSourceOffsetInfo is a dummy instruction to emit source offset info.
+	// The existence of this instruction does not affect the execution.
+	emitSourceOffsetInfo
+
+	// ------------------- do not define below this line -------------------
+	numInstructionKinds
+)
+
+func (i *instruction) asLoadConstBlockArg(v uint64, typ ssa.Type, dst regalloc.VReg) *instruction {
+	i.kind = loadConstBlockArg
+	i.u1 = v
+	i.u2 = uint64(typ)
+	i.rd = operandNR(dst)
+	return i
+}
+
+func (i *instruction) loadConstBlockArgData() (v uint64, typ ssa.Type, dst regalloc.VReg) {
+	return i.u1, ssa.Type(i.u2), i.rd.nr()
+}
+
+func (i *instruction) asEmitSourceOffsetInfo(l ssa.SourceOffset) *instruction {
+	i.kind = emitSourceOffsetInfo
+	i.u1 = uint64(l)
+	return i
+}
+
+func (i *instruction) sourceOffsetInfo() ssa.SourceOffset {
+	return ssa.SourceOffset(i.u1)
+}
+
+func (i *instruction) asUDF() *instruction {
+	i.kind = udf
+	return i
+}
+
+func (i *instruction) asFpuToInt(rd, rn operand, rdSigned, src64bit, dst64bit bool) {
+	i.kind = fpuToInt
+	i.rn = rn
+	i.rd = rd
+	if rdSigned {
+		i.u1 = 1
+	}
+	if src64bit {
+		i.u2 = 1
+	}
+	if dst64bit {
+		i.u3 = 1
+	}
+}
+
+func (i *instruction) asIntToFpu(rd, rn operand, rnSigned, src64bit, dst64bit bool) {
+	i.kind = intToFpu
+	i.rn = rn
+	i.rd = rd
+	if rnSigned {
+		i.u1 = 1
+	}
+	if src64bit {
+		i.u2 = 1
+	}
+	if dst64bit {
+		i.u3 = 1
+	}
+}
+
+func (i *instruction) asExitSequence(ctx regalloc.VReg) *instruction {
+	i.kind = exitSequence
+	i.rn = operandNR(ctx)
+	return i
+}
+
+// aluOp determines the type of ALU operation. Instructions whose kind is one of
+// aluRRR, aluRRRR, aluRRImm12, aluRRBitmaskImm, aluRRImmShift, aluRRRShift and aluRRRExtend
+// would use this type.
+type aluOp int
+
+func (a aluOp) String() string {
+	switch a {
+	case aluOpAdd:
+		return "add"
+	case aluOpSub:
+		return "sub"
+	case aluOpOrr:
+		return "orr"
+	case aluOpOrn:
+		return "orn"
+	case aluOpAnd:
+		return "and"
+	case aluOpAnds:
+		return "ands"
+	case aluOpBic:
+		return "bic"
+	case aluOpEor:
+		return "eor"
+	case aluOpAddS:
+		return "adds"
+	case aluOpSubS:
+		return "subs"
+	case aluOpSMulH:
+		return "sMulH"
+	case aluOpUMulH:
+		return "uMulH"
+	case aluOpSDiv:
+		return "sdiv"
+	case aluOpUDiv:
+		return "udiv"
+	case aluOpRotR:
+		return "ror"
+	case aluOpLsr:
+		return "lsr"
+	case aluOpAsr:
+		return "asr"
+	case aluOpLsl:
+		return "lsl"
+	case aluOpMAdd:
+		return "madd"
+	case aluOpMSub:
+		return "msub"
+	}
+	panic(int(a))
+}
+
+const (
+	// 32/64-bit Add.
+	aluOpAdd aluOp = iota
+	// 32/64-bit Subtract.
+	aluOpSub
+	// 32/64-bit Bitwise OR.
+	aluOpOrr
+	// 32/64-bit Bitwise OR NOT.
+	aluOpOrn
+	// 32/64-bit Bitwise AND.
+	aluOpAnd
+	// 32/64-bit Bitwise ANDS.
+	aluOpAnds
+	// 32/64-bit Bitwise AND NOT.
+	aluOpBic
+	// 32/64-bit Bitwise XOR (Exclusive OR).
+	aluOpEor
+	// 32/64-bit Add setting flags.
+	aluOpAddS
+	// 32/64-bit Subtract setting flags.
+	aluOpSubS
+	// Signed multiply, high-word result.
+	aluOpSMulH
+	// Unsigned multiply, high-word result.
+	aluOpUMulH
+	// 64-bit Signed divide.
+	aluOpSDiv
+	// 64-bit Unsigned divide.
+	aluOpUDiv
+	// 32/64-bit Rotate right.
+	aluOpRotR
+	// 32/64-bit Logical shift right.
+	aluOpLsr
+	// 32/64-bit Arithmetic shift right.
+	aluOpAsr
+	// 32/64-bit Logical shift left.
+	aluOpLsl /// Multiply-add
+
+	// MAdd and MSub are only applicable for aluRRRR.
+	aluOpMAdd
+	aluOpMSub
+)
+
+// vecOp determines the type of vector operation. Instructions whose kind is one of
+// vecOpCnt would use this type.
+type vecOp int
+
+// String implements fmt.Stringer.
+func (b vecOp) String() string {
+	switch b {
+	case vecOpCnt:
+		return "cnt"
+	case vecOpCmeq:
+		return "cmeq"
+	case vecOpCmgt:
+		return "cmgt"
+	case vecOpCmhi:
+		return "cmhi"
+	case vecOpCmge:
+		return "cmge"
+	case vecOpCmhs:
+		return "cmhs"
+	case vecOpFcmeq:
+		return "fcmeq"
+	case vecOpFcmgt:
+		return "fcmgt"
+	case vecOpFcmge:
+		return "fcmge"
+	case vecOpCmeq0:
+		return "cmeq0"
+	case vecOpUaddlv:
+		return "uaddlv"
+	case vecOpBit:
+		return "bit"
+	case vecOpBic:
+		return "bic"
+	case vecOpBsl:
+		return "bsl"
+	case vecOpNot:
+		return "not"
+	case vecOpAnd:
+		return "and"
+	case vecOpOrr:
+		return "orr"
+	case vecOpEOR:
+		return "eor"
+	case vecOpFadd:
+		return "fadd"
+	case vecOpAdd:
+		return "add"
+	case vecOpAddp:
+		return "addp"
+	case vecOpAddv:
+		return "addv"
+	case vecOpSub:
+		return "sub"
+	case vecOpFsub:
+		return "fsub"
+	case vecOpSmin:
+		return "smin"
+	case vecOpUmin:
+		return "umin"
+	case vecOpUminv:
+		return "uminv"
+	case vecOpSmax:
+		return "smax"
+	case vecOpUmax:
+		return "umax"
+	case vecOpUmaxp:
+		return "umaxp"
+	case vecOpUrhadd:
+		return "urhadd"
+	case vecOpFmul:
+		return "fmul"
+	case vecOpSqrdmulh:
+		return "sqrdmulh"
+	case vecOpMul:
+		return "mul"
+	case vecOpUmlal:
+		return "umlal"
+	case vecOpFdiv:
+		return "fdiv"
+	case vecOpFsqrt:
+		return "fsqrt"
+	case vecOpAbs:
+		return "abs"
+	case vecOpFabs:
+		return "fabs"
+	case vecOpNeg:
+		return "neg"
+	case vecOpFneg:
+		return "fneg"
+	case vecOpFrintp:
+		return "frintp"
+	case vecOpFrintm:
+		return "frintm"
+	case vecOpFrintn:
+		return "frintn"
+	case vecOpFrintz:
+		return "frintz"
+	case vecOpFcvtl:
+		return "fcvtl"
+	case vecOpFcvtn:
+		return "fcvtn"
+	case vecOpFcvtzu:
+		return "fcvtzu"
+	case vecOpFcvtzs:
+		return "fcvtzs"
+	case vecOpScvtf:
+		return "scvtf"
+	case vecOpUcvtf:
+		return "ucvtf"
+	case vecOpSqxtn:
+		return "sqxtn"
+	case vecOpUqxtn:
+		return "uqxtn"
+	case vecOpSqxtun:
+		return "sqxtun"
+	case vecOpRev64:
+		return "rev64"
+	case vecOpXtn:
+		return "xtn"
+	case vecOpShll:
+		return "shll"
+	case vecOpSshl:
+		return "sshl"
+	case vecOpSshll:
+		return "sshll"
+	case vecOpUshl:
+		return "ushl"
+	case vecOpUshll:
+		return "ushll"
+	case vecOpSshr:
+		return "sshr"
+	case vecOpZip1:
+		return "zip1"
+	case vecOpFmin:
+		return "fmin"
+	case vecOpFmax:
+		return "fmax"
+	case vecOpSmull:
+		return "smull"
+	case vecOpSmull2:
+		return "smull2"
+	}
+	panic(int(b))
+}
+
+const (
+	vecOpCnt vecOp = iota
+	vecOpCmeq0
+	vecOpCmeq
+	vecOpCmgt
+	vecOpCmhi
+	vecOpCmge
+	vecOpCmhs
+	vecOpFcmeq
+	vecOpFcmgt
+	vecOpFcmge
+	vecOpUaddlv
+	vecOpBit
+	vecOpBic
+	vecOpBsl
+	vecOpNot
+	vecOpAnd
+	vecOpOrr
+	vecOpEOR
+	vecOpAdd
+	vecOpFadd
+	vecOpAddv
+	vecOpSqadd
+	vecOpUqadd
+	vecOpAddp
+	vecOpSub
+	vecOpFsub
+	vecOpSqsub
+	vecOpUqsub
+	vecOpSmin
+	vecOpUmin
+	vecOpUminv
+	vecOpFmin
+	vecOpSmax
+	vecOpUmax
+	vecOpUmaxp
+	vecOpFmax
+	vecOpUrhadd
+	vecOpMul
+	vecOpFmul
+	vecOpSqrdmulh
+	vecOpUmlal
+	vecOpFdiv
+	vecOpFsqrt
+	vecOpAbs
+	vecOpFabs
+	vecOpNeg
+	vecOpFneg
+	vecOpFrintm
+	vecOpFrintn
+	vecOpFrintp
+	vecOpFrintz
+	vecOpFcvtl
+	vecOpFcvtn
+	vecOpFcvtzs
+	vecOpFcvtzu
+	vecOpScvtf
+	vecOpUcvtf
+	vecOpSqxtn
+	vecOpSqxtun
+	vecOpUqxtn
+	vecOpRev64
+	vecOpXtn
+	vecOpShll
+	vecOpSshl
+	vecOpSshll
+	vecOpUshl
+	vecOpUshll
+	vecOpSshr
+	vecOpZip1
+	vecOpSmull
+	vecOpSmull2
+)
+
+// bitOp determines the type of bitwise operation. Instructions whose kind is one of
+// bitOpRbit and bitOpClz would use this type.
+type bitOp int
+
+// String implements fmt.Stringer.
+func (b bitOp) String() string {
+	switch b {
+	case bitOpRbit:
+		return "rbit"
+	case bitOpClz:
+		return "clz"
+	}
+	panic(int(b))
+}
+
+const (
+	// 32/64-bit Rbit.
+	bitOpRbit bitOp = iota
+	// 32/64-bit Clz.
+	bitOpClz
+)
+
+// fpuUniOp represents a unary floating-point unit (FPU) operation.
+type fpuUniOp byte
+
+const (
+	fpuUniOpNeg fpuUniOp = iota
+	fpuUniOpCvt32To64
+	fpuUniOpCvt64To32
+	fpuUniOpSqrt
+	fpuUniOpRoundPlus
+	fpuUniOpRoundMinus
+	fpuUniOpRoundZero
+	fpuUniOpRoundNearest
+	fpuUniOpAbs
+)
+
+// String implements the fmt.Stringer.
+func (f fpuUniOp) String() string {
+	switch f {
+	case fpuUniOpNeg:
+		return "fneg"
+	case fpuUniOpCvt32To64:
+		return "fcvt"
+	case fpuUniOpCvt64To32:
+		return "fcvt"
+	case fpuUniOpSqrt:
+		return "fsqrt"
+	case fpuUniOpRoundPlus:
+		return "frintp"
+	case fpuUniOpRoundMinus:
+		return "frintm"
+	case fpuUniOpRoundZero:
+		return "frintz"
+	case fpuUniOpRoundNearest:
+		return "frintn"
+	case fpuUniOpAbs:
+		return "fabs"
+	}
+	panic(int(f))
+}
+
+// fpuBinOp represents a binary floating-point unit (FPU) operation.
+type fpuBinOp byte
+
+const (
+	fpuBinOpAdd = iota
+	fpuBinOpSub
+	fpuBinOpMul
+	fpuBinOpDiv
+	fpuBinOpMax
+	fpuBinOpMin
+)
+
+// String implements the fmt.Stringer.
+func (f fpuBinOp) String() string {
+	switch f {
+	case fpuBinOpAdd:
+		return "fadd"
+	case fpuBinOpSub:
+		return "fsub"
+	case fpuBinOpMul:
+		return "fmul"
+	case fpuBinOpDiv:
+		return "fdiv"
+	case fpuBinOpMax:
+		return "fmax"
+	case fpuBinOpMin:
+		return "fmin"
+	}
+	panic(int(f))
+}
+
+// extMode represents the mode of a register operand extension.
+// For example, aluRRRExtend instructions need this info to determine the extensions.
+type extMode byte
+
+const (
+	extModeNone extMode = iota
+	// extModeZeroExtend64 suggests a zero-extension to 32 bits if the original bit size is less than 32.
+	extModeZeroExtend32
+	// extModeSignExtend64 stands for a sign-extension to 32 bits if the original bit size is less than 32.
+	extModeSignExtend32
+	// extModeZeroExtend64 suggests a zero-extension to 64 bits if the original bit size is less than 64.
+	extModeZeroExtend64
+	// extModeSignExtend64 stands for a sign-extension to 64 bits if the original bit size is less than 64.
+	extModeSignExtend64
+)
+
+func (e extMode) bits() byte {
+	switch e {
+	case extModeZeroExtend32, extModeSignExtend32:
+		return 32
+	case extModeZeroExtend64, extModeSignExtend64:
+		return 64
+	default:
+		return 0
+	}
+}
+
+func (e extMode) signed() bool {
+	switch e {
+	case extModeSignExtend32, extModeSignExtend64:
+		return true
+	default:
+		return false
+	}
+}
+
+func extModeOf(t ssa.Type, signed bool) extMode {
+	switch t.Bits() {
+	case 32:
+		if signed {
+			return extModeSignExtend32
+		}
+		return extModeZeroExtend32
+	case 64:
+		if signed {
+			return extModeSignExtend64
+		}
+		return extModeZeroExtend64
+	default:
+		panic("TODO? do we need narrower than 32 bits?")
+	}
+}
+
+type extendOp byte
+
+const (
+	extendOpUXTB extendOp = 0b000
+	extendOpUXTH extendOp = 0b001
+	extendOpUXTW extendOp = 0b010
+	// extendOpUXTX does nothing, but convenient symbol that officially exists. See:
+	// https://stackoverflow.com/questions/72041372/what-do-the-uxtx-and-sxtx-extensions-mean-for-32-bit-aarch64-adds-instruct
+	extendOpUXTX extendOp = 0b011
+	extendOpSXTB extendOp = 0b100
+	extendOpSXTH extendOp = 0b101
+	extendOpSXTW extendOp = 0b110
+	// extendOpSXTX does nothing, but convenient symbol that officially exists. See:
+	// https://stackoverflow.com/questions/72041372/what-do-the-uxtx-and-sxtx-extensions-mean-for-32-bit-aarch64-adds-instruct
+	extendOpSXTX extendOp = 0b111
+	extendOpNone extendOp = 0xff
+)
+
+func (e extendOp) srcBits() byte {
+	switch e {
+	case extendOpUXTB, extendOpSXTB:
+		return 8
+	case extendOpUXTH, extendOpSXTH:
+		return 16
+	case extendOpUXTW, extendOpSXTW:
+		return 32
+	case extendOpUXTX, extendOpSXTX:
+		return 64
+	}
+	panic(int(e))
+}
+
+func (e extendOp) String() string {
+	switch e {
+	case extendOpUXTB:
+		return "UXTB"
+	case extendOpUXTH:
+		return "UXTH"
+	case extendOpUXTW:
+		return "UXTW"
+	case extendOpUXTX:
+		return "UXTX"
+	case extendOpSXTB:
+		return "SXTB"
+	case extendOpSXTH:
+		return "SXTH"
+	case extendOpSXTW:
+		return "SXTW"
+	case extendOpSXTX:
+		return "SXTX"
+	}
+	panic(int(e))
+}
+
+func extendOpFrom(signed bool, from byte) extendOp {
+	switch from {
+	case 8:
+		if signed {
+			return extendOpSXTB
+		}
+		return extendOpUXTB
+	case 16:
+		if signed {
+			return extendOpSXTH
+		}
+		return extendOpUXTH
+	case 32:
+		if signed {
+			return extendOpSXTW
+		}
+		return extendOpUXTW
+	case 64:
+		if signed {
+			return extendOpSXTX
+		}
+		return extendOpUXTX
+	}
+	panic("invalid extendOpFrom")
+}
+
+type shiftOp byte
+
+const (
+	shiftOpLSL shiftOp = 0b00
+	shiftOpLSR shiftOp = 0b01
+	shiftOpASR shiftOp = 0b10
+	shiftOpROR shiftOp = 0b11
+)
+
+func (s shiftOp) String() string {
+	switch s {
+	case shiftOpLSL:
+		return "lsl"
+	case shiftOpLSR:
+		return "lsr"
+	case shiftOpASR:
+		return "asr"
+	case shiftOpROR:
+		return "ror"
+	}
+	panic(int(s))
+}
+
+const exitSequenceSize = 6 * 4 // 6 instructions as in encodeExitSequence.
+
+// size returns the size of the instruction in encoded bytes.
+func (i *instruction) size() int64 {
+	switch i.kind {
+	case exitSequence:
+		return exitSequenceSize // 5 instructions as in encodeExitSequence.
+	case nop0, loadConstBlockArg:
+		return 0
+	case emitSourceOffsetInfo:
+		return 0
+	case loadFpuConst32:
+		if i.u1 == 0 {
+			return 4 // zero loading can be encoded as a single instruction.
+		}
+		return 4 + 4 + 4
+	case loadFpuConst64:
+		if i.u1 == 0 {
+			return 4 // zero loading can be encoded as a single instruction.
+		}
+		return 4 + 4 + 8
+	case loadFpuConst128:
+		if i.u1 == 0 && i.u2 == 0 {
+			return 4 // zero loading can be encoded as a single instruction.
+		}
+		return 4 + 4 + 16
+	case brTableSequence:
+		return 4*4 + int64(i.u2)*4
+	default:
+		return 4
+	}
+}
+
+// vecArrangement is the arrangement of data within a vector register.
+type vecArrangement byte
+
+const (
+	// vecArrangementNone is an arrangement indicating no data is stored.
+	vecArrangementNone vecArrangement = iota
+	// vecArrangement8B is an arrangement of 8 bytes (64-bit vector)
+	vecArrangement8B
+	// vecArrangement16B is an arrangement of 16 bytes (128-bit vector)
+	vecArrangement16B
+	// vecArrangement4H is an arrangement of 4 half precisions (64-bit vector)
+	vecArrangement4H
+	// vecArrangement8H is an arrangement of 8 half precisions (128-bit vector)
+	vecArrangement8H
+	// vecArrangement2S is an arrangement of 2 single precisions (64-bit vector)
+	vecArrangement2S
+	// vecArrangement4S is an arrangement of 4 single precisions (128-bit vector)
+	vecArrangement4S
+	// vecArrangement1D is an arrangement of 1 double precision (64-bit vector)
+	vecArrangement1D
+	// vecArrangement2D is an arrangement of 2 double precisions (128-bit vector)
+	vecArrangement2D
+
+	// Assign each vector size specifier to a vector arrangement ID.
+	// Instructions can only have an arrangement or a size specifier, but not both, so it
+	// simplifies the internal representation of vector instructions by being able to
+	// store either into the same field.
+
+	// vecArrangementB is a size specifier of byte
+	vecArrangementB
+	// vecArrangementH is a size specifier of word (16-bit)
+	vecArrangementH
+	// vecArrangementS is a size specifier of double word (32-bit)
+	vecArrangementS
+	// vecArrangementD is a size specifier of quad word (64-bit)
+	vecArrangementD
+	// vecArrangementQ is a size specifier of the entire vector (128-bit)
+	vecArrangementQ
+)
+
+// String implements fmt.Stringer
+func (v vecArrangement) String() (ret string) {
+	switch v {
+	case vecArrangement8B:
+		ret = "8B"
+	case vecArrangement16B:
+		ret = "16B"
+	case vecArrangement4H:
+		ret = "4H"
+	case vecArrangement8H:
+		ret = "8H"
+	case vecArrangement2S:
+		ret = "2S"
+	case vecArrangement4S:
+		ret = "4S"
+	case vecArrangement1D:
+		ret = "1D"
+	case vecArrangement2D:
+		ret = "2D"
+	case vecArrangementB:
+		ret = "B"
+	case vecArrangementH:
+		ret = "H"
+	case vecArrangementS:
+		ret = "S"
+	case vecArrangementD:
+		ret = "D"
+	case vecArrangementQ:
+		ret = "Q"
+	case vecArrangementNone:
+		ret = "none"
+	default:
+		panic(v)
+	}
+	return
+}
+
+// vecIndex is the index of an element of a vector register
+type vecIndex byte
+
+// vecIndexNone indicates no vector index specified.
+const vecIndexNone = ^vecIndex(0)
+
+func ssaLaneToArrangement(lane ssa.VecLane) vecArrangement {
+	switch lane {
+	case ssa.VecLaneI8x16:
+		return vecArrangement16B
+	case ssa.VecLaneI16x8:
+		return vecArrangement8H
+	case ssa.VecLaneI32x4:
+		return vecArrangement4S
+	case ssa.VecLaneI64x2:
+		return vecArrangement2D
+	case ssa.VecLaneF32x4:
+		return vecArrangement4S
+	case ssa.VecLaneF64x2:
+		return vecArrangement2D
+	default:
+		panic(lane)
+	}
+}
+
+// atomicRmwOp is the type of atomic read-modify-write operation.
+type atomicRmwOp byte
+
+const (
+	// atomicRmwOpAdd is an atomic add operation.
+	atomicRmwOpAdd atomicRmwOp = iota
+	// atomicRmwOpClr is an atomic clear operation, i.e. AND NOT.
+	atomicRmwOpClr
+	// atomicRmwOpSet is an atomic set operation, i.e. OR.
+	atomicRmwOpSet
+	// atomicRmwOpEor is an atomic exclusive OR operation.
+	atomicRmwOpEor
+	// atomicRmwOpSwp is an atomic swap operation.
+	atomicRmwOpSwp
+)
+
+// String implements fmt.Stringer
+func (a atomicRmwOp) String() string {
+	switch a {
+	case atomicRmwOpAdd:
+		return "ldaddal"
+	case atomicRmwOpClr:
+		return "ldclral"
+	case atomicRmwOpSet:
+		return "ldsetal"
+	case atomicRmwOpEor:
+		return "ldeoral"
+	case atomicRmwOpSwp:
+		return "swpal"
+	}
+	panic(fmt.Sprintf("unknown atomicRmwOp: %d", a))
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
new file mode 100644
index 000000000..227a96474
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
@@ -0,0 +1,2351 @@
+package arm64
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// Encode implements backend.Machine Encode.
+func (m *machine) Encode(ctx context.Context) error {
+	m.resolveRelativeAddresses(ctx)
+	m.encode(m.executableContext.RootInstr)
+	if l := len(m.compiler.Buf()); l > maxFunctionExecutableSize {
+		return fmt.Errorf("function size exceeds the limit: %d > %d", l, maxFunctionExecutableSize)
+	}
+	return nil
+}
+
+func (m *machine) encode(root *instruction) {
+	for cur := root; cur != nil; cur = cur.next {
+		cur.encode(m)
+	}
+}
+
+func (i *instruction) encode(m *machine) {
+	c := m.compiler
+	switch kind := i.kind; kind {
+	case nop0, emitSourceOffsetInfo, loadConstBlockArg:
+	case exitSequence:
+		encodeExitSequence(c, i.rn.reg())
+	case ret:
+		// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/RET--Return-from-subroutine-?lang=en
+		c.Emit4Bytes(encodeRet())
+	case br:
+		imm := i.brOffset()
+		c.Emit4Bytes(encodeUnconditionalBranch(false, imm))
+	case call:
+		// We still don't know the exact address of the function to call, so we emit a placeholder.
+		c.AddRelocationInfo(i.callFuncRef())
+		c.Emit4Bytes(encodeUnconditionalBranch(true, 0)) // 0 = placeholder
+	case callInd:
+		c.Emit4Bytes(encodeUnconditionalBranchReg(regNumberInEncoding[i.rn.realReg()], true))
+	case store8, store16, store32, store64, fpuStore32, fpuStore64, fpuStore128:
+		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rn.realReg()], i.amode))
+	case uLoad8, uLoad16, uLoad32, uLoad64, sLoad8, sLoad16, sLoad32, fpuLoad32, fpuLoad64, fpuLoad128:
+		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rd.realReg()], i.amode))
+	case vecLoad1R:
+		c.Emit4Bytes(encodeVecLoad1R(
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			vecArrangement(i.u1)))
+	case condBr:
+		imm19 := i.condBrOffset()
+		if imm19%4 != 0 {
+			panic("imm26 for branch must be a multiple of 4")
+		}
+
+		imm19U32 := uint32(imm19/4) & 0b111_11111111_11111111
+		brCond := i.condBrCond()
+		switch brCond.kind() {
+		case condKindRegisterZero:
+			rt := regNumberInEncoding[brCond.register().RealReg()]
+			c.Emit4Bytes(encodeCBZCBNZ(rt, false, imm19U32, i.condBr64bit()))
+		case condKindRegisterNotZero:
+			rt := regNumberInEncoding[brCond.register().RealReg()]
+			c.Emit4Bytes(encodeCBZCBNZ(rt, true, imm19U32, i.condBr64bit()))
+		case condKindCondFlagSet:
+			// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B-cond--Branch-conditionally-
+			fl := brCond.flag()
+			c.Emit4Bytes(0b01010100<<24 | (imm19U32 << 5) | uint32(fl))
+		default:
+			panic("BUG")
+		}
+	case movN:
+		c.Emit4Bytes(encodeMoveWideImmediate(0b00, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+	case movZ:
+		c.Emit4Bytes(encodeMoveWideImmediate(0b10, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+	case movK:
+		c.Emit4Bytes(encodeMoveWideImmediate(0b11, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+	case mov32:
+		to, from := i.rd.realReg(), i.rn.realReg()
+		c.Emit4Bytes(encodeAsMov32(regNumberInEncoding[from], regNumberInEncoding[to]))
+	case mov64:
+		to, from := i.rd.realReg(), i.rn.realReg()
+		toIsSp := to == sp
+		fromIsSp := from == sp
+		c.Emit4Bytes(encodeMov64(regNumberInEncoding[to], regNumberInEncoding[from], toIsSp, fromIsSp))
+	case loadP64, storeP64:
+		rt, rt2 := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()]
+		amode := i.amode
+		rn := regNumberInEncoding[amode.rn.RealReg()]
+		var pre bool
+		switch amode.kind {
+		case addressModeKindPostIndex:
+		case addressModeKindPreIndex:
+			pre = true
+		default:
+			panic("BUG")
+		}
+		c.Emit4Bytes(encodePreOrPostIndexLoadStorePair64(pre, kind == loadP64, rn, rt, rt2, amode.imm))
+	case loadFpuConst32:
+		rd := regNumberInEncoding[i.rd.realReg()]
+		if i.u1 == 0 {
+			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B))
+		} else {
+			encodeLoadFpuConst32(c, rd, i.u1)
+		}
+	case loadFpuConst64:
+		rd := regNumberInEncoding[i.rd.realReg()]
+		if i.u1 == 0 {
+			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B))
+		} else {
+			encodeLoadFpuConst64(c, regNumberInEncoding[i.rd.realReg()], i.u1)
+		}
+	case loadFpuConst128:
+		rd := regNumberInEncoding[i.rd.realReg()]
+		lo, hi := i.u1, i.u2
+		if lo == 0 && hi == 0 {
+			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement16B))
+		} else {
+			encodeLoadFpuConst128(c, rd, lo, hi)
+		}
+	case aluRRRR:
+		c.Emit4Bytes(encodeAluRRRR(
+			aluOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			regNumberInEncoding[i.rm.realReg()],
+			regNumberInEncoding[i.ra.realReg()],
+			uint32(i.u3),
+		))
+	case aluRRImmShift:
+		c.Emit4Bytes(encodeAluRRImm(
+			aluOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			uint32(i.rm.shiftImm()),
+			uint32(i.u3),
+		))
+	case aluRRR:
+		rn := i.rn.realReg()
+		c.Emit4Bytes(encodeAluRRR(
+			aluOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[rn],
+			regNumberInEncoding[i.rm.realReg()],
+			i.u3 == 1,
+			rn == sp,
+		))
+	case aluRRRExtend:
+		rm, exo, to := i.rm.er()
+		c.Emit4Bytes(encodeAluRRRExtend(
+			aluOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			regNumberInEncoding[rm.RealReg()],
+			exo,
+			to,
+		))
+	case aluRRRShift:
+		r, amt, sop := i.rm.sr()
+		c.Emit4Bytes(encodeAluRRRShift(
+			aluOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			regNumberInEncoding[r.RealReg()],
+			uint32(amt),
+			sop,
+			i.u3 == 1,
+		))
+	case aluRRBitmaskImm:
+		c.Emit4Bytes(encodeAluBitmaskImmediate(
+			aluOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			i.u2,
+			i.u3 == 1,
+		))
+	case bitRR:
+		c.Emit4Bytes(encodeBitRR(
+			bitOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			uint32(i.u2)),
+		)
+	case aluRRImm12:
+		imm12, shift := i.rm.imm12()
+		c.Emit4Bytes(encodeAluRRImm12(
+			aluOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			imm12, shift,
+			i.u3 == 1,
+		))
+	case fpuRRR:
+		c.Emit4Bytes(encodeFpuRRR(
+			fpuBinOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			regNumberInEncoding[i.rm.realReg()],
+			i.u3 == 1,
+		))
+	case fpuMov64, fpuMov128:
+		// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/MOV--vector---Move-vector--an-alias-of-ORR--vector--register--
+		rd := regNumberInEncoding[i.rd.realReg()]
+		rn := regNumberInEncoding[i.rn.realReg()]
+		var q uint32
+		if kind == fpuMov128 {
+			q = 0b1
+		}
+		c.Emit4Bytes(q<<30 | 0b1110101<<21 | rn<<16 | 0b000111<<10 | rn<<5 | rd)
+	case cSet:
+		rd := regNumberInEncoding[i.rd.realReg()]
+		cf := condFlag(i.u1)
+		if i.u2 == 1 {
+			// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/CSETM--Conditional-Set-Mask--an-alias-of-CSINV-
+			// Note that we set 64bit version here.
+			c.Emit4Bytes(0b1101101010011111<<16 | uint32(cf.invert())<<12 | 0b011111<<5 | rd)
+		} else {
+			// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/CSET--Conditional-Set--an-alias-of-CSINC-
+			// Note that we set 64bit version here.
+			c.Emit4Bytes(0b1001101010011111<<16 | uint32(cf.invert())<<12 | 0b111111<<5 | rd)
+		}
+	case extend:
+		c.Emit4Bytes(encodeExtend(i.u3 == 1, byte(i.u1), byte(i.u2), regNumberInEncoding[i.rd.realReg()], regNumberInEncoding[i.rn.realReg()]))
+	case fpuCmp:
+		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/FCMP--Floating-point-quiet-Compare--scalar--?lang=en
+		rn, rm := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()]
+		var ftype uint32
+		if i.u3 == 1 {
+			ftype = 0b01 // double precision.
+		}
+		c.Emit4Bytes(0b1111<<25 | ftype<<22 | 1<<21 | rm<<16 | 0b1<<13 | rn<<5)
+	case udf:
+		// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UDF--Permanently-Undefined-?lang=en
+		if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
+			c.Emit4Bytes(dummyInstruction)
+		} else {
+			c.Emit4Bytes(0)
+		}
+	case adr:
+		c.Emit4Bytes(encodeAdr(regNumberInEncoding[i.rd.realReg()], uint32(i.u1)))
+	case cSel:
+		c.Emit4Bytes(encodeConditionalSelect(
+			kind,
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			regNumberInEncoding[i.rm.realReg()],
+			condFlag(i.u1),
+			i.u3 == 1,
+		))
+	case fpuCSel:
+		c.Emit4Bytes(encodeFpuCSel(
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			regNumberInEncoding[i.rm.realReg()],
+			condFlag(i.u1),
+			i.u3 == 1,
+		))
+	case movToVec:
+		c.Emit4Bytes(encodeMoveToVec(
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			vecArrangement(byte(i.u1)),
+			vecIndex(i.u2),
+		))
+	case movFromVec, movFromVecSigned:
+		c.Emit4Bytes(encodeMoveFromVec(
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			vecArrangement(byte(i.u1)),
+			vecIndex(i.u2),
+			i.kind == movFromVecSigned,
+		))
+	case vecDup:
+		c.Emit4Bytes(encodeVecDup(
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			vecArrangement(byte(i.u1))))
+	case vecDupElement:
+		c.Emit4Bytes(encodeVecDupElement(
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			vecArrangement(byte(i.u1)),
+			vecIndex(i.u2)))
+	case vecExtract:
+		c.Emit4Bytes(encodeVecExtract(
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			regNumberInEncoding[i.rm.realReg()],
+			vecArrangement(byte(i.u1)),
+			uint32(i.u2)))
+	case vecPermute:
+		c.Emit4Bytes(encodeVecPermute(
+			vecOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			regNumberInEncoding[i.rm.realReg()],
+			vecArrangement(byte(i.u2))))
+	case vecMovElement:
+		c.Emit4Bytes(encodeVecMovElement(
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			vecArrangement(i.u1),
+			uint32(i.u2), uint32(i.u3),
+		))
+	case vecMisc:
+		c.Emit4Bytes(encodeAdvancedSIMDTwoMisc(
+			vecOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			vecArrangement(i.u2),
+		))
+	case vecLanes:
+		c.Emit4Bytes(encodeVecLanes(
+			vecOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			vecArrangement(i.u2),
+		))
+	case vecShiftImm:
+		c.Emit4Bytes(encodeVecShiftImm(
+			vecOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			uint32(i.rm.shiftImm()),
+			vecArrangement(i.u2),
+		))
+	case vecTbl:
+		c.Emit4Bytes(encodeVecTbl(
+			1,
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			regNumberInEncoding[i.rm.realReg()],
+			vecArrangement(i.u2)),
+		)
+	case vecTbl2:
+		c.Emit4Bytes(encodeVecTbl(
+			2,
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			regNumberInEncoding[i.rm.realReg()],
+			vecArrangement(i.u2)),
+		)
+	case brTableSequence:
+		targets := m.jmpTableTargets[i.u1]
+		encodeBrTableSequence(c, i.rn.reg(), targets)
+	case fpuToInt, intToFpu:
+		c.Emit4Bytes(encodeCnvBetweenFloatInt(i))
+	case fpuRR:
+		c.Emit4Bytes(encodeFloatDataOneSource(
+			fpuUniOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			i.u3 == 1,
+		))
+	case vecRRR:
+		if op := vecOp(i.u1); op == vecOpBsl || op == vecOpBit || op == vecOpUmlal {
+			panic(fmt.Sprintf("vecOp %s must use vecRRRRewrite instead of vecRRR", op.String()))
+		}
+		fallthrough
+	case vecRRRRewrite:
+		c.Emit4Bytes(encodeVecRRR(
+			vecOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			regNumberInEncoding[i.rm.realReg()],
+			vecArrangement(i.u2),
+		))
+	case cCmpImm:
+		// Conditional compare (immediate) in https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
+		sf := uint32(i.u3 & 0b1)
+		nzcv := uint32(i.u2 & 0b1111)
+		cond := uint32(condFlag(i.u1))
+		imm := uint32(i.rm.data & 0b11111)
+		rn := regNumberInEncoding[i.rn.realReg()]
+		c.Emit4Bytes(
+			sf<<31 | 0b111101001<<22 | imm<<16 | cond<<12 | 0b1<<11 | rn<<5 | nzcv,
+		)
+	case movFromFPSR:
+		rt := regNumberInEncoding[i.rd.realReg()]
+		c.Emit4Bytes(encodeSystemRegisterMove(rt, true))
+	case movToFPSR:
+		rt := regNumberInEncoding[i.rn.realReg()]
+		c.Emit4Bytes(encodeSystemRegisterMove(rt, false))
+	case atomicRmw:
+		c.Emit4Bytes(encodeAtomicRmw(
+			atomicRmwOp(i.u1),
+			regNumberInEncoding[i.rm.realReg()],
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			uint32(i.u2),
+		))
+	case atomicCas:
+		c.Emit4Bytes(encodeAtomicCas(
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rm.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			uint32(i.u2),
+		))
+	case atomicLoad:
+		c.Emit4Bytes(encodeAtomicLoadStore(
+			regNumberInEncoding[i.rn.realReg()],
+			regNumberInEncoding[i.rd.realReg()],
+			uint32(i.u2),
+			1,
+		))
+	case atomicStore:
+		c.Emit4Bytes(encodeAtomicLoadStore(
+			regNumberInEncoding[i.rn.realReg()],
+			regNumberInEncoding[i.rm.realReg()],
+			uint32(i.u2),
+			0,
+		))
+	case dmb:
+		c.Emit4Bytes(encodeDMB())
+	default:
+		panic(i.String())
+	}
+}
+
+func encodeMov64(rd, rn uint32, toIsSp, fromIsSp bool) uint32 {
+	if toIsSp || fromIsSp {
+		// This is an alias of ADD (immediate):
+		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MOV--to-from-SP---Move-between-register-and-stack-pointer--an-alias-of-ADD--immediate--
+		return encodeAddSubtractImmediate(0b100, 0, 0, rn, rd)
+	} else {
+		// This is an alias of ORR (shifted register):
+		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MOV--register---Move--register---an-alias-of-ORR--shifted-register--
+		return encodeLogicalShiftedRegister(0b101, 0, rn, 0, regNumberInEncoding[xzr], rd)
+	}
+}
+
+// encodeSystemRegisterMove encodes as "System register move" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en
+//
+// Note that currently we only supports read/write of FPSR.
+func encodeSystemRegisterMove(rt uint32, fromSystem bool) uint32 {
+	ret := 0b11010101<<24 | 0b11011<<16 | 0b01000100<<8 | 0b001<<5 | rt
+	if fromSystem {
+		ret |= 0b1 << 21
+	}
+	return ret
+}
+
+// encodeVecRRR encodes as either "Advanced SIMD three *" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeVecRRR(op vecOp, rd, rn, rm uint32, arr vecArrangement) uint32 {
+	switch op {
+	case vecOpBit:
+		_, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b10 /* always has size 0b10 */, 0b1, q)
+	case vecOpBic:
+		if arr > vecArrangement16B {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		_, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b01 /* always has size 0b01 */, 0b0, q)
+	case vecOpBsl:
+		if arr > vecArrangement16B {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		_, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b01 /* always has size 0b01 */, 0b1, q)
+	case vecOpAnd:
+		if arr > vecArrangement16B {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		_, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b00 /* always has size 0b00 */, 0b0, q)
+	case vecOpOrr:
+		_, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b10 /* always has size 0b10 */, 0b0, q)
+	case vecOpEOR:
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, size, 0b1, q)
+	case vecOpCmeq:
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10001, size, 0b1, q)
+	case vecOpCmgt:
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00110, size, 0b0, q)
+	case vecOpCmhi:
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00110, size, 0b1, q)
+	case vecOpCmge:
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00111, size, 0b0, q)
+	case vecOpCmhs:
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00111, size, 0b1, q)
+	case vecOpFcmeq:
+		var size, q uint32
+		switch arr {
+		case vecArrangement4S:
+			size, q = 0b00, 0b1
+		case vecArrangement2S:
+			size, q = 0b00, 0b0
+		case vecArrangement2D:
+			size, q = 0b01, 0b1
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11100, size, 0b0, q)
+	case vecOpFcmgt:
+		if arr < vecArrangement2S || arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11100, size, 0b1, q)
+	case vecOpFcmge:
+		var size, q uint32
+		switch arr {
+		case vecArrangement4S:
+			size, q = 0b00, 0b1
+		case vecArrangement2S:
+			size, q = 0b00, 0b0
+		case vecArrangement2D:
+			size, q = 0b01, 0b1
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11100, size, 0b1, q)
+	case vecOpAdd:
+		if arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10000, size, 0b0, q)
+	case vecOpSqadd:
+		if arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00001, size, 0b0, q)
+	case vecOpUqadd:
+		if arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00001, size, 0b1, q)
+	case vecOpAddp:
+		if arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10111, size, 0b0, q)
+	case vecOpSqsub:
+		if arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00101, size, 0b0, q)
+	case vecOpUqsub:
+		if arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00101, size, 0b1, q)
+	case vecOpSub:
+		if arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10000, size, 0b1, q)
+	case vecOpFmin:
+		if arr < vecArrangement2S || arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11110, size, 0b0, q)
+	case vecOpSmin:
+		if arr > vecArrangement4S {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01101, size, 0b0, q)
+	case vecOpUmin:
+		if arr > vecArrangement4S {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01101, size, 0b1, q)
+	case vecOpFmax:
+		var size, q uint32
+		switch arr {
+		case vecArrangement4S:
+			size, q = 0b00, 0b1
+		case vecArrangement2S:
+			size, q = 0b00, 0b0
+		case vecArrangement2D:
+			size, q = 0b01, 0b1
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11110, size, 0b0, q)
+	case vecOpFadd:
+		var size, q uint32
+		switch arr {
+		case vecArrangement4S:
+			size, q = 0b00, 0b1
+		case vecArrangement2S:
+			size, q = 0b00, 0b0
+		case vecArrangement2D:
+			size, q = 0b01, 0b1
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11010, size, 0b0, q)
+	case vecOpFsub:
+		if arr < vecArrangement2S || arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11010, size, 0b0, q)
+	case vecOpFmul:
+		var size, q uint32
+		switch arr {
+		case vecArrangement4S:
+			size, q = 0b00, 0b1
+		case vecArrangement2S:
+			size, q = 0b00, 0b0
+		case vecArrangement2D:
+			size, q = 0b01, 0b1
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11011, size, 0b1, q)
+	case vecOpSqrdmulh:
+		if arr < vecArrangement4H || arr > vecArrangement4S {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10110, size, 0b1, q)
+	case vecOpFdiv:
+		var size, q uint32
+		switch arr {
+		case vecArrangement4S:
+			size, q = 0b00, 0b1
+		case vecArrangement2S:
+			size, q = 0b00, 0b0
+		case vecArrangement2D:
+			size, q = 0b01, 0b1
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11111, size, 0b1, q)
+	case vecOpSmax:
+		if arr > vecArrangement4S {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01100, size, 0b0, q)
+	case vecOpUmax:
+		if arr > vecArrangement4S {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01100, size, 0b1, q)
+	case vecOpUmaxp:
+		if arr > vecArrangement4S {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10100, size, 0b1, q)
+	case vecOpUrhadd:
+		if arr > vecArrangement4S {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00010, size, 0b1, q)
+	case vecOpMul:
+		if arr > vecArrangement4S {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10011, size, 0b0, q)
+	case vecOpUmlal:
+		if arr > vecArrangement4S {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeDifferent(rd, rn, rm, 0b1000, size, 0b1, q)
+	case vecOpSshl:
+		if arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01000, size, 0b0, q)
+	case vecOpUshl:
+		if arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01000, size, 0b1, q)
+
+	case vecOpSmull:
+		if arr > vecArrangement4S {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, _ := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeDifferent(rd, rn, rm, 0b1100, size, 0b0, 0b0)
+
+	case vecOpSmull2:
+		if arr > vecArrangement4S {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, _ := arrToSizeQEncoded(arr)
+		return encodeAdvancedSIMDThreeDifferent(rd, rn, rm, 0b1100, size, 0b0, 0b1)
+
+	default:
+		panic("TODO: " + op.String())
+	}
+}
+
+func arrToSizeQEncoded(arr vecArrangement) (size, q uint32) {
+	switch arr {
+	case vecArrangement16B:
+		q = 0b1
+		fallthrough
+	case vecArrangement8B:
+		size = 0b00
+	case vecArrangement8H:
+		q = 0b1
+		fallthrough
+	case vecArrangement4H:
+		size = 0b01
+	case vecArrangement4S:
+		q = 0b1
+		fallthrough
+	case vecArrangement2S:
+		size = 0b10
+	case vecArrangement2D:
+		q = 0b1
+		fallthrough
+	case vecArrangement1D:
+		size = 0b11
+	default:
+		panic("BUG")
+	}
+	return
+}
+
+// encodeAdvancedSIMDThreeSame encodes as "Advanced SIMD three same" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeAdvancedSIMDThreeSame(rd, rn, rm, opcode, size, U, Q uint32) uint32 {
+	return Q<<30 | U<<29 | 0b111<<25 | size<<22 | 0b1<<21 | rm<<16 | opcode<<11 | 0b1<<10 | rn<<5 | rd
+}
+
+// encodeAdvancedSIMDThreeDifferent encodes as "Advanced SIMD three different" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeAdvancedSIMDThreeDifferent(rd, rn, rm, opcode, size, U, Q uint32) uint32 {
+	return Q<<30 | U<<29 | 0b111<<25 | size<<22 | 0b1<<21 | rm<<16 | opcode<<12 | rn<<5 | rd
+}
+
+// encodeFloatDataOneSource encodes as "Floating-point data-processing (1 source)" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp
+func encodeFloatDataOneSource(op fpuUniOp, rd, rn uint32, dst64bit bool) uint32 {
+	var opcode, ptype uint32
+	switch op {
+	case fpuUniOpCvt32To64:
+		opcode = 0b000101
+	case fpuUniOpCvt64To32:
+		opcode = 0b000100
+		ptype = 0b01
+	case fpuUniOpNeg:
+		opcode = 0b000010
+		if dst64bit {
+			ptype = 0b01
+		}
+	case fpuUniOpSqrt:
+		opcode = 0b000011
+		if dst64bit {
+			ptype = 0b01
+		}
+	case fpuUniOpRoundPlus:
+		opcode = 0b001001
+		if dst64bit {
+			ptype = 0b01
+		}
+	case fpuUniOpRoundMinus:
+		opcode = 0b001010
+		if dst64bit {
+			ptype = 0b01
+		}
+	case fpuUniOpRoundZero:
+		opcode = 0b001011
+		if dst64bit {
+			ptype = 0b01
+		}
+	case fpuUniOpRoundNearest:
+		opcode = 0b001000
+		if dst64bit {
+			ptype = 0b01
+		}
+	case fpuUniOpAbs:
+		opcode = 0b000001
+		if dst64bit {
+			ptype = 0b01
+		}
+	default:
+		panic("BUG")
+	}
+	return 0b1111<<25 | ptype<<22 | 0b1<<21 | opcode<<15 | 0b1<<14 | rn<<5 | rd
+}
+
+// encodeCnvBetweenFloatInt encodes as "Conversion between floating-point and integer" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeCnvBetweenFloatInt(i *instruction) uint32 {
+	rd := regNumberInEncoding[i.rd.realReg()]
+	rn := regNumberInEncoding[i.rn.realReg()]
+
+	var opcode uint32
+	var rmode uint32
+	var ptype uint32
+	var sf uint32
+	switch i.kind {
+	case intToFpu: // Either UCVTF or SCVTF.
+		rmode = 0b00
+
+		signed := i.u1 == 1
+		src64bit := i.u2 == 1
+		dst64bit := i.u3 == 1
+		if signed {
+			opcode = 0b010
+		} else {
+			opcode = 0b011
+		}
+		if src64bit {
+			sf = 0b1
+		}
+		if dst64bit {
+			ptype = 0b01
+		} else {
+			ptype = 0b00
+		}
+	case fpuToInt: // Either FCVTZU or FCVTZS.
+		rmode = 0b11
+
+		signed := i.u1 == 1
+		src64bit := i.u2 == 1
+		dst64bit := i.u3 == 1
+
+		if signed {
+			opcode = 0b000
+		} else {
+			opcode = 0b001
+		}
+		if dst64bit {
+			sf = 0b1
+		}
+		if src64bit {
+			ptype = 0b01
+		} else {
+			ptype = 0b00
+		}
+	}
+	return sf<<31 | 0b1111<<25 | ptype<<22 | 0b1<<21 | rmode<<19 | opcode<<16 | rn<<5 | rd
+}
+
+// encodeAdr encodes a PC-relative ADR instruction.
+// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/ADR--Form-PC-relative-address-
+func encodeAdr(rd uint32, offset uint32) uint32 {
+	if offset >= 1<<20 {
+		panic("BUG: too large adr instruction")
+	}
+	return offset&0b11<<29 | 0b1<<28 | offset&0b1111111111_1111111100<<3 | rd
+}
+
+// encodeFpuCSel encodes as "Floating-point conditional select" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeFpuCSel(rd, rn, rm uint32, c condFlag, _64bit bool) uint32 {
+	var ftype uint32
+	if _64bit {
+		ftype = 0b01 // double precision.
+	}
+	return 0b1111<<25 | ftype<<22 | 0b1<<21 | rm<<16 | uint32(c)<<12 | 0b11<<10 | rn<<5 | rd
+}
+
+// encodeMoveToVec encodes as "Move general-purpose register to a vector element" (represented as `ins`) in
+// https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/MOV--vector--from-general-
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--from-general---Move-general-purpose-register-to-a-vector-element--an-alias-of-INS--general--?lang=en
+func encodeMoveToVec(rd, rn uint32, arr vecArrangement, index vecIndex) uint32 {
+	var imm5 uint32
+	switch arr {
+	case vecArrangementB:
+		imm5 |= 0b1
+		imm5 |= uint32(index) << 1
+		if index > 0b1111 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", index))
+		}
+	case vecArrangementH:
+		imm5 |= 0b10
+		imm5 |= uint32(index) << 2
+		if index > 0b111 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", index))
+		}
+	case vecArrangementS:
+		imm5 |= 0b100
+		imm5 |= uint32(index) << 3
+		if index > 0b11 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", index))
+		}
+	case vecArrangementD:
+		imm5 |= 0b1000
+		imm5 |= uint32(index) << 4
+		if index > 0b1 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", index))
+		}
+	default:
+		panic("Unsupported arrangement " + arr.String())
+	}
+
+	return 0b01001110000<<21 | imm5<<16 | 0b000111<<10 | rn<<5 | rd
+}
+
+// encodeMoveToVec encodes as "Move vector element to another vector element, mov (element)" (represented as `ins`) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--element---Move-vector-element-to-another-vector-element--an-alias-of-INS--element--?lang=en
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en
+func encodeVecMovElement(rd, rn uint32, arr vecArrangement, srcIndex, dstIndex uint32) uint32 {
+	var imm4, imm5 uint32
+	switch arr {
+	case vecArrangementB:
+		imm5 |= 0b1
+		imm5 |= srcIndex << 1
+		imm4 = dstIndex
+		if srcIndex > 0b1111 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", srcIndex))
+		}
+	case vecArrangementH:
+		imm5 |= 0b10
+		imm5 |= srcIndex << 2
+		imm4 = dstIndex << 1
+		if srcIndex > 0b111 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", srcIndex))
+		}
+	case vecArrangementS:
+		imm5 |= 0b100
+		imm5 |= srcIndex << 3
+		imm4 = dstIndex << 2
+		if srcIndex > 0b11 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", srcIndex))
+		}
+	case vecArrangementD:
+		imm5 |= 0b1000
+		imm5 |= srcIndex << 4
+		imm4 = dstIndex << 3
+		if srcIndex > 0b1 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", srcIndex))
+		}
+	default:
+		panic("Unsupported arrangement " + arr.String())
+	}
+
+	return 0b01101110000<<21 | imm5<<16 | imm4<<11 | 0b1<<10 | rn<<5 | rd
+}
+
+// encodeUnconditionalBranchReg encodes as "Unconditional branch (register)" in:
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en
+func encodeUnconditionalBranchReg(rn uint32, link bool) uint32 {
+	var opc uint32
+	if link {
+		opc = 0b0001
+	}
+	return 0b1101011<<25 | opc<<21 | 0b11111<<16 | rn<<5
+}
+
+// encodeMoveFromVec encodes as "Move vector element to a general-purpose register"
+// (represented as `umov` when dest is 32-bit, `umov` otherwise) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register-?lang=en
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--to-general---Move-vector-element-to-general-purpose-register--an-alias-of-UMOV-?lang=en
+func encodeMoveFromVec(rd, rn uint32, arr vecArrangement, index vecIndex, signed bool) uint32 {
+	var op, imm4, q, imm5 uint32
+	switch {
+	case arr == vecArrangementB:
+		imm5 |= 0b1
+		imm5 |= uint32(index) << 1
+		if index > 0b1111 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", index))
+		}
+	case arr == vecArrangementH:
+		imm5 |= 0b10
+		imm5 |= uint32(index) << 2
+		if index > 0b111 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", index))
+		}
+	case arr == vecArrangementS && signed:
+		q = 0b1
+		fallthrough
+	case arr == vecArrangementS:
+		imm5 |= 0b100
+		imm5 |= uint32(index) << 3
+		if index > 0b11 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", index))
+		}
+	case arr == vecArrangementD && !signed:
+		imm5 |= 0b1000
+		imm5 |= uint32(index) << 4
+		q = 0b1
+		if index > 0b1 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", index))
+		}
+	default:
+		panic("Unsupported arrangement " + arr.String())
+	}
+	if signed {
+		op, imm4 = 0, 0b0101
+	} else {
+		op, imm4 = 0, 0b0111
+	}
+	return op<<29 | 0b01110000<<21 | q<<30 | imm5<<16 | imm4<<11 | 1<<10 | rn<<5 | rd
+}
+
+// encodeVecDup encodes as "Duplicate general-purpose register to vector" DUP (general)
+// (represented as `dup`)
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--general---Duplicate-general-purpose-register-to-vector-?lang=en
+func encodeVecDup(rd, rn uint32, arr vecArrangement) uint32 {
+	var q, imm5 uint32
+	switch arr {
+	case vecArrangement8B:
+		q, imm5 = 0b0, 0b1
+	case vecArrangement16B:
+		q, imm5 = 0b1, 0b1
+	case vecArrangement4H:
+		q, imm5 = 0b0, 0b10
+	case vecArrangement8H:
+		q, imm5 = 0b1, 0b10
+	case vecArrangement2S:
+		q, imm5 = 0b0, 0b100
+	case vecArrangement4S:
+		q, imm5 = 0b1, 0b100
+	case vecArrangement2D:
+		q, imm5 = 0b1, 0b1000
+	default:
+		panic("Unsupported arrangement " + arr.String())
+	}
+	return q<<30 | 0b001110000<<21 | imm5<<16 | 0b000011<<10 | rn<<5 | rd
+}
+
+// encodeVecDup encodes as "Duplicate vector element to vector or scalar" DUP (element).
+// (represented as `dup`)
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar-
+func encodeVecDupElement(rd, rn uint32, arr vecArrangement, srcIndex vecIndex) uint32 {
+	var q, imm5 uint32
+	q = 0b1
+	switch arr {
+	case vecArrangementB:
+		imm5 |= 0b1
+		imm5 |= uint32(srcIndex) << 1
+	case vecArrangementH:
+		imm5 |= 0b10
+		imm5 |= uint32(srcIndex) << 2
+	case vecArrangementS:
+		imm5 |= 0b100
+		imm5 |= uint32(srcIndex) << 3
+	case vecArrangementD:
+		imm5 |= 0b1000
+		imm5 |= uint32(srcIndex) << 4
+	default:
+		panic("unsupported arrangement" + arr.String())
+	}
+
+	return q<<30 | 0b001110000<<21 | imm5<<16 | 0b1<<10 | rn<<5 | rd
+}
+
+// encodeVecExtract encodes as "Advanced SIMD extract."
+// Currently only `ext` is defined.
+// https://developer.arm.com/documentation/ddi0602/2023-06/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp
+// https://developer.arm.com/documentation/ddi0602/2023-06/SIMD-FP-Instructions/EXT--Extract-vector-from-pair-of-vectors-?lang=en
+func encodeVecExtract(rd, rn, rm uint32, arr vecArrangement, index uint32) uint32 {
+	var q, imm4 uint32
+	switch arr {
+	case vecArrangement8B:
+		q, imm4 = 0, 0b0111&uint32(index)
+	case vecArrangement16B:
+		q, imm4 = 1, 0b1111&uint32(index)
+	default:
+		panic("Unsupported arrangement " + arr.String())
+	}
+	return q<<30 | 0b101110000<<21 | rm<<16 | imm4<<11 | rn<<5 | rd
+}
+
+// encodeVecPermute encodes as "Advanced SIMD permute."
+// https://developer.arm.com/documentation/ddi0602/2023-06/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp
+func encodeVecPermute(op vecOp, rd, rn, rm uint32, arr vecArrangement) uint32 {
+	var q, size, opcode uint32
+	switch op {
+	case vecOpZip1:
+		opcode = 0b011
+		if arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q = arrToSizeQEncoded(arr)
+	default:
+		panic("TODO: " + op.String())
+	}
+	return q<<30 | 0b001110<<24 | size<<22 | rm<<16 | opcode<<12 | 0b10<<10 | rn<<5 | rd
+}
+
+// encodeConditionalSelect encodes as "Conditional select" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#condsel
+func encodeConditionalSelect(kind instructionKind, rd, rn, rm uint32, c condFlag, _64bit bool) uint32 {
+	if kind != cSel {
+		panic("TODO: support other conditional select")
+	}
+
+	ret := 0b110101<<23 | rm<<16 | uint32(c)<<12 | rn<<5 | rd
+	if _64bit {
+		ret |= 0b1 << 31
+	}
+	return ret
+}
+
+const dummyInstruction uint32 = 0x14000000 // "b 0"
+
+// encodeLoadFpuConst32 encodes the following three instructions:
+//
+//	ldr s8, #8  ;; literal load of data.f32
+//	b 8           ;; skip the data
+//	data.f32 xxxxxxx
+func encodeLoadFpuConst32(c backend.Compiler, rd uint32, rawF32 uint64) {
+	c.Emit4Bytes(
+		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--?lang=en
+		0b111<<26 | (0x8/4)<<5 | rd,
+	)
+	c.Emit4Bytes(encodeUnconditionalBranch(false, 8)) // b 8
+	if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
+		// Inlined data.f32 cannot be disassembled, so we add a dummy instruction here.
+		c.Emit4Bytes(dummyInstruction)
+	} else {
+		c.Emit4Bytes(uint32(rawF32)) // data.f32 xxxxxxx
+	}
+}
+
+// encodeLoadFpuConst64 encodes the following three instructions:
+//
+//	ldr d8, #8  ;; literal load of data.f64
+//	b 12           ;; skip the data
+//	data.f64 xxxxxxx
+func encodeLoadFpuConst64(c backend.Compiler, rd uint32, rawF64 uint64) {
+	c.Emit4Bytes(
+		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--?lang=en
+		0b1<<30 | 0b111<<26 | (0x8/4)<<5 | rd,
+	)
+	c.Emit4Bytes(encodeUnconditionalBranch(false, 12)) // b 12
+	if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
+		// Inlined data.f64 cannot be disassembled, so we add dummy instructions here.
+		c.Emit4Bytes(dummyInstruction)
+		c.Emit4Bytes(dummyInstruction)
+	} else {
+		// data.f64 xxxxxxx
+		c.Emit4Bytes(uint32(rawF64))
+		c.Emit4Bytes(uint32(rawF64 >> 32))
+	}
+}
+
+// encodeLoadFpuConst128 encodes the following three instructions:
+//
+//	ldr v8, #8  ;; literal load of data.f64
+//	b 20           ;; skip the data
+//	data.v128 xxxxxxx
+func encodeLoadFpuConst128(c backend.Compiler, rd uint32, lo, hi uint64) {
+	c.Emit4Bytes(
+		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--?lang=en
+		0b1<<31 | 0b111<<26 | (0x8/4)<<5 | rd,
+	)
+	c.Emit4Bytes(encodeUnconditionalBranch(false, 20)) // b 20
+	if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
+		// Inlined data.v128 cannot be disassembled, so we add dummy instructions here.
+		c.Emit4Bytes(dummyInstruction)
+		c.Emit4Bytes(dummyInstruction)
+		c.Emit4Bytes(dummyInstruction)
+		c.Emit4Bytes(dummyInstruction)
+	} else {
+		// data.v128 xxxxxxx
+		c.Emit4Bytes(uint32(lo))
+		c.Emit4Bytes(uint32(lo >> 32))
+		c.Emit4Bytes(uint32(hi))
+		c.Emit4Bytes(uint32(hi >> 32))
+	}
+}
+
+// encodeAluRRRR encodes as Data-processing (3 source) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
+func encodeAluRRRR(op aluOp, rd, rn, rm, ra, _64bit uint32) uint32 {
+	var oO, op31 uint32
+	switch op {
+	case aluOpMAdd:
+		op31, oO = 0b000, 0b0
+	case aluOpMSub:
+		op31, oO = 0b000, 0b1
+	default:
+		panic("TODO/BUG")
+	}
+	return _64bit<<31 | 0b11011<<24 | op31<<21 | rm<<16 | oO<<15 | ra<<10 | rn<<5 | rd
+}
+
+// encodeBitRR encodes as Data-processing (1 source) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
+func encodeBitRR(op bitOp, rd, rn, _64bit uint32) uint32 {
+	var opcode2, opcode uint32
+	switch op {
+	case bitOpRbit:
+		opcode2, opcode = 0b00000, 0b000000
+	case bitOpClz:
+		opcode2, opcode = 0b00000, 0b000100
+	default:
+		panic("TODO/BUG")
+	}
+	return _64bit<<31 | 0b1_0_11010110<<21 | opcode2<<15 | opcode<<10 | rn<<5 | rd
+}
+
+func encodeAsMov32(rn, rd uint32) uint32 {
+	// This is an alias of ORR (shifted register):
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MOV--register---Move--register---an-alias-of-ORR--shifted-register--
+	return encodeLogicalShiftedRegister(0b001, 0, rn, 0, regNumberInEncoding[xzr], rd)
+}
+
+// encodeExtend encodes extension instructions.
+func encodeExtend(signed bool, from, to byte, rd, rn uint32) uint32 {
+	// UTXB: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UXTB--Unsigned-Extend-Byte--an-alias-of-UBFM-?lang=en
+	// UTXH: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UXTH--Unsigned-Extend-Halfword--an-alias-of-UBFM-?lang=en
+	// STXB: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SXTB--Signed-Extend-Byte--an-alias-of-SBFM-
+	// STXH: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SXTH--Sign-Extend-Halfword--an-alias-of-SBFM-
+	// STXW: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SXTW--Sign-Extend-Word--an-alias-of-SBFM-
+	var _31to10 uint32
+	switch {
+	case !signed && from == 8 && to == 32:
+		// 32-bit UXTB
+		_31to10 = 0b0101001100000000000111
+	case !signed && from == 16 && to == 32:
+		// 32-bit UXTH
+		_31to10 = 0b0101001100000000001111
+	case !signed && from == 8 && to == 64:
+		// 64-bit UXTB
+		_31to10 = 0b0101001100000000000111
+	case !signed && from == 16 && to == 64:
+		// 64-bit UXTH
+		_31to10 = 0b0101001100000000001111
+	case !signed && from == 32 && to == 64:
+		return encodeAsMov32(rn, rd)
+	case signed && from == 8 && to == 32:
+		// 32-bit SXTB
+		_31to10 = 0b0001001100000000000111
+	case signed && from == 16 && to == 32:
+		// 32-bit SXTH
+		_31to10 = 0b0001001100000000001111
+	case signed && from == 8 && to == 64:
+		// 64-bit SXTB
+		_31to10 = 0b1001001101000000000111
+	case signed && from == 16 && to == 64:
+		// 64-bit SXTH
+		_31to10 = 0b1001001101000000001111
+	case signed && from == 32 && to == 64:
+		// SXTW
+		_31to10 = 0b1001001101000000011111
+	default:
+		panic("BUG")
+	}
+	return _31to10<<10 | rn<<5 | rd
+}
+
+func encodeLoadOrStore(kind instructionKind, rt uint32, amode addressMode) uint32 {
+	var _22to31 uint32
+	var bits int64
+	switch kind {
+	case uLoad8:
+		_22to31 = 0b0011100001
+		bits = 8
+	case sLoad8:
+		_22to31 = 0b0011100010
+		bits = 8
+	case uLoad16:
+		_22to31 = 0b0111100001
+		bits = 16
+	case sLoad16:
+		_22to31 = 0b0111100010
+		bits = 16
+	case uLoad32:
+		_22to31 = 0b1011100001
+		bits = 32
+	case sLoad32:
+		_22to31 = 0b1011100010
+		bits = 32
+	case uLoad64:
+		_22to31 = 0b1111100001
+		bits = 64
+	case fpuLoad32:
+		_22to31 = 0b1011110001
+		bits = 32
+	case fpuLoad64:
+		_22to31 = 0b1111110001
+		bits = 64
+	case fpuLoad128:
+		_22to31 = 0b0011110011
+		bits = 128
+	case store8:
+		_22to31 = 0b0011100000
+		bits = 8
+	case store16:
+		_22to31 = 0b0111100000
+		bits = 16
+	case store32:
+		_22to31 = 0b1011100000
+		bits = 32
+	case store64:
+		_22to31 = 0b1111100000
+		bits = 64
+	case fpuStore32:
+		_22to31 = 0b1011110000
+		bits = 32
+	case fpuStore64:
+		_22to31 = 0b1111110000
+		bits = 64
+	case fpuStore128:
+		_22to31 = 0b0011110010
+		bits = 128
+	default:
+		panic("BUG")
+	}
+
+	switch amode.kind {
+	case addressModeKindRegScaledExtended:
+		return encodeLoadOrStoreExtended(_22to31,
+			regNumberInEncoding[amode.rn.RealReg()],
+			regNumberInEncoding[amode.rm.RealReg()],
+			rt, true, amode.extOp)
+	case addressModeKindRegScaled:
+		return encodeLoadOrStoreExtended(_22to31,
+			regNumberInEncoding[amode.rn.RealReg()], regNumberInEncoding[amode.rm.RealReg()],
+			rt, true, extendOpNone)
+	case addressModeKindRegExtended:
+		return encodeLoadOrStoreExtended(_22to31,
+			regNumberInEncoding[amode.rn.RealReg()], regNumberInEncoding[amode.rm.RealReg()],
+			rt, false, amode.extOp)
+	case addressModeKindRegReg:
+		return encodeLoadOrStoreExtended(_22to31,
+			regNumberInEncoding[amode.rn.RealReg()], regNumberInEncoding[amode.rm.RealReg()],
+			rt, false, extendOpNone)
+	case addressModeKindRegSignedImm9:
+		// e.g. https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled--
+		return encodeLoadOrStoreSIMM9(_22to31, 0b00 /* unscaled */, regNumberInEncoding[amode.rn.RealReg()], rt, amode.imm)
+	case addressModeKindPostIndex:
+		return encodeLoadOrStoreSIMM9(_22to31, 0b01 /* post index */, regNumberInEncoding[amode.rn.RealReg()], rt, amode.imm)
+	case addressModeKindPreIndex:
+		return encodeLoadOrStoreSIMM9(_22to31, 0b11 /* pre index */, regNumberInEncoding[amode.rn.RealReg()], rt, amode.imm)
+	case addressModeKindRegUnsignedImm12:
+		// "unsigned immediate" in https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Loads-and-Stores?lang=en
+		rn := regNumberInEncoding[amode.rn.RealReg()]
+		imm := amode.imm
+		div := bits / 8
+		if imm != 0 && !offsetFitsInAddressModeKindRegUnsignedImm12(byte(bits), imm) {
+			panic("BUG")
+		}
+		imm /= div
+		return _22to31<<22 | 0b1<<24 | uint32(imm&0b111111111111)<<10 | rn<<5 | rt
+	default:
+		panic("BUG")
+	}
+}
+
+// encodeVecLoad1R encodes as Load one single-element structure and Replicate to all lanes (of one register) in
+// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LD1R--Load-one-single-element-structure-and-Replicate-to-all-lanes--of-one-register--?lang=en#sa_imm
+func encodeVecLoad1R(rt, rn uint32, arr vecArrangement) uint32 {
+	size, q := arrToSizeQEncoded(arr)
+	return q<<30 | 0b001101010000001100<<12 | size<<10 | rn<<5 | rt
+}
+
+// encodeAluBitmaskImmediate encodes as Logical (immediate) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
+func encodeAluBitmaskImmediate(op aluOp, rd, rn uint32, imm uint64, _64bit bool) uint32 {
+	var _31to23 uint32
+	switch op {
+	case aluOpAnd:
+		_31to23 = 0b00_100100
+	case aluOpOrr:
+		_31to23 = 0b01_100100
+	case aluOpEor:
+		_31to23 = 0b10_100100
+	case aluOpAnds:
+		_31to23 = 0b11_100100
+	default:
+		panic("BUG")
+	}
+	if _64bit {
+		_31to23 |= 0b1 << 8
+	}
+	immr, imms, N := bitmaskImmediate(imm, _64bit)
+	return _31to23<<23 | uint32(N)<<22 | uint32(immr)<<16 | uint32(imms)<<10 | rn<<5 | rd
+}
+
+func bitmaskImmediate(c uint64, is64bit bool) (immr, imms, N byte) {
+	var size uint32
+	switch {
+	case c != c>>32|c<<32:
+		size = 64
+	case c != c>>16|c<<48:
+		size = 32
+		c = uint64(int32(c))
+	case c != c>>8|c<<56:
+		size = 16
+		c = uint64(int16(c))
+	case c != c>>4|c<<60:
+		size = 8
+		c = uint64(int8(c))
+	case c != c>>2|c<<62:
+		size = 4
+		c = uint64(int64(c<<60) >> 60)
+	default:
+		size = 2
+		c = uint64(int64(c<<62) >> 62)
+	}
+
+	neg := false
+	if int64(c) < 0 {
+		c = ^c
+		neg = true
+	}
+
+	onesSize, nonZeroPos := getOnesSequenceSize(c)
+	if neg {
+		nonZeroPos = onesSize + nonZeroPos
+		onesSize = size - onesSize
+	}
+
+	var mode byte = 32
+	if is64bit && size == 64 {
+		N, mode = 0b1, 64
+	}
+
+	immr = byte((size - nonZeroPos) & (size - 1) & uint32(mode-1))
+	imms = byte((onesSize - 1) | 63&^(size<<1-1))
+	return
+}
+
+func getOnesSequenceSize(x uint64) (size, nonZeroPos uint32) {
+	// Take 0b00111000 for example:
+	y := getLowestBit(x)               // = 0b0000100
+	nonZeroPos = setBitPos(y)          // = 2
+	size = setBitPos(x+y) - nonZeroPos // = setBitPos(0b0100000) - 2 = 5 - 2 = 3
+	return
+}
+
+func setBitPos(x uint64) (ret uint32) {
+	for ; ; ret++ {
+		if x == 0b1 {
+			break
+		}
+		x = x >> 1
+	}
+	return
+}
+
+// encodeLoadOrStoreExtended encodes store/load instruction as "extended register offset" in Load/store register (register offset):
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Loads-and-Stores?lang=en
+func encodeLoadOrStoreExtended(_22to32 uint32, rn, rm, rt uint32, scaled bool, extOp extendOp) uint32 {
+	var option uint32
+	switch extOp {
+	case extendOpUXTW:
+		option = 0b010
+	case extendOpSXTW:
+		option = 0b110
+	case extendOpNone:
+		option = 0b111
+	default:
+		panic("BUG")
+	}
+	var s uint32
+	if scaled {
+		s = 0b1
+	}
+	return _22to32<<22 | 0b1<<21 | rm<<16 | option<<13 | s<<12 | 0b10<<10 | rn<<5 | rt
+}
+
+// encodeLoadOrStoreSIMM9 encodes store/load instruction as one of post-index, pre-index or unscaled immediate as in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Loads-and-Stores?lang=en
+func encodeLoadOrStoreSIMM9(_22to32, _1011 uint32, rn, rt uint32, imm9 int64) uint32 {
+	return _22to32<<22 | (uint32(imm9)&0b111111111)<<12 | _1011<<10 | rn<<5 | rt
+}
+
+// encodeFpuRRR encodes as single or double precision (depending on `_64bit`) of Floating-point data-processing (2 source) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeFpuRRR(op fpuBinOp, rd, rn, rm uint32, _64bit bool) (ret uint32) {
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADD--vector--Add-vectors--scalar--floating-point-and-integer-
+	var opcode uint32
+	switch op {
+	case fpuBinOpAdd:
+		opcode = 0b0010
+	case fpuBinOpSub:
+		opcode = 0b0011
+	case fpuBinOpMul:
+		opcode = 0b0000
+	case fpuBinOpDiv:
+		opcode = 0b0001
+	case fpuBinOpMax:
+		opcode = 0b0100
+	case fpuBinOpMin:
+		opcode = 0b0101
+	default:
+		panic("BUG")
+	}
+	var ptype uint32
+	if _64bit {
+		ptype = 0b01
+	}
+	return 0b1111<<25 | ptype<<22 | 0b1<<21 | rm<<16 | opcode<<12 | 0b1<<11 | rn<<5 | rd
+}
+
+// encodeAluRRImm12 encodes as Add/subtract (immediate) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
+func encodeAluRRImm12(op aluOp, rd, rn uint32, imm12 uint16, shiftBit byte, _64bit bool) uint32 {
+	var _31to24 uint32
+	switch op {
+	case aluOpAdd:
+		_31to24 = 0b00_10001
+	case aluOpAddS:
+		_31to24 = 0b01_10001
+	case aluOpSub:
+		_31to24 = 0b10_10001
+	case aluOpSubS:
+		_31to24 = 0b11_10001
+	default:
+		panic("BUG")
+	}
+	if _64bit {
+		_31to24 |= 0b1 << 7
+	}
+	return _31to24<<24 | uint32(shiftBit)<<22 | uint32(imm12&0b111111111111)<<10 | rn<<5 | rd
+}
+
+// encodeAluRRR encodes as Data Processing (shifted register), depending on aluOp.
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift
+func encodeAluRRRShift(op aluOp, rd, rn, rm, amount uint32, shiftOp shiftOp, _64bit bool) uint32 {
+	var _31to24 uint32
+	var opc, n uint32
+	switch op {
+	case aluOpAdd:
+		_31to24 = 0b00001011
+	case aluOpAddS:
+		_31to24 = 0b00101011
+	case aluOpSub:
+		_31to24 = 0b01001011
+	case aluOpSubS:
+		_31to24 = 0b01101011
+	case aluOpAnd, aluOpOrr, aluOpEor, aluOpAnds:
+		// "Logical (shifted register)".
+		switch op {
+		case aluOpAnd:
+			// all zeros
+		case aluOpOrr:
+			opc = 0b01
+		case aluOpEor:
+			opc = 0b10
+		case aluOpAnds:
+			opc = 0b11
+		}
+		_31to24 = 0b000_01010
+	default:
+		panic(op.String())
+	}
+
+	if _64bit {
+		_31to24 |= 0b1 << 7
+	}
+
+	var shift uint32
+	switch shiftOp {
+	case shiftOpLSL:
+		shift = 0b00
+	case shiftOpLSR:
+		shift = 0b01
+	case shiftOpASR:
+		shift = 0b10
+	default:
+		panic(shiftOp.String())
+	}
+	return opc<<29 | n<<21 | _31to24<<24 | shift<<22 | rm<<16 | (amount << 10) | (rn << 5) | rd
+}
+
+// "Add/subtract (extended register)" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_ext
+func encodeAluRRRExtend(ao aluOp, rd, rn, rm uint32, extOp extendOp, to byte) uint32 {
+	var s, op uint32
+	switch ao {
+	case aluOpAdd:
+		op = 0b0
+	case aluOpAddS:
+		op, s = 0b0, 0b1
+	case aluOpSub:
+		op = 0b1
+	case aluOpSubS:
+		op, s = 0b1, 0b1
+	default:
+		panic("BUG: extended register operand can be used only for add/sub")
+	}
+
+	var sf uint32
+	if to == 64 {
+		sf = 0b1
+	}
+
+	var option uint32
+	switch extOp {
+	case extendOpUXTB:
+		option = 0b000
+	case extendOpUXTH:
+		option = 0b001
+	case extendOpUXTW:
+		option = 0b010
+	case extendOpSXTB:
+		option = 0b100
+	case extendOpSXTH:
+		option = 0b101
+	case extendOpSXTW:
+		option = 0b110
+	case extendOpSXTX, extendOpUXTX:
+		panic(fmt.Sprintf("%s is essentially noop, and should be handled much earlier than encoding", extOp.String()))
+	}
+	return sf<<31 | op<<30 | s<<29 | 0b1011001<<21 | rm<<16 | option<<13 | rn<<5 | rd
+}
+
+// encodeAluRRR encodes as Data Processing (register), depending on aluOp.
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
+func encodeAluRRR(op aluOp, rd, rn, rm uint32, _64bit, isRnSp bool) uint32 {
+	var _31to21, _15to10 uint32
+	switch op {
+	case aluOpAdd:
+		if isRnSp {
+			// "Extended register" with UXTW.
+			_31to21 = 0b00001011_001
+			_15to10 = 0b011000
+		} else {
+			// "Shifted register" with shift = 0
+			_31to21 = 0b00001011_000
+		}
+	case aluOpAddS:
+		if isRnSp {
+			panic("TODO")
+		}
+		// "Shifted register" with shift = 0
+		_31to21 = 0b00101011_000
+	case aluOpSub:
+		if isRnSp {
+			// "Extended register" with UXTW.
+			_31to21 = 0b01001011_001
+			_15to10 = 0b011000
+		} else {
+			// "Shifted register" with shift = 0
+			_31to21 = 0b01001011_000
+		}
+	case aluOpSubS:
+		if isRnSp {
+			panic("TODO")
+		}
+		// "Shifted register" with shift = 0
+		_31to21 = 0b01101011_000
+	case aluOpAnd, aluOpOrr, aluOpOrn, aluOpEor, aluOpAnds:
+		// "Logical (shifted register)".
+		var opc, n uint32
+		switch op {
+		case aluOpAnd:
+			// all zeros
+		case aluOpOrr:
+			opc = 0b01
+		case aluOpOrn:
+			opc = 0b01
+			n = 1
+		case aluOpEor:
+			opc = 0b10
+		case aluOpAnds:
+			opc = 0b11
+		}
+		_31to21 = 0b000_01010_000 | opc<<8 | n
+	case aluOpLsl, aluOpAsr, aluOpLsr, aluOpRotR:
+		// "Data-processing (2 source)".
+		_31to21 = 0b00011010_110
+		switch op {
+		case aluOpLsl:
+			_15to10 = 0b001000
+		case aluOpLsr:
+			_15to10 = 0b001001
+		case aluOpAsr:
+			_15to10 = 0b001010
+		case aluOpRotR:
+			_15to10 = 0b001011
+		}
+	case aluOpSDiv:
+		// "Data-processing (2 source)".
+		_31to21 = 0b11010110
+		_15to10 = 0b000011
+	case aluOpUDiv:
+		// "Data-processing (2 source)".
+		_31to21 = 0b11010110
+		_15to10 = 0b000010
+	default:
+		panic(op.String())
+	}
+	if _64bit {
+		_31to21 |= 0b1 << 10
+	}
+	return _31to21<<21 | rm<<16 | (_15to10 << 10) | (rn << 5) | rd
+}
+
+// encodeLogicalShiftedRegister encodes as Logical (shifted register) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
+func encodeLogicalShiftedRegister(sf_opc uint32, shift_N uint32, rm uint32, imm6 uint32, rn, rd uint32) (ret uint32) {
+	ret = sf_opc << 29
+	ret |= 0b01010 << 24
+	ret |= shift_N << 21
+	ret |= rm << 16
+	ret |= imm6 << 10
+	ret |= rn << 5
+	ret |= rd
+	return
+}
+
+// encodeAddSubtractImmediate encodes as Add/subtract (immediate) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
+func encodeAddSubtractImmediate(sf_op_s uint32, sh uint32, imm12 uint32, rn, rd uint32) (ret uint32) {
+	ret = sf_op_s << 29
+	ret |= 0b100010 << 23
+	ret |= sh << 22
+	ret |= imm12 << 10
+	ret |= rn << 5
+	ret |= rd
+	return
+}
+
+// encodePreOrPostIndexLoadStorePair64 encodes as Load/store pair (pre/post-indexed) in
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STP--Store-Pair-of-Registers-
+func encodePreOrPostIndexLoadStorePair64(pre bool, load bool, rn, rt, rt2 uint32, imm7 int64) (ret uint32) {
+	if imm7%8 != 0 {
+		panic("imm7 for pair load/store must be a multiple of 8")
+	}
+	imm7 /= 8
+	ret = rt
+	ret |= rn << 5
+	ret |= rt2 << 10
+	ret |= (uint32(imm7) & 0b1111111) << 15
+	if load {
+		ret |= 0b1 << 22
+	}
+	ret |= 0b101010001 << 23
+	if pre {
+		ret |= 0b1 << 24
+	}
+	return
+}
+
+// encodeUnconditionalBranch encodes as B or BL instructions:
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B--Branch-
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link-
+func encodeUnconditionalBranch(link bool, imm26 int64) (ret uint32) {
+	if imm26%4 != 0 {
+		panic("imm26 for branch must be a multiple of 4")
+	}
+	imm26 /= 4
+	ret = uint32(imm26 & 0b11_11111111_11111111_11111111)
+	ret |= 0b101 << 26
+	if link {
+		ret |= 0b1 << 31
+	}
+	return
+}
+
+// encodeCBZCBNZ encodes as either CBZ or CBNZ:
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CBZ--Compare-and-Branch-on-Zero-
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CBNZ--Compare-and-Branch-on-Nonzero-
+func encodeCBZCBNZ(rt uint32, nz bool, imm19 uint32, _64bit bool) (ret uint32) {
+	ret = rt
+	ret |= imm19 << 5
+	if nz {
+		ret |= 1 << 24
+	}
+	ret |= 0b11010 << 25
+	if _64bit {
+		ret |= 1 << 31
+	}
+	return
+}
+
+// encodeMoveWideImmediate encodes as either MOVZ, MOVN or MOVK, as Move wide (immediate) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
+//
+// "shift" must have been divided by 16 at this point.
+func encodeMoveWideImmediate(opc uint32, rd uint32, imm, shift, _64bit uint64) (ret uint32) {
+	ret = rd
+	ret |= uint32(imm&0xffff) << 5
+	ret |= (uint32(shift)) << 21
+	ret |= 0b100101 << 23
+	ret |= opc << 29
+	ret |= uint32(_64bit) << 31
+	return
+}
+
+// encodeAluRRImm encodes as "Bitfield" in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en#log_imm
+func encodeAluRRImm(op aluOp, rd, rn, amount, _64bit uint32) uint32 {
+	var opc uint32
+	var immr, imms uint32
+	switch op {
+	case aluOpLsl:
+		// LSL (immediate) is an alias for UBFM.
+		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/UBFM--Unsigned-Bitfield-Move-?lang=en
+		opc = 0b10
+		if amount == 0 {
+			// This can be encoded as NOP, but we don't do it for consistency: lsr xn, xm, #0
+			immr = 0
+			if _64bit == 1 {
+				imms = 0b111111
+			} else {
+				imms = 0b11111
+			}
+		} else {
+			if _64bit == 1 {
+				immr = 64 - amount
+			} else {
+				immr = (32 - amount) & 0b11111
+			}
+			imms = immr - 1
+		}
+	case aluOpLsr:
+		// LSR (immediate) is an alias for UBFM.
+		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LSR--immediate---Logical-Shift-Right--immediate---an-alias-of-UBFM-?lang=en
+		opc = 0b10
+		imms, immr = 0b011111|_64bit<<5, amount
+	case aluOpAsr:
+		// ASR (immediate) is an alias for SBFM.
+		// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SBFM--Signed-Bitfield-Move-?lang=en
+		opc = 0b00
+		imms, immr = 0b011111|_64bit<<5, amount
+	default:
+		panic(op.String())
+	}
+	return _64bit<<31 | opc<<29 | 0b100110<<23 | _64bit<<22 | immr<<16 | imms<<10 | rn<<5 | rd
+}
+
+// encodeVecLanes encodes as Data Processing (Advanced SIMD across lanes) depending on vecOp in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeVecLanes(op vecOp, rd uint32, rn uint32, arr vecArrangement) uint32 {
+	var u, q, size, opcode uint32
+	switch arr {
+	case vecArrangement8B:
+		q, size = 0b0, 0b00
+	case vecArrangement16B:
+		q, size = 0b1, 0b00
+	case vecArrangement4H:
+		q, size = 0, 0b01
+	case vecArrangement8H:
+		q, size = 1, 0b01
+	case vecArrangement4S:
+		q, size = 1, 0b10
+	default:
+		panic("unsupported arrangement: " + arr.String())
+	}
+	switch op {
+	case vecOpUaddlv:
+		u, opcode = 1, 0b00011
+	case vecOpUminv:
+		u, opcode = 1, 0b11010
+	case vecOpAddv:
+		u, opcode = 0, 0b11011
+	default:
+		panic("unsupported or illegal vecOp: " + op.String())
+	}
+	return q<<30 | u<<29 | 0b1110<<24 | size<<22 | 0b11000<<17 | opcode<<12 | 0b10<<10 | rn<<5 | rd
+}
+
+// encodeVecLanes encodes as Data Processing (Advanced SIMD scalar shift by immediate) depending on vecOp in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeVecShiftImm(op vecOp, rd uint32, rn, amount uint32, arr vecArrangement) uint32 {
+	var u, q, immh, immb, opcode uint32
+	switch op {
+	case vecOpSshll:
+		u, opcode = 0b0, 0b10100
+	case vecOpUshll:
+		u, opcode = 0b1, 0b10100
+	case vecOpSshr:
+		u, opcode = 0, 0b00000
+	default:
+		panic("unsupported or illegal vecOp: " + op.String())
+	}
+	switch arr {
+	case vecArrangement16B:
+		q = 0b1
+		fallthrough
+	case vecArrangement8B:
+		immh = 0b0001
+		immb = 8 - uint32(amount&0b111)
+	case vecArrangement8H:
+		q = 0b1
+		fallthrough
+	case vecArrangement4H:
+		v := 16 - uint32(amount&0b1111)
+		immb = v & 0b111
+		immh = 0b0010 | (v >> 3)
+	case vecArrangement4S:
+		q = 0b1
+		fallthrough
+	case vecArrangement2S:
+		v := 32 - uint32(amount&0b11111)
+		immb = v & 0b111
+		immh = 0b0100 | (v >> 3)
+	case vecArrangement2D:
+		q = 0b1
+		v := 64 - uint32(amount&0b111111)
+		immb = v & 0b111
+		immh = 0b1000 | (v >> 3)
+	default:
+		panic("unsupported arrangement: " + arr.String())
+	}
+	return q<<30 | u<<29 | 0b011110<<23 | immh<<19 | immb<<16 | 0b000001<<10 | opcode<<11 | 0b1<<10 | rn<<5 | rd
+}
+
+// encodeVecTbl encodes as Data Processing (Advanced SIMD table lookup) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp
+//
+// Note: tblOp may encode tbl1, tbl2... in the future. Currently, it is ignored.
+func encodeVecTbl(nregs, rd, rn, rm uint32, arr vecArrangement) uint32 {
+	var q, op2, len, op uint32
+
+	switch nregs {
+	case 1:
+		// tbl: single-register
+		len = 0b00
+	case 2:
+		// tbl2: 2-register table
+		len = 0b01
+	default:
+		panic(fmt.Sprintf("unsupported number or registers %d", nregs))
+	}
+	switch arr {
+	case vecArrangement8B:
+		q = 0b0
+	case vecArrangement16B:
+		q = 0b1
+	default:
+		panic("unsupported arrangement: " + arr.String())
+	}
+
+	return q<<30 | 0b001110<<24 | op2<<22 | rm<<16 | len<<13 | op<<12 | rn<<5 | rd
+}
+
+// encodeVecMisc encodes as Data Processing (Advanced SIMD two-register miscellaneous) depending on vecOp in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp
+func encodeAdvancedSIMDTwoMisc(op vecOp, rd, rn uint32, arr vecArrangement) uint32 {
+	var q, u, size, opcode uint32
+	switch op {
+	case vecOpCnt:
+		opcode = 0b00101
+		switch arr {
+		case vecArrangement8B:
+			q, size = 0b0, 0b00
+		case vecArrangement16B:
+			q, size = 0b1, 0b00
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+	case vecOpCmeq0:
+		if arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		opcode = 0b01001
+		size, q = arrToSizeQEncoded(arr)
+	case vecOpNot:
+		u = 1
+		opcode = 0b00101
+		switch arr {
+		case vecArrangement8B:
+			q, size = 0b0, 0b00
+		case vecArrangement16B:
+			q, size = 0b1, 0b00
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+	case vecOpAbs:
+		if arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		opcode = 0b01011
+		u = 0b0
+		size, q = arrToSizeQEncoded(arr)
+	case vecOpNeg:
+		if arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		opcode = 0b01011
+		u = 0b1
+		size, q = arrToSizeQEncoded(arr)
+	case vecOpFabs:
+		if arr < vecArrangement2S || arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		opcode = 0b01111
+		u = 0b0
+		size, q = arrToSizeQEncoded(arr)
+	case vecOpFneg:
+		if arr < vecArrangement2S || arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		opcode = 0b01111
+		u = 0b1
+		size, q = arrToSizeQEncoded(arr)
+	case vecOpFrintm:
+		u = 0b0
+		opcode = 0b11001
+		switch arr {
+		case vecArrangement2S:
+			q, size = 0b0, 0b00
+		case vecArrangement4S:
+			q, size = 0b1, 0b00
+		case vecArrangement2D:
+			q, size = 0b1, 0b01
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+	case vecOpFrintn:
+		u = 0b0
+		opcode = 0b11000
+		switch arr {
+		case vecArrangement2S:
+			q, size = 0b0, 0b00
+		case vecArrangement4S:
+			q, size = 0b1, 0b00
+		case vecArrangement2D:
+			q, size = 0b1, 0b01
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+	case vecOpFrintp:
+		u = 0b0
+		opcode = 0b11000
+		if arr < vecArrangement2S || arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q = arrToSizeQEncoded(arr)
+	case vecOpFrintz:
+		u = 0b0
+		opcode = 0b11001
+		if arr < vecArrangement2S || arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q = arrToSizeQEncoded(arr)
+	case vecOpFsqrt:
+		if arr < vecArrangement2S || arr == vecArrangement1D {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		opcode = 0b11111
+		u = 0b1
+		size, q = arrToSizeQEncoded(arr)
+	case vecOpFcvtl:
+		opcode = 0b10111
+		u = 0b0
+		switch arr {
+		case vecArrangement2S:
+			size, q = 0b01, 0b0
+		case vecArrangement4H:
+			size, q = 0b00, 0b0
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+	case vecOpFcvtn:
+		opcode = 0b10110
+		u = 0b0
+		switch arr {
+		case vecArrangement2S:
+			size, q = 0b01, 0b0
+		case vecArrangement4H:
+			size, q = 0b00, 0b0
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+	case vecOpFcvtzs:
+		opcode = 0b11011
+		u = 0b0
+		switch arr {
+		case vecArrangement2S:
+			q, size = 0b0, 0b10
+		case vecArrangement4S:
+			q, size = 0b1, 0b10
+		case vecArrangement2D:
+			q, size = 0b1, 0b11
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+	case vecOpFcvtzu:
+		opcode = 0b11011
+		u = 0b1
+		switch arr {
+		case vecArrangement2S:
+			q, size = 0b0, 0b10
+		case vecArrangement4S:
+			q, size = 0b1, 0b10
+		case vecArrangement2D:
+			q, size = 0b1, 0b11
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+	case vecOpScvtf:
+		opcode = 0b11101
+		u = 0b0
+		switch arr {
+		case vecArrangement4S:
+			q, size = 0b1, 0b00
+		case vecArrangement2S:
+			q, size = 0b0, 0b00
+		case vecArrangement2D:
+			q, size = 0b1, 0b01
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+	case vecOpUcvtf:
+		opcode = 0b11101
+		u = 0b1
+		switch arr {
+		case vecArrangement4S:
+			q, size = 0b1, 0b00
+		case vecArrangement2S:
+			q, size = 0b0, 0b00
+		case vecArrangement2D:
+			q, size = 0b1, 0b01
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+	case vecOpSqxtn:
+		// When q == 1 it encodes sqxtn2 (operates on upper 64 bits).
+		opcode = 0b10100
+		u = 0b0
+		if arr > vecArrangement4S {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q = arrToSizeQEncoded(arr)
+	case vecOpUqxtn:
+		// When q == 1 it encodes uqxtn2 (operates on upper 64 bits).
+		opcode = 0b10100
+		u = 0b1
+		if arr > vecArrangement4S {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q = arrToSizeQEncoded(arr)
+	case vecOpSqxtun:
+		// When q == 1 it encodes sqxtun2 (operates on upper 64 bits).
+		opcode = 0b10010 // 0b10100
+		u = 0b1
+		if arr > vecArrangement4S {
+			panic("unsupported arrangement: " + arr.String())
+		}
+		size, q = arrToSizeQEncoded(arr)
+	case vecOpRev64:
+		opcode = 0b00000
+		size, q = arrToSizeQEncoded(arr)
+	case vecOpXtn:
+		u = 0b0
+		opcode = 0b10010
+		size, q = arrToSizeQEncoded(arr)
+	case vecOpShll:
+		u = 0b1
+		opcode = 0b10011
+		switch arr {
+		case vecArrangement8B:
+			q, size = 0b0, 0b00
+		case vecArrangement4H:
+			q, size = 0b0, 0b01
+		case vecArrangement2S:
+			q, size = 0b0, 0b10
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+	default:
+		panic("unsupported or illegal vecOp: " + op.String())
+	}
+	return q<<30 | u<<29 | 0b01110<<24 | size<<22 | 0b10000<<17 | opcode<<12 | 0b10<<10 | rn<<5 | rd
+}
+
+// brTableSequenceOffsetTableBegin is the offset inside the brTableSequence where the table begins after 4 instructions
+const brTableSequenceOffsetTableBegin = 16
+
+func encodeBrTableSequence(c backend.Compiler, index regalloc.VReg, targets []uint32) {
+	tmpRegNumber := regNumberInEncoding[tmp]
+	indexNumber := regNumberInEncoding[index.RealReg()]
+
+	// adr tmpReg, PC+16 (PC+16 is the address of the first label offset)
+	// ldrsw index, [tmpReg, index, UXTW 2] ;; index = int64(*(tmpReg + index*8))
+	// add tmpReg, tmpReg, index
+	// br tmpReg
+	// [offset_to_l1, offset_to_l2, ..., offset_to_lN]
+	c.Emit4Bytes(encodeAdr(tmpRegNumber, 16))
+	c.Emit4Bytes(encodeLoadOrStore(sLoad32, indexNumber,
+		addressMode{kind: addressModeKindRegScaledExtended, rn: tmpRegVReg, rm: index, extOp: extendOpUXTW},
+	))
+	c.Emit4Bytes(encodeAluRRR(aluOpAdd, tmpRegNumber, tmpRegNumber, indexNumber, true, false))
+	c.Emit4Bytes(encodeUnconditionalBranchReg(tmpRegNumber, false))
+
+	// Offsets are resolved in ResolveRelativeAddress phase.
+	for _, offset := range targets {
+		if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
+			// Inlined offset tables cannot be disassembled properly, so pad dummy instructions to make the debugging easier.
+			c.Emit4Bytes(dummyInstruction)
+		} else {
+			c.Emit4Bytes(offset)
+		}
+	}
+}
+
+// encodeExitSequence matches the implementation detail of functionABI.emitGoEntryPreamble.
+func encodeExitSequence(c backend.Compiler, ctxReg regalloc.VReg) {
+	// Restore the FP, SP and LR, and return to the Go code:
+	// 		ldr lr,  [ctxReg, #GoReturnAddress]
+	// 		ldr fp,  [ctxReg, #OriginalFramePointer]
+	// 		ldr tmp, [ctxReg, #OriginalStackPointer]
+	//      mov sp, tmp ;; sp cannot be str'ed directly.
+	// 		ret ;; --> return to the Go code
+
+	var ctxEvicted bool
+	if ctx := ctxReg.RealReg(); ctx == fp || ctx == lr {
+		// In order to avoid overwriting the context register, we move ctxReg to tmp.
+		c.Emit4Bytes(encodeMov64(regNumberInEncoding[tmp], regNumberInEncoding[ctx], false, false))
+		ctxReg = tmpRegVReg
+		ctxEvicted = true
+	}
+
+	restoreLr := encodeLoadOrStore(
+		uLoad64,
+		regNumberInEncoding[lr],
+		addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			rn:   ctxReg,
+			imm:  wazevoapi.ExecutionContextOffsetGoReturnAddress.I64(),
+		},
+	)
+
+	restoreFp := encodeLoadOrStore(
+		uLoad64,
+		regNumberInEncoding[fp],
+		addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			rn:   ctxReg,
+			imm:  wazevoapi.ExecutionContextOffsetOriginalFramePointer.I64(),
+		},
+	)
+
+	restoreSpToTmp := encodeLoadOrStore(
+		uLoad64,
+		regNumberInEncoding[tmp],
+		addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			rn:   ctxReg,
+			imm:  wazevoapi.ExecutionContextOffsetOriginalStackPointer.I64(),
+		},
+	)
+
+	movTmpToSp := encodeAddSubtractImmediate(0b100, 0, 0,
+		regNumberInEncoding[tmp], regNumberInEncoding[sp])
+
+	c.Emit4Bytes(restoreFp)
+	c.Emit4Bytes(restoreLr)
+	c.Emit4Bytes(restoreSpToTmp)
+	c.Emit4Bytes(movTmpToSp)
+	c.Emit4Bytes(encodeRet())
+	if !ctxEvicted {
+		// In order to have the fixed-length exit sequence, we need to padd the binary.
+		// Since this will never be reached, we insert a dummy instruction.
+		c.Emit4Bytes(dummyInstruction)
+	}
+}
+
+func encodeRet() uint32 {
+	// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/RET--Return-from-subroutine-?lang=en
+	return 0b1101011001011111<<16 | regNumberInEncoding[lr]<<5
+}
+
+func encodeAtomicRmw(op atomicRmwOp, rs, rt, rn uint32, size uint32) uint32 {
+	var _31to21, _15to10, sz uint32
+
+	switch size {
+	case 8:
+		sz = 0b11
+	case 4:
+		sz = 0b10
+	case 2:
+		sz = 0b01
+	case 1:
+		sz = 0b00
+	}
+
+	_31to21 = 0b00111000_111 | sz<<9
+
+	switch op {
+	case atomicRmwOpAdd:
+		_15to10 = 0b000000
+	case atomicRmwOpClr:
+		_15to10 = 0b000100
+	case atomicRmwOpSet:
+		_15to10 = 0b001100
+	case atomicRmwOpEor:
+		_15to10 = 0b001000
+	case atomicRmwOpSwp:
+		_15to10 = 0b100000
+	}
+
+	return _31to21<<21 | rs<<16 | _15to10<<10 | rn<<5 | rt
+}
+
+func encodeAtomicCas(rs, rt, rn uint32, size uint32) uint32 {
+	var _31to21, _15to10, sz uint32
+
+	switch size {
+	case 8:
+		sz = 0b11
+	case 4:
+		sz = 0b10
+	case 2:
+		sz = 0b01
+	case 1:
+		sz = 0b00
+	}
+
+	_31to21 = 0b00001000_111 | sz<<9
+	_15to10 = 0b111111
+
+	return _31to21<<21 | rs<<16 | _15to10<<10 | rn<<5 | rt
+}
+
+func encodeAtomicLoadStore(rn, rt, size, l uint32) uint32 {
+	var _31to21, _20to16, _15to10, sz uint32
+
+	switch size {
+	case 8:
+		sz = 0b11
+	case 4:
+		sz = 0b10
+	case 2:
+		sz = 0b01
+	case 1:
+		sz = 0b00
+	}
+
+	_31to21 = 0b00001000_100 | sz<<9 | l<<1
+	_20to16 = 0b11111
+	_15to10 = 0b111111
+
+	return _31to21<<21 | _20to16<<16 | _15to10<<10 | rn<<5 | rt
+}
+
+func encodeDMB() uint32 {
+	return 0b11010101000000110011101110111111
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
new file mode 100644
index 000000000..698b382d4
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
@@ -0,0 +1,301 @@
+package arm64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// lowerConstant allocates a new VReg and inserts the instruction to load the constant value.
+func (m *machine) lowerConstant(instr *ssa.Instruction) (vr regalloc.VReg) {
+	val := instr.Return()
+	valType := val.Type()
+
+	vr = m.compiler.AllocateVReg(valType)
+	v := instr.ConstantVal()
+	m.insertLoadConstant(v, valType, vr)
+	return
+}
+
+// InsertLoadConstantBlockArg implements backend.Machine.
+func (m *machine) InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) {
+	val := instr.Return()
+	valType := val.Type()
+	v := instr.ConstantVal()
+	load := m.allocateInstr()
+	load.asLoadConstBlockArg(v, valType, vr)
+	m.insert(load)
+}
+
+func (m *machine) lowerLoadConstantBlockArgAfterRegAlloc(i *instruction) {
+	v, typ, dst := i.loadConstBlockArgData()
+	m.insertLoadConstant(v, typ, dst)
+}
+
+func (m *machine) insertLoadConstant(v uint64, valType ssa.Type, vr regalloc.VReg) {
+	if valType.Bits() < 64 { // Clear the redundant bits just in case it's unexpectedly sign-extended, etc.
+		v = v & ((1 << valType.Bits()) - 1)
+	}
+
+	switch valType {
+	case ssa.TypeF32:
+		loadF := m.allocateInstr()
+		loadF.asLoadFpuConst32(vr, v)
+		m.insert(loadF)
+	case ssa.TypeF64:
+		loadF := m.allocateInstr()
+		loadF.asLoadFpuConst64(vr, v)
+		m.insert(loadF)
+	case ssa.TypeI32:
+		if v == 0 {
+			m.InsertMove(vr, xzrVReg, ssa.TypeI32)
+		} else {
+			m.lowerConstantI32(vr, int32(v))
+		}
+	case ssa.TypeI64:
+		if v == 0 {
+			m.InsertMove(vr, xzrVReg, ssa.TypeI64)
+		} else {
+			m.lowerConstantI64(vr, int64(v))
+		}
+	default:
+		panic("TODO")
+	}
+}
+
+// The following logics are based on the old asm/arm64 package.
+// https://github.com/tetratelabs/wazero/blob/39f2ff23a6d609e10c82b9cc0b981f6de5b87a9c/internal/asm/arm64/impl.go
+
+func (m *machine) lowerConstantI32(dst regalloc.VReg, c int32) {
+	// Following the logic here:
+	// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1637
+	ic := int64(uint32(c))
+	if ic >= 0 && (ic <= 0xfff || (ic&0xfff) == 0 && (uint64(ic>>12) <= 0xfff)) {
+		if isBitMaskImmediate(uint64(c), false) {
+			m.lowerConstViaBitMaskImmediate(uint64(uint32(c)), dst, false)
+			return
+		}
+	}
+
+	if t := const16bitAligned(int64(uint32(c))); t >= 0 {
+		// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
+		// We could load it into temporary with movk.
+		m.insertMOVZ(dst, uint64(uint32(c)>>(16*t)), t, false)
+	} else if t := const16bitAligned(int64(^c)); t >= 0 {
+		// Also, if the inverse of the const can fit within 16-bit range, do the same ^^.
+		m.insertMOVN(dst, uint64(^c>>(16*t)), t, false)
+	} else if isBitMaskImmediate(uint64(uint32(c)), false) {
+		m.lowerConstViaBitMaskImmediate(uint64(c), dst, false)
+	} else {
+		// Otherwise, we use MOVZ and MOVK to load it.
+		c16 := uint16(c)
+		m.insertMOVZ(dst, uint64(c16), 0, false)
+		c16 = uint16(uint32(c) >> 16)
+		m.insertMOVK(dst, uint64(c16), 1, false)
+	}
+}
+
+func (m *machine) lowerConstantI64(dst regalloc.VReg, c int64) {
+	// Following the logic here:
+	// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1798-L1852
+	if c >= 0 && (c <= 0xfff || (c&0xfff) == 0 && (uint64(c>>12) <= 0xfff)) {
+		if isBitMaskImmediate(uint64(c), true) {
+			m.lowerConstViaBitMaskImmediate(uint64(c), dst, true)
+			return
+		}
+	}
+
+	if t := const16bitAligned(c); t >= 0 {
+		// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
+		// We could load it into temporary with movk.
+		m.insertMOVZ(dst, uint64(c)>>(16*t), t, true)
+	} else if t := const16bitAligned(^c); t >= 0 {
+		// Also, if the reverse of the const can fit within 16-bit range, do the same ^^.
+		m.insertMOVN(dst, uint64(^c)>>(16*t), t, true)
+	} else if isBitMaskImmediate(uint64(c), true) {
+		m.lowerConstViaBitMaskImmediate(uint64(c), dst, true)
+	} else {
+		m.load64bitConst(c, dst)
+	}
+}
+
+func (m *machine) lowerConstViaBitMaskImmediate(c uint64, dst regalloc.VReg, b64 bool) {
+	instr := m.allocateInstr()
+	instr.asALUBitmaskImm(aluOpOrr, dst, xzrVReg, c, b64)
+	m.insert(instr)
+}
+
+// isBitMaskImmediate determines if the value can be encoded as "bitmask immediate".
+//
+//	Such an immediate is a 32-bit or 64-bit pattern viewed as a vector of identical elements of size e = 2, 4, 8, 16, 32, or 64 bits.
+//	Each element contains the same sub-pattern: a single run of 1 to e-1 non-zero bits, rotated by 0 to e-1 bits.
+//
+// See https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/MOV--bitmask-immediate-
+func isBitMaskImmediate(x uint64, _64 bool) bool {
+	// All zeros and ones are not "bitmask immediate" by definition.
+	if x == 0 || (_64 && x == 0xffff_ffff_ffff_ffff) || (!_64 && x == 0xffff_ffff) {
+		return false
+	}
+
+	switch {
+	case x != x>>32|x<<32:
+		// e = 64
+	case x != x>>16|x<<48:
+		// e = 32 (x == x>>32|x<<32).
+		// e.g. 0x00ff_ff00_00ff_ff00
+		x = uint64(int32(x))
+	case x != x>>8|x<<56:
+		// e = 16 (x == x>>16|x<<48).
+		// e.g. 0x00ff_00ff_00ff_00ff
+		x = uint64(int16(x))
+	case x != x>>4|x<<60:
+		// e = 8 (x == x>>8|x<<56).
+		// e.g. 0x0f0f_0f0f_0f0f_0f0f
+		x = uint64(int8(x))
+	default:
+		// e = 4 or 2.
+		return true
+	}
+	return sequenceOfSetbits(x) || sequenceOfSetbits(^x)
+}
+
+// sequenceOfSetbits returns true if the number's binary representation is the sequence set bit (1).
+// For example: 0b1110 -> true, 0b1010 -> false
+func sequenceOfSetbits(x uint64) bool {
+	y := getLowestBit(x)
+	// If x is a sequence of set bit, this should results in the number
+	// with only one set bit (i.e. power of two).
+	y += x
+	return (y-1)&y == 0
+}
+
+func getLowestBit(x uint64) uint64 {
+	return x & (^x + 1)
+}
+
+// const16bitAligned check if the value is on the 16-bit alignment.
+// If so, returns the shift num divided by 16, and otherwise -1.
+func const16bitAligned(v int64) (ret int) {
+	ret = -1
+	for s := 0; s < 64; s += 16 {
+		if (uint64(v) &^ (uint64(0xffff) << uint(s))) == 0 {
+			ret = s / 16
+			break
+		}
+	}
+	return
+}
+
+// load64bitConst loads a 64-bit constant into the register, following the same logic to decide how to load large 64-bit
+// consts as in the Go assembler.
+//
+// See https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6632-L6759
+func (m *machine) load64bitConst(c int64, dst regalloc.VReg) {
+	var bits [4]uint64
+	var zeros, negs int
+	for i := 0; i < 4; i++ {
+		bits[i] = uint64(c) >> uint(i*16) & 0xffff
+		if v := bits[i]; v == 0 {
+			zeros++
+		} else if v == 0xffff {
+			negs++
+		}
+	}
+
+	if zeros == 3 {
+		// one MOVZ instruction.
+		for i, v := range bits {
+			if v != 0 {
+				m.insertMOVZ(dst, v, i, true)
+			}
+		}
+	} else if negs == 3 {
+		// one MOVN instruction.
+		for i, v := range bits {
+			if v != 0xffff {
+				v = ^v
+				m.insertMOVN(dst, v, i, true)
+			}
+		}
+	} else if zeros == 2 {
+		// one MOVZ then one OVK.
+		var movz bool
+		for i, v := range bits {
+			if !movz && v != 0 { // MOVZ.
+				m.insertMOVZ(dst, v, i, true)
+				movz = true
+			} else if v != 0 {
+				m.insertMOVK(dst, v, i, true)
+			}
+		}
+
+	} else if negs == 2 {
+		// one MOVN then one or two MOVK.
+		var movn bool
+		for i, v := range bits { // Emit MOVN.
+			if !movn && v != 0xffff {
+				v = ^v
+				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
+				m.insertMOVN(dst, v, i, true)
+				movn = true
+			} else if v != 0xffff {
+				m.insertMOVK(dst, v, i, true)
+			}
+		}
+
+	} else if zeros == 1 {
+		// one MOVZ then two MOVK.
+		var movz bool
+		for i, v := range bits {
+			if !movz && v != 0 { // MOVZ.
+				m.insertMOVZ(dst, v, i, true)
+				movz = true
+			} else if v != 0 {
+				m.insertMOVK(dst, v, i, true)
+			}
+		}
+
+	} else if negs == 1 {
+		// one MOVN then two MOVK.
+		var movn bool
+		for i, v := range bits { // Emit MOVN.
+			if !movn && v != 0xffff {
+				v = ^v
+				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
+				m.insertMOVN(dst, v, i, true)
+				movn = true
+			} else if v != 0xffff {
+				m.insertMOVK(dst, v, i, true)
+			}
+		}
+
+	} else {
+		// one MOVZ then up to three MOVK.
+		var movz bool
+		for i, v := range bits {
+			if !movz && v != 0 { // MOVZ.
+				m.insertMOVZ(dst, v, i, true)
+				movz = true
+			} else if v != 0 {
+				m.insertMOVK(dst, v, i, true)
+			}
+		}
+	}
+}
+
+func (m *machine) insertMOVZ(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
+	instr := m.allocateInstr()
+	instr.asMOVZ(dst, v, uint64(shift), dst64)
+	m.insert(instr)
+}
+
+func (m *machine) insertMOVK(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
+	instr := m.allocateInstr()
+	instr.asMOVK(dst, v, uint64(shift), dst64)
+	m.insert(instr)
+}
+
+func (m *machine) insertMOVN(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
+	instr := m.allocateInstr()
+	instr.asMOVN(dst, v, uint64(shift), dst64)
+	m.insert(instr)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
new file mode 100644
index 000000000..2bb234e8c
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
@@ -0,0 +1,2221 @@
+package arm64
+
+// Files prefixed as lower_instr** do the instruction selection, meaning that lowering SSA level instructions
+// into machine specific instructions.
+//
+// Importantly, what the lower** functions does includes tree-matching; find the pattern from the given instruction tree,
+// and merge the multiple instructions if possible. It can be considered as "N:1" instruction selection.
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// LowerSingleBranch implements backend.Machine.
+func (m *machine) LowerSingleBranch(br *ssa.Instruction) {
+	ectx := m.executableContext
+	switch br.Opcode() {
+	case ssa.OpcodeJump:
+		_, _, targetBlk := br.BranchData()
+		if br.IsFallthroughJump() {
+			return
+		}
+		b := m.allocateInstr()
+		target := ectx.GetOrAllocateSSABlockLabel(targetBlk)
+		if target == labelReturn {
+			b.asRet()
+		} else {
+			b.asBr(target)
+		}
+		m.insert(b)
+	case ssa.OpcodeBrTable:
+		m.lowerBrTable(br)
+	default:
+		panic("BUG: unexpected branch opcode" + br.Opcode().String())
+	}
+}
+
+func (m *machine) lowerBrTable(i *ssa.Instruction) {
+	index, targets := i.BrTableData()
+	indexOperand := m.getOperand_NR(m.compiler.ValueDefinition(index), extModeNone)
+
+	// Firstly, we have to do the bounds check of the index, and
+	// set it to the default target (sitting at the end of the list) if it's out of bounds.
+
+	// mov  maxIndexReg #maximum_index
+	// subs wzr, index, maxIndexReg
+	// csel adjustedIndex, maxIndexReg, index, hs ;; if index is higher or equal than maxIndexReg.
+	maxIndexReg := m.compiler.AllocateVReg(ssa.TypeI32)
+	m.lowerConstantI32(maxIndexReg, int32(len(targets)-1))
+	subs := m.allocateInstr()
+	subs.asALU(aluOpSubS, operandNR(xzrVReg), indexOperand, operandNR(maxIndexReg), false)
+	m.insert(subs)
+	csel := m.allocateInstr()
+	adjustedIndex := m.compiler.AllocateVReg(ssa.TypeI32)
+	csel.asCSel(operandNR(adjustedIndex), operandNR(maxIndexReg), indexOperand, hs, false)
+	m.insert(csel)
+
+	brSequence := m.allocateInstr()
+
+	tableIndex := m.addJmpTableTarget(targets)
+	brSequence.asBrTableSequence(adjustedIndex, tableIndex, len(targets))
+	m.insert(brSequence)
+}
+
+// LowerConditionalBranch implements backend.Machine.
+func (m *machine) LowerConditionalBranch(b *ssa.Instruction) {
+	exctx := m.executableContext
+	cval, args, targetBlk := b.BranchData()
+	if len(args) > 0 {
+		panic(fmt.Sprintf(
+			"conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s",
+			exctx.CurrentSSABlk,
+			targetBlk,
+		))
+	}
+
+	target := exctx.GetOrAllocateSSABlockLabel(targetBlk)
+	cvalDef := m.compiler.ValueDefinition(cval)
+
+	switch {
+	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction.
+		cvalInstr := cvalDef.Instr
+		x, y, c := cvalInstr.IcmpData()
+		cc, signed := condFlagFromSSAIntegerCmpCond(c), c.Signed()
+		if b.Opcode() == ssa.OpcodeBrz {
+			cc = cc.invert()
+		}
+
+		if !m.tryLowerBandToFlag(x, y) {
+			m.lowerIcmpToFlag(x, y, signed)
+		}
+		cbr := m.allocateInstr()
+		cbr.asCondBr(cc.asCond(), target, false /* ignored */)
+		m.insert(cbr)
+		cvalDef.Instr.MarkLowered()
+	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly.
+		cvalInstr := cvalDef.Instr
+		x, y, c := cvalInstr.FcmpData()
+		cc := condFlagFromSSAFloatCmpCond(c)
+		if b.Opcode() == ssa.OpcodeBrz {
+			cc = cc.invert()
+		}
+		m.lowerFcmpToFlag(x, y)
+		cbr := m.allocateInstr()
+		cbr.asCondBr(cc.asCond(), target, false /* ignored */)
+		m.insert(cbr)
+		cvalDef.Instr.MarkLowered()
+	default:
+		rn := m.getOperand_NR(cvalDef, extModeNone)
+		var c cond
+		if b.Opcode() == ssa.OpcodeBrz {
+			c = registerAsRegZeroCond(rn.nr())
+		} else {
+			c = registerAsRegNotZeroCond(rn.nr())
+		}
+		cbr := m.allocateInstr()
+		cbr.asCondBr(c, target, false)
+		m.insert(cbr)
+	}
+}
+
+func (m *machine) tryLowerBandToFlag(x, y ssa.Value) (ok bool) {
+	xx := m.compiler.ValueDefinition(x)
+	yy := m.compiler.ValueDefinition(y)
+	if xx.IsFromInstr() && xx.Instr.Constant() && xx.Instr.ConstantVal() == 0 {
+		if m.compiler.MatchInstr(yy, ssa.OpcodeBand) {
+			bandInstr := yy.Instr
+			m.lowerBitwiseAluOp(bandInstr, aluOpAnds, true)
+			ok = true
+			bandInstr.MarkLowered()
+			return
+		}
+	}
+
+	if yy.IsFromInstr() && yy.Instr.Constant() && yy.Instr.ConstantVal() == 0 {
+		if m.compiler.MatchInstr(xx, ssa.OpcodeBand) {
+			bandInstr := xx.Instr
+			m.lowerBitwiseAluOp(bandInstr, aluOpAnds, true)
+			ok = true
+			bandInstr.MarkLowered()
+			return
+		}
+	}
+	return
+}
+
+// LowerInstr implements backend.Machine.
+func (m *machine) LowerInstr(instr *ssa.Instruction) {
+	if l := instr.SourceOffset(); l.Valid() {
+		info := m.allocateInstr().asEmitSourceOffsetInfo(l)
+		m.insert(info)
+	}
+
+	switch op := instr.Opcode(); op {
+	case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable:
+		panic("BUG: branching instructions are handled by LowerBranches")
+	case ssa.OpcodeReturn:
+		panic("BUG: return must be handled by backend.Compiler")
+	case ssa.OpcodeIadd, ssa.OpcodeIsub:
+		m.lowerSubOrAdd(instr, op == ssa.OpcodeIadd)
+	case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv, ssa.OpcodeFmax, ssa.OpcodeFmin:
+		m.lowerFpuBinOp(instr)
+	case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined.
+	case ssa.OpcodeExitWithCode:
+		execCtx, code := instr.ExitWithCodeData()
+		m.lowerExitWithCode(m.compiler.VRegOf(execCtx), code)
+	case ssa.OpcodeExitIfTrueWithCode:
+		execCtx, c, code := instr.ExitIfTrueWithCodeData()
+		m.lowerExitIfTrueWithCode(m.compiler.VRegOf(execCtx), c, code)
+	case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32:
+		m.lowerStore(instr)
+	case ssa.OpcodeLoad:
+		dst := instr.Return()
+		ptr, offset, typ := instr.LoadData()
+		m.lowerLoad(ptr, offset, typ, dst)
+	case ssa.OpcodeVZeroExtLoad:
+		dst := instr.Return()
+		ptr, offset, typ := instr.VZeroExtLoadData()
+		m.lowerLoad(ptr, offset, typ, dst)
+	case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32:
+		ptr, offset, _ := instr.LoadData()
+		ret := m.compiler.VRegOf(instr.Return())
+		m.lowerExtLoad(op, ptr, offset, ret)
+	case ssa.OpcodeCall, ssa.OpcodeCallIndirect:
+		m.lowerCall(instr)
+	case ssa.OpcodeIcmp:
+		m.lowerIcmp(instr)
+	case ssa.OpcodeVIcmp:
+		m.lowerVIcmp(instr)
+	case ssa.OpcodeVFcmp:
+		m.lowerVFcmp(instr)
+	case ssa.OpcodeVCeil:
+		m.lowerVecMisc(vecOpFrintp, instr)
+	case ssa.OpcodeVFloor:
+		m.lowerVecMisc(vecOpFrintm, instr)
+	case ssa.OpcodeVTrunc:
+		m.lowerVecMisc(vecOpFrintz, instr)
+	case ssa.OpcodeVNearest:
+		m.lowerVecMisc(vecOpFrintn, instr)
+	case ssa.OpcodeVMaxPseudo:
+		m.lowerVMinMaxPseudo(instr, true)
+	case ssa.OpcodeVMinPseudo:
+		m.lowerVMinMaxPseudo(instr, false)
+	case ssa.OpcodeBand:
+		m.lowerBitwiseAluOp(instr, aluOpAnd, false)
+	case ssa.OpcodeBor:
+		m.lowerBitwiseAluOp(instr, aluOpOrr, false)
+	case ssa.OpcodeBxor:
+		m.lowerBitwiseAluOp(instr, aluOpEor, false)
+	case ssa.OpcodeIshl:
+		m.lowerShifts(instr, extModeNone, aluOpLsl)
+	case ssa.OpcodeSshr:
+		if instr.Return().Type().Bits() == 64 {
+			m.lowerShifts(instr, extModeSignExtend64, aluOpAsr)
+		} else {
+			m.lowerShifts(instr, extModeSignExtend32, aluOpAsr)
+		}
+	case ssa.OpcodeUshr:
+		if instr.Return().Type().Bits() == 64 {
+			m.lowerShifts(instr, extModeZeroExtend64, aluOpLsr)
+		} else {
+			m.lowerShifts(instr, extModeZeroExtend32, aluOpLsr)
+		}
+	case ssa.OpcodeRotl:
+		m.lowerRotl(instr)
+	case ssa.OpcodeRotr:
+		m.lowerRotr(instr)
+	case ssa.OpcodeSExtend, ssa.OpcodeUExtend:
+		from, to, signed := instr.ExtendData()
+		m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed)
+	case ssa.OpcodeFcmp:
+		x, y, c := instr.FcmpData()
+		m.lowerFcmp(x, y, instr.Return(), c)
+	case ssa.OpcodeImul:
+		x, y := instr.Arg2()
+		result := instr.Return()
+		m.lowerImul(x, y, result)
+	case ssa.OpcodeUndefined:
+		undef := m.allocateInstr()
+		undef.asUDF()
+		m.insert(undef)
+	case ssa.OpcodeSelect:
+		c, x, y := instr.SelectData()
+		if x.Type() == ssa.TypeV128 {
+			rc := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
+			rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+			rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+			rd := operandNR(m.compiler.VRegOf(instr.Return()))
+			m.lowerSelectVec(rc, rn, rm, rd)
+		} else {
+			m.lowerSelect(c, x, y, instr.Return())
+		}
+	case ssa.OpcodeClz:
+		x := instr.Arg()
+		result := instr.Return()
+		m.lowerClz(x, result)
+	case ssa.OpcodeCtz:
+		x := instr.Arg()
+		result := instr.Return()
+		m.lowerCtz(x, result)
+	case ssa.OpcodePopcnt:
+		x := instr.Arg()
+		result := instr.Return()
+		m.lowerPopcnt(x, result)
+	case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat:
+		x, ctx := instr.Arg2()
+		result := instr.Return()
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(result))
+		ctxVReg := m.compiler.VRegOf(ctx)
+		m.lowerFpuToInt(rd, rn, ctxVReg, true, x.Type() == ssa.TypeF64,
+			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat)
+	case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat:
+		x, ctx := instr.Arg2()
+		result := instr.Return()
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(result))
+		ctxVReg := m.compiler.VRegOf(ctx)
+		m.lowerFpuToInt(rd, rn, ctxVReg, false, x.Type() == ssa.TypeF64,
+			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat)
+	case ssa.OpcodeFcvtFromSint:
+		x := instr.Arg()
+		result := instr.Return()
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(result))
+		m.lowerIntToFpu(rd, rn, true, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
+	case ssa.OpcodeFcvtFromUint:
+		x := instr.Arg()
+		result := instr.Return()
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(result))
+		m.lowerIntToFpu(rd, rn, false, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
+	case ssa.OpcodeFdemote:
+		v := instr.Arg()
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		cnt := m.allocateInstr()
+		cnt.asFpuRR(fpuUniOpCvt64To32, rd, rn, false)
+		m.insert(cnt)
+	case ssa.OpcodeFpromote:
+		v := instr.Arg()
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		cnt := m.allocateInstr()
+		cnt.asFpuRR(fpuUniOpCvt32To64, rd, rn, true)
+		m.insert(cnt)
+	case ssa.OpcodeIreduce:
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(instr.Arg()), extModeNone)
+		retVal := instr.Return()
+		rd := m.compiler.VRegOf(retVal)
+
+		if retVal.Type() != ssa.TypeI32 {
+			panic("TODO?: Ireduce to non-i32")
+		}
+		mov := m.allocateInstr()
+		mov.asMove32(rd, rn.reg())
+		m.insert(mov)
+	case ssa.OpcodeFneg:
+		m.lowerFpuUniOp(fpuUniOpNeg, instr.Arg(), instr.Return())
+	case ssa.OpcodeSqrt:
+		m.lowerFpuUniOp(fpuUniOpSqrt, instr.Arg(), instr.Return())
+	case ssa.OpcodeCeil:
+		m.lowerFpuUniOp(fpuUniOpRoundPlus, instr.Arg(), instr.Return())
+	case ssa.OpcodeFloor:
+		m.lowerFpuUniOp(fpuUniOpRoundMinus, instr.Arg(), instr.Return())
+	case ssa.OpcodeTrunc:
+		m.lowerFpuUniOp(fpuUniOpRoundZero, instr.Arg(), instr.Return())
+	case ssa.OpcodeNearest:
+		m.lowerFpuUniOp(fpuUniOpRoundNearest, instr.Arg(), instr.Return())
+	case ssa.OpcodeFabs:
+		m.lowerFpuUniOp(fpuUniOpAbs, instr.Arg(), instr.Return())
+	case ssa.OpcodeBitcast:
+		m.lowerBitcast(instr)
+	case ssa.OpcodeFcopysign:
+		x, y := instr.Arg2()
+		m.lowerFcopysign(x, y, instr.Return())
+	case ssa.OpcodeSdiv, ssa.OpcodeUdiv:
+		x, y, ctx := instr.Arg3()
+		ctxVReg := m.compiler.VRegOf(ctx)
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		m.lowerIDiv(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSdiv)
+	case ssa.OpcodeSrem, ssa.OpcodeUrem:
+		x, y, ctx := instr.Arg3()
+		ctxVReg := m.compiler.VRegOf(ctx)
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		m.lowerIRem(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem)
+	case ssa.OpcodeVconst:
+		result := m.compiler.VRegOf(instr.Return())
+		lo, hi := instr.VconstData()
+		v := m.allocateInstr()
+		v.asLoadFpuConst128(result, lo, hi)
+		m.insert(v)
+	case ssa.OpcodeVbnot:
+		x := instr.Arg()
+		ins := m.allocateInstr()
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		ins.asVecMisc(vecOpNot, rd, rn, vecArrangement16B)
+		m.insert(ins)
+	case ssa.OpcodeVbxor:
+		x, y := instr.Arg2()
+		m.lowerVecRRR(vecOpEOR, x, y, instr.Return(), vecArrangement16B)
+	case ssa.OpcodeVbor:
+		x, y := instr.Arg2()
+		m.lowerVecRRR(vecOpOrr, x, y, instr.Return(), vecArrangement16B)
+	case ssa.OpcodeVband:
+		x, y := instr.Arg2()
+		m.lowerVecRRR(vecOpAnd, x, y, instr.Return(), vecArrangement16B)
+	case ssa.OpcodeVbandnot:
+		x, y := instr.Arg2()
+		m.lowerVecRRR(vecOpBic, x, y, instr.Return(), vecArrangement16B)
+	case ssa.OpcodeVbitselect:
+		c, x, y := instr.SelectData()
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+		creg := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
+		tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+		// creg is overwritten by BSL, so we need to move it to the result register before the instruction
+		// in case when it is used somewhere else.
+		mov := m.allocateInstr()
+		mov.asFpuMov128(tmp.nr(), creg.nr())
+		m.insert(mov)
+
+		ins := m.allocateInstr()
+		ins.asVecRRRRewrite(vecOpBsl, tmp, rn, rm, vecArrangement16B)
+		m.insert(ins)
+
+		mov2 := m.allocateInstr()
+		rd := m.compiler.VRegOf(instr.Return())
+		mov2.asFpuMov128(rd, tmp.nr())
+		m.insert(mov2)
+	case ssa.OpcodeVanyTrue, ssa.OpcodeVallTrue:
+		x, lane := instr.ArgWithLane()
+		var arr vecArrangement
+		if op == ssa.OpcodeVallTrue {
+			arr = ssaLaneToArrangement(lane)
+		}
+		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		m.lowerVcheckTrue(op, rm, rd, arr)
+	case ssa.OpcodeVhighBits:
+		x, lane := instr.ArgWithLane()
+		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVhighBits(rm, rd, arr)
+	case ssa.OpcodeVIadd:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpAdd, x, y, instr.Return(), arr)
+	case ssa.OpcodeExtIaddPairwise:
+		v, lane, signed := instr.ExtIaddPairwiseData()
+		vv := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
+
+		tmpLo, tmpHi := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		var widen vecOp
+		if signed {
+			widen = vecOpSshll
+		} else {
+			widen = vecOpUshll
+		}
+
+		var loArr, hiArr, dstArr vecArrangement
+		switch lane {
+		case ssa.VecLaneI8x16:
+			loArr, hiArr, dstArr = vecArrangement8B, vecArrangement16B, vecArrangement8H
+		case ssa.VecLaneI16x8:
+			loArr, hiArr, dstArr = vecArrangement4H, vecArrangement8H, vecArrangement4S
+		case ssa.VecLaneI32x4:
+			loArr, hiArr, dstArr = vecArrangement2S, vecArrangement4S, vecArrangement2D
+		default:
+			panic("unsupported lane " + lane.String())
+		}
+
+		widenLo := m.allocateInstr().asVecShiftImm(widen, tmpLo, vv, operandShiftImm(0), loArr)
+		widenHi := m.allocateInstr().asVecShiftImm(widen, tmpHi, vv, operandShiftImm(0), hiArr)
+		addp := m.allocateInstr().asVecRRR(vecOpAddp, operandNR(m.compiler.VRegOf(instr.Return())), tmpLo, tmpHi, dstArr)
+		m.insert(widenLo)
+		m.insert(widenHi)
+		m.insert(addp)
+
+	case ssa.OpcodeVSaddSat:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpSqadd, x, y, instr.Return(), arr)
+	case ssa.OpcodeVUaddSat:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpUqadd, x, y, instr.Return(), arr)
+	case ssa.OpcodeVIsub:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpSub, x, y, instr.Return(), arr)
+	case ssa.OpcodeVSsubSat:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpSqsub, x, y, instr.Return(), arr)
+	case ssa.OpcodeVUsubSat:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpUqsub, x, y, instr.Return(), arr)
+	case ssa.OpcodeVImin:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpSmin, x, y, instr.Return(), arr)
+	case ssa.OpcodeVUmin:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpUmin, x, y, instr.Return(), arr)
+	case ssa.OpcodeVImax:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpSmax, x, y, instr.Return(), arr)
+	case ssa.OpcodeVUmax:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpUmax, x, y, instr.Return(), arr)
+	case ssa.OpcodeVAvgRound:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpUrhadd, x, y, instr.Return(), arr)
+	case ssa.OpcodeVImul:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		m.lowerVIMul(rd, rn, rm, arr)
+	case ssa.OpcodeVIabs:
+		m.lowerVecMisc(vecOpAbs, instr)
+	case ssa.OpcodeVIneg:
+		m.lowerVecMisc(vecOpNeg, instr)
+	case ssa.OpcodeVIpopcnt:
+		m.lowerVecMisc(vecOpCnt, instr)
+	case ssa.OpcodeVIshl,
+		ssa.OpcodeVSshr, ssa.OpcodeVUshr:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		m.lowerVShift(op, rd, rn, rm, arr)
+	case ssa.OpcodeVSqrt:
+		m.lowerVecMisc(vecOpFsqrt, instr)
+	case ssa.OpcodeVFabs:
+		m.lowerVecMisc(vecOpFabs, instr)
+	case ssa.OpcodeVFneg:
+		m.lowerVecMisc(vecOpFneg, instr)
+	case ssa.OpcodeVFmin:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpFmin, x, y, instr.Return(), arr)
+	case ssa.OpcodeVFmax:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpFmax, x, y, instr.Return(), arr)
+	case ssa.OpcodeVFadd:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpFadd, x, y, instr.Return(), arr)
+	case ssa.OpcodeVFsub:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpFsub, x, y, instr.Return(), arr)
+	case ssa.OpcodeVFmul:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpFmul, x, y, instr.Return(), arr)
+	case ssa.OpcodeSqmulRoundSat:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpSqrdmulh, x, y, instr.Return(), arr)
+	case ssa.OpcodeVFdiv:
+		x, y, lane := instr.Arg2WithLane()
+		arr := ssaLaneToArrangement(lane)
+		m.lowerVecRRR(vecOpFdiv, x, y, instr.Return(), arr)
+	case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat:
+		x, lane := instr.ArgWithLane()
+		arr := ssaLaneToArrangement(lane)
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		m.lowerVfpuToInt(rd, rn, arr, op == ssa.OpcodeVFcvtToSintSat)
+	case ssa.OpcodeVFcvtFromSint, ssa.OpcodeVFcvtFromUint:
+		x, lane := instr.ArgWithLane()
+		arr := ssaLaneToArrangement(lane)
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		m.lowerVfpuFromInt(rd, rn, arr, op == ssa.OpcodeVFcvtFromSint)
+	case ssa.OpcodeSwidenLow, ssa.OpcodeUwidenLow:
+		x, lane := instr.ArgWithLane()
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+
+		var arr vecArrangement
+		switch lane {
+		case ssa.VecLaneI8x16:
+			arr = vecArrangement8B
+		case ssa.VecLaneI16x8:
+			arr = vecArrangement4H
+		case ssa.VecLaneI32x4:
+			arr = vecArrangement2S
+		}
+
+		shll := m.allocateInstr()
+		if signed := op == ssa.OpcodeSwidenLow; signed {
+			shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr)
+		} else {
+			shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr)
+		}
+		m.insert(shll)
+	case ssa.OpcodeSwidenHigh, ssa.OpcodeUwidenHigh:
+		x, lane := instr.ArgWithLane()
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+
+		arr := ssaLaneToArrangement(lane)
+
+		shll := m.allocateInstr()
+		if signed := op == ssa.OpcodeSwidenHigh; signed {
+			shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr)
+		} else {
+			shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr)
+		}
+		m.insert(shll)
+
+	case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow:
+		x, y, lane := instr.Arg2WithLane()
+		var arr, arr2 vecArrangement
+		switch lane {
+		case ssa.VecLaneI16x8: // I16x8
+			arr = vecArrangement8B
+			arr2 = vecArrangement16B // Implies sqxtn2.
+		case ssa.VecLaneI32x4:
+			arr = vecArrangement4H
+			arr2 = vecArrangement8H // Implies sqxtn2.
+		default:
+			panic("unsupported lane " + lane.String())
+		}
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+
+		tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+		loQxtn := m.allocateInstr()
+		hiQxtn := m.allocateInstr()
+		if signed := op == ssa.OpcodeSnarrow; signed {
+			// Narrow lanes on rn and write them into lower-half of rd.
+			loQxtn.asVecMisc(vecOpSqxtn, tmp, rn, arr) // low
+			// Narrow lanes on rm and write them into higher-half of rd.
+			hiQxtn.asVecMisc(vecOpSqxtn, tmp, rm, arr2) // high (sqxtn2)
+		} else {
+			// Narrow lanes on rn and write them into lower-half of rd.
+			loQxtn.asVecMisc(vecOpSqxtun, tmp, rn, arr) // low
+			// Narrow lanes on rm and write them into higher-half of rd.
+			hiQxtn.asVecMisc(vecOpSqxtun, tmp, rm, arr2) // high (sqxtn2)
+		}
+		m.insert(loQxtn)
+		m.insert(hiQxtn)
+
+		mov := m.allocateInstr()
+		mov.asFpuMov128(rd.nr(), tmp.nr())
+		m.insert(mov)
+	case ssa.OpcodeFvpromoteLow:
+		x, lane := instr.ArgWithLane()
+		if lane != ssa.VecLaneF32x4 {
+			panic("unsupported lane type " + lane.String())
+		}
+		ins := m.allocateInstr()
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		ins.asVecMisc(vecOpFcvtl, rd, rn, vecArrangement2S)
+		m.insert(ins)
+	case ssa.OpcodeFvdemote:
+		x, lane := instr.ArgWithLane()
+		if lane != ssa.VecLaneF64x2 {
+			panic("unsupported lane type " + lane.String())
+		}
+		ins := m.allocateInstr()
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		ins.asVecMisc(vecOpFcvtn, rd, rn, vecArrangement2S)
+		m.insert(ins)
+	case ssa.OpcodeExtractlane:
+		x, index, signed, lane := instr.ExtractlaneData()
+
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+
+		mov := m.allocateInstr()
+		switch lane {
+		case ssa.VecLaneI8x16:
+			mov.asMovFromVec(rd, rn, vecArrangementB, vecIndex(index), signed)
+		case ssa.VecLaneI16x8:
+			mov.asMovFromVec(rd, rn, vecArrangementH, vecIndex(index), signed)
+		case ssa.VecLaneI32x4:
+			mov.asMovFromVec(rd, rn, vecArrangementS, vecIndex(index), signed)
+		case ssa.VecLaneI64x2:
+			mov.asMovFromVec(rd, rn, vecArrangementD, vecIndex(index), signed)
+		case ssa.VecLaneF32x4:
+			mov.asVecMovElement(rd, rn, vecArrangementS, vecIndex(0), vecIndex(index))
+		case ssa.VecLaneF64x2:
+			mov.asVecMovElement(rd, rn, vecArrangementD, vecIndex(0), vecIndex(index))
+		default:
+			panic("unsupported lane: " + lane.String())
+		}
+
+		m.insert(mov)
+
+	case ssa.OpcodeInsertlane:
+		x, y, index, lane := instr.InsertlaneData()
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+		// Initially mov rn to tmp.
+		mov1 := m.allocateInstr()
+		mov1.asFpuMov128(tmpReg.nr(), rn.nr())
+		m.insert(mov1)
+
+		// movToVec and vecMovElement do not clear the remaining bits to zero,
+		// thus, we can mov rm in-place to tmp.
+		mov2 := m.allocateInstr()
+		switch lane {
+		case ssa.VecLaneI8x16:
+			mov2.asMovToVec(tmpReg, rm, vecArrangementB, vecIndex(index))
+		case ssa.VecLaneI16x8:
+			mov2.asMovToVec(tmpReg, rm, vecArrangementH, vecIndex(index))
+		case ssa.VecLaneI32x4:
+			mov2.asMovToVec(tmpReg, rm, vecArrangementS, vecIndex(index))
+		case ssa.VecLaneI64x2:
+			mov2.asMovToVec(tmpReg, rm, vecArrangementD, vecIndex(index))
+		case ssa.VecLaneF32x4:
+			mov2.asVecMovElement(tmpReg, rm, vecArrangementS, vecIndex(index), vecIndex(0))
+		case ssa.VecLaneF64x2:
+			mov2.asVecMovElement(tmpReg, rm, vecArrangementD, vecIndex(index), vecIndex(0))
+		}
+		m.insert(mov2)
+
+		// Finally mov tmp to rd.
+		mov3 := m.allocateInstr()
+		mov3.asFpuMov128(rd.nr(), tmpReg.nr())
+		m.insert(mov3)
+
+	case ssa.OpcodeSwizzle:
+		x, y, lane := instr.Arg2WithLane()
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+
+		arr := ssaLaneToArrangement(lane)
+
+		// tbl <rd>.<arr>, { <rn>.<arr> }, <rm>.<arr>
+		tbl1 := m.allocateInstr()
+		tbl1.asVecTbl(1, rd, rn, rm, arr)
+		m.insert(tbl1)
+
+	case ssa.OpcodeShuffle:
+		x, y, lane1, lane2 := instr.ShuffleData()
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+
+		m.lowerShuffle(rd, rn, rm, lane1, lane2)
+
+	case ssa.OpcodeSplat:
+		x, lane := instr.ArgWithLane()
+		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+
+		dup := m.allocateInstr()
+		switch lane {
+		case ssa.VecLaneI8x16:
+			dup.asVecDup(rd, rn, vecArrangement16B)
+		case ssa.VecLaneI16x8:
+			dup.asVecDup(rd, rn, vecArrangement8H)
+		case ssa.VecLaneI32x4:
+			dup.asVecDup(rd, rn, vecArrangement4S)
+		case ssa.VecLaneI64x2:
+			dup.asVecDup(rd, rn, vecArrangement2D)
+		case ssa.VecLaneF32x4:
+			dup.asVecDupElement(rd, rn, vecArrangementS, vecIndex(0))
+		case ssa.VecLaneF64x2:
+			dup.asVecDupElement(rd, rn, vecArrangementD, vecIndex(0))
+		}
+		m.insert(dup)
+
+	case ssa.OpcodeWideningPairwiseDotProductS:
+		x, y := instr.Arg2()
+		xx, yy := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone),
+			m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+		tmp, tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		m.insert(m.allocateInstr().asVecRRR(vecOpSmull, tmp, xx, yy, vecArrangement8H))
+		m.insert(m.allocateInstr().asVecRRR(vecOpSmull2, tmp2, xx, yy, vecArrangement8H))
+		m.insert(m.allocateInstr().asVecRRR(vecOpAddp, tmp, tmp, tmp2, vecArrangement4S))
+
+		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		m.insert(m.allocateInstr().asFpuMov128(rd.nr(), tmp.nr()))
+
+	case ssa.OpcodeLoadSplat:
+		ptr, offset, lane := instr.LoadSplatData()
+		m.lowerLoadSplat(ptr, offset, lane, instr.Return())
+
+	case ssa.OpcodeAtomicRmw:
+		m.lowerAtomicRmw(instr)
+
+	case ssa.OpcodeAtomicCas:
+		m.lowerAtomicCas(instr)
+
+	case ssa.OpcodeAtomicLoad:
+		m.lowerAtomicLoad(instr)
+
+	case ssa.OpcodeAtomicStore:
+		m.lowerAtomicStore(instr)
+
+	case ssa.OpcodeFence:
+		instr := m.allocateInstr()
+		instr.asDMB()
+		m.insert(instr)
+
+	default:
+		panic("TODO: lowering " + op.String())
+	}
+	m.executableContext.FlushPendingInstructions()
+}
+
+func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) {
+	// `tbl2` requires 2 consecutive registers, so we arbitrarily pick v29, v30.
+	vReg, wReg := v29VReg, v30VReg
+
+	// Initialize v29, v30 to rn, rm.
+	movv := m.allocateInstr()
+	movv.asFpuMov128(vReg, rn.nr())
+	m.insert(movv)
+
+	movw := m.allocateInstr()
+	movw.asFpuMov128(wReg, rm.nr())
+	m.insert(movw)
+
+	// `lane1`, `lane2` are already encoded as two u64s with the right layout:
+	//     lane1 := lane[7]<<56 | ... | lane[1]<<8 | lane[0]
+	//     lane2 := lane[15]<<56 | ... | lane[9]<<8 | lane[8]
+	// Thus, we can use loadFpuConst128.
+	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+	lfc := m.allocateInstr()
+	lfc.asLoadFpuConst128(tmp.nr(), lane1, lane2)
+	m.insert(lfc)
+
+	// tbl <rd>.16b, { <vReg>.16B, <wReg>.16b }, <tmp>.16b
+	tbl2 := m.allocateInstr()
+	tbl2.asVecTbl(2, rd, operandNR(vReg), tmp, vecArrangement16B)
+	m.insert(tbl2)
+}
+
+func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangement) {
+	var modulo byte
+	switch arr {
+	case vecArrangement16B:
+		modulo = 0x7 // Modulo 8.
+	case vecArrangement8H:
+		modulo = 0xf // Modulo 16.
+	case vecArrangement4S:
+		modulo = 0x1f // Modulo 32.
+	case vecArrangement2D:
+		modulo = 0x3f // Modulo 64.
+	default:
+		panic("unsupported arrangment " + arr.String())
+	}
+
+	rtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+	vtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+	and := m.allocateInstr()
+	and.asALUBitmaskImm(aluOpAnd, rtmp.nr(), rm.nr(), uint64(modulo), true)
+	m.insert(and)
+
+	if op != ssa.OpcodeVIshl {
+		// Negate the amount to make this as right shift.
+		neg := m.allocateInstr()
+		neg.asALU(aluOpSub, rtmp, operandNR(xzrVReg), rtmp, true)
+		m.insert(neg)
+	}
+
+	// Copy the shift amount into a vector register as sshl/ushl requires it to be there.
+	dup := m.allocateInstr()
+	dup.asVecDup(vtmp, rtmp, arr)
+	m.insert(dup)
+
+	if op == ssa.OpcodeVIshl || op == ssa.OpcodeVSshr {
+		sshl := m.allocateInstr()
+		sshl.asVecRRR(vecOpSshl, rd, rn, vtmp, arr)
+		m.insert(sshl)
+	} else {
+		ushl := m.allocateInstr()
+		ushl.asVecRRR(vecOpUshl, rd, rn, vtmp, arr)
+		m.insert(ushl)
+	}
+}
+
+func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangement) {
+	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+	// Special case VallTrue for i64x2.
+	if op == ssa.OpcodeVallTrue && arr == vecArrangement2D {
+		// 	cmeq v3?.2d, v2?.2d, #0
+		//	addp v3?.2d, v3?.2d, v3?.2d
+		//	fcmp v3?, v3?
+		//	cset dst, eq
+
+		ins := m.allocateInstr()
+		ins.asVecMisc(vecOpCmeq0, tmp, rm, vecArrangement2D)
+		m.insert(ins)
+
+		addp := m.allocateInstr()
+		addp.asVecRRR(vecOpAddp, tmp, tmp, tmp, vecArrangement2D)
+		m.insert(addp)
+
+		fcmp := m.allocateInstr()
+		fcmp.asFpuCmp(tmp, tmp, true)
+		m.insert(fcmp)
+
+		cset := m.allocateInstr()
+		cset.asCSet(rd.nr(), false, eq)
+		m.insert(cset)
+
+		return
+	}
+
+	// Create a scalar value with umaxp or uminv, then compare it against zero.
+	ins := m.allocateInstr()
+	if op == ssa.OpcodeVanyTrue {
+		// 	umaxp v4?.16b, v2?.16b, v2?.16b
+		ins.asVecRRR(vecOpUmaxp, tmp, rm, rm, vecArrangement16B)
+	} else {
+		// 	uminv d4?, v2?.4s
+		ins.asVecLanes(vecOpUminv, tmp, rm, arr)
+	}
+	m.insert(ins)
+
+	//	mov x3?, v4?.d[0]
+	//	ccmp x3?, #0x0, #0x0, al
+	//	cset x3?, ne
+	//	mov x0, x3?
+
+	movv := m.allocateInstr()
+	movv.asMovFromVec(rd, tmp, vecArrangementD, vecIndex(0), false)
+	m.insert(movv)
+
+	fc := m.allocateInstr()
+	fc.asCCmpImm(rd, uint64(0), al, 0, true)
+	m.insert(fc)
+
+	cset := m.allocateInstr()
+	cset.asCSet(rd.nr(), false, ne)
+	m.insert(cset)
+}
+
+func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
+	r0 := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+	v0 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+	v1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+	switch arr {
+	case vecArrangement16B:
+		//	sshr v6?.16b, v2?.16b, #7
+		//	movz x4?, #0x201, lsl 0
+		//	movk x4?, #0x804, lsl 16
+		//	movk x4?, #0x2010, lsl 32
+		//	movk x4?, #0x8040, lsl 48
+		//	dup v5?.2d, x4?
+		//	and v6?.16b, v6?.16b, v5?.16b
+		//	ext v5?.16b, v6?.16b, v6?.16b, #8
+		//	zip1 v5?.16b, v6?.16b, v5?.16b
+		//	addv s5?, v5?.8h
+		//	umov s3?, v5?.h[0]
+
+		// Right arithmetic shift on the original vector and store the result into v1. So we have:
+		// v1[i] = 0xff if vi<0, 0 otherwise.
+		sshr := m.allocateInstr()
+		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(7), vecArrangement16B)
+		m.insert(sshr)
+
+		// Load the bit mask into r0.
+		m.insertMOVZ(r0.nr(), 0x0201, 0, true)
+		m.insertMOVK(r0.nr(), 0x0804, 1, true)
+		m.insertMOVK(r0.nr(), 0x2010, 2, true)
+		m.insertMOVK(r0.nr(), 0x8040, 3, true)
+
+		// dup r0 to v0.
+		dup := m.allocateInstr()
+		dup.asVecDup(v0, r0, vecArrangement2D)
+		m.insert(dup)
+
+		// Lane-wise logical AND with the bit mask, meaning that we have
+		// v[i] = (1 << i) if vi<0, 0 otherwise.
+		//
+		// Below, we use the following notation:
+		// wi := (1 << i) if vi<0, 0 otherwise.
+		and := m.allocateInstr()
+		and.asVecRRR(vecOpAnd, v1, v1, v0, vecArrangement16B)
+		m.insert(and)
+
+		// Swap the lower and higher 8 byte elements, and write it into v0, meaning that we have
+		// v0[i] = w(i+8) if i < 8, w(i-8) otherwise.
+		ext := m.allocateInstr()
+		ext.asVecExtract(v0, v1, v1, vecArrangement16B, uint32(8))
+		m.insert(ext)
+
+		// v = [w0, w8, ..., w7, w15]
+		zip1 := m.allocateInstr()
+		zip1.asVecPermute(vecOpZip1, v0, v1, v0, vecArrangement16B)
+		m.insert(zip1)
+
+		// v.h[0] = w0 + ... + w15
+		addv := m.allocateInstr()
+		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
+		m.insert(addv)
+
+		// Extract the v.h[0] as the result.
+		movfv := m.allocateInstr()
+		movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false)
+		m.insert(movfv)
+	case vecArrangement8H:
+		//	sshr v6?.8h, v2?.8h, #15
+		//	movz x4?, #0x1, lsl 0
+		//	movk x4?, #0x2, lsl 16
+		//	movk x4?, #0x4, lsl 32
+		//	movk x4?, #0x8, lsl 48
+		//	dup v5?.2d, x4?
+		//	lsl x4?, x4?, 0x4
+		//	ins v5?.d[1], x4?
+		//	and v5?.16b, v6?.16b, v5?.16b
+		//	addv s5?, v5?.8h
+		//	umov s3?, v5?.h[0]
+
+		// Right arithmetic shift on the original vector and store the result into v1. So we have:
+		// v[i] = 0xffff if vi<0, 0 otherwise.
+		sshr := m.allocateInstr()
+		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(15), vecArrangement8H)
+		m.insert(sshr)
+
+		// Load the bit mask into r0.
+		m.lowerConstantI64(r0.nr(), 0x0008000400020001)
+
+		// dup r0 to vector v0.
+		dup := m.allocateInstr()
+		dup.asVecDup(v0, r0, vecArrangement2D)
+		m.insert(dup)
+
+		lsl := m.allocateInstr()
+		lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(4), true)
+		m.insert(lsl)
+
+		movv := m.allocateInstr()
+		movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
+		m.insert(movv)
+
+		// Lane-wise logical AND with the bitmask, meaning that we have
+		// v[i] = (1 << i)     if vi<0, 0 otherwise for i=0..3
+		//      = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7
+		and := m.allocateInstr()
+		and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
+		m.insert(and)
+
+		addv := m.allocateInstr()
+		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
+		m.insert(addv)
+
+		movfv := m.allocateInstr()
+		movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false)
+		m.insert(movfv)
+	case vecArrangement4S:
+		// 	sshr v6?.8h, v2?.8h, #15
+		//	movz x4?, #0x1, lsl 0
+		//	movk x4?, #0x2, lsl 16
+		//	movk x4?, #0x4, lsl 32
+		//	movk x4?, #0x8, lsl 48
+		//	dup v5?.2d, x4?
+		//	lsl x4?, x4?, 0x4
+		//	ins v5?.d[1], x4?
+		//	and v5?.16b, v6?.16b, v5?.16b
+		//	addv s5?, v5?.8h
+		//	umov s3?, v5?.h[0]
+
+		// Right arithmetic shift on the original vector and store the result into v1. So we have:
+		// v[i] = 0xffffffff if vi<0, 0 otherwise.
+		sshr := m.allocateInstr()
+		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(31), vecArrangement4S)
+		m.insert(sshr)
+
+		// Load the bit mask into r0.
+		m.lowerConstantI64(r0.nr(), 0x0000000200000001)
+
+		// dup r0 to vector v0.
+		dup := m.allocateInstr()
+		dup.asVecDup(v0, r0, vecArrangement2D)
+		m.insert(dup)
+
+		lsl := m.allocateInstr()
+		lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(2), true)
+		m.insert(lsl)
+
+		movv := m.allocateInstr()
+		movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
+		m.insert(movv)
+
+		// Lane-wise logical AND with the bitmask, meaning that we have
+		// v[i] = (1 << i)     if vi<0, 0 otherwise for i in [0, 1]
+		//      = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3]
+		and := m.allocateInstr()
+		and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
+		m.insert(and)
+
+		addv := m.allocateInstr()
+		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement4S)
+		m.insert(addv)
+
+		movfv := m.allocateInstr()
+		movfv.asMovFromVec(rd, v0, vecArrangementS, vecIndex(0), false)
+		m.insert(movfv)
+	case vecArrangement2D:
+		// 	mov d3?, v2?.d[0]
+		//	mov x4?, v2?.d[1]
+		//	lsr x4?, x4?, 0x3f
+		//	lsr d3?, d3?, 0x3f
+		//	add s3?, s3?, w4?, lsl #1
+
+		// Move the lower 64-bit int into result.
+		movv0 := m.allocateInstr()
+		movv0.asMovFromVec(rd, rm, vecArrangementD, vecIndex(0), false)
+		m.insert(movv0)
+
+		// Move the higher 64-bit int into r0.
+		movv1 := m.allocateInstr()
+		movv1.asMovFromVec(r0, rm, vecArrangementD, vecIndex(1), false)
+		m.insert(movv1)
+
+		// Move the sign bit into the least significant bit.
+		lsr1 := m.allocateInstr()
+		lsr1.asALUShift(aluOpLsr, r0, r0, operandShiftImm(63), true)
+		m.insert(lsr1)
+
+		lsr2 := m.allocateInstr()
+		lsr2.asALUShift(aluOpLsr, rd, rd, operandShiftImm(63), true)
+		m.insert(lsr2)
+
+		// rd = (r0<<1) | rd
+		lsl := m.allocateInstr()
+		lsl.asALU(aluOpAdd, rd, rd, operandSR(r0.nr(), 1, shiftOpLSL), false)
+		m.insert(lsl)
+	default:
+		panic("Unsupported " + arr.String())
+	}
+}
+
+func (m *machine) lowerVecMisc(op vecOp, instr *ssa.Instruction) {
+	x, lane := instr.ArgWithLane()
+	arr := ssaLaneToArrangement(lane)
+	ins := m.allocateInstr()
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+	rd := operandNR(m.compiler.VRegOf(instr.Return()))
+	ins.asVecMisc(op, rd, rn, arr)
+	m.insert(ins)
+}
+
+func (m *machine) lowerVecRRR(op vecOp, x, y, ret ssa.Value, arr vecArrangement) {
+	ins := m.allocateInstr()
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+	rd := operandNR(m.compiler.VRegOf(ret))
+	ins.asVecRRR(op, rd, rn, rm, arr)
+	m.insert(ins)
+}
+
+func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
+	if arr != vecArrangement2D {
+		mul := m.allocateInstr()
+		mul.asVecRRR(vecOpMul, rd, rn, rm, arr)
+		m.insert(mul)
+	} else {
+		tmp1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmp3 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+		tmpRes := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+		// Following the algorithm in https://chromium-review.googlesource.com/c/v8/v8/+/1781696
+		rev64 := m.allocateInstr()
+		rev64.asVecMisc(vecOpRev64, tmp2, rm, vecArrangement4S)
+		m.insert(rev64)
+
+		mul := m.allocateInstr()
+		mul.asVecRRR(vecOpMul, tmp2, tmp2, rn, vecArrangement4S)
+		m.insert(mul)
+
+		xtn1 := m.allocateInstr()
+		xtn1.asVecMisc(vecOpXtn, tmp1, rn, vecArrangement2S)
+		m.insert(xtn1)
+
+		addp := m.allocateInstr()
+		addp.asVecRRR(vecOpAddp, tmp2, tmp2, tmp2, vecArrangement4S)
+		m.insert(addp)
+
+		xtn2 := m.allocateInstr()
+		xtn2.asVecMisc(vecOpXtn, tmp3, rm, vecArrangement2S)
+		m.insert(xtn2)
+
+		// Note: do not write the result directly into result yet. This is the same reason as in bsl.
+		// In short, in UMLAL instruction, the result register is also one of the source register, and
+		// the value on the result register is significant.
+		shll := m.allocateInstr()
+		shll.asVecMisc(vecOpShll, tmpRes, tmp2, vecArrangement2S)
+		m.insert(shll)
+
+		umlal := m.allocateInstr()
+		umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, tmp3, tmp1, vecArrangement2S)
+		m.insert(umlal)
+
+		mov := m.allocateInstr()
+		mov.asFpuMov128(rd.nr(), tmpRes.nr())
+		m.insert(mov)
+	}
+}
+
+func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) {
+	x, y, lane := instr.Arg2WithLane()
+	arr := ssaLaneToArrangement(lane)
+
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+
+	// Note: this usage of tmp is important.
+	// BSL modifies the destination register, so we need to use a temporary register so that
+	// the actual definition of the destination register happens *after* the BSL instruction.
+	// That way, we can force the spill instruction to be inserted after the BSL instruction.
+	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+
+	fcmgt := m.allocateInstr()
+	if max {
+		fcmgt.asVecRRR(vecOpFcmgt, tmp, rm, rn, arr)
+	} else {
+		// If min, swap the args.
+		fcmgt.asVecRRR(vecOpFcmgt, tmp, rn, rm, arr)
+	}
+	m.insert(fcmgt)
+
+	bsl := m.allocateInstr()
+	bsl.asVecRRRRewrite(vecOpBsl, tmp, rm, rn, vecArrangement16B)
+	m.insert(bsl)
+
+	res := operandNR(m.compiler.VRegOf(instr.Return()))
+	mov2 := m.allocateInstr()
+	mov2.asFpuMov128(res.nr(), tmp.nr())
+	m.insert(mov2)
+}
+
+func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
+	div := m.allocateInstr()
+
+	if signed {
+		div.asALU(aluOpSDiv, rd, rn, rm, _64bit)
+	} else {
+		div.asALU(aluOpUDiv, rd, rn, rm, _64bit)
+	}
+	m.insert(div)
+
+	// Check if rm is zero:
+	m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero)
+
+	// rd = rn-rd*rm by MSUB instruction.
+	msub := m.allocateInstr()
+	msub.asALURRRR(aluOpMSub, rd, rd, rm, rn, _64bit)
+	m.insert(msub)
+}
+
+func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
+	div := m.allocateInstr()
+
+	if signed {
+		div.asALU(aluOpSDiv, rd, rn, rm, _64bit)
+	} else {
+		div.asALU(aluOpUDiv, rd, rn, rm, _64bit)
+	}
+	m.insert(div)
+
+	// Check if rm is zero:
+	m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero)
+
+	if signed {
+		// We need to check the signed overflow which happens iff "math.MinInt{32,64} / -1"
+		minusOneCheck := m.allocateInstr()
+		// Sets eq condition if rm == -1.
+		minusOneCheck.asALU(aluOpAddS, operandNR(xzrVReg), rm, operandImm12(1, 0), _64bit)
+		m.insert(minusOneCheck)
+
+		ccmp := m.allocateInstr()
+		// If eq condition is set, sets the flag by the result based on "rn - 1", otherwise clears the flag.
+		ccmp.asCCmpImm(rn, 1, eq, 0, _64bit)
+		m.insert(ccmp)
+
+		// Check the overflow flag.
+		m.exitIfNot(execCtxVReg, vs.invert().asCond(), false, wazevoapi.ExitCodeIntegerOverflow)
+	}
+}
+
+// exitIfNot emits a conditional branch to exit if the condition is not met.
+// If `c` (cond type) is a register, `cond64bit` must be chosen to indicate whether the register is 32-bit or 64-bit.
+// Otherwise, `cond64bit` is ignored.
+func (m *machine) exitIfNot(execCtxVReg regalloc.VReg, c cond, cond64bit bool, code wazevoapi.ExitCode) {
+	execCtxTmp := m.copyToTmp(execCtxVReg)
+
+	cbr := m.allocateInstr()
+	m.insert(cbr)
+	m.lowerExitWithCode(execCtxTmp, code)
+	// Conditional branch target is after exit.
+	l := m.insertBrTargetLabel()
+	cbr.asCondBr(c, l, cond64bit)
+}
+
+func (m *machine) lowerFcopysign(x, y, ret ssa.Value) {
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+	var tmpI, tmpF operand
+	_64 := x.Type() == ssa.TypeF64
+	if _64 {
+		tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
+		tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+	} else {
+		tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF32))
+		tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+	}
+	rd := m.compiler.VRegOf(ret)
+	m.lowerFcopysignImpl(operandNR(rd), rn, rm, tmpI, tmpF, _64)
+}
+
+func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool) {
+	// This is exactly the same code emitted by GCC for "__builtin_copysign":
+	//
+	//    mov     x0, -9223372036854775808
+	//    fmov    d2, x0
+	//    vbit    v0.8b, v1.8b, v2.8b
+	//
+
+	setMSB := m.allocateInstr()
+	if _64bit {
+		m.lowerConstantI64(tmpI.nr(), math.MinInt64)
+		setMSB.asMovToVec(tmpF, tmpI, vecArrangementD, vecIndex(0))
+	} else {
+		m.lowerConstantI32(tmpI.nr(), math.MinInt32)
+		setMSB.asMovToVec(tmpF, tmpI, vecArrangementS, vecIndex(0))
+	}
+	m.insert(setMSB)
+
+	tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
+
+	mov := m.allocateInstr()
+	mov.asFpuMov64(tmpReg.nr(), rn.nr())
+	m.insert(mov)
+
+	vbit := m.allocateInstr()
+	vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, tmpF, vecArrangement8B)
+	m.insert(vbit)
+
+	movDst := m.allocateInstr()
+	movDst.asFpuMov64(rd.nr(), tmpReg.nr())
+	m.insert(movDst)
+}
+
+func (m *machine) lowerBitcast(instr *ssa.Instruction) {
+	v, dstType := instr.BitcastData()
+	srcType := v.Type()
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
+	rd := operandNR(m.compiler.VRegOf(instr.Return()))
+	srcInt := srcType.IsInt()
+	dstInt := dstType.IsInt()
+	switch {
+	case srcInt && !dstInt: // Int to Float:
+		mov := m.allocateInstr()
+		var arr vecArrangement
+		if srcType.Bits() == 64 {
+			arr = vecArrangementD
+		} else {
+			arr = vecArrangementS
+		}
+		mov.asMovToVec(rd, rn, arr, vecIndex(0))
+		m.insert(mov)
+	case !srcInt && dstInt: // Float to Int:
+		mov := m.allocateInstr()
+		var arr vecArrangement
+		if dstType.Bits() == 64 {
+			arr = vecArrangementD
+		} else {
+			arr = vecArrangementS
+		}
+		mov.asMovFromVec(rd, rn, arr, vecIndex(0), false)
+		m.insert(mov)
+	default:
+		panic("TODO?BUG?")
+	}
+}
+
+func (m *machine) lowerFpuUniOp(op fpuUniOp, in, out ssa.Value) {
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(in), extModeNone)
+	rd := operandNR(m.compiler.VRegOf(out))
+
+	neg := m.allocateInstr()
+	neg.asFpuRR(op, rd, rn, in.Type().Bits() == 64)
+	m.insert(neg)
+}
+
+func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) {
+	if !nonTrapping {
+		// First of all, we have to clear the FPU flags.
+		flagClear := m.allocateInstr()
+		flagClear.asMovToFPSR(xzrVReg)
+		m.insert(flagClear)
+	}
+
+	// Then, do the conversion which doesn't trap inherently.
+	cvt := m.allocateInstr()
+	cvt.asFpuToInt(rd, rn, signed, src64bit, dst64bit)
+	m.insert(cvt)
+
+	if !nonTrapping {
+		tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
+
+		// After the conversion, check the FPU flags.
+		getFlag := m.allocateInstr()
+		getFlag.asMovFromFPSR(tmpReg)
+		m.insert(getFlag)
+
+		execCtx := m.copyToTmp(ctx)
+		_rn := operandNR(m.copyToTmp(rn.nr()))
+
+		// Check if the conversion was undefined by comparing the status with 1.
+		// See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register
+		alu := m.allocateInstr()
+		alu.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpReg), operandImm12(1, 0), true)
+		m.insert(alu)
+
+		// If it is not undefined, we can return the result.
+		ok := m.allocateInstr()
+		m.insert(ok)
+
+		// Otherwise, we have to choose the status depending on it is overflow or NaN conversion.
+
+		// Comparing itself to check if it is a NaN.
+		fpuCmp := m.allocateInstr()
+		fpuCmp.asFpuCmp(_rn, _rn, src64bit)
+		m.insert(fpuCmp)
+		// If the VC flag is not set (== VS flag is set), it is a NaN.
+		m.exitIfNot(execCtx, vc.asCond(), false, wazevoapi.ExitCodeInvalidConversionToInteger)
+		// Otherwise, it is an overflow.
+		m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow)
+
+		// Conditional branch target is after exit.
+		l := m.insertBrTargetLabel()
+		ok.asCondBr(ne.asCond(), l, false /* ignored */)
+	}
+}
+
+func (m *machine) lowerIntToFpu(rd, rn operand, signed, src64bit, dst64bit bool) {
+	cvt := m.allocateInstr()
+	cvt.asIntToFpu(rd, rn, signed, src64bit, dst64bit)
+	m.insert(cvt)
+}
+
+func (m *machine) lowerFpuBinOp(si *ssa.Instruction) {
+	instr := m.allocateInstr()
+	var op fpuBinOp
+	switch si.Opcode() {
+	case ssa.OpcodeFadd:
+		op = fpuBinOpAdd
+	case ssa.OpcodeFsub:
+		op = fpuBinOpSub
+	case ssa.OpcodeFmul:
+		op = fpuBinOpMul
+	case ssa.OpcodeFdiv:
+		op = fpuBinOpDiv
+	case ssa.OpcodeFmax:
+		op = fpuBinOpMax
+	case ssa.OpcodeFmin:
+		op = fpuBinOpMin
+	}
+	x, y := si.Arg2()
+	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
+	rn := m.getOperand_NR(xDef, extModeNone)
+	rm := m.getOperand_NR(yDef, extModeNone)
+	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	instr.asFpuRRR(op, rd, rn, rm, x.Type().Bits() == 64)
+	m.insert(instr)
+}
+
+func (m *machine) lowerSubOrAdd(si *ssa.Instruction, add bool) {
+	x, y := si.Arg2()
+	if !x.Type().IsInt() {
+		panic("BUG?")
+	}
+
+	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
+	rn := m.getOperand_NR(xDef, extModeNone)
+	rm, yNegated := m.getOperand_MaybeNegatedImm12_ER_SR_NR(yDef, extModeNone)
+
+	var aop aluOp
+	switch {
+	case add && !yNegated: // rn+rm = x+y
+		aop = aluOpAdd
+	case add && yNegated: // rn-rm = x-(-y) = x+y
+		aop = aluOpSub
+	case !add && !yNegated: // rn-rm = x-y
+		aop = aluOpSub
+	case !add && yNegated: // rn+rm = x-(-y) = x-y
+		aop = aluOpAdd
+	}
+	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	alu := m.allocateInstr()
+	alu.asALU(aop, rd, rn, rm, x.Type().Bits() == 64)
+	m.insert(alu)
+}
+
+// InsertMove implements backend.Machine.
+func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) {
+	instr := m.allocateInstr()
+	switch typ {
+	case ssa.TypeI32, ssa.TypeI64:
+		instr.asMove64(dst, src)
+	case ssa.TypeF32, ssa.TypeF64:
+		instr.asFpuMov64(dst, src)
+	case ssa.TypeV128:
+		instr.asFpuMov128(dst, src)
+	default:
+		panic("TODO")
+	}
+	m.insert(instr)
+}
+
+func (m *machine) lowerIcmp(si *ssa.Instruction) {
+	x, y, c := si.IcmpData()
+	flag := condFlagFromSSAIntegerCmpCond(c)
+
+	in64bit := x.Type().Bits() == 64
+	var ext extMode
+	if in64bit {
+		if c.Signed() {
+			ext = extModeSignExtend64
+		} else {
+			ext = extModeZeroExtend64
+		}
+	} else {
+		if c.Signed() {
+			ext = extModeSignExtend32
+		} else {
+			ext = extModeZeroExtend32
+		}
+	}
+
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
+	rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), ext)
+	alu := m.allocateInstr()
+	alu.asALU(aluOpSubS, operandNR(xzrVReg), rn, rm, in64bit)
+	m.insert(alu)
+
+	cset := m.allocateInstr()
+	cset.asCSet(m.compiler.VRegOf(si.Return()), false, flag)
+	m.insert(cset)
+}
+
+func (m *machine) lowerVIcmp(si *ssa.Instruction) {
+	x, y, c, lane := si.VIcmpData()
+	flag := condFlagFromSSAIntegerCmpCond(c)
+	arr := ssaLaneToArrangement(lane)
+
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+	rd := operandNR(m.compiler.VRegOf(si.Return()))
+
+	switch flag {
+	case eq:
+		cmp := m.allocateInstr()
+		cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr)
+		m.insert(cmp)
+	case ne:
+		cmp := m.allocateInstr()
+		cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr)
+		m.insert(cmp)
+		not := m.allocateInstr()
+		not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
+		m.insert(not)
+	case ge:
+		cmp := m.allocateInstr()
+		cmp.asVecRRR(vecOpCmge, rd, rn, rm, arr)
+		m.insert(cmp)
+	case gt:
+		cmp := m.allocateInstr()
+		cmp.asVecRRR(vecOpCmgt, rd, rn, rm, arr)
+		m.insert(cmp)
+	case le:
+		cmp := m.allocateInstr()
+		cmp.asVecRRR(vecOpCmge, rd, rm, rn, arr) // rm, rn are swapped
+		m.insert(cmp)
+	case lt:
+		cmp := m.allocateInstr()
+		cmp.asVecRRR(vecOpCmgt, rd, rm, rn, arr) // rm, rn are swapped
+		m.insert(cmp)
+	case hs:
+		cmp := m.allocateInstr()
+		cmp.asVecRRR(vecOpCmhs, rd, rn, rm, arr)
+		m.insert(cmp)
+	case hi:
+		cmp := m.allocateInstr()
+		cmp.asVecRRR(vecOpCmhi, rd, rn, rm, arr)
+		m.insert(cmp)
+	case ls:
+		cmp := m.allocateInstr()
+		cmp.asVecRRR(vecOpCmhs, rd, rm, rn, arr) // rm, rn are swapped
+		m.insert(cmp)
+	case lo:
+		cmp := m.allocateInstr()
+		cmp.asVecRRR(vecOpCmhi, rd, rm, rn, arr) // rm, rn are swapped
+		m.insert(cmp)
+	}
+}
+
+func (m *machine) lowerVFcmp(si *ssa.Instruction) {
+	x, y, c, lane := si.VFcmpData()
+	flag := condFlagFromSSAFloatCmpCond(c)
+	arr := ssaLaneToArrangement(lane)
+
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+	rd := operandNR(m.compiler.VRegOf(si.Return()))
+
+	switch flag {
+	case eq:
+		cmp := m.allocateInstr()
+		cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr)
+		m.insert(cmp)
+	case ne:
+		cmp := m.allocateInstr()
+		cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr)
+		m.insert(cmp)
+		not := m.allocateInstr()
+		not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
+		m.insert(not)
+	case ge:
+		cmp := m.allocateInstr()
+		cmp.asVecRRR(vecOpFcmge, rd, rn, rm, arr)
+		m.insert(cmp)
+	case gt:
+		cmp := m.allocateInstr()
+		cmp.asVecRRR(vecOpFcmgt, rd, rn, rm, arr)
+		m.insert(cmp)
+	case mi:
+		cmp := m.allocateInstr()
+		cmp.asVecRRR(vecOpFcmgt, rd, rm, rn, arr) // rm, rn are swapped
+		m.insert(cmp)
+	case ls:
+		cmp := m.allocateInstr()
+		cmp.asVecRRR(vecOpFcmge, rd, rm, rn, arr) // rm, rn are swapped
+		m.insert(cmp)
+	}
+}
+
+func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool) {
+	cvt := m.allocateInstr()
+	if signed {
+		cvt.asVecMisc(vecOpFcvtzs, rd, rn, arr)
+	} else {
+		cvt.asVecMisc(vecOpFcvtzu, rd, rn, arr)
+	}
+	m.insert(cvt)
+
+	if arr == vecArrangement2D {
+		narrow := m.allocateInstr()
+		if signed {
+			narrow.asVecMisc(vecOpSqxtn, rd, rd, vecArrangement2S)
+		} else {
+			narrow.asVecMisc(vecOpUqxtn, rd, rd, vecArrangement2S)
+		}
+		m.insert(narrow)
+	}
+}
+
+func (m *machine) lowerVfpuFromInt(rd, rn operand, arr vecArrangement, signed bool) {
+	cvt := m.allocateInstr()
+	if signed {
+		cvt.asVecMisc(vecOpScvtf, rd, rn, arr)
+	} else {
+		cvt.asVecMisc(vecOpUcvtf, rd, rn, arr)
+	}
+	m.insert(cvt)
+}
+
+func (m *machine) lowerShifts(si *ssa.Instruction, ext extMode, aluOp aluOp) {
+	x, amount := si.Arg2()
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
+	rm := m.getOperand_ShiftImm_NR(m.compiler.ValueDefinition(amount), ext, x.Type().Bits())
+	rd := operandNR(m.compiler.VRegOf(si.Return()))
+
+	alu := m.allocateInstr()
+	alu.asALUShift(aluOp, rd, rn, rm, x.Type().Bits() == 64)
+	m.insert(alu)
+}
+
+func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp, ignoreResult bool) {
+	x, y := si.Arg2()
+
+	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
+	rn := m.getOperand_NR(xDef, extModeNone)
+
+	var rd operand
+	if ignoreResult {
+		rd = operandNR(xzrVReg)
+	} else {
+		rd = operandNR(m.compiler.VRegOf(si.Return()))
+	}
+
+	_64 := x.Type().Bits() == 64
+	alu := m.allocateInstr()
+	if instr := yDef.Instr; instr != nil && instr.Constant() {
+		c := instr.ConstantVal()
+		if isBitMaskImmediate(c, _64) {
+			// Constant bit wise operations can be lowered to a single instruction.
+			alu.asALUBitmaskImm(op, rd.nr(), rn.nr(), c, _64)
+			m.insert(alu)
+			return
+		}
+	}
+
+	rm := m.getOperand_SR_NR(yDef, extModeNone)
+	alu.asALU(op, rd, rn, rm, _64)
+	m.insert(alu)
+}
+
+func (m *machine) lowerRotl(si *ssa.Instruction) {
+	x, y := si.Arg2()
+	r := si.Return()
+	_64 := r.Type().Bits() == 64
+
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+	var tmp operand
+	if _64 {
+		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+	} else {
+		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+	}
+	rd := operandNR(m.compiler.VRegOf(r))
+
+	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
+	m.lowerRotlImpl(rd, rn, rm, tmp, _64)
+}
+
+func (m *machine) lowerRotlImpl(rd, rn, rm, tmp operand, is64bit bool) {
+	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
+	neg := m.allocateInstr()
+	neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rm, is64bit)
+	m.insert(neg)
+	alu := m.allocateInstr()
+	alu.asALU(aluOpRotR, rd, rn, tmp, is64bit)
+	m.insert(alu)
+}
+
+func (m *machine) lowerRotr(si *ssa.Instruction) {
+	x, y := si.Arg2()
+
+	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
+	rn := m.getOperand_NR(xDef, extModeNone)
+	rm := m.getOperand_NR(yDef, extModeNone)
+	rd := operandNR(m.compiler.VRegOf(si.Return()))
+
+	alu := m.allocateInstr()
+	alu.asALU(aluOpRotR, rd, rn, rm, si.Return().Type().Bits() == 64)
+	m.insert(alu)
+}
+
+func (m *machine) lowerExtend(arg, ret ssa.Value, from, to byte, signed bool) {
+	rd := m.compiler.VRegOf(ret)
+	def := m.compiler.ValueDefinition(arg)
+
+	if instr := def.Instr; !signed && from == 32 && instr != nil {
+		// We can optimize out the unsigned extend because:
+		// 	Writes to the W register set bits [63:32] of the X register to zero
+		//  https://developer.arm.com/documentation/den0024/a/An-Introduction-to-the-ARMv8-Instruction-Sets/The-ARMv8-instruction-sets/Distinguishing-between-32-bit-and-64-bit-A64-instructions
+		switch instr.Opcode() {
+		case
+			ssa.OpcodeIadd, ssa.OpcodeIsub, ssa.OpcodeLoad,
+			ssa.OpcodeBand, ssa.OpcodeBor, ssa.OpcodeBnot,
+			ssa.OpcodeIshl, ssa.OpcodeUshr, ssa.OpcodeSshr,
+			ssa.OpcodeRotl, ssa.OpcodeRotr,
+			ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32:
+			// So, if the argument is the result of a 32-bit operation, we can just copy the register.
+			// It is highly likely that this copy will be optimized out after register allocation.
+			rn := m.compiler.VRegOf(arg)
+			mov := m.allocateInstr()
+			// Note: do not use move32 as it will be lowered to a 32-bit move, which is not copy (that is actually the impl of UExtend).
+			mov.asMove64(rd, rn)
+			m.insert(mov)
+			return
+		default:
+		}
+	}
+	rn := m.getOperand_NR(def, extModeNone)
+
+	ext := m.allocateInstr()
+	ext.asExtend(rd, rn.nr(), from, to, signed)
+	m.insert(ext)
+}
+
+func (m *machine) lowerFcmp(x, y, result ssa.Value, c ssa.FloatCmpCond) {
+	rn, rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone), m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+
+	fc := m.allocateInstr()
+	fc.asFpuCmp(rn, rm, x.Type().Bits() == 64)
+	m.insert(fc)
+
+	cset := m.allocateInstr()
+	cset.asCSet(m.compiler.VRegOf(result), false, condFlagFromSSAFloatCmpCond(c))
+	m.insert(cset)
+}
+
+func (m *machine) lowerImul(x, y, result ssa.Value) {
+	rd := m.compiler.VRegOf(result)
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+
+	// TODO: if this comes before Add/Sub, we could merge it by putting it into the place of xzrVReg.
+
+	mul := m.allocateInstr()
+	mul.asALURRRR(aluOpMAdd, operandNR(rd), rn, rm, operandNR(xzrVReg), x.Type().Bits() == 64)
+	m.insert(mul)
+}
+
+func (m *machine) lowerClz(x, result ssa.Value) {
+	rd := m.compiler.VRegOf(result)
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+	clz := m.allocateInstr()
+	clz.asBitRR(bitOpClz, rd, rn.nr(), x.Type().Bits() == 64)
+	m.insert(clz)
+}
+
+func (m *machine) lowerCtz(x, result ssa.Value) {
+	rd := m.compiler.VRegOf(result)
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+	rbit := m.allocateInstr()
+	_64 := x.Type().Bits() == 64
+	var tmpReg regalloc.VReg
+	if _64 {
+		tmpReg = m.compiler.AllocateVReg(ssa.TypeI64)
+	} else {
+		tmpReg = m.compiler.AllocateVReg(ssa.TypeI32)
+	}
+	rbit.asBitRR(bitOpRbit, tmpReg, rn.nr(), _64)
+	m.insert(rbit)
+
+	clz := m.allocateInstr()
+	clz.asBitRR(bitOpClz, rd, tmpReg, _64)
+	m.insert(clz)
+}
+
+func (m *machine) lowerPopcnt(x, result ssa.Value) {
+	// arm64 doesn't have an instruction for population count on scalar register,
+	// so we use the vector instruction `cnt`.
+	// This is exactly what the official Go implements bits.OneCount.
+	// For example, "func () int { return bits.OneCount(10) }" is compiled as
+	//
+	//    MOVD    $10, R0 ;; Load 10.
+	//    FMOVD   R0, F0
+	//    VCNT    V0.B8, V0.B8
+	//    UADDLV  V0.B8, V0
+	//
+	// In aarch64 asm, FMOVD is encoded as `ins`, VCNT is `cnt`,
+	// and the registers may use different names. In our encoding we use the following
+	// instructions:
+	//
+	//    ins v0.d[0], x0     ;; mov from GPR to vec (FMOV above) is encoded as INS
+	//    cnt v0.16b, v0.16b  ;; we use vec arrangement 16b
+	//    uaddlv h0, v0.8b    ;; h0 is still v0 with the dest width specifier 'H', implied when src arrangement is 8b
+	//    mov x5, v0.d[0]     ;; finally we mov the result back to a GPR
+	//
+
+	rd := operandNR(m.compiler.VRegOf(result))
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+
+	rf1 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
+	ins := m.allocateInstr()
+	ins.asMovToVec(rf1, rn, vecArrangementD, vecIndex(0))
+	m.insert(ins)
+
+	rf2 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
+	cnt := m.allocateInstr()
+	cnt.asVecMisc(vecOpCnt, rf2, rf1, vecArrangement16B)
+	m.insert(cnt)
+
+	rf3 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
+	uaddlv := m.allocateInstr()
+	uaddlv.asVecLanes(vecOpUaddlv, rf3, rf2, vecArrangement8B)
+	m.insert(uaddlv)
+
+	mov := m.allocateInstr()
+	mov.asMovFromVec(rd, rf3, vecArrangementD, vecIndex(0), false)
+	m.insert(mov)
+}
+
+// lowerExitWithCode lowers the lowerExitWithCode takes a context pointer as argument.
+func (m *machine) lowerExitWithCode(execCtxVReg regalloc.VReg, code wazevoapi.ExitCode) {
+	tmpReg1 := m.compiler.AllocateVReg(ssa.TypeI32)
+	loadExitCodeConst := m.allocateInstr()
+	loadExitCodeConst.asMOVZ(tmpReg1, uint64(code), 0, true)
+
+	setExitCode := m.allocateInstr()
+	setExitCode.asStore(operandNR(tmpReg1),
+		addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
+		}, 32)
+
+	// In order to unwind the stack, we also need to push the current stack pointer:
+	tmp2 := m.compiler.AllocateVReg(ssa.TypeI64)
+	movSpToTmp := m.allocateInstr()
+	movSpToTmp.asMove64(tmp2, spVReg)
+	strSpToExecCtx := m.allocateInstr()
+	strSpToExecCtx.asStore(operandNR(tmp2),
+		addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
+		}, 64)
+	// Also the address of this exit.
+	tmp3 := m.compiler.AllocateVReg(ssa.TypeI64)
+	currentAddrToTmp := m.allocateInstr()
+	currentAddrToTmp.asAdr(tmp3, 0)
+	storeCurrentAddrToExecCtx := m.allocateInstr()
+	storeCurrentAddrToExecCtx.asStore(operandNR(tmp3),
+		addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
+		}, 64)
+
+	exitSeq := m.allocateInstr()
+	exitSeq.asExitSequence(execCtxVReg)
+
+	m.insert(loadExitCodeConst)
+	m.insert(setExitCode)
+	m.insert(movSpToTmp)
+	m.insert(strSpToExecCtx)
+	m.insert(currentAddrToTmp)
+	m.insert(storeCurrentAddrToExecCtx)
+	m.insert(exitSeq)
+}
+
+func (m *machine) lowerIcmpToFlag(x, y ssa.Value, signed bool) {
+	if x.Type() != y.Type() {
+		panic(
+			fmt.Sprintf("TODO(maybe): support icmp with different types: v%d=%s != v%d=%s",
+				x.ID(), x.Type(), y.ID(), y.Type()))
+	}
+
+	extMod := extModeOf(x.Type(), signed)
+
+	// First operand must be in pure register form.
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extMod)
+	// Second operand can be in any of Imm12, ER, SR, or NR form supported by the SUBS instructions.
+	rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), extMod)
+
+	alu := m.allocateInstr()
+	// subs zr, rn, rm
+	alu.asALU(
+		aluOpSubS,
+		// We don't need the result, just need to set flags.
+		operandNR(xzrVReg),
+		rn,
+		rm,
+		x.Type().Bits() == 64,
+	)
+	m.insert(alu)
+}
+
+func (m *machine) lowerFcmpToFlag(x, y ssa.Value) {
+	if x.Type() != y.Type() {
+		panic("TODO(maybe): support icmp with different types")
+	}
+
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+	cmp := m.allocateInstr()
+	cmp.asFpuCmp(rn, rm, x.Type().Bits() == 64)
+	m.insert(cmp)
+}
+
+func (m *machine) lowerExitIfTrueWithCode(execCtxVReg regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) {
+	condDef := m.compiler.ValueDefinition(cond)
+	if !m.compiler.MatchInstr(condDef, ssa.OpcodeIcmp) {
+		panic("TODO: OpcodeExitIfTrueWithCode must come after Icmp at the moment: " + condDef.Instr.Opcode().String())
+	}
+	condDef.Instr.MarkLowered()
+
+	cvalInstr := condDef.Instr
+	x, y, c := cvalInstr.IcmpData()
+	signed := c.Signed()
+
+	if !m.tryLowerBandToFlag(x, y) {
+		m.lowerIcmpToFlag(x, y, signed)
+	}
+
+	// We need to copy the execution context to a temp register, because if it's spilled,
+	// it might end up being reloaded inside the exiting branch.
+	execCtxTmp := m.copyToTmp(execCtxVReg)
+
+	// We have to skip the entire exit sequence if the condition is false.
+	cbr := m.allocateInstr()
+	m.insert(cbr)
+	m.lowerExitWithCode(execCtxTmp, code)
+	// conditional branch target is after exit.
+	l := m.insertBrTargetLabel()
+	cbr.asCondBr(condFlagFromSSAIntegerCmpCond(c).invert().asCond(), l, false /* ignored */)
+}
+
+func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
+	cvalDef := m.compiler.ValueDefinition(c)
+
+	var cc condFlag
+	switch {
+	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction.
+		cvalInstr := cvalDef.Instr
+		x, y, c := cvalInstr.IcmpData()
+		cc = condFlagFromSSAIntegerCmpCond(c)
+		m.lowerIcmpToFlag(x, y, c.Signed())
+		cvalDef.Instr.MarkLowered()
+	case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly.
+		cvalInstr := cvalDef.Instr
+		x, y, c := cvalInstr.FcmpData()
+		cc = condFlagFromSSAFloatCmpCond(c)
+		m.lowerFcmpToFlag(x, y)
+		cvalDef.Instr.MarkLowered()
+	default:
+		rn := m.getOperand_NR(cvalDef, extModeNone)
+		if c.Type() != ssa.TypeI32 && c.Type() != ssa.TypeI64 {
+			panic("TODO?BUG?: support select with non-integer condition")
+		}
+		alu := m.allocateInstr()
+		// subs zr, rn, zr
+		alu.asALU(
+			aluOpSubS,
+			// We don't need the result, just need to set flags.
+			operandNR(xzrVReg),
+			rn,
+			operandNR(xzrVReg),
+			c.Type().Bits() == 64,
+		)
+		m.insert(alu)
+		cc = ne
+	}
+
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
+
+	rd := operandNR(m.compiler.VRegOf(result))
+	switch x.Type() {
+	case ssa.TypeI32, ssa.TypeI64:
+		// csel rd, rn, rm, cc
+		csel := m.allocateInstr()
+		csel.asCSel(rd, rn, rm, cc, x.Type().Bits() == 64)
+		m.insert(csel)
+	case ssa.TypeF32, ssa.TypeF64:
+		// fcsel rd, rn, rm, cc
+		fcsel := m.allocateInstr()
+		fcsel.asFpuCSel(rd, rn, rm, cc, x.Type().Bits() == 64)
+		m.insert(fcsel)
+	default:
+		panic("BUG")
+	}
+}
+
+func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
+	// First check if `rc` is zero or not.
+	checkZero := m.allocateInstr()
+	checkZero.asALU(aluOpSubS, operandNR(xzrVReg), rc, operandNR(xzrVReg), false)
+	m.insert(checkZero)
+
+	// Then use CSETM to set all bits to one if `rc` is zero.
+	allOnesOrZero := m.compiler.AllocateVReg(ssa.TypeI64)
+	cset := m.allocateInstr()
+	cset.asCSet(allOnesOrZero, true, ne)
+	m.insert(cset)
+
+	// Then move the bits to the result vector register.
+	tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+	dup := m.allocateInstr()
+	dup.asVecDup(tmp2, operandNR(allOnesOrZero), vecArrangement2D)
+	m.insert(dup)
+
+	// Now that `tmp2` has either all bits one or zero depending on `rc`,
+	// we can use bsl to select between `rn` and `rm`.
+	ins := m.allocateInstr()
+	ins.asVecRRRRewrite(vecOpBsl, tmp2, rn, rm, vecArrangement16B)
+	m.insert(ins)
+
+	// Finally, move the result to the destination register.
+	mov2 := m.allocateInstr()
+	mov2.asFpuMov128(rd.nr(), tmp2.nr())
+	m.insert(mov2)
+}
+
+func (m *machine) lowerAtomicRmw(si *ssa.Instruction) {
+	ssaOp, size := si.AtomicRmwData()
+
+	var op atomicRmwOp
+	var negateArg bool
+	var flipArg bool
+	switch ssaOp {
+	case ssa.AtomicRmwOpAdd:
+		op = atomicRmwOpAdd
+	case ssa.AtomicRmwOpSub:
+		op = atomicRmwOpAdd
+		negateArg = true
+	case ssa.AtomicRmwOpAnd:
+		op = atomicRmwOpClr
+		flipArg = true
+	case ssa.AtomicRmwOpOr:
+		op = atomicRmwOpSet
+	case ssa.AtomicRmwOpXor:
+		op = atomicRmwOpEor
+	case ssa.AtomicRmwOpXchg:
+		op = atomicRmwOpSwp
+	default:
+		panic(fmt.Sprintf("unknown ssa atomic rmw op: %s", ssaOp))
+	}
+
+	addr, val := si.Arg2()
+	addrDef, valDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(val)
+	rn := m.getOperand_NR(addrDef, extModeNone)
+	rt := operandNR(m.compiler.VRegOf(si.Return()))
+	rs := m.getOperand_NR(valDef, extModeNone)
+
+	_64 := si.Return().Type().Bits() == 64
+	var tmp operand
+	if _64 {
+		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+	} else {
+		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+	}
+	m.lowerAtomicRmwImpl(op, rn, rs, rt, tmp, size, negateArg, flipArg, _64)
+}
+
+func (m *machine) lowerAtomicRmwImpl(op atomicRmwOp, rn, rs, rt, tmp operand, size uint64, negateArg, flipArg, dst64bit bool) {
+	switch {
+	case negateArg:
+		neg := m.allocateInstr()
+		neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rs, dst64bit)
+		m.insert(neg)
+	case flipArg:
+		flip := m.allocateInstr()
+		flip.asALU(aluOpOrn, tmp, operandNR(xzrVReg), rs, dst64bit)
+		m.insert(flip)
+	default:
+		tmp = rs
+	}
+
+	rmw := m.allocateInstr()
+	rmw.asAtomicRmw(op, rn, tmp, rt, size)
+	m.insert(rmw)
+}
+
+func (m *machine) lowerAtomicCas(si *ssa.Instruction) {
+	addr, exp, repl := si.Arg3()
+	size := si.AtomicTargetSize()
+
+	addrDef, expDef, replDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(exp), m.compiler.ValueDefinition(repl)
+	rn := m.getOperand_NR(addrDef, extModeNone)
+	rt := m.getOperand_NR(replDef, extModeNone)
+	rs := m.getOperand_NR(expDef, extModeNone)
+	tmp := operandNR(m.compiler.AllocateVReg(si.Return().Type()))
+
+	_64 := si.Return().Type().Bits() == 64
+	// rs is overwritten by CAS, so we need to move it to the result register before the instruction
+	// in case when it is used somewhere else.
+	mov := m.allocateInstr()
+	if _64 {
+		mov.asMove64(tmp.nr(), rs.nr())
+	} else {
+		mov.asMove32(tmp.nr(), rs.nr())
+	}
+	m.insert(mov)
+
+	m.lowerAtomicCasImpl(rn, tmp, rt, size)
+
+	mov2 := m.allocateInstr()
+	rd := m.compiler.VRegOf(si.Return())
+	if _64 {
+		mov2.asMove64(rd, tmp.nr())
+	} else {
+		mov2.asMove32(rd, tmp.nr())
+	}
+	m.insert(mov2)
+}
+
+func (m *machine) lowerAtomicCasImpl(rn, rs, rt operand, size uint64) {
+	cas := m.allocateInstr()
+	cas.asAtomicCas(rn, rs, rt, size)
+	m.insert(cas)
+}
+
+func (m *machine) lowerAtomicLoad(si *ssa.Instruction) {
+	addr := si.Arg()
+	size := si.AtomicTargetSize()
+
+	addrDef := m.compiler.ValueDefinition(addr)
+	rn := m.getOperand_NR(addrDef, extModeNone)
+	rt := operandNR(m.compiler.VRegOf(si.Return()))
+
+	m.lowerAtomicLoadImpl(rn, rt, size)
+}
+
+func (m *machine) lowerAtomicLoadImpl(rn, rt operand, size uint64) {
+	ld := m.allocateInstr()
+	ld.asAtomicLoad(rn, rt, size)
+	m.insert(ld)
+}
+
+func (m *machine) lowerAtomicStore(si *ssa.Instruction) {
+	addr, val := si.Arg2()
+	size := si.AtomicTargetSize()
+
+	addrDef := m.compiler.ValueDefinition(addr)
+	valDef := m.compiler.ValueDefinition(val)
+	rn := m.getOperand_NR(addrDef, extModeNone)
+	rt := m.getOperand_NR(valDef, extModeNone)
+
+	m.lowerAtomicStoreImpl(rn, rt, size)
+}
+
+func (m *machine) lowerAtomicStoreImpl(rn, rt operand, size uint64) {
+	ld := m.allocateInstr()
+	ld.asAtomicStore(rn, rt, size)
+	m.insert(ld)
+}
+
+// copyToTmp copies the given regalloc.VReg to a temporary register. This is called before cbr to avoid the regalloc issue
+// e.g. reload happening in the middle of the exit sequence which is not the path the normal path executes
+func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg {
+	typ := m.compiler.TypeOf(v)
+	mov := m.allocateInstr()
+	tmp := m.compiler.AllocateVReg(typ)
+	if typ.IsInt() {
+		mov.asMove64(tmp, v)
+	} else {
+		mov.asFpuMov128(tmp, v)
+	}
+	m.insert(mov)
+	return tmp
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go
new file mode 100644
index 000000000..d9fbf1789
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go
@@ -0,0 +1,350 @@
+package arm64
+
+// This file contains the logic to "find and determine operands" for instructions.
+// In order to finalize the form of an operand, we might end up merging/eliminating
+// the source instructions into an operand whenever possible.
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type (
+	// operand represents an operand of an instruction whose type is determined by the kind.
+	operand struct {
+		kind        operandKind
+		data, data2 uint64
+	}
+	operandKind byte
+)
+
+// Here's the list of operand kinds. We use the abbreviation of the kind name not only for these consts,
+// but also names of functions which return the operand of the kind.
+const (
+	// operandKindNR represents "NormalRegister" (NR). This is literally the register without any special operation unlike others.
+	operandKindNR operandKind = iota
+	// operandKindSR represents "Shifted Register" (SR). This is a register which is shifted by a constant.
+	// Some of the arm64 instructions can take this kind of operand.
+	operandKindSR
+	// operandKindER represents "Extended Register (ER). This is a register which is sign/zero-extended to a larger size.
+	// Some of the arm64 instructions can take this kind of operand.
+	operandKindER
+	// operandKindImm12 represents "Immediate 12" (Imm12). This is a 12-bit immediate value which can be either shifted by 12 or not.
+	// See asImm12 function for detail.
+	operandKindImm12
+	// operandKindShiftImm represents "Shifted Immediate" (ShiftImm) used by shift operations.
+	operandKindShiftImm
+)
+
+// String implements fmt.Stringer for debugging.
+func (o operand) format(size byte) string {
+	switch o.kind {
+	case operandKindNR:
+		return formatVRegSized(o.nr(), size)
+	case operandKindSR:
+		r, amt, sop := o.sr()
+		return fmt.Sprintf("%s, %s #%d", formatVRegSized(r, size), sop, amt)
+	case operandKindER:
+		r, eop, _ := o.er()
+		return fmt.Sprintf("%s %s", formatVRegSized(r, size), eop)
+	case operandKindImm12:
+		imm12, shiftBit := o.imm12()
+		if shiftBit == 1 {
+			return fmt.Sprintf("#%#x", uint64(imm12)<<12)
+		} else {
+			return fmt.Sprintf("#%#x", imm12)
+		}
+	default:
+		panic(fmt.Sprintf("unknown operand kind: %d", o.kind))
+	}
+}
+
+// operandNR encodes the given VReg as an operand of operandKindNR.
+func operandNR(r regalloc.VReg) operand {
+	return operand{kind: operandKindNR, data: uint64(r)}
+}
+
+// nr decodes the underlying VReg assuming the operand is of operandKindNR.
+func (o operand) nr() regalloc.VReg {
+	return regalloc.VReg(o.data)
+}
+
+// operandER encodes the given VReg as an operand of operandKindER.
+func operandER(r regalloc.VReg, eop extendOp, to byte) operand {
+	if to < 32 {
+		panic("TODO?BUG?: when we need to extend to less than 32 bits?")
+	}
+	return operand{kind: operandKindER, data: uint64(r), data2: uint64(eop)<<32 | uint64(to)}
+}
+
+// er decodes the underlying VReg, extend operation, and the target size assuming the operand is of operandKindER.
+func (o operand) er() (r regalloc.VReg, eop extendOp, to byte) {
+	return regalloc.VReg(o.data), extendOp(o.data2>>32) & 0xff, byte(o.data2 & 0xff)
+}
+
+// operandSR encodes the given VReg as an operand of operandKindSR.
+func operandSR(r regalloc.VReg, amt byte, sop shiftOp) operand {
+	return operand{kind: operandKindSR, data: uint64(r), data2: uint64(amt)<<32 | uint64(sop)}
+}
+
+// sr decodes the underlying VReg, shift amount, and shift operation assuming the operand is of operandKindSR.
+func (o operand) sr() (r regalloc.VReg, amt byte, sop shiftOp) {
+	return regalloc.VReg(o.data), byte(o.data2>>32) & 0xff, shiftOp(o.data2) & 0xff
+}
+
+// operandImm12 encodes the given imm12 as an operand of operandKindImm12.
+func operandImm12(imm12 uint16, shiftBit byte) operand {
+	return operand{kind: operandKindImm12, data: uint64(imm12) | uint64(shiftBit)<<32}
+}
+
+// imm12 decodes the underlying imm12 data assuming the operand is of operandKindImm12.
+func (o operand) imm12() (v uint16, shiftBit byte) {
+	return uint16(o.data), byte(o.data >> 32)
+}
+
+// operandShiftImm encodes the given amount as an operand of operandKindShiftImm.
+func operandShiftImm(amount byte) operand {
+	return operand{kind: operandKindShiftImm, data: uint64(amount)}
+}
+
+// shiftImm decodes the underlying shift amount data assuming the operand is of operandKindShiftImm.
+func (o operand) shiftImm() byte {
+	return byte(o.data)
+}
+
+// reg returns the register of the operand if applicable.
+func (o operand) reg() regalloc.VReg {
+	switch o.kind {
+	case operandKindNR:
+		return o.nr()
+	case operandKindSR:
+		r, _, _ := o.sr()
+		return r
+	case operandKindER:
+		r, _, _ := o.er()
+		return r
+	case operandKindImm12:
+		// Does not have a register.
+	case operandKindShiftImm:
+		// Does not have a register.
+	default:
+		panic(o.kind)
+	}
+	return regalloc.VRegInvalid
+}
+
+func (o operand) realReg() regalloc.RealReg {
+	return o.nr().RealReg()
+}
+
+func (o operand) assignReg(v regalloc.VReg) operand {
+	switch o.kind {
+	case operandKindNR:
+		return operandNR(v)
+	case operandKindSR:
+		_, amt, sop := o.sr()
+		return operandSR(v, amt, sop)
+	case operandKindER:
+		_, eop, to := o.er()
+		return operandER(v, eop, to)
+	case operandKindImm12:
+		// Does not have a register.
+	case operandKindShiftImm:
+		// Does not have a register.
+	}
+	panic(o.kind)
+}
+
+// ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def).
+//
+// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
+// If the operand can be expressed as operandKindImm12, `mode` is ignored.
+func (m *machine) getOperand_Imm12_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+	if def.IsFromBlockParam() {
+		return operandNR(def.BlkParamVReg)
+	}
+
+	instr := def.Instr
+	if instr.Opcode() == ssa.OpcodeIconst {
+		if imm12Op, ok := asImm12Operand(instr.ConstantVal()); ok {
+			instr.MarkLowered()
+			return imm12Op
+		}
+	}
+	return m.getOperand_ER_SR_NR(def, mode)
+}
+
+// getOperand_MaybeNegatedImm12_ER_SR_NR is almost the same as getOperand_Imm12_ER_SR_NR, but this might negate the immediate value.
+// If the immediate value is negated, the second return value is true, otherwise always false.
+func (m *machine) getOperand_MaybeNegatedImm12_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand, negatedImm12 bool) {
+	if def.IsFromBlockParam() {
+		return operandNR(def.BlkParamVReg), false
+	}
+
+	instr := def.Instr
+	if instr.Opcode() == ssa.OpcodeIconst {
+		c := instr.ConstantVal()
+		if imm12Op, ok := asImm12Operand(c); ok {
+			instr.MarkLowered()
+			return imm12Op, false
+		}
+
+		signExtended := int64(c)
+		if def.SSAValue().Type().Bits() == 32 {
+			signExtended = (signExtended << 32) >> 32
+		}
+		negatedWithoutSign := -signExtended
+		if imm12Op, ok := asImm12Operand(uint64(negatedWithoutSign)); ok {
+			instr.MarkLowered()
+			return imm12Op, true
+		}
+	}
+	return m.getOperand_ER_SR_NR(def, mode), false
+}
+
+// ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def).
+//
+// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
+func (m *machine) getOperand_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+	if def.IsFromBlockParam() {
+		return operandNR(def.BlkParamVReg)
+	}
+
+	if m.compiler.MatchInstr(def, ssa.OpcodeSExtend) || m.compiler.MatchInstr(def, ssa.OpcodeUExtend) {
+		extInstr := def.Instr
+
+		signed := extInstr.Opcode() == ssa.OpcodeSExtend
+		innerExtFromBits, innerExtToBits := extInstr.ExtendFromToBits()
+		modeBits, modeSigned := mode.bits(), mode.signed()
+		if mode == extModeNone || innerExtToBits == modeBits {
+			eop := extendOpFrom(signed, innerExtFromBits)
+			extArg := m.getOperand_NR(m.compiler.ValueDefinition(extInstr.Arg()), extModeNone)
+			op = operandER(extArg.nr(), eop, innerExtToBits)
+			extInstr.MarkLowered()
+			return
+		}
+
+		if innerExtToBits > modeBits {
+			panic("BUG?TODO?: need the results of inner extension to be larger than the mode")
+		}
+
+		switch {
+		case (!signed && !modeSigned) || (signed && modeSigned):
+			// Two sign/zero extensions are equivalent to one sign/zero extension for the larger size.
+			eop := extendOpFrom(modeSigned, innerExtFromBits)
+			op = operandER(m.compiler.VRegOf(extInstr.Arg()), eop, modeBits)
+			extInstr.MarkLowered()
+		case (signed && !modeSigned) || (!signed && modeSigned):
+			// We need to {sign, zero}-extend the result of the {zero,sign} extension.
+			eop := extendOpFrom(modeSigned, innerExtToBits)
+			op = operandER(m.compiler.VRegOf(extInstr.Return()), eop, modeBits)
+			// Note that we failed to merge the inner extension instruction this case.
+		}
+		return
+	}
+	return m.getOperand_SR_NR(def, mode)
+}
+
+// ensureValueNR returns an operand of either operandKindSR or operandKindNR from the given value (defined by `def).
+//
+// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
+func (m *machine) getOperand_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+	if def.IsFromBlockParam() {
+		return operandNR(def.BlkParamVReg)
+	}
+
+	if m.compiler.MatchInstr(def, ssa.OpcodeIshl) {
+		// Check if the shift amount is constant instruction.
+		targetVal, amountVal := def.Instr.Arg2()
+		targetVReg := m.getOperand_NR(m.compiler.ValueDefinition(targetVal), extModeNone).nr()
+		amountDef := m.compiler.ValueDefinition(amountVal)
+		if amountDef.IsFromInstr() && amountDef.Instr.Constant() {
+			// If that is the case, we can use the shifted register operand (SR).
+			c := byte(amountDef.Instr.ConstantVal()) & (targetVal.Type().Bits() - 1) // Clears the unnecessary bits.
+			def.Instr.MarkLowered()
+			amountDef.Instr.MarkLowered()
+			return operandSR(targetVReg, c, shiftOpLSL)
+		}
+	}
+	return m.getOperand_NR(def, mode)
+}
+
+// getOperand_ShiftImm_NR returns an operand of either operandKindShiftImm or operandKindNR from the given value (defined by `def).
+func (m *machine) getOperand_ShiftImm_NR(def *backend.SSAValueDefinition, mode extMode, shiftBitWidth byte) (op operand) {
+	if def.IsFromBlockParam() {
+		return operandNR(def.BlkParamVReg)
+	}
+
+	instr := def.Instr
+	if instr.Constant() {
+		amount := byte(instr.ConstantVal()) & (shiftBitWidth - 1) // Clears the unnecessary bits.
+		return operandShiftImm(amount)
+	}
+	return m.getOperand_NR(def, mode)
+}
+
+// ensureValueNR returns an operand of operandKindNR from the given value (defined by `def).
+//
+// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
+func (m *machine) getOperand_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+	var v regalloc.VReg
+	if def.IsFromBlockParam() {
+		v = def.BlkParamVReg
+	} else {
+		instr := def.Instr
+		if instr.Constant() {
+			// We inline all the constant instructions so that we could reduce the register usage.
+			v = m.lowerConstant(instr)
+			instr.MarkLowered()
+		} else {
+			if n := def.N; n == 0 {
+				v = m.compiler.VRegOf(instr.Return())
+			} else {
+				_, rs := instr.Returns()
+				v = m.compiler.VRegOf(rs[n-1])
+			}
+		}
+	}
+
+	r := v
+	switch inBits := def.SSAValue().Type().Bits(); {
+	case mode == extModeNone:
+	case inBits == 32 && (mode == extModeZeroExtend32 || mode == extModeSignExtend32):
+	case inBits == 32 && mode == extModeZeroExtend64:
+		extended := m.compiler.AllocateVReg(ssa.TypeI64)
+		ext := m.allocateInstr()
+		ext.asExtend(extended, v, 32, 64, false)
+		m.insert(ext)
+		r = extended
+	case inBits == 32 && mode == extModeSignExtend64:
+		extended := m.compiler.AllocateVReg(ssa.TypeI64)
+		ext := m.allocateInstr()
+		ext.asExtend(extended, v, 32, 64, true)
+		m.insert(ext)
+		r = extended
+	case inBits == 64 && (mode == extModeZeroExtend64 || mode == extModeSignExtend64):
+	}
+	return operandNR(r)
+}
+
+func asImm12Operand(val uint64) (op operand, ok bool) {
+	v, shiftBit, ok := asImm12(val)
+	if !ok {
+		return operand{}, false
+	}
+	return operandImm12(v, shiftBit), true
+}
+
+func asImm12(val uint64) (v uint16, shiftBit byte, ok bool) {
+	const mask1, mask2 uint64 = 0xfff, 0xfff_000
+	if val&^mask1 == 0 {
+		return uint16(val), 0, true
+	} else if val&^mask2 == 0 {
+		return uint16(val >> 12), 1, true
+	} else {
+		return 0, 0, false
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
new file mode 100644
index 000000000..4842eaa38
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
@@ -0,0 +1,440 @@
+package arm64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type (
+	// addressMode represents an ARM64 addressing mode.
+	//
+	// https://developer.arm.com/documentation/102374/0101/Loads-and-stores---addressing
+	// TODO: use the bit-packed layout like operand struct.
+	addressMode struct {
+		kind   addressModeKind
+		rn, rm regalloc.VReg
+		extOp  extendOp
+		imm    int64
+	}
+
+	// addressModeKind represents the kind of ARM64 addressing mode.
+	addressModeKind byte
+)
+
+const (
+	// addressModeKindRegExtended takes a base register and an index register. The index register is sign/zero-extended,
+	// and then scaled by bits(type)/8.
+	//
+	// e.g.
+	// 	- ldrh w1, [x2, w3, SXTW #1] ;; sign-extended and scaled by 2 (== LSL #1)
+	// 	- strh w1, [x2, w3, UXTW #1] ;; zero-extended and scaled by 2 (== LSL #1)
+	// 	- ldr w1, [x2, w3, SXTW #2] ;; sign-extended and scaled by 4 (== LSL #2)
+	// 	- str x1, [x2, w3, UXTW #3] ;; zero-extended and scaled by 8 (== LSL #3)
+	//
+	// See the following pages:
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--register---Load-Register-Halfword--register--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--register---Load-Register--register--
+	addressModeKindRegScaledExtended addressModeKind = iota
+
+	// addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without extension factor.
+	addressModeKindRegScaled
+
+	// addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without scale factor.
+	addressModeKindRegExtended
+
+	// addressModeKindRegReg takes a base register and an index register. The index register is not either scaled or extended.
+	addressModeKindRegReg
+
+	// addressModeKindRegSignedImm9 takes a base register and a 9-bit "signed" immediate offset (-256 to 255).
+	// The immediate will be sign-extended, and be added to the base register.
+	// This is a.k.a. "unscaled" since the immediate is not scaled.
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled--
+	addressModeKindRegSignedImm9
+
+	// addressModeKindRegUnsignedImm12 takes a base register and a 12-bit "unsigned" immediate offset.  scaled by
+	// the size of the type. In other words, the actual offset will be imm12 * bits(type)/8.
+	// See "Unsigned offset" in the following pages:
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+	addressModeKindRegUnsignedImm12
+
+	// addressModePostIndex takes a base register and a 9-bit "signed" immediate offset.
+	// After the load/store, the base register will be updated by the offset.
+	//
+	// Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset.
+	//
+	// See "Post-index" in the following pages for examples:
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
+	addressModeKindPostIndex
+
+	// addressModePostIndex takes a base register and a 9-bit "signed" immediate offset.
+	// Before the load/store, the base register will be updated by the offset.
+	//
+	// Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset.
+	//
+	// See "Pre-index" in the following pages for examples:
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
+	addressModeKindPreIndex
+
+	// addressModeKindArgStackSpace is used to resolve the address of the argument stack space
+	// exiting right above the stack pointer. Since we don't know the exact stack space needed for a function
+	// at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above.
+	addressModeKindArgStackSpace
+
+	// addressModeKindResultStackSpace is used to resolve the address of the result stack space
+	// exiting right above the stack pointer. Since we don't know the exact stack space needed for a function
+	// at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above.
+	addressModeKindResultStackSpace
+)
+
+func (a addressMode) format(dstSizeBits byte) (ret string) {
+	base := formatVRegSized(a.rn, 64)
+	if rn := a.rn; rn.RegType() != regalloc.RegTypeInt {
+		panic("invalid base register type: " + a.rn.RegType().String())
+	} else if rn.IsRealReg() && v0 <= a.rn.RealReg() && a.rn.RealReg() <= v30 {
+		panic("BUG: likely a bug in reg alloc or reset behavior")
+	}
+
+	switch a.kind {
+	case addressModeKindRegScaledExtended:
+		amount := a.sizeInBitsToShiftAmount(dstSizeBits)
+		ret = fmt.Sprintf("[%s, %s, %s #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp, amount)
+	case addressModeKindRegScaled:
+		amount := a.sizeInBitsToShiftAmount(dstSizeBits)
+		ret = fmt.Sprintf("[%s, %s, lsl #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), amount)
+	case addressModeKindRegExtended:
+		ret = fmt.Sprintf("[%s, %s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp)
+	case addressModeKindRegReg:
+		ret = fmt.Sprintf("[%s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()))
+	case addressModeKindRegSignedImm9:
+		if a.imm != 0 {
+			ret = fmt.Sprintf("[%s, #%#x]", base, a.imm)
+		} else {
+			ret = fmt.Sprintf("[%s]", base)
+		}
+	case addressModeKindRegUnsignedImm12:
+		if a.imm != 0 {
+			ret = fmt.Sprintf("[%s, #%#x]", base, a.imm)
+		} else {
+			ret = fmt.Sprintf("[%s]", base)
+		}
+	case addressModeKindPostIndex:
+		ret = fmt.Sprintf("[%s], #%#x", base, a.imm)
+	case addressModeKindPreIndex:
+		ret = fmt.Sprintf("[%s, #%#x]!", base, a.imm)
+	case addressModeKindArgStackSpace:
+		ret = fmt.Sprintf("[#arg_space, #%#x]", a.imm)
+	case addressModeKindResultStackSpace:
+		ret = fmt.Sprintf("[#ret_space, #%#x]", a.imm)
+	}
+	return
+}
+
+func addressModePreOrPostIndex(rn regalloc.VReg, imm int64, preIndex bool) addressMode {
+	if !offsetFitsInAddressModeKindRegSignedImm9(imm) {
+		panic(fmt.Sprintf("BUG: offset %#x does not fit in addressModeKindRegSignedImm9", imm))
+	}
+	if preIndex {
+		return addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm}
+	} else {
+		return addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm}
+	}
+}
+
+func offsetFitsInAddressModeKindRegUnsignedImm12(dstSizeInBits byte, offset int64) bool {
+	divisor := int64(dstSizeInBits) / 8
+	return 0 < offset && offset%divisor == 0 && offset/divisor < 4096
+}
+
+func offsetFitsInAddressModeKindRegSignedImm9(offset int64) bool {
+	return -256 <= offset && offset <= 255
+}
+
+func (a addressMode) indexRegBits() byte {
+	bits := a.extOp.srcBits()
+	if bits != 32 && bits != 64 {
+		panic("invalid index register for address mode. it must be either 32 or 64 bits")
+	}
+	return bits
+}
+
+func (a addressMode) sizeInBitsToShiftAmount(sizeInBits byte) (lsl byte) {
+	switch sizeInBits {
+	case 8:
+		lsl = 0
+	case 16:
+		lsl = 1
+	case 32:
+		lsl = 2
+	case 64:
+		lsl = 3
+	}
+	return
+}
+
+func extLoadSignSize(op ssa.Opcode) (size byte, signed bool) {
+	switch op {
+	case ssa.OpcodeUload8:
+		size, signed = 8, false
+	case ssa.OpcodeUload16:
+		size, signed = 16, false
+	case ssa.OpcodeUload32:
+		size, signed = 32, false
+	case ssa.OpcodeSload8:
+		size, signed = 8, true
+	case ssa.OpcodeSload16:
+		size, signed = 16, true
+	case ssa.OpcodeSload32:
+		size, signed = 32, true
+	default:
+		panic("BUG")
+	}
+	return
+}
+
+func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, ret regalloc.VReg) {
+	size, signed := extLoadSignSize(op)
+	amode := m.lowerToAddressMode(ptr, offset, size)
+	load := m.allocateInstr()
+	if signed {
+		load.asSLoad(operandNR(ret), amode, size)
+	} else {
+		load.asULoad(operandNR(ret), amode, size)
+	}
+	m.insert(load)
+}
+
+func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, ret ssa.Value) {
+	amode := m.lowerToAddressMode(ptr, offset, typ.Bits())
+
+	dst := m.compiler.VRegOf(ret)
+	load := m.allocateInstr()
+	switch typ {
+	case ssa.TypeI32, ssa.TypeI64:
+		load.asULoad(operandNR(dst), amode, typ.Bits())
+	case ssa.TypeF32, ssa.TypeF64:
+		load.asFpuLoad(operandNR(dst), amode, typ.Bits())
+	case ssa.TypeV128:
+		load.asFpuLoad(operandNR(dst), amode, 128)
+	default:
+		panic("TODO")
+	}
+	m.insert(load)
+}
+
+func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, lane ssa.VecLane, ret ssa.Value) {
+	// vecLoad1R has offset address mode (base+imm) only for post index, so we simply add the offset to the base.
+	base := m.getOperand_NR(m.compiler.ValueDefinition(ptr), extModeNone).nr()
+	offsetReg := m.compiler.AllocateVReg(ssa.TypeI64)
+	m.lowerConstantI64(offsetReg, int64(offset))
+	addedBase := m.addReg64ToReg64(base, offsetReg)
+
+	rd := operandNR(m.compiler.VRegOf(ret))
+
+	ld1r := m.allocateInstr()
+	ld1r.asVecLoad1R(rd, operandNR(addedBase), ssaLaneToArrangement(lane))
+	m.insert(ld1r)
+}
+
+func (m *machine) lowerStore(si *ssa.Instruction) {
+	// TODO: merge consecutive stores into a single pair store instruction.
+	value, ptr, offset, storeSizeInBits := si.StoreData()
+	amode := m.lowerToAddressMode(ptr, offset, storeSizeInBits)
+
+	valueOp := m.getOperand_NR(m.compiler.ValueDefinition(value), extModeNone)
+	store := m.allocateInstr()
+	store.asStore(valueOp, amode, storeSizeInBits)
+	m.insert(store)
+}
+
+// lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
+func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode addressMode) {
+	// TODO: currently the instruction selection logic doesn't support addressModeKindRegScaledExtended and
+	// addressModeKindRegScaled since collectAddends doesn't take ssa.OpcodeIshl into account. This should be fixed
+	// to support more efficient address resolution.
+
+	a32s, a64s, offset := m.collectAddends(ptr)
+	offset += int64(offsetBase)
+	return m.lowerToAddressModeFromAddends(a32s, a64s, size, offset)
+}
+
+// lowerToAddressModeFromAddends creates an addressMode from a list of addends collected by collectAddends.
+// During the construction, this might emit additional instructions.
+//
+// Extracted as a separate function for easy testing.
+func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32], a64s *wazevoapi.Queue[regalloc.VReg], size byte, offset int64) (amode addressMode) {
+	switch a64sExist, a32sExist := !a64s.Empty(), !a32s.Empty(); {
+	case a64sExist && a32sExist:
+		var base regalloc.VReg
+		base = a64s.Dequeue()
+		var a32 addend32
+		a32 = a32s.Dequeue()
+		amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext}
+	case a64sExist && offsetFitsInAddressModeKindRegUnsignedImm12(size, offset):
+		var base regalloc.VReg
+		base = a64s.Dequeue()
+		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset}
+		offset = 0
+	case a64sExist && offsetFitsInAddressModeKindRegSignedImm9(offset):
+		var base regalloc.VReg
+		base = a64s.Dequeue()
+		amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset}
+		offset = 0
+	case a64sExist:
+		var base regalloc.VReg
+		base = a64s.Dequeue()
+		if !a64s.Empty() {
+			index := a64s.Dequeue()
+			amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */}
+		} else {
+			amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
+		}
+	case a32sExist:
+		base32 := a32s.Dequeue()
+
+		// First we need 64-bit base.
+		base := m.compiler.AllocateVReg(ssa.TypeI64)
+		baseExt := m.allocateInstr()
+		var signed bool
+		if base32.ext == extendOpSXTW {
+			signed = true
+		}
+		baseExt.asExtend(base, base32.r, 32, 64, signed)
+		m.insert(baseExt)
+
+		if !a32s.Empty() {
+			index := a32s.Dequeue()
+			amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext}
+		} else {
+			amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
+		}
+	default: // Only static offsets.
+		tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
+		m.lowerConstantI64(tmpReg, offset)
+		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0}
+		offset = 0
+	}
+
+	baseReg := amode.rn
+	if offset > 0 {
+		baseReg = m.addConstToReg64(baseReg, offset) // baseReg += offset
+	}
+
+	for !a64s.Empty() {
+		a64 := a64s.Dequeue()
+		baseReg = m.addReg64ToReg64(baseReg, a64) // baseReg += a64
+	}
+
+	for !a32s.Empty() {
+		a32 := a32s.Dequeue()
+		baseReg = m.addRegToReg64Ext(baseReg, a32.r, a32.ext) // baseReg += (a32 extended to 64-bit)
+	}
+	amode.rn = baseReg
+	return
+}
+
+var addendsMatchOpcodes = [4]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst}
+
+func (m *machine) collectAddends(ptr ssa.Value) (addends32 *wazevoapi.Queue[addend32], addends64 *wazevoapi.Queue[regalloc.VReg], offset int64) {
+	m.addendsWorkQueue.Reset()
+	m.addends32.Reset()
+	m.addends64.Reset()
+	m.addendsWorkQueue.Enqueue(ptr)
+
+	for !m.addendsWorkQueue.Empty() {
+		v := m.addendsWorkQueue.Dequeue()
+
+		def := m.compiler.ValueDefinition(v)
+		switch op := m.compiler.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op {
+		case ssa.OpcodeIadd:
+			// If the addend is an add, we recursively collect its operands.
+			x, y := def.Instr.Arg2()
+			m.addendsWorkQueue.Enqueue(x)
+			m.addendsWorkQueue.Enqueue(y)
+			def.Instr.MarkLowered()
+		case ssa.OpcodeIconst:
+			// If the addend is constant, we just statically merge it into the offset.
+			ic := def.Instr
+			u64 := ic.ConstantVal()
+			if ic.Return().Type().Bits() == 32 {
+				offset += int64(int32(u64)) // sign-extend.
+			} else {
+				offset += int64(u64)
+			}
+			def.Instr.MarkLowered()
+		case ssa.OpcodeUExtend, ssa.OpcodeSExtend:
+			input := def.Instr.Arg()
+			if input.Type().Bits() != 32 {
+				panic("illegal size: " + input.Type().String())
+			}
+
+			var ext extendOp
+			if op == ssa.OpcodeUExtend {
+				ext = extendOpUXTW
+			} else {
+				ext = extendOpSXTW
+			}
+
+			inputDef := m.compiler.ValueDefinition(input)
+			constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant()
+			switch {
+			case constInst && ext == extendOpUXTW:
+				// Zero-extension of a 32-bit constant can be merged into the offset.
+				offset += int64(uint32(inputDef.Instr.ConstantVal()))
+			case constInst && ext == extendOpSXTW:
+				// Sign-extension of a 32-bit constant can be merged into the offset.
+				offset += int64(int32(inputDef.Instr.ConstantVal())) // sign-extend!
+			default:
+				m.addends32.Enqueue(addend32{r: m.getOperand_NR(inputDef, extModeNone).nr(), ext: ext})
+			}
+			def.Instr.MarkLowered()
+			continue
+		default:
+			// If the addend is not one of them, we simply use it as-is (without merging!), optionally zero-extending it.
+			m.addends64.Enqueue(m.getOperand_NR(def, extModeZeroExtend64 /* optional zero ext */).nr())
+		}
+	}
+	return &m.addends32, &m.addends64, offset
+}
+
+func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) {
+	rd = m.compiler.AllocateVReg(ssa.TypeI64)
+	alu := m.allocateInstr()
+	if imm12Op, ok := asImm12Operand(uint64(c)); ok {
+		alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), imm12Op, true)
+	} else if imm12Op, ok = asImm12Operand(uint64(-c)); ok {
+		alu.asALU(aluOpSub, operandNR(rd), operandNR(r), imm12Op, true)
+	} else {
+		tmp := m.compiler.AllocateVReg(ssa.TypeI64)
+		m.load64bitConst(c, tmp)
+		alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), operandNR(tmp), true)
+	}
+	m.insert(alu)
+	return
+}
+
+func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) {
+	rd = m.compiler.AllocateVReg(ssa.TypeI64)
+	alu := m.allocateInstr()
+	alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandNR(rm), true)
+	m.insert(alu)
+	return
+}
+
+func (m *machine) addRegToReg64Ext(rn, rm regalloc.VReg, ext extendOp) (rd regalloc.VReg) {
+	rd = m.compiler.AllocateVReg(ssa.TypeI64)
+	alu := m.allocateInstr()
+	alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandER(rm, ext, 64), true)
+	m.insert(alu)
+	return
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
new file mode 100644
index 000000000..b435d9ba9
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
@@ -0,0 +1,515 @@
+package arm64
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type (
+	// machine implements backend.Machine.
+	machine struct {
+		compiler          backend.Compiler
+		executableContext *backend.ExecutableContextT[instruction]
+		currentABI        *backend.FunctionABI
+
+		regAlloc   regalloc.Allocator
+		regAllocFn *backend.RegAllocFunction[*instruction, *machine]
+
+		// addendsWorkQueue is used during address lowering, defined here for reuse.
+		addendsWorkQueue wazevoapi.Queue[ssa.Value]
+		addends32        wazevoapi.Queue[addend32]
+		// addends64 is used during address lowering, defined here for reuse.
+		addends64              wazevoapi.Queue[regalloc.VReg]
+		unresolvedAddressModes []*instruction
+
+		// condBrRelocs holds the conditional branches which need offset relocation.
+		condBrRelocs []condBrReloc
+
+		// jmpTableTargets holds the labels of the jump table targets.
+		jmpTableTargets [][]uint32
+
+		// spillSlotSize is the size of the stack slot in bytes used for spilling registers.
+		// During the execution of the function, the stack looks like:
+		//
+		//
+		//            (high address)
+		//          +-----------------+
+		//          |     .......     |
+		//          |      ret Y      |
+		//          |     .......     |
+		//          |      ret 0      |
+		//          |      arg X      |
+		//          |     .......     |
+		//          |      arg 1      |
+		//          |      arg 0      |
+		//          |      xxxxx      |
+		//          |   ReturnAddress |
+		//          +-----------------+   <<-|
+		//          |   ...........   |      |
+		//          |   spill slot M  |      | <--- spillSlotSize
+		//          |   ............  |      |
+		//          |   spill slot 2  |      |
+		//          |   spill slot 1  |   <<-+
+		//          |   clobbered N   |
+		//          |   ...........   |
+		//          |   clobbered 1   |
+		//          |   clobbered 0   |
+		//   SP---> +-----------------+
+		//             (low address)
+		//
+		// and it represents the size of the space between FP and the first spilled slot. This must be a multiple of 16.
+		// Also note that this is only known after register allocation.
+		spillSlotSize int64
+		spillSlots    map[regalloc.VRegID]int64 // regalloc.VRegID to offset.
+		// clobberedRegs holds real-register backed VRegs saved at the function prologue, and restored at the epilogue.
+		clobberedRegs []regalloc.VReg
+
+		maxRequiredStackSizeForCalls int64
+		stackBoundsCheckDisabled     bool
+
+		regAllocStarted bool
+	}
+
+	addend32 struct {
+		r   regalloc.VReg
+		ext extendOp
+	}
+
+	condBrReloc struct {
+		cbr *instruction
+		// currentLabelPos is the labelPosition within which condBr is defined.
+		currentLabelPos *labelPosition
+		// Next block's labelPosition.
+		nextLabel label
+		offset    int64
+	}
+
+	labelPosition = backend.LabelPosition[instruction]
+	label         = backend.Label
+)
+
+const (
+	labelReturn  = backend.LabelReturn
+	labelInvalid = backend.LabelInvalid
+)
+
+// NewBackend returns a new backend for arm64.
+func NewBackend() backend.Machine {
+	m := &machine{
+		spillSlots:        make(map[regalloc.VRegID]int64),
+		executableContext: newExecutableContext(),
+		regAlloc:          regalloc.NewAllocator(regInfo),
+	}
+	return m
+}
+
+func newExecutableContext() *backend.ExecutableContextT[instruction] {
+	return backend.NewExecutableContextT[instruction](resetInstruction, setNext, setPrev, asNop0)
+}
+
+// ExecutableContext implements backend.Machine.
+func (m *machine) ExecutableContext() backend.ExecutableContext {
+	return m.executableContext
+}
+
+// RegAlloc implements backend.Machine Function.
+func (m *machine) RegAlloc() {
+	rf := m.regAllocFn
+	for _, pos := range m.executableContext.OrderedBlockLabels {
+		rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End)
+	}
+
+	m.regAllocStarted = true
+	m.regAlloc.DoAllocation(rf)
+	// Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes.
+	m.spillSlotSize = (m.spillSlotSize + 15) &^ 15
+}
+
+// Reset implements backend.Machine.
+func (m *machine) Reset() {
+	m.clobberedRegs = m.clobberedRegs[:0]
+	for key := range m.spillSlots {
+		m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key))
+	}
+	for _, key := range m.clobberedRegs {
+		delete(m.spillSlots, regalloc.VRegID(key))
+	}
+	m.clobberedRegs = m.clobberedRegs[:0]
+	m.regAllocStarted = false
+	m.regAlloc.Reset()
+	m.regAllocFn.Reset()
+	m.spillSlotSize = 0
+	m.unresolvedAddressModes = m.unresolvedAddressModes[:0]
+	m.maxRequiredStackSizeForCalls = 0
+	m.executableContext.Reset()
+	m.jmpTableTargets = m.jmpTableTargets[:0]
+}
+
+// SetCurrentABI implements backend.Machine SetCurrentABI.
+func (m *machine) SetCurrentABI(abi *backend.FunctionABI) {
+	m.currentABI = abi
+}
+
+// DisableStackCheck implements backend.Machine DisableStackCheck.
+func (m *machine) DisableStackCheck() {
+	m.stackBoundsCheckDisabled = true
+}
+
+// SetCompiler implements backend.Machine.
+func (m *machine) SetCompiler(ctx backend.Compiler) {
+	m.compiler = ctx
+	m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, ctx.SSABuilder(), ctx)
+}
+
+func (m *machine) insert(i *instruction) {
+	ectx := m.executableContext
+	ectx.PendingInstructions = append(ectx.PendingInstructions, i)
+}
+
+func (m *machine) insertBrTargetLabel() label {
+	nop, l := m.allocateBrTarget()
+	m.insert(nop)
+	return l
+}
+
+func (m *machine) allocateBrTarget() (nop *instruction, l label) {
+	ectx := m.executableContext
+	l = ectx.AllocateLabel()
+	nop = m.allocateInstr()
+	nop.asNop0WithLabel(l)
+	pos := ectx.AllocateLabelPosition(l)
+	pos.Begin, pos.End = nop, nop
+	ectx.LabelPositions[l] = pos
+	return
+}
+
+// allocateInstr allocates an instruction.
+func (m *machine) allocateInstr() *instruction {
+	instr := m.executableContext.InstructionPool.Allocate()
+	if !m.regAllocStarted {
+		instr.addedBeforeRegAlloc = true
+	}
+	return instr
+}
+
+func resetInstruction(i *instruction) {
+	*i = instruction{}
+}
+
+func (m *machine) allocateNop() *instruction {
+	instr := m.allocateInstr()
+	instr.asNop0()
+	return instr
+}
+
+func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruction) {
+	amode := &i.amode
+	switch amode.kind {
+	case addressModeKindResultStackSpace:
+		amode.imm += ret0offset
+	case addressModeKindArgStackSpace:
+		amode.imm += arg0offset
+	default:
+		panic("BUG")
+	}
+
+	var sizeInBits byte
+	switch i.kind {
+	case store8, uLoad8:
+		sizeInBits = 8
+	case store16, uLoad16:
+		sizeInBits = 16
+	case store32, fpuStore32, uLoad32, fpuLoad32:
+		sizeInBits = 32
+	case store64, fpuStore64, uLoad64, fpuLoad64:
+		sizeInBits = 64
+	case fpuStore128, fpuLoad128:
+		sizeInBits = 128
+	default:
+		panic("BUG")
+	}
+
+	if offsetFitsInAddressModeKindRegUnsignedImm12(sizeInBits, amode.imm) {
+		amode.kind = addressModeKindRegUnsignedImm12
+	} else {
+		// This case, we load the offset into the temporary register,
+		// and then use it as the index register.
+		newPrev := m.lowerConstantI64AndInsert(i.prev, tmpRegVReg, amode.imm)
+		linkInstr(newPrev, i)
+		*amode = addressMode{kind: addressModeKindRegReg, rn: amode.rn, rm: tmpRegVReg, extOp: extendOpUXTX /* indicates rm reg is 64-bit */}
+	}
+}
+
+// resolveRelativeAddresses resolves the relative addresses before encoding.
+func (m *machine) resolveRelativeAddresses(ctx context.Context) {
+	ectx := m.executableContext
+	for {
+		if len(m.unresolvedAddressModes) > 0 {
+			arg0offset, ret0offset := m.arg0OffsetFromSP(), m.ret0OffsetFromSP()
+			for _, i := range m.unresolvedAddressModes {
+				m.resolveAddressingMode(arg0offset, ret0offset, i)
+			}
+		}
+
+		// Reuse the slice to gather the unresolved conditional branches.
+		m.condBrRelocs = m.condBrRelocs[:0]
+
+		var fn string
+		var fnIndex int
+		var labelToSSABlockID map[label]ssa.BasicBlockID
+		if wazevoapi.PerfMapEnabled {
+			fn = wazevoapi.GetCurrentFunctionName(ctx)
+			labelToSSABlockID = make(map[label]ssa.BasicBlockID)
+			for i, l := range ectx.SsaBlockIDToLabels {
+				labelToSSABlockID[l] = ssa.BasicBlockID(i)
+			}
+			fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx)
+		}
+
+		// Next, in order to determine the offsets of relative jumps, we have to calculate the size of each label.
+		var offset int64
+		for i, pos := range ectx.OrderedBlockLabels {
+			pos.BinaryOffset = offset
+			var size int64
+			for cur := pos.Begin; ; cur = cur.next {
+				switch cur.kind {
+				case nop0:
+					l := cur.nop0Label()
+					if pos, ok := ectx.LabelPositions[l]; ok {
+						pos.BinaryOffset = offset + size
+					}
+				case condBr:
+					if !cur.condBrOffsetResolved() {
+						var nextLabel label
+						if i < len(ectx.OrderedBlockLabels)-1 {
+							// Note: this is only used when the block ends with fallthrough,
+							// therefore can be safely assumed that the next block exists when it's needed.
+							nextLabel = ectx.OrderedBlockLabels[i+1].L
+						}
+						m.condBrRelocs = append(m.condBrRelocs, condBrReloc{
+							cbr: cur, currentLabelPos: pos, offset: offset + size,
+							nextLabel: nextLabel,
+						})
+					}
+				}
+				size += cur.size()
+				if cur == pos.End {
+					break
+				}
+			}
+
+			if wazevoapi.PerfMapEnabled {
+				if size > 0 {
+					l := pos.L
+					var labelStr string
+					if blkID, ok := labelToSSABlockID[l]; ok {
+						labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID)
+					} else {
+						labelStr = l.String()
+					}
+					wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr))
+				}
+			}
+			offset += size
+		}
+
+		// Before resolving any offsets, we need to check if all the conditional branches can be resolved.
+		var needRerun bool
+		for i := range m.condBrRelocs {
+			reloc := &m.condBrRelocs[i]
+			cbr := reloc.cbr
+			offset := reloc.offset
+
+			target := cbr.condBrLabel()
+			offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
+			diff := offsetOfTarget - offset
+			if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
+				// This case the conditional branch is too huge. We place the trampoline instructions at the end of the current block,
+				// and jump to it.
+				m.insertConditionalJumpTrampoline(cbr, reloc.currentLabelPos, reloc.nextLabel)
+				// Then, we need to recall this function to fix up the label offsets
+				// as they have changed after the trampoline is inserted.
+				needRerun = true
+			}
+		}
+		if needRerun {
+			if wazevoapi.PerfMapEnabled {
+				wazevoapi.PerfMap.Clear()
+			}
+		} else {
+			break
+		}
+	}
+
+	var currentOffset int64
+	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+		switch cur.kind {
+		case br:
+			target := cur.brLabel()
+			offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
+			diff := offsetOfTarget - currentOffset
+			divided := diff >> 2
+			if divided < minSignedInt26 || divided > maxSignedInt26 {
+				// This means the currently compiled single function is extremely large.
+				panic("too large function that requires branch relocation of large unconditional branch larger than 26-bit range")
+			}
+			cur.brOffsetResolve(diff)
+		case condBr:
+			if !cur.condBrOffsetResolved() {
+				target := cur.condBrLabel()
+				offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
+				diff := offsetOfTarget - currentOffset
+				if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
+					panic("BUG: branch relocation for large conditional branch larger than 19-bit range must be handled properly")
+				}
+				cur.condBrOffsetResolve(diff)
+			}
+		case brTableSequence:
+			tableIndex := cur.u1
+			targets := m.jmpTableTargets[tableIndex]
+			for i := range targets {
+				l := label(targets[i])
+				offsetOfTarget := ectx.LabelPositions[l].BinaryOffset
+				diff := offsetOfTarget - (currentOffset + brTableSequenceOffsetTableBegin)
+				targets[i] = uint32(diff)
+			}
+			cur.brTableSequenceOffsetsResolved()
+		case emitSourceOffsetInfo:
+			m.compiler.AddSourceOffsetInfo(currentOffset, cur.sourceOffsetInfo())
+		}
+		currentOffset += cur.size()
+	}
+}
+
+const (
+	maxSignedInt26 = 1<<25 - 1
+	minSignedInt26 = -(1 << 25)
+
+	maxSignedInt19 = 1<<18 - 1
+	minSignedInt19 = -(1 << 18)
+)
+
+func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *labelPosition, nextLabel label) {
+	cur := currentBlk.End
+	originalTarget := cbr.condBrLabel()
+	endNext := cur.next
+
+	if cur.kind != br {
+		// If the current block ends with a conditional branch, we can just insert the trampoline after it.
+		// Otherwise, we need to insert "skip" instruction to skip the trampoline instructions.
+		skip := m.allocateInstr()
+		skip.asBr(nextLabel)
+		cur = linkInstr(cur, skip)
+	}
+
+	cbrNewTargetInstr, cbrNewTargetLabel := m.allocateBrTarget()
+	cbr.setCondBrTargets(cbrNewTargetLabel)
+	cur = linkInstr(cur, cbrNewTargetInstr)
+
+	// Then insert the unconditional branch to the original, which should be possible to get encoded
+	// as 26-bit offset should be enough for any practical application.
+	br := m.allocateInstr()
+	br.asBr(originalTarget)
+	cur = linkInstr(cur, br)
+
+	// Update the end of the current block.
+	currentBlk.End = cur
+
+	linkInstr(cur, endNext)
+}
+
+// Format implements backend.Machine.
+func (m *machine) Format() string {
+	ectx := m.executableContext
+	begins := map[*instruction]label{}
+	for l, pos := range ectx.LabelPositions {
+		begins[pos.Begin] = l
+	}
+
+	irBlocks := map[label]ssa.BasicBlockID{}
+	for i, l := range ectx.SsaBlockIDToLabels {
+		irBlocks[l] = ssa.BasicBlockID(i)
+	}
+
+	var lines []string
+	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+		if l, ok := begins[cur]; ok {
+			var labelStr string
+			if blkID, ok := irBlocks[l]; ok {
+				labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID)
+			} else {
+				labelStr = fmt.Sprintf("%s:", l)
+			}
+			lines = append(lines, labelStr)
+		}
+		if cur.kind == nop0 {
+			continue
+		}
+		lines = append(lines, "\t"+cur.String())
+	}
+	return "\n" + strings.Join(lines, "\n") + "\n"
+}
+
+// InsertReturn implements backend.Machine.
+func (m *machine) InsertReturn() {
+	i := m.allocateInstr()
+	i.asRet()
+	m.insert(i)
+}
+
+func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 {
+	offset, ok := m.spillSlots[id]
+	if !ok {
+		offset = m.spillSlotSize
+		// TODO: this should be aligned depending on the `size` to use Imm12 offset load/store as much as possible.
+		m.spillSlots[id] = offset
+		m.spillSlotSize += int64(size)
+	}
+	return offset + 16 // spill slot starts above the clobbered registers and the frame size.
+}
+
+func (m *machine) clobberedRegSlotSize() int64 {
+	return int64(len(m.clobberedRegs) * 16)
+}
+
+func (m *machine) arg0OffsetFromSP() int64 {
+	return m.frameSize() +
+		16 + // 16-byte aligned return address
+		16 // frame size saved below the clobbered registers.
+}
+
+func (m *machine) ret0OffsetFromSP() int64 {
+	return m.arg0OffsetFromSP() + m.currentABI.ArgStackSize
+}
+
+func (m *machine) requiredStackSize() int64 {
+	return m.maxRequiredStackSizeForCalls +
+		m.frameSize() +
+		16 + // 16-byte aligned return address.
+		16 // frame size saved below the clobbered registers.
+}
+
+func (m *machine) frameSize() int64 {
+	s := m.clobberedRegSlotSize() + m.spillSlotSize
+	if s&0xf != 0 {
+		panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s))
+	}
+	return s
+}
+
+func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) {
+	// TODO: reuse the slice!
+	labels := make([]uint32, len(targets))
+	for j, target := range targets {
+		labels[j] = uint32(m.executableContext.GetOrAllocateSSABlockLabel(target))
+	}
+	index = len(m.jmpTableTargets)
+	m.jmpTableTargets = append(m.jmpTableTargets, labels)
+	return
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
new file mode 100644
index 000000000..466fac464
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
@@ -0,0 +1,469 @@
+package arm64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// PostRegAlloc implements backend.Machine.
+func (m *machine) PostRegAlloc() {
+	m.setupPrologue()
+	m.postRegAlloc()
+}
+
+// setupPrologue initializes the prologue of the function.
+func (m *machine) setupPrologue() {
+	ectx := m.executableContext
+
+	cur := ectx.RootInstr
+	prevInitInst := cur.next
+
+	//
+	//                   (high address)                    (high address)
+	//         SP----> +-----------------+               +------------------+ <----+
+	//                 |     .......     |               |     .......      |      |
+	//                 |      ret Y      |               |      ret Y       |      |
+	//                 |     .......     |               |     .......      |      |
+	//                 |      ret 0      |               |      ret 0       |      |
+	//                 |      arg X      |               |      arg X       |      |  size_of_arg_ret.
+	//                 |     .......     |     ====>     |     .......      |      |
+	//                 |      arg 1      |               |      arg 1       |      |
+	//                 |      arg 0      |               |      arg 0       | <----+
+	//                 |-----------------|               |  size_of_arg_ret |
+	//                                                   |  return address  |
+	//                                                   +------------------+ <---- SP
+	//                    (low address)                     (low address)
+
+	// Saves the return address (lr) and the size_of_arg_ret below the SP.
+	// size_of_arg_ret is used for stack unwinding.
+	cur = m.createReturnAddrAndSizeOfArgRetSlot(cur)
+
+	if !m.stackBoundsCheckDisabled {
+		cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur)
+	}
+
+	// Decrement SP if spillSlotSize > 0.
+	if m.spillSlotSize == 0 && len(m.spillSlots) != 0 {
+		panic(fmt.Sprintf("BUG: spillSlotSize=%d, spillSlots=%v\n", m.spillSlotSize, m.spillSlots))
+	}
+
+	if regs := m.clobberedRegs; len(regs) > 0 {
+		//
+		//            (high address)                  (high address)
+		//          +-----------------+             +-----------------+
+		//          |     .......     |             |     .......     |
+		//          |      ret Y      |             |      ret Y      |
+		//          |     .......     |             |     .......     |
+		//          |      ret 0      |             |      ret 0      |
+		//          |      arg X      |             |      arg X      |
+		//          |     .......     |             |     .......     |
+		//          |      arg 1      |             |      arg 1      |
+		//          |      arg 0      |             |      arg 0      |
+		//          | size_of_arg_ret |             | size_of_arg_ret |
+		//          |   ReturnAddress |             |  ReturnAddress  |
+		//  SP----> +-----------------+    ====>    +-----------------+
+		//             (low address)                |   clobbered M   |
+		//                                          |   ............  |
+		//                                          |   clobbered 0   |
+		//                                          +-----------------+ <----- SP
+		//                                             (low address)
+		//
+		_amode := addressModePreOrPostIndex(spVReg,
+			-16,  // stack pointer must be 16-byte aligned.
+			true, // Decrement before store.
+		)
+		for _, vr := range regs {
+			// TODO: pair stores to reduce the number of instructions.
+			store := m.allocateInstr()
+			store.asStore(operandNR(vr), _amode, regTypeToRegisterSizeInBits(vr.RegType()))
+			cur = linkInstr(cur, store)
+		}
+	}
+
+	if size := m.spillSlotSize; size > 0 {
+		// Check if size is 16-byte aligned.
+		if size&0xf != 0 {
+			panic(fmt.Errorf("BUG: spill slot size %d is not 16-byte aligned", size))
+		}
+
+		cur = m.addsAddOrSubStackPointer(cur, spVReg, size, false)
+
+		// At this point, the stack looks like:
+		//
+		//            (high address)
+		//          +------------------+
+		//          |     .......      |
+		//          |      ret Y       |
+		//          |     .......      |
+		//          |      ret 0       |
+		//          |      arg X       |
+		//          |     .......      |
+		//          |      arg 1       |
+		//          |      arg 0       |
+		//          |  size_of_arg_ret |
+		//          |   ReturnAddress  |
+		//          +------------------+
+		//          |    clobbered M   |
+		//          |   ............   |
+		//          |    clobbered 0   |
+		//          |   spill slot N   |
+		//          |   ............   |
+		//          |   spill slot 2   |
+		//          |   spill slot 0   |
+		//  SP----> +------------------+
+		//             (low address)
+	}
+
+	// We push the frame size into the stack to make it possible to unwind stack:
+	//
+	//
+	//            (high address)                  (high address)
+	//         +-----------------+                +-----------------+
+	//         |     .......     |                |     .......     |
+	//         |      ret Y      |                |      ret Y      |
+	//         |     .......     |                |     .......     |
+	//         |      ret 0      |                |      ret 0      |
+	//         |      arg X      |                |      arg X      |
+	//         |     .......     |                |     .......     |
+	//         |      arg 1      |                |      arg 1      |
+	//         |      arg 0      |                |      arg 0      |
+	//         | size_of_arg_ret |                | size_of_arg_ret |
+	//         |  ReturnAddress  |                |  ReturnAddress  |
+	//         +-----------------+      ==>       +-----------------+ <----+
+	//         |   clobbered  M  |                |   clobbered  M  |      |
+	//         |   ............  |                |   ............  |      |
+	//         |   clobbered  2  |                |   clobbered  2  |      |
+	//         |   clobbered  1  |                |   clobbered  1  |      | frame size
+	//         |   clobbered  0  |                |   clobbered  0  |      |
+	//         |   spill slot N  |                |   spill slot N  |      |
+	//         |   ............  |                |   ............  |      |
+	//         |   spill slot 0  |                |   spill slot 0  | <----+
+	// SP--->  +-----------------+                |     xxxxxx      |  ;; unused space to make it 16-byte aligned.
+	//                                            |   frame_size    |
+	//                                            +-----------------+ <---- SP
+	//            (low address)
+	//
+	cur = m.createFrameSizeSlot(cur, m.frameSize())
+
+	linkInstr(cur, prevInitInst)
+}
+
+func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruction {
+	// First we decrement the stack pointer to point the arg0 slot.
+	var sizeOfArgRetReg regalloc.VReg
+	s := int64(m.currentABI.AlignedArgResultStackSlotSize())
+	if s > 0 {
+		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
+		sizeOfArgRetReg = tmpRegVReg
+
+		subSp := m.allocateInstr()
+		subSp.asALU(aluOpSub, operandNR(spVReg), operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
+		cur = linkInstr(cur, subSp)
+	} else {
+		sizeOfArgRetReg = xzrVReg
+	}
+
+	// Saves the return address (lr) and the size_of_arg_ret below the SP.
+	// size_of_arg_ret is used for stack unwinding.
+	pstr := m.allocateInstr()
+	amode := addressModePreOrPostIndex(spVReg, -16, true /* decrement before store */)
+	pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode)
+	cur = linkInstr(cur, pstr)
+	return cur
+}
+
+func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction {
+	var frameSizeReg regalloc.VReg
+	if s > 0 {
+		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
+		frameSizeReg = tmpRegVReg
+	} else {
+		frameSizeReg = xzrVReg
+	}
+	_amode := addressModePreOrPostIndex(spVReg,
+		-16,  // stack pointer must be 16-byte aligned.
+		true, // Decrement before store.
+	)
+	store := m.allocateInstr()
+	store.asStore(operandNR(frameSizeReg), _amode, 64)
+	cur = linkInstr(cur, store)
+	return cur
+}
+
+// postRegAlloc does multiple things while walking through the instructions:
+// 1. Removes the redundant copy instruction.
+// 2. Inserts the epilogue.
+func (m *machine) postRegAlloc() {
+	ectx := m.executableContext
+	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+		switch cur.kind {
+		case ret:
+			m.setupEpilogueAfter(cur.prev)
+		case loadConstBlockArg:
+			lc := cur
+			next := lc.next
+			m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
+			m.lowerLoadConstantBlockArgAfterRegAlloc(lc)
+			for _, instr := range m.executableContext.PendingInstructions {
+				cur = linkInstr(cur, instr)
+			}
+			linkInstr(cur, next)
+			m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
+		default:
+			// Removes the redundant copy instruction.
+			if cur.IsCopy() && cur.rn.realReg() == cur.rd.realReg() {
+				prev, next := cur.prev, cur.next
+				// Remove the copy instruction.
+				prev.next = next
+				if next != nil {
+					next.prev = prev
+				}
+			}
+		}
+	}
+}
+
+func (m *machine) setupEpilogueAfter(cur *instruction) {
+	prevNext := cur.next
+
+	// We've stored the frame size in the prologue, and now that we are about to return from this function, we won't need it anymore.
+	cur = m.addsAddOrSubStackPointer(cur, spVReg, 16, true)
+
+	if s := m.spillSlotSize; s > 0 {
+		// Adjust SP to the original value:
+		//
+		//            (high address)                        (high address)
+		//          +-----------------+                  +-----------------+
+		//          |     .......     |                  |     .......     |
+		//          |      ret Y      |                  |      ret Y      |
+		//          |     .......     |                  |     .......     |
+		//          |      ret 0      |                  |      ret 0      |
+		//          |      arg X      |                  |      arg X      |
+		//          |     .......     |                  |     .......     |
+		//          |      arg 1      |                  |      arg 1      |
+		//          |      arg 0      |                  |      arg 0      |
+		//          |      xxxxx      |                  |      xxxxx      |
+		//          |   ReturnAddress |                  |   ReturnAddress |
+		//          +-----------------+      ====>       +-----------------+
+		//          |    clobbered M  |                  |    clobbered M  |
+		//          |   ............  |                  |   ............  |
+		//          |    clobbered 1  |                  |    clobbered 1  |
+		//          |    clobbered 0  |                  |    clobbered 0  |
+		//          |   spill slot N  |                  +-----------------+ <---- SP
+		//          |   ............  |
+		//          |   spill slot 0  |
+		//   SP---> +-----------------+
+		//             (low address)
+		//
+		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
+	}
+
+	// First we need to restore the clobbered registers.
+	if len(m.clobberedRegs) > 0 {
+		//            (high address)
+		//          +-----------------+                      +-----------------+
+		//          |     .......     |                      |     .......     |
+		//          |      ret Y      |                      |      ret Y      |
+		//          |     .......     |                      |     .......     |
+		//          |      ret 0      |                      |      ret 0      |
+		//          |      arg X      |                      |      arg X      |
+		//          |     .......     |                      |     .......     |
+		//          |      arg 1      |                      |      arg 1      |
+		//          |      arg 0      |                      |      arg 0      |
+		//          |      xxxxx      |                      |      xxxxx      |
+		//          |   ReturnAddress |                      |   ReturnAddress |
+		//          +-----------------+      ========>       +-----------------+ <---- SP
+		//          |   clobbered M   |
+		//          |   ...........   |
+		//          |   clobbered 1   |
+		//          |   clobbered 0   |
+		//   SP---> +-----------------+
+		//             (low address)
+
+		l := len(m.clobberedRegs) - 1
+		for i := range m.clobberedRegs {
+			vr := m.clobberedRegs[l-i] // reverse order to restore.
+			load := m.allocateInstr()
+			amode := addressModePreOrPostIndex(spVReg,
+				16,    // stack pointer must be 16-byte aligned.
+				false, // Increment after store.
+			)
+			// TODO: pair loads to reduce the number of instructions.
+			switch regTypeToRegisterSizeInBits(vr.RegType()) {
+			case 64: // save int reg.
+				load.asULoad(operandNR(vr), amode, 64)
+			case 128: // save vector reg.
+				load.asFpuLoad(operandNR(vr), amode, 128)
+			}
+			cur = linkInstr(cur, load)
+		}
+	}
+
+	// Reload the return address (lr).
+	//
+	//            +-----------------+          +-----------------+
+	//            |     .......     |          |     .......     |
+	//            |      ret Y      |          |      ret Y      |
+	//            |     .......     |          |     .......     |
+	//            |      ret 0      |          |      ret 0      |
+	//            |      arg X      |          |      arg X      |
+	//            |     .......     |   ===>   |     .......     |
+	//            |      arg 1      |          |      arg 1      |
+	//            |      arg 0      |          |      arg 0      |
+	//            |      xxxxx      |          +-----------------+ <---- SP
+	//            |  ReturnAddress  |
+	//    SP----> +-----------------+
+
+	ldr := m.allocateInstr()
+	ldr.asULoad(operandNR(lrVReg),
+		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
+	cur = linkInstr(cur, ldr)
+
+	if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
+		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
+	}
+
+	linkInstr(cur, prevNext)
+}
+
+// saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
+// stack space left. Basically this is the combination of CalleeSavedRegisters plus argument registers execpt for x0,
+// which always points to the execution context whenever the native code is entered from Go.
+var saveRequiredRegs = []regalloc.VReg{
+	x1VReg, x2VReg, x3VReg, x4VReg, x5VReg, x6VReg, x7VReg,
+	x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg, lrVReg,
+	v0VReg, v1VReg, v2VReg, v3VReg, v4VReg, v5VReg, v6VReg, v7VReg,
+	v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg,
+}
+
+// insertStackBoundsCheck will insert the instructions after `cur` to check the
+// stack bounds, and if there's no sufficient spaces required for the function,
+// exit the execution and try growing it in Go world.
+//
+// TODO: we should be able to share the instructions across all the functions to reduce the size of compiled executable.
+func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction {
+	if requiredStackSize%16 != 0 {
+		panic("BUG")
+	}
+
+	if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok {
+		// sub tmp, sp, #requiredStackSize
+		sub := m.allocateInstr()
+		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), immm12op, true)
+		cur = linkInstr(cur, sub)
+	} else {
+		// This case, we first load the requiredStackSize into the temporary register,
+		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
+		// Then subtract it.
+		sub := m.allocateInstr()
+		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true)
+		cur = linkInstr(cur, sub)
+	}
+
+	tmp2 := x11VReg // Caller save, so it is safe to use it here in the prologue.
+
+	// ldr tmp2, [executionContext #StackBottomPtr]
+	ldr := m.allocateInstr()
+	ldr.asULoad(operandNR(tmp2), addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   x0VReg, // execution context is always the first argument.
+		imm:  wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(),
+	}, 64)
+	cur = linkInstr(cur, ldr)
+
+	// subs xzr, tmp, tmp2
+	subs := m.allocateInstr()
+	subs.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpRegVReg), operandNR(tmp2), true)
+	cur = linkInstr(cur, subs)
+
+	// b.ge #imm
+	cbr := m.allocateInstr()
+	cbr.asCondBr(ge.asCond(), labelInvalid, false /* ignored */)
+	cur = linkInstr(cur, cbr)
+
+	// Set the required stack size and set it to the exec context.
+	{
+		// First load the requiredStackSize into the temporary register,
+		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
+		setRequiredStackSize := m.allocateInstr()
+		setRequiredStackSize.asStore(operandNR(tmpRegVReg),
+			addressMode{
+				kind: addressModeKindRegUnsignedImm12,
+				// Execution context is always the first argument.
+				rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
+			}, 64)
+
+		cur = linkInstr(cur, setRequiredStackSize)
+	}
+
+	ldrAddress := m.allocateInstr()
+	ldrAddress.asULoad(operandNR(tmpRegVReg), addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   x0VReg, // execution context is always the first argument
+		imm:  wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(),
+	}, 64)
+	cur = linkInstr(cur, ldrAddress)
+
+	// Then jumps to the stack grow call sequence's address, meaning
+	// transferring the control to the code compiled by CompileStackGrowCallSequence.
+	bl := m.allocateInstr()
+	bl.asCallIndirect(tmpRegVReg, nil)
+	cur = linkInstr(cur, bl)
+
+	// Now that we know the entire code, we can finalize how many bytes
+	// we have to skip when the stack size is sufficient.
+	var cbrOffset int64
+	for _cur := cbr; ; _cur = _cur.next {
+		cbrOffset += _cur.size()
+		if _cur == cur {
+			break
+		}
+	}
+	cbr.condBrOffsetResolve(cbrOffset)
+	return cur
+}
+
+// CompileStackGrowCallSequence implements backend.Machine.
+func (m *machine) CompileStackGrowCallSequence() []byte {
+	ectx := m.executableContext
+
+	cur := m.allocateInstr()
+	cur.asNop0()
+	ectx.RootInstr = cur
+
+	// Save the callee saved and argument registers.
+	cur = m.saveRegistersInExecutionContext(cur, saveRequiredRegs)
+
+	// Save the current stack pointer.
+	cur = m.saveCurrentStackPointer(cur, x0VReg)
+
+	// Set the exit status on the execution context.
+	cur = m.setExitCode(cur, x0VReg, wazevoapi.ExitCodeGrowStack)
+
+	// Exit the execution.
+	cur = m.storeReturnAddressAndExit(cur)
+
+	// After the exit, restore the saved registers.
+	cur = m.restoreRegistersInExecutionContext(cur, saveRequiredRegs)
+
+	// Then goes back the original address of this stack grow call.
+	ret := m.allocateInstr()
+	ret.asRet()
+	linkInstr(cur, ret)
+
+	m.encode(ectx.RootInstr)
+	return m.compiler.Buf()
+}
+
+func (m *machine) addsAddOrSubStackPointer(cur *instruction, rd regalloc.VReg, diff int64, add bool) *instruction {
+	ectx := m.executableContext
+
+	ectx.PendingInstructions = ectx.PendingInstructions[:0]
+	m.insertAddOrSubStackPointer(rd, diff, add)
+	for _, inserted := range ectx.PendingInstructions {
+		cur = linkInstr(cur, inserted)
+	}
+	return cur
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
new file mode 100644
index 000000000..1c8793b73
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
@@ -0,0 +1,152 @@
+package arm64
+
+// This file implements the interfaces required for register allocations. See backend.RegAllocFunctionMachine.
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// ClobberedRegisters implements backend.RegAllocFunctionMachine.
+func (m *machine) ClobberedRegisters(regs []regalloc.VReg) {
+	m.clobberedRegs = append(m.clobberedRegs[:0], regs...)
+}
+
+// Swap implements backend.RegAllocFunctionMachine.
+func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
+	prevNext := cur.next
+	var mov1, mov2, mov3 *instruction
+	if x1.RegType() == regalloc.RegTypeInt {
+		if !tmp.Valid() {
+			tmp = tmpRegVReg
+		}
+		mov1 = m.allocateInstr().asMove64(tmp, x1)
+		mov2 = m.allocateInstr().asMove64(x1, x2)
+		mov3 = m.allocateInstr().asMove64(x2, tmp)
+		cur = linkInstr(cur, mov1)
+		cur = linkInstr(cur, mov2)
+		cur = linkInstr(cur, mov3)
+		linkInstr(cur, prevNext)
+	} else {
+		if !tmp.Valid() {
+			r2 := x2.RealReg()
+			// Temporarily spill x1 to stack.
+			cur = m.InsertStoreRegisterAt(x1, cur, true).prev
+			// Then move x2 to x1.
+			cur = linkInstr(cur, m.allocateInstr().asFpuMov128(x1, x2))
+			linkInstr(cur, prevNext)
+			// Then reload the original value on x1 from stack to r2.
+			m.InsertReloadRegisterAt(x1.SetRealReg(r2), cur, true)
+		} else {
+			mov1 = m.allocateInstr().asFpuMov128(tmp, x1)
+			mov2 = m.allocateInstr().asFpuMov128(x1, x2)
+			mov3 = m.allocateInstr().asFpuMov128(x2, tmp)
+			cur = linkInstr(cur, mov1)
+			cur = linkInstr(cur, mov2)
+			cur = linkInstr(cur, mov3)
+			linkInstr(cur, prevNext)
+		}
+	}
+}
+
+// InsertMoveBefore implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
+	typ := src.RegType()
+	if typ != dst.RegType() {
+		panic("BUG: src and dst must have the same type")
+	}
+
+	mov := m.allocateInstr()
+	if typ == regalloc.RegTypeInt {
+		mov.asMove64(dst, src)
+	} else {
+		mov.asFpuMov128(dst, src)
+	}
+
+	cur := instr.prev
+	prevNext := cur.next
+	cur = linkInstr(cur, mov)
+	linkInstr(cur, prevNext)
+}
+
+// SSABlockLabel implements backend.RegAllocFunctionMachine.
+func (m *machine) SSABlockLabel(id ssa.BasicBlockID) backend.Label {
+	return m.executableContext.SsaBlockIDToLabels[id]
+}
+
+// InsertStoreRegisterAt implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+	if !v.IsRealReg() {
+		panic("BUG: VReg must be backed by real reg to be stored")
+	}
+
+	typ := m.compiler.TypeOf(v)
+
+	var prevNext, cur *instruction
+	if after {
+		cur, prevNext = instr, instr.next
+	} else {
+		cur, prevNext = instr.prev, instr
+	}
+
+	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
+	var amode addressMode
+	cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
+	store := m.allocateInstr()
+	store.asStore(operandNR(v), amode, typ.Bits())
+
+	cur = linkInstr(cur, store)
+	return linkInstr(cur, prevNext)
+}
+
+// InsertReloadRegisterAt implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+	if !v.IsRealReg() {
+		panic("BUG: VReg must be backed by real reg to be stored")
+	}
+
+	typ := m.compiler.TypeOf(v)
+
+	var prevNext, cur *instruction
+	if after {
+		cur, prevNext = instr, instr.next
+	} else {
+		cur, prevNext = instr.prev, instr
+	}
+
+	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
+	var amode addressMode
+	cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
+	load := m.allocateInstr()
+	switch typ {
+	case ssa.TypeI32, ssa.TypeI64:
+		load.asULoad(operandNR(v), amode, typ.Bits())
+	case ssa.TypeF32, ssa.TypeF64:
+		load.asFpuLoad(operandNR(v), amode, typ.Bits())
+	case ssa.TypeV128:
+		load.asFpuLoad(operandNR(v), amode, 128)
+	default:
+		panic("TODO")
+	}
+
+	cur = linkInstr(cur, load)
+	return linkInstr(cur, prevNext)
+}
+
+// LastInstrForInsertion implements backend.RegAllocFunctionMachine.
+func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction {
+	cur := end
+	for cur.kind == nop0 {
+		cur = cur.prev
+		if cur == begin {
+			return end
+		}
+	}
+	switch cur.kind {
+	case br:
+		return cur
+	default:
+		return end
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go
new file mode 100644
index 000000000..83902d927
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go
@@ -0,0 +1,117 @@
+package arm64
+
+import (
+	"encoding/binary"
+	"fmt"
+	"math"
+	"sort"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+)
+
+const (
+	// trampolineCallSize is the size of the trampoline instruction sequence for each function in an island.
+	trampolineCallSize = 4*4 + 4 // Four instructions + 32-bit immediate.
+
+	// Unconditional branch offset is encoded as divided by 4 in imm26.
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link-?lang=en
+
+	maxUnconditionalBranchOffset = maxSignedInt26 * 4
+	minUnconditionalBranchOffset = minSignedInt26 * 4
+
+	// trampolineIslandInterval is the range of the trampoline island.
+	// Half of the range is used for the trampoline island, and the other half is used for the function.
+	trampolineIslandInterval = maxUnconditionalBranchOffset / 2
+
+	// maxNumFunctions explicitly specifies the maximum number of functions that can be allowed in a single executable.
+	maxNumFunctions = trampolineIslandInterval >> 6
+
+	// maxFunctionExecutableSize is the maximum size of a function that can exist in a trampoline island.
+	// Conservatively set to 1/4 of the trampoline island interval.
+	maxFunctionExecutableSize = trampolineIslandInterval >> 2
+)
+
+// CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo.
+func (m *machine) CallTrampolineIslandInfo(numFunctions int) (interval, size int, err error) {
+	if numFunctions > maxNumFunctions {
+		return 0, 0, fmt.Errorf("too many functions: %d > %d", numFunctions, maxNumFunctions)
+	}
+	return trampolineIslandInterval, trampolineCallSize * numFunctions, nil
+}
+
+// ResolveRelocations implements backend.Machine ResolveRelocations.
+func (m *machine) ResolveRelocations(
+	refToBinaryOffset []int,
+	executable []byte,
+	relocations []backend.RelocationInfo,
+	callTrampolineIslandOffsets []int,
+) {
+	for _, islandOffset := range callTrampolineIslandOffsets {
+		encodeCallTrampolineIsland(refToBinaryOffset, islandOffset, executable)
+	}
+
+	for _, r := range relocations {
+		instrOffset := r.Offset
+		calleeFnOffset := refToBinaryOffset[r.FuncRef]
+		diff := int64(calleeFnOffset) - (instrOffset)
+		// Check if the diff is within the range of the branch instruction.
+		if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset {
+			// Find the near trampoline island from callTrampolineIslandOffsets.
+			islandOffset := searchTrampolineIsland(callTrampolineIslandOffsets, int(instrOffset))
+			islandTargetOffset := islandOffset + trampolineCallSize*int(r.FuncRef)
+			diff = int64(islandTargetOffset) - (instrOffset)
+			if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset {
+				panic("BUG in trampoline placement")
+			}
+		}
+		binary.LittleEndian.PutUint32(executable[instrOffset:instrOffset+4], encodeUnconditionalBranch(true, diff))
+	}
+}
+
+// encodeCallTrampolineIsland encodes a trampoline island for the given functions.
+// Each island consists of a trampoline instruction sequence for each function.
+// Each trampoline instruction sequence consists of 4 instructions + 32-bit immediate.
+func encodeCallTrampolineIsland(refToBinaryOffset []int, islandOffset int, executable []byte) {
+	for i := 0; i < len(refToBinaryOffset); i++ {
+		trampolineOffset := islandOffset + trampolineCallSize*i
+
+		fnOffset := refToBinaryOffset[i]
+		diff := fnOffset - (trampolineOffset + 16)
+		if diff > math.MaxInt32 || diff < math.MinInt32 {
+			// This case even amd64 can't handle. 4GB is too big.
+			panic("too big binary")
+		}
+
+		// The tmpReg, tmpReg2 is safe to overwrite (in fact any caller-saved register is safe to use).
+		tmpReg, tmpReg2 := regNumberInEncoding[tmpRegVReg.RealReg()], regNumberInEncoding[x11]
+
+		// adr tmpReg, PC+16: load the address of #diff into tmpReg.
+		binary.LittleEndian.PutUint32(executable[trampolineOffset:], encodeAdr(tmpReg, 16))
+		// ldrsw tmpReg2, [tmpReg]: Load #diff into tmpReg2.
+		binary.LittleEndian.PutUint32(executable[trampolineOffset+4:],
+			encodeLoadOrStore(sLoad32, tmpReg2, addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpRegVReg}))
+		// add tmpReg, tmpReg2, tmpReg: add #diff to the address of #diff, getting the absolute address of the function.
+		binary.LittleEndian.PutUint32(executable[trampolineOffset+8:],
+			encodeAluRRR(aluOpAdd, tmpReg, tmpReg, tmpReg2, true, false))
+		// br tmpReg: branch to the function without overwriting the link register.
+		binary.LittleEndian.PutUint32(executable[trampolineOffset+12:], encodeUnconditionalBranchReg(tmpReg, false))
+		// #diff
+		binary.LittleEndian.PutUint32(executable[trampolineOffset+16:], uint32(diff))
+	}
+}
+
+// searchTrampolineIsland finds the nearest trampoline island from callTrampolineIslandOffsets.
+// Note that even if the offset is in the middle of two islands, it returns the latter one.
+// That is ok because the island is always placed in the middle of the range.
+//
+// precondition: callTrampolineIslandOffsets is sorted in ascending order.
+func searchTrampolineIsland(callTrampolineIslandOffsets []int, offset int) int {
+	l := len(callTrampolineIslandOffsets)
+	n := sort.Search(l, func(i int) bool {
+		return callTrampolineIslandOffsets[i] >= offset
+	})
+	if n == l {
+		n = l - 1
+	}
+	return callTrampolineIslandOffsets[n]
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go
new file mode 100644
index 000000000..45737516d
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go
@@ -0,0 +1,397 @@
+package arm64
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+)
+
+// Arm64-specific registers.
+//
+// See https://developer.arm.com/documentation/dui0801/a/Overview-of-AArch64-state/Predeclared-core-register-names-in-AArch64-state
+
+const (
+	// General purpose registers. Note that we do not distinguish wn and xn registers
+	// because they are the same from the perspective of register allocator, and
+	// the size can be determined by the type of the instruction.
+
+	x0 = regalloc.RealRegInvalid + 1 + iota
+	x1
+	x2
+	x3
+	x4
+	x5
+	x6
+	x7
+	x8
+	x9
+	x10
+	x11
+	x12
+	x13
+	x14
+	x15
+	x16
+	x17
+	x18
+	x19
+	x20
+	x21
+	x22
+	x23
+	x24
+	x25
+	x26
+	x27
+	x28
+	x29
+	x30
+
+	// Vector registers. Note that we do not distinguish vn and dn, ... registers
+	// because they are the same from the perspective of register allocator, and
+	// the size can be determined by the type of the instruction.
+
+	v0
+	v1
+	v2
+	v3
+	v4
+	v5
+	v6
+	v7
+	v8
+	v9
+	v10
+	v11
+	v12
+	v13
+	v14
+	v15
+	v16
+	v17
+	v18
+	v19
+	v20
+	v21
+	v22
+	v23
+	v24
+	v25
+	v26
+	v27
+	v28
+	v29
+	v30
+	v31
+
+	// Special registers
+
+	xzr
+	sp
+	lr  = x30
+	fp  = x29
+	tmp = x27
+)
+
+var (
+	x0VReg  = regalloc.FromRealReg(x0, regalloc.RegTypeInt)
+	x1VReg  = regalloc.FromRealReg(x1, regalloc.RegTypeInt)
+	x2VReg  = regalloc.FromRealReg(x2, regalloc.RegTypeInt)
+	x3VReg  = regalloc.FromRealReg(x3, regalloc.RegTypeInt)
+	x4VReg  = regalloc.FromRealReg(x4, regalloc.RegTypeInt)
+	x5VReg  = regalloc.FromRealReg(x5, regalloc.RegTypeInt)
+	x6VReg  = regalloc.FromRealReg(x6, regalloc.RegTypeInt)
+	x7VReg  = regalloc.FromRealReg(x7, regalloc.RegTypeInt)
+	x8VReg  = regalloc.FromRealReg(x8, regalloc.RegTypeInt)
+	x9VReg  = regalloc.FromRealReg(x9, regalloc.RegTypeInt)
+	x10VReg = regalloc.FromRealReg(x10, regalloc.RegTypeInt)
+	x11VReg = regalloc.FromRealReg(x11, regalloc.RegTypeInt)
+	x12VReg = regalloc.FromRealReg(x12, regalloc.RegTypeInt)
+	x13VReg = regalloc.FromRealReg(x13, regalloc.RegTypeInt)
+	x14VReg = regalloc.FromRealReg(x14, regalloc.RegTypeInt)
+	x15VReg = regalloc.FromRealReg(x15, regalloc.RegTypeInt)
+	x16VReg = regalloc.FromRealReg(x16, regalloc.RegTypeInt)
+	x17VReg = regalloc.FromRealReg(x17, regalloc.RegTypeInt)
+	x18VReg = regalloc.FromRealReg(x18, regalloc.RegTypeInt)
+	x19VReg = regalloc.FromRealReg(x19, regalloc.RegTypeInt)
+	x20VReg = regalloc.FromRealReg(x20, regalloc.RegTypeInt)
+	x21VReg = regalloc.FromRealReg(x21, regalloc.RegTypeInt)
+	x22VReg = regalloc.FromRealReg(x22, regalloc.RegTypeInt)
+	x23VReg = regalloc.FromRealReg(x23, regalloc.RegTypeInt)
+	x24VReg = regalloc.FromRealReg(x24, regalloc.RegTypeInt)
+	x25VReg = regalloc.FromRealReg(x25, regalloc.RegTypeInt)
+	x26VReg = regalloc.FromRealReg(x26, regalloc.RegTypeInt)
+	x27VReg = regalloc.FromRealReg(x27, regalloc.RegTypeInt)
+	x28VReg = regalloc.FromRealReg(x28, regalloc.RegTypeInt)
+	x29VReg = regalloc.FromRealReg(x29, regalloc.RegTypeInt)
+	x30VReg = regalloc.FromRealReg(x30, regalloc.RegTypeInt)
+	v0VReg  = regalloc.FromRealReg(v0, regalloc.RegTypeFloat)
+	v1VReg  = regalloc.FromRealReg(v1, regalloc.RegTypeFloat)
+	v2VReg  = regalloc.FromRealReg(v2, regalloc.RegTypeFloat)
+	v3VReg  = regalloc.FromRealReg(v3, regalloc.RegTypeFloat)
+	v4VReg  = regalloc.FromRealReg(v4, regalloc.RegTypeFloat)
+	v5VReg  = regalloc.FromRealReg(v5, regalloc.RegTypeFloat)
+	v6VReg  = regalloc.FromRealReg(v6, regalloc.RegTypeFloat)
+	v7VReg  = regalloc.FromRealReg(v7, regalloc.RegTypeFloat)
+	v8VReg  = regalloc.FromRealReg(v8, regalloc.RegTypeFloat)
+	v9VReg  = regalloc.FromRealReg(v9, regalloc.RegTypeFloat)
+	v10VReg = regalloc.FromRealReg(v10, regalloc.RegTypeFloat)
+	v11VReg = regalloc.FromRealReg(v11, regalloc.RegTypeFloat)
+	v12VReg = regalloc.FromRealReg(v12, regalloc.RegTypeFloat)
+	v13VReg = regalloc.FromRealReg(v13, regalloc.RegTypeFloat)
+	v14VReg = regalloc.FromRealReg(v14, regalloc.RegTypeFloat)
+	v15VReg = regalloc.FromRealReg(v15, regalloc.RegTypeFloat)
+	v16VReg = regalloc.FromRealReg(v16, regalloc.RegTypeFloat)
+	v17VReg = regalloc.FromRealReg(v17, regalloc.RegTypeFloat)
+	v18VReg = regalloc.FromRealReg(v18, regalloc.RegTypeFloat)
+	v19VReg = regalloc.FromRealReg(v19, regalloc.RegTypeFloat)
+	v20VReg = regalloc.FromRealReg(v20, regalloc.RegTypeFloat)
+	v21VReg = regalloc.FromRealReg(v21, regalloc.RegTypeFloat)
+	v22VReg = regalloc.FromRealReg(v22, regalloc.RegTypeFloat)
+	v23VReg = regalloc.FromRealReg(v23, regalloc.RegTypeFloat)
+	v24VReg = regalloc.FromRealReg(v24, regalloc.RegTypeFloat)
+	v25VReg = regalloc.FromRealReg(v25, regalloc.RegTypeFloat)
+	v26VReg = regalloc.FromRealReg(v26, regalloc.RegTypeFloat)
+	v27VReg = regalloc.FromRealReg(v27, regalloc.RegTypeFloat)
+	// lr (link register) holds the return address at the function entry.
+	lrVReg = x30VReg
+	// tmpReg is used to perform spill/load on large stack offsets, and load large constants.
+	// Therefore, be cautious to use this register in the middle of the compilation, especially before the register allocation.
+	// This is the same as golang/go, but it's only described in the source code:
+	// https://github.com/golang/go/blob/18e17e2cb12837ea2c8582ecdb0cc780f49a1aac/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go#L59
+	// https://github.com/golang/go/blob/18e17e2cb12837ea2c8582ecdb0cc780f49a1aac/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go#L13-L15
+	tmpRegVReg = regalloc.FromRealReg(tmp, regalloc.RegTypeInt)
+	v28VReg    = regalloc.FromRealReg(v28, regalloc.RegTypeFloat)
+	v29VReg    = regalloc.FromRealReg(v29, regalloc.RegTypeFloat)
+	v30VReg    = regalloc.FromRealReg(v30, regalloc.RegTypeFloat)
+	v31VReg    = regalloc.FromRealReg(v31, regalloc.RegTypeFloat)
+	xzrVReg    = regalloc.FromRealReg(xzr, regalloc.RegTypeInt)
+	spVReg     = regalloc.FromRealReg(sp, regalloc.RegTypeInt)
+	fpVReg     = regalloc.FromRealReg(fp, regalloc.RegTypeInt)
+)
+
+var regNames = [...]string{
+	x0:  "x0",
+	x1:  "x1",
+	x2:  "x2",
+	x3:  "x3",
+	x4:  "x4",
+	x5:  "x5",
+	x6:  "x6",
+	x7:  "x7",
+	x8:  "x8",
+	x9:  "x9",
+	x10: "x10",
+	x11: "x11",
+	x12: "x12",
+	x13: "x13",
+	x14: "x14",
+	x15: "x15",
+	x16: "x16",
+	x17: "x17",
+	x18: "x18",
+	x19: "x19",
+	x20: "x20",
+	x21: "x21",
+	x22: "x22",
+	x23: "x23",
+	x24: "x24",
+	x25: "x25",
+	x26: "x26",
+	x27: "x27",
+	x28: "x28",
+	x29: "x29",
+	x30: "x30",
+	xzr: "xzr",
+	sp:  "sp",
+	v0:  "v0",
+	v1:  "v1",
+	v2:  "v2",
+	v3:  "v3",
+	v4:  "v4",
+	v5:  "v5",
+	v6:  "v6",
+	v7:  "v7",
+	v8:  "v8",
+	v9:  "v9",
+	v10: "v10",
+	v11: "v11",
+	v12: "v12",
+	v13: "v13",
+	v14: "v14",
+	v15: "v15",
+	v16: "v16",
+	v17: "v17",
+	v18: "v18",
+	v19: "v19",
+	v20: "v20",
+	v21: "v21",
+	v22: "v22",
+	v23: "v23",
+	v24: "v24",
+	v25: "v25",
+	v26: "v26",
+	v27: "v27",
+	v28: "v28",
+	v29: "v29",
+	v30: "v30",
+	v31: "v31",
+}
+
+func formatVRegSized(r regalloc.VReg, size byte) (ret string) {
+	if r.IsRealReg() {
+		ret = regNames[r.RealReg()]
+		switch ret[0] {
+		case 'x':
+			switch size {
+			case 32:
+				ret = strings.Replace(ret, "x", "w", 1)
+			case 64:
+			default:
+				panic("BUG: invalid register size: " + strconv.Itoa(int(size)))
+			}
+		case 'v':
+			switch size {
+			case 32:
+				ret = strings.Replace(ret, "v", "s", 1)
+			case 64:
+				ret = strings.Replace(ret, "v", "d", 1)
+			case 128:
+				ret = strings.Replace(ret, "v", "q", 1)
+			default:
+				panic("BUG: invalid register size")
+			}
+		}
+	} else {
+		switch r.RegType() {
+		case regalloc.RegTypeInt:
+			switch size {
+			case 32:
+				ret = fmt.Sprintf("w%d?", r.ID())
+			case 64:
+				ret = fmt.Sprintf("x%d?", r.ID())
+			default:
+				panic("BUG: invalid register size: " + strconv.Itoa(int(size)))
+			}
+		case regalloc.RegTypeFloat:
+			switch size {
+			case 32:
+				ret = fmt.Sprintf("s%d?", r.ID())
+			case 64:
+				ret = fmt.Sprintf("d%d?", r.ID())
+			case 128:
+				ret = fmt.Sprintf("q%d?", r.ID())
+			default:
+				panic("BUG: invalid register size")
+			}
+		default:
+			panic(fmt.Sprintf("BUG: invalid register type: %d for %s", r.RegType(), r))
+		}
+	}
+	return
+}
+
+func formatVRegWidthVec(r regalloc.VReg, width vecArrangement) (ret string) {
+	var id string
+	wspec := strings.ToLower(width.String())
+	if r.IsRealReg() {
+		id = regNames[r.RealReg()][1:]
+	} else {
+		id = fmt.Sprintf("%d?", r.ID())
+	}
+	ret = fmt.Sprintf("%s%s", wspec, id)
+	return
+}
+
+func formatVRegVec(r regalloc.VReg, arr vecArrangement, index vecIndex) (ret string) {
+	id := fmt.Sprintf("v%d?", r.ID())
+	if r.IsRealReg() {
+		id = regNames[r.RealReg()]
+	}
+	ret = fmt.Sprintf("%s.%s", id, strings.ToLower(arr.String()))
+	if index != vecIndexNone {
+		ret += fmt.Sprintf("[%d]", index)
+	}
+	return
+}
+
+func regTypeToRegisterSizeInBits(r regalloc.RegType) byte {
+	switch r {
+	case regalloc.RegTypeInt:
+		return 64
+	case regalloc.RegTypeFloat:
+		return 128
+	default:
+		panic("BUG: invalid register type")
+	}
+}
+
+var regNumberInEncoding = [...]uint32{
+	x0:  0,
+	x1:  1,
+	x2:  2,
+	x3:  3,
+	x4:  4,
+	x5:  5,
+	x6:  6,
+	x7:  7,
+	x8:  8,
+	x9:  9,
+	x10: 10,
+	x11: 11,
+	x12: 12,
+	x13: 13,
+	x14: 14,
+	x15: 15,
+	x16: 16,
+	x17: 17,
+	x18: 18,
+	x19: 19,
+	x20: 20,
+	x21: 21,
+	x22: 22,
+	x23: 23,
+	x24: 24,
+	x25: 25,
+	x26: 26,
+	x27: 27,
+	x28: 28,
+	x29: 29,
+	x30: 30,
+	xzr: 31,
+	sp:  31,
+	v0:  0,
+	v1:  1,
+	v2:  2,
+	v3:  3,
+	v4:  4,
+	v5:  5,
+	v6:  6,
+	v7:  7,
+	v8:  8,
+	v9:  9,
+	v10: 10,
+	v11: 11,
+	v12: 12,
+	v13: 13,
+	v14: 14,
+	v15: 15,
+	v16: 16,
+	v17: 17,
+	v18: 18,
+	v19: 19,
+	v20: 20,
+	v21: 21,
+	v22: 22,
+	v23: 23,
+	v24: 24,
+	v25: 25,
+	v26: 26,
+	v27: 27,
+	v28: 28,
+	v29: 29,
+	v30: 30,
+	v31: 31,
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go
new file mode 100644
index 000000000..edb0e36e3
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go
@@ -0,0 +1,90 @@
+package arm64
+
+import (
+	"encoding/binary"
+	"reflect"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/internal/wasmdebug"
+)
+
+// UnwindStack implements wazevo.unwindStack.
+func UnwindStack(sp, _, top uintptr, returnAddresses []uintptr) []uintptr {
+	l := int(top - sp)
+
+	var stackBuf []byte
+	{
+		// TODO: use unsafe.Slice after floor version is set to Go 1.20.
+		hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf))
+		hdr.Data = sp
+		hdr.Len = l
+		hdr.Cap = l
+	}
+
+	for i := uint64(0); i < uint64(l); {
+		//       (high address)
+		//    +-----------------+
+		//    |     .......     |
+		//    |      ret Y      |  <----+
+		//    |     .......     |       |
+		//    |      ret 0      |       |
+		//    |      arg X      |       |  size_of_arg_ret
+		//    |     .......     |       |
+		//    |      arg 1      |       |
+		//    |      arg 0      |  <----+
+		//    | size_of_arg_ret |
+		//    |  ReturnAddress  |
+		//    +-----------------+ <----+
+		//    |   ...........   |      |
+		//    |   spill slot M  |      |
+		//    |   ............  |      |
+		//    |   spill slot 2  |      |
+		//    |   spill slot 1  |      | frame size
+		//    |   spill slot 1  |      |
+		//    |   clobbered N   |      |
+		//    |   ............  |      |
+		//    |   clobbered 0   | <----+
+		//    |     xxxxxx      |  ;; unused space to make it 16-byte aligned.
+		//    |   frame_size    |
+		//    +-----------------+ <---- SP
+		//       (low address)
+
+		frameSize := binary.LittleEndian.Uint64(stackBuf[i:])
+		i += frameSize +
+			16 // frame size + aligned space.
+		retAddr := binary.LittleEndian.Uint64(stackBuf[i:])
+		i += 8 // ret addr.
+		sizeOfArgRet := binary.LittleEndian.Uint64(stackBuf[i:])
+		i += 8 + sizeOfArgRet
+		returnAddresses = append(returnAddresses, uintptr(retAddr))
+		if len(returnAddresses) == wasmdebug.MaxFrames {
+			break
+		}
+	}
+	return returnAddresses
+}
+
+// GoCallStackView implements wazevo.goCallStackView.
+func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
+	//                  (high address)
+	//              +-----------------+ <----+
+	//              |   xxxxxxxxxxx   |      | ;; optional unused space to make it 16-byte aligned.
+	//           ^  |  arg[N]/ret[M]  |      |
+	// sliceSize |  |  ............   |      | sliceSize
+	//           |  |  arg[1]/ret[1]  |      |
+	//           v  |  arg[0]/ret[0]  | <----+
+	//              |    sliceSize    |
+	//              |   frame_size    |
+	//              +-----------------+ <---- stackPointerBeforeGoCall
+	//                 (low address)
+	ptr := unsafe.Pointer(stackPointerBeforeGoCall)
+	size := *(*uint64)(unsafe.Add(ptr, 8))
+	var view []uint64
+	{
+		sh := (*reflect.SliceHeader)(unsafe.Pointer(&view))
+		sh.Data = uintptr(unsafe.Add(ptr, 16)) // skips the (frame_size, sliceSize).
+		sh.Len = int(size)
+		sh.Cap = int(size)
+	}
+	return view
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go
new file mode 100644
index 000000000..54ce89e46
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go
@@ -0,0 +1,100 @@
+package backend
+
+import (
+	"context"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type (
+	// Machine is a backend for a specific ISA machine.
+	Machine interface {
+		ExecutableContext() ExecutableContext
+
+		// DisableStackCheck disables the stack check for the current compilation for debugging/testing.
+		DisableStackCheck()
+
+		// SetCurrentABI initializes the FunctionABI for the given signature.
+		SetCurrentABI(abi *FunctionABI)
+
+		// SetCompiler sets the compilation context used for the lifetime of Machine.
+		// This is only called once per Machine, i.e. before the first compilation.
+		SetCompiler(Compiler)
+
+		// LowerSingleBranch is called when the compilation of the given single branch is started.
+		LowerSingleBranch(b *ssa.Instruction)
+
+		// LowerConditionalBranch is called when the compilation of the given conditional branch is started.
+		LowerConditionalBranch(b *ssa.Instruction)
+
+		// LowerInstr is called for each instruction in the given block except for the ones marked as already lowered
+		// via Compiler.MarkLowered. The order is reverse, i.e. from the last instruction to the first one.
+		//
+		// Note: this can lower multiple instructions (which produce the inputs) at once whenever it's possible
+		// for optimization.
+		LowerInstr(*ssa.Instruction)
+
+		// Reset resets the machine state for the next compilation.
+		Reset()
+
+		// InsertMove inserts a move instruction from src to dst whose type is typ.
+		InsertMove(dst, src regalloc.VReg, typ ssa.Type)
+
+		// InsertReturn inserts the return instruction to return from the current function.
+		InsertReturn()
+
+		// InsertLoadConstantBlockArg inserts the instruction(s) to load the constant value into the given regalloc.VReg.
+		InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg)
+
+		// Format returns the string representation of the currently compiled machine code.
+		// This is only for testing purpose.
+		Format() string
+
+		// RegAlloc does the register allocation after lowering.
+		RegAlloc()
+
+		// PostRegAlloc does the post register allocation, e.g. setting up prologue/epilogue, redundant move elimination, etc.
+		PostRegAlloc()
+
+		// ResolveRelocations resolves the relocations after emitting machine code.
+		//  * refToBinaryOffset: the map from the function reference (ssa.FuncRef) to the executable offset.
+		//  * executable: the binary to resolve the relocations.
+		//  * relocations: the relocations to resolve.
+		//  * callTrampolineIslandOffsets: the offsets of the trampoline islands in the executable.
+		ResolveRelocations(
+			refToBinaryOffset []int,
+			executable []byte,
+			relocations []RelocationInfo,
+			callTrampolineIslandOffsets []int,
+		)
+
+		// Encode encodes the machine instructions to the Compiler.
+		Encode(ctx context.Context) error
+
+		// CompileGoFunctionTrampoline compiles the trampoline function  to call a Go function of the given exit code and signature.
+		CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte
+
+		// CompileStackGrowCallSequence returns the sequence of instructions shared by all functions to
+		// call the stack grow builtin function.
+		CompileStackGrowCallSequence() []byte
+
+		// CompileEntryPreamble returns the sequence of instructions shared by multiple functions to
+		// enter the function from Go.
+		CompileEntryPreamble(signature *ssa.Signature) []byte
+
+		// LowerParams lowers the given parameters.
+		LowerParams(params []ssa.Value)
+
+		// LowerReturns lowers the given returns.
+		LowerReturns(returns []ssa.Value)
+
+		// ArgsResultsRegs returns the registers used for arguments and return values.
+		ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg)
+
+		// CallTrampolineIslandInfo returns the interval of the offset where the trampoline island is placed, and
+		// the size of the trampoline island. If islandSize is zero, the trampoline island is not used on this machine.
+		CallTrampolineIslandInfo(numFunctions int) (interval, islandSize int, err error)
+	}
+)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go
new file mode 100644
index 000000000..3f36c84e5
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go
@@ -0,0 +1,319 @@
+package backend
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// RegAllocFunctionMachine is the interface for the machine specific logic that will be used in RegAllocFunction.
+type RegAllocFunctionMachine[I regalloc.InstrConstraint] interface {
+	// InsertMoveBefore inserts the move instruction from src to dst before the given instruction.
+	InsertMoveBefore(dst, src regalloc.VReg, instr I)
+	// InsertStoreRegisterAt inserts the instruction(s) to store the given virtual register at the given instruction.
+	// If after is true, the instruction(s) will be inserted after the given instruction, otherwise before.
+	InsertStoreRegisterAt(v regalloc.VReg, instr I, after bool) I
+	// InsertReloadRegisterAt inserts the instruction(s) to reload the given virtual register at the given instruction.
+	// If after is true, the instruction(s) will be inserted after the given instruction, otherwise before.
+	InsertReloadRegisterAt(v regalloc.VReg, instr I, after bool) I
+	// ClobberedRegisters is called when the register allocation is done and the clobbered registers are known.
+	ClobberedRegisters(regs []regalloc.VReg)
+	// Swap swaps the two virtual registers after the given instruction.
+	Swap(cur I, x1, x2, tmp regalloc.VReg)
+	// LastInstrForInsertion implements LastInstrForInsertion of regalloc.Function. See its comment for details.
+	LastInstrForInsertion(begin, end I) I
+	// SSABlockLabel returns the label of the given ssa.BasicBlockID.
+	SSABlockLabel(id ssa.BasicBlockID) Label
+}
+
+type (
+	// RegAllocFunction implements regalloc.Function.
+	RegAllocFunction[I regalloc.InstrConstraint, m RegAllocFunctionMachine[I]] struct {
+		m   m
+		ssb ssa.Builder
+		c   Compiler
+		// iter is the iterator for reversePostOrderBlocks
+		iter                   int
+		reversePostOrderBlocks []RegAllocBlock[I, m]
+		// labelToRegAllocBlockIndex maps label to the index of reversePostOrderBlocks.
+		labelToRegAllocBlockIndex map[Label]int
+		loopNestingForestRoots    []ssa.BasicBlock
+	}
+
+	// RegAllocBlock implements regalloc.Block.
+	RegAllocBlock[I regalloc.InstrConstraint, m RegAllocFunctionMachine[I]] struct {
+		// f is the function this instruction belongs to. Used to reuse the regAllocFunctionImpl.predsSlice slice for Defs() and Uses().
+		f                           *RegAllocFunction[I, m]
+		sb                          ssa.BasicBlock
+		l                           Label
+		begin, end                  I
+		loopNestingForestChildren   []ssa.BasicBlock
+		cur                         I
+		id                          int
+		cachedLastInstrForInsertion I
+	}
+)
+
+// NewRegAllocFunction returns a new RegAllocFunction.
+func NewRegAllocFunction[I regalloc.InstrConstraint, M RegAllocFunctionMachine[I]](m M, ssb ssa.Builder, c Compiler) *RegAllocFunction[I, M] {
+	return &RegAllocFunction[I, M]{
+		m:                         m,
+		ssb:                       ssb,
+		c:                         c,
+		labelToRegAllocBlockIndex: make(map[Label]int),
+	}
+}
+
+// AddBlock adds a new block to the function.
+func (f *RegAllocFunction[I, M]) AddBlock(sb ssa.BasicBlock, l Label, begin, end I) {
+	i := len(f.reversePostOrderBlocks)
+	f.reversePostOrderBlocks = append(f.reversePostOrderBlocks, RegAllocBlock[I, M]{
+		f:     f,
+		sb:    sb,
+		l:     l,
+		begin: begin,
+		end:   end,
+		id:    int(sb.ID()),
+	})
+	f.labelToRegAllocBlockIndex[l] = i
+}
+
+// Reset resets the function for the next compilation.
+func (f *RegAllocFunction[I, M]) Reset() {
+	f.reversePostOrderBlocks = f.reversePostOrderBlocks[:0]
+	f.iter = 0
+}
+
+// StoreRegisterAfter implements regalloc.Function StoreRegisterAfter.
+func (f *RegAllocFunction[I, M]) StoreRegisterAfter(v regalloc.VReg, instr regalloc.Instr) {
+	m := f.m
+	m.InsertStoreRegisterAt(v, instr.(I), true)
+}
+
+// ReloadRegisterBefore implements regalloc.Function ReloadRegisterBefore.
+func (f *RegAllocFunction[I, M]) ReloadRegisterBefore(v regalloc.VReg, instr regalloc.Instr) {
+	m := f.m
+	m.InsertReloadRegisterAt(v, instr.(I), false)
+}
+
+// ReloadRegisterAfter implements regalloc.Function ReloadRegisterAfter.
+func (f *RegAllocFunction[I, M]) ReloadRegisterAfter(v regalloc.VReg, instr regalloc.Instr) {
+	m := f.m
+	m.InsertReloadRegisterAt(v, instr.(I), true)
+}
+
+// StoreRegisterBefore implements regalloc.Function StoreRegisterBefore.
+func (f *RegAllocFunction[I, M]) StoreRegisterBefore(v regalloc.VReg, instr regalloc.Instr) {
+	m := f.m
+	m.InsertStoreRegisterAt(v, instr.(I), false)
+}
+
+// ClobberedRegisters implements regalloc.Function ClobberedRegisters.
+func (f *RegAllocFunction[I, M]) ClobberedRegisters(regs []regalloc.VReg) {
+	f.m.ClobberedRegisters(regs)
+}
+
+// SwapBefore implements regalloc.Function SwapBefore.
+func (f *RegAllocFunction[I, M]) SwapBefore(x1, x2, tmp regalloc.VReg, instr regalloc.Instr) {
+	f.m.Swap(instr.Prev().(I), x1, x2, tmp)
+}
+
+// PostOrderBlockIteratorBegin implements regalloc.Function PostOrderBlockIteratorBegin.
+func (f *RegAllocFunction[I, M]) PostOrderBlockIteratorBegin() regalloc.Block {
+	f.iter = len(f.reversePostOrderBlocks) - 1
+	return f.PostOrderBlockIteratorNext()
+}
+
+// PostOrderBlockIteratorNext implements regalloc.Function PostOrderBlockIteratorNext.
+func (f *RegAllocFunction[I, M]) PostOrderBlockIteratorNext() regalloc.Block {
+	if f.iter < 0 {
+		return nil
+	}
+	b := &f.reversePostOrderBlocks[f.iter]
+	f.iter--
+	return b
+}
+
+// ReversePostOrderBlockIteratorBegin implements regalloc.Function ReversePostOrderBlockIteratorBegin.
+func (f *RegAllocFunction[I, M]) ReversePostOrderBlockIteratorBegin() regalloc.Block {
+	f.iter = 0
+	return f.ReversePostOrderBlockIteratorNext()
+}
+
+// ReversePostOrderBlockIteratorNext implements regalloc.Function ReversePostOrderBlockIteratorNext.
+func (f *RegAllocFunction[I, M]) ReversePostOrderBlockIteratorNext() regalloc.Block {
+	if f.iter >= len(f.reversePostOrderBlocks) {
+		return nil
+	}
+	b := &f.reversePostOrderBlocks[f.iter]
+	f.iter++
+	return b
+}
+
+// LoopNestingForestRoots implements regalloc.Function LoopNestingForestRoots.
+func (f *RegAllocFunction[I, M]) LoopNestingForestRoots() int {
+	f.loopNestingForestRoots = f.ssb.LoopNestingForestRoots()
+	return len(f.loopNestingForestRoots)
+}
+
+// LoopNestingForestRoot implements regalloc.Function LoopNestingForestRoot.
+func (f *RegAllocFunction[I, M]) LoopNestingForestRoot(i int) regalloc.Block {
+	blk := f.loopNestingForestRoots[i]
+	l := f.m.SSABlockLabel(blk.ID())
+	index := f.labelToRegAllocBlockIndex[l]
+	return &f.reversePostOrderBlocks[index]
+}
+
+// InsertMoveBefore implements regalloc.Function InsertMoveBefore.
+func (f *RegAllocFunction[I, M]) InsertMoveBefore(dst, src regalloc.VReg, instr regalloc.Instr) {
+	f.m.InsertMoveBefore(dst, src, instr.(I))
+}
+
+// LowestCommonAncestor implements regalloc.Function LowestCommonAncestor.
+func (f *RegAllocFunction[I, M]) LowestCommonAncestor(blk1, blk2 regalloc.Block) regalloc.Block {
+	ret := f.ssb.LowestCommonAncestor(blk1.(*RegAllocBlock[I, M]).sb, blk2.(*RegAllocBlock[I, M]).sb)
+	l := f.m.SSABlockLabel(ret.ID())
+	index := f.labelToRegAllocBlockIndex[l]
+	return &f.reversePostOrderBlocks[index]
+}
+
+// Idom implements regalloc.Function Idom.
+func (f *RegAllocFunction[I, M]) Idom(blk regalloc.Block) regalloc.Block {
+	builder := f.ssb
+	idom := builder.Idom(blk.(*RegAllocBlock[I, M]).sb)
+	if idom == nil {
+		panic("BUG: idom must not be nil")
+	}
+	l := f.m.SSABlockLabel(idom.ID())
+	index := f.labelToRegAllocBlockIndex[l]
+	return &f.reversePostOrderBlocks[index]
+}
+
+// ID implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) ID() int32 { return int32(r.id) }
+
+// BlockParams implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) BlockParams(regs *[]regalloc.VReg) []regalloc.VReg {
+	c := r.f.c
+	*regs = (*regs)[:0]
+	for i := 0; i < r.sb.Params(); i++ {
+		v := c.VRegOf(r.sb.Param(i))
+		*regs = append(*regs, v)
+	}
+	return *regs
+}
+
+// InstrIteratorBegin implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) InstrIteratorBegin() regalloc.Instr {
+	r.cur = r.begin
+	return r.cur
+}
+
+// InstrIteratorNext implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) InstrIteratorNext() regalloc.Instr {
+	for {
+		if r.cur == r.end {
+			return nil
+		}
+		instr := r.cur.Next()
+		r.cur = instr.(I)
+		if instr == nil {
+			return nil
+		} else if instr.AddedBeforeRegAlloc() {
+			// Only concerned about the instruction added before regalloc.
+			return instr
+		}
+	}
+}
+
+// InstrRevIteratorBegin implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) InstrRevIteratorBegin() regalloc.Instr {
+	r.cur = r.end
+	return r.cur
+}
+
+// InstrRevIteratorNext implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) InstrRevIteratorNext() regalloc.Instr {
+	for {
+		if r.cur == r.begin {
+			return nil
+		}
+		instr := r.cur.Prev()
+		r.cur = instr.(I)
+		if instr == nil {
+			return nil
+		} else if instr.AddedBeforeRegAlloc() {
+			// Only concerned about the instruction added before regalloc.
+			return instr
+		}
+	}
+}
+
+// FirstInstr implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) FirstInstr() regalloc.Instr {
+	return r.begin
+}
+
+// EndInstr implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) EndInstr() regalloc.Instr {
+	return r.end
+}
+
+// LastInstrForInsertion implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) LastInstrForInsertion() regalloc.Instr {
+	var nil I
+	if r.cachedLastInstrForInsertion == nil {
+		r.cachedLastInstrForInsertion = r.f.m.LastInstrForInsertion(r.begin, r.end)
+	}
+	return r.cachedLastInstrForInsertion
+}
+
+// Preds implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Preds() int { return r.sb.Preds() }
+
+// Pred implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Pred(i int) regalloc.Block {
+	sb := r.sb
+	pred := sb.Pred(i)
+	l := r.f.m.SSABlockLabel(pred.ID())
+	index := r.f.labelToRegAllocBlockIndex[l]
+	return &r.f.reversePostOrderBlocks[index]
+}
+
+// Entry implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Entry() bool { return r.sb.EntryBlock() }
+
+// Succs implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Succs() int {
+	return r.sb.Succs()
+}
+
+// Succ implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Succ(i int) regalloc.Block {
+	sb := r.sb
+	succ := sb.Succ(i)
+	if succ.ReturnBlock() {
+		return nil
+	}
+	l := r.f.m.SSABlockLabel(succ.ID())
+	index := r.f.labelToRegAllocBlockIndex[l]
+	return &r.f.reversePostOrderBlocks[index]
+}
+
+// LoopHeader implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) LoopHeader() bool {
+	return r.sb.LoopHeader()
+}
+
+// LoopNestingForestChildren implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) LoopNestingForestChildren() int {
+	r.loopNestingForestChildren = r.sb.LoopNestingForestChildren()
+	return len(r.loopNestingForestChildren)
+}
+
+// LoopNestingForestChild implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) LoopNestingForestChild(i int) regalloc.Block {
+	blk := r.loopNestingForestChildren[i]
+	l := r.f.m.SSABlockLabel(blk.ID())
+	index := r.f.labelToRegAllocBlockIndex[l]
+	return &r.f.reversePostOrderBlocks[index]
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go
new file mode 100644
index 000000000..23157b478
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go
@@ -0,0 +1,136 @@
+package regalloc
+
+import "fmt"
+
+// These interfaces are implemented by ISA-specific backends to abstract away the details, and allow the register
+// allocators to work on any ISA.
+//
+// TODO: the interfaces are not stabilized yet, especially x64 will need some changes. E.g. x64 has an addressing mode
+// 	where index can be in memory. That kind of info will be useful to reduce the register pressure, and should be leveraged
+// 	by the register allocators, like https://docs.rs/regalloc2/latest/regalloc2/enum.OperandConstraint.html
+
+type (
+	// Function is the top-level interface to do register allocation, which corresponds to a CFG containing
+	// Blocks(s).
+	Function interface {
+		// PostOrderBlockIteratorBegin returns the first block in the post-order traversal of the CFG.
+		// In other words, the last blocks in the CFG will be returned first.
+		PostOrderBlockIteratorBegin() Block
+		// PostOrderBlockIteratorNext returns the next block in the post-order traversal of the CFG.
+		PostOrderBlockIteratorNext() Block
+		// ReversePostOrderBlockIteratorBegin returns the first block in the reverse post-order traversal of the CFG.
+		// In other words, the first blocks in the CFG will be returned first.
+		ReversePostOrderBlockIteratorBegin() Block
+		// ReversePostOrderBlockIteratorNext returns the next block in the reverse post-order traversal of the CFG.
+		ReversePostOrderBlockIteratorNext() Block
+		// ClobberedRegisters tell the clobbered registers by this function.
+		ClobberedRegisters([]VReg)
+		// LoopNestingForestRoots returns the number of roots of the loop nesting forest in a function.
+		LoopNestingForestRoots() int
+		// LoopNestingForestRoot returns the i-th root of the loop nesting forest in a function.
+		LoopNestingForestRoot(i int) Block
+		// LowestCommonAncestor returns the lowest common ancestor of two blocks in the dominator tree.
+		LowestCommonAncestor(blk1, blk2 Block) Block
+		// Idom returns the immediate dominator of the given block.
+		Idom(blk Block) Block
+
+		// Followings are for rewriting the function.
+
+		// SwapAtEndOfBlock swaps the two virtual registers at the end of the given block.
+		SwapBefore(x1, x2, tmp VReg, instr Instr)
+		// StoreRegisterBefore inserts store instruction(s) before the given instruction for the given virtual register.
+		StoreRegisterBefore(v VReg, instr Instr)
+		// StoreRegisterAfter inserts store instruction(s) after the given instruction for the given virtual register.
+		StoreRegisterAfter(v VReg, instr Instr)
+		// ReloadRegisterBefore inserts reload instruction(s) before the given instruction for the given virtual register.
+		ReloadRegisterBefore(v VReg, instr Instr)
+		// ReloadRegisterAfter inserts reload instruction(s) after the given instruction for the given virtual register.
+		ReloadRegisterAfter(v VReg, instr Instr)
+		// InsertMoveBefore inserts move instruction(s) before the given instruction for the given virtual registers.
+		InsertMoveBefore(dst, src VReg, instr Instr)
+	}
+
+	// Block is a basic block in the CFG of a function, and it consists of multiple instructions, and predecessor Block(s).
+	Block interface {
+		// ID returns the unique identifier of this block which is ordered in the reverse post-order traversal of the CFG.
+		ID() int32
+		// BlockParams returns the virtual registers used as the parameters of this block.
+		BlockParams(*[]VReg) []VReg
+		// InstrIteratorBegin returns the first instruction in this block. Instructions added after lowering must be skipped.
+		// Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr.
+		InstrIteratorBegin() Instr
+		// InstrIteratorNext returns the next instruction in this block. Instructions added after lowering must be skipped.
+		// Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr.
+		InstrIteratorNext() Instr
+		// InstrRevIteratorBegin is the same as InstrIteratorBegin, but in the reverse order.
+		InstrRevIteratorBegin() Instr
+		// InstrRevIteratorNext is the same as InstrIteratorNext, but in the reverse order.
+		InstrRevIteratorNext() Instr
+		// FirstInstr returns the fist instruction in this block where instructions will be inserted after it.
+		FirstInstr() Instr
+		// EndInstr returns the end instruction in this block.
+		EndInstr() Instr
+		// LastInstrForInsertion returns the last instruction in this block where instructions will be inserted before it.
+		// Such insertions only happen when we need to insert spill/reload instructions to adjust the merge edges.
+		// At the time of register allocation, all the critical edges are already split, so there is no need
+		// to worry about the case where branching instruction has multiple successors.
+		// Therefore, usually, it is the nop instruction, but if the block ends with an unconditional branching, then it returns
+		// the unconditional branch, not the nop. In other words it is either nop or unconditional branch.
+		LastInstrForInsertion() Instr
+		// Preds returns the number of predecessors of this block in the CFG.
+		Preds() int
+		// Pred returns the i-th predecessor of this block in the CFG.
+		Pred(i int) Block
+		// Entry returns true if the block is for the entry block.
+		Entry() bool
+		// Succs returns the number of successors of this block in the CFG.
+		Succs() int
+		// Succ returns the i-th successor of this block in the CFG.
+		Succ(i int) Block
+		// LoopHeader returns true if this block is a loop header.
+		LoopHeader() bool
+		// LoopNestingForestChildren returns the number of children of this block in the loop nesting forest.
+		LoopNestingForestChildren() int
+		// LoopNestingForestChild returns the i-th child of this block in the loop nesting forest.
+		LoopNestingForestChild(i int) Block
+	}
+
+	// Instr is an instruction in a block, abstracting away the underlying ISA.
+	Instr interface {
+		fmt.Stringer
+		// Next returns the next instruction in the same block.
+		Next() Instr
+		// Prev returns the previous instruction in the same block.
+		Prev() Instr
+		// Defs returns the virtual registers defined by this instruction.
+		Defs(*[]VReg) []VReg
+		// Uses returns the virtual registers used by this instruction.
+		// Note: multiple returned []VReg will not be held at the same time, so it's safe to use the same slice for this.
+		Uses(*[]VReg) []VReg
+		// AssignUse assigns the RealReg-allocated virtual register used by this instruction at the given index.
+		AssignUse(index int, v VReg)
+		// AssignDef assigns a RealReg-allocated virtual register defined by this instruction.
+		// This only accepts one register because we don't allocate registers for multi-def instructions (i.e. call instruction)
+		AssignDef(VReg)
+		// IsCopy returns true if this instruction is a move instruction between two registers.
+		// If true, the instruction is of the form of dst = src, and if the src and dst do not interfere with each other,
+		// we could coalesce them, and hence the copy can be eliminated from the final code.
+		IsCopy() bool
+		// IsCall returns true if this instruction is a call instruction. The result is used to insert
+		// caller saved register spills and restores.
+		IsCall() bool
+		// IsIndirectCall returns true if this instruction is an indirect call instruction which calls a function pointer.
+		//  The result is used to insert caller saved register spills and restores.
+		IsIndirectCall() bool
+		// IsReturn returns true if this instruction is a return instruction.
+		IsReturn() bool
+		// AddedBeforeRegAlloc returns true if this instruction is added before register allocation.
+		AddedBeforeRegAlloc() bool
+	}
+
+	// InstrConstraint is an interface for arch-specific instruction constraints.
+	InstrConstraint interface {
+		comparable
+		Instr
+	}
+)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go
new file mode 100644
index 000000000..46df807e6
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go
@@ -0,0 +1,123 @@
+package regalloc
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// VReg represents a register which is assigned to an SSA value. This is used to represent a register in the backend.
+// A VReg may or may not be a physical register, and the info of physical register can be obtained by RealReg.
+type VReg uint64
+
+// VRegID is the lower 32bit of VReg, which is the pure identifier of VReg without RealReg info.
+type VRegID uint32
+
+// RealReg returns the RealReg of this VReg.
+func (v VReg) RealReg() RealReg {
+	return RealReg(v >> 32)
+}
+
+// IsRealReg returns true if this VReg is backed by a physical register.
+func (v VReg) IsRealReg() bool {
+	return v.RealReg() != RealRegInvalid
+}
+
+// FromRealReg returns a VReg from the given RealReg and RegType.
+// This is used to represent a specific pre-colored register in the backend.
+func FromRealReg(r RealReg, typ RegType) VReg {
+	rid := VRegID(r)
+	if rid > vRegIDReservedForRealNum {
+		panic(fmt.Sprintf("invalid real reg %d", r))
+	}
+	return VReg(r).SetRealReg(r).SetRegType(typ)
+}
+
+// SetRealReg sets the RealReg of this VReg and returns the updated VReg.
+func (v VReg) SetRealReg(r RealReg) VReg {
+	return VReg(r)<<32 | (v & 0xff_00_ffffffff)
+}
+
+// RegType returns the RegType of this VReg.
+func (v VReg) RegType() RegType {
+	return RegType(v >> 40)
+}
+
+// SetRegType sets the RegType of this VReg and returns the updated VReg.
+func (v VReg) SetRegType(t RegType) VReg {
+	return VReg(t)<<40 | (v & 0x00_ff_ffffffff)
+}
+
+// ID returns the VRegID of this VReg.
+func (v VReg) ID() VRegID {
+	return VRegID(v & 0xffffffff)
+}
+
+// Valid returns true if this VReg is Valid.
+func (v VReg) Valid() bool {
+	return v.ID() != vRegIDInvalid && v.RegType() != RegTypeInvalid
+}
+
+// RealReg represents a physical register.
+type RealReg byte
+
+const RealRegInvalid RealReg = 0
+
+const (
+	vRegIDInvalid            VRegID = 1 << 31
+	VRegIDNonReservedBegin          = vRegIDReservedForRealNum
+	vRegIDReservedForRealNum VRegID = 128
+	VRegInvalid                     = VReg(vRegIDInvalid)
+)
+
+// String implements fmt.Stringer.
+func (r RealReg) String() string {
+	switch r {
+	case RealRegInvalid:
+		return "invalid"
+	default:
+		return fmt.Sprintf("r%d", r)
+	}
+}
+
+// String implements fmt.Stringer.
+func (v VReg) String() string {
+	if v.IsRealReg() {
+		return fmt.Sprintf("r%d", v.ID())
+	}
+	return fmt.Sprintf("v%d?", v.ID())
+}
+
+// RegType represents the type of a register.
+type RegType byte
+
+const (
+	RegTypeInvalid RegType = iota
+	RegTypeInt
+	RegTypeFloat
+	NumRegType
+)
+
+// String implements fmt.Stringer.
+func (r RegType) String() string {
+	switch r {
+	case RegTypeInt:
+		return "int"
+	case RegTypeFloat:
+		return "float"
+	default:
+		return "invalid"
+	}
+}
+
+// RegTypeOf returns the RegType of the given ssa.Type.
+func RegTypeOf(p ssa.Type) RegType {
+	switch p {
+	case ssa.TypeI32, ssa.TypeI64:
+		return RegTypeInt
+	case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+		return RegTypeFloat
+	default:
+		panic("invalid type")
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
new file mode 100644
index 000000000..b4450d56f
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
@@ -0,0 +1,1212 @@
+// Package regalloc performs register allocation. The algorithm can work on any ISA by implementing the interfaces in
+// api.go.
+//
+// References:
+//   - https://web.stanford.edu/class/archive/cs/cs143/cs143.1128/lectures/17/Slides17.pdf
+//   - https://en.wikipedia.org/wiki/Chaitin%27s_algorithm
+//   - https://llvm.org/ProjectsWithLLVM/2004-Fall-CS426-LS.pdf
+//   - https://pfalcon.github.io/ssabook/latest/book-full.pdf: Chapter 9. for liveness analysis.
+//   - https://github.com/golang/go/blob/release-branch.go1.21/src/cmd/compile/internal/ssa/regalloc.go
+package regalloc
+
+import (
+	"fmt"
+	"math"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// NewAllocator returns a new Allocator.
+func NewAllocator(allocatableRegs *RegisterInfo) Allocator {
+	a := Allocator{
+		regInfo:            allocatableRegs,
+		phiDefInstListPool: wazevoapi.NewPool[phiDefInstList](resetPhiDefInstList),
+		blockStates:        wazevoapi.NewIDedPool[blockState](resetBlockState),
+	}
+	a.state.vrStates = wazevoapi.NewIDedPool[vrState](resetVrState)
+	a.state.reset()
+	for _, regs := range allocatableRegs.AllocatableRegisters {
+		for _, r := range regs {
+			a.allocatableSet = a.allocatableSet.add(r)
+		}
+	}
+	return a
+}
+
+type (
+	// RegisterInfo holds the statically-known ISA-specific register information.
+	RegisterInfo struct {
+		// AllocatableRegisters is a 2D array of allocatable RealReg, indexed by regTypeNum and regNum.
+		// The order matters: the first element is the most preferred one when allocating.
+		AllocatableRegisters [NumRegType][]RealReg
+		CalleeSavedRegisters RegSet
+		CallerSavedRegisters RegSet
+		RealRegToVReg        []VReg
+		// RealRegName returns the name of the given RealReg for debugging.
+		RealRegName func(r RealReg) string
+		RealRegType func(r RealReg) RegType
+	}
+
+	// Allocator is a register allocator.
+	Allocator struct {
+		// regInfo is static per ABI/ISA, and is initialized by the machine during Machine.PrepareRegisterAllocator.
+		regInfo *RegisterInfo
+		// allocatableSet is a set of allocatable RealReg derived from regInfo. Static per ABI/ISA.
+		allocatableSet           RegSet
+		allocatedCalleeSavedRegs []VReg
+		vs                       []VReg
+		vs2                      []VRegID
+		phiDefInstListPool       wazevoapi.Pool[phiDefInstList]
+
+		// Followings are re-used during various places.
+		blks             []Block
+		reals            []RealReg
+		currentOccupants regInUseSet
+
+		// Following two fields are updated while iterating the blocks in the reverse postorder.
+		state       state
+		blockStates wazevoapi.IDedPool[blockState]
+	}
+
+	// programCounter represents an opaque index into the program which is used to represents a LiveInterval of a VReg.
+	programCounter int32
+
+	state struct {
+		argRealRegs []VReg
+		regsInUse   regInUseSet
+		vrStates    wazevoapi.IDedPool[vrState]
+
+		currentBlockID int32
+
+		// allocatedRegSet is a set of RealReg that are allocated during the allocation phase. This is reset per function.
+		allocatedRegSet RegSet
+	}
+
+	blockState struct {
+		// liveIns is a list of VReg that are live at the beginning of the block.
+		liveIns []VRegID
+		// seen is true if the block is visited during the liveness analysis.
+		seen bool
+		// visited is true if the block is visited during the allocation phase.
+		visited            bool
+		startFromPredIndex int
+		// startRegs is a list of RealReg that are used at the beginning of the block. This is used to fix the merge edges.
+		startRegs regInUseSet
+		// endRegs is a list of RealReg that are used at the end of the block. This is used to fix the merge edges.
+		endRegs regInUseSet
+	}
+
+	vrState struct {
+		v VReg
+		r RealReg
+		// defInstr is the instruction that defines this value. If this is the phi value and not the entry block, this is nil.
+		defInstr Instr
+		// defBlk is the block that defines this value. If this is the phi value, this is the block whose arguments contain this value.
+		defBlk Block
+		// lca = lowest common ancestor. This is the block that is the lowest common ancestor of all the blocks that
+		// reloads this value. This is used to determine the spill location. Only valid if spilled=true.
+		lca Block
+		// lastUse is the program counter of the last use of this value. This changes while iterating the block, and
+		// should not be used across the blocks as it becomes invalid. To check the validity, use lastUseUpdatedAtBlockID.
+		lastUse                 programCounter
+		lastUseUpdatedAtBlockID int32
+		// spilled is true if this value is spilled i.e. the value is reload from the stack somewhere in the program.
+		//
+		// Note that this field is used during liveness analysis for different purpose. This is used to determine the
+		// value is live-in or not.
+		spilled bool
+		// isPhi is true if this is a phi value.
+		isPhi      bool
+		desiredLoc desiredLoc
+		// phiDefInstList is a list of instructions that defines this phi value.
+		// This is used to determine the spill location, and only valid if isPhi=true.
+		*phiDefInstList
+	}
+
+	// phiDefInstList is a linked list of instructions that defines a phi value.
+	phiDefInstList struct {
+		instr Instr
+		v     VReg
+		next  *phiDefInstList
+	}
+
+	// desiredLoc represents a desired location for a VReg.
+	desiredLoc uint16
+	// desiredLocKind is a kind of desired location for a VReg.
+	desiredLocKind uint16
+)
+
+const (
+	// desiredLocKindUnspecified is a kind of desired location for a VReg that is not specified.
+	desiredLocKindUnspecified desiredLocKind = iota
+	// desiredLocKindStack is a kind of desired location for a VReg that is on the stack, only used for the phi values.
+	desiredLocKindStack
+	// desiredLocKindReg is a kind of desired location for a VReg that is in a register.
+	desiredLocKindReg
+	desiredLocUnspecified = desiredLoc(desiredLocKindUnspecified)
+	desiredLocStack       = desiredLoc(desiredLocKindStack)
+)
+
+func newDesiredLocReg(r RealReg) desiredLoc {
+	return desiredLoc(desiredLocKindReg) | desiredLoc(r<<2)
+}
+
+func (d desiredLoc) realReg() RealReg {
+	return RealReg(d >> 2)
+}
+
+func (d desiredLoc) stack() bool {
+	return d&3 == desiredLoc(desiredLocKindStack)
+}
+
+func resetPhiDefInstList(l *phiDefInstList) {
+	l.instr = nil
+	l.next = nil
+	l.v = VRegInvalid
+}
+
+func (s *state) dump(info *RegisterInfo) { //nolint:unused
+	fmt.Println("\t\tstate:")
+	fmt.Println("\t\t\targRealRegs:", s.argRealRegs)
+	fmt.Println("\t\t\tregsInUse", s.regsInUse.format(info))
+	fmt.Println("\t\t\tallocatedRegSet:", s.allocatedRegSet.format(info))
+	fmt.Println("\t\t\tused:", s.regsInUse.format(info))
+	var strs []string
+	for i := 0; i <= s.vrStates.MaxIDEncountered(); i++ {
+		vs := s.vrStates.Get(i)
+		if vs == nil {
+			continue
+		}
+		if vs.r != RealRegInvalid {
+			strs = append(strs, fmt.Sprintf("(v%d: %s)", vs.v.ID(), info.RealRegName(vs.r)))
+		}
+	}
+	fmt.Println("\t\t\tvrStates:", strings.Join(strs, ", "))
+}
+
+func (s *state) reset() {
+	s.argRealRegs = s.argRealRegs[:0]
+	s.vrStates.Reset()
+	s.allocatedRegSet = RegSet(0)
+	s.regsInUse.reset()
+	s.currentBlockID = -1
+}
+
+func (s *state) setVRegState(v VReg, r RealReg) {
+	id := int(v.ID())
+	st := s.vrStates.GetOrAllocate(id)
+	st.r = r
+	st.v = v
+}
+
+func resetVrState(vs *vrState) {
+	vs.v = VRegInvalid
+	vs.r = RealRegInvalid
+	vs.defInstr = nil
+	vs.defBlk = nil
+	vs.spilled = false
+	vs.lastUse = -1
+	vs.lastUseUpdatedAtBlockID = -1
+	vs.lca = nil
+	vs.isPhi = false
+	vs.phiDefInstList = nil
+	vs.desiredLoc = desiredLocUnspecified
+}
+
+func (s *state) getVRegState(v VRegID) *vrState {
+	return s.vrStates.GetOrAllocate(int(v))
+}
+
+func (s *state) useRealReg(r RealReg, v VReg) {
+	if s.regsInUse.has(r) {
+		panic("BUG: useRealReg: the given real register is already used")
+	}
+	s.regsInUse.add(r, v)
+	s.setVRegState(v, r)
+	s.allocatedRegSet = s.allocatedRegSet.add(r)
+}
+
+func (s *state) releaseRealReg(r RealReg) {
+	current := s.regsInUse.get(r)
+	if current.Valid() {
+		s.regsInUse.remove(r)
+		s.setVRegState(current, RealRegInvalid)
+	}
+}
+
+// recordReload records that the given VReg is reloaded in the given block.
+// This is used to determine the spill location by tracking the lowest common ancestor of all the blocks that reloads the value.
+func (vs *vrState) recordReload(f Function, blk Block) {
+	vs.spilled = true
+	if vs.lca == nil {
+		if wazevoapi.RegAllocLoggingEnabled {
+			fmt.Printf("\t\tv%d is reloaded in blk%d,\n", vs.v.ID(), blk.ID())
+		}
+		vs.lca = blk
+	} else {
+		if wazevoapi.RegAllocLoggingEnabled {
+			fmt.Printf("\t\tv%d is reloaded in blk%d, lca=%d\n", vs.v.ID(), blk.ID(), vs.lca.ID())
+		}
+		vs.lca = f.LowestCommonAncestor(vs.lca, blk)
+		if wazevoapi.RegAllocLoggingEnabled {
+			fmt.Printf("updated lca=%d\n", vs.lca.ID())
+		}
+	}
+}
+
+func (s *state) findOrSpillAllocatable(a *Allocator, allocatable []RealReg, forbiddenMask RegSet, preferred RealReg) (r RealReg) {
+	r = RealRegInvalid
+	// First, check if the preferredMask has any allocatable register.
+	if preferred != RealRegInvalid && !forbiddenMask.has(preferred) && !s.regsInUse.has(preferred) {
+		for _, candidateReal := range allocatable {
+			// TODO: we should ensure the preferred register is in the allocatable set in the first place,
+			//  but right now, just in case, we check it here.
+			if candidateReal == preferred {
+				return preferred
+			}
+		}
+	}
+
+	var lastUseAt programCounter
+	var spillVReg VReg
+	for _, candidateReal := range allocatable {
+		if forbiddenMask.has(candidateReal) {
+			continue
+		}
+
+		using := s.regsInUse.get(candidateReal)
+		if using == VRegInvalid {
+			// This is not used at this point.
+			return candidateReal
+		}
+
+		// Real registers in use should not be spilled, so we skip them.
+		// For example, if the register is used as an argument register, and it might be
+		// spilled and not reloaded when it ends up being used as a temporary to pass
+		// stack based argument.
+		if using.IsRealReg() {
+			continue
+		}
+
+		isPreferred := candidateReal == preferred
+
+		// last == -1 means the value won't be used anymore.
+		if last := s.getVRegState(using.ID()).lastUse; r == RealRegInvalid || isPreferred || last == -1 || (lastUseAt != -1 && last > lastUseAt) {
+			lastUseAt = last
+			r = candidateReal
+			spillVReg = using
+			if isPreferred {
+				break
+			}
+		}
+	}
+
+	if r == RealRegInvalid {
+		panic("not found any allocatable register")
+	}
+
+	if wazevoapi.RegAllocLoggingEnabled {
+		fmt.Printf("\tspilling v%d when lastUseAt=%d and regsInUse=%s\n", spillVReg.ID(), lastUseAt, s.regsInUse.format(a.regInfo))
+	}
+	s.releaseRealReg(r)
+	return r
+}
+
+func (s *state) findAllocatable(allocatable []RealReg, forbiddenMask RegSet) RealReg {
+	for _, r := range allocatable {
+		if !s.regsInUse.has(r) && !forbiddenMask.has(r) {
+			return r
+		}
+	}
+	return RealRegInvalid
+}
+
+func (s *state) resetAt(bs *blockState) {
+	s.regsInUse.range_(func(_ RealReg, vr VReg) {
+		s.setVRegState(vr, RealRegInvalid)
+	})
+	s.regsInUse.reset()
+	bs.endRegs.range_(func(r RealReg, v VReg) {
+		id := int(v.ID())
+		st := s.vrStates.GetOrAllocate(id)
+		if st.lastUseUpdatedAtBlockID == s.currentBlockID && st.lastUse == programCounterLiveIn {
+			s.regsInUse.add(r, v)
+			s.setVRegState(v, r)
+		}
+	})
+}
+
+func resetBlockState(b *blockState) {
+	b.seen = false
+	b.visited = false
+	b.endRegs.reset()
+	b.startRegs.reset()
+	b.startFromPredIndex = -1
+	b.liveIns = b.liveIns[:0]
+}
+
+func (b *blockState) dump(a *RegisterInfo) {
+	fmt.Println("\t\tblockState:")
+	fmt.Println("\t\t\tstartRegs:", b.startRegs.format(a))
+	fmt.Println("\t\t\tendRegs:", b.endRegs.format(a))
+	fmt.Println("\t\t\tstartFromPredIndex:", b.startFromPredIndex)
+	fmt.Println("\t\t\tvisited:", b.visited)
+}
+
+// DoAllocation performs register allocation on the given Function.
+func (a *Allocator) DoAllocation(f Function) {
+	a.livenessAnalysis(f)
+	a.alloc(f)
+	a.determineCalleeSavedRealRegs(f)
+}
+
+func (a *Allocator) determineCalleeSavedRealRegs(f Function) {
+	a.allocatedCalleeSavedRegs = a.allocatedCalleeSavedRegs[:0]
+	a.state.allocatedRegSet.Range(func(allocatedRealReg RealReg) {
+		if a.regInfo.CalleeSavedRegisters.has(allocatedRealReg) {
+			a.allocatedCalleeSavedRegs = append(a.allocatedCalleeSavedRegs, a.regInfo.RealRegToVReg[allocatedRealReg])
+		}
+	})
+	f.ClobberedRegisters(a.allocatedCalleeSavedRegs)
+}
+
+func (a *Allocator) getOrAllocateBlockState(blockID int32) *blockState {
+	return a.blockStates.GetOrAllocate(int(blockID))
+}
+
+// phiBlk returns the block that defines the given phi value, nil otherwise.
+func (s *state) phiBlk(v VRegID) Block {
+	vs := s.getVRegState(v)
+	if vs.isPhi {
+		return vs.defBlk
+	}
+	return nil
+}
+
+const (
+	programCounterLiveIn  = math.MinInt32
+	programCounterLiveOut = math.MaxInt32
+)
+
+// liveAnalysis constructs Allocator.blockLivenessData.
+// The algorithm here is described in https://pfalcon.github.io/ssabook/latest/book-full.pdf Chapter 9.2.
+func (a *Allocator) livenessAnalysis(f Function) {
+	s := &a.state
+	for blk := f.PostOrderBlockIteratorBegin(); blk != nil; blk = f.PostOrderBlockIteratorNext() { // Order doesn't matter.
+
+		// We should gather phi value data.
+		for _, p := range blk.BlockParams(&a.vs) {
+			vs := s.getVRegState(p.ID())
+			vs.isPhi = true
+			vs.defBlk = blk
+		}
+	}
+
+	for blk := f.PostOrderBlockIteratorBegin(); blk != nil; blk = f.PostOrderBlockIteratorNext() {
+		blkID := blk.ID()
+		info := a.getOrAllocateBlockState(blkID)
+
+		a.vs2 = a.vs2[:0]
+		const (
+			flagDeleted = false
+			flagLive    = true
+		)
+		ns := blk.Succs()
+		for i := 0; i < ns; i++ {
+			succ := blk.Succ(i)
+			if succ == nil {
+				continue
+			}
+
+			succID := succ.ID()
+			succInfo := a.getOrAllocateBlockState(succID)
+			if !succInfo.seen { // This means the back edge.
+				continue
+			}
+
+			for _, v := range succInfo.liveIns {
+				if s.phiBlk(v) != succ {
+					st := s.getVRegState(v)
+					// We use .spilled field to store the flag.
+					st.spilled = flagLive
+					a.vs2 = append(a.vs2, v)
+				}
+			}
+		}
+
+		for instr := blk.InstrRevIteratorBegin(); instr != nil; instr = blk.InstrRevIteratorNext() {
+
+			var use, def VReg
+			for _, def = range instr.Defs(&a.vs) {
+				if !def.IsRealReg() {
+					id := def.ID()
+					st := s.getVRegState(id)
+					// We use .spilled field to store the flag.
+					st.spilled = flagDeleted
+					a.vs2 = append(a.vs2, id)
+				}
+			}
+			for _, use = range instr.Uses(&a.vs) {
+				if !use.IsRealReg() {
+					id := use.ID()
+					st := s.getVRegState(id)
+					// We use .spilled field to store the flag.
+					st.spilled = flagLive
+					a.vs2 = append(a.vs2, id)
+				}
+			}
+
+			if def.Valid() && s.phiBlk(def.ID()) != nil {
+				if use.Valid() && use.IsRealReg() {
+					// If the destination is a phi value, and the source is a real register, this is the beginning of the function.
+					a.state.argRealRegs = append(a.state.argRealRegs, use)
+				}
+			}
+		}
+
+		for _, v := range a.vs2 {
+			st := s.getVRegState(v)
+			// We use .spilled field to store the flag.
+			if st.spilled == flagLive { //nolint:gosimple
+				info.liveIns = append(info.liveIns, v)
+				st.spilled = false
+			}
+		}
+
+		info.seen = true
+	}
+
+	nrs := f.LoopNestingForestRoots()
+	for i := 0; i < nrs; i++ {
+		root := f.LoopNestingForestRoot(i)
+		a.loopTreeDFS(root)
+	}
+}
+
+// loopTreeDFS implements the Algorithm 9.3 in the book in an iterative way.
+func (a *Allocator) loopTreeDFS(entry Block) {
+	a.blks = a.blks[:0]
+	a.blks = append(a.blks, entry)
+
+	s := &a.state
+	for len(a.blks) > 0 {
+		tail := len(a.blks) - 1
+		loop := a.blks[tail]
+		a.blks = a.blks[:tail]
+		a.vs2 = a.vs2[:0]
+		const (
+			flagDone    = false
+			flagPending = true
+		)
+		info := a.getOrAllocateBlockState(loop.ID())
+		for _, v := range info.liveIns {
+			if s.phiBlk(v) != loop {
+				a.vs2 = append(a.vs2, v)
+				st := s.getVRegState(v)
+				// We use .spilled field to store the flag.
+				st.spilled = flagPending
+			}
+		}
+
+		var siblingAddedView []VRegID
+		cn := loop.LoopNestingForestChildren()
+		for i := 0; i < cn; i++ {
+			child := loop.LoopNestingForestChild(i)
+			childID := child.ID()
+			childInfo := a.getOrAllocateBlockState(childID)
+
+			if i == 0 {
+				begin := len(childInfo.liveIns)
+				for _, v := range a.vs2 {
+					st := s.getVRegState(v)
+					// We use .spilled field to store the flag.
+					if st.spilled == flagPending { //nolint:gosimple
+						st.spilled = flagDone
+						// TODO: deduplicate, though I don't think it has much impact.
+						childInfo.liveIns = append(childInfo.liveIns, v)
+					}
+				}
+				siblingAddedView = childInfo.liveIns[begin:]
+			} else {
+				// TODO: deduplicate, though I don't think it has much impact.
+				childInfo.liveIns = append(childInfo.liveIns, siblingAddedView...)
+			}
+
+			if child.LoopHeader() {
+				a.blks = append(a.blks, child)
+			}
+		}
+
+		if cn == 0 {
+			// If there's no forest child, we haven't cleared the .spilled field at this point.
+			for _, v := range a.vs2 {
+				st := s.getVRegState(v)
+				st.spilled = false
+			}
+		}
+	}
+}
+
+// alloc allocates registers for the given function by iterating the blocks in the reverse postorder.
+// The algorithm here is derived from the Go compiler's allocator https://github.com/golang/go/blob/release-branch.go1.21/src/cmd/compile/internal/ssa/regalloc.go
+// In short, this is a simply linear scan register allocation where each block inherits the register allocation state from
+// one of its predecessors. Each block inherits the selected state and starts allocation from there.
+// If there's a discrepancy in the end states between predecessors, the adjustments are made to ensure consistency after allocation is done (which we call "fixing merge state").
+// The spill instructions (store into the dedicated slots) are inserted after all the allocations and fixing merge states. That is because
+// at the point, we all know where the reloads happen, and therefore we can know the best place to spill the values. More precisely,
+// the spill happens in the block that is the lowest common ancestor of all the blocks that reloads the value.
+//
+// All of these logics are almost the same as Go's compiler which has a dedicated description in the source file ^^.
+func (a *Allocator) alloc(f Function) {
+	// First we allocate each block in the reverse postorder (at least one predecessor should be allocated for each block).
+	for blk := f.ReversePostOrderBlockIteratorBegin(); blk != nil; blk = f.ReversePostOrderBlockIteratorNext() {
+		if wazevoapi.RegAllocLoggingEnabled {
+			fmt.Printf("========== allocating blk%d ========\n", blk.ID())
+		}
+		if blk.Entry() {
+			a.finalizeStartReg(blk)
+		}
+		a.allocBlock(f, blk)
+	}
+	// After the allocation, we all know the start and end state of each block. So we can fix the merge states.
+	for blk := f.ReversePostOrderBlockIteratorBegin(); blk != nil; blk = f.ReversePostOrderBlockIteratorNext() {
+		a.fixMergeState(f, blk)
+	}
+	// Finally, we insert the spill instructions as we know all the places where the reloads happen.
+	a.scheduleSpills(f)
+}
+
+func (a *Allocator) updateLiveInVRState(liveness *blockState) {
+	currentBlockID := a.state.currentBlockID
+	for _, v := range liveness.liveIns {
+		vs := a.state.getVRegState(v)
+		vs.lastUse = programCounterLiveIn
+		vs.lastUseUpdatedAtBlockID = currentBlockID
+	}
+}
+
+func (a *Allocator) finalizeStartReg(blk Block) {
+	bID := blk.ID()
+	liveness := a.getOrAllocateBlockState(bID)
+	s := &a.state
+	currentBlkState := a.getOrAllocateBlockState(bID)
+	if currentBlkState.startFromPredIndex > -1 {
+		return
+	}
+
+	s.currentBlockID = bID
+	a.updateLiveInVRState(liveness)
+
+	preds := blk.Preds()
+	var predState *blockState
+	switch preds {
+	case 0: // This is the entry block.
+	case 1:
+		predID := blk.Pred(0).ID()
+		predState = a.getOrAllocateBlockState(predID)
+		currentBlkState.startFromPredIndex = 0
+	default:
+		// TODO: there should be some better heuristic to choose the predecessor.
+		for i := 0; i < preds; i++ {
+			predID := blk.Pred(i).ID()
+			if _predState := a.getOrAllocateBlockState(predID); _predState.visited {
+				predState = _predState
+				currentBlkState.startFromPredIndex = i
+				break
+			}
+		}
+	}
+	if predState == nil {
+		if !blk.Entry() {
+			panic(fmt.Sprintf("BUG: at lease one predecessor should be visited for blk%d", blk.ID()))
+		}
+		for _, u := range s.argRealRegs {
+			s.useRealReg(u.RealReg(), u)
+		}
+		currentBlkState.startFromPredIndex = 0
+	} else if predState != nil {
+		if wazevoapi.RegAllocLoggingEnabled {
+			fmt.Printf("allocating blk%d starting from blk%d (on index=%d) \n",
+				bID, blk.Pred(currentBlkState.startFromPredIndex).ID(), currentBlkState.startFromPredIndex)
+		}
+		s.resetAt(predState)
+	}
+
+	s.regsInUse.range_(func(allocated RealReg, v VReg) {
+		currentBlkState.startRegs.add(allocated, v)
+	})
+	if wazevoapi.RegAllocLoggingEnabled {
+		fmt.Printf("finalized start reg for blk%d: %s\n", blk.ID(), currentBlkState.startRegs.format(a.regInfo))
+	}
+}
+
+func (a *Allocator) allocBlock(f Function, blk Block) {
+	bID := blk.ID()
+	s := &a.state
+	currentBlkState := a.getOrAllocateBlockState(bID)
+	s.currentBlockID = bID
+
+	if currentBlkState.startFromPredIndex < 0 {
+		panic("BUG: startFromPredIndex should be set in finalizeStartReg prior to allocBlock")
+	}
+
+	// Clears the previous state.
+	s.regsInUse.range_(func(allocatedRealReg RealReg, vr VReg) {
+		s.setVRegState(vr, RealRegInvalid)
+	})
+	s.regsInUse.reset()
+	// Then set the start state.
+	currentBlkState.startRegs.range_(func(allocatedRealReg RealReg, vr VReg) {
+		s.useRealReg(allocatedRealReg, vr)
+	})
+
+	desiredUpdated := a.vs2[:0]
+
+	// Update the last use of each VReg.
+	var pc programCounter
+	for instr := blk.InstrIteratorBegin(); instr != nil; instr = blk.InstrIteratorNext() {
+		var use, def VReg
+		for _, use = range instr.Uses(&a.vs) {
+			if !use.IsRealReg() {
+				s.getVRegState(use.ID()).lastUse = pc
+			}
+		}
+
+		if instr.IsCopy() {
+			def = instr.Defs(&a.vs)[0]
+			r := def.RealReg()
+			if r != RealRegInvalid {
+				useID := use.ID()
+				vs := s.getVRegState(useID)
+				if !vs.isPhi { // TODO: no idea why do we need this.
+					vs.desiredLoc = newDesiredLocReg(r)
+					desiredUpdated = append(desiredUpdated, useID)
+				}
+			}
+		}
+		pc++
+	}
+
+	// Mark all live-out values by checking live-in of the successors.
+	// While doing so, we also update the desired register values.
+	var succ Block
+	for i, ns := 0, blk.Succs(); i < ns; i++ {
+		succ = blk.Succ(i)
+		if succ == nil {
+			continue
+		}
+
+		succID := succ.ID()
+		succState := a.getOrAllocateBlockState(succID)
+		for _, v := range succState.liveIns {
+			if s.phiBlk(v) != succ {
+				st := s.getVRegState(v)
+				st.lastUse = programCounterLiveOut
+			}
+		}
+
+		if succState.startFromPredIndex > -1 {
+			if wazevoapi.RegAllocLoggingEnabled {
+				fmt.Printf("blk%d -> blk%d: start_regs: %s\n", bID, succID, succState.startRegs.format(a.regInfo))
+			}
+			succState.startRegs.range_(func(allocatedRealReg RealReg, vr VReg) {
+				vs := s.getVRegState(vr.ID())
+				vs.desiredLoc = newDesiredLocReg(allocatedRealReg)
+				desiredUpdated = append(desiredUpdated, vr.ID())
+			})
+			for _, p := range succ.BlockParams(&a.vs) {
+				vs := s.getVRegState(p.ID())
+				if vs.desiredLoc.realReg() == RealRegInvalid {
+					vs.desiredLoc = desiredLocStack
+					desiredUpdated = append(desiredUpdated, p.ID())
+				}
+			}
+		}
+	}
+
+	// Propagate the desired register values from the end of the block to the beginning.
+	for instr := blk.InstrRevIteratorBegin(); instr != nil; instr = blk.InstrRevIteratorNext() {
+		if instr.IsCopy() {
+			def := instr.Defs(&a.vs)[0]
+			defState := s.getVRegState(def.ID())
+			desired := defState.desiredLoc.realReg()
+			if desired == RealRegInvalid {
+				continue
+			}
+
+			use := instr.Uses(&a.vs)[0]
+			useID := use.ID()
+			useState := s.getVRegState(useID)
+			if s.phiBlk(useID) != succ && useState.desiredLoc == desiredLocUnspecified {
+				useState.desiredLoc = newDesiredLocReg(desired)
+				desiredUpdated = append(desiredUpdated, useID)
+			}
+		}
+	}
+
+	pc = 0
+	for instr := blk.InstrIteratorBegin(); instr != nil; instr = blk.InstrIteratorNext() {
+		if wazevoapi.RegAllocLoggingEnabled {
+			fmt.Println(instr)
+		}
+
+		var currentUsedSet RegSet
+		killSet := a.reals[:0]
+
+		// Gather the set of registers that will be used in the current instruction.
+		for _, use := range instr.Uses(&a.vs) {
+			if use.IsRealReg() {
+				r := use.RealReg()
+				currentUsedSet = currentUsedSet.add(r)
+				if a.allocatableSet.has(r) {
+					killSet = append(killSet, r)
+				}
+			} else {
+				vs := s.getVRegState(use.ID())
+				if r := vs.r; r != RealRegInvalid {
+					currentUsedSet = currentUsedSet.add(r)
+				}
+			}
+		}
+
+		for i, use := range instr.Uses(&a.vs) {
+			if !use.IsRealReg() {
+				vs := s.getVRegState(use.ID())
+				killed := vs.lastUse == pc
+				r := vs.r
+
+				if r == RealRegInvalid {
+					r = s.findOrSpillAllocatable(a, a.regInfo.AllocatableRegisters[use.RegType()], currentUsedSet,
+						// Prefer the desired register if it's available.
+						vs.desiredLoc.realReg())
+					vs.recordReload(f, blk)
+					f.ReloadRegisterBefore(use.SetRealReg(r), instr)
+					s.useRealReg(r, use)
+				}
+				if wazevoapi.RegAllocLoggingEnabled {
+					fmt.Printf("\ttrying to use v%v on %s\n", use.ID(), a.regInfo.RealRegName(r))
+				}
+				instr.AssignUse(i, use.SetRealReg(r))
+				currentUsedSet = currentUsedSet.add(r)
+				if killed {
+					if wazevoapi.RegAllocLoggingEnabled {
+						fmt.Printf("\tkill v%d with %s\n", use.ID(), a.regInfo.RealRegName(r))
+					}
+					killSet = append(killSet, r)
+				}
+			}
+		}
+
+		isIndirect := instr.IsIndirectCall()
+		call := instr.IsCall() || isIndirect
+		if call {
+			addr := RealRegInvalid
+			if instr.IsIndirectCall() {
+				addr = a.vs[0].RealReg()
+			}
+			a.releaseCallerSavedRegs(addr)
+		}
+
+		for _, r := range killSet {
+			s.releaseRealReg(r)
+		}
+		a.reals = killSet
+
+		defs := instr.Defs(&a.vs)
+		switch {
+		case len(defs) > 1:
+			// Some instructions define multiple values on real registers.
+			// E.g. call instructions (following calling convention) / div instruction on x64 that defines both rax and rdx.
+			//
+			// Note that currently I assume that such instructions define only the pre colored real registers, not the VRegs
+			// that require allocations. If we need to support such case, we need to add the logic to handle it here,
+			// though is there any such instruction?
+			for _, def := range defs {
+				if !def.IsRealReg() {
+					panic("BUG: multiple defs should be on real registers")
+				}
+				r := def.RealReg()
+				if s.regsInUse.has(r) {
+					s.releaseRealReg(r)
+				}
+				s.useRealReg(r, def)
+			}
+		case len(defs) == 1:
+			def := defs[0]
+			if def.IsRealReg() {
+				r := def.RealReg()
+				if a.allocatableSet.has(r) {
+					if s.regsInUse.has(r) {
+						s.releaseRealReg(r)
+					}
+					s.useRealReg(r, def)
+				}
+			} else {
+				vState := s.getVRegState(def.ID())
+				r := vState.r
+
+				if desired := vState.desiredLoc.realReg(); desired != RealRegInvalid {
+					if r != desired {
+						if (vState.isPhi && vState.defBlk == succ) ||
+							// If this is not a phi and it's already assigned a real reg,
+							// this value has multiple definitions, hence we cannot assign the desired register.
+							(!s.regsInUse.has(desired) && r == RealRegInvalid) {
+							// If the phi value is passed via a real register, we force the value to be in the desired register.
+							if wazevoapi.RegAllocLoggingEnabled {
+								fmt.Printf("\t\tv%d is phi and desiredReg=%s\n", def.ID(), a.regInfo.RealRegName(desired))
+							}
+							if r != RealRegInvalid {
+								// If the value is already in a different real register, we release it to change the state.
+								// Otherwise, multiple registers might have the same values at the end, which results in
+								// messing up the merge state reconciliation.
+								s.releaseRealReg(r)
+							}
+							r = desired
+							s.releaseRealReg(r)
+							s.useRealReg(r, def)
+						}
+					}
+				}
+
+				// Allocate a new real register if `def` is not currently assigned one.
+				// It can happen when multiple instructions define the same VReg (e.g. const loads).
+				if r == RealRegInvalid {
+					if instr.IsCopy() {
+						copySrc := instr.Uses(&a.vs)[0].RealReg()
+						if a.allocatableSet.has(copySrc) && !s.regsInUse.has(copySrc) {
+							r = copySrc
+						}
+					}
+					if r == RealRegInvalid {
+						typ := def.RegType()
+						r = s.findOrSpillAllocatable(a, a.regInfo.AllocatableRegisters[typ], RegSet(0), RealRegInvalid)
+					}
+					s.useRealReg(r, def)
+				}
+				dr := def.SetRealReg(r)
+				instr.AssignDef(dr)
+				if wazevoapi.RegAllocLoggingEnabled {
+					fmt.Printf("\tdefining v%d with %s\n", def.ID(), a.regInfo.RealRegName(r))
+				}
+				if vState.isPhi {
+					if vState.desiredLoc.stack() { // Stack based phi value.
+						f.StoreRegisterAfter(dr, instr)
+						// Release the real register as it's not used anymore.
+						s.releaseRealReg(r)
+					} else {
+						// Only the register based phis are necessary to track the defining instructions
+						// since the stack-based phis are already having stores inserted ^.
+						n := a.phiDefInstListPool.Allocate()
+						n.instr = instr
+						n.next = vState.phiDefInstList
+						n.v = dr
+						vState.phiDefInstList = n
+					}
+				} else {
+					vState.defInstr = instr
+					vState.defBlk = blk
+				}
+			}
+		}
+		if wazevoapi.RegAllocLoggingEnabled {
+			fmt.Println(instr)
+		}
+		pc++
+	}
+
+	s.regsInUse.range_(func(allocated RealReg, v VReg) {
+		currentBlkState.endRegs.add(allocated, v)
+	})
+
+	currentBlkState.visited = true
+	if wazevoapi.RegAllocLoggingEnabled {
+		currentBlkState.dump(a.regInfo)
+	}
+
+	// Reset the desired end location.
+	for _, v := range desiredUpdated {
+		vs := s.getVRegState(v)
+		vs.desiredLoc = desiredLocUnspecified
+	}
+	a.vs2 = desiredUpdated[:0]
+
+	for i := 0; i < blk.Succs(); i++ {
+		succ := blk.Succ(i)
+		if succ == nil {
+			continue
+		}
+		// If the successor is not visited yet, finalize the start state.
+		a.finalizeStartReg(succ)
+	}
+}
+
+func (a *Allocator) releaseCallerSavedRegs(addrReg RealReg) {
+	s := &a.state
+
+	for i := 0; i < 64; i++ {
+		allocated := RealReg(i)
+		if allocated == addrReg { // If this is the call indirect, we should not touch the addr register.
+			continue
+		}
+		if v := s.regsInUse.get(allocated); v.Valid() {
+			if v.IsRealReg() {
+				continue // This is the argument register as it's already used by VReg backed by the corresponding RealReg.
+			}
+			if !a.regInfo.CallerSavedRegisters.has(allocated) {
+				// If this is not a caller-saved register, it is safe to keep it across the call.
+				continue
+			}
+			s.releaseRealReg(allocated)
+		}
+	}
+}
+
+func (a *Allocator) fixMergeState(f Function, blk Block) {
+	preds := blk.Preds()
+	if preds <= 1 {
+		return
+	}
+
+	s := &a.state
+
+	// Restores the state at the beginning of the block.
+	bID := blk.ID()
+	blkSt := a.getOrAllocateBlockState(bID)
+	desiredOccupants := &blkSt.startRegs
+	aliveOnRegVRegs := make(map[VReg]RealReg)
+	for i := 0; i < 64; i++ {
+		r := RealReg(i)
+		if v := blkSt.startRegs.get(r); v.Valid() {
+			aliveOnRegVRegs[v] = r
+		}
+	}
+
+	if wazevoapi.RegAllocLoggingEnabled {
+		fmt.Println("fixMergeState", blk.ID(), ":", desiredOccupants.format(a.regInfo))
+	}
+
+	s.currentBlockID = bID
+	a.updateLiveInVRState(a.getOrAllocateBlockState(bID))
+
+	currentOccupants := &a.currentOccupants
+	for i := 0; i < preds; i++ {
+		currentOccupants.reset()
+		if i == blkSt.startFromPredIndex {
+			continue
+		}
+
+		currentOccupantsRev := make(map[VReg]RealReg)
+		pred := blk.Pred(i)
+		predSt := a.getOrAllocateBlockState(pred.ID())
+		for ii := 0; ii < 64; ii++ {
+			r := RealReg(ii)
+			if v := predSt.endRegs.get(r); v.Valid() {
+				if _, ok := aliveOnRegVRegs[v]; !ok {
+					continue
+				}
+				currentOccupants.add(r, v)
+				currentOccupantsRev[v] = r
+			}
+		}
+
+		s.resetAt(predSt)
+
+		// Finds the free registers if any.
+		intTmp, floatTmp := VRegInvalid, VRegInvalid
+		if intFree := s.findAllocatable(
+			a.regInfo.AllocatableRegisters[RegTypeInt], desiredOccupants.set,
+		); intFree != RealRegInvalid {
+			intTmp = FromRealReg(intFree, RegTypeInt)
+		}
+		if floatFree := s.findAllocatable(
+			a.regInfo.AllocatableRegisters[RegTypeFloat], desiredOccupants.set,
+		); floatFree != RealRegInvalid {
+			floatTmp = FromRealReg(floatFree, RegTypeFloat)
+		}
+
+		if wazevoapi.RegAllocLoggingEnabled {
+			fmt.Println("\t", pred.ID(), ":", currentOccupants.format(a.regInfo))
+		}
+
+		for ii := 0; ii < 64; ii++ {
+			r := RealReg(ii)
+			desiredVReg := desiredOccupants.get(r)
+			if !desiredVReg.Valid() {
+				continue
+			}
+
+			currentVReg := currentOccupants.get(r)
+			if desiredVReg.ID() == currentVReg.ID() {
+				continue
+			}
+
+			typ := desiredVReg.RegType()
+			var tmpRealReg VReg
+			if typ == RegTypeInt {
+				tmpRealReg = intTmp
+			} else {
+				tmpRealReg = floatTmp
+			}
+			a.reconcileEdge(f, r, pred, currentOccupants, currentOccupantsRev, currentVReg, desiredVReg, tmpRealReg, typ)
+		}
+	}
+}
+
+func (a *Allocator) reconcileEdge(f Function,
+	r RealReg,
+	pred Block,
+	currentOccupants *regInUseSet,
+	currentOccupantsRev map[VReg]RealReg,
+	currentVReg, desiredVReg VReg,
+	freeReg VReg,
+	typ RegType,
+) {
+	s := &a.state
+	if currentVReg.Valid() {
+		// Both are on reg.
+		er, ok := currentOccupantsRev[desiredVReg]
+		if !ok {
+			if wazevoapi.RegAllocLoggingEnabled {
+				fmt.Printf("\t\tv%d is desired to be on %s, but currently on the stack\n",
+					desiredVReg.ID(), a.regInfo.RealRegName(r),
+				)
+			}
+			// This case is that the desired value is on the stack, but currentVReg is on the target register.
+			// We need to move the current value to the stack, and reload the desired value.
+			// TODO: we can do better here.
+			f.StoreRegisterBefore(currentVReg.SetRealReg(r), pred.LastInstrForInsertion())
+			delete(currentOccupantsRev, currentVReg)
+
+			s.getVRegState(desiredVReg.ID()).recordReload(f, pred)
+			f.ReloadRegisterBefore(desiredVReg.SetRealReg(r), pred.LastInstrForInsertion())
+			currentOccupants.add(r, desiredVReg)
+			currentOccupantsRev[desiredVReg] = r
+			return
+		}
+
+		if wazevoapi.RegAllocLoggingEnabled {
+			fmt.Printf("\t\tv%d is desired to be on %s, but currently on %s\n",
+				desiredVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er),
+			)
+		}
+		f.SwapBefore(
+			currentVReg.SetRealReg(r),
+			desiredVReg.SetRealReg(er),
+			freeReg,
+			pred.LastInstrForInsertion(),
+		)
+		s.allocatedRegSet = s.allocatedRegSet.add(freeReg.RealReg())
+		currentOccupantsRev[desiredVReg] = r
+		currentOccupantsRev[currentVReg] = er
+		currentOccupants.add(r, desiredVReg)
+		currentOccupants.add(er, currentVReg)
+		if wazevoapi.RegAllocLoggingEnabled {
+			fmt.Printf("\t\tv%d previously on %s moved to %s\n", currentVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er))
+		}
+	} else {
+		// Desired is on reg, but currently the target register is not used.
+		if wazevoapi.RegAllocLoggingEnabled {
+			fmt.Printf("\t\tv%d is desired to be on %s, current not used\n",
+				desiredVReg.ID(), a.regInfo.RealRegName(r),
+			)
+		}
+		if currentReg, ok := currentOccupantsRev[desiredVReg]; ok {
+			f.InsertMoveBefore(
+				FromRealReg(r, typ),
+				desiredVReg.SetRealReg(currentReg),
+				pred.LastInstrForInsertion(),
+			)
+			currentOccupants.remove(currentReg)
+		} else {
+			s.getVRegState(desiredVReg.ID()).recordReload(f, pred)
+			f.ReloadRegisterBefore(desiredVReg.SetRealReg(r), pred.LastInstrForInsertion())
+		}
+		currentOccupantsRev[desiredVReg] = r
+		currentOccupants.add(r, desiredVReg)
+	}
+
+	if wazevoapi.RegAllocLoggingEnabled {
+		fmt.Println("\t", pred.ID(), ":", currentOccupants.format(a.regInfo))
+	}
+}
+
+func (a *Allocator) scheduleSpills(f Function) {
+	states := a.state.vrStates
+	for i := 0; i <= states.MaxIDEncountered(); i++ {
+		vs := states.Get(i)
+		if vs == nil {
+			continue
+		}
+		if vs.spilled {
+			a.scheduleSpill(f, vs)
+		}
+	}
+}
+
+func (a *Allocator) scheduleSpill(f Function, vs *vrState) {
+	v := vs.v
+	// If the value is the phi value, we need to insert a spill after each phi definition.
+	if vs.isPhi {
+		for defInstr := vs.phiDefInstList; defInstr != nil; defInstr = defInstr.next {
+			f.StoreRegisterAfter(defInstr.v, defInstr.instr)
+		}
+		return
+	}
+
+	pos := vs.lca
+	definingBlk := vs.defBlk
+	r := RealRegInvalid
+	if definingBlk == nil {
+		panic(fmt.Sprintf("BUG: definingBlk should not be nil for %s. This is likley a bug in backend lowering logic", vs.v.String()))
+	}
+	if pos == nil {
+		panic(fmt.Sprintf("BUG: pos should not be nil for %s. This is likley a bug in backend lowering logic", vs.v.String()))
+	}
+
+	if wazevoapi.RegAllocLoggingEnabled {
+		fmt.Printf("v%d is spilled in blk%d, lca=blk%d\n", v.ID(), definingBlk.ID(), pos.ID())
+	}
+	for pos != definingBlk {
+		st := a.getOrAllocateBlockState(pos.ID())
+		for ii := 0; ii < 64; ii++ {
+			rr := RealReg(ii)
+			if st.startRegs.get(rr) == v {
+				r = rr
+				// Already in the register, so we can place the spill at the beginning of the block.
+				break
+			}
+		}
+
+		if r != RealRegInvalid {
+			break
+		}
+
+		pos = f.Idom(pos)
+	}
+
+	if pos == definingBlk {
+		defInstr := vs.defInstr
+		defInstr.Defs(&a.vs)
+		if wazevoapi.RegAllocLoggingEnabled {
+			fmt.Printf("schedule spill v%d after %v\n", v.ID(), defInstr)
+		}
+		f.StoreRegisterAfter(a.vs[0], defInstr)
+	} else {
+		// Found an ancestor block that holds the value in the register at the beginning of the block.
+		// We need to insert a spill before the last use.
+		first := pos.FirstInstr()
+		if wazevoapi.RegAllocLoggingEnabled {
+			fmt.Printf("schedule spill v%d before %v\n", v.ID(), first)
+		}
+		f.StoreRegisterAfter(v.SetRealReg(r), first)
+	}
+}
+
+// Reset resets the allocator's internal state so that it can be reused.
+func (a *Allocator) Reset() {
+	a.state.reset()
+	a.blockStates.Reset()
+	a.phiDefInstListPool.Reset()
+	a.vs = a.vs[:0]
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
new file mode 100644
index 000000000..e9bf60661
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
@@ -0,0 +1,108 @@
+package regalloc
+
+import (
+	"fmt"
+	"strings"
+)
+
+// NewRegSet returns a new RegSet with the given registers.
+func NewRegSet(regs ...RealReg) RegSet {
+	var ret RegSet
+	for _, r := range regs {
+		ret = ret.add(r)
+	}
+	return ret
+}
+
+// RegSet represents a set of registers.
+type RegSet uint64
+
+func (rs RegSet) format(info *RegisterInfo) string { //nolint:unused
+	var ret []string
+	for i := 0; i < 64; i++ {
+		if rs&(1<<uint(i)) != 0 {
+			ret = append(ret, info.RealRegName(RealReg(i)))
+		}
+	}
+	return strings.Join(ret, ", ")
+}
+
+func (rs RegSet) has(r RealReg) bool {
+	return rs&(1<<uint(r)) != 0
+}
+
+func (rs RegSet) add(r RealReg) RegSet {
+	if r >= 64 {
+		return rs
+	}
+	return rs | 1<<uint(r)
+}
+
+func (rs RegSet) Range(f func(allocatedRealReg RealReg)) {
+	for i := 0; i < 64; i++ {
+		if rs&(1<<uint(i)) != 0 {
+			f(RealReg(i))
+		}
+	}
+}
+
+type regInUseSet struct {
+	set RegSet
+	vrs [64]VReg
+}
+
+func (rs *regInUseSet) reset() {
+	rs.set = 0
+	for i := range rs.vrs {
+		rs.vrs[i] = VRegInvalid
+	}
+}
+
+func (rs *regInUseSet) format(info *RegisterInfo) string { //nolint:unused
+	var ret []string
+	for i := 0; i < 64; i++ {
+		if rs.set&(1<<uint(i)) != 0 {
+			vr := rs.vrs[i]
+			ret = append(ret, fmt.Sprintf("(%s->v%d)", info.RealRegName(RealReg(i)), vr.ID()))
+		}
+	}
+	return strings.Join(ret, ", ")
+}
+
+func (rs *regInUseSet) has(r RealReg) bool {
+	if r >= 64 {
+		return false
+	}
+	return rs.set&(1<<uint(r)) != 0
+}
+
+func (rs *regInUseSet) get(r RealReg) VReg {
+	if r >= 64 {
+		return VRegInvalid
+	}
+	return rs.vrs[r]
+}
+
+func (rs *regInUseSet) remove(r RealReg) {
+	if r >= 64 {
+		return
+	}
+	rs.set &= ^(1 << uint(r))
+	rs.vrs[r] = VRegInvalid
+}
+
+func (rs *regInUseSet) add(r RealReg, vr VReg) {
+	if r >= 64 {
+		return
+	}
+	rs.set |= 1 << uint(r)
+	rs.vrs[r] = vr
+}
+
+func (rs *regInUseSet) range_(f func(allocatedRealReg RealReg, vr VReg)) {
+	for i := 0; i < 64; i++ {
+		if rs.set&(1<<uint(i)) != 0 {
+			f(RealReg(i), rs.vrs[i])
+		}
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go
new file mode 100644
index 000000000..edfa962b5
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go
@@ -0,0 +1,43 @@
+package backend
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// SSAValueDefinition represents a definition of an SSA value.
+type SSAValueDefinition struct {
+	// BlockParamValue is valid if Instr == nil
+	BlockParamValue ssa.Value
+
+	// BlkParamVReg is valid if Instr == nil
+	BlkParamVReg regalloc.VReg
+
+	// Instr is not nil if this is a definition from an instruction.
+	Instr *ssa.Instruction
+	// N is the index of the return value in the instr's return values list.
+	N int
+	// RefCount is the number of references to the result.
+	RefCount int
+}
+
+func (d *SSAValueDefinition) IsFromInstr() bool {
+	return d.Instr != nil
+}
+
+func (d *SSAValueDefinition) IsFromBlockParam() bool {
+	return d.Instr == nil
+}
+
+func (d *SSAValueDefinition) SSAValue() ssa.Value {
+	if d.IsFromBlockParam() {
+		return d.BlockParamValue
+	} else {
+		r, rs := d.Instr.Returns()
+		if d.N == 0 {
+			return r
+		} else {
+			return rs[d.N-1]
+		}
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/call_engine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/call_engine.go
new file mode 100644
index 000000000..3379c4dde
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/call_engine.go
@@ -0,0 +1,722 @@
+package wazevo
+
+import (
+	"context"
+	"encoding/binary"
+	"fmt"
+	"reflect"
+	"runtime"
+	"sync/atomic"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/api"
+	"github.com/tetratelabs/wazero/experimental"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+	"github.com/tetratelabs/wazero/internal/expctxkeys"
+	"github.com/tetratelabs/wazero/internal/internalapi"
+	"github.com/tetratelabs/wazero/internal/wasm"
+	"github.com/tetratelabs/wazero/internal/wasmdebug"
+	"github.com/tetratelabs/wazero/internal/wasmruntime"
+)
+
+type (
+	// callEngine implements api.Function.
+	callEngine struct {
+		internalapi.WazeroOnly
+		stack []byte
+		// stackTop is the pointer to the *aligned* top of the stack. This must be updated
+		// whenever the stack is changed. This is passed to the assembly function
+		// at the very beginning of api.Function Call/CallWithStack.
+		stackTop uintptr
+		// executable is the pointer to the executable code for this function.
+		executable         *byte
+		preambleExecutable *byte
+		// parent is the *moduleEngine from which this callEngine is created.
+		parent *moduleEngine
+		// indexInModule is the index of the function in the module.
+		indexInModule wasm.Index
+		// sizeOfParamResultSlice is the size of the parameter/result slice.
+		sizeOfParamResultSlice int
+		requiredParams         int
+		// execCtx holds various information to be read/written by assembly functions.
+		execCtx executionContext
+		// execCtxPtr holds the pointer to the executionContext which doesn't change after callEngine is created.
+		execCtxPtr        uintptr
+		numberOfResults   int
+		stackIteratorImpl stackIterator
+	}
+
+	// executionContext is the struct to be read/written by assembly functions.
+	executionContext struct {
+		// exitCode holds the wazevoapi.ExitCode describing the state of the function execution.
+		exitCode wazevoapi.ExitCode
+		// callerModuleContextPtr holds the moduleContextOpaque for Go function calls.
+		callerModuleContextPtr *byte
+		// originalFramePointer holds the original frame pointer of the caller of the assembly function.
+		originalFramePointer uintptr
+		// originalStackPointer holds the original stack pointer of the caller of the assembly function.
+		originalStackPointer uintptr
+		// goReturnAddress holds the return address to go back to the caller of the assembly function.
+		goReturnAddress uintptr
+		// stackBottomPtr holds the pointer to the bottom of the stack.
+		stackBottomPtr *byte
+		// goCallReturnAddress holds the return address to go back to the caller of the Go function.
+		goCallReturnAddress *byte
+		// stackPointerBeforeGoCall holds the stack pointer before calling a Go function.
+		stackPointerBeforeGoCall *uint64
+		// stackGrowRequiredSize holds the required size of stack grow.
+		stackGrowRequiredSize uintptr
+		// memoryGrowTrampolineAddress holds the address of memory grow trampoline function.
+		memoryGrowTrampolineAddress *byte
+		// stackGrowCallTrampolineAddress holds the address of stack grow trampoline function.
+		stackGrowCallTrampolineAddress *byte
+		// checkModuleExitCodeTrampolineAddress holds the address of check-module-exit-code function.
+		checkModuleExitCodeTrampolineAddress *byte
+		// savedRegisters is the opaque spaces for save/restore registers.
+		// We want to align 16 bytes for each register, so we use [64][2]uint64.
+		savedRegisters [64][2]uint64
+		// goFunctionCallCalleeModuleContextOpaque is the pointer to the target Go function's moduleContextOpaque.
+		goFunctionCallCalleeModuleContextOpaque uintptr
+		// tableGrowTrampolineAddress holds the address of table grow trampoline function.
+		tableGrowTrampolineAddress *byte
+		// refFuncTrampolineAddress holds the address of ref-func trampoline function.
+		refFuncTrampolineAddress *byte
+		// memmoveAddress holds the address of memmove function implemented by Go runtime. See memmove.go.
+		memmoveAddress uintptr
+		// framePointerBeforeGoCall holds the frame pointer before calling a Go function. Note: only used in amd64.
+		framePointerBeforeGoCall uintptr
+		// memoryWait32TrampolineAddress holds the address of memory_wait32 trampoline function.
+		memoryWait32TrampolineAddress *byte
+		// memoryWait32TrampolineAddress holds the address of memory_wait64 trampoline function.
+		memoryWait64TrampolineAddress *byte
+		// memoryNotifyTrampolineAddress holds the address of the memory_notify trampoline function.
+		memoryNotifyTrampolineAddress *byte
+	}
+)
+
+func (c *callEngine) requiredInitialStackSize() int {
+	const initialStackSizeDefault = 10240
+	stackSize := initialStackSizeDefault
+	paramResultInBytes := c.sizeOfParamResultSlice * 8 * 2 // * 8 because uint64 is 8 bytes, and *2 because we need both separated param/result slots.
+	required := paramResultInBytes + 32 + 16               // 32 is enough to accommodate the call frame info, and 16 exists just in case when []byte is not aligned to 16 bytes.
+	if required > stackSize {
+		stackSize = required
+	}
+	return stackSize
+}
+
+func (c *callEngine) init() {
+	stackSize := c.requiredInitialStackSize()
+	if wazevoapi.StackGuardCheckEnabled {
+		stackSize += wazevoapi.StackGuardCheckGuardPageSize
+	}
+	c.stack = make([]byte, stackSize)
+	c.stackTop = alignedStackTop(c.stack)
+	if wazevoapi.StackGuardCheckEnabled {
+		c.execCtx.stackBottomPtr = &c.stack[wazevoapi.StackGuardCheckGuardPageSize]
+	} else {
+		c.execCtx.stackBottomPtr = &c.stack[0]
+	}
+	c.execCtxPtr = uintptr(unsafe.Pointer(&c.execCtx))
+}
+
+// alignedStackTop returns 16-bytes aligned stack top of given stack.
+// 16 bytes should be good for all platform (arm64/amd64).
+func alignedStackTop(s []byte) uintptr {
+	stackAddr := uintptr(unsafe.Pointer(&s[len(s)-1]))
+	return stackAddr - (stackAddr & (16 - 1))
+}
+
+// Definition implements api.Function.
+func (c *callEngine) Definition() api.FunctionDefinition {
+	return c.parent.module.Source.FunctionDefinition(c.indexInModule)
+}
+
+// Call implements api.Function.
+func (c *callEngine) Call(ctx context.Context, params ...uint64) ([]uint64, error) {
+	if c.requiredParams != len(params) {
+		return nil, fmt.Errorf("expected %d params, but passed %d", c.requiredParams, len(params))
+	}
+	paramResultSlice := make([]uint64, c.sizeOfParamResultSlice)
+	copy(paramResultSlice, params)
+	if err := c.callWithStack(ctx, paramResultSlice); err != nil {
+		return nil, err
+	}
+	return paramResultSlice[:c.numberOfResults], nil
+}
+
+func (c *callEngine) addFrame(builder wasmdebug.ErrorBuilder, addr uintptr) (def api.FunctionDefinition, listener experimental.FunctionListener) {
+	eng := c.parent.parent.parent
+	cm := eng.compiledModuleOfAddr(addr)
+	if cm == nil {
+		// This case, the module might have been closed and deleted from the engine.
+		// We fall back to searching the imported modules that can be referenced from this callEngine.
+
+		// First, we check itself.
+		if checkAddrInBytes(addr, c.parent.parent.executable) {
+			cm = c.parent.parent
+		} else {
+			// Otherwise, search all imported modules. TODO: maybe recursive, but not sure it's useful in practice.
+			p := c.parent
+			for i := range p.importedFunctions {
+				candidate := p.importedFunctions[i].me.parent
+				if checkAddrInBytes(addr, candidate.executable) {
+					cm = candidate
+					break
+				}
+			}
+		}
+	}
+
+	if cm != nil {
+		index := cm.functionIndexOf(addr)
+		def = cm.module.FunctionDefinition(cm.module.ImportFunctionCount + index)
+		var sources []string
+		if dw := cm.module.DWARFLines; dw != nil {
+			sourceOffset := cm.getSourceOffset(addr)
+			sources = dw.Line(sourceOffset)
+		}
+		builder.AddFrame(def.DebugName(), def.ParamTypes(), def.ResultTypes(), sources)
+		if len(cm.listeners) > 0 {
+			listener = cm.listeners[index]
+		}
+	}
+	return
+}
+
+// CallWithStack implements api.Function.
+func (c *callEngine) CallWithStack(ctx context.Context, paramResultStack []uint64) (err error) {
+	if c.sizeOfParamResultSlice > len(paramResultStack) {
+		return fmt.Errorf("need %d params, but stack size is %d", c.sizeOfParamResultSlice, len(paramResultStack))
+	}
+	return c.callWithStack(ctx, paramResultStack)
+}
+
+// CallWithStack implements api.Function.
+func (c *callEngine) callWithStack(ctx context.Context, paramResultStack []uint64) (err error) {
+	snapshotEnabled := ctx.Value(expctxkeys.EnableSnapshotterKey{}) != nil
+	if snapshotEnabled {
+		ctx = context.WithValue(ctx, expctxkeys.SnapshotterKey{}, c)
+	}
+
+	if wazevoapi.StackGuardCheckEnabled {
+		defer func() {
+			wazevoapi.CheckStackGuardPage(c.stack)
+		}()
+	}
+
+	p := c.parent
+	ensureTermination := p.parent.ensureTermination
+	m := p.module
+	if ensureTermination {
+		select {
+		case <-ctx.Done():
+			// If the provided context is already done, close the module and return the error.
+			m.CloseWithCtxErr(ctx)
+			return m.FailIfClosed()
+		default:
+		}
+	}
+
+	var paramResultPtr *uint64
+	if len(paramResultStack) > 0 {
+		paramResultPtr = &paramResultStack[0]
+	}
+	defer func() {
+		r := recover()
+		if s, ok := r.(*snapshot); ok {
+			// A snapshot that wasn't handled was created by a different call engine possibly from a nested wasm invocation,
+			// let it propagate up to be handled by the caller.
+			panic(s)
+		}
+		if r != nil {
+			type listenerForAbort struct {
+				def api.FunctionDefinition
+				lsn experimental.FunctionListener
+			}
+
+			var listeners []listenerForAbort
+			builder := wasmdebug.NewErrorBuilder()
+			def, lsn := c.addFrame(builder, uintptr(unsafe.Pointer(c.execCtx.goCallReturnAddress)))
+			if lsn != nil {
+				listeners = append(listeners, listenerForAbort{def, lsn})
+			}
+			returnAddrs := unwindStack(
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)),
+				c.execCtx.framePointerBeforeGoCall,
+				c.stackTop,
+				nil,
+			)
+			for _, retAddr := range returnAddrs[:len(returnAddrs)-1] { // the last return addr is the trampoline, so we skip it.
+				def, lsn = c.addFrame(builder, retAddr)
+				if lsn != nil {
+					listeners = append(listeners, listenerForAbort{def, lsn})
+				}
+			}
+			err = builder.FromRecovered(r)
+
+			for _, lsn := range listeners {
+				lsn.lsn.Abort(ctx, m, lsn.def, err)
+			}
+		} else {
+			if err != wasmruntime.ErrRuntimeStackOverflow { // Stackoverflow case shouldn't be panic (to avoid extreme stack unwinding).
+				err = c.parent.module.FailIfClosed()
+			}
+		}
+
+		if err != nil {
+			// Ensures that we can reuse this callEngine even after an error.
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+		}
+	}()
+
+	if ensureTermination {
+		done := m.CloseModuleOnCanceledOrTimeout(ctx)
+		defer done()
+	}
+
+	if c.stackTop&(16-1) != 0 {
+		panic("BUG: stack must be aligned to 16 bytes")
+	}
+	entrypoint(c.preambleExecutable, c.executable, c.execCtxPtr, c.parent.opaquePtr, paramResultPtr, c.stackTop)
+	for {
+		switch ec := c.execCtx.exitCode; ec & wazevoapi.ExitCodeMask {
+		case wazevoapi.ExitCodeOK:
+			return nil
+		case wazevoapi.ExitCodeGrowStack:
+			oldsp := uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall))
+			oldTop := c.stackTop
+			oldStack := c.stack
+			var newsp, newfp uintptr
+			if wazevoapi.StackGuardCheckEnabled {
+				newsp, newfp, err = c.growStackWithGuarded()
+			} else {
+				newsp, newfp, err = c.growStack()
+			}
+			if err != nil {
+				return err
+			}
+			adjustClonedStack(oldsp, oldTop, newsp, newfp, c.stackTop)
+			// Old stack must be alive until the new stack is adjusted.
+			runtime.KeepAlive(oldStack)
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, newsp, newfp)
+		case wazevoapi.ExitCodeGrowMemory:
+			mod := c.callerModuleInstance()
+			mem := mod.MemoryInstance
+			s := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			argRes := &s[0]
+			if res, ok := mem.Grow(uint32(*argRes)); !ok {
+				*argRes = uint64(0xffffffff) // = -1 in signed 32-bit integer.
+			} else {
+				*argRes = uint64(res)
+				calleeOpaque := opaqueViewFromPtr(uintptr(unsafe.Pointer(c.execCtx.callerModuleContextPtr)))
+				if mod.Source.MemorySection != nil { // Local memory.
+					putLocalMemory(calleeOpaque, 8 /* local memory begins at 8 */, mem)
+				} else {
+					// Imported memory's owner at offset 16 of the callerModuleContextPtr.
+					opaquePtr := uintptr(binary.LittleEndian.Uint64(calleeOpaque[16:]))
+					importedMemOwner := opaqueViewFromPtr(opaquePtr)
+					putLocalMemory(importedMemOwner, 8 /* local memory begins at 8 */, mem)
+				}
+			}
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeTableGrow:
+			mod := c.callerModuleInstance()
+			s := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			tableIndex, num, ref := uint32(s[0]), uint32(s[1]), uintptr(s[2])
+			table := mod.Tables[tableIndex]
+			s[0] = uint64(uint32(int32(table.Grow(num, ref))))
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeCallGoFunction:
+			index := wazevoapi.GoFunctionIndexFromExitCode(ec)
+			f := hostModuleGoFuncFromOpaque[api.GoFunction](index, c.execCtx.goFunctionCallCalleeModuleContextOpaque)
+			func() {
+				if snapshotEnabled {
+					defer snapshotRecoverFn(c)
+				}
+				f.Call(ctx, goCallStackView(c.execCtx.stackPointerBeforeGoCall))
+			}()
+			// Back to the native code.
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeCallGoFunctionWithListener:
+			index := wazevoapi.GoFunctionIndexFromExitCode(ec)
+			f := hostModuleGoFuncFromOpaque[api.GoFunction](index, c.execCtx.goFunctionCallCalleeModuleContextOpaque)
+			listeners := hostModuleListenersSliceFromOpaque(c.execCtx.goFunctionCallCalleeModuleContextOpaque)
+			s := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			// Call Listener.Before.
+			callerModule := c.callerModuleInstance()
+			listener := listeners[index]
+			hostModule := hostModuleFromOpaque(c.execCtx.goFunctionCallCalleeModuleContextOpaque)
+			def := hostModule.FunctionDefinition(wasm.Index(index))
+			listener.Before(ctx, callerModule, def, s, c.stackIterator(true))
+			// Call into the Go function.
+			func() {
+				if snapshotEnabled {
+					defer snapshotRecoverFn(c)
+				}
+				f.Call(ctx, s)
+			}()
+			// Call Listener.After.
+			listener.After(ctx, callerModule, def, s)
+			// Back to the native code.
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeCallGoModuleFunction:
+			index := wazevoapi.GoFunctionIndexFromExitCode(ec)
+			f := hostModuleGoFuncFromOpaque[api.GoModuleFunction](index, c.execCtx.goFunctionCallCalleeModuleContextOpaque)
+			mod := c.callerModuleInstance()
+			func() {
+				if snapshotEnabled {
+					defer snapshotRecoverFn(c)
+				}
+				f.Call(ctx, mod, goCallStackView(c.execCtx.stackPointerBeforeGoCall))
+			}()
+			// Back to the native code.
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeCallGoModuleFunctionWithListener:
+			index := wazevoapi.GoFunctionIndexFromExitCode(ec)
+			f := hostModuleGoFuncFromOpaque[api.GoModuleFunction](index, c.execCtx.goFunctionCallCalleeModuleContextOpaque)
+			listeners := hostModuleListenersSliceFromOpaque(c.execCtx.goFunctionCallCalleeModuleContextOpaque)
+			s := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			// Call Listener.Before.
+			callerModule := c.callerModuleInstance()
+			listener := listeners[index]
+			hostModule := hostModuleFromOpaque(c.execCtx.goFunctionCallCalleeModuleContextOpaque)
+			def := hostModule.FunctionDefinition(wasm.Index(index))
+			listener.Before(ctx, callerModule, def, s, c.stackIterator(true))
+			// Call into the Go function.
+			func() {
+				if snapshotEnabled {
+					defer snapshotRecoverFn(c)
+				}
+				f.Call(ctx, callerModule, s)
+			}()
+			// Call Listener.After.
+			listener.After(ctx, callerModule, def, s)
+			// Back to the native code.
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeCallListenerBefore:
+			stack := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			index := wasm.Index(stack[0])
+			mod := c.callerModuleInstance()
+			listener := mod.Engine.(*moduleEngine).listeners[index]
+			def := mod.Source.FunctionDefinition(index + mod.Source.ImportFunctionCount)
+			listener.Before(ctx, mod, def, stack[1:], c.stackIterator(false))
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeCallListenerAfter:
+			stack := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			index := wasm.Index(stack[0])
+			mod := c.callerModuleInstance()
+			listener := mod.Engine.(*moduleEngine).listeners[index]
+			def := mod.Source.FunctionDefinition(index + mod.Source.ImportFunctionCount)
+			listener.After(ctx, mod, def, stack[1:])
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeCheckModuleExitCode:
+			// Note: this operation must be done in Go, not native code. The reason is that
+			// native code cannot be preempted and that means it can block forever if there are not
+			// enough OS threads (which we don't have control over).
+			if err := m.FailIfClosed(); err != nil {
+				panic(err)
+			}
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeRefFunc:
+			mod := c.callerModuleInstance()
+			s := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			funcIndex := wasm.Index(s[0])
+			ref := mod.Engine.FunctionInstanceReference(funcIndex)
+			s[0] = uint64(ref)
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeMemoryWait32:
+			mod := c.callerModuleInstance()
+			mem := mod.MemoryInstance
+			if !mem.Shared {
+				panic(wasmruntime.ErrRuntimeExpectedSharedMemory)
+			}
+
+			s := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			timeout, exp, addr := int64(s[0]), uint32(s[1]), uintptr(s[2])
+			base := uintptr(unsafe.Pointer(&mem.Buffer[0]))
+
+			offset := uint32(addr - base)
+			res := mem.Wait32(offset, exp, timeout, func(mem *wasm.MemoryInstance, offset uint32) uint32 {
+				addr := unsafe.Add(unsafe.Pointer(&mem.Buffer[0]), offset)
+				return atomic.LoadUint32((*uint32)(addr))
+			})
+			s[0] = res
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeMemoryWait64:
+			mod := c.callerModuleInstance()
+			mem := mod.MemoryInstance
+			if !mem.Shared {
+				panic(wasmruntime.ErrRuntimeExpectedSharedMemory)
+			}
+
+			s := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			timeout, exp, addr := int64(s[0]), uint64(s[1]), uintptr(s[2])
+			base := uintptr(unsafe.Pointer(&mem.Buffer[0]))
+
+			offset := uint32(addr - base)
+			res := mem.Wait64(offset, exp, timeout, func(mem *wasm.MemoryInstance, offset uint32) uint64 {
+				addr := unsafe.Add(unsafe.Pointer(&mem.Buffer[0]), offset)
+				return atomic.LoadUint64((*uint64)(addr))
+			})
+			s[0] = uint64(res)
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeMemoryNotify:
+			mod := c.callerModuleInstance()
+			mem := mod.MemoryInstance
+
+			s := goCallStackView(c.execCtx.stackPointerBeforeGoCall)
+			count, addr := uint32(s[0]), s[1]
+			offset := uint32(uintptr(addr) - uintptr(unsafe.Pointer(&mem.Buffer[0])))
+			res := mem.Notify(offset, count)
+			s[0] = uint64(res)
+			c.execCtx.exitCode = wazevoapi.ExitCodeOK
+			afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr,
+				uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall)
+		case wazevoapi.ExitCodeUnreachable:
+			panic(wasmruntime.ErrRuntimeUnreachable)
+		case wazevoapi.ExitCodeMemoryOutOfBounds:
+			panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
+		case wazevoapi.ExitCodeTableOutOfBounds:
+			panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+		case wazevoapi.ExitCodeIndirectCallNullPointer:
+			panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+		case wazevoapi.ExitCodeIndirectCallTypeMismatch:
+			panic(wasmruntime.ErrRuntimeIndirectCallTypeMismatch)
+		case wazevoapi.ExitCodeIntegerOverflow:
+			panic(wasmruntime.ErrRuntimeIntegerOverflow)
+		case wazevoapi.ExitCodeIntegerDivisionByZero:
+			panic(wasmruntime.ErrRuntimeIntegerDivideByZero)
+		case wazevoapi.ExitCodeInvalidConversionToInteger:
+			panic(wasmruntime.ErrRuntimeInvalidConversionToInteger)
+		case wazevoapi.ExitCodeUnalignedAtomic:
+			panic(wasmruntime.ErrRuntimeUnalignedAtomic)
+		default:
+			panic("BUG")
+		}
+	}
+}
+
+func (c *callEngine) callerModuleInstance() *wasm.ModuleInstance {
+	return moduleInstanceFromOpaquePtr(c.execCtx.callerModuleContextPtr)
+}
+
+func opaqueViewFromPtr(ptr uintptr) []byte {
+	var opaque []byte
+	sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaque))
+	sh.Data = ptr
+	setSliceLimits(sh, 24, 24)
+	return opaque
+}
+
+const callStackCeiling = uintptr(50000000) // in uint64 (8 bytes) == 400000000 bytes in total == 400mb.
+
+func (c *callEngine) growStackWithGuarded() (newSP uintptr, newFP uintptr, err error) {
+	if wazevoapi.StackGuardCheckEnabled {
+		wazevoapi.CheckStackGuardPage(c.stack)
+	}
+	newSP, newFP, err = c.growStack()
+	if err != nil {
+		return
+	}
+	if wazevoapi.StackGuardCheckEnabled {
+		c.execCtx.stackBottomPtr = &c.stack[wazevoapi.StackGuardCheckGuardPageSize]
+	}
+	return
+}
+
+// growStack grows the stack, and returns the new stack pointer.
+func (c *callEngine) growStack() (newSP, newFP uintptr, err error) {
+	currentLen := uintptr(len(c.stack))
+	if callStackCeiling < currentLen {
+		err = wasmruntime.ErrRuntimeStackOverflow
+		return
+	}
+
+	newLen := 2*currentLen + c.execCtx.stackGrowRequiredSize + 16 // Stack might be aligned to 16 bytes, so add 16 bytes just in case.
+	newSP, newFP, c.stackTop, c.stack = c.cloneStack(newLen)
+	c.execCtx.stackBottomPtr = &c.stack[0]
+	return
+}
+
+func (c *callEngine) cloneStack(l uintptr) (newSP, newFP, newTop uintptr, newStack []byte) {
+	newStack = make([]byte, l)
+
+	relSp := c.stackTop - uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall))
+	relFp := c.stackTop - c.execCtx.framePointerBeforeGoCall
+
+	// Copy the existing contents in the previous Go-allocated stack into the new one.
+	var prevStackAligned, newStackAligned []byte
+	{
+		sh := (*reflect.SliceHeader)(unsafe.Pointer(&prevStackAligned))
+		sh.Data = c.stackTop - relSp
+		setSliceLimits(sh, relSp, relSp)
+	}
+	newTop = alignedStackTop(newStack)
+	{
+		newSP = newTop - relSp
+		newFP = newTop - relFp
+		sh := (*reflect.SliceHeader)(unsafe.Pointer(&newStackAligned))
+		sh.Data = newSP
+		setSliceLimits(sh, relSp, relSp)
+	}
+	copy(newStackAligned, prevStackAligned)
+	return
+}
+
+func (c *callEngine) stackIterator(onHostCall bool) experimental.StackIterator {
+	c.stackIteratorImpl.reset(c, onHostCall)
+	return &c.stackIteratorImpl
+}
+
+// stackIterator implements experimental.StackIterator.
+type stackIterator struct {
+	retAddrs      []uintptr
+	retAddrCursor int
+	eng           *engine
+	pc            uint64
+
+	currentDef *wasm.FunctionDefinition
+}
+
+func (si *stackIterator) reset(c *callEngine, onHostCall bool) {
+	if onHostCall {
+		si.retAddrs = append(si.retAddrs[:0], uintptr(unsafe.Pointer(c.execCtx.goCallReturnAddress)))
+	} else {
+		si.retAddrs = si.retAddrs[:0]
+	}
+	si.retAddrs = unwindStack(uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall, c.stackTop, si.retAddrs)
+	si.retAddrs = si.retAddrs[:len(si.retAddrs)-1] // the last return addr is the trampoline, so we skip it.
+	si.retAddrCursor = 0
+	si.eng = c.parent.parent.parent
+}
+
+// Next implements the same method as documented on experimental.StackIterator.
+func (si *stackIterator) Next() bool {
+	if si.retAddrCursor >= len(si.retAddrs) {
+		return false
+	}
+
+	addr := si.retAddrs[si.retAddrCursor]
+	cm := si.eng.compiledModuleOfAddr(addr)
+	if cm != nil {
+		index := cm.functionIndexOf(addr)
+		def := cm.module.FunctionDefinition(cm.module.ImportFunctionCount + index)
+		si.currentDef = def
+		si.retAddrCursor++
+		si.pc = uint64(addr)
+		return true
+	}
+	return false
+}
+
+// ProgramCounter implements the same method as documented on experimental.StackIterator.
+func (si *stackIterator) ProgramCounter() experimental.ProgramCounter {
+	return experimental.ProgramCounter(si.pc)
+}
+
+// Function implements the same method as documented on experimental.StackIterator.
+func (si *stackIterator) Function() experimental.InternalFunction {
+	return si
+}
+
+// Definition implements the same method as documented on experimental.InternalFunction.
+func (si *stackIterator) Definition() api.FunctionDefinition {
+	return si.currentDef
+}
+
+// SourceOffsetForPC implements the same method as documented on experimental.InternalFunction.
+func (si *stackIterator) SourceOffsetForPC(pc experimental.ProgramCounter) uint64 {
+	upc := uintptr(pc)
+	cm := si.eng.compiledModuleOfAddr(upc)
+	return cm.getSourceOffset(upc)
+}
+
+// snapshot implements experimental.Snapshot
+type snapshot struct {
+	sp, fp, top    uintptr
+	returnAddress  *byte
+	stack          []byte
+	savedRegisters [64][2]uint64
+	ret            []uint64
+	c              *callEngine
+}
+
+// Snapshot implements the same method as documented on experimental.Snapshotter.
+func (c *callEngine) Snapshot() experimental.Snapshot {
+	returnAddress := c.execCtx.goCallReturnAddress
+	oldTop, oldSp := c.stackTop, uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall))
+	newSP, newFP, newTop, newStack := c.cloneStack(uintptr(len(c.stack)) + 16)
+	adjustClonedStack(oldSp, oldTop, newSP, newFP, newTop)
+	return &snapshot{
+		sp:             newSP,
+		fp:             newFP,
+		top:            newTop,
+		savedRegisters: c.execCtx.savedRegisters,
+		returnAddress:  returnAddress,
+		stack:          newStack,
+		c:              c,
+	}
+}
+
+// Restore implements the same method as documented on experimental.Snapshot.
+func (s *snapshot) Restore(ret []uint64) {
+	s.ret = ret
+	panic(s)
+}
+
+func (s *snapshot) doRestore() {
+	spp := *(**uint64)(unsafe.Pointer(&s.sp))
+	view := goCallStackView(spp)
+	copy(view, s.ret)
+
+	c := s.c
+	c.stack = s.stack
+	c.stackTop = s.top
+	ec := &c.execCtx
+	ec.stackBottomPtr = &c.stack[0]
+	ec.stackPointerBeforeGoCall = spp
+	ec.framePointerBeforeGoCall = s.fp
+	ec.goCallReturnAddress = s.returnAddress
+	ec.savedRegisters = s.savedRegisters
+}
+
+// Error implements the same method on error.
+func (s *snapshot) Error() string {
+	return "unhandled snapshot restore, this generally indicates restore was called from a different " +
+		"exported function invocation than snapshot"
+}
+
+func snapshotRecoverFn(c *callEngine) {
+	if r := recover(); r != nil {
+		if s, ok := r.(*snapshot); ok && s.c == c {
+			s.doRestore()
+		} else {
+			panic(r)
+		}
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine.go
new file mode 100644
index 000000000..f02b905fc
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine.go
@@ -0,0 +1,843 @@
+package wazevo
+
+import (
+	"context"
+	"encoding/hex"
+	"errors"
+	"fmt"
+	"runtime"
+	"sort"
+	"sync"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/api"
+	"github.com/tetratelabs/wazero/experimental"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/frontend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+	"github.com/tetratelabs/wazero/internal/filecache"
+	"github.com/tetratelabs/wazero/internal/platform"
+	"github.com/tetratelabs/wazero/internal/version"
+	"github.com/tetratelabs/wazero/internal/wasm"
+)
+
+type (
+	// engine implements wasm.Engine.
+	engine struct {
+		wazeroVersion   string
+		fileCache       filecache.Cache
+		compiledModules map[wasm.ModuleID]*compiledModule
+		// sortedCompiledModules is a list of compiled modules sorted by the initial address of the executable.
+		sortedCompiledModules []*compiledModule
+		mux                   sync.RWMutex
+		// sharedFunctions is compiled functions shared by all modules.
+		sharedFunctions *sharedFunctions
+		// setFinalizer defaults to runtime.SetFinalizer, but overridable for tests.
+		setFinalizer func(obj interface{}, finalizer interface{})
+
+		// The followings are reused for compiling shared functions.
+		machine backend.Machine
+		be      backend.Compiler
+	}
+
+	sharedFunctions struct {
+		// memoryGrowExecutable is a compiled trampoline executable for memory.grow builtin function.
+		memoryGrowExecutable []byte
+		// checkModuleExitCode is a compiled trampoline executable for checking module instance exit code. This
+		// is used when ensureTermination is true.
+		checkModuleExitCode []byte
+		// stackGrowExecutable is a compiled executable for growing stack builtin function.
+		stackGrowExecutable []byte
+		// tableGrowExecutable is a compiled trampoline executable for table.grow builtin function.
+		tableGrowExecutable []byte
+		// refFuncExecutable is a compiled trampoline executable for ref.func builtin function.
+		refFuncExecutable []byte
+		// memoryWait32Executable is a compiled trampoline executable for memory.wait32 builtin function
+		memoryWait32Executable []byte
+		// memoryWait64Executable is a compiled trampoline executable for memory.wait64 builtin function
+		memoryWait64Executable []byte
+		// memoryNotifyExecutable is a compiled trampoline executable for memory.notify builtin function
+		memoryNotifyExecutable    []byte
+		listenerBeforeTrampolines map[*wasm.FunctionType][]byte
+		listenerAfterTrampolines  map[*wasm.FunctionType][]byte
+	}
+
+	// compiledModule is a compiled variant of a wasm.Module and ready to be used for instantiation.
+	compiledModule struct {
+		*executables
+		// functionOffsets maps a local function index to the offset in the executable.
+		functionOffsets           []int
+		parent                    *engine
+		module                    *wasm.Module
+		ensureTermination         bool
+		listeners                 []experimental.FunctionListener
+		listenerBeforeTrampolines []*byte
+		listenerAfterTrampolines  []*byte
+
+		// The followings are only available for non host modules.
+
+		offsets         wazevoapi.ModuleContextOffsetData
+		sharedFunctions *sharedFunctions
+		sourceMap       sourceMap
+	}
+
+	executables struct {
+		executable     []byte
+		entryPreambles [][]byte
+	}
+)
+
+// sourceMap is a mapping from the offset of the executable to the offset of the original wasm binary.
+type sourceMap struct {
+	// executableOffsets is a sorted list of offsets of the executable. This is index-correlated with wasmBinaryOffsets,
+	// in other words executableOffsets[i] is the offset of the executable which corresponds to the offset of a Wasm
+	// binary pointed by wasmBinaryOffsets[i].
+	executableOffsets []uintptr
+	// wasmBinaryOffsets is the counterpart of executableOffsets.
+	wasmBinaryOffsets []uint64
+}
+
+var _ wasm.Engine = (*engine)(nil)
+
+// NewEngine returns the implementation of wasm.Engine.
+func NewEngine(ctx context.Context, _ api.CoreFeatures, fc filecache.Cache) wasm.Engine {
+	machine := newMachine()
+	be := backend.NewCompiler(ctx, machine, ssa.NewBuilder())
+	e := &engine{
+		compiledModules: make(map[wasm.ModuleID]*compiledModule),
+		setFinalizer:    runtime.SetFinalizer,
+		machine:         machine,
+		be:              be,
+		fileCache:       fc,
+		wazeroVersion:   version.GetWazeroVersion(),
+	}
+	e.compileSharedFunctions()
+	return e
+}
+
+// CompileModule implements wasm.Engine.
+func (e *engine) CompileModule(ctx context.Context, module *wasm.Module, listeners []experimental.FunctionListener, ensureTermination bool) (err error) {
+	if wazevoapi.PerfMapEnabled {
+		wazevoapi.PerfMap.Lock()
+		defer wazevoapi.PerfMap.Unlock()
+	}
+
+	if _, ok, err := e.getCompiledModule(module, listeners, ensureTermination); ok { // cache hit!
+		return nil
+	} else if err != nil {
+		return err
+	}
+
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		ctx = wazevoapi.NewDeterministicCompilationVerifierContext(ctx, len(module.CodeSection))
+	}
+	cm, err := e.compileModule(ctx, module, listeners, ensureTermination)
+	if err != nil {
+		return err
+	}
+	if err = e.addCompiledModule(module, cm); err != nil {
+		return err
+	}
+
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		for i := 0; i < wazevoapi.DeterministicCompilationVerifyingIter; i++ {
+			_, err := e.compileModule(ctx, module, listeners, ensureTermination)
+			if err != nil {
+				return err
+			}
+		}
+	}
+
+	if len(listeners) > 0 {
+		cm.listeners = listeners
+		cm.listenerBeforeTrampolines = make([]*byte, len(module.TypeSection))
+		cm.listenerAfterTrampolines = make([]*byte, len(module.TypeSection))
+		for i := range module.TypeSection {
+			typ := &module.TypeSection[i]
+			before, after := e.getListenerTrampolineForType(typ)
+			cm.listenerBeforeTrampolines[i] = before
+			cm.listenerAfterTrampolines[i] = after
+		}
+	}
+	return nil
+}
+
+func (exec *executables) compileEntryPreambles(m *wasm.Module, machine backend.Machine, be backend.Compiler) {
+	exec.entryPreambles = make([][]byte, len(m.TypeSection))
+	for i := range m.TypeSection {
+		typ := &m.TypeSection[i]
+		sig := frontend.SignatureForWasmFunctionType(typ)
+		be.Init()
+		buf := machine.CompileEntryPreamble(&sig)
+		executable := mmapExecutable(buf)
+		exec.entryPreambles[i] = executable
+
+		if wazevoapi.PerfMapEnabled {
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&executable[0])),
+				uint64(len(executable)), fmt.Sprintf("entry_preamble::type=%s", typ.String()))
+		}
+	}
+}
+
+func (e *engine) compileModule(ctx context.Context, module *wasm.Module, listeners []experimental.FunctionListener, ensureTermination bool) (*compiledModule, error) {
+	withListener := len(listeners) > 0
+	cm := &compiledModule{
+		offsets: wazevoapi.NewModuleContextOffsetData(module, withListener), parent: e, module: module,
+		ensureTermination: ensureTermination,
+		executables:       &executables{},
+	}
+
+	if module.IsHostModule {
+		return e.compileHostModule(ctx, module, listeners)
+	}
+
+	importedFns, localFns := int(module.ImportFunctionCount), len(module.FunctionSection)
+	if localFns == 0 {
+		return cm, nil
+	}
+
+	rels := make([]backend.RelocationInfo, 0)
+	refToBinaryOffset := make([]int, importedFns+localFns)
+
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		// The compilation must be deterministic regardless of the order of functions being compiled.
+		wazevoapi.DeterministicCompilationVerifierRandomizeIndexes(ctx)
+	}
+
+	needSourceInfo := module.DWARFLines != nil
+
+	// Creates new compiler instances which are reused for each function.
+	ssaBuilder := ssa.NewBuilder()
+	fe := frontend.NewFrontendCompiler(module, ssaBuilder, &cm.offsets, ensureTermination, withListener, needSourceInfo)
+	machine := newMachine()
+	be := backend.NewCompiler(ctx, machine, ssaBuilder)
+
+	cm.executables.compileEntryPreambles(module, machine, be)
+
+	totalSize := 0 // Total binary size of the executable.
+	cm.functionOffsets = make([]int, localFns)
+	bodies := make([][]byte, localFns)
+
+	// Trampoline relocation related variables.
+	trampolineInterval, callTrampolineIslandSize, err := machine.CallTrampolineIslandInfo(localFns)
+	if err != nil {
+		return nil, err
+	}
+	needCallTrampoline := callTrampolineIslandSize > 0
+	var callTrampolineIslandOffsets []int // Holds the offsets of trampoline islands.
+
+	for i := range module.CodeSection {
+		if wazevoapi.DeterministicCompilationVerifierEnabled {
+			i = wazevoapi.DeterministicCompilationVerifierGetRandomizedLocalFunctionIndex(ctx, i)
+		}
+
+		fidx := wasm.Index(i + importedFns)
+
+		if wazevoapi.NeedFunctionNameInContext {
+			def := module.FunctionDefinition(fidx)
+			name := def.DebugName()
+			if len(def.ExportNames()) > 0 {
+				name = def.ExportNames()[0]
+			}
+			ctx = wazevoapi.SetCurrentFunctionName(ctx, i, fmt.Sprintf("[%d/%d]%s", i, len(module.CodeSection)-1, name))
+		}
+
+		needListener := len(listeners) > 0 && listeners[i] != nil
+		body, relsPerFunc, err := e.compileLocalWasmFunction(ctx, module, wasm.Index(i), fe, ssaBuilder, be, needListener)
+		if err != nil {
+			return nil, fmt.Errorf("compile function %d/%d: %v", i, len(module.CodeSection)-1, err)
+		}
+
+		// Align 16-bytes boundary.
+		totalSize = (totalSize + 15) &^ 15
+		cm.functionOffsets[i] = totalSize
+
+		if needSourceInfo {
+			// At the beginning of the function, we add the offset of the function body so that
+			// we can resolve the source location of the call site of before listener call.
+			cm.sourceMap.executableOffsets = append(cm.sourceMap.executableOffsets, uintptr(totalSize))
+			cm.sourceMap.wasmBinaryOffsets = append(cm.sourceMap.wasmBinaryOffsets, module.CodeSection[i].BodyOffsetInCodeSection)
+
+			for _, info := range be.SourceOffsetInfo() {
+				cm.sourceMap.executableOffsets = append(cm.sourceMap.executableOffsets, uintptr(totalSize)+uintptr(info.ExecutableOffset))
+				cm.sourceMap.wasmBinaryOffsets = append(cm.sourceMap.wasmBinaryOffsets, uint64(info.SourceOffset))
+			}
+		}
+
+		fref := frontend.FunctionIndexToFuncRef(fidx)
+		refToBinaryOffset[fref] = totalSize
+
+		// At this point, relocation offsets are relative to the start of the function body,
+		// so we adjust it to the start of the executable.
+		for _, r := range relsPerFunc {
+			r.Offset += int64(totalSize)
+			rels = append(rels, r)
+		}
+
+		bodies[i] = body
+		totalSize += len(body)
+		if wazevoapi.PrintMachineCodeHexPerFunction {
+			fmt.Printf("[[[machine code for %s]]]\n%s\n\n", wazevoapi.GetCurrentFunctionName(ctx), hex.EncodeToString(body))
+		}
+
+		if needCallTrampoline {
+			// If the total size exceeds the trampoline interval, we need to add a trampoline island.
+			if totalSize/trampolineInterval > len(callTrampolineIslandOffsets) {
+				callTrampolineIslandOffsets = append(callTrampolineIslandOffsets, totalSize)
+				totalSize += callTrampolineIslandSize
+			}
+		}
+	}
+
+	// Allocate executable memory and then copy the generated machine code.
+	executable, err := platform.MmapCodeSegment(totalSize)
+	if err != nil {
+		panic(err)
+	}
+	cm.executable = executable
+
+	for i, b := range bodies {
+		offset := cm.functionOffsets[i]
+		copy(executable[offset:], b)
+	}
+
+	if wazevoapi.PerfMapEnabled {
+		wazevoapi.PerfMap.Flush(uintptr(unsafe.Pointer(&executable[0])), cm.functionOffsets)
+	}
+
+	if needSourceInfo {
+		for i := range cm.sourceMap.executableOffsets {
+			cm.sourceMap.executableOffsets[i] += uintptr(unsafe.Pointer(&cm.executable[0]))
+		}
+	}
+
+	// Resolve relocations for local function calls.
+	if len(rels) > 0 {
+		machine.ResolveRelocations(refToBinaryOffset, executable, rels, callTrampolineIslandOffsets)
+	}
+
+	if runtime.GOARCH == "arm64" {
+		// On arm64, we cannot give all of rwx at the same time, so we change it to exec.
+		if err = platform.MprotectRX(executable); err != nil {
+			return nil, err
+		}
+	}
+	cm.sharedFunctions = e.sharedFunctions
+	e.setFinalizer(cm.executables, executablesFinalizer)
+	return cm, nil
+}
+
+func (e *engine) compileLocalWasmFunction(
+	ctx context.Context,
+	module *wasm.Module,
+	localFunctionIndex wasm.Index,
+	fe *frontend.Compiler,
+	ssaBuilder ssa.Builder,
+	be backend.Compiler,
+	needListener bool,
+) (body []byte, rels []backend.RelocationInfo, err error) {
+	typIndex := module.FunctionSection[localFunctionIndex]
+	typ := &module.TypeSection[typIndex]
+	codeSeg := &module.CodeSection[localFunctionIndex]
+
+	// Initializes both frontend and backend compilers.
+	fe.Init(localFunctionIndex, typIndex, typ, codeSeg.LocalTypes, codeSeg.Body, needListener, codeSeg.BodyOffsetInCodeSection)
+	be.Init()
+
+	// Lower Wasm to SSA.
+	fe.LowerToSSA()
+	if wazevoapi.PrintSSA && wazevoapi.PrintEnabledIndex(ctx) {
+		fmt.Printf("[[[SSA for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), ssaBuilder.Format())
+	}
+
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "SSA", ssaBuilder.Format())
+	}
+
+	// Run SSA-level optimization passes.
+	ssaBuilder.RunPasses()
+
+	if wazevoapi.PrintOptimizedSSA && wazevoapi.PrintEnabledIndex(ctx) {
+		fmt.Printf("[[[Optimized SSA for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), ssaBuilder.Format())
+	}
+
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "Optimized SSA", ssaBuilder.Format())
+	}
+
+	// Now our ssaBuilder contains the necessary information to further lower them to
+	// machine code.
+	original, rels, err := be.Compile(ctx)
+	if err != nil {
+		return nil, nil, fmt.Errorf("ssa->machine code: %v", err)
+	}
+
+	// TODO: optimize as zero copy.
+	copied := make([]byte, len(original))
+	copy(copied, original)
+	return copied, rels, nil
+}
+
+func (e *engine) compileHostModule(ctx context.Context, module *wasm.Module, listeners []experimental.FunctionListener) (*compiledModule, error) {
+	machine := newMachine()
+	be := backend.NewCompiler(ctx, machine, ssa.NewBuilder())
+
+	num := len(module.CodeSection)
+	cm := &compiledModule{module: module, listeners: listeners, executables: &executables{}}
+	cm.functionOffsets = make([]int, num)
+	totalSize := 0 // Total binary size of the executable.
+	bodies := make([][]byte, num)
+	var sig ssa.Signature
+	for i := range module.CodeSection {
+		totalSize = (totalSize + 15) &^ 15
+		cm.functionOffsets[i] = totalSize
+
+		typIndex := module.FunctionSection[i]
+		typ := &module.TypeSection[typIndex]
+
+		// We can relax until the index fits together in ExitCode as we do in wazevoapi.ExitCodeCallGoModuleFunctionWithIndex.
+		// However, 1 << 16 should be large enough for a real use case.
+		const hostFunctionNumMaximum = 1 << 16
+		if i >= hostFunctionNumMaximum {
+			return nil, fmt.Errorf("too many host functions (maximum %d)", hostFunctionNumMaximum)
+		}
+
+		sig.ID = ssa.SignatureID(typIndex) // This is important since we reuse the `machine` which caches the ABI based on the SignatureID.
+		sig.Params = append(sig.Params[:0],
+			ssa.TypeI64, // First argument must be exec context.
+			ssa.TypeI64, // The second argument is the moduleContextOpaque of this host module.
+		)
+		for _, t := range typ.Params {
+			sig.Params = append(sig.Params, frontend.WasmTypeToSSAType(t))
+		}
+
+		sig.Results = sig.Results[:0]
+		for _, t := range typ.Results {
+			sig.Results = append(sig.Results, frontend.WasmTypeToSSAType(t))
+		}
+
+		c := &module.CodeSection[i]
+		if c.GoFunc == nil {
+			panic("BUG: GoFunc must be set for host module")
+		}
+
+		withListener := len(listeners) > 0 && listeners[i] != nil
+		var exitCode wazevoapi.ExitCode
+		fn := c.GoFunc
+		switch fn.(type) {
+		case api.GoModuleFunction:
+			exitCode = wazevoapi.ExitCodeCallGoModuleFunctionWithIndex(i, withListener)
+		case api.GoFunction:
+			exitCode = wazevoapi.ExitCodeCallGoFunctionWithIndex(i, withListener)
+		}
+
+		be.Init()
+		machine.CompileGoFunctionTrampoline(exitCode, &sig, true)
+		if err := be.Finalize(ctx); err != nil {
+			return nil, err
+		}
+		body := be.Buf()
+
+		if wazevoapi.PerfMapEnabled {
+			name := module.FunctionDefinition(wasm.Index(i)).DebugName()
+			wazevoapi.PerfMap.AddModuleEntry(i,
+				int64(totalSize),
+				uint64(len(body)),
+				fmt.Sprintf("trampoline:%s", name))
+		}
+
+		// TODO: optimize as zero copy.
+		copied := make([]byte, len(body))
+		copy(copied, body)
+		bodies[i] = copied
+		totalSize += len(body)
+	}
+
+	if totalSize == 0 {
+		// Empty module.
+		return cm, nil
+	}
+
+	// Allocate executable memory and then copy the generated machine code.
+	executable, err := platform.MmapCodeSegment(totalSize)
+	if err != nil {
+		panic(err)
+	}
+	cm.executable = executable
+
+	for i, b := range bodies {
+		offset := cm.functionOffsets[i]
+		copy(executable[offset:], b)
+	}
+
+	if wazevoapi.PerfMapEnabled {
+		wazevoapi.PerfMap.Flush(uintptr(unsafe.Pointer(&executable[0])), cm.functionOffsets)
+	}
+
+	if runtime.GOARCH == "arm64" {
+		// On arm64, we cannot give all of rwx at the same time, so we change it to exec.
+		if err = platform.MprotectRX(executable); err != nil {
+			return nil, err
+		}
+	}
+	e.setFinalizer(cm.executables, executablesFinalizer)
+	return cm, nil
+}
+
+// Close implements wasm.Engine.
+func (e *engine) Close() (err error) {
+	e.mux.Lock()
+	defer e.mux.Unlock()
+	e.sortedCompiledModules = nil
+	e.compiledModules = nil
+	e.sharedFunctions = nil
+	return nil
+}
+
+// CompiledModuleCount implements wasm.Engine.
+func (e *engine) CompiledModuleCount() uint32 {
+	e.mux.RLock()
+	defer e.mux.RUnlock()
+	return uint32(len(e.compiledModules))
+}
+
+// DeleteCompiledModule implements wasm.Engine.
+func (e *engine) DeleteCompiledModule(m *wasm.Module) {
+	e.mux.Lock()
+	defer e.mux.Unlock()
+	cm, ok := e.compiledModules[m.ID]
+	if ok {
+		if len(cm.executable) > 0 {
+			e.deleteCompiledModuleFromSortedList(cm)
+		}
+		delete(e.compiledModules, m.ID)
+	}
+}
+
+func (e *engine) addCompiledModuleToSortedList(cm *compiledModule) {
+	ptr := uintptr(unsafe.Pointer(&cm.executable[0]))
+
+	index := sort.Search(len(e.sortedCompiledModules), func(i int) bool {
+		return uintptr(unsafe.Pointer(&e.sortedCompiledModules[i].executable[0])) >= ptr
+	})
+	e.sortedCompiledModules = append(e.sortedCompiledModules, nil)
+	copy(e.sortedCompiledModules[index+1:], e.sortedCompiledModules[index:])
+	e.sortedCompiledModules[index] = cm
+}
+
+func (e *engine) deleteCompiledModuleFromSortedList(cm *compiledModule) {
+	ptr := uintptr(unsafe.Pointer(&cm.executable[0]))
+
+	index := sort.Search(len(e.sortedCompiledModules), func(i int) bool {
+		return uintptr(unsafe.Pointer(&e.sortedCompiledModules[i].executable[0])) >= ptr
+	})
+	if index >= len(e.sortedCompiledModules) {
+		return
+	}
+	copy(e.sortedCompiledModules[index:], e.sortedCompiledModules[index+1:])
+	e.sortedCompiledModules = e.sortedCompiledModules[:len(e.sortedCompiledModules)-1]
+}
+
+func (e *engine) compiledModuleOfAddr(addr uintptr) *compiledModule {
+	e.mux.RLock()
+	defer e.mux.RUnlock()
+
+	index := sort.Search(len(e.sortedCompiledModules), func(i int) bool {
+		return uintptr(unsafe.Pointer(&e.sortedCompiledModules[i].executable[0])) > addr
+	})
+	index -= 1
+	if index < 0 {
+		return nil
+	}
+	candidate := e.sortedCompiledModules[index]
+	if checkAddrInBytes(addr, candidate.executable) {
+		// If a module is already deleted, the found module may have been wrong.
+		return candidate
+	}
+	return nil
+}
+
+func checkAddrInBytes(addr uintptr, b []byte) bool {
+	return uintptr(unsafe.Pointer(&b[0])) <= addr && addr <= uintptr(unsafe.Pointer(&b[len(b)-1]))
+}
+
+// NewModuleEngine implements wasm.Engine.
+func (e *engine) NewModuleEngine(m *wasm.Module, mi *wasm.ModuleInstance) (wasm.ModuleEngine, error) {
+	me := &moduleEngine{}
+
+	// Note: imported functions are resolved in moduleEngine.ResolveImportedFunction.
+	me.importedFunctions = make([]importedFunction, m.ImportFunctionCount)
+
+	compiled, ok := e.getCompiledModuleFromMemory(m)
+	if !ok {
+		return nil, errors.New("source module must be compiled before instantiation")
+	}
+	me.parent = compiled
+	me.module = mi
+	me.listeners = compiled.listeners
+
+	if m.IsHostModule {
+		me.opaque = buildHostModuleOpaque(m, compiled.listeners)
+		me.opaquePtr = &me.opaque[0]
+	} else {
+		if size := compiled.offsets.TotalSize; size != 0 {
+			opaque := newAlignedOpaque(size)
+			me.opaque = opaque
+			me.opaquePtr = &opaque[0]
+		}
+	}
+	return me, nil
+}
+
+func (e *engine) compileSharedFunctions() {
+	e.sharedFunctions = &sharedFunctions{
+		listenerBeforeTrampolines: make(map[*wasm.FunctionType][]byte),
+		listenerAfterTrampolines:  make(map[*wasm.FunctionType][]byte),
+	}
+
+	e.be.Init()
+	{
+		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeGrowMemory, &ssa.Signature{
+			Params:  []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32},
+			Results: []ssa.Type{ssa.TypeI32},
+		}, false)
+		e.sharedFunctions.memoryGrowExecutable = mmapExecutable(src)
+		if wazevoapi.PerfMapEnabled {
+			exe := e.sharedFunctions.memoryGrowExecutable
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "memory_grow_trampoline")
+		}
+	}
+
+	e.be.Init()
+	{
+		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeTableGrow, &ssa.Signature{
+			Params:  []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32 /* table index */, ssa.TypeI32 /* num */, ssa.TypeI64 /* ref */},
+			Results: []ssa.Type{ssa.TypeI32},
+		}, false)
+		e.sharedFunctions.tableGrowExecutable = mmapExecutable(src)
+		if wazevoapi.PerfMapEnabled {
+			exe := e.sharedFunctions.tableGrowExecutable
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "table_grow_trampoline")
+		}
+	}
+
+	e.be.Init()
+	{
+		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeCheckModuleExitCode, &ssa.Signature{
+			Params:  []ssa.Type{ssa.TypeI32 /* exec context */},
+			Results: []ssa.Type{ssa.TypeI32},
+		}, false)
+		e.sharedFunctions.checkModuleExitCode = mmapExecutable(src)
+		if wazevoapi.PerfMapEnabled {
+			exe := e.sharedFunctions.checkModuleExitCode
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "check_module_exit_code_trampoline")
+		}
+	}
+
+	e.be.Init()
+	{
+		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeRefFunc, &ssa.Signature{
+			Params:  []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32 /* function index */},
+			Results: []ssa.Type{ssa.TypeI64}, // returns the function reference.
+		}, false)
+		e.sharedFunctions.refFuncExecutable = mmapExecutable(src)
+		if wazevoapi.PerfMapEnabled {
+			exe := e.sharedFunctions.refFuncExecutable
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "ref_func_trampoline")
+		}
+	}
+
+	e.be.Init()
+	{
+		src := e.machine.CompileStackGrowCallSequence()
+		e.sharedFunctions.stackGrowExecutable = mmapExecutable(src)
+		if wazevoapi.PerfMapEnabled {
+			exe := e.sharedFunctions.stackGrowExecutable
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "stack_grow_trampoline")
+		}
+	}
+
+	e.be.Init()
+	{
+		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeMemoryWait32, &ssa.Signature{
+			// exec context, timeout, expected, addr
+			Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI32, ssa.TypeI64},
+			// Returns the status.
+			Results: []ssa.Type{ssa.TypeI32},
+		}, false)
+		e.sharedFunctions.memoryWait32Executable = mmapExecutable(src)
+		if wazevoapi.PerfMapEnabled {
+			exe := e.sharedFunctions.memoryWait32Executable
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "memory_wait32_trampoline")
+		}
+	}
+
+	e.be.Init()
+	{
+		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeMemoryWait64, &ssa.Signature{
+			// exec context, timeout, expected, addr
+			Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI64, ssa.TypeI64},
+			// Returns the status.
+			Results: []ssa.Type{ssa.TypeI32},
+		}, false)
+		e.sharedFunctions.memoryWait64Executable = mmapExecutable(src)
+		if wazevoapi.PerfMapEnabled {
+			exe := e.sharedFunctions.memoryWait64Executable
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "memory_wait64_trampoline")
+		}
+	}
+
+	e.be.Init()
+	{
+		src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeMemoryNotify, &ssa.Signature{
+			// exec context, count, addr
+			Params: []ssa.Type{ssa.TypeI64, ssa.TypeI32, ssa.TypeI64},
+			// Returns the number notified.
+			Results: []ssa.Type{ssa.TypeI32},
+		}, false)
+		e.sharedFunctions.memoryNotifyExecutable = mmapExecutable(src)
+		if wazevoapi.PerfMapEnabled {
+			exe := e.sharedFunctions.memoryNotifyExecutable
+			wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "memory_notify_trampoline")
+		}
+	}
+
+	e.setFinalizer(e.sharedFunctions, sharedFunctionsFinalizer)
+}
+
+func sharedFunctionsFinalizer(sf *sharedFunctions) {
+	if err := platform.MunmapCodeSegment(sf.memoryGrowExecutable); err != nil {
+		panic(err)
+	}
+	if err := platform.MunmapCodeSegment(sf.checkModuleExitCode); err != nil {
+		panic(err)
+	}
+	if err := platform.MunmapCodeSegment(sf.stackGrowExecutable); err != nil {
+		panic(err)
+	}
+	if err := platform.MunmapCodeSegment(sf.tableGrowExecutable); err != nil {
+		panic(err)
+	}
+	if err := platform.MunmapCodeSegment(sf.refFuncExecutable); err != nil {
+		panic(err)
+	}
+	if err := platform.MunmapCodeSegment(sf.memoryWait32Executable); err != nil {
+		panic(err)
+	}
+	if err := platform.MunmapCodeSegment(sf.memoryWait64Executable); err != nil {
+		panic(err)
+	}
+	if err := platform.MunmapCodeSegment(sf.memoryNotifyExecutable); err != nil {
+		panic(err)
+	}
+	for _, f := range sf.listenerBeforeTrampolines {
+		if err := platform.MunmapCodeSegment(f); err != nil {
+			panic(err)
+		}
+	}
+	for _, f := range sf.listenerAfterTrampolines {
+		if err := platform.MunmapCodeSegment(f); err != nil {
+			panic(err)
+		}
+	}
+
+	sf.memoryGrowExecutable = nil
+	sf.checkModuleExitCode = nil
+	sf.stackGrowExecutable = nil
+	sf.tableGrowExecutable = nil
+	sf.refFuncExecutable = nil
+	sf.memoryWait32Executable = nil
+	sf.memoryWait64Executable = nil
+	sf.memoryNotifyExecutable = nil
+	sf.listenerBeforeTrampolines = nil
+	sf.listenerAfterTrampolines = nil
+}
+
+func executablesFinalizer(exec *executables) {
+	if len(exec.executable) > 0 {
+		if err := platform.MunmapCodeSegment(exec.executable); err != nil {
+			panic(err)
+		}
+	}
+	exec.executable = nil
+
+	for _, f := range exec.entryPreambles {
+		if err := platform.MunmapCodeSegment(f); err != nil {
+			panic(err)
+		}
+	}
+	exec.entryPreambles = nil
+}
+
+func mmapExecutable(src []byte) []byte {
+	executable, err := platform.MmapCodeSegment(len(src))
+	if err != nil {
+		panic(err)
+	}
+
+	copy(executable, src)
+
+	if runtime.GOARCH == "arm64" {
+		// On arm64, we cannot give all of rwx at the same time, so we change it to exec.
+		if err = platform.MprotectRX(executable); err != nil {
+			panic(err)
+		}
+	}
+	return executable
+}
+
+func (cm *compiledModule) functionIndexOf(addr uintptr) wasm.Index {
+	addr -= uintptr(unsafe.Pointer(&cm.executable[0]))
+	offset := cm.functionOffsets
+	index := sort.Search(len(offset), func(i int) bool {
+		return offset[i] > int(addr)
+	})
+	index--
+	if index < 0 {
+		panic("BUG")
+	}
+	return wasm.Index(index)
+}
+
+func (e *engine) getListenerTrampolineForType(functionType *wasm.FunctionType) (before, after *byte) {
+	e.mux.Lock()
+	defer e.mux.Unlock()
+
+	beforeBuf, ok := e.sharedFunctions.listenerBeforeTrampolines[functionType]
+	afterBuf := e.sharedFunctions.listenerAfterTrampolines[functionType]
+	if ok {
+		return &beforeBuf[0], &afterBuf[0]
+	}
+
+	beforeSig, afterSig := frontend.SignatureForListener(functionType)
+
+	e.be.Init()
+	buf := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeCallListenerBefore, beforeSig, false)
+	beforeBuf = mmapExecutable(buf)
+
+	e.be.Init()
+	buf = e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeCallListenerAfter, afterSig, false)
+	afterBuf = mmapExecutable(buf)
+
+	e.sharedFunctions.listenerBeforeTrampolines[functionType] = beforeBuf
+	e.sharedFunctions.listenerAfterTrampolines[functionType] = afterBuf
+	return &beforeBuf[0], &afterBuf[0]
+}
+
+func (cm *compiledModule) getSourceOffset(pc uintptr) uint64 {
+	offsets := cm.sourceMap.executableOffsets
+	if len(offsets) == 0 {
+		return 0
+	}
+
+	index := sort.Search(len(offsets), func(i int) bool {
+		return offsets[i] >= pc
+	})
+
+	index--
+	if index < 0 {
+		return 0
+	}
+	return cm.sourceMap.wasmBinaryOffsets[index]
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go
new file mode 100644
index 000000000..f7c0450ae
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go
@@ -0,0 +1,296 @@
+package wazevo
+
+import (
+	"bytes"
+	"context"
+	"crypto/sha256"
+	"encoding/binary"
+	"fmt"
+	"hash/crc32"
+	"io"
+	"runtime"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/experimental"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+	"github.com/tetratelabs/wazero/internal/filecache"
+	"github.com/tetratelabs/wazero/internal/platform"
+	"github.com/tetratelabs/wazero/internal/u32"
+	"github.com/tetratelabs/wazero/internal/u64"
+	"github.com/tetratelabs/wazero/internal/wasm"
+)
+
+var crc = crc32.MakeTable(crc32.Castagnoli)
+
+// fileCacheKey returns a key for the file cache.
+// In order to avoid collisions with the existing compiler, we do not use m.ID directly,
+// but instead we rehash it with magic.
+func fileCacheKey(m *wasm.Module) (ret filecache.Key) {
+	s := sha256.New()
+	s.Write(m.ID[:])
+	s.Write(magic)
+	s.Sum(ret[:0])
+	return
+}
+
+func (e *engine) addCompiledModule(module *wasm.Module, cm *compiledModule) (err error) {
+	e.addCompiledModuleToMemory(module, cm)
+	if !module.IsHostModule && e.fileCache != nil {
+		err = e.addCompiledModuleToCache(module, cm)
+	}
+	return
+}
+
+func (e *engine) getCompiledModule(module *wasm.Module, listeners []experimental.FunctionListener, ensureTermination bool) (cm *compiledModule, ok bool, err error) {
+	cm, ok = e.getCompiledModuleFromMemory(module)
+	if ok {
+		return
+	}
+	cm, ok, err = e.getCompiledModuleFromCache(module)
+	if ok {
+		cm.parent = e
+		cm.module = module
+		cm.sharedFunctions = e.sharedFunctions
+		cm.ensureTermination = ensureTermination
+		cm.offsets = wazevoapi.NewModuleContextOffsetData(module, len(listeners) > 0)
+		if len(listeners) > 0 {
+			cm.listeners = listeners
+			cm.listenerBeforeTrampolines = make([]*byte, len(module.TypeSection))
+			cm.listenerAfterTrampolines = make([]*byte, len(module.TypeSection))
+			for i := range module.TypeSection {
+				typ := &module.TypeSection[i]
+				before, after := e.getListenerTrampolineForType(typ)
+				cm.listenerBeforeTrampolines[i] = before
+				cm.listenerAfterTrampolines[i] = after
+			}
+		}
+		e.addCompiledModuleToMemory(module, cm)
+		ssaBuilder := ssa.NewBuilder()
+		machine := newMachine()
+		be := backend.NewCompiler(context.Background(), machine, ssaBuilder)
+		cm.executables.compileEntryPreambles(module, machine, be)
+
+		// Set the finalizer.
+		e.setFinalizer(cm.executables, executablesFinalizer)
+	}
+	return
+}
+
+func (e *engine) addCompiledModuleToMemory(m *wasm.Module, cm *compiledModule) {
+	e.mux.Lock()
+	defer e.mux.Unlock()
+	e.compiledModules[m.ID] = cm
+	if len(cm.executable) > 0 {
+		e.addCompiledModuleToSortedList(cm)
+	}
+}
+
+func (e *engine) getCompiledModuleFromMemory(module *wasm.Module) (cm *compiledModule, ok bool) {
+	e.mux.RLock()
+	defer e.mux.RUnlock()
+	cm, ok = e.compiledModules[module.ID]
+	return
+}
+
+func (e *engine) addCompiledModuleToCache(module *wasm.Module, cm *compiledModule) (err error) {
+	if e.fileCache == nil || module.IsHostModule {
+		return
+	}
+	err = e.fileCache.Add(fileCacheKey(module), serializeCompiledModule(e.wazeroVersion, cm))
+	return
+}
+
+func (e *engine) getCompiledModuleFromCache(module *wasm.Module) (cm *compiledModule, hit bool, err error) {
+	if e.fileCache == nil || module.IsHostModule {
+		return
+	}
+
+	// Check if the entries exist in the external cache.
+	var cached io.ReadCloser
+	cached, hit, err = e.fileCache.Get(fileCacheKey(module))
+	if !hit || err != nil {
+		return
+	}
+
+	// Otherwise, we hit the cache on external cache.
+	// We retrieve *code structures from `cached`.
+	var staleCache bool
+	// Note: cached.Close is ensured to be called in deserializeCodes.
+	cm, staleCache, err = deserializeCompiledModule(e.wazeroVersion, cached)
+	if err != nil {
+		hit = false
+		return
+	} else if staleCache {
+		return nil, false, e.fileCache.Delete(fileCacheKey(module))
+	}
+	return
+}
+
+var magic = []byte{'W', 'A', 'Z', 'E', 'V', 'O'}
+
+func serializeCompiledModule(wazeroVersion string, cm *compiledModule) io.Reader {
+	buf := bytes.NewBuffer(nil)
+	// First 6 byte: WAZEVO header.
+	buf.Write(magic)
+	// Next 1 byte: length of version:
+	buf.WriteByte(byte(len(wazeroVersion)))
+	// Version of wazero.
+	buf.WriteString(wazeroVersion)
+	// Number of *code (== locally defined functions in the module): 4 bytes.
+	buf.Write(u32.LeBytes(uint32(len(cm.functionOffsets))))
+	for _, offset := range cm.functionOffsets {
+		// The offset of this function in the executable (8 bytes).
+		buf.Write(u64.LeBytes(uint64(offset)))
+	}
+	// The length of code segment (8 bytes).
+	buf.Write(u64.LeBytes(uint64(len(cm.executable))))
+	// Append the native code.
+	buf.Write(cm.executable)
+	// Append checksum.
+	checksum := crc32.Checksum(cm.executable, crc)
+	buf.Write(u32.LeBytes(checksum))
+	if sm := cm.sourceMap; len(sm.executableOffsets) > 0 {
+		buf.WriteByte(1) // indicates that source map is present.
+		l := len(sm.wasmBinaryOffsets)
+		buf.Write(u64.LeBytes(uint64(l)))
+		executableAddr := uintptr(unsafe.Pointer(&cm.executable[0]))
+		for i := 0; i < l; i++ {
+			buf.Write(u64.LeBytes(sm.wasmBinaryOffsets[i]))
+			// executableOffsets is absolute address, so we need to subtract executableAddr.
+			buf.Write(u64.LeBytes(uint64(sm.executableOffsets[i] - executableAddr)))
+		}
+	} else {
+		buf.WriteByte(0) // indicates that source map is not present.
+	}
+	return bytes.NewReader(buf.Bytes())
+}
+
+func deserializeCompiledModule(wazeroVersion string, reader io.ReadCloser) (cm *compiledModule, staleCache bool, err error) {
+	defer reader.Close()
+	cacheHeaderSize := len(magic) + 1 /* version size */ + len(wazeroVersion) + 4 /* number of functions */
+
+	// Read the header before the native code.
+	header := make([]byte, cacheHeaderSize)
+	n, err := reader.Read(header)
+	if err != nil {
+		return nil, false, fmt.Errorf("compilationcache: error reading header: %v", err)
+	}
+
+	if n != cacheHeaderSize {
+		return nil, false, fmt.Errorf("compilationcache: invalid header length: %d", n)
+	}
+
+	if !bytes.Equal(header[:len(magic)], magic) {
+		return nil, false, fmt.Errorf(
+			"compilationcache: invalid magic number: got %s but want %s", magic, header[:len(magic)])
+	}
+
+	// Check the version compatibility.
+	versionSize := int(header[len(magic)])
+
+	cachedVersionBegin, cachedVersionEnd := len(magic)+1, len(magic)+1+versionSize
+	if cachedVersionEnd >= len(header) {
+		staleCache = true
+		return
+	} else if cachedVersion := string(header[cachedVersionBegin:cachedVersionEnd]); cachedVersion != wazeroVersion {
+		staleCache = true
+		return
+	}
+
+	functionsNum := binary.LittleEndian.Uint32(header[len(header)-4:])
+	cm = &compiledModule{functionOffsets: make([]int, functionsNum), executables: &executables{}}
+
+	var eightBytes [8]byte
+	for i := uint32(0); i < functionsNum; i++ {
+		// Read the offset of each function in the executable.
+		var offset uint64
+		if offset, err = readUint64(reader, &eightBytes); err != nil {
+			err = fmt.Errorf("compilationcache: error reading func[%d] executable offset: %v", i, err)
+			return
+		}
+		cm.functionOffsets[i] = int(offset)
+	}
+
+	executableLen, err := readUint64(reader, &eightBytes)
+	if err != nil {
+		err = fmt.Errorf("compilationcache: error reading executable size: %v", err)
+		return
+	}
+
+	if executableLen > 0 {
+		executable, err := platform.MmapCodeSegment(int(executableLen))
+		if err != nil {
+			err = fmt.Errorf("compilationcache: error mmapping executable (len=%d): %v", executableLen, err)
+			return nil, false, err
+		}
+
+		_, err = io.ReadFull(reader, executable)
+		if err != nil {
+			err = fmt.Errorf("compilationcache: error reading executable (len=%d): %v", executableLen, err)
+			return nil, false, err
+		}
+
+		expected := crc32.Checksum(executable, crc)
+		if _, err = io.ReadFull(reader, eightBytes[:4]); err != nil {
+			return nil, false, fmt.Errorf("compilationcache: could not read checksum: %v", err)
+		} else if checksum := binary.LittleEndian.Uint32(eightBytes[:4]); expected != checksum {
+			return nil, false, fmt.Errorf("compilationcache: checksum mismatch (expected %d, got %d)", expected, checksum)
+		}
+
+		if runtime.GOARCH == "arm64" {
+			// On arm64, we cannot give all of rwx at the same time, so we change it to exec.
+			if err = platform.MprotectRX(executable); err != nil {
+				return nil, false, err
+			}
+		}
+		cm.executable = executable
+	}
+
+	if _, err := io.ReadFull(reader, eightBytes[:1]); err != nil {
+		return nil, false, fmt.Errorf("compilationcache: error reading source map presence: %v", err)
+	}
+
+	if eightBytes[0] == 1 {
+		sm := &cm.sourceMap
+		sourceMapLen, err := readUint64(reader, &eightBytes)
+		if err != nil {
+			err = fmt.Errorf("compilationcache: error reading source map length: %v", err)
+			return nil, false, err
+		}
+		executableOffset := uintptr(unsafe.Pointer(&cm.executable[0]))
+		for i := uint64(0); i < sourceMapLen; i++ {
+			wasmBinaryOffset, err := readUint64(reader, &eightBytes)
+			if err != nil {
+				err = fmt.Errorf("compilationcache: error reading source map[%d] wasm binary offset: %v", i, err)
+				return nil, false, err
+			}
+			executableRelativeOffset, err := readUint64(reader, &eightBytes)
+			if err != nil {
+				err = fmt.Errorf("compilationcache: error reading source map[%d] executable offset: %v", i, err)
+				return nil, false, err
+			}
+			sm.wasmBinaryOffsets = append(sm.wasmBinaryOffsets, wasmBinaryOffset)
+			// executableOffsets is absolute address, so we need to add executableOffset.
+			sm.executableOffsets = append(sm.executableOffsets, uintptr(executableRelativeOffset)+executableOffset)
+		}
+	}
+	return
+}
+
+// readUint64 strictly reads an uint64 in little-endian byte order, using the
+// given array as a buffer. This returns io.EOF if less than 8 bytes were read.
+func readUint64(reader io.Reader, b *[8]byte) (uint64, error) {
+	s := b[0:8]
+	n, err := reader.Read(s)
+	if err != nil {
+		return 0, err
+	} else if n < 8 { // more strict than reader.Read
+		return 0, io.EOF
+	}
+
+	// Read the u64 from the underlying buffer.
+	ret := binary.LittleEndian.Uint64(s)
+	return ret, nil
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_amd64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_amd64.go
new file mode 100644
index 000000000..18f60af3a
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_amd64.go
@@ -0,0 +1,15 @@
+//go:build amd64 && !tinygo
+
+package wazevo
+
+import _ "unsafe"
+
+// entrypoint is implemented by the backend.
+//
+//go:linkname entrypoint github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64.entrypoint
+func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultStackPtr *uint64, goAllocatedStackSlicePtr uintptr)
+
+// entrypoint is implemented by the backend.
+//
+//go:linkname afterGoFunctionCallEntrypoint github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64.afterGoFunctionCallEntrypoint
+func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_arm64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_arm64.go
new file mode 100644
index 000000000..e16d64f65
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_arm64.go
@@ -0,0 +1,15 @@
+//go:build arm64 && !tinygo
+
+package wazevo
+
+import _ "unsafe"
+
+// entrypoint is implemented by the backend.
+//
+//go:linkname entrypoint github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64.entrypoint
+func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultStackPtr *uint64, goAllocatedStackSlicePtr uintptr)
+
+// entrypoint is implemented by the backend.
+//
+//go:linkname afterGoFunctionCallEntrypoint github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64.afterGoFunctionCallEntrypoint
+func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_others.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_others.go
new file mode 100644
index 000000000..8f9d64b2b
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_others.go
@@ -0,0 +1,15 @@
+//go:build (!arm64 && !amd64) || tinygo
+
+package wazevo
+
+import (
+	"runtime"
+)
+
+func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultStackPtr *uint64, goAllocatedStackSlicePtr uintptr) {
+	panic(runtime.GOARCH)
+}
+
+func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr) {
+	panic(runtime.GOARCH)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go
new file mode 100644
index 000000000..873a35a55
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go
@@ -0,0 +1,594 @@
+// Package frontend implements the translation of WebAssembly to SSA IR using the ssa package.
+package frontend
+
+import (
+	"bytes"
+	"math"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+	"github.com/tetratelabs/wazero/internal/wasm"
+)
+
+// Compiler is in charge of lowering Wasm to SSA IR, and does the optimization
+// on top of it in architecture-independent way.
+type Compiler struct {
+	// Per-module data that is used across all functions.
+
+	m      *wasm.Module
+	offset *wazevoapi.ModuleContextOffsetData
+	// ssaBuilder is a ssa.Builder used by this frontend.
+	ssaBuilder             ssa.Builder
+	signatures             map[*wasm.FunctionType]*ssa.Signature
+	listenerSignatures     map[*wasm.FunctionType][2]*ssa.Signature
+	memoryGrowSig          ssa.Signature
+	memoryWait32Sig        ssa.Signature
+	memoryWait64Sig        ssa.Signature
+	memoryNotifySig        ssa.Signature
+	checkModuleExitCodeSig ssa.Signature
+	tableGrowSig           ssa.Signature
+	refFuncSig             ssa.Signature
+	memmoveSig             ssa.Signature
+	ensureTermination      bool
+
+	// Followings are reset by per function.
+
+	// wasmLocalToVariable maps the index (considered as wasm.Index of locals)
+	// to the corresponding ssa.Variable.
+	wasmLocalToVariable                   [] /* local index to */ ssa.Variable
+	wasmLocalFunctionIndex                wasm.Index
+	wasmFunctionTypeIndex                 wasm.Index
+	wasmFunctionTyp                       *wasm.FunctionType
+	wasmFunctionLocalTypes                []wasm.ValueType
+	wasmFunctionBody                      []byte
+	wasmFunctionBodyOffsetInCodeSection   uint64
+	memoryBaseVariable, memoryLenVariable ssa.Variable
+	needMemory                            bool
+	memoryShared                          bool
+	globalVariables                       []ssa.Variable
+	globalVariablesTypes                  []ssa.Type
+	mutableGlobalVariablesIndexes         []wasm.Index // index to ^.
+	needListener                          bool
+	needSourceOffsetInfo                  bool
+	// br is reused during lowering.
+	br            *bytes.Reader
+	loweringState loweringState
+
+	knownSafeBounds    [] /* ssa.ValueID to */ knownSafeBound
+	knownSafeBoundsSet []ssa.ValueID
+
+	knownSafeBoundsAtTheEndOfBlocks   [] /* ssa.BlockID to */ knownSafeBoundsAtTheEndOfBlock
+	varLengthKnownSafeBoundWithIDPool wazevoapi.VarLengthPool[knownSafeBoundWithID]
+
+	execCtxPtrValue, moduleCtxPtrValue ssa.Value
+
+	// Following are reused for the known safe bounds analysis.
+
+	pointers []int
+	bounds   [][]knownSafeBoundWithID
+}
+
+type (
+	// knownSafeBound represents a known safe bound for a value.
+	knownSafeBound struct {
+		// bound is a constant upper bound for the value.
+		bound uint64
+		// absoluteAddr is the absolute address of the value.
+		absoluteAddr ssa.Value
+	}
+	// knownSafeBoundWithID is a knownSafeBound with the ID of the value.
+	knownSafeBoundWithID struct {
+		knownSafeBound
+		id ssa.ValueID
+	}
+	knownSafeBoundsAtTheEndOfBlock = wazevoapi.VarLength[knownSafeBoundWithID]
+)
+
+var knownSafeBoundsAtTheEndOfBlockNil = wazevoapi.NewNilVarLength[knownSafeBoundWithID]()
+
+// NewFrontendCompiler returns a frontend Compiler.
+func NewFrontendCompiler(m *wasm.Module, ssaBuilder ssa.Builder, offset *wazevoapi.ModuleContextOffsetData, ensureTermination bool, listenerOn bool, sourceInfo bool) *Compiler {
+	c := &Compiler{
+		m:                                 m,
+		ssaBuilder:                        ssaBuilder,
+		br:                                bytes.NewReader(nil),
+		offset:                            offset,
+		ensureTermination:                 ensureTermination,
+		needSourceOffsetInfo:              sourceInfo,
+		varLengthKnownSafeBoundWithIDPool: wazevoapi.NewVarLengthPool[knownSafeBoundWithID](),
+	}
+	c.declareSignatures(listenerOn)
+	return c
+}
+
+func (c *Compiler) declareSignatures(listenerOn bool) {
+	m := c.m
+	c.signatures = make(map[*wasm.FunctionType]*ssa.Signature, len(m.TypeSection)+2)
+	if listenerOn {
+		c.listenerSignatures = make(map[*wasm.FunctionType][2]*ssa.Signature, len(m.TypeSection))
+	}
+	for i := range m.TypeSection {
+		wasmSig := &m.TypeSection[i]
+		sig := SignatureForWasmFunctionType(wasmSig)
+		sig.ID = ssa.SignatureID(i)
+		c.signatures[wasmSig] = &sig
+		c.ssaBuilder.DeclareSignature(&sig)
+
+		if listenerOn {
+			beforeSig, afterSig := SignatureForListener(wasmSig)
+			beforeSig.ID = ssa.SignatureID(i) + ssa.SignatureID(len(m.TypeSection))
+			afterSig.ID = ssa.SignatureID(i) + ssa.SignatureID(len(m.TypeSection))*2
+			c.listenerSignatures[wasmSig] = [2]*ssa.Signature{beforeSig, afterSig}
+			c.ssaBuilder.DeclareSignature(beforeSig)
+			c.ssaBuilder.DeclareSignature(afterSig)
+		}
+	}
+
+	begin := ssa.SignatureID(len(m.TypeSection))
+	if listenerOn {
+		begin *= 3
+	}
+	c.memoryGrowSig = ssa.Signature{
+		ID: begin,
+		// Takes execution context and the page size to grow.
+		Params: []ssa.Type{ssa.TypeI64, ssa.TypeI32},
+		// Returns the previous page size.
+		Results: []ssa.Type{ssa.TypeI32},
+	}
+	c.ssaBuilder.DeclareSignature(&c.memoryGrowSig)
+
+	c.checkModuleExitCodeSig = ssa.Signature{
+		ID: c.memoryGrowSig.ID + 1,
+		// Only takes execution context.
+		Params: []ssa.Type{ssa.TypeI64},
+	}
+	c.ssaBuilder.DeclareSignature(&c.checkModuleExitCodeSig)
+
+	c.tableGrowSig = ssa.Signature{
+		ID:     c.checkModuleExitCodeSig.ID + 1,
+		Params: []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32 /* table index */, ssa.TypeI32 /* num */, ssa.TypeI64 /* ref */},
+		// Returns the previous size.
+		Results: []ssa.Type{ssa.TypeI32},
+	}
+	c.ssaBuilder.DeclareSignature(&c.tableGrowSig)
+
+	c.refFuncSig = ssa.Signature{
+		ID:     c.tableGrowSig.ID + 1,
+		Params: []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32 /* func index */},
+		// Returns the function reference.
+		Results: []ssa.Type{ssa.TypeI64},
+	}
+	c.ssaBuilder.DeclareSignature(&c.refFuncSig)
+
+	c.memmoveSig = ssa.Signature{
+		ID: c.refFuncSig.ID + 1,
+		// dst, src, and the byte count.
+		Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI64},
+	}
+
+	c.ssaBuilder.DeclareSignature(&c.memmoveSig)
+
+	c.memoryWait32Sig = ssa.Signature{
+		ID: c.memmoveSig.ID + 1,
+		// exec context, timeout, expected, addr
+		Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI32, ssa.TypeI64},
+		// Returns the status.
+		Results: []ssa.Type{ssa.TypeI32},
+	}
+	c.ssaBuilder.DeclareSignature(&c.memoryWait32Sig)
+
+	c.memoryWait64Sig = ssa.Signature{
+		ID: c.memoryWait32Sig.ID + 1,
+		// exec context, timeout, expected, addr
+		Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI64, ssa.TypeI64},
+		// Returns the status.
+		Results: []ssa.Type{ssa.TypeI32},
+	}
+	c.ssaBuilder.DeclareSignature(&c.memoryWait64Sig)
+
+	c.memoryNotifySig = ssa.Signature{
+		ID: c.memoryWait64Sig.ID + 1,
+		// exec context, count, addr
+		Params: []ssa.Type{ssa.TypeI64, ssa.TypeI32, ssa.TypeI64},
+		// Returns the number notified.
+		Results: []ssa.Type{ssa.TypeI32},
+	}
+	c.ssaBuilder.DeclareSignature(&c.memoryNotifySig)
+}
+
+// SignatureForWasmFunctionType returns the ssa.Signature for the given wasm.FunctionType.
+func SignatureForWasmFunctionType(typ *wasm.FunctionType) ssa.Signature {
+	sig := ssa.Signature{
+		// +2 to pass moduleContextPtr and executionContextPtr. See the inline comment LowerToSSA.
+		Params:  make([]ssa.Type, len(typ.Params)+2),
+		Results: make([]ssa.Type, len(typ.Results)),
+	}
+	sig.Params[0] = executionContextPtrTyp
+	sig.Params[1] = moduleContextPtrTyp
+	for j, typ := range typ.Params {
+		sig.Params[j+2] = WasmTypeToSSAType(typ)
+	}
+	for j, typ := range typ.Results {
+		sig.Results[j] = WasmTypeToSSAType(typ)
+	}
+	return sig
+}
+
+// Init initializes the state of frontendCompiler and make it ready for a next function.
+func (c *Compiler) Init(idx, typIndex wasm.Index, typ *wasm.FunctionType, localTypes []wasm.ValueType, body []byte, needListener bool, bodyOffsetInCodeSection uint64) {
+	c.ssaBuilder.Init(c.signatures[typ])
+	c.loweringState.reset()
+
+	c.wasmFunctionTypeIndex = typIndex
+	c.wasmLocalFunctionIndex = idx
+	c.wasmFunctionTyp = typ
+	c.wasmFunctionLocalTypes = localTypes
+	c.wasmFunctionBody = body
+	c.wasmFunctionBodyOffsetInCodeSection = bodyOffsetInCodeSection
+	c.needListener = needListener
+	c.clearSafeBounds()
+	c.varLengthKnownSafeBoundWithIDPool.Reset()
+	c.knownSafeBoundsAtTheEndOfBlocks = c.knownSafeBoundsAtTheEndOfBlocks[:0]
+}
+
+// Note: this assumes 64-bit platform (I believe we won't have 32-bit backend ;)).
+const executionContextPtrTyp, moduleContextPtrTyp = ssa.TypeI64, ssa.TypeI64
+
+// LowerToSSA lowers the current function to SSA function which will be held by ssaBuilder.
+// After calling this, the caller will be able to access the SSA info in *Compiler.ssaBuilder.
+//
+// Note that this only does the naive lowering, and do not do any optimization, instead the caller is expected to do so.
+func (c *Compiler) LowerToSSA() {
+	builder := c.ssaBuilder
+
+	// Set up the entry block.
+	entryBlock := builder.AllocateBasicBlock()
+	builder.SetCurrentBlock(entryBlock)
+
+	// Functions always take two parameters in addition to Wasm-level parameters:
+	//
+	//  1. executionContextPtr: pointer to the *executionContext in wazevo package.
+	//    This will be used to exit the execution in the face of trap, plus used for host function calls.
+	//
+	// 	2. moduleContextPtr: pointer to the *moduleContextOpaque in wazevo package.
+	//	  This will be used to access memory, etc. Also, this will be used during host function calls.
+	//
+	// Note: it's clear that sometimes a function won't need them. For example,
+	//  if the function doesn't trap and doesn't make function call, then
+	// 	we might be able to eliminate the parameter. However, if that function
+	//	can be called via call_indirect, then we cannot eliminate because the
+	//  signature won't match with the expected one.
+	// TODO: maybe there's some way to do this optimization without glitches, but so far I have no clue about the feasibility.
+	//
+	// Note: In Wasmtime or many other runtimes, moduleContextPtr is called "vmContext". Also note that `moduleContextPtr`
+	//  is wazero-specific since other runtimes can naturally use the OS-level signal to do this job thanks to the fact that
+	//  they can use native stack vs wazero cannot use Go-routine stack and have to use Go-runtime allocated []byte as a stack.
+	c.execCtxPtrValue = entryBlock.AddParam(builder, executionContextPtrTyp)
+	c.moduleCtxPtrValue = entryBlock.AddParam(builder, moduleContextPtrTyp)
+	builder.AnnotateValue(c.execCtxPtrValue, "exec_ctx")
+	builder.AnnotateValue(c.moduleCtxPtrValue, "module_ctx")
+
+	for i, typ := range c.wasmFunctionTyp.Params {
+		st := WasmTypeToSSAType(typ)
+		variable := builder.DeclareVariable(st)
+		value := entryBlock.AddParam(builder, st)
+		builder.DefineVariable(variable, value, entryBlock)
+		c.setWasmLocalVariable(wasm.Index(i), variable)
+	}
+	c.declareWasmLocals(entryBlock)
+	c.declareNecessaryVariables()
+
+	c.lowerBody(entryBlock)
+}
+
+// localVariable returns the SSA variable for the given Wasm local index.
+func (c *Compiler) localVariable(index wasm.Index) ssa.Variable {
+	return c.wasmLocalToVariable[index]
+}
+
+func (c *Compiler) setWasmLocalVariable(index wasm.Index, variable ssa.Variable) {
+	idx := int(index)
+	if idx >= len(c.wasmLocalToVariable) {
+		c.wasmLocalToVariable = append(c.wasmLocalToVariable, make([]ssa.Variable, idx+1-len(c.wasmLocalToVariable))...)
+	}
+	c.wasmLocalToVariable[idx] = variable
+}
+
+// declareWasmLocals declares the SSA variables for the Wasm locals.
+func (c *Compiler) declareWasmLocals(entry ssa.BasicBlock) {
+	localCount := wasm.Index(len(c.wasmFunctionTyp.Params))
+	for i, typ := range c.wasmFunctionLocalTypes {
+		st := WasmTypeToSSAType(typ)
+		variable := c.ssaBuilder.DeclareVariable(st)
+		c.setWasmLocalVariable(wasm.Index(i)+localCount, variable)
+
+		zeroInst := c.ssaBuilder.AllocateInstruction()
+		switch st {
+		case ssa.TypeI32:
+			zeroInst.AsIconst32(0)
+		case ssa.TypeI64:
+			zeroInst.AsIconst64(0)
+		case ssa.TypeF32:
+			zeroInst.AsF32const(0)
+		case ssa.TypeF64:
+			zeroInst.AsF64const(0)
+		case ssa.TypeV128:
+			zeroInst.AsVconst(0, 0)
+		default:
+			panic("TODO: " + wasm.ValueTypeName(typ))
+		}
+
+		c.ssaBuilder.InsertInstruction(zeroInst)
+		value := zeroInst.Return()
+		c.ssaBuilder.DefineVariable(variable, value, entry)
+	}
+}
+
+func (c *Compiler) declareNecessaryVariables() {
+	if c.needMemory = c.m.MemorySection != nil; c.needMemory {
+		c.memoryShared = c.m.MemorySection.IsShared
+	} else if c.needMemory = c.m.ImportMemoryCount > 0; c.needMemory {
+		for _, imp := range c.m.ImportSection {
+			if imp.Type == wasm.ExternTypeMemory {
+				c.memoryShared = imp.DescMem.IsShared
+				break
+			}
+		}
+	}
+
+	if c.needMemory {
+		c.memoryBaseVariable = c.ssaBuilder.DeclareVariable(ssa.TypeI64)
+		c.memoryLenVariable = c.ssaBuilder.DeclareVariable(ssa.TypeI64)
+	}
+
+	c.globalVariables = c.globalVariables[:0]
+	c.mutableGlobalVariablesIndexes = c.mutableGlobalVariablesIndexes[:0]
+	c.globalVariablesTypes = c.globalVariablesTypes[:0]
+	for _, imp := range c.m.ImportSection {
+		if imp.Type == wasm.ExternTypeGlobal {
+			desc := imp.DescGlobal
+			c.declareWasmGlobal(desc.ValType, desc.Mutable)
+		}
+	}
+	for _, g := range c.m.GlobalSection {
+		desc := g.Type
+		c.declareWasmGlobal(desc.ValType, desc.Mutable)
+	}
+
+	// TODO: add tables.
+}
+
+func (c *Compiler) declareWasmGlobal(typ wasm.ValueType, mutable bool) {
+	var st ssa.Type
+	switch typ {
+	case wasm.ValueTypeI32:
+		st = ssa.TypeI32
+	case wasm.ValueTypeI64,
+		// Both externref and funcref are represented as I64 since we only support 64-bit platforms.
+		wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
+		st = ssa.TypeI64
+	case wasm.ValueTypeF32:
+		st = ssa.TypeF32
+	case wasm.ValueTypeF64:
+		st = ssa.TypeF64
+	case wasm.ValueTypeV128:
+		st = ssa.TypeV128
+	default:
+		panic("TODO: " + wasm.ValueTypeName(typ))
+	}
+	v := c.ssaBuilder.DeclareVariable(st)
+	index := wasm.Index(len(c.globalVariables))
+	c.globalVariables = append(c.globalVariables, v)
+	c.globalVariablesTypes = append(c.globalVariablesTypes, st)
+	if mutable {
+		c.mutableGlobalVariablesIndexes = append(c.mutableGlobalVariablesIndexes, index)
+	}
+}
+
+// WasmTypeToSSAType converts wasm.ValueType to ssa.Type.
+func WasmTypeToSSAType(vt wasm.ValueType) ssa.Type {
+	switch vt {
+	case wasm.ValueTypeI32:
+		return ssa.TypeI32
+	case wasm.ValueTypeI64,
+		// Both externref and funcref are represented as I64 since we only support 64-bit platforms.
+		wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
+		return ssa.TypeI64
+	case wasm.ValueTypeF32:
+		return ssa.TypeF32
+	case wasm.ValueTypeF64:
+		return ssa.TypeF64
+	case wasm.ValueTypeV128:
+		return ssa.TypeV128
+	default:
+		panic("TODO: " + wasm.ValueTypeName(vt))
+	}
+}
+
+// addBlockParamsFromWasmTypes adds the block parameters to the given block.
+func (c *Compiler) addBlockParamsFromWasmTypes(tps []wasm.ValueType, blk ssa.BasicBlock) {
+	for _, typ := range tps {
+		st := WasmTypeToSSAType(typ)
+		blk.AddParam(c.ssaBuilder, st)
+	}
+}
+
+// formatBuilder outputs the constructed SSA function as a string with a source information.
+func (c *Compiler) formatBuilder() string {
+	return c.ssaBuilder.Format()
+}
+
+// SignatureForListener returns the signatures for the listener functions.
+func SignatureForListener(wasmSig *wasm.FunctionType) (*ssa.Signature, *ssa.Signature) {
+	beforeSig := &ssa.Signature{}
+	beforeSig.Params = make([]ssa.Type, len(wasmSig.Params)+2)
+	beforeSig.Params[0] = ssa.TypeI64 // Execution context.
+	beforeSig.Params[1] = ssa.TypeI32 // Function index.
+	for i, p := range wasmSig.Params {
+		beforeSig.Params[i+2] = WasmTypeToSSAType(p)
+	}
+	afterSig := &ssa.Signature{}
+	afterSig.Params = make([]ssa.Type, len(wasmSig.Results)+2)
+	afterSig.Params[0] = ssa.TypeI64 // Execution context.
+	afterSig.Params[1] = ssa.TypeI32 // Function index.
+	for i, p := range wasmSig.Results {
+		afterSig.Params[i+2] = WasmTypeToSSAType(p)
+	}
+	return beforeSig, afterSig
+}
+
+// isBoundSafe returns true if the given value is known to be safe to access up to the given bound.
+func (c *Compiler) getKnownSafeBound(v ssa.ValueID) *knownSafeBound {
+	if int(v) >= len(c.knownSafeBounds) {
+		return nil
+	}
+	return &c.knownSafeBounds[v]
+}
+
+// recordKnownSafeBound records the given safe bound for the given value.
+func (c *Compiler) recordKnownSafeBound(v ssa.ValueID, safeBound uint64, absoluteAddr ssa.Value) {
+	if int(v) >= len(c.knownSafeBounds) {
+		c.knownSafeBounds = append(c.knownSafeBounds, make([]knownSafeBound, v+1)...)
+	}
+
+	if exiting := c.knownSafeBounds[v]; exiting.bound == 0 {
+		c.knownSafeBounds[v] = knownSafeBound{
+			bound:        safeBound,
+			absoluteAddr: absoluteAddr,
+		}
+		c.knownSafeBoundsSet = append(c.knownSafeBoundsSet, v)
+	} else if safeBound > exiting.bound {
+		c.knownSafeBounds[v].bound = safeBound
+	}
+}
+
+// clearSafeBounds clears the known safe bounds.
+func (c *Compiler) clearSafeBounds() {
+	for _, v := range c.knownSafeBoundsSet {
+		ptr := &c.knownSafeBounds[v]
+		ptr.bound = 0
+		ptr.absoluteAddr = ssa.ValueInvalid
+	}
+	c.knownSafeBoundsSet = c.knownSafeBoundsSet[:0]
+}
+
+// resetAbsoluteAddressInSafeBounds resets the absolute addresses recorded in the known safe bounds.
+func (c *Compiler) resetAbsoluteAddressInSafeBounds() {
+	for _, v := range c.knownSafeBoundsSet {
+		ptr := &c.knownSafeBounds[v]
+		ptr.absoluteAddr = ssa.ValueInvalid
+	}
+}
+
+func (k *knownSafeBound) valid() bool {
+	return k != nil && k.bound > 0
+}
+
+func (c *Compiler) allocateVarLengthValues(_cap int, vs ...ssa.Value) ssa.Values {
+	builder := c.ssaBuilder
+	pool := builder.VarLengthPool()
+	args := pool.Allocate(_cap)
+	args = args.Append(builder.VarLengthPool(), vs...)
+	return args
+}
+
+func (c *Compiler) finalizeKnownSafeBoundsAtTheEndOfBlock(bID ssa.BasicBlockID) {
+	_bID := int(bID)
+	if l := len(c.knownSafeBoundsAtTheEndOfBlocks); _bID >= l {
+		c.knownSafeBoundsAtTheEndOfBlocks = append(c.knownSafeBoundsAtTheEndOfBlocks,
+			make([]knownSafeBoundsAtTheEndOfBlock, _bID+1-len(c.knownSafeBoundsAtTheEndOfBlocks))...)
+		for i := l; i < len(c.knownSafeBoundsAtTheEndOfBlocks); i++ {
+			c.knownSafeBoundsAtTheEndOfBlocks[i] = knownSafeBoundsAtTheEndOfBlockNil
+		}
+	}
+	p := &c.varLengthKnownSafeBoundWithIDPool
+	size := len(c.knownSafeBoundsSet)
+	allocated := c.varLengthKnownSafeBoundWithIDPool.Allocate(size)
+	// Sort the known safe bounds by the value ID so that we can use the intersection algorithm in initializeCurrentBlockKnownBounds.
+	sortSSAValueIDs(c.knownSafeBoundsSet)
+	for _, vID := range c.knownSafeBoundsSet {
+		kb := c.knownSafeBounds[vID]
+		allocated = allocated.Append(p, knownSafeBoundWithID{
+			knownSafeBound: kb,
+			id:             vID,
+		})
+	}
+	c.knownSafeBoundsAtTheEndOfBlocks[bID] = allocated
+	c.clearSafeBounds()
+}
+
+func (c *Compiler) initializeCurrentBlockKnownBounds() {
+	currentBlk := c.ssaBuilder.CurrentBlock()
+	switch preds := currentBlk.Preds(); preds {
+	case 0:
+	case 1:
+		pred := currentBlk.Pred(0).ID()
+		for _, kb := range c.getKnownSafeBoundsAtTheEndOfBlocks(pred).View() {
+			// Unless the block is sealed, we cannot assume the absolute address is valid:
+			// later we might add another predecessor that has no visibility of that value.
+			addr := ssa.ValueInvalid
+			if currentBlk.Sealed() {
+				addr = kb.absoluteAddr
+			}
+			c.recordKnownSafeBound(kb.id, kb.bound, addr)
+		}
+	default:
+		c.pointers = c.pointers[:0]
+		c.bounds = c.bounds[:0]
+		for i := 0; i < preds; i++ {
+			c.bounds = append(c.bounds, c.getKnownSafeBoundsAtTheEndOfBlocks(currentBlk.Pred(i).ID()).View())
+			c.pointers = append(c.pointers, 0)
+		}
+
+		// If there are multiple predecessors, we need to find the intersection of the known safe bounds.
+
+	outer:
+		for {
+			smallestID := ssa.ValueID(math.MaxUint32)
+			for i, ptr := range c.pointers {
+				if ptr >= len(c.bounds[i]) {
+					break outer
+				}
+				cb := &c.bounds[i][ptr]
+				if id := cb.id; id < smallestID {
+					smallestID = cb.id
+				}
+			}
+
+			// Check if current elements are the same across all lists.
+			same := true
+			minBound := uint64(math.MaxUint64)
+			for i := 0; i < preds; i++ {
+				cb := &c.bounds[i][c.pointers[i]]
+				if cb.id != smallestID {
+					same = false
+					break
+				} else {
+					if cb.bound < minBound {
+						minBound = cb.bound
+					}
+				}
+			}
+
+			if same { // All elements are the same.
+				// Absolute address cannot be used in the intersection since the value might be only defined in one of the predecessors.
+				c.recordKnownSafeBound(smallestID, minBound, ssa.ValueInvalid)
+			}
+
+			// Move pointer(s) for the smallest ID forward (if same, move all).
+			for i := 0; i < preds; i++ {
+				cb := &c.bounds[i][c.pointers[i]]
+				if cb.id == smallestID {
+					c.pointers[i]++
+				}
+			}
+		}
+	}
+}
+
+func (c *Compiler) getKnownSafeBoundsAtTheEndOfBlocks(id ssa.BasicBlockID) knownSafeBoundsAtTheEndOfBlock {
+	if int(id) >= len(c.knownSafeBoundsAtTheEndOfBlocks) {
+		return knownSafeBoundsAtTheEndOfBlockNil
+	}
+	return c.knownSafeBoundsAtTheEndOfBlocks[id]
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go
new file mode 100644
index 000000000..5096a6365
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go
@@ -0,0 +1,4268 @@
+package frontend
+
+import (
+	"encoding/binary"
+	"fmt"
+	"math"
+	"runtime"
+	"strings"
+
+	"github.com/tetratelabs/wazero/api"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+	"github.com/tetratelabs/wazero/internal/leb128"
+	"github.com/tetratelabs/wazero/internal/wasm"
+)
+
+type (
+	// loweringState is used to keep the state of lowering.
+	loweringState struct {
+		// values holds the values on the Wasm stack.
+		values           []ssa.Value
+		controlFrames    []controlFrame
+		unreachable      bool
+		unreachableDepth int
+		tmpForBrTable    []uint32
+		pc               int
+	}
+	controlFrame struct {
+		kind controlFrameKind
+		// originalStackLen holds the number of values on the Wasm stack
+		// when start executing this control frame minus params for the block.
+		originalStackLenWithoutParam int
+		// blk is the loop header if this is loop, and is the else-block if this is an if frame.
+		blk,
+		// followingBlock is the basic block we enter if we reach "end" of block.
+		followingBlock ssa.BasicBlock
+		blockType *wasm.FunctionType
+		// clonedArgs hold the arguments to Else block.
+		clonedArgs ssa.Values
+	}
+
+	controlFrameKind byte
+)
+
+// String implements fmt.Stringer for debugging.
+func (l *loweringState) String() string {
+	var str []string
+	for _, v := range l.values {
+		str = append(str, fmt.Sprintf("v%v", v.ID()))
+	}
+	var frames []string
+	for i := range l.controlFrames {
+		frames = append(frames, l.controlFrames[i].kind.String())
+	}
+	return fmt.Sprintf("\n\tunreachable=%v(depth=%d)\n\tstack: %s\n\tcontrol frames: %s",
+		l.unreachable, l.unreachableDepth,
+		strings.Join(str, ", "),
+		strings.Join(frames, ", "),
+	)
+}
+
+const (
+	controlFrameKindFunction = iota + 1
+	controlFrameKindLoop
+	controlFrameKindIfWithElse
+	controlFrameKindIfWithoutElse
+	controlFrameKindBlock
+)
+
+// String implements fmt.Stringer for debugging.
+func (k controlFrameKind) String() string {
+	switch k {
+	case controlFrameKindFunction:
+		return "function"
+	case controlFrameKindLoop:
+		return "loop"
+	case controlFrameKindIfWithElse:
+		return "if_with_else"
+	case controlFrameKindIfWithoutElse:
+		return "if_without_else"
+	case controlFrameKindBlock:
+		return "block"
+	default:
+		panic(k)
+	}
+}
+
+// isLoop returns true if this is a loop frame.
+func (ctrl *controlFrame) isLoop() bool {
+	return ctrl.kind == controlFrameKindLoop
+}
+
+// reset resets the state of loweringState for reuse.
+func (l *loweringState) reset() {
+	l.values = l.values[:0]
+	l.controlFrames = l.controlFrames[:0]
+	l.pc = 0
+	l.unreachable = false
+	l.unreachableDepth = 0
+}
+
+func (l *loweringState) peek() (ret ssa.Value) {
+	tail := len(l.values) - 1
+	return l.values[tail]
+}
+
+func (l *loweringState) pop() (ret ssa.Value) {
+	tail := len(l.values) - 1
+	ret = l.values[tail]
+	l.values = l.values[:tail]
+	return
+}
+
+func (l *loweringState) push(ret ssa.Value) {
+	l.values = append(l.values, ret)
+}
+
+func (c *Compiler) nPeekDup(n int) ssa.Values {
+	if n == 0 {
+		return ssa.ValuesNil
+	}
+
+	l := c.state()
+	tail := len(l.values)
+
+	args := c.allocateVarLengthValues(n)
+	args = args.Append(c.ssaBuilder.VarLengthPool(), l.values[tail-n:tail]...)
+	return args
+}
+
+func (l *loweringState) ctrlPop() (ret controlFrame) {
+	tail := len(l.controlFrames) - 1
+	ret = l.controlFrames[tail]
+	l.controlFrames = l.controlFrames[:tail]
+	return
+}
+
+func (l *loweringState) ctrlPush(ret controlFrame) {
+	l.controlFrames = append(l.controlFrames, ret)
+}
+
+func (l *loweringState) ctrlPeekAt(n int) (ret *controlFrame) {
+	tail := len(l.controlFrames) - 1
+	return &l.controlFrames[tail-n]
+}
+
+// lowerBody lowers the body of the Wasm function to the SSA form.
+func (c *Compiler) lowerBody(entryBlk ssa.BasicBlock) {
+	c.ssaBuilder.Seal(entryBlk)
+
+	if c.needListener {
+		c.callListenerBefore()
+	}
+
+	// Pushes the empty control frame which corresponds to the function return.
+	c.loweringState.ctrlPush(controlFrame{
+		kind:           controlFrameKindFunction,
+		blockType:      c.wasmFunctionTyp,
+		followingBlock: c.ssaBuilder.ReturnBlock(),
+	})
+
+	for c.loweringState.pc < len(c.wasmFunctionBody) {
+		blkBeforeLowering := c.ssaBuilder.CurrentBlock()
+		c.lowerCurrentOpcode()
+		blkAfterLowering := c.ssaBuilder.CurrentBlock()
+		if blkBeforeLowering != blkAfterLowering {
+			// In Wasm, once a block exits, that means we've done compiling the block.
+			// Therefore, we finalize the known bounds at the end of the block for the exiting block.
+			c.finalizeKnownSafeBoundsAtTheEndOfBlock(blkBeforeLowering.ID())
+			// After that, we initialize the known bounds for the new compilation target block.
+			c.initializeCurrentBlockKnownBounds()
+		}
+	}
+}
+
+func (c *Compiler) state() *loweringState {
+	return &c.loweringState
+}
+
+func (c *Compiler) lowerCurrentOpcode() {
+	op := c.wasmFunctionBody[c.loweringState.pc]
+
+	if c.needSourceOffsetInfo {
+		c.ssaBuilder.SetCurrentSourceOffset(
+			ssa.SourceOffset(c.loweringState.pc) + ssa.SourceOffset(c.wasmFunctionBodyOffsetInCodeSection),
+		)
+	}
+
+	builder := c.ssaBuilder
+	state := c.state()
+	switch op {
+	case wasm.OpcodeI32Const:
+		c := c.readI32s()
+		if state.unreachable {
+			break
+		}
+
+		iconst := builder.AllocateInstruction().AsIconst32(uint32(c)).Insert(builder)
+		value := iconst.Return()
+		state.push(value)
+	case wasm.OpcodeI64Const:
+		c := c.readI64s()
+		if state.unreachable {
+			break
+		}
+		iconst := builder.AllocateInstruction().AsIconst64(uint64(c)).Insert(builder)
+		value := iconst.Return()
+		state.push(value)
+	case wasm.OpcodeF32Const:
+		f32 := c.readF32()
+		if state.unreachable {
+			break
+		}
+		f32const := builder.AllocateInstruction().
+			AsF32const(f32).
+			Insert(builder).
+			Return()
+		state.push(f32const)
+	case wasm.OpcodeF64Const:
+		f64 := c.readF64()
+		if state.unreachable {
+			break
+		}
+		f64const := builder.AllocateInstruction().
+			AsF64const(f64).
+			Insert(builder).
+			Return()
+		state.push(f64const)
+	case wasm.OpcodeI32Add, wasm.OpcodeI64Add:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		iadd := builder.AllocateInstruction()
+		iadd.AsIadd(x, y)
+		builder.InsertInstruction(iadd)
+		value := iadd.Return()
+		state.push(value)
+	case wasm.OpcodeI32Sub, wasm.OpcodeI64Sub:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		isub := builder.AllocateInstruction()
+		isub.AsIsub(x, y)
+		builder.InsertInstruction(isub)
+		value := isub.Return()
+		state.push(value)
+	case wasm.OpcodeF32Add, wasm.OpcodeF64Add:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		iadd := builder.AllocateInstruction()
+		iadd.AsFadd(x, y)
+		builder.InsertInstruction(iadd)
+		value := iadd.Return()
+		state.push(value)
+	case wasm.OpcodeI32Mul, wasm.OpcodeI64Mul:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		imul := builder.AllocateInstruction()
+		imul.AsImul(x, y)
+		builder.InsertInstruction(imul)
+		value := imul.Return()
+		state.push(value)
+	case wasm.OpcodeF32Sub, wasm.OpcodeF64Sub:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		isub := builder.AllocateInstruction()
+		isub.AsFsub(x, y)
+		builder.InsertInstruction(isub)
+		value := isub.Return()
+		state.push(value)
+	case wasm.OpcodeF32Mul, wasm.OpcodeF64Mul:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		isub := builder.AllocateInstruction()
+		isub.AsFmul(x, y)
+		builder.InsertInstruction(isub)
+		value := isub.Return()
+		state.push(value)
+	case wasm.OpcodeF32Div, wasm.OpcodeF64Div:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		isub := builder.AllocateInstruction()
+		isub.AsFdiv(x, y)
+		builder.InsertInstruction(isub)
+		value := isub.Return()
+		state.push(value)
+	case wasm.OpcodeF32Max, wasm.OpcodeF64Max:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		isub := builder.AllocateInstruction()
+		isub.AsFmax(x, y)
+		builder.InsertInstruction(isub)
+		value := isub.Return()
+		state.push(value)
+	case wasm.OpcodeF32Min, wasm.OpcodeF64Min:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		isub := builder.AllocateInstruction()
+		isub.AsFmin(x, y)
+		builder.InsertInstruction(isub)
+		value := isub.Return()
+		state.push(value)
+	case wasm.OpcodeI64Extend8S:
+		if state.unreachable {
+			break
+		}
+		c.insertIntegerExtend(true, 8, 64)
+	case wasm.OpcodeI64Extend16S:
+		if state.unreachable {
+			break
+		}
+		c.insertIntegerExtend(true, 16, 64)
+	case wasm.OpcodeI64Extend32S, wasm.OpcodeI64ExtendI32S:
+		if state.unreachable {
+			break
+		}
+		c.insertIntegerExtend(true, 32, 64)
+	case wasm.OpcodeI64ExtendI32U:
+		if state.unreachable {
+			break
+		}
+		c.insertIntegerExtend(false, 32, 64)
+	case wasm.OpcodeI32Extend8S:
+		if state.unreachable {
+			break
+		}
+		c.insertIntegerExtend(true, 8, 32)
+	case wasm.OpcodeI32Extend16S:
+		if state.unreachable {
+			break
+		}
+		c.insertIntegerExtend(true, 16, 32)
+	case wasm.OpcodeI32Eqz, wasm.OpcodeI64Eqz:
+		if state.unreachable {
+			break
+		}
+		x := state.pop()
+		zero := builder.AllocateInstruction()
+		if op == wasm.OpcodeI32Eqz {
+			zero.AsIconst32(0)
+		} else {
+			zero.AsIconst64(0)
+		}
+		builder.InsertInstruction(zero)
+		icmp := builder.AllocateInstruction().
+			AsIcmp(x, zero.Return(), ssa.IntegerCmpCondEqual).
+			Insert(builder).
+			Return()
+		state.push(icmp)
+	case wasm.OpcodeI32Eq, wasm.OpcodeI64Eq:
+		if state.unreachable {
+			break
+		}
+		c.insertIcmp(ssa.IntegerCmpCondEqual)
+	case wasm.OpcodeI32Ne, wasm.OpcodeI64Ne:
+		if state.unreachable {
+			break
+		}
+		c.insertIcmp(ssa.IntegerCmpCondNotEqual)
+	case wasm.OpcodeI32LtS, wasm.OpcodeI64LtS:
+		if state.unreachable {
+			break
+		}
+		c.insertIcmp(ssa.IntegerCmpCondSignedLessThan)
+	case wasm.OpcodeI32LtU, wasm.OpcodeI64LtU:
+		if state.unreachable {
+			break
+		}
+		c.insertIcmp(ssa.IntegerCmpCondUnsignedLessThan)
+	case wasm.OpcodeI32GtS, wasm.OpcodeI64GtS:
+		if state.unreachable {
+			break
+		}
+		c.insertIcmp(ssa.IntegerCmpCondSignedGreaterThan)
+	case wasm.OpcodeI32GtU, wasm.OpcodeI64GtU:
+		if state.unreachable {
+			break
+		}
+		c.insertIcmp(ssa.IntegerCmpCondUnsignedGreaterThan)
+	case wasm.OpcodeI32LeS, wasm.OpcodeI64LeS:
+		if state.unreachable {
+			break
+		}
+		c.insertIcmp(ssa.IntegerCmpCondSignedLessThanOrEqual)
+	case wasm.OpcodeI32LeU, wasm.OpcodeI64LeU:
+		if state.unreachable {
+			break
+		}
+		c.insertIcmp(ssa.IntegerCmpCondUnsignedLessThanOrEqual)
+	case wasm.OpcodeI32GeS, wasm.OpcodeI64GeS:
+		if state.unreachable {
+			break
+		}
+		c.insertIcmp(ssa.IntegerCmpCondSignedGreaterThanOrEqual)
+	case wasm.OpcodeI32GeU, wasm.OpcodeI64GeU:
+		if state.unreachable {
+			break
+		}
+		c.insertIcmp(ssa.IntegerCmpCondUnsignedGreaterThanOrEqual)
+
+	case wasm.OpcodeF32Eq, wasm.OpcodeF64Eq:
+		if state.unreachable {
+			break
+		}
+		c.insertFcmp(ssa.FloatCmpCondEqual)
+	case wasm.OpcodeF32Ne, wasm.OpcodeF64Ne:
+		if state.unreachable {
+			break
+		}
+		c.insertFcmp(ssa.FloatCmpCondNotEqual)
+	case wasm.OpcodeF32Lt, wasm.OpcodeF64Lt:
+		if state.unreachable {
+			break
+		}
+		c.insertFcmp(ssa.FloatCmpCondLessThan)
+	case wasm.OpcodeF32Gt, wasm.OpcodeF64Gt:
+		if state.unreachable {
+			break
+		}
+		c.insertFcmp(ssa.FloatCmpCondGreaterThan)
+	case wasm.OpcodeF32Le, wasm.OpcodeF64Le:
+		if state.unreachable {
+			break
+		}
+		c.insertFcmp(ssa.FloatCmpCondLessThanOrEqual)
+	case wasm.OpcodeF32Ge, wasm.OpcodeF64Ge:
+		if state.unreachable {
+			break
+		}
+		c.insertFcmp(ssa.FloatCmpCondGreaterThanOrEqual)
+	case wasm.OpcodeF32Neg, wasm.OpcodeF64Neg:
+		if state.unreachable {
+			break
+		}
+		x := state.pop()
+		v := builder.AllocateInstruction().AsFneg(x).Insert(builder).Return()
+		state.push(v)
+	case wasm.OpcodeF32Sqrt, wasm.OpcodeF64Sqrt:
+		if state.unreachable {
+			break
+		}
+		x := state.pop()
+		v := builder.AllocateInstruction().AsSqrt(x).Insert(builder).Return()
+		state.push(v)
+	case wasm.OpcodeF32Abs, wasm.OpcodeF64Abs:
+		if state.unreachable {
+			break
+		}
+		x := state.pop()
+		v := builder.AllocateInstruction().AsFabs(x).Insert(builder).Return()
+		state.push(v)
+	case wasm.OpcodeF32Copysign, wasm.OpcodeF64Copysign:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		v := builder.AllocateInstruction().AsFcopysign(x, y).Insert(builder).Return()
+		state.push(v)
+
+	case wasm.OpcodeF32Ceil, wasm.OpcodeF64Ceil:
+		if state.unreachable {
+			break
+		}
+		x := state.pop()
+		v := builder.AllocateInstruction().AsCeil(x).Insert(builder).Return()
+		state.push(v)
+	case wasm.OpcodeF32Floor, wasm.OpcodeF64Floor:
+		if state.unreachable {
+			break
+		}
+		x := state.pop()
+		v := builder.AllocateInstruction().AsFloor(x).Insert(builder).Return()
+		state.push(v)
+	case wasm.OpcodeF32Trunc, wasm.OpcodeF64Trunc:
+		if state.unreachable {
+			break
+		}
+		x := state.pop()
+		v := builder.AllocateInstruction().AsTrunc(x).Insert(builder).Return()
+		state.push(v)
+	case wasm.OpcodeF32Nearest, wasm.OpcodeF64Nearest:
+		if state.unreachable {
+			break
+		}
+		x := state.pop()
+		v := builder.AllocateInstruction().AsNearest(x).Insert(builder).Return()
+		state.push(v)
+	case wasm.OpcodeI64TruncF64S, wasm.OpcodeI64TruncF32S,
+		wasm.OpcodeI32TruncF64S, wasm.OpcodeI32TruncF32S,
+		wasm.OpcodeI64TruncF64U, wasm.OpcodeI64TruncF32U,
+		wasm.OpcodeI32TruncF64U, wasm.OpcodeI32TruncF32U:
+		if state.unreachable {
+			break
+		}
+		ret := builder.AllocateInstruction().AsFcvtToInt(
+			state.pop(),
+			c.execCtxPtrValue,
+			op == wasm.OpcodeI64TruncF64S || op == wasm.OpcodeI64TruncF32S || op == wasm.OpcodeI32TruncF32S || op == wasm.OpcodeI32TruncF64S,
+			op == wasm.OpcodeI64TruncF64S || op == wasm.OpcodeI64TruncF32S || op == wasm.OpcodeI64TruncF64U || op == wasm.OpcodeI64TruncF32U,
+			false,
+		).Insert(builder).Return()
+		state.push(ret)
+	case wasm.OpcodeMiscPrefix:
+		state.pc++
+		// A misc opcode is encoded as an unsigned variable 32-bit integer.
+		miscOpUint, num, err := leb128.LoadUint32(c.wasmFunctionBody[state.pc:])
+		if err != nil {
+			// In normal conditions this should never happen because the function has passed validation.
+			panic(fmt.Sprintf("failed to read misc opcode: %v", err))
+		}
+		state.pc += int(num - 1)
+		miscOp := wasm.OpcodeMisc(miscOpUint)
+		switch miscOp {
+		case wasm.OpcodeMiscI64TruncSatF64S, wasm.OpcodeMiscI64TruncSatF32S,
+			wasm.OpcodeMiscI32TruncSatF64S, wasm.OpcodeMiscI32TruncSatF32S,
+			wasm.OpcodeMiscI64TruncSatF64U, wasm.OpcodeMiscI64TruncSatF32U,
+			wasm.OpcodeMiscI32TruncSatF64U, wasm.OpcodeMiscI32TruncSatF32U:
+			if state.unreachable {
+				break
+			}
+			ret := builder.AllocateInstruction().AsFcvtToInt(
+				state.pop(),
+				c.execCtxPtrValue,
+				miscOp == wasm.OpcodeMiscI64TruncSatF64S || miscOp == wasm.OpcodeMiscI64TruncSatF32S || miscOp == wasm.OpcodeMiscI32TruncSatF32S || miscOp == wasm.OpcodeMiscI32TruncSatF64S,
+				miscOp == wasm.OpcodeMiscI64TruncSatF64S || miscOp == wasm.OpcodeMiscI64TruncSatF32S || miscOp == wasm.OpcodeMiscI64TruncSatF64U || miscOp == wasm.OpcodeMiscI64TruncSatF32U,
+				true,
+			).Insert(builder).Return()
+			state.push(ret)
+
+		case wasm.OpcodeMiscTableSize:
+			tableIndex := c.readI32u()
+			if state.unreachable {
+				break
+			}
+
+			// Load the table.
+			loadTableInstancePtr := builder.AllocateInstruction()
+			loadTableInstancePtr.AsLoad(c.moduleCtxPtrValue, c.offset.TableOffset(int(tableIndex)).U32(), ssa.TypeI64)
+			builder.InsertInstruction(loadTableInstancePtr)
+			tableInstancePtr := loadTableInstancePtr.Return()
+
+			// Load the table's length.
+			loadTableLen := builder.AllocateInstruction().
+				AsLoad(tableInstancePtr, tableInstanceLenOffset, ssa.TypeI32).
+				Insert(builder)
+			state.push(loadTableLen.Return())
+
+		case wasm.OpcodeMiscTableGrow:
+			tableIndex := c.readI32u()
+			if state.unreachable {
+				break
+			}
+
+			c.storeCallerModuleContext()
+
+			tableIndexVal := builder.AllocateInstruction().AsIconst32(tableIndex).Insert(builder).Return()
+
+			num := state.pop()
+			r := state.pop()
+
+			tableGrowPtr := builder.AllocateInstruction().
+				AsLoad(c.execCtxPtrValue,
+					wazevoapi.ExecutionContextOffsetTableGrowTrampolineAddress.U32(),
+					ssa.TypeI64,
+				).Insert(builder).Return()
+
+			args := c.allocateVarLengthValues(4, c.execCtxPtrValue, tableIndexVal, num, r)
+			callGrowRet := builder.
+				AllocateInstruction().
+				AsCallIndirect(tableGrowPtr, &c.tableGrowSig, args).
+				Insert(builder).Return()
+			state.push(callGrowRet)
+
+		case wasm.OpcodeMiscTableCopy:
+			dstTableIndex := c.readI32u()
+			srcTableIndex := c.readI32u()
+			if state.unreachable {
+				break
+			}
+
+			copySize := builder.
+				AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return()
+			srcOffset := builder.
+				AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return()
+			dstOffset := builder.
+				AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return()
+
+			// Out of bounds check.
+			dstTableInstancePtr := c.boundsCheckInTable(dstTableIndex, dstOffset, copySize)
+			srcTableInstancePtr := c.boundsCheckInTable(srcTableIndex, srcOffset, copySize)
+
+			dstTableBaseAddr := c.loadTableBaseAddr(dstTableInstancePtr)
+			srcTableBaseAddr := c.loadTableBaseAddr(srcTableInstancePtr)
+
+			three := builder.AllocateInstruction().AsIconst64(3).Insert(builder).Return()
+
+			dstOffsetInBytes := builder.AllocateInstruction().AsIshl(dstOffset, three).Insert(builder).Return()
+			dstAddr := builder.AllocateInstruction().AsIadd(dstTableBaseAddr, dstOffsetInBytes).Insert(builder).Return()
+			srcOffsetInBytes := builder.AllocateInstruction().AsIshl(srcOffset, three).Insert(builder).Return()
+			srcAddr := builder.AllocateInstruction().AsIadd(srcTableBaseAddr, srcOffsetInBytes).Insert(builder).Return()
+
+			copySizeInBytes := builder.AllocateInstruction().AsIshl(copySize, three).Insert(builder).Return()
+			c.callMemmove(dstAddr, srcAddr, copySizeInBytes)
+
+		case wasm.OpcodeMiscMemoryCopy:
+			state.pc += 2 // +2 to skip two memory indexes which are fixed to zero.
+			if state.unreachable {
+				break
+			}
+
+			copySize := builder.
+				AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return()
+			srcOffset := builder.
+				AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return()
+			dstOffset := builder.
+				AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return()
+
+			// Out of bounds check.
+			memLen := c.getMemoryLenValue(false)
+			c.boundsCheckInMemory(memLen, dstOffset, copySize)
+			c.boundsCheckInMemory(memLen, srcOffset, copySize)
+
+			memBase := c.getMemoryBaseValue(false)
+			dstAddr := builder.AllocateInstruction().AsIadd(memBase, dstOffset).Insert(builder).Return()
+			srcAddr := builder.AllocateInstruction().AsIadd(memBase, srcOffset).Insert(builder).Return()
+
+			c.callMemmove(dstAddr, srcAddr, copySize)
+
+		case wasm.OpcodeMiscTableFill:
+			tableIndex := c.readI32u()
+			if state.unreachable {
+				break
+			}
+			fillSize := state.pop()
+			value := state.pop()
+			offset := state.pop()
+
+			fillSizeExt := builder.
+				AllocateInstruction().AsUExtend(fillSize, 32, 64).Insert(builder).Return()
+			offsetExt := builder.
+				AllocateInstruction().AsUExtend(offset, 32, 64).Insert(builder).Return()
+			tableInstancePtr := c.boundsCheckInTable(tableIndex, offsetExt, fillSizeExt)
+
+			three := builder.AllocateInstruction().AsIconst64(3).Insert(builder).Return()
+			offsetInBytes := builder.AllocateInstruction().AsIshl(offsetExt, three).Insert(builder).Return()
+			fillSizeInBytes := builder.AllocateInstruction().AsIshl(fillSizeExt, three).Insert(builder).Return()
+
+			// Calculate the base address of the table.
+			tableBaseAddr := c.loadTableBaseAddr(tableInstancePtr)
+			addr := builder.AllocateInstruction().AsIadd(tableBaseAddr, offsetInBytes).Insert(builder).Return()
+
+			// Prepare the loop and following block.
+			beforeLoop := builder.AllocateBasicBlock()
+			loopBlk := builder.AllocateBasicBlock()
+			loopVar := loopBlk.AddParam(builder, ssa.TypeI64)
+			followingBlk := builder.AllocateBasicBlock()
+
+			// Uses the copy trick for faster filling buffer like memory.fill, but in this case we copy 8 bytes at a time.
+			// 	buf := memoryInst.Buffer[offset : offset+fillSize]
+			// 	buf[0:8] = value
+			// 	for i := 8; i < fillSize; i *= 2 { Begin with 8 bytes.
+			// 		copy(buf[i:], buf[:i])
+			// 	}
+
+			// Insert the jump to the beforeLoop block; If the fillSize is zero, then jump to the following block to skip entire logics.
+			zero := builder.AllocateInstruction().AsIconst64(0).Insert(builder).Return()
+			ifFillSizeZero := builder.AllocateInstruction().AsIcmp(fillSizeExt, zero, ssa.IntegerCmpCondEqual).
+				Insert(builder).Return()
+			builder.AllocateInstruction().AsBrnz(ifFillSizeZero, ssa.ValuesNil, followingBlk).Insert(builder)
+			c.insertJumpToBlock(ssa.ValuesNil, beforeLoop)
+
+			// buf[0:8] = value
+			builder.SetCurrentBlock(beforeLoop)
+			builder.AllocateInstruction().AsStore(ssa.OpcodeStore, value, addr, 0).Insert(builder)
+			initValue := builder.AllocateInstruction().AsIconst64(8).Insert(builder).Return()
+			c.insertJumpToBlock(c.allocateVarLengthValues(1, initValue), loopBlk)
+
+			builder.SetCurrentBlock(loopBlk)
+			dstAddr := builder.AllocateInstruction().AsIadd(addr, loopVar).Insert(builder).Return()
+
+			// If loopVar*2 > fillSizeInBytes, then count must be fillSizeInBytes-loopVar.
+			var count ssa.Value
+			{
+				loopVarDoubled := builder.AllocateInstruction().AsIadd(loopVar, loopVar).Insert(builder).Return()
+				loopVarDoubledLargerThanFillSize := builder.
+					AllocateInstruction().AsIcmp(loopVarDoubled, fillSizeInBytes, ssa.IntegerCmpCondUnsignedGreaterThanOrEqual).
+					Insert(builder).Return()
+				diff := builder.AllocateInstruction().AsIsub(fillSizeInBytes, loopVar).Insert(builder).Return()
+				count = builder.AllocateInstruction().AsSelect(loopVarDoubledLargerThanFillSize, diff, loopVar).Insert(builder).Return()
+			}
+
+			c.callMemmove(dstAddr, addr, count)
+
+			shiftAmount := builder.AllocateInstruction().AsIconst64(1).Insert(builder).Return()
+			newLoopVar := builder.AllocateInstruction().AsIshl(loopVar, shiftAmount).Insert(builder).Return()
+			loopVarLessThanFillSize := builder.AllocateInstruction().
+				AsIcmp(newLoopVar, fillSizeInBytes, ssa.IntegerCmpCondUnsignedLessThan).Insert(builder).Return()
+
+			builder.AllocateInstruction().
+				AsBrnz(loopVarLessThanFillSize, c.allocateVarLengthValues(1, newLoopVar), loopBlk).
+				Insert(builder)
+
+			c.insertJumpToBlock(ssa.ValuesNil, followingBlk)
+			builder.SetCurrentBlock(followingBlk)
+
+			builder.Seal(beforeLoop)
+			builder.Seal(loopBlk)
+			builder.Seal(followingBlk)
+
+		case wasm.OpcodeMiscMemoryFill:
+			state.pc++ // Skip the memory index which is fixed to zero.
+			if state.unreachable {
+				break
+			}
+
+			fillSize := builder.
+				AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return()
+			value := state.pop()
+			offset := builder.
+				AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return()
+
+			// Out of bounds check.
+			c.boundsCheckInMemory(c.getMemoryLenValue(false), offset, fillSize)
+
+			// Calculate the base address:
+			addr := builder.AllocateInstruction().AsIadd(c.getMemoryBaseValue(false), offset).Insert(builder).Return()
+
+			// Uses the copy trick for faster filling buffer: https://gist.github.com/taylorza/df2f89d5f9ab3ffd06865062a4cf015d
+			// 	buf := memoryInst.Buffer[offset : offset+fillSize]
+			// 	buf[0] = value
+			// 	for i := 1; i < fillSize; i *= 2 {
+			// 		copy(buf[i:], buf[:i])
+			// 	}
+
+			// Prepare the loop and following block.
+			beforeLoop := builder.AllocateBasicBlock()
+			loopBlk := builder.AllocateBasicBlock()
+			loopVar := loopBlk.AddParam(builder, ssa.TypeI64)
+			followingBlk := builder.AllocateBasicBlock()
+
+			// Insert the jump to the beforeLoop block; If the fillSize is zero, then jump to the following block to skip entire logics.
+			zero := builder.AllocateInstruction().AsIconst64(0).Insert(builder).Return()
+			ifFillSizeZero := builder.AllocateInstruction().AsIcmp(fillSize, zero, ssa.IntegerCmpCondEqual).
+				Insert(builder).Return()
+			builder.AllocateInstruction().AsBrnz(ifFillSizeZero, ssa.ValuesNil, followingBlk).Insert(builder)
+			c.insertJumpToBlock(ssa.ValuesNil, beforeLoop)
+
+			// buf[0] = value
+			builder.SetCurrentBlock(beforeLoop)
+			builder.AllocateInstruction().AsStore(ssa.OpcodeIstore8, value, addr, 0).Insert(builder)
+			initValue := builder.AllocateInstruction().AsIconst64(1).Insert(builder).Return()
+			c.insertJumpToBlock(c.allocateVarLengthValues(1, initValue), loopBlk)
+
+			builder.SetCurrentBlock(loopBlk)
+			dstAddr := builder.AllocateInstruction().AsIadd(addr, loopVar).Insert(builder).Return()
+
+			// If loopVar*2 > fillSizeExt, then count must be fillSizeExt-loopVar.
+			var count ssa.Value
+			{
+				loopVarDoubled := builder.AllocateInstruction().AsIadd(loopVar, loopVar).Insert(builder).Return()
+				loopVarDoubledLargerThanFillSize := builder.
+					AllocateInstruction().AsIcmp(loopVarDoubled, fillSize, ssa.IntegerCmpCondUnsignedGreaterThanOrEqual).
+					Insert(builder).Return()
+				diff := builder.AllocateInstruction().AsIsub(fillSize, loopVar).Insert(builder).Return()
+				count = builder.AllocateInstruction().AsSelect(loopVarDoubledLargerThanFillSize, diff, loopVar).Insert(builder).Return()
+			}
+
+			c.callMemmove(dstAddr, addr, count)
+
+			shiftAmount := builder.AllocateInstruction().AsIconst64(1).Insert(builder).Return()
+			newLoopVar := builder.AllocateInstruction().AsIshl(loopVar, shiftAmount).Insert(builder).Return()
+			loopVarLessThanFillSize := builder.AllocateInstruction().
+				AsIcmp(newLoopVar, fillSize, ssa.IntegerCmpCondUnsignedLessThan).Insert(builder).Return()
+
+			builder.AllocateInstruction().
+				AsBrnz(loopVarLessThanFillSize, c.allocateVarLengthValues(1, newLoopVar), loopBlk).
+				Insert(builder)
+
+			c.insertJumpToBlock(ssa.ValuesNil, followingBlk)
+			builder.SetCurrentBlock(followingBlk)
+
+			builder.Seal(beforeLoop)
+			builder.Seal(loopBlk)
+			builder.Seal(followingBlk)
+
+		case wasm.OpcodeMiscMemoryInit:
+			index := c.readI32u()
+			state.pc++ // Skip the memory index which is fixed to zero.
+			if state.unreachable {
+				break
+			}
+
+			copySize := builder.
+				AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return()
+			offsetInDataInstance := builder.
+				AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return()
+			offsetInMemory := builder.
+				AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return()
+
+			dataInstPtr := c.dataOrElementInstanceAddr(index, c.offset.DataInstances1stElement)
+
+			// Bounds check.
+			c.boundsCheckInMemory(c.getMemoryLenValue(false), offsetInMemory, copySize)
+			c.boundsCheckInDataOrElementInstance(dataInstPtr, offsetInDataInstance, copySize, wazevoapi.ExitCodeMemoryOutOfBounds)
+
+			dataInstBaseAddr := builder.AllocateInstruction().AsLoad(dataInstPtr, 0, ssa.TypeI64).Insert(builder).Return()
+			srcAddr := builder.AllocateInstruction().AsIadd(dataInstBaseAddr, offsetInDataInstance).Insert(builder).Return()
+
+			memBase := c.getMemoryBaseValue(false)
+			dstAddr := builder.AllocateInstruction().AsIadd(memBase, offsetInMemory).Insert(builder).Return()
+
+			c.callMemmove(dstAddr, srcAddr, copySize)
+
+		case wasm.OpcodeMiscTableInit:
+			elemIndex := c.readI32u()
+			tableIndex := c.readI32u()
+			if state.unreachable {
+				break
+			}
+
+			copySize := builder.
+				AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return()
+			offsetInElementInstance := builder.
+				AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return()
+			offsetInTable := builder.
+				AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return()
+
+			elemInstPtr := c.dataOrElementInstanceAddr(elemIndex, c.offset.ElementInstances1stElement)
+
+			// Bounds check.
+			tableInstancePtr := c.boundsCheckInTable(tableIndex, offsetInTable, copySize)
+			c.boundsCheckInDataOrElementInstance(elemInstPtr, offsetInElementInstance, copySize, wazevoapi.ExitCodeTableOutOfBounds)
+
+			three := builder.AllocateInstruction().AsIconst64(3).Insert(builder).Return()
+			// Calculates the destination address in the table.
+			tableOffsetInBytes := builder.AllocateInstruction().AsIshl(offsetInTable, three).Insert(builder).Return()
+			tableBaseAddr := c.loadTableBaseAddr(tableInstancePtr)
+			dstAddr := builder.AllocateInstruction().AsIadd(tableBaseAddr, tableOffsetInBytes).Insert(builder).Return()
+
+			// Calculates the source address in the element instance.
+			srcOffsetInBytes := builder.AllocateInstruction().AsIshl(offsetInElementInstance, three).Insert(builder).Return()
+			elemInstBaseAddr := builder.AllocateInstruction().AsLoad(elemInstPtr, 0, ssa.TypeI64).Insert(builder).Return()
+			srcAddr := builder.AllocateInstruction().AsIadd(elemInstBaseAddr, srcOffsetInBytes).Insert(builder).Return()
+
+			copySizeInBytes := builder.AllocateInstruction().AsIshl(copySize, three).Insert(builder).Return()
+			c.callMemmove(dstAddr, srcAddr, copySizeInBytes)
+
+		case wasm.OpcodeMiscElemDrop:
+			index := c.readI32u()
+			if state.unreachable {
+				break
+			}
+
+			c.dropDataOrElementInstance(index, c.offset.ElementInstances1stElement)
+
+		case wasm.OpcodeMiscDataDrop:
+			index := c.readI32u()
+			if state.unreachable {
+				break
+			}
+			c.dropDataOrElementInstance(index, c.offset.DataInstances1stElement)
+
+		default:
+			panic("Unknown MiscOp " + wasm.MiscInstructionName(miscOp))
+		}
+
+	case wasm.OpcodeI32ReinterpretF32:
+		if state.unreachable {
+			break
+		}
+		reinterpret := builder.AllocateInstruction().
+			AsBitcast(state.pop(), ssa.TypeI32).
+			Insert(builder).Return()
+		state.push(reinterpret)
+
+	case wasm.OpcodeI64ReinterpretF64:
+		if state.unreachable {
+			break
+		}
+		reinterpret := builder.AllocateInstruction().
+			AsBitcast(state.pop(), ssa.TypeI64).
+			Insert(builder).Return()
+		state.push(reinterpret)
+
+	case wasm.OpcodeF32ReinterpretI32:
+		if state.unreachable {
+			break
+		}
+		reinterpret := builder.AllocateInstruction().
+			AsBitcast(state.pop(), ssa.TypeF32).
+			Insert(builder).Return()
+		state.push(reinterpret)
+
+	case wasm.OpcodeF64ReinterpretI64:
+		if state.unreachable {
+			break
+		}
+		reinterpret := builder.AllocateInstruction().
+			AsBitcast(state.pop(), ssa.TypeF64).
+			Insert(builder).Return()
+		state.push(reinterpret)
+
+	case wasm.OpcodeI32DivS, wasm.OpcodeI64DivS:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		result := builder.AllocateInstruction().AsSDiv(x, y, c.execCtxPtrValue).Insert(builder).Return()
+		state.push(result)
+
+	case wasm.OpcodeI32DivU, wasm.OpcodeI64DivU:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		result := builder.AllocateInstruction().AsUDiv(x, y, c.execCtxPtrValue).Insert(builder).Return()
+		state.push(result)
+
+	case wasm.OpcodeI32RemS, wasm.OpcodeI64RemS:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		result := builder.AllocateInstruction().AsSRem(x, y, c.execCtxPtrValue).Insert(builder).Return()
+		state.push(result)
+
+	case wasm.OpcodeI32RemU, wasm.OpcodeI64RemU:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		result := builder.AllocateInstruction().AsURem(x, y, c.execCtxPtrValue).Insert(builder).Return()
+		state.push(result)
+
+	case wasm.OpcodeI32And, wasm.OpcodeI64And:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		and := builder.AllocateInstruction()
+		and.AsBand(x, y)
+		builder.InsertInstruction(and)
+		value := and.Return()
+		state.push(value)
+	case wasm.OpcodeI32Or, wasm.OpcodeI64Or:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		or := builder.AllocateInstruction()
+		or.AsBor(x, y)
+		builder.InsertInstruction(or)
+		value := or.Return()
+		state.push(value)
+	case wasm.OpcodeI32Xor, wasm.OpcodeI64Xor:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		xor := builder.AllocateInstruction()
+		xor.AsBxor(x, y)
+		builder.InsertInstruction(xor)
+		value := xor.Return()
+		state.push(value)
+	case wasm.OpcodeI32Shl, wasm.OpcodeI64Shl:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		ishl := builder.AllocateInstruction()
+		ishl.AsIshl(x, y)
+		builder.InsertInstruction(ishl)
+		value := ishl.Return()
+		state.push(value)
+	case wasm.OpcodeI32ShrU, wasm.OpcodeI64ShrU:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		ishl := builder.AllocateInstruction()
+		ishl.AsUshr(x, y)
+		builder.InsertInstruction(ishl)
+		value := ishl.Return()
+		state.push(value)
+	case wasm.OpcodeI32ShrS, wasm.OpcodeI64ShrS:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		ishl := builder.AllocateInstruction()
+		ishl.AsSshr(x, y)
+		builder.InsertInstruction(ishl)
+		value := ishl.Return()
+		state.push(value)
+	case wasm.OpcodeI32Rotl, wasm.OpcodeI64Rotl:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		rotl := builder.AllocateInstruction()
+		rotl.AsRotl(x, y)
+		builder.InsertInstruction(rotl)
+		value := rotl.Return()
+		state.push(value)
+	case wasm.OpcodeI32Rotr, wasm.OpcodeI64Rotr:
+		if state.unreachable {
+			break
+		}
+		y, x := state.pop(), state.pop()
+		rotr := builder.AllocateInstruction()
+		rotr.AsRotr(x, y)
+		builder.InsertInstruction(rotr)
+		value := rotr.Return()
+		state.push(value)
+	case wasm.OpcodeI32Clz, wasm.OpcodeI64Clz:
+		if state.unreachable {
+			break
+		}
+		x := state.pop()
+		clz := builder.AllocateInstruction()
+		clz.AsClz(x)
+		builder.InsertInstruction(clz)
+		value := clz.Return()
+		state.push(value)
+	case wasm.OpcodeI32Ctz, wasm.OpcodeI64Ctz:
+		if state.unreachable {
+			break
+		}
+		x := state.pop()
+		ctz := builder.AllocateInstruction()
+		ctz.AsCtz(x)
+		builder.InsertInstruction(ctz)
+		value := ctz.Return()
+		state.push(value)
+	case wasm.OpcodeI32Popcnt, wasm.OpcodeI64Popcnt:
+		if state.unreachable {
+			break
+		}
+		x := state.pop()
+		popcnt := builder.AllocateInstruction()
+		popcnt.AsPopcnt(x)
+		builder.InsertInstruction(popcnt)
+		value := popcnt.Return()
+		state.push(value)
+
+	case wasm.OpcodeI32WrapI64:
+		if state.unreachable {
+			break
+		}
+		x := state.pop()
+		wrap := builder.AllocateInstruction().AsIreduce(x, ssa.TypeI32).Insert(builder).Return()
+		state.push(wrap)
+	case wasm.OpcodeGlobalGet:
+		index := c.readI32u()
+		if state.unreachable {
+			break
+		}
+		v := c.getWasmGlobalValue(index, false)
+		state.push(v)
+	case wasm.OpcodeGlobalSet:
+		index := c.readI32u()
+		if state.unreachable {
+			break
+		}
+		v := state.pop()
+		c.setWasmGlobalValue(index, v)
+	case wasm.OpcodeLocalGet:
+		index := c.readI32u()
+		if state.unreachable {
+			break
+		}
+		variable := c.localVariable(index)
+		if _, ok := c.m.NonStaticLocals[c.wasmLocalFunctionIndex][index]; ok {
+			state.push(builder.MustFindValue(variable))
+		} else {
+			// If a local is static, we can simply find it in the entry block which is either a function param
+			// or a zero value. This fast pass helps to avoid the overhead of searching the entire function plus
+			// avoid adding unnecessary block arguments.
+			// TODO: I think this optimization should be done in a SSA pass like passRedundantPhiEliminationOpt,
+			// 	but somehow there's some corner cases that it fails to optimize.
+			state.push(builder.MustFindValueInBlk(variable, c.ssaBuilder.EntryBlock()))
+		}
+	case wasm.OpcodeLocalSet:
+		index := c.readI32u()
+		if state.unreachable {
+			break
+		}
+		variable := c.localVariable(index)
+		newValue := state.pop()
+		builder.DefineVariableInCurrentBB(variable, newValue)
+
+	case wasm.OpcodeLocalTee:
+		index := c.readI32u()
+		if state.unreachable {
+			break
+		}
+		variable := c.localVariable(index)
+		newValue := state.peek()
+		builder.DefineVariableInCurrentBB(variable, newValue)
+
+	case wasm.OpcodeSelect, wasm.OpcodeTypedSelect:
+		if op == wasm.OpcodeTypedSelect {
+			state.pc += 2 // ignores the type which is only needed during validation.
+		}
+
+		if state.unreachable {
+			break
+		}
+
+		cond := state.pop()
+		v2 := state.pop()
+		v1 := state.pop()
+
+		sl := builder.AllocateInstruction().
+			AsSelect(cond, v1, v2).
+			Insert(builder).
+			Return()
+		state.push(sl)
+
+	case wasm.OpcodeMemorySize:
+		state.pc++ // skips the memory index.
+		if state.unreachable {
+			break
+		}
+
+		var memSizeInBytes ssa.Value
+		if c.offset.LocalMemoryBegin < 0 {
+			memInstPtr := builder.AllocateInstruction().
+				AsLoad(c.moduleCtxPtrValue, c.offset.ImportedMemoryBegin.U32(), ssa.TypeI64).
+				Insert(builder).
+				Return()
+
+			memSizeInBytes = builder.AllocateInstruction().
+				AsLoad(memInstPtr, memoryInstanceBufSizeOffset, ssa.TypeI32).
+				Insert(builder).
+				Return()
+		} else {
+			memSizeInBytes = builder.AllocateInstruction().
+				AsLoad(c.moduleCtxPtrValue, c.offset.LocalMemoryLen().U32(), ssa.TypeI32).
+				Insert(builder).
+				Return()
+		}
+
+		amount := builder.AllocateInstruction()
+		amount.AsIconst32(uint32(wasm.MemoryPageSizeInBits))
+		builder.InsertInstruction(amount)
+		memSize := builder.AllocateInstruction().
+			AsUshr(memSizeInBytes, amount.Return()).
+			Insert(builder).
+			Return()
+		state.push(memSize)
+
+	case wasm.OpcodeMemoryGrow:
+		state.pc++ // skips the memory index.
+		if state.unreachable {
+			break
+		}
+
+		c.storeCallerModuleContext()
+
+		pages := state.pop()
+		memoryGrowPtr := builder.AllocateInstruction().
+			AsLoad(c.execCtxPtrValue,
+				wazevoapi.ExecutionContextOffsetMemoryGrowTrampolineAddress.U32(),
+				ssa.TypeI64,
+			).Insert(builder).Return()
+
+		args := c.allocateVarLengthValues(1, c.execCtxPtrValue, pages)
+		callGrowRet := builder.
+			AllocateInstruction().
+			AsCallIndirect(memoryGrowPtr, &c.memoryGrowSig, args).
+			Insert(builder).Return()
+		state.push(callGrowRet)
+
+		// After the memory grow, reload the cached memory base and len.
+		c.reloadMemoryBaseLen()
+
+	case wasm.OpcodeI32Store,
+		wasm.OpcodeI64Store,
+		wasm.OpcodeF32Store,
+		wasm.OpcodeF64Store,
+		wasm.OpcodeI32Store8,
+		wasm.OpcodeI32Store16,
+		wasm.OpcodeI64Store8,
+		wasm.OpcodeI64Store16,
+		wasm.OpcodeI64Store32:
+
+		_, offset := c.readMemArg()
+		if state.unreachable {
+			break
+		}
+		var opSize uint64
+		var opcode ssa.Opcode
+		switch op {
+		case wasm.OpcodeI32Store, wasm.OpcodeF32Store:
+			opcode = ssa.OpcodeStore
+			opSize = 4
+		case wasm.OpcodeI64Store, wasm.OpcodeF64Store:
+			opcode = ssa.OpcodeStore
+			opSize = 8
+		case wasm.OpcodeI32Store8, wasm.OpcodeI64Store8:
+			opcode = ssa.OpcodeIstore8
+			opSize = 1
+		case wasm.OpcodeI32Store16, wasm.OpcodeI64Store16:
+			opcode = ssa.OpcodeIstore16
+			opSize = 2
+		case wasm.OpcodeI64Store32:
+			opcode = ssa.OpcodeIstore32
+			opSize = 4
+		default:
+			panic("BUG")
+		}
+
+		value := state.pop()
+		baseAddr := state.pop()
+		addr := c.memOpSetup(baseAddr, uint64(offset), opSize)
+		builder.AllocateInstruction().
+			AsStore(opcode, value, addr, offset).
+			Insert(builder)
+
+	case wasm.OpcodeI32Load,
+		wasm.OpcodeI64Load,
+		wasm.OpcodeF32Load,
+		wasm.OpcodeF64Load,
+		wasm.OpcodeI32Load8S,
+		wasm.OpcodeI32Load8U,
+		wasm.OpcodeI32Load16S,
+		wasm.OpcodeI32Load16U,
+		wasm.OpcodeI64Load8S,
+		wasm.OpcodeI64Load8U,
+		wasm.OpcodeI64Load16S,
+		wasm.OpcodeI64Load16U,
+		wasm.OpcodeI64Load32S,
+		wasm.OpcodeI64Load32U:
+		_, offset := c.readMemArg()
+		if state.unreachable {
+			break
+		}
+
+		var opSize uint64
+		switch op {
+		case wasm.OpcodeI32Load, wasm.OpcodeF32Load:
+			opSize = 4
+		case wasm.OpcodeI64Load, wasm.OpcodeF64Load:
+			opSize = 8
+		case wasm.OpcodeI32Load8S, wasm.OpcodeI32Load8U:
+			opSize = 1
+		case wasm.OpcodeI32Load16S, wasm.OpcodeI32Load16U:
+			opSize = 2
+		case wasm.OpcodeI64Load8S, wasm.OpcodeI64Load8U:
+			opSize = 1
+		case wasm.OpcodeI64Load16S, wasm.OpcodeI64Load16U:
+			opSize = 2
+		case wasm.OpcodeI64Load32S, wasm.OpcodeI64Load32U:
+			opSize = 4
+		default:
+			panic("BUG")
+		}
+
+		baseAddr := state.pop()
+		addr := c.memOpSetup(baseAddr, uint64(offset), opSize)
+		load := builder.AllocateInstruction()
+		switch op {
+		case wasm.OpcodeI32Load:
+			load.AsLoad(addr, offset, ssa.TypeI32)
+		case wasm.OpcodeI64Load:
+			load.AsLoad(addr, offset, ssa.TypeI64)
+		case wasm.OpcodeF32Load:
+			load.AsLoad(addr, offset, ssa.TypeF32)
+		case wasm.OpcodeF64Load:
+			load.AsLoad(addr, offset, ssa.TypeF64)
+		case wasm.OpcodeI32Load8S:
+			load.AsExtLoad(ssa.OpcodeSload8, addr, offset, false)
+		case wasm.OpcodeI32Load8U:
+			load.AsExtLoad(ssa.OpcodeUload8, addr, offset, false)
+		case wasm.OpcodeI32Load16S:
+			load.AsExtLoad(ssa.OpcodeSload16, addr, offset, false)
+		case wasm.OpcodeI32Load16U:
+			load.AsExtLoad(ssa.OpcodeUload16, addr, offset, false)
+		case wasm.OpcodeI64Load8S:
+			load.AsExtLoad(ssa.OpcodeSload8, addr, offset, true)
+		case wasm.OpcodeI64Load8U:
+			load.AsExtLoad(ssa.OpcodeUload8, addr, offset, true)
+		case wasm.OpcodeI64Load16S:
+			load.AsExtLoad(ssa.OpcodeSload16, addr, offset, true)
+		case wasm.OpcodeI64Load16U:
+			load.AsExtLoad(ssa.OpcodeUload16, addr, offset, true)
+		case wasm.OpcodeI64Load32S:
+			load.AsExtLoad(ssa.OpcodeSload32, addr, offset, true)
+		case wasm.OpcodeI64Load32U:
+			load.AsExtLoad(ssa.OpcodeUload32, addr, offset, true)
+		default:
+			panic("BUG")
+		}
+		builder.InsertInstruction(load)
+		state.push(load.Return())
+	case wasm.OpcodeBlock:
+		// Note: we do not need to create a BB for this as that would always have only one predecessor
+		// which is the current BB, and therefore it's always ok to merge them in any way.
+
+		bt := c.readBlockType()
+
+		if state.unreachable {
+			state.unreachableDepth++
+			break
+		}
+
+		followingBlk := builder.AllocateBasicBlock()
+		c.addBlockParamsFromWasmTypes(bt.Results, followingBlk)
+
+		state.ctrlPush(controlFrame{
+			kind:                         controlFrameKindBlock,
+			originalStackLenWithoutParam: len(state.values) - len(bt.Params),
+			followingBlock:               followingBlk,
+			blockType:                    bt,
+		})
+	case wasm.OpcodeLoop:
+		bt := c.readBlockType()
+
+		if state.unreachable {
+			state.unreachableDepth++
+			break
+		}
+
+		loopHeader, afterLoopBlock := builder.AllocateBasicBlock(), builder.AllocateBasicBlock()
+		c.addBlockParamsFromWasmTypes(bt.Params, loopHeader)
+		c.addBlockParamsFromWasmTypes(bt.Results, afterLoopBlock)
+
+		originalLen := len(state.values) - len(bt.Params)
+		state.ctrlPush(controlFrame{
+			originalStackLenWithoutParam: originalLen,
+			kind:                         controlFrameKindLoop,
+			blk:                          loopHeader,
+			followingBlock:               afterLoopBlock,
+			blockType:                    bt,
+		})
+
+		args := c.allocateVarLengthValues(originalLen)
+		args = args.Append(builder.VarLengthPool(), state.values[originalLen:]...)
+
+		// Insert the jump to the header of loop.
+		br := builder.AllocateInstruction()
+		br.AsJump(args, loopHeader)
+		builder.InsertInstruction(br)
+
+		c.switchTo(originalLen, loopHeader)
+
+		if c.ensureTermination {
+			checkModuleExitCodePtr := builder.AllocateInstruction().
+				AsLoad(c.execCtxPtrValue,
+					wazevoapi.ExecutionContextOffsetCheckModuleExitCodeTrampolineAddress.U32(),
+					ssa.TypeI64,
+				).Insert(builder).Return()
+
+			args := c.allocateVarLengthValues(1, c.execCtxPtrValue)
+			builder.AllocateInstruction().
+				AsCallIndirect(checkModuleExitCodePtr, &c.checkModuleExitCodeSig, args).
+				Insert(builder)
+		}
+	case wasm.OpcodeIf:
+		bt := c.readBlockType()
+
+		if state.unreachable {
+			state.unreachableDepth++
+			break
+		}
+
+		v := state.pop()
+		thenBlk, elseBlk, followingBlk := builder.AllocateBasicBlock(), builder.AllocateBasicBlock(), builder.AllocateBasicBlock()
+
+		// We do not make the Wasm-level block parameters as SSA-level block params for if-else blocks
+		// since they won't be PHI and the definition is unique.
+
+		// On the other hand, the following block after if-else-end will likely have
+		// multiple definitions (one in Then and another in Else blocks).
+		c.addBlockParamsFromWasmTypes(bt.Results, followingBlk)
+
+		args := c.allocateVarLengthValues(len(bt.Params))
+		args = args.Append(builder.VarLengthPool(), state.values[len(state.values)-len(bt.Params):]...)
+
+		// Insert the conditional jump to the Else block.
+		brz := builder.AllocateInstruction()
+		brz.AsBrz(v, ssa.ValuesNil, elseBlk)
+		builder.InsertInstruction(brz)
+
+		// Then, insert the jump to the Then block.
+		br := builder.AllocateInstruction()
+		br.AsJump(ssa.ValuesNil, thenBlk)
+		builder.InsertInstruction(br)
+
+		state.ctrlPush(controlFrame{
+			kind:                         controlFrameKindIfWithoutElse,
+			originalStackLenWithoutParam: len(state.values) - len(bt.Params),
+			blk:                          elseBlk,
+			followingBlock:               followingBlk,
+			blockType:                    bt,
+			clonedArgs:                   args,
+		})
+
+		builder.SetCurrentBlock(thenBlk)
+
+		// Then and Else (if exists) have only one predecessor.
+		builder.Seal(thenBlk)
+		builder.Seal(elseBlk)
+	case wasm.OpcodeElse:
+		ifctrl := state.ctrlPeekAt(0)
+		if unreachable := state.unreachable; unreachable && state.unreachableDepth > 0 {
+			// If it is currently in unreachable and is a nested if,
+			// we just remove the entire else block.
+			break
+		}
+
+		ifctrl.kind = controlFrameKindIfWithElse
+		if !state.unreachable {
+			// If this Then block is currently reachable, we have to insert the branching to the following BB.
+			followingBlk := ifctrl.followingBlock // == the BB after if-then-else.
+			args := c.nPeekDup(len(ifctrl.blockType.Results))
+			c.insertJumpToBlock(args, followingBlk)
+		} else {
+			state.unreachable = false
+		}
+
+		// Reset the stack so that we can correctly handle the else block.
+		state.values = state.values[:ifctrl.originalStackLenWithoutParam]
+		elseBlk := ifctrl.blk
+		for _, arg := range ifctrl.clonedArgs.View() {
+			state.push(arg)
+		}
+
+		builder.SetCurrentBlock(elseBlk)
+
+	case wasm.OpcodeEnd:
+		if state.unreachableDepth > 0 {
+			state.unreachableDepth--
+			break
+		}
+
+		ctrl := state.ctrlPop()
+		followingBlk := ctrl.followingBlock
+
+		unreachable := state.unreachable
+		if !unreachable {
+			// Top n-th args will be used as a result of the current control frame.
+			args := c.nPeekDup(len(ctrl.blockType.Results))
+
+			// Insert the unconditional branch to the target.
+			c.insertJumpToBlock(args, followingBlk)
+		} else { // recover from the unreachable state.
+			state.unreachable = false
+		}
+
+		switch ctrl.kind {
+		case controlFrameKindFunction:
+			break // This is the very end of function.
+		case controlFrameKindLoop:
+			// Loop header block can be reached from any br/br_table contained in the loop,
+			// so now that we've reached End of it, we can seal it.
+			builder.Seal(ctrl.blk)
+		case controlFrameKindIfWithoutElse:
+			// If this is the end of Then block, we have to emit the empty Else block.
+			elseBlk := ctrl.blk
+			builder.SetCurrentBlock(elseBlk)
+			c.insertJumpToBlock(ctrl.clonedArgs, followingBlk)
+		}
+
+		builder.Seal(followingBlk)
+
+		// Ready to start translating the following block.
+		c.switchTo(ctrl.originalStackLenWithoutParam, followingBlk)
+
+	case wasm.OpcodeBr:
+		labelIndex := c.readI32u()
+		if state.unreachable {
+			break
+		}
+
+		targetBlk, argNum := state.brTargetArgNumFor(labelIndex)
+		args := c.nPeekDup(argNum)
+		c.insertJumpToBlock(args, targetBlk)
+
+		state.unreachable = true
+
+	case wasm.OpcodeBrIf:
+		labelIndex := c.readI32u()
+		if state.unreachable {
+			break
+		}
+
+		v := state.pop()
+
+		targetBlk, argNum := state.brTargetArgNumFor(labelIndex)
+		args := c.nPeekDup(argNum)
+		var sealTargetBlk bool
+		if c.needListener && targetBlk.ReturnBlock() { // In this case, we have to call the listener before returning.
+			// Save the currently active block.
+			current := builder.CurrentBlock()
+
+			// Allocate the trampoline block to the return where we call the listener.
+			targetBlk = builder.AllocateBasicBlock()
+			builder.SetCurrentBlock(targetBlk)
+			sealTargetBlk = true
+
+			c.callListenerAfter()
+
+			instr := builder.AllocateInstruction()
+			instr.AsReturn(args)
+			builder.InsertInstruction(instr)
+
+			args = ssa.ValuesNil
+
+			// Revert the current block.
+			builder.SetCurrentBlock(current)
+		}
+
+		// Insert the conditional jump to the target block.
+		brnz := builder.AllocateInstruction()
+		brnz.AsBrnz(v, args, targetBlk)
+		builder.InsertInstruction(brnz)
+
+		if sealTargetBlk {
+			builder.Seal(targetBlk)
+		}
+
+		// Insert the unconditional jump to the Else block which corresponds to after br_if.
+		elseBlk := builder.AllocateBasicBlock()
+		c.insertJumpToBlock(ssa.ValuesNil, elseBlk)
+
+		// Now start translating the instructions after br_if.
+		builder.Seal(elseBlk) // Else of br_if has the current block as the only one successor.
+		builder.SetCurrentBlock(elseBlk)
+
+	case wasm.OpcodeBrTable:
+		labels := state.tmpForBrTable
+		labels = labels[:0]
+		labelCount := c.readI32u()
+		for i := 0; i < int(labelCount); i++ {
+			labels = append(labels, c.readI32u())
+		}
+		labels = append(labels, c.readI32u()) // default label.
+		if state.unreachable {
+			break
+		}
+
+		index := state.pop()
+		if labelCount == 0 { // If this br_table is empty, we can just emit the unconditional jump.
+			targetBlk, argNum := state.brTargetArgNumFor(labels[0])
+			args := c.nPeekDup(argNum)
+			c.insertJumpToBlock(args, targetBlk)
+		} else {
+			c.lowerBrTable(labels, index)
+		}
+		state.unreachable = true
+
+	case wasm.OpcodeNop:
+	case wasm.OpcodeReturn:
+		if state.unreachable {
+			break
+		}
+		if c.needListener {
+			c.callListenerAfter()
+		}
+
+		results := c.nPeekDup(c.results())
+		instr := builder.AllocateInstruction()
+
+		instr.AsReturn(results)
+		builder.InsertInstruction(instr)
+		state.unreachable = true
+
+	case wasm.OpcodeUnreachable:
+		if state.unreachable {
+			break
+		}
+		exit := builder.AllocateInstruction()
+		exit.AsExitWithCode(c.execCtxPtrValue, wazevoapi.ExitCodeUnreachable)
+		builder.InsertInstruction(exit)
+		state.unreachable = true
+
+	case wasm.OpcodeCallIndirect:
+		typeIndex := c.readI32u()
+		tableIndex := c.readI32u()
+		if state.unreachable {
+			break
+		}
+		c.lowerCallIndirect(typeIndex, tableIndex)
+
+	case wasm.OpcodeCall:
+		fnIndex := c.readI32u()
+		if state.unreachable {
+			break
+		}
+
+		var typIndex wasm.Index
+		if fnIndex < c.m.ImportFunctionCount {
+			// Before transfer the control to the callee, we have to store the current module's moduleContextPtr
+			// into execContext.callerModuleContextPtr in case when the callee is a Go function.
+			c.storeCallerModuleContext()
+			var fi int
+			for i := range c.m.ImportSection {
+				imp := &c.m.ImportSection[i]
+				if imp.Type == wasm.ExternTypeFunc {
+					if fi == int(fnIndex) {
+						typIndex = imp.DescFunc
+						break
+					}
+					fi++
+				}
+			}
+		} else {
+			typIndex = c.m.FunctionSection[fnIndex-c.m.ImportFunctionCount]
+		}
+		typ := &c.m.TypeSection[typIndex]
+
+		argN := len(typ.Params)
+		tail := len(state.values) - argN
+		vs := state.values[tail:]
+		state.values = state.values[:tail]
+		args := c.allocateVarLengthValues(2+len(vs), c.execCtxPtrValue)
+
+		sig := c.signatures[typ]
+		call := builder.AllocateInstruction()
+		if fnIndex >= c.m.ImportFunctionCount {
+			args = args.Append(builder.VarLengthPool(), c.moduleCtxPtrValue) // This case the callee module is itself.
+			args = args.Append(builder.VarLengthPool(), vs...)
+			call.AsCall(FunctionIndexToFuncRef(fnIndex), sig, args)
+			builder.InsertInstruction(call)
+		} else {
+			// This case we have to read the address of the imported function from the module context.
+			moduleCtx := c.moduleCtxPtrValue
+			loadFuncPtr, loadModuleCtxPtr := builder.AllocateInstruction(), builder.AllocateInstruction()
+			funcPtrOffset, moduleCtxPtrOffset, _ := c.offset.ImportedFunctionOffset(fnIndex)
+			loadFuncPtr.AsLoad(moduleCtx, funcPtrOffset.U32(), ssa.TypeI64)
+			loadModuleCtxPtr.AsLoad(moduleCtx, moduleCtxPtrOffset.U32(), ssa.TypeI64)
+			builder.InsertInstruction(loadFuncPtr)
+			builder.InsertInstruction(loadModuleCtxPtr)
+
+			args = args.Append(builder.VarLengthPool(), loadModuleCtxPtr.Return())
+			args = args.Append(builder.VarLengthPool(), vs...)
+			call.AsCallIndirect(loadFuncPtr.Return(), sig, args)
+			builder.InsertInstruction(call)
+		}
+
+		first, rest := call.Returns()
+		if first.Valid() {
+			state.push(first)
+		}
+		for _, v := range rest {
+			state.push(v)
+		}
+
+		c.reloadAfterCall()
+
+	case wasm.OpcodeDrop:
+		if state.unreachable {
+			break
+		}
+		_ = state.pop()
+	case wasm.OpcodeF64ConvertI32S, wasm.OpcodeF64ConvertI64S, wasm.OpcodeF64ConvertI32U, wasm.OpcodeF64ConvertI64U:
+		if state.unreachable {
+			break
+		}
+		result := builder.AllocateInstruction().AsFcvtFromInt(
+			state.pop(),
+			op == wasm.OpcodeF64ConvertI32S || op == wasm.OpcodeF64ConvertI64S,
+			true,
+		).Insert(builder).Return()
+		state.push(result)
+	case wasm.OpcodeF32ConvertI32S, wasm.OpcodeF32ConvertI64S, wasm.OpcodeF32ConvertI32U, wasm.OpcodeF32ConvertI64U:
+		if state.unreachable {
+			break
+		}
+		result := builder.AllocateInstruction().AsFcvtFromInt(
+			state.pop(),
+			op == wasm.OpcodeF32ConvertI32S || op == wasm.OpcodeF32ConvertI64S,
+			false,
+		).Insert(builder).Return()
+		state.push(result)
+	case wasm.OpcodeF32DemoteF64:
+		if state.unreachable {
+			break
+		}
+		cvt := builder.AllocateInstruction()
+		cvt.AsFdemote(state.pop())
+		builder.InsertInstruction(cvt)
+		state.push(cvt.Return())
+	case wasm.OpcodeF64PromoteF32:
+		if state.unreachable {
+			break
+		}
+		cvt := builder.AllocateInstruction()
+		cvt.AsFpromote(state.pop())
+		builder.InsertInstruction(cvt)
+		state.push(cvt.Return())
+
+	case wasm.OpcodeVecPrefix:
+		state.pc++
+		vecOp := c.wasmFunctionBody[state.pc]
+		switch vecOp {
+		case wasm.OpcodeVecV128Const:
+			state.pc++
+			lo := binary.LittleEndian.Uint64(c.wasmFunctionBody[state.pc:])
+			state.pc += 8
+			hi := binary.LittleEndian.Uint64(c.wasmFunctionBody[state.pc:])
+			state.pc += 7
+			if state.unreachable {
+				break
+			}
+			ret := builder.AllocateInstruction().AsVconst(lo, hi).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecV128Load:
+			_, offset := c.readMemArg()
+			if state.unreachable {
+				break
+			}
+			baseAddr := state.pop()
+			addr := c.memOpSetup(baseAddr, uint64(offset), 16)
+			load := builder.AllocateInstruction()
+			load.AsLoad(addr, offset, ssa.TypeV128)
+			builder.InsertInstruction(load)
+			state.push(load.Return())
+		case wasm.OpcodeVecV128Load8Lane, wasm.OpcodeVecV128Load16Lane, wasm.OpcodeVecV128Load32Lane:
+			_, offset := c.readMemArg()
+			state.pc++
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			var loadOp ssa.Opcode
+			var opSize uint64
+			switch vecOp {
+			case wasm.OpcodeVecV128Load8Lane:
+				loadOp, lane, opSize = ssa.OpcodeUload8, ssa.VecLaneI8x16, 1
+			case wasm.OpcodeVecV128Load16Lane:
+				loadOp, lane, opSize = ssa.OpcodeUload16, ssa.VecLaneI16x8, 2
+			case wasm.OpcodeVecV128Load32Lane:
+				loadOp, lane, opSize = ssa.OpcodeUload32, ssa.VecLaneI32x4, 4
+			}
+			laneIndex := c.wasmFunctionBody[state.pc]
+			vector := state.pop()
+			baseAddr := state.pop()
+			addr := c.memOpSetup(baseAddr, uint64(offset), opSize)
+			load := builder.AllocateInstruction().
+				AsExtLoad(loadOp, addr, offset, false).
+				Insert(builder).Return()
+			ret := builder.AllocateInstruction().
+				AsInsertlane(vector, load, laneIndex, lane).
+				Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecV128Load64Lane:
+			_, offset := c.readMemArg()
+			state.pc++
+			if state.unreachable {
+				break
+			}
+			laneIndex := c.wasmFunctionBody[state.pc]
+			vector := state.pop()
+			baseAddr := state.pop()
+			addr := c.memOpSetup(baseAddr, uint64(offset), 8)
+			load := builder.AllocateInstruction().
+				AsLoad(addr, offset, ssa.TypeI64).
+				Insert(builder).Return()
+			ret := builder.AllocateInstruction().
+				AsInsertlane(vector, load, laneIndex, ssa.VecLaneI64x2).
+				Insert(builder).Return()
+			state.push(ret)
+
+		case wasm.OpcodeVecV128Load32zero, wasm.OpcodeVecV128Load64zero:
+			_, offset := c.readMemArg()
+			if state.unreachable {
+				break
+			}
+
+			var scalarType ssa.Type
+			switch vecOp {
+			case wasm.OpcodeVecV128Load32zero:
+				scalarType = ssa.TypeF32
+			case wasm.OpcodeVecV128Load64zero:
+				scalarType = ssa.TypeF64
+			}
+
+			baseAddr := state.pop()
+			addr := c.memOpSetup(baseAddr, uint64(offset), uint64(scalarType.Size()))
+
+			ret := builder.AllocateInstruction().
+				AsVZeroExtLoad(addr, offset, scalarType).
+				Insert(builder).Return()
+			state.push(ret)
+
+		case wasm.OpcodeVecV128Load8x8u, wasm.OpcodeVecV128Load8x8s,
+			wasm.OpcodeVecV128Load16x4u, wasm.OpcodeVecV128Load16x4s,
+			wasm.OpcodeVecV128Load32x2u, wasm.OpcodeVecV128Load32x2s:
+			_, offset := c.readMemArg()
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			var signed bool
+			switch vecOp {
+			case wasm.OpcodeVecV128Load8x8s:
+				signed = true
+				fallthrough
+			case wasm.OpcodeVecV128Load8x8u:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecV128Load16x4s:
+				signed = true
+				fallthrough
+			case wasm.OpcodeVecV128Load16x4u:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecV128Load32x2s:
+				signed = true
+				fallthrough
+			case wasm.OpcodeVecV128Load32x2u:
+				lane = ssa.VecLaneI32x4
+			}
+			baseAddr := state.pop()
+			addr := c.memOpSetup(baseAddr, uint64(offset), 8)
+			load := builder.AllocateInstruction().
+				AsLoad(addr, offset, ssa.TypeF64).
+				Insert(builder).Return()
+			ret := builder.AllocateInstruction().
+				AsWiden(load, lane, signed, true).
+				Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecV128Load8Splat, wasm.OpcodeVecV128Load16Splat,
+			wasm.OpcodeVecV128Load32Splat, wasm.OpcodeVecV128Load64Splat:
+			_, offset := c.readMemArg()
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			var opSize uint64
+			switch vecOp {
+			case wasm.OpcodeVecV128Load8Splat:
+				lane, opSize = ssa.VecLaneI8x16, 1
+			case wasm.OpcodeVecV128Load16Splat:
+				lane, opSize = ssa.VecLaneI16x8, 2
+			case wasm.OpcodeVecV128Load32Splat:
+				lane, opSize = ssa.VecLaneI32x4, 4
+			case wasm.OpcodeVecV128Load64Splat:
+				lane, opSize = ssa.VecLaneI64x2, 8
+			}
+			baseAddr := state.pop()
+			addr := c.memOpSetup(baseAddr, uint64(offset), opSize)
+			ret := builder.AllocateInstruction().
+				AsLoadSplat(addr, offset, lane).
+				Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecV128Store:
+			_, offset := c.readMemArg()
+			if state.unreachable {
+				break
+			}
+			value := state.pop()
+			baseAddr := state.pop()
+			addr := c.memOpSetup(baseAddr, uint64(offset), 16)
+			builder.AllocateInstruction().
+				AsStore(ssa.OpcodeStore, value, addr, offset).
+				Insert(builder)
+		case wasm.OpcodeVecV128Store8Lane, wasm.OpcodeVecV128Store16Lane,
+			wasm.OpcodeVecV128Store32Lane, wasm.OpcodeVecV128Store64Lane:
+			_, offset := c.readMemArg()
+			state.pc++
+			if state.unreachable {
+				break
+			}
+			laneIndex := c.wasmFunctionBody[state.pc]
+			var storeOp ssa.Opcode
+			var lane ssa.VecLane
+			var opSize uint64
+			switch vecOp {
+			case wasm.OpcodeVecV128Store8Lane:
+				storeOp, lane, opSize = ssa.OpcodeIstore8, ssa.VecLaneI8x16, 1
+			case wasm.OpcodeVecV128Store16Lane:
+				storeOp, lane, opSize = ssa.OpcodeIstore16, ssa.VecLaneI16x8, 2
+			case wasm.OpcodeVecV128Store32Lane:
+				storeOp, lane, opSize = ssa.OpcodeIstore32, ssa.VecLaneI32x4, 4
+			case wasm.OpcodeVecV128Store64Lane:
+				storeOp, lane, opSize = ssa.OpcodeStore, ssa.VecLaneI64x2, 8
+			}
+			vector := state.pop()
+			baseAddr := state.pop()
+			addr := c.memOpSetup(baseAddr, uint64(offset), opSize)
+			value := builder.AllocateInstruction().
+				AsExtractlane(vector, laneIndex, lane, false).
+				Insert(builder).Return()
+			builder.AllocateInstruction().
+				AsStore(storeOp, value, addr, offset).
+				Insert(builder)
+		case wasm.OpcodeVecV128Not:
+			if state.unreachable {
+				break
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVbnot(v1).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecV128And:
+			if state.unreachable {
+				break
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVband(v1, v2).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecV128AndNot:
+			if state.unreachable {
+				break
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVbandnot(v1, v2).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecV128Or:
+			if state.unreachable {
+				break
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVbor(v1, v2).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecV128Xor:
+			if state.unreachable {
+				break
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVbxor(v1, v2).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecV128Bitselect:
+			if state.unreachable {
+				break
+			}
+			c := state.pop()
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVbitselect(c, v1, v2).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecV128AnyTrue:
+			if state.unreachable {
+				break
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVanyTrue(v1).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16AllTrue, wasm.OpcodeVecI16x8AllTrue, wasm.OpcodeVecI32x4AllTrue, wasm.OpcodeVecI64x2AllTrue:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16AllTrue:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8AllTrue:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4AllTrue:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2AllTrue:
+				lane = ssa.VecLaneI64x2
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVallTrue(v1, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16BitMask, wasm.OpcodeVecI16x8BitMask, wasm.OpcodeVecI32x4BitMask, wasm.OpcodeVecI64x2BitMask:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16BitMask:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8BitMask:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4BitMask:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2BitMask:
+				lane = ssa.VecLaneI64x2
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVhighBits(v1, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16Abs, wasm.OpcodeVecI16x8Abs, wasm.OpcodeVecI32x4Abs, wasm.OpcodeVecI64x2Abs:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16Abs:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8Abs:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4Abs:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2Abs:
+				lane = ssa.VecLaneI64x2
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVIabs(v1, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16Neg, wasm.OpcodeVecI16x8Neg, wasm.OpcodeVecI32x4Neg, wasm.OpcodeVecI64x2Neg:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16Neg:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8Neg:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4Neg:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2Neg:
+				lane = ssa.VecLaneI64x2
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVIneg(v1, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16Popcnt:
+			if state.unreachable {
+				break
+			}
+			lane := ssa.VecLaneI8x16
+			v1 := state.pop()
+
+			ret := builder.AllocateInstruction().AsVIpopcnt(v1, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16Add, wasm.OpcodeVecI16x8Add, wasm.OpcodeVecI32x4Add, wasm.OpcodeVecI64x2Add:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16Add:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8Add:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4Add:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2Add:
+				lane = ssa.VecLaneI64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVIadd(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16AddSatS, wasm.OpcodeVecI16x8AddSatS:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16AddSatS:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8AddSatS:
+				lane = ssa.VecLaneI16x8
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVSaddSat(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16AddSatU, wasm.OpcodeVecI16x8AddSatU:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16AddSatU:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8AddSatU:
+				lane = ssa.VecLaneI16x8
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVUaddSat(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16SubSatS, wasm.OpcodeVecI16x8SubSatS:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16SubSatS:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8SubSatS:
+				lane = ssa.VecLaneI16x8
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVSsubSat(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16SubSatU, wasm.OpcodeVecI16x8SubSatU:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16SubSatU:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8SubSatU:
+				lane = ssa.VecLaneI16x8
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVUsubSat(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+
+		case wasm.OpcodeVecI8x16Sub, wasm.OpcodeVecI16x8Sub, wasm.OpcodeVecI32x4Sub, wasm.OpcodeVecI64x2Sub:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16Sub:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8Sub:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4Sub:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2Sub:
+				lane = ssa.VecLaneI64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVIsub(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16MinS, wasm.OpcodeVecI16x8MinS, wasm.OpcodeVecI32x4MinS:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16MinS:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8MinS:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4MinS:
+				lane = ssa.VecLaneI32x4
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVImin(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16MinU, wasm.OpcodeVecI16x8MinU, wasm.OpcodeVecI32x4MinU:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16MinU:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8MinU:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4MinU:
+				lane = ssa.VecLaneI32x4
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVUmin(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16MaxS, wasm.OpcodeVecI16x8MaxS, wasm.OpcodeVecI32x4MaxS:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16MaxS:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8MaxS:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4MaxS:
+				lane = ssa.VecLaneI32x4
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVImax(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16MaxU, wasm.OpcodeVecI16x8MaxU, wasm.OpcodeVecI32x4MaxU:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16MaxU:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8MaxU:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4MaxU:
+				lane = ssa.VecLaneI32x4
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVUmax(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16AvgrU, wasm.OpcodeVecI16x8AvgrU:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16AvgrU:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8AvgrU:
+				lane = ssa.VecLaneI16x8
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVAvgRound(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI16x8Mul, wasm.OpcodeVecI32x4Mul, wasm.OpcodeVecI64x2Mul:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI16x8Mul:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4Mul:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2Mul:
+				lane = ssa.VecLaneI64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVImul(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI16x8Q15mulrSatS:
+			if state.unreachable {
+				break
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsSqmulRoundSat(v1, v2, ssa.VecLaneI16x8).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16Eq, wasm.OpcodeVecI16x8Eq, wasm.OpcodeVecI32x4Eq, wasm.OpcodeVecI64x2Eq:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16Eq:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8Eq:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4Eq:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2Eq:
+				lane = ssa.VecLaneI64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVIcmp(v1, v2, ssa.IntegerCmpCondEqual, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16Ne, wasm.OpcodeVecI16x8Ne, wasm.OpcodeVecI32x4Ne, wasm.OpcodeVecI64x2Ne:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16Ne:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8Ne:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4Ne:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2Ne:
+				lane = ssa.VecLaneI64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVIcmp(v1, v2, ssa.IntegerCmpCondNotEqual, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16LtS, wasm.OpcodeVecI16x8LtS, wasm.OpcodeVecI32x4LtS, wasm.OpcodeVecI64x2LtS:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16LtS:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8LtS:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4LtS:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2LtS:
+				lane = ssa.VecLaneI64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVIcmp(v1, v2, ssa.IntegerCmpCondSignedLessThan, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16LtU, wasm.OpcodeVecI16x8LtU, wasm.OpcodeVecI32x4LtU:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16LtU:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8LtU:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4LtU:
+				lane = ssa.VecLaneI32x4
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVIcmp(v1, v2, ssa.IntegerCmpCondUnsignedLessThan, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16LeS, wasm.OpcodeVecI16x8LeS, wasm.OpcodeVecI32x4LeS, wasm.OpcodeVecI64x2LeS:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16LeS:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8LeS:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4LeS:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2LeS:
+				lane = ssa.VecLaneI64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVIcmp(v1, v2, ssa.IntegerCmpCondSignedLessThanOrEqual, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16LeU, wasm.OpcodeVecI16x8LeU, wasm.OpcodeVecI32x4LeU:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16LeU:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8LeU:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4LeU:
+				lane = ssa.VecLaneI32x4
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVIcmp(v1, v2, ssa.IntegerCmpCondUnsignedLessThanOrEqual, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16GtS, wasm.OpcodeVecI16x8GtS, wasm.OpcodeVecI32x4GtS, wasm.OpcodeVecI64x2GtS:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16GtS:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8GtS:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4GtS:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2GtS:
+				lane = ssa.VecLaneI64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVIcmp(v1, v2, ssa.IntegerCmpCondSignedGreaterThan, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16GtU, wasm.OpcodeVecI16x8GtU, wasm.OpcodeVecI32x4GtU:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16GtU:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8GtU:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4GtU:
+				lane = ssa.VecLaneI32x4
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVIcmp(v1, v2, ssa.IntegerCmpCondUnsignedGreaterThan, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16GeS, wasm.OpcodeVecI16x8GeS, wasm.OpcodeVecI32x4GeS, wasm.OpcodeVecI64x2GeS:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16GeS:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8GeS:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4GeS:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2GeS:
+				lane = ssa.VecLaneI64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVIcmp(v1, v2, ssa.IntegerCmpCondSignedGreaterThanOrEqual, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16GeU, wasm.OpcodeVecI16x8GeU, wasm.OpcodeVecI32x4GeU:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16GeU:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8GeU:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4GeU:
+				lane = ssa.VecLaneI32x4
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVIcmp(v1, v2, ssa.IntegerCmpCondUnsignedGreaterThanOrEqual, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Max, wasm.OpcodeVecF64x2Max:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Max:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Max:
+				lane = ssa.VecLaneF64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVFmax(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Abs, wasm.OpcodeVecF64x2Abs:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Abs:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Abs:
+				lane = ssa.VecLaneF64x2
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVFabs(v1, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Min, wasm.OpcodeVecF64x2Min:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Min:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Min:
+				lane = ssa.VecLaneF64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVFmin(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Neg, wasm.OpcodeVecF64x2Neg:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Neg:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Neg:
+				lane = ssa.VecLaneF64x2
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVFneg(v1, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Sqrt, wasm.OpcodeVecF64x2Sqrt:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Sqrt:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Sqrt:
+				lane = ssa.VecLaneF64x2
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVSqrt(v1, lane).Insert(builder).Return()
+			state.push(ret)
+
+		case wasm.OpcodeVecF32x4Add, wasm.OpcodeVecF64x2Add:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Add:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Add:
+				lane = ssa.VecLaneF64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVFadd(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Sub, wasm.OpcodeVecF64x2Sub:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Sub:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Sub:
+				lane = ssa.VecLaneF64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVFsub(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Mul, wasm.OpcodeVecF64x2Mul:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Mul:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Mul:
+				lane = ssa.VecLaneF64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVFmul(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Div, wasm.OpcodeVecF64x2Div:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Div:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Div:
+				lane = ssa.VecLaneF64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVFdiv(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+
+		case wasm.OpcodeVecI16x8ExtaddPairwiseI8x16S, wasm.OpcodeVecI16x8ExtaddPairwiseI8x16U:
+			if state.unreachable {
+				break
+			}
+			v := state.pop()
+			signed := vecOp == wasm.OpcodeVecI16x8ExtaddPairwiseI8x16S
+			ret := builder.AllocateInstruction().AsExtIaddPairwise(v, ssa.VecLaneI8x16, signed).Insert(builder).Return()
+			state.push(ret)
+
+		case wasm.OpcodeVecI32x4ExtaddPairwiseI16x8S, wasm.OpcodeVecI32x4ExtaddPairwiseI16x8U:
+			if state.unreachable {
+				break
+			}
+			v := state.pop()
+			signed := vecOp == wasm.OpcodeVecI32x4ExtaddPairwiseI16x8S
+			ret := builder.AllocateInstruction().AsExtIaddPairwise(v, ssa.VecLaneI16x8, signed).Insert(builder).Return()
+			state.push(ret)
+
+		case wasm.OpcodeVecI16x8ExtMulLowI8x16S, wasm.OpcodeVecI16x8ExtMulLowI8x16U:
+			if state.unreachable {
+				break
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := c.lowerExtMul(
+				v1, v2,
+				ssa.VecLaneI8x16, ssa.VecLaneI16x8,
+				vecOp == wasm.OpcodeVecI16x8ExtMulLowI8x16S, true)
+			state.push(ret)
+
+		case wasm.OpcodeVecI16x8ExtMulHighI8x16S, wasm.OpcodeVecI16x8ExtMulHighI8x16U:
+			if state.unreachable {
+				break
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := c.lowerExtMul(
+				v1, v2,
+				ssa.VecLaneI8x16, ssa.VecLaneI16x8,
+				vecOp == wasm.OpcodeVecI16x8ExtMulHighI8x16S, false)
+			state.push(ret)
+
+		case wasm.OpcodeVecI32x4ExtMulLowI16x8S, wasm.OpcodeVecI32x4ExtMulLowI16x8U:
+			if state.unreachable {
+				break
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := c.lowerExtMul(
+				v1, v2,
+				ssa.VecLaneI16x8, ssa.VecLaneI32x4,
+				vecOp == wasm.OpcodeVecI32x4ExtMulLowI16x8S, true)
+			state.push(ret)
+
+		case wasm.OpcodeVecI32x4ExtMulHighI16x8S, wasm.OpcodeVecI32x4ExtMulHighI16x8U:
+			if state.unreachable {
+				break
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := c.lowerExtMul(
+				v1, v2,
+				ssa.VecLaneI16x8, ssa.VecLaneI32x4,
+				vecOp == wasm.OpcodeVecI32x4ExtMulHighI16x8S, false)
+			state.push(ret)
+		case wasm.OpcodeVecI64x2ExtMulLowI32x4S, wasm.OpcodeVecI64x2ExtMulLowI32x4U:
+			if state.unreachable {
+				break
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := c.lowerExtMul(
+				v1, v2,
+				ssa.VecLaneI32x4, ssa.VecLaneI64x2,
+				vecOp == wasm.OpcodeVecI64x2ExtMulLowI32x4S, true)
+			state.push(ret)
+
+		case wasm.OpcodeVecI64x2ExtMulHighI32x4S, wasm.OpcodeVecI64x2ExtMulHighI32x4U:
+			if state.unreachable {
+				break
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := c.lowerExtMul(
+				v1, v2,
+				ssa.VecLaneI32x4, ssa.VecLaneI64x2,
+				vecOp == wasm.OpcodeVecI64x2ExtMulHighI32x4S, false)
+			state.push(ret)
+
+		case wasm.OpcodeVecI32x4DotI16x8S:
+			if state.unreachable {
+				break
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+
+			ret := builder.AllocateInstruction().AsWideningPairwiseDotProductS(v1, v2).Insert(builder).Return()
+			state.push(ret)
+
+		case wasm.OpcodeVecF32x4Eq, wasm.OpcodeVecF64x2Eq:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Eq:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Eq:
+				lane = ssa.VecLaneF64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVFcmp(v1, v2, ssa.FloatCmpCondEqual, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Ne, wasm.OpcodeVecF64x2Ne:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Ne:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Ne:
+				lane = ssa.VecLaneF64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVFcmp(v1, v2, ssa.FloatCmpCondNotEqual, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Lt, wasm.OpcodeVecF64x2Lt:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Lt:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Lt:
+				lane = ssa.VecLaneF64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVFcmp(v1, v2, ssa.FloatCmpCondLessThan, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Le, wasm.OpcodeVecF64x2Le:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Le:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Le:
+				lane = ssa.VecLaneF64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVFcmp(v1, v2, ssa.FloatCmpCondLessThanOrEqual, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Gt, wasm.OpcodeVecF64x2Gt:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Gt:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Gt:
+				lane = ssa.VecLaneF64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVFcmp(v1, v2, ssa.FloatCmpCondGreaterThan, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Ge, wasm.OpcodeVecF64x2Ge:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Ge:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Ge:
+				lane = ssa.VecLaneF64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVFcmp(v1, v2, ssa.FloatCmpCondGreaterThanOrEqual, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Ceil, wasm.OpcodeVecF64x2Ceil:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Ceil:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Ceil:
+				lane = ssa.VecLaneF64x2
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVCeil(v1, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Floor, wasm.OpcodeVecF64x2Floor:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Floor:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Floor:
+				lane = ssa.VecLaneF64x2
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVFloor(v1, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Trunc, wasm.OpcodeVecF64x2Trunc:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Trunc:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Trunc:
+				lane = ssa.VecLaneF64x2
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVTrunc(v1, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Nearest, wasm.OpcodeVecF64x2Nearest:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Nearest:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Nearest:
+				lane = ssa.VecLaneF64x2
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVNearest(v1, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Pmin, wasm.OpcodeVecF64x2Pmin:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Pmin:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Pmin:
+				lane = ssa.VecLaneF64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVMinPseudo(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4Pmax, wasm.OpcodeVecF64x2Pmax:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecF32x4Pmax:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Pmax:
+				lane = ssa.VecLaneF64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVMaxPseudo(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI32x4TruncSatF32x4S, wasm.OpcodeVecI32x4TruncSatF32x4U:
+			if state.unreachable {
+				break
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVFcvtToIntSat(v1, ssa.VecLaneF32x4, vecOp == wasm.OpcodeVecI32x4TruncSatF32x4S).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI32x4TruncSatF64x2SZero, wasm.OpcodeVecI32x4TruncSatF64x2UZero:
+			if state.unreachable {
+				break
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVFcvtToIntSat(v1, ssa.VecLaneF64x2, vecOp == wasm.OpcodeVecI32x4TruncSatF64x2SZero).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4ConvertI32x4S, wasm.OpcodeVecF32x4ConvertI32x4U:
+			if state.unreachable {
+				break
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsVFcvtFromInt(v1, ssa.VecLaneF32x4, vecOp == wasm.OpcodeVecF32x4ConvertI32x4S).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF64x2ConvertLowI32x4S, wasm.OpcodeVecF64x2ConvertLowI32x4U:
+			if state.unreachable {
+				break
+			}
+			v1 := state.pop()
+			if runtime.GOARCH == "arm64" {
+				// TODO: this is weird. fix.
+				v1 = builder.AllocateInstruction().
+					AsWiden(v1, ssa.VecLaneI32x4, vecOp == wasm.OpcodeVecF64x2ConvertLowI32x4S, true).Insert(builder).Return()
+			}
+			ret := builder.AllocateInstruction().
+				AsVFcvtFromInt(v1, ssa.VecLaneF64x2, vecOp == wasm.OpcodeVecF64x2ConvertLowI32x4S).
+				Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16NarrowI16x8S, wasm.OpcodeVecI8x16NarrowI16x8U:
+			if state.unreachable {
+				break
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsNarrow(v1, v2, ssa.VecLaneI16x8, vecOp == wasm.OpcodeVecI8x16NarrowI16x8S).
+				Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI16x8NarrowI32x4S, wasm.OpcodeVecI16x8NarrowI32x4U:
+			if state.unreachable {
+				break
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsNarrow(v1, v2, ssa.VecLaneI32x4, vecOp == wasm.OpcodeVecI16x8NarrowI32x4S).
+				Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI16x8ExtendLowI8x16S, wasm.OpcodeVecI16x8ExtendLowI8x16U:
+			if state.unreachable {
+				break
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsWiden(v1, ssa.VecLaneI8x16, vecOp == wasm.OpcodeVecI16x8ExtendLowI8x16S, true).
+				Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI16x8ExtendHighI8x16S, wasm.OpcodeVecI16x8ExtendHighI8x16U:
+			if state.unreachable {
+				break
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsWiden(v1, ssa.VecLaneI8x16, vecOp == wasm.OpcodeVecI16x8ExtendHighI8x16S, false).
+				Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI32x4ExtendLowI16x8S, wasm.OpcodeVecI32x4ExtendLowI16x8U:
+			if state.unreachable {
+				break
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsWiden(v1, ssa.VecLaneI16x8, vecOp == wasm.OpcodeVecI32x4ExtendLowI16x8S, true).
+				Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI32x4ExtendHighI16x8S, wasm.OpcodeVecI32x4ExtendHighI16x8U:
+			if state.unreachable {
+				break
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsWiden(v1, ssa.VecLaneI16x8, vecOp == wasm.OpcodeVecI32x4ExtendHighI16x8S, false).
+				Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI64x2ExtendLowI32x4S, wasm.OpcodeVecI64x2ExtendLowI32x4U:
+			if state.unreachable {
+				break
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsWiden(v1, ssa.VecLaneI32x4, vecOp == wasm.OpcodeVecI64x2ExtendLowI32x4S, true).
+				Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI64x2ExtendHighI32x4S, wasm.OpcodeVecI64x2ExtendHighI32x4U:
+			if state.unreachable {
+				break
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsWiden(v1, ssa.VecLaneI32x4, vecOp == wasm.OpcodeVecI64x2ExtendHighI32x4S, false).
+				Insert(builder).Return()
+			state.push(ret)
+
+		case wasm.OpcodeVecF64x2PromoteLowF32x4Zero:
+			if state.unreachable {
+				break
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsFvpromoteLow(v1, ssa.VecLaneF32x4).
+				Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecF32x4DemoteF64x2Zero:
+			if state.unreachable {
+				break
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().
+				AsFvdemote(v1, ssa.VecLaneF64x2).
+				Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16Shl, wasm.OpcodeVecI16x8Shl, wasm.OpcodeVecI32x4Shl, wasm.OpcodeVecI64x2Shl:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16Shl:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8Shl:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4Shl:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2Shl:
+				lane = ssa.VecLaneI64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVIshl(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16ShrS, wasm.OpcodeVecI16x8ShrS, wasm.OpcodeVecI32x4ShrS, wasm.OpcodeVecI64x2ShrS:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16ShrS:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8ShrS:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4ShrS:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2ShrS:
+				lane = ssa.VecLaneI64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVSshr(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16ShrU, wasm.OpcodeVecI16x8ShrU, wasm.OpcodeVecI32x4ShrU, wasm.OpcodeVecI64x2ShrU:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16ShrU:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8ShrU:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4ShrU:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2ShrU:
+				lane = ssa.VecLaneI64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsVUshr(v1, v2, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecI8x16ExtractLaneS, wasm.OpcodeVecI16x8ExtractLaneS:
+			state.pc++
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16ExtractLaneS:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8ExtractLaneS:
+				lane = ssa.VecLaneI16x8
+			}
+			v1 := state.pop()
+			index := c.wasmFunctionBody[state.pc]
+			ext := builder.AllocateInstruction().AsExtractlane(v1, index, lane, true).Insert(builder).Return()
+			state.push(ext)
+		case wasm.OpcodeVecI8x16ExtractLaneU, wasm.OpcodeVecI16x8ExtractLaneU,
+			wasm.OpcodeVecI32x4ExtractLane, wasm.OpcodeVecI64x2ExtractLane,
+			wasm.OpcodeVecF32x4ExtractLane, wasm.OpcodeVecF64x2ExtractLane:
+			state.pc++ // Skip the immediate value.
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16ExtractLaneU:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8ExtractLaneU:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4ExtractLane:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2ExtractLane:
+				lane = ssa.VecLaneI64x2
+			case wasm.OpcodeVecF32x4ExtractLane:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2ExtractLane:
+				lane = ssa.VecLaneF64x2
+			}
+			v1 := state.pop()
+			index := c.wasmFunctionBody[state.pc]
+			ext := builder.AllocateInstruction().AsExtractlane(v1, index, lane, false).Insert(builder).Return()
+			state.push(ext)
+		case wasm.OpcodeVecI8x16ReplaceLane, wasm.OpcodeVecI16x8ReplaceLane,
+			wasm.OpcodeVecI32x4ReplaceLane, wasm.OpcodeVecI64x2ReplaceLane,
+			wasm.OpcodeVecF32x4ReplaceLane, wasm.OpcodeVecF64x2ReplaceLane:
+			state.pc++
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16ReplaceLane:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8ReplaceLane:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4ReplaceLane:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2ReplaceLane:
+				lane = ssa.VecLaneI64x2
+			case wasm.OpcodeVecF32x4ReplaceLane:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2ReplaceLane:
+				lane = ssa.VecLaneF64x2
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			index := c.wasmFunctionBody[state.pc]
+			ret := builder.AllocateInstruction().AsInsertlane(v1, v2, index, lane).Insert(builder).Return()
+			state.push(ret)
+		case wasm.OpcodeVecV128i8x16Shuffle:
+			state.pc++
+			laneIndexes := c.wasmFunctionBody[state.pc : state.pc+16]
+			state.pc += 15
+			if state.unreachable {
+				break
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsShuffle(v1, v2, laneIndexes).Insert(builder).Return()
+			state.push(ret)
+
+		case wasm.OpcodeVecI8x16Swizzle:
+			if state.unreachable {
+				break
+			}
+			v2 := state.pop()
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsSwizzle(v1, v2, ssa.VecLaneI8x16).Insert(builder).Return()
+			state.push(ret)
+
+		case wasm.OpcodeVecI8x16Splat,
+			wasm.OpcodeVecI16x8Splat,
+			wasm.OpcodeVecI32x4Splat,
+			wasm.OpcodeVecI64x2Splat,
+			wasm.OpcodeVecF32x4Splat,
+			wasm.OpcodeVecF64x2Splat:
+			if state.unreachable {
+				break
+			}
+			var lane ssa.VecLane
+			switch vecOp {
+			case wasm.OpcodeVecI8x16Splat:
+				lane = ssa.VecLaneI8x16
+			case wasm.OpcodeVecI16x8Splat:
+				lane = ssa.VecLaneI16x8
+			case wasm.OpcodeVecI32x4Splat:
+				lane = ssa.VecLaneI32x4
+			case wasm.OpcodeVecI64x2Splat:
+				lane = ssa.VecLaneI64x2
+			case wasm.OpcodeVecF32x4Splat:
+				lane = ssa.VecLaneF32x4
+			case wasm.OpcodeVecF64x2Splat:
+				lane = ssa.VecLaneF64x2
+			}
+			v1 := state.pop()
+			ret := builder.AllocateInstruction().AsSplat(v1, lane).Insert(builder).Return()
+			state.push(ret)
+
+		default:
+			panic("TODO: unsupported vector instruction: " + wasm.VectorInstructionName(vecOp))
+		}
+	case wasm.OpcodeAtomicPrefix:
+		state.pc++
+		atomicOp := c.wasmFunctionBody[state.pc]
+		switch atomicOp {
+		case wasm.OpcodeAtomicMemoryWait32, wasm.OpcodeAtomicMemoryWait64:
+			_, offset := c.readMemArg()
+			if state.unreachable {
+				break
+			}
+
+			c.storeCallerModuleContext()
+
+			var opSize uint64
+			var trampoline wazevoapi.Offset
+			var sig *ssa.Signature
+			switch atomicOp {
+			case wasm.OpcodeAtomicMemoryWait32:
+				opSize = 4
+				trampoline = wazevoapi.ExecutionContextOffsetMemoryWait32TrampolineAddress
+				sig = &c.memoryWait32Sig
+			case wasm.OpcodeAtomicMemoryWait64:
+				opSize = 8
+				trampoline = wazevoapi.ExecutionContextOffsetMemoryWait64TrampolineAddress
+				sig = &c.memoryWait64Sig
+			}
+
+			timeout := state.pop()
+			exp := state.pop()
+			baseAddr := state.pop()
+			addr := c.atomicMemOpSetup(baseAddr, uint64(offset), opSize)
+
+			memoryWaitPtr := builder.AllocateInstruction().
+				AsLoad(c.execCtxPtrValue,
+					trampoline.U32(),
+					ssa.TypeI64,
+				).Insert(builder).Return()
+
+			args := c.allocateVarLengthValues(3, c.execCtxPtrValue, timeout, exp, addr)
+			memoryWaitRet := builder.AllocateInstruction().
+				AsCallIndirect(memoryWaitPtr, sig, args).
+				Insert(builder).Return()
+			state.push(memoryWaitRet)
+		case wasm.OpcodeAtomicMemoryNotify:
+			_, offset := c.readMemArg()
+			if state.unreachable {
+				break
+			}
+
+			c.storeCallerModuleContext()
+			count := state.pop()
+			baseAddr := state.pop()
+			addr := c.atomicMemOpSetup(baseAddr, uint64(offset), 4)
+
+			memoryNotifyPtr := builder.AllocateInstruction().
+				AsLoad(c.execCtxPtrValue,
+					wazevoapi.ExecutionContextOffsetMemoryNotifyTrampolineAddress.U32(),
+					ssa.TypeI64,
+				).Insert(builder).Return()
+			args := c.allocateVarLengthValues(2, c.execCtxPtrValue, count, addr)
+			memoryNotifyRet := builder.AllocateInstruction().
+				AsCallIndirect(memoryNotifyPtr, &c.memoryNotifySig, args).
+				Insert(builder).Return()
+			state.push(memoryNotifyRet)
+		case wasm.OpcodeAtomicI32Load, wasm.OpcodeAtomicI64Load, wasm.OpcodeAtomicI32Load8U, wasm.OpcodeAtomicI32Load16U, wasm.OpcodeAtomicI64Load8U, wasm.OpcodeAtomicI64Load16U, wasm.OpcodeAtomicI64Load32U:
+			_, offset := c.readMemArg()
+			if state.unreachable {
+				break
+			}
+
+			baseAddr := state.pop()
+
+			var size uint64
+			switch atomicOp {
+			case wasm.OpcodeAtomicI64Load:
+				size = 8
+			case wasm.OpcodeAtomicI32Load, wasm.OpcodeAtomicI64Load32U:
+				size = 4
+			case wasm.OpcodeAtomicI32Load16U, wasm.OpcodeAtomicI64Load16U:
+				size = 2
+			case wasm.OpcodeAtomicI32Load8U, wasm.OpcodeAtomicI64Load8U:
+				size = 1
+			}
+
+			var typ ssa.Type
+			switch atomicOp {
+			case wasm.OpcodeAtomicI64Load, wasm.OpcodeAtomicI64Load32U, wasm.OpcodeAtomicI64Load16U, wasm.OpcodeAtomicI64Load8U:
+				typ = ssa.TypeI64
+			case wasm.OpcodeAtomicI32Load, wasm.OpcodeAtomicI32Load16U, wasm.OpcodeAtomicI32Load8U:
+				typ = ssa.TypeI32
+			}
+
+			addr := c.atomicMemOpSetup(baseAddr, uint64(offset), size)
+			res := builder.AllocateInstruction().AsAtomicLoad(addr, size, typ).Insert(builder).Return()
+			state.push(res)
+		case wasm.OpcodeAtomicI32Store, wasm.OpcodeAtomicI64Store, wasm.OpcodeAtomicI32Store8, wasm.OpcodeAtomicI32Store16, wasm.OpcodeAtomicI64Store8, wasm.OpcodeAtomicI64Store16, wasm.OpcodeAtomicI64Store32:
+			_, offset := c.readMemArg()
+			if state.unreachable {
+				break
+			}
+
+			val := state.pop()
+			baseAddr := state.pop()
+
+			var size uint64
+			switch atomicOp {
+			case wasm.OpcodeAtomicI64Store:
+				size = 8
+			case wasm.OpcodeAtomicI32Store, wasm.OpcodeAtomicI64Store32:
+				size = 4
+			case wasm.OpcodeAtomicI32Store16, wasm.OpcodeAtomicI64Store16:
+				size = 2
+			case wasm.OpcodeAtomicI32Store8, wasm.OpcodeAtomicI64Store8:
+				size = 1
+			}
+
+			addr := c.atomicMemOpSetup(baseAddr, uint64(offset), size)
+			builder.AllocateInstruction().AsAtomicStore(addr, val, size).Insert(builder)
+		case wasm.OpcodeAtomicI32RmwAdd, wasm.OpcodeAtomicI64RmwAdd, wasm.OpcodeAtomicI32Rmw8AddU, wasm.OpcodeAtomicI32Rmw16AddU, wasm.OpcodeAtomicI64Rmw8AddU, wasm.OpcodeAtomicI64Rmw16AddU, wasm.OpcodeAtomicI64Rmw32AddU,
+			wasm.OpcodeAtomicI32RmwSub, wasm.OpcodeAtomicI64RmwSub, wasm.OpcodeAtomicI32Rmw8SubU, wasm.OpcodeAtomicI32Rmw16SubU, wasm.OpcodeAtomicI64Rmw8SubU, wasm.OpcodeAtomicI64Rmw16SubU, wasm.OpcodeAtomicI64Rmw32SubU,
+			wasm.OpcodeAtomicI32RmwAnd, wasm.OpcodeAtomicI64RmwAnd, wasm.OpcodeAtomicI32Rmw8AndU, wasm.OpcodeAtomicI32Rmw16AndU, wasm.OpcodeAtomicI64Rmw8AndU, wasm.OpcodeAtomicI64Rmw16AndU, wasm.OpcodeAtomicI64Rmw32AndU,
+			wasm.OpcodeAtomicI32RmwOr, wasm.OpcodeAtomicI64RmwOr, wasm.OpcodeAtomicI32Rmw8OrU, wasm.OpcodeAtomicI32Rmw16OrU, wasm.OpcodeAtomicI64Rmw8OrU, wasm.OpcodeAtomicI64Rmw16OrU, wasm.OpcodeAtomicI64Rmw32OrU,
+			wasm.OpcodeAtomicI32RmwXor, wasm.OpcodeAtomicI64RmwXor, wasm.OpcodeAtomicI32Rmw8XorU, wasm.OpcodeAtomicI32Rmw16XorU, wasm.OpcodeAtomicI64Rmw8XorU, wasm.OpcodeAtomicI64Rmw16XorU, wasm.OpcodeAtomicI64Rmw32XorU,
+			wasm.OpcodeAtomicI32RmwXchg, wasm.OpcodeAtomicI64RmwXchg, wasm.OpcodeAtomicI32Rmw8XchgU, wasm.OpcodeAtomicI32Rmw16XchgU, wasm.OpcodeAtomicI64Rmw8XchgU, wasm.OpcodeAtomicI64Rmw16XchgU, wasm.OpcodeAtomicI64Rmw32XchgU:
+			_, offset := c.readMemArg()
+			if state.unreachable {
+				break
+			}
+
+			val := state.pop()
+			baseAddr := state.pop()
+
+			var rmwOp ssa.AtomicRmwOp
+			var size uint64
+			switch atomicOp {
+			case wasm.OpcodeAtomicI32RmwAdd, wasm.OpcodeAtomicI64RmwAdd, wasm.OpcodeAtomicI32Rmw8AddU, wasm.OpcodeAtomicI32Rmw16AddU, wasm.OpcodeAtomicI64Rmw8AddU, wasm.OpcodeAtomicI64Rmw16AddU, wasm.OpcodeAtomicI64Rmw32AddU:
+				rmwOp = ssa.AtomicRmwOpAdd
+				switch atomicOp {
+				case wasm.OpcodeAtomicI64RmwAdd:
+					size = 8
+				case wasm.OpcodeAtomicI32RmwAdd, wasm.OpcodeAtomicI64Rmw32AddU:
+					size = 4
+				case wasm.OpcodeAtomicI32Rmw16AddU, wasm.OpcodeAtomicI64Rmw16AddU:
+					size = 2
+				case wasm.OpcodeAtomicI32Rmw8AddU, wasm.OpcodeAtomicI64Rmw8AddU:
+					size = 1
+				}
+			case wasm.OpcodeAtomicI32RmwSub, wasm.OpcodeAtomicI64RmwSub, wasm.OpcodeAtomicI32Rmw8SubU, wasm.OpcodeAtomicI32Rmw16SubU, wasm.OpcodeAtomicI64Rmw8SubU, wasm.OpcodeAtomicI64Rmw16SubU, wasm.OpcodeAtomicI64Rmw32SubU:
+				rmwOp = ssa.AtomicRmwOpSub
+				switch atomicOp {
+				case wasm.OpcodeAtomicI64RmwSub:
+					size = 8
+				case wasm.OpcodeAtomicI32RmwSub, wasm.OpcodeAtomicI64Rmw32SubU:
+					size = 4
+				case wasm.OpcodeAtomicI32Rmw16SubU, wasm.OpcodeAtomicI64Rmw16SubU:
+					size = 2
+				case wasm.OpcodeAtomicI32Rmw8SubU, wasm.OpcodeAtomicI64Rmw8SubU:
+					size = 1
+				}
+			case wasm.OpcodeAtomicI32RmwAnd, wasm.OpcodeAtomicI64RmwAnd, wasm.OpcodeAtomicI32Rmw8AndU, wasm.OpcodeAtomicI32Rmw16AndU, wasm.OpcodeAtomicI64Rmw8AndU, wasm.OpcodeAtomicI64Rmw16AndU, wasm.OpcodeAtomicI64Rmw32AndU:
+				rmwOp = ssa.AtomicRmwOpAnd
+				switch atomicOp {
+				case wasm.OpcodeAtomicI64RmwAnd:
+					size = 8
+				case wasm.OpcodeAtomicI32RmwAnd, wasm.OpcodeAtomicI64Rmw32AndU:
+					size = 4
+				case wasm.OpcodeAtomicI32Rmw16AndU, wasm.OpcodeAtomicI64Rmw16AndU:
+					size = 2
+				case wasm.OpcodeAtomicI32Rmw8AndU, wasm.OpcodeAtomicI64Rmw8AndU:
+					size = 1
+				}
+			case wasm.OpcodeAtomicI32RmwOr, wasm.OpcodeAtomicI64RmwOr, wasm.OpcodeAtomicI32Rmw8OrU, wasm.OpcodeAtomicI32Rmw16OrU, wasm.OpcodeAtomicI64Rmw8OrU, wasm.OpcodeAtomicI64Rmw16OrU, wasm.OpcodeAtomicI64Rmw32OrU:
+				rmwOp = ssa.AtomicRmwOpOr
+				switch atomicOp {
+				case wasm.OpcodeAtomicI64RmwOr:
+					size = 8
+				case wasm.OpcodeAtomicI32RmwOr, wasm.OpcodeAtomicI64Rmw32OrU:
+					size = 4
+				case wasm.OpcodeAtomicI32Rmw16OrU, wasm.OpcodeAtomicI64Rmw16OrU:
+					size = 2
+				case wasm.OpcodeAtomicI32Rmw8OrU, wasm.OpcodeAtomicI64Rmw8OrU:
+					size = 1
+				}
+			case wasm.OpcodeAtomicI32RmwXor, wasm.OpcodeAtomicI64RmwXor, wasm.OpcodeAtomicI32Rmw8XorU, wasm.OpcodeAtomicI32Rmw16XorU, wasm.OpcodeAtomicI64Rmw8XorU, wasm.OpcodeAtomicI64Rmw16XorU, wasm.OpcodeAtomicI64Rmw32XorU:
+				rmwOp = ssa.AtomicRmwOpXor
+				switch atomicOp {
+				case wasm.OpcodeAtomicI64RmwXor:
+					size = 8
+				case wasm.OpcodeAtomicI32RmwXor, wasm.OpcodeAtomicI64Rmw32XorU:
+					size = 4
+				case wasm.OpcodeAtomicI32Rmw16XorU, wasm.OpcodeAtomicI64Rmw16XorU:
+					size = 2
+				case wasm.OpcodeAtomicI32Rmw8XorU, wasm.OpcodeAtomicI64Rmw8XorU:
+					size = 1
+				}
+			case wasm.OpcodeAtomicI32RmwXchg, wasm.OpcodeAtomicI64RmwXchg, wasm.OpcodeAtomicI32Rmw8XchgU, wasm.OpcodeAtomicI32Rmw16XchgU, wasm.OpcodeAtomicI64Rmw8XchgU, wasm.OpcodeAtomicI64Rmw16XchgU, wasm.OpcodeAtomicI64Rmw32XchgU:
+				rmwOp = ssa.AtomicRmwOpXchg
+				switch atomicOp {
+				case wasm.OpcodeAtomicI64RmwXchg:
+					size = 8
+				case wasm.OpcodeAtomicI32RmwXchg, wasm.OpcodeAtomicI64Rmw32XchgU:
+					size = 4
+				case wasm.OpcodeAtomicI32Rmw16XchgU, wasm.OpcodeAtomicI64Rmw16XchgU:
+					size = 2
+				case wasm.OpcodeAtomicI32Rmw8XchgU, wasm.OpcodeAtomicI64Rmw8XchgU:
+					size = 1
+				}
+			}
+
+			addr := c.atomicMemOpSetup(baseAddr, uint64(offset), size)
+			res := builder.AllocateInstruction().AsAtomicRmw(rmwOp, addr, val, size).Insert(builder).Return()
+			state.push(res)
+		case wasm.OpcodeAtomicI32RmwCmpxchg, wasm.OpcodeAtomicI64RmwCmpxchg, wasm.OpcodeAtomicI32Rmw8CmpxchgU, wasm.OpcodeAtomicI32Rmw16CmpxchgU, wasm.OpcodeAtomicI64Rmw8CmpxchgU, wasm.OpcodeAtomicI64Rmw16CmpxchgU, wasm.OpcodeAtomicI64Rmw32CmpxchgU:
+			_, offset := c.readMemArg()
+			if state.unreachable {
+				break
+			}
+
+			repl := state.pop()
+			exp := state.pop()
+			baseAddr := state.pop()
+
+			var size uint64
+			switch atomicOp {
+			case wasm.OpcodeAtomicI64RmwCmpxchg:
+				size = 8
+			case wasm.OpcodeAtomicI32RmwCmpxchg, wasm.OpcodeAtomicI64Rmw32CmpxchgU:
+				size = 4
+			case wasm.OpcodeAtomicI32Rmw16CmpxchgU, wasm.OpcodeAtomicI64Rmw16CmpxchgU:
+				size = 2
+			case wasm.OpcodeAtomicI32Rmw8CmpxchgU, wasm.OpcodeAtomicI64Rmw8CmpxchgU:
+				size = 1
+			}
+			addr := c.atomicMemOpSetup(baseAddr, uint64(offset), size)
+			res := builder.AllocateInstruction().AsAtomicCas(addr, exp, repl, size).Insert(builder).Return()
+			state.push(res)
+		case wasm.OpcodeAtomicFence:
+			order := c.readByte()
+			if state.unreachable {
+				break
+			}
+			if c.needMemory {
+				builder.AllocateInstruction().AsFence(order).Insert(builder)
+			}
+		default:
+			panic("TODO: unsupported atomic instruction: " + wasm.AtomicInstructionName(atomicOp))
+		}
+	case wasm.OpcodeRefFunc:
+		funcIndex := c.readI32u()
+		if state.unreachable {
+			break
+		}
+
+		c.storeCallerModuleContext()
+
+		funcIndexVal := builder.AllocateInstruction().AsIconst32(funcIndex).Insert(builder).Return()
+
+		refFuncPtr := builder.AllocateInstruction().
+			AsLoad(c.execCtxPtrValue,
+				wazevoapi.ExecutionContextOffsetRefFuncTrampolineAddress.U32(),
+				ssa.TypeI64,
+			).Insert(builder).Return()
+
+		args := c.allocateVarLengthValues(2, c.execCtxPtrValue, funcIndexVal)
+		refFuncRet := builder.
+			AllocateInstruction().
+			AsCallIndirect(refFuncPtr, &c.refFuncSig, args).
+			Insert(builder).Return()
+		state.push(refFuncRet)
+
+	case wasm.OpcodeRefNull:
+		c.loweringState.pc++ // skips the reference type as we treat both of them as i64(0).
+		if state.unreachable {
+			break
+		}
+		ret := builder.AllocateInstruction().AsIconst64(0).Insert(builder).Return()
+		state.push(ret)
+	case wasm.OpcodeRefIsNull:
+		if state.unreachable {
+			break
+		}
+		r := state.pop()
+		zero := builder.AllocateInstruction().AsIconst64(0).Insert(builder)
+		icmp := builder.AllocateInstruction().
+			AsIcmp(r, zero.Return(), ssa.IntegerCmpCondEqual).
+			Insert(builder).
+			Return()
+		state.push(icmp)
+	case wasm.OpcodeTableSet:
+		tableIndex := c.readI32u()
+		if state.unreachable {
+			break
+		}
+		r := state.pop()
+		targetOffsetInTable := state.pop()
+
+		elementAddr := c.lowerAccessTableWithBoundsCheck(tableIndex, targetOffsetInTable)
+		builder.AllocateInstruction().AsStore(ssa.OpcodeStore, r, elementAddr, 0).Insert(builder)
+
+	case wasm.OpcodeTableGet:
+		tableIndex := c.readI32u()
+		if state.unreachable {
+			break
+		}
+		targetOffsetInTable := state.pop()
+		elementAddr := c.lowerAccessTableWithBoundsCheck(tableIndex, targetOffsetInTable)
+		loaded := builder.AllocateInstruction().AsLoad(elementAddr, 0, ssa.TypeI64).Insert(builder).Return()
+		state.push(loaded)
+	default:
+		panic("TODO: unsupported in wazevo yet: " + wasm.InstructionName(op))
+	}
+
+	if wazevoapi.FrontEndLoggingEnabled {
+		fmt.Println("--------- Translated " + wasm.InstructionName(op) + " --------")
+		fmt.Println("state: " + c.loweringState.String())
+		fmt.Println(c.formatBuilder())
+		fmt.Println("--------------------------")
+	}
+	c.loweringState.pc++
+}
+
+func (c *Compiler) lowerExtMul(v1, v2 ssa.Value, from, to ssa.VecLane, signed, low bool) ssa.Value {
+	// TODO: The sequence `Widen; Widen; VIMul` can be substituted for a single instruction on some ISAs.
+	builder := c.ssaBuilder
+
+	v1lo := builder.AllocateInstruction().AsWiden(v1, from, signed, low).Insert(builder).Return()
+	v2lo := builder.AllocateInstruction().AsWiden(v2, from, signed, low).Insert(builder).Return()
+
+	return builder.AllocateInstruction().AsVImul(v1lo, v2lo, to).Insert(builder).Return()
+}
+
+const (
+	tableInstanceBaseAddressOffset = 0
+	tableInstanceLenOffset         = tableInstanceBaseAddressOffset + 8
+)
+
+func (c *Compiler) lowerAccessTableWithBoundsCheck(tableIndex uint32, elementOffsetInTable ssa.Value) (elementAddress ssa.Value) {
+	builder := c.ssaBuilder
+
+	// Load the table.
+	loadTableInstancePtr := builder.AllocateInstruction()
+	loadTableInstancePtr.AsLoad(c.moduleCtxPtrValue, c.offset.TableOffset(int(tableIndex)).U32(), ssa.TypeI64)
+	builder.InsertInstruction(loadTableInstancePtr)
+	tableInstancePtr := loadTableInstancePtr.Return()
+
+	// Load the table's length.
+	loadTableLen := builder.AllocateInstruction()
+	loadTableLen.AsLoad(tableInstancePtr, tableInstanceLenOffset, ssa.TypeI32)
+	builder.InsertInstruction(loadTableLen)
+	tableLen := loadTableLen.Return()
+
+	// Compare the length and the target, and trap if out of bounds.
+	checkOOB := builder.AllocateInstruction()
+	checkOOB.AsIcmp(elementOffsetInTable, tableLen, ssa.IntegerCmpCondUnsignedGreaterThanOrEqual)
+	builder.InsertInstruction(checkOOB)
+	exitIfOOB := builder.AllocateInstruction()
+	exitIfOOB.AsExitIfTrueWithCode(c.execCtxPtrValue, checkOOB.Return(), wazevoapi.ExitCodeTableOutOfBounds)
+	builder.InsertInstruction(exitIfOOB)
+
+	// Get the base address of wasm.TableInstance.References.
+	loadTableBaseAddress := builder.AllocateInstruction()
+	loadTableBaseAddress.AsLoad(tableInstancePtr, tableInstanceBaseAddressOffset, ssa.TypeI64)
+	builder.InsertInstruction(loadTableBaseAddress)
+	tableBase := loadTableBaseAddress.Return()
+
+	// Calculate the address of the target function. First we need to multiply targetOffsetInTable by 8 (pointer size).
+	multiplyBy8 := builder.AllocateInstruction()
+	three := builder.AllocateInstruction()
+	three.AsIconst64(3)
+	builder.InsertInstruction(three)
+	multiplyBy8.AsIshl(elementOffsetInTable, three.Return())
+	builder.InsertInstruction(multiplyBy8)
+	targetOffsetInTableMultipliedBy8 := multiplyBy8.Return()
+
+	// Then add the multiplied value to the base which results in the address of the target function (*wazevo.functionInstance)
+	calcElementAddressInTable := builder.AllocateInstruction()
+	calcElementAddressInTable.AsIadd(tableBase, targetOffsetInTableMultipliedBy8)
+	builder.InsertInstruction(calcElementAddressInTable)
+	return calcElementAddressInTable.Return()
+}
+
+func (c *Compiler) lowerCallIndirect(typeIndex, tableIndex uint32) {
+	builder := c.ssaBuilder
+	state := c.state()
+
+	elementOffsetInTable := state.pop()
+	functionInstancePtrAddress := c.lowerAccessTableWithBoundsCheck(tableIndex, elementOffsetInTable)
+	loadFunctionInstancePtr := builder.AllocateInstruction()
+	loadFunctionInstancePtr.AsLoad(functionInstancePtrAddress, 0, ssa.TypeI64)
+	builder.InsertInstruction(loadFunctionInstancePtr)
+	functionInstancePtr := loadFunctionInstancePtr.Return()
+
+	// Check if it is not the null pointer.
+	zero := builder.AllocateInstruction()
+	zero.AsIconst64(0)
+	builder.InsertInstruction(zero)
+	checkNull := builder.AllocateInstruction()
+	checkNull.AsIcmp(functionInstancePtr, zero.Return(), ssa.IntegerCmpCondEqual)
+	builder.InsertInstruction(checkNull)
+	exitIfNull := builder.AllocateInstruction()
+	exitIfNull.AsExitIfTrueWithCode(c.execCtxPtrValue, checkNull.Return(), wazevoapi.ExitCodeIndirectCallNullPointer)
+	builder.InsertInstruction(exitIfNull)
+
+	// We need to do the type check. First, load the target function instance's typeID.
+	loadTypeID := builder.AllocateInstruction()
+	loadTypeID.AsLoad(functionInstancePtr, wazevoapi.FunctionInstanceTypeIDOffset, ssa.TypeI32)
+	builder.InsertInstruction(loadTypeID)
+	actualTypeID := loadTypeID.Return()
+
+	// Next, we load the expected TypeID:
+	loadTypeIDsBegin := builder.AllocateInstruction()
+	loadTypeIDsBegin.AsLoad(c.moduleCtxPtrValue, c.offset.TypeIDs1stElement.U32(), ssa.TypeI64)
+	builder.InsertInstruction(loadTypeIDsBegin)
+	typeIDsBegin := loadTypeIDsBegin.Return()
+
+	loadExpectedTypeID := builder.AllocateInstruction()
+	loadExpectedTypeID.AsLoad(typeIDsBegin, uint32(typeIndex)*4 /* size of wasm.FunctionTypeID */, ssa.TypeI32)
+	builder.InsertInstruction(loadExpectedTypeID)
+	expectedTypeID := loadExpectedTypeID.Return()
+
+	// Check if the type ID matches.
+	checkTypeID := builder.AllocateInstruction()
+	checkTypeID.AsIcmp(actualTypeID, expectedTypeID, ssa.IntegerCmpCondNotEqual)
+	builder.InsertInstruction(checkTypeID)
+	exitIfNotMatch := builder.AllocateInstruction()
+	exitIfNotMatch.AsExitIfTrueWithCode(c.execCtxPtrValue, checkTypeID.Return(), wazevoapi.ExitCodeIndirectCallTypeMismatch)
+	builder.InsertInstruction(exitIfNotMatch)
+
+	// Now ready to call the function. Load the executable and moduleContextOpaquePtr from the function instance.
+	loadExecutablePtr := builder.AllocateInstruction()
+	loadExecutablePtr.AsLoad(functionInstancePtr, wazevoapi.FunctionInstanceExecutableOffset, ssa.TypeI64)
+	builder.InsertInstruction(loadExecutablePtr)
+	executablePtr := loadExecutablePtr.Return()
+	loadModuleContextOpaquePtr := builder.AllocateInstruction()
+	loadModuleContextOpaquePtr.AsLoad(functionInstancePtr, wazevoapi.FunctionInstanceModuleContextOpaquePtrOffset, ssa.TypeI64)
+	builder.InsertInstruction(loadModuleContextOpaquePtr)
+	moduleContextOpaquePtr := loadModuleContextOpaquePtr.Return()
+
+	typ := &c.m.TypeSection[typeIndex]
+	tail := len(state.values) - len(typ.Params)
+	vs := state.values[tail:]
+	state.values = state.values[:tail]
+	args := c.allocateVarLengthValues(2+len(vs), c.execCtxPtrValue, moduleContextOpaquePtr)
+	args = args.Append(builder.VarLengthPool(), vs...)
+
+	// Before transfer the control to the callee, we have to store the current module's moduleContextPtr
+	// into execContext.callerModuleContextPtr in case when the callee is a Go function.
+	c.storeCallerModuleContext()
+
+	call := builder.AllocateInstruction()
+	call.AsCallIndirect(executablePtr, c.signatures[typ], args)
+	builder.InsertInstruction(call)
+
+	first, rest := call.Returns()
+	if first.Valid() {
+		state.push(first)
+	}
+	for _, v := range rest {
+		state.push(v)
+	}
+
+	c.reloadAfterCall()
+}
+
+// memOpSetup inserts the bounds check and calculates the address of the memory operation (loads/stores).
+func (c *Compiler) memOpSetup(baseAddr ssa.Value, constOffset, operationSizeInBytes uint64) (address ssa.Value) {
+	address = ssa.ValueInvalid
+	builder := c.ssaBuilder
+
+	baseAddrID := baseAddr.ID()
+	ceil := constOffset + operationSizeInBytes
+	if known := c.getKnownSafeBound(baseAddrID); known.valid() {
+		// We reuse the calculated absolute address even if the bound is not known to be safe.
+		address = known.absoluteAddr
+		if ceil <= known.bound {
+			if !address.Valid() {
+				// This means that, the bound is known to be safe, but the memory base might have changed.
+				// So, we re-calculate the address.
+				memBase := c.getMemoryBaseValue(false)
+				extBaseAddr := builder.AllocateInstruction().
+					AsUExtend(baseAddr, 32, 64).
+					Insert(builder).
+					Return()
+				address = builder.AllocateInstruction().
+					AsIadd(memBase, extBaseAddr).Insert(builder).Return()
+				known.absoluteAddr = address // Update the absolute address for the subsequent memory access.
+			}
+			return
+		}
+	}
+
+	ceilConst := builder.AllocateInstruction()
+	ceilConst.AsIconst64(ceil)
+	builder.InsertInstruction(ceilConst)
+
+	// We calculate the offset in 64-bit space.
+	extBaseAddr := builder.AllocateInstruction().
+		AsUExtend(baseAddr, 32, 64).
+		Insert(builder).
+		Return()
+
+	// Note: memLen is already zero extended to 64-bit space at the load time.
+	memLen := c.getMemoryLenValue(false)
+
+	// baseAddrPlusCeil = baseAddr + ceil
+	baseAddrPlusCeil := builder.AllocateInstruction()
+	baseAddrPlusCeil.AsIadd(extBaseAddr, ceilConst.Return())
+	builder.InsertInstruction(baseAddrPlusCeil)
+
+	// Check for out of bounds memory access: `memLen >= baseAddrPlusCeil`.
+	cmp := builder.AllocateInstruction()
+	cmp.AsIcmp(memLen, baseAddrPlusCeil.Return(), ssa.IntegerCmpCondUnsignedLessThan)
+	builder.InsertInstruction(cmp)
+	exitIfNZ := builder.AllocateInstruction()
+	exitIfNZ.AsExitIfTrueWithCode(c.execCtxPtrValue, cmp.Return(), wazevoapi.ExitCodeMemoryOutOfBounds)
+	builder.InsertInstruction(exitIfNZ)
+
+	// Load the value from memBase + extBaseAddr.
+	if address == ssa.ValueInvalid { // Reuse the value if the memBase is already calculated at this point.
+		memBase := c.getMemoryBaseValue(false)
+		address = builder.AllocateInstruction().
+			AsIadd(memBase, extBaseAddr).Insert(builder).Return()
+	}
+
+	// Record the bound ceil for this baseAddr is known to be safe for the subsequent memory access in the same block.
+	c.recordKnownSafeBound(baseAddrID, ceil, address)
+	return
+}
+
+// atomicMemOpSetup inserts the bounds check and calculates the address of the memory operation (loads/stores), including
+// the constant offset and performs an alignment check on the final address.
+func (c *Compiler) atomicMemOpSetup(baseAddr ssa.Value, constOffset, operationSizeInBytes uint64) (address ssa.Value) {
+	builder := c.ssaBuilder
+
+	addrWithoutOffset := c.memOpSetup(baseAddr, constOffset, operationSizeInBytes)
+	var addr ssa.Value
+	if constOffset == 0 {
+		addr = addrWithoutOffset
+	} else {
+		offset := builder.AllocateInstruction().AsIconst64(constOffset).Insert(builder).Return()
+		addr = builder.AllocateInstruction().AsIadd(addrWithoutOffset, offset).Insert(builder).Return()
+	}
+
+	c.memAlignmentCheck(addr, operationSizeInBytes)
+
+	return addr
+}
+
+func (c *Compiler) memAlignmentCheck(addr ssa.Value, operationSizeInBytes uint64) {
+	if operationSizeInBytes == 1 {
+		return // No alignment restrictions when accessing a byte
+	}
+	var checkBits uint64
+	switch operationSizeInBytes {
+	case 2:
+		checkBits = 0b1
+	case 4:
+		checkBits = 0b11
+	case 8:
+		checkBits = 0b111
+	}
+
+	builder := c.ssaBuilder
+
+	mask := builder.AllocateInstruction().AsIconst64(checkBits).Insert(builder).Return()
+	masked := builder.AllocateInstruction().AsBand(addr, mask).Insert(builder).Return()
+	zero := builder.AllocateInstruction().AsIconst64(0).Insert(builder).Return()
+	cmp := builder.AllocateInstruction().AsIcmp(masked, zero, ssa.IntegerCmpCondNotEqual).Insert(builder).Return()
+	builder.AllocateInstruction().AsExitIfTrueWithCode(c.execCtxPtrValue, cmp, wazevoapi.ExitCodeUnalignedAtomic).Insert(builder)
+}
+
+func (c *Compiler) callMemmove(dst, src, size ssa.Value) {
+	args := c.allocateVarLengthValues(3, dst, src, size)
+	if size.Type() != ssa.TypeI64 {
+		panic("TODO: memmove size must be i64")
+	}
+
+	builder := c.ssaBuilder
+	memmovePtr := builder.AllocateInstruction().
+		AsLoad(c.execCtxPtrValue,
+			wazevoapi.ExecutionContextOffsetMemmoveAddress.U32(),
+			ssa.TypeI64,
+		).Insert(builder).Return()
+	builder.AllocateInstruction().AsCallGoRuntimeMemmove(memmovePtr, &c.memmoveSig, args).Insert(builder)
+}
+
+func (c *Compiler) reloadAfterCall() {
+	// Note that when these are not used in the following instructions, they will be optimized out.
+	// So in any ways, we define them!
+
+	// After calling any function, memory buffer might have changed. So we need to re-define the variable.
+	// However, if the memory is shared, we don't need to reload the memory base and length as the base will never change.
+	if c.needMemory && !c.memoryShared {
+		c.reloadMemoryBaseLen()
+	}
+
+	// Also, any mutable Global can change.
+	for _, index := range c.mutableGlobalVariablesIndexes {
+		_ = c.getWasmGlobalValue(index, true)
+	}
+}
+
+func (c *Compiler) reloadMemoryBaseLen() {
+	_ = c.getMemoryBaseValue(true)
+	_ = c.getMemoryLenValue(true)
+
+	// This function being called means that the memory base might have changed.
+	// Therefore, we need to clear the absolute addresses recorded in the known safe bounds
+	// because we cache the absolute address of the memory access per each base offset.
+	c.resetAbsoluteAddressInSafeBounds()
+}
+
+func (c *Compiler) setWasmGlobalValue(index wasm.Index, v ssa.Value) {
+	variable := c.globalVariables[index]
+	opaqueOffset := c.offset.GlobalInstanceOffset(index)
+
+	builder := c.ssaBuilder
+	if index < c.m.ImportGlobalCount {
+		loadGlobalInstPtr := builder.AllocateInstruction()
+		loadGlobalInstPtr.AsLoad(c.moduleCtxPtrValue, uint32(opaqueOffset), ssa.TypeI64)
+		builder.InsertInstruction(loadGlobalInstPtr)
+
+		store := builder.AllocateInstruction()
+		store.AsStore(ssa.OpcodeStore, v, loadGlobalInstPtr.Return(), uint32(0))
+		builder.InsertInstruction(store)
+
+	} else {
+		store := builder.AllocateInstruction()
+		store.AsStore(ssa.OpcodeStore, v, c.moduleCtxPtrValue, uint32(opaqueOffset))
+		builder.InsertInstruction(store)
+	}
+
+	// The value has changed to `v`, so we record it.
+	builder.DefineVariableInCurrentBB(variable, v)
+}
+
+func (c *Compiler) getWasmGlobalValue(index wasm.Index, forceLoad bool) ssa.Value {
+	variable := c.globalVariables[index]
+	typ := c.globalVariablesTypes[index]
+	opaqueOffset := c.offset.GlobalInstanceOffset(index)
+
+	builder := c.ssaBuilder
+	if !forceLoad {
+		if v := builder.FindValueInLinearPath(variable); v.Valid() {
+			return v
+		}
+	}
+
+	var load *ssa.Instruction
+	if index < c.m.ImportGlobalCount {
+		loadGlobalInstPtr := builder.AllocateInstruction()
+		loadGlobalInstPtr.AsLoad(c.moduleCtxPtrValue, uint32(opaqueOffset), ssa.TypeI64)
+		builder.InsertInstruction(loadGlobalInstPtr)
+		load = builder.AllocateInstruction().
+			AsLoad(loadGlobalInstPtr.Return(), uint32(0), typ)
+	} else {
+		load = builder.AllocateInstruction().
+			AsLoad(c.moduleCtxPtrValue, uint32(opaqueOffset), typ)
+	}
+
+	v := load.Insert(builder).Return()
+	builder.DefineVariableInCurrentBB(variable, v)
+	return v
+}
+
+const (
+	memoryInstanceBufOffset     = 0
+	memoryInstanceBufSizeOffset = memoryInstanceBufOffset + 8
+)
+
+func (c *Compiler) getMemoryBaseValue(forceReload bool) ssa.Value {
+	builder := c.ssaBuilder
+	variable := c.memoryBaseVariable
+	if !forceReload {
+		if v := builder.FindValueInLinearPath(variable); v.Valid() {
+			return v
+		}
+	}
+
+	var ret ssa.Value
+	if c.offset.LocalMemoryBegin < 0 {
+		loadMemInstPtr := builder.AllocateInstruction()
+		loadMemInstPtr.AsLoad(c.moduleCtxPtrValue, c.offset.ImportedMemoryBegin.U32(), ssa.TypeI64)
+		builder.InsertInstruction(loadMemInstPtr)
+		memInstPtr := loadMemInstPtr.Return()
+
+		loadBufPtr := builder.AllocateInstruction()
+		loadBufPtr.AsLoad(memInstPtr, memoryInstanceBufOffset, ssa.TypeI64)
+		builder.InsertInstruction(loadBufPtr)
+		ret = loadBufPtr.Return()
+	} else {
+		load := builder.AllocateInstruction()
+		load.AsLoad(c.moduleCtxPtrValue, c.offset.LocalMemoryBase().U32(), ssa.TypeI64)
+		builder.InsertInstruction(load)
+		ret = load.Return()
+	}
+
+	builder.DefineVariableInCurrentBB(variable, ret)
+	return ret
+}
+
+func (c *Compiler) getMemoryLenValue(forceReload bool) ssa.Value {
+	variable := c.memoryLenVariable
+	builder := c.ssaBuilder
+	if !forceReload && !c.memoryShared {
+		if v := builder.FindValueInLinearPath(variable); v.Valid() {
+			return v
+		}
+	}
+
+	var ret ssa.Value
+	if c.offset.LocalMemoryBegin < 0 {
+		loadMemInstPtr := builder.AllocateInstruction()
+		loadMemInstPtr.AsLoad(c.moduleCtxPtrValue, c.offset.ImportedMemoryBegin.U32(), ssa.TypeI64)
+		builder.InsertInstruction(loadMemInstPtr)
+		memInstPtr := loadMemInstPtr.Return()
+
+		loadBufSizePtr := builder.AllocateInstruction()
+		if c.memoryShared {
+			sizeOffset := builder.AllocateInstruction().AsIconst64(memoryInstanceBufSizeOffset).Insert(builder).Return()
+			addr := builder.AllocateInstruction().AsIadd(memInstPtr, sizeOffset).Insert(builder).Return()
+			loadBufSizePtr.AsAtomicLoad(addr, 8, ssa.TypeI64)
+		} else {
+			loadBufSizePtr.AsLoad(memInstPtr, memoryInstanceBufSizeOffset, ssa.TypeI64)
+		}
+		builder.InsertInstruction(loadBufSizePtr)
+
+		ret = loadBufSizePtr.Return()
+	} else {
+		load := builder.AllocateInstruction()
+		if c.memoryShared {
+			lenOffset := builder.AllocateInstruction().AsIconst64(c.offset.LocalMemoryLen().U64()).Insert(builder).Return()
+			addr := builder.AllocateInstruction().AsIadd(c.moduleCtxPtrValue, lenOffset).Insert(builder).Return()
+			load.AsAtomicLoad(addr, 8, ssa.TypeI64)
+		} else {
+			load.AsExtLoad(ssa.OpcodeUload32, c.moduleCtxPtrValue, c.offset.LocalMemoryLen().U32(), true)
+		}
+		builder.InsertInstruction(load)
+		ret = load.Return()
+	}
+
+	builder.DefineVariableInCurrentBB(variable, ret)
+	return ret
+}
+
+func (c *Compiler) insertIcmp(cond ssa.IntegerCmpCond) {
+	state, builder := c.state(), c.ssaBuilder
+	y, x := state.pop(), state.pop()
+	cmp := builder.AllocateInstruction()
+	cmp.AsIcmp(x, y, cond)
+	builder.InsertInstruction(cmp)
+	value := cmp.Return()
+	state.push(value)
+}
+
+func (c *Compiler) insertFcmp(cond ssa.FloatCmpCond) {
+	state, builder := c.state(), c.ssaBuilder
+	y, x := state.pop(), state.pop()
+	cmp := builder.AllocateInstruction()
+	cmp.AsFcmp(x, y, cond)
+	builder.InsertInstruction(cmp)
+	value := cmp.Return()
+	state.push(value)
+}
+
+// storeCallerModuleContext stores the current module's moduleContextPtr into execContext.callerModuleContextPtr.
+func (c *Compiler) storeCallerModuleContext() {
+	builder := c.ssaBuilder
+	execCtx := c.execCtxPtrValue
+	store := builder.AllocateInstruction()
+	store.AsStore(ssa.OpcodeStore,
+		c.moduleCtxPtrValue, execCtx, wazevoapi.ExecutionContextOffsetCallerModuleContextPtr.U32())
+	builder.InsertInstruction(store)
+}
+
+func (c *Compiler) readByte() byte {
+	v := c.wasmFunctionBody[c.loweringState.pc+1]
+	c.loweringState.pc++
+	return v
+}
+
+func (c *Compiler) readI32u() uint32 {
+	v, n, err := leb128.LoadUint32(c.wasmFunctionBody[c.loweringState.pc+1:])
+	if err != nil {
+		panic(err) // shouldn't be reached since compilation comes after validation.
+	}
+	c.loweringState.pc += int(n)
+	return v
+}
+
+func (c *Compiler) readI32s() int32 {
+	v, n, err := leb128.LoadInt32(c.wasmFunctionBody[c.loweringState.pc+1:])
+	if err != nil {
+		panic(err) // shouldn't be reached since compilation comes after validation.
+	}
+	c.loweringState.pc += int(n)
+	return v
+}
+
+func (c *Compiler) readI64s() int64 {
+	v, n, err := leb128.LoadInt64(c.wasmFunctionBody[c.loweringState.pc+1:])
+	if err != nil {
+		panic(err) // shouldn't be reached since compilation comes after validation.
+	}
+	c.loweringState.pc += int(n)
+	return v
+}
+
+func (c *Compiler) readF32() float32 {
+	v := math.Float32frombits(binary.LittleEndian.Uint32(c.wasmFunctionBody[c.loweringState.pc+1:]))
+	c.loweringState.pc += 4
+	return v
+}
+
+func (c *Compiler) readF64() float64 {
+	v := math.Float64frombits(binary.LittleEndian.Uint64(c.wasmFunctionBody[c.loweringState.pc+1:]))
+	c.loweringState.pc += 8
+	return v
+}
+
+// readBlockType reads the block type from the current position of the bytecode reader.
+func (c *Compiler) readBlockType() *wasm.FunctionType {
+	state := c.state()
+
+	c.br.Reset(c.wasmFunctionBody[state.pc+1:])
+	bt, num, err := wasm.DecodeBlockType(c.m.TypeSection, c.br, api.CoreFeaturesV2)
+	if err != nil {
+		panic(err) // shouldn't be reached since compilation comes after validation.
+	}
+	state.pc += int(num)
+
+	return bt
+}
+
+func (c *Compiler) readMemArg() (align, offset uint32) {
+	state := c.state()
+
+	align, num, err := leb128.LoadUint32(c.wasmFunctionBody[state.pc+1:])
+	if err != nil {
+		panic(fmt.Errorf("read memory align: %v", err))
+	}
+
+	state.pc += int(num)
+	offset, num, err = leb128.LoadUint32(c.wasmFunctionBody[state.pc+1:])
+	if err != nil {
+		panic(fmt.Errorf("read memory offset: %v", err))
+	}
+
+	state.pc += int(num)
+	return align, offset
+}
+
+// insertJumpToBlock inserts a jump instruction to the given block in the current block.
+func (c *Compiler) insertJumpToBlock(args ssa.Values, targetBlk ssa.BasicBlock) {
+	if targetBlk.ReturnBlock() {
+		if c.needListener {
+			c.callListenerAfter()
+		}
+	}
+
+	builder := c.ssaBuilder
+	jmp := builder.AllocateInstruction()
+	jmp.AsJump(args, targetBlk)
+	builder.InsertInstruction(jmp)
+}
+
+func (c *Compiler) insertIntegerExtend(signed bool, from, to byte) {
+	state := c.state()
+	builder := c.ssaBuilder
+	v := state.pop()
+	extend := builder.AllocateInstruction()
+	if signed {
+		extend.AsSExtend(v, from, to)
+	} else {
+		extend.AsUExtend(v, from, to)
+	}
+	builder.InsertInstruction(extend)
+	value := extend.Return()
+	state.push(value)
+}
+
+func (c *Compiler) switchTo(originalStackLen int, targetBlk ssa.BasicBlock) {
+	if targetBlk.Preds() == 0 {
+		c.loweringState.unreachable = true
+	}
+
+	// Now we should adjust the stack and start translating the continuation block.
+	c.loweringState.values = c.loweringState.values[:originalStackLen]
+
+	c.ssaBuilder.SetCurrentBlock(targetBlk)
+
+	// At this point, blocks params consist only of the Wasm-level parameters,
+	// (since it's added only when we are trying to resolve variable *inside* this block).
+	for i := 0; i < targetBlk.Params(); i++ {
+		value := targetBlk.Param(i)
+		c.loweringState.push(value)
+	}
+}
+
+// results returns the number of results of the current function.
+func (c *Compiler) results() int {
+	return len(c.wasmFunctionTyp.Results)
+}
+
+func (c *Compiler) lowerBrTable(labels []uint32, index ssa.Value) {
+	state := c.state()
+	builder := c.ssaBuilder
+
+	f := state.ctrlPeekAt(int(labels[0]))
+	var numArgs int
+	if f.isLoop() {
+		numArgs = len(f.blockType.Params)
+	} else {
+		numArgs = len(f.blockType.Results)
+	}
+
+	targets := make([]ssa.BasicBlock, len(labels))
+
+	// We need trampoline blocks since depending on the target block structure, we might end up inserting moves before jumps,
+	// which cannot be done with br_table. Instead, we can do such per-block moves in the trampoline blocks.
+	// At the linking phase (very end of the backend), we can remove the unnecessary jumps, and therefore no runtime overhead.
+	currentBlk := builder.CurrentBlock()
+	for i, l := range labels {
+		// Args are always on the top of the stack. Note that we should not share the args slice
+		// among the jump instructions since the args are modified during passes (e.g. redundant phi elimination).
+		args := c.nPeekDup(numArgs)
+		targetBlk, _ := state.brTargetArgNumFor(l)
+		trampoline := builder.AllocateBasicBlock()
+		builder.SetCurrentBlock(trampoline)
+		c.insertJumpToBlock(args, targetBlk)
+		targets[i] = trampoline
+	}
+	builder.SetCurrentBlock(currentBlk)
+
+	// If the target block has no arguments, we can just jump to the target block.
+	brTable := builder.AllocateInstruction()
+	brTable.AsBrTable(index, targets)
+	builder.InsertInstruction(brTable)
+
+	for _, trampoline := range targets {
+		builder.Seal(trampoline)
+	}
+}
+
+func (l *loweringState) brTargetArgNumFor(labelIndex uint32) (targetBlk ssa.BasicBlock, argNum int) {
+	targetFrame := l.ctrlPeekAt(int(labelIndex))
+	if targetFrame.isLoop() {
+		targetBlk, argNum = targetFrame.blk, len(targetFrame.blockType.Params)
+	} else {
+		targetBlk, argNum = targetFrame.followingBlock, len(targetFrame.blockType.Results)
+	}
+	return
+}
+
+func (c *Compiler) callListenerBefore() {
+	c.storeCallerModuleContext()
+
+	builder := c.ssaBuilder
+	beforeListeners1stElement := builder.AllocateInstruction().
+		AsLoad(c.moduleCtxPtrValue,
+			c.offset.BeforeListenerTrampolines1stElement.U32(),
+			ssa.TypeI64,
+		).Insert(builder).Return()
+
+	beforeListenerPtr := builder.AllocateInstruction().
+		AsLoad(beforeListeners1stElement, uint32(c.wasmFunctionTypeIndex)*8 /* 8 bytes per index */, ssa.TypeI64).Insert(builder).Return()
+
+	entry := builder.EntryBlock()
+	ps := entry.Params()
+
+	args := c.allocateVarLengthValues(ps, c.execCtxPtrValue,
+		builder.AllocateInstruction().AsIconst32(c.wasmLocalFunctionIndex).Insert(builder).Return())
+	for i := 2; i < ps; i++ {
+		args = args.Append(builder.VarLengthPool(), entry.Param(i))
+	}
+
+	beforeSig := c.listenerSignatures[c.wasmFunctionTyp][0]
+	builder.AllocateInstruction().
+		AsCallIndirect(beforeListenerPtr, beforeSig, args).
+		Insert(builder)
+}
+
+func (c *Compiler) callListenerAfter() {
+	c.storeCallerModuleContext()
+
+	builder := c.ssaBuilder
+	afterListeners1stElement := builder.AllocateInstruction().
+		AsLoad(c.moduleCtxPtrValue,
+			c.offset.AfterListenerTrampolines1stElement.U32(),
+			ssa.TypeI64,
+		).Insert(builder).Return()
+
+	afterListenerPtr := builder.AllocateInstruction().
+		AsLoad(afterListeners1stElement,
+			uint32(c.wasmFunctionTypeIndex)*8 /* 8 bytes per index */, ssa.TypeI64).
+		Insert(builder).
+		Return()
+
+	afterSig := c.listenerSignatures[c.wasmFunctionTyp][1]
+	args := c.allocateVarLengthValues(
+		c.results()+2,
+		c.execCtxPtrValue,
+		builder.AllocateInstruction().AsIconst32(c.wasmLocalFunctionIndex).Insert(builder).Return(),
+	)
+
+	l := c.state()
+	tail := len(l.values)
+	args = args.Append(c.ssaBuilder.VarLengthPool(), l.values[tail-c.results():tail]...)
+	builder.AllocateInstruction().
+		AsCallIndirect(afterListenerPtr, afterSig, args).
+		Insert(builder)
+}
+
+const (
+	elementOrDataInstanceLenOffset = 8
+	elementOrDataInstanceSize      = 24
+)
+
+// dropInstance inserts instructions to drop the element/data instance specified by the given index.
+func (c *Compiler) dropDataOrElementInstance(index uint32, firstItemOffset wazevoapi.Offset) {
+	builder := c.ssaBuilder
+	instPtr := c.dataOrElementInstanceAddr(index, firstItemOffset)
+
+	zero := builder.AllocateInstruction().AsIconst64(0).Insert(builder).Return()
+
+	// Clear the instance.
+	builder.AllocateInstruction().AsStore(ssa.OpcodeStore, zero, instPtr, 0).Insert(builder)
+	builder.AllocateInstruction().AsStore(ssa.OpcodeStore, zero, instPtr, elementOrDataInstanceLenOffset).Insert(builder)
+	builder.AllocateInstruction().AsStore(ssa.OpcodeStore, zero, instPtr, elementOrDataInstanceLenOffset+8).Insert(builder)
+}
+
+func (c *Compiler) dataOrElementInstanceAddr(index uint32, firstItemOffset wazevoapi.Offset) ssa.Value {
+	builder := c.ssaBuilder
+
+	_1stItemPtr := builder.
+		AllocateInstruction().
+		AsLoad(c.moduleCtxPtrValue, firstItemOffset.U32(), ssa.TypeI64).
+		Insert(builder).Return()
+
+	// Each data/element instance is a slice, so we need to multiply index by 16 to get the offset of the target instance.
+	index = index * elementOrDataInstanceSize
+	indexExt := builder.AllocateInstruction().AsIconst64(uint64(index)).Insert(builder).Return()
+	// Then, add the offset to the address of the instance.
+	instPtr := builder.AllocateInstruction().AsIadd(_1stItemPtr, indexExt).Insert(builder).Return()
+	return instPtr
+}
+
+func (c *Compiler) boundsCheckInDataOrElementInstance(instPtr, offsetInInstance, copySize ssa.Value, exitCode wazevoapi.ExitCode) {
+	builder := c.ssaBuilder
+	dataInstLen := builder.AllocateInstruction().
+		AsLoad(instPtr, elementOrDataInstanceLenOffset, ssa.TypeI64).
+		Insert(builder).Return()
+	ceil := builder.AllocateInstruction().AsIadd(offsetInInstance, copySize).Insert(builder).Return()
+	cmp := builder.AllocateInstruction().
+		AsIcmp(dataInstLen, ceil, ssa.IntegerCmpCondUnsignedLessThan).
+		Insert(builder).
+		Return()
+	builder.AllocateInstruction().
+		AsExitIfTrueWithCode(c.execCtxPtrValue, cmp, exitCode).
+		Insert(builder)
+}
+
+func (c *Compiler) boundsCheckInTable(tableIndex uint32, offset, size ssa.Value) (tableInstancePtr ssa.Value) {
+	builder := c.ssaBuilder
+	dstCeil := builder.AllocateInstruction().AsIadd(offset, size).Insert(builder).Return()
+
+	// Load the table.
+	tableInstancePtr = builder.AllocateInstruction().
+		AsLoad(c.moduleCtxPtrValue, c.offset.TableOffset(int(tableIndex)).U32(), ssa.TypeI64).
+		Insert(builder).Return()
+
+	// Load the table's length.
+	tableLen := builder.AllocateInstruction().
+		AsLoad(tableInstancePtr, tableInstanceLenOffset, ssa.TypeI32).Insert(builder).Return()
+	tableLenExt := builder.AllocateInstruction().AsUExtend(tableLen, 32, 64).Insert(builder).Return()
+
+	// Compare the length and the target, and trap if out of bounds.
+	checkOOB := builder.AllocateInstruction()
+	checkOOB.AsIcmp(tableLenExt, dstCeil, ssa.IntegerCmpCondUnsignedLessThan)
+	builder.InsertInstruction(checkOOB)
+	exitIfOOB := builder.AllocateInstruction()
+	exitIfOOB.AsExitIfTrueWithCode(c.execCtxPtrValue, checkOOB.Return(), wazevoapi.ExitCodeTableOutOfBounds)
+	builder.InsertInstruction(exitIfOOB)
+	return
+}
+
+func (c *Compiler) loadTableBaseAddr(tableInstancePtr ssa.Value) ssa.Value {
+	builder := c.ssaBuilder
+	loadTableBaseAddress := builder.
+		AllocateInstruction().
+		AsLoad(tableInstancePtr, tableInstanceBaseAddressOffset, ssa.TypeI64).
+		Insert(builder)
+	return loadTableBaseAddress.Return()
+}
+
+func (c *Compiler) boundsCheckInMemory(memLen, offset, size ssa.Value) {
+	builder := c.ssaBuilder
+	ceil := builder.AllocateInstruction().AsIadd(offset, size).Insert(builder).Return()
+	cmp := builder.AllocateInstruction().
+		AsIcmp(memLen, ceil, ssa.IntegerCmpCondUnsignedLessThan).
+		Insert(builder).
+		Return()
+	builder.AllocateInstruction().
+		AsExitIfTrueWithCode(c.execCtxPtrValue, cmp, wazevoapi.ExitCodeMemoryOutOfBounds).
+		Insert(builder)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/misc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/misc.go
new file mode 100644
index 000000000..2db2b892c
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/misc.go
@@ -0,0 +1,10 @@
+package frontend
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/wasm"
+)
+
+func FunctionIndexToFuncRef(idx wasm.Index) ssa.FuncRef {
+	return ssa.FuncRef(idx)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id.go
new file mode 100644
index 000000000..1296706f5
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id.go
@@ -0,0 +1,15 @@
+//go:build go1.21
+
+package frontend
+
+import (
+	"slices"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+func sortSSAValueIDs(IDs []ssa.ValueID) {
+	slices.SortFunc(IDs, func(i, j ssa.ValueID) int {
+		return int(i) - int(j)
+	})
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id_old.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id_old.go
new file mode 100644
index 000000000..2e786a160
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id_old.go
@@ -0,0 +1,17 @@
+//go:build !go1.21
+
+// TODO: delete after the floor Go version is 1.21
+
+package frontend
+
+import (
+	"sort"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+func sortSSAValueIDs(IDs []ssa.ValueID) {
+	sort.SliceStable(IDs, func(i, j int) bool {
+		return int(IDs[i]) < int(IDs[j])
+	})
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/hostmodule.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/hostmodule.go
new file mode 100644
index 000000000..8da7347a9
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/hostmodule.go
@@ -0,0 +1,82 @@
+package wazevo
+
+import (
+	"encoding/binary"
+	"reflect"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/experimental"
+	"github.com/tetratelabs/wazero/internal/wasm"
+)
+
+func buildHostModuleOpaque(m *wasm.Module, listeners []experimental.FunctionListener) moduleContextOpaque {
+	size := len(m.CodeSection)*16 + 32
+	ret := newAlignedOpaque(size)
+
+	binary.LittleEndian.PutUint64(ret[0:], uint64(uintptr(unsafe.Pointer(m))))
+
+	if len(listeners) > 0 {
+		sliceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&listeners))
+		binary.LittleEndian.PutUint64(ret[8:], uint64(sliceHeader.Data))
+		binary.LittleEndian.PutUint64(ret[16:], uint64(sliceHeader.Len))
+		binary.LittleEndian.PutUint64(ret[24:], uint64(sliceHeader.Cap))
+	}
+
+	offset := 32
+	for i := range m.CodeSection {
+		goFn := m.CodeSection[i].GoFunc
+		writeIface(goFn, ret[offset:])
+		offset += 16
+	}
+	return ret
+}
+
+func hostModuleFromOpaque(opaqueBegin uintptr) *wasm.Module {
+	var opaqueViewOverSlice []byte
+	sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaqueViewOverSlice))
+	sh.Data = opaqueBegin
+	sh.Len = 32
+	sh.Cap = 32
+	return *(**wasm.Module)(unsafe.Pointer(&opaqueViewOverSlice[0]))
+}
+
+func hostModuleListenersSliceFromOpaque(opaqueBegin uintptr) []experimental.FunctionListener {
+	var opaqueViewOverSlice []byte
+	sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaqueViewOverSlice))
+	sh.Data = opaqueBegin
+	sh.Len = 32
+	sh.Cap = 32
+
+	b := binary.LittleEndian.Uint64(opaqueViewOverSlice[8:])
+	l := binary.LittleEndian.Uint64(opaqueViewOverSlice[16:])
+	c := binary.LittleEndian.Uint64(opaqueViewOverSlice[24:])
+	var ret []experimental.FunctionListener
+	sh = (*reflect.SliceHeader)(unsafe.Pointer(&ret))
+	sh.Data = uintptr(b)
+	setSliceLimits(sh, uintptr(l), uintptr(c))
+	return ret
+}
+
+func hostModuleGoFuncFromOpaque[T any](index int, opaqueBegin uintptr) T {
+	offset := uintptr(index*16) + 32
+	ptr := opaqueBegin + offset
+
+	var opaqueViewOverFunction []byte
+	sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaqueViewOverFunction))
+	sh.Data = ptr
+	sh.Len = 16
+	sh.Cap = 16
+	return readIface(opaqueViewOverFunction).(T)
+}
+
+func writeIface(goFn interface{}, buf []byte) {
+	goFnIface := *(*[2]uint64)(unsafe.Pointer(&goFn))
+	binary.LittleEndian.PutUint64(buf, goFnIface[0])
+	binary.LittleEndian.PutUint64(buf[8:], goFnIface[1])
+}
+
+func readIface(buf []byte) interface{} {
+	b := binary.LittleEndian.Uint64(buf)
+	s := binary.LittleEndian.Uint64(buf[8:])
+	return *(*interface{})(unsafe.Pointer(&[2]uint64{b, s}))
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_amd64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_amd64.go
new file mode 100644
index 000000000..da27cc108
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_amd64.go
@@ -0,0 +1,30 @@
+//go:build amd64
+
+package wazevo
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64"
+)
+
+func newMachine() backend.Machine {
+	return amd64.NewBackend()
+}
+
+// unwindStack is a function to unwind the stack, and appends return addresses to `returnAddresses` slice.
+// The implementation must be aligned with the ABI/Calling convention.
+func unwindStack(sp, fp, top uintptr, returnAddresses []uintptr) []uintptr {
+	return amd64.UnwindStack(sp, fp, top, returnAddresses)
+}
+
+// goCallStackView is a function to get a view of the stack before a Go call, which
+// is the view of the stack allocated in CompileGoFunctionTrampoline.
+func goCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
+	return amd64.GoCallStackView(stackPointerBeforeGoCall)
+}
+
+// adjustClonedStack is a function to adjust the stack after it is grown.
+// More precisely, absolute addresses (frame pointers) in the stack must be adjusted.
+func adjustClonedStack(oldsp, oldTop, sp, fp, top uintptr) {
+	amd64.AdjustClonedStack(oldsp, oldTop, sp, fp, top)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_arm64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_arm64.go
new file mode 100644
index 000000000..e7a846548
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_arm64.go
@@ -0,0 +1,32 @@
+//go:build arm64
+
+package wazevo
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64"
+)
+
+func newMachine() backend.Machine {
+	return arm64.NewBackend()
+}
+
+// unwindStack is a function to unwind the stack, and appends return addresses to `returnAddresses` slice.
+// The implementation must be aligned with the ABI/Calling convention.
+func unwindStack(sp, fp, top uintptr, returnAddresses []uintptr) []uintptr {
+	return arm64.UnwindStack(sp, fp, top, returnAddresses)
+}
+
+// goCallStackView is a function to get a view of the stack before a Go call, which
+// is the view of the stack allocated in CompileGoFunctionTrampoline.
+func goCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
+	return arm64.GoCallStackView(stackPointerBeforeGoCall)
+}
+
+// adjustClonedStack is a function to adjust the stack after it is grown.
+// More precisely, absolute addresses (frame pointers) in the stack must be adjusted.
+func adjustClonedStack(oldsp, oldTop, sp, fp, top uintptr) {
+	// TODO: currently, the frame pointers are not used, and saved old sps are relative to the current stack pointer,
+	//  so no need to adjustment on arm64. However, when we make it absolute, which in my opinion is better perf-wise
+	//  at the expense of slightly costly stack growth, we need to adjust the pushed frame pointers.
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_other.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_other.go
new file mode 100644
index 000000000..c5afc6314
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_other.go
@@ -0,0 +1,29 @@
+//go:build !(amd64 || arm64)
+
+package wazevo
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+)
+
+func newMachine() backend.Machine {
+	panic("unsupported architecture")
+}
+
+// unwindStack is a function to unwind the stack, and appends return addresses to `returnAddresses` slice.
+// The implementation must be aligned with the ABI/Calling convention.
+func unwindStack(sp, fp, top uintptr, returnAddresses []uintptr) []uintptr {
+	panic("unsupported architecture")
+}
+
+// goCallStackView is a function to get a view of the stack before a Go call, which
+// is the view of the stack allocated in CompileGoFunctionTrampoline.
+func goCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
+	panic("unsupported architecture")
+}
+
+// adjustClonedStack is a function to adjust the stack after it is grown.
+// More precisely, absolute addresses (frame pointers) in the stack must be adjusted.
+func adjustClonedStack(oldsp, oldTop, sp, fp, top uintptr) {
+	panic("unsupported architecture")
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/memmove.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/memmove.go
new file mode 100644
index 000000000..889922107
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/memmove.go
@@ -0,0 +1,11 @@
+package wazevo
+
+import (
+	"reflect"
+	"unsafe"
+)
+
+//go:linkname memmove runtime.memmove
+func memmove(_, _ unsafe.Pointer, _ uintptr)
+
+var memmovPtr = reflect.ValueOf(memmove).Pointer()
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go
new file mode 100644
index 000000000..ba8f546c0
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go
@@ -0,0 +1,344 @@
+package wazevo
+
+import (
+	"encoding/binary"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/api"
+	"github.com/tetratelabs/wazero/experimental"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+	"github.com/tetratelabs/wazero/internal/wasm"
+	"github.com/tetratelabs/wazero/internal/wasmruntime"
+)
+
+type (
+	// moduleEngine implements wasm.ModuleEngine.
+	moduleEngine struct {
+		// opaquePtr equals &opaque[0].
+		opaquePtr              *byte
+		parent                 *compiledModule
+		module                 *wasm.ModuleInstance
+		opaque                 moduleContextOpaque
+		localFunctionInstances []*functionInstance
+		importedFunctions      []importedFunction
+		listeners              []experimental.FunctionListener
+	}
+
+	functionInstance struct {
+		executable             *byte
+		moduleContextOpaquePtr *byte
+		typeID                 wasm.FunctionTypeID
+		indexInModule          wasm.Index
+	}
+
+	importedFunction struct {
+		me            *moduleEngine
+		indexInModule wasm.Index
+	}
+
+	// moduleContextOpaque is the opaque byte slice of Module instance specific contents whose size
+	// is only Wasm-compile-time known, hence dynamic. Its contents are basically the pointers to the module instance,
+	// specific objects as well as functions. This is sometimes called "VMContext" in other Wasm runtimes.
+	//
+	// Internally, the buffer is structured as follows:
+	//
+	// 	type moduleContextOpaque struct {
+	// 	    moduleInstance                            *wasm.ModuleInstance
+	// 	    localMemoryBufferPtr                      *byte                (optional)
+	// 	    localMemoryLength                         uint64               (optional)
+	// 	    importedMemoryInstance                    *wasm.MemoryInstance (optional)
+	// 	    importedMemoryOwnerOpaqueCtx              *byte                (optional)
+	// 	    importedFunctions                         [# of importedFunctions]functionInstance
+	//      importedGlobals                           []ImportedGlobal       (optional)
+	//      localGlobals                              []Global               (optional)
+	//      typeIDsBegin                              &wasm.ModuleInstance.TypeIDs[0]  (optional)
+	//      tables                                    []*wasm.TableInstance  (optional)
+	// 	    beforeListenerTrampolines1stElement       **byte                 (optional)
+	// 	    afterListenerTrampolines1stElement        **byte                 (optional)
+	//      dataInstances1stElement                   []wasm.DataInstance    (optional)
+	//      elementInstances1stElement                []wasm.ElementInstance (optional)
+	// 	}
+	//
+	//  type ImportedGlobal struct {
+	// 		*Global
+	// 		_ uint64 // padding
+	//  }
+	//
+	//  type Global struct {
+	// 		Val, ValHi uint64
+	//  }
+	//
+	// See wazevoapi.NewModuleContextOffsetData for the details of the offsets.
+	//
+	// Note that for host modules, the structure is entirely different. See buildHostModuleOpaque.
+	moduleContextOpaque []byte
+)
+
+func newAlignedOpaque(size int) moduleContextOpaque {
+	// Check if the size is a multiple of 16.
+	if size%16 != 0 {
+		panic("size must be a multiple of 16")
+	}
+	buf := make([]byte, size+16)
+	// Align the buffer to 16 bytes.
+	rem := uintptr(unsafe.Pointer(&buf[0])) % 16
+	buf = buf[16-rem:]
+	return buf
+}
+
+func putLocalMemory(opaque []byte, offset wazevoapi.Offset, mem *wasm.MemoryInstance) {
+	s := uint64(len(mem.Buffer))
+	var b uint64
+	if len(mem.Buffer) > 0 {
+		b = uint64(uintptr(unsafe.Pointer(&mem.Buffer[0])))
+	}
+	binary.LittleEndian.PutUint64(opaque[offset:], b)
+	binary.LittleEndian.PutUint64(opaque[offset+8:], s)
+}
+
+func (m *moduleEngine) setupOpaque() {
+	inst := m.module
+	offsets := &m.parent.offsets
+	opaque := m.opaque
+
+	binary.LittleEndian.PutUint64(opaque[offsets.ModuleInstanceOffset:],
+		uint64(uintptr(unsafe.Pointer(m.module))),
+	)
+
+	if lm := offsets.LocalMemoryBegin; lm >= 0 {
+		putLocalMemory(opaque, lm, inst.MemoryInstance)
+	}
+
+	// Note: imported memory is resolved in ResolveImportedFunction.
+
+	// Note: imported functions are resolved in ResolveImportedFunction.
+
+	if globalOffset := offsets.GlobalsBegin; globalOffset >= 0 {
+		for i, g := range inst.Globals {
+			if i < int(inst.Source.ImportGlobalCount) {
+				importedME := g.Me.(*moduleEngine)
+				offset := importedME.parent.offsets.GlobalInstanceOffset(g.Index)
+				importedMEOpaque := importedME.opaque
+				binary.LittleEndian.PutUint64(opaque[globalOffset:],
+					uint64(uintptr(unsafe.Pointer(&importedMEOpaque[offset]))))
+			} else {
+				binary.LittleEndian.PutUint64(opaque[globalOffset:], g.Val)
+				binary.LittleEndian.PutUint64(opaque[globalOffset+8:], g.ValHi)
+			}
+			globalOffset += 16
+		}
+	}
+
+	if tableOffset := offsets.TablesBegin; tableOffset >= 0 {
+		// First we write the first element's address of typeIDs.
+		if len(inst.TypeIDs) > 0 {
+			binary.LittleEndian.PutUint64(opaque[offsets.TypeIDs1stElement:], uint64(uintptr(unsafe.Pointer(&inst.TypeIDs[0]))))
+		}
+
+		// Then we write the table addresses.
+		for _, table := range inst.Tables {
+			binary.LittleEndian.PutUint64(opaque[tableOffset:], uint64(uintptr(unsafe.Pointer(table))))
+			tableOffset += 8
+		}
+	}
+
+	if beforeListenerOffset := offsets.BeforeListenerTrampolines1stElement; beforeListenerOffset >= 0 {
+		binary.LittleEndian.PutUint64(opaque[beforeListenerOffset:], uint64(uintptr(unsafe.Pointer(&m.parent.listenerBeforeTrampolines[0]))))
+	}
+	if afterListenerOffset := offsets.AfterListenerTrampolines1stElement; afterListenerOffset >= 0 {
+		binary.LittleEndian.PutUint64(opaque[afterListenerOffset:], uint64(uintptr(unsafe.Pointer(&m.parent.listenerAfterTrampolines[0]))))
+	}
+	if len(inst.DataInstances) > 0 {
+		binary.LittleEndian.PutUint64(opaque[offsets.DataInstances1stElement:], uint64(uintptr(unsafe.Pointer(&inst.DataInstances[0]))))
+	}
+	if len(inst.ElementInstances) > 0 {
+		binary.LittleEndian.PutUint64(opaque[offsets.ElementInstances1stElement:], uint64(uintptr(unsafe.Pointer(&inst.ElementInstances[0]))))
+	}
+}
+
+// NewFunction implements wasm.ModuleEngine.
+func (m *moduleEngine) NewFunction(index wasm.Index) api.Function {
+	if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable {
+		panic("When PrintMachineCodeHexPerFunctionDisassemblable enabled, functions must not be called")
+	}
+
+	localIndex := index
+	if importedFnCount := m.module.Source.ImportFunctionCount; index < importedFnCount {
+		imported := &m.importedFunctions[index]
+		return imported.me.NewFunction(imported.indexInModule)
+	} else {
+		localIndex -= importedFnCount
+	}
+
+	src := m.module.Source
+	typIndex := src.FunctionSection[localIndex]
+	typ := src.TypeSection[typIndex]
+	sizeOfParamResultSlice := typ.ResultNumInUint64
+	if ps := typ.ParamNumInUint64; ps > sizeOfParamResultSlice {
+		sizeOfParamResultSlice = ps
+	}
+	p := m.parent
+	offset := p.functionOffsets[localIndex]
+
+	ce := &callEngine{
+		indexInModule:          index,
+		executable:             &p.executable[offset],
+		parent:                 m,
+		preambleExecutable:     &m.parent.entryPreambles[typIndex][0],
+		sizeOfParamResultSlice: sizeOfParamResultSlice,
+		requiredParams:         typ.ParamNumInUint64,
+		numberOfResults:        typ.ResultNumInUint64,
+	}
+
+	ce.execCtx.memoryGrowTrampolineAddress = &m.parent.sharedFunctions.memoryGrowExecutable[0]
+	ce.execCtx.stackGrowCallTrampolineAddress = &m.parent.sharedFunctions.stackGrowExecutable[0]
+	ce.execCtx.checkModuleExitCodeTrampolineAddress = &m.parent.sharedFunctions.checkModuleExitCode[0]
+	ce.execCtx.tableGrowTrampolineAddress = &m.parent.sharedFunctions.tableGrowExecutable[0]
+	ce.execCtx.refFuncTrampolineAddress = &m.parent.sharedFunctions.refFuncExecutable[0]
+	ce.execCtx.memoryWait32TrampolineAddress = &m.parent.sharedFunctions.memoryWait32Executable[0]
+	ce.execCtx.memoryWait64TrampolineAddress = &m.parent.sharedFunctions.memoryWait64Executable[0]
+	ce.execCtx.memoryNotifyTrampolineAddress = &m.parent.sharedFunctions.memoryNotifyExecutable[0]
+	ce.execCtx.memmoveAddress = memmovPtr
+	ce.init()
+	return ce
+}
+
+// GetGlobalValue implements the same method as documented on wasm.ModuleEngine.
+func (m *moduleEngine) GetGlobalValue(i wasm.Index) (lo, hi uint64) {
+	offset := m.parent.offsets.GlobalInstanceOffset(i)
+	buf := m.opaque[offset:]
+	if i < m.module.Source.ImportGlobalCount {
+		panic("GetGlobalValue should not be called for imported globals")
+	}
+	return binary.LittleEndian.Uint64(buf), binary.LittleEndian.Uint64(buf[8:])
+}
+
+// SetGlobalValue implements the same method as documented on wasm.ModuleEngine.
+func (m *moduleEngine) SetGlobalValue(i wasm.Index, lo, hi uint64) {
+	offset := m.parent.offsets.GlobalInstanceOffset(i)
+	buf := m.opaque[offset:]
+	if i < m.module.Source.ImportGlobalCount {
+		panic("GetGlobalValue should not be called for imported globals")
+	}
+	binary.LittleEndian.PutUint64(buf, lo)
+	binary.LittleEndian.PutUint64(buf[8:], hi)
+}
+
+// OwnsGlobals implements the same method as documented on wasm.ModuleEngine.
+func (m *moduleEngine) OwnsGlobals() bool { return true }
+
+// ResolveImportedFunction implements wasm.ModuleEngine.
+func (m *moduleEngine) ResolveImportedFunction(index, indexInImportedModule wasm.Index, importedModuleEngine wasm.ModuleEngine) {
+	executableOffset, moduleCtxOffset, typeIDOffset := m.parent.offsets.ImportedFunctionOffset(index)
+	importedME := importedModuleEngine.(*moduleEngine)
+
+	if int(indexInImportedModule) >= len(importedME.importedFunctions) {
+		indexInImportedModule -= wasm.Index(len(importedME.importedFunctions))
+	} else {
+		imported := &importedME.importedFunctions[indexInImportedModule]
+		m.ResolveImportedFunction(index, imported.indexInModule, imported.me)
+		return // Recursively resolve the imported function.
+	}
+
+	offset := importedME.parent.functionOffsets[indexInImportedModule]
+	typeID := getTypeIDOf(indexInImportedModule, importedME.module)
+	executable := &importedME.parent.executable[offset]
+	// Write functionInstance.
+	binary.LittleEndian.PutUint64(m.opaque[executableOffset:], uint64(uintptr(unsafe.Pointer(executable))))
+	binary.LittleEndian.PutUint64(m.opaque[moduleCtxOffset:], uint64(uintptr(unsafe.Pointer(importedME.opaquePtr))))
+	binary.LittleEndian.PutUint64(m.opaque[typeIDOffset:], uint64(typeID))
+
+	// Write importedFunction so that it can be used by NewFunction.
+	m.importedFunctions[index] = importedFunction{me: importedME, indexInModule: indexInImportedModule}
+}
+
+func getTypeIDOf(funcIndex wasm.Index, m *wasm.ModuleInstance) wasm.FunctionTypeID {
+	source := m.Source
+
+	var typeIndex wasm.Index
+	if funcIndex >= source.ImportFunctionCount {
+		funcIndex -= source.ImportFunctionCount
+		typeIndex = source.FunctionSection[funcIndex]
+	} else {
+		var cnt wasm.Index
+		for i := range source.ImportSection {
+			if source.ImportSection[i].Type == wasm.ExternTypeFunc {
+				if cnt == funcIndex {
+					typeIndex = source.ImportSection[i].DescFunc
+					break
+				}
+				cnt++
+			}
+		}
+	}
+	return m.TypeIDs[typeIndex]
+}
+
+// ResolveImportedMemory implements wasm.ModuleEngine.
+func (m *moduleEngine) ResolveImportedMemory(importedModuleEngine wasm.ModuleEngine) {
+	importedME := importedModuleEngine.(*moduleEngine)
+	inst := importedME.module
+
+	var memInstPtr uint64
+	var memOwnerOpaquePtr uint64
+	if offs := importedME.parent.offsets; offs.ImportedMemoryBegin >= 0 {
+		offset := offs.ImportedMemoryBegin
+		memInstPtr = binary.LittleEndian.Uint64(importedME.opaque[offset:])
+		memOwnerOpaquePtr = binary.LittleEndian.Uint64(importedME.opaque[offset+8:])
+	} else {
+		memInstPtr = uint64(uintptr(unsafe.Pointer(inst.MemoryInstance)))
+		memOwnerOpaquePtr = uint64(uintptr(unsafe.Pointer(importedME.opaquePtr)))
+	}
+	offset := m.parent.offsets.ImportedMemoryBegin
+	binary.LittleEndian.PutUint64(m.opaque[offset:], memInstPtr)
+	binary.LittleEndian.PutUint64(m.opaque[offset+8:], memOwnerOpaquePtr)
+}
+
+// DoneInstantiation implements wasm.ModuleEngine.
+func (m *moduleEngine) DoneInstantiation() {
+	if !m.module.Source.IsHostModule {
+		m.setupOpaque()
+	}
+}
+
+// FunctionInstanceReference implements wasm.ModuleEngine.
+func (m *moduleEngine) FunctionInstanceReference(funcIndex wasm.Index) wasm.Reference {
+	if funcIndex < m.module.Source.ImportFunctionCount {
+		begin, _, _ := m.parent.offsets.ImportedFunctionOffset(funcIndex)
+		return uintptr(unsafe.Pointer(&m.opaque[begin]))
+	}
+	localIndex := funcIndex - m.module.Source.ImportFunctionCount
+	p := m.parent
+	executable := &p.executable[p.functionOffsets[localIndex]]
+	typeID := m.module.TypeIDs[m.module.Source.FunctionSection[localIndex]]
+
+	lf := &functionInstance{
+		executable:             executable,
+		moduleContextOpaquePtr: m.opaquePtr,
+		typeID:                 typeID,
+		indexInModule:          funcIndex,
+	}
+	m.localFunctionInstances = append(m.localFunctionInstances, lf)
+	return uintptr(unsafe.Pointer(lf))
+}
+
+// LookupFunction implements wasm.ModuleEngine.
+func (m *moduleEngine) LookupFunction(t *wasm.TableInstance, typeId wasm.FunctionTypeID, tableOffset wasm.Index) (*wasm.ModuleInstance, wasm.Index) {
+	if tableOffset >= uint32(len(t.References)) || t.Type != wasm.RefTypeFuncref {
+		panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+	}
+	rawPtr := t.References[tableOffset]
+	if rawPtr == 0 {
+		panic(wasmruntime.ErrRuntimeInvalidTableAccess)
+	}
+
+	tf := wazevoapi.PtrFromUintptr[functionInstance](rawPtr)
+	if tf.typeID != typeId {
+		panic(wasmruntime.ErrRuntimeIndirectCallTypeMismatch)
+	}
+	return moduleInstanceFromOpaquePtr(tf.moduleContextOpaquePtr), tf.indexInModule
+}
+
+func moduleInstanceFromOpaquePtr(ptr *byte) *wasm.ModuleInstance {
+	return *(**wasm.ModuleInstance)(unsafe.Pointer(ptr))
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect.go
new file mode 100644
index 000000000..6a03fc65c
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect.go
@@ -0,0 +1,11 @@
+//go:build !tinygo
+
+package wazevo
+
+import "reflect"
+
+// setSliceLimits sets both Cap and Len for the given reflected slice.
+func setSliceLimits(s *reflect.SliceHeader, l, c uintptr) {
+	s.Len = int(l)
+	s.Cap = int(c)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect_tinygo.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect_tinygo.go
new file mode 100644
index 000000000..eda3e706a
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect_tinygo.go
@@ -0,0 +1,11 @@
+//go:build tinygo
+
+package wazevo
+
+import "reflect"
+
+// setSliceLimits sets both Cap and Len for the given reflected slice.
+func setSliceLimits(s *reflect.SliceHeader, l, c uintptr) {
+	s.Len = l
+	s.Cap = c
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block.go
new file mode 100644
index 000000000..10b6b4b62
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block.go
@@ -0,0 +1,407 @@
+package ssa
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// BasicBlock represents the Basic Block of an SSA function.
+// Each BasicBlock always ends with branching instructions (e.g. Branch, Return, etc.),
+// and at most two branches are allowed. If there's two branches, these two are placed together at the end of the block.
+// In other words, there's no branching instruction in the middle of the block.
+//
+// Note: we use the "block argument" variant of SSA, instead of PHI functions. See the package level doc comments.
+//
+// Note: we use "parameter/param" as a placeholder which represents a variant of PHI, and "argument/arg" as an actual
+// Value passed to that "parameter/param".
+type BasicBlock interface {
+	// ID returns the unique ID of this block.
+	ID() BasicBlockID
+
+	// Name returns the unique string ID of this block. e.g. blk0, blk1, ...
+	Name() string
+
+	// AddParam adds the parameter to the block whose type specified by `t`.
+	AddParam(b Builder, t Type) Value
+
+	// Params returns the number of parameters to this block.
+	Params() int
+
+	// Param returns (Variable, Value) which corresponds to the i-th parameter of this block.
+	// The returned Value is the definition of the param in this block.
+	Param(i int) Value
+
+	// InsertInstruction inserts an instruction that implements Value into the tail of this block.
+	InsertInstruction(raw *Instruction)
+
+	// Root returns the root instruction of this block.
+	Root() *Instruction
+
+	// Tail returns the tail instruction of this block.
+	Tail() *Instruction
+
+	// EntryBlock returns true if this block represents the function entry.
+	EntryBlock() bool
+
+	// ReturnBlock returns ture if this block represents the function return.
+	ReturnBlock() bool
+
+	// FormatHeader returns the debug string of this block, not including instruction.
+	FormatHeader(b Builder) string
+
+	// Valid is true if this block is still valid even after optimizations.
+	Valid() bool
+
+	// Sealed is true if this block has been sealed.
+	Sealed() bool
+
+	// BeginPredIterator returns the first predecessor of this block.
+	BeginPredIterator() BasicBlock
+
+	// NextPredIterator returns the next predecessor of this block.
+	NextPredIterator() BasicBlock
+
+	// Preds returns the number of predecessors of this block.
+	Preds() int
+
+	// Pred returns the i-th predecessor of this block.
+	Pred(i int) BasicBlock
+
+	// Succs returns the number of successors of this block.
+	Succs() int
+
+	// Succ returns the i-th successor of this block.
+	Succ(i int) BasicBlock
+
+	// LoopHeader returns true if this block is a loop header.
+	LoopHeader() bool
+
+	// LoopNestingForestChildren returns the children of this block in the loop nesting forest.
+	LoopNestingForestChildren() []BasicBlock
+}
+
+type (
+	// basicBlock is a basic block in a SSA-transformed function.
+	basicBlock struct {
+		id                      BasicBlockID
+		rootInstr, currentInstr *Instruction
+		params                  []blockParam
+		predIter                int
+		preds                   []basicBlockPredecessorInfo
+		success                 []*basicBlock
+		// singlePred is the alias to preds[0] for fast lookup, and only set after Seal is called.
+		singlePred *basicBlock
+		// lastDefinitions maps Variable to its last definition in this block.
+		lastDefinitions map[Variable]Value
+		// unknownsValues are used in builder.findValue. The usage is well-described in the paper.
+		unknownValues []unknownValue
+		// invalid is true if this block is made invalid during optimizations.
+		invalid bool
+		// sealed is true if this is sealed (all the predecessors are known).
+		sealed bool
+		// loopHeader is true if this block is a loop header:
+		//
+		// > A loop header (sometimes called the entry point of the loop) is a dominator that is the target
+		// > of a loop-forming back edge. The loop header dominates all blocks in the loop body.
+		// > A block may be a loop header for more than one loop. A loop may have multiple entry points,
+		// > in which case it has no "loop header".
+		//
+		// See https://en.wikipedia.org/wiki/Control-flow_graph for more details.
+		//
+		// This is modified during the subPassLoopDetection pass.
+		loopHeader bool
+
+		// loopNestingForestChildren holds the children of this block in the loop nesting forest.
+		// Non-empty if and only if this block is a loop header (i.e. loopHeader=true)
+		loopNestingForestChildren []BasicBlock
+
+		// reversePostOrder is used to sort all the blocks in the function in reverse post order.
+		// This is used in builder.LayoutBlocks.
+		reversePostOrder int
+
+		// child and sibling are the ones in the dominator tree.
+		child, sibling *basicBlock
+	}
+	// BasicBlockID is the unique ID of a basicBlock.
+	BasicBlockID uint32
+
+	// blockParam implements Value and represents a parameter to a basicBlock.
+	blockParam struct {
+		// value is the Value that corresponds to the parameter in this block,
+		// and can be considered as an output of PHI instruction in traditional SSA.
+		value Value
+		// typ is the type of the parameter.
+		typ Type
+	}
+
+	unknownValue struct {
+		// variable is the variable that this unknownValue represents.
+		variable Variable
+		// value is the value that this unknownValue represents.
+		value Value
+	}
+)
+
+const basicBlockIDReturnBlock = 0xffffffff
+
+// Name implements BasicBlock.Name.
+func (bb *basicBlock) Name() string {
+	if bb.id == basicBlockIDReturnBlock {
+		return "blk_ret"
+	} else {
+		return fmt.Sprintf("blk%d", bb.id)
+	}
+}
+
+// String implements fmt.Stringer for debugging.
+func (bid BasicBlockID) String() string {
+	if bid == basicBlockIDReturnBlock {
+		return "blk_ret"
+	} else {
+		return fmt.Sprintf("blk%d", bid)
+	}
+}
+
+// ID implements BasicBlock.ID.
+func (bb *basicBlock) ID() BasicBlockID {
+	return bb.id
+}
+
+// basicBlockPredecessorInfo is the information of a predecessor of a basicBlock.
+// predecessor is determined by a pair of block and the branch instruction used to jump to the successor.
+type basicBlockPredecessorInfo struct {
+	blk    *basicBlock
+	branch *Instruction
+}
+
+// EntryBlock implements BasicBlock.EntryBlock.
+func (bb *basicBlock) EntryBlock() bool {
+	return bb.id == 0
+}
+
+// ReturnBlock implements BasicBlock.ReturnBlock.
+func (bb *basicBlock) ReturnBlock() bool {
+	return bb.id == basicBlockIDReturnBlock
+}
+
+// AddParam implements BasicBlock.AddParam.
+func (bb *basicBlock) AddParam(b Builder, typ Type) Value {
+	paramValue := b.allocateValue(typ)
+	bb.params = append(bb.params, blockParam{typ: typ, value: paramValue})
+	return paramValue
+}
+
+// addParamOn adds a parameter to this block whose value is already allocated.
+func (bb *basicBlock) addParamOn(typ Type, value Value) {
+	bb.params = append(bb.params, blockParam{typ: typ, value: value})
+}
+
+// Params implements BasicBlock.Params.
+func (bb *basicBlock) Params() int {
+	return len(bb.params)
+}
+
+// Param implements BasicBlock.Param.
+func (bb *basicBlock) Param(i int) Value {
+	p := &bb.params[i]
+	return p.value
+}
+
+// Valid implements BasicBlock.Valid.
+func (bb *basicBlock) Valid() bool {
+	return !bb.invalid
+}
+
+// Sealed implements BasicBlock.Sealed.
+func (bb *basicBlock) Sealed() bool {
+	return bb.sealed
+}
+
+// InsertInstruction implements BasicBlock.InsertInstruction.
+func (bb *basicBlock) InsertInstruction(next *Instruction) {
+	current := bb.currentInstr
+	if current != nil {
+		current.next = next
+		next.prev = current
+	} else {
+		bb.rootInstr = next
+	}
+	bb.currentInstr = next
+
+	switch next.opcode {
+	case OpcodeJump, OpcodeBrz, OpcodeBrnz:
+		target := next.blk.(*basicBlock)
+		target.addPred(bb, next)
+	case OpcodeBrTable:
+		for _, _target := range next.targets {
+			target := _target.(*basicBlock)
+			target.addPred(bb, next)
+		}
+	}
+}
+
+// NumPreds implements BasicBlock.NumPreds.
+func (bb *basicBlock) NumPreds() int {
+	return len(bb.preds)
+}
+
+// BeginPredIterator implements BasicBlock.BeginPredIterator.
+func (bb *basicBlock) BeginPredIterator() BasicBlock {
+	bb.predIter = 0
+	return bb.NextPredIterator()
+}
+
+// NextPredIterator implements BasicBlock.NextPredIterator.
+func (bb *basicBlock) NextPredIterator() BasicBlock {
+	if bb.predIter >= len(bb.preds) {
+		return nil
+	}
+	pred := bb.preds[bb.predIter].blk
+	bb.predIter++
+	return pred
+}
+
+// Preds implements BasicBlock.Preds.
+func (bb *basicBlock) Preds() int {
+	return len(bb.preds)
+}
+
+// Pred implements BasicBlock.Pred.
+func (bb *basicBlock) Pred(i int) BasicBlock {
+	return bb.preds[i].blk
+}
+
+// Succs implements BasicBlock.Succs.
+func (bb *basicBlock) Succs() int {
+	return len(bb.success)
+}
+
+// Succ implements BasicBlock.Succ.
+func (bb *basicBlock) Succ(i int) BasicBlock {
+	return bb.success[i]
+}
+
+// Root implements BasicBlock.Root.
+func (bb *basicBlock) Root() *Instruction {
+	return bb.rootInstr
+}
+
+// Tail implements BasicBlock.Tail.
+func (bb *basicBlock) Tail() *Instruction {
+	return bb.currentInstr
+}
+
+// reset resets the basicBlock to its initial state so that it can be reused for another function.
+func resetBasicBlock(bb *basicBlock) {
+	bb.params = bb.params[:0]
+	bb.rootInstr, bb.currentInstr = nil, nil
+	bb.preds = bb.preds[:0]
+	bb.success = bb.success[:0]
+	bb.invalid, bb.sealed = false, false
+	bb.singlePred = nil
+	bb.unknownValues = bb.unknownValues[:0]
+	bb.lastDefinitions = wazevoapi.ResetMap(bb.lastDefinitions)
+	bb.reversePostOrder = -1
+	bb.loopNestingForestChildren = bb.loopNestingForestChildren[:0]
+	bb.loopHeader = false
+	bb.sibling = nil
+	bb.child = nil
+}
+
+// addPred adds a predecessor to this block specified by the branch instruction.
+func (bb *basicBlock) addPred(blk BasicBlock, branch *Instruction) {
+	if bb.sealed {
+		panic("BUG: trying to add predecessor to a sealed block: " + bb.Name())
+	}
+
+	pred := blk.(*basicBlock)
+	for i := range bb.preds {
+		existingPred := &bb.preds[i]
+		if existingPred.blk == pred && existingPred.branch != branch {
+			// If the target is already added, then this must come from the same BrTable,
+			// otherwise such redundant branch should be eliminated by the frontend. (which should be simpler).
+			panic(fmt.Sprintf("BUG: redundant non BrTable jumps in %s whose targes are the same", bb.Name()))
+		}
+	}
+
+	bb.preds = append(bb.preds, basicBlockPredecessorInfo{
+		blk:    pred,
+		branch: branch,
+	})
+
+	pred.success = append(pred.success, bb)
+}
+
+// FormatHeader implements BasicBlock.FormatHeader.
+func (bb *basicBlock) FormatHeader(b Builder) string {
+	ps := make([]string, len(bb.params))
+	for i, p := range bb.params {
+		ps[i] = p.value.formatWithType(b)
+	}
+
+	if len(bb.preds) > 0 {
+		preds := make([]string, 0, len(bb.preds))
+		for _, pred := range bb.preds {
+			if pred.blk.invalid {
+				continue
+			}
+			preds = append(preds, fmt.Sprintf("blk%d", pred.blk.id))
+
+		}
+		return fmt.Sprintf("blk%d: (%s) <-- (%s)",
+			bb.id, strings.Join(ps, ","), strings.Join(preds, ","))
+	} else {
+		return fmt.Sprintf("blk%d: (%s)", bb.id, strings.Join(ps, ", "))
+	}
+}
+
+// validates validates the basicBlock for debugging purpose.
+func (bb *basicBlock) validate(b *builder) {
+	if bb.invalid {
+		panic("BUG: trying to validate an invalid block: " + bb.Name())
+	}
+	if len(bb.preds) > 0 {
+		for _, pred := range bb.preds {
+			if pred.branch.opcode != OpcodeBrTable {
+				if target := pred.branch.blk; target != bb {
+					panic(fmt.Sprintf("BUG: '%s' is not branch to %s, but to %s",
+						pred.branch.Format(b), bb.Name(), target.Name()))
+				}
+			}
+
+			var exp int
+			if bb.ReturnBlock() {
+				exp = len(b.currentSignature.Results)
+			} else {
+				exp = len(bb.params)
+			}
+
+			if len(pred.branch.vs.View()) != exp {
+				panic(fmt.Sprintf(
+					"BUG: len(argument at %s) != len(params at %s): %d != %d: %s",
+					pred.blk.Name(), bb.Name(),
+					len(pred.branch.vs.View()), len(bb.params), pred.branch.Format(b),
+				))
+			}
+
+		}
+	}
+}
+
+// String implements fmt.Stringer for debugging purpose only.
+func (bb *basicBlock) String() string {
+	return strconv.Itoa(int(bb.id))
+}
+
+// LoopNestingForestChildren implements BasicBlock.LoopNestingForestChildren.
+func (bb *basicBlock) LoopNestingForestChildren() []BasicBlock {
+	return bb.loopNestingForestChildren
+}
+
+// LoopHeader implements BasicBlock.LoopHeader.
+func (bb *basicBlock) LoopHeader() bool {
+	return bb.loopHeader
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort.go
new file mode 100644
index 000000000..e1471edc3
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort.go
@@ -0,0 +1,34 @@
+//go:build go1.21
+
+package ssa
+
+import (
+	"slices"
+)
+
+func sortBlocks(blocks []*basicBlock) {
+	slices.SortFunc(blocks, func(i, j *basicBlock) int {
+		jIsReturn := j.ReturnBlock()
+		iIsReturn := i.ReturnBlock()
+		if iIsReturn && jIsReturn {
+			return 0
+		}
+		if jIsReturn {
+			return 1
+		}
+		if iIsReturn {
+			return -1
+		}
+		iRoot, jRoot := i.rootInstr, j.rootInstr
+		if iRoot == nil && jRoot == nil { // For testing.
+			return 0
+		}
+		if jRoot == nil {
+			return 1
+		}
+		if iRoot == nil {
+			return -1
+		}
+		return i.rootInstr.id - j.rootInstr.id
+	})
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort_old.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort_old.go
new file mode 100644
index 000000000..9dc881dae
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort_old.go
@@ -0,0 +1,24 @@
+//go:build !go1.21
+
+// TODO: delete after the floor Go version is 1.21
+
+package ssa
+
+import "sort"
+
+func sortBlocks(blocks []*basicBlock) {
+	sort.SliceStable(blocks, func(i, j int) bool {
+		iBlk, jBlk := blocks[i], blocks[j]
+		if jBlk.ReturnBlock() {
+			return true
+		}
+		if iBlk.ReturnBlock() {
+			return false
+		}
+		iRoot, jRoot := iBlk.rootInstr, jBlk.rootInstr
+		if iRoot == nil || jRoot == nil { // For testing.
+			return true
+		}
+		return iBlk.rootInstr.id < jBlk.rootInstr.id
+	})
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/builder.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/builder.go
new file mode 100644
index 000000000..1fc84d2ea
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/builder.go
@@ -0,0 +1,731 @@
+package ssa
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// Builder is used to builds SSA consisting of Basic Blocks per function.
+type Builder interface {
+	// Init must be called to reuse this builder for the next function.
+	Init(typ *Signature)
+
+	// Signature returns the Signature of the currently-compiled function.
+	Signature() *Signature
+
+	// BlockIDMax returns the maximum value of BasicBlocksID existing in the currently-compiled function.
+	BlockIDMax() BasicBlockID
+
+	// AllocateBasicBlock creates a basic block in SSA function.
+	AllocateBasicBlock() BasicBlock
+
+	// CurrentBlock returns the currently handled BasicBlock which is set by the latest call to SetCurrentBlock.
+	CurrentBlock() BasicBlock
+
+	// EntryBlock returns the entry BasicBlock of the currently-compiled function.
+	EntryBlock() BasicBlock
+
+	// SetCurrentBlock sets the instruction insertion target to the BasicBlock `b`.
+	SetCurrentBlock(b BasicBlock)
+
+	// DeclareVariable declares a Variable of the given Type.
+	DeclareVariable(Type) Variable
+
+	// DefineVariable defines a variable in the `block` with value.
+	// The defining instruction will be inserted into the `block`.
+	DefineVariable(variable Variable, value Value, block BasicBlock)
+
+	// DefineVariableInCurrentBB is the same as DefineVariable except the definition is
+	// inserted into the current BasicBlock. Alias to DefineVariable(x, y, CurrentBlock()).
+	DefineVariableInCurrentBB(variable Variable, value Value)
+
+	// AllocateInstruction returns a new Instruction.
+	AllocateInstruction() *Instruction
+
+	// InsertInstruction executes BasicBlock.InsertInstruction for the currently handled basic block.
+	InsertInstruction(raw *Instruction)
+
+	// allocateValue allocates an unused Value.
+	allocateValue(typ Type) Value
+
+	// MustFindValue searches the latest definition of the given Variable and returns the result.
+	MustFindValue(variable Variable) Value
+
+	// MustFindValueInBlk is the same as MustFindValue except it searches the latest definition from the given BasicBlock.
+	MustFindValueInBlk(variable Variable, blk BasicBlock) Value
+
+	// FindValueInLinearPath tries to find the latest definition of the given Variable in the linear path to the current BasicBlock.
+	// If it cannot find the definition, or it's not sealed yet, it returns ValueInvalid.
+	FindValueInLinearPath(variable Variable) Value
+
+	// Seal declares that we've known all the predecessors to this block and were added via AddPred.
+	// After calling this, AddPred will be forbidden.
+	Seal(blk BasicBlock)
+
+	// AnnotateValue is for debugging purpose.
+	AnnotateValue(value Value, annotation string)
+
+	// DeclareSignature appends the *Signature to be referenced by various instructions (e.g. OpcodeCall).
+	DeclareSignature(signature *Signature)
+
+	// Signatures returns the slice of declared Signatures.
+	Signatures() []*Signature
+
+	// ResolveSignature returns the Signature which corresponds to SignatureID.
+	ResolveSignature(id SignatureID) *Signature
+
+	// RunPasses runs various passes on the constructed SSA function.
+	RunPasses()
+
+	// Format returns the debugging string of the SSA function.
+	Format() string
+
+	// BlockIteratorBegin initializes the state to iterate over all the valid BasicBlock(s) compiled.
+	// Combined with BlockIteratorNext, we can use this like:
+	//
+	// 	for blk := builder.BlockIteratorBegin(); blk != nil; blk = builder.BlockIteratorNext() {
+	// 		// ...
+	//	}
+	//
+	// The returned blocks are ordered in the order of AllocateBasicBlock being called.
+	BlockIteratorBegin() BasicBlock
+
+	// BlockIteratorNext advances the state for iteration initialized by BlockIteratorBegin.
+	// Returns nil if there's no unseen BasicBlock.
+	BlockIteratorNext() BasicBlock
+
+	// ValueRefCounts returns the map of ValueID to its reference count.
+	// The returned slice must not be modified.
+	ValueRefCounts() []int
+
+	// BlockIteratorReversePostOrderBegin is almost the same as BlockIteratorBegin except it returns the BasicBlock in the reverse post-order.
+	// This is available after RunPasses is run.
+	BlockIteratorReversePostOrderBegin() BasicBlock
+
+	// BlockIteratorReversePostOrderNext is almost the same as BlockIteratorPostOrderNext except it returns the BasicBlock in the reverse post-order.
+	// This is available after RunPasses is run.
+	BlockIteratorReversePostOrderNext() BasicBlock
+
+	// ReturnBlock returns the BasicBlock which is used to return from the function.
+	ReturnBlock() BasicBlock
+
+	// InsertUndefined inserts an undefined instruction at the current position.
+	InsertUndefined()
+
+	// SetCurrentSourceOffset sets the current source offset. The incoming instruction will be annotated with this offset.
+	SetCurrentSourceOffset(line SourceOffset)
+
+	// LoopNestingForestRoots returns the roots of the loop nesting forest.
+	LoopNestingForestRoots() []BasicBlock
+
+	// LowestCommonAncestor returns the lowest common ancestor in the dominator tree of the given BasicBlock(s).
+	LowestCommonAncestor(blk1, blk2 BasicBlock) BasicBlock
+
+	// Idom returns the immediate dominator of the given BasicBlock.
+	Idom(blk BasicBlock) BasicBlock
+
+	VarLengthPool() *wazevoapi.VarLengthPool[Value]
+}
+
+// NewBuilder returns a new Builder implementation.
+func NewBuilder() Builder {
+	return &builder{
+		instructionsPool:               wazevoapi.NewPool[Instruction](resetInstruction),
+		basicBlocksPool:                wazevoapi.NewPool[basicBlock](resetBasicBlock),
+		varLengthPool:                  wazevoapi.NewVarLengthPool[Value](),
+		valueAnnotations:               make(map[ValueID]string),
+		signatures:                     make(map[SignatureID]*Signature),
+		blkVisited:                     make(map[*basicBlock]int),
+		valueIDAliases:                 make(map[ValueID]Value),
+		redundantParameterIndexToValue: make(map[int]Value),
+		returnBlk:                      &basicBlock{id: basicBlockIDReturnBlock},
+	}
+}
+
+// builder implements Builder interface.
+type builder struct {
+	basicBlocksPool  wazevoapi.Pool[basicBlock]
+	instructionsPool wazevoapi.Pool[Instruction]
+	varLengthPool    wazevoapi.VarLengthPool[Value]
+	signatures       map[SignatureID]*Signature
+	currentSignature *Signature
+
+	// reversePostOrderedBasicBlocks are the BasicBlock(s) ordered in the reverse post-order after passCalculateImmediateDominators.
+	reversePostOrderedBasicBlocks []*basicBlock
+	currentBB                     *basicBlock
+	returnBlk                     *basicBlock
+
+	// variables track the types for Variable with the index regarded Variable.
+	variables []Type
+	// nextValueID is used by builder.AllocateValue.
+	nextValueID ValueID
+	// nextVariable is used by builder.AllocateVariable.
+	nextVariable Variable
+
+	valueIDAliases   map[ValueID]Value
+	valueAnnotations map[ValueID]string
+
+	// valueRefCounts is used to lower the SSA in backend, and will be calculated
+	// by the last SSA-level optimization pass.
+	valueRefCounts []int
+
+	// dominators stores the immediate dominator of each BasicBlock.
+	// The index is blockID of the BasicBlock.
+	dominators []*basicBlock
+	sparseTree dominatorSparseTree
+
+	// loopNestingForestRoots are the roots of the loop nesting forest.
+	loopNestingForestRoots []BasicBlock
+
+	// The followings are used for optimization passes/deterministic compilation.
+	instStack                      []*Instruction
+	blkVisited                     map[*basicBlock]int
+	valueIDToInstruction           []*Instruction
+	blkStack                       []*basicBlock
+	blkStack2                      []*basicBlock
+	ints                           []int
+	redundantParameterIndexToValue map[int]Value
+
+	// blockIterCur is used to implement blockIteratorBegin and blockIteratorNext.
+	blockIterCur int
+
+	// donePreBlockLayoutPasses is true if all the passes before LayoutBlocks are called.
+	donePreBlockLayoutPasses bool
+	// doneBlockLayout is true if LayoutBlocks is called.
+	doneBlockLayout bool
+	// donePostBlockLayoutPasses is true if all the passes after LayoutBlocks are called.
+	donePostBlockLayoutPasses bool
+
+	currentSourceOffset SourceOffset
+}
+
+func (b *builder) VarLengthPool() *wazevoapi.VarLengthPool[Value] {
+	return &b.varLengthPool
+}
+
+// ReturnBlock implements Builder.ReturnBlock.
+func (b *builder) ReturnBlock() BasicBlock {
+	return b.returnBlk
+}
+
+// Init implements Builder.Reset.
+func (b *builder) Init(s *Signature) {
+	b.nextVariable = 0
+	b.currentSignature = s
+	resetBasicBlock(b.returnBlk)
+	b.instructionsPool.Reset()
+	b.basicBlocksPool.Reset()
+	b.varLengthPool.Reset()
+	b.donePreBlockLayoutPasses = false
+	b.doneBlockLayout = false
+	b.donePostBlockLayoutPasses = false
+	for _, sig := range b.signatures {
+		sig.used = false
+	}
+
+	b.ints = b.ints[:0]
+	b.blkStack = b.blkStack[:0]
+	b.blkStack2 = b.blkStack2[:0]
+	b.dominators = b.dominators[:0]
+	b.loopNestingForestRoots = b.loopNestingForestRoots[:0]
+
+	for i := 0; i < b.basicBlocksPool.Allocated(); i++ {
+		blk := b.basicBlocksPool.View(i)
+		delete(b.blkVisited, blk)
+	}
+	b.basicBlocksPool.Reset()
+
+	for v := ValueID(0); v < b.nextValueID; v++ {
+		delete(b.valueAnnotations, v)
+		delete(b.valueIDAliases, v)
+		b.valueRefCounts[v] = 0
+		b.valueIDToInstruction[v] = nil
+	}
+	b.nextValueID = 0
+	b.reversePostOrderedBasicBlocks = b.reversePostOrderedBasicBlocks[:0]
+	b.doneBlockLayout = false
+	for i := range b.valueRefCounts {
+		b.valueRefCounts[i] = 0
+	}
+
+	b.currentSourceOffset = sourceOffsetUnknown
+}
+
+// Signature implements Builder.Signature.
+func (b *builder) Signature() *Signature {
+	return b.currentSignature
+}
+
+// AnnotateValue implements Builder.AnnotateValue.
+func (b *builder) AnnotateValue(value Value, a string) {
+	b.valueAnnotations[value.ID()] = a
+}
+
+// AllocateInstruction implements Builder.AllocateInstruction.
+func (b *builder) AllocateInstruction() *Instruction {
+	instr := b.instructionsPool.Allocate()
+	instr.id = b.instructionsPool.Allocated()
+	return instr
+}
+
+// DeclareSignature implements Builder.AnnotateValue.
+func (b *builder) DeclareSignature(s *Signature) {
+	b.signatures[s.ID] = s
+	s.used = false
+}
+
+// Signatures implements Builder.Signatures.
+func (b *builder) Signatures() (ret []*Signature) {
+	for _, sig := range b.signatures {
+		ret = append(ret, sig)
+	}
+	sort.Slice(ret, func(i, j int) bool {
+		return ret[i].ID < ret[j].ID
+	})
+	return
+}
+
+// SetCurrentSourceOffset implements Builder.SetCurrentSourceOffset.
+func (b *builder) SetCurrentSourceOffset(l SourceOffset) {
+	b.currentSourceOffset = l
+}
+
+func (b *builder) usedSignatures() (ret []*Signature) {
+	for _, sig := range b.signatures {
+		if sig.used {
+			ret = append(ret, sig)
+		}
+	}
+	sort.Slice(ret, func(i, j int) bool {
+		return ret[i].ID < ret[j].ID
+	})
+	return
+}
+
+// ResolveSignature implements Builder.ResolveSignature.
+func (b *builder) ResolveSignature(id SignatureID) *Signature {
+	return b.signatures[id]
+}
+
+// AllocateBasicBlock implements Builder.AllocateBasicBlock.
+func (b *builder) AllocateBasicBlock() BasicBlock {
+	return b.allocateBasicBlock()
+}
+
+// allocateBasicBlock allocates a new basicBlock.
+func (b *builder) allocateBasicBlock() *basicBlock {
+	id := BasicBlockID(b.basicBlocksPool.Allocated())
+	blk := b.basicBlocksPool.Allocate()
+	blk.id = id
+	return blk
+}
+
+// Idom implements Builder.Idom.
+func (b *builder) Idom(blk BasicBlock) BasicBlock {
+	return b.dominators[blk.ID()]
+}
+
+// InsertInstruction implements Builder.InsertInstruction.
+func (b *builder) InsertInstruction(instr *Instruction) {
+	b.currentBB.InsertInstruction(instr)
+
+	if l := b.currentSourceOffset; l.Valid() {
+		// Emit the source offset info only when the instruction has side effect because
+		// these are the only instructions that are accessed by stack unwinding.
+		// This reduces the significant amount of the offset info in the binary.
+		if instr.sideEffect() != sideEffectNone {
+			instr.annotateSourceOffset(l)
+		}
+	}
+
+	resultTypesFn := instructionReturnTypes[instr.opcode]
+	if resultTypesFn == nil {
+		panic("TODO: " + instr.Format(b))
+	}
+
+	t1, ts := resultTypesFn(b, instr)
+	if t1.invalid() {
+		return
+	}
+
+	r1 := b.allocateValue(t1)
+	instr.rValue = r1
+
+	tsl := len(ts)
+	if tsl == 0 {
+		return
+	}
+
+	rValues := b.varLengthPool.Allocate(tsl)
+	for i := 0; i < tsl; i++ {
+		rValues = rValues.Append(&b.varLengthPool, b.allocateValue(ts[i]))
+	}
+	instr.rValues = rValues
+}
+
+// DefineVariable implements Builder.DefineVariable.
+func (b *builder) DefineVariable(variable Variable, value Value, block BasicBlock) {
+	if b.variables[variable].invalid() {
+		panic("BUG: trying to define variable " + variable.String() + " but is not declared yet")
+	}
+
+	if b.variables[variable] != value.Type() {
+		panic(fmt.Sprintf("BUG: inconsistent type for variable %d: expected %s but got %s", variable, b.variables[variable], value.Type()))
+	}
+	bb := block.(*basicBlock)
+	bb.lastDefinitions[variable] = value
+}
+
+// DefineVariableInCurrentBB implements Builder.DefineVariableInCurrentBB.
+func (b *builder) DefineVariableInCurrentBB(variable Variable, value Value) {
+	b.DefineVariable(variable, value, b.currentBB)
+}
+
+// SetCurrentBlock implements Builder.SetCurrentBlock.
+func (b *builder) SetCurrentBlock(bb BasicBlock) {
+	b.currentBB = bb.(*basicBlock)
+}
+
+// CurrentBlock implements Builder.CurrentBlock.
+func (b *builder) CurrentBlock() BasicBlock {
+	return b.currentBB
+}
+
+// EntryBlock implements Builder.EntryBlock.
+func (b *builder) EntryBlock() BasicBlock {
+	return b.entryBlk()
+}
+
+// DeclareVariable implements Builder.DeclareVariable.
+func (b *builder) DeclareVariable(typ Type) Variable {
+	v := b.allocateVariable()
+	iv := int(v)
+	if l := len(b.variables); l <= iv {
+		b.variables = append(b.variables, make([]Type, 2*(l+1))...)
+	}
+	b.variables[v] = typ
+	return v
+}
+
+// allocateVariable allocates a new variable.
+func (b *builder) allocateVariable() (ret Variable) {
+	ret = b.nextVariable
+	b.nextVariable++
+	return
+}
+
+// allocateValue implements Builder.AllocateValue.
+func (b *builder) allocateValue(typ Type) (v Value) {
+	v = Value(b.nextValueID)
+	v = v.setType(typ)
+	b.nextValueID++
+	return
+}
+
+// FindValueInLinearPath implements Builder.FindValueInLinearPath.
+func (b *builder) FindValueInLinearPath(variable Variable) Value {
+	return b.findValueInLinearPath(variable, b.currentBB)
+}
+
+func (b *builder) findValueInLinearPath(variable Variable, blk *basicBlock) Value {
+	if val, ok := blk.lastDefinitions[variable]; ok {
+		return val
+	} else if !blk.sealed {
+		return ValueInvalid
+	}
+
+	if pred := blk.singlePred; pred != nil {
+		// If this block is sealed and have only one predecessor,
+		// we can use the value in that block without ambiguity on definition.
+		return b.findValueInLinearPath(variable, pred)
+	}
+	if len(blk.preds) == 1 {
+		panic("BUG")
+	}
+	return ValueInvalid
+}
+
+func (b *builder) MustFindValueInBlk(variable Variable, blk BasicBlock) Value {
+	typ := b.definedVariableType(variable)
+	return b.findValue(typ, variable, blk.(*basicBlock))
+}
+
+// MustFindValue implements Builder.MustFindValue.
+func (b *builder) MustFindValue(variable Variable) Value {
+	typ := b.definedVariableType(variable)
+	return b.findValue(typ, variable, b.currentBB)
+}
+
+// findValue recursively tries to find the latest definition of a `variable`. The algorithm is described in
+// the section 2 of the paper https://link.springer.com/content/pdf/10.1007/978-3-642-37051-9_6.pdf.
+//
+// TODO: reimplement this in iterative, not recursive, to avoid stack overflow.
+func (b *builder) findValue(typ Type, variable Variable, blk *basicBlock) Value {
+	if val, ok := blk.lastDefinitions[variable]; ok {
+		// The value is already defined in this block!
+		return val
+	} else if !blk.sealed { // Incomplete CFG as in the paper.
+		// If this is not sealed, that means it might have additional unknown predecessor later on.
+		// So we temporarily define the placeholder value here (not add as a parameter yet!),
+		// and record it as unknown.
+		// The unknown values are resolved when we call seal this block via BasicBlock.Seal().
+		value := b.allocateValue(typ)
+		if wazevoapi.SSALoggingEnabled {
+			fmt.Printf("adding unknown value placeholder for %s at %d\n", variable, blk.id)
+		}
+		blk.lastDefinitions[variable] = value
+		blk.unknownValues = append(blk.unknownValues, unknownValue{
+			variable: variable,
+			value:    value,
+		})
+		return value
+	}
+
+	if pred := blk.singlePred; pred != nil {
+		// If this block is sealed and have only one predecessor,
+		// we can use the value in that block without ambiguity on definition.
+		return b.findValue(typ, variable, pred)
+	} else if len(blk.preds) == 0 {
+		panic("BUG: value is not defined for " + variable.String())
+	}
+
+	// If this block has multiple predecessors, we have to gather the definitions,
+	// and treat them as an argument to this block.
+	//
+	// The first thing is to define a new parameter to this block which may or may not be redundant, but
+	// later we eliminate trivial params in an optimization pass. This must be done before finding the
+	// definitions in the predecessors so that we can break the cycle.
+	paramValue := blk.AddParam(b, typ)
+	b.DefineVariable(variable, paramValue, blk)
+
+	// After the new param is added, we have to manipulate the original branching instructions
+	// in predecessors so that they would pass the definition of `variable` as the argument to
+	// the newly added PHI.
+	for i := range blk.preds {
+		pred := &blk.preds[i]
+		value := b.findValue(typ, variable, pred.blk)
+		pred.branch.addArgumentBranchInst(b, value)
+	}
+	return paramValue
+}
+
+// Seal implements Builder.Seal.
+func (b *builder) Seal(raw BasicBlock) {
+	blk := raw.(*basicBlock)
+	if len(blk.preds) == 1 {
+		blk.singlePred = blk.preds[0].blk
+	}
+	blk.sealed = true
+
+	for _, v := range blk.unknownValues {
+		variable, phiValue := v.variable, v.value
+		typ := b.definedVariableType(variable)
+		blk.addParamOn(typ, phiValue)
+		for i := range blk.preds {
+			pred := &blk.preds[i]
+			predValue := b.findValue(typ, variable, pred.blk)
+			if !predValue.Valid() {
+				panic("BUG: value is not defined anywhere in the predecessors in the CFG")
+			}
+			pred.branch.addArgumentBranchInst(b, predValue)
+		}
+	}
+}
+
+// definedVariableType returns the type of the given variable. If the variable is not defined yet, it panics.
+func (b *builder) definedVariableType(variable Variable) Type {
+	typ := b.variables[variable]
+	if typ.invalid() {
+		panic(fmt.Sprintf("%s is not defined yet", variable))
+	}
+	return typ
+}
+
+// Format implements Builder.Format.
+func (b *builder) Format() string {
+	str := strings.Builder{}
+	usedSigs := b.usedSignatures()
+	if len(usedSigs) > 0 {
+		str.WriteByte('\n')
+		str.WriteString("signatures:\n")
+		for _, sig := range usedSigs {
+			str.WriteByte('\t')
+			str.WriteString(sig.String())
+			str.WriteByte('\n')
+		}
+	}
+
+	var iterBegin, iterNext func() *basicBlock
+	if b.doneBlockLayout {
+		iterBegin, iterNext = b.blockIteratorReversePostOrderBegin, b.blockIteratorReversePostOrderNext
+	} else {
+		iterBegin, iterNext = b.blockIteratorBegin, b.blockIteratorNext
+	}
+	for bb := iterBegin(); bb != nil; bb = iterNext() {
+		str.WriteByte('\n')
+		str.WriteString(bb.FormatHeader(b))
+		str.WriteByte('\n')
+
+		for cur := bb.Root(); cur != nil; cur = cur.Next() {
+			str.WriteByte('\t')
+			str.WriteString(cur.Format(b))
+			str.WriteByte('\n')
+		}
+	}
+	return str.String()
+}
+
+// BlockIteratorNext implements Builder.BlockIteratorNext.
+func (b *builder) BlockIteratorNext() BasicBlock {
+	if blk := b.blockIteratorNext(); blk == nil {
+		return nil // BasicBlock((*basicBlock)(nil)) != BasicBlock(nil)
+	} else {
+		return blk
+	}
+}
+
+// BlockIteratorNext implements Builder.BlockIteratorNext.
+func (b *builder) blockIteratorNext() *basicBlock {
+	index := b.blockIterCur
+	for {
+		if index == b.basicBlocksPool.Allocated() {
+			return nil
+		}
+		ret := b.basicBlocksPool.View(index)
+		index++
+		if !ret.invalid {
+			b.blockIterCur = index
+			return ret
+		}
+	}
+}
+
+// BlockIteratorBegin implements Builder.BlockIteratorBegin.
+func (b *builder) BlockIteratorBegin() BasicBlock {
+	return b.blockIteratorBegin()
+}
+
+// BlockIteratorBegin implements Builder.BlockIteratorBegin.
+func (b *builder) blockIteratorBegin() *basicBlock {
+	b.blockIterCur = 0
+	return b.blockIteratorNext()
+}
+
+// BlockIteratorReversePostOrderBegin implements Builder.BlockIteratorReversePostOrderBegin.
+func (b *builder) BlockIteratorReversePostOrderBegin() BasicBlock {
+	return b.blockIteratorReversePostOrderBegin()
+}
+
+// BlockIteratorBegin implements Builder.BlockIteratorBegin.
+func (b *builder) blockIteratorReversePostOrderBegin() *basicBlock {
+	b.blockIterCur = 0
+	return b.blockIteratorReversePostOrderNext()
+}
+
+// BlockIteratorReversePostOrderNext implements Builder.BlockIteratorReversePostOrderNext.
+func (b *builder) BlockIteratorReversePostOrderNext() BasicBlock {
+	if blk := b.blockIteratorReversePostOrderNext(); blk == nil {
+		return nil // BasicBlock((*basicBlock)(nil)) != BasicBlock(nil)
+	} else {
+		return blk
+	}
+}
+
+// BlockIteratorNext implements Builder.BlockIteratorNext.
+func (b *builder) blockIteratorReversePostOrderNext() *basicBlock {
+	if b.blockIterCur >= len(b.reversePostOrderedBasicBlocks) {
+		return nil
+	} else {
+		ret := b.reversePostOrderedBasicBlocks[b.blockIterCur]
+		b.blockIterCur++
+		return ret
+	}
+}
+
+// ValueRefCounts implements Builder.ValueRefCounts.
+func (b *builder) ValueRefCounts() []int {
+	return b.valueRefCounts
+}
+
+// alias records the alias of the given values. The alias(es) will be
+// eliminated in the optimization pass via resolveArgumentAlias.
+func (b *builder) alias(dst, src Value) {
+	b.valueIDAliases[dst.ID()] = src
+}
+
+// resolveArgumentAlias resolves the alias of the arguments of the given instruction.
+func (b *builder) resolveArgumentAlias(instr *Instruction) {
+	if instr.v.Valid() {
+		instr.v = b.resolveAlias(instr.v)
+	}
+
+	if instr.v2.Valid() {
+		instr.v2 = b.resolveAlias(instr.v2)
+	}
+
+	if instr.v3.Valid() {
+		instr.v3 = b.resolveAlias(instr.v3)
+	}
+
+	view := instr.vs.View()
+	for i, v := range view {
+		view[i] = b.resolveAlias(v)
+	}
+}
+
+// resolveAlias resolves the alias of the given value.
+func (b *builder) resolveAlias(v Value) Value {
+	// Some aliases are chained, so we need to resolve them recursively.
+	for {
+		if src, ok := b.valueIDAliases[v.ID()]; ok {
+			v = src
+		} else {
+			break
+		}
+	}
+	return v
+}
+
+// entryBlk returns the entry block of the function.
+func (b *builder) entryBlk() *basicBlock {
+	return b.basicBlocksPool.View(0)
+}
+
+// isDominatedBy returns true if the given block `n` is dominated by the given block `d`.
+// Before calling this, the builder must pass by passCalculateImmediateDominators.
+func (b *builder) isDominatedBy(n *basicBlock, d *basicBlock) bool {
+	if len(b.dominators) == 0 {
+		panic("BUG: passCalculateImmediateDominators must be called before calling isDominatedBy")
+	}
+	ent := b.entryBlk()
+	doms := b.dominators
+	for n != d && n != ent {
+		n = doms[n.id]
+	}
+	return n == d
+}
+
+// BlockIDMax implements Builder.BlockIDMax.
+func (b *builder) BlockIDMax() BasicBlockID {
+	return BasicBlockID(b.basicBlocksPool.Allocated())
+}
+
+// InsertUndefined implements Builder.InsertUndefined.
+func (b *builder) InsertUndefined() {
+	instr := b.AllocateInstruction()
+	instr.opcode = OpcodeUndefined
+	b.InsertInstruction(instr)
+}
+
+// LoopNestingForestRoots implements Builder.LoopNestingForestRoots.
+func (b *builder) LoopNestingForestRoots() []BasicBlock {
+	return b.loopNestingForestRoots
+}
+
+// LowestCommonAncestor implements Builder.LowestCommonAncestor.
+func (b *builder) LowestCommonAncestor(blk1, blk2 BasicBlock) BasicBlock {
+	return b.sparseTree.findLCA(blk1.ID(), blk2.ID())
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/cmp.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/cmp.go
new file mode 100644
index 000000000..15b62ca8e
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/cmp.go
@@ -0,0 +1,107 @@
+package ssa
+
+// IntegerCmpCond represents a condition for integer comparison.
+type IntegerCmpCond byte
+
+const (
+	// IntegerCmpCondInvalid represents an invalid condition.
+	IntegerCmpCondInvalid IntegerCmpCond = iota
+	// IntegerCmpCondEqual represents "==".
+	IntegerCmpCondEqual
+	// IntegerCmpCondNotEqual represents "!=".
+	IntegerCmpCondNotEqual
+	// IntegerCmpCondSignedLessThan represents Signed "<".
+	IntegerCmpCondSignedLessThan
+	// IntegerCmpCondSignedGreaterThanOrEqual represents Signed ">=".
+	IntegerCmpCondSignedGreaterThanOrEqual
+	// IntegerCmpCondSignedGreaterThan represents Signed ">".
+	IntegerCmpCondSignedGreaterThan
+	// IntegerCmpCondSignedLessThanOrEqual represents Signed "<=".
+	IntegerCmpCondSignedLessThanOrEqual
+	// IntegerCmpCondUnsignedLessThan represents Unsigned "<".
+	IntegerCmpCondUnsignedLessThan
+	// IntegerCmpCondUnsignedGreaterThanOrEqual represents Unsigned ">=".
+	IntegerCmpCondUnsignedGreaterThanOrEqual
+	// IntegerCmpCondUnsignedGreaterThan represents Unsigned ">".
+	IntegerCmpCondUnsignedGreaterThan
+	// IntegerCmpCondUnsignedLessThanOrEqual represents Unsigned "<=".
+	IntegerCmpCondUnsignedLessThanOrEqual
+)
+
+// String implements fmt.Stringer.
+func (i IntegerCmpCond) String() string {
+	switch i {
+	case IntegerCmpCondEqual:
+		return "eq"
+	case IntegerCmpCondNotEqual:
+		return "neq"
+	case IntegerCmpCondSignedLessThan:
+		return "lt_s"
+	case IntegerCmpCondSignedGreaterThanOrEqual:
+		return "ge_s"
+	case IntegerCmpCondSignedGreaterThan:
+		return "gt_s"
+	case IntegerCmpCondSignedLessThanOrEqual:
+		return "le_s"
+	case IntegerCmpCondUnsignedLessThan:
+		return "lt_u"
+	case IntegerCmpCondUnsignedGreaterThanOrEqual:
+		return "ge_u"
+	case IntegerCmpCondUnsignedGreaterThan:
+		return "gt_u"
+	case IntegerCmpCondUnsignedLessThanOrEqual:
+		return "le_u"
+	default:
+		panic("invalid integer comparison condition")
+	}
+}
+
+// Signed returns true if the condition is signed integer comparison.
+func (i IntegerCmpCond) Signed() bool {
+	switch i {
+	case IntegerCmpCondSignedLessThan, IntegerCmpCondSignedGreaterThanOrEqual,
+		IntegerCmpCondSignedGreaterThan, IntegerCmpCondSignedLessThanOrEqual:
+		return true
+	default:
+		return false
+	}
+}
+
+type FloatCmpCond byte
+
+const (
+	// FloatCmpCondInvalid represents an invalid condition.
+	FloatCmpCondInvalid FloatCmpCond = iota
+	// FloatCmpCondEqual represents "==".
+	FloatCmpCondEqual
+	// FloatCmpCondNotEqual represents "!=".
+	FloatCmpCondNotEqual
+	// FloatCmpCondLessThan represents "<".
+	FloatCmpCondLessThan
+	// FloatCmpCondLessThanOrEqual represents "<=".
+	FloatCmpCondLessThanOrEqual
+	// FloatCmpCondGreaterThan represents ">".
+	FloatCmpCondGreaterThan
+	// FloatCmpCondGreaterThanOrEqual represents ">=".
+	FloatCmpCondGreaterThanOrEqual
+)
+
+// String implements fmt.Stringer.
+func (f FloatCmpCond) String() string {
+	switch f {
+	case FloatCmpCondEqual:
+		return "eq"
+	case FloatCmpCondNotEqual:
+		return "neq"
+	case FloatCmpCondLessThan:
+		return "lt"
+	case FloatCmpCondLessThanOrEqual:
+		return "le"
+	case FloatCmpCondGreaterThan:
+		return "gt"
+	case FloatCmpCondGreaterThanOrEqual:
+		return "ge"
+	default:
+		panic("invalid float comparison condition")
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/funcref.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/funcref.go
new file mode 100644
index 000000000..d9620762a
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/funcref.go
@@ -0,0 +1,12 @@
+package ssa
+
+import "fmt"
+
+// FuncRef is a unique identifier for a function of the frontend,
+// and is used to reference the function in function call.
+type FuncRef uint32
+
+// String implements fmt.Stringer.
+func (r FuncRef) String() string {
+	return fmt.Sprintf("f%d", r)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/instructions.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/instructions.go
new file mode 100644
index 000000000..3e3482efc
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/instructions.go
@@ -0,0 +1,2967 @@
+package ssa
+
+import (
+	"fmt"
+	"math"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// Opcode represents a SSA instruction.
+type Opcode uint32
+
+// Instruction represents an instruction whose opcode is specified by
+// Opcode. Since Go doesn't have union type, we use this flattened type
+// for all instructions, and therefore each field has different meaning
+// depending on Opcode.
+type Instruction struct {
+	// id is the unique ID of this instruction which ascends from 0 following the order of program.
+	id         int
+	opcode     Opcode
+	u1, u2     uint64
+	v          Value
+	v2         Value
+	v3         Value
+	vs         Values
+	typ        Type
+	blk        BasicBlock
+	targets    []BasicBlock
+	prev, next *Instruction
+
+	rValue         Value
+	rValues        Values
+	gid            InstructionGroupID
+	sourceOffset   SourceOffset
+	live           bool
+	alreadyLowered bool
+}
+
+// SourceOffset represents the offset of the source of an instruction.
+type SourceOffset int64
+
+const sourceOffsetUnknown = -1
+
+// Valid returns true if this source offset is valid.
+func (l SourceOffset) Valid() bool {
+	return l != sourceOffsetUnknown
+}
+
+func (i *Instruction) annotateSourceOffset(line SourceOffset) {
+	i.sourceOffset = line
+}
+
+// SourceOffset returns the source offset of this instruction.
+func (i *Instruction) SourceOffset() SourceOffset {
+	return i.sourceOffset
+}
+
+// Opcode returns the opcode of this instruction.
+func (i *Instruction) Opcode() Opcode {
+	return i.opcode
+}
+
+// GroupID returns the InstructionGroupID of this instruction.
+func (i *Instruction) GroupID() InstructionGroupID {
+	return i.gid
+}
+
+// MarkLowered marks this instruction as already lowered.
+func (i *Instruction) MarkLowered() {
+	i.alreadyLowered = true
+}
+
+// Lowered returns true if this instruction is already lowered.
+func (i *Instruction) Lowered() bool {
+	return i.alreadyLowered
+}
+
+// resetInstruction resets this instruction to the initial state.
+func resetInstruction(i *Instruction) {
+	*i = Instruction{}
+	i.v = ValueInvalid
+	i.v2 = ValueInvalid
+	i.v3 = ValueInvalid
+	i.rValue = ValueInvalid
+	i.typ = typeInvalid
+	i.vs = ValuesNil
+	i.sourceOffset = sourceOffsetUnknown
+}
+
+// InstructionGroupID is assigned to each instruction and represents a group of instructions
+// where each instruction is interchangeable with others except for the last instruction
+// in the group which has side effects. In short, InstructionGroupID is determined by the side effects of instructions.
+// That means, if there's an instruction with side effect between two instructions, then these two instructions
+// will have different instructionGroupID. Note that each block always ends with branching, which is with side effects,
+// therefore, instructions in different blocks always have different InstructionGroupID(s).
+//
+// The notable application of this is used in lowering SSA-level instruction to a ISA specific instruction,
+// where we eagerly try to merge multiple instructions into single operation etc. Such merging cannot be done
+// if these instruction have different InstructionGroupID since it will change the semantics of a program.
+//
+// See passDeadCodeElimination.
+type InstructionGroupID uint32
+
+// Returns Value(s) produced by this instruction if any.
+// The `first` is the first return value, and `rest` is the rest of the values.
+func (i *Instruction) Returns() (first Value, rest []Value) {
+	return i.rValue, i.rValues.View()
+}
+
+// Return returns a Value(s) produced by this instruction if any.
+// If there's multiple return values, only the first one is returned.
+func (i *Instruction) Return() (first Value) {
+	return i.rValue
+}
+
+// Args returns the arguments to this instruction.
+func (i *Instruction) Args() (v1, v2, v3 Value, vs []Value) {
+	return i.v, i.v2, i.v3, i.vs.View()
+}
+
+// Arg returns the first argument to this instruction.
+func (i *Instruction) Arg() Value {
+	return i.v
+}
+
+// Arg2 returns the first two arguments to this instruction.
+func (i *Instruction) Arg2() (Value, Value) {
+	return i.v, i.v2
+}
+
+// ArgWithLane returns the first argument to this instruction, and the lane type.
+func (i *Instruction) ArgWithLane() (Value, VecLane) {
+	return i.v, VecLane(i.u1)
+}
+
+// Arg2WithLane returns the first two arguments to this instruction, and the lane type.
+func (i *Instruction) Arg2WithLane() (Value, Value, VecLane) {
+	return i.v, i.v2, VecLane(i.u1)
+}
+
+// ShuffleData returns the first two arguments to this instruction and 2 uint64s `lo`, `hi`.
+//
+// Note: Each uint64 encodes a sequence of 8 bytes where each byte encodes a VecLane,
+// so that the 128bit integer `hi<<64|lo` packs a slice `[16]VecLane`,
+// where `lane[0]` is the least significant byte, and `lane[n]` is shifted to offset `n*8`.
+func (i *Instruction) ShuffleData() (v Value, v2 Value, lo uint64, hi uint64) {
+	return i.v, i.v2, i.u1, i.u2
+}
+
+// Arg3 returns the first three arguments to this instruction.
+func (i *Instruction) Arg3() (Value, Value, Value) {
+	return i.v, i.v2, i.v3
+}
+
+// Next returns the next instruction laid out next to itself.
+func (i *Instruction) Next() *Instruction {
+	return i.next
+}
+
+// Prev returns the previous instruction laid out prior to itself.
+func (i *Instruction) Prev() *Instruction {
+	return i.prev
+}
+
+// IsBranching returns true if this instruction is a branching instruction.
+func (i *Instruction) IsBranching() bool {
+	switch i.opcode {
+	case OpcodeJump, OpcodeBrz, OpcodeBrnz, OpcodeBrTable:
+		return true
+	default:
+		return false
+	}
+}
+
+// TODO: complete opcode comments.
+const (
+	OpcodeInvalid Opcode = iota
+
+	// OpcodeUndefined is a placeholder for undefined opcode. This can be used for debugging to intentionally
+	// cause a crash at certain point.
+	OpcodeUndefined
+
+	// OpcodeJump takes the list of args to the `block` and unconditionally jumps to it.
+	OpcodeJump
+
+	// OpcodeBrz branches into `blk` with `args`  if the value `c` equals zero: `Brz c, blk, args`.
+	OpcodeBrz
+
+	// OpcodeBrnz branches into `blk` with `args`  if the value `c` is not zero: `Brnz c, blk, args`.
+	OpcodeBrnz
+
+	// OpcodeBrTable takes the index value `index`, and branches into `labelX`. If the `index` is out of range,
+	// it branches into the last labelN: `BrTable index, [label1, label2, ... labelN]`.
+	OpcodeBrTable
+
+	// OpcodeExitWithCode exit the execution immediately.
+	OpcodeExitWithCode
+
+	// OpcodeExitIfTrueWithCode exits the execution immediately if the value `c` is not zero.
+	OpcodeExitIfTrueWithCode
+
+	// OpcodeReturn returns from the function: `return rvalues`.
+	OpcodeReturn
+
+	// OpcodeCall calls a function specified by the symbol FN with arguments `args`: `returnvals = Call FN, args...`
+	// This is a "near" call, which means the call target is known at compile time, and the target is relatively close
+	// to this function. If the target cannot be reached by near call, the backend fails to compile.
+	OpcodeCall
+
+	// OpcodeCallIndirect calls a function specified by `callee` which is a function address: `returnvals = call_indirect SIG, callee, args`.
+	// Note that this is different from call_indirect in Wasm, which also does type checking, etc.
+	OpcodeCallIndirect
+
+	// OpcodeSplat performs a vector splat operation: `v = Splat.lane x`.
+	OpcodeSplat
+
+	// OpcodeSwizzle performs a vector swizzle operation: `v = Swizzle.lane x, y`.
+	OpcodeSwizzle
+
+	// OpcodeInsertlane inserts a lane value into a vector: `v = InsertLane x, y, Idx`.
+	OpcodeInsertlane
+
+	// OpcodeExtractlane extracts a lane value from a vector: `v = ExtractLane x, Idx`.
+	OpcodeExtractlane
+
+	// OpcodeLoad loads a Type value from the [base + offset] address: `v = Load base, offset`.
+	OpcodeLoad
+
+	// OpcodeStore stores a Type value to the [base + offset] address: `Store v, base, offset`.
+	OpcodeStore
+
+	// OpcodeUload8 loads the 8-bit value from the [base + offset] address, zero-extended to 64 bits: `v = Uload8 base, offset`.
+	OpcodeUload8
+
+	// OpcodeSload8 loads the 8-bit value from the [base + offset] address, sign-extended to 64 bits: `v = Sload8 base, offset`.
+	OpcodeSload8
+
+	// OpcodeIstore8 stores the 8-bit value to the [base + offset] address, sign-extended to 64 bits: `Istore8 v, base, offset`.
+	OpcodeIstore8
+
+	// OpcodeUload16 loads the 16-bit value from the [base + offset] address, zero-extended to 64 bits: `v = Uload16 base, offset`.
+	OpcodeUload16
+
+	// OpcodeSload16 loads the 16-bit value from the [base + offset] address, sign-extended to 64 bits: `v = Sload16 base, offset`.
+	OpcodeSload16
+
+	// OpcodeIstore16 stores the 16-bit value to the [base + offset] address, zero-extended to 64 bits: `Istore16 v, base, offset`.
+	OpcodeIstore16
+
+	// OpcodeUload32 loads the 32-bit value from the [base + offset] address, zero-extended to 64 bits: `v = Uload32 base, offset`.
+	OpcodeUload32
+
+	// OpcodeSload32 loads the 32-bit value from the [base + offset] address, sign-extended to 64 bits: `v = Sload32 base, offset`.
+	OpcodeSload32
+
+	// OpcodeIstore32 stores the 32-bit value to the [base + offset] address, zero-extended to 64 bits: `Istore16 v, base, offset`.
+	OpcodeIstore32
+
+	// OpcodeLoadSplat represents a load that replicates the loaded value to all lanes `v = LoadSplat.lane p, Offset`.
+	OpcodeLoadSplat
+
+	// OpcodeVZeroExtLoad loads a scalar single/double precision floating point value from the [p + Offset] address,
+	// and zero-extend it to the V128 value: `v = VExtLoad  p, Offset`.
+	OpcodeVZeroExtLoad
+
+	// OpcodeIconst represents the integer const.
+	OpcodeIconst
+
+	// OpcodeF32const represents the single-precision const.
+	OpcodeF32const
+
+	// OpcodeF64const represents the double-precision const.
+	OpcodeF64const
+
+	// OpcodeVconst represents the 128bit vector const.
+	OpcodeVconst
+
+	// OpcodeVbor computes binary or between two 128bit vectors: `v = bor x, y`.
+	OpcodeVbor
+
+	// OpcodeVbxor computes binary xor between two 128bit vectors: `v = bxor x, y`.
+	OpcodeVbxor
+
+	// OpcodeVband computes binary and between two 128bit vectors: `v = band x, y`.
+	OpcodeVband
+
+	// OpcodeVbandnot computes binary and-not between two 128bit vectors: `v = bandnot x, y`.
+	OpcodeVbandnot
+
+	// OpcodeVbnot negates a 128bit vector: `v = bnot x`.
+	OpcodeVbnot
+
+	// OpcodeVbitselect uses the bits in the control mask c to select the corresponding bit from x when 1
+	// and y when 0: `v = bitselect c, x, y`.
+	OpcodeVbitselect
+
+	// OpcodeShuffle shuffles two vectors using the given 128-bit immediate: `v = shuffle imm, x, y`.
+	// For each byte in the immediate, a value i in [0, 15] selects the i-th byte in vector x;
+	// i in [16, 31] selects the (i-16)-th byte in vector y.
+	OpcodeShuffle
+
+	// OpcodeSelect chooses between two values based on a condition `c`: `v = Select c, x, y`.
+	OpcodeSelect
+
+	// OpcodeVanyTrue performs a any true operation: `s = VanyTrue a`.
+	OpcodeVanyTrue
+
+	// OpcodeVallTrue performs a lane-wise all true operation: `s = VallTrue.lane a`.
+	OpcodeVallTrue
+
+	// OpcodeVhighBits performs a lane-wise extract of the high bits: `v = VhighBits.lane a`.
+	OpcodeVhighBits
+
+	// OpcodeIcmp compares two integer values with the given condition: `v = icmp Cond, x, y`.
+	OpcodeIcmp
+
+	// OpcodeVIcmp compares two integer values with the given condition: `v = vicmp Cond, x, y` on vector.
+	OpcodeVIcmp
+
+	// OpcodeIcmpImm compares an integer value with the immediate value on the given condition: `v = icmp_imm Cond, x, Y`.
+	OpcodeIcmpImm
+
+	// OpcodeIadd performs an integer addition: `v = Iadd x, y`.
+	OpcodeIadd
+
+	// OpcodeVIadd performs an integer addition: `v = VIadd.lane x, y` on vector.
+	OpcodeVIadd
+
+	// OpcodeVSaddSat performs a signed saturating vector addition: `v = VSaddSat.lane x, y` on vector.
+	OpcodeVSaddSat
+
+	// OpcodeVUaddSat performs an unsigned saturating vector addition: `v = VUaddSat.lane x, y` on vector.
+	OpcodeVUaddSat
+
+	// OpcodeIsub performs an integer subtraction: `v = Isub x, y`.
+	OpcodeIsub
+
+	// OpcodeVIsub performs an integer subtraction: `v = VIsub.lane x, y` on vector.
+	OpcodeVIsub
+
+	// OpcodeVSsubSat performs a signed saturating vector subtraction: `v = VSsubSat.lane x, y` on vector.
+	OpcodeVSsubSat
+
+	// OpcodeVUsubSat performs an unsigned saturating vector subtraction: `v = VUsubSat.lane x, y` on vector.
+	OpcodeVUsubSat
+
+	// OpcodeVImin performs a signed integer min: `v = VImin.lane x, y` on vector.
+	OpcodeVImin
+
+	// OpcodeVUmin performs an unsigned integer min: `v = VUmin.lane x, y` on vector.
+	OpcodeVUmin
+
+	// OpcodeVImax performs a signed integer max: `v = VImax.lane x, y` on vector.
+	OpcodeVImax
+
+	// OpcodeVUmax performs an unsigned integer max: `v = VUmax.lane x, y` on vector.
+	OpcodeVUmax
+
+	// OpcodeVAvgRound performs an unsigned integer avg, truncating to zero: `v = VAvgRound.lane x, y` on vector.
+	OpcodeVAvgRound
+
+	// OpcodeVImul performs an integer multiplication: `v = VImul.lane x, y` on vector.
+	OpcodeVImul
+
+	// OpcodeVIneg negates the given integer vector value: `v = VIneg x`.
+	OpcodeVIneg
+
+	// OpcodeVIpopcnt counts the number of 1-bits in the given vector: `v = VIpopcnt x`.
+	OpcodeVIpopcnt
+
+	// OpcodeVIabs returns the absolute value for the given vector value: `v = VIabs.lane x`.
+	OpcodeVIabs
+
+	// OpcodeVIshl shifts x left by (y mod lane-width): `v = VIshl.lane x, y` on vector.
+	OpcodeVIshl
+
+	// OpcodeVUshr shifts x right by (y mod lane-width), unsigned: `v = VUshr.lane x, y` on vector.
+	OpcodeVUshr
+
+	// OpcodeVSshr shifts x right by (y mod lane-width), signed: `v = VSshr.lane x, y` on vector.
+	OpcodeVSshr
+
+	// OpcodeVFabs takes the absolute value of a floating point value: `v = VFabs.lane x on vector.
+	OpcodeVFabs
+
+	// OpcodeVFmax takes the maximum of two floating point values: `v = VFmax.lane x, y on vector.
+	OpcodeVFmax
+
+	// OpcodeVFmin takes the minimum of two floating point values: `v = VFmin.lane x, y on vector.
+	OpcodeVFmin
+
+	// OpcodeVFneg negates the given floating point vector value: `v = VFneg x`.
+	OpcodeVFneg
+
+	// OpcodeVFadd performs a floating point addition: `v = VFadd.lane x, y` on vector.
+	OpcodeVFadd
+
+	// OpcodeVFsub performs a floating point subtraction: `v = VFsub.lane x, y` on vector.
+	OpcodeVFsub
+
+	// OpcodeVFmul performs a floating point multiplication: `v = VFmul.lane x, y` on vector.
+	OpcodeVFmul
+
+	// OpcodeVFdiv performs a floating point division: `v = VFdiv.lane x, y` on vector.
+	OpcodeVFdiv
+
+	// OpcodeVFcmp compares two float values with the given condition: `v = VFcmp.lane Cond, x, y` on float.
+	OpcodeVFcmp
+
+	// OpcodeVCeil takes the ceiling of the given floating point value: `v = ceil.lane x` on vector.
+	OpcodeVCeil
+
+	// OpcodeVFloor takes the floor of the given floating point value: `v = floor.lane x` on vector.
+	OpcodeVFloor
+
+	// OpcodeVTrunc takes the truncation of the given floating point value: `v = trunc.lane x` on vector.
+	OpcodeVTrunc
+
+	// OpcodeVNearest takes the nearest integer of the given floating point value: `v = nearest.lane x` on vector.
+	OpcodeVNearest
+
+	// OpcodeVMaxPseudo computes the lane-wise maximum value `v = VMaxPseudo.lane x, y` on vector defined as `x < y ? x : y`.
+	OpcodeVMaxPseudo
+
+	// OpcodeVMinPseudo computes the lane-wise minimum value `v = VMinPseudo.lane x, y` on vector defined as `y < x ? x : y`.
+	OpcodeVMinPseudo
+
+	// OpcodeVSqrt takes the minimum of two floating point values: `v = VFmin.lane x, y` on vector.
+	OpcodeVSqrt
+
+	// OpcodeVFcvtToUintSat converts a floating point value to an unsigned integer: `v = FcvtToUintSat.lane x` on vector.
+	OpcodeVFcvtToUintSat
+
+	// OpcodeVFcvtToSintSat converts a floating point value to a signed integer: `v = VFcvtToSintSat.lane x` on vector.
+	OpcodeVFcvtToSintSat
+
+	// OpcodeVFcvtFromUint converts a floating point value from an unsigned integer: `v = FcvtFromUint.lane x` on vector.
+	// x is always a 32-bit integer lane, and the result is either a 32-bit or 64-bit floating point-sized vector.
+	OpcodeVFcvtFromUint
+
+	// OpcodeVFcvtFromSint converts a floating point value from a signed integer: `v = VFcvtFromSint.lane x` on vector.
+	// x is always a 32-bit integer lane, and the result is either a 32-bit or 64-bit floating point-sized vector.
+	OpcodeVFcvtFromSint
+
+	// OpcodeImul performs an integer multiplication: `v = Imul x, y`.
+	OpcodeImul
+
+	// OpcodeUdiv performs the unsigned integer division `v = Udiv x, y`.
+	OpcodeUdiv
+
+	// OpcodeSdiv performs the signed integer division `v = Sdiv x, y`.
+	OpcodeSdiv
+
+	// OpcodeUrem computes the remainder of the unsigned integer division `v = Urem x, y`.
+	OpcodeUrem
+
+	// OpcodeSrem computes the remainder of the signed integer division `v = Srem x, y`.
+	OpcodeSrem
+
+	// OpcodeBand performs a binary and: `v = Band x, y`.
+	OpcodeBand
+
+	// OpcodeBor performs a binary or: `v = Bor x, y`.
+	OpcodeBor
+
+	// OpcodeBxor performs a binary xor: `v = Bxor x, y`.
+	OpcodeBxor
+
+	// OpcodeBnot performs a binary not: `v = Bnot x`.
+	OpcodeBnot
+
+	// OpcodeRotl rotates the given integer value to the left: `v = Rotl x, y`.
+	OpcodeRotl
+
+	// OpcodeRotr rotates the given integer value to the right: `v = Rotr x, y`.
+	OpcodeRotr
+
+	// OpcodeIshl does logical shift left: `v = Ishl x, y`.
+	OpcodeIshl
+
+	// OpcodeUshr does logical shift right: `v = Ushr x, y`.
+	OpcodeUshr
+
+	// OpcodeSshr does arithmetic shift right: `v = Sshr x, y`.
+	OpcodeSshr
+
+	// OpcodeClz counts the number of leading zeros: `v = clz x`.
+	OpcodeClz
+
+	// OpcodeCtz counts the number of trailing zeros: `v = ctz x`.
+	OpcodeCtz
+
+	// OpcodePopcnt counts the number of 1-bits: `v = popcnt x`.
+	OpcodePopcnt
+
+	// OpcodeFcmp compares two floating point values: `v = fcmp Cond, x, y`.
+	OpcodeFcmp
+
+	// OpcodeFadd performs a floating point addition: / `v = Fadd x, y`.
+	OpcodeFadd
+
+	// OpcodeFsub performs a floating point subtraction: `v = Fsub x, y`.
+	OpcodeFsub
+
+	// OpcodeFmul performs a floating point multiplication: `v = Fmul x, y`.
+	OpcodeFmul
+
+	// OpcodeSqmulRoundSat performs a lane-wise saturating rounding multiplication
+	// in Q15 format: `v = SqmulRoundSat.lane x,y` on vector.
+	OpcodeSqmulRoundSat
+
+	// OpcodeFdiv performs a floating point division: `v = Fdiv x, y`.
+	OpcodeFdiv
+
+	// OpcodeSqrt takes the square root of the given floating point value: `v = sqrt x`.
+	OpcodeSqrt
+
+	// OpcodeFneg negates the given floating point value: `v = Fneg x`.
+	OpcodeFneg
+
+	// OpcodeFabs takes the absolute value of the given floating point value: `v = fabs x`.
+	OpcodeFabs
+
+	// OpcodeFcopysign copies the sign of the second floating point value to the first floating point value:
+	// `v = Fcopysign x, y`.
+	OpcodeFcopysign
+
+	// OpcodeFmin takes the minimum of two floating point values: `v = fmin x, y`.
+	OpcodeFmin
+
+	// OpcodeFmax takes the maximum of two floating point values: `v = fmax x, y`.
+	OpcodeFmax
+
+	// OpcodeCeil takes the ceiling of the given floating point value: `v = ceil x`.
+	OpcodeCeil
+
+	// OpcodeFloor takes the floor of the given floating point value: `v = floor x`.
+	OpcodeFloor
+
+	// OpcodeTrunc takes the truncation of the given floating point value: `v = trunc x`.
+	OpcodeTrunc
+
+	// OpcodeNearest takes the nearest integer of the given floating point value: `v = nearest x`.
+	OpcodeNearest
+
+	// OpcodeBitcast is a bitcast operation: `v = bitcast x`.
+	OpcodeBitcast
+
+	// OpcodeIreduce narrow the given integer: `v = Ireduce x`.
+	OpcodeIreduce
+
+	// OpcodeSnarrow converts two input vectors x, y into a smaller lane vector by narrowing each lane, signed `v = Snarrow.lane x, y`.
+	OpcodeSnarrow
+
+	// OpcodeUnarrow converts two input vectors x, y into a smaller lane vector by narrowing each lane, unsigned `v = Unarrow.lane x, y`.
+	OpcodeUnarrow
+
+	// OpcodeSwidenLow converts low half of the smaller lane vector to a larger lane vector, sign extended: `v = SwidenLow.lane x`.
+	OpcodeSwidenLow
+
+	// OpcodeSwidenHigh converts high half of the smaller lane vector to a larger lane vector, sign extended: `v = SwidenHigh.lane x`.
+	OpcodeSwidenHigh
+
+	// OpcodeUwidenLow converts low half of the smaller lane vector to a larger lane vector, zero (unsigned) extended: `v = UwidenLow.lane x`.
+	OpcodeUwidenLow
+
+	// OpcodeUwidenHigh converts high half of the smaller lane vector to a larger lane vector, zero (unsigned) extended: `v = UwidenHigh.lane x`.
+	OpcodeUwidenHigh
+
+	// OpcodeExtIaddPairwise is a lane-wise integer extended pairwise addition producing extended results (twice wider results than the inputs): `v = extiadd_pairwise x, y` on vector.
+	OpcodeExtIaddPairwise
+
+	// OpcodeWideningPairwiseDotProductS is a lane-wise widening pairwise dot product with signed saturation: `v = WideningPairwiseDotProductS x, y` on vector.
+	// Currently, the only lane is i16, and the result is i32.
+	OpcodeWideningPairwiseDotProductS
+
+	// OpcodeUExtend zero-extends the given integer: `v = UExtend x, from->to`.
+	OpcodeUExtend
+
+	// OpcodeSExtend sign-extends the given integer: `v = SExtend x, from->to`.
+	OpcodeSExtend
+
+	// OpcodeFpromote promotes the given floating point value: `v = Fpromote x`.
+	OpcodeFpromote
+
+	// OpcodeFvpromoteLow converts the two lower single-precision floating point lanes
+	// to the two double-precision lanes of the result: `v = FvpromoteLow.lane x` on vector.
+	OpcodeFvpromoteLow
+
+	// OpcodeFdemote demotes the given float point value: `v = Fdemote x`.
+	OpcodeFdemote
+
+	// OpcodeFvdemote converts the two double-precision floating point lanes
+	// to two lower single-precision lanes of the result `v = Fvdemote.lane x`.
+	OpcodeFvdemote
+
+	// OpcodeFcvtToUint converts a floating point value to an unsigned integer: `v = FcvtToUint x`.
+	OpcodeFcvtToUint
+
+	// OpcodeFcvtToSint converts a floating point value to a signed integer: `v = FcvtToSint x`.
+	OpcodeFcvtToSint
+
+	// OpcodeFcvtToUintSat converts a floating point value to an unsigned integer: `v = FcvtToUintSat x` which saturates on overflow.
+	OpcodeFcvtToUintSat
+
+	// OpcodeFcvtToSintSat converts a floating point value to a signed integer: `v = FcvtToSintSat x` which saturates on overflow.
+	OpcodeFcvtToSintSat
+
+	// OpcodeFcvtFromUint converts an unsigned integer to a floating point value: `v = FcvtFromUint x`.
+	OpcodeFcvtFromUint
+
+	// OpcodeFcvtFromSint converts a signed integer to a floating point value: `v = FcvtFromSint x`.
+	OpcodeFcvtFromSint
+
+	// OpcodeAtomicRmw is atomic read-modify-write operation: `v = atomic_rmw op, p, offset, value`.
+	OpcodeAtomicRmw
+
+	// OpcodeAtomicCas is atomic compare-and-swap operation.
+	OpcodeAtomicCas
+
+	// OpcodeAtomicLoad is atomic load operation.
+	OpcodeAtomicLoad
+
+	// OpcodeAtomicStore is atomic store operation.
+	OpcodeAtomicStore
+
+	// OpcodeFence is a memory fence operation.
+	OpcodeFence
+
+	// opcodeEnd marks the end of the opcode list.
+	opcodeEnd
+)
+
+// AtomicRmwOp represents the atomic read-modify-write operation.
+type AtomicRmwOp byte
+
+const (
+	// AtomicRmwOpAdd is an atomic add operation.
+	AtomicRmwOpAdd AtomicRmwOp = iota
+	// AtomicRmwOpSub is an atomic sub operation.
+	AtomicRmwOpSub
+	// AtomicRmwOpAnd is an atomic and operation.
+	AtomicRmwOpAnd
+	// AtomicRmwOpOr is an atomic or operation.
+	AtomicRmwOpOr
+	// AtomicRmwOpXor is an atomic xor operation.
+	AtomicRmwOpXor
+	// AtomicRmwOpXchg is an atomic swap operation.
+	AtomicRmwOpXchg
+)
+
+// String implements the fmt.Stringer.
+func (op AtomicRmwOp) String() string {
+	switch op {
+	case AtomicRmwOpAdd:
+		return "add"
+	case AtomicRmwOpSub:
+		return "sub"
+	case AtomicRmwOpAnd:
+		return "and"
+	case AtomicRmwOpOr:
+		return "or"
+	case AtomicRmwOpXor:
+		return "xor"
+	case AtomicRmwOpXchg:
+		return "xchg"
+	}
+	panic(fmt.Sprintf("unknown AtomicRmwOp: %d", op))
+}
+
+// returnTypesFn provides the info to determine the type of instruction.
+// t1 is the type of the first result, ts are the types of the remaining results.
+type returnTypesFn func(b *builder, instr *Instruction) (t1 Type, ts []Type)
+
+var (
+	returnTypesFnNoReturns returnTypesFn = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return typeInvalid, nil }
+	returnTypesFnSingle                  = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return instr.typ, nil }
+	returnTypesFnI32                     = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return TypeI32, nil }
+	returnTypesFnF32                     = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return TypeF32, nil }
+	returnTypesFnF64                     = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return TypeF64, nil }
+	returnTypesFnV128                    = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return TypeV128, nil }
+)
+
+// sideEffect provides the info to determine if an instruction has side effects which
+// is used to determine if it can be optimized out, interchanged with others, etc.
+type sideEffect byte
+
+const (
+	sideEffectUnknown sideEffect = iota
+	// sideEffectStrict represents an instruction with side effects, and should be always alive plus cannot be reordered.
+	sideEffectStrict
+	// sideEffectTraps represents an instruction that can trap, and should be always alive but can be reordered within the group.
+	sideEffectTraps
+	// sideEffectNone represents an instruction without side effects, and can be eliminated if the result is not used, plus can be reordered within the group.
+	sideEffectNone
+)
+
+// instructionSideEffects provides the info to determine if an instruction has side effects.
+// Instructions with side effects must not be eliminated regardless whether the result is used or not.
+var instructionSideEffects = [opcodeEnd]sideEffect{
+	OpcodeUndefined:                   sideEffectStrict,
+	OpcodeJump:                        sideEffectStrict,
+	OpcodeIconst:                      sideEffectNone,
+	OpcodeCall:                        sideEffectStrict,
+	OpcodeCallIndirect:                sideEffectStrict,
+	OpcodeIadd:                        sideEffectNone,
+	OpcodeImul:                        sideEffectNone,
+	OpcodeIsub:                        sideEffectNone,
+	OpcodeIcmp:                        sideEffectNone,
+	OpcodeExtractlane:                 sideEffectNone,
+	OpcodeInsertlane:                  sideEffectNone,
+	OpcodeBand:                        sideEffectNone,
+	OpcodeBor:                         sideEffectNone,
+	OpcodeBxor:                        sideEffectNone,
+	OpcodeRotl:                        sideEffectNone,
+	OpcodeRotr:                        sideEffectNone,
+	OpcodeFcmp:                        sideEffectNone,
+	OpcodeFadd:                        sideEffectNone,
+	OpcodeClz:                         sideEffectNone,
+	OpcodeCtz:                         sideEffectNone,
+	OpcodePopcnt:                      sideEffectNone,
+	OpcodeLoad:                        sideEffectNone,
+	OpcodeLoadSplat:                   sideEffectNone,
+	OpcodeUload8:                      sideEffectNone,
+	OpcodeUload16:                     sideEffectNone,
+	OpcodeUload32:                     sideEffectNone,
+	OpcodeSload8:                      sideEffectNone,
+	OpcodeSload16:                     sideEffectNone,
+	OpcodeSload32:                     sideEffectNone,
+	OpcodeSExtend:                     sideEffectNone,
+	OpcodeUExtend:                     sideEffectNone,
+	OpcodeSwidenLow:                   sideEffectNone,
+	OpcodeUwidenLow:                   sideEffectNone,
+	OpcodeSwidenHigh:                  sideEffectNone,
+	OpcodeUwidenHigh:                  sideEffectNone,
+	OpcodeSnarrow:                     sideEffectNone,
+	OpcodeUnarrow:                     sideEffectNone,
+	OpcodeSwizzle:                     sideEffectNone,
+	OpcodeShuffle:                     sideEffectNone,
+	OpcodeSplat:                       sideEffectNone,
+	OpcodeFsub:                        sideEffectNone,
+	OpcodeF32const:                    sideEffectNone,
+	OpcodeF64const:                    sideEffectNone,
+	OpcodeIshl:                        sideEffectNone,
+	OpcodeSshr:                        sideEffectNone,
+	OpcodeUshr:                        sideEffectNone,
+	OpcodeStore:                       sideEffectStrict,
+	OpcodeIstore8:                     sideEffectStrict,
+	OpcodeIstore16:                    sideEffectStrict,
+	OpcodeIstore32:                    sideEffectStrict,
+	OpcodeExitWithCode:                sideEffectStrict,
+	OpcodeExitIfTrueWithCode:          sideEffectStrict,
+	OpcodeReturn:                      sideEffectStrict,
+	OpcodeBrz:                         sideEffectStrict,
+	OpcodeBrnz:                        sideEffectStrict,
+	OpcodeBrTable:                     sideEffectStrict,
+	OpcodeFdiv:                        sideEffectNone,
+	OpcodeFmul:                        sideEffectNone,
+	OpcodeFmax:                        sideEffectNone,
+	OpcodeSqmulRoundSat:               sideEffectNone,
+	OpcodeSelect:                      sideEffectNone,
+	OpcodeFmin:                        sideEffectNone,
+	OpcodeFneg:                        sideEffectNone,
+	OpcodeFcvtToSint:                  sideEffectTraps,
+	OpcodeFcvtToUint:                  sideEffectTraps,
+	OpcodeFcvtFromSint:                sideEffectNone,
+	OpcodeFcvtFromUint:                sideEffectNone,
+	OpcodeFcvtToSintSat:               sideEffectNone,
+	OpcodeFcvtToUintSat:               sideEffectNone,
+	OpcodeVFcvtFromUint:               sideEffectNone,
+	OpcodeVFcvtFromSint:               sideEffectNone,
+	OpcodeFdemote:                     sideEffectNone,
+	OpcodeFvpromoteLow:                sideEffectNone,
+	OpcodeFvdemote:                    sideEffectNone,
+	OpcodeFpromote:                    sideEffectNone,
+	OpcodeBitcast:                     sideEffectNone,
+	OpcodeIreduce:                     sideEffectNone,
+	OpcodeSqrt:                        sideEffectNone,
+	OpcodeCeil:                        sideEffectNone,
+	OpcodeFloor:                       sideEffectNone,
+	OpcodeTrunc:                       sideEffectNone,
+	OpcodeNearest:                     sideEffectNone,
+	OpcodeSdiv:                        sideEffectTraps,
+	OpcodeSrem:                        sideEffectTraps,
+	OpcodeUdiv:                        sideEffectTraps,
+	OpcodeUrem:                        sideEffectTraps,
+	OpcodeFabs:                        sideEffectNone,
+	OpcodeFcopysign:                   sideEffectNone,
+	OpcodeExtIaddPairwise:             sideEffectNone,
+	OpcodeVconst:                      sideEffectNone,
+	OpcodeVbor:                        sideEffectNone,
+	OpcodeVbxor:                       sideEffectNone,
+	OpcodeVband:                       sideEffectNone,
+	OpcodeVbandnot:                    sideEffectNone,
+	OpcodeVbnot:                       sideEffectNone,
+	OpcodeVbitselect:                  sideEffectNone,
+	OpcodeVanyTrue:                    sideEffectNone,
+	OpcodeVallTrue:                    sideEffectNone,
+	OpcodeVhighBits:                   sideEffectNone,
+	OpcodeVIadd:                       sideEffectNone,
+	OpcodeVSaddSat:                    sideEffectNone,
+	OpcodeVUaddSat:                    sideEffectNone,
+	OpcodeVIsub:                       sideEffectNone,
+	OpcodeVSsubSat:                    sideEffectNone,
+	OpcodeVUsubSat:                    sideEffectNone,
+	OpcodeVIcmp:                       sideEffectNone,
+	OpcodeVImin:                       sideEffectNone,
+	OpcodeVUmin:                       sideEffectNone,
+	OpcodeVImax:                       sideEffectNone,
+	OpcodeVUmax:                       sideEffectNone,
+	OpcodeVAvgRound:                   sideEffectNone,
+	OpcodeVImul:                       sideEffectNone,
+	OpcodeVIabs:                       sideEffectNone,
+	OpcodeVIneg:                       sideEffectNone,
+	OpcodeVIpopcnt:                    sideEffectNone,
+	OpcodeVIshl:                       sideEffectNone,
+	OpcodeVSshr:                       sideEffectNone,
+	OpcodeVUshr:                       sideEffectNone,
+	OpcodeVSqrt:                       sideEffectNone,
+	OpcodeVFabs:                       sideEffectNone,
+	OpcodeVFmin:                       sideEffectNone,
+	OpcodeVFmax:                       sideEffectNone,
+	OpcodeVFneg:                       sideEffectNone,
+	OpcodeVFadd:                       sideEffectNone,
+	OpcodeVFsub:                       sideEffectNone,
+	OpcodeVFmul:                       sideEffectNone,
+	OpcodeVFdiv:                       sideEffectNone,
+	OpcodeVFcmp:                       sideEffectNone,
+	OpcodeVCeil:                       sideEffectNone,
+	OpcodeVFloor:                      sideEffectNone,
+	OpcodeVTrunc:                      sideEffectNone,
+	OpcodeVNearest:                    sideEffectNone,
+	OpcodeVMaxPseudo:                  sideEffectNone,
+	OpcodeVMinPseudo:                  sideEffectNone,
+	OpcodeVFcvtToUintSat:              sideEffectNone,
+	OpcodeVFcvtToSintSat:              sideEffectNone,
+	OpcodeVZeroExtLoad:                sideEffectNone,
+	OpcodeAtomicRmw:                   sideEffectStrict,
+	OpcodeAtomicLoad:                  sideEffectStrict,
+	OpcodeAtomicStore:                 sideEffectStrict,
+	OpcodeAtomicCas:                   sideEffectStrict,
+	OpcodeFence:                       sideEffectStrict,
+	OpcodeWideningPairwiseDotProductS: sideEffectNone,
+}
+
+// sideEffect returns true if this instruction has side effects.
+func (i *Instruction) sideEffect() sideEffect {
+	if e := instructionSideEffects[i.opcode]; e == sideEffectUnknown {
+		panic("BUG: side effect info not registered for " + i.opcode.String())
+	} else {
+		return e
+	}
+}
+
+// instructionReturnTypes provides the function to determine the return types of an instruction.
+var instructionReturnTypes = [opcodeEnd]returnTypesFn{
+	OpcodeExtIaddPairwise: returnTypesFnV128,
+	OpcodeVbor:            returnTypesFnV128,
+	OpcodeVbxor:           returnTypesFnV128,
+	OpcodeVband:           returnTypesFnV128,
+	OpcodeVbnot:           returnTypesFnV128,
+	OpcodeVbandnot:        returnTypesFnV128,
+	OpcodeVbitselect:      returnTypesFnV128,
+	OpcodeVanyTrue:        returnTypesFnI32,
+	OpcodeVallTrue:        returnTypesFnI32,
+	OpcodeVhighBits:       returnTypesFnI32,
+	OpcodeVIadd:           returnTypesFnV128,
+	OpcodeVSaddSat:        returnTypesFnV128,
+	OpcodeVUaddSat:        returnTypesFnV128,
+	OpcodeVIsub:           returnTypesFnV128,
+	OpcodeVSsubSat:        returnTypesFnV128,
+	OpcodeVUsubSat:        returnTypesFnV128,
+	OpcodeVIcmp:           returnTypesFnV128,
+	OpcodeVImin:           returnTypesFnV128,
+	OpcodeVUmin:           returnTypesFnV128,
+	OpcodeVImax:           returnTypesFnV128,
+	OpcodeVUmax:           returnTypesFnV128,
+	OpcodeVImul:           returnTypesFnV128,
+	OpcodeVAvgRound:       returnTypesFnV128,
+	OpcodeVIabs:           returnTypesFnV128,
+	OpcodeVIneg:           returnTypesFnV128,
+	OpcodeVIpopcnt:        returnTypesFnV128,
+	OpcodeVIshl:           returnTypesFnV128,
+	OpcodeVSshr:           returnTypesFnV128,
+	OpcodeVUshr:           returnTypesFnV128,
+	OpcodeExtractlane:     returnTypesFnSingle,
+	OpcodeInsertlane:      returnTypesFnV128,
+	OpcodeBand:            returnTypesFnSingle,
+	OpcodeFcopysign:       returnTypesFnSingle,
+	OpcodeBitcast:         returnTypesFnSingle,
+	OpcodeBor:             returnTypesFnSingle,
+	OpcodeBxor:            returnTypesFnSingle,
+	OpcodeRotl:            returnTypesFnSingle,
+	OpcodeRotr:            returnTypesFnSingle,
+	OpcodeIshl:            returnTypesFnSingle,
+	OpcodeSshr:            returnTypesFnSingle,
+	OpcodeSdiv:            returnTypesFnSingle,
+	OpcodeSrem:            returnTypesFnSingle,
+	OpcodeUdiv:            returnTypesFnSingle,
+	OpcodeUrem:            returnTypesFnSingle,
+	OpcodeUshr:            returnTypesFnSingle,
+	OpcodeJump:            returnTypesFnNoReturns,
+	OpcodeUndefined:       returnTypesFnNoReturns,
+	OpcodeIconst:          returnTypesFnSingle,
+	OpcodeSelect:          returnTypesFnSingle,
+	OpcodeSExtend:         returnTypesFnSingle,
+	OpcodeUExtend:         returnTypesFnSingle,
+	OpcodeSwidenLow:       returnTypesFnV128,
+	OpcodeUwidenLow:       returnTypesFnV128,
+	OpcodeSwidenHigh:      returnTypesFnV128,
+	OpcodeUwidenHigh:      returnTypesFnV128,
+	OpcodeSnarrow:         returnTypesFnV128,
+	OpcodeUnarrow:         returnTypesFnV128,
+	OpcodeSwizzle:         returnTypesFnSingle,
+	OpcodeShuffle:         returnTypesFnV128,
+	OpcodeSplat:           returnTypesFnV128,
+	OpcodeIreduce:         returnTypesFnSingle,
+	OpcodeFabs:            returnTypesFnSingle,
+	OpcodeSqrt:            returnTypesFnSingle,
+	OpcodeCeil:            returnTypesFnSingle,
+	OpcodeFloor:           returnTypesFnSingle,
+	OpcodeTrunc:           returnTypesFnSingle,
+	OpcodeNearest:         returnTypesFnSingle,
+	OpcodeCallIndirect: func(b *builder, instr *Instruction) (t1 Type, ts []Type) {
+		sigID := SignatureID(instr.u1)
+		sig, ok := b.signatures[sigID]
+		if !ok {
+			panic("BUG")
+		}
+		switch len(sig.Results) {
+		case 0:
+			t1 = typeInvalid
+		case 1:
+			t1 = sig.Results[0]
+		default:
+			t1, ts = sig.Results[0], sig.Results[1:]
+		}
+		return
+	},
+	OpcodeCall: func(b *builder, instr *Instruction) (t1 Type, ts []Type) {
+		sigID := SignatureID(instr.u2)
+		sig, ok := b.signatures[sigID]
+		if !ok {
+			panic("BUG")
+		}
+		switch len(sig.Results) {
+		case 0:
+			t1 = typeInvalid
+		case 1:
+			t1 = sig.Results[0]
+		default:
+			t1, ts = sig.Results[0], sig.Results[1:]
+		}
+		return
+	},
+	OpcodeLoad:                        returnTypesFnSingle,
+	OpcodeVZeroExtLoad:                returnTypesFnV128,
+	OpcodeLoadSplat:                   returnTypesFnV128,
+	OpcodeIadd:                        returnTypesFnSingle,
+	OpcodeIsub:                        returnTypesFnSingle,
+	OpcodeImul:                        returnTypesFnSingle,
+	OpcodeIcmp:                        returnTypesFnI32,
+	OpcodeFcmp:                        returnTypesFnI32,
+	OpcodeFadd:                        returnTypesFnSingle,
+	OpcodeFsub:                        returnTypesFnSingle,
+	OpcodeFdiv:                        returnTypesFnSingle,
+	OpcodeFmul:                        returnTypesFnSingle,
+	OpcodeFmax:                        returnTypesFnSingle,
+	OpcodeFmin:                        returnTypesFnSingle,
+	OpcodeSqmulRoundSat:               returnTypesFnV128,
+	OpcodeF32const:                    returnTypesFnF32,
+	OpcodeF64const:                    returnTypesFnF64,
+	OpcodeClz:                         returnTypesFnSingle,
+	OpcodeCtz:                         returnTypesFnSingle,
+	OpcodePopcnt:                      returnTypesFnSingle,
+	OpcodeStore:                       returnTypesFnNoReturns,
+	OpcodeIstore8:                     returnTypesFnNoReturns,
+	OpcodeIstore16:                    returnTypesFnNoReturns,
+	OpcodeIstore32:                    returnTypesFnNoReturns,
+	OpcodeExitWithCode:                returnTypesFnNoReturns,
+	OpcodeExitIfTrueWithCode:          returnTypesFnNoReturns,
+	OpcodeReturn:                      returnTypesFnNoReturns,
+	OpcodeBrz:                         returnTypesFnNoReturns,
+	OpcodeBrnz:                        returnTypesFnNoReturns,
+	OpcodeBrTable:                     returnTypesFnNoReturns,
+	OpcodeUload8:                      returnTypesFnSingle,
+	OpcodeUload16:                     returnTypesFnSingle,
+	OpcodeUload32:                     returnTypesFnSingle,
+	OpcodeSload8:                      returnTypesFnSingle,
+	OpcodeSload16:                     returnTypesFnSingle,
+	OpcodeSload32:                     returnTypesFnSingle,
+	OpcodeFcvtToSint:                  returnTypesFnSingle,
+	OpcodeFcvtToUint:                  returnTypesFnSingle,
+	OpcodeFcvtFromSint:                returnTypesFnSingle,
+	OpcodeFcvtFromUint:                returnTypesFnSingle,
+	OpcodeFcvtToSintSat:               returnTypesFnSingle,
+	OpcodeFcvtToUintSat:               returnTypesFnSingle,
+	OpcodeVFcvtFromUint:               returnTypesFnV128,
+	OpcodeVFcvtFromSint:               returnTypesFnV128,
+	OpcodeFneg:                        returnTypesFnSingle,
+	OpcodeFdemote:                     returnTypesFnF32,
+	OpcodeFvdemote:                    returnTypesFnV128,
+	OpcodeFvpromoteLow:                returnTypesFnV128,
+	OpcodeFpromote:                    returnTypesFnF64,
+	OpcodeVconst:                      returnTypesFnV128,
+	OpcodeVFabs:                       returnTypesFnV128,
+	OpcodeVSqrt:                       returnTypesFnV128,
+	OpcodeVFmax:                       returnTypesFnV128,
+	OpcodeVFmin:                       returnTypesFnV128,
+	OpcodeVFneg:                       returnTypesFnV128,
+	OpcodeVFadd:                       returnTypesFnV128,
+	OpcodeVFsub:                       returnTypesFnV128,
+	OpcodeVFmul:                       returnTypesFnV128,
+	OpcodeVFdiv:                       returnTypesFnV128,
+	OpcodeVFcmp:                       returnTypesFnV128,
+	OpcodeVCeil:                       returnTypesFnV128,
+	OpcodeVFloor:                      returnTypesFnV128,
+	OpcodeVTrunc:                      returnTypesFnV128,
+	OpcodeVNearest:                    returnTypesFnV128,
+	OpcodeVMaxPseudo:                  returnTypesFnV128,
+	OpcodeVMinPseudo:                  returnTypesFnV128,
+	OpcodeVFcvtToUintSat:              returnTypesFnV128,
+	OpcodeVFcvtToSintSat:              returnTypesFnV128,
+	OpcodeAtomicRmw:                   returnTypesFnSingle,
+	OpcodeAtomicLoad:                  returnTypesFnSingle,
+	OpcodeAtomicStore:                 returnTypesFnNoReturns,
+	OpcodeAtomicCas:                   returnTypesFnSingle,
+	OpcodeFence:                       returnTypesFnNoReturns,
+	OpcodeWideningPairwiseDotProductS: returnTypesFnV128,
+}
+
+// AsLoad initializes this instruction as a store instruction with OpcodeLoad.
+func (i *Instruction) AsLoad(ptr Value, offset uint32, typ Type) *Instruction {
+	i.opcode = OpcodeLoad
+	i.v = ptr
+	i.u1 = uint64(offset)
+	i.typ = typ
+	return i
+}
+
+// AsExtLoad initializes this instruction as a store instruction with OpcodeLoad.
+func (i *Instruction) AsExtLoad(op Opcode, ptr Value, offset uint32, dst64bit bool) *Instruction {
+	i.opcode = op
+	i.v = ptr
+	i.u1 = uint64(offset)
+	if dst64bit {
+		i.typ = TypeI64
+	} else {
+		i.typ = TypeI32
+	}
+	return i
+}
+
+// AsVZeroExtLoad initializes this instruction as a store instruction with OpcodeVExtLoad.
+func (i *Instruction) AsVZeroExtLoad(ptr Value, offset uint32, scalarType Type) *Instruction {
+	i.opcode = OpcodeVZeroExtLoad
+	i.v = ptr
+	i.u1 = uint64(offset)
+	i.u2 = uint64(scalarType)
+	i.typ = TypeV128
+	return i
+}
+
+// VZeroExtLoadData returns the operands for a load instruction. The returned `typ` is the scalar type of the load target.
+func (i *Instruction) VZeroExtLoadData() (ptr Value, offset uint32, typ Type) {
+	return i.v, uint32(i.u1), Type(i.u2)
+}
+
+// AsLoadSplat initializes this instruction as a store instruction with OpcodeLoadSplat.
+func (i *Instruction) AsLoadSplat(ptr Value, offset uint32, lane VecLane) *Instruction {
+	i.opcode = OpcodeLoadSplat
+	i.v = ptr
+	i.u1 = uint64(offset)
+	i.u2 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// LoadData returns the operands for a load instruction.
+func (i *Instruction) LoadData() (ptr Value, offset uint32, typ Type) {
+	return i.v, uint32(i.u1), i.typ
+}
+
+// LoadSplatData returns the operands for a load splat instruction.
+func (i *Instruction) LoadSplatData() (ptr Value, offset uint32, lane VecLane) {
+	return i.v, uint32(i.u1), VecLane(i.u2)
+}
+
+// AsStore initializes this instruction as a store instruction with OpcodeStore.
+func (i *Instruction) AsStore(storeOp Opcode, value, ptr Value, offset uint32) *Instruction {
+	i.opcode = storeOp
+	i.v = value
+	i.v2 = ptr
+
+	var dstSize uint64
+	switch storeOp {
+	case OpcodeStore:
+		dstSize = uint64(value.Type().Bits())
+	case OpcodeIstore8:
+		dstSize = 8
+	case OpcodeIstore16:
+		dstSize = 16
+	case OpcodeIstore32:
+		dstSize = 32
+	default:
+		panic("invalid store opcode" + storeOp.String())
+	}
+	i.u1 = uint64(offset) | dstSize<<32
+	return i
+}
+
+// StoreData returns the operands for a store instruction.
+func (i *Instruction) StoreData() (value, ptr Value, offset uint32, storeSizeInBits byte) {
+	return i.v, i.v2, uint32(i.u1), byte(i.u1 >> 32)
+}
+
+// AsIconst64 initializes this instruction as a 64-bit integer constant instruction with OpcodeIconst.
+func (i *Instruction) AsIconst64(v uint64) *Instruction {
+	i.opcode = OpcodeIconst
+	i.typ = TypeI64
+	i.u1 = v
+	return i
+}
+
+// AsIconst32 initializes this instruction as a 32-bit integer constant instruction with OpcodeIconst.
+func (i *Instruction) AsIconst32(v uint32) *Instruction {
+	i.opcode = OpcodeIconst
+	i.typ = TypeI32
+	i.u1 = uint64(v)
+	return i
+}
+
+// AsIadd initializes this instruction as an integer addition instruction with OpcodeIadd.
+func (i *Instruction) AsIadd(x, y Value) *Instruction {
+	i.opcode = OpcodeIadd
+	i.v = x
+	i.v2 = y
+	i.typ = x.Type()
+	return i
+}
+
+// AsVIadd initializes this instruction as an integer addition instruction with OpcodeVIadd on a vector.
+func (i *Instruction) AsVIadd(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVIadd
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsWideningPairwiseDotProductS initializes this instruction as a lane-wise integer extended pairwise addition instruction
+// with OpcodeIaddPairwise on a vector.
+func (i *Instruction) AsWideningPairwiseDotProductS(x, y Value) *Instruction {
+	i.opcode = OpcodeWideningPairwiseDotProductS
+	i.v = x
+	i.v2 = y
+	i.typ = TypeV128
+	return i
+}
+
+// AsExtIaddPairwise initializes this instruction as a lane-wise integer extended pairwise addition instruction
+// with OpcodeIaddPairwise on a vector.
+func (i *Instruction) AsExtIaddPairwise(x Value, srcLane VecLane, signed bool) *Instruction {
+	i.opcode = OpcodeExtIaddPairwise
+	i.v = x
+	i.u1 = uint64(srcLane)
+	if signed {
+		i.u2 = 1
+	}
+	i.typ = TypeV128
+	return i
+}
+
+// ExtIaddPairwiseData returns the operands for a lane-wise integer extended pairwise addition instruction.
+func (i *Instruction) ExtIaddPairwiseData() (x Value, srcLane VecLane, signed bool) {
+	return i.v, VecLane(i.u1), i.u2 != 0
+}
+
+// AsVSaddSat initializes this instruction as a vector addition with saturation instruction with OpcodeVSaddSat on a vector.
+func (i *Instruction) AsVSaddSat(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVSaddSat
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVUaddSat initializes this instruction as a vector addition with saturation instruction with OpcodeVUaddSat on a vector.
+func (i *Instruction) AsVUaddSat(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVUaddSat
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVIsub initializes this instruction as an integer subtraction instruction with OpcodeVIsub on a vector.
+func (i *Instruction) AsVIsub(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVIsub
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVSsubSat initializes this instruction as a vector addition with saturation instruction with OpcodeVSsubSat on a vector.
+func (i *Instruction) AsVSsubSat(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVSsubSat
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVUsubSat initializes this instruction as a vector addition with saturation instruction with OpcodeVUsubSat on a vector.
+func (i *Instruction) AsVUsubSat(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVUsubSat
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVImin initializes this instruction as a signed integer min instruction with OpcodeVImin on a vector.
+func (i *Instruction) AsVImin(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVImin
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVUmin initializes this instruction as an unsigned integer min instruction with OpcodeVUmin on a vector.
+func (i *Instruction) AsVUmin(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVUmin
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVImax initializes this instruction as a signed integer max instruction with OpcodeVImax on a vector.
+func (i *Instruction) AsVImax(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVImax
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVUmax initializes this instruction as an unsigned integer max instruction with OpcodeVUmax on a vector.
+func (i *Instruction) AsVUmax(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVUmax
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVAvgRound initializes this instruction as an unsigned integer avg instruction, truncating to zero with OpcodeVAvgRound on a vector.
+func (i *Instruction) AsVAvgRound(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVAvgRound
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVImul initializes this instruction as an integer multiplication with OpcodeVImul on a vector.
+func (i *Instruction) AsVImul(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVImul
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsSqmulRoundSat initializes this instruction as a lane-wise saturating rounding multiplication
+// in Q15 format with OpcodeSqmulRoundSat on a vector.
+func (i *Instruction) AsSqmulRoundSat(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeSqmulRoundSat
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVIabs initializes this instruction as a vector absolute value with OpcodeVIabs.
+func (i *Instruction) AsVIabs(x Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVIabs
+	i.v = x
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVIneg initializes this instruction as a vector negation with OpcodeVIneg.
+func (i *Instruction) AsVIneg(x Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVIneg
+	i.v = x
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVIpopcnt initializes this instruction as a Population Count instruction with OpcodeVIpopcnt on a vector.
+func (i *Instruction) AsVIpopcnt(x Value, lane VecLane) *Instruction {
+	if lane != VecLaneI8x16 {
+		panic("Unsupported lane type " + lane.String())
+	}
+	i.opcode = OpcodeVIpopcnt
+	i.v = x
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVSqrt initializes this instruction as a sqrt instruction with OpcodeVSqrt on a vector.
+func (i *Instruction) AsVSqrt(x Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVSqrt
+	i.v = x
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVFabs initializes this instruction as a float abs instruction with OpcodeVFabs on a vector.
+func (i *Instruction) AsVFabs(x Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVFabs
+	i.v = x
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVFneg initializes this instruction as a float neg instruction with OpcodeVFneg on a vector.
+func (i *Instruction) AsVFneg(x Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVFneg
+	i.v = x
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVFmax initializes this instruction as a float max instruction with OpcodeVFmax on a vector.
+func (i *Instruction) AsVFmax(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVFmax
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVFmin initializes this instruction as a float min instruction with OpcodeVFmin on a vector.
+func (i *Instruction) AsVFmin(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVFmin
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVFadd initializes this instruction as a floating point add instruction with OpcodeVFadd on a vector.
+func (i *Instruction) AsVFadd(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVFadd
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVFsub initializes this instruction as a floating point subtraction instruction with OpcodeVFsub on a vector.
+func (i *Instruction) AsVFsub(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVFsub
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVFmul initializes this instruction as a floating point multiplication instruction with OpcodeVFmul on a vector.
+func (i *Instruction) AsVFmul(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVFmul
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVFdiv initializes this instruction as a floating point division instruction with OpcodeVFdiv on a vector.
+func (i *Instruction) AsVFdiv(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVFdiv
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsImul initializes this instruction as an integer addition instruction with OpcodeImul.
+func (i *Instruction) AsImul(x, y Value) *Instruction {
+	i.opcode = OpcodeImul
+	i.v = x
+	i.v2 = y
+	i.typ = x.Type()
+	return i
+}
+
+func (i *Instruction) Insert(b Builder) *Instruction {
+	b.InsertInstruction(i)
+	return i
+}
+
+// AsIsub initializes this instruction as an integer subtraction instruction with OpcodeIsub.
+func (i *Instruction) AsIsub(x, y Value) *Instruction {
+	i.opcode = OpcodeIsub
+	i.v = x
+	i.v2 = y
+	i.typ = x.Type()
+	return i
+}
+
+// AsIcmp initializes this instruction as an integer comparison instruction with OpcodeIcmp.
+func (i *Instruction) AsIcmp(x, y Value, c IntegerCmpCond) *Instruction {
+	i.opcode = OpcodeIcmp
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(c)
+	i.typ = TypeI32
+	return i
+}
+
+// AsFcmp initializes this instruction as an integer comparison instruction with OpcodeFcmp.
+func (i *Instruction) AsFcmp(x, y Value, c FloatCmpCond) {
+	i.opcode = OpcodeFcmp
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(c)
+	i.typ = TypeI32
+}
+
+// AsVIcmp initializes this instruction as an integer vector comparison instruction with OpcodeVIcmp.
+func (i *Instruction) AsVIcmp(x, y Value, c IntegerCmpCond, lane VecLane) *Instruction {
+	i.opcode = OpcodeVIcmp
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(c)
+	i.u2 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsVFcmp initializes this instruction as a float comparison instruction with OpcodeVFcmp on Vector.
+func (i *Instruction) AsVFcmp(x, y Value, c FloatCmpCond, lane VecLane) *Instruction {
+	i.opcode = OpcodeVFcmp
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(c)
+	i.typ = TypeV128
+	i.u2 = uint64(lane)
+	return i
+}
+
+// AsVCeil initializes this instruction as an instruction with OpcodeCeil.
+func (i *Instruction) AsVCeil(x Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVCeil
+	i.v = x
+	i.typ = x.Type()
+	i.u1 = uint64(lane)
+	return i
+}
+
+// AsVFloor initializes this instruction as an instruction with OpcodeFloor.
+func (i *Instruction) AsVFloor(x Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVFloor
+	i.v = x
+	i.typ = x.Type()
+	i.u1 = uint64(lane)
+	return i
+}
+
+// AsVTrunc initializes this instruction as an instruction with OpcodeTrunc.
+func (i *Instruction) AsVTrunc(x Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVTrunc
+	i.v = x
+	i.typ = x.Type()
+	i.u1 = uint64(lane)
+	return i
+}
+
+// AsVNearest initializes this instruction as an instruction with OpcodeNearest.
+func (i *Instruction) AsVNearest(x Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVNearest
+	i.v = x
+	i.typ = x.Type()
+	i.u1 = uint64(lane)
+	return i
+}
+
+// AsVMaxPseudo initializes this instruction as an instruction with OpcodeVMaxPseudo.
+func (i *Instruction) AsVMaxPseudo(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVMaxPseudo
+	i.typ = x.Type()
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	return i
+}
+
+// AsVMinPseudo initializes this instruction as an instruction with OpcodeVMinPseudo.
+func (i *Instruction) AsVMinPseudo(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVMinPseudo
+	i.typ = x.Type()
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	return i
+}
+
+// AsSDiv initializes this instruction as an integer bitwise and instruction with OpcodeSdiv.
+func (i *Instruction) AsSDiv(x, y, ctx Value) *Instruction {
+	i.opcode = OpcodeSdiv
+	i.v = x
+	i.v2 = y
+	i.v3 = ctx
+	i.typ = x.Type()
+	return i
+}
+
+// AsUDiv initializes this instruction as an integer bitwise and instruction with OpcodeUdiv.
+func (i *Instruction) AsUDiv(x, y, ctx Value) *Instruction {
+	i.opcode = OpcodeUdiv
+	i.v = x
+	i.v2 = y
+	i.v3 = ctx
+	i.typ = x.Type()
+	return i
+}
+
+// AsSRem initializes this instruction as an integer bitwise and instruction with OpcodeSrem.
+func (i *Instruction) AsSRem(x, y, ctx Value) *Instruction {
+	i.opcode = OpcodeSrem
+	i.v = x
+	i.v2 = y
+	i.v3 = ctx
+	i.typ = x.Type()
+	return i
+}
+
+// AsURem initializes this instruction as an integer bitwise and instruction with OpcodeUrem.
+func (i *Instruction) AsURem(x, y, ctx Value) *Instruction {
+	i.opcode = OpcodeUrem
+	i.v = x
+	i.v2 = y
+	i.v3 = ctx
+	i.typ = x.Type()
+	return i
+}
+
+// AsBand initializes this instruction as an integer bitwise and instruction with OpcodeBand.
+func (i *Instruction) AsBand(x, amount Value) *Instruction {
+	i.opcode = OpcodeBand
+	i.v = x
+	i.v2 = amount
+	i.typ = x.Type()
+	return i
+}
+
+// AsBor initializes this instruction as an integer bitwise or instruction with OpcodeBor.
+func (i *Instruction) AsBor(x, amount Value) {
+	i.opcode = OpcodeBor
+	i.v = x
+	i.v2 = amount
+	i.typ = x.Type()
+}
+
+// AsBxor initializes this instruction as an integer bitwise xor instruction with OpcodeBxor.
+func (i *Instruction) AsBxor(x, amount Value) {
+	i.opcode = OpcodeBxor
+	i.v = x
+	i.v2 = amount
+	i.typ = x.Type()
+}
+
+// AsIshl initializes this instruction as an integer shift left instruction with OpcodeIshl.
+func (i *Instruction) AsIshl(x, amount Value) *Instruction {
+	i.opcode = OpcodeIshl
+	i.v = x
+	i.v2 = amount
+	i.typ = x.Type()
+	return i
+}
+
+// AsVIshl initializes this instruction as an integer shift left instruction with OpcodeVIshl on vector.
+func (i *Instruction) AsVIshl(x, amount Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVIshl
+	i.v = x
+	i.v2 = amount
+	i.u1 = uint64(lane)
+	i.typ = x.Type()
+	return i
+}
+
+// AsUshr initializes this instruction as an integer unsigned shift right (logical shift right) instruction with OpcodeUshr.
+func (i *Instruction) AsUshr(x, amount Value) *Instruction {
+	i.opcode = OpcodeUshr
+	i.v = x
+	i.v2 = amount
+	i.typ = x.Type()
+	return i
+}
+
+// AsVUshr initializes this instruction as an integer unsigned shift right (logical shift right) instruction with OpcodeVUshr on vector.
+func (i *Instruction) AsVUshr(x, amount Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVUshr
+	i.v = x
+	i.v2 = amount
+	i.u1 = uint64(lane)
+	i.typ = x.Type()
+	return i
+}
+
+// AsSshr initializes this instruction as an integer signed shift right (arithmetic shift right) instruction with OpcodeSshr.
+func (i *Instruction) AsSshr(x, amount Value) *Instruction {
+	i.opcode = OpcodeSshr
+	i.v = x
+	i.v2 = amount
+	i.typ = x.Type()
+	return i
+}
+
+// AsVSshr initializes this instruction as an integer signed shift right (arithmetic shift right) instruction with OpcodeVSshr on vector.
+func (i *Instruction) AsVSshr(x, amount Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVSshr
+	i.v = x
+	i.v2 = amount
+	i.u1 = uint64(lane)
+	i.typ = x.Type()
+	return i
+}
+
+// AsExtractlane initializes this instruction as an extract lane instruction with OpcodeExtractlane on vector.
+func (i *Instruction) AsExtractlane(x Value, index byte, lane VecLane, signed bool) *Instruction {
+	i.opcode = OpcodeExtractlane
+	i.v = x
+	// We do not have a field for signedness, but `index` is a byte,
+	// so we just encode the flag in the high bits of `u1`.
+	i.u1 = uint64(index)
+	if signed {
+		i.u1 = i.u1 | 1<<32
+	}
+	i.u2 = uint64(lane)
+	switch lane {
+	case VecLaneI8x16, VecLaneI16x8, VecLaneI32x4:
+		i.typ = TypeI32
+	case VecLaneI64x2:
+		i.typ = TypeI64
+	case VecLaneF32x4:
+		i.typ = TypeF32
+	case VecLaneF64x2:
+		i.typ = TypeF64
+	}
+	return i
+}
+
+// AsInsertlane initializes this instruction as an insert lane instruction with OpcodeInsertlane on vector.
+func (i *Instruction) AsInsertlane(x, y Value, index byte, lane VecLane) *Instruction {
+	i.opcode = OpcodeInsertlane
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(index)
+	i.u2 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsShuffle initializes this instruction as a shuffle instruction with OpcodeShuffle on vector.
+func (i *Instruction) AsShuffle(x, y Value, lane []byte) *Instruction {
+	i.opcode = OpcodeShuffle
+	i.v = x
+	i.v2 = y
+	// Encode the 16 bytes as 8 bytes in u1, and 8 bytes in u2.
+	i.u1 = uint64(lane[7])<<56 | uint64(lane[6])<<48 | uint64(lane[5])<<40 | uint64(lane[4])<<32 | uint64(lane[3])<<24 | uint64(lane[2])<<16 | uint64(lane[1])<<8 | uint64(lane[0])
+	i.u2 = uint64(lane[15])<<56 | uint64(lane[14])<<48 | uint64(lane[13])<<40 | uint64(lane[12])<<32 | uint64(lane[11])<<24 | uint64(lane[10])<<16 | uint64(lane[9])<<8 | uint64(lane[8])
+	i.typ = TypeV128
+	return i
+}
+
+// AsSwizzle initializes this instruction as an insert lane instruction with OpcodeSwizzle on vector.
+func (i *Instruction) AsSwizzle(x, y Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeSwizzle
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsSplat initializes this instruction as an insert lane instruction with OpcodeSplat on vector.
+func (i *Instruction) AsSplat(x Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeSplat
+	i.v = x
+	i.u1 = uint64(lane)
+	i.typ = TypeV128
+	return i
+}
+
+// AsRotl initializes this instruction as a word rotate left instruction with OpcodeRotl.
+func (i *Instruction) AsRotl(x, amount Value) {
+	i.opcode = OpcodeRotl
+	i.v = x
+	i.v2 = amount
+	i.typ = x.Type()
+}
+
+// AsRotr initializes this instruction as a word rotate right instruction with OpcodeRotr.
+func (i *Instruction) AsRotr(x, amount Value) {
+	i.opcode = OpcodeRotr
+	i.v = x
+	i.v2 = amount
+	i.typ = x.Type()
+}
+
+// IcmpData returns the operands and comparison condition of this integer comparison instruction.
+func (i *Instruction) IcmpData() (x, y Value, c IntegerCmpCond) {
+	return i.v, i.v2, IntegerCmpCond(i.u1)
+}
+
+// FcmpData returns the operands and comparison condition of this floating-point comparison instruction.
+func (i *Instruction) FcmpData() (x, y Value, c FloatCmpCond) {
+	return i.v, i.v2, FloatCmpCond(i.u1)
+}
+
+// VIcmpData returns the operands and comparison condition of this integer comparison instruction on vector.
+func (i *Instruction) VIcmpData() (x, y Value, c IntegerCmpCond, l VecLane) {
+	return i.v, i.v2, IntegerCmpCond(i.u1), VecLane(i.u2)
+}
+
+// VFcmpData returns the operands and comparison condition of this float comparison instruction on vector.
+func (i *Instruction) VFcmpData() (x, y Value, c FloatCmpCond, l VecLane) {
+	return i.v, i.v2, FloatCmpCond(i.u1), VecLane(i.u2)
+}
+
+// ExtractlaneData returns the operands and sign flag of Extractlane on vector.
+func (i *Instruction) ExtractlaneData() (x Value, index byte, signed bool, l VecLane) {
+	x = i.v
+	index = byte(0b00001111 & i.u1)
+	signed = i.u1>>32 != 0
+	l = VecLane(i.u2)
+	return
+}
+
+// InsertlaneData returns the operands and sign flag of Insertlane on vector.
+func (i *Instruction) InsertlaneData() (x, y Value, index byte, l VecLane) {
+	x = i.v
+	y = i.v2
+	index = byte(i.u1)
+	l = VecLane(i.u2)
+	return
+}
+
+// AsFadd initializes this instruction as a floating-point addition instruction with OpcodeFadd.
+func (i *Instruction) AsFadd(x, y Value) {
+	i.opcode = OpcodeFadd
+	i.v = x
+	i.v2 = y
+	i.typ = x.Type()
+}
+
+// AsFsub initializes this instruction as a floating-point subtraction instruction with OpcodeFsub.
+func (i *Instruction) AsFsub(x, y Value) {
+	i.opcode = OpcodeFsub
+	i.v = x
+	i.v2 = y
+	i.typ = x.Type()
+}
+
+// AsFmul initializes this instruction as a floating-point multiplication instruction with OpcodeFmul.
+func (i *Instruction) AsFmul(x, y Value) {
+	i.opcode = OpcodeFmul
+	i.v = x
+	i.v2 = y
+	i.typ = x.Type()
+}
+
+// AsFdiv initializes this instruction as a floating-point division instruction with OpcodeFdiv.
+func (i *Instruction) AsFdiv(x, y Value) {
+	i.opcode = OpcodeFdiv
+	i.v = x
+	i.v2 = y
+	i.typ = x.Type()
+}
+
+// AsFmin initializes this instruction to take the minimum of two floating-points with OpcodeFmin.
+func (i *Instruction) AsFmin(x, y Value) {
+	i.opcode = OpcodeFmin
+	i.v = x
+	i.v2 = y
+	i.typ = x.Type()
+}
+
+// AsFmax initializes this instruction to take the maximum of two floating-points with OpcodeFmax.
+func (i *Instruction) AsFmax(x, y Value) {
+	i.opcode = OpcodeFmax
+	i.v = x
+	i.v2 = y
+	i.typ = x.Type()
+}
+
+// AsF32const initializes this instruction as a 32-bit floating-point constant instruction with OpcodeF32const.
+func (i *Instruction) AsF32const(f float32) *Instruction {
+	i.opcode = OpcodeF32const
+	i.typ = TypeF64
+	i.u1 = uint64(math.Float32bits(f))
+	return i
+}
+
+// AsF64const initializes this instruction as a 64-bit floating-point constant instruction with OpcodeF64const.
+func (i *Instruction) AsF64const(f float64) *Instruction {
+	i.opcode = OpcodeF64const
+	i.typ = TypeF64
+	i.u1 = math.Float64bits(f)
+	return i
+}
+
+// AsVconst initializes this instruction as a vector constant instruction with OpcodeVconst.
+func (i *Instruction) AsVconst(lo, hi uint64) *Instruction {
+	i.opcode = OpcodeVconst
+	i.typ = TypeV128
+	i.u1 = lo
+	i.u2 = hi
+	return i
+}
+
+// AsVbnot initializes this instruction as a vector negation instruction with OpcodeVbnot.
+func (i *Instruction) AsVbnot(v Value) *Instruction {
+	i.opcode = OpcodeVbnot
+	i.typ = TypeV128
+	i.v = v
+	return i
+}
+
+// AsVband initializes this instruction as an and vector instruction with OpcodeVband.
+func (i *Instruction) AsVband(x, y Value) *Instruction {
+	i.opcode = OpcodeVband
+	i.typ = TypeV128
+	i.v = x
+	i.v2 = y
+	return i
+}
+
+// AsVbor initializes this instruction as an or vector instruction with OpcodeVbor.
+func (i *Instruction) AsVbor(x, y Value) *Instruction {
+	i.opcode = OpcodeVbor
+	i.typ = TypeV128
+	i.v = x
+	i.v2 = y
+	return i
+}
+
+// AsVbxor initializes this instruction as a xor vector instruction with OpcodeVbxor.
+func (i *Instruction) AsVbxor(x, y Value) *Instruction {
+	i.opcode = OpcodeVbxor
+	i.typ = TypeV128
+	i.v = x
+	i.v2 = y
+	return i
+}
+
+// AsVbandnot initializes this instruction as an and-not vector instruction with OpcodeVbandnot.
+func (i *Instruction) AsVbandnot(x, y Value) *Instruction {
+	i.opcode = OpcodeVbandnot
+	i.typ = TypeV128
+	i.v = x
+	i.v2 = y
+	return i
+}
+
+// AsVbitselect initializes this instruction as a bit select vector instruction with OpcodeVbitselect.
+func (i *Instruction) AsVbitselect(c, x, y Value) *Instruction {
+	i.opcode = OpcodeVbitselect
+	i.typ = TypeV128
+	i.v = c
+	i.v2 = x
+	i.v3 = y
+	return i
+}
+
+// AsVanyTrue initializes this instruction as an anyTrue vector instruction with OpcodeVanyTrue.
+func (i *Instruction) AsVanyTrue(x Value) *Instruction {
+	i.opcode = OpcodeVanyTrue
+	i.typ = TypeI32
+	i.v = x
+	return i
+}
+
+// AsVallTrue initializes this instruction as an allTrue vector instruction with OpcodeVallTrue.
+func (i *Instruction) AsVallTrue(x Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVallTrue
+	i.typ = TypeI32
+	i.v = x
+	i.u1 = uint64(lane)
+	return i
+}
+
+// AsVhighBits initializes this instruction as a highBits vector instruction with OpcodeVhighBits.
+func (i *Instruction) AsVhighBits(x Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeVhighBits
+	i.typ = TypeI32
+	i.v = x
+	i.u1 = uint64(lane)
+	return i
+}
+
+// VconstData returns the operands of this vector constant instruction.
+func (i *Instruction) VconstData() (lo, hi uint64) {
+	return i.u1, i.u2
+}
+
+// AsReturn initializes this instruction as a return instruction with OpcodeReturn.
+func (i *Instruction) AsReturn(vs wazevoapi.VarLength[Value]) *Instruction {
+	i.opcode = OpcodeReturn
+	i.vs = vs
+	return i
+}
+
+// AsIreduce initializes this instruction as a reduction instruction with OpcodeIreduce.
+func (i *Instruction) AsIreduce(v Value, dstType Type) *Instruction {
+	i.opcode = OpcodeIreduce
+	i.v = v
+	i.typ = dstType
+	return i
+}
+
+// AsWiden initializes this instruction as a signed or unsigned widen instruction
+// on low half or high half of the given vector with OpcodeSwidenLow, OpcodeUwidenLow, OpcodeSwidenHigh, OpcodeUwidenHigh.
+func (i *Instruction) AsWiden(v Value, lane VecLane, signed, low bool) *Instruction {
+	switch {
+	case signed && low:
+		i.opcode = OpcodeSwidenLow
+	case !signed && low:
+		i.opcode = OpcodeUwidenLow
+	case signed && !low:
+		i.opcode = OpcodeSwidenHigh
+	case !signed && !low:
+		i.opcode = OpcodeUwidenHigh
+	}
+	i.v = v
+	i.u1 = uint64(lane)
+	return i
+}
+
+// AsAtomicLoad initializes this instruction as an atomic load.
+// The size is in bytes and must be 1, 2, 4, or 8.
+func (i *Instruction) AsAtomicLoad(addr Value, size uint64, typ Type) *Instruction {
+	i.opcode = OpcodeAtomicLoad
+	i.u1 = size
+	i.v = addr
+	i.typ = typ
+	return i
+}
+
+// AsAtomicLoad initializes this instruction as an atomic store.
+// The size is in bytes and must be 1, 2, 4, or 8.
+func (i *Instruction) AsAtomicStore(addr, val Value, size uint64) *Instruction {
+	i.opcode = OpcodeAtomicStore
+	i.u1 = size
+	i.v = addr
+	i.v2 = val
+	i.typ = val.Type()
+	return i
+}
+
+// AsAtomicRmw initializes this instruction as an atomic read-modify-write.
+// The size is in bytes and must be 1, 2, 4, or 8.
+func (i *Instruction) AsAtomicRmw(op AtomicRmwOp, addr, val Value, size uint64) *Instruction {
+	i.opcode = OpcodeAtomicRmw
+	i.u1 = uint64(op)
+	i.u2 = size
+	i.v = addr
+	i.v2 = val
+	i.typ = val.Type()
+	return i
+}
+
+// AsAtomicCas initializes this instruction as an atomic compare-and-swap.
+// The size is in bytes and must be 1, 2, 4, or 8.
+func (i *Instruction) AsAtomicCas(addr, exp, repl Value, size uint64) *Instruction {
+	i.opcode = OpcodeAtomicCas
+	i.u1 = size
+	i.v = addr
+	i.v2 = exp
+	i.v3 = repl
+	i.typ = repl.Type()
+	return i
+}
+
+// AsFence initializes this instruction as a memory fence.
+// A single byte immediate may be used to indicate fence ordering in the future
+// but is currently always 0 and ignored.
+func (i *Instruction) AsFence(order byte) *Instruction {
+	i.opcode = OpcodeFence
+	i.u1 = uint64(order)
+	return i
+}
+
+// AtomicRmwData returns the data for this atomic read-modify-write instruction.
+func (i *Instruction) AtomicRmwData() (op AtomicRmwOp, size uint64) {
+	return AtomicRmwOp(i.u1), i.u2
+}
+
+// AtomicTargetSize returns the target memory size of the atomic instruction.
+func (i *Instruction) AtomicTargetSize() (size uint64) {
+	return i.u1
+}
+
+// ReturnVals returns the return values of OpcodeReturn.
+func (i *Instruction) ReturnVals() []Value {
+	return i.vs.View()
+}
+
+// AsExitWithCode initializes this instruction as a trap instruction with OpcodeExitWithCode.
+func (i *Instruction) AsExitWithCode(ctx Value, code wazevoapi.ExitCode) {
+	i.opcode = OpcodeExitWithCode
+	i.v = ctx
+	i.u1 = uint64(code)
+}
+
+// AsExitIfTrueWithCode initializes this instruction as a trap instruction with OpcodeExitIfTrueWithCode.
+func (i *Instruction) AsExitIfTrueWithCode(ctx, c Value, code wazevoapi.ExitCode) *Instruction {
+	i.opcode = OpcodeExitIfTrueWithCode
+	i.v = ctx
+	i.v2 = c
+	i.u1 = uint64(code)
+	return i
+}
+
+// ExitWithCodeData returns the context and exit code of OpcodeExitWithCode.
+func (i *Instruction) ExitWithCodeData() (ctx Value, code wazevoapi.ExitCode) {
+	return i.v, wazevoapi.ExitCode(i.u1)
+}
+
+// ExitIfTrueWithCodeData returns the context and exit code of OpcodeExitWithCode.
+func (i *Instruction) ExitIfTrueWithCodeData() (ctx, c Value, code wazevoapi.ExitCode) {
+	return i.v, i.v2, wazevoapi.ExitCode(i.u1)
+}
+
+// InvertBrx inverts either OpcodeBrz or OpcodeBrnz to the other.
+func (i *Instruction) InvertBrx() {
+	switch i.opcode {
+	case OpcodeBrz:
+		i.opcode = OpcodeBrnz
+	case OpcodeBrnz:
+		i.opcode = OpcodeBrz
+	default:
+		panic("BUG")
+	}
+}
+
+// BranchData returns the branch data for this instruction necessary for backends.
+func (i *Instruction) BranchData() (condVal Value, blockArgs []Value, target BasicBlock) {
+	switch i.opcode {
+	case OpcodeJump:
+		condVal = ValueInvalid
+	case OpcodeBrz, OpcodeBrnz:
+		condVal = i.v
+	default:
+		panic("BUG")
+	}
+	blockArgs = i.vs.View()
+	target = i.blk
+	return
+}
+
+// BrTableData returns the branch table data for this instruction necessary for backends.
+func (i *Instruction) BrTableData() (index Value, targets []BasicBlock) {
+	if i.opcode != OpcodeBrTable {
+		panic("BUG: BrTableData only available for OpcodeBrTable")
+	}
+	index = i.v
+	targets = i.targets
+	return
+}
+
+// AsJump initializes this instruction as a jump instruction with OpcodeJump.
+func (i *Instruction) AsJump(vs Values, target BasicBlock) *Instruction {
+	i.opcode = OpcodeJump
+	i.vs = vs
+	i.blk = target
+	return i
+}
+
+// IsFallthroughJump returns true if this instruction is a fallthrough jump.
+func (i *Instruction) IsFallthroughJump() bool {
+	if i.opcode != OpcodeJump {
+		panic("BUG: IsFallthrough only available for OpcodeJump")
+	}
+	return i.opcode == OpcodeJump && i.u1 != 0
+}
+
+// AsFallthroughJump marks this instruction as a fallthrough jump.
+func (i *Instruction) AsFallthroughJump() {
+	if i.opcode != OpcodeJump {
+		panic("BUG: AsFallthroughJump only available for OpcodeJump")
+	}
+	i.u1 = 1
+}
+
+// AsBrz initializes this instruction as a branch-if-zero instruction with OpcodeBrz.
+func (i *Instruction) AsBrz(v Value, args Values, target BasicBlock) {
+	i.opcode = OpcodeBrz
+	i.v = v
+	i.vs = args
+	i.blk = target
+}
+
+// AsBrnz initializes this instruction as a branch-if-not-zero instruction with OpcodeBrnz.
+func (i *Instruction) AsBrnz(v Value, args Values, target BasicBlock) *Instruction {
+	i.opcode = OpcodeBrnz
+	i.v = v
+	i.vs = args
+	i.blk = target
+	return i
+}
+
+// AsBrTable initializes this instruction as a branch-table instruction with OpcodeBrTable.
+func (i *Instruction) AsBrTable(index Value, targets []BasicBlock) {
+	i.opcode = OpcodeBrTable
+	i.v = index
+	i.targets = targets
+}
+
+// AsCall initializes this instruction as a call instruction with OpcodeCall.
+func (i *Instruction) AsCall(ref FuncRef, sig *Signature, args Values) {
+	i.opcode = OpcodeCall
+	i.u1 = uint64(ref)
+	i.vs = args
+	i.u2 = uint64(sig.ID)
+	sig.used = true
+}
+
+// CallData returns the call data for this instruction necessary for backends.
+func (i *Instruction) CallData() (ref FuncRef, sigID SignatureID, args []Value) {
+	if i.opcode != OpcodeCall {
+		panic("BUG: CallData only available for OpcodeCall")
+	}
+	ref = FuncRef(i.u1)
+	sigID = SignatureID(i.u2)
+	args = i.vs.View()
+	return
+}
+
+// AsCallIndirect initializes this instruction as a call-indirect instruction with OpcodeCallIndirect.
+func (i *Instruction) AsCallIndirect(funcPtr Value, sig *Signature, args Values) *Instruction {
+	i.opcode = OpcodeCallIndirect
+	i.typ = TypeF64
+	i.vs = args
+	i.v = funcPtr
+	i.u1 = uint64(sig.ID)
+	sig.used = true
+	return i
+}
+
+// AsCallGoRuntimeMemmove is the same as AsCallIndirect, but with a special flag set to indicate that it is a call to the Go runtime memmove function.
+func (i *Instruction) AsCallGoRuntimeMemmove(funcPtr Value, sig *Signature, args Values) *Instruction {
+	i.AsCallIndirect(funcPtr, sig, args)
+	i.u2 = 1
+	return i
+}
+
+// CallIndirectData returns the call indirect data for this instruction necessary for backends.
+func (i *Instruction) CallIndirectData() (funcPtr Value, sigID SignatureID, args []Value, isGoMemmove bool) {
+	if i.opcode != OpcodeCallIndirect {
+		panic("BUG: CallIndirectData only available for OpcodeCallIndirect")
+	}
+	funcPtr = i.v
+	sigID = SignatureID(i.u1)
+	args = i.vs.View()
+	isGoMemmove = i.u2 == 1
+	return
+}
+
+// AsClz initializes this instruction as a Count Leading Zeroes instruction with OpcodeClz.
+func (i *Instruction) AsClz(x Value) {
+	i.opcode = OpcodeClz
+	i.v = x
+	i.typ = x.Type()
+}
+
+// AsCtz initializes this instruction as a Count Trailing Zeroes instruction with OpcodeCtz.
+func (i *Instruction) AsCtz(x Value) {
+	i.opcode = OpcodeCtz
+	i.v = x
+	i.typ = x.Type()
+}
+
+// AsPopcnt initializes this instruction as a Population Count instruction with OpcodePopcnt.
+func (i *Instruction) AsPopcnt(x Value) {
+	i.opcode = OpcodePopcnt
+	i.v = x
+	i.typ = x.Type()
+}
+
+// AsFneg initializes this instruction as an instruction with OpcodeFneg.
+func (i *Instruction) AsFneg(x Value) *Instruction {
+	i.opcode = OpcodeFneg
+	i.v = x
+	i.typ = x.Type()
+	return i
+}
+
+// AsSqrt initializes this instruction as an instruction with OpcodeSqrt.
+func (i *Instruction) AsSqrt(x Value) *Instruction {
+	i.opcode = OpcodeSqrt
+	i.v = x
+	i.typ = x.Type()
+	return i
+}
+
+// AsFabs initializes this instruction as an instruction with OpcodeFabs.
+func (i *Instruction) AsFabs(x Value) *Instruction {
+	i.opcode = OpcodeFabs
+	i.v = x
+	i.typ = x.Type()
+	return i
+}
+
+// AsFcopysign initializes this instruction as an instruction with OpcodeFcopysign.
+func (i *Instruction) AsFcopysign(x, y Value) *Instruction {
+	i.opcode = OpcodeFcopysign
+	i.v = x
+	i.v2 = y
+	i.typ = x.Type()
+	return i
+}
+
+// AsCeil initializes this instruction as an instruction with OpcodeCeil.
+func (i *Instruction) AsCeil(x Value) *Instruction {
+	i.opcode = OpcodeCeil
+	i.v = x
+	i.typ = x.Type()
+	return i
+}
+
+// AsFloor initializes this instruction as an instruction with OpcodeFloor.
+func (i *Instruction) AsFloor(x Value) *Instruction {
+	i.opcode = OpcodeFloor
+	i.v = x
+	i.typ = x.Type()
+	return i
+}
+
+// AsTrunc initializes this instruction as an instruction with OpcodeTrunc.
+func (i *Instruction) AsTrunc(x Value) *Instruction {
+	i.opcode = OpcodeTrunc
+	i.v = x
+	i.typ = x.Type()
+	return i
+}
+
+// AsNearest initializes this instruction as an instruction with OpcodeNearest.
+func (i *Instruction) AsNearest(x Value) *Instruction {
+	i.opcode = OpcodeNearest
+	i.v = x
+	i.typ = x.Type()
+	return i
+}
+
+// AsBitcast initializes this instruction as an instruction with OpcodeBitcast.
+func (i *Instruction) AsBitcast(x Value, dstType Type) *Instruction {
+	i.opcode = OpcodeBitcast
+	i.v = x
+	i.typ = dstType
+	return i
+}
+
+// BitcastData returns the operands for a bitcast instruction.
+func (i *Instruction) BitcastData() (x Value, dstType Type) {
+	return i.v, i.typ
+}
+
+// AsFdemote initializes this instruction as an instruction with OpcodeFdemote.
+func (i *Instruction) AsFdemote(x Value) {
+	i.opcode = OpcodeFdemote
+	i.v = x
+	i.typ = TypeF32
+}
+
+// AsFpromote initializes this instruction as an instruction with OpcodeFpromote.
+func (i *Instruction) AsFpromote(x Value) {
+	i.opcode = OpcodeFpromote
+	i.v = x
+	i.typ = TypeF64
+}
+
+// AsFcvtFromInt initializes this instruction as an instruction with either OpcodeFcvtFromUint or OpcodeFcvtFromSint
+func (i *Instruction) AsFcvtFromInt(x Value, signed bool, dst64bit bool) *Instruction {
+	if signed {
+		i.opcode = OpcodeFcvtFromSint
+	} else {
+		i.opcode = OpcodeFcvtFromUint
+	}
+	i.v = x
+	if dst64bit {
+		i.typ = TypeF64
+	} else {
+		i.typ = TypeF32
+	}
+	return i
+}
+
+// AsFcvtToInt initializes this instruction as an instruction with either OpcodeFcvtToUint or OpcodeFcvtToSint
+func (i *Instruction) AsFcvtToInt(x, ctx Value, signed bool, dst64bit bool, sat bool) *Instruction {
+	switch {
+	case signed && !sat:
+		i.opcode = OpcodeFcvtToSint
+	case !signed && !sat:
+		i.opcode = OpcodeFcvtToUint
+	case signed && sat:
+		i.opcode = OpcodeFcvtToSintSat
+	case !signed && sat:
+		i.opcode = OpcodeFcvtToUintSat
+	}
+	i.v = x
+	i.v2 = ctx
+	if dst64bit {
+		i.typ = TypeI64
+	} else {
+		i.typ = TypeI32
+	}
+	return i
+}
+
+// AsVFcvtToIntSat initializes this instruction as an instruction with either OpcodeVFcvtToSintSat or OpcodeVFcvtToUintSat
+func (i *Instruction) AsVFcvtToIntSat(x Value, lane VecLane, signed bool) *Instruction {
+	if signed {
+		i.opcode = OpcodeVFcvtToSintSat
+	} else {
+		i.opcode = OpcodeVFcvtToUintSat
+	}
+	i.v = x
+	i.u1 = uint64(lane)
+	return i
+}
+
+// AsVFcvtFromInt initializes this instruction as an instruction with either OpcodeVFcvtToSintSat or OpcodeVFcvtToUintSat
+func (i *Instruction) AsVFcvtFromInt(x Value, lane VecLane, signed bool) *Instruction {
+	if signed {
+		i.opcode = OpcodeVFcvtFromSint
+	} else {
+		i.opcode = OpcodeVFcvtFromUint
+	}
+	i.v = x
+	i.u1 = uint64(lane)
+	return i
+}
+
+// AsNarrow initializes this instruction as an instruction with either OpcodeSnarrow or OpcodeUnarrow
+func (i *Instruction) AsNarrow(x, y Value, lane VecLane, signed bool) *Instruction {
+	if signed {
+		i.opcode = OpcodeSnarrow
+	} else {
+		i.opcode = OpcodeUnarrow
+	}
+	i.v = x
+	i.v2 = y
+	i.u1 = uint64(lane)
+	return i
+}
+
+// AsFvpromoteLow initializes this instruction as an instruction with OpcodeFvpromoteLow
+func (i *Instruction) AsFvpromoteLow(x Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeFvpromoteLow
+	i.v = x
+	i.u1 = uint64(lane)
+	return i
+}
+
+// AsFvdemote initializes this instruction as an instruction with OpcodeFvdemote
+func (i *Instruction) AsFvdemote(x Value, lane VecLane) *Instruction {
+	i.opcode = OpcodeFvdemote
+	i.v = x
+	i.u1 = uint64(lane)
+	return i
+}
+
+// AsSExtend initializes this instruction as a sign extension instruction with OpcodeSExtend.
+func (i *Instruction) AsSExtend(v Value, from, to byte) *Instruction {
+	i.opcode = OpcodeSExtend
+	i.v = v
+	i.u1 = uint64(from)<<8 | uint64(to)
+	if to == 64 {
+		i.typ = TypeI64
+	} else {
+		i.typ = TypeI32
+	}
+	return i
+}
+
+// AsUExtend initializes this instruction as an unsigned extension instruction with OpcodeUExtend.
+func (i *Instruction) AsUExtend(v Value, from, to byte) *Instruction {
+	i.opcode = OpcodeUExtend
+	i.v = v
+	i.u1 = uint64(from)<<8 | uint64(to)
+	if to == 64 {
+		i.typ = TypeI64
+	} else {
+		i.typ = TypeI32
+	}
+	return i
+}
+
+func (i *Instruction) ExtendData() (from, to byte, signed bool) {
+	if i.opcode != OpcodeSExtend && i.opcode != OpcodeUExtend {
+		panic("BUG: ExtendData only available for OpcodeSExtend and OpcodeUExtend")
+	}
+	from = byte(i.u1 >> 8)
+	to = byte(i.u1)
+	signed = i.opcode == OpcodeSExtend
+	return
+}
+
+// AsSelect initializes this instruction as an unsigned extension instruction with OpcodeSelect.
+func (i *Instruction) AsSelect(c, x, y Value) *Instruction {
+	i.opcode = OpcodeSelect
+	i.v = c
+	i.v2 = x
+	i.v3 = y
+	i.typ = x.Type()
+	return i
+}
+
+// SelectData returns the select data for this instruction necessary for backends.
+func (i *Instruction) SelectData() (c, x, y Value) {
+	c = i.v
+	x = i.v2
+	y = i.v3
+	return
+}
+
+// ExtendFromToBits returns the from and to bit size for the extension instruction.
+func (i *Instruction) ExtendFromToBits() (from, to byte) {
+	from = byte(i.u1 >> 8)
+	to = byte(i.u1)
+	return
+}
+
+// Format returns a string representation of this instruction with the given builder.
+// For debugging purposes only.
+func (i *Instruction) Format(b Builder) string {
+	var instSuffix string
+	switch i.opcode {
+	case OpcodeExitWithCode:
+		instSuffix = fmt.Sprintf(" %s, %s", i.v.Format(b), wazevoapi.ExitCode(i.u1))
+	case OpcodeExitIfTrueWithCode:
+		instSuffix = fmt.Sprintf(" %s, %s, %s", i.v2.Format(b), i.v.Format(b), wazevoapi.ExitCode(i.u1))
+	case OpcodeIadd, OpcodeIsub, OpcodeImul, OpcodeFadd, OpcodeFsub, OpcodeFmin, OpcodeFmax, OpcodeFdiv, OpcodeFmul:
+		instSuffix = fmt.Sprintf(" %s, %s", i.v.Format(b), i.v2.Format(b))
+	case OpcodeIcmp:
+		instSuffix = fmt.Sprintf(" %s, %s, %s", IntegerCmpCond(i.u1), i.v.Format(b), i.v2.Format(b))
+	case OpcodeFcmp:
+		instSuffix = fmt.Sprintf(" %s, %s, %s", FloatCmpCond(i.u1), i.v.Format(b), i.v2.Format(b))
+	case OpcodeSExtend, OpcodeUExtend:
+		instSuffix = fmt.Sprintf(" %s, %d->%d", i.v.Format(b), i.u1>>8, i.u1&0xff)
+	case OpcodeCall, OpcodeCallIndirect:
+		view := i.vs.View()
+		vs := make([]string, len(view))
+		for idx := range vs {
+			vs[idx] = view[idx].Format(b)
+		}
+		if i.opcode == OpcodeCallIndirect {
+			instSuffix = fmt.Sprintf(" %s:%s, %s", i.v.Format(b), SignatureID(i.u1), strings.Join(vs, ", "))
+		} else {
+			instSuffix = fmt.Sprintf(" %s:%s, %s", FuncRef(i.u1), SignatureID(i.u2), strings.Join(vs, ", "))
+		}
+	case OpcodeStore, OpcodeIstore8, OpcodeIstore16, OpcodeIstore32:
+		instSuffix = fmt.Sprintf(" %s, %s, %#x", i.v.Format(b), i.v2.Format(b), uint32(i.u1))
+	case OpcodeLoad, OpcodeVZeroExtLoad:
+		instSuffix = fmt.Sprintf(" %s, %#x", i.v.Format(b), int32(i.u1))
+	case OpcodeLoadSplat:
+		instSuffix = fmt.Sprintf(".%s %s, %#x", VecLane(i.u2), i.v.Format(b), int32(i.u1))
+	case OpcodeUload8, OpcodeUload16, OpcodeUload32, OpcodeSload8, OpcodeSload16, OpcodeSload32:
+		instSuffix = fmt.Sprintf(" %s, %#x", i.v.Format(b), int32(i.u1))
+	case OpcodeSelect, OpcodeVbitselect:
+		instSuffix = fmt.Sprintf(" %s, %s, %s", i.v.Format(b), i.v2.Format(b), i.v3.Format(b))
+	case OpcodeIconst:
+		switch i.typ {
+		case TypeI32:
+			instSuffix = fmt.Sprintf("_32 %#x", uint32(i.u1))
+		case TypeI64:
+			instSuffix = fmt.Sprintf("_64 %#x", i.u1)
+		}
+	case OpcodeVconst:
+		instSuffix = fmt.Sprintf(" %016x %016x", i.u1, i.u2)
+	case OpcodeF32const:
+		instSuffix = fmt.Sprintf(" %f", math.Float32frombits(uint32(i.u1)))
+	case OpcodeF64const:
+		instSuffix = fmt.Sprintf(" %f", math.Float64frombits(i.u1))
+	case OpcodeReturn:
+		view := i.vs.View()
+		if len(view) == 0 {
+			break
+		}
+		vs := make([]string, len(view))
+		for idx := range vs {
+			vs[idx] = view[idx].Format(b)
+		}
+		instSuffix = fmt.Sprintf(" %s", strings.Join(vs, ", "))
+	case OpcodeJump:
+		view := i.vs.View()
+		vs := make([]string, len(view)+1)
+		if i.IsFallthroughJump() {
+			vs[0] = " fallthrough"
+		} else {
+			vs[0] = " " + i.blk.(*basicBlock).Name()
+		}
+		for idx := range view {
+			vs[idx+1] = view[idx].Format(b)
+		}
+
+		instSuffix = strings.Join(vs, ", ")
+	case OpcodeBrz, OpcodeBrnz:
+		view := i.vs.View()
+		vs := make([]string, len(view)+2)
+		vs[0] = " " + i.v.Format(b)
+		vs[1] = i.blk.(*basicBlock).Name()
+		for idx := range view {
+			vs[idx+2] = view[idx].Format(b)
+		}
+		instSuffix = strings.Join(vs, ", ")
+	case OpcodeBrTable:
+		// `BrTable index, [label1, label2, ... labelN]`
+		instSuffix = fmt.Sprintf(" %s", i.v.Format(b))
+		instSuffix += ", ["
+		for i, target := range i.targets {
+			blk := target.(*basicBlock)
+			if i == 0 {
+				instSuffix += blk.Name()
+			} else {
+				instSuffix += ", " + blk.Name()
+			}
+		}
+		instSuffix += "]"
+	case OpcodeBand, OpcodeBor, OpcodeBxor, OpcodeRotr, OpcodeRotl, OpcodeIshl, OpcodeSshr, OpcodeUshr,
+		OpcodeSdiv, OpcodeUdiv, OpcodeFcopysign, OpcodeSrem, OpcodeUrem,
+		OpcodeVbnot, OpcodeVbxor, OpcodeVbor, OpcodeVband, OpcodeVbandnot, OpcodeVIcmp, OpcodeVFcmp:
+		instSuffix = fmt.Sprintf(" %s, %s", i.v.Format(b), i.v2.Format(b))
+	case OpcodeUndefined:
+	case OpcodeClz, OpcodeCtz, OpcodePopcnt, OpcodeFneg, OpcodeFcvtToSint, OpcodeFcvtToUint, OpcodeFcvtFromSint,
+		OpcodeFcvtFromUint, OpcodeFcvtToSintSat, OpcodeFcvtToUintSat, OpcodeFdemote, OpcodeFpromote, OpcodeIreduce, OpcodeBitcast, OpcodeSqrt, OpcodeFabs,
+		OpcodeCeil, OpcodeFloor, OpcodeTrunc, OpcodeNearest:
+		instSuffix = " " + i.v.Format(b)
+	case OpcodeVIadd, OpcodeExtIaddPairwise, OpcodeVSaddSat, OpcodeVUaddSat, OpcodeVIsub, OpcodeVSsubSat, OpcodeVUsubSat,
+		OpcodeVImin, OpcodeVUmin, OpcodeVImax, OpcodeVUmax, OpcodeVImul, OpcodeVAvgRound,
+		OpcodeVFadd, OpcodeVFsub, OpcodeVFmul, OpcodeVFdiv,
+		OpcodeVIshl, OpcodeVSshr, OpcodeVUshr,
+		OpcodeVFmin, OpcodeVFmax, OpcodeVMinPseudo, OpcodeVMaxPseudo,
+		OpcodeSnarrow, OpcodeUnarrow, OpcodeSwizzle, OpcodeSqmulRoundSat:
+		instSuffix = fmt.Sprintf(".%s %s, %s", VecLane(i.u1), i.v.Format(b), i.v2.Format(b))
+	case OpcodeVIabs, OpcodeVIneg, OpcodeVIpopcnt, OpcodeVhighBits, OpcodeVallTrue, OpcodeVanyTrue,
+		OpcodeVFabs, OpcodeVFneg, OpcodeVSqrt, OpcodeVCeil, OpcodeVFloor, OpcodeVTrunc, OpcodeVNearest,
+		OpcodeVFcvtToUintSat, OpcodeVFcvtToSintSat, OpcodeVFcvtFromUint, OpcodeVFcvtFromSint,
+		OpcodeFvpromoteLow, OpcodeFvdemote, OpcodeSwidenLow, OpcodeUwidenLow, OpcodeSwidenHigh, OpcodeUwidenHigh,
+		OpcodeSplat:
+		instSuffix = fmt.Sprintf(".%s %s", VecLane(i.u1), i.v.Format(b))
+	case OpcodeExtractlane:
+		var signedness string
+		if i.u1 != 0 {
+			signedness = "signed"
+		} else {
+			signedness = "unsigned"
+		}
+		instSuffix = fmt.Sprintf(".%s %d, %s (%s)", VecLane(i.u2), 0x0000FFFF&i.u1, i.v.Format(b), signedness)
+	case OpcodeInsertlane:
+		instSuffix = fmt.Sprintf(".%s %d, %s, %s", VecLane(i.u2), i.u1, i.v.Format(b), i.v2.Format(b))
+	case OpcodeShuffle:
+		lanes := make([]byte, 16)
+		for idx := 0; idx < 8; idx++ {
+			lanes[idx] = byte(i.u1 >> (8 * idx))
+		}
+		for idx := 0; idx < 8; idx++ {
+			lanes[idx+8] = byte(i.u2 >> (8 * idx))
+		}
+		// Prints Shuffle.[0 1 2 3 4 5 6 7 ...] v2, v3
+		instSuffix = fmt.Sprintf(".%v %s, %s", lanes, i.v.Format(b), i.v2.Format(b))
+	case OpcodeAtomicRmw:
+		instSuffix = fmt.Sprintf(" %s_%d, %s, %s", AtomicRmwOp(i.u1), 8*i.u2, i.v.Format(b), i.v2.Format(b))
+	case OpcodeAtomicLoad:
+		instSuffix = fmt.Sprintf("_%d, %s", 8*i.u1, i.v.Format(b))
+	case OpcodeAtomicStore:
+		instSuffix = fmt.Sprintf("_%d, %s, %s", 8*i.u1, i.v.Format(b), i.v2.Format(b))
+	case OpcodeAtomicCas:
+		instSuffix = fmt.Sprintf("_%d, %s, %s, %s", 8*i.u1, i.v.Format(b), i.v2.Format(b), i.v3.Format(b))
+	case OpcodeFence:
+		instSuffix = fmt.Sprintf(" %d", i.u1)
+	case OpcodeWideningPairwiseDotProductS:
+		instSuffix = fmt.Sprintf(" %s, %s", i.v.Format(b), i.v2.Format(b))
+	default:
+		panic(fmt.Sprintf("TODO: format for %s", i.opcode))
+	}
+
+	instr := i.opcode.String() + instSuffix
+
+	var rvs []string
+	if rv := i.rValue; rv.Valid() {
+		rvs = append(rvs, rv.formatWithType(b))
+	}
+
+	for _, v := range i.rValues.View() {
+		rvs = append(rvs, v.formatWithType(b))
+	}
+
+	if len(rvs) > 0 {
+		return fmt.Sprintf("%s = %s", strings.Join(rvs, ", "), instr)
+	} else {
+		return instr
+	}
+}
+
+// addArgumentBranchInst adds an argument to this instruction.
+func (i *Instruction) addArgumentBranchInst(b *builder, v Value) {
+	switch i.opcode {
+	case OpcodeJump, OpcodeBrz, OpcodeBrnz:
+		i.vs = i.vs.Append(&b.varLengthPool, v)
+	default:
+		panic("BUG: " + i.opcode.String())
+	}
+}
+
+// Constant returns true if this instruction is a constant instruction.
+func (i *Instruction) Constant() bool {
+	switch i.opcode {
+	case OpcodeIconst, OpcodeF32const, OpcodeF64const:
+		return true
+	}
+	return false
+}
+
+// ConstantVal returns the constant value of this instruction.
+// How to interpret the return value depends on the opcode.
+func (i *Instruction) ConstantVal() (ret uint64) {
+	switch i.opcode {
+	case OpcodeIconst, OpcodeF32const, OpcodeF64const:
+		ret = i.u1
+	default:
+		panic("TODO")
+	}
+	return
+}
+
+// String implements fmt.Stringer.
+func (o Opcode) String() (ret string) {
+	switch o {
+	case OpcodeInvalid:
+		return "invalid"
+	case OpcodeUndefined:
+		return "Undefined"
+	case OpcodeJump:
+		return "Jump"
+	case OpcodeBrz:
+		return "Brz"
+	case OpcodeBrnz:
+		return "Brnz"
+	case OpcodeBrTable:
+		return "BrTable"
+	case OpcodeExitWithCode:
+		return "Exit"
+	case OpcodeExitIfTrueWithCode:
+		return "ExitIfTrue"
+	case OpcodeReturn:
+		return "Return"
+	case OpcodeCall:
+		return "Call"
+	case OpcodeCallIndirect:
+		return "CallIndirect"
+	case OpcodeSplat:
+		return "Splat"
+	case OpcodeSwizzle:
+		return "Swizzle"
+	case OpcodeInsertlane:
+		return "Insertlane"
+	case OpcodeExtractlane:
+		return "Extractlane"
+	case OpcodeLoad:
+		return "Load"
+	case OpcodeLoadSplat:
+		return "LoadSplat"
+	case OpcodeStore:
+		return "Store"
+	case OpcodeUload8:
+		return "Uload8"
+	case OpcodeSload8:
+		return "Sload8"
+	case OpcodeIstore8:
+		return "Istore8"
+	case OpcodeUload16:
+		return "Uload16"
+	case OpcodeSload16:
+		return "Sload16"
+	case OpcodeIstore16:
+		return "Istore16"
+	case OpcodeUload32:
+		return "Uload32"
+	case OpcodeSload32:
+		return "Sload32"
+	case OpcodeIstore32:
+		return "Istore32"
+	case OpcodeIconst:
+		return "Iconst"
+	case OpcodeF32const:
+		return "F32const"
+	case OpcodeF64const:
+		return "F64const"
+	case OpcodeVconst:
+		return "Vconst"
+	case OpcodeShuffle:
+		return "Shuffle"
+	case OpcodeSelect:
+		return "Select"
+	case OpcodeVanyTrue:
+		return "VanyTrue"
+	case OpcodeVallTrue:
+		return "VallTrue"
+	case OpcodeVhighBits:
+		return "VhighBits"
+	case OpcodeIcmp:
+		return "Icmp"
+	case OpcodeIcmpImm:
+		return "IcmpImm"
+	case OpcodeVIcmp:
+		return "VIcmp"
+	case OpcodeIadd:
+		return "Iadd"
+	case OpcodeIsub:
+		return "Isub"
+	case OpcodeImul:
+		return "Imul"
+	case OpcodeUdiv:
+		return "Udiv"
+	case OpcodeSdiv:
+		return "Sdiv"
+	case OpcodeUrem:
+		return "Urem"
+	case OpcodeSrem:
+		return "Srem"
+	case OpcodeBand:
+		return "Band"
+	case OpcodeBor:
+		return "Bor"
+	case OpcodeBxor:
+		return "Bxor"
+	case OpcodeBnot:
+		return "Bnot"
+	case OpcodeRotl:
+		return "Rotl"
+	case OpcodeRotr:
+		return "Rotr"
+	case OpcodeIshl:
+		return "Ishl"
+	case OpcodeUshr:
+		return "Ushr"
+	case OpcodeSshr:
+		return "Sshr"
+	case OpcodeClz:
+		return "Clz"
+	case OpcodeCtz:
+		return "Ctz"
+	case OpcodePopcnt:
+		return "Popcnt"
+	case OpcodeFcmp:
+		return "Fcmp"
+	case OpcodeFadd:
+		return "Fadd"
+	case OpcodeFsub:
+		return "Fsub"
+	case OpcodeFmul:
+		return "Fmul"
+	case OpcodeFdiv:
+		return "Fdiv"
+	case OpcodeSqmulRoundSat:
+		return "SqmulRoundSat"
+	case OpcodeSqrt:
+		return "Sqrt"
+	case OpcodeFneg:
+		return "Fneg"
+	case OpcodeFabs:
+		return "Fabs"
+	case OpcodeFcopysign:
+		return "Fcopysign"
+	case OpcodeFmin:
+		return "Fmin"
+	case OpcodeFmax:
+		return "Fmax"
+	case OpcodeCeil:
+		return "Ceil"
+	case OpcodeFloor:
+		return "Floor"
+	case OpcodeTrunc:
+		return "Trunc"
+	case OpcodeNearest:
+		return "Nearest"
+	case OpcodeBitcast:
+		return "Bitcast"
+	case OpcodeIreduce:
+		return "Ireduce"
+	case OpcodeSnarrow:
+		return "Snarrow"
+	case OpcodeUnarrow:
+		return "Unarrow"
+	case OpcodeSwidenLow:
+		return "SwidenLow"
+	case OpcodeSwidenHigh:
+		return "SwidenHigh"
+	case OpcodeUwidenLow:
+		return "UwidenLow"
+	case OpcodeUwidenHigh:
+		return "UwidenHigh"
+	case OpcodeExtIaddPairwise:
+		return "IaddPairwise"
+	case OpcodeWideningPairwiseDotProductS:
+		return "WideningPairwiseDotProductS"
+	case OpcodeUExtend:
+		return "UExtend"
+	case OpcodeSExtend:
+		return "SExtend"
+	case OpcodeFpromote:
+		return "Fpromote"
+	case OpcodeFdemote:
+		return "Fdemote"
+	case OpcodeFvdemote:
+		return "Fvdemote"
+	case OpcodeFcvtToUint:
+		return "FcvtToUint"
+	case OpcodeFcvtToSint:
+		return "FcvtToSint"
+	case OpcodeFcvtToUintSat:
+		return "FcvtToUintSat"
+	case OpcodeFcvtToSintSat:
+		return "FcvtToSintSat"
+	case OpcodeFcvtFromUint:
+		return "FcvtFromUint"
+	case OpcodeFcvtFromSint:
+		return "FcvtFromSint"
+	case OpcodeAtomicRmw:
+		return "AtomicRmw"
+	case OpcodeAtomicCas:
+		return "AtomicCas"
+	case OpcodeAtomicLoad:
+		return "AtomicLoad"
+	case OpcodeAtomicStore:
+		return "AtomicStore"
+	case OpcodeFence:
+		return "Fence"
+	case OpcodeVbor:
+		return "Vbor"
+	case OpcodeVbxor:
+		return "Vbxor"
+	case OpcodeVband:
+		return "Vband"
+	case OpcodeVbandnot:
+		return "Vbandnot"
+	case OpcodeVbnot:
+		return "Vbnot"
+	case OpcodeVbitselect:
+		return "Vbitselect"
+	case OpcodeVIadd:
+		return "VIadd"
+	case OpcodeVSaddSat:
+		return "VSaddSat"
+	case OpcodeVUaddSat:
+		return "VUaddSat"
+	case OpcodeVSsubSat:
+		return "VSsubSat"
+	case OpcodeVUsubSat:
+		return "VUsubSat"
+	case OpcodeVAvgRound:
+		return "OpcodeVAvgRound"
+	case OpcodeVIsub:
+		return "VIsub"
+	case OpcodeVImin:
+		return "VImin"
+	case OpcodeVUmin:
+		return "VUmin"
+	case OpcodeVImax:
+		return "VImax"
+	case OpcodeVUmax:
+		return "VUmax"
+	case OpcodeVImul:
+		return "VImul"
+	case OpcodeVIabs:
+		return "VIabs"
+	case OpcodeVIneg:
+		return "VIneg"
+	case OpcodeVIpopcnt:
+		return "VIpopcnt"
+	case OpcodeVIshl:
+		return "VIshl"
+	case OpcodeVUshr:
+		return "VUshr"
+	case OpcodeVSshr:
+		return "VSshr"
+	case OpcodeVFabs:
+		return "VFabs"
+	case OpcodeVFmax:
+		return "VFmax"
+	case OpcodeVFmin:
+		return "VFmin"
+	case OpcodeVFneg:
+		return "VFneg"
+	case OpcodeVFadd:
+		return "VFadd"
+	case OpcodeVFsub:
+		return "VFsub"
+	case OpcodeVFmul:
+		return "VFmul"
+	case OpcodeVFdiv:
+		return "VFdiv"
+	case OpcodeVFcmp:
+		return "VFcmp"
+	case OpcodeVCeil:
+		return "VCeil"
+	case OpcodeVFloor:
+		return "VFloor"
+	case OpcodeVTrunc:
+		return "VTrunc"
+	case OpcodeVNearest:
+		return "VNearest"
+	case OpcodeVMaxPseudo:
+		return "VMaxPseudo"
+	case OpcodeVMinPseudo:
+		return "VMinPseudo"
+	case OpcodeVSqrt:
+		return "VSqrt"
+	case OpcodeVFcvtToUintSat:
+		return "VFcvtToUintSat"
+	case OpcodeVFcvtToSintSat:
+		return "VFcvtToSintSat"
+	case OpcodeVFcvtFromUint:
+		return "VFcvtFromUint"
+	case OpcodeVFcvtFromSint:
+		return "VFcvtFromSint"
+	case OpcodeFvpromoteLow:
+		return "FvpromoteLow"
+	case OpcodeVZeroExtLoad:
+		return "VZeroExtLoad"
+	}
+	panic(fmt.Sprintf("unknown opcode %d", o))
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass.go
new file mode 100644
index 000000000..a2e986cd1
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass.go
@@ -0,0 +1,417 @@
+package ssa
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// RunPasses implements Builder.RunPasses.
+//
+// The order here matters; some pass depends on the previous ones.
+//
+// Note that passes suffixed with "Opt" are the optimization passes, meaning that they edit the instructions and blocks
+// while the other passes are not, like passEstimateBranchProbabilities does not edit them, but only calculates the additional information.
+func (b *builder) RunPasses() {
+	b.runPreBlockLayoutPasses()
+	b.runBlockLayoutPass()
+	b.runPostBlockLayoutPasses()
+	b.runFinalizingPasses()
+}
+
+func (b *builder) runPreBlockLayoutPasses() {
+	passSortSuccessors(b)
+	passDeadBlockEliminationOpt(b)
+	passRedundantPhiEliminationOpt(b)
+	// The result of passCalculateImmediateDominators will be used by various passes below.
+	passCalculateImmediateDominators(b)
+	passNopInstElimination(b)
+
+	// TODO: implement either conversion of irreducible CFG into reducible one, or irreducible CFG detection where we panic.
+	// 	WebAssembly program shouldn't result in irreducible CFG, but we should handle it properly in just in case.
+	// 	See FixIrreducible pass in LLVM: https://llvm.org/doxygen/FixIrreducible_8cpp_source.html
+
+	// TODO: implement more optimization passes like:
+	// 	block coalescing.
+	// 	Copy-propagation.
+	// 	Constant folding.
+	// 	Common subexpression elimination.
+	// 	Arithmetic simplifications.
+	// 	and more!
+
+	// passDeadCodeEliminationOpt could be more accurate if we do this after other optimizations.
+	passDeadCodeEliminationOpt(b)
+	b.donePreBlockLayoutPasses = true
+}
+
+func (b *builder) runBlockLayoutPass() {
+	if !b.donePreBlockLayoutPasses {
+		panic("runBlockLayoutPass must be called after all pre passes are done")
+	}
+	passLayoutBlocks(b)
+	b.doneBlockLayout = true
+}
+
+// runPostBlockLayoutPasses runs the post block layout passes. After this point, CFG is somewhat stable,
+// but still can be modified before finalizing passes. At this point, critical edges are split by passLayoutBlocks.
+func (b *builder) runPostBlockLayoutPasses() {
+	if !b.doneBlockLayout {
+		panic("runPostBlockLayoutPasses must be called after block layout pass is done")
+	}
+	// TODO: Do more. e.g. tail duplication, loop unrolling, etc.
+
+	b.donePostBlockLayoutPasses = true
+}
+
+// runFinalizingPasses runs the finalizing passes. After this point, CFG should not be modified.
+func (b *builder) runFinalizingPasses() {
+	if !b.donePostBlockLayoutPasses {
+		panic("runFinalizingPasses must be called after post block layout passes are done")
+	}
+	// Critical edges are split, so we fix the loop nesting forest.
+	passBuildLoopNestingForest(b)
+	passBuildDominatorTree(b)
+	// Now that we know the final placement of the blocks, we can explicitly mark the fallthrough jumps.
+	b.markFallthroughJumps()
+}
+
+// passDeadBlockEliminationOpt searches the unreachable blocks, and sets the basicBlock.invalid flag true if so.
+func passDeadBlockEliminationOpt(b *builder) {
+	entryBlk := b.entryBlk()
+	b.clearBlkVisited()
+	b.blkStack = append(b.blkStack, entryBlk)
+	for len(b.blkStack) > 0 {
+		reachableBlk := b.blkStack[len(b.blkStack)-1]
+		b.blkStack = b.blkStack[:len(b.blkStack)-1]
+		b.blkVisited[reachableBlk] = 0 // the value won't be used in this pass.
+
+		if !reachableBlk.sealed && !reachableBlk.ReturnBlock() {
+			panic(fmt.Sprintf("%s is not sealed", reachableBlk))
+		}
+
+		if wazevoapi.SSAValidationEnabled {
+			reachableBlk.validate(b)
+		}
+
+		for _, succ := range reachableBlk.success {
+			if _, ok := b.blkVisited[succ]; ok {
+				continue
+			}
+			b.blkStack = append(b.blkStack, succ)
+		}
+	}
+
+	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
+		if _, ok := b.blkVisited[blk]; !ok {
+			blk.invalid = true
+		}
+	}
+}
+
+// passRedundantPhiEliminationOpt eliminates the redundant PHIs (in our terminology, parameters of a block).
+func passRedundantPhiEliminationOpt(b *builder) {
+	redundantParameterIndexes := b.ints[:0] // reuse the slice from previous iterations.
+
+	// TODO: this might be costly for large programs, but at least, as far as I did the experiment, it's almost the
+	//  same as the single iteration version in terms of the overall compilation time. That *might be* mostly thanks to the fact
+	//  that removing many PHIs results in the reduction of the total instructions, not because of this indefinite iteration is
+	//  relatively small. For example, sqlite speedtest binary results in the large number of redundant PHIs,
+	//  the maximum number of iteration was 22, which seems to be acceptable but not that small either since the
+	//  complexity here is O(BlockNum * Iterations) at the worst case where BlockNum might be the order of thousands.
+	for {
+		changed := false
+		_ = b.blockIteratorBegin() // skip entry block!
+		// Below, we intentionally use the named iteration variable name, as this comes with inevitable nested for loops!
+		for blk := b.blockIteratorNext(); blk != nil; blk = b.blockIteratorNext() {
+			paramNum := len(blk.params)
+
+			for paramIndex := 0; paramIndex < paramNum; paramIndex++ {
+				phiValue := blk.params[paramIndex].value
+				redundant := true
+
+				nonSelfReferencingValue := ValueInvalid
+				for predIndex := range blk.preds {
+					br := blk.preds[predIndex].branch
+					// Resolve the alias in the arguments so that we could use the previous iteration's result.
+					b.resolveArgumentAlias(br)
+					pred := br.vs.View()[paramIndex]
+					if pred == phiValue {
+						// This is self-referencing: PHI from the same PHI.
+						continue
+					}
+
+					if !nonSelfReferencingValue.Valid() {
+						nonSelfReferencingValue = pred
+						continue
+					}
+
+					if nonSelfReferencingValue != pred {
+						redundant = false
+						break
+					}
+				}
+
+				if !nonSelfReferencingValue.Valid() {
+					// This shouldn't happen, and must be a bug in builder.go.
+					panic("BUG: params added but only self-referencing")
+				}
+
+				if redundant {
+					b.redundantParameterIndexToValue[paramIndex] = nonSelfReferencingValue
+					redundantParameterIndexes = append(redundantParameterIndexes, paramIndex)
+				}
+			}
+
+			if len(b.redundantParameterIndexToValue) == 0 {
+				continue
+			}
+			changed = true
+
+			// Remove the redundant PHIs from the argument list of branching instructions.
+			for predIndex := range blk.preds {
+				var cur int
+				predBlk := blk.preds[predIndex]
+				branchInst := predBlk.branch
+				view := branchInst.vs.View()
+				for argIndex, value := range view {
+					if _, ok := b.redundantParameterIndexToValue[argIndex]; !ok {
+						view[cur] = value
+						cur++
+					}
+				}
+				branchInst.vs.Cut(cur)
+			}
+
+			// Still need to have the definition of the value of the PHI (previously as the parameter).
+			for _, redundantParamIndex := range redundantParameterIndexes {
+				phiValue := blk.params[redundantParamIndex].value
+				onlyValue := b.redundantParameterIndexToValue[redundantParamIndex]
+				// Create an alias in this block from the only phi argument to the phi value.
+				b.alias(phiValue, onlyValue)
+			}
+
+			// Finally, Remove the param from the blk.
+			var cur int
+			for paramIndex := 0; paramIndex < paramNum; paramIndex++ {
+				param := blk.params[paramIndex]
+				if _, ok := b.redundantParameterIndexToValue[paramIndex]; !ok {
+					blk.params[cur] = param
+					cur++
+				}
+			}
+			blk.params = blk.params[:cur]
+
+			// Clears the map for the next iteration.
+			for _, paramIndex := range redundantParameterIndexes {
+				delete(b.redundantParameterIndexToValue, paramIndex)
+			}
+			redundantParameterIndexes = redundantParameterIndexes[:0]
+		}
+
+		if !changed {
+			break
+		}
+	}
+
+	// Reuse the slice for the future passes.
+	b.ints = redundantParameterIndexes
+}
+
+// passDeadCodeEliminationOpt traverses all the instructions, and calculates the reference count of each Value, and
+// eliminates all the unnecessary instructions whose ref count is zero.
+// The results are stored at builder.valueRefCounts. This also assigns a InstructionGroupID to each Instruction
+// during the process. This is the last SSA-level optimization pass and after this,
+// the SSA function is ready to be used by backends.
+//
+// TODO: the algorithm here might not be efficient. Get back to this later.
+func passDeadCodeEliminationOpt(b *builder) {
+	nvid := int(b.nextValueID)
+	if nvid >= len(b.valueRefCounts) {
+		b.valueRefCounts = append(b.valueRefCounts, make([]int, b.nextValueID)...)
+	}
+	if nvid >= len(b.valueIDToInstruction) {
+		b.valueIDToInstruction = append(b.valueIDToInstruction, make([]*Instruction, b.nextValueID)...)
+	}
+
+	// First, we gather all the instructions with side effects.
+	liveInstructions := b.instStack[:0]
+	// During the process, we will assign InstructionGroupID to each instruction, which is not
+	// relevant to dead code elimination, but we need in the backend.
+	var gid InstructionGroupID
+	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
+		for cur := blk.rootInstr; cur != nil; cur = cur.next {
+			cur.gid = gid
+			switch cur.sideEffect() {
+			case sideEffectTraps:
+				// The trappable should always be alive.
+				liveInstructions = append(liveInstructions, cur)
+			case sideEffectStrict:
+				liveInstructions = append(liveInstructions, cur)
+				// The strict side effect should create different instruction groups.
+				gid++
+			}
+
+			r1, rs := cur.Returns()
+			if r1.Valid() {
+				b.valueIDToInstruction[r1.ID()] = cur
+			}
+			for _, r := range rs {
+				b.valueIDToInstruction[r.ID()] = cur
+			}
+		}
+	}
+
+	// Find all the instructions referenced by live instructions transitively.
+	for len(liveInstructions) > 0 {
+		tail := len(liveInstructions) - 1
+		live := liveInstructions[tail]
+		liveInstructions = liveInstructions[:tail]
+		if live.live {
+			// If it's already marked alive, this is referenced multiple times,
+			// so we can skip it.
+			continue
+		}
+		live.live = true
+
+		// Before we walk, we need to resolve the alias first.
+		b.resolveArgumentAlias(live)
+
+		v1, v2, v3, vs := live.Args()
+		if v1.Valid() {
+			producingInst := b.valueIDToInstruction[v1.ID()]
+			if producingInst != nil {
+				liveInstructions = append(liveInstructions, producingInst)
+			}
+		}
+
+		if v2.Valid() {
+			producingInst := b.valueIDToInstruction[v2.ID()]
+			if producingInst != nil {
+				liveInstructions = append(liveInstructions, producingInst)
+			}
+		}
+
+		if v3.Valid() {
+			producingInst := b.valueIDToInstruction[v3.ID()]
+			if producingInst != nil {
+				liveInstructions = append(liveInstructions, producingInst)
+			}
+		}
+
+		for _, v := range vs {
+			producingInst := b.valueIDToInstruction[v.ID()]
+			if producingInst != nil {
+				liveInstructions = append(liveInstructions, producingInst)
+			}
+		}
+	}
+
+	// Now that all the live instructions are flagged as live=true, we eliminate all dead instructions.
+	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
+		for cur := blk.rootInstr; cur != nil; cur = cur.next {
+			if !cur.live {
+				// Remove the instruction from the list.
+				if prev := cur.prev; prev != nil {
+					prev.next = cur.next
+				} else {
+					blk.rootInstr = cur.next
+				}
+				if next := cur.next; next != nil {
+					next.prev = cur.prev
+				}
+				continue
+			}
+
+			// If the value alive, we can be sure that arguments are used definitely.
+			// Hence, we can increment the value reference counts.
+			v1, v2, v3, vs := cur.Args()
+			if v1.Valid() {
+				b.incRefCount(v1.ID(), cur)
+			}
+			if v2.Valid() {
+				b.incRefCount(v2.ID(), cur)
+			}
+			if v3.Valid() {
+				b.incRefCount(v3.ID(), cur)
+			}
+			for _, v := range vs {
+				b.incRefCount(v.ID(), cur)
+			}
+		}
+	}
+
+	b.instStack = liveInstructions // we reuse the stack for the next iteration.
+}
+
+func (b *builder) incRefCount(id ValueID, from *Instruction) {
+	if wazevoapi.SSALoggingEnabled {
+		fmt.Printf("v%d referenced from %v\n", id, from.Format(b))
+	}
+	b.valueRefCounts[id]++
+}
+
+// clearBlkVisited clears the b.blkVisited map so that we can reuse it for multiple places.
+func (b *builder) clearBlkVisited() {
+	b.blkStack2 = b.blkStack2[:0]
+	for key := range b.blkVisited {
+		b.blkStack2 = append(b.blkStack2, key)
+	}
+	for _, blk := range b.blkStack2 {
+		delete(b.blkVisited, blk)
+	}
+	b.blkStack2 = b.blkStack2[:0]
+}
+
+// passNopInstElimination eliminates the instructions which is essentially a no-op.
+func passNopInstElimination(b *builder) {
+	if int(b.nextValueID) >= len(b.valueIDToInstruction) {
+		b.valueIDToInstruction = append(b.valueIDToInstruction, make([]*Instruction, b.nextValueID)...)
+	}
+
+	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
+		for cur := blk.rootInstr; cur != nil; cur = cur.next {
+			r1, rs := cur.Returns()
+			if r1.Valid() {
+				b.valueIDToInstruction[r1.ID()] = cur
+			}
+			for _, r := range rs {
+				b.valueIDToInstruction[r.ID()] = cur
+			}
+		}
+	}
+
+	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
+		for cur := blk.rootInstr; cur != nil; cur = cur.next {
+			switch cur.Opcode() {
+			// TODO: add more logics here.
+			case OpcodeIshl, OpcodeSshr, OpcodeUshr:
+				x, amount := cur.Arg2()
+				definingInst := b.valueIDToInstruction[amount.ID()]
+				if definingInst == nil {
+					// If there's no defining instruction, that means the amount is coming from the parameter.
+					continue
+				}
+				if definingInst.Constant() {
+					v := definingInst.ConstantVal()
+
+					if x.Type().Bits() == 64 {
+						v = v % 64
+					} else {
+						v = v % 32
+					}
+					if v == 0 {
+						b.alias(cur.Return(), x)
+					}
+				}
+			}
+		}
+	}
+}
+
+// passSortSuccessors sorts the successors of each block in the natural program order.
+func passSortSuccessors(b *builder) {
+	for i := 0; i < b.basicBlocksPool.Allocated(); i++ {
+		blk := b.basicBlocksPool.View(i)
+		sortBlocks(blk.success)
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_blk_layouts.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_blk_layouts.go
new file mode 100644
index 000000000..9068180a0
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_blk_layouts.go
@@ -0,0 +1,335 @@
+package ssa
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// passLayoutBlocks implements Builder.LayoutBlocks. This re-organizes builder.reversePostOrderedBasicBlocks.
+//
+// TODO: there are tons of room for improvement here. e.g. LLVM has BlockPlacementPass using BlockFrequencyInfo,
+// BranchProbabilityInfo, and LoopInfo to do a much better job. Also, if we have the profiling instrumentation
+// like ball-larus algorithm, then we could do profile-guided optimization. Basically all of them are trying
+// to maximize the fall-through opportunities which is most efficient.
+//
+// Here, fallthrough happens when a block ends with jump instruction whose target is the right next block in the
+// builder.reversePostOrderedBasicBlocks.
+//
+// Currently, we just place blocks using the DFS reverse post-order of the dominator tree with the heuristics:
+//  1. a split edge trampoline towards a loop header will be placed as a fallthrough.
+//  2. we invert the brz and brnz if it makes the fallthrough more likely.
+//
+// This heuristic is done in maybeInvertBranches function.
+func passLayoutBlocks(b *builder) {
+	b.clearBlkVisited()
+
+	// We might end up splitting critical edges which adds more basic blocks,
+	// so we store the currently existing basic blocks in nonSplitBlocks temporarily.
+	// That way we can iterate over the original basic blocks while appending new ones into reversePostOrderedBasicBlocks.
+	nonSplitBlocks := b.blkStack[:0]
+	for i, blk := range b.reversePostOrderedBasicBlocks {
+		if !blk.Valid() {
+			continue
+		}
+		nonSplitBlocks = append(nonSplitBlocks, blk)
+		if i != len(b.reversePostOrderedBasicBlocks)-1 {
+			_ = maybeInvertBranches(blk, b.reversePostOrderedBasicBlocks[i+1])
+		}
+	}
+
+	var trampolines []*basicBlock
+
+	// Reset the order slice since we update on the fly by splitting critical edges.
+	b.reversePostOrderedBasicBlocks = b.reversePostOrderedBasicBlocks[:0]
+	uninsertedTrampolines := b.blkStack2[:0]
+	for _, blk := range nonSplitBlocks {
+		for i := range blk.preds {
+			pred := blk.preds[i].blk
+			if _, ok := b.blkVisited[pred]; ok || !pred.Valid() {
+				continue
+			} else if pred.reversePostOrder < blk.reversePostOrder {
+				// This means the edge is critical, and this pred is the trampoline and yet to be inserted.
+				// Split edge trampolines must come before the destination in reverse post-order.
+				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, pred)
+				b.blkVisited[pred] = 0 // mark as inserted, the value is not used.
+			}
+		}
+
+		// Now that we've already added all the potential trampoline blocks incoming to this block,
+		// we can add this block itself.
+		b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, blk)
+		b.blkVisited[blk] = 0 // mark as inserted, the value is not used.
+
+		if len(blk.success) < 2 {
+			// There won't be critical edge originating from this block.
+			continue
+		} else if blk.currentInstr.opcode == OpcodeBrTable {
+			// We don't split critical edges here, because at the construction site of BrTable, we already split the edges.
+			continue
+		}
+
+		for sidx, succ := range blk.success {
+			if !succ.ReturnBlock() && // If the successor is a return block, we need to split the edge any way because we need "epilogue" to be inserted.
+				// Plus if there's no multiple incoming edges to this successor, (pred, succ) is not critical.
+				len(succ.preds) < 2 {
+				continue
+			}
+
+			// Otherwise, we are sure this is a critical edge. To modify the CFG, we need to find the predecessor info
+			// from the successor.
+			var predInfo *basicBlockPredecessorInfo
+			for i := range succ.preds { // This linear search should not be a problem since the number of predecessors should almost always small.
+				pred := &succ.preds[i]
+				if pred.blk == blk {
+					predInfo = pred
+					break
+				}
+			}
+
+			if predInfo == nil {
+				// This must be a bug in somewhere around branch manipulation.
+				panic("BUG: predecessor info not found while the successor exists in successors list")
+			}
+
+			if wazevoapi.SSALoggingEnabled {
+				fmt.Printf("trying to split edge from %d->%d at %s\n",
+					blk.ID(), succ.ID(), predInfo.branch.Format(b))
+			}
+
+			trampoline := b.splitCriticalEdge(blk, succ, predInfo)
+			// Update the successors slice because the target is no longer the original `succ`.
+			blk.success[sidx] = trampoline
+
+			if wazevoapi.SSAValidationEnabled {
+				trampolines = append(trampolines, trampoline)
+			}
+
+			if wazevoapi.SSALoggingEnabled {
+				fmt.Printf("edge split from %d->%d at %s as %d->%d->%d \n",
+					blk.ID(), succ.ID(), predInfo.branch.Format(b),
+					blk.ID(), trampoline.ID(), succ.ID())
+			}
+
+			fallthroughBranch := blk.currentInstr
+			if fallthroughBranch.opcode == OpcodeJump && fallthroughBranch.blk == trampoline {
+				// This can be lowered as fallthrough at the end of the block.
+				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, trampoline)
+				b.blkVisited[trampoline] = 0 // mark as inserted, the value is not used.
+			} else {
+				uninsertedTrampolines = append(uninsertedTrampolines, trampoline)
+			}
+		}
+
+		for _, trampoline := range uninsertedTrampolines {
+			if trampoline.success[0].reversePostOrder <= trampoline.reversePostOrder { // "<=", not "<" because the target might be itself.
+				// This means the critical edge was backward, so we insert after the current block immediately.
+				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, trampoline)
+				b.blkVisited[trampoline] = 0 // mark as inserted, the value is not used.
+			} // If the target is forward, we can wait to insert until the target is inserted.
+		}
+		uninsertedTrampolines = uninsertedTrampolines[:0] // Reuse the stack for the next block.
+	}
+
+	if wazevoapi.SSALoggingEnabled {
+		var bs []string
+		for _, blk := range b.reversePostOrderedBasicBlocks {
+			bs = append(bs, blk.Name())
+		}
+		fmt.Println("ordered blocks: ", strings.Join(bs, ", "))
+	}
+
+	if wazevoapi.SSAValidationEnabled {
+		for _, trampoline := range trampolines {
+			if _, ok := b.blkVisited[trampoline]; !ok {
+				panic("BUG: trampoline block not inserted: " + trampoline.FormatHeader(b))
+			}
+			trampoline.validate(b)
+		}
+	}
+
+	// Reuse the stack for the next iteration.
+	b.blkStack2 = uninsertedTrampolines[:0]
+}
+
+// markFallthroughJumps finds the fallthrough jumps and marks them as such.
+func (b *builder) markFallthroughJumps() {
+	l := len(b.reversePostOrderedBasicBlocks) - 1
+	for i, blk := range b.reversePostOrderedBasicBlocks {
+		if i < l {
+			cur := blk.currentInstr
+			if cur.opcode == OpcodeJump && cur.blk == b.reversePostOrderedBasicBlocks[i+1] {
+				cur.AsFallthroughJump()
+			}
+		}
+	}
+}
+
+// maybeInvertBranches inverts the branch instructions if it is likely possible to the fallthrough more likely with simple heuristics.
+// nextInRPO is the next block in the reverse post-order.
+//
+// Returns true if the branch is inverted for testing purpose.
+func maybeInvertBranches(now *basicBlock, nextInRPO *basicBlock) bool {
+	fallthroughBranch := now.currentInstr
+	if fallthroughBranch.opcode == OpcodeBrTable {
+		return false
+	}
+
+	condBranch := fallthroughBranch.prev
+	if condBranch == nil || (condBranch.opcode != OpcodeBrnz && condBranch.opcode != OpcodeBrz) {
+		return false
+	}
+
+	if len(fallthroughBranch.vs.View()) != 0 || len(condBranch.vs.View()) != 0 {
+		// If either one of them has arguments, we don't invert the branches.
+		return false
+	}
+
+	// So this block has two branches (a conditional branch followed by an unconditional branch) at the end.
+	// We can invert the condition of the branch if it makes the fallthrough more likely.
+
+	fallthroughTarget, condTarget := fallthroughBranch.blk.(*basicBlock), condBranch.blk.(*basicBlock)
+
+	if fallthroughTarget.loopHeader {
+		// First, if the tail's target is loopHeader, we don't need to do anything here,
+		// because the edge is likely to be critical edge for complex loops (e.g. loop with branches inside it).
+		// That means, we will split the edge in the end of LayoutBlocks function, and insert the trampoline block
+		// right after this block, which will be fallthrough in any way.
+		return false
+	} else if condTarget.loopHeader {
+		// On the other hand, if the condBranch's target is loopHeader, we invert the condition of the branch
+		// so that we could get the fallthrough to the trampoline block.
+		goto invert
+	}
+
+	if fallthroughTarget == nextInRPO {
+		// Also, if the tail's target is the next block in the reverse post-order, we don't need to do anything here,
+		// because if this is not critical edge, we would end up placing these two blocks adjacent to each other.
+		// Even if it is the critical edge, we place the trampoline block right after this block, which will be fallthrough in any way.
+		return false
+	} else if condTarget == nextInRPO {
+		// If the condBranch's target is the next block in the reverse post-order, we invert the condition of the branch
+		// so that we could get the fallthrough to the block.
+		goto invert
+	} else {
+		return false
+	}
+
+invert:
+	for i := range fallthroughTarget.preds {
+		pred := &fallthroughTarget.preds[i]
+		if pred.branch == fallthroughBranch {
+			pred.branch = condBranch
+			break
+		}
+	}
+	for i := range condTarget.preds {
+		pred := &condTarget.preds[i]
+		if pred.branch == condBranch {
+			pred.branch = fallthroughBranch
+			break
+		}
+	}
+
+	condBranch.InvertBrx()
+	condBranch.blk = fallthroughTarget
+	fallthroughBranch.blk = condTarget
+	if wazevoapi.SSALoggingEnabled {
+		fmt.Printf("inverting branches at %d->%d and %d->%d\n",
+			now.ID(), fallthroughTarget.ID(), now.ID(), condTarget.ID())
+	}
+
+	return true
+}
+
+// splitCriticalEdge splits the critical edge between the given predecessor (`pred`) and successor (owning `predInfo`).
+//
+// - `pred` is the source of the critical edge,
+// - `succ` is the destination of the critical edge,
+// - `predInfo` is the predecessor info in the succ.preds slice which represents the critical edge.
+//
+// Why splitting critical edges is important? See following links:
+//
+//   - https://en.wikipedia.org/wiki/Control-flow_graph
+//   - https://nickdesaulniers.github.io/blog/2023/01/27/critical-edge-splitting/
+//
+// The returned basic block is the trampoline block which is inserted to split the critical edge.
+func (b *builder) splitCriticalEdge(pred, succ *basicBlock, predInfo *basicBlockPredecessorInfo) *basicBlock {
+	// In the following, we convert the following CFG:
+	//
+	//     pred --(originalBranch)--> succ
+	//
+	// to the following CFG:
+	//
+	//     pred --(newBranch)--> trampoline --(originalBranch)-> succ
+	//
+	// where trampoline is a new basic block which is created to split the critical edge.
+
+	trampoline := b.allocateBasicBlock()
+	if int(trampoline.id) >= len(b.dominators) {
+		b.dominators = append(b.dominators, make([]*basicBlock, trampoline.id+1)...)
+	}
+	b.dominators[trampoline.id] = pred
+
+	originalBranch := predInfo.branch
+
+	// Replace originalBranch with the newBranch.
+	newBranch := b.AllocateInstruction()
+	newBranch.opcode = originalBranch.opcode
+	newBranch.blk = trampoline
+	switch originalBranch.opcode {
+	case OpcodeJump:
+	case OpcodeBrz, OpcodeBrnz:
+		originalBranch.opcode = OpcodeJump // Trampoline consists of one unconditional branch.
+		newBranch.v = originalBranch.v
+		originalBranch.v = ValueInvalid
+	default:
+		panic("BUG: critical edge shouldn't be originated from br_table")
+	}
+	swapInstruction(pred, originalBranch, newBranch)
+
+	// Replace the original branch with the new branch.
+	trampoline.rootInstr = originalBranch
+	trampoline.currentInstr = originalBranch
+	trampoline.success = append(trampoline.success, succ) // Do not use []*basicBlock{pred} because we might have already allocated the slice.
+	trampoline.preds = append(trampoline.preds,           // same as ^.
+		basicBlockPredecessorInfo{blk: pred, branch: newBranch})
+	b.Seal(trampoline)
+
+	// Update the original branch to point to the trampoline.
+	predInfo.blk = trampoline
+	predInfo.branch = originalBranch
+
+	if wazevoapi.SSAValidationEnabled {
+		trampoline.validate(b)
+	}
+
+	if len(trampoline.params) > 0 {
+		panic("trampoline should not have params")
+	}
+
+	// Assign the same order as the original block so that this will be placed before the actual destination.
+	trampoline.reversePostOrder = pred.reversePostOrder
+	return trampoline
+}
+
+// swapInstruction replaces `old` in the block `blk` with `New`.
+func swapInstruction(blk *basicBlock, old, New *Instruction) {
+	if blk.rootInstr == old {
+		blk.rootInstr = New
+		next := old.next
+		New.next = next
+		next.prev = New
+	} else {
+		if blk.currentInstr == old {
+			blk.currentInstr = New
+		}
+		prev := old.prev
+		prev.next, New.prev = New, prev
+		if next := old.next; next != nil {
+			New.next, next.prev = next, New
+		}
+	}
+	old.prev, old.next = nil, nil
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_cfg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_cfg.go
new file mode 100644
index 000000000..50cb9c475
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_cfg.go
@@ -0,0 +1,312 @@
+package ssa
+
+import (
+	"fmt"
+	"math"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// passCalculateImmediateDominators calculates immediate dominators for each basic block.
+// The result is stored in b.dominators. This make it possible for the following passes to
+// use builder.isDominatedBy to check if a block is dominated by another block.
+//
+// At the last of pass, this function also does the loop detection and sets the basicBlock.loop flag.
+func passCalculateImmediateDominators(b *builder) {
+	reversePostOrder := b.reversePostOrderedBasicBlocks[:0]
+	exploreStack := b.blkStack[:0]
+	b.clearBlkVisited()
+
+	entryBlk := b.entryBlk()
+
+	// Store the reverse postorder from the entrypoint into reversePostOrder slice.
+	// This calculation of reverse postorder is not described in the paper,
+	// so we use heuristic to calculate it so that we could potentially handle arbitrary
+	// complex CFGs under the assumption that success is sorted in program's natural order.
+	// That means blk.success[i] always appears before blk.success[i+1] in the source program,
+	// which is a reasonable assumption as long as SSA Builder is properly used.
+	//
+	// First we push blocks in postorder iteratively visit successors of the entry block.
+	exploreStack = append(exploreStack, entryBlk)
+	const visitStateUnseen, visitStateSeen, visitStateDone = 0, 1, 2
+	b.blkVisited[entryBlk] = visitStateSeen
+	for len(exploreStack) > 0 {
+		tail := len(exploreStack) - 1
+		blk := exploreStack[tail]
+		exploreStack = exploreStack[:tail]
+		switch b.blkVisited[blk] {
+		case visitStateUnseen:
+			// This is likely a bug in the frontend.
+			panic("BUG: unsupported CFG")
+		case visitStateSeen:
+			// This is the first time to pop this block, and we have to see the successors first.
+			// So push this block again to the stack.
+			exploreStack = append(exploreStack, blk)
+			// And push the successors to the stack if necessary.
+			for _, succ := range blk.success {
+				if succ.ReturnBlock() || succ.invalid {
+					continue
+				}
+				if b.blkVisited[succ] == visitStateUnseen {
+					b.blkVisited[succ] = visitStateSeen
+					exploreStack = append(exploreStack, succ)
+				}
+			}
+			// Finally, we could pop this block once we pop all of its successors.
+			b.blkVisited[blk] = visitStateDone
+		case visitStateDone:
+			// Note: at this point we push blk in postorder despite its name.
+			reversePostOrder = append(reversePostOrder, blk)
+		}
+	}
+	// At this point, reversePostOrder has postorder actually, so we reverse it.
+	for i := len(reversePostOrder)/2 - 1; i >= 0; i-- {
+		j := len(reversePostOrder) - 1 - i
+		reversePostOrder[i], reversePostOrder[j] = reversePostOrder[j], reversePostOrder[i]
+	}
+
+	for i, blk := range reversePostOrder {
+		blk.reversePostOrder = i
+	}
+
+	// Reuse the dominators slice if possible from the previous computation of function.
+	b.dominators = b.dominators[:cap(b.dominators)]
+	if len(b.dominators) < b.basicBlocksPool.Allocated() {
+		// Generously reserve space in the slice because the slice will be reused future allocation.
+		b.dominators = append(b.dominators, make([]*basicBlock, b.basicBlocksPool.Allocated())...)
+	}
+	calculateDominators(reversePostOrder, b.dominators)
+
+	// Reuse the slices for the future use.
+	b.blkStack = exploreStack
+
+	// For the following passes.
+	b.reversePostOrderedBasicBlocks = reversePostOrder
+
+	// Ready to detect loops!
+	subPassLoopDetection(b)
+}
+
+// calculateDominators calculates the immediate dominator of each node in the CFG, and store the result in `doms`.
+// The algorithm is based on the one described in the paper "A Simple, Fast Dominance Algorithm"
+// https://www.cs.rice.edu/~keith/EMBED/dom.pdf which is a faster/simple alternative to the well known Lengauer-Tarjan algorithm.
+//
+// The following code almost matches the pseudocode in the paper with one exception (see the code comment below).
+//
+// The result slice `doms` must be pre-allocated with the size larger than the size of dfsBlocks.
+func calculateDominators(reversePostOrderedBlks []*basicBlock, doms []*basicBlock) {
+	entry, reversePostOrderedBlks := reversePostOrderedBlks[0], reversePostOrderedBlks[1: /* skips entry point */]
+	for _, blk := range reversePostOrderedBlks {
+		doms[blk.id] = nil
+	}
+	doms[entry.id] = entry
+
+	changed := true
+	for changed {
+		changed = false
+		for _, blk := range reversePostOrderedBlks {
+			var u *basicBlock
+			for i := range blk.preds {
+				pred := blk.preds[i].blk
+				// Skip if this pred is not reachable yet. Note that this is not described in the paper,
+				// but it is necessary to handle nested loops etc.
+				if doms[pred.id] == nil {
+					continue
+				}
+
+				if u == nil {
+					u = pred
+					continue
+				} else {
+					u = intersect(doms, u, pred)
+				}
+			}
+			if doms[blk.id] != u {
+				doms[blk.id] = u
+				changed = true
+			}
+		}
+	}
+}
+
+// intersect returns the common dominator of blk1 and blk2.
+//
+// This is the `intersect` function in the paper.
+func intersect(doms []*basicBlock, blk1 *basicBlock, blk2 *basicBlock) *basicBlock {
+	finger1, finger2 := blk1, blk2
+	for finger1 != finger2 {
+		// Move the 'finger1' upwards to its immediate dominator.
+		for finger1.reversePostOrder > finger2.reversePostOrder {
+			finger1 = doms[finger1.id]
+		}
+		// Move the 'finger2' upwards to its immediate dominator.
+		for finger2.reversePostOrder > finger1.reversePostOrder {
+			finger2 = doms[finger2.id]
+		}
+	}
+	return finger1
+}
+
+// subPassLoopDetection detects loops in the function using the immediate dominators.
+//
+// This is run at the last of passCalculateImmediateDominators.
+func subPassLoopDetection(b *builder) {
+	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
+		for i := range blk.preds {
+			pred := blk.preds[i].blk
+			if pred.invalid {
+				continue
+			}
+			if b.isDominatedBy(pred, blk) {
+				blk.loopHeader = true
+			}
+		}
+	}
+}
+
+// buildLoopNestingForest builds the loop nesting forest for the function.
+// This must be called after branch splitting since it relies on the CFG.
+func passBuildLoopNestingForest(b *builder) {
+	ent := b.entryBlk()
+	doms := b.dominators
+	for _, blk := range b.reversePostOrderedBasicBlocks {
+		n := doms[blk.id]
+		for !n.loopHeader && n != ent {
+			n = doms[n.id]
+		}
+
+		if n == ent && blk.loopHeader {
+			b.loopNestingForestRoots = append(b.loopNestingForestRoots, blk)
+		} else if n == ent {
+		} else if n.loopHeader {
+			n.loopNestingForestChildren = append(n.loopNestingForestChildren, blk)
+		}
+	}
+
+	if wazevoapi.SSALoggingEnabled {
+		for _, root := range b.loopNestingForestRoots {
+			printLoopNestingForest(root.(*basicBlock), 0)
+		}
+	}
+}
+
+func printLoopNestingForest(root *basicBlock, depth int) {
+	fmt.Println(strings.Repeat("\t", depth), "loop nesting forest root:", root.ID())
+	for _, child := range root.loopNestingForestChildren {
+		fmt.Println(strings.Repeat("\t", depth+1), "child:", child.ID())
+		if child.LoopHeader() {
+			printLoopNestingForest(child.(*basicBlock), depth+2)
+		}
+	}
+}
+
+type dominatorSparseTree struct {
+	time         int
+	euler        []*basicBlock
+	first, depth []int
+	table        [][]int
+}
+
+// passBuildDominatorTree builds the dominator tree for the function, and constructs builder.sparseTree.
+func passBuildDominatorTree(b *builder) {
+	// First we materialize the children of each node in the dominator tree.
+	idoms := b.dominators
+	for _, blk := range b.reversePostOrderedBasicBlocks {
+		parent := idoms[blk.id]
+		if parent == nil {
+			panic("BUG")
+		} else if parent == blk {
+			// This is the entry block.
+			continue
+		}
+		if prev := parent.child; prev == nil {
+			parent.child = blk
+		} else {
+			parent.child = blk
+			blk.sibling = prev
+		}
+	}
+
+	// Reset the state from the previous computation.
+	n := b.basicBlocksPool.Allocated()
+	st := &b.sparseTree
+	st.euler = append(st.euler[:0], make([]*basicBlock, 2*n-1)...)
+	st.first = append(st.first[:0], make([]int, n)...)
+	for i := range st.first {
+		st.first[i] = -1
+	}
+	st.depth = append(st.depth[:0], make([]int, 2*n-1)...)
+	st.time = 0
+
+	// Start building the sparse tree.
+	st.eulerTour(b.entryBlk(), 0)
+	st.buildSparseTable()
+}
+
+func (dt *dominatorSparseTree) eulerTour(node *basicBlock, height int) {
+	if wazevoapi.SSALoggingEnabled {
+		fmt.Println(strings.Repeat("\t", height), "euler tour:", node.ID())
+	}
+	dt.euler[dt.time] = node
+	dt.depth[dt.time] = height
+	if dt.first[node.id] == -1 {
+		dt.first[node.id] = dt.time
+	}
+	dt.time++
+
+	for child := node.child; child != nil; child = child.sibling {
+		dt.eulerTour(child, height+1)
+		dt.euler[dt.time] = node // add the current node again after visiting a child
+		dt.depth[dt.time] = height
+		dt.time++
+	}
+}
+
+// buildSparseTable builds a sparse table for RMQ queries.
+func (dt *dominatorSparseTree) buildSparseTable() {
+	n := len(dt.depth)
+	k := int(math.Log2(float64(n))) + 1
+	table := dt.table
+
+	if n >= len(table) {
+		table = append(table, make([][]int, n+1)...)
+	}
+	for i := range table {
+		if len(table[i]) < k {
+			table[i] = append(table[i], make([]int, k)...)
+		}
+		table[i][0] = i
+	}
+
+	for j := 1; 1<<j <= n; j++ {
+		for i := 0; i+(1<<j)-1 < n; i++ {
+			if dt.depth[table[i][j-1]] < dt.depth[table[i+(1<<(j-1))][j-1]] {
+				table[i][j] = table[i][j-1]
+			} else {
+				table[i][j] = table[i+(1<<(j-1))][j-1]
+			}
+		}
+	}
+	dt.table = table
+}
+
+// rmq performs a range minimum query on the sparse table.
+func (dt *dominatorSparseTree) rmq(l, r int) int {
+	table := dt.table
+	depth := dt.depth
+	j := int(math.Log2(float64(r - l + 1)))
+	if depth[table[l][j]] <= depth[table[r-(1<<j)+1][j]] {
+		return table[l][j]
+	}
+	return table[r-(1<<j)+1][j]
+}
+
+// findLCA finds the LCA using the Euler tour and RMQ.
+func (dt *dominatorSparseTree) findLCA(u, v BasicBlockID) *basicBlock {
+	first := dt.first
+	if first[u] > first[v] {
+		u, v = v, u
+	}
+	return dt.euler[dt.rmq(first[u], first[v])]
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/signature.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/signature.go
new file mode 100644
index 000000000..43483395a
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/signature.go
@@ -0,0 +1,49 @@
+package ssa
+
+import (
+	"fmt"
+	"strings"
+)
+
+// Signature is a function prototype.
+type Signature struct {
+	// ID is a unique identifier for this signature used to lookup.
+	ID SignatureID
+	// Params and Results are the types of the parameters and results of the function.
+	Params, Results []Type
+
+	// used is true if this is used by the currently-compiled function.
+	// Debugging only.
+	used bool
+}
+
+// String implements fmt.Stringer.
+func (s *Signature) String() string {
+	str := strings.Builder{}
+	str.WriteString(s.ID.String())
+	str.WriteString(": ")
+	if len(s.Params) > 0 {
+		for _, typ := range s.Params {
+			str.WriteString(typ.String())
+		}
+	} else {
+		str.WriteByte('v')
+	}
+	str.WriteByte('_')
+	if len(s.Results) > 0 {
+		for _, typ := range s.Results {
+			str.WriteString(typ.String())
+		}
+	} else {
+		str.WriteByte('v')
+	}
+	return str.String()
+}
+
+// SignatureID is an unique identifier used to lookup.
+type SignatureID int
+
+// String implements fmt.Stringer.
+func (s SignatureID) String() string {
+	return fmt.Sprintf("sig%d", s)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/ssa.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/ssa.go
new file mode 100644
index 000000000..b477e58bd
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/ssa.go
@@ -0,0 +1,14 @@
+// Package ssa is used to construct SSA function. By nature this is free of Wasm specific thing
+// and ISA.
+//
+// We use the "block argument" variant of SSA: https://en.wikipedia.org/wiki/Static_single-assignment_form#Block_arguments
+// which is equivalent to the traditional PHI function based one, but more convenient during optimizations.
+// However, in this package's source code comment, we might use PHI whenever it seems necessary in order to be aligned with
+// existing literatures, e.g. SSA level optimization algorithms are often described using PHI nodes.
+//
+// The rationale doc for the choice of "block argument" by MLIR of LLVM is worth a read:
+// https://mlir.llvm.org/docs/Rationale/Rationale/#block-arguments-vs-phi-nodes
+//
+// The algorithm to resolve variable definitions used here is based on the paper
+// "Simple and Efficient Construction of Static Single Assignment Form": https://link.springer.com/content/pdf/10.1007/978-3-642-37051-9_6.pdf.
+package ssa
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/type.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/type.go
new file mode 100644
index 000000000..e8c8cd9de
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/type.go
@@ -0,0 +1,112 @@
+package ssa
+
+type Type byte
+
+const (
+	typeInvalid Type = iota
+
+	// TODO: add 8, 16 bit types when it's needed for optimizations.
+
+	// TypeI32 represents an integer type with 32 bits.
+	TypeI32
+
+	// TypeI64 represents an integer type with 64 bits.
+	TypeI64
+
+	// TypeF32 represents 32-bit floats in the IEEE 754.
+	TypeF32
+
+	// TypeF64 represents 64-bit floats in the IEEE 754.
+	TypeF64
+
+	// TypeV128 represents 128-bit SIMD vectors.
+	TypeV128
+)
+
+// String implements fmt.Stringer.
+func (t Type) String() (ret string) {
+	switch t {
+	case typeInvalid:
+		return "invalid"
+	case TypeI32:
+		return "i32"
+	case TypeI64:
+		return "i64"
+	case TypeF32:
+		return "f32"
+	case TypeF64:
+		return "f64"
+	case TypeV128:
+		return "v128"
+	default:
+		panic(int(t))
+	}
+}
+
+// IsInt returns true if the type is an integer type.
+func (t Type) IsInt() bool {
+	return t == TypeI32 || t == TypeI64
+}
+
+// IsFloat returns true if the type is a floating point type.
+func (t Type) IsFloat() bool {
+	return t == TypeF32 || t == TypeF64
+}
+
+// Bits returns the number of bits required to represent the type.
+func (t Type) Bits() byte {
+	switch t {
+	case TypeI32, TypeF32:
+		return 32
+	case TypeI64, TypeF64:
+		return 64
+	case TypeV128:
+		return 128
+	default:
+		panic(int(t))
+	}
+}
+
+// Size returns the number of bytes required to represent the type.
+func (t Type) Size() byte {
+	return t.Bits() / 8
+}
+
+func (t Type) invalid() bool {
+	return t == typeInvalid
+}
+
+// VecLane represents a lane in a SIMD vector.
+type VecLane byte
+
+const (
+	VecLaneInvalid VecLane = 1 + iota
+	VecLaneI8x16
+	VecLaneI16x8
+	VecLaneI32x4
+	VecLaneI64x2
+	VecLaneF32x4
+	VecLaneF64x2
+)
+
+// String implements fmt.Stringer.
+func (vl VecLane) String() (ret string) {
+	switch vl {
+	case VecLaneInvalid:
+		return "invalid"
+	case VecLaneI8x16:
+		return "i8x16"
+	case VecLaneI16x8:
+		return "i16x8"
+	case VecLaneI32x4:
+		return "i32x4"
+	case VecLaneI64x2:
+		return "i64x2"
+	case VecLaneF32x4:
+		return "f32x4"
+	case VecLaneF64x2:
+		return "f64x2"
+	default:
+		panic(int(vl))
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/vs.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/vs.go
new file mode 100644
index 000000000..bcf83cbf8
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/vs.go
@@ -0,0 +1,87 @@
+package ssa
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// Variable is a unique identifier for a source program's variable and will correspond to
+// multiple ssa Value(s).
+//
+// For example, `Local 1` is a Variable in WebAssembly, and Value(s) will be created for it
+// whenever it executes `local.set 1`.
+//
+// Variable is useful to track the SSA Values of a variable in the source program, and
+// can be used to find the corresponding latest SSA Value via Builder.FindValue.
+type Variable uint32
+
+// String implements fmt.Stringer.
+func (v Variable) String() string {
+	return fmt.Sprintf("var%d", v)
+}
+
+// Value represents an SSA value with a type information. The relationship with Variable is 1: N (including 0),
+// that means there might be multiple Variable(s) for a Value.
+//
+// Higher 32-bit is used to store Type for this value.
+type Value uint64
+
+// ValueID is the lower 32bit of Value, which is the pure identifier of Value without type info.
+type ValueID uint32
+
+const (
+	valueIDInvalid ValueID = math.MaxUint32
+	ValueInvalid   Value   = Value(valueIDInvalid)
+)
+
+// Format creates a debug string for this Value using the data stored in Builder.
+func (v Value) Format(b Builder) string {
+	if annotation, ok := b.(*builder).valueAnnotations[v.ID()]; ok {
+		return annotation
+	}
+	return fmt.Sprintf("v%d", v.ID())
+}
+
+func (v Value) formatWithType(b Builder) (ret string) {
+	if annotation, ok := b.(*builder).valueAnnotations[v.ID()]; ok {
+		ret = annotation + ":" + v.Type().String()
+	} else {
+		ret = fmt.Sprintf("v%d:%s", v.ID(), v.Type())
+	}
+
+	if wazevoapi.SSALoggingEnabled { // This is useful to check live value analysis bugs.
+		if bd := b.(*builder); bd.donePostBlockLayoutPasses {
+			id := v.ID()
+			ret += fmt.Sprintf("(ref=%d)", bd.valueRefCounts[id])
+		}
+	}
+	return ret
+}
+
+// Valid returns true if this value is valid.
+func (v Value) Valid() bool {
+	return v.ID() != valueIDInvalid
+}
+
+// Type returns the Type of this value.
+func (v Value) Type() Type {
+	return Type(v >> 32)
+}
+
+// ID returns the valueID of this value.
+func (v Value) ID() ValueID {
+	return ValueID(v)
+}
+
+// setType sets a type to this Value and returns the updated Value.
+func (v Value) setType(typ Type) Value {
+	return v | Value(typ)<<32
+}
+
+// Values is a slice of Value. Use this instead of []Value to reuse the underlying memory.
+type Values = wazevoapi.VarLength[Value]
+
+// ValuesNil is a nil Values.
+var ValuesNil = wazevoapi.NewNilVarLength[Value]()
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/debug_options.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/debug_options.go
new file mode 100644
index 000000000..2db61e219
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/debug_options.go
@@ -0,0 +1,196 @@
+package wazevoapi
+
+import (
+	"context"
+	"encoding/hex"
+	"fmt"
+	"math/rand"
+	"os"
+	"time"
+)
+
+// These consts are used various places in the wazevo implementations.
+// Instead of defining them in each file, we define them here so that we can quickly iterate on
+// debugging without spending "where do we have debug logging?" time.
+
+// ----- Debug logging -----
+// These consts must be disabled by default. Enable them only when debugging.
+
+const (
+	FrontEndLoggingEnabled = false
+	SSALoggingEnabled      = false
+	RegAllocLoggingEnabled = false
+)
+
+// ----- Output prints -----
+// These consts must be disabled by default. Enable them only when debugging.
+
+const (
+	PrintSSA                                 = false
+	PrintOptimizedSSA                        = false
+	PrintSSAToBackendIRLowering              = false
+	PrintRegisterAllocated                   = false
+	PrintFinalizedMachineCode                = false
+	PrintMachineCodeHexPerFunction           = printMachineCodeHexPerFunctionUnmodified || PrintMachineCodeHexPerFunctionDisassemblable //nolint
+	printMachineCodeHexPerFunctionUnmodified = false
+	// PrintMachineCodeHexPerFunctionDisassemblable prints the machine code while modifying the actual result
+	// to make it disassemblable. This is useful when debugging the final machine code. See the places where this is used for detail.
+	// When this is enabled, functions must not be called.
+	PrintMachineCodeHexPerFunctionDisassemblable = false
+)
+
+// printTarget is the function index to print the machine code. This is used for debugging to print the machine code
+// of a specific function.
+const printTarget = -1
+
+// PrintEnabledIndex returns true if the current function index is the print target.
+func PrintEnabledIndex(ctx context.Context) bool {
+	if printTarget == -1 {
+		return true
+	}
+	return GetCurrentFunctionIndex(ctx) == printTarget
+}
+
+// ----- Validations -----
+const (
+	// SSAValidationEnabled enables the SSA validation. This is disabled by default since the operation is expensive.
+	SSAValidationEnabled = false
+)
+
+// ----- Stack Guard Check -----
+const (
+	// StackGuardCheckEnabled enables the stack guard check to ensure that our stack bounds check works correctly.
+	StackGuardCheckEnabled       = false
+	StackGuardCheckGuardPageSize = 8096
+)
+
+// CheckStackGuardPage checks the given stack guard page is not corrupted.
+func CheckStackGuardPage(s []byte) {
+	for i := 0; i < StackGuardCheckGuardPageSize; i++ {
+		if s[i] != 0 {
+			panic(
+				fmt.Sprintf("BUG: stack guard page is corrupted:\n\tguard_page=%s\n\tstack=%s",
+					hex.EncodeToString(s[:StackGuardCheckGuardPageSize]),
+					hex.EncodeToString(s[StackGuardCheckGuardPageSize:]),
+				))
+		}
+	}
+}
+
+// ----- Deterministic compilation verifier -----
+
+const (
+	// DeterministicCompilationVerifierEnabled enables the deterministic compilation verifier. This is disabled by default
+	// since the operation is expensive. But when in doubt, enable this to make sure the compilation is deterministic.
+	DeterministicCompilationVerifierEnabled = false
+	DeterministicCompilationVerifyingIter   = 5
+)
+
+type (
+	verifierState struct {
+		initialCompilationDone bool
+		maybeRandomizedIndexes []int
+		r                      *rand.Rand
+		values                 map[string]string
+	}
+	verifierStateContextKey struct{}
+	currentFunctionNameKey  struct{}
+	currentFunctionIndexKey struct{}
+)
+
+// NewDeterministicCompilationVerifierContext creates a new context with the deterministic compilation verifier used per wasm.Module.
+func NewDeterministicCompilationVerifierContext(ctx context.Context, localFunctions int) context.Context {
+	maybeRandomizedIndexes := make([]int, localFunctions)
+	for i := range maybeRandomizedIndexes {
+		maybeRandomizedIndexes[i] = i
+	}
+	r := rand.New(rand.NewSource(time.Now().UnixNano()))
+	return context.WithValue(ctx, verifierStateContextKey{}, &verifierState{
+		r: r, maybeRandomizedIndexes: maybeRandomizedIndexes, values: map[string]string{},
+	})
+}
+
+// DeterministicCompilationVerifierRandomizeIndexes randomizes the indexes for the deterministic compilation verifier.
+// To get the randomized index, use DeterministicCompilationVerifierGetRandomizedLocalFunctionIndex.
+func DeterministicCompilationVerifierRandomizeIndexes(ctx context.Context) {
+	state := ctx.Value(verifierStateContextKey{}).(*verifierState)
+	if !state.initialCompilationDone {
+		// If this is the first attempt, we use the index as-is order.
+		state.initialCompilationDone = true
+		return
+	}
+	r := state.r
+	r.Shuffle(len(state.maybeRandomizedIndexes), func(i, j int) {
+		state.maybeRandomizedIndexes[i], state.maybeRandomizedIndexes[j] = state.maybeRandomizedIndexes[j], state.maybeRandomizedIndexes[i]
+	})
+}
+
+// DeterministicCompilationVerifierGetRandomizedLocalFunctionIndex returns the randomized index for the given `index`
+// which is assigned by DeterministicCompilationVerifierRandomizeIndexes.
+func DeterministicCompilationVerifierGetRandomizedLocalFunctionIndex(ctx context.Context, index int) int {
+	state := ctx.Value(verifierStateContextKey{}).(*verifierState)
+	ret := state.maybeRandomizedIndexes[index]
+	return ret
+}
+
+// VerifyOrSetDeterministicCompilationContextValue verifies that the `newValue` is the same as the previous value for the given `scope`
+// and the current function name. If the previous value doesn't exist, it sets the value to the given `newValue`.
+//
+// If the verification fails, this prints the diff and exits the process.
+func VerifyOrSetDeterministicCompilationContextValue(ctx context.Context, scope string, newValue string) {
+	fn := ctx.Value(currentFunctionNameKey{}).(string)
+	key := fn + ": " + scope
+	verifierCtx := ctx.Value(verifierStateContextKey{}).(*verifierState)
+	oldValue, ok := verifierCtx.values[key]
+	if !ok {
+		verifierCtx.values[key] = newValue
+		return
+	}
+	if oldValue != newValue {
+		fmt.Printf(
+			`BUG: Deterministic compilation failed for function%s at scope="%s".
+
+This is mostly due to (but might not be limited to):
+	* Resetting ssa.Builder, backend.Compiler or frontend.Compiler, etc doens't work as expected, and the compilation has been affected by the previous iterations.
+	* Using a map with non-deterministic iteration order.
+
+---------- [old] ----------
+%s
+
+---------- [new] ----------
+%s
+`,
+			fn, scope, oldValue, newValue,
+		)
+		os.Exit(1)
+	}
+}
+
+// nolint
+const NeedFunctionNameInContext = PrintSSA ||
+	PrintOptimizedSSA ||
+	PrintSSAToBackendIRLowering ||
+	PrintRegisterAllocated ||
+	PrintFinalizedMachineCode ||
+	PrintMachineCodeHexPerFunction ||
+	DeterministicCompilationVerifierEnabled ||
+	PerfMapEnabled
+
+// SetCurrentFunctionName sets the current function name to the given `functionName`.
+func SetCurrentFunctionName(ctx context.Context, index int, functionName string) context.Context {
+	ctx = context.WithValue(ctx, currentFunctionNameKey{}, functionName)
+	ctx = context.WithValue(ctx, currentFunctionIndexKey{}, index)
+	return ctx
+}
+
+// GetCurrentFunctionName returns the current function name.
+func GetCurrentFunctionName(ctx context.Context) string {
+	ret, _ := ctx.Value(currentFunctionNameKey{}).(string)
+	return ret
+}
+
+// GetCurrentFunctionIndex returns the current function index.
+func GetCurrentFunctionIndex(ctx context.Context) int {
+	ret, _ := ctx.Value(currentFunctionIndexKey{}).(int)
+	return ret
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/exitcode.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/exitcode.go
new file mode 100644
index 000000000..5ad594982
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/exitcode.go
@@ -0,0 +1,109 @@
+package wazevoapi
+
+// ExitCode is an exit code of an execution of a function.
+type ExitCode uint32
+
+const (
+	ExitCodeOK ExitCode = iota
+	ExitCodeGrowStack
+	ExitCodeGrowMemory
+	ExitCodeUnreachable
+	ExitCodeMemoryOutOfBounds
+	// ExitCodeCallGoModuleFunction is an exit code for a call to an api.GoModuleFunction.
+	ExitCodeCallGoModuleFunction
+	// ExitCodeCallGoFunction is an exit code for a call to an api.GoFunction.
+	ExitCodeCallGoFunction
+	ExitCodeTableOutOfBounds
+	ExitCodeIndirectCallNullPointer
+	ExitCodeIndirectCallTypeMismatch
+	ExitCodeIntegerDivisionByZero
+	ExitCodeIntegerOverflow
+	ExitCodeInvalidConversionToInteger
+	ExitCodeCheckModuleExitCode
+	ExitCodeCallListenerBefore
+	ExitCodeCallListenerAfter
+	ExitCodeCallGoModuleFunctionWithListener
+	ExitCodeCallGoFunctionWithListener
+	ExitCodeTableGrow
+	ExitCodeRefFunc
+	ExitCodeMemoryWait32
+	ExitCodeMemoryWait64
+	ExitCodeMemoryNotify
+	ExitCodeUnalignedAtomic
+	exitCodeMax
+)
+
+const ExitCodeMask = 0xff
+
+// String implements fmt.Stringer.
+func (e ExitCode) String() string {
+	switch e {
+	case ExitCodeOK:
+		return "ok"
+	case ExitCodeGrowStack:
+		return "grow_stack"
+	case ExitCodeCallGoModuleFunction:
+		return "call_go_module_function"
+	case ExitCodeCallGoFunction:
+		return "call_go_function"
+	case ExitCodeUnreachable:
+		return "unreachable"
+	case ExitCodeMemoryOutOfBounds:
+		return "memory_out_of_bounds"
+	case ExitCodeUnalignedAtomic:
+		return "unaligned_atomic"
+	case ExitCodeTableOutOfBounds:
+		return "table_out_of_bounds"
+	case ExitCodeIndirectCallNullPointer:
+		return "indirect_call_null_pointer"
+	case ExitCodeIndirectCallTypeMismatch:
+		return "indirect_call_type_mismatch"
+	case ExitCodeIntegerDivisionByZero:
+		return "integer_division_by_zero"
+	case ExitCodeIntegerOverflow:
+		return "integer_overflow"
+	case ExitCodeInvalidConversionToInteger:
+		return "invalid_conversion_to_integer"
+	case ExitCodeCheckModuleExitCode:
+		return "check_module_exit_code"
+	case ExitCodeCallListenerBefore:
+		return "call_listener_before"
+	case ExitCodeCallListenerAfter:
+		return "call_listener_after"
+	case ExitCodeCallGoModuleFunctionWithListener:
+		return "call_go_module_function_with_listener"
+	case ExitCodeCallGoFunctionWithListener:
+		return "call_go_function_with_listener"
+	case ExitCodeGrowMemory:
+		return "grow_memory"
+	case ExitCodeTableGrow:
+		return "table_grow"
+	case ExitCodeRefFunc:
+		return "ref_func"
+	case ExitCodeMemoryWait32:
+		return "memory_wait32"
+	case ExitCodeMemoryWait64:
+		return "memory_wait64"
+	case ExitCodeMemoryNotify:
+		return "memory_notify"
+	}
+	panic("TODO")
+}
+
+func ExitCodeCallGoModuleFunctionWithIndex(index int, withListener bool) ExitCode {
+	if withListener {
+		return ExitCodeCallGoModuleFunctionWithListener | ExitCode(index<<8)
+	}
+	return ExitCodeCallGoModuleFunction | ExitCode(index<<8)
+}
+
+func ExitCodeCallGoFunctionWithIndex(index int, withListener bool) ExitCode {
+	if withListener {
+		return ExitCodeCallGoFunctionWithListener | ExitCode(index<<8)
+	}
+	return ExitCodeCallGoFunction | ExitCode(index<<8)
+}
+
+func GoFunctionIndexFromExitCode(exitCode ExitCode) int {
+	return int(exitCode >> 8)
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/offsetdata.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/offsetdata.go
new file mode 100644
index 000000000..fe6161b04
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/offsetdata.go
@@ -0,0 +1,216 @@
+package wazevoapi
+
+import (
+	"github.com/tetratelabs/wazero/internal/wasm"
+)
+
+const (
+	// FunctionInstanceSize is the size of wazevo.functionInstance.
+	FunctionInstanceSize = 24
+	// FunctionInstanceExecutableOffset is an offset of `executable` field in wazevo.functionInstance
+	FunctionInstanceExecutableOffset = 0
+	// FunctionInstanceModuleContextOpaquePtrOffset is an offset of `moduleContextOpaquePtr` field in wazevo.functionInstance
+	FunctionInstanceModuleContextOpaquePtrOffset = 8
+	// FunctionInstanceTypeIDOffset is an offset of `typeID` field in wazevo.functionInstance
+	FunctionInstanceTypeIDOffset = 16
+)
+
+const (
+	// ExecutionContextOffsetExitCodeOffset is an offset of `exitCode` field in wazevo.executionContext
+	ExecutionContextOffsetExitCodeOffset Offset = 0
+	// ExecutionContextOffsetCallerModuleContextPtr is an offset of `callerModuleContextPtr` field in wazevo.executionContext
+	ExecutionContextOffsetCallerModuleContextPtr Offset = 8
+	// ExecutionContextOffsetOriginalFramePointer is an offset of `originalFramePointer` field in wazevo.executionContext
+	ExecutionContextOffsetOriginalFramePointer Offset = 16
+	// ExecutionContextOffsetOriginalStackPointer is an offset of `originalStackPointer` field in wazevo.executionContext
+	ExecutionContextOffsetOriginalStackPointer Offset = 24
+	// ExecutionContextOffsetGoReturnAddress is an offset of `goReturnAddress` field in wazevo.executionContext
+	ExecutionContextOffsetGoReturnAddress Offset = 32
+	// ExecutionContextOffsetStackBottomPtr is an offset of `stackBottomPtr` field in wazevo.executionContext
+	ExecutionContextOffsetStackBottomPtr Offset = 40
+	// ExecutionContextOffsetGoCallReturnAddress is an offset of `goCallReturnAddress` field in wazevo.executionContext
+	ExecutionContextOffsetGoCallReturnAddress Offset = 48
+	// ExecutionContextOffsetStackPointerBeforeGoCall is an offset of `StackPointerBeforeGoCall` field in wazevo.executionContext
+	ExecutionContextOffsetStackPointerBeforeGoCall Offset = 56
+	// ExecutionContextOffsetStackGrowRequiredSize is an offset of `stackGrowRequiredSize` field in wazevo.executionContext
+	ExecutionContextOffsetStackGrowRequiredSize Offset = 64
+	// ExecutionContextOffsetMemoryGrowTrampolineAddress is an offset of `memoryGrowTrampolineAddress` field in wazevo.executionContext
+	ExecutionContextOffsetMemoryGrowTrampolineAddress Offset = 72
+	// ExecutionContextOffsetStackGrowCallTrampolineAddress is an offset of `stackGrowCallTrampolineAddress` field in wazevo.executionContext.
+	ExecutionContextOffsetStackGrowCallTrampolineAddress Offset = 80
+	// ExecutionContextOffsetCheckModuleExitCodeTrampolineAddress is an offset of `checkModuleExitCodeTrampolineAddress` field in wazevo.executionContext.
+	ExecutionContextOffsetCheckModuleExitCodeTrampolineAddress Offset = 88
+	// ExecutionContextOffsetSavedRegistersBegin is an offset of the first element of `savedRegisters` field in wazevo.executionContext
+	ExecutionContextOffsetSavedRegistersBegin Offset = 96
+	// ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque is an offset of `goFunctionCallCalleeModuleContextOpaque` field in wazevo.executionContext
+	ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque Offset = 1120
+	// ExecutionContextOffsetTableGrowTrampolineAddress is an offset of `tableGrowTrampolineAddress` field in wazevo.executionContext
+	ExecutionContextOffsetTableGrowTrampolineAddress Offset = 1128
+	// ExecutionContextOffsetRefFuncTrampolineAddress is an offset of `refFuncTrampolineAddress` field in wazevo.executionContext
+	ExecutionContextOffsetRefFuncTrampolineAddress      Offset = 1136
+	ExecutionContextOffsetMemmoveAddress                Offset = 1144
+	ExecutionContextOffsetFramePointerBeforeGoCall      Offset = 1152
+	ExecutionContextOffsetMemoryWait32TrampolineAddress Offset = 1160
+	ExecutionContextOffsetMemoryWait64TrampolineAddress Offset = 1168
+	ExecutionContextOffsetMemoryNotifyTrampolineAddress Offset = 1176
+)
+
+// ModuleContextOffsetData allows the compilers to get the information about offsets to the fields of wazevo.moduleContextOpaque,
+// This is unique per module.
+type ModuleContextOffsetData struct {
+	TotalSize int
+	ModuleInstanceOffset,
+	LocalMemoryBegin,
+	ImportedMemoryBegin,
+	ImportedFunctionsBegin,
+	GlobalsBegin,
+	TypeIDs1stElement,
+	TablesBegin,
+	BeforeListenerTrampolines1stElement,
+	AfterListenerTrampolines1stElement,
+	DataInstances1stElement,
+	ElementInstances1stElement Offset
+}
+
+// ImportedFunctionOffset returns an offset of the i-th imported function.
+// Each item is stored as wazevo.functionInstance whose size matches FunctionInstanceSize.
+func (m *ModuleContextOffsetData) ImportedFunctionOffset(i wasm.Index) (
+	executableOffset, moduleCtxOffset, typeIDOffset Offset,
+) {
+	base := m.ImportedFunctionsBegin + Offset(i)*FunctionInstanceSize
+	return base, base + 8, base + 16
+}
+
+// GlobalInstanceOffset returns an offset of the i-th global instance.
+func (m *ModuleContextOffsetData) GlobalInstanceOffset(i wasm.Index) Offset {
+	return m.GlobalsBegin + Offset(i)*16
+}
+
+// Offset represents an offset of a field of a struct.
+type Offset int32
+
+// U32 encodes an Offset as uint32 for convenience.
+func (o Offset) U32() uint32 {
+	return uint32(o)
+}
+
+// I64 encodes an Offset as int64 for convenience.
+func (o Offset) I64() int64 {
+	return int64(o)
+}
+
+// U64 encodes an Offset as int64 for convenience.
+func (o Offset) U64() uint64 {
+	return uint64(o)
+}
+
+// LocalMemoryBase returns an offset of the first byte of the local memory.
+func (m *ModuleContextOffsetData) LocalMemoryBase() Offset {
+	return m.LocalMemoryBegin
+}
+
+// LocalMemoryLen returns an offset of the length of the local memory buffer.
+func (m *ModuleContextOffsetData) LocalMemoryLen() Offset {
+	if l := m.LocalMemoryBegin; l >= 0 {
+		return l + 8
+	}
+	return -1
+}
+
+// TableOffset returns an offset of the i-th table instance.
+func (m *ModuleContextOffsetData) TableOffset(tableIndex int) Offset {
+	return m.TablesBegin + Offset(tableIndex)*8
+}
+
+// NewModuleContextOffsetData creates a ModuleContextOffsetData determining the structure of moduleContextOpaque for the given Module.
+// The structure is described in the comment of wazevo.moduleContextOpaque.
+func NewModuleContextOffsetData(m *wasm.Module, withListener bool) ModuleContextOffsetData {
+	ret := ModuleContextOffsetData{}
+	var offset Offset
+
+	ret.ModuleInstanceOffset = 0
+	offset += 8
+
+	if m.MemorySection != nil {
+		ret.LocalMemoryBegin = offset
+		// buffer base + memory size.
+		const localMemorySizeInOpaqueModuleContext = 16
+		offset += localMemorySizeInOpaqueModuleContext
+	} else {
+		// Indicates that there's no local memory
+		ret.LocalMemoryBegin = -1
+	}
+
+	if m.ImportMemoryCount > 0 {
+		offset = align8(offset)
+		// *wasm.MemoryInstance + imported memory's owner (moduleContextOpaque)
+		const importedMemorySizeInOpaqueModuleContext = 16
+		ret.ImportedMemoryBegin = offset
+		offset += importedMemorySizeInOpaqueModuleContext
+	} else {
+		// Indicates that there's no imported memory
+		ret.ImportedMemoryBegin = -1
+	}
+
+	if m.ImportFunctionCount > 0 {
+		offset = align8(offset)
+		ret.ImportedFunctionsBegin = offset
+		// Each function is stored wazevo.functionInstance.
+		size := int(m.ImportFunctionCount) * FunctionInstanceSize
+		offset += Offset(size)
+	} else {
+		ret.ImportedFunctionsBegin = -1
+	}
+
+	if globals := int(m.ImportGlobalCount) + len(m.GlobalSection); globals > 0 {
+		// Align to 16 bytes for globals, as f32/f64/v128 might be loaded via SIMD instructions.
+		offset = align16(offset)
+		ret.GlobalsBegin = offset
+		// Pointers to *wasm.GlobalInstance.
+		offset += Offset(globals) * 16
+	} else {
+		ret.GlobalsBegin = -1
+	}
+
+	if tables := len(m.TableSection) + int(m.ImportTableCount); tables > 0 {
+		offset = align8(offset)
+		ret.TypeIDs1stElement = offset
+		offset += 8 // First element of TypeIDs.
+
+		ret.TablesBegin = offset
+		// Pointers to *wasm.TableInstance.
+		offset += Offset(tables) * 8
+	} else {
+		ret.TypeIDs1stElement = -1
+		ret.TablesBegin = -1
+	}
+
+	if withListener {
+		offset = align8(offset)
+		ret.BeforeListenerTrampolines1stElement = offset
+		offset += 8 // First element of BeforeListenerTrampolines.
+
+		ret.AfterListenerTrampolines1stElement = offset
+		offset += 8 // First element of AfterListenerTrampolines.
+	} else {
+		ret.BeforeListenerTrampolines1stElement = -1
+		ret.AfterListenerTrampolines1stElement = -1
+	}
+
+	ret.DataInstances1stElement = offset
+	offset += 8 // First element of DataInstances.
+
+	ret.ElementInstances1stElement = offset
+	offset += 8 // First element of ElementInstances.
+
+	ret.TotalSize = int(align16(offset))
+	return ret
+}
+
+func align16(o Offset) Offset {
+	return (o + 15) &^ 15
+}
+
+func align8(o Offset) Offset {
+	return (o + 7) &^ 7
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap.go
new file mode 100644
index 000000000..642c7f75d
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap.go
@@ -0,0 +1,96 @@
+package wazevoapi
+
+import (
+	"fmt"
+	"os"
+	"strconv"
+	"sync"
+)
+
+var PerfMap *Perfmap
+
+func init() {
+	if PerfMapEnabled {
+		pid := os.Getpid()
+		filename := "/tmp/perf-" + strconv.Itoa(pid) + ".map"
+
+		fh, err := os.OpenFile(filename, os.O_APPEND|os.O_RDWR|os.O_CREATE, 0o644)
+		if err != nil {
+			panic(err)
+		}
+
+		PerfMap = &Perfmap{fh: fh}
+	}
+}
+
+// Perfmap holds perfmap entries to be flushed into a perfmap file.
+type Perfmap struct {
+	entries []entry
+	mux     sync.Mutex
+	fh      *os.File
+}
+
+type entry struct {
+	index  int
+	offset int64
+	size   uint64
+	name   string
+}
+
+func (f *Perfmap) Lock() {
+	f.mux.Lock()
+}
+
+func (f *Perfmap) Unlock() {
+	f.mux.Unlock()
+}
+
+// AddModuleEntry adds a perfmap entry into the perfmap file.
+// index is the index of the function in the module, offset is the offset of the function in the module,
+// size is the size of the function, and name is the name of the function.
+//
+// Note that the entries are not flushed into the perfmap file until Flush is called,
+// and the entries are module-scoped; Perfmap must be locked until Flush is called.
+func (f *Perfmap) AddModuleEntry(index int, offset int64, size uint64, name string) {
+	e := entry{index: index, offset: offset, size: size, name: name}
+	if f.entries == nil {
+		f.entries = []entry{e}
+		return
+	}
+	f.entries = append(f.entries, e)
+}
+
+// Flush writes the perfmap entries into the perfmap file where the entries are adjusted by the given `addr` and `functionOffsets`.
+func (f *Perfmap) Flush(addr uintptr, functionOffsets []int) {
+	defer func() {
+		_ = f.fh.Sync()
+	}()
+
+	for _, e := range f.entries {
+		if _, err := f.fh.WriteString(fmt.Sprintf("%x %s %s\n",
+			uintptr(e.offset)+addr+uintptr(functionOffsets[e.index]),
+			strconv.FormatUint(e.size, 16),
+			e.name,
+		)); err != nil {
+			panic(err)
+		}
+	}
+	f.entries = f.entries[:0]
+}
+
+// Clear clears the perfmap entries not yet flushed.
+func (f *Perfmap) Clear() {
+	f.entries = f.entries[:0]
+}
+
+// AddEntry writes a perfmap entry directly into the perfmap file, not using the entries.
+func (f *Perfmap) AddEntry(addr uintptr, size uint64, name string) {
+	_, err := f.fh.WriteString(fmt.Sprintf("%x %s %s\n",
+		addr,
+		strconv.FormatUint(size, 16),
+		name,
+	))
+	if err != nil {
+		panic(err)
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap_disabled.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap_disabled.go
new file mode 100644
index 000000000..bcc4e545c
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap_disabled.go
@@ -0,0 +1,5 @@
+//go:build !perfmap
+
+package wazevoapi
+
+const PerfMapEnabled = false
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap_enabled.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap_enabled.go
new file mode 100644
index 000000000..2a39879ec
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap_enabled.go
@@ -0,0 +1,5 @@
+//go:build perfmap
+
+package wazevoapi
+
+const PerfMapEnabled = true
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go
new file mode 100644
index 000000000..3149fdc9e
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go
@@ -0,0 +1,215 @@
+package wazevoapi
+
+const poolPageSize = 128
+
+// Pool is a pool of T that can be allocated and reset.
+// This is useful to avoid unnecessary allocations.
+type Pool[T any] struct {
+	pages            []*[poolPageSize]T
+	resetFn          func(*T)
+	allocated, index int
+}
+
+// NewPool returns a new Pool.
+// resetFn is called when a new T is allocated in Pool.Allocate.
+func NewPool[T any](resetFn func(*T)) Pool[T] {
+	var ret Pool[T]
+	ret.resetFn = resetFn
+	ret.Reset()
+	return ret
+}
+
+// Allocated returns the number of allocated T currently in the pool.
+func (p *Pool[T]) Allocated() int {
+	return p.allocated
+}
+
+// Allocate allocates a new T from the pool.
+func (p *Pool[T]) Allocate() *T {
+	if p.index == poolPageSize {
+		if len(p.pages) == cap(p.pages) {
+			p.pages = append(p.pages, new([poolPageSize]T))
+		} else {
+			i := len(p.pages)
+			p.pages = p.pages[:i+1]
+			if p.pages[i] == nil {
+				p.pages[i] = new([poolPageSize]T)
+			}
+		}
+		p.index = 0
+	}
+	ret := &p.pages[len(p.pages)-1][p.index]
+	if p.resetFn != nil {
+		p.resetFn(ret)
+	}
+	p.index++
+	p.allocated++
+	return ret
+}
+
+// View returns the pointer to i-th item from the pool.
+func (p *Pool[T]) View(i int) *T {
+	page, index := i/poolPageSize, i%poolPageSize
+	return &p.pages[page][index]
+}
+
+// Reset resets the pool.
+func (p *Pool[T]) Reset() {
+	p.pages = p.pages[:0]
+	p.index = poolPageSize
+	p.allocated = 0
+}
+
+// IDedPool is a pool of T that can be allocated and reset, with a way to get T by an ID.
+type IDedPool[T any] struct {
+	pool             Pool[T]
+	idToItems        []*T
+	maxIDEncountered int
+}
+
+// NewIDedPool returns a new IDedPool.
+func NewIDedPool[T any](resetFn func(*T)) IDedPool[T] {
+	return IDedPool[T]{pool: NewPool[T](resetFn)}
+}
+
+// GetOrAllocate returns the T with the given id.
+func (p *IDedPool[T]) GetOrAllocate(id int) *T {
+	if p.maxIDEncountered < id {
+		p.maxIDEncountered = id
+	}
+	if id >= len(p.idToItems) {
+		p.idToItems = append(p.idToItems, make([]*T, id-len(p.idToItems)+1)...)
+	}
+	if p.idToItems[id] == nil {
+		p.idToItems[id] = p.pool.Allocate()
+	}
+	return p.idToItems[id]
+}
+
+// Get returns the T with the given id, or nil if it's not allocated.
+func (p *IDedPool[T]) Get(id int) *T {
+	if id >= len(p.idToItems) {
+		return nil
+	}
+	return p.idToItems[id]
+}
+
+// Reset resets the pool.
+func (p *IDedPool[T]) Reset() {
+	p.pool.Reset()
+	for i := range p.idToItems {
+		p.idToItems[i] = nil
+	}
+	p.maxIDEncountered = -1
+}
+
+// MaxIDEncountered returns the maximum id encountered so far.
+func (p *IDedPool[T]) MaxIDEncountered() int {
+	return p.maxIDEncountered
+}
+
+// arraySize is the size of the array used in VarLengthPool's arrayPool.
+// This is chosen to be 8, which is empirically a good number among 8, 12, 16 and 20.
+const arraySize = 8
+
+// VarLengthPool is a pool of VarLength[T] that can be allocated and reset.
+type (
+	VarLengthPool[T any] struct {
+		arrayPool Pool[varLengthPoolArray[T]]
+		slicePool Pool[[]T]
+	}
+	// varLengthPoolArray wraps an array and keeps track of the next index to be used to avoid the heap allocation.
+	varLengthPoolArray[T any] struct {
+		arr  [arraySize]T
+		next int
+	}
+)
+
+// VarLength is a variable length array that can be reused via a pool.
+type VarLength[T any] struct {
+	arr *varLengthPoolArray[T]
+	slc *[]T
+}
+
+// NewVarLengthPool returns a new VarLengthPool.
+func NewVarLengthPool[T any]() VarLengthPool[T] {
+	return VarLengthPool[T]{
+		arrayPool: NewPool[varLengthPoolArray[T]](func(v *varLengthPoolArray[T]) {
+			v.next = 0
+		}),
+		slicePool: NewPool[[]T](func(i *[]T) {
+			*i = (*i)[:0]
+		}),
+	}
+}
+
+// NewNilVarLength returns a new VarLength[T] with a nil backing.
+func NewNilVarLength[T any]() VarLength[T] {
+	return VarLength[T]{}
+}
+
+// Allocate allocates a new VarLength[T] from the pool.
+func (p *VarLengthPool[T]) Allocate(knownMin int) VarLength[T] {
+	if knownMin <= arraySize {
+		arr := p.arrayPool.Allocate()
+		return VarLength[T]{arr: arr}
+	}
+	slc := p.slicePool.Allocate()
+	return VarLength[T]{slc: slc}
+}
+
+// Reset resets the pool.
+func (p *VarLengthPool[T]) Reset() {
+	p.arrayPool.Reset()
+	p.slicePool.Reset()
+}
+
+// Append appends items to the backing slice just like the `append` builtin function in Go.
+func (i VarLength[T]) Append(p *VarLengthPool[T], items ...T) VarLength[T] {
+	if i.slc != nil {
+		*i.slc = append(*i.slc, items...)
+		return i
+	}
+
+	if i.arr == nil {
+		i.arr = p.arrayPool.Allocate()
+	}
+
+	arr := i.arr
+	if arr.next+len(items) <= arraySize {
+		for _, item := range items {
+			arr.arr[arr.next] = item
+			arr.next++
+		}
+	} else {
+		slc := p.slicePool.Allocate()
+		// Copy the array to the slice.
+		for ptr := 0; ptr < arr.next; ptr++ {
+			*slc = append(*slc, arr.arr[ptr])
+		}
+		i.slc = slc
+		*i.slc = append(*i.slc, items...)
+	}
+	return i
+}
+
+// View returns the backing slice.
+func (i VarLength[T]) View() []T {
+	if i.slc != nil {
+		return *i.slc
+	} else if i.arr != nil {
+		arr := i.arr
+		return arr.arr[:arr.next]
+	}
+	return nil
+}
+
+// Cut cuts the backing slice to the given length.
+// Precondition: n <= len(i.backing).
+func (i VarLength[T]) Cut(n int) {
+	if i.slc != nil {
+		*i.slc = (*i.slc)[:n]
+	} else if i.arr != nil {
+		i.arr.next = n
+	}
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/ptr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/ptr.go
new file mode 100644
index 000000000..f21e1a5d8
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/ptr.go
@@ -0,0 +1,15 @@
+package wazevoapi
+
+import "unsafe"
+
+// PtrFromUintptr resurrects the original *T from the given uintptr.
+// The caller of this function MUST be sure that ptr is valid.
+func PtrFromUintptr[T any](ptr uintptr) *T {
+	// Wraps ptrs as the double pointer in order to avoid the unsafe access as detected by race detector.
+	//
+	// For example, if we have (*function)(unsafe.Pointer(ptr)) instead, then the race detector's "checkptr"
+	// subroutine wanrs as "checkptr: pointer arithmetic result points to invalid allocation"
+	// https://github.com/golang/go/blob/1ce7fcf139417d618c2730010ede2afb41664211/src/runtime/checkptr.go#L69
+	var wrapped *uintptr = &ptr
+	return *(**T)(unsafe.Pointer(wrapped))
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/queue.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/queue.go
new file mode 100644
index 000000000..e3118fa69
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/queue.go
@@ -0,0 +1,26 @@
+package wazevoapi
+
+// Queue is the resettable queue where the underlying slice is reused.
+type Queue[T any] struct {
+	index int
+	Data  []T
+}
+
+func (q *Queue[T]) Enqueue(v T) {
+	q.Data = append(q.Data, v)
+}
+
+func (q *Queue[T]) Dequeue() (ret T) {
+	ret = q.Data[q.index]
+	q.index++
+	return
+}
+
+func (q *Queue[T]) Empty() bool {
+	return q.index >= len(q.Data)
+}
+
+func (q *Queue[T]) Reset() {
+	q.index = 0
+	q.Data = q.Data[:0]
+}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/resetmap.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/resetmap.go
new file mode 100644
index 000000000..7177fbb4b
--- /dev/null
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/resetmap.go
@@ -0,0 +1,13 @@
+package wazevoapi
+
+// ResetMap resets the map to an empty state, or creates a new map if it is nil.
+func ResetMap[K comparable, V any](m map[K]V) map[K]V {
+	if m == nil {
+		m = make(map[K]V)
+	} else {
+		for v := range m {
+			delete(m, v)
+		}
+	}
+	return m
+}