[chore] Upgrade wasm-sqlite to v0.16.2 (#2997)

author: Daenney <daenney@users.noreply.github.com> 2024-06-12 14:21:34 +0200
committer: GitHub <noreply@github.com> 2024-06-12 13:21:34 +0100
commit: 978b4176f1a31a497aaadd33f21659b318832c95 (patch)
tree: 8ab36617b993a457af5d2975bedaa63a57031ff3 /vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend
parent: [bugfix] Correct Swagger path for poll voting (#2996) (diff)
download: gotosocial-978b4176f1a31a497aaadd33f21659b318832c95.tar.xz
16 files changed, 775 insertions, 739 deletions
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go
index 81c6a6b62..8e9571b20 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go
@@ -43,7 +43,7 @@ type ExecutableContextT[Instr any] struct {
 	labelPositionPool wazevoapi.Pool[LabelPosition[Instr]]
 	NextLabel         Label
 	// LabelPositions maps a label to the instructions of the region which the label represents.
-	LabelPositions     map[Label]*LabelPosition[Instr]
+	LabelPositions     []*LabelPosition[Instr]
 	OrderedBlockLabels []*LabelPosition[Instr]
 
 	// PerBlockHead and PerBlockEnd are the head and tail of the instruction list per currently-compiled ssa.BasicBlock.
@@ -67,7 +67,6 @@ func NewExecutableContextT[Instr any](
 		setNext:           setNext,
 		setPrev:           setPrev,
 		labelPositionPool: wazevoapi.NewPool[LabelPosition[Instr]](resetLabelPosition[Instr]),
-		LabelPositions:    make(map[Label]*LabelPosition[Instr]),
 		NextLabel:         LabelInvalid,
 	}
 }
@@ -97,11 +96,7 @@ func (e *ExecutableContextT[Instr]) StartBlock(blk ssa.BasicBlock) {
 	end := e.allocateNop0()
 	e.PerBlockHead, e.PerBlockEnd = end, end
 
-	labelPos, ok := e.LabelPositions[l]
-	if !ok {
-		labelPos = e.AllocateLabelPosition(l)
-		e.LabelPositions[l] = labelPos
-	}
+	labelPos := e.GetOrAllocateLabelPosition(l)
 	e.OrderedBlockLabels = append(e.OrderedBlockLabels, labelPos)
 	labelPos.Begin, labelPos.End = end, end
 	labelPos.SB = blk
@@ -146,8 +141,8 @@ func (e *ExecutableContextT[T]) FlushPendingInstructions() {
 func (e *ExecutableContextT[T]) Reset() {
 	e.labelPositionPool.Reset()
 	e.InstructionPool.Reset()
-	for l := Label(0); l <= e.NextLabel; l++ {
-		delete(e.LabelPositions, l)
+	for i := range e.LabelPositions {
+		e.LabelPositions[i] = nil
 	}
 	e.PendingInstructions = e.PendingInstructions[:0]
 	e.OrderedBlockLabels = e.OrderedBlockLabels[:0]
@@ -163,10 +158,17 @@ func (e *ExecutableContextT[T]) AllocateLabel() Label {
 	return e.NextLabel
 }
 
-func (e *ExecutableContextT[T]) AllocateLabelPosition(la Label) *LabelPosition[T] {
-	l := e.labelPositionPool.Allocate()
-	l.L = la
-	return l
+func (e *ExecutableContextT[T]) GetOrAllocateLabelPosition(l Label) *LabelPosition[T] {
+	if len(e.LabelPositions) <= int(l) {
+		e.LabelPositions = append(e.LabelPositions, make([]*LabelPosition[T], int(l)+1-len(e.LabelPositions))...)
+	}
+	ret := e.LabelPositions[l]
+	if ret == nil {
+		ret = e.labelPositionPool.Allocate()
+		ret.L = l
+		e.LabelPositions[l] = ret
+	}
+	return ret
 }
 
 func (e *ExecutableContextT[T]) GetOrAllocateSSABlockLabel(blk ssa.BasicBlock) Label {
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
index 310ad2203..61ae6f406 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
@@ -1906,8 +1906,10 @@ func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) {
 func (m *machine) Format() string {
 	ectx := m.ectx
 	begins := map[*instruction]backend.Label{}
-	for l, pos := range ectx.LabelPositions {
-		begins[pos.Begin] = l
+	for _, pos := range ectx.LabelPositions {
+		if pos != nil {
+			begins[pos.Begin] = pos.L
+		}
 	}
 
 	irBlocks := map[backend.Label]ssa.BasicBlockID{}
@@ -1950,7 +1952,10 @@ func (m *machine) encodeWithoutSSA(root *instruction) {
 		offset := int64(len(*bufPtr))
 		if cur.kind == nop0 {
 			l := cur.nop0Label()
-			if pos, ok := ectx.LabelPositions[l]; ok {
+			if int(l) >= len(ectx.LabelPositions) {
+				continue
+			}
+			if pos := ectx.LabelPositions[l]; pos != nil {
 				pos.BinaryOffset = offset
 			}
 		}
@@ -2005,7 +2010,7 @@ func (m *machine) Encode(ctx context.Context) (err error) {
 			switch cur.kind {
 			case nop0:
 				l := cur.nop0Label()
-				if pos, ok := ectx.LabelPositions[l]; ok {
+				if pos := ectx.LabelPositions[l]; pos != nil {
 					pos.BinaryOffset = offset
 				}
 			case sourceOffsetInfo:
@@ -2165,8 +2170,7 @@ func (m *machine) allocateBrTarget() (nop *instruction, l backend.Label) { //nol
 func (m *machine) allocateLabel() *labelPosition {
 	ectx := m.ectx
 	l := ectx.AllocateLabel()
-	pos := ectx.AllocateLabelPosition(l)
-	ectx.LabelPositions[l] = pos
+	pos := ectx.GetOrAllocateLabelPosition(l)
 	return pos
 }
 
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
index 6615471c6..4eaa13ce1 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
@@ -101,13 +101,14 @@ func (m *machine) LowerParams(args []ssa.Value) {
 			bits := arg.Type.Bits()
 			// At this point of compilation, we don't yet know how much space exist below the return address.
 			// So we instruct the address mode to add the `argStackOffset` to the offset at the later phase of compilation.
-			amode := addressMode{imm: arg.Offset, rn: spVReg, kind: addressModeKindArgStackSpace}
+			amode := m.amodePool.Allocate()
+			*amode = addressMode{imm: arg.Offset, rn: spVReg, kind: addressModeKindArgStackSpace}
 			load := m.allocateInstr()
 			switch arg.Type {
 			case ssa.TypeI32, ssa.TypeI64:
-				load.asULoad(operandNR(reg), amode, bits)
+				load.asULoad(reg, amode, bits)
 			case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-				load.asFpuLoad(operandNR(reg), amode, bits)
+				load.asFpuLoad(reg, amode, bits)
 			default:
 				panic("BUG")
 			}
@@ -169,7 +170,8 @@ func (m *machine) LowerReturns(rets []ssa.Value) {
 
 			// At this point of compilation, we don't yet know how much space exist below the return address.
 			// So we instruct the address mode to add the `retStackOffset` to the offset at the later phase of compilation.
-			amode := addressMode{imm: r.Offset, rn: spVReg, kind: addressModeKindResultStackSpace}
+			amode := m.amodePool.Allocate()
+			*amode = addressMode{imm: r.Offset, rn: spVReg, kind: addressModeKindResultStackSpace}
 			store := m.allocateInstr()
 			store.asStore(operandNR(reg), amode, bits)
 			m.insert(store)
@@ -215,9 +217,9 @@ func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex i
 		ldr := m.allocateInstr()
 		switch r.Type {
 		case ssa.TypeI32, ssa.TypeI64:
-			ldr.asULoad(operandNR(reg), amode, r.Type.Bits())
+			ldr.asULoad(reg, amode, r.Type.Bits())
 		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-			ldr.asFpuLoad(operandNR(reg), amode, r.Type.Bits())
+			ldr.asFpuLoad(reg, amode, r.Type.Bits())
 		default:
 			panic("BUG")
 		}
@@ -225,7 +227,7 @@ func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex i
 	}
 }
 
-func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) (*instruction, addressMode) {
+func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) (*instruction, *addressMode) {
 	exct := m.executableContext
 	exct.PendingInstructions = exct.PendingInstructions[:0]
 	mode := m.resolveAddressModeForOffset(offset, dstBits, rn, allowTmpRegUse)
@@ -235,15 +237,15 @@ func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset
 	return cur, mode
 }
 
-func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) addressMode {
+func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) *addressMode {
 	if rn.RegType() != regalloc.RegTypeInt {
 		panic("BUG: rn should be a pointer: " + formatVRegSized(rn, 64))
 	}
-	var amode addressMode
+	amode := m.amodePool.Allocate()
 	if offsetFitsInAddressModeKindRegUnsignedImm12(dstBits, offset) {
-		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: rn, imm: offset}
+		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: rn, imm: offset}
 	} else if offsetFitsInAddressModeKindRegSignedImm9(offset) {
-		amode = addressMode{kind: addressModeKindRegSignedImm9, rn: rn, imm: offset}
+		*amode = addressMode{kind: addressModeKindRegSignedImm9, rn: rn, imm: offset}
 	} else {
 		var indexReg regalloc.VReg
 		if allowTmpRegUse {
@@ -253,7 +255,7 @@ func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn reg
 			indexReg = m.compiler.AllocateVReg(ssa.TypeI64)
 			m.lowerConstantI64(indexReg, offset)
 		}
-		amode = addressMode{kind: addressModeKindRegReg, rn: rn, rm: indexReg, extOp: extendOpUXTX /* indicates index rm is 64-bit */}
+		*amode = addressMode{kind: addressModeKindRegReg, rn: rn, rm: indexReg, extOp: extendOpUXTX /* indicates index rm is 64-bit */}
 	}
 	return amode
 }
@@ -315,7 +317,7 @@ func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add b
 		} else {
 			ao = aluOpSub
 		}
-		alu.asALU(ao, operandNR(rd), operandNR(spVReg), imm12Operand, true)
+		alu.asALU(ao, rd, operandNR(spVReg), imm12Operand, true)
 		m.insert(alu)
 	} else {
 		m.lowerConstantI64(tmpRegVReg, diff)
@@ -326,7 +328,7 @@ func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add b
 		} else {
 			ao = aluOpSub
 		}
-		alu.asALU(ao, operandNR(rd), operandNR(spVReg), operandNR(tmpRegVReg), true)
+		alu.asALU(ao, rd, operandNR(spVReg), operandNR(tmpRegVReg), true)
 		m.insert(alu)
 	}
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
index 7a9cceb33..f8b5d97ac 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
@@ -59,25 +59,26 @@ func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regallo
 	} else {
 		postIndexImm = 8
 	}
-	loadMode := addressMode{kind: addressModeKindPostIndex, rn: paramSlicePtr, imm: postIndexImm}
+	loadMode := m.amodePool.Allocate()
+	*loadMode = addressMode{kind: addressModeKindPostIndex, rn: paramSlicePtr, imm: postIndexImm}
 
 	instr := m.allocateInstr()
 	switch typ {
 	case ssa.TypeI32:
-		instr.asULoad(loadTargetReg, loadMode, 32)
+		instr.asULoad(loadTargetReg.reg(), loadMode, 32)
 	case ssa.TypeI64:
-		instr.asULoad(loadTargetReg, loadMode, 64)
+		instr.asULoad(loadTargetReg.reg(), loadMode, 64)
 	case ssa.TypeF32:
-		instr.asFpuLoad(loadTargetReg, loadMode, 32)
+		instr.asFpuLoad(loadTargetReg.reg(), loadMode, 32)
 	case ssa.TypeF64:
-		instr.asFpuLoad(loadTargetReg, loadMode, 64)
+		instr.asFpuLoad(loadTargetReg.reg(), loadMode, 64)
 	case ssa.TypeV128:
-		instr.asFpuLoad(loadTargetReg, loadMode, 128)
+		instr.asFpuLoad(loadTargetReg.reg(), loadMode, 128)
 	}
 	cur = linkInstr(cur, instr)
 
 	if isStackArg {
-		var storeMode addressMode
+		var storeMode *addressMode
 		cur, storeMode = m.resolveAddressModeForOffsetAndInsert(cur, argStartOffsetFromSP+arg.Offset, bits, spVReg, true)
 		toStack := m.allocateInstr()
 		toStack.asStore(loadTargetReg, storeMode, bits)
@@ -113,21 +114,22 @@ func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr reg
 	}
 
 	if isStackArg {
-		var loadMode addressMode
+		var loadMode *addressMode
 		cur, loadMode = m.resolveAddressModeForOffsetAndInsert(cur, resultStartOffsetFromSP+result.Offset, bits, spVReg, true)
 		toReg := m.allocateInstr()
 		switch typ {
 		case ssa.TypeI32, ssa.TypeI64:
-			toReg.asULoad(storeTargetReg, loadMode, bits)
+			toReg.asULoad(storeTargetReg.reg(), loadMode, bits)
 		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-			toReg.asFpuLoad(storeTargetReg, loadMode, bits)
+			toReg.asFpuLoad(storeTargetReg.reg(), loadMode, bits)
 		default:
 			panic("TODO?")
 		}
 		cur = linkInstr(cur, toReg)
 	}
 
-	mode := addressMode{kind: addressModeKindPostIndex, rn: resultSlicePtr, imm: postIndexImm}
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindPostIndex, rn: resultSlicePtr, imm: postIndexImm}
 	instr := m.allocateInstr()
 	instr.asStore(storeTargetReg, mode, bits)
 	cur = linkInstr(cur, instr)
@@ -214,11 +216,12 @@ func (m *machine) move64(dst, src regalloc.VReg, prev *instruction) *instruction
 
 func (m *machine) loadOrStoreAtExecutionContext(d regalloc.VReg, offset wazevoapi.Offset, store bool, prev *instruction) *instruction {
 	instr := m.allocateInstr()
-	mode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: savedExecutionContextPtr, imm: offset.I64()}
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: savedExecutionContextPtr, imm: offset.I64()}
 	if store {
 		instr.asStore(operandNR(d), mode, 64)
 	} else {
-		instr.asULoad(operandNR(d), mode, 64)
+		instr.asULoad(d, mode, 64)
 	}
 	return linkInstr(prev, instr)
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
index 466b1f960..99e6bb482 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
@@ -87,7 +87,8 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 		// Module context is always the second argument.
 		moduleCtrPtr := x1VReg
 		store := m.allocateInstr()
-		amode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtrPtr, imm: offset}
+		amode := m.amodePool.Allocate()
+		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtrPtr, imm: offset}
 		store.asStore(operandNR(moduleCtrPtr), amode, 64)
 		cur = linkInstr(cur, store)
 	}
@@ -120,11 +121,9 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 		} else {
 			sizeInBits = 64
 		}
-		store.asStore(operandNR(v),
-			addressMode{
-				kind: addressModeKindPostIndex,
-				rn:   arg0ret0AddrReg, imm: int64(sizeInBits / 8),
-			}, sizeInBits)
+		amode := m.amodePool.Allocate()
+		*amode = addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg, imm: int64(sizeInBits / 8)}
+		store.asStore(operandNR(v), amode, sizeInBits)
 		cur = linkInstr(cur, store)
 	}
 
@@ -139,7 +138,7 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 		frameSizeReg = xzrVReg
 		sliceSizeReg = xzrVReg
 	}
-	_amode := addressModePreOrPostIndex(spVReg, -16, true)
+	_amode := addressModePreOrPostIndex(m, spVReg, -16, true)
 	storeP := m.allocateInstr()
 	storeP.asStorePair64(frameSizeReg, sliceSizeReg, _amode)
 	cur = linkInstr(cur, storeP)
@@ -165,8 +164,8 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 	cur = m.addsAddOrSubStackPointer(cur, spVReg, frameInfoSize+goCallStackSize, true)
 	ldr := m.allocateInstr()
 	// And load the return address.
-	ldr.asULoad(operandNR(lrVReg),
-		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
+	amode := addressModePreOrPostIndex(m, spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */)
+	ldr.asULoad(lrVReg, amode, 64)
 	cur = linkInstr(cur, ldr)
 
 	originalRet0Reg := x17VReg // Caller save, so we can use it for whatever we want.
@@ -183,23 +182,24 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 		r := &abi.Rets[i]
 		if r.Kind == backend.ABIArgKindReg {
 			loadIntoReg := m.allocateInstr()
-			mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
+			mode := m.amodePool.Allocate()
+			*mode = addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
 			switch r.Type {
 			case ssa.TypeI32:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asULoad(operandNR(r.Reg), mode, 32)
+				loadIntoReg.asULoad(r.Reg, mode, 32)
 			case ssa.TypeI64:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asULoad(operandNR(r.Reg), mode, 64)
+				loadIntoReg.asULoad(r.Reg, mode, 64)
 			case ssa.TypeF32:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 32)
+				loadIntoReg.asFpuLoad(r.Reg, mode, 32)
 			case ssa.TypeF64:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 64)
+				loadIntoReg.asFpuLoad(r.Reg, mode, 64)
 			case ssa.TypeV128:
 				mode.imm = 16
-				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 128)
+				loadIntoReg.asFpuLoad(r.Reg, mode, 128)
 			default:
 				panic("TODO")
 			}
@@ -208,28 +208,29 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 			// First we need to load the value to a temporary just like ^^.
 			intTmp, floatTmp := x11VReg, v11VReg
 			loadIntoTmpReg := m.allocateInstr()
-			mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
+			mode := m.amodePool.Allocate()
+			*mode = addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
 			var resultReg regalloc.VReg
 			switch r.Type {
 			case ssa.TypeI32:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 32)
+				loadIntoTmpReg.asULoad(intTmp, mode, 32)
 				resultReg = intTmp
 			case ssa.TypeI64:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 64)
+				loadIntoTmpReg.asULoad(intTmp, mode, 64)
 				resultReg = intTmp
 			case ssa.TypeF32:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 32)
+				loadIntoTmpReg.asFpuLoad(floatTmp, mode, 32)
 				resultReg = floatTmp
 			case ssa.TypeF64:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 64)
+				loadIntoTmpReg.asFpuLoad(floatTmp, mode, 64)
 				resultReg = floatTmp
 			case ssa.TypeV128:
 				mode.imm = 16
-				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 128)
+				loadIntoTmpReg.asFpuLoad(floatTmp, mode, 128)
 				resultReg = floatTmp
 			default:
 				panic("TODO")
@@ -258,12 +259,13 @@ func (m *machine) saveRegistersInExecutionContext(cur *instruction, regs []regal
 		case regalloc.RegTypeFloat:
 			sizeInBits = 128
 		}
-		store.asStore(operandNR(v),
-			addressMode{
-				kind: addressModeKindRegUnsignedImm12,
-				// Execution context is always the first argument.
-				rn: x0VReg, imm: offset,
-			}, sizeInBits)
+		mode := m.amodePool.Allocate()
+		*mode = addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			// Execution context is always the first argument.
+			rn: x0VReg, imm: offset,
+		}
+		store.asStore(operandNR(v), mode, sizeInBits)
 		store.prev = cur
 		cur.next = store
 		cur = store
@@ -276,7 +278,7 @@ func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []re
 	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
 	for _, v := range regs {
 		load := m.allocateInstr()
-		var as func(dst operand, amode addressMode, sizeInBits byte)
+		var as func(dst regalloc.VReg, amode *addressMode, sizeInBits byte)
 		var sizeInBits byte
 		switch v.RegType() {
 		case regalloc.RegTypeInt:
@@ -286,12 +288,13 @@ func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []re
 			as = load.asFpuLoad
 			sizeInBits = 128
 		}
-		as(operandNR(v),
-			addressMode{
-				kind: addressModeKindRegUnsignedImm12,
-				// Execution context is always the first argument.
-				rn: x0VReg, imm: offset,
-			}, sizeInBits)
+		mode := m.amodePool.Allocate()
+		*mode = addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			// Execution context is always the first argument.
+			rn: x0VReg, imm: offset,
+		}
+		as(v, mode, sizeInBits)
 		cur = linkInstr(cur, load)
 		offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally load regs at the offset of multiple of 16.
 	}
@@ -324,11 +327,9 @@ func (m *machine) setExitCode(cur *instruction, execCtr regalloc.VReg, exitCode
 
 	// Set the exit status on the execution context.
 	setExistStatus := m.allocateInstr()
-	setExistStatus.asStore(operandNR(constReg),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
-		}, 32)
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtr, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64()}
+	setExistStatus.asStore(operandNR(constReg), mode, 32)
 	cur = linkInstr(cur, setExistStatus)
 	return cur
 }
@@ -340,12 +341,13 @@ func (m *machine) storeReturnAddressAndExit(cur *instruction) *instruction {
 	cur = linkInstr(cur, adr)
 
 	storeReturnAddr := m.allocateInstr()
-	storeReturnAddr.asStore(operandNR(tmpRegVReg),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			// Execution context is always the first argument.
-			rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
-		}, 64)
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		// Execution context is always the first argument.
+		rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
+	}
+	storeReturnAddr.asStore(operandNR(tmpRegVReg), mode, 64)
 	cur = linkInstr(cur, storeReturnAddr)
 
 	// Exit the execution.
@@ -364,11 +366,12 @@ func (m *machine) saveCurrentStackPointer(cur *instruction, execCtr regalloc.VRe
 	cur = linkInstr(cur, movSp)
 
 	strSp := m.allocateInstr()
-	strSp.asStore(operandNR(tmpRegVReg),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
-		}, 64)
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
+	}
+	strSp.asStore(operandNR(tmpRegVReg), mode, 64)
 	cur = linkInstr(cur, strSp)
 	return cur
 }
@@ -376,27 +379,28 @@ func (m *machine) saveCurrentStackPointer(cur *instruction, execCtr regalloc.VRe
 func (m *machine) goFunctionCallLoadStackArg(cur *instruction, originalArg0Reg regalloc.VReg, arg *backend.ABIArg, intVReg, floatVReg regalloc.VReg) (*instruction, regalloc.VReg) {
 	load := m.allocateInstr()
 	var result regalloc.VReg
-	mode := addressMode{kind: addressModeKindPostIndex, rn: originalArg0Reg}
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindPostIndex, rn: originalArg0Reg}
 	switch arg.Type {
 	case ssa.TypeI32:
 		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asULoad(operandNR(intVReg), mode, 32)
+		load.asULoad(intVReg, mode, 32)
 		result = intVReg
 	case ssa.TypeI64:
 		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asULoad(operandNR(intVReg), mode, 64)
+		load.asULoad(intVReg, mode, 64)
 		result = intVReg
 	case ssa.TypeF32:
 		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asFpuLoad(operandNR(floatVReg), mode, 32)
+		load.asFpuLoad(floatVReg, mode, 32)
 		result = floatVReg
 	case ssa.TypeF64:
 		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asFpuLoad(operandNR(floatVReg), mode, 64)
+		load.asFpuLoad(floatVReg, mode, 64)
 		result = floatVReg
 	case ssa.TypeV128:
 		mode.imm = 16
-		load.asFpuLoad(operandNR(floatVReg), mode, 128)
+		load.asFpuLoad(floatVReg, mode, 128)
 		result = floatVReg
 	default:
 		panic("TODO")
@@ -408,7 +412,8 @@ func (m *machine) goFunctionCallLoadStackArg(cur *instruction, originalArg0Reg r
 
 func (m *machine) goFunctionCallStoreStackResult(cur *instruction, originalRet0Reg regalloc.VReg, result *backend.ABIArg, resultVReg regalloc.VReg) *instruction {
 	store := m.allocateInstr()
-	mode := addressMode{kind: addressModeKindPostIndex, rn: originalRet0Reg}
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindPostIndex, rn: originalRet0Reg}
 	var sizeInBits byte
 	switch result.Type {
 	case ssa.TypeI32, ssa.TypeF32:
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
index 8aabc5997..7121cb538 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
@@ -3,10 +3,12 @@ package arm64
 import (
 	"fmt"
 	"math"
+	"unsafe"
 
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
 )
 
 type (
@@ -22,9 +24,9 @@ type (
 	// TODO: optimize the layout later once the impl settles.
 	instruction struct {
 		prev, next          *instruction
-		u1, u2, u3          uint64
-		rd, rm, rn, ra      operand
-		amode               addressMode
+		u1, u2              uint64
+		rd                  regalloc.VReg
+		rm, rn              operand
 		kind                instructionKind
 		addedBeforeRegAlloc bool
 	}
@@ -174,7 +176,7 @@ func (i *instruction) Defs(regs *[]regalloc.VReg) []regalloc.VReg {
 	switch defKinds[i.kind] {
 	case defKindNone:
 	case defKindRD:
-		*regs = append(*regs, i.rd.nr())
+		*regs = append(*regs, i.rd)
 	case defKindCall:
 		_, _, retIntRealRegs, retFloatRealRegs, _ := backend.ABIInfoFromUint64(i.u2)
 		for i := byte(0); i < retIntRealRegs; i++ {
@@ -194,7 +196,7 @@ func (i *instruction) AssignDef(reg regalloc.VReg) {
 	switch defKinds[i.kind] {
 	case defKindNone:
 	case defKindRD:
-		i.rd = i.rd.assignReg(reg)
+		i.rd = reg
 	case defKindCall:
 		panic("BUG: call instructions shouldn't be assigned")
 	default:
@@ -329,7 +331,7 @@ func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
 		if rm := i.rm.reg(); rm.Valid() {
 			*regs = append(*regs, rm)
 		}
-		if ra := i.ra.reg(); ra.Valid() {
+		if ra := regalloc.VReg(i.u2); ra.Valid() {
 			*regs = append(*regs, ra)
 		}
 	case useKindRNRN1RM:
@@ -341,18 +343,20 @@ func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
 			*regs = append(*regs, rm)
 		}
 	case useKindAMode:
-		if amodeRN := i.amode.rn; amodeRN.Valid() {
+		amode := i.getAmode()
+		if amodeRN := amode.rn; amodeRN.Valid() {
 			*regs = append(*regs, amodeRN)
 		}
-		if amodeRM := i.amode.rm; amodeRM.Valid() {
+		if amodeRM := amode.rm; amodeRM.Valid() {
 			*regs = append(*regs, amodeRM)
 		}
 	case useKindRNAMode:
 		*regs = append(*regs, i.rn.reg())
-		if amodeRN := i.amode.rn; amodeRN.Valid() {
+		amode := i.getAmode()
+		if amodeRN := amode.rn; amodeRN.Valid() {
 			*regs = append(*regs, amodeRN)
 		}
-		if amodeRM := i.amode.rm; amodeRM.Valid() {
+		if amodeRM := amode.rm; amodeRM.Valid() {
 			*regs = append(*regs, amodeRM)
 		}
 	case useKindCond:
@@ -374,7 +378,7 @@ func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
 	case useKindRDRewrite:
 		*regs = append(*regs, i.rn.reg())
 		*regs = append(*regs, i.rm.reg())
-		*regs = append(*regs, i.rd.reg())
+		*regs = append(*regs, i.rd)
 	default:
 		panic(fmt.Sprintf("useKind for %v not defined", i))
 	}
@@ -408,8 +412,8 @@ func (i *instruction) AssignUse(index int, reg regalloc.VReg) {
 				i.rm = i.rm.assignReg(reg)
 			}
 		} else {
-			if rd := i.rd.reg(); rd.Valid() {
-				i.rd = i.rd.assignReg(reg)
+			if rd := i.rd; rd.Valid() {
+				i.rd = reg
 			}
 		}
 	case useKindRNRN1RM:
@@ -435,32 +439,36 @@ func (i *instruction) AssignUse(index int, reg regalloc.VReg) {
 				i.rm = i.rm.assignReg(reg)
 			}
 		} else {
-			if ra := i.ra.reg(); ra.Valid() {
-				i.ra = i.ra.assignReg(reg)
+			if ra := regalloc.VReg(i.u2); ra.Valid() {
+				i.u2 = uint64(reg)
 			}
 		}
 	case useKindAMode:
 		if index == 0 {
-			if amodeRN := i.amode.rn; amodeRN.Valid() {
-				i.amode.rn = reg
+			amode := i.getAmode()
+			if amodeRN := amode.rn; amodeRN.Valid() {
+				amode.rn = reg
 			}
 		} else {
-			if amodeRM := i.amode.rm; amodeRM.Valid() {
-				i.amode.rm = reg
+			amode := i.getAmode()
+			if amodeRM := amode.rm; amodeRM.Valid() {
+				amode.rm = reg
 			}
 		}
 	case useKindRNAMode:
 		if index == 0 {
 			i.rn = i.rn.assignReg(reg)
 		} else if index == 1 {
-			if amodeRN := i.amode.rn; amodeRN.Valid() {
-				i.amode.rn = reg
+			amode := i.getAmode()
+			if amodeRN := amode.rn; amodeRN.Valid() {
+				amode.rn = reg
 			} else {
 				panic("BUG")
 			}
 		} else {
-			if amodeRM := i.amode.rm; amodeRM.Valid() {
-				i.amode.rm = reg
+			amode := i.getAmode()
+			if amodeRM := amode.rm; amodeRM.Valid() {
+				amode.rm = reg
 			} else {
 				panic("BUG")
 			}
@@ -503,35 +511,35 @@ func (i *instruction) callFuncRef() ssa.FuncRef {
 }
 
 // shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
-func (i *instruction) asMOVZ(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+func (i *instruction) asMOVZ(dst regalloc.VReg, imm uint64, shift uint32, dst64bit bool) {
 	i.kind = movZ
-	i.rd = operandNR(dst)
+	i.rd = dst
 	i.u1 = imm
-	i.u2 = shift
+	i.u2 = uint64(shift)
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 // shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
-func (i *instruction) asMOVK(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+func (i *instruction) asMOVK(dst regalloc.VReg, imm uint64, shift uint32, dst64bit bool) {
 	i.kind = movK
-	i.rd = operandNR(dst)
+	i.rd = dst
 	i.u1 = imm
-	i.u2 = shift
+	i.u2 = uint64(shift)
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 // shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
-func (i *instruction) asMOVN(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+func (i *instruction) asMOVN(dst regalloc.VReg, imm uint64, shift uint32, dst64bit bool) {
 	i.kind = movN
-	i.rd = operandNR(dst)
+	i.rd = dst
 	i.u1 = imm
-	i.u2 = shift
+	i.u2 = uint64(shift)
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
@@ -553,21 +561,21 @@ func (i *instruction) asRet() {
 	i.kind = ret
 }
 
-func (i *instruction) asStorePair64(src1, src2 regalloc.VReg, amode addressMode) {
+func (i *instruction) asStorePair64(src1, src2 regalloc.VReg, amode *addressMode) {
 	i.kind = storeP64
 	i.rn = operandNR(src1)
 	i.rm = operandNR(src2)
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asLoadPair64(src1, src2 regalloc.VReg, amode addressMode) {
+func (i *instruction) asLoadPair64(src1, src2 regalloc.VReg, amode *addressMode) {
 	i.kind = loadP64
 	i.rn = operandNR(src1)
 	i.rm = operandNR(src2)
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asStore(src operand, amode addressMode, sizeInBits byte) {
+func (i *instruction) asStore(src operand, amode *addressMode, sizeInBits byte) {
 	switch sizeInBits {
 	case 8:
 		i.kind = store8
@@ -589,10 +597,10 @@ func (i *instruction) asStore(src operand, amode addressMode, sizeInBits byte) {
 		i.kind = fpuStore128
 	}
 	i.rn = src
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asSLoad(dst operand, amode addressMode, sizeInBits byte) {
+func (i *instruction) asSLoad(dst regalloc.VReg, amode *addressMode, sizeInBits byte) {
 	switch sizeInBits {
 	case 8:
 		i.kind = sLoad8
@@ -604,10 +612,10 @@ func (i *instruction) asSLoad(dst operand, amode addressMode, sizeInBits byte) {
 		panic("BUG")
 	}
 	i.rd = dst
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asULoad(dst operand, amode addressMode, sizeInBits byte) {
+func (i *instruction) asULoad(dst regalloc.VReg, amode *addressMode, sizeInBits byte) {
 	switch sizeInBits {
 	case 8:
 		i.kind = uLoad8
@@ -619,10 +627,10 @@ func (i *instruction) asULoad(dst operand, amode addressMode, sizeInBits byte) {
 		i.kind = uLoad64
 	}
 	i.rd = dst
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asFpuLoad(dst operand, amode addressMode, sizeInBits byte) {
+func (i *instruction) asFpuLoad(dst regalloc.VReg, amode *addressMode, sizeInBits byte) {
 	switch sizeInBits {
 	case 32:
 		i.kind = fpuLoad32
@@ -632,10 +640,18 @@ func (i *instruction) asFpuLoad(dst operand, amode addressMode, sizeInBits byte)
 		i.kind = fpuLoad128
 	}
 	i.rd = dst
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asVecLoad1R(rd, rn operand, arr vecArrangement) {
+func (i *instruction) getAmode() *addressMode {
+	return wazevoapi.PtrFromUintptr[addressMode](uintptr(i.u1))
+}
+
+func (i *instruction) setAmode(a *addressMode) {
+	i.u1 = uint64(uintptr(unsafe.Pointer(a)))
+}
+
+func (i *instruction) asVecLoad1R(rd regalloc.VReg, rn operand, arr vecArrangement) {
 	// NOTE: currently only has support for no-offset loads, though it is suspicious that
 	// we would need to support offset load (that is only available for post-index).
 	i.kind = vecLoad1R
@@ -646,32 +662,32 @@ func (i *instruction) asVecLoad1R(rd, rn operand, arr vecArrangement) {
 
 func (i *instruction) asCSet(rd regalloc.VReg, mask bool, c condFlag) {
 	i.kind = cSet
-	i.rd = operandNR(rd)
+	i.rd = rd
 	i.u1 = uint64(c)
 	if mask {
 		i.u2 = 1
 	}
 }
 
-func (i *instruction) asCSel(rd, rn, rm operand, c condFlag, _64bit bool) {
+func (i *instruction) asCSel(rd regalloc.VReg, rn, rm operand, c condFlag, _64bit bool) {
 	i.kind = cSel
 	i.rd = rd
 	i.rn = rn
 	i.rm = rm
 	i.u1 = uint64(c)
 	if _64bit {
-		i.u3 = 1
+		i.u2 = 1
 	}
 }
 
-func (i *instruction) asFpuCSel(rd, rn, rm operand, c condFlag, _64bit bool) {
+func (i *instruction) asFpuCSel(rd regalloc.VReg, rn, rm operand, c condFlag, _64bit bool) {
 	i.kind = fpuCSel
 	i.rd = rd
 	i.rn = rn
 	i.rm = rm
 	i.u1 = uint64(c)
 	if _64bit {
-		i.u3 = 1
+		i.u2 = 1
 	}
 }
 
@@ -691,7 +707,7 @@ func (i *instruction) asBrTableSequence(indexReg regalloc.VReg, targetIndex, tar
 }
 
 func (i *instruction) brTableSequenceOffsetsResolved() {
-	i.u3 = 1 // indicate that the offsets are resolved, for debugging.
+	i.rm.data = 1 // indicate that the offsets are resolved, for debugging.
 }
 
 func (i *instruction) brLabel() label {
@@ -701,7 +717,7 @@ func (i *instruction) brLabel() label {
 // brOffsetResolved is called when the target label is resolved.
 func (i *instruction) brOffsetResolve(offset int64) {
 	i.u2 = uint64(offset)
-	i.u3 = 1 // indicate that the offset is resolved, for debugging.
+	i.rm.data = 1 // indicate that the offset is resolved, for debugging.
 }
 
 func (i *instruction) brOffset() int64 {
@@ -714,7 +730,7 @@ func (i *instruction) asCondBr(c cond, target label, is64bit bool) {
 	i.u1 = c.asUint64()
 	i.u2 = uint64(target)
 	if is64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
@@ -728,17 +744,17 @@ func (i *instruction) condBrLabel() label {
 
 // condBrOffsetResolve is called when the target label is resolved.
 func (i *instruction) condBrOffsetResolve(offset int64) {
-	i.rd.data = uint64(offset)
-	i.rd.data2 = 1 // indicate that the offset is resolved, for debugging.
+	i.rn.data = uint64(offset)
+	i.rn.data2 = 1 // indicate that the offset is resolved, for debugging.
 }
 
 // condBrOffsetResolved returns true if condBrOffsetResolve is already called.
 func (i *instruction) condBrOffsetResolved() bool {
-	return i.rd.data2 == 1
+	return i.rn.data2 == 1
 }
 
 func (i *instruction) condBrOffset() int64 {
-	return int64(i.rd.data)
+	return int64(i.rn.data)
 }
 
 func (i *instruction) condBrCond() cond {
@@ -746,33 +762,33 @@ func (i *instruction) condBrCond() cond {
 }
 
 func (i *instruction) condBr64bit() bool {
-	return i.u3 == 1
+	return i.u2&(1<<32) != 0
 }
 
 func (i *instruction) asLoadFpuConst32(rd regalloc.VReg, raw uint64) {
 	i.kind = loadFpuConst32
 	i.u1 = raw
-	i.rd = operandNR(rd)
+	i.rd = rd
 }
 
 func (i *instruction) asLoadFpuConst64(rd regalloc.VReg, raw uint64) {
 	i.kind = loadFpuConst64
 	i.u1 = raw
-	i.rd = operandNR(rd)
+	i.rd = rd
 }
 
 func (i *instruction) asLoadFpuConst128(rd regalloc.VReg, lo, hi uint64) {
 	i.kind = loadFpuConst128
 	i.u1 = lo
 	i.u2 = hi
-	i.rd = operandNR(rd)
+	i.rd = rd
 }
 
 func (i *instruction) asFpuCmp(rn, rm operand, is64bit bool) {
 	i.kind = fpuCmp
 	i.rn, i.rm = rn, rm
 	if is64bit {
-		i.u3 = 1
+		i.u1 = 1
 	}
 }
 
@@ -783,12 +799,12 @@ func (i *instruction) asCCmpImm(rn operand, imm uint64, c condFlag, flag byte, i
 	i.u1 = uint64(c)
 	i.u2 = uint64(flag)
 	if is64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 // asALU setups a basic ALU instruction.
-func (i *instruction) asALU(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
+func (i *instruction) asALU(aluOp aluOp, rd regalloc.VReg, rn, rm operand, dst64bit bool) {
 	switch rm.kind {
 	case operandKindNR:
 		i.kind = aluRRR
@@ -804,22 +820,22 @@ func (i *instruction) asALU(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
 	i.u1 = uint64(aluOp)
 	i.rd, i.rn, i.rm = rd, rn, rm
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 // asALU setups a basic ALU instruction.
-func (i *instruction) asALURRRR(aluOp aluOp, rd, rn, rm, ra operand, dst64bit bool) {
+func (i *instruction) asALURRRR(aluOp aluOp, rd regalloc.VReg, rn, rm operand, ra regalloc.VReg, dst64bit bool) {
 	i.kind = aluRRRR
 	i.u1 = uint64(aluOp)
-	i.rd, i.rn, i.rm, i.ra = rd, rn, rm, ra
+	i.rd, i.rn, i.rm, i.u2 = rd, rn, rm, uint64(ra)
 	if dst64bit {
-		i.u3 = 1
+		i.u1 |= 1 << 32
 	}
 }
 
 // asALUShift setups a shift based ALU instruction.
-func (i *instruction) asALUShift(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
+func (i *instruction) asALUShift(aluOp aluOp, rd regalloc.VReg, rn, rm operand, dst64bit bool) {
 	switch rm.kind {
 	case operandKindNR:
 		i.kind = aluRRR // If the shift amount op is a register, then the instruction is encoded as a normal ALU instruction with two register operands.
@@ -831,17 +847,17 @@ func (i *instruction) asALUShift(aluOp aluOp, rd, rn, rm operand, dst64bit bool)
 	i.u1 = uint64(aluOp)
 	i.rd, i.rn, i.rm = rd, rn, rm
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 func (i *instruction) asALUBitmaskImm(aluOp aluOp, rd, rn regalloc.VReg, imm uint64, dst64bit bool) {
 	i.kind = aluRRBitmaskImm
 	i.u1 = uint64(aluOp)
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	i.u2 = imm
 	if dst64bit {
-		i.u3 = 1
+		i.u1 |= 1 << 32
 	}
 }
 
@@ -852,76 +868,76 @@ func (i *instruction) asMovToFPSR(rn regalloc.VReg) {
 
 func (i *instruction) asMovFromFPSR(rd regalloc.VReg) {
 	i.kind = movFromFPSR
-	i.rd = operandNR(rd)
+	i.rd = rd
 }
 
 func (i *instruction) asBitRR(bitOp bitOp, rd, rn regalloc.VReg, is64bit bool) {
 	i.kind = bitRR
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	i.u1 = uint64(bitOp)
 	if is64bit {
 		i.u2 = 1
 	}
 }
 
-func (i *instruction) asFpuRRR(op fpuBinOp, rd, rn, rm operand, dst64bit bool) {
+func (i *instruction) asFpuRRR(op fpuBinOp, rd regalloc.VReg, rn, rm operand, dst64bit bool) {
 	i.kind = fpuRRR
 	i.u1 = uint64(op)
 	i.rd, i.rn, i.rm = rd, rn, rm
 	if dst64bit {
-		i.u3 = 1
+		i.u2 = 1
 	}
 }
 
-func (i *instruction) asFpuRR(op fpuUniOp, rd, rn operand, dst64bit bool) {
+func (i *instruction) asFpuRR(op fpuUniOp, rd regalloc.VReg, rn operand, dst64bit bool) {
 	i.kind = fpuRR
 	i.u1 = uint64(op)
 	i.rd, i.rn = rd, rn
 	if dst64bit {
-		i.u3 = 1
+		i.u2 = 1
 	}
 }
 
 func (i *instruction) asExtend(rd, rn regalloc.VReg, fromBits, toBits byte, signed bool) {
 	i.kind = extend
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	i.u1 = uint64(fromBits)
 	i.u2 = uint64(toBits)
 	if signed {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 func (i *instruction) asMove32(rd, rn regalloc.VReg) {
 	i.kind = mov32
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 }
 
 func (i *instruction) asMove64(rd, rn regalloc.VReg) *instruction {
 	i.kind = mov64
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	return i
 }
 
 func (i *instruction) asFpuMov64(rd, rn regalloc.VReg) {
 	i.kind = fpuMov64
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 }
 
 func (i *instruction) asFpuMov128(rd, rn regalloc.VReg) *instruction {
 	i.kind = fpuMov128
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	return i
 }
 
-func (i *instruction) asMovToVec(rd, rn operand, arr vecArrangement, index vecIndex) {
+func (i *instruction) asMovToVec(rd regalloc.VReg, rn operand, arr vecArrangement, index vecIndex) {
 	i.kind = movToVec
 	i.rd = rd
 	i.rn = rn
 	i.u1, i.u2 = uint64(arr), uint64(index)
 }
 
-func (i *instruction) asMovFromVec(rd, rn operand, arr vecArrangement, index vecIndex, signed bool) {
+func (i *instruction) asMovFromVec(rd regalloc.VReg, rn operand, arr vecArrangement, index vecIndex, signed bool) {
 	if signed {
 		i.kind = movFromVecSigned
 	} else {
@@ -932,48 +948,48 @@ func (i *instruction) asMovFromVec(rd, rn operand, arr vecArrangement, index vec
 	i.u1, i.u2 = uint64(arr), uint64(index)
 }
 
-func (i *instruction) asVecDup(rd, rn operand, arr vecArrangement) {
+func (i *instruction) asVecDup(rd regalloc.VReg, rn operand, arr vecArrangement) {
 	i.kind = vecDup
 	i.u1 = uint64(arr)
 	i.rn, i.rd = rn, rd
 }
 
-func (i *instruction) asVecDupElement(rd, rn operand, arr vecArrangement, index vecIndex) {
+func (i *instruction) asVecDupElement(rd regalloc.VReg, rn operand, arr vecArrangement, index vecIndex) {
 	i.kind = vecDupElement
 	i.u1 = uint64(arr)
 	i.rn, i.rd = rn, rd
 	i.u2 = uint64(index)
 }
 
-func (i *instruction) asVecExtract(rd, rn, rm operand, arr vecArrangement, index uint32) {
+func (i *instruction) asVecExtract(rd regalloc.VReg, rn, rm operand, arr vecArrangement, index uint32) {
 	i.kind = vecExtract
 	i.u1 = uint64(arr)
 	i.rn, i.rm, i.rd = rn, rm, rd
 	i.u2 = uint64(index)
 }
 
-func (i *instruction) asVecMovElement(rd, rn operand, arr vecArrangement, rdIndex, rnIndex vecIndex) {
+func (i *instruction) asVecMovElement(rd regalloc.VReg, rn operand, arr vecArrangement, rdIndex, rnIndex vecIndex) {
 	i.kind = vecMovElement
 	i.u1 = uint64(arr)
-	i.u2, i.u3 = uint64(rdIndex), uint64(rnIndex)
+	i.u2 = uint64(rdIndex) | uint64(rnIndex)<<32
 	i.rn, i.rd = rn, rd
 }
 
-func (i *instruction) asVecMisc(op vecOp, rd, rn operand, arr vecArrangement) {
+func (i *instruction) asVecMisc(op vecOp, rd regalloc.VReg, rn operand, arr vecArrangement) {
 	i.kind = vecMisc
 	i.u1 = uint64(op)
 	i.rn, i.rd = rn, rd
 	i.u2 = uint64(arr)
 }
 
-func (i *instruction) asVecLanes(op vecOp, rd, rn operand, arr vecArrangement) {
+func (i *instruction) asVecLanes(op vecOp, rd regalloc.VReg, rn operand, arr vecArrangement) {
 	i.kind = vecLanes
 	i.u1 = uint64(op)
 	i.rn, i.rd = rn, rd
 	i.u2 = uint64(arr)
 }
 
-func (i *instruction) asVecShiftImm(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction {
+func (i *instruction) asVecShiftImm(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) *instruction {
 	i.kind = vecShiftImm
 	i.u1 = uint64(op)
 	i.rn, i.rm, i.rd = rn, rm, rd
@@ -981,7 +997,7 @@ func (i *instruction) asVecShiftImm(op vecOp, rd, rn, rm operand, arr vecArrange
 	return i
 }
 
-func (i *instruction) asVecTbl(nregs byte, rd, rn, rm operand, arr vecArrangement) {
+func (i *instruction) asVecTbl(nregs byte, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	switch nregs {
 	case 0, 1:
 		i.kind = vecTbl
@@ -1000,14 +1016,14 @@ func (i *instruction) asVecTbl(nregs byte, rd, rn, rm operand, arr vecArrangemen
 	i.u2 = uint64(arr)
 }
 
-func (i *instruction) asVecPermute(op vecOp, rd, rn, rm operand, arr vecArrangement) {
+func (i *instruction) asVecPermute(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	i.kind = vecPermute
 	i.u1 = uint64(op)
 	i.rn, i.rm, i.rd = rn, rm, rd
 	i.u2 = uint64(arr)
 }
 
-func (i *instruction) asVecRRR(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction {
+func (i *instruction) asVecRRR(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) *instruction {
 	i.kind = vecRRR
 	i.u1 = uint64(op)
 	i.rn, i.rd, i.rm = rn, rd, rm
@@ -1017,7 +1033,7 @@ func (i *instruction) asVecRRR(op vecOp, rd, rn, rm operand, arr vecArrangement)
 
 // asVecRRRRewrite encodes a vector instruction that rewrites the destination register.
 // IMPORTANT: the destination register must be already defined before this instruction.
-func (i *instruction) asVecRRRRewrite(op vecOp, rd, rn, rm operand, arr vecArrangement) {
+func (i *instruction) asVecRRRRewrite(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	i.kind = vecRRRRewrite
 	i.u1 = uint64(op)
 	i.rn, i.rd, i.rm = rn, rd, rm
@@ -1033,8 +1049,8 @@ func (i *instruction) IsCopy() bool {
 
 // String implements fmt.Stringer.
 func (i *instruction) String() (str string) {
-	is64SizeBitToSize := func(u3 uint64) byte {
-		if u3 == 0 {
+	is64SizeBitToSize := func(v uint64) byte {
+		if v == 0 {
 			return 32
 		}
 		return 64
@@ -1049,46 +1065,46 @@ func (i *instruction) String() (str string) {
 			str = "nop0"
 		}
 	case aluRRR:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size),
+			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size),
 			i.rm.format(size))
 	case aluRRRR:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u1 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.ra.nr(), size))
+			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(regalloc.VReg(i.u2), size))
 	case aluRRImm12:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), i.rm.format(size))
+			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size), i.rm.format(size))
 	case aluRRBitmaskImm:
-		size := is64SizeBitToSize(i.u3)
-		rd, rn := formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size)
+		size := is64SizeBitToSize(i.u1 >> 32)
+		rd, rn := formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size)
 		if size == 32 {
 			str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, uint32(i.u2))
 		} else {
 			str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, i.u2)
 		}
 	case aluRRImmShift:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %#x",
 			aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			i.rm.shiftImm(),
 		)
 	case aluRRRShift:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s",
 			aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			i.rm.format(size),
 		)
 	case aluRRRExtend:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			// Regardless of the source size, the register is formatted in 32-bit.
 			i.rm.format(32),
@@ -1097,57 +1113,57 @@ func (i *instruction) String() (str string) {
 		size := is64SizeBitToSize(i.u2)
 		str = fmt.Sprintf("%s %s, %s",
 			bitOp(i.u1),
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 		)
 	case uLoad8:
-		str = fmt.Sprintf("ldrb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrb %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case sLoad8:
-		str = fmt.Sprintf("ldrsb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrsb %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case uLoad16:
-		str = fmt.Sprintf("ldrh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrh %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case sLoad16:
-		str = fmt.Sprintf("ldrsh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrsh %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case uLoad32:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case sLoad32:
-		str = fmt.Sprintf("ldrs %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrs %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case uLoad64:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 64), i.getAmode().format(64))
 	case store8:
-		str = fmt.Sprintf("strb %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(8))
+		str = fmt.Sprintf("strb %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(8))
 	case store16:
-		str = fmt.Sprintf("strh %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(16))
+		str = fmt.Sprintf("strh %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(16))
 	case store32:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(32))
 	case store64:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.getAmode().format(64))
 	case storeP64:
 		str = fmt.Sprintf("stp %s, %s, %s",
-			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64))
+			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.getAmode().format(64))
 	case loadP64:
 		str = fmt.Sprintf("ldp %s, %s, %s",
-			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64))
+			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.getAmode().format(64))
 	case mov64:
 		str = fmt.Sprintf("mov %s, %s",
-			formatVRegSized(i.rd.nr(), 64),
+			formatVRegSized(i.rd, 64),
 			formatVRegSized(i.rn.nr(), 64))
 	case mov32:
-		str = fmt.Sprintf("mov %s, %s", formatVRegSized(i.rd.nr(), 32), formatVRegSized(i.rn.nr(), 32))
+		str = fmt.Sprintf("mov %s, %s", formatVRegSized(i.rd, 32), formatVRegSized(i.rn.nr(), 32))
 	case movZ:
-		size := is64SizeBitToSize(i.u3)
-		str = fmt.Sprintf("movz %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+		size := is64SizeBitToSize(i.u2 >> 32)
+		str = fmt.Sprintf("movz %s, #%#x, lsl %d", formatVRegSized(i.rd, size), uint16(i.u1), uint32(i.u2)*16)
 	case movN:
-		size := is64SizeBitToSize(i.u3)
-		str = fmt.Sprintf("movn %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+		size := is64SizeBitToSize(i.u2 >> 32)
+		str = fmt.Sprintf("movn %s, #%#x, lsl %d", formatVRegSized(i.rd, size), uint16(i.u1), uint32(i.u2)*16)
 	case movK:
-		size := is64SizeBitToSize(i.u3)
-		str = fmt.Sprintf("movk %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+		size := is64SizeBitToSize(i.u2 >> 32)
+		str = fmt.Sprintf("movk %s, #%#x, lsl %d", formatVRegSized(i.rd, size), uint16(i.u1), uint32(i.u2)*16)
 	case extend:
 		fromBits, toBits := byte(i.u1), byte(i.u2)
 
 		var signedStr string
-		if i.u3 == 1 {
+		if i.u2>>32 == 1 {
 			signedStr = "s"
 		} else {
 			signedStr = "u"
@@ -1161,39 +1177,39 @@ func (i *instruction) String() (str string) {
 		case 32:
 			fromStr = "w"
 		}
-		str = fmt.Sprintf("%sxt%s %s, %s", signedStr, fromStr, formatVRegSized(i.rd.nr(), toBits), formatVRegSized(i.rn.nr(), 32))
+		str = fmt.Sprintf("%sxt%s %s, %s", signedStr, fromStr, formatVRegSized(i.rd, toBits), formatVRegSized(i.rn.nr(), 32))
 	case cSel:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2)
 		str = fmt.Sprintf("csel %s, %s, %s, %s",
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			formatVRegSized(i.rm.nr(), size),
 			condFlag(i.u1),
 		)
 	case cSet:
 		if i.u2 != 0 {
-			str = fmt.Sprintf("csetm %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1))
+			str = fmt.Sprintf("csetm %s, %s", formatVRegSized(i.rd, 64), condFlag(i.u1))
 		} else {
-			str = fmt.Sprintf("cset %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1))
+			str = fmt.Sprintf("cset %s, %s", formatVRegSized(i.rd, 64), condFlag(i.u1))
 		}
 	case cCmpImm:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("ccmp %s, #%#x, #%#x, %s",
 			formatVRegSized(i.rn.nr(), size), i.rm.data,
 			i.u2&0b1111,
 			condFlag(i.u1))
 	case fpuMov64:
 		str = fmt.Sprintf("mov %s, %s",
-			formatVRegVec(i.rd.nr(), vecArrangement8B, vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement8B, vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement8B, vecIndexNone))
 	case fpuMov128:
 		str = fmt.Sprintf("mov %s, %s",
-			formatVRegVec(i.rd.nr(), vecArrangement16B, vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement16B, vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone))
 	case fpuMovFromVec:
 		panic("TODO")
 	case fpuRR:
-		dstSz := is64SizeBitToSize(i.u3)
+		dstSz := is64SizeBitToSize(i.u2)
 		srcSz := dstSz
 		op := fpuUniOp(i.u1)
 		switch op {
@@ -1203,38 +1219,38 @@ func (i *instruction) String() (str string) {
 			srcSz = 64
 		}
 		str = fmt.Sprintf("%s %s, %s", op.String(),
-			formatVRegSized(i.rd.nr(), dstSz), formatVRegSized(i.rn.nr(), srcSz))
+			formatVRegSized(i.rd, dstSz), formatVRegSized(i.rn.nr(), srcSz))
 	case fpuRRR:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2)
 		str = fmt.Sprintf("%s %s, %s, %s", fpuBinOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
+			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
 	case fpuRRI:
 		panic("TODO")
 	case fpuRRRR:
 		panic("TODO")
 	case fpuCmp:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u1)
 		str = fmt.Sprintf("fcmp %s, %s",
 			formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
 	case fpuLoad32:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case fpuStore32:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(64))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(64))
 	case fpuLoad64:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 64), i.getAmode().format(64))
 	case fpuStore64:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.getAmode().format(64))
 	case fpuLoad128:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 128), i.amode.format(64))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 128), i.getAmode().format(64))
 	case fpuStore128:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 128), i.amode.format(64))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 128), i.getAmode().format(64))
 	case loadFpuConst32:
-		str = fmt.Sprintf("ldr %s, #8; b 8; data.f32 %f", formatVRegSized(i.rd.nr(), 32), math.Float32frombits(uint32(i.u1)))
+		str = fmt.Sprintf("ldr %s, #8; b 8; data.f32 %f", formatVRegSized(i.rd, 32), math.Float32frombits(uint32(i.u1)))
 	case loadFpuConst64:
-		str = fmt.Sprintf("ldr %s, #8; b 16; data.f64 %f", formatVRegSized(i.rd.nr(), 64), math.Float64frombits(i.u1))
+		str = fmt.Sprintf("ldr %s, #8; b 16; data.f64 %f", formatVRegSized(i.rd, 64), math.Float64frombits(i.u1))
 	case loadFpuConst128:
 		str = fmt.Sprintf("ldr %s, #8; b 32; data.v128  %016x %016x",
-			formatVRegSized(i.rd.nr(), 128), i.u1, i.u2)
+			formatVRegSized(i.rd, 128), i.u1, i.u2)
 	case fpuToInt:
 		var op, src, dst string
 		if signed := i.u1 == 1; signed {
@@ -1242,15 +1258,15 @@ func (i *instruction) String() (str string) {
 		} else {
 			op = "fcvtzu"
 		}
-		if src64 := i.u2 == 1; src64 {
+		if src64 := i.u2&1 != 0; src64 {
 			src = formatVRegWidthVec(i.rn.nr(), vecArrangementD)
 		} else {
 			src = formatVRegWidthVec(i.rn.nr(), vecArrangementS)
 		}
-		if dst64 := i.u3 == 1; dst64 {
-			dst = formatVRegSized(i.rd.nr(), 64)
+		if dst64 := i.u2&2 != 0; dst64 {
+			dst = formatVRegSized(i.rd, 64)
 		} else {
-			dst = formatVRegSized(i.rd.nr(), 32)
+			dst = formatVRegSized(i.rd, 32)
 		}
 		str = fmt.Sprintf("%s %s, %s", op, dst, src)
 
@@ -1261,21 +1277,21 @@ func (i *instruction) String() (str string) {
 		} else {
 			op = "ucvtf"
 		}
-		if src64 := i.u2 == 1; src64 {
+		if src64 := i.u2&1 != 0; src64 {
 			src = formatVRegSized(i.rn.nr(), 64)
 		} else {
 			src = formatVRegSized(i.rn.nr(), 32)
 		}
-		if dst64 := i.u3 == 1; dst64 {
-			dst = formatVRegWidthVec(i.rd.nr(), vecArrangementD)
+		if dst64 := i.u2&2 != 0; dst64 {
+			dst = formatVRegWidthVec(i.rd, vecArrangementD)
 		} else {
-			dst = formatVRegWidthVec(i.rd.nr(), vecArrangementS)
+			dst = formatVRegWidthVec(i.rd, vecArrangementS)
 		}
 		str = fmt.Sprintf("%s %s, %s", op, dst, src)
 	case fpuCSel:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2)
 		str = fmt.Sprintf("fcsel %s, %s, %s, %s",
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			formatVRegSized(i.rm.nr(), size),
 			condFlag(i.u1),
@@ -1291,7 +1307,7 @@ func (i *instruction) String() (str string) {
 		default:
 			panic("unsupported arrangement " + arr.String())
 		}
-		str = fmt.Sprintf("ins %s, %s", formatVRegVec(i.rd.nr(), arr, vecIndex(i.u2)), formatVRegSized(i.rn.nr(), size))
+		str = fmt.Sprintf("ins %s, %s", formatVRegVec(i.rd, arr, vecIndex(i.u2)), formatVRegSized(i.rn.nr(), size))
 	case movFromVec, movFromVecSigned:
 		var size byte
 		var opcode string
@@ -1315,23 +1331,23 @@ func (i *instruction) String() (str string) {
 		default:
 			panic("unsupported arrangement " + arr.String())
 		}
-		str = fmt.Sprintf("%s %s, %s", opcode, formatVRegSized(i.rd.nr(), size), formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)))
+		str = fmt.Sprintf("%s %s, %s", opcode, formatVRegSized(i.rd, size), formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)))
 	case vecDup:
 		str = fmt.Sprintf("dup %s, %s",
-			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement(i.u1), vecIndexNone),
 			formatVRegSized(i.rn.nr(), 64),
 		)
 	case vecDupElement:
 		arr := vecArrangement(i.u1)
 		str = fmt.Sprintf("dup %s, %s",
-			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rd, arr, vecIndexNone),
 			formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)),
 		)
 	case vecDupFromFpu:
 		panic("TODO")
 	case vecExtract:
 		str = fmt.Sprintf("ext %s, %s, %s, #%d",
-			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement(i.u1), vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndexNone),
 			formatVRegVec(i.rm.nr(), vecArrangement(i.u1), vecIndexNone),
 			uint32(i.u2),
@@ -1340,15 +1356,15 @@ func (i *instruction) String() (str string) {
 		panic("TODO")
 	case vecMovElement:
 		str = fmt.Sprintf("mov %s, %s",
-			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndex(i.u2)),
-			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndex(i.u3)),
+			formatVRegVec(i.rd, vecArrangement(i.u1), vecIndex(i.u2&0xffffffff)),
+			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndex(i.u2>>32)),
 		)
 	case vecMiscNarrow:
 		panic("TODO")
 	case vecRRR, vecRRRRewrite:
 		str = fmt.Sprintf("%s %s, %s, %s",
 			vecOp(i.u1),
-			formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement(i.u2), vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone),
 			formatVRegVec(i.rm.nr(), vecArrangement(i.u2), vecIndexNone),
 		)
@@ -1356,12 +1372,12 @@ func (i *instruction) String() (str string) {
 		vop := vecOp(i.u1)
 		if vop == vecOpCmeq0 {
 			str = fmt.Sprintf("cmeq %s, %s, #0",
-				formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+				formatVRegVec(i.rd, vecArrangement(i.u2), vecIndexNone),
 				formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
 		} else {
 			str = fmt.Sprintf("%s %s, %s",
 				vop,
-				formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+				formatVRegVec(i.rd, vecArrangement(i.u2), vecIndexNone),
 				formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
 		}
 	case vecLanes:
@@ -1379,24 +1395,24 @@ func (i *instruction) String() (str string) {
 		}
 		str = fmt.Sprintf("%s %s, %s",
 			vecOp(i.u1),
-			formatVRegWidthVec(i.rd.nr(), destArr),
+			formatVRegWidthVec(i.rd, destArr),
 			formatVRegVec(i.rn.nr(), arr, vecIndexNone))
 	case vecShiftImm:
 		arr := vecArrangement(i.u2)
 		str = fmt.Sprintf("%s %s, %s, #%d",
 			vecOp(i.u1),
-			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rd, arr, vecIndexNone),
 			formatVRegVec(i.rn.nr(), arr, vecIndexNone),
 			i.rm.shiftImm())
 	case vecTbl:
 		arr := vecArrangement(i.u2)
 		str = fmt.Sprintf("tbl %s, { %s }, %s",
-			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rd, arr, vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone),
 			formatVRegVec(i.rm.nr(), arr, vecIndexNone))
 	case vecTbl2:
 		arr := vecArrangement(i.u2)
-		rd, rn, rm := i.rd.nr(), i.rn.nr(), i.rm.nr()
+		rd, rn, rm := i.rd, i.rn.nr(), i.rm.nr()
 		rn1 := regalloc.FromRealReg(rn.RealReg()+1, rn.RegType())
 		str = fmt.Sprintf("tbl %s, { %s, %s }, %s",
 			formatVRegVec(rd, arr, vecIndexNone),
@@ -1407,13 +1423,13 @@ func (i *instruction) String() (str string) {
 		arr := vecArrangement(i.u2)
 		str = fmt.Sprintf("%s %s, %s, %s",
 			vecOp(i.u1),
-			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rd, arr, vecIndexNone),
 			formatVRegVec(i.rn.nr(), arr, vecIndexNone),
 			formatVRegVec(i.rm.nr(), arr, vecIndexNone))
 	case movToFPSR:
 		str = fmt.Sprintf("msr fpsr, %s", formatVRegSized(i.rn.nr(), 64))
 	case movFromFPSR:
-		str = fmt.Sprintf("mrs %s fpsr", formatVRegSized(i.rd.nr(), 64))
+		str = fmt.Sprintf("mrs %s fpsr", formatVRegSized(i.rd, 64))
 	case call:
 		str = fmt.Sprintf("bl %s", ssa.FuncRef(i.u1))
 	case callInd:
@@ -1422,15 +1438,15 @@ func (i *instruction) String() (str string) {
 		str = "ret"
 	case br:
 		target := label(i.u1)
-		if i.u3 != 0 {
+		if i.rm.data != 0 {
 			str = fmt.Sprintf("b #%#x (%s)", i.brOffset(), target.String())
 		} else {
 			str = fmt.Sprintf("b %s", target.String())
 		}
 	case condBr:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		c := cond(i.u1)
-		target := label(i.u2)
+		target := label(i.u2 & 0xffffffff)
 		switch c.kind() {
 		case condKindRegisterZero:
 			if !i.condBrOffsetResolved() {
@@ -1456,7 +1472,7 @@ func (i *instruction) String() (str string) {
 			}
 		}
 	case adr:
-		str = fmt.Sprintf("adr %s, #%#x", formatVRegSized(i.rd.nr(), 64), int64(i.u1))
+		str = fmt.Sprintf("adr %s, #%#x", formatVRegSized(i.rd, 64), int64(i.u1))
 	case brTableSequence:
 		targetIndex := i.u1
 		str = fmt.Sprintf("br_table_sequence %s, table_index=%d", formatVRegSized(i.rn.nr(), 64), targetIndex)
@@ -1473,7 +1489,7 @@ func (i *instruction) String() (str string) {
 		case 1:
 			m = m + "b"
 		}
-		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64))
+		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), 64))
 	case atomicCas:
 		m := "casal"
 		size := byte(32)
@@ -1485,7 +1501,7 @@ func (i *instruction) String() (str string) {
 		case 1:
 			m = m + "b"
 		}
-		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
+		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rd, size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
 	case atomicLoad:
 		m := "ldar"
 		size := byte(32)
@@ -1497,7 +1513,7 @@ func (i *instruction) String() (str string) {
 		case 1:
 			m = m + "b"
 		}
-		str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64))
+		str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), 64))
 	case atomicStore:
 		m := "stlr"
 		size := byte(32)
@@ -1517,9 +1533,9 @@ func (i *instruction) String() (str string) {
 	case emitSourceOffsetInfo:
 		str = fmt.Sprintf("source_offset_info %d", ssa.SourceOffset(i.u1))
 	case vecLoad1R:
-		str = fmt.Sprintf("ld1r {%s}, [%s]", formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone), formatVRegSized(i.rn.nr(), 64))
+		str = fmt.Sprintf("ld1r {%s}, [%s]", formatVRegVec(i.rd, vecArrangement(i.u1), vecIndexNone), formatVRegSized(i.rn.nr(), 64))
 	case loadConstBlockArg:
-		str = fmt.Sprintf("load_const_block_arg %s, %#x", formatVRegSized(i.rd.nr(), 64), i.u1)
+		str = fmt.Sprintf("load_const_block_arg %s, %#x", formatVRegSized(i.rd, 64), i.u1)
 	default:
 		panic(i.kind)
 	}
@@ -1528,26 +1544,26 @@ func (i *instruction) String() (str string) {
 
 func (i *instruction) asAdr(rd regalloc.VReg, offset int64) {
 	i.kind = adr
-	i.rd = operandNR(rd)
+	i.rd = rd
 	i.u1 = uint64(offset)
 }
 
-func (i *instruction) asAtomicRmw(op atomicRmwOp, rn, rs, rt operand, size uint64) {
+func (i *instruction) asAtomicRmw(op atomicRmwOp, rn, rs, rt regalloc.VReg, size uint64) {
 	i.kind = atomicRmw
-	i.rd, i.rn, i.rm = rt, rn, rs
+	i.rd, i.rn, i.rm = rt, operandNR(rn), operandNR(rs)
 	i.u1 = uint64(op)
 	i.u2 = size
 }
 
-func (i *instruction) asAtomicCas(rn, rs, rt operand, size uint64) {
+func (i *instruction) asAtomicCas(rn, rs, rt regalloc.VReg, size uint64) {
 	i.kind = atomicCas
-	i.rm, i.rn, i.rd = rt, rn, rs
+	i.rm, i.rn, i.rd = operandNR(rt), operandNR(rn), rs
 	i.u2 = size
 }
 
-func (i *instruction) asAtomicLoad(rn, rt operand, size uint64) {
+func (i *instruction) asAtomicLoad(rn, rt regalloc.VReg, size uint64) {
 	i.kind = atomicLoad
-	i.rn, i.rd = rn, rt
+	i.rn, i.rd = operandNR(rn), rt
 	i.u2 = size
 }
 
@@ -1755,12 +1771,12 @@ func (i *instruction) asLoadConstBlockArg(v uint64, typ ssa.Type, dst regalloc.V
 	i.kind = loadConstBlockArg
 	i.u1 = v
 	i.u2 = uint64(typ)
-	i.rd = operandNR(dst)
+	i.rd = dst
 	return i
 }
 
 func (i *instruction) loadConstBlockArgData() (v uint64, typ ssa.Type, dst regalloc.VReg) {
-	return i.u1, ssa.Type(i.u2), i.rd.nr()
+	return i.u1, ssa.Type(i.u2), i.rd
 }
 
 func (i *instruction) asEmitSourceOffsetInfo(l ssa.SourceOffset) *instruction {
@@ -1778,7 +1794,7 @@ func (i *instruction) asUDF() *instruction {
 	return i
 }
 
-func (i *instruction) asFpuToInt(rd, rn operand, rdSigned, src64bit, dst64bit bool) {
+func (i *instruction) asFpuToInt(rd regalloc.VReg, rn operand, rdSigned, src64bit, dst64bit bool) {
 	i.kind = fpuToInt
 	i.rn = rn
 	i.rd = rd
@@ -1789,11 +1805,11 @@ func (i *instruction) asFpuToInt(rd, rn operand, rdSigned, src64bit, dst64bit bo
 		i.u2 = 1
 	}
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 2
 	}
 }
 
-func (i *instruction) asIntToFpu(rd, rn operand, rnSigned, src64bit, dst64bit bool) {
+func (i *instruction) asIntToFpu(rd regalloc.VReg, rn operand, rnSigned, src64bit, dst64bit bool) {
 	i.kind = intToFpu
 	i.rn = rn
 	i.rd = rd
@@ -1804,7 +1820,7 @@ func (i *instruction) asIntToFpu(rd, rn operand, rnSigned, src64bit, dst64bit bo
 		i.u2 = 1
 	}
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 2
 	}
 }
 
@@ -1817,7 +1833,7 @@ func (i *instruction) asExitSequence(ctx regalloc.VReg) *instruction {
 // aluOp determines the type of ALU operation. Instructions whose kind is one of
 // aluRRR, aluRRRR, aluRRImm12, aluRRBitmaskImm, aluRRImmShift, aluRRRShift and aluRRRExtend
 // would use this type.
-type aluOp int
+type aluOp uint32
 
 func (a aluOp) String() string {
 	switch a {
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
index 227a96474..f0ede2d6a 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
@@ -44,12 +44,12 @@ func (i *instruction) encode(m *machine) {
 	case callInd:
 		c.Emit4Bytes(encodeUnconditionalBranchReg(regNumberInEncoding[i.rn.realReg()], true))
 	case store8, store16, store32, store64, fpuStore32, fpuStore64, fpuStore128:
-		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rn.realReg()], i.amode))
+		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rn.realReg()], *i.getAmode()))
 	case uLoad8, uLoad16, uLoad32, uLoad64, sLoad8, sLoad16, sLoad32, fpuLoad32, fpuLoad64, fpuLoad128:
-		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rd.realReg()], i.amode))
+		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rd.RealReg()], *i.getAmode()))
 	case vecLoad1R:
 		c.Emit4Bytes(encodeVecLoad1R(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(i.u1)))
 	case condBr:
@@ -75,22 +75,22 @@ func (i *instruction) encode(m *machine) {
 			panic("BUG")
 		}
 	case movN:
-		c.Emit4Bytes(encodeMoveWideImmediate(0b00, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+		c.Emit4Bytes(encodeMoveWideImmediate(0b00, regNumberInEncoding[i.rd.RealReg()], i.u1, uint32(i.u2), uint32(i.u2>>32)))
 	case movZ:
-		c.Emit4Bytes(encodeMoveWideImmediate(0b10, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+		c.Emit4Bytes(encodeMoveWideImmediate(0b10, regNumberInEncoding[i.rd.RealReg()], i.u1, uint32(i.u2), uint32(i.u2>>32)))
 	case movK:
-		c.Emit4Bytes(encodeMoveWideImmediate(0b11, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+		c.Emit4Bytes(encodeMoveWideImmediate(0b11, regNumberInEncoding[i.rd.RealReg()], i.u1, uint32(i.u2), uint32(i.u2>>32)))
 	case mov32:
-		to, from := i.rd.realReg(), i.rn.realReg()
+		to, from := i.rd.RealReg(), i.rn.realReg()
 		c.Emit4Bytes(encodeAsMov32(regNumberInEncoding[from], regNumberInEncoding[to]))
 	case mov64:
-		to, from := i.rd.realReg(), i.rn.realReg()
+		to, from := i.rd.RealReg(), i.rn.realReg()
 		toIsSp := to == sp
 		fromIsSp := from == sp
 		c.Emit4Bytes(encodeMov64(regNumberInEncoding[to], regNumberInEncoding[from], toIsSp, fromIsSp))
 	case loadP64, storeP64:
 		rt, rt2 := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()]
-		amode := i.amode
+		amode := i.getAmode()
 		rn := regNumberInEncoding[amode.rn.RealReg()]
 		var pre bool
 		switch amode.kind {
@@ -102,21 +102,21 @@ func (i *instruction) encode(m *machine) {
 		}
 		c.Emit4Bytes(encodePreOrPostIndexLoadStorePair64(pre, kind == loadP64, rn, rt, rt2, amode.imm))
 	case loadFpuConst32:
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		if i.u1 == 0 {
 			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B))
 		} else {
 			encodeLoadFpuConst32(c, rd, i.u1)
 		}
 	case loadFpuConst64:
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		if i.u1 == 0 {
 			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B))
 		} else {
-			encodeLoadFpuConst64(c, regNumberInEncoding[i.rd.realReg()], i.u1)
+			encodeLoadFpuConst64(c, regNumberInEncoding[i.rd.RealReg()], i.u1)
 		}
 	case loadFpuConst128:
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		lo, hi := i.u1, i.u2
 		if lo == 0 && hi == 0 {
 			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement16B))
@@ -126,35 +126,35 @@ func (i *instruction) encode(m *machine) {
 	case aluRRRR:
 		c.Emit4Bytes(encodeAluRRRR(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
-			regNumberInEncoding[i.ra.realReg()],
-			uint32(i.u3),
+			regNumberInEncoding[regalloc.VReg(i.u2).RealReg()],
+			uint32(i.u1>>32),
 		))
 	case aluRRImmShift:
 		c.Emit4Bytes(encodeAluRRImm(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.rm.shiftImm()),
-			uint32(i.u3),
+			uint32(i.u2>>32),
 		))
 	case aluRRR:
 		rn := i.rn.realReg()
 		c.Emit4Bytes(encodeAluRRR(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[rn],
 			regNumberInEncoding[i.rm.realReg()],
-			i.u3 == 1,
+			i.u2>>32 == 1,
 			rn == sp,
 		))
 	case aluRRRExtend:
 		rm, exo, to := i.rm.er()
 		c.Emit4Bytes(encodeAluRRRExtend(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[rm.RealReg()],
 			exo,
@@ -164,25 +164,25 @@ func (i *instruction) encode(m *machine) {
 		r, amt, sop := i.rm.sr()
 		c.Emit4Bytes(encodeAluRRRShift(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[r.RealReg()],
 			uint32(amt),
 			sop,
-			i.u3 == 1,
+			i.u2>>32 == 1,
 		))
 	case aluRRBitmaskImm:
 		c.Emit4Bytes(encodeAluBitmaskImmediate(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			i.u2,
-			i.u3 == 1,
+			i.u1>>32 == 1,
 		))
 	case bitRR:
 		c.Emit4Bytes(encodeBitRR(
 			bitOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.u2)),
 		)
@@ -190,22 +190,22 @@ func (i *instruction) encode(m *machine) {
 		imm12, shift := i.rm.imm12()
 		c.Emit4Bytes(encodeAluRRImm12(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			imm12, shift,
-			i.u3 == 1,
+			i.u2>>32 == 1,
 		))
 	case fpuRRR:
 		c.Emit4Bytes(encodeFpuRRR(
 			fpuBinOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
-			i.u3 == 1,
+			i.u2 == 1,
 		))
 	case fpuMov64, fpuMov128:
 		// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/MOV--vector---Move-vector--an-alias-of-ORR--vector--register--
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		rn := regNumberInEncoding[i.rn.realReg()]
 		var q uint32
 		if kind == fpuMov128 {
@@ -213,7 +213,7 @@ func (i *instruction) encode(m *machine) {
 		}
 		c.Emit4Bytes(q<<30 | 0b1110101<<21 | rn<<16 | 0b000111<<10 | rn<<5 | rd)
 	case cSet:
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		cf := condFlag(i.u1)
 		if i.u2 == 1 {
 			// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/CSETM--Conditional-Set-Mask--an-alias-of-CSINV-
@@ -225,12 +225,12 @@ func (i *instruction) encode(m *machine) {
 			c.Emit4Bytes(0b1001101010011111<<16 | uint32(cf.invert())<<12 | 0b111111<<5 | rd)
 		}
 	case extend:
-		c.Emit4Bytes(encodeExtend(i.u3 == 1, byte(i.u1), byte(i.u2), regNumberInEncoding[i.rd.realReg()], regNumberInEncoding[i.rn.realReg()]))
+		c.Emit4Bytes(encodeExtend((i.u2>>32) == 1, byte(i.u1), byte(i.u2), regNumberInEncoding[i.rd.RealReg()], regNumberInEncoding[i.rn.realReg()]))
 	case fpuCmp:
 		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/FCMP--Floating-point-quiet-Compare--scalar--?lang=en
 		rn, rm := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()]
 		var ftype uint32
-		if i.u3 == 1 {
+		if i.u1 == 1 {
 			ftype = 0b01 // double precision.
 		}
 		c.Emit4Bytes(0b1111<<25 | ftype<<22 | 1<<21 | rm<<16 | 0b1<<13 | rn<<5)
@@ -242,34 +242,34 @@ func (i *instruction) encode(m *machine) {
 			c.Emit4Bytes(0)
 		}
 	case adr:
-		c.Emit4Bytes(encodeAdr(regNumberInEncoding[i.rd.realReg()], uint32(i.u1)))
+		c.Emit4Bytes(encodeAdr(regNumberInEncoding[i.rd.RealReg()], uint32(i.u1)))
 	case cSel:
 		c.Emit4Bytes(encodeConditionalSelect(
 			kind,
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			condFlag(i.u1),
-			i.u3 == 1,
+			i.u2 == 1,
 		))
 	case fpuCSel:
 		c.Emit4Bytes(encodeFpuCSel(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			condFlag(i.u1),
-			i.u3 == 1,
+			i.u2 == 1,
 		))
 	case movToVec:
 		c.Emit4Bytes(encodeMoveToVec(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(byte(i.u1)),
 			vecIndex(i.u2),
 		))
 	case movFromVec, movFromVecSigned:
 		c.Emit4Bytes(encodeMoveFromVec(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(byte(i.u1)),
 			vecIndex(i.u2),
@@ -277,18 +277,18 @@ func (i *instruction) encode(m *machine) {
 		))
 	case vecDup:
 		c.Emit4Bytes(encodeVecDup(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(byte(i.u1))))
 	case vecDupElement:
 		c.Emit4Bytes(encodeVecDupElement(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(byte(i.u1)),
 			vecIndex(i.u2)))
 	case vecExtract:
 		c.Emit4Bytes(encodeVecExtract(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(byte(i.u1)),
@@ -296,35 +296,35 @@ func (i *instruction) encode(m *machine) {
 	case vecPermute:
 		c.Emit4Bytes(encodeVecPermute(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(byte(i.u2))))
 	case vecMovElement:
 		c.Emit4Bytes(encodeVecMovElement(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(i.u1),
-			uint32(i.u2), uint32(i.u3),
+			uint32(i.u2), uint32(i.u2>>32),
 		))
 	case vecMisc:
 		c.Emit4Bytes(encodeAdvancedSIMDTwoMisc(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(i.u2),
 		))
 	case vecLanes:
 		c.Emit4Bytes(encodeVecLanes(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(i.u2),
 		))
 	case vecShiftImm:
 		c.Emit4Bytes(encodeVecShiftImm(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.rm.shiftImm()),
 			vecArrangement(i.u2),
@@ -332,7 +332,7 @@ func (i *instruction) encode(m *machine) {
 	case vecTbl:
 		c.Emit4Bytes(encodeVecTbl(
 			1,
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(i.u2)),
@@ -340,7 +340,7 @@ func (i *instruction) encode(m *machine) {
 	case vecTbl2:
 		c.Emit4Bytes(encodeVecTbl(
 			2,
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(i.u2)),
@@ -353,9 +353,9 @@ func (i *instruction) encode(m *machine) {
 	case fpuRR:
 		c.Emit4Bytes(encodeFloatDataOneSource(
 			fpuUniOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
-			i.u3 == 1,
+			i.u2 == 1,
 		))
 	case vecRRR:
 		if op := vecOp(i.u1); op == vecOpBsl || op == vecOpBit || op == vecOpUmlal {
@@ -365,14 +365,14 @@ func (i *instruction) encode(m *machine) {
 	case vecRRRRewrite:
 		c.Emit4Bytes(encodeVecRRR(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(i.u2),
 		))
 	case cCmpImm:
 		// Conditional compare (immediate) in https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
-		sf := uint32(i.u3 & 0b1)
+		sf := uint32((i.u2 >> 32) & 0b1)
 		nzcv := uint32(i.u2 & 0b1111)
 		cond := uint32(condFlag(i.u1))
 		imm := uint32(i.rm.data & 0b11111)
@@ -381,7 +381,7 @@ func (i *instruction) encode(m *machine) {
 			sf<<31 | 0b111101001<<22 | imm<<16 | cond<<12 | 0b1<<11 | rn<<5 | nzcv,
 		)
 	case movFromFPSR:
-		rt := regNumberInEncoding[i.rd.realReg()]
+		rt := regNumberInEncoding[i.rd.RealReg()]
 		c.Emit4Bytes(encodeSystemRegisterMove(rt, true))
 	case movToFPSR:
 		rt := regNumberInEncoding[i.rn.realReg()]
@@ -390,13 +390,13 @@ func (i *instruction) encode(m *machine) {
 		c.Emit4Bytes(encodeAtomicRmw(
 			atomicRmwOp(i.u1),
 			regNumberInEncoding[i.rm.realReg()],
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.u2),
 		))
 	case atomicCas:
 		c.Emit4Bytes(encodeAtomicCas(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.u2),
@@ -404,7 +404,7 @@ func (i *instruction) encode(m *machine) {
 	case atomicLoad:
 		c.Emit4Bytes(encodeAtomicLoadStore(
 			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			uint32(i.u2),
 			1,
 		))
@@ -810,7 +810,7 @@ func encodeFloatDataOneSource(op fpuUniOp, rd, rn uint32, dst64bit bool) uint32
 // encodeCnvBetweenFloatInt encodes as "Conversion between floating-point and integer" in
 // https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
 func encodeCnvBetweenFloatInt(i *instruction) uint32 {
-	rd := regNumberInEncoding[i.rd.realReg()]
+	rd := regNumberInEncoding[i.rd.RealReg()]
 	rn := regNumberInEncoding[i.rn.realReg()]
 
 	var opcode uint32
@@ -822,8 +822,8 @@ func encodeCnvBetweenFloatInt(i *instruction) uint32 {
 		rmode = 0b00
 
 		signed := i.u1 == 1
-		src64bit := i.u2 == 1
-		dst64bit := i.u3 == 1
+		src64bit := i.u2&1 != 0
+		dst64bit := i.u2&2 != 0
 		if signed {
 			opcode = 0b010
 		} else {
@@ -841,8 +841,8 @@ func encodeCnvBetweenFloatInt(i *instruction) uint32 {
 		rmode = 0b11
 
 		signed := i.u1 == 1
-		src64bit := i.u2 == 1
-		dst64bit := i.u3 == 1
+		src64bit := i.u2&1 != 0
+		dst64bit := i.u2&2 != 0
 
 		if signed {
 			opcode = 0b000
@@ -1787,13 +1787,13 @@ func encodeCBZCBNZ(rt uint32, nz bool, imm19 uint32, _64bit bool) (ret uint32) {
 // https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
 //
 // "shift" must have been divided by 16 at this point.
-func encodeMoveWideImmediate(opc uint32, rd uint32, imm, shift, _64bit uint64) (ret uint32) {
+func encodeMoveWideImmediate(opc uint32, rd uint32, imm uint64, shift, _64bit uint32) (ret uint32) {
 	ret = rd
 	ret |= uint32(imm&0xffff) << 5
-	ret |= (uint32(shift)) << 21
+	ret |= (shift) << 21
 	ret |= 0b100101 << 23
 	ret |= opc << 29
-	ret |= uint32(_64bit) << 31
+	ret |= _64bit << 31
 	return
 }
 
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
index 698b382d4..6c6824fb0 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
@@ -284,18 +284,18 @@ func (m *machine) load64bitConst(c int64, dst regalloc.VReg) {
 
 func (m *machine) insertMOVZ(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
 	instr := m.allocateInstr()
-	instr.asMOVZ(dst, v, uint64(shift), dst64)
+	instr.asMOVZ(dst, v, uint32(shift), dst64)
 	m.insert(instr)
 }
 
 func (m *machine) insertMOVK(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
 	instr := m.allocateInstr()
-	instr.asMOVK(dst, v, uint64(shift), dst64)
+	instr.asMOVK(dst, v, uint32(shift), dst64)
 	m.insert(instr)
 }
 
 func (m *machine) insertMOVN(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
 	instr := m.allocateInstr()
-	instr.asMOVN(dst, v, uint64(shift), dst64)
+	instr.asMOVN(dst, v, uint32(shift), dst64)
 	m.insert(instr)
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
index 2bb234e8c..048bf3204 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
@@ -52,11 +52,11 @@ func (m *machine) lowerBrTable(i *ssa.Instruction) {
 	maxIndexReg := m.compiler.AllocateVReg(ssa.TypeI32)
 	m.lowerConstantI32(maxIndexReg, int32(len(targets)-1))
 	subs := m.allocateInstr()
-	subs.asALU(aluOpSubS, operandNR(xzrVReg), indexOperand, operandNR(maxIndexReg), false)
+	subs.asALU(aluOpSubS, xzrVReg, indexOperand, operandNR(maxIndexReg), false)
 	m.insert(subs)
 	csel := m.allocateInstr()
 	adjustedIndex := m.compiler.AllocateVReg(ssa.TypeI32)
-	csel.asCSel(operandNR(adjustedIndex), operandNR(maxIndexReg), indexOperand, hs, false)
+	csel.asCSel(adjustedIndex, operandNR(maxIndexReg), indexOperand, hs, false)
 	m.insert(csel)
 
 	brSequence := m.allocateInstr()
@@ -249,7 +249,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 			rc := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
 			rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 			rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-			rd := operandNR(m.compiler.VRegOf(instr.Return()))
+			rd := m.compiler.VRegOf(instr.Return())
 			m.lowerSelectVec(rc, rn, rm, rd)
 		} else {
 			m.lowerSelect(c, x, y, instr.Return())
@@ -270,7 +270,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, ctx := instr.Arg2()
 		result := instr.Return()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(result))
+		rd := m.compiler.VRegOf(result)
 		ctxVReg := m.compiler.VRegOf(ctx)
 		m.lowerFpuToInt(rd, rn, ctxVReg, true, x.Type() == ssa.TypeF64,
 			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat)
@@ -278,7 +278,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, ctx := instr.Arg2()
 		result := instr.Return()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(result))
+		rd := m.compiler.VRegOf(result)
 		ctxVReg := m.compiler.VRegOf(ctx)
 		m.lowerFpuToInt(rd, rn, ctxVReg, false, x.Type() == ssa.TypeF64,
 			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat)
@@ -286,25 +286,25 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x := instr.Arg()
 		result := instr.Return()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(result))
+		rd := m.compiler.VRegOf(result)
 		m.lowerIntToFpu(rd, rn, true, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
 	case ssa.OpcodeFcvtFromUint:
 		x := instr.Arg()
 		result := instr.Return()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(result))
+		rd := m.compiler.VRegOf(result)
 		m.lowerIntToFpu(rd, rn, false, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
 	case ssa.OpcodeFdemote:
 		v := instr.Arg()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		cnt := m.allocateInstr()
 		cnt.asFpuRR(fpuUniOpCvt64To32, rd, rn, false)
 		m.insert(cnt)
 	case ssa.OpcodeFpromote:
 		v := instr.Arg()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		cnt := m.allocateInstr()
 		cnt.asFpuRR(fpuUniOpCvt32To64, rd, rn, true)
 		m.insert(cnt)
@@ -343,15 +343,15 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		ctxVReg := m.compiler.VRegOf(ctx)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerIDiv(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSdiv)
 	case ssa.OpcodeSrem, ssa.OpcodeUrem:
 		x, y, ctx := instr.Arg3()
 		ctxVReg := m.compiler.VRegOf(ctx)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
-		m.lowerIRem(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem)
+		rd := m.compiler.VRegOf(instr.Return())
+		m.lowerIRem(ctxVReg, rd, rn.nr(), rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem)
 	case ssa.OpcodeVconst:
 		result := m.compiler.VRegOf(instr.Return())
 		lo, hi := instr.VconstData()
@@ -362,7 +362,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x := instr.Arg()
 		ins := m.allocateInstr()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		ins.asVecMisc(vecOpNot, rd, rn, vecArrangement16B)
 		m.insert(ins)
 	case ssa.OpcodeVbxor:
@@ -382,12 +382,12 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
 		creg := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
-		tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmp := m.compiler.AllocateVReg(ssa.TypeV128)
 
 		// creg is overwritten by BSL, so we need to move it to the result register before the instruction
 		// in case when it is used somewhere else.
 		mov := m.allocateInstr()
-		mov.asFpuMov128(tmp.nr(), creg.nr())
+		mov.asFpuMov128(tmp, creg.nr())
 		m.insert(mov)
 
 		ins := m.allocateInstr()
@@ -396,7 +396,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 
 		mov2 := m.allocateInstr()
 		rd := m.compiler.VRegOf(instr.Return())
-		mov2.asFpuMov128(rd, tmp.nr())
+		mov2.asFpuMov128(rd, tmp)
 		m.insert(mov2)
 	case ssa.OpcodeVanyTrue, ssa.OpcodeVallTrue:
 		x, lane := instr.ArgWithLane()
@@ -405,12 +405,12 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 			arr = ssaLaneToArrangement(lane)
 		}
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVcheckTrue(op, rm, rd, arr)
 	case ssa.OpcodeVhighBits:
 		x, lane := instr.ArgWithLane()
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		arr := ssaLaneToArrangement(lane)
 		m.lowerVhighBits(rm, rd, arr)
 	case ssa.OpcodeVIadd:
@@ -441,9 +441,9 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 			panic("unsupported lane " + lane.String())
 		}
 
-		widenLo := m.allocateInstr().asVecShiftImm(widen, tmpLo, vv, operandShiftImm(0), loArr)
-		widenHi := m.allocateInstr().asVecShiftImm(widen, tmpHi, vv, operandShiftImm(0), hiArr)
-		addp := m.allocateInstr().asVecRRR(vecOpAddp, operandNR(m.compiler.VRegOf(instr.Return())), tmpLo, tmpHi, dstArr)
+		widenLo := m.allocateInstr().asVecShiftImm(widen, tmpLo.nr(), vv, operandShiftImm(0), loArr)
+		widenHi := m.allocateInstr().asVecShiftImm(widen, tmpHi.nr(), vv, operandShiftImm(0), hiArr)
+		addp := m.allocateInstr().asVecRRR(vecOpAddp, m.compiler.VRegOf(instr.Return()), tmpLo, tmpHi, dstArr)
 		m.insert(widenLo)
 		m.insert(widenHi)
 		m.insert(addp)
@@ -493,7 +493,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		arr := ssaLaneToArrangement(lane)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVIMul(rd, rn, rm, arr)
 	case ssa.OpcodeVIabs:
 		m.lowerVecMisc(vecOpAbs, instr)
@@ -507,7 +507,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		arr := ssaLaneToArrangement(lane)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVShift(op, rd, rn, rm, arr)
 	case ssa.OpcodeVSqrt:
 		m.lowerVecMisc(vecOpFsqrt, instr)
@@ -547,18 +547,18 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, lane := instr.ArgWithLane()
 		arr := ssaLaneToArrangement(lane)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVfpuToInt(rd, rn, arr, op == ssa.OpcodeVFcvtToSintSat)
 	case ssa.OpcodeVFcvtFromSint, ssa.OpcodeVFcvtFromUint:
 		x, lane := instr.ArgWithLane()
 		arr := ssaLaneToArrangement(lane)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVfpuFromInt(rd, rn, arr, op == ssa.OpcodeVFcvtFromSint)
 	case ssa.OpcodeSwidenLow, ssa.OpcodeUwidenLow:
 		x, lane := instr.ArgWithLane()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		var arr vecArrangement
 		switch lane {
@@ -580,7 +580,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 	case ssa.OpcodeSwidenHigh, ssa.OpcodeUwidenHigh:
 		x, lane := instr.ArgWithLane()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		arr := ssaLaneToArrangement(lane)
 
@@ -607,9 +607,9 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		}
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
-		tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmp := m.compiler.AllocateVReg(ssa.TypeV128)
 
 		loQxtn := m.allocateInstr()
 		hiQxtn := m.allocateInstr()
@@ -628,7 +628,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		m.insert(hiQxtn)
 
 		mov := m.allocateInstr()
-		mov.asFpuMov128(rd.nr(), tmp.nr())
+		mov.asFpuMov128(rd, tmp)
 		m.insert(mov)
 	case ssa.OpcodeFvpromoteLow:
 		x, lane := instr.ArgWithLane()
@@ -637,7 +637,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		}
 		ins := m.allocateInstr()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		ins.asVecMisc(vecOpFcvtl, rd, rn, vecArrangement2S)
 		m.insert(ins)
 	case ssa.OpcodeFvdemote:
@@ -647,14 +647,14 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		}
 		ins := m.allocateInstr()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		ins.asVecMisc(vecOpFcvtn, rd, rn, vecArrangement2S)
 		m.insert(ins)
 	case ssa.OpcodeExtractlane:
 		x, index, signed, lane := instr.ExtractlaneData()
 
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		mov := m.allocateInstr()
 		switch lane {
@@ -680,12 +680,12 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, y, index, lane := instr.InsertlaneData()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
-		tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		rd := m.compiler.VRegOf(instr.Return())
+		tmpReg := m.compiler.AllocateVReg(ssa.TypeV128)
 
 		// Initially mov rn to tmp.
 		mov1 := m.allocateInstr()
-		mov1.asFpuMov128(tmpReg.nr(), rn.nr())
+		mov1.asFpuMov128(tmpReg, rn.nr())
 		m.insert(mov1)
 
 		// movToVec and vecMovElement do not clear the remaining bits to zero,
@@ -709,14 +709,14 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 
 		// Finally mov tmp to rd.
 		mov3 := m.allocateInstr()
-		mov3.asFpuMov128(rd.nr(), tmpReg.nr())
+		mov3.asFpuMov128(rd, tmpReg)
 		m.insert(mov3)
 
 	case ssa.OpcodeSwizzle:
 		x, y, lane := instr.Arg2WithLane()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		arr := ssaLaneToArrangement(lane)
 
@@ -729,14 +729,14 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, y, lane1, lane2 := instr.ShuffleData()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		m.lowerShuffle(rd, rn, rm, lane1, lane2)
 
 	case ssa.OpcodeSplat:
 		x, lane := instr.ArgWithLane()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		dup := m.allocateInstr()
 		switch lane {
@@ -760,12 +760,12 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		xx, yy := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone),
 			m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
 		tmp, tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-		m.insert(m.allocateInstr().asVecRRR(vecOpSmull, tmp, xx, yy, vecArrangement8H))
-		m.insert(m.allocateInstr().asVecRRR(vecOpSmull2, tmp2, xx, yy, vecArrangement8H))
-		m.insert(m.allocateInstr().asVecRRR(vecOpAddp, tmp, tmp, tmp2, vecArrangement4S))
+		m.insert(m.allocateInstr().asVecRRR(vecOpSmull, tmp.nr(), xx, yy, vecArrangement8H))
+		m.insert(m.allocateInstr().asVecRRR(vecOpSmull2, tmp2.nr(), xx, yy, vecArrangement8H))
+		m.insert(m.allocateInstr().asVecRRR(vecOpAddp, tmp.nr(), tmp, tmp2, vecArrangement4S))
 
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
-		m.insert(m.allocateInstr().asFpuMov128(rd.nr(), tmp.nr()))
+		rd := m.compiler.VRegOf(instr.Return())
+		m.insert(m.allocateInstr().asFpuMov128(rd, tmp.nr()))
 
 	case ssa.OpcodeLoadSplat:
 		ptr, offset, lane := instr.LoadSplatData()
@@ -794,7 +794,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 	m.executableContext.FlushPendingInstructions()
 }
 
-func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) {
+func (m *machine) lowerShuffle(rd regalloc.VReg, rn, rm operand, lane1, lane2 uint64) {
 	// `tbl2` requires 2 consecutive registers, so we arbitrarily pick v29, v30.
 	vReg, wReg := v29VReg, v30VReg
 
@@ -822,7 +822,7 @@ func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) {
 	m.insert(tbl2)
 }
 
-func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangement) {
+func (m *machine) lowerVShift(op ssa.Opcode, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	var modulo byte
 	switch arr {
 	case vecArrangement16B:
@@ -847,13 +847,13 @@ func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangem
 	if op != ssa.OpcodeVIshl {
 		// Negate the amount to make this as right shift.
 		neg := m.allocateInstr()
-		neg.asALU(aluOpSub, rtmp, operandNR(xzrVReg), rtmp, true)
+		neg.asALU(aluOpSub, rtmp.nr(), operandNR(xzrVReg), rtmp, true)
 		m.insert(neg)
 	}
 
 	// Copy the shift amount into a vector register as sshl/ushl requires it to be there.
 	dup := m.allocateInstr()
-	dup.asVecDup(vtmp, rtmp, arr)
+	dup.asVecDup(vtmp.nr(), rtmp, arr)
 	m.insert(dup)
 
 	if op == ssa.OpcodeVIshl || op == ssa.OpcodeVSshr {
@@ -867,7 +867,7 @@ func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangem
 	}
 }
 
-func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangement) {
+func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm operand, rd regalloc.VReg, arr vecArrangement) {
 	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
 
 	// Special case VallTrue for i64x2.
@@ -878,11 +878,11 @@ func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangem
 		//	cset dst, eq
 
 		ins := m.allocateInstr()
-		ins.asVecMisc(vecOpCmeq0, tmp, rm, vecArrangement2D)
+		ins.asVecMisc(vecOpCmeq0, tmp.nr(), rm, vecArrangement2D)
 		m.insert(ins)
 
 		addp := m.allocateInstr()
-		addp.asVecRRR(vecOpAddp, tmp, tmp, tmp, vecArrangement2D)
+		addp.asVecRRR(vecOpAddp, tmp.nr(), tmp, tmp, vecArrangement2D)
 		m.insert(addp)
 
 		fcmp := m.allocateInstr()
@@ -890,7 +890,7 @@ func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangem
 		m.insert(fcmp)
 
 		cset := m.allocateInstr()
-		cset.asCSet(rd.nr(), false, eq)
+		cset.asCSet(rd, false, eq)
 		m.insert(cset)
 
 		return
@@ -900,10 +900,10 @@ func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangem
 	ins := m.allocateInstr()
 	if op == ssa.OpcodeVanyTrue {
 		// 	umaxp v4?.16b, v2?.16b, v2?.16b
-		ins.asVecRRR(vecOpUmaxp, tmp, rm, rm, vecArrangement16B)
+		ins.asVecRRR(vecOpUmaxp, tmp.nr(), rm, rm, vecArrangement16B)
 	} else {
 		// 	uminv d4?, v2?.4s
-		ins.asVecLanes(vecOpUminv, tmp, rm, arr)
+		ins.asVecLanes(vecOpUminv, tmp.nr(), rm, arr)
 	}
 	m.insert(ins)
 
@@ -917,15 +917,15 @@ func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangem
 	m.insert(movv)
 
 	fc := m.allocateInstr()
-	fc.asCCmpImm(rd, uint64(0), al, 0, true)
+	fc.asCCmpImm(operandNR(rd), uint64(0), al, 0, true)
 	m.insert(fc)
 
 	cset := m.allocateInstr()
-	cset.asCSet(rd.nr(), false, ne)
+	cset.asCSet(rd, false, ne)
 	m.insert(cset)
 }
 
-func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
+func (m *machine) lowerVhighBits(rm operand, rd regalloc.VReg, arr vecArrangement) {
 	r0 := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
 	v0 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
 	v1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
@@ -947,7 +947,7 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 		// Right arithmetic shift on the original vector and store the result into v1. So we have:
 		// v1[i] = 0xff if vi<0, 0 otherwise.
 		sshr := m.allocateInstr()
-		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(7), vecArrangement16B)
+		sshr.asVecShiftImm(vecOpSshr, v1.nr(), rm, operandShiftImm(7), vecArrangement16B)
 		m.insert(sshr)
 
 		// Load the bit mask into r0.
@@ -958,7 +958,7 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 
 		// dup r0 to v0.
 		dup := m.allocateInstr()
-		dup.asVecDup(v0, r0, vecArrangement2D)
+		dup.asVecDup(v0.nr(), r0, vecArrangement2D)
 		m.insert(dup)
 
 		// Lane-wise logical AND with the bit mask, meaning that we have
@@ -967,23 +967,23 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 		// Below, we use the following notation:
 		// wi := (1 << i) if vi<0, 0 otherwise.
 		and := m.allocateInstr()
-		and.asVecRRR(vecOpAnd, v1, v1, v0, vecArrangement16B)
+		and.asVecRRR(vecOpAnd, v1.nr(), v1, v0, vecArrangement16B)
 		m.insert(and)
 
 		// Swap the lower and higher 8 byte elements, and write it into v0, meaning that we have
 		// v0[i] = w(i+8) if i < 8, w(i-8) otherwise.
 		ext := m.allocateInstr()
-		ext.asVecExtract(v0, v1, v1, vecArrangement16B, uint32(8))
+		ext.asVecExtract(v0.nr(), v1, v1, vecArrangement16B, uint32(8))
 		m.insert(ext)
 
 		// v = [w0, w8, ..., w7, w15]
 		zip1 := m.allocateInstr()
-		zip1.asVecPermute(vecOpZip1, v0, v1, v0, vecArrangement16B)
+		zip1.asVecPermute(vecOpZip1, v0.nr(), v1, v0, vecArrangement16B)
 		m.insert(zip1)
 
 		// v.h[0] = w0 + ... + w15
 		addv := m.allocateInstr()
-		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
+		addv.asVecLanes(vecOpAddv, v0.nr(), v0, vecArrangement8H)
 		m.insert(addv)
 
 		// Extract the v.h[0] as the result.
@@ -1006,7 +1006,7 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 		// Right arithmetic shift on the original vector and store the result into v1. So we have:
 		// v[i] = 0xffff if vi<0, 0 otherwise.
 		sshr := m.allocateInstr()
-		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(15), vecArrangement8H)
+		sshr.asVecShiftImm(vecOpSshr, v1.nr(), rm, operandShiftImm(15), vecArrangement8H)
 		m.insert(sshr)
 
 		// Load the bit mask into r0.
@@ -1014,26 +1014,26 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 
 		// dup r0 to vector v0.
 		dup := m.allocateInstr()
-		dup.asVecDup(v0, r0, vecArrangement2D)
+		dup.asVecDup(v0.nr(), r0, vecArrangement2D)
 		m.insert(dup)
 
 		lsl := m.allocateInstr()
-		lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(4), true)
+		lsl.asALUShift(aluOpLsl, r0.nr(), r0, operandShiftImm(4), true)
 		m.insert(lsl)
 
 		movv := m.allocateInstr()
-		movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
+		movv.asMovToVec(v0.nr(), r0, vecArrangementD, vecIndex(1))
 		m.insert(movv)
 
 		// Lane-wise logical AND with the bitmask, meaning that we have
 		// v[i] = (1 << i)     if vi<0, 0 otherwise for i=0..3
 		//      = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7
 		and := m.allocateInstr()
-		and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
+		and.asVecRRR(vecOpAnd, v0.nr(), v1, v0, vecArrangement16B)
 		m.insert(and)
 
 		addv := m.allocateInstr()
-		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
+		addv.asVecLanes(vecOpAddv, v0.nr(), v0, vecArrangement8H)
 		m.insert(addv)
 
 		movfv := m.allocateInstr()
@@ -1055,7 +1055,7 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 		// Right arithmetic shift on the original vector and store the result into v1. So we have:
 		// v[i] = 0xffffffff if vi<0, 0 otherwise.
 		sshr := m.allocateInstr()
-		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(31), vecArrangement4S)
+		sshr.asVecShiftImm(vecOpSshr, v1.nr(), rm, operandShiftImm(31), vecArrangement4S)
 		m.insert(sshr)
 
 		// Load the bit mask into r0.
@@ -1063,26 +1063,26 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 
 		// dup r0 to vector v0.
 		dup := m.allocateInstr()
-		dup.asVecDup(v0, r0, vecArrangement2D)
+		dup.asVecDup(v0.nr(), r0, vecArrangement2D)
 		m.insert(dup)
 
 		lsl := m.allocateInstr()
-		lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(2), true)
+		lsl.asALUShift(aluOpLsl, r0.nr(), r0, operandShiftImm(2), true)
 		m.insert(lsl)
 
 		movv := m.allocateInstr()
-		movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
+		movv.asMovToVec(v0.nr(), r0, vecArrangementD, vecIndex(1))
 		m.insert(movv)
 
 		// Lane-wise logical AND with the bitmask, meaning that we have
 		// v[i] = (1 << i)     if vi<0, 0 otherwise for i in [0, 1]
 		//      = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3]
 		and := m.allocateInstr()
-		and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
+		and.asVecRRR(vecOpAnd, v0.nr(), v1, v0, vecArrangement16B)
 		m.insert(and)
 
 		addv := m.allocateInstr()
-		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement4S)
+		addv.asVecLanes(vecOpAddv, v0.nr(), v0, vecArrangement4S)
 		m.insert(addv)
 
 		movfv := m.allocateInstr()
@@ -1102,21 +1102,21 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 
 		// Move the higher 64-bit int into r0.
 		movv1 := m.allocateInstr()
-		movv1.asMovFromVec(r0, rm, vecArrangementD, vecIndex(1), false)
+		movv1.asMovFromVec(r0.nr(), rm, vecArrangementD, vecIndex(1), false)
 		m.insert(movv1)
 
 		// Move the sign bit into the least significant bit.
 		lsr1 := m.allocateInstr()
-		lsr1.asALUShift(aluOpLsr, r0, r0, operandShiftImm(63), true)
+		lsr1.asALUShift(aluOpLsr, r0.nr(), r0, operandShiftImm(63), true)
 		m.insert(lsr1)
 
 		lsr2 := m.allocateInstr()
-		lsr2.asALUShift(aluOpLsr, rd, rd, operandShiftImm(63), true)
+		lsr2.asALUShift(aluOpLsr, rd, operandNR(rd), operandShiftImm(63), true)
 		m.insert(lsr2)
 
 		// rd = (r0<<1) | rd
 		lsl := m.allocateInstr()
-		lsl.asALU(aluOpAdd, rd, rd, operandSR(r0.nr(), 1, shiftOpLSL), false)
+		lsl.asALU(aluOpAdd, rd, operandNR(rd), operandSR(r0.nr(), 1, shiftOpLSL), false)
 		m.insert(lsl)
 	default:
 		panic("Unsupported " + arr.String())
@@ -1128,7 +1128,7 @@ func (m *machine) lowerVecMisc(op vecOp, instr *ssa.Instruction) {
 	arr := ssaLaneToArrangement(lane)
 	ins := m.allocateInstr()
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(instr.Return()))
+	rd := m.compiler.VRegOf(instr.Return())
 	ins.asVecMisc(op, rd, rn, arr)
 	m.insert(ins)
 }
@@ -1137,22 +1137,22 @@ func (m *machine) lowerVecRRR(op vecOp, x, y, ret ssa.Value, arr vecArrangement)
 	ins := m.allocateInstr()
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(ret))
+	rd := m.compiler.VRegOf(ret)
 	ins.asVecRRR(op, rd, rn, rm, arr)
 	m.insert(ins)
 }
 
-func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
+func (m *machine) lowerVIMul(rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	if arr != vecArrangement2D {
 		mul := m.allocateInstr()
 		mul.asVecRRR(vecOpMul, rd, rn, rm, arr)
 		m.insert(mul)
 	} else {
-		tmp1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-		tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-		tmp3 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmp1 := m.compiler.AllocateVReg(ssa.TypeV128)
+		tmp2 := m.compiler.AllocateVReg(ssa.TypeV128)
+		tmp3 := m.compiler.AllocateVReg(ssa.TypeV128)
 
-		tmpRes := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmpRes := m.compiler.AllocateVReg(ssa.TypeV128)
 
 		// Following the algorithm in https://chromium-review.googlesource.com/c/v8/v8/+/1781696
 		rev64 := m.allocateInstr()
@@ -1160,7 +1160,7 @@ func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
 		m.insert(rev64)
 
 		mul := m.allocateInstr()
-		mul.asVecRRR(vecOpMul, tmp2, tmp2, rn, vecArrangement4S)
+		mul.asVecRRR(vecOpMul, tmp2, operandNR(tmp2), rn, vecArrangement4S)
 		m.insert(mul)
 
 		xtn1 := m.allocateInstr()
@@ -1168,7 +1168,7 @@ func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
 		m.insert(xtn1)
 
 		addp := m.allocateInstr()
-		addp.asVecRRR(vecOpAddp, tmp2, tmp2, tmp2, vecArrangement4S)
+		addp.asVecRRR(vecOpAddp, tmp2, operandNR(tmp2), operandNR(tmp2), vecArrangement4S)
 		m.insert(addp)
 
 		xtn2 := m.allocateInstr()
@@ -1179,15 +1179,15 @@ func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
 		// In short, in UMLAL instruction, the result register is also one of the source register, and
 		// the value on the result register is significant.
 		shll := m.allocateInstr()
-		shll.asVecMisc(vecOpShll, tmpRes, tmp2, vecArrangement2S)
+		shll.asVecMisc(vecOpShll, tmpRes, operandNR(tmp2), vecArrangement2S)
 		m.insert(shll)
 
 		umlal := m.allocateInstr()
-		umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, tmp3, tmp1, vecArrangement2S)
+		umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, operandNR(tmp3), operandNR(tmp1), vecArrangement2S)
 		m.insert(umlal)
 
 		mov := m.allocateInstr()
-		mov.asFpuMov128(rd.nr(), tmpRes.nr())
+		mov.asFpuMov128(rd, tmpRes)
 		m.insert(mov)
 	}
 }
@@ -1203,7 +1203,7 @@ func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) {
 	// BSL modifies the destination register, so we need to use a temporary register so that
 	// the actual definition of the destination register happens *after* the BSL instruction.
 	// That way, we can force the spill instruction to be inserted after the BSL instruction.
-	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+	tmp := m.compiler.AllocateVReg(ssa.TypeV128)
 
 	fcmgt := m.allocateInstr()
 	if max {
@@ -1220,17 +1220,17 @@ func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) {
 
 	res := operandNR(m.compiler.VRegOf(instr.Return()))
 	mov2 := m.allocateInstr()
-	mov2.asFpuMov128(res.nr(), tmp.nr())
+	mov2.asFpuMov128(res.nr(), tmp)
 	m.insert(mov2)
 }
 
-func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
+func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn regalloc.VReg, rm operand, _64bit, signed bool) {
 	div := m.allocateInstr()
 
 	if signed {
-		div.asALU(aluOpSDiv, rd, rn, rm, _64bit)
+		div.asALU(aluOpSDiv, rd, operandNR(rn), rm, _64bit)
 	} else {
-		div.asALU(aluOpUDiv, rd, rn, rm, _64bit)
+		div.asALU(aluOpUDiv, rd, operandNR(rn), rm, _64bit)
 	}
 	m.insert(div)
 
@@ -1239,11 +1239,11 @@ func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bi
 
 	// rd = rn-rd*rm by MSUB instruction.
 	msub := m.allocateInstr()
-	msub.asALURRRR(aluOpMSub, rd, rd, rm, rn, _64bit)
+	msub.asALURRRR(aluOpMSub, rd, operandNR(rd), rm, rn, _64bit)
 	m.insert(msub)
 }
 
-func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
+func (m *machine) lowerIDiv(execCtxVReg, rd regalloc.VReg, rn, rm operand, _64bit, signed bool) {
 	div := m.allocateInstr()
 
 	if signed {
@@ -1260,7 +1260,7 @@ func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bi
 		// We need to check the signed overflow which happens iff "math.MinInt{32,64} / -1"
 		minusOneCheck := m.allocateInstr()
 		// Sets eq condition if rm == -1.
-		minusOneCheck.asALU(aluOpAddS, operandNR(xzrVReg), rm, operandImm12(1, 0), _64bit)
+		minusOneCheck.asALU(aluOpAddS, xzrVReg, rm, operandImm12(1, 0), _64bit)
 		m.insert(minusOneCheck)
 
 		ccmp := m.allocateInstr()
@@ -1290,20 +1290,20 @@ func (m *machine) exitIfNot(execCtxVReg regalloc.VReg, c cond, cond64bit bool, c
 func (m *machine) lowerFcopysign(x, y, ret ssa.Value) {
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	var tmpI, tmpF operand
+	var tmpI, tmpF regalloc.VReg
 	_64 := x.Type() == ssa.TypeF64
 	if _64 {
-		tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
-		tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+		tmpF = m.compiler.AllocateVReg(ssa.TypeF64)
+		tmpI = m.compiler.AllocateVReg(ssa.TypeI64)
 	} else {
-		tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF32))
-		tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+		tmpF = m.compiler.AllocateVReg(ssa.TypeF32)
+		tmpI = m.compiler.AllocateVReg(ssa.TypeI32)
 	}
 	rd := m.compiler.VRegOf(ret)
-	m.lowerFcopysignImpl(operandNR(rd), rn, rm, tmpI, tmpF, _64)
+	m.lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF, _64)
 }
 
-func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool) {
+func (m *machine) lowerFcopysignImpl(rd regalloc.VReg, rn, rm operand, tmpI, tmpF regalloc.VReg, _64bit bool) {
 	// This is exactly the same code emitted by GCC for "__builtin_copysign":
 	//
 	//    mov     x0, -9223372036854775808
@@ -1313,26 +1313,26 @@ func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool
 
 	setMSB := m.allocateInstr()
 	if _64bit {
-		m.lowerConstantI64(tmpI.nr(), math.MinInt64)
-		setMSB.asMovToVec(tmpF, tmpI, vecArrangementD, vecIndex(0))
+		m.lowerConstantI64(tmpI, math.MinInt64)
+		setMSB.asMovToVec(tmpF, operandNR(tmpI), vecArrangementD, vecIndex(0))
 	} else {
-		m.lowerConstantI32(tmpI.nr(), math.MinInt32)
-		setMSB.asMovToVec(tmpF, tmpI, vecArrangementS, vecIndex(0))
+		m.lowerConstantI32(tmpI, math.MinInt32)
+		setMSB.asMovToVec(tmpF, operandNR(tmpI), vecArrangementS, vecIndex(0))
 	}
 	m.insert(setMSB)
 
-	tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
+	tmpReg := m.compiler.AllocateVReg(ssa.TypeF64)
 
 	mov := m.allocateInstr()
-	mov.asFpuMov64(tmpReg.nr(), rn.nr())
+	mov.asFpuMov64(tmpReg, rn.nr())
 	m.insert(mov)
 
 	vbit := m.allocateInstr()
-	vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, tmpF, vecArrangement8B)
+	vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, operandNR(tmpF), vecArrangement8B)
 	m.insert(vbit)
 
 	movDst := m.allocateInstr()
-	movDst.asFpuMov64(rd.nr(), tmpReg.nr())
+	movDst.asFpuMov64(rd, tmpReg)
 	m.insert(movDst)
 }
 
@@ -1340,7 +1340,7 @@ func (m *machine) lowerBitcast(instr *ssa.Instruction) {
 	v, dstType := instr.BitcastData()
 	srcType := v.Type()
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(instr.Return()))
+	rd := m.compiler.VRegOf(instr.Return())
 	srcInt := srcType.IsInt()
 	dstInt := dstType.IsInt()
 	switch {
@@ -1371,14 +1371,14 @@ func (m *machine) lowerBitcast(instr *ssa.Instruction) {
 
 func (m *machine) lowerFpuUniOp(op fpuUniOp, in, out ssa.Value) {
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(in), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(out))
+	rd := m.compiler.VRegOf(out)
 
 	neg := m.allocateInstr()
 	neg.asFpuRR(op, rd, rn, in.Type().Bits() == 64)
 	m.insert(neg)
 }
 
-func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) {
+func (m *machine) lowerFpuToInt(rd regalloc.VReg, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) {
 	if !nonTrapping {
 		// First of all, we have to clear the FPU flags.
 		flagClear := m.allocateInstr()
@@ -1405,7 +1405,7 @@ func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64
 		// Check if the conversion was undefined by comparing the status with 1.
 		// See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register
 		alu := m.allocateInstr()
-		alu.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpReg), operandImm12(1, 0), true)
+		alu.asALU(aluOpSubS, xzrVReg, operandNR(tmpReg), operandImm12(1, 0), true)
 		m.insert(alu)
 
 		// If it is not undefined, we can return the result.
@@ -1429,7 +1429,7 @@ func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64
 	}
 }
 
-func (m *machine) lowerIntToFpu(rd, rn operand, signed, src64bit, dst64bit bool) {
+func (m *machine) lowerIntToFpu(rd regalloc.VReg, rn operand, signed, src64bit, dst64bit bool) {
 	cvt := m.allocateInstr()
 	cvt.asIntToFpu(rd, rn, signed, src64bit, dst64bit)
 	m.insert(cvt)
@@ -1456,7 +1456,7 @@ func (m *machine) lowerFpuBinOp(si *ssa.Instruction) {
 	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
 	rn := m.getOperand_NR(xDef, extModeNone)
 	rm := m.getOperand_NR(yDef, extModeNone)
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 	instr.asFpuRRR(op, rd, rn, rm, x.Type().Bits() == 64)
 	m.insert(instr)
 }
@@ -1482,7 +1482,7 @@ func (m *machine) lowerSubOrAdd(si *ssa.Instruction, add bool) {
 	case !add && yNegated: // rn+rm = x-(-y) = x-y
 		aop = aluOpAdd
 	}
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 	alu := m.allocateInstr()
 	alu.asALU(aop, rd, rn, rm, x.Type().Bits() == 64)
 	m.insert(alu)
@@ -1527,7 +1527,7 @@ func (m *machine) lowerIcmp(si *ssa.Instruction) {
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
 	rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), ext)
 	alu := m.allocateInstr()
-	alu.asALU(aluOpSubS, operandNR(xzrVReg), rn, rm, in64bit)
+	alu.asALU(aluOpSubS, xzrVReg, rn, rm, in64bit)
 	m.insert(alu)
 
 	cset := m.allocateInstr()
@@ -1542,7 +1542,7 @@ func (m *machine) lowerVIcmp(si *ssa.Instruction) {
 
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 
 	switch flag {
 	case eq:
@@ -1554,7 +1554,7 @@ func (m *machine) lowerVIcmp(si *ssa.Instruction) {
 		cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr)
 		m.insert(cmp)
 		not := m.allocateInstr()
-		not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
+		not.asVecMisc(vecOpNot, rd, operandNR(rd), vecArrangement16B)
 		m.insert(not)
 	case ge:
 		cmp := m.allocateInstr()
@@ -1598,7 +1598,7 @@ func (m *machine) lowerVFcmp(si *ssa.Instruction) {
 
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 
 	switch flag {
 	case eq:
@@ -1610,7 +1610,7 @@ func (m *machine) lowerVFcmp(si *ssa.Instruction) {
 		cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr)
 		m.insert(cmp)
 		not := m.allocateInstr()
-		not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
+		not.asVecMisc(vecOpNot, rd, operandNR(rd), vecArrangement16B)
 		m.insert(not)
 	case ge:
 		cmp := m.allocateInstr()
@@ -1631,7 +1631,7 @@ func (m *machine) lowerVFcmp(si *ssa.Instruction) {
 	}
 }
 
-func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool) {
+func (m *machine) lowerVfpuToInt(rd regalloc.VReg, rn operand, arr vecArrangement, signed bool) {
 	cvt := m.allocateInstr()
 	if signed {
 		cvt.asVecMisc(vecOpFcvtzs, rd, rn, arr)
@@ -1643,15 +1643,15 @@ func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool
 	if arr == vecArrangement2D {
 		narrow := m.allocateInstr()
 		if signed {
-			narrow.asVecMisc(vecOpSqxtn, rd, rd, vecArrangement2S)
+			narrow.asVecMisc(vecOpSqxtn, rd, operandNR(rd), vecArrangement2S)
 		} else {
-			narrow.asVecMisc(vecOpUqxtn, rd, rd, vecArrangement2S)
+			narrow.asVecMisc(vecOpUqxtn, rd, operandNR(rd), vecArrangement2S)
 		}
 		m.insert(narrow)
 	}
 }
 
-func (m *machine) lowerVfpuFromInt(rd, rn operand, arr vecArrangement, signed bool) {
+func (m *machine) lowerVfpuFromInt(rd regalloc.VReg, rn operand, arr vecArrangement, signed bool) {
 	cvt := m.allocateInstr()
 	if signed {
 		cvt.asVecMisc(vecOpScvtf, rd, rn, arr)
@@ -1665,7 +1665,7 @@ func (m *machine) lowerShifts(si *ssa.Instruction, ext extMode, aluOp aluOp) {
 	x, amount := si.Arg2()
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
 	rm := m.getOperand_ShiftImm_NR(m.compiler.ValueDefinition(amount), ext, x.Type().Bits())
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 
 	alu := m.allocateInstr()
 	alu.asALUShift(aluOp, rd, rn, rm, x.Type().Bits() == 64)
@@ -1678,11 +1678,11 @@ func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp, ignoreResult
 	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
 	rn := m.getOperand_NR(xDef, extModeNone)
 
-	var rd operand
+	var rd regalloc.VReg
 	if ignoreResult {
-		rd = operandNR(xzrVReg)
+		rd = xzrVReg
 	} else {
-		rd = operandNR(m.compiler.VRegOf(si.Return()))
+		rd = m.compiler.VRegOf(si.Return())
 	}
 
 	_64 := x.Type().Bits() == 64
@@ -1691,7 +1691,7 @@ func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp, ignoreResult
 		c := instr.ConstantVal()
 		if isBitMaskImmediate(c, _64) {
 			// Constant bit wise operations can be lowered to a single instruction.
-			alu.asALUBitmaskImm(op, rd.nr(), rn.nr(), c, _64)
+			alu.asALUBitmaskImm(op, rd, rn.nr(), c, _64)
 			m.insert(alu)
 			return
 		}
@@ -1709,25 +1709,25 @@ func (m *machine) lowerRotl(si *ssa.Instruction) {
 
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	var tmp operand
+	var tmp regalloc.VReg
 	if _64 {
-		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+		tmp = m.compiler.AllocateVReg(ssa.TypeI64)
 	} else {
-		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+		tmp = m.compiler.AllocateVReg(ssa.TypeI32)
 	}
-	rd := operandNR(m.compiler.VRegOf(r))
+	rd := m.compiler.VRegOf(r)
 
 	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
 	m.lowerRotlImpl(rd, rn, rm, tmp, _64)
 }
 
-func (m *machine) lowerRotlImpl(rd, rn, rm, tmp operand, is64bit bool) {
+func (m *machine) lowerRotlImpl(rd regalloc.VReg, rn, rm operand, tmp regalloc.VReg, is64bit bool) {
 	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
 	neg := m.allocateInstr()
 	neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rm, is64bit)
 	m.insert(neg)
 	alu := m.allocateInstr()
-	alu.asALU(aluOpRotR, rd, rn, tmp, is64bit)
+	alu.asALU(aluOpRotR, rd, rn, operandNR(tmp), is64bit)
 	m.insert(alu)
 }
 
@@ -1737,7 +1737,7 @@ func (m *machine) lowerRotr(si *ssa.Instruction) {
 	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
 	rn := m.getOperand_NR(xDef, extModeNone)
 	rm := m.getOperand_NR(yDef, extModeNone)
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 
 	alu := m.allocateInstr()
 	alu.asALU(aluOpRotR, rd, rn, rm, si.Return().Type().Bits() == 64)
@@ -1797,7 +1797,7 @@ func (m *machine) lowerImul(x, y, result ssa.Value) {
 	// TODO: if this comes before Add/Sub, we could merge it by putting it into the place of xzrVReg.
 
 	mul := m.allocateInstr()
-	mul.asALURRRR(aluOpMAdd, operandNR(rd), rn, rm, operandNR(xzrVReg), x.Type().Bits() == 64)
+	mul.asALURRRR(aluOpMAdd, rd, rn, rm, xzrVReg, x.Type().Bits() == 64)
 	m.insert(mul)
 }
 
@@ -1849,22 +1849,22 @@ func (m *machine) lowerPopcnt(x, result ssa.Value) {
 	//    mov x5, v0.d[0]     ;; finally we mov the result back to a GPR
 	//
 
-	rd := operandNR(m.compiler.VRegOf(result))
+	rd := m.compiler.VRegOf(result)
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 
 	rf1 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
 	ins := m.allocateInstr()
-	ins.asMovToVec(rf1, rn, vecArrangementD, vecIndex(0))
+	ins.asMovToVec(rf1.nr(), rn, vecArrangementD, vecIndex(0))
 	m.insert(ins)
 
 	rf2 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
 	cnt := m.allocateInstr()
-	cnt.asVecMisc(vecOpCnt, rf2, rf1, vecArrangement16B)
+	cnt.asVecMisc(vecOpCnt, rf2.nr(), rf1, vecArrangement16B)
 	m.insert(cnt)
 
 	rf3 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
 	uaddlv := m.allocateInstr()
-	uaddlv.asVecLanes(vecOpUaddlv, rf3, rf2, vecArrangement8B)
+	uaddlv.asVecLanes(vecOpUaddlv, rf3.nr(), rf2, vecArrangement8B)
 	m.insert(uaddlv)
 
 	mov := m.allocateInstr()
@@ -1879,32 +1879,35 @@ func (m *machine) lowerExitWithCode(execCtxVReg regalloc.VReg, code wazevoapi.Ex
 	loadExitCodeConst.asMOVZ(tmpReg1, uint64(code), 0, true)
 
 	setExitCode := m.allocateInstr()
-	setExitCode.asStore(operandNR(tmpReg1),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
-		}, 32)
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
+	}
+	setExitCode.asStore(operandNR(tmpReg1), mode, 32)
 
 	// In order to unwind the stack, we also need to push the current stack pointer:
 	tmp2 := m.compiler.AllocateVReg(ssa.TypeI64)
 	movSpToTmp := m.allocateInstr()
 	movSpToTmp.asMove64(tmp2, spVReg)
 	strSpToExecCtx := m.allocateInstr()
-	strSpToExecCtx.asStore(operandNR(tmp2),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
-		}, 64)
+	mode2 := m.amodePool.Allocate()
+	*mode2 = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
+	}
+	strSpToExecCtx.asStore(operandNR(tmp2), mode2, 64)
 	// Also the address of this exit.
 	tmp3 := m.compiler.AllocateVReg(ssa.TypeI64)
 	currentAddrToTmp := m.allocateInstr()
 	currentAddrToTmp.asAdr(tmp3, 0)
 	storeCurrentAddrToExecCtx := m.allocateInstr()
-	storeCurrentAddrToExecCtx.asStore(operandNR(tmp3),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
-		}, 64)
+	mode3 := m.amodePool.Allocate()
+	*mode3 = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
+	}
+	storeCurrentAddrToExecCtx.asStore(operandNR(tmp3), mode3, 64)
 
 	exitSeq := m.allocateInstr()
 	exitSeq.asExitSequence(execCtxVReg)
@@ -1937,7 +1940,7 @@ func (m *machine) lowerIcmpToFlag(x, y ssa.Value, signed bool) {
 	alu.asALU(
 		aluOpSubS,
 		// We don't need the result, just need to set flags.
-		operandNR(xzrVReg),
+		xzrVReg,
 		rn,
 		rm,
 		x.Type().Bits() == 64,
@@ -2012,7 +2015,7 @@ func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
 		alu.asALU(
 			aluOpSubS,
 			// We don't need the result, just need to set flags.
-			operandNR(xzrVReg),
+			xzrVReg,
 			rn,
 			operandNR(xzrVReg),
 			c.Type().Bits() == 64,
@@ -2024,7 +2027,7 @@ func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
 
-	rd := operandNR(m.compiler.VRegOf(result))
+	rd := m.compiler.VRegOf(result)
 	switch x.Type() {
 	case ssa.TypeI32, ssa.TypeI64:
 		// csel rd, rn, rm, cc
@@ -2041,10 +2044,10 @@ func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
 	}
 }
 
-func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
+func (m *machine) lowerSelectVec(rc, rn, rm operand, rd regalloc.VReg) {
 	// First check if `rc` is zero or not.
 	checkZero := m.allocateInstr()
-	checkZero.asALU(aluOpSubS, operandNR(xzrVReg), rc, operandNR(xzrVReg), false)
+	checkZero.asALU(aluOpSubS, xzrVReg, rc, operandNR(xzrVReg), false)
 	m.insert(checkZero)
 
 	// Then use CSETM to set all bits to one if `rc` is zero.
@@ -2054,7 +2057,7 @@ func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
 	m.insert(cset)
 
 	// Then move the bits to the result vector register.
-	tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+	tmp2 := m.compiler.AllocateVReg(ssa.TypeV128)
 	dup := m.allocateInstr()
 	dup.asVecDup(tmp2, operandNR(allOnesOrZero), vecArrangement2D)
 	m.insert(dup)
@@ -2067,7 +2070,7 @@ func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
 
 	// Finally, move the result to the destination register.
 	mov2 := m.allocateInstr()
-	mov2.asFpuMov128(rd.nr(), tmp2.nr())
+	mov2.asFpuMov128(rd, tmp2)
 	m.insert(mov2)
 }
 
@@ -2099,28 +2102,28 @@ func (m *machine) lowerAtomicRmw(si *ssa.Instruction) {
 	addr, val := si.Arg2()
 	addrDef, valDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(val)
 	rn := m.getOperand_NR(addrDef, extModeNone)
-	rt := operandNR(m.compiler.VRegOf(si.Return()))
+	rt := m.compiler.VRegOf(si.Return())
 	rs := m.getOperand_NR(valDef, extModeNone)
 
 	_64 := si.Return().Type().Bits() == 64
-	var tmp operand
+	var tmp regalloc.VReg
 	if _64 {
-		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+		tmp = m.compiler.AllocateVReg(ssa.TypeI64)
 	} else {
-		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+		tmp = m.compiler.AllocateVReg(ssa.TypeI32)
 	}
-	m.lowerAtomicRmwImpl(op, rn, rs, rt, tmp, size, negateArg, flipArg, _64)
+	m.lowerAtomicRmwImpl(op, rn.nr(), rs.nr(), rt, tmp, size, negateArg, flipArg, _64)
 }
 
-func (m *machine) lowerAtomicRmwImpl(op atomicRmwOp, rn, rs, rt, tmp operand, size uint64, negateArg, flipArg, dst64bit bool) {
+func (m *machine) lowerAtomicRmwImpl(op atomicRmwOp, rn, rs, rt, tmp regalloc.VReg, size uint64, negateArg, flipArg, dst64bit bool) {
 	switch {
 	case negateArg:
 		neg := m.allocateInstr()
-		neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rs, dst64bit)
+		neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), operandNR(rs), dst64bit)
 		m.insert(neg)
 	case flipArg:
 		flip := m.allocateInstr()
-		flip.asALU(aluOpOrn, tmp, operandNR(xzrVReg), rs, dst64bit)
+		flip.asALU(aluOpOrn, tmp, operandNR(xzrVReg), operandNR(rs), dst64bit)
 		m.insert(flip)
 	default:
 		tmp = rs
@@ -2139,32 +2142,32 @@ func (m *machine) lowerAtomicCas(si *ssa.Instruction) {
 	rn := m.getOperand_NR(addrDef, extModeNone)
 	rt := m.getOperand_NR(replDef, extModeNone)
 	rs := m.getOperand_NR(expDef, extModeNone)
-	tmp := operandNR(m.compiler.AllocateVReg(si.Return().Type()))
+	tmp := m.compiler.AllocateVReg(si.Return().Type())
 
 	_64 := si.Return().Type().Bits() == 64
 	// rs is overwritten by CAS, so we need to move it to the result register before the instruction
 	// in case when it is used somewhere else.
 	mov := m.allocateInstr()
 	if _64 {
-		mov.asMove64(tmp.nr(), rs.nr())
+		mov.asMove64(tmp, rs.nr())
 	} else {
-		mov.asMove32(tmp.nr(), rs.nr())
+		mov.asMove32(tmp, rs.nr())
 	}
 	m.insert(mov)
 
-	m.lowerAtomicCasImpl(rn, tmp, rt, size)
+	m.lowerAtomicCasImpl(rn.nr(), tmp, rt.nr(), size)
 
 	mov2 := m.allocateInstr()
 	rd := m.compiler.VRegOf(si.Return())
 	if _64 {
-		mov2.asMove64(rd, tmp.nr())
+		mov2.asMove64(rd, tmp)
 	} else {
-		mov2.asMove32(rd, tmp.nr())
+		mov2.asMove32(rd, tmp)
 	}
 	m.insert(mov2)
 }
 
-func (m *machine) lowerAtomicCasImpl(rn, rs, rt operand, size uint64) {
+func (m *machine) lowerAtomicCasImpl(rn, rs, rt regalloc.VReg, size uint64) {
 	cas := m.allocateInstr()
 	cas.asAtomicCas(rn, rs, rt, size)
 	m.insert(cas)
@@ -2176,12 +2179,12 @@ func (m *machine) lowerAtomicLoad(si *ssa.Instruction) {
 
 	addrDef := m.compiler.ValueDefinition(addr)
 	rn := m.getOperand_NR(addrDef, extModeNone)
-	rt := operandNR(m.compiler.VRegOf(si.Return()))
+	rt := m.compiler.VRegOf(si.Return())
 
-	m.lowerAtomicLoadImpl(rn, rt, size)
+	m.lowerAtomicLoadImpl(rn.nr(), rt, size)
 }
 
-func (m *machine) lowerAtomicLoadImpl(rn, rt operand, size uint64) {
+func (m *machine) lowerAtomicLoadImpl(rn, rt regalloc.VReg, size uint64) {
 	ld := m.allocateInstr()
 	ld.asAtomicLoad(rn, rt, size)
 	m.insert(ld)
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
index 4842eaa38..fd0760d72 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
@@ -24,6 +24,14 @@ type (
 	addressModeKind byte
 )
 
+func resetAddressMode(a *addressMode) {
+	a.kind = 0
+	a.rn = 0
+	a.rm = 0
+	a.extOp = 0
+	a.imm = 0
+}
+
 const (
 	// addressModeKindRegExtended takes a base register and an index register. The index register is sign/zero-extended,
 	// and then scaled by bits(type)/8.
@@ -140,15 +148,17 @@ func (a addressMode) format(dstSizeBits byte) (ret string) {
 	return
 }
 
-func addressModePreOrPostIndex(rn regalloc.VReg, imm int64, preIndex bool) addressMode {
+func addressModePreOrPostIndex(m *machine, rn regalloc.VReg, imm int64, preIndex bool) *addressMode {
 	if !offsetFitsInAddressModeKindRegSignedImm9(imm) {
 		panic(fmt.Sprintf("BUG: offset %#x does not fit in addressModeKindRegSignedImm9", imm))
 	}
+	mode := m.amodePool.Allocate()
 	if preIndex {
-		return addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm}
+		*mode = addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm}
 	} else {
-		return addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm}
+		*mode = addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm}
 	}
+	return mode
 }
 
 func offsetFitsInAddressModeKindRegUnsignedImm12(dstSizeInBits byte, offset int64) bool {
@@ -207,9 +217,9 @@ func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, ret
 	amode := m.lowerToAddressMode(ptr, offset, size)
 	load := m.allocateInstr()
 	if signed {
-		load.asSLoad(operandNR(ret), amode, size)
+		load.asSLoad(ret, amode, size)
 	} else {
-		load.asULoad(operandNR(ret), amode, size)
+		load.asULoad(ret, amode, size)
 	}
 	m.insert(load)
 }
@@ -221,11 +231,11 @@ func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, ret ssa.
 	load := m.allocateInstr()
 	switch typ {
 	case ssa.TypeI32, ssa.TypeI64:
-		load.asULoad(operandNR(dst), amode, typ.Bits())
+		load.asULoad(dst, amode, typ.Bits())
 	case ssa.TypeF32, ssa.TypeF64:
-		load.asFpuLoad(operandNR(dst), amode, typ.Bits())
+		load.asFpuLoad(dst, amode, typ.Bits())
 	case ssa.TypeV128:
-		load.asFpuLoad(operandNR(dst), amode, 128)
+		load.asFpuLoad(dst, amode, 128)
 	default:
 		panic("TODO")
 	}
@@ -239,7 +249,7 @@ func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, lane ssa.VecLane,
 	m.lowerConstantI64(offsetReg, int64(offset))
 	addedBase := m.addReg64ToReg64(base, offsetReg)
 
-	rd := operandNR(m.compiler.VRegOf(ret))
+	rd := m.compiler.VRegOf(ret)
 
 	ld1r := m.allocateInstr()
 	ld1r.asVecLoad1R(rd, operandNR(addedBase), ssaLaneToArrangement(lane))
@@ -258,7 +268,7 @@ func (m *machine) lowerStore(si *ssa.Instruction) {
 }
 
 // lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
-func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode addressMode) {
+func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode *addressMode) {
 	// TODO: currently the instruction selection logic doesn't support addressModeKindRegScaledExtended and
 	// addressModeKindRegScaled since collectAddends doesn't take ssa.OpcodeIshl into account. This should be fixed
 	// to support more efficient address resolution.
@@ -272,32 +282,33 @@ func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte
 // During the construction, this might emit additional instructions.
 //
 // Extracted as a separate function for easy testing.
-func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32], a64s *wazevoapi.Queue[regalloc.VReg], size byte, offset int64) (amode addressMode) {
+func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32], a64s *wazevoapi.Queue[regalloc.VReg], size byte, offset int64) (amode *addressMode) {
+	amode = m.amodePool.Allocate()
 	switch a64sExist, a32sExist := !a64s.Empty(), !a32s.Empty(); {
 	case a64sExist && a32sExist:
 		var base regalloc.VReg
 		base = a64s.Dequeue()
 		var a32 addend32
 		a32 = a32s.Dequeue()
-		amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext}
+		*amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext}
 	case a64sExist && offsetFitsInAddressModeKindRegUnsignedImm12(size, offset):
 		var base regalloc.VReg
 		base = a64s.Dequeue()
-		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset}
+		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset}
 		offset = 0
 	case a64sExist && offsetFitsInAddressModeKindRegSignedImm9(offset):
 		var base regalloc.VReg
 		base = a64s.Dequeue()
-		amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset}
+		*amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset}
 		offset = 0
 	case a64sExist:
 		var base regalloc.VReg
 		base = a64s.Dequeue()
 		if !a64s.Empty() {
 			index := a64s.Dequeue()
-			amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */}
+			*amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */}
 		} else {
-			amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
+			*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
 		}
 	case a32sExist:
 		base32 := a32s.Dequeue()
@@ -314,14 +325,14 @@ func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32],
 
 		if !a32s.Empty() {
 			index := a32s.Dequeue()
-			amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext}
+			*amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext}
 		} else {
-			amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
+			*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
 		}
 	default: // Only static offsets.
 		tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
 		m.lowerConstantI64(tmpReg, offset)
-		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0}
+		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0}
 		offset = 0
 	}
 
@@ -411,13 +422,13 @@ func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) {
 	rd = m.compiler.AllocateVReg(ssa.TypeI64)
 	alu := m.allocateInstr()
 	if imm12Op, ok := asImm12Operand(uint64(c)); ok {
-		alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), imm12Op, true)
+		alu.asALU(aluOpAdd, rd, operandNR(r), imm12Op, true)
 	} else if imm12Op, ok = asImm12Operand(uint64(-c)); ok {
-		alu.asALU(aluOpSub, operandNR(rd), operandNR(r), imm12Op, true)
+		alu.asALU(aluOpSub, rd, operandNR(r), imm12Op, true)
 	} else {
 		tmp := m.compiler.AllocateVReg(ssa.TypeI64)
 		m.load64bitConst(c, tmp)
-		alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), operandNR(tmp), true)
+		alu.asALU(aluOpAdd, rd, operandNR(r), operandNR(tmp), true)
 	}
 	m.insert(alu)
 	return
@@ -426,7 +437,7 @@ func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) {
 func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) {
 	rd = m.compiler.AllocateVReg(ssa.TypeI64)
 	alu := m.allocateInstr()
-	alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandNR(rm), true)
+	alu.asALU(aluOpAdd, rd, operandNR(rn), operandNR(rm), true)
 	m.insert(alu)
 	return
 }
@@ -434,7 +445,7 @@ func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) {
 func (m *machine) addRegToReg64Ext(rn, rm regalloc.VReg, ext extendOp) (rd regalloc.VReg) {
 	rd = m.compiler.AllocateVReg(ssa.TypeI64)
 	alu := m.allocateInstr()
-	alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandER(rm, ext, 64), true)
+	alu.asALU(aluOpAdd, rd, operandNR(rn), operandER(rm, ext, 64), true)
 	m.insert(alu)
 	return
 }
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
index b435d9ba9..5f584f928 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
@@ -21,6 +21,8 @@ type (
 		regAlloc   regalloc.Allocator
 		regAllocFn *backend.RegAllocFunction[*instruction, *machine]
 
+		amodePool wazevoapi.Pool[addressMode]
+
 		// addendsWorkQueue is used during address lowering, defined here for reuse.
 		addendsWorkQueue wazevoapi.Queue[ssa.Value]
 		addends32        wazevoapi.Queue[addend32]
@@ -105,6 +107,7 @@ func NewBackend() backend.Machine {
 		spillSlots:        make(map[regalloc.VRegID]int64),
 		executableContext: newExecutableContext(),
 		regAlloc:          regalloc.NewAllocator(regInfo),
+		amodePool:         wazevoapi.NewPool[addressMode](resetAddressMode),
 	}
 	return m
 }
@@ -149,6 +152,7 @@ func (m *machine) Reset() {
 	m.maxRequiredStackSizeForCalls = 0
 	m.executableContext.Reset()
 	m.jmpTableTargets = m.jmpTableTargets[:0]
+	m.amodePool.Reset()
 }
 
 // SetCurrentABI implements backend.Machine SetCurrentABI.
@@ -183,9 +187,8 @@ func (m *machine) allocateBrTarget() (nop *instruction, l label) {
 	l = ectx.AllocateLabel()
 	nop = m.allocateInstr()
 	nop.asNop0WithLabel(l)
-	pos := ectx.AllocateLabelPosition(l)
+	pos := ectx.GetOrAllocateLabelPosition(l)
 	pos.Begin, pos.End = nop, nop
-	ectx.LabelPositions[l] = pos
 	return
 }
 
@@ -209,7 +212,7 @@ func (m *machine) allocateNop() *instruction {
 }
 
 func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruction) {
-	amode := &i.amode
+	amode := i.getAmode()
 	switch amode.kind {
 	case addressModeKindResultStackSpace:
 		amode.imm += ret0offset
@@ -281,7 +284,7 @@ func (m *machine) resolveRelativeAddresses(ctx context.Context) {
 				switch cur.kind {
 				case nop0:
 					l := cur.nop0Label()
-					if pos, ok := ectx.LabelPositions[l]; ok {
+					if pos := ectx.LabelPositions[l]; pos != nil {
 						pos.BinaryOffset = offset + size
 					}
 				case condBr:
@@ -428,8 +431,10 @@ func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *
 func (m *machine) Format() string {
 	ectx := m.executableContext
 	begins := map[*instruction]label{}
-	for l, pos := range ectx.LabelPositions {
-		begins[pos.Begin] = l
+	for _, pos := range ectx.LabelPositions {
+		if pos != nil {
+			begins[pos.Begin] = pos.L
+		}
 	}
 
 	irBlocks := map[label]ssa.BasicBlockID{}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
index 466fac464..d9032f921 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
@@ -70,7 +70,7 @@ func (m *machine) setupPrologue() {
 		//                                          +-----------------+ <----- SP
 		//                                             (low address)
 		//
-		_amode := addressModePreOrPostIndex(spVReg,
+		_amode := addressModePreOrPostIndex(m, spVReg,
 			-16,  // stack pointer must be 16-byte aligned.
 			true, // Decrement before store.
 		)
@@ -159,7 +159,7 @@ func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruc
 		sizeOfArgRetReg = tmpRegVReg
 
 		subSp := m.allocateInstr()
-		subSp.asALU(aluOpSub, operandNR(spVReg), operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
+		subSp.asALU(aluOpSub, spVReg, operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
 		cur = linkInstr(cur, subSp)
 	} else {
 		sizeOfArgRetReg = xzrVReg
@@ -168,7 +168,7 @@ func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruc
 	// Saves the return address (lr) and the size_of_arg_ret below the SP.
 	// size_of_arg_ret is used for stack unwinding.
 	pstr := m.allocateInstr()
-	amode := addressModePreOrPostIndex(spVReg, -16, true /* decrement before store */)
+	amode := addressModePreOrPostIndex(m, spVReg, -16, true /* decrement before store */)
 	pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode)
 	cur = linkInstr(cur, pstr)
 	return cur
@@ -182,7 +182,7 @@ func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction {
 	} else {
 		frameSizeReg = xzrVReg
 	}
-	_amode := addressModePreOrPostIndex(spVReg,
+	_amode := addressModePreOrPostIndex(m, spVReg,
 		-16,  // stack pointer must be 16-byte aligned.
 		true, // Decrement before store.
 	)
@@ -213,7 +213,7 @@ func (m *machine) postRegAlloc() {
 			m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
 		default:
 			// Removes the redundant copy instruction.
-			if cur.IsCopy() && cur.rn.realReg() == cur.rd.realReg() {
+			if cur.IsCopy() && cur.rn.realReg() == cur.rd.RealReg() {
 				prev, next := cur.prev, cur.next
 				// Remove the copy instruction.
 				prev.next = next
@@ -286,16 +286,16 @@ func (m *machine) setupEpilogueAfter(cur *instruction) {
 		for i := range m.clobberedRegs {
 			vr := m.clobberedRegs[l-i] // reverse order to restore.
 			load := m.allocateInstr()
-			amode := addressModePreOrPostIndex(spVReg,
+			amode := addressModePreOrPostIndex(m, spVReg,
 				16,    // stack pointer must be 16-byte aligned.
 				false, // Increment after store.
 			)
 			// TODO: pair loads to reduce the number of instructions.
 			switch regTypeToRegisterSizeInBits(vr.RegType()) {
 			case 64: // save int reg.
-				load.asULoad(operandNR(vr), amode, 64)
+				load.asULoad(vr, amode, 64)
 			case 128: // save vector reg.
-				load.asFpuLoad(operandNR(vr), amode, 128)
+				load.asFpuLoad(vr, amode, 128)
 			}
 			cur = linkInstr(cur, load)
 		}
@@ -317,8 +317,8 @@ func (m *machine) setupEpilogueAfter(cur *instruction) {
 	//    SP----> +-----------------+
 
 	ldr := m.allocateInstr()
-	ldr.asULoad(operandNR(lrVReg),
-		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
+	ldr.asULoad(lrVReg,
+		addressModePreOrPostIndex(m, spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
 	cur = linkInstr(cur, ldr)
 
 	if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
@@ -351,14 +351,14 @@ func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instructi
 	if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok {
 		// sub tmp, sp, #requiredStackSize
 		sub := m.allocateInstr()
-		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), immm12op, true)
+		sub.asALU(aluOpSub, tmpRegVReg, operandNR(spVReg), immm12op, true)
 		cur = linkInstr(cur, sub)
 	} else {
 		// This case, we first load the requiredStackSize into the temporary register,
 		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
 		// Then subtract it.
 		sub := m.allocateInstr()
-		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true)
+		sub.asALU(aluOpSub, tmpRegVReg, operandNR(spVReg), operandNR(tmpRegVReg), true)
 		cur = linkInstr(cur, sub)
 	}
 
@@ -366,16 +366,18 @@ func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instructi
 
 	// ldr tmp2, [executionContext #StackBottomPtr]
 	ldr := m.allocateInstr()
-	ldr.asULoad(operandNR(tmp2), addressMode{
+	amode := m.amodePool.Allocate()
+	*amode = addressMode{
 		kind: addressModeKindRegUnsignedImm12,
 		rn:   x0VReg, // execution context is always the first argument.
 		imm:  wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(),
-	}, 64)
+	}
+	ldr.asULoad(tmp2, amode, 64)
 	cur = linkInstr(cur, ldr)
 
 	// subs xzr, tmp, tmp2
 	subs := m.allocateInstr()
-	subs.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpRegVReg), operandNR(tmp2), true)
+	subs.asALU(aluOpSubS, xzrVReg, operandNR(tmpRegVReg), operandNR(tmp2), true)
 	cur = linkInstr(cur, subs)
 
 	// b.ge #imm
@@ -388,22 +390,25 @@ func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instructi
 		// First load the requiredStackSize into the temporary register,
 		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
 		setRequiredStackSize := m.allocateInstr()
-		setRequiredStackSize.asStore(operandNR(tmpRegVReg),
-			addressMode{
-				kind: addressModeKindRegUnsignedImm12,
-				// Execution context is always the first argument.
-				rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
-			}, 64)
+		amode := m.amodePool.Allocate()
+		*amode = addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			// Execution context is always the first argument.
+			rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
+		}
+		setRequiredStackSize.asStore(operandNR(tmpRegVReg), amode, 64)
 
 		cur = linkInstr(cur, setRequiredStackSize)
 	}
 
 	ldrAddress := m.allocateInstr()
-	ldrAddress.asULoad(operandNR(tmpRegVReg), addressMode{
+	amode2 := m.amodePool.Allocate()
+	*amode2 = addressMode{
 		kind: addressModeKindRegUnsignedImm12,
 		rn:   x0VReg, // execution context is always the first argument
 		imm:  wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(),
-	}, 64)
+	}
+	ldrAddress.asULoad(tmpRegVReg, amode2, 64)
 	cur = linkInstr(cur, ldrAddress)
 
 	// Then jumps to the stack grow call sequence's address, meaning
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
index 1c8793b73..c7eb92cc2 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
@@ -91,7 +91,7 @@ func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, aft
 	}
 
 	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
-	var amode addressMode
+	var amode *addressMode
 	cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
 	store := m.allocateInstr()
 	store.asStore(operandNR(v), amode, typ.Bits())
@@ -116,16 +116,16 @@ func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, af
 	}
 
 	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
-	var amode addressMode
+	var amode *addressMode
 	cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
 	load := m.allocateInstr()
 	switch typ {
 	case ssa.TypeI32, ssa.TypeI64:
-		load.asULoad(operandNR(v), amode, typ.Bits())
+		load.asULoad(v, amode, typ.Bits())
 	case ssa.TypeF32, ssa.TypeF64:
-		load.asFpuLoad(operandNR(v), amode, typ.Bits())
+		load.asFpuLoad(v, amode, typ.Bits())
 	case ssa.TypeV128:
-		load.asFpuLoad(operandNR(v), amode, 128)
+		load.asFpuLoad(v, amode, 128)
 	default:
 		panic("TODO")
 	}
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go
index 3f36c84e5..655370786 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go
@@ -35,7 +35,7 @@ type (
 		iter                   int
 		reversePostOrderBlocks []RegAllocBlock[I, m]
 		// labelToRegAllocBlockIndex maps label to the index of reversePostOrderBlocks.
-		labelToRegAllocBlockIndex map[Label]int
+		labelToRegAllocBlockIndex [] /* Label to */ int
 		loopNestingForestRoots    []ssa.BasicBlock
 	}
 
@@ -56,10 +56,9 @@ type (
 // NewRegAllocFunction returns a new RegAllocFunction.
 func NewRegAllocFunction[I regalloc.InstrConstraint, M RegAllocFunctionMachine[I]](m M, ssb ssa.Builder, c Compiler) *RegAllocFunction[I, M] {
 	return &RegAllocFunction[I, M]{
-		m:                         m,
-		ssb:                       ssb,
-		c:                         c,
-		labelToRegAllocBlockIndex: make(map[Label]int),
+		m:   m,
+		ssb: ssb,
+		c:   c,
 	}
 }
 
@@ -74,6 +73,9 @@ func (f *RegAllocFunction[I, M]) AddBlock(sb ssa.BasicBlock, l Label, begin, end
 		end:   end,
 		id:    int(sb.ID()),
 	})
+	if len(f.labelToRegAllocBlockIndex) <= int(l) {
+		f.labelToRegAllocBlockIndex = append(f.labelToRegAllocBlockIndex, make([]int, int(l)-len(f.labelToRegAllocBlockIndex)+1)...)
+	}
 	f.labelToRegAllocBlockIndex[l] = i
 }
 
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
index b4450d56f..eacb6a7ef 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
@@ -60,9 +60,8 @@ type (
 		phiDefInstListPool       wazevoapi.Pool[phiDefInstList]
 
 		// Followings are re-used during various places.
-		blks             []Block
-		reals            []RealReg
-		currentOccupants regInUseSet
+		blks  []Block
+		reals []RealReg
 
 		// Following two fields are updated while iterating the blocks in the reverse postorder.
 		state       state
@@ -755,7 +754,8 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 		killSet := a.reals[:0]
 
 		// Gather the set of registers that will be used in the current instruction.
-		for _, use := range instr.Uses(&a.vs) {
+		uses := instr.Uses(&a.vs)
+		for _, use := range uses {
 			if use.IsRealReg() {
 				r := use.RealReg()
 				currentUsedSet = currentUsedSet.add(r)
@@ -770,7 +770,7 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 			}
 		}
 
-		for i, use := range instr.Uses(&a.vs) {
+		for i, use := range uses {
 			if !use.IsRealReg() {
 				vs := s.getVRegState(use.ID())
 				killed := vs.lastUse == pc
@@ -944,8 +944,7 @@ func (a *Allocator) allocBlock(f Function, blk Block) {
 func (a *Allocator) releaseCallerSavedRegs(addrReg RealReg) {
 	s := &a.state
 
-	for i := 0; i < 64; i++ {
-		allocated := RealReg(i)
+	for allocated := RealReg(0); allocated < 64; allocated++ {
 		if allocated == addrReg { // If this is the call indirect, we should not touch the addr register.
 			continue
 		}
@@ -974,11 +973,10 @@ func (a *Allocator) fixMergeState(f Function, blk Block) {
 	bID := blk.ID()
 	blkSt := a.getOrAllocateBlockState(bID)
 	desiredOccupants := &blkSt.startRegs
-	aliveOnRegVRegs := make(map[VReg]RealReg)
-	for i := 0; i < 64; i++ {
-		r := RealReg(i)
-		if v := blkSt.startRegs.get(r); v.Valid() {
-			aliveOnRegVRegs[v] = r
+	var desiredOccupantsSet RegSet
+	for i, v := range desiredOccupants {
+		if v != VRegInvalid {
+			desiredOccupantsSet = desiredOccupantsSet.add(RealReg(i))
 		}
 	}
 
@@ -987,56 +985,38 @@ func (a *Allocator) fixMergeState(f Function, blk Block) {
 	}
 
 	s.currentBlockID = bID
-	a.updateLiveInVRState(a.getOrAllocateBlockState(bID))
+	a.updateLiveInVRState(blkSt)
 
-	currentOccupants := &a.currentOccupants
 	for i := 0; i < preds; i++ {
-		currentOccupants.reset()
 		if i == blkSt.startFromPredIndex {
 			continue
 		}
 
-		currentOccupantsRev := make(map[VReg]RealReg)
 		pred := blk.Pred(i)
 		predSt := a.getOrAllocateBlockState(pred.ID())
-		for ii := 0; ii < 64; ii++ {
-			r := RealReg(ii)
-			if v := predSt.endRegs.get(r); v.Valid() {
-				if _, ok := aliveOnRegVRegs[v]; !ok {
-					continue
-				}
-				currentOccupants.add(r, v)
-				currentOccupantsRev[v] = r
-			}
-		}
 
 		s.resetAt(predSt)
 
 		// Finds the free registers if any.
 		intTmp, floatTmp := VRegInvalid, VRegInvalid
 		if intFree := s.findAllocatable(
-			a.regInfo.AllocatableRegisters[RegTypeInt], desiredOccupants.set,
+			a.regInfo.AllocatableRegisters[RegTypeInt], desiredOccupantsSet,
 		); intFree != RealRegInvalid {
 			intTmp = FromRealReg(intFree, RegTypeInt)
 		}
 		if floatFree := s.findAllocatable(
-			a.regInfo.AllocatableRegisters[RegTypeFloat], desiredOccupants.set,
+			a.regInfo.AllocatableRegisters[RegTypeFloat], desiredOccupantsSet,
 		); floatFree != RealRegInvalid {
 			floatTmp = FromRealReg(floatFree, RegTypeFloat)
 		}
 
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Println("\t", pred.ID(), ":", currentOccupants.format(a.regInfo))
-		}
-
-		for ii := 0; ii < 64; ii++ {
-			r := RealReg(ii)
+		for r := RealReg(0); r < 64; r++ {
 			desiredVReg := desiredOccupants.get(r)
 			if !desiredVReg.Valid() {
 				continue
 			}
 
-			currentVReg := currentOccupants.get(r)
+			currentVReg := s.regsInUse.get(r)
 			if desiredVReg.ID() == currentVReg.ID() {
 				continue
 			}
@@ -1048,86 +1028,95 @@ func (a *Allocator) fixMergeState(f Function, blk Block) {
 			} else {
 				tmpRealReg = floatTmp
 			}
-			a.reconcileEdge(f, r, pred, currentOccupants, currentOccupantsRev, currentVReg, desiredVReg, tmpRealReg, typ)
+			a.reconcileEdge(f, r, pred, currentVReg, desiredVReg, tmpRealReg, typ)
 		}
 	}
 }
 
+// reconcileEdge reconciles the register state between the current block and the predecessor for the real register `r`.
+//
+//   - currentVReg is the current VReg value that sits on the register `r`. This can be VRegInvalid if the register is not used at the end of the predecessor.
+//   - desiredVReg is the desired VReg value that should be on the register `r`.
+//   - freeReg is the temporary register that can be used to swap the values, which may or may not be used.
+//   - typ is the register type of the `r`.
 func (a *Allocator) reconcileEdge(f Function,
 	r RealReg,
 	pred Block,
-	currentOccupants *regInUseSet,
-	currentOccupantsRev map[VReg]RealReg,
 	currentVReg, desiredVReg VReg,
 	freeReg VReg,
 	typ RegType,
 ) {
+	// There are four cases to consider:
+	// 1. currentVReg is valid, but desiredVReg is on the stack.
+	// 2. Both currentVReg and desiredVReg are valid.
+	// 3. Desired is on a different register than `r` and currentReg is not valid.
+	// 4. Desired is on the stack and currentReg is not valid.
+
 	s := &a.state
 	if currentVReg.Valid() {
-		// Both are on reg.
-		er, ok := currentOccupantsRev[desiredVReg]
-		if !ok {
+		desiredState := s.getVRegState(desiredVReg.ID())
+		er := desiredState.r
+		if er == RealRegInvalid {
+			// Case 1: currentVReg is valid, but desiredVReg is on the stack.
 			if wazevoapi.RegAllocLoggingEnabled {
 				fmt.Printf("\t\tv%d is desired to be on %s, but currently on the stack\n",
 					desiredVReg.ID(), a.regInfo.RealRegName(r),
 				)
 			}
-			// This case is that the desired value is on the stack, but currentVReg is on the target register.
-			// We need to move the current value to the stack, and reload the desired value.
+			// We need to move the current value to the stack, and reload the desired value into the register.
 			// TODO: we can do better here.
 			f.StoreRegisterBefore(currentVReg.SetRealReg(r), pred.LastInstrForInsertion())
-			delete(currentOccupantsRev, currentVReg)
+			s.releaseRealReg(r)
 
 			s.getVRegState(desiredVReg.ID()).recordReload(f, pred)
 			f.ReloadRegisterBefore(desiredVReg.SetRealReg(r), pred.LastInstrForInsertion())
-			currentOccupants.add(r, desiredVReg)
-			currentOccupantsRev[desiredVReg] = r
+			s.useRealReg(r, desiredVReg)
 			return
-		}
-
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Printf("\t\tv%d is desired to be on %s, but currently on %s\n",
-				desiredVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er),
+		} else {
+			// Case 2: Both currentVReg and desiredVReg are valid.
+			if wazevoapi.RegAllocLoggingEnabled {
+				fmt.Printf("\t\tv%d is desired to be on %s, but currently on %s\n",
+					desiredVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er),
+				)
+			}
+			// This case, we need to swap the values between the current and desired values.
+			f.SwapBefore(
+				currentVReg.SetRealReg(r),
+				desiredVReg.SetRealReg(er),
+				freeReg,
+				pred.LastInstrForInsertion(),
 			)
-		}
-		f.SwapBefore(
-			currentVReg.SetRealReg(r),
-			desiredVReg.SetRealReg(er),
-			freeReg,
-			pred.LastInstrForInsertion(),
-		)
-		s.allocatedRegSet = s.allocatedRegSet.add(freeReg.RealReg())
-		currentOccupantsRev[desiredVReg] = r
-		currentOccupantsRev[currentVReg] = er
-		currentOccupants.add(r, desiredVReg)
-		currentOccupants.add(er, currentVReg)
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Printf("\t\tv%d previously on %s moved to %s\n", currentVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er))
+			s.allocatedRegSet = s.allocatedRegSet.add(freeReg.RealReg())
+			s.releaseRealReg(r)
+			s.releaseRealReg(er)
+			s.useRealReg(r, desiredVReg)
+			s.useRealReg(er, currentVReg)
+			if wazevoapi.RegAllocLoggingEnabled {
+				fmt.Printf("\t\tv%d previously on %s moved to %s\n", currentVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er))
+			}
 		}
 	} else {
-		// Desired is on reg, but currently the target register is not used.
 		if wazevoapi.RegAllocLoggingEnabled {
 			fmt.Printf("\t\tv%d is desired to be on %s, current not used\n",
 				desiredVReg.ID(), a.regInfo.RealRegName(r),
 			)
 		}
-		if currentReg, ok := currentOccupantsRev[desiredVReg]; ok {
+		if currentReg := s.getVRegState(desiredVReg.ID()).r; currentReg != RealRegInvalid {
+			// Case 3: Desired is on a different register than `r` and currentReg is not valid.
+			// We simply need to move the desired value to the register.
 			f.InsertMoveBefore(
 				FromRealReg(r, typ),
 				desiredVReg.SetRealReg(currentReg),
 				pred.LastInstrForInsertion(),
 			)
-			currentOccupants.remove(currentReg)
+			s.releaseRealReg(currentReg)
 		} else {
+			// Case 4: Both currentVReg and desiredVReg are not valid.
+			// We simply need to reload the desired value into the register.
 			s.getVRegState(desiredVReg.ID()).recordReload(f, pred)
 			f.ReloadRegisterBefore(desiredVReg.SetRealReg(r), pred.LastInstrForInsertion())
 		}
-		currentOccupantsRev[desiredVReg] = r
-		currentOccupants.add(r, desiredVReg)
-	}
-
-	if wazevoapi.RegAllocLoggingEnabled {
-		fmt.Println("\t", pred.ID(), ":", currentOccupants.format(a.regInfo))
+		s.useRealReg(r, desiredVReg)
 	}
 }
 
@@ -1169,8 +1158,7 @@ func (a *Allocator) scheduleSpill(f Function, vs *vrState) {
 	}
 	for pos != definingBlk {
 		st := a.getOrAllocateBlockState(pos.ID())
-		for ii := 0; ii < 64; ii++ {
-			rr := RealReg(ii)
+		for rr := RealReg(0); rr < 64; rr++ {
 			if st.startRegs.get(rr) == v {
 				r = rr
 				// Already in the register, so we can place the spill at the beginning of the block.
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
index e9bf60661..04a8e8f4d 100644
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
@@ -46,23 +46,24 @@ func (rs RegSet) Range(f func(allocatedRealReg RealReg)) {
 	}
 }
 
-type regInUseSet struct {
-	set RegSet
-	vrs [64]VReg
+type regInUseSet [64]VReg
+
+func newRegInUseSet() regInUseSet {
+	var ret regInUseSet
+	ret.reset()
+	return ret
 }
 
 func (rs *regInUseSet) reset() {
-	rs.set = 0
-	for i := range rs.vrs {
-		rs.vrs[i] = VRegInvalid
+	for i := range rs {
+		rs[i] = VRegInvalid
 	}
 }
 
 func (rs *regInUseSet) format(info *RegisterInfo) string { //nolint:unused
 	var ret []string
-	for i := 0; i < 64; i++ {
-		if rs.set&(1<<uint(i)) != 0 {
-			vr := rs.vrs[i]
+	for i, vr := range rs {
+		if vr != VRegInvalid {
 			ret = append(ret, fmt.Sprintf("(%s->v%d)", info.RealRegName(RealReg(i)), vr.ID()))
 		}
 	}
@@ -70,39 +71,28 @@ func (rs *regInUseSet) format(info *RegisterInfo) string { //nolint:unused
 }
 
 func (rs *regInUseSet) has(r RealReg) bool {
-	if r >= 64 {
-		return false
-	}
-	return rs.set&(1<<uint(r)) != 0
+	return r < 64 && rs[r] != VRegInvalid
 }
 
 func (rs *regInUseSet) get(r RealReg) VReg {
-	if r >= 64 {
-		return VRegInvalid
-	}
-	return rs.vrs[r]
+	return rs[r]
 }
 
 func (rs *regInUseSet) remove(r RealReg) {
-	if r >= 64 {
-		return
-	}
-	rs.set &= ^(1 << uint(r))
-	rs.vrs[r] = VRegInvalid
+	rs[r] = VRegInvalid
 }
 
 func (rs *regInUseSet) add(r RealReg, vr VReg) {
 	if r >= 64 {
 		return
 	}
-	rs.set |= 1 << uint(r)
-	rs.vrs[r] = vr
+	rs[r] = vr
 }
 
 func (rs *regInUseSet) range_(f func(allocatedRealReg RealReg, vr VReg)) {
-	for i := 0; i < 64; i++ {
-		if rs.set&(1<<uint(i)) != 0 {
-			f(RealReg(i), rs.vrs[i])
+	for i, vr := range rs {
+		if vr != VRegInvalid {
+			f(RealReg(i), vr)
 		}
 	}
 }
author	Daenney <daenney@users.noreply.github.com>	2024-06-12 14:21:34 +0200
committer	GitHub <noreply@github.com>	2024-06-12 13:21:34 +0100
commit	978b4176f1a31a497aaadd33f21659b318832c95 (patch)
tree	8ab36617b993a457af5d2975bedaa63a57031ff3 /vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend
parent	[bugfix] Correct Swagger path for poll voting (#2996) (diff)
download	gotosocial-978b4176f1a31a497aaadd33f21659b318832c95.tar.xz