diff options
Diffstat (limited to 'vendor/github.com/tetratelabs/wazero/internal/engine')
99 files changed, 51250 insertions, 0 deletions
diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/compiler.go b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/compiler.go new file mode 100644 index 000000000..56dfac620 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/compiler.go @@ -0,0 +1,3634 @@ +package interpreter + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + "strings" + + "github.com/tetratelabs/wazero/api" + "github.com/tetratelabs/wazero/internal/leb128" + "github.com/tetratelabs/wazero/internal/wasm" +) + +type controlFrameKind byte + +const ( + controlFrameKindBlockWithContinuationLabel controlFrameKind = iota + controlFrameKindBlockWithoutContinuationLabel + controlFrameKindFunction + controlFrameKindLoop + controlFrameKindIfWithElse + controlFrameKindIfWithoutElse +) + +type ( + controlFrame struct { + frameID uint32 + // originalStackLen holds the number of values on the stack + // when Start executing this control frame minus params for the block. + originalStackLenWithoutParam int + blockType *wasm.FunctionType + kind controlFrameKind + } + controlFrames struct{ frames []controlFrame } +) + +func (c *controlFrame) ensureContinuation() { + // Make sure that if the frame is block and doesn't have continuation, + // change the Kind so we can emit the continuation block + // later when we reach the End instruction of this frame. + if c.kind == controlFrameKindBlockWithoutContinuationLabel { + c.kind = controlFrameKindBlockWithContinuationLabel + } +} + +func (c *controlFrame) asLabel() label { + switch c.kind { + case controlFrameKindBlockWithContinuationLabel, + controlFrameKindBlockWithoutContinuationLabel: + return newLabel(labelKindContinuation, c.frameID) + case controlFrameKindLoop: + return newLabel(labelKindHeader, c.frameID) + case controlFrameKindFunction: + return newLabel(labelKindReturn, 0) + case controlFrameKindIfWithElse, + controlFrameKindIfWithoutElse: + return newLabel(labelKindContinuation, c.frameID) + } + panic(fmt.Sprintf("unreachable: a bug in interpreterir implementation: %v", c.kind)) +} + +func (c *controlFrames) functionFrame() *controlFrame { + // No need to check stack bound + // as we can assume that all the operations + // are valid thanks to validateFunction + // at module validation phase. + return &c.frames[0] +} + +func (c *controlFrames) get(n int) *controlFrame { + // No need to check stack bound + // as we can assume that all the operations + // are valid thanks to validateFunction + // at module validation phase. + return &c.frames[len(c.frames)-n-1] +} + +func (c *controlFrames) top() *controlFrame { + // No need to check stack bound + // as we can assume that all the operations + // are valid thanks to validateFunction + // at module validation phase. + return &c.frames[len(c.frames)-1] +} + +func (c *controlFrames) empty() bool { + return len(c.frames) == 0 +} + +func (c *controlFrames) pop() (frame *controlFrame) { + // No need to check stack bound + // as we can assume that all the operations + // are valid thanks to validateFunction + // at module validation phase. + frame = c.top() + c.frames = c.frames[:len(c.frames)-1] + return +} + +func (c *controlFrames) push(frame controlFrame) { + c.frames = append(c.frames, frame) +} + +func (c *compiler) initializeStack() { + // Reuse the existing slice. + c.localIndexToStackHeightInUint64 = c.localIndexToStackHeightInUint64[:0] + var current int + for _, lt := range c.sig.Params { + c.localIndexToStackHeightInUint64 = append(c.localIndexToStackHeightInUint64, current) + if lt == wasm.ValueTypeV128 { + current++ + } + current++ + } + + if c.callFrameStackSizeInUint64 > 0 { + // We reserve the stack slots for result values below the return call frame slots. + if diff := c.sig.ResultNumInUint64 - c.sig.ParamNumInUint64; diff > 0 { + current += diff + } + } + + // Non-func param locals Start after the return call frame. + current += c.callFrameStackSizeInUint64 + + for _, lt := range c.localTypes { + c.localIndexToStackHeightInUint64 = append(c.localIndexToStackHeightInUint64, current) + if lt == wasm.ValueTypeV128 { + current++ + } + current++ + } + + // Push function arguments. + for _, t := range c.sig.Params { + c.stackPush(wasmValueTypeTounsignedType(t)) + } + + if c.callFrameStackSizeInUint64 > 0 { + // Reserve the stack slots for results. + for i := 0; i < c.sig.ResultNumInUint64-c.sig.ParamNumInUint64; i++ { + c.stackPush(unsignedTypeI64) + } + + // Reserve the stack slots for call frame. + for i := 0; i < c.callFrameStackSizeInUint64; i++ { + c.stackPush(unsignedTypeI64) + } + } +} + +// compiler is in charge of lowering raw Wasm function body to get compilationResult. +// This is created per *wasm.Module and reused for all functions in it to reduce memory allocations. +type compiler struct { + module *wasm.Module + enabledFeatures api.CoreFeatures + callFrameStackSizeInUint64 int + stack []unsignedType + currentFrameID uint32 + controlFrames controlFrames + unreachableState struct { + on bool + depth int + } + pc, currentOpPC uint64 + result compilationResult + + // body holds the code for the function's body where Wasm instructions are stored. + body []byte + // sig is the function type of the target function. + sig *wasm.FunctionType + // localTypes holds the target function locals' value types except function params. + localTypes []wasm.ValueType + // localIndexToStackHeightInUint64 maps the local index (starting with function params) to the stack height + // where the local is places. This is the necessary mapping for functions who contain vector type locals. + localIndexToStackHeightInUint64 []int + + // types hold all the function types in the module where the targe function exists. + types []wasm.FunctionType + // funcs holds the type indexes for all declared functions in the module where the target function exists. + funcs []uint32 + // globals holds the global types for all declared globals in the module where the target function exists. + globals []wasm.GlobalType + + // needSourceOffset is true if this module requires DWARF based stack trace. + needSourceOffset bool + // bodyOffsetInCodeSection is the offset of the body of this function in the original Wasm binary's code section. + bodyOffsetInCodeSection uint64 + + ensureTermination bool + // Pre-allocated bytes.Reader to be used in various places. + br *bytes.Reader + funcTypeToSigs funcTypeToIRSignatures + + next int +} + +//lint:ignore U1000 for debugging only. +func (c *compiler) stackDump() string { + strs := make([]string, 0, len(c.stack)) + for _, s := range c.stack { + strs = append(strs, s.String()) + } + return "[" + strings.Join(strs, ", ") + "]" +} + +func (c *compiler) markUnreachable() { + c.unreachableState.on = true +} + +func (c *compiler) resetUnreachable() { + c.unreachableState.on = false +} + +// memoryType is the type of memory in a compiled module. +type memoryType byte + +const ( + // memoryTypeNone indicates there is no memory. + memoryTypeNone memoryType = iota + // memoryTypeStandard indicates there is a non-shared memory. + memoryTypeStandard + // memoryTypeShared indicates there is a shared memory. + memoryTypeShared +) + +type compilationResult struct { + // Operations holds interpreterir operations compiled from Wasm instructions in a Wasm function. + Operations []unionOperation + + // IROperationSourceOffsetsInWasmBinary is index-correlated with Operation and maps each operation to the corresponding source instruction's + // offset in the original WebAssembly binary. + // Non nil only when the given Wasm module has the DWARF section. + IROperationSourceOffsetsInWasmBinary []uint64 + + // LabelCallers maps label to the number of callers to that label. + // Here "callers" means that the call-sites which jumps to the label with br, br_if or br_table + // instructions. + // + // Note: zero possible and allowed in wasm. e.g. + // + // (block + // (br 0) + // (block i32.const 1111) + // ) + // + // This example the label corresponding to `(block i32.const 1111)` is never be reached at runtime because `br 0` exits the function before we reach there + LabelCallers map[label]uint32 + // UsesMemory is true if this function might use memory. + UsesMemory bool + + // The following fields are per-module values, not per-function. + + // Globals holds all the declarations of globals in the module from which this function is compiled. + Globals []wasm.GlobalType + // Functions holds all the declarations of function in the module from which this function is compiled, including itself. + Functions []wasm.Index + // Types holds all the types in the module from which this function is compiled. + Types []wasm.FunctionType + // Memory indicates the type of memory of the module. + Memory memoryType + // HasTable is true if the module from which this function is compiled has table declaration. + HasTable bool + // HasDataInstances is true if the module has data instances which might be used by memory.init or data.drop instructions. + HasDataInstances bool + // HasDataInstances is true if the module has element instances which might be used by table.init or elem.drop instructions. + HasElementInstances bool +} + +// newCompiler returns the new *compiler for the given parameters. +// Use compiler.Next function to get compilation result per function. +func newCompiler(enabledFeatures api.CoreFeatures, callFrameStackSizeInUint64 int, module *wasm.Module, ensureTermination bool) (*compiler, error) { + functions, globals, mem, tables, err := module.AllDeclarations() + if err != nil { + return nil, err + } + + hasTable, hasDataInstances, hasElementInstances := len(tables) > 0, + len(module.DataSection) > 0, len(module.ElementSection) > 0 + + var mt memoryType + switch { + case mem == nil: + mt = memoryTypeNone + case mem.IsShared: + mt = memoryTypeShared + default: + mt = memoryTypeStandard + } + + types := module.TypeSection + + c := &compiler{ + module: module, + enabledFeatures: enabledFeatures, + controlFrames: controlFrames{}, + callFrameStackSizeInUint64: callFrameStackSizeInUint64, + result: compilationResult{ + Globals: globals, + Functions: functions, + Types: types, + Memory: mt, + HasTable: hasTable, + HasDataInstances: hasDataInstances, + HasElementInstances: hasElementInstances, + LabelCallers: map[label]uint32{}, + }, + globals: globals, + funcs: functions, + types: types, + ensureTermination: ensureTermination, + br: bytes.NewReader(nil), + funcTypeToSigs: funcTypeToIRSignatures{ + indirectCalls: make([]*signature, len(types)), + directCalls: make([]*signature, len(types)), + wasmTypes: types, + }, + needSourceOffset: module.DWARFLines != nil, + } + return c, nil +} + +// Next returns the next compilationResult for this compiler. +func (c *compiler) Next() (*compilationResult, error) { + funcIndex := c.next + code := &c.module.CodeSection[funcIndex] + sig := &c.types[c.module.FunctionSection[funcIndex]] + + // Reset the previous result. + c.result.Operations = c.result.Operations[:0] + c.result.IROperationSourceOffsetsInWasmBinary = c.result.IROperationSourceOffsetsInWasmBinary[:0] + c.result.UsesMemory = false + // Clears the existing entries in LabelCallers. + for frameID := uint32(0); frameID <= c.currentFrameID; frameID++ { + for k := labelKind(0); k < labelKindNum; k++ { + delete(c.result.LabelCallers, newLabel(k, frameID)) + } + } + // Reset the previous states. + c.pc = 0 + c.currentOpPC = 0 + c.currentFrameID = 0 + c.unreachableState.on, c.unreachableState.depth = false, 0 + + if err := c.compile(sig, code.Body, code.LocalTypes, code.BodyOffsetInCodeSection); err != nil { + return nil, err + } + c.next++ + return &c.result, nil +} + +// Compile lowers given function instance into interpreterir operations +// so that the resulting operations can be consumed by the interpreter +// or the compiler compilation engine. +func (c *compiler) compile(sig *wasm.FunctionType, body []byte, localTypes []wasm.ValueType, bodyOffsetInCodeSection uint64) error { + // Set function specific fields. + c.body = body + c.localTypes = localTypes + c.sig = sig + c.bodyOffsetInCodeSection = bodyOffsetInCodeSection + + // Reuses the underlying slices. + c.stack = c.stack[:0] + c.controlFrames.frames = c.controlFrames.frames[:0] + + c.initializeStack() + + // Emit const expressions for locals. + // Note that here we don't take function arguments + // into account, meaning that callers must push + // arguments before entering into the function body. + for _, t := range c.localTypes { + c.emitDefaultValue(t) + } + + // Insert the function control frame. + c.controlFrames.push(controlFrame{ + frameID: c.nextFrameID(), + blockType: c.sig, + kind: controlFrameKindFunction, + }) + + // Now, enter the function body. + for !c.controlFrames.empty() && c.pc < uint64(len(c.body)) { + if err := c.handleInstruction(); err != nil { + return fmt.Errorf("handling instruction: %w", err) + } + } + return nil +} + +// Translate the current Wasm instruction to interpreterir's operations, +// and emit the results into c.results. +func (c *compiler) handleInstruction() error { + op := c.body[c.pc] + c.currentOpPC = c.pc + if false { + var instName string + if op == wasm.OpcodeVecPrefix { + instName = wasm.VectorInstructionName(c.body[c.pc+1]) + } else if op == wasm.OpcodeAtomicPrefix { + instName = wasm.AtomicInstructionName(c.body[c.pc+1]) + } else if op == wasm.OpcodeMiscPrefix { + instName = wasm.MiscInstructionName(c.body[c.pc+1]) + } else { + instName = wasm.InstructionName(op) + } + fmt.Printf("handling %s, unreachable_state(on=%v,depth=%d), stack=%v\n", + instName, c.unreachableState.on, c.unreachableState.depth, c.stack, + ) + } + + var peekValueType unsignedType + if len(c.stack) > 0 { + peekValueType = c.stackPeek() + } + + // Modify the stack according the current instruction. + // Note that some instructions will read "index" in + // applyToStack and advance c.pc inside the function. + index, err := c.applyToStack(op) + if err != nil { + return fmt.Errorf("apply stack failed for %s: %w", wasm.InstructionName(op), err) + } + // Now we handle each instruction, and + // emit the corresponding interpreterir operations to the results. +operatorSwitch: + switch op { + case wasm.OpcodeUnreachable: + c.emit(newOperationUnreachable()) + c.markUnreachable() + case wasm.OpcodeNop: + // Nop is noop! + case wasm.OpcodeBlock: + c.br.Reset(c.body[c.pc+1:]) + bt, num, err := wasm.DecodeBlockType(c.types, c.br, c.enabledFeatures) + if err != nil { + return fmt.Errorf("reading block type for block instruction: %w", err) + } + c.pc += num + + if c.unreachableState.on { + // If it is currently in unreachable, + // just remove the entire block. + c.unreachableState.depth++ + break operatorSwitch + } + + // Create a new frame -- entering this block. + frame := controlFrame{ + frameID: c.nextFrameID(), + originalStackLenWithoutParam: len(c.stack) - len(bt.Params), + kind: controlFrameKindBlockWithoutContinuationLabel, + blockType: bt, + } + c.controlFrames.push(frame) + + case wasm.OpcodeLoop: + c.br.Reset(c.body[c.pc+1:]) + bt, num, err := wasm.DecodeBlockType(c.types, c.br, c.enabledFeatures) + if err != nil { + return fmt.Errorf("reading block type for loop instruction: %w", err) + } + c.pc += num + + if c.unreachableState.on { + // If it is currently in unreachable, + // just remove the entire block. + c.unreachableState.depth++ + break operatorSwitch + } + + // Create a new frame -- entering loop. + frame := controlFrame{ + frameID: c.nextFrameID(), + originalStackLenWithoutParam: len(c.stack) - len(bt.Params), + kind: controlFrameKindLoop, + blockType: bt, + } + c.controlFrames.push(frame) + + // Prep labels for inside and the continuation of this loop. + loopLabel := newLabel(labelKindHeader, frame.frameID) + c.result.LabelCallers[loopLabel]++ + + // Emit the branch operation to enter inside the loop. + c.emit(newOperationBr(loopLabel)) + c.emit(newOperationLabel(loopLabel)) + + // Insert the exit code check on the loop header, which is the only necessary point in the function body + // to prevent infinite loop. + // + // Note that this is a little aggressive: this checks the exit code regardless the loop header is actually + // the loop. In other words, this checks even when no br/br_if/br_table instructions jumping to this loop + // exist. However, in reality, that shouldn't be an issue since such "noop" loop header will highly likely be + // optimized out by almost all guest language compilers which have the control flow optimization passes. + if c.ensureTermination { + c.emit(newOperationBuiltinFunctionCheckExitCode()) + } + case wasm.OpcodeIf: + c.br.Reset(c.body[c.pc+1:]) + bt, num, err := wasm.DecodeBlockType(c.types, c.br, c.enabledFeatures) + if err != nil { + return fmt.Errorf("reading block type for if instruction: %w", err) + } + c.pc += num + + if c.unreachableState.on { + // If it is currently in unreachable, + // just remove the entire block. + c.unreachableState.depth++ + break operatorSwitch + } + + // Create a new frame -- entering if. + frame := controlFrame{ + frameID: c.nextFrameID(), + originalStackLenWithoutParam: len(c.stack) - len(bt.Params), + // Note this will be set to controlFrameKindIfWithElse + // when else opcode found later. + kind: controlFrameKindIfWithoutElse, + blockType: bt, + } + c.controlFrames.push(frame) + + // Prep labels for if and else of this if. + thenLabel := newLabel(labelKindHeader, frame.frameID) + elseLabel := newLabel(labelKindElse, frame.frameID) + c.result.LabelCallers[thenLabel]++ + c.result.LabelCallers[elseLabel]++ + + // Emit the branch operation to enter the then block. + c.emit(newOperationBrIf(thenLabel, elseLabel, nopinclusiveRange)) + c.emit(newOperationLabel(thenLabel)) + case wasm.OpcodeElse: + frame := c.controlFrames.top() + if c.unreachableState.on && c.unreachableState.depth > 0 { + // If it is currently in unreachable, and the nested if, + // just remove the entire else block. + break operatorSwitch + } else if c.unreachableState.on { + // If it is currently in unreachable, and the non-nested if, + // reset the stack so we can correctly handle the else block. + top := c.controlFrames.top() + c.stack = c.stack[:top.originalStackLenWithoutParam] + top.kind = controlFrameKindIfWithElse + + // Re-push the parameters to the if block so that else block can use them. + for _, t := range frame.blockType.Params { + c.stackPush(wasmValueTypeTounsignedType(t)) + } + + // We are no longer unreachable in else frame, + // so emit the correct label, and reset the unreachable state. + elseLabel := newLabel(labelKindElse, frame.frameID) + c.resetUnreachable() + c.emit( + newOperationLabel(elseLabel), + ) + break operatorSwitch + } + + // Change the Kind of this If block, indicating that + // the if has else block. + frame.kind = controlFrameKindIfWithElse + + // We need to reset the stack so that + // the values pushed inside the then block + // do not affect the else block. + dropOp := newOperationDrop(c.getFrameDropRange(frame, false)) + + // Reset the stack manipulated by the then block, and re-push the block param types to the stack. + + c.stack = c.stack[:frame.originalStackLenWithoutParam] + for _, t := range frame.blockType.Params { + c.stackPush(wasmValueTypeTounsignedType(t)) + } + + // Prep labels for else and the continuation of this if block. + elseLabel := newLabel(labelKindElse, frame.frameID) + continuationLabel := newLabel(labelKindContinuation, frame.frameID) + c.result.LabelCallers[continuationLabel]++ + + // Emit the instructions for exiting the if loop, + // and then the initiation of else block. + c.emit(dropOp) + // Jump to the continuation of this block. + c.emit(newOperationBr(continuationLabel)) + // Initiate the else block. + c.emit(newOperationLabel(elseLabel)) + case wasm.OpcodeEnd: + if c.unreachableState.on && c.unreachableState.depth > 0 { + c.unreachableState.depth-- + break operatorSwitch + } else if c.unreachableState.on { + c.resetUnreachable() + + frame := c.controlFrames.pop() + if c.controlFrames.empty() { + return nil + } + + c.stack = c.stack[:frame.originalStackLenWithoutParam] + for _, t := range frame.blockType.Results { + c.stackPush(wasmValueTypeTounsignedType(t)) + } + + continuationLabel := newLabel(labelKindContinuation, frame.frameID) + if frame.kind == controlFrameKindIfWithoutElse { + // Emit the else label. + elseLabel := newLabel(labelKindElse, frame.frameID) + c.result.LabelCallers[continuationLabel]++ + c.emit(newOperationLabel(elseLabel)) + c.emit(newOperationBr(continuationLabel)) + c.emit(newOperationLabel(continuationLabel)) + } else { + c.emit( + newOperationLabel(continuationLabel), + ) + } + + break operatorSwitch + } + + frame := c.controlFrames.pop() + + // We need to reset the stack so that + // the values pushed inside the block. + dropOp := newOperationDrop(c.getFrameDropRange(frame, true)) + c.stack = c.stack[:frame.originalStackLenWithoutParam] + + // Push the result types onto the stack. + for _, t := range frame.blockType.Results { + c.stackPush(wasmValueTypeTounsignedType(t)) + } + + // Emit the instructions according to the Kind of the current control frame. + switch frame.kind { + case controlFrameKindFunction: + if !c.controlFrames.empty() { + // Should never happen. If so, there's a bug in the translation. + panic("bug: found more function control frames") + } + // Return from function. + c.emit(dropOp) + c.emit(newOperationBr(newLabel(labelKindReturn, 0))) + case controlFrameKindIfWithoutElse: + // This case we have to emit "empty" else label. + elseLabel := newLabel(labelKindElse, frame.frameID) + continuationLabel := newLabel(labelKindContinuation, frame.frameID) + c.result.LabelCallers[continuationLabel] += 2 + c.emit(dropOp) + c.emit(newOperationBr(continuationLabel)) + // Emit the else which soon branches into the continuation. + c.emit(newOperationLabel(elseLabel)) + c.emit(newOperationBr(continuationLabel)) + // Initiate the continuation. + c.emit(newOperationLabel(continuationLabel)) + case controlFrameKindBlockWithContinuationLabel, + controlFrameKindIfWithElse: + continuationLabel := newLabel(labelKindContinuation, frame.frameID) + c.result.LabelCallers[continuationLabel]++ + c.emit(dropOp) + c.emit(newOperationBr(continuationLabel)) + c.emit(newOperationLabel(continuationLabel)) + case controlFrameKindLoop, controlFrameKindBlockWithoutContinuationLabel: + c.emit( + dropOp, + ) + default: + // Should never happen. If so, there's a bug in the translation. + panic(fmt.Errorf("bug: invalid control frame Kind: 0x%x", frame.kind)) + } + + case wasm.OpcodeBr: + targetIndex, n, err := leb128.LoadUint32(c.body[c.pc+1:]) + if err != nil { + return fmt.Errorf("read the target for br_if: %w", err) + } + c.pc += n + + if c.unreachableState.on { + // If it is currently in unreachable, br is no-op. + break operatorSwitch + } + + targetFrame := c.controlFrames.get(int(targetIndex)) + targetFrame.ensureContinuation() + dropOp := newOperationDrop(c.getFrameDropRange(targetFrame, false)) + targetID := targetFrame.asLabel() + c.result.LabelCallers[targetID]++ + c.emit(dropOp) + c.emit(newOperationBr(targetID)) + // Br operation is stack-polymorphic, and mark the state as unreachable. + // That means subsequent instructions in the current control frame are "unreachable" + // and can be safely removed. + c.markUnreachable() + case wasm.OpcodeBrIf: + targetIndex, n, err := leb128.LoadUint32(c.body[c.pc+1:]) + if err != nil { + return fmt.Errorf("read the target for br_if: %w", err) + } + c.pc += n + + if c.unreachableState.on { + // If it is currently in unreachable, br-if is no-op. + break operatorSwitch + } + + targetFrame := c.controlFrames.get(int(targetIndex)) + targetFrame.ensureContinuation() + drop := c.getFrameDropRange(targetFrame, false) + target := targetFrame.asLabel() + c.result.LabelCallers[target]++ + + continuationLabel := newLabel(labelKindHeader, c.nextFrameID()) + c.result.LabelCallers[continuationLabel]++ + c.emit(newOperationBrIf(target, continuationLabel, drop)) + // Start emitting else block operations. + c.emit(newOperationLabel(continuationLabel)) + case wasm.OpcodeBrTable: + c.br.Reset(c.body[c.pc+1:]) + r := c.br + numTargets, n, err := leb128.DecodeUint32(r) + if err != nil { + return fmt.Errorf("error reading number of targets in br_table: %w", err) + } + c.pc += n + + if c.unreachableState.on { + // If it is currently in unreachable, br_table is no-op. + // But before proceeding to the next instruction, we must advance the pc + // according to the number of br_table targets. + for i := uint32(0); i <= numTargets; i++ { // inclusive as we also need to read the index of default target. + _, n, err := leb128.DecodeUint32(r) + if err != nil { + return fmt.Errorf("error reading target %d in br_table: %w", i, err) + } + c.pc += n + } + break operatorSwitch + } + + // Read the branch targets. + s := numTargets * 2 + targetLabels := make([]uint64, 2+s) // (label, inclusiveRange) * (default+numTargets) + for i := uint32(0); i < s; i += 2 { + l, n, err := leb128.DecodeUint32(r) + if err != nil { + return fmt.Errorf("error reading target %d in br_table: %w", i, err) + } + c.pc += n + targetFrame := c.controlFrames.get(int(l)) + targetFrame.ensureContinuation() + drop := c.getFrameDropRange(targetFrame, false) + targetLabel := targetFrame.asLabel() + targetLabels[i] = uint64(targetLabel) + targetLabels[i+1] = drop.AsU64() + c.result.LabelCallers[targetLabel]++ + } + + // Prep default target control frame. + l, n, err := leb128.DecodeUint32(r) + if err != nil { + return fmt.Errorf("error reading default target of br_table: %w", err) + } + c.pc += n + defaultTargetFrame := c.controlFrames.get(int(l)) + defaultTargetFrame.ensureContinuation() + defaultTargetDrop := c.getFrameDropRange(defaultTargetFrame, false) + defaultLabel := defaultTargetFrame.asLabel() + c.result.LabelCallers[defaultLabel]++ + targetLabels[s] = uint64(defaultLabel) + targetLabels[s+1] = defaultTargetDrop.AsU64() + c.emit(newOperationBrTable(targetLabels)) + + // br_table operation is stack-polymorphic, and mark the state as unreachable. + // That means subsequent instructions in the current control frame are "unreachable" + // and can be safely removed. + c.markUnreachable() + case wasm.OpcodeReturn: + functionFrame := c.controlFrames.functionFrame() + dropOp := newOperationDrop(c.getFrameDropRange(functionFrame, false)) + + // Cleanup the stack and then jmp to function frame's continuation (meaning return). + c.emit(dropOp) + c.emit(newOperationBr(functionFrame.asLabel())) + + // Return operation is stack-polymorphic, and mark the state as unreachable. + // That means subsequent instructions in the current control frame are "unreachable" + // and can be safely removed. + c.markUnreachable() + case wasm.OpcodeCall: + c.emit( + newOperationCall(index), + ) + case wasm.OpcodeCallIndirect: + typeIndex := index + tableIndex, n, err := leb128.LoadUint32(c.body[c.pc+1:]) + if err != nil { + return fmt.Errorf("read target for br_table: %w", err) + } + c.pc += n + c.emit( + newOperationCallIndirect(typeIndex, tableIndex), + ) + case wasm.OpcodeDrop: + r := inclusiveRange{Start: 0, End: 0} + if peekValueType == unsignedTypeV128 { + // inclusiveRange is the range in uint64 representation, so dropping a vector value on top + // should be translated as drop [0..1] inclusively. + r.End++ + } + c.emit(newOperationDrop(r)) + case wasm.OpcodeSelect: + // If it is on the unreachable state, ignore the instruction. + if c.unreachableState.on { + break operatorSwitch + } + isTargetVector := c.stackPeek() == unsignedTypeV128 + c.emit( + newOperationSelect(isTargetVector), + ) + case wasm.OpcodeTypedSelect: + // Skips two bytes: vector size fixed to 1, and the value type for select. + c.pc += 2 + // If it is on the unreachable state, ignore the instruction. + if c.unreachableState.on { + break operatorSwitch + } + // Typed select is semantically equivalent to select at runtime. + isTargetVector := c.stackPeek() == unsignedTypeV128 + c.emit( + newOperationSelect(isTargetVector), + ) + case wasm.OpcodeLocalGet: + depth := c.localDepth(index) + if isVector := c.localType(index) == wasm.ValueTypeV128; !isVector { + c.emit( + // -1 because we already manipulated the stack before + // called localDepth ^^. + newOperationPick(depth-1, isVector), + ) + } else { + c.emit( + // -2 because we already manipulated the stack before + // called localDepth ^^. + newOperationPick(depth-2, isVector), + ) + } + case wasm.OpcodeLocalSet: + depth := c.localDepth(index) + + isVector := c.localType(index) == wasm.ValueTypeV128 + if isVector { + c.emit( + // +2 because we already popped the operands for this operation from the c.stack before + // called localDepth ^^, + newOperationSet(depth+2, isVector), + ) + } else { + c.emit( + // +1 because we already popped the operands for this operation from the c.stack before + // called localDepth ^^, + newOperationSet(depth+1, isVector), + ) + } + case wasm.OpcodeLocalTee: + depth := c.localDepth(index) + isVector := c.localType(index) == wasm.ValueTypeV128 + if isVector { + c.emit(newOperationPick(1, isVector)) + c.emit(newOperationSet(depth+2, isVector)) + } else { + c.emit( + newOperationPick(0, isVector)) + c.emit(newOperationSet(depth+1, isVector)) + } + case wasm.OpcodeGlobalGet: + c.emit( + newOperationGlobalGet(index), + ) + case wasm.OpcodeGlobalSet: + c.emit( + newOperationGlobalSet(index), + ) + case wasm.OpcodeI32Load: + imm, err := c.readMemoryArg(wasm.OpcodeI32LoadName) + if err != nil { + return err + } + c.emit(newOperationLoad(unsignedTypeI32, imm)) + case wasm.OpcodeI64Load: + imm, err := c.readMemoryArg(wasm.OpcodeI64LoadName) + if err != nil { + return err + } + c.emit(newOperationLoad(unsignedTypeI64, imm)) + case wasm.OpcodeF32Load: + imm, err := c.readMemoryArg(wasm.OpcodeF32LoadName) + if err != nil { + return err + } + c.emit(newOperationLoad(unsignedTypeF32, imm)) + case wasm.OpcodeF64Load: + imm, err := c.readMemoryArg(wasm.OpcodeF64LoadName) + if err != nil { + return err + } + c.emit(newOperationLoad(unsignedTypeF64, imm)) + case wasm.OpcodeI32Load8S: + imm, err := c.readMemoryArg(wasm.OpcodeI32Load8SName) + if err != nil { + return err + } + c.emit(newOperationLoad8(signedInt32, imm)) + case wasm.OpcodeI32Load8U: + imm, err := c.readMemoryArg(wasm.OpcodeI32Load8UName) + if err != nil { + return err + } + c.emit(newOperationLoad8(signedUint32, imm)) + case wasm.OpcodeI32Load16S: + imm, err := c.readMemoryArg(wasm.OpcodeI32Load16SName) + if err != nil { + return err + } + c.emit(newOperationLoad16(signedInt32, imm)) + case wasm.OpcodeI32Load16U: + imm, err := c.readMemoryArg(wasm.OpcodeI32Load16UName) + if err != nil { + return err + } + c.emit(newOperationLoad16(signedUint32, imm)) + case wasm.OpcodeI64Load8S: + imm, err := c.readMemoryArg(wasm.OpcodeI64Load8SName) + if err != nil { + return err + } + c.emit(newOperationLoad8(signedInt64, imm)) + case wasm.OpcodeI64Load8U: + imm, err := c.readMemoryArg(wasm.OpcodeI64Load8UName) + if err != nil { + return err + } + c.emit(newOperationLoad8(signedUint64, imm)) + case wasm.OpcodeI64Load16S: + imm, err := c.readMemoryArg(wasm.OpcodeI64Load16SName) + if err != nil { + return err + } + c.emit(newOperationLoad16(signedInt64, imm)) + case wasm.OpcodeI64Load16U: + imm, err := c.readMemoryArg(wasm.OpcodeI64Load16UName) + if err != nil { + return err + } + c.emit(newOperationLoad16(signedUint64, imm)) + case wasm.OpcodeI64Load32S: + imm, err := c.readMemoryArg(wasm.OpcodeI64Load32SName) + if err != nil { + return err + } + c.emit(newOperationLoad32(true, imm)) + case wasm.OpcodeI64Load32U: + imm, err := c.readMemoryArg(wasm.OpcodeI64Load32UName) + if err != nil { + return err + } + c.emit(newOperationLoad32(false, imm)) + case wasm.OpcodeI32Store: + imm, err := c.readMemoryArg(wasm.OpcodeI32StoreName) + if err != nil { + return err + } + c.emit( + newOperationStore(unsignedTypeI32, imm), + ) + case wasm.OpcodeI64Store: + imm, err := c.readMemoryArg(wasm.OpcodeI64StoreName) + if err != nil { + return err + } + c.emit( + newOperationStore(unsignedTypeI64, imm), + ) + case wasm.OpcodeF32Store: + imm, err := c.readMemoryArg(wasm.OpcodeF32StoreName) + if err != nil { + return err + } + c.emit( + newOperationStore(unsignedTypeF32, imm), + ) + case wasm.OpcodeF64Store: + imm, err := c.readMemoryArg(wasm.OpcodeF64StoreName) + if err != nil { + return err + } + c.emit( + newOperationStore(unsignedTypeF64, imm), + ) + case wasm.OpcodeI32Store8: + imm, err := c.readMemoryArg(wasm.OpcodeI32Store8Name) + if err != nil { + return err + } + c.emit( + newOperationStore8(imm), + ) + case wasm.OpcodeI32Store16: + imm, err := c.readMemoryArg(wasm.OpcodeI32Store16Name) + if err != nil { + return err + } + c.emit( + newOperationStore16(imm), + ) + case wasm.OpcodeI64Store8: + imm, err := c.readMemoryArg(wasm.OpcodeI64Store8Name) + if err != nil { + return err + } + c.emit( + newOperationStore8(imm), + ) + case wasm.OpcodeI64Store16: + imm, err := c.readMemoryArg(wasm.OpcodeI64Store16Name) + if err != nil { + return err + } + c.emit( + newOperationStore16(imm), + ) + case wasm.OpcodeI64Store32: + imm, err := c.readMemoryArg(wasm.OpcodeI64Store32Name) + if err != nil { + return err + } + c.emit( + newOperationStore32(imm), + ) + case wasm.OpcodeMemorySize: + c.result.UsesMemory = true + c.pc++ // Skip the reserved one byte. + c.emit( + newOperationMemorySize(), + ) + case wasm.OpcodeMemoryGrow: + c.result.UsesMemory = true + c.pc++ // Skip the reserved one byte. + c.emit( + newOperationMemoryGrow(), + ) + case wasm.OpcodeI32Const: + val, num, err := leb128.LoadInt32(c.body[c.pc+1:]) + if err != nil { + return fmt.Errorf("reading i32.const value: %v", err) + } + c.pc += num + c.emit( + newOperationConstI32(uint32(val)), + ) + case wasm.OpcodeI64Const: + val, num, err := leb128.LoadInt64(c.body[c.pc+1:]) + if err != nil { + return fmt.Errorf("reading i64.const value: %v", err) + } + c.pc += num + c.emit( + newOperationConstI64(uint64(val)), + ) + case wasm.OpcodeF32Const: + v := math.Float32frombits(binary.LittleEndian.Uint32(c.body[c.pc+1:])) + c.pc += 4 + c.emit( + newOperationConstF32(v), + ) + case wasm.OpcodeF64Const: + v := math.Float64frombits(binary.LittleEndian.Uint64(c.body[c.pc+1:])) + c.pc += 8 + c.emit( + newOperationConstF64(v), + ) + case wasm.OpcodeI32Eqz: + c.emit( + newOperationEqz(unsignedInt32), + ) + case wasm.OpcodeI32Eq: + c.emit( + newOperationEq(unsignedTypeI32), + ) + case wasm.OpcodeI32Ne: + c.emit( + newOperationNe(unsignedTypeI32), + ) + case wasm.OpcodeI32LtS: + c.emit( + newOperationLt(signedTypeInt32), + ) + case wasm.OpcodeI32LtU: + c.emit( + newOperationLt(signedTypeUint32), + ) + case wasm.OpcodeI32GtS: + c.emit( + newOperationGt(signedTypeInt32), + ) + case wasm.OpcodeI32GtU: + c.emit( + newOperationGt(signedTypeUint32), + ) + case wasm.OpcodeI32LeS: + c.emit( + newOperationLe(signedTypeInt32), + ) + case wasm.OpcodeI32LeU: + c.emit( + newOperationLe(signedTypeUint32), + ) + case wasm.OpcodeI32GeS: + c.emit( + newOperationGe(signedTypeInt32), + ) + case wasm.OpcodeI32GeU: + c.emit( + newOperationGe(signedTypeUint32), + ) + case wasm.OpcodeI64Eqz: + c.emit( + newOperationEqz(unsignedInt64), + ) + case wasm.OpcodeI64Eq: + c.emit( + newOperationEq(unsignedTypeI64), + ) + case wasm.OpcodeI64Ne: + c.emit( + newOperationNe(unsignedTypeI64), + ) + case wasm.OpcodeI64LtS: + c.emit( + newOperationLt(signedTypeInt64), + ) + case wasm.OpcodeI64LtU: + c.emit( + newOperationLt(signedTypeUint64), + ) + case wasm.OpcodeI64GtS: + c.emit( + newOperationGt(signedTypeInt64), + ) + case wasm.OpcodeI64GtU: + c.emit( + newOperationGt(signedTypeUint64), + ) + case wasm.OpcodeI64LeS: + c.emit( + newOperationLe(signedTypeInt64), + ) + case wasm.OpcodeI64LeU: + c.emit( + newOperationLe(signedTypeUint64), + ) + case wasm.OpcodeI64GeS: + c.emit( + newOperationGe(signedTypeInt64), + ) + case wasm.OpcodeI64GeU: + c.emit( + newOperationGe(signedTypeUint64), + ) + case wasm.OpcodeF32Eq: + c.emit( + newOperationEq(unsignedTypeF32), + ) + case wasm.OpcodeF32Ne: + c.emit( + newOperationNe(unsignedTypeF32), + ) + case wasm.OpcodeF32Lt: + c.emit( + newOperationLt(signedTypeFloat32), + ) + case wasm.OpcodeF32Gt: + c.emit( + newOperationGt(signedTypeFloat32), + ) + case wasm.OpcodeF32Le: + c.emit( + newOperationLe(signedTypeFloat32), + ) + case wasm.OpcodeF32Ge: + c.emit( + newOperationGe(signedTypeFloat32), + ) + case wasm.OpcodeF64Eq: + c.emit( + newOperationEq(unsignedTypeF64), + ) + case wasm.OpcodeF64Ne: + c.emit( + newOperationNe(unsignedTypeF64), + ) + case wasm.OpcodeF64Lt: + c.emit( + newOperationLt(signedTypeFloat64), + ) + case wasm.OpcodeF64Gt: + c.emit( + newOperationGt(signedTypeFloat64), + ) + case wasm.OpcodeF64Le: + c.emit( + newOperationLe(signedTypeFloat64), + ) + case wasm.OpcodeF64Ge: + c.emit( + newOperationGe(signedTypeFloat64), + ) + case wasm.OpcodeI32Clz: + c.emit( + newOperationClz(unsignedInt32), + ) + case wasm.OpcodeI32Ctz: + c.emit( + newOperationCtz(unsignedInt32), + ) + case wasm.OpcodeI32Popcnt: + c.emit( + newOperationPopcnt(unsignedInt32), + ) + case wasm.OpcodeI32Add: + c.emit( + newOperationAdd(unsignedTypeI32), + ) + case wasm.OpcodeI32Sub: + c.emit( + newOperationSub(unsignedTypeI32), + ) + case wasm.OpcodeI32Mul: + c.emit( + newOperationMul(unsignedTypeI32), + ) + case wasm.OpcodeI32DivS: + c.emit( + newOperationDiv(signedTypeInt32), + ) + case wasm.OpcodeI32DivU: + c.emit( + newOperationDiv(signedTypeUint32), + ) + case wasm.OpcodeI32RemS: + c.emit( + newOperationRem(signedInt32), + ) + case wasm.OpcodeI32RemU: + c.emit( + newOperationRem(signedUint32), + ) + case wasm.OpcodeI32And: + c.emit( + newOperationAnd(unsignedInt32), + ) + case wasm.OpcodeI32Or: + c.emit( + newOperationOr(unsignedInt32), + ) + case wasm.OpcodeI32Xor: + c.emit( + newOperationXor(unsignedInt64), + ) + case wasm.OpcodeI32Shl: + c.emit( + newOperationShl(unsignedInt32), + ) + case wasm.OpcodeI32ShrS: + c.emit( + newOperationShr(signedInt32), + ) + case wasm.OpcodeI32ShrU: + c.emit( + newOperationShr(signedUint32), + ) + case wasm.OpcodeI32Rotl: + c.emit( + newOperationRotl(unsignedInt32), + ) + case wasm.OpcodeI32Rotr: + c.emit( + newOperationRotr(unsignedInt32), + ) + case wasm.OpcodeI64Clz: + c.emit( + newOperationClz(unsignedInt64), + ) + case wasm.OpcodeI64Ctz: + c.emit( + newOperationCtz(unsignedInt64), + ) + case wasm.OpcodeI64Popcnt: + c.emit( + newOperationPopcnt(unsignedInt64), + ) + case wasm.OpcodeI64Add: + c.emit( + newOperationAdd(unsignedTypeI64), + ) + case wasm.OpcodeI64Sub: + c.emit( + newOperationSub(unsignedTypeI64), + ) + case wasm.OpcodeI64Mul: + c.emit( + newOperationMul(unsignedTypeI64), + ) + case wasm.OpcodeI64DivS: + c.emit( + newOperationDiv(signedTypeInt64), + ) + case wasm.OpcodeI64DivU: + c.emit( + newOperationDiv(signedTypeUint64), + ) + case wasm.OpcodeI64RemS: + c.emit( + newOperationRem(signedInt64), + ) + case wasm.OpcodeI64RemU: + c.emit( + newOperationRem(signedUint64), + ) + case wasm.OpcodeI64And: + c.emit( + newOperationAnd(unsignedInt64), + ) + case wasm.OpcodeI64Or: + c.emit( + newOperationOr(unsignedInt64), + ) + case wasm.OpcodeI64Xor: + c.emit( + newOperationXor(unsignedInt64), + ) + case wasm.OpcodeI64Shl: + c.emit( + newOperationShl(unsignedInt64), + ) + case wasm.OpcodeI64ShrS: + c.emit( + newOperationShr(signedInt64), + ) + case wasm.OpcodeI64ShrU: + c.emit( + newOperationShr(signedUint64), + ) + case wasm.OpcodeI64Rotl: + c.emit( + newOperationRotl(unsignedInt64), + ) + case wasm.OpcodeI64Rotr: + c.emit( + newOperationRotr(unsignedInt64), + ) + case wasm.OpcodeF32Abs: + c.emit( + newOperationAbs(f32), + ) + case wasm.OpcodeF32Neg: + c.emit( + newOperationNeg(f32), + ) + case wasm.OpcodeF32Ceil: + c.emit( + newOperationCeil(f32), + ) + case wasm.OpcodeF32Floor: + c.emit( + newOperationFloor(f32), + ) + case wasm.OpcodeF32Trunc: + c.emit( + newOperationTrunc(f32), + ) + case wasm.OpcodeF32Nearest: + c.emit( + newOperationNearest(f32), + ) + case wasm.OpcodeF32Sqrt: + c.emit( + newOperationSqrt(f32), + ) + case wasm.OpcodeF32Add: + c.emit( + newOperationAdd(unsignedTypeF32), + ) + case wasm.OpcodeF32Sub: + c.emit( + newOperationSub(unsignedTypeF32), + ) + case wasm.OpcodeF32Mul: + c.emit( + newOperationMul(unsignedTypeF32), + ) + case wasm.OpcodeF32Div: + c.emit( + newOperationDiv(signedTypeFloat32), + ) + case wasm.OpcodeF32Min: + c.emit( + newOperationMin(f32), + ) + case wasm.OpcodeF32Max: + c.emit( + newOperationMax(f32), + ) + case wasm.OpcodeF32Copysign: + c.emit( + newOperationCopysign(f32), + ) + case wasm.OpcodeF64Abs: + c.emit( + newOperationAbs(f64), + ) + case wasm.OpcodeF64Neg: + c.emit( + newOperationNeg(f64), + ) + case wasm.OpcodeF64Ceil: + c.emit( + newOperationCeil(f64), + ) + case wasm.OpcodeF64Floor: + c.emit( + newOperationFloor(f64), + ) + case wasm.OpcodeF64Trunc: + c.emit( + newOperationTrunc(f64), + ) + case wasm.OpcodeF64Nearest: + c.emit( + newOperationNearest(f64), + ) + case wasm.OpcodeF64Sqrt: + c.emit( + newOperationSqrt(f64), + ) + case wasm.OpcodeF64Add: + c.emit( + newOperationAdd(unsignedTypeF64), + ) + case wasm.OpcodeF64Sub: + c.emit( + newOperationSub(unsignedTypeF64), + ) + case wasm.OpcodeF64Mul: + c.emit( + newOperationMul(unsignedTypeF64), + ) + case wasm.OpcodeF64Div: + c.emit( + newOperationDiv(signedTypeFloat64), + ) + case wasm.OpcodeF64Min: + c.emit( + newOperationMin(f64), + ) + case wasm.OpcodeF64Max: + c.emit( + newOperationMax(f64), + ) + case wasm.OpcodeF64Copysign: + c.emit( + newOperationCopysign(f64), + ) + case wasm.OpcodeI32WrapI64: + c.emit( + newOperationI32WrapFromI64(), + ) + case wasm.OpcodeI32TruncF32S: + c.emit( + newOperationITruncFromF(f32, signedInt32, false), + ) + case wasm.OpcodeI32TruncF32U: + c.emit( + newOperationITruncFromF(f32, signedUint32, false), + ) + case wasm.OpcodeI32TruncF64S: + c.emit( + newOperationITruncFromF(f64, signedInt32, false), + ) + case wasm.OpcodeI32TruncF64U: + c.emit( + newOperationITruncFromF(f64, signedUint32, false), + ) + case wasm.OpcodeI64ExtendI32S: + c.emit( + newOperationExtend(true), + ) + case wasm.OpcodeI64ExtendI32U: + c.emit( + newOperationExtend(false), + ) + case wasm.OpcodeI64TruncF32S: + c.emit( + newOperationITruncFromF(f32, signedInt64, false), + ) + case wasm.OpcodeI64TruncF32U: + c.emit( + newOperationITruncFromF(f32, signedUint64, false), + ) + case wasm.OpcodeI64TruncF64S: + c.emit( + newOperationITruncFromF(f64, signedInt64, false), + ) + case wasm.OpcodeI64TruncF64U: + c.emit( + newOperationITruncFromF(f64, signedUint64, false), + ) + case wasm.OpcodeF32ConvertI32S: + c.emit( + newOperationFConvertFromI(signedInt32, f32), + ) + case wasm.OpcodeF32ConvertI32U: + c.emit( + newOperationFConvertFromI(signedUint32, f32), + ) + case wasm.OpcodeF32ConvertI64S: + c.emit( + newOperationFConvertFromI(signedInt64, f32), + ) + case wasm.OpcodeF32ConvertI64U: + c.emit( + newOperationFConvertFromI(signedUint64, f32), + ) + case wasm.OpcodeF32DemoteF64: + c.emit( + newOperationF32DemoteFromF64(), + ) + case wasm.OpcodeF64ConvertI32S: + c.emit( + newOperationFConvertFromI(signedInt32, f64), + ) + case wasm.OpcodeF64ConvertI32U: + c.emit( + newOperationFConvertFromI(signedUint32, f64), + ) + case wasm.OpcodeF64ConvertI64S: + c.emit( + newOperationFConvertFromI(signedInt64, f64), + ) + case wasm.OpcodeF64ConvertI64U: + c.emit( + newOperationFConvertFromI(signedUint64, f64), + ) + case wasm.OpcodeF64PromoteF32: + c.emit( + newOperationF64PromoteFromF32(), + ) + case wasm.OpcodeI32ReinterpretF32: + c.emit( + newOperationI32ReinterpretFromF32(), + ) + case wasm.OpcodeI64ReinterpretF64: + c.emit( + newOperationI64ReinterpretFromF64(), + ) + case wasm.OpcodeF32ReinterpretI32: + c.emit( + newOperationF32ReinterpretFromI32(), + ) + case wasm.OpcodeF64ReinterpretI64: + c.emit( + newOperationF64ReinterpretFromI64(), + ) + case wasm.OpcodeI32Extend8S: + c.emit( + newOperationSignExtend32From8(), + ) + case wasm.OpcodeI32Extend16S: + c.emit( + newOperationSignExtend32From16(), + ) + case wasm.OpcodeI64Extend8S: + c.emit( + newOperationSignExtend64From8(), + ) + case wasm.OpcodeI64Extend16S: + c.emit( + newOperationSignExtend64From16(), + ) + case wasm.OpcodeI64Extend32S: + c.emit( + newOperationSignExtend64From32(), + ) + case wasm.OpcodeRefFunc: + c.pc++ + index, num, err := leb128.LoadUint32(c.body[c.pc:]) + if err != nil { + return fmt.Errorf("failed to read function index for ref.func: %v", err) + } + c.pc += num - 1 + c.emit( + newOperationRefFunc(index), + ) + case wasm.OpcodeRefNull: + c.pc++ // Skip the type of reftype as every ref value is opaque pointer. + c.emit( + newOperationConstI64(0), + ) + case wasm.OpcodeRefIsNull: + // Simply compare the opaque pointer (i64) with zero. + c.emit( + newOperationEqz(unsignedInt64), + ) + case wasm.OpcodeTableGet: + c.pc++ + tableIndex, num, err := leb128.LoadUint32(c.body[c.pc:]) + if err != nil { + return fmt.Errorf("failed to read function index for table.get: %v", err) + } + c.pc += num - 1 + c.emit( + newOperationTableGet(tableIndex), + ) + case wasm.OpcodeTableSet: + c.pc++ + tableIndex, num, err := leb128.LoadUint32(c.body[c.pc:]) + if err != nil { + return fmt.Errorf("failed to read function index for table.set: %v", err) + } + c.pc += num - 1 + c.emit( + newOperationTableSet(tableIndex), + ) + case wasm.OpcodeMiscPrefix: + c.pc++ + // A misc opcode is encoded as an unsigned variable 32-bit integer. + miscOp, num, err := leb128.LoadUint32(c.body[c.pc:]) + if err != nil { + return fmt.Errorf("failed to read misc opcode: %v", err) + } + c.pc += num - 1 + switch byte(miscOp) { + case wasm.OpcodeMiscI32TruncSatF32S: + c.emit( + newOperationITruncFromF(f32, signedInt32, true), + ) + case wasm.OpcodeMiscI32TruncSatF32U: + c.emit( + newOperationITruncFromF(f32, signedUint32, true), + ) + case wasm.OpcodeMiscI32TruncSatF64S: + c.emit( + newOperationITruncFromF(f64, signedInt32, true), + ) + case wasm.OpcodeMiscI32TruncSatF64U: + c.emit( + newOperationITruncFromF(f64, signedUint32, true), + ) + case wasm.OpcodeMiscI64TruncSatF32S: + c.emit( + newOperationITruncFromF(f32, signedInt64, true), + ) + case wasm.OpcodeMiscI64TruncSatF32U: + c.emit( + newOperationITruncFromF(f32, signedUint64, true), + ) + case wasm.OpcodeMiscI64TruncSatF64S: + c.emit( + newOperationITruncFromF(f64, signedInt64, true), + ) + case wasm.OpcodeMiscI64TruncSatF64U: + c.emit( + newOperationITruncFromF(f64, signedUint64, true), + ) + case wasm.OpcodeMiscMemoryInit: + c.result.UsesMemory = true + dataIndex, num, err := leb128.LoadUint32(c.body[c.pc+1:]) + if err != nil { + return fmt.Errorf("reading i32.const value: %v", err) + } + c.pc += num + 1 // +1 to skip the memory index which is fixed to zero. + c.emit( + newOperationMemoryInit(dataIndex), + ) + case wasm.OpcodeMiscDataDrop: + dataIndex, num, err := leb128.LoadUint32(c.body[c.pc+1:]) + if err != nil { + return fmt.Errorf("reading i32.const value: %v", err) + } + c.pc += num + c.emit( + newOperationDataDrop(dataIndex), + ) + case wasm.OpcodeMiscMemoryCopy: + c.result.UsesMemory = true + c.pc += 2 // +2 to skip two memory indexes which are fixed to zero. + c.emit( + newOperationMemoryCopy(), + ) + case wasm.OpcodeMiscMemoryFill: + c.result.UsesMemory = true + c.pc += 1 // +1 to skip the memory index which is fixed to zero. + c.emit( + newOperationMemoryFill(), + ) + case wasm.OpcodeMiscTableInit: + elemIndex, num, err := leb128.LoadUint32(c.body[c.pc+1:]) + if err != nil { + return fmt.Errorf("reading i32.const value: %v", err) + } + c.pc += num + // Read table index which is fixed to zero currently. + tableIndex, num, err := leb128.LoadUint32(c.body[c.pc+1:]) + if err != nil { + return fmt.Errorf("reading i32.const value: %v", err) + } + c.pc += num + c.emit( + newOperationTableInit(elemIndex, tableIndex), + ) + case wasm.OpcodeMiscElemDrop: + elemIndex, num, err := leb128.LoadUint32(c.body[c.pc+1:]) + if err != nil { + return fmt.Errorf("reading i32.const value: %v", err) + } + c.pc += num + c.emit( + newOperationElemDrop(elemIndex), + ) + case wasm.OpcodeMiscTableCopy: + // Read the source table inde.g. + dst, num, err := leb128.LoadUint32(c.body[c.pc+1:]) + if err != nil { + return fmt.Errorf("reading i32.const value: %v", err) + } + c.pc += num + // Read the destination table inde.g. + src, num, err := leb128.LoadUint32(c.body[c.pc+1:]) + if err != nil { + return fmt.Errorf("reading i32.const value: %v", err) + } + c.pc += num + c.emit( + newOperationTableCopy(src, dst), + ) + case wasm.OpcodeMiscTableGrow: + // Read the source table inde.g. + tableIndex, num, err := leb128.LoadUint32(c.body[c.pc+1:]) + if err != nil { + return fmt.Errorf("reading i32.const value: %v", err) + } + c.pc += num + c.emit( + newOperationTableGrow(tableIndex), + ) + case wasm.OpcodeMiscTableSize: + // Read the source table inde.g. + tableIndex, num, err := leb128.LoadUint32(c.body[c.pc+1:]) + if err != nil { + return fmt.Errorf("reading i32.const value: %v", err) + } + c.pc += num + c.emit( + newOperationTableSize(tableIndex), + ) + case wasm.OpcodeMiscTableFill: + // Read the source table index. + tableIndex, num, err := leb128.LoadUint32(c.body[c.pc+1:]) + if err != nil { + return fmt.Errorf("reading i32.const value: %v", err) + } + c.pc += num + c.emit( + newOperationTableFill(tableIndex), + ) + default: + return fmt.Errorf("unsupported misc instruction in interpreterir: 0x%x", op) + } + case wasm.OpcodeVecPrefix: + c.pc++ + switch vecOp := c.body[c.pc]; vecOp { + case wasm.OpcodeVecV128Const: + c.pc++ + lo := binary.LittleEndian.Uint64(c.body[c.pc : c.pc+8]) + c.pc += 8 + hi := binary.LittleEndian.Uint64(c.body[c.pc : c.pc+8]) + c.emit( + newOperationV128Const(lo, hi), + ) + c.pc += 7 + case wasm.OpcodeVecV128Load: + arg, err := c.readMemoryArg(wasm.OpcodeI32LoadName) + if err != nil { + return err + } + c.emit( + newOperationV128Load(v128LoadType128, arg), + ) + case wasm.OpcodeVecV128Load8x8s: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load8x8SName) + if err != nil { + return err + } + c.emit( + newOperationV128Load(v128LoadType8x8s, arg), + ) + case wasm.OpcodeVecV128Load8x8u: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load8x8UName) + if err != nil { + return err + } + c.emit( + newOperationV128Load(v128LoadType8x8u, arg), + ) + case wasm.OpcodeVecV128Load16x4s: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load16x4SName) + if err != nil { + return err + } + c.emit( + newOperationV128Load(v128LoadType16x4s, arg), + ) + case wasm.OpcodeVecV128Load16x4u: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load16x4UName) + if err != nil { + return err + } + c.emit( + newOperationV128Load(v128LoadType16x4u, arg), + ) + case wasm.OpcodeVecV128Load32x2s: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load32x2SName) + if err != nil { + return err + } + c.emit( + newOperationV128Load(v128LoadType32x2s, arg), + ) + case wasm.OpcodeVecV128Load32x2u: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load32x2UName) + if err != nil { + return err + } + c.emit( + newOperationV128Load(v128LoadType32x2u, arg), + ) + case wasm.OpcodeVecV128Load8Splat: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load8SplatName) + if err != nil { + return err + } + c.emit( + newOperationV128Load(v128LoadType8Splat, arg), + ) + case wasm.OpcodeVecV128Load16Splat: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load16SplatName) + if err != nil { + return err + } + c.emit( + newOperationV128Load(v128LoadType16Splat, arg), + ) + case wasm.OpcodeVecV128Load32Splat: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load32SplatName) + if err != nil { + return err + } + c.emit( + newOperationV128Load(v128LoadType32Splat, arg), + ) + case wasm.OpcodeVecV128Load64Splat: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load64SplatName) + if err != nil { + return err + } + c.emit( + newOperationV128Load(v128LoadType64Splat, arg), + ) + case wasm.OpcodeVecV128Load32zero: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load32zeroName) + if err != nil { + return err + } + c.emit( + newOperationV128Load(v128LoadType32zero, arg), + ) + case wasm.OpcodeVecV128Load64zero: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load64zeroName) + if err != nil { + return err + } + c.emit( + newOperationV128Load(v128LoadType64zero, arg), + ) + case wasm.OpcodeVecV128Load8Lane: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load8LaneName) + if err != nil { + return err + } + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128LoadLane(laneIndex, 8, arg), + ) + case wasm.OpcodeVecV128Load16Lane: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load16LaneName) + if err != nil { + return err + } + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128LoadLane(laneIndex, 16, arg), + ) + case wasm.OpcodeVecV128Load32Lane: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load32LaneName) + if err != nil { + return err + } + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128LoadLane(laneIndex, 32, arg), + ) + case wasm.OpcodeVecV128Load64Lane: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load64LaneName) + if err != nil { + return err + } + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128LoadLane(laneIndex, 64, arg), + ) + case wasm.OpcodeVecV128Store: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128StoreName) + if err != nil { + return err + } + c.emit( + newOperationV128Store(arg), + ) + case wasm.OpcodeVecV128Store8Lane: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Store8LaneName) + if err != nil { + return err + } + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128StoreLane(laneIndex, 8, arg), + ) + case wasm.OpcodeVecV128Store16Lane: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Store16LaneName) + if err != nil { + return err + } + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128StoreLane(laneIndex, 16, arg), + ) + case wasm.OpcodeVecV128Store32Lane: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Store32LaneName) + if err != nil { + return err + } + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128StoreLane(laneIndex, 32, arg), + ) + case wasm.OpcodeVecV128Store64Lane: + arg, err := c.readMemoryArg(wasm.OpcodeVecV128Store64LaneName) + if err != nil { + return err + } + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128StoreLane(laneIndex, 64, arg), + ) + case wasm.OpcodeVecI8x16ExtractLaneS: + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128ExtractLane(laneIndex, true, shapeI8x16), + ) + case wasm.OpcodeVecI8x16ExtractLaneU: + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128ExtractLane(laneIndex, false, shapeI8x16), + ) + case wasm.OpcodeVecI16x8ExtractLaneS: + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128ExtractLane(laneIndex, true, shapeI16x8), + ) + case wasm.OpcodeVecI16x8ExtractLaneU: + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128ExtractLane(laneIndex, false, shapeI16x8), + ) + case wasm.OpcodeVecI32x4ExtractLane: + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128ExtractLane(laneIndex, false, shapeI32x4), + ) + case wasm.OpcodeVecI64x2ExtractLane: + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128ExtractLane(laneIndex, false, shapeI64x2), + ) + case wasm.OpcodeVecF32x4ExtractLane: + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128ExtractLane(laneIndex, false, shapeF32x4), + ) + case wasm.OpcodeVecF64x2ExtractLane: + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128ExtractLane(laneIndex, false, shapeF64x2), + ) + case wasm.OpcodeVecI8x16ReplaceLane: + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128ReplaceLane(laneIndex, shapeI8x16), + ) + case wasm.OpcodeVecI16x8ReplaceLane: + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128ReplaceLane(laneIndex, shapeI16x8), + ) + case wasm.OpcodeVecI32x4ReplaceLane: + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128ReplaceLane(laneIndex, shapeI32x4), + ) + case wasm.OpcodeVecI64x2ReplaceLane: + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128ReplaceLane(laneIndex, shapeI64x2), + ) + case wasm.OpcodeVecF32x4ReplaceLane: + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128ReplaceLane(laneIndex, shapeF32x4), + ) + case wasm.OpcodeVecF64x2ReplaceLane: + c.pc++ + laneIndex := c.body[c.pc] + c.emit( + newOperationV128ReplaceLane(laneIndex, shapeF64x2), + ) + case wasm.OpcodeVecI8x16Splat: + c.emit( + newOperationV128Splat(shapeI8x16), + ) + case wasm.OpcodeVecI16x8Splat: + c.emit( + newOperationV128Splat(shapeI16x8), + ) + case wasm.OpcodeVecI32x4Splat: + c.emit( + newOperationV128Splat(shapeI32x4), + ) + case wasm.OpcodeVecI64x2Splat: + c.emit( + newOperationV128Splat(shapeI64x2), + ) + case wasm.OpcodeVecF32x4Splat: + c.emit( + newOperationV128Splat(shapeF32x4), + ) + case wasm.OpcodeVecF64x2Splat: + c.emit( + newOperationV128Splat(shapeF64x2), + ) + case wasm.OpcodeVecI8x16Swizzle: + c.emit( + newOperationV128Swizzle(), + ) + case wasm.OpcodeVecV128i8x16Shuffle: + c.pc++ + lanes := make([]uint64, 16) + for i := uint64(0); i < 16; i++ { + lanes[i] = uint64(c.body[c.pc+i]) + } + op := newOperationV128Shuffle(lanes) + c.emit(op) + c.pc += 15 + case wasm.OpcodeVecV128AnyTrue: + c.emit( + newOperationV128AnyTrue(), + ) + case wasm.OpcodeVecI8x16AllTrue: + c.emit( + newOperationV128AllTrue(shapeI8x16), + ) + case wasm.OpcodeVecI16x8AllTrue: + c.emit( + newOperationV128AllTrue(shapeI16x8), + ) + case wasm.OpcodeVecI32x4AllTrue: + c.emit( + newOperationV128AllTrue(shapeI32x4), + ) + case wasm.OpcodeVecI64x2AllTrue: + c.emit( + newOperationV128AllTrue(shapeI64x2), + ) + case wasm.OpcodeVecI8x16BitMask: + c.emit( + newOperationV128BitMask(shapeI8x16), + ) + case wasm.OpcodeVecI16x8BitMask: + c.emit( + newOperationV128BitMask(shapeI16x8), + ) + case wasm.OpcodeVecI32x4BitMask: + c.emit( + newOperationV128BitMask(shapeI32x4), + ) + case wasm.OpcodeVecI64x2BitMask: + c.emit( + newOperationV128BitMask(shapeI64x2), + ) + case wasm.OpcodeVecV128And: + c.emit( + newOperationV128And(), + ) + case wasm.OpcodeVecV128Not: + c.emit( + newOperationV128Not(), + ) + case wasm.OpcodeVecV128Or: + c.emit( + newOperationV128Or(), + ) + case wasm.OpcodeVecV128Xor: + c.emit( + newOperationV128Xor(), + ) + case wasm.OpcodeVecV128Bitselect: + c.emit( + newOperationV128Bitselect(), + ) + case wasm.OpcodeVecV128AndNot: + c.emit( + newOperationV128AndNot(), + ) + case wasm.OpcodeVecI8x16Shl: + c.emit( + newOperationV128Shl(shapeI8x16), + ) + case wasm.OpcodeVecI8x16ShrS: + c.emit( + newOperationV128Shr(shapeI8x16, true), + ) + case wasm.OpcodeVecI8x16ShrU: + c.emit( + newOperationV128Shr(shapeI8x16, false), + ) + case wasm.OpcodeVecI16x8Shl: + c.emit( + newOperationV128Shl(shapeI16x8), + ) + case wasm.OpcodeVecI16x8ShrS: + c.emit( + newOperationV128Shr(shapeI16x8, true), + ) + case wasm.OpcodeVecI16x8ShrU: + c.emit( + newOperationV128Shr(shapeI16x8, false), + ) + case wasm.OpcodeVecI32x4Shl: + c.emit( + newOperationV128Shl(shapeI32x4), + ) + case wasm.OpcodeVecI32x4ShrS: + c.emit( + newOperationV128Shr(shapeI32x4, true), + ) + case wasm.OpcodeVecI32x4ShrU: + c.emit( + newOperationV128Shr(shapeI32x4, false), + ) + case wasm.OpcodeVecI64x2Shl: + c.emit( + newOperationV128Shl(shapeI64x2), + ) + case wasm.OpcodeVecI64x2ShrS: + c.emit( + newOperationV128Shr(shapeI64x2, true), + ) + case wasm.OpcodeVecI64x2ShrU: + c.emit( + newOperationV128Shr(shapeI64x2, false), + ) + case wasm.OpcodeVecI8x16Eq: + c.emit( + newOperationV128Cmp(v128CmpTypeI8x16Eq), + ) + case wasm.OpcodeVecI8x16Ne: + c.emit( + newOperationV128Cmp(v128CmpTypeI8x16Ne), + ) + case wasm.OpcodeVecI8x16LtS: + c.emit( + newOperationV128Cmp(v128CmpTypeI8x16LtS), + ) + case wasm.OpcodeVecI8x16LtU: + c.emit( + newOperationV128Cmp(v128CmpTypeI8x16LtU), + ) + case wasm.OpcodeVecI8x16GtS: + c.emit( + newOperationV128Cmp(v128CmpTypeI8x16GtS), + ) + case wasm.OpcodeVecI8x16GtU: + c.emit( + newOperationV128Cmp(v128CmpTypeI8x16GtU), + ) + case wasm.OpcodeVecI8x16LeS: + c.emit( + newOperationV128Cmp(v128CmpTypeI8x16LeS), + ) + case wasm.OpcodeVecI8x16LeU: + c.emit( + newOperationV128Cmp(v128CmpTypeI8x16LeU), + ) + case wasm.OpcodeVecI8x16GeS: + c.emit( + newOperationV128Cmp(v128CmpTypeI8x16GeS), + ) + case wasm.OpcodeVecI8x16GeU: + c.emit( + newOperationV128Cmp(v128CmpTypeI8x16GeU), + ) + case wasm.OpcodeVecI16x8Eq: + c.emit( + newOperationV128Cmp(v128CmpTypeI16x8Eq), + ) + case wasm.OpcodeVecI16x8Ne: + c.emit( + newOperationV128Cmp(v128CmpTypeI16x8Ne), + ) + case wasm.OpcodeVecI16x8LtS: + c.emit( + newOperationV128Cmp(v128CmpTypeI16x8LtS), + ) + case wasm.OpcodeVecI16x8LtU: + c.emit( + newOperationV128Cmp(v128CmpTypeI16x8LtU), + ) + case wasm.OpcodeVecI16x8GtS: + c.emit( + newOperationV128Cmp(v128CmpTypeI16x8GtS), + ) + case wasm.OpcodeVecI16x8GtU: + c.emit( + newOperationV128Cmp(v128CmpTypeI16x8GtU), + ) + case wasm.OpcodeVecI16x8LeS: + c.emit( + newOperationV128Cmp(v128CmpTypeI16x8LeS), + ) + case wasm.OpcodeVecI16x8LeU: + c.emit( + newOperationV128Cmp(v128CmpTypeI16x8LeU), + ) + case wasm.OpcodeVecI16x8GeS: + c.emit( + newOperationV128Cmp(v128CmpTypeI16x8GeS), + ) + case wasm.OpcodeVecI16x8GeU: + c.emit( + newOperationV128Cmp(v128CmpTypeI16x8GeU), + ) + case wasm.OpcodeVecI32x4Eq: + c.emit( + newOperationV128Cmp(v128CmpTypeI32x4Eq), + ) + case wasm.OpcodeVecI32x4Ne: + c.emit( + newOperationV128Cmp(v128CmpTypeI32x4Ne), + ) + case wasm.OpcodeVecI32x4LtS: + c.emit( + newOperationV128Cmp(v128CmpTypeI32x4LtS), + ) + case wasm.OpcodeVecI32x4LtU: + c.emit( + newOperationV128Cmp(v128CmpTypeI32x4LtU), + ) + case wasm.OpcodeVecI32x4GtS: + c.emit( + newOperationV128Cmp(v128CmpTypeI32x4GtS), + ) + case wasm.OpcodeVecI32x4GtU: + c.emit( + newOperationV128Cmp(v128CmpTypeI32x4GtU), + ) + case wasm.OpcodeVecI32x4LeS: + c.emit( + newOperationV128Cmp(v128CmpTypeI32x4LeS), + ) + case wasm.OpcodeVecI32x4LeU: + c.emit( + newOperationV128Cmp(v128CmpTypeI32x4LeU), + ) + case wasm.OpcodeVecI32x4GeS: + c.emit( + newOperationV128Cmp(v128CmpTypeI32x4GeS), + ) + case wasm.OpcodeVecI32x4GeU: + c.emit( + newOperationV128Cmp(v128CmpTypeI32x4GeU), + ) + case wasm.OpcodeVecI64x2Eq: + c.emit( + newOperationV128Cmp(v128CmpTypeI64x2Eq), + ) + case wasm.OpcodeVecI64x2Ne: + c.emit( + newOperationV128Cmp(v128CmpTypeI64x2Ne), + ) + case wasm.OpcodeVecI64x2LtS: + c.emit( + newOperationV128Cmp(v128CmpTypeI64x2LtS), + ) + case wasm.OpcodeVecI64x2GtS: + c.emit( + newOperationV128Cmp(v128CmpTypeI64x2GtS), + ) + case wasm.OpcodeVecI64x2LeS: + c.emit( + newOperationV128Cmp(v128CmpTypeI64x2LeS), + ) + case wasm.OpcodeVecI64x2GeS: + c.emit( + newOperationV128Cmp(v128CmpTypeI64x2GeS), + ) + case wasm.OpcodeVecF32x4Eq: + c.emit( + newOperationV128Cmp(v128CmpTypeF32x4Eq), + ) + case wasm.OpcodeVecF32x4Ne: + c.emit( + newOperationV128Cmp(v128CmpTypeF32x4Ne), + ) + case wasm.OpcodeVecF32x4Lt: + c.emit( + newOperationV128Cmp(v128CmpTypeF32x4Lt), + ) + case wasm.OpcodeVecF32x4Gt: + c.emit( + newOperationV128Cmp(v128CmpTypeF32x4Gt), + ) + case wasm.OpcodeVecF32x4Le: + c.emit( + newOperationV128Cmp(v128CmpTypeF32x4Le), + ) + case wasm.OpcodeVecF32x4Ge: + c.emit( + newOperationV128Cmp(v128CmpTypeF32x4Ge), + ) + case wasm.OpcodeVecF64x2Eq: + c.emit( + newOperationV128Cmp(v128CmpTypeF64x2Eq), + ) + case wasm.OpcodeVecF64x2Ne: + c.emit( + newOperationV128Cmp(v128CmpTypeF64x2Ne), + ) + case wasm.OpcodeVecF64x2Lt: + c.emit( + newOperationV128Cmp(v128CmpTypeF64x2Lt), + ) + case wasm.OpcodeVecF64x2Gt: + c.emit( + newOperationV128Cmp(v128CmpTypeF64x2Gt), + ) + case wasm.OpcodeVecF64x2Le: + c.emit( + newOperationV128Cmp(v128CmpTypeF64x2Le), + ) + case wasm.OpcodeVecF64x2Ge: + c.emit( + newOperationV128Cmp(v128CmpTypeF64x2Ge), + ) + case wasm.OpcodeVecI8x16Neg: + c.emit( + newOperationV128Neg(shapeI8x16), + ) + case wasm.OpcodeVecI16x8Neg: + c.emit( + newOperationV128Neg(shapeI16x8), + ) + case wasm.OpcodeVecI32x4Neg: + c.emit( + newOperationV128Neg(shapeI32x4), + ) + case wasm.OpcodeVecI64x2Neg: + c.emit( + newOperationV128Neg(shapeI64x2), + ) + case wasm.OpcodeVecF32x4Neg: + c.emit( + newOperationV128Neg(shapeF32x4), + ) + case wasm.OpcodeVecF64x2Neg: + c.emit( + newOperationV128Neg(shapeF64x2), + ) + case wasm.OpcodeVecI8x16Add: + c.emit( + newOperationV128Add(shapeI8x16), + ) + case wasm.OpcodeVecI16x8Add: + c.emit( + newOperationV128Add(shapeI16x8), + ) + case wasm.OpcodeVecI32x4Add: + c.emit( + newOperationV128Add(shapeI32x4), + ) + case wasm.OpcodeVecI64x2Add: + c.emit( + newOperationV128Add(shapeI64x2), + ) + case wasm.OpcodeVecF32x4Add: + c.emit( + newOperationV128Add(shapeF32x4), + ) + case wasm.OpcodeVecF64x2Add: + c.emit( + newOperationV128Add(shapeF64x2), + ) + case wasm.OpcodeVecI8x16Sub: + c.emit( + newOperationV128Sub(shapeI8x16), + ) + case wasm.OpcodeVecI16x8Sub: + c.emit( + newOperationV128Sub(shapeI16x8), + ) + case wasm.OpcodeVecI32x4Sub: + c.emit( + newOperationV128Sub(shapeI32x4), + ) + case wasm.OpcodeVecI64x2Sub: + c.emit( + newOperationV128Sub(shapeI64x2), + ) + case wasm.OpcodeVecF32x4Sub: + c.emit( + newOperationV128Sub(shapeF32x4), + ) + case wasm.OpcodeVecF64x2Sub: + c.emit( + newOperationV128Sub(shapeF64x2), + ) + case wasm.OpcodeVecI8x16AddSatS: + c.emit( + newOperationV128AddSat(shapeI8x16, true), + ) + case wasm.OpcodeVecI8x16AddSatU: + c.emit( + newOperationV128AddSat(shapeI8x16, false), + ) + case wasm.OpcodeVecI16x8AddSatS: + c.emit( + newOperationV128AddSat(shapeI16x8, true), + ) + case wasm.OpcodeVecI16x8AddSatU: + c.emit( + newOperationV128AddSat(shapeI16x8, false), + ) + case wasm.OpcodeVecI8x16SubSatS: + c.emit( + newOperationV128SubSat(shapeI8x16, true), + ) + case wasm.OpcodeVecI8x16SubSatU: + c.emit( + newOperationV128SubSat(shapeI8x16, false), + ) + case wasm.OpcodeVecI16x8SubSatS: + c.emit( + newOperationV128SubSat(shapeI16x8, true), + ) + case wasm.OpcodeVecI16x8SubSatU: + c.emit( + newOperationV128SubSat(shapeI16x8, false), + ) + case wasm.OpcodeVecI16x8Mul: + c.emit( + newOperationV128Mul(shapeI16x8), + ) + case wasm.OpcodeVecI32x4Mul: + c.emit( + newOperationV128Mul(shapeI32x4), + ) + case wasm.OpcodeVecI64x2Mul: + c.emit( + newOperationV128Mul(shapeI64x2), + ) + case wasm.OpcodeVecF32x4Mul: + c.emit( + newOperationV128Mul(shapeF32x4), + ) + case wasm.OpcodeVecF64x2Mul: + c.emit( + newOperationV128Mul(shapeF64x2), + ) + case wasm.OpcodeVecF32x4Sqrt: + c.emit( + newOperationV128Sqrt(shapeF32x4), + ) + case wasm.OpcodeVecF64x2Sqrt: + c.emit( + newOperationV128Sqrt(shapeF64x2), + ) + case wasm.OpcodeVecF32x4Div: + c.emit( + newOperationV128Div(shapeF32x4), + ) + case wasm.OpcodeVecF64x2Div: + c.emit( + newOperationV128Div(shapeF64x2), + ) + case wasm.OpcodeVecI8x16Abs: + c.emit( + newOperationV128Abs(shapeI8x16), + ) + case wasm.OpcodeVecI8x16Popcnt: + c.emit( + newOperationV128Popcnt(shapeI8x16), + ) + case wasm.OpcodeVecI16x8Abs: + c.emit( + newOperationV128Abs(shapeI16x8), + ) + case wasm.OpcodeVecI32x4Abs: + c.emit( + newOperationV128Abs(shapeI32x4), + ) + case wasm.OpcodeVecI64x2Abs: + c.emit( + newOperationV128Abs(shapeI64x2), + ) + case wasm.OpcodeVecF32x4Abs: + c.emit( + newOperationV128Abs(shapeF32x4), + ) + case wasm.OpcodeVecF64x2Abs: + c.emit( + newOperationV128Abs(shapeF64x2), + ) + case wasm.OpcodeVecI8x16MinS: + c.emit( + newOperationV128Min(shapeI8x16, true), + ) + case wasm.OpcodeVecI8x16MinU: + c.emit( + newOperationV128Min(shapeI8x16, false), + ) + case wasm.OpcodeVecI8x16MaxS: + c.emit( + newOperationV128Max(shapeI8x16, true), + ) + case wasm.OpcodeVecI8x16MaxU: + c.emit( + newOperationV128Max(shapeI8x16, false), + ) + case wasm.OpcodeVecI8x16AvgrU: + c.emit( + newOperationV128AvgrU(shapeI8x16), + ) + case wasm.OpcodeVecI16x8MinS: + c.emit( + newOperationV128Min(shapeI16x8, true), + ) + case wasm.OpcodeVecI16x8MinU: + c.emit( + newOperationV128Min(shapeI16x8, false), + ) + case wasm.OpcodeVecI16x8MaxS: + c.emit( + newOperationV128Max(shapeI16x8, true), + ) + case wasm.OpcodeVecI16x8MaxU: + c.emit( + newOperationV128Max(shapeI16x8, false), + ) + case wasm.OpcodeVecI16x8AvgrU: + c.emit( + newOperationV128AvgrU(shapeI16x8), + ) + case wasm.OpcodeVecI32x4MinS: + c.emit( + newOperationV128Min(shapeI32x4, true), + ) + case wasm.OpcodeVecI32x4MinU: + c.emit( + newOperationV128Min(shapeI32x4, false), + ) + case wasm.OpcodeVecI32x4MaxS: + c.emit( + newOperationV128Max(shapeI32x4, true), + ) + case wasm.OpcodeVecI32x4MaxU: + c.emit( + newOperationV128Max(shapeI32x4, false), + ) + case wasm.OpcodeVecF32x4Min: + c.emit( + newOperationV128Min(shapeF32x4, false), + ) + case wasm.OpcodeVecF32x4Max: + c.emit( + newOperationV128Max(shapeF32x4, false), + ) + case wasm.OpcodeVecF64x2Min: + c.emit( + newOperationV128Min(shapeF64x2, false), + ) + case wasm.OpcodeVecF64x2Max: + c.emit( + newOperationV128Max(shapeF64x2, false), + ) + case wasm.OpcodeVecF32x4Pmin: + c.emit( + newOperationV128Pmin(shapeF32x4), + ) + case wasm.OpcodeVecF32x4Pmax: + c.emit( + newOperationV128Pmax(shapeF32x4), + ) + case wasm.OpcodeVecF64x2Pmin: + c.emit( + newOperationV128Pmin(shapeF64x2), + ) + case wasm.OpcodeVecF64x2Pmax: + c.emit( + newOperationV128Pmax(shapeF64x2), + ) + case wasm.OpcodeVecF32x4Ceil: + c.emit( + newOperationV128Ceil(shapeF32x4), + ) + case wasm.OpcodeVecF32x4Floor: + c.emit( + newOperationV128Floor(shapeF32x4), + ) + case wasm.OpcodeVecF32x4Trunc: + c.emit( + newOperationV128Trunc(shapeF32x4), + ) + case wasm.OpcodeVecF32x4Nearest: + c.emit( + newOperationV128Nearest(shapeF32x4), + ) + case wasm.OpcodeVecF64x2Ceil: + c.emit( + newOperationV128Ceil(shapeF64x2), + ) + case wasm.OpcodeVecF64x2Floor: + c.emit( + newOperationV128Floor(shapeF64x2), + ) + case wasm.OpcodeVecF64x2Trunc: + c.emit( + newOperationV128Trunc(shapeF64x2), + ) + case wasm.OpcodeVecF64x2Nearest: + c.emit( + newOperationV128Nearest(shapeF64x2), + ) + case wasm.OpcodeVecI16x8ExtendLowI8x16S: + c.emit( + newOperationV128Extend(shapeI8x16, true, true), + ) + case wasm.OpcodeVecI16x8ExtendHighI8x16S: + c.emit( + newOperationV128Extend(shapeI8x16, true, false), + ) + case wasm.OpcodeVecI16x8ExtendLowI8x16U: + c.emit( + newOperationV128Extend(shapeI8x16, false, true), + ) + case wasm.OpcodeVecI16x8ExtendHighI8x16U: + c.emit( + newOperationV128Extend(shapeI8x16, false, false), + ) + case wasm.OpcodeVecI32x4ExtendLowI16x8S: + c.emit( + newOperationV128Extend(shapeI16x8, true, true), + ) + case wasm.OpcodeVecI32x4ExtendHighI16x8S: + c.emit( + newOperationV128Extend(shapeI16x8, true, false), + ) + case wasm.OpcodeVecI32x4ExtendLowI16x8U: + c.emit( + newOperationV128Extend(shapeI16x8, false, true), + ) + case wasm.OpcodeVecI32x4ExtendHighI16x8U: + c.emit( + newOperationV128Extend(shapeI16x8, false, false), + ) + case wasm.OpcodeVecI64x2ExtendLowI32x4S: + c.emit( + newOperationV128Extend(shapeI32x4, true, true), + ) + case wasm.OpcodeVecI64x2ExtendHighI32x4S: + c.emit( + newOperationV128Extend(shapeI32x4, true, false), + ) + case wasm.OpcodeVecI64x2ExtendLowI32x4U: + c.emit( + newOperationV128Extend(shapeI32x4, false, true), + ) + case wasm.OpcodeVecI64x2ExtendHighI32x4U: + c.emit( + newOperationV128Extend(shapeI32x4, false, false), + ) + case wasm.OpcodeVecI16x8Q15mulrSatS: + c.emit( + newOperationV128Q15mulrSatS(), + ) + case wasm.OpcodeVecI16x8ExtMulLowI8x16S: + c.emit( + newOperationV128ExtMul(shapeI8x16, true, true), + ) + case wasm.OpcodeVecI16x8ExtMulHighI8x16S: + c.emit( + newOperationV128ExtMul(shapeI8x16, true, false), + ) + case wasm.OpcodeVecI16x8ExtMulLowI8x16U: + c.emit( + newOperationV128ExtMul(shapeI8x16, false, true), + ) + case wasm.OpcodeVecI16x8ExtMulHighI8x16U: + c.emit( + newOperationV128ExtMul(shapeI8x16, false, false), + ) + case wasm.OpcodeVecI32x4ExtMulLowI16x8S: + c.emit( + newOperationV128ExtMul(shapeI16x8, true, true), + ) + case wasm.OpcodeVecI32x4ExtMulHighI16x8S: + c.emit( + newOperationV128ExtMul(shapeI16x8, true, false), + ) + case wasm.OpcodeVecI32x4ExtMulLowI16x8U: + c.emit( + newOperationV128ExtMul(shapeI16x8, false, true), + ) + case wasm.OpcodeVecI32x4ExtMulHighI16x8U: + c.emit( + newOperationV128ExtMul(shapeI16x8, false, false), + ) + case wasm.OpcodeVecI64x2ExtMulLowI32x4S: + c.emit( + newOperationV128ExtMul(shapeI32x4, true, true), + ) + case wasm.OpcodeVecI64x2ExtMulHighI32x4S: + c.emit( + newOperationV128ExtMul(shapeI32x4, true, false), + ) + case wasm.OpcodeVecI64x2ExtMulLowI32x4U: + c.emit( + newOperationV128ExtMul(shapeI32x4, false, true), + ) + case wasm.OpcodeVecI64x2ExtMulHighI32x4U: + c.emit( + newOperationV128ExtMul(shapeI32x4, false, false), + ) + case wasm.OpcodeVecI16x8ExtaddPairwiseI8x16S: + c.emit( + newOperationV128ExtAddPairwise(shapeI8x16, true), + ) + case wasm.OpcodeVecI16x8ExtaddPairwiseI8x16U: + c.emit( + newOperationV128ExtAddPairwise(shapeI8x16, false), + ) + case wasm.OpcodeVecI32x4ExtaddPairwiseI16x8S: + c.emit( + newOperationV128ExtAddPairwise(shapeI16x8, true), + ) + case wasm.OpcodeVecI32x4ExtaddPairwiseI16x8U: + c.emit( + newOperationV128ExtAddPairwise(shapeI16x8, false), + ) + case wasm.OpcodeVecF64x2PromoteLowF32x4Zero: + c.emit( + newOperationV128FloatPromote(), + ) + case wasm.OpcodeVecF32x4DemoteF64x2Zero: + c.emit( + newOperationV128FloatDemote(), + ) + case wasm.OpcodeVecF32x4ConvertI32x4S: + c.emit( + newOperationV128FConvertFromI(shapeF32x4, true), + ) + case wasm.OpcodeVecF32x4ConvertI32x4U: + c.emit( + newOperationV128FConvertFromI(shapeF32x4, false), + ) + case wasm.OpcodeVecF64x2ConvertLowI32x4S: + c.emit( + newOperationV128FConvertFromI(shapeF64x2, true), + ) + case wasm.OpcodeVecF64x2ConvertLowI32x4U: + c.emit( + newOperationV128FConvertFromI(shapeF64x2, false), + ) + case wasm.OpcodeVecI32x4DotI16x8S: + c.emit( + newOperationV128Dot(), + ) + case wasm.OpcodeVecI8x16NarrowI16x8S: + c.emit( + newOperationV128Narrow(shapeI16x8, true), + ) + case wasm.OpcodeVecI8x16NarrowI16x8U: + c.emit( + newOperationV128Narrow(shapeI16x8, false), + ) + case wasm.OpcodeVecI16x8NarrowI32x4S: + c.emit( + newOperationV128Narrow(shapeI32x4, true), + ) + case wasm.OpcodeVecI16x8NarrowI32x4U: + c.emit( + newOperationV128Narrow(shapeI32x4, false), + ) + case wasm.OpcodeVecI32x4TruncSatF32x4S: + c.emit( + newOperationV128ITruncSatFromF(shapeF32x4, true), + ) + case wasm.OpcodeVecI32x4TruncSatF32x4U: + c.emit( + newOperationV128ITruncSatFromF(shapeF32x4, false), + ) + case wasm.OpcodeVecI32x4TruncSatF64x2SZero: + c.emit( + newOperationV128ITruncSatFromF(shapeF64x2, true), + ) + case wasm.OpcodeVecI32x4TruncSatF64x2UZero: + c.emit( + newOperationV128ITruncSatFromF(shapeF64x2, false), + ) + default: + return fmt.Errorf("unsupported vector instruction in interpreterir: %s", wasm.VectorInstructionName(vecOp)) + } + case wasm.OpcodeAtomicPrefix: + c.pc++ + atomicOp := c.body[c.pc] + switch atomicOp { + case wasm.OpcodeAtomicMemoryWait32: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicMemoryWait32Name) + if err != nil { + return err + } + c.emit( + newOperationAtomicMemoryWait(unsignedTypeI32, imm), + ) + case wasm.OpcodeAtomicMemoryWait64: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicMemoryWait64Name) + if err != nil { + return err + } + c.emit( + newOperationAtomicMemoryWait(unsignedTypeI64, imm), + ) + case wasm.OpcodeAtomicMemoryNotify: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicMemoryNotifyName) + if err != nil { + return err + } + c.emit( + newOperationAtomicMemoryNotify(imm), + ) + case wasm.OpcodeAtomicFence: + // Skip immediate value + c.pc++ + _ = c.body[c.pc] + c.emit( + newOperationAtomicFence(), + ) + case wasm.OpcodeAtomicI32Load: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32LoadName) + if err != nil { + return err + } + c.emit( + newOperationAtomicLoad(unsignedTypeI32, imm), + ) + case wasm.OpcodeAtomicI64Load: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64LoadName) + if err != nil { + return err + } + c.emit( + newOperationAtomicLoad(unsignedTypeI64, imm), + ) + case wasm.OpcodeAtomicI32Load8U: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Load8UName) + if err != nil { + return err + } + c.emit( + newOperationAtomicLoad8(unsignedTypeI32, imm), + ) + case wasm.OpcodeAtomicI32Load16U: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Load16UName) + if err != nil { + return err + } + c.emit( + newOperationAtomicLoad16(unsignedTypeI32, imm), + ) + case wasm.OpcodeAtomicI64Load8U: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Load8UName) + if err != nil { + return err + } + c.emit( + newOperationAtomicLoad8(unsignedTypeI64, imm), + ) + case wasm.OpcodeAtomicI64Load16U: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Load16UName) + if err != nil { + return err + } + c.emit( + newOperationAtomicLoad16(unsignedTypeI64, imm), + ) + case wasm.OpcodeAtomicI64Load32U: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Load32UName) + if err != nil { + return err + } + c.emit( + newOperationAtomicLoad(unsignedTypeI32, imm), + ) + case wasm.OpcodeAtomicI32Store: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32StoreName) + if err != nil { + return err + } + c.emit( + newOperationAtomicStore(unsignedTypeI32, imm), + ) + case wasm.OpcodeAtomicI32Store8: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Store8Name) + if err != nil { + return err + } + c.emit( + newOperationAtomicStore8(unsignedTypeI32, imm), + ) + case wasm.OpcodeAtomicI32Store16: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Store16Name) + if err != nil { + return err + } + c.emit( + newOperationAtomicStore16(unsignedTypeI32, imm), + ) + case wasm.OpcodeAtomicI64Store: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64StoreName) + if err != nil { + return err + } + c.emit( + newOperationAtomicStore(unsignedTypeI64, imm), + ) + case wasm.OpcodeAtomicI64Store8: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Store8Name) + if err != nil { + return err + } + c.emit( + newOperationAtomicStore8(unsignedTypeI64, imm), + ) + case wasm.OpcodeAtomicI64Store16: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Store16Name) + if err != nil { + return err + } + c.emit( + newOperationAtomicStore16(unsignedTypeI64, imm), + ) + case wasm.OpcodeAtomicI64Store32: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Store32Name) + if err != nil { + return err + } + c.emit( + newOperationAtomicStore(unsignedTypeI32, imm), + ) + case wasm.OpcodeAtomicI32RmwAdd: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32RmwAddName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpAdd), + ) + case wasm.OpcodeAtomicI64RmwAdd: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64RmwAddName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI64, imm, atomicArithmeticOpAdd), + ) + case wasm.OpcodeAtomicI32Rmw8AddU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw8AddUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW8(unsignedTypeI32, imm, atomicArithmeticOpAdd), + ) + case wasm.OpcodeAtomicI64Rmw8AddU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw8AddUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW8(unsignedTypeI64, imm, atomicArithmeticOpAdd), + ) + case wasm.OpcodeAtomicI32Rmw16AddU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw16AddUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW16(unsignedTypeI32, imm, atomicArithmeticOpAdd), + ) + case wasm.OpcodeAtomicI64Rmw16AddU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw16AddUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW16(unsignedTypeI64, imm, atomicArithmeticOpAdd), + ) + case wasm.OpcodeAtomicI64Rmw32AddU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw32AddUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpAdd), + ) + case wasm.OpcodeAtomicI32RmwSub: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32RmwSubName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpSub), + ) + case wasm.OpcodeAtomicI64RmwSub: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64RmwSubName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI64, imm, atomicArithmeticOpSub), + ) + case wasm.OpcodeAtomicI32Rmw8SubU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw8SubUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW8(unsignedTypeI32, imm, atomicArithmeticOpSub), + ) + case wasm.OpcodeAtomicI64Rmw8SubU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw8SubUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW8(unsignedTypeI64, imm, atomicArithmeticOpSub), + ) + case wasm.OpcodeAtomicI32Rmw16SubU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw16SubUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW16(unsignedTypeI32, imm, atomicArithmeticOpSub), + ) + case wasm.OpcodeAtomicI64Rmw16SubU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw16SubUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW16(unsignedTypeI64, imm, atomicArithmeticOpSub), + ) + case wasm.OpcodeAtomicI64Rmw32SubU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw32SubUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpSub), + ) + case wasm.OpcodeAtomicI32RmwAnd: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32RmwAndName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpAnd), + ) + case wasm.OpcodeAtomicI64RmwAnd: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64RmwAndName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI64, imm, atomicArithmeticOpAnd), + ) + case wasm.OpcodeAtomicI32Rmw8AndU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw8AndUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW8(unsignedTypeI32, imm, atomicArithmeticOpAnd), + ) + case wasm.OpcodeAtomicI64Rmw8AndU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw8AndUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW8(unsignedTypeI64, imm, atomicArithmeticOpAnd), + ) + case wasm.OpcodeAtomicI32Rmw16AndU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw16AndUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW16(unsignedTypeI32, imm, atomicArithmeticOpAnd), + ) + case wasm.OpcodeAtomicI64Rmw16AndU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw16AndUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW16(unsignedTypeI64, imm, atomicArithmeticOpAnd), + ) + case wasm.OpcodeAtomicI64Rmw32AndU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw32AndUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpAnd), + ) + case wasm.OpcodeAtomicI32RmwOr: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32RmwOrName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpOr), + ) + case wasm.OpcodeAtomicI64RmwOr: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64RmwOrName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI64, imm, atomicArithmeticOpOr), + ) + case wasm.OpcodeAtomicI32Rmw8OrU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw8OrUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW8(unsignedTypeI32, imm, atomicArithmeticOpOr), + ) + case wasm.OpcodeAtomicI64Rmw8OrU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw8OrUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW8(unsignedTypeI64, imm, atomicArithmeticOpOr), + ) + case wasm.OpcodeAtomicI32Rmw16OrU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw16OrUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW16(unsignedTypeI32, imm, atomicArithmeticOpOr), + ) + case wasm.OpcodeAtomicI64Rmw16OrU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw16OrUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW16(unsignedTypeI64, imm, atomicArithmeticOpOr), + ) + case wasm.OpcodeAtomicI64Rmw32OrU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw32OrUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpOr), + ) + case wasm.OpcodeAtomicI32RmwXor: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32RmwXorName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpXor), + ) + case wasm.OpcodeAtomicI64RmwXor: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64RmwXorName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI64, imm, atomicArithmeticOpXor), + ) + case wasm.OpcodeAtomicI32Rmw8XorU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw8XorUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW8(unsignedTypeI32, imm, atomicArithmeticOpXor), + ) + case wasm.OpcodeAtomicI64Rmw8XorU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw8XorUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW8(unsignedTypeI64, imm, atomicArithmeticOpXor), + ) + case wasm.OpcodeAtomicI32Rmw16XorU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw16XorUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW16(unsignedTypeI32, imm, atomicArithmeticOpXor), + ) + case wasm.OpcodeAtomicI64Rmw16XorU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw16XorUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW16(unsignedTypeI64, imm, atomicArithmeticOpXor), + ) + case wasm.OpcodeAtomicI64Rmw32XorU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw32XorUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpXor), + ) + case wasm.OpcodeAtomicI32RmwXchg: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32RmwXchgName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpNop), + ) + case wasm.OpcodeAtomicI64RmwXchg: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64RmwXchgName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI64, imm, atomicArithmeticOpNop), + ) + case wasm.OpcodeAtomicI32Rmw8XchgU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw8XchgUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW8(unsignedTypeI32, imm, atomicArithmeticOpNop), + ) + case wasm.OpcodeAtomicI64Rmw8XchgU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw8XchgUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW8(unsignedTypeI64, imm, atomicArithmeticOpNop), + ) + case wasm.OpcodeAtomicI32Rmw16XchgU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw16XchgUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW16(unsignedTypeI32, imm, atomicArithmeticOpNop), + ) + case wasm.OpcodeAtomicI64Rmw16XchgU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw16XchgUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW16(unsignedTypeI64, imm, atomicArithmeticOpNop), + ) + case wasm.OpcodeAtomicI64Rmw32XchgU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw32XchgUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW(unsignedTypeI32, imm, atomicArithmeticOpNop), + ) + case wasm.OpcodeAtomicI32RmwCmpxchg: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32RmwCmpxchgName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMWCmpxchg(unsignedTypeI32, imm), + ) + case wasm.OpcodeAtomicI64RmwCmpxchg: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64RmwCmpxchgName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMWCmpxchg(unsignedTypeI64, imm), + ) + case wasm.OpcodeAtomicI32Rmw8CmpxchgU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw8CmpxchgUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW8Cmpxchg(unsignedTypeI32, imm), + ) + case wasm.OpcodeAtomicI64Rmw8CmpxchgU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw8CmpxchgUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW8Cmpxchg(unsignedTypeI64, imm), + ) + case wasm.OpcodeAtomicI32Rmw16CmpxchgU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI32Rmw16CmpxchgUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW16Cmpxchg(unsignedTypeI32, imm), + ) + case wasm.OpcodeAtomicI64Rmw16CmpxchgU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw16CmpxchgUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMW16Cmpxchg(unsignedTypeI64, imm), + ) + case wasm.OpcodeAtomicI64Rmw32CmpxchgU: + imm, err := c.readMemoryArg(wasm.OpcodeAtomicI64Rmw32CmpxchgUName) + if err != nil { + return err + } + c.emit( + newOperationAtomicRMWCmpxchg(unsignedTypeI32, imm), + ) + default: + return fmt.Errorf("unsupported atomic instruction in interpreterir: %s", wasm.AtomicInstructionName(atomicOp)) + } + default: + return fmt.Errorf("unsupported instruction in interpreterir: 0x%x", op) + } + + // Move the program counter to point to the next instruction. + c.pc++ + return nil +} + +func (c *compiler) nextFrameID() (id uint32) { + id = c.currentFrameID + 1 + c.currentFrameID++ + return +} + +func (c *compiler) applyToStack(opcode wasm.Opcode) (index uint32, err error) { + switch opcode { + case + // These are the opcodes that is coupled with "index" immediate + // and it DOES affect the signature of opcode. + wasm.OpcodeCall, + wasm.OpcodeCallIndirect, + wasm.OpcodeLocalGet, + wasm.OpcodeLocalSet, + wasm.OpcodeLocalTee, + wasm.OpcodeGlobalGet, + wasm.OpcodeGlobalSet: + // Assumes that we are at the opcode now so skip it before read immediates. + v, num, err := leb128.LoadUint32(c.body[c.pc+1:]) + if err != nil { + return 0, fmt.Errorf("reading immediates: %w", err) + } + c.pc += num + index = v + default: + // Note that other opcodes are free of index + // as it doesn't affect the signature of opt code. + // In other words, the "index" argument of wasmOpcodeSignature + // is ignored there. + } + + if c.unreachableState.on { + return 0, nil + } + + // Retrieve the signature of the opcode. + s, err := c.wasmOpcodeSignature(opcode, index) + if err != nil { + return 0, err + } + + // Manipulate the stack according to the signature. + // Note that the following algorithm assumes that + // the unknown type is unique in the signature, + // and is determined by the actual type on the stack. + // The determined type is stored in this typeParam. + var typeParam unsignedType + var typeParamFound bool + for i := range s.in { + want := s.in[len(s.in)-1-i] + actual := c.stackPop() + if want == unsignedTypeUnknown && typeParamFound { + want = typeParam + } else if want == unsignedTypeUnknown { + want = actual + typeParam = want + typeParamFound = true + } + if want != actual { + return 0, fmt.Errorf("input signature mismatch: want %s but have %s", want, actual) + } + } + + for _, target := range s.out { + if target == unsignedTypeUnknown && !typeParamFound { + return 0, fmt.Errorf("cannot determine type of unknown result") + } else if target == unsignedTypeUnknown { + c.stackPush(typeParam) + } else { + c.stackPush(target) + } + } + + return index, nil +} + +func (c *compiler) stackPeek() (ret unsignedType) { + ret = c.stack[len(c.stack)-1] + return +} + +func (c *compiler) stackPop() (ret unsignedType) { + // No need to check stack bound + // as we can assume that all the operations + // are valid thanks to validateFunction + // at module validation phase. + ret = c.stack[len(c.stack)-1] + c.stack = c.stack[:len(c.stack)-1] + return +} + +func (c *compiler) stackPush(ts unsignedType) { + c.stack = append(c.stack, ts) +} + +// emit adds the operations into the result. +func (c *compiler) emit(op unionOperation) { + if !c.unreachableState.on { + switch op.Kind { + case operationKindDrop: + // If the drop range is nil, + // we could remove such operations. + // That happens when drop operation is unnecessary. + // i.e. when there's no need to adjust stack before jmp. + if int64(op.U1) == -1 { + return + } + } + c.result.Operations = append(c.result.Operations, op) + if c.needSourceOffset { + c.result.IROperationSourceOffsetsInWasmBinary = append(c.result.IROperationSourceOffsetsInWasmBinary, + c.currentOpPC+c.bodyOffsetInCodeSection) + } + } +} + +// Emit const expression with default values of the given type. +func (c *compiler) emitDefaultValue(t wasm.ValueType) { + switch t { + case wasm.ValueTypeI32: + c.stackPush(unsignedTypeI32) + c.emit(newOperationConstI32(0)) + case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref: + c.stackPush(unsignedTypeI64) + c.emit(newOperationConstI64(0)) + case wasm.ValueTypeF32: + c.stackPush(unsignedTypeF32) + c.emit(newOperationConstF32(0)) + case wasm.ValueTypeF64: + c.stackPush(unsignedTypeF64) + c.emit(newOperationConstF64(0)) + case wasm.ValueTypeV128: + c.stackPush(unsignedTypeV128) + c.emit(newOperationV128Const(0, 0)) + } +} + +// Returns the "depth" (starting from top of the stack) +// of the n-th local. +func (c *compiler) localDepth(index wasm.Index) int { + height := c.localIndexToStackHeightInUint64[index] + return c.stackLenInUint64(len(c.stack)) - 1 - int(height) +} + +func (c *compiler) localType(index wasm.Index) (t wasm.ValueType) { + if params := uint32(len(c.sig.Params)); index < params { + t = c.sig.Params[index] + } else { + t = c.localTypes[index-params] + } + return +} + +// getFrameDropRange returns the range (starting from top of the stack) that spans across the (uint64) stack. The range is +// supposed to be dropped from the stack when the given frame exists or branch into it. +// +// * frame is the control frame which the call-site is trying to branch into or exit. +// * isEnd true if the call-site is handling wasm.OpcodeEnd. +func (c *compiler) getFrameDropRange(frame *controlFrame, isEnd bool) inclusiveRange { + var start int + if !isEnd && frame.kind == controlFrameKindLoop { + // If this is not End and the call-site is trying to branch into the Loop control frame, + // we have to Start executing from the beginning of the loop block. + // Therefore, we have to pass the inputs to the frame. + start = frame.blockType.ParamNumInUint64 + } else { + start = frame.blockType.ResultNumInUint64 + } + var end int + if frame.kind == controlFrameKindFunction { + // On the function return, we eliminate all the contents on the stack + // including locals (existing below of frame.originalStackLen) + end = c.stackLenInUint64(len(c.stack)) - 1 + } else { + end = c.stackLenInUint64(len(c.stack)) - 1 - c.stackLenInUint64(frame.originalStackLenWithoutParam) + } + if start <= end { + return inclusiveRange{Start: int32(start), End: int32(end)} + } else { + return nopinclusiveRange + } +} + +func (c *compiler) stackLenInUint64(ceil int) (ret int) { + for i := 0; i < ceil; i++ { + if c.stack[i] == unsignedTypeV128 { + ret += 2 + } else { + ret++ + } + } + return +} + +func (c *compiler) readMemoryArg(tag string) (memoryArg, error) { + c.result.UsesMemory = true + alignment, num, err := leb128.LoadUint32(c.body[c.pc+1:]) + if err != nil { + return memoryArg{}, fmt.Errorf("reading alignment for %s: %w", tag, err) + } + c.pc += num + offset, num, err := leb128.LoadUint32(c.body[c.pc+1:]) + if err != nil { + return memoryArg{}, fmt.Errorf("reading offset for %s: %w", tag, err) + } + c.pc += num + return memoryArg{Offset: offset, Alignment: alignment}, nil +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/format.go b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/format.go new file mode 100644 index 000000000..8af1d94b0 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/format.go @@ -0,0 +1,22 @@ +package interpreter + +import ( + "bytes" +) + +func format(ops []unionOperation) string { + buf := bytes.NewBuffer(nil) + + _, _ = buf.WriteString(".entrypoint\n") + for i := range ops { + op := &ops[i] + str := op.String() + isLabel := op.Kind == operationKindLabel + if !isLabel { + const indent = "\t" + str = indent + str + } + _, _ = buf.WriteString(str + "\n") + } + return buf.String() +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/interpreter.go b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/interpreter.go new file mode 100644 index 000000000..a89ddc457 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/interpreter.go @@ -0,0 +1,4583 @@ +package interpreter + +import ( + "context" + "encoding/binary" + "errors" + "fmt" + "math" + "math/bits" + "sync" + "unsafe" + + "github.com/tetratelabs/wazero/api" + "github.com/tetratelabs/wazero/experimental" + "github.com/tetratelabs/wazero/internal/expctxkeys" + "github.com/tetratelabs/wazero/internal/filecache" + "github.com/tetratelabs/wazero/internal/internalapi" + "github.com/tetratelabs/wazero/internal/moremath" + "github.com/tetratelabs/wazero/internal/wasm" + "github.com/tetratelabs/wazero/internal/wasmdebug" + "github.com/tetratelabs/wazero/internal/wasmruntime" +) + +// callStackCeiling is the maximum WebAssembly call frame stack height. This allows wazero to raise +// wasm.ErrCallStackOverflow instead of overflowing the Go runtime. +// +// The default value should suffice for most use cases. Those wishing to change this can via `go build -ldflags`. +var callStackCeiling = 2000 + +// engine is an interpreter implementation of wasm.Engine +type engine struct { + enabledFeatures api.CoreFeatures + compiledFunctions map[wasm.ModuleID][]compiledFunction // guarded by mutex. + mux sync.RWMutex +} + +func NewEngine(_ context.Context, enabledFeatures api.CoreFeatures, _ filecache.Cache) wasm.Engine { + return &engine{ + enabledFeatures: enabledFeatures, + compiledFunctions: map[wasm.ModuleID][]compiledFunction{}, + } +} + +// Close implements the same method as documented on wasm.Engine. +func (e *engine) Close() (err error) { + return +} + +// CompiledModuleCount implements the same method as documented on wasm.Engine. +func (e *engine) CompiledModuleCount() uint32 { + return uint32(len(e.compiledFunctions)) +} + +// DeleteCompiledModule implements the same method as documented on wasm.Engine. +func (e *engine) DeleteCompiledModule(m *wasm.Module) { + e.deleteCompiledFunctions(m) +} + +func (e *engine) deleteCompiledFunctions(module *wasm.Module) { + e.mux.Lock() + defer e.mux.Unlock() + delete(e.compiledFunctions, module.ID) +} + +func (e *engine) addCompiledFunctions(module *wasm.Module, fs []compiledFunction) { + e.mux.Lock() + defer e.mux.Unlock() + e.compiledFunctions[module.ID] = fs +} + +func (e *engine) getCompiledFunctions(module *wasm.Module) (fs []compiledFunction, ok bool) { + e.mux.RLock() + defer e.mux.RUnlock() + fs, ok = e.compiledFunctions[module.ID] + return +} + +// moduleEngine implements wasm.ModuleEngine +type moduleEngine struct { + // codes are the compiled functions in a module instances. + // The index is module instance-scoped. + functions []function + + // parentEngine holds *engine from which this module engine is created from. + parentEngine *engine +} + +// GetGlobalValue implements the same method as documented on wasm.ModuleEngine. +func (e *moduleEngine) GetGlobalValue(wasm.Index) (lo, hi uint64) { + panic("BUG: GetGlobalValue should never be called on interpreter mode") +} + +// SetGlobalValue implements the same method as documented on wasm.ModuleEngine. +func (e *moduleEngine) SetGlobalValue(idx wasm.Index, lo, hi uint64) { + panic("BUG: SetGlobalValue should never be called on interpreter mode") +} + +// OwnsGlobals implements the same method as documented on wasm.ModuleEngine. +func (e *moduleEngine) OwnsGlobals() bool { return false } + +// callEngine holds context per moduleEngine.Call, and shared across all the +// function calls originating from the same moduleEngine.Call execution. +// +// This implements api.Function. +type callEngine struct { + internalapi.WazeroOnlyType + + // stack contains the operands. + // Note that all the values are represented as uint64. + stack []uint64 + + // frames are the function call stack. + frames []*callFrame + + // f is the initial function for this call engine. + f *function + + // stackiterator for Listeners to walk frames and stack. + stackIterator stackIterator +} + +func (e *moduleEngine) newCallEngine(compiled *function) *callEngine { + return &callEngine{f: compiled} +} + +func (ce *callEngine) pushValue(v uint64) { + ce.stack = append(ce.stack, v) +} + +func (ce *callEngine) pushValues(v []uint64) { + ce.stack = append(ce.stack, v...) +} + +func (ce *callEngine) popValue() (v uint64) { + // No need to check stack bound + // as we can assume that all the operations + // are valid thanks to validateFunction + // at module validation phase + // and interpreterir translation + // before compilation. + stackTopIndex := len(ce.stack) - 1 + v = ce.stack[stackTopIndex] + ce.stack = ce.stack[:stackTopIndex] + return +} + +func (ce *callEngine) popValues(v []uint64) { + stackTopIndex := len(ce.stack) - len(v) + copy(v, ce.stack[stackTopIndex:]) + ce.stack = ce.stack[:stackTopIndex] +} + +// peekValues peeks api.ValueType values from the stack and returns them. +func (ce *callEngine) peekValues(count int) []uint64 { + if count == 0 { + return nil + } + stackLen := len(ce.stack) + return ce.stack[stackLen-count : stackLen] +} + +func (ce *callEngine) drop(raw uint64) { + r := inclusiveRangeFromU64(raw) + if r.Start == -1 { + return + } else if r.Start == 0 { + ce.stack = ce.stack[:int32(len(ce.stack))-1-r.End] + } else { + newStack := ce.stack[:int32(len(ce.stack))-1-r.End] + newStack = append(newStack, ce.stack[int32(len(ce.stack))-r.Start:]...) + ce.stack = newStack + } +} + +func (ce *callEngine) pushFrame(frame *callFrame) { + if callStackCeiling <= len(ce.frames) { + panic(wasmruntime.ErrRuntimeStackOverflow) + } + ce.frames = append(ce.frames, frame) +} + +func (ce *callEngine) popFrame() (frame *callFrame) { + // No need to check stack bound as we can assume that all the operations are valid thanks to validateFunction at + // module validation phase and interpreterir translation before compilation. + oneLess := len(ce.frames) - 1 + frame = ce.frames[oneLess] + ce.frames = ce.frames[:oneLess] + return +} + +type callFrame struct { + // pc is the program counter representing the current position in code.body. + pc uint64 + // f is the compiled function used in this function frame. + f *function + // base index in the frame of this function, used to detect the count of + // values on the stack. + base int +} + +type compiledFunction struct { + source *wasm.Module + body []unionOperation + listener experimental.FunctionListener + offsetsInWasmBinary []uint64 + hostFn interface{} + ensureTermination bool + index wasm.Index +} + +type function struct { + funcType *wasm.FunctionType + moduleInstance *wasm.ModuleInstance + typeID wasm.FunctionTypeID + parent *compiledFunction +} + +// functionFromUintptr resurrects the original *function from the given uintptr +// which comes from either funcref table or OpcodeRefFunc instruction. +func functionFromUintptr(ptr uintptr) *function { + // Wraps ptrs as the double pointer in order to avoid the unsafe access as detected by race detector. + // + // For example, if we have (*function)(unsafe.Pointer(ptr)) instead, then the race detector's "checkptr" + // subroutine wanrs as "checkptr: pointer arithmetic result points to invalid allocation" + // https://github.com/golang/go/blob/1ce7fcf139417d618c2730010ede2afb41664211/src/runtime/checkptr.go#L69 + var wrapped *uintptr = &ptr + return *(**function)(unsafe.Pointer(wrapped)) +} + +type snapshot struct { + stack []uint64 + frames []*callFrame + pc uint64 + + ret []uint64 + + ce *callEngine +} + +// Snapshot implements the same method as documented on experimental.Snapshotter. +func (ce *callEngine) Snapshot() experimental.Snapshot { + stack := make([]uint64, len(ce.stack)) + copy(stack, ce.stack) + + frames := make([]*callFrame, len(ce.frames)) + copy(frames, ce.frames) + + return &snapshot{ + stack: stack, + frames: frames, + ce: ce, + } +} + +// Restore implements the same method as documented on experimental.Snapshot. +func (s *snapshot) Restore(ret []uint64) { + s.ret = ret + panic(s) +} + +func (s *snapshot) doRestore() { + ce := s.ce + + ce.stack = s.stack + ce.frames = s.frames + ce.frames[len(ce.frames)-1].pc = s.pc + + copy(ce.stack[len(ce.stack)-len(s.ret):], s.ret) +} + +// Error implements the same method on error. +func (s *snapshot) Error() string { + return "unhandled snapshot restore, this generally indicates restore was called from a different " + + "exported function invocation than snapshot" +} + +// stackIterator implements experimental.StackIterator. +type stackIterator struct { + stack []uint64 + frames []*callFrame + started bool + fn *function + pc uint64 +} + +func (si *stackIterator) reset(stack []uint64, frames []*callFrame, f *function) { + si.fn = f + si.pc = 0 + si.stack = stack + si.frames = frames + si.started = false +} + +func (si *stackIterator) clear() { + si.stack = nil + si.frames = nil + si.started = false + si.fn = nil +} + +// Next implements the same method as documented on experimental.StackIterator. +func (si *stackIterator) Next() bool { + if !si.started { + si.started = true + return true + } + + if len(si.frames) == 0 { + return false + } + + frame := si.frames[len(si.frames)-1] + si.stack = si.stack[:frame.base] + si.fn = frame.f + si.pc = frame.pc + si.frames = si.frames[:len(si.frames)-1] + return true +} + +// Function implements the same method as documented on +// experimental.StackIterator. +func (si *stackIterator) Function() experimental.InternalFunction { + return internalFunction{si.fn} +} + +// ProgramCounter implements the same method as documented on +// experimental.StackIterator. +func (si *stackIterator) ProgramCounter() experimental.ProgramCounter { + return experimental.ProgramCounter(si.pc) +} + +// internalFunction implements experimental.InternalFunction. +type internalFunction struct{ *function } + +// Definition implements the same method as documented on +// experimental.InternalFunction. +func (f internalFunction) Definition() api.FunctionDefinition { + return f.definition() +} + +// SourceOffsetForPC implements the same method as documented on +// experimental.InternalFunction. +func (f internalFunction) SourceOffsetForPC(pc experimental.ProgramCounter) uint64 { + offsetsMap := f.parent.offsetsInWasmBinary + if uint64(pc) < uint64(len(offsetsMap)) { + return offsetsMap[pc] + } + return 0 +} + +// interpreter mode doesn't maintain call frames in the stack, so pass the zero size to the IR. +const callFrameStackSize = 0 + +// CompileModule implements the same method as documented on wasm.Engine. +func (e *engine) CompileModule(_ context.Context, module *wasm.Module, listeners []experimental.FunctionListener, ensureTermination bool) error { + if _, ok := e.getCompiledFunctions(module); ok { // cache hit! + return nil + } + + funcs := make([]compiledFunction, len(module.FunctionSection)) + irCompiler, err := newCompiler(e.enabledFeatures, callFrameStackSize, module, ensureTermination) + if err != nil { + return err + } + imported := module.ImportFunctionCount + for i := range module.CodeSection { + var lsn experimental.FunctionListener + if i < len(listeners) { + lsn = listeners[i] + } + + compiled := &funcs[i] + // If this is the host function, there's nothing to do as the runtime representation of + // host function in interpreter is its Go function itself as opposed to Wasm functions, + // which need to be compiled down to + if codeSeg := &module.CodeSection[i]; codeSeg.GoFunc != nil { + compiled.hostFn = codeSeg.GoFunc + } else { + ir, err := irCompiler.Next() + if err != nil { + return err + } + err = e.lowerIR(ir, compiled) + if err != nil { + def := module.FunctionDefinition(uint32(i) + module.ImportFunctionCount) + return fmt.Errorf("failed to lower func[%s] to interpreterir: %w", def.DebugName(), err) + } + } + compiled.source = module + compiled.ensureTermination = ensureTermination + compiled.listener = lsn + compiled.index = imported + uint32(i) + } + e.addCompiledFunctions(module, funcs) + return nil +} + +// NewModuleEngine implements the same method as documented on wasm.Engine. +func (e *engine) NewModuleEngine(module *wasm.Module, instance *wasm.ModuleInstance) (wasm.ModuleEngine, error) { + me := &moduleEngine{ + parentEngine: e, + functions: make([]function, len(module.FunctionSection)+int(module.ImportFunctionCount)), + } + + codes, ok := e.getCompiledFunctions(module) + if !ok { + return nil, errors.New("source module must be compiled before instantiation") + } + + for i := range codes { + c := &codes[i] + offset := i + int(module.ImportFunctionCount) + typeIndex := module.FunctionSection[i] + me.functions[offset] = function{ + moduleInstance: instance, + typeID: instance.TypeIDs[typeIndex], + funcType: &module.TypeSection[typeIndex], + parent: c, + } + } + return me, nil +} + +// lowerIR lowers the interpreterir operations to engine friendly struct. +func (e *engine) lowerIR(ir *compilationResult, ret *compiledFunction) error { + // Copy the body from the result. + ret.body = make([]unionOperation, len(ir.Operations)) + copy(ret.body, ir.Operations) + // Also copy the offsets if necessary. + if offsets := ir.IROperationSourceOffsetsInWasmBinary; len(offsets) > 0 { + ret.offsetsInWasmBinary = make([]uint64, len(offsets)) + copy(ret.offsetsInWasmBinary, offsets) + } + + labelAddressResolutions := [labelKindNum][]uint64{} + + // First, we iterate all labels, and resolve the address. + for i := range ret.body { + op := &ret.body[i] + switch op.Kind { + case operationKindLabel: + label := label(op.U1) + address := uint64(i) + + kind, fid := label.Kind(), label.FrameID() + frameToAddresses := labelAddressResolutions[label.Kind()] + // Expand the slice if necessary. + if diff := fid - len(frameToAddresses) + 1; diff > 0 { + for j := 0; j < diff; j++ { + frameToAddresses = append(frameToAddresses, 0) + } + } + frameToAddresses[fid] = address + labelAddressResolutions[kind] = frameToAddresses + } + } + + // Then resolve the label as the index to the body. + for i := range ret.body { + op := &ret.body[i] + switch op.Kind { + case operationKindBr: + e.setLabelAddress(&op.U1, label(op.U1), labelAddressResolutions) + case operationKindBrIf: + e.setLabelAddress(&op.U1, label(op.U1), labelAddressResolutions) + e.setLabelAddress(&op.U2, label(op.U2), labelAddressResolutions) + case operationKindBrTable: + for j := 0; j < len(op.Us); j += 2 { + target := op.Us[j] + e.setLabelAddress(&op.Us[j], label(target), labelAddressResolutions) + } + } + } + return nil +} + +func (e *engine) setLabelAddress(op *uint64, label label, labelAddressResolutions [labelKindNum][]uint64) { + if label.IsReturnTarget() { + // Jmp to the end of the possible binary. + *op = math.MaxUint64 + } else { + *op = labelAddressResolutions[label.Kind()][label.FrameID()] + } +} + +// ResolveImportedFunction implements wasm.ModuleEngine. +func (e *moduleEngine) ResolveImportedFunction(index, indexInImportedModule wasm.Index, importedModuleEngine wasm.ModuleEngine) { + imported := importedModuleEngine.(*moduleEngine) + e.functions[index] = imported.functions[indexInImportedModule] +} + +// ResolveImportedMemory implements wasm.ModuleEngine. +func (e *moduleEngine) ResolveImportedMemory(wasm.ModuleEngine) {} + +// DoneInstantiation implements wasm.ModuleEngine. +func (e *moduleEngine) DoneInstantiation() {} + +// FunctionInstanceReference implements the same method as documented on wasm.ModuleEngine. +func (e *moduleEngine) FunctionInstanceReference(funcIndex wasm.Index) wasm.Reference { + return uintptr(unsafe.Pointer(&e.functions[funcIndex])) +} + +// NewFunction implements the same method as documented on wasm.ModuleEngine. +func (e *moduleEngine) NewFunction(index wasm.Index) (ce api.Function) { + // Note: The input parameters are pre-validated, so a compiled function is only absent on close. Updates to + // code on close aren't locked, neither is this read. + compiled := &e.functions[index] + return e.newCallEngine(compiled) +} + +// LookupFunction implements the same method as documented on wasm.ModuleEngine. +func (e *moduleEngine) LookupFunction(t *wasm.TableInstance, typeId wasm.FunctionTypeID, tableOffset wasm.Index) (*wasm.ModuleInstance, wasm.Index) { + if tableOffset >= uint32(len(t.References)) { + panic(wasmruntime.ErrRuntimeInvalidTableAccess) + } + rawPtr := t.References[tableOffset] + if rawPtr == 0 { + panic(wasmruntime.ErrRuntimeInvalidTableAccess) + } + + tf := functionFromUintptr(rawPtr) + if tf.typeID != typeId { + panic(wasmruntime.ErrRuntimeIndirectCallTypeMismatch) + } + return tf.moduleInstance, tf.parent.index +} + +// Definition implements the same method as documented on api.Function. +func (ce *callEngine) Definition() api.FunctionDefinition { + return ce.f.definition() +} + +func (f *function) definition() api.FunctionDefinition { + compiled := f.parent + return compiled.source.FunctionDefinition(compiled.index) +} + +// Call implements the same method as documented on api.Function. +func (ce *callEngine) Call(ctx context.Context, params ...uint64) (results []uint64, err error) { + ft := ce.f.funcType + if n := ft.ParamNumInUint64; n != len(params) { + return nil, fmt.Errorf("expected %d params, but passed %d", n, len(params)) + } + return ce.call(ctx, params, nil) +} + +// CallWithStack implements the same method as documented on api.Function. +func (ce *callEngine) CallWithStack(ctx context.Context, stack []uint64) error { + params, results, err := wasm.SplitCallStack(ce.f.funcType, stack) + if err != nil { + return err + } + _, err = ce.call(ctx, params, results) + return err +} + +func (ce *callEngine) call(ctx context.Context, params, results []uint64) (_ []uint64, err error) { + m := ce.f.moduleInstance + if ce.f.parent.ensureTermination { + select { + case <-ctx.Done(): + // If the provided context is already done, close the call context + // and return the error. + m.CloseWithCtxErr(ctx) + return nil, m.FailIfClosed() + default: + } + } + + if ctx.Value(expctxkeys.EnableSnapshotterKey{}) != nil { + ctx = context.WithValue(ctx, expctxkeys.SnapshotterKey{}, ce) + } + + defer func() { + // If the module closed during the call, and the call didn't err for another reason, set an ExitError. + if err == nil { + err = m.FailIfClosed() + } + // TODO: ^^ Will not fail if the function was imported from a closed module. + + if v := recover(); v != nil { + err = ce.recoverOnCall(ctx, m, v) + } + }() + + ce.pushValues(params) + + if ce.f.parent.ensureTermination { + done := m.CloseModuleOnCanceledOrTimeout(ctx) + defer done() + } + + ce.callFunction(ctx, m, ce.f) + + // This returns a safe copy of the results, instead of a slice view. If we + // returned a re-slice, the caller could accidentally or purposefully + // corrupt the stack of subsequent calls. + ft := ce.f.funcType + if results == nil && ft.ResultNumInUint64 > 0 { + results = make([]uint64, ft.ResultNumInUint64) + } + ce.popValues(results) + return results, nil +} + +// functionListenerInvocation captures arguments needed to perform function +// listener invocations when unwinding the call stack. +type functionListenerInvocation struct { + experimental.FunctionListener + def api.FunctionDefinition +} + +// recoverOnCall takes the recovered value `recoverOnCall`, and wraps it +// with the call frame stack traces. Also, reset the state of callEngine +// so that it can be used for the subsequent calls. +func (ce *callEngine) recoverOnCall(ctx context.Context, m *wasm.ModuleInstance, v interface{}) (err error) { + if s, ok := v.(*snapshot); ok { + // A snapshot that wasn't handled was created by a different call engine possibly from a nested wasm invocation, + // let it propagate up to be handled by the caller. + panic(s) + } + + builder := wasmdebug.NewErrorBuilder() + frameCount := len(ce.frames) + functionListeners := make([]functionListenerInvocation, 0, 16) + + if frameCount > wasmdebug.MaxFrames { + frameCount = wasmdebug.MaxFrames + } + for i := 0; i < frameCount; i++ { + frame := ce.popFrame() + f := frame.f + def := f.definition() + var sources []string + if parent := frame.f.parent; parent.body != nil && len(parent.offsetsInWasmBinary) > 0 { + sources = parent.source.DWARFLines.Line(parent.offsetsInWasmBinary[frame.pc]) + } + builder.AddFrame(def.DebugName(), def.ParamTypes(), def.ResultTypes(), sources) + if f.parent.listener != nil { + functionListeners = append(functionListeners, functionListenerInvocation{ + FunctionListener: f.parent.listener, + def: f.definition(), + }) + } + } + + err = builder.FromRecovered(v) + for i := range functionListeners { + functionListeners[i].Abort(ctx, m, functionListeners[i].def, err) + } + + // Allows the reuse of CallEngine. + ce.stack, ce.frames = ce.stack[:0], ce.frames[:0] + return +} + +func (ce *callEngine) callFunction(ctx context.Context, m *wasm.ModuleInstance, f *function) { + if f.parent.hostFn != nil { + ce.callGoFuncWithStack(ctx, m, f) + } else if lsn := f.parent.listener; lsn != nil { + ce.callNativeFuncWithListener(ctx, m, f, lsn) + } else { + ce.callNativeFunc(ctx, m, f) + } +} + +func (ce *callEngine) callGoFunc(ctx context.Context, m *wasm.ModuleInstance, f *function, stack []uint64) { + typ := f.funcType + lsn := f.parent.listener + if lsn != nil { + params := stack[:typ.ParamNumInUint64] + ce.stackIterator.reset(ce.stack, ce.frames, f) + lsn.Before(ctx, m, f.definition(), params, &ce.stackIterator) + ce.stackIterator.clear() + } + frame := &callFrame{f: f, base: len(ce.stack)} + ce.pushFrame(frame) + + fn := f.parent.hostFn + switch fn := fn.(type) { + case api.GoModuleFunction: + fn.Call(ctx, m, stack) + case api.GoFunction: + fn.Call(ctx, stack) + } + + ce.popFrame() + if lsn != nil { + // TODO: This doesn't get the error due to use of panic to propagate them. + results := stack[:typ.ResultNumInUint64] + lsn.After(ctx, m, f.definition(), results) + } +} + +func (ce *callEngine) callNativeFunc(ctx context.Context, m *wasm.ModuleInstance, f *function) { + frame := &callFrame{f: f, base: len(ce.stack)} + moduleInst := f.moduleInstance + functions := moduleInst.Engine.(*moduleEngine).functions + memoryInst := moduleInst.MemoryInstance + globals := moduleInst.Globals + tables := moduleInst.Tables + typeIDs := moduleInst.TypeIDs + dataInstances := moduleInst.DataInstances + elementInstances := moduleInst.ElementInstances + ce.pushFrame(frame) + body := frame.f.parent.body + bodyLen := uint64(len(body)) + for frame.pc < bodyLen { + op := &body[frame.pc] + // TODO: add description of each operation/case + // on, for example, how many args are used, + // how the stack is modified, etc. + switch op.Kind { + case operationKindBuiltinFunctionCheckExitCode: + if err := m.FailIfClosed(); err != nil { + panic(err) + } + frame.pc++ + case operationKindUnreachable: + panic(wasmruntime.ErrRuntimeUnreachable) + case operationKindBr: + frame.pc = op.U1 + case operationKindBrIf: + if ce.popValue() > 0 { + ce.drop(op.U3) + frame.pc = op.U1 + } else { + frame.pc = op.U2 + } + case operationKindBrTable: + v := ce.popValue() + defaultAt := uint64(len(op.Us))/2 - 1 + if v > defaultAt { + v = defaultAt + } + v *= 2 + ce.drop(op.Us[v+1]) + frame.pc = op.Us[v] + case operationKindCall: + func() { + if ctx.Value(expctxkeys.EnableSnapshotterKey{}) != nil { + defer func() { + if r := recover(); r != nil { + if s, ok := r.(*snapshot); ok && s.ce == ce { + s.doRestore() + frame = ce.frames[len(ce.frames)-1] + body = frame.f.parent.body + bodyLen = uint64(len(body)) + } else { + panic(r) + } + } + }() + } + ce.callFunction(ctx, f.moduleInstance, &functions[op.U1]) + }() + frame.pc++ + case operationKindCallIndirect: + offset := ce.popValue() + table := tables[op.U2] + if offset >= uint64(len(table.References)) { + panic(wasmruntime.ErrRuntimeInvalidTableAccess) + } + rawPtr := table.References[offset] + if rawPtr == 0 { + panic(wasmruntime.ErrRuntimeInvalidTableAccess) + } + + tf := functionFromUintptr(rawPtr) + if tf.typeID != typeIDs[op.U1] { + panic(wasmruntime.ErrRuntimeIndirectCallTypeMismatch) + } + + ce.callFunction(ctx, f.moduleInstance, tf) + frame.pc++ + case operationKindDrop: + ce.drop(op.U1) + frame.pc++ + case operationKindSelect: + c := ce.popValue() + if op.B3 { // Target is vector. + x2Hi, x2Lo := ce.popValue(), ce.popValue() + if c == 0 { + _, _ = ce.popValue(), ce.popValue() // discard the x1's lo and hi bits. + ce.pushValue(x2Lo) + ce.pushValue(x2Hi) + } + } else { + v2 := ce.popValue() + if c == 0 { + _ = ce.popValue() + ce.pushValue(v2) + } + } + frame.pc++ + case operationKindPick: + index := len(ce.stack) - 1 - int(op.U1) + ce.pushValue(ce.stack[index]) + if op.B3 { // V128 value target. + ce.pushValue(ce.stack[index+1]) + } + frame.pc++ + case operationKindSet: + if op.B3 { // V128 value target. + lowIndex := len(ce.stack) - 1 - int(op.U1) + highIndex := lowIndex + 1 + hi, lo := ce.popValue(), ce.popValue() + ce.stack[lowIndex], ce.stack[highIndex] = lo, hi + } else { + index := len(ce.stack) - 1 - int(op.U1) + ce.stack[index] = ce.popValue() + } + frame.pc++ + case operationKindGlobalGet: + g := globals[op.U1] + ce.pushValue(g.Val) + if g.Type.ValType == wasm.ValueTypeV128 { + ce.pushValue(g.ValHi) + } + frame.pc++ + case operationKindGlobalSet: + g := globals[op.U1] + if g.Type.ValType == wasm.ValueTypeV128 { + g.ValHi = ce.popValue() + } + g.Val = ce.popValue() + frame.pc++ + case operationKindLoad: + offset := ce.popMemoryOffset(op) + switch unsignedType(op.B1) { + case unsignedTypeI32, unsignedTypeF32: + if val, ok := memoryInst.ReadUint32Le(offset); !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } else { + ce.pushValue(uint64(val)) + } + case unsignedTypeI64, unsignedTypeF64: + if val, ok := memoryInst.ReadUint64Le(offset); !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } else { + ce.pushValue(val) + } + } + frame.pc++ + case operationKindLoad8: + val, ok := memoryInst.ReadByte(ce.popMemoryOffset(op)) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + + switch signedInt(op.B1) { + case signedInt32: + ce.pushValue(uint64(uint32(int8(val)))) + case signedInt64: + ce.pushValue(uint64(int8(val))) + case signedUint32, signedUint64: + ce.pushValue(uint64(val)) + } + frame.pc++ + case operationKindLoad16: + + val, ok := memoryInst.ReadUint16Le(ce.popMemoryOffset(op)) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + + switch signedInt(op.B1) { + case signedInt32: + ce.pushValue(uint64(uint32(int16(val)))) + case signedInt64: + ce.pushValue(uint64(int16(val))) + case signedUint32, signedUint64: + ce.pushValue(uint64(val)) + } + frame.pc++ + case operationKindLoad32: + val, ok := memoryInst.ReadUint32Le(ce.popMemoryOffset(op)) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + + if op.B1 == 1 { // Signed + ce.pushValue(uint64(int32(val))) + } else { + ce.pushValue(uint64(val)) + } + frame.pc++ + case operationKindStore: + val := ce.popValue() + offset := ce.popMemoryOffset(op) + switch unsignedType(op.B1) { + case unsignedTypeI32, unsignedTypeF32: + if !memoryInst.WriteUint32Le(offset, uint32(val)) { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + case unsignedTypeI64, unsignedTypeF64: + if !memoryInst.WriteUint64Le(offset, val) { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + } + frame.pc++ + case operationKindStore8: + val := byte(ce.popValue()) + offset := ce.popMemoryOffset(op) + if !memoryInst.WriteByte(offset, val) { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + frame.pc++ + case operationKindStore16: + val := uint16(ce.popValue()) + offset := ce.popMemoryOffset(op) + if !memoryInst.WriteUint16Le(offset, val) { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + frame.pc++ + case operationKindStore32: + val := uint32(ce.popValue()) + offset := ce.popMemoryOffset(op) + if !memoryInst.WriteUint32Le(offset, val) { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + frame.pc++ + case operationKindMemorySize: + ce.pushValue(uint64(memoryInst.Pages())) + frame.pc++ + case operationKindMemoryGrow: + n := ce.popValue() + if res, ok := memoryInst.Grow(uint32(n)); !ok { + ce.pushValue(uint64(0xffffffff)) // = -1 in signed 32-bit integer. + } else { + ce.pushValue(uint64(res)) + } + frame.pc++ + case operationKindConstI32, operationKindConstI64, + operationKindConstF32, operationKindConstF64: + ce.pushValue(op.U1) + frame.pc++ + case operationKindEq: + var b bool + switch unsignedType(op.B1) { + case unsignedTypeI32: + v2, v1 := ce.popValue(), ce.popValue() + b = uint32(v1) == uint32(v2) + case unsignedTypeI64: + v2, v1 := ce.popValue(), ce.popValue() + b = v1 == v2 + case unsignedTypeF32: + v2, v1 := ce.popValue(), ce.popValue() + b = math.Float32frombits(uint32(v2)) == math.Float32frombits(uint32(v1)) + case unsignedTypeF64: + v2, v1 := ce.popValue(), ce.popValue() + b = math.Float64frombits(v2) == math.Float64frombits(v1) + } + if b { + ce.pushValue(1) + } else { + ce.pushValue(0) + } + frame.pc++ + case operationKindNe: + var b bool + switch unsignedType(op.B1) { + case unsignedTypeI32, unsignedTypeI64: + v2, v1 := ce.popValue(), ce.popValue() + b = v1 != v2 + case unsignedTypeF32: + v2, v1 := ce.popValue(), ce.popValue() + b = math.Float32frombits(uint32(v2)) != math.Float32frombits(uint32(v1)) + case unsignedTypeF64: + v2, v1 := ce.popValue(), ce.popValue() + b = math.Float64frombits(v2) != math.Float64frombits(v1) + } + if b { + ce.pushValue(1) + } else { + ce.pushValue(0) + } + frame.pc++ + case operationKindEqz: + if ce.popValue() == 0 { + ce.pushValue(1) + } else { + ce.pushValue(0) + } + frame.pc++ + case operationKindLt: + v2 := ce.popValue() + v1 := ce.popValue() + var b bool + switch signedType(op.B1) { + case signedTypeInt32: + b = int32(v1) < int32(v2) + case signedTypeInt64: + b = int64(v1) < int64(v2) + case signedTypeUint32, signedTypeUint64: + b = v1 < v2 + case signedTypeFloat32: + b = math.Float32frombits(uint32(v1)) < math.Float32frombits(uint32(v2)) + case signedTypeFloat64: + b = math.Float64frombits(v1) < math.Float64frombits(v2) + } + if b { + ce.pushValue(1) + } else { + ce.pushValue(0) + } + frame.pc++ + case operationKindGt: + v2 := ce.popValue() + v1 := ce.popValue() + var b bool + switch signedType(op.B1) { + case signedTypeInt32: + b = int32(v1) > int32(v2) + case signedTypeInt64: + b = int64(v1) > int64(v2) + case signedTypeUint32, signedTypeUint64: + b = v1 > v2 + case signedTypeFloat32: + b = math.Float32frombits(uint32(v1)) > math.Float32frombits(uint32(v2)) + case signedTypeFloat64: + b = math.Float64frombits(v1) > math.Float64frombits(v2) + } + if b { + ce.pushValue(1) + } else { + ce.pushValue(0) + } + frame.pc++ + case operationKindLe: + v2 := ce.popValue() + v1 := ce.popValue() + var b bool + switch signedType(op.B1) { + case signedTypeInt32: + b = int32(v1) <= int32(v2) + case signedTypeInt64: + b = int64(v1) <= int64(v2) + case signedTypeUint32, signedTypeUint64: + b = v1 <= v2 + case signedTypeFloat32: + b = math.Float32frombits(uint32(v1)) <= math.Float32frombits(uint32(v2)) + case signedTypeFloat64: + b = math.Float64frombits(v1) <= math.Float64frombits(v2) + } + if b { + ce.pushValue(1) + } else { + ce.pushValue(0) + } + frame.pc++ + case operationKindGe: + v2 := ce.popValue() + v1 := ce.popValue() + var b bool + switch signedType(op.B1) { + case signedTypeInt32: + b = int32(v1) >= int32(v2) + case signedTypeInt64: + b = int64(v1) >= int64(v2) + case signedTypeUint32, signedTypeUint64: + b = v1 >= v2 + case signedTypeFloat32: + b = math.Float32frombits(uint32(v1)) >= math.Float32frombits(uint32(v2)) + case signedTypeFloat64: + b = math.Float64frombits(v1) >= math.Float64frombits(v2) + } + if b { + ce.pushValue(1) + } else { + ce.pushValue(0) + } + frame.pc++ + case operationKindAdd: + v2 := ce.popValue() + v1 := ce.popValue() + switch unsignedType(op.B1) { + case unsignedTypeI32: + v := uint32(v1) + uint32(v2) + ce.pushValue(uint64(v)) + case unsignedTypeI64: + ce.pushValue(v1 + v2) + case unsignedTypeF32: + ce.pushValue(addFloat32bits(uint32(v1), uint32(v2))) + case unsignedTypeF64: + v := math.Float64frombits(v1) + math.Float64frombits(v2) + ce.pushValue(math.Float64bits(v)) + } + frame.pc++ + case operationKindSub: + v2 := ce.popValue() + v1 := ce.popValue() + switch unsignedType(op.B1) { + case unsignedTypeI32: + ce.pushValue(uint64(uint32(v1) - uint32(v2))) + case unsignedTypeI64: + ce.pushValue(v1 - v2) + case unsignedTypeF32: + ce.pushValue(subFloat32bits(uint32(v1), uint32(v2))) + case unsignedTypeF64: + v := math.Float64frombits(v1) - math.Float64frombits(v2) + ce.pushValue(math.Float64bits(v)) + } + frame.pc++ + case operationKindMul: + v2 := ce.popValue() + v1 := ce.popValue() + switch unsignedType(op.B1) { + case unsignedTypeI32: + ce.pushValue(uint64(uint32(v1) * uint32(v2))) + case unsignedTypeI64: + ce.pushValue(v1 * v2) + case unsignedTypeF32: + ce.pushValue(mulFloat32bits(uint32(v1), uint32(v2))) + case unsignedTypeF64: + v := math.Float64frombits(v2) * math.Float64frombits(v1) + ce.pushValue(math.Float64bits(v)) + } + frame.pc++ + case operationKindClz: + v := ce.popValue() + if op.B1 == 0 { + // unsignedInt32 + ce.pushValue(uint64(bits.LeadingZeros32(uint32(v)))) + } else { + // unsignedInt64 + ce.pushValue(uint64(bits.LeadingZeros64(v))) + } + frame.pc++ + case operationKindCtz: + v := ce.popValue() + if op.B1 == 0 { + // unsignedInt32 + ce.pushValue(uint64(bits.TrailingZeros32(uint32(v)))) + } else { + // unsignedInt64 + ce.pushValue(uint64(bits.TrailingZeros64(v))) + } + frame.pc++ + case operationKindPopcnt: + v := ce.popValue() + if op.B1 == 0 { + // unsignedInt32 + ce.pushValue(uint64(bits.OnesCount32(uint32(v)))) + } else { + // unsignedInt64 + ce.pushValue(uint64(bits.OnesCount64(v))) + } + frame.pc++ + case operationKindDiv: + // If an integer, check we won't divide by zero. + t := signedType(op.B1) + v2, v1 := ce.popValue(), ce.popValue() + switch t { + case signedTypeFloat32, signedTypeFloat64: // not integers + default: + if v2 == 0 { + panic(wasmruntime.ErrRuntimeIntegerDivideByZero) + } + } + + switch t { + case signedTypeInt32: + d := int32(v2) + n := int32(v1) + if n == math.MinInt32 && d == -1 { + panic(wasmruntime.ErrRuntimeIntegerOverflow) + } + ce.pushValue(uint64(uint32(n / d))) + case signedTypeInt64: + d := int64(v2) + n := int64(v1) + if n == math.MinInt64 && d == -1 { + panic(wasmruntime.ErrRuntimeIntegerOverflow) + } + ce.pushValue(uint64(n / d)) + case signedTypeUint32: + d := uint32(v2) + n := uint32(v1) + ce.pushValue(uint64(n / d)) + case signedTypeUint64: + d := v2 + n := v1 + ce.pushValue(n / d) + case signedTypeFloat32: + ce.pushValue(divFloat32bits(uint32(v1), uint32(v2))) + case signedTypeFloat64: + ce.pushValue(math.Float64bits(math.Float64frombits(v1) / math.Float64frombits(v2))) + } + frame.pc++ + case operationKindRem: + v2, v1 := ce.popValue(), ce.popValue() + if v2 == 0 { + panic(wasmruntime.ErrRuntimeIntegerDivideByZero) + } + switch signedInt(op.B1) { + case signedInt32: + d := int32(v2) + n := int32(v1) + ce.pushValue(uint64(uint32(n % d))) + case signedInt64: + d := int64(v2) + n := int64(v1) + ce.pushValue(uint64(n % d)) + case signedUint32: + d := uint32(v2) + n := uint32(v1) + ce.pushValue(uint64(n % d)) + case signedUint64: + d := v2 + n := v1 + ce.pushValue(n % d) + } + frame.pc++ + case operationKindAnd: + v2 := ce.popValue() + v1 := ce.popValue() + if op.B1 == 0 { + // unsignedInt32 + ce.pushValue(uint64(uint32(v2) & uint32(v1))) + } else { + // unsignedInt64 + ce.pushValue(uint64(v2 & v1)) + } + frame.pc++ + case operationKindOr: + v2 := ce.popValue() + v1 := ce.popValue() + if op.B1 == 0 { + // unsignedInt32 + ce.pushValue(uint64(uint32(v2) | uint32(v1))) + } else { + // unsignedInt64 + ce.pushValue(uint64(v2 | v1)) + } + frame.pc++ + case operationKindXor: + v2 := ce.popValue() + v1 := ce.popValue() + if op.B1 == 0 { + // unsignedInt32 + ce.pushValue(uint64(uint32(v2) ^ uint32(v1))) + } else { + // unsignedInt64 + ce.pushValue(uint64(v2 ^ v1)) + } + frame.pc++ + case operationKindShl: + v2 := ce.popValue() + v1 := ce.popValue() + if op.B1 == 0 { + // unsignedInt32 + ce.pushValue(uint64(uint32(v1) << (uint32(v2) % 32))) + } else { + // unsignedInt64 + ce.pushValue(v1 << (v2 % 64)) + } + frame.pc++ + case operationKindShr: + v2 := ce.popValue() + v1 := ce.popValue() + switch signedInt(op.B1) { + case signedInt32: + ce.pushValue(uint64(uint32(int32(v1) >> (uint32(v2) % 32)))) + case signedInt64: + ce.pushValue(uint64(int64(v1) >> (v2 % 64))) + case signedUint32: + ce.pushValue(uint64(uint32(v1) >> (uint32(v2) % 32))) + case signedUint64: + ce.pushValue(v1 >> (v2 % 64)) + } + frame.pc++ + case operationKindRotl: + v2 := ce.popValue() + v1 := ce.popValue() + if op.B1 == 0 { + // unsignedInt32 + ce.pushValue(uint64(bits.RotateLeft32(uint32(v1), int(v2)))) + } else { + // unsignedInt64 + ce.pushValue(uint64(bits.RotateLeft64(v1, int(v2)))) + } + frame.pc++ + case operationKindRotr: + v2 := ce.popValue() + v1 := ce.popValue() + if op.B1 == 0 { + // unsignedInt32 + ce.pushValue(uint64(bits.RotateLeft32(uint32(v1), -int(v2)))) + } else { + // unsignedInt64 + ce.pushValue(uint64(bits.RotateLeft64(v1, -int(v2)))) + } + frame.pc++ + case operationKindAbs: + if op.B1 == 0 { + // float32 + const mask uint32 = 1 << 31 + ce.pushValue(uint64(uint32(ce.popValue()) &^ mask)) + } else { + // float64 + const mask uint64 = 1 << 63 + ce.pushValue(ce.popValue() &^ mask) + } + frame.pc++ + case operationKindNeg: + if op.B1 == 0 { + // float32 + v := -math.Float32frombits(uint32(ce.popValue())) + ce.pushValue(uint64(math.Float32bits(v))) + } else { + // float64 + v := -math.Float64frombits(ce.popValue()) + ce.pushValue(math.Float64bits(v)) + } + frame.pc++ + case operationKindCeil: + if op.B1 == 0 { + // float32 + v := moremath.WasmCompatCeilF32(math.Float32frombits(uint32(ce.popValue()))) + ce.pushValue(uint64(math.Float32bits(v))) + } else { + // float64 + v := moremath.WasmCompatCeilF64(math.Float64frombits(ce.popValue())) + ce.pushValue(math.Float64bits(v)) + } + frame.pc++ + case operationKindFloor: + if op.B1 == 0 { + // float32 + v := moremath.WasmCompatFloorF32(math.Float32frombits(uint32(ce.popValue()))) + ce.pushValue(uint64(math.Float32bits(v))) + } else { + // float64 + v := moremath.WasmCompatFloorF64(math.Float64frombits(ce.popValue())) + ce.pushValue(math.Float64bits(v)) + } + frame.pc++ + case operationKindTrunc: + if op.B1 == 0 { + // float32 + v := moremath.WasmCompatTruncF32(math.Float32frombits(uint32(ce.popValue()))) + ce.pushValue(uint64(math.Float32bits(v))) + } else { + // float64 + v := moremath.WasmCompatTruncF64(math.Float64frombits(ce.popValue())) + ce.pushValue(math.Float64bits(v)) + } + frame.pc++ + case operationKindNearest: + if op.B1 == 0 { + // float32 + f := math.Float32frombits(uint32(ce.popValue())) + ce.pushValue(uint64(math.Float32bits(moremath.WasmCompatNearestF32(f)))) + } else { + // float64 + f := math.Float64frombits(ce.popValue()) + ce.pushValue(math.Float64bits(moremath.WasmCompatNearestF64(f))) + } + frame.pc++ + case operationKindSqrt: + if op.B1 == 0 { + // float32 + v := math.Sqrt(float64(math.Float32frombits(uint32(ce.popValue())))) + ce.pushValue(uint64(math.Float32bits(float32(v)))) + } else { + // float64 + v := math.Sqrt(math.Float64frombits(ce.popValue())) + ce.pushValue(math.Float64bits(v)) + } + frame.pc++ + case operationKindMin: + if op.B1 == 0 { + // float32 + ce.pushValue(wasmCompatMin32bits(uint32(ce.popValue()), uint32(ce.popValue()))) + } else { + v2 := math.Float64frombits(ce.popValue()) + v1 := math.Float64frombits(ce.popValue()) + ce.pushValue(math.Float64bits(moremath.WasmCompatMin64(v1, v2))) + } + frame.pc++ + case operationKindMax: + if op.B1 == 0 { + ce.pushValue(wasmCompatMax32bits(uint32(ce.popValue()), uint32(ce.popValue()))) + } else { + // float64 + v2 := math.Float64frombits(ce.popValue()) + v1 := math.Float64frombits(ce.popValue()) + ce.pushValue(math.Float64bits(moremath.WasmCompatMax64(v1, v2))) + } + frame.pc++ + case operationKindCopysign: + if op.B1 == 0 { + // float32 + v2 := uint32(ce.popValue()) + v1 := uint32(ce.popValue()) + const signbit = 1 << 31 + ce.pushValue(uint64(v1&^signbit | v2&signbit)) + } else { + // float64 + v2 := ce.popValue() + v1 := ce.popValue() + const signbit = 1 << 63 + ce.pushValue(v1&^signbit | v2&signbit) + } + frame.pc++ + case operationKindI32WrapFromI64: + ce.pushValue(uint64(uint32(ce.popValue()))) + frame.pc++ + case operationKindITruncFromF: + if op.B1 == 0 { + // float32 + switch signedInt(op.B2) { + case signedInt32: + v := math.Trunc(float64(math.Float32frombits(uint32(ce.popValue())))) + if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN + if op.B3 { + // non-trapping conversion must cast nan to zero. + v = 0 + } else { + panic(wasmruntime.ErrRuntimeInvalidConversionToInteger) + } + } else if v < math.MinInt32 || v > math.MaxInt32 { + if op.B3 { + // non-trapping conversion must "saturate" the value for overflowing sources. + if v < 0 { + v = math.MinInt32 + } else { + v = math.MaxInt32 + } + } else { + panic(wasmruntime.ErrRuntimeIntegerOverflow) + } + } + ce.pushValue(uint64(uint32(int32(v)))) + case signedInt64: + v := math.Trunc(float64(math.Float32frombits(uint32(ce.popValue())))) + res := int64(v) + if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN + if op.B3 { + // non-trapping conversion must cast nan to zero. + res = 0 + } else { + panic(wasmruntime.ErrRuntimeInvalidConversionToInteger) + } + } else if v < math.MinInt64 || v >= math.MaxInt64 { + // Note: math.MaxInt64 is rounded up to math.MaxInt64+1 in 64-bit float representation, + // and that's why we use '>=' not '>' to check overflow. + if op.B3 { + // non-trapping conversion must "saturate" the value for overflowing sources. + if v < 0 { + res = math.MinInt64 + } else { + res = math.MaxInt64 + } + } else { + panic(wasmruntime.ErrRuntimeIntegerOverflow) + } + } + ce.pushValue(uint64(res)) + case signedUint32: + v := math.Trunc(float64(math.Float32frombits(uint32(ce.popValue())))) + if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN + if op.B3 { + // non-trapping conversion must cast nan to zero. + v = 0 + } else { + panic(wasmruntime.ErrRuntimeInvalidConversionToInteger) + } + } else if v < 0 || v > math.MaxUint32 { + if op.B3 { + // non-trapping conversion must "saturate" the value for overflowing source. + if v < 0 { + v = 0 + } else { + v = math.MaxUint32 + } + } else { + panic(wasmruntime.ErrRuntimeIntegerOverflow) + } + } + ce.pushValue(uint64(uint32(v))) + case signedUint64: + v := math.Trunc(float64(math.Float32frombits(uint32(ce.popValue())))) + res := uint64(v) + if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN + if op.B3 { + // non-trapping conversion must cast nan to zero. + res = 0 + } else { + panic(wasmruntime.ErrRuntimeInvalidConversionToInteger) + } + } else if v < 0 || v >= math.MaxUint64 { + // Note: math.MaxUint64 is rounded up to math.MaxUint64+1 in 64-bit float representation, + // and that's why we use '>=' not '>' to check overflow. + if op.B3 { + // non-trapping conversion must "saturate" the value for overflowing source. + if v < 0 { + res = 0 + } else { + res = math.MaxUint64 + } + } else { + panic(wasmruntime.ErrRuntimeIntegerOverflow) + } + } + ce.pushValue(res) + } + } else { + // float64 + switch signedInt(op.B2) { + case signedInt32: + v := math.Trunc(math.Float64frombits(ce.popValue())) + if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN + if op.B3 { + // non-trapping conversion must cast nan to zero. + v = 0 + } else { + panic(wasmruntime.ErrRuntimeInvalidConversionToInteger) + } + } else if v < math.MinInt32 || v > math.MaxInt32 { + if op.B3 { + // non-trapping conversion must "saturate" the value for overflowing source. + if v < 0 { + v = math.MinInt32 + } else { + v = math.MaxInt32 + } + } else { + panic(wasmruntime.ErrRuntimeIntegerOverflow) + } + } + ce.pushValue(uint64(uint32(int32(v)))) + case signedInt64: + v := math.Trunc(math.Float64frombits(ce.popValue())) + res := int64(v) + if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN + if op.B3 { + // non-trapping conversion must cast nan to zero. + res = 0 + } else { + panic(wasmruntime.ErrRuntimeInvalidConversionToInteger) + } + } else if v < math.MinInt64 || v >= math.MaxInt64 { + // Note: math.MaxInt64 is rounded up to math.MaxInt64+1 in 64-bit float representation, + // and that's why we use '>=' not '>' to check overflow. + if op.B3 { + // non-trapping conversion must "saturate" the value for overflowing source. + if v < 0 { + res = math.MinInt64 + } else { + res = math.MaxInt64 + } + } else { + panic(wasmruntime.ErrRuntimeIntegerOverflow) + } + } + ce.pushValue(uint64(res)) + case signedUint32: + v := math.Trunc(math.Float64frombits(ce.popValue())) + if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN + if op.B3 { + // non-trapping conversion must cast nan to zero. + v = 0 + } else { + panic(wasmruntime.ErrRuntimeInvalidConversionToInteger) + } + } else if v < 0 || v > math.MaxUint32 { + if op.B3 { + // non-trapping conversion must "saturate" the value for overflowing source. + if v < 0 { + v = 0 + } else { + v = math.MaxUint32 + } + } else { + panic(wasmruntime.ErrRuntimeIntegerOverflow) + } + } + ce.pushValue(uint64(uint32(v))) + case signedUint64: + v := math.Trunc(math.Float64frombits(ce.popValue())) + res := uint64(v) + if math.IsNaN(v) { // NaN cannot be compared with themselves, so we have to use IsNaN + if op.B3 { + // non-trapping conversion must cast nan to zero. + res = 0 + } else { + panic(wasmruntime.ErrRuntimeInvalidConversionToInteger) + } + } else if v < 0 || v >= math.MaxUint64 { + // Note: math.MaxUint64 is rounded up to math.MaxUint64+1 in 64-bit float representation, + // and that's why we use '>=' not '>' to check overflow. + if op.B3 { + // non-trapping conversion must "saturate" the value for overflowing source. + if v < 0 { + res = 0 + } else { + res = math.MaxUint64 + } + } else { + panic(wasmruntime.ErrRuntimeIntegerOverflow) + } + } + ce.pushValue(res) + } + } + frame.pc++ + case operationKindFConvertFromI: + switch signedInt(op.B1) { + case signedInt32: + if op.B2 == 0 { + // float32 + v := float32(int32(ce.popValue())) + ce.pushValue(uint64(math.Float32bits(v))) + } else { + // float64 + v := float64(int32(ce.popValue())) + ce.pushValue(math.Float64bits(v)) + } + case signedInt64: + if op.B2 == 0 { + // float32 + v := float32(int64(ce.popValue())) + ce.pushValue(uint64(math.Float32bits(v))) + } else { + // float64 + v := float64(int64(ce.popValue())) + ce.pushValue(math.Float64bits(v)) + } + case signedUint32: + if op.B2 == 0 { + // float32 + v := float32(uint32(ce.popValue())) + ce.pushValue(uint64(math.Float32bits(v))) + } else { + // float64 + v := float64(uint32(ce.popValue())) + ce.pushValue(math.Float64bits(v)) + } + case signedUint64: + if op.B2 == 0 { + // float32 + v := float32(ce.popValue()) + ce.pushValue(uint64(math.Float32bits(v))) + } else { + // float64 + v := float64(ce.popValue()) + ce.pushValue(math.Float64bits(v)) + } + } + frame.pc++ + case operationKindF32DemoteFromF64: + v := float32(math.Float64frombits(ce.popValue())) + ce.pushValue(uint64(math.Float32bits(v))) + frame.pc++ + case operationKindF64PromoteFromF32: + v := float64(math.Float32frombits(uint32(ce.popValue()))) + ce.pushValue(math.Float64bits(v)) + frame.pc++ + case operationKindExtend: + if op.B1 == 1 { + // Signed. + v := int64(int32(ce.popValue())) + ce.pushValue(uint64(v)) + } else { + v := uint64(uint32(ce.popValue())) + ce.pushValue(v) + } + frame.pc++ + case operationKindSignExtend32From8: + v := uint32(int8(ce.popValue())) + ce.pushValue(uint64(v)) + frame.pc++ + case operationKindSignExtend32From16: + v := uint32(int16(ce.popValue())) + ce.pushValue(uint64(v)) + frame.pc++ + case operationKindSignExtend64From8: + v := int64(int8(ce.popValue())) + ce.pushValue(uint64(v)) + frame.pc++ + case operationKindSignExtend64From16: + v := int64(int16(ce.popValue())) + ce.pushValue(uint64(v)) + frame.pc++ + case operationKindSignExtend64From32: + v := int64(int32(ce.popValue())) + ce.pushValue(uint64(v)) + frame.pc++ + case operationKindMemoryInit: + dataInstance := dataInstances[op.U1] + copySize := ce.popValue() + inDataOffset := ce.popValue() + inMemoryOffset := ce.popValue() + if inDataOffset+copySize > uint64(len(dataInstance)) || + inMemoryOffset+copySize > uint64(len(memoryInst.Buffer)) { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } else if copySize != 0 { + copy(memoryInst.Buffer[inMemoryOffset:inMemoryOffset+copySize], dataInstance[inDataOffset:]) + } + frame.pc++ + case operationKindDataDrop: + dataInstances[op.U1] = nil + frame.pc++ + case operationKindMemoryCopy: + memLen := uint64(len(memoryInst.Buffer)) + copySize := ce.popValue() + sourceOffset := ce.popValue() + destinationOffset := ce.popValue() + if sourceOffset+copySize > memLen || destinationOffset+copySize > memLen { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } else if copySize != 0 { + copy(memoryInst.Buffer[destinationOffset:], + memoryInst.Buffer[sourceOffset:sourceOffset+copySize]) + } + frame.pc++ + case operationKindMemoryFill: + fillSize := ce.popValue() + value := byte(ce.popValue()) + offset := ce.popValue() + if fillSize+offset > uint64(len(memoryInst.Buffer)) { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } else if fillSize != 0 { + // Uses the copy trick for faster filling buffer. + // https://gist.github.com/taylorza/df2f89d5f9ab3ffd06865062a4cf015d + buf := memoryInst.Buffer[offset : offset+fillSize] + buf[0] = value + for i := 1; i < len(buf); i *= 2 { + copy(buf[i:], buf[:i]) + } + } + frame.pc++ + case operationKindTableInit: + elementInstance := elementInstances[op.U1] + copySize := ce.popValue() + inElementOffset := ce.popValue() + inTableOffset := ce.popValue() + table := tables[op.U2] + if inElementOffset+copySize > uint64(len(elementInstance)) || + inTableOffset+copySize > uint64(len(table.References)) { + panic(wasmruntime.ErrRuntimeInvalidTableAccess) + } else if copySize != 0 { + copy(table.References[inTableOffset:inTableOffset+copySize], elementInstance[inElementOffset:]) + } + frame.pc++ + case operationKindElemDrop: + elementInstances[op.U1] = nil + frame.pc++ + case operationKindTableCopy: + srcTable, dstTable := tables[op.U1].References, tables[op.U2].References + copySize := ce.popValue() + sourceOffset := ce.popValue() + destinationOffset := ce.popValue() + if sourceOffset+copySize > uint64(len(srcTable)) || destinationOffset+copySize > uint64(len(dstTable)) { + panic(wasmruntime.ErrRuntimeInvalidTableAccess) + } else if copySize != 0 { + copy(dstTable[destinationOffset:], srcTable[sourceOffset:sourceOffset+copySize]) + } + frame.pc++ + case operationKindRefFunc: + ce.pushValue(uint64(uintptr(unsafe.Pointer(&functions[op.U1])))) + frame.pc++ + case operationKindTableGet: + table := tables[op.U1] + + offset := ce.popValue() + if offset >= uint64(len(table.References)) { + panic(wasmruntime.ErrRuntimeInvalidTableAccess) + } + + ce.pushValue(uint64(table.References[offset])) + frame.pc++ + case operationKindTableSet: + table := tables[op.U1] + ref := ce.popValue() + + offset := ce.popValue() + if offset >= uint64(len(table.References)) { + panic(wasmruntime.ErrRuntimeInvalidTableAccess) + } + + table.References[offset] = uintptr(ref) // externrefs are opaque uint64. + frame.pc++ + case operationKindTableSize: + table := tables[op.U1] + ce.pushValue(uint64(len(table.References))) + frame.pc++ + case operationKindTableGrow: + table := tables[op.U1] + num, ref := ce.popValue(), ce.popValue() + ret := table.Grow(uint32(num), uintptr(ref)) + ce.pushValue(uint64(ret)) + frame.pc++ + case operationKindTableFill: + table := tables[op.U1] + num := ce.popValue() + ref := uintptr(ce.popValue()) + offset := ce.popValue() + if num+offset > uint64(len(table.References)) { + panic(wasmruntime.ErrRuntimeInvalidTableAccess) + } else if num > 0 { + // Uses the copy trick for faster filling the region with the value. + // https://gist.github.com/taylorza/df2f89d5f9ab3ffd06865062a4cf015d + targetRegion := table.References[offset : offset+num] + targetRegion[0] = ref + for i := 1; i < len(targetRegion); i *= 2 { + copy(targetRegion[i:], targetRegion[:i]) + } + } + frame.pc++ + case operationKindV128Const: + lo, hi := op.U1, op.U2 + ce.pushValue(lo) + ce.pushValue(hi) + frame.pc++ + case operationKindV128Add: + yHigh, yLow := ce.popValue(), ce.popValue() + xHigh, xLow := ce.popValue(), ce.popValue() + switch op.B1 { + case shapeI8x16: + ce.pushValue( + uint64(uint8(xLow>>8)+uint8(yLow>>8))<<8 | uint64(uint8(xLow)+uint8(yLow)) | + uint64(uint8(xLow>>24)+uint8(yLow>>24))<<24 | uint64(uint8(xLow>>16)+uint8(yLow>>16))<<16 | + uint64(uint8(xLow>>40)+uint8(yLow>>40))<<40 | uint64(uint8(xLow>>32)+uint8(yLow>>32))<<32 | + uint64(uint8(xLow>>56)+uint8(yLow>>56))<<56 | uint64(uint8(xLow>>48)+uint8(yLow>>48))<<48, + ) + ce.pushValue( + uint64(uint8(xHigh>>8)+uint8(yHigh>>8))<<8 | uint64(uint8(xHigh)+uint8(yHigh)) | + uint64(uint8(xHigh>>24)+uint8(yHigh>>24))<<24 | uint64(uint8(xHigh>>16)+uint8(yHigh>>16))<<16 | + uint64(uint8(xHigh>>40)+uint8(yHigh>>40))<<40 | uint64(uint8(xHigh>>32)+uint8(yHigh>>32))<<32 | + uint64(uint8(xHigh>>56)+uint8(yHigh>>56))<<56 | uint64(uint8(xHigh>>48)+uint8(yHigh>>48))<<48, + ) + case shapeI16x8: + ce.pushValue( + uint64(uint16(xLow>>16+yLow>>16))<<16 | uint64(uint16(xLow)+uint16(yLow)) | + uint64(uint16(xLow>>48+yLow>>48))<<48 | uint64(uint16(xLow>>32+yLow>>32))<<32, + ) + ce.pushValue( + uint64(uint16(xHigh>>16)+uint16(yHigh>>16))<<16 | uint64(uint16(xHigh)+uint16(yHigh)) | + uint64(uint16(xHigh>>48)+uint16(yHigh>>48))<<48 | uint64(uint16(xHigh>>32)+uint16(yHigh>>32))<<32, + ) + case shapeI32x4: + ce.pushValue(uint64(uint32(xLow>>32)+uint32(yLow>>32))<<32 | uint64(uint32(xLow)+uint32(yLow))) + ce.pushValue(uint64(uint32(xHigh>>32)+uint32(yHigh>>32))<<32 | uint64(uint32(xHigh)+uint32(yHigh))) + case shapeI64x2: + ce.pushValue(xLow + yLow) + ce.pushValue(xHigh + yHigh) + case shapeF32x4: + ce.pushValue( + addFloat32bits(uint32(xLow), uint32(yLow)) | addFloat32bits(uint32(xLow>>32), uint32(yLow>>32))<<32, + ) + ce.pushValue( + addFloat32bits(uint32(xHigh), uint32(yHigh)) | addFloat32bits(uint32(xHigh>>32), uint32(yHigh>>32))<<32, + ) + case shapeF64x2: + ce.pushValue(math.Float64bits(math.Float64frombits(xLow) + math.Float64frombits(yLow))) + ce.pushValue(math.Float64bits(math.Float64frombits(xHigh) + math.Float64frombits(yHigh))) + } + frame.pc++ + case operationKindV128Sub: + yHigh, yLow := ce.popValue(), ce.popValue() + xHigh, xLow := ce.popValue(), ce.popValue() + switch op.B1 { + case shapeI8x16: + ce.pushValue( + uint64(uint8(xLow>>8)-uint8(yLow>>8))<<8 | uint64(uint8(xLow)-uint8(yLow)) | + uint64(uint8(xLow>>24)-uint8(yLow>>24))<<24 | uint64(uint8(xLow>>16)-uint8(yLow>>16))<<16 | + uint64(uint8(xLow>>40)-uint8(yLow>>40))<<40 | uint64(uint8(xLow>>32)-uint8(yLow>>32))<<32 | + uint64(uint8(xLow>>56)-uint8(yLow>>56))<<56 | uint64(uint8(xLow>>48)-uint8(yLow>>48))<<48, + ) + ce.pushValue( + uint64(uint8(xHigh>>8)-uint8(yHigh>>8))<<8 | uint64(uint8(xHigh)-uint8(yHigh)) | + uint64(uint8(xHigh>>24)-uint8(yHigh>>24))<<24 | uint64(uint8(xHigh>>16)-uint8(yHigh>>16))<<16 | + uint64(uint8(xHigh>>40)-uint8(yHigh>>40))<<40 | uint64(uint8(xHigh>>32)-uint8(yHigh>>32))<<32 | + uint64(uint8(xHigh>>56)-uint8(yHigh>>56))<<56 | uint64(uint8(xHigh>>48)-uint8(yHigh>>48))<<48, + ) + case shapeI16x8: + ce.pushValue( + uint64(uint16(xLow>>16)-uint16(yLow>>16))<<16 | uint64(uint16(xLow)-uint16(yLow)) | + uint64(uint16(xLow>>48)-uint16(yLow>>48))<<48 | uint64(uint16(xLow>>32)-uint16(yLow>>32))<<32, + ) + ce.pushValue( + uint64(uint16(xHigh>>16)-uint16(yHigh>>16))<<16 | uint64(uint16(xHigh)-uint16(yHigh)) | + uint64(uint16(xHigh>>48)-uint16(yHigh>>48))<<48 | uint64(uint16(xHigh>>32)-uint16(yHigh>>32))<<32, + ) + case shapeI32x4: + ce.pushValue(uint64(uint32(xLow>>32-yLow>>32))<<32 | uint64(uint32(xLow)-uint32(yLow))) + ce.pushValue(uint64(uint32(xHigh>>32-yHigh>>32))<<32 | uint64(uint32(xHigh)-uint32(yHigh))) + case shapeI64x2: + ce.pushValue(xLow - yLow) + ce.pushValue(xHigh - yHigh) + case shapeF32x4: + ce.pushValue( + subFloat32bits(uint32(xLow), uint32(yLow)) | subFloat32bits(uint32(xLow>>32), uint32(yLow>>32))<<32, + ) + ce.pushValue( + subFloat32bits(uint32(xHigh), uint32(yHigh)) | subFloat32bits(uint32(xHigh>>32), uint32(yHigh>>32))<<32, + ) + case shapeF64x2: + ce.pushValue(math.Float64bits(math.Float64frombits(xLow) - math.Float64frombits(yLow))) + ce.pushValue(math.Float64bits(math.Float64frombits(xHigh) - math.Float64frombits(yHigh))) + } + frame.pc++ + case operationKindV128Load: + offset := ce.popMemoryOffset(op) + switch op.B1 { + case v128LoadType128: + lo, ok := memoryInst.ReadUint64Le(offset) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue(lo) + hi, ok := memoryInst.ReadUint64Le(offset + 8) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue(hi) + case v128LoadType8x8s: + data, ok := memoryInst.Read(offset, 8) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue( + uint64(uint16(int8(data[3])))<<48 | uint64(uint16(int8(data[2])))<<32 | uint64(uint16(int8(data[1])))<<16 | uint64(uint16(int8(data[0]))), + ) + ce.pushValue( + uint64(uint16(int8(data[7])))<<48 | uint64(uint16(int8(data[6])))<<32 | uint64(uint16(int8(data[5])))<<16 | uint64(uint16(int8(data[4]))), + ) + case v128LoadType8x8u: + data, ok := memoryInst.Read(offset, 8) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue( + uint64(data[3])<<48 | uint64(data[2])<<32 | uint64(data[1])<<16 | uint64(data[0]), + ) + ce.pushValue( + uint64(data[7])<<48 | uint64(data[6])<<32 | uint64(data[5])<<16 | uint64(data[4]), + ) + case v128LoadType16x4s: + data, ok := memoryInst.Read(offset, 8) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue( + uint64(int16(binary.LittleEndian.Uint16(data[2:])))<<32 | + uint64(uint32(int16(binary.LittleEndian.Uint16(data)))), + ) + ce.pushValue( + uint64(uint32(int16(binary.LittleEndian.Uint16(data[6:]))))<<32 | + uint64(uint32(int16(binary.LittleEndian.Uint16(data[4:])))), + ) + case v128LoadType16x4u: + data, ok := memoryInst.Read(offset, 8) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue( + uint64(binary.LittleEndian.Uint16(data[2:]))<<32 | uint64(binary.LittleEndian.Uint16(data)), + ) + ce.pushValue( + uint64(binary.LittleEndian.Uint16(data[6:]))<<32 | uint64(binary.LittleEndian.Uint16(data[4:])), + ) + case v128LoadType32x2s: + data, ok := memoryInst.Read(offset, 8) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue(uint64(int32(binary.LittleEndian.Uint32(data)))) + ce.pushValue(uint64(int32(binary.LittleEndian.Uint32(data[4:])))) + case v128LoadType32x2u: + data, ok := memoryInst.Read(offset, 8) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue(uint64(binary.LittleEndian.Uint32(data))) + ce.pushValue(uint64(binary.LittleEndian.Uint32(data[4:]))) + case v128LoadType8Splat: + v, ok := memoryInst.ReadByte(offset) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + v8 := uint64(v)<<56 | uint64(v)<<48 | uint64(v)<<40 | uint64(v)<<32 | + uint64(v)<<24 | uint64(v)<<16 | uint64(v)<<8 | uint64(v) + ce.pushValue(v8) + ce.pushValue(v8) + case v128LoadType16Splat: + v, ok := memoryInst.ReadUint16Le(offset) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + v4 := uint64(v)<<48 | uint64(v)<<32 | uint64(v)<<16 | uint64(v) + ce.pushValue(v4) + ce.pushValue(v4) + case v128LoadType32Splat: + v, ok := memoryInst.ReadUint32Le(offset) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + vv := uint64(v)<<32 | uint64(v) + ce.pushValue(vv) + ce.pushValue(vv) + case v128LoadType64Splat: + lo, ok := memoryInst.ReadUint64Le(offset) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue(lo) + ce.pushValue(lo) + case v128LoadType32zero: + lo, ok := memoryInst.ReadUint32Le(offset) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue(uint64(lo)) + ce.pushValue(0) + case v128LoadType64zero: + lo, ok := memoryInst.ReadUint64Le(offset) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue(lo) + ce.pushValue(0) + } + frame.pc++ + case operationKindV128LoadLane: + hi, lo := ce.popValue(), ce.popValue() + offset := ce.popMemoryOffset(op) + switch op.B1 { + case 8: + b, ok := memoryInst.ReadByte(offset) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + if op.B2 < 8 { + s := op.B2 << 3 + lo = (lo & ^(0xff << s)) | uint64(b)<<s + } else { + s := (op.B2 - 8) << 3 + hi = (hi & ^(0xff << s)) | uint64(b)<<s + } + case 16: + b, ok := memoryInst.ReadUint16Le(offset) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + if op.B2 < 4 { + s := op.B2 << 4 + lo = (lo & ^(0xff_ff << s)) | uint64(b)<<s + } else { + s := (op.B2 - 4) << 4 + hi = (hi & ^(0xff_ff << s)) | uint64(b)<<s + } + case 32: + b, ok := memoryInst.ReadUint32Le(offset) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + if op.B2 < 2 { + s := op.B2 << 5 + lo = (lo & ^(0xff_ff_ff_ff << s)) | uint64(b)<<s + } else { + s := (op.B2 - 2) << 5 + hi = (hi & ^(0xff_ff_ff_ff << s)) | uint64(b)<<s + } + case 64: + b, ok := memoryInst.ReadUint64Le(offset) + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + if op.B2 == 0 { + lo = b + } else { + hi = b + } + } + ce.pushValue(lo) + ce.pushValue(hi) + frame.pc++ + case operationKindV128Store: + hi, lo := ce.popValue(), ce.popValue() + offset := ce.popMemoryOffset(op) + // Write the upper bytes first to trigger an early error if the memory access is out of bounds. + // Otherwise, the lower bytes might be written to memory, but the upper bytes might not. + if uint64(offset)+8 > math.MaxUint32 { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + if ok := memoryInst.WriteUint64Le(offset+8, hi); !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + if ok := memoryInst.WriteUint64Le(offset, lo); !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + frame.pc++ + case operationKindV128StoreLane: + hi, lo := ce.popValue(), ce.popValue() + offset := ce.popMemoryOffset(op) + var ok bool + switch op.B1 { + case 8: + if op.B2 < 8 { + ok = memoryInst.WriteByte(offset, byte(lo>>(op.B2*8))) + } else { + ok = memoryInst.WriteByte(offset, byte(hi>>((op.B2-8)*8))) + } + case 16: + if op.B2 < 4 { + ok = memoryInst.WriteUint16Le(offset, uint16(lo>>(op.B2*16))) + } else { + ok = memoryInst.WriteUint16Le(offset, uint16(hi>>((op.B2-4)*16))) + } + case 32: + if op.B2 < 2 { + ok = memoryInst.WriteUint32Le(offset, uint32(lo>>(op.B2*32))) + } else { + ok = memoryInst.WriteUint32Le(offset, uint32(hi>>((op.B2-2)*32))) + } + case 64: + if op.B2 == 0 { + ok = memoryInst.WriteUint64Le(offset, lo) + } else { + ok = memoryInst.WriteUint64Le(offset, hi) + } + } + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + frame.pc++ + case operationKindV128ReplaceLane: + v := ce.popValue() + hi, lo := ce.popValue(), ce.popValue() + switch op.B1 { + case shapeI8x16: + if op.B2 < 8 { + s := op.B2 << 3 + lo = (lo & ^(0xff << s)) | uint64(byte(v))<<s + } else { + s := (op.B2 - 8) << 3 + hi = (hi & ^(0xff << s)) | uint64(byte(v))<<s + } + case shapeI16x8: + if op.B2 < 4 { + s := op.B2 << 4 + lo = (lo & ^(0xff_ff << s)) | uint64(uint16(v))<<s + } else { + s := (op.B2 - 4) << 4 + hi = (hi & ^(0xff_ff << s)) | uint64(uint16(v))<<s + } + case shapeI32x4, shapeF32x4: + if op.B2 < 2 { + s := op.B2 << 5 + lo = (lo & ^(0xff_ff_ff_ff << s)) | uint64(uint32(v))<<s + } else { + s := (op.B2 - 2) << 5 + hi = (hi & ^(0xff_ff_ff_ff << s)) | uint64(uint32(v))<<s + } + case shapeI64x2, shapeF64x2: + if op.B2 == 0 { + lo = v + } else { + hi = v + } + } + ce.pushValue(lo) + ce.pushValue(hi) + frame.pc++ + case operationKindV128ExtractLane: + hi, lo := ce.popValue(), ce.popValue() + var v uint64 + switch op.B1 { + case shapeI8x16: + var u8 byte + if op.B2 < 8 { + u8 = byte(lo >> (op.B2 * 8)) + } else { + u8 = byte(hi >> ((op.B2 - 8) * 8)) + } + if op.B3 { + // sign-extend. + v = uint64(uint32(int8(u8))) + } else { + v = uint64(u8) + } + case shapeI16x8: + var u16 uint16 + if op.B2 < 4 { + u16 = uint16(lo >> (op.B2 * 16)) + } else { + u16 = uint16(hi >> ((op.B2 - 4) * 16)) + } + if op.B3 { + // sign-extend. + v = uint64(uint32(int16(u16))) + } else { + v = uint64(u16) + } + case shapeI32x4, shapeF32x4: + if op.B2 < 2 { + v = uint64(uint32(lo >> (op.B2 * 32))) + } else { + v = uint64(uint32(hi >> ((op.B2 - 2) * 32))) + } + case shapeI64x2, shapeF64x2: + if op.B2 == 0 { + v = lo + } else { + v = hi + } + } + ce.pushValue(v) + frame.pc++ + case operationKindV128Splat: + v := ce.popValue() + var hi, lo uint64 + switch op.B1 { + case shapeI8x16: + v8 := uint64(byte(v))<<56 | uint64(byte(v))<<48 | uint64(byte(v))<<40 | uint64(byte(v))<<32 | + uint64(byte(v))<<24 | uint64(byte(v))<<16 | uint64(byte(v))<<8 | uint64(byte(v)) + hi, lo = v8, v8 + case shapeI16x8: + v4 := uint64(uint16(v))<<48 | uint64(uint16(v))<<32 | uint64(uint16(v))<<16 | uint64(uint16(v)) + hi, lo = v4, v4 + case shapeI32x4, shapeF32x4: + v2 := uint64(uint32(v))<<32 | uint64(uint32(v)) + lo, hi = v2, v2 + case shapeI64x2, shapeF64x2: + lo, hi = v, v + } + ce.pushValue(lo) + ce.pushValue(hi) + frame.pc++ + case operationKindV128Swizzle: + idxHi, idxLo := ce.popValue(), ce.popValue() + baseHi, baseLo := ce.popValue(), ce.popValue() + var newVal [16]byte + for i := 0; i < 16; i++ { + var id byte + if i < 8 { + id = byte(idxLo >> (i * 8)) + } else { + id = byte(idxHi >> ((i - 8) * 8)) + } + if id < 8 { + newVal[i] = byte(baseLo >> (id * 8)) + } else if id < 16 { + newVal[i] = byte(baseHi >> ((id - 8) * 8)) + } + } + ce.pushValue(binary.LittleEndian.Uint64(newVal[:8])) + ce.pushValue(binary.LittleEndian.Uint64(newVal[8:])) + frame.pc++ + case operationKindV128Shuffle: + xHi, xLo, yHi, yLo := ce.popValue(), ce.popValue(), ce.popValue(), ce.popValue() + var newVal [16]byte + for i, l := range op.Us { + if l < 8 { + newVal[i] = byte(yLo >> (l * 8)) + } else if l < 16 { + newVal[i] = byte(yHi >> ((l - 8) * 8)) + } else if l < 24 { + newVal[i] = byte(xLo >> ((l - 16) * 8)) + } else if l < 32 { + newVal[i] = byte(xHi >> ((l - 24) * 8)) + } + } + ce.pushValue(binary.LittleEndian.Uint64(newVal[:8])) + ce.pushValue(binary.LittleEndian.Uint64(newVal[8:])) + frame.pc++ + case operationKindV128AnyTrue: + hi, lo := ce.popValue(), ce.popValue() + if hi != 0 || lo != 0 { + ce.pushValue(1) + } else { + ce.pushValue(0) + } + frame.pc++ + case operationKindV128AllTrue: + hi, lo := ce.popValue(), ce.popValue() + var ret bool + switch op.B1 { + case shapeI8x16: + ret = (uint8(lo) != 0) && (uint8(lo>>8) != 0) && (uint8(lo>>16) != 0) && (uint8(lo>>24) != 0) && + (uint8(lo>>32) != 0) && (uint8(lo>>40) != 0) && (uint8(lo>>48) != 0) && (uint8(lo>>56) != 0) && + (uint8(hi) != 0) && (uint8(hi>>8) != 0) && (uint8(hi>>16) != 0) && (uint8(hi>>24) != 0) && + (uint8(hi>>32) != 0) && (uint8(hi>>40) != 0) && (uint8(hi>>48) != 0) && (uint8(hi>>56) != 0) + case shapeI16x8: + ret = (uint16(lo) != 0) && (uint16(lo>>16) != 0) && (uint16(lo>>32) != 0) && (uint16(lo>>48) != 0) && + (uint16(hi) != 0) && (uint16(hi>>16) != 0) && (uint16(hi>>32) != 0) && (uint16(hi>>48) != 0) + case shapeI32x4: + ret = (uint32(lo) != 0) && (uint32(lo>>32) != 0) && + (uint32(hi) != 0) && (uint32(hi>>32) != 0) + case shapeI64x2: + ret = (lo != 0) && + (hi != 0) + } + if ret { + ce.pushValue(1) + } else { + ce.pushValue(0) + } + frame.pc++ + case operationKindV128BitMask: + // https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#bitmask-extraction + hi, lo := ce.popValue(), ce.popValue() + var res uint64 + switch op.B1 { + case shapeI8x16: + for i := 0; i < 8; i++ { + if int8(lo>>(i*8)) < 0 { + res |= 1 << i + } + } + for i := 0; i < 8; i++ { + if int8(hi>>(i*8)) < 0 { + res |= 1 << (i + 8) + } + } + case shapeI16x8: + for i := 0; i < 4; i++ { + if int16(lo>>(i*16)) < 0 { + res |= 1 << i + } + } + for i := 0; i < 4; i++ { + if int16(hi>>(i*16)) < 0 { + res |= 1 << (i + 4) + } + } + case shapeI32x4: + for i := 0; i < 2; i++ { + if int32(lo>>(i*32)) < 0 { + res |= 1 << i + } + } + for i := 0; i < 2; i++ { + if int32(hi>>(i*32)) < 0 { + res |= 1 << (i + 2) + } + } + case shapeI64x2: + if int64(lo) < 0 { + res |= 0b01 + } + if int(hi) < 0 { + res |= 0b10 + } + } + ce.pushValue(res) + frame.pc++ + case operationKindV128And: + x2Hi, x2Lo := ce.popValue(), ce.popValue() + x1Hi, x1Lo := ce.popValue(), ce.popValue() + ce.pushValue(x1Lo & x2Lo) + ce.pushValue(x1Hi & x2Hi) + frame.pc++ + case operationKindV128Not: + hi, lo := ce.popValue(), ce.popValue() + ce.pushValue(^lo) + ce.pushValue(^hi) + frame.pc++ + case operationKindV128Or: + x2Hi, x2Lo := ce.popValue(), ce.popValue() + x1Hi, x1Lo := ce.popValue(), ce.popValue() + ce.pushValue(x1Lo | x2Lo) + ce.pushValue(x1Hi | x2Hi) + frame.pc++ + case operationKindV128Xor: + x2Hi, x2Lo := ce.popValue(), ce.popValue() + x1Hi, x1Lo := ce.popValue(), ce.popValue() + ce.pushValue(x1Lo ^ x2Lo) + ce.pushValue(x1Hi ^ x2Hi) + frame.pc++ + case operationKindV128Bitselect: + // https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#bitwise-select + cHi, cLo := ce.popValue(), ce.popValue() + x2Hi, x2Lo := ce.popValue(), ce.popValue() + x1Hi, x1Lo := ce.popValue(), ce.popValue() + // v128.or(v128.and(v1, c), v128.and(v2, v128.not(c))) + ce.pushValue((x1Lo & cLo) | (x2Lo & (^cLo))) + ce.pushValue((x1Hi & cHi) | (x2Hi & (^cHi))) + frame.pc++ + case operationKindV128AndNot: + x2Hi, x2Lo := ce.popValue(), ce.popValue() + x1Hi, x1Lo := ce.popValue(), ce.popValue() + ce.pushValue(x1Lo & (^x2Lo)) + ce.pushValue(x1Hi & (^x2Hi)) + frame.pc++ + case operationKindV128Shl: + s := ce.popValue() + hi, lo := ce.popValue(), ce.popValue() + switch op.B1 { + case shapeI8x16: + s = s % 8 + lo = uint64(uint8(lo<<s)) | + uint64(uint8((lo>>8)<<s))<<8 | + uint64(uint8((lo>>16)<<s))<<16 | + uint64(uint8((lo>>24)<<s))<<24 | + uint64(uint8((lo>>32)<<s))<<32 | + uint64(uint8((lo>>40)<<s))<<40 | + uint64(uint8((lo>>48)<<s))<<48 | + uint64(uint8((lo>>56)<<s))<<56 + hi = uint64(uint8(hi<<s)) | + uint64(uint8((hi>>8)<<s))<<8 | + uint64(uint8((hi>>16)<<s))<<16 | + uint64(uint8((hi>>24)<<s))<<24 | + uint64(uint8((hi>>32)<<s))<<32 | + uint64(uint8((hi>>40)<<s))<<40 | + uint64(uint8((hi>>48)<<s))<<48 | + uint64(uint8((hi>>56)<<s))<<56 + case shapeI16x8: + s = s % 16 + lo = uint64(uint16(lo<<s)) | + uint64(uint16((lo>>16)<<s))<<16 | + uint64(uint16((lo>>32)<<s))<<32 | + uint64(uint16((lo>>48)<<s))<<48 + hi = uint64(uint16(hi<<s)) | + uint64(uint16((hi>>16)<<s))<<16 | + uint64(uint16((hi>>32)<<s))<<32 | + uint64(uint16((hi>>48)<<s))<<48 + case shapeI32x4: + s = s % 32 + lo = uint64(uint32(lo<<s)) | uint64(uint32((lo>>32)<<s))<<32 + hi = uint64(uint32(hi<<s)) | uint64(uint32((hi>>32)<<s))<<32 + case shapeI64x2: + s = s % 64 + lo = lo << s + hi = hi << s + } + ce.pushValue(lo) + ce.pushValue(hi) + frame.pc++ + case operationKindV128Shr: + s := ce.popValue() + hi, lo := ce.popValue(), ce.popValue() + switch op.B1 { + case shapeI8x16: + s = s % 8 + if op.B3 { // signed + lo = uint64(uint8(int8(lo)>>s)) | + uint64(uint8(int8(lo>>8)>>s))<<8 | + uint64(uint8(int8(lo>>16)>>s))<<16 | + uint64(uint8(int8(lo>>24)>>s))<<24 | + uint64(uint8(int8(lo>>32)>>s))<<32 | + uint64(uint8(int8(lo>>40)>>s))<<40 | + uint64(uint8(int8(lo>>48)>>s))<<48 | + uint64(uint8(int8(lo>>56)>>s))<<56 + hi = uint64(uint8(int8(hi)>>s)) | + uint64(uint8(int8(hi>>8)>>s))<<8 | + uint64(uint8(int8(hi>>16)>>s))<<16 | + uint64(uint8(int8(hi>>24)>>s))<<24 | + uint64(uint8(int8(hi>>32)>>s))<<32 | + uint64(uint8(int8(hi>>40)>>s))<<40 | + uint64(uint8(int8(hi>>48)>>s))<<48 | + uint64(uint8(int8(hi>>56)>>s))<<56 + } else { + lo = uint64(uint8(lo)>>s) | + uint64(uint8(lo>>8)>>s)<<8 | + uint64(uint8(lo>>16)>>s)<<16 | + uint64(uint8(lo>>24)>>s)<<24 | + uint64(uint8(lo>>32)>>s)<<32 | + uint64(uint8(lo>>40)>>s)<<40 | + uint64(uint8(lo>>48)>>s)<<48 | + uint64(uint8(lo>>56)>>s)<<56 + hi = uint64(uint8(hi)>>s) | + uint64(uint8(hi>>8)>>s)<<8 | + uint64(uint8(hi>>16)>>s)<<16 | + uint64(uint8(hi>>24)>>s)<<24 | + uint64(uint8(hi>>32)>>s)<<32 | + uint64(uint8(hi>>40)>>s)<<40 | + uint64(uint8(hi>>48)>>s)<<48 | + uint64(uint8(hi>>56)>>s)<<56 + } + case shapeI16x8: + s = s % 16 + if op.B3 { // signed + lo = uint64(uint16(int16(lo)>>s)) | + uint64(uint16(int16(lo>>16)>>s))<<16 | + uint64(uint16(int16(lo>>32)>>s))<<32 | + uint64(uint16(int16(lo>>48)>>s))<<48 + hi = uint64(uint16(int16(hi)>>s)) | + uint64(uint16(int16(hi>>16)>>s))<<16 | + uint64(uint16(int16(hi>>32)>>s))<<32 | + uint64(uint16(int16(hi>>48)>>s))<<48 + } else { + lo = uint64(uint16(lo)>>s) | + uint64(uint16(lo>>16)>>s)<<16 | + uint64(uint16(lo>>32)>>s)<<32 | + uint64(uint16(lo>>48)>>s)<<48 + hi = uint64(uint16(hi)>>s) | + uint64(uint16(hi>>16)>>s)<<16 | + uint64(uint16(hi>>32)>>s)<<32 | + uint64(uint16(hi>>48)>>s)<<48 + } + case shapeI32x4: + s = s % 32 + if op.B3 { + lo = uint64(uint32(int32(lo)>>s)) | uint64(uint32(int32(lo>>32)>>s))<<32 + hi = uint64(uint32(int32(hi)>>s)) | uint64(uint32(int32(hi>>32)>>s))<<32 + } else { + lo = uint64(uint32(lo)>>s) | uint64(uint32(lo>>32)>>s)<<32 + hi = uint64(uint32(hi)>>s) | uint64(uint32(hi>>32)>>s)<<32 + } + case shapeI64x2: + s = s % 64 + if op.B3 { // signed + lo = uint64(int64(lo) >> s) + hi = uint64(int64(hi) >> s) + } else { + lo = lo >> s + hi = hi >> s + } + + } + ce.pushValue(lo) + ce.pushValue(hi) + frame.pc++ + case operationKindV128Cmp: + x2Hi, x2Lo := ce.popValue(), ce.popValue() + x1Hi, x1Lo := ce.popValue(), ce.popValue() + var result []bool + switch op.B1 { + case v128CmpTypeI8x16Eq: + result = []bool{ + byte(x1Lo>>0) == byte(x2Lo>>0), byte(x1Lo>>8) == byte(x2Lo>>8), + byte(x1Lo>>16) == byte(x2Lo>>16), byte(x1Lo>>24) == byte(x2Lo>>24), + byte(x1Lo>>32) == byte(x2Lo>>32), byte(x1Lo>>40) == byte(x2Lo>>40), + byte(x1Lo>>48) == byte(x2Lo>>48), byte(x1Lo>>56) == byte(x2Lo>>56), + byte(x1Hi>>0) == byte(x2Hi>>0), byte(x1Hi>>8) == byte(x2Hi>>8), + byte(x1Hi>>16) == byte(x2Hi>>16), byte(x1Hi>>24) == byte(x2Hi>>24), + byte(x1Hi>>32) == byte(x2Hi>>32), byte(x1Hi>>40) == byte(x2Hi>>40), + byte(x1Hi>>48) == byte(x2Hi>>48), byte(x1Hi>>56) == byte(x2Hi>>56), + } + case v128CmpTypeI8x16Ne: + result = []bool{ + byte(x1Lo>>0) != byte(x2Lo>>0), byte(x1Lo>>8) != byte(x2Lo>>8), + byte(x1Lo>>16) != byte(x2Lo>>16), byte(x1Lo>>24) != byte(x2Lo>>24), + byte(x1Lo>>32) != byte(x2Lo>>32), byte(x1Lo>>40) != byte(x2Lo>>40), + byte(x1Lo>>48) != byte(x2Lo>>48), byte(x1Lo>>56) != byte(x2Lo>>56), + byte(x1Hi>>0) != byte(x2Hi>>0), byte(x1Hi>>8) != byte(x2Hi>>8), + byte(x1Hi>>16) != byte(x2Hi>>16), byte(x1Hi>>24) != byte(x2Hi>>24), + byte(x1Hi>>32) != byte(x2Hi>>32), byte(x1Hi>>40) != byte(x2Hi>>40), + byte(x1Hi>>48) != byte(x2Hi>>48), byte(x1Hi>>56) != byte(x2Hi>>56), + } + case v128CmpTypeI8x16LtS: + result = []bool{ + int8(x1Lo>>0) < int8(x2Lo>>0), int8(x1Lo>>8) < int8(x2Lo>>8), + int8(x1Lo>>16) < int8(x2Lo>>16), int8(x1Lo>>24) < int8(x2Lo>>24), + int8(x1Lo>>32) < int8(x2Lo>>32), int8(x1Lo>>40) < int8(x2Lo>>40), + int8(x1Lo>>48) < int8(x2Lo>>48), int8(x1Lo>>56) < int8(x2Lo>>56), + int8(x1Hi>>0) < int8(x2Hi>>0), int8(x1Hi>>8) < int8(x2Hi>>8), + int8(x1Hi>>16) < int8(x2Hi>>16), int8(x1Hi>>24) < int8(x2Hi>>24), + int8(x1Hi>>32) < int8(x2Hi>>32), int8(x1Hi>>40) < int8(x2Hi>>40), + int8(x1Hi>>48) < int8(x2Hi>>48), int8(x1Hi>>56) < int8(x2Hi>>56), + } + case v128CmpTypeI8x16LtU: + result = []bool{ + byte(x1Lo>>0) < byte(x2Lo>>0), byte(x1Lo>>8) < byte(x2Lo>>8), + byte(x1Lo>>16) < byte(x2Lo>>16), byte(x1Lo>>24) < byte(x2Lo>>24), + byte(x1Lo>>32) < byte(x2Lo>>32), byte(x1Lo>>40) < byte(x2Lo>>40), + byte(x1Lo>>48) < byte(x2Lo>>48), byte(x1Lo>>56) < byte(x2Lo>>56), + byte(x1Hi>>0) < byte(x2Hi>>0), byte(x1Hi>>8) < byte(x2Hi>>8), + byte(x1Hi>>16) < byte(x2Hi>>16), byte(x1Hi>>24) < byte(x2Hi>>24), + byte(x1Hi>>32) < byte(x2Hi>>32), byte(x1Hi>>40) < byte(x2Hi>>40), + byte(x1Hi>>48) < byte(x2Hi>>48), byte(x1Hi>>56) < byte(x2Hi>>56), + } + case v128CmpTypeI8x16GtS: + result = []bool{ + int8(x1Lo>>0) > int8(x2Lo>>0), int8(x1Lo>>8) > int8(x2Lo>>8), + int8(x1Lo>>16) > int8(x2Lo>>16), int8(x1Lo>>24) > int8(x2Lo>>24), + int8(x1Lo>>32) > int8(x2Lo>>32), int8(x1Lo>>40) > int8(x2Lo>>40), + int8(x1Lo>>48) > int8(x2Lo>>48), int8(x1Lo>>56) > int8(x2Lo>>56), + int8(x1Hi>>0) > int8(x2Hi>>0), int8(x1Hi>>8) > int8(x2Hi>>8), + int8(x1Hi>>16) > int8(x2Hi>>16), int8(x1Hi>>24) > int8(x2Hi>>24), + int8(x1Hi>>32) > int8(x2Hi>>32), int8(x1Hi>>40) > int8(x2Hi>>40), + int8(x1Hi>>48) > int8(x2Hi>>48), int8(x1Hi>>56) > int8(x2Hi>>56), + } + case v128CmpTypeI8x16GtU: + result = []bool{ + byte(x1Lo>>0) > byte(x2Lo>>0), byte(x1Lo>>8) > byte(x2Lo>>8), + byte(x1Lo>>16) > byte(x2Lo>>16), byte(x1Lo>>24) > byte(x2Lo>>24), + byte(x1Lo>>32) > byte(x2Lo>>32), byte(x1Lo>>40) > byte(x2Lo>>40), + byte(x1Lo>>48) > byte(x2Lo>>48), byte(x1Lo>>56) > byte(x2Lo>>56), + byte(x1Hi>>0) > byte(x2Hi>>0), byte(x1Hi>>8) > byte(x2Hi>>8), + byte(x1Hi>>16) > byte(x2Hi>>16), byte(x1Hi>>24) > byte(x2Hi>>24), + byte(x1Hi>>32) > byte(x2Hi>>32), byte(x1Hi>>40) > byte(x2Hi>>40), + byte(x1Hi>>48) > byte(x2Hi>>48), byte(x1Hi>>56) > byte(x2Hi>>56), + } + case v128CmpTypeI8x16LeS: + result = []bool{ + int8(x1Lo>>0) <= int8(x2Lo>>0), int8(x1Lo>>8) <= int8(x2Lo>>8), + int8(x1Lo>>16) <= int8(x2Lo>>16), int8(x1Lo>>24) <= int8(x2Lo>>24), + int8(x1Lo>>32) <= int8(x2Lo>>32), int8(x1Lo>>40) <= int8(x2Lo>>40), + int8(x1Lo>>48) <= int8(x2Lo>>48), int8(x1Lo>>56) <= int8(x2Lo>>56), + int8(x1Hi>>0) <= int8(x2Hi>>0), int8(x1Hi>>8) <= int8(x2Hi>>8), + int8(x1Hi>>16) <= int8(x2Hi>>16), int8(x1Hi>>24) <= int8(x2Hi>>24), + int8(x1Hi>>32) <= int8(x2Hi>>32), int8(x1Hi>>40) <= int8(x2Hi>>40), + int8(x1Hi>>48) <= int8(x2Hi>>48), int8(x1Hi>>56) <= int8(x2Hi>>56), + } + case v128CmpTypeI8x16LeU: + result = []bool{ + byte(x1Lo>>0) <= byte(x2Lo>>0), byte(x1Lo>>8) <= byte(x2Lo>>8), + byte(x1Lo>>16) <= byte(x2Lo>>16), byte(x1Lo>>24) <= byte(x2Lo>>24), + byte(x1Lo>>32) <= byte(x2Lo>>32), byte(x1Lo>>40) <= byte(x2Lo>>40), + byte(x1Lo>>48) <= byte(x2Lo>>48), byte(x1Lo>>56) <= byte(x2Lo>>56), + byte(x1Hi>>0) <= byte(x2Hi>>0), byte(x1Hi>>8) <= byte(x2Hi>>8), + byte(x1Hi>>16) <= byte(x2Hi>>16), byte(x1Hi>>24) <= byte(x2Hi>>24), + byte(x1Hi>>32) <= byte(x2Hi>>32), byte(x1Hi>>40) <= byte(x2Hi>>40), + byte(x1Hi>>48) <= byte(x2Hi>>48), byte(x1Hi>>56) <= byte(x2Hi>>56), + } + case v128CmpTypeI8x16GeS: + result = []bool{ + int8(x1Lo>>0) >= int8(x2Lo>>0), int8(x1Lo>>8) >= int8(x2Lo>>8), + int8(x1Lo>>16) >= int8(x2Lo>>16), int8(x1Lo>>24) >= int8(x2Lo>>24), + int8(x1Lo>>32) >= int8(x2Lo>>32), int8(x1Lo>>40) >= int8(x2Lo>>40), + int8(x1Lo>>48) >= int8(x2Lo>>48), int8(x1Lo>>56) >= int8(x2Lo>>56), + int8(x1Hi>>0) >= int8(x2Hi>>0), int8(x1Hi>>8) >= int8(x2Hi>>8), + int8(x1Hi>>16) >= int8(x2Hi>>16), int8(x1Hi>>24) >= int8(x2Hi>>24), + int8(x1Hi>>32) >= int8(x2Hi>>32), int8(x1Hi>>40) >= int8(x2Hi>>40), + int8(x1Hi>>48) >= int8(x2Hi>>48), int8(x1Hi>>56) >= int8(x2Hi>>56), + } + case v128CmpTypeI8x16GeU: + result = []bool{ + byte(x1Lo>>0) >= byte(x2Lo>>0), byte(x1Lo>>8) >= byte(x2Lo>>8), + byte(x1Lo>>16) >= byte(x2Lo>>16), byte(x1Lo>>24) >= byte(x2Lo>>24), + byte(x1Lo>>32) >= byte(x2Lo>>32), byte(x1Lo>>40) >= byte(x2Lo>>40), + byte(x1Lo>>48) >= byte(x2Lo>>48), byte(x1Lo>>56) >= byte(x2Lo>>56), + byte(x1Hi>>0) >= byte(x2Hi>>0), byte(x1Hi>>8) >= byte(x2Hi>>8), + byte(x1Hi>>16) >= byte(x2Hi>>16), byte(x1Hi>>24) >= byte(x2Hi>>24), + byte(x1Hi>>32) >= byte(x2Hi>>32), byte(x1Hi>>40) >= byte(x2Hi>>40), + byte(x1Hi>>48) >= byte(x2Hi>>48), byte(x1Hi>>56) >= byte(x2Hi>>56), + } + case v128CmpTypeI16x8Eq: + result = []bool{ + uint16(x1Lo>>0) == uint16(x2Lo>>0), uint16(x1Lo>>16) == uint16(x2Lo>>16), + uint16(x1Lo>>32) == uint16(x2Lo>>32), uint16(x1Lo>>48) == uint16(x2Lo>>48), + uint16(x1Hi>>0) == uint16(x2Hi>>0), uint16(x1Hi>>16) == uint16(x2Hi>>16), + uint16(x1Hi>>32) == uint16(x2Hi>>32), uint16(x1Hi>>48) == uint16(x2Hi>>48), + } + case v128CmpTypeI16x8Ne: + result = []bool{ + uint16(x1Lo>>0) != uint16(x2Lo>>0), uint16(x1Lo>>16) != uint16(x2Lo>>16), + uint16(x1Lo>>32) != uint16(x2Lo>>32), uint16(x1Lo>>48) != uint16(x2Lo>>48), + uint16(x1Hi>>0) != uint16(x2Hi>>0), uint16(x1Hi>>16) != uint16(x2Hi>>16), + uint16(x1Hi>>32) != uint16(x2Hi>>32), uint16(x1Hi>>48) != uint16(x2Hi>>48), + } + case v128CmpTypeI16x8LtS: + result = []bool{ + int16(x1Lo>>0) < int16(x2Lo>>0), int16(x1Lo>>16) < int16(x2Lo>>16), + int16(x1Lo>>32) < int16(x2Lo>>32), int16(x1Lo>>48) < int16(x2Lo>>48), + int16(x1Hi>>0) < int16(x2Hi>>0), int16(x1Hi>>16) < int16(x2Hi>>16), + int16(x1Hi>>32) < int16(x2Hi>>32), int16(x1Hi>>48) < int16(x2Hi>>48), + } + case v128CmpTypeI16x8LtU: + result = []bool{ + uint16(x1Lo>>0) < uint16(x2Lo>>0), uint16(x1Lo>>16) < uint16(x2Lo>>16), + uint16(x1Lo>>32) < uint16(x2Lo>>32), uint16(x1Lo>>48) < uint16(x2Lo>>48), + uint16(x1Hi>>0) < uint16(x2Hi>>0), uint16(x1Hi>>16) < uint16(x2Hi>>16), + uint16(x1Hi>>32) < uint16(x2Hi>>32), uint16(x1Hi>>48) < uint16(x2Hi>>48), + } + case v128CmpTypeI16x8GtS: + result = []bool{ + int16(x1Lo>>0) > int16(x2Lo>>0), int16(x1Lo>>16) > int16(x2Lo>>16), + int16(x1Lo>>32) > int16(x2Lo>>32), int16(x1Lo>>48) > int16(x2Lo>>48), + int16(x1Hi>>0) > int16(x2Hi>>0), int16(x1Hi>>16) > int16(x2Hi>>16), + int16(x1Hi>>32) > int16(x2Hi>>32), int16(x1Hi>>48) > int16(x2Hi>>48), + } + case v128CmpTypeI16x8GtU: + result = []bool{ + uint16(x1Lo>>0) > uint16(x2Lo>>0), uint16(x1Lo>>16) > uint16(x2Lo>>16), + uint16(x1Lo>>32) > uint16(x2Lo>>32), uint16(x1Lo>>48) > uint16(x2Lo>>48), + uint16(x1Hi>>0) > uint16(x2Hi>>0), uint16(x1Hi>>16) > uint16(x2Hi>>16), + uint16(x1Hi>>32) > uint16(x2Hi>>32), uint16(x1Hi>>48) > uint16(x2Hi>>48), + } + case v128CmpTypeI16x8LeS: + result = []bool{ + int16(x1Lo>>0) <= int16(x2Lo>>0), int16(x1Lo>>16) <= int16(x2Lo>>16), + int16(x1Lo>>32) <= int16(x2Lo>>32), int16(x1Lo>>48) <= int16(x2Lo>>48), + int16(x1Hi>>0) <= int16(x2Hi>>0), int16(x1Hi>>16) <= int16(x2Hi>>16), + int16(x1Hi>>32) <= int16(x2Hi>>32), int16(x1Hi>>48) <= int16(x2Hi>>48), + } + case v128CmpTypeI16x8LeU: + result = []bool{ + uint16(x1Lo>>0) <= uint16(x2Lo>>0), uint16(x1Lo>>16) <= uint16(x2Lo>>16), + uint16(x1Lo>>32) <= uint16(x2Lo>>32), uint16(x1Lo>>48) <= uint16(x2Lo>>48), + uint16(x1Hi>>0) <= uint16(x2Hi>>0), uint16(x1Hi>>16) <= uint16(x2Hi>>16), + uint16(x1Hi>>32) <= uint16(x2Hi>>32), uint16(x1Hi>>48) <= uint16(x2Hi>>48), + } + case v128CmpTypeI16x8GeS: + result = []bool{ + int16(x1Lo>>0) >= int16(x2Lo>>0), int16(x1Lo>>16) >= int16(x2Lo>>16), + int16(x1Lo>>32) >= int16(x2Lo>>32), int16(x1Lo>>48) >= int16(x2Lo>>48), + int16(x1Hi>>0) >= int16(x2Hi>>0), int16(x1Hi>>16) >= int16(x2Hi>>16), + int16(x1Hi>>32) >= int16(x2Hi>>32), int16(x1Hi>>48) >= int16(x2Hi>>48), + } + case v128CmpTypeI16x8GeU: + result = []bool{ + uint16(x1Lo>>0) >= uint16(x2Lo>>0), uint16(x1Lo>>16) >= uint16(x2Lo>>16), + uint16(x1Lo>>32) >= uint16(x2Lo>>32), uint16(x1Lo>>48) >= uint16(x2Lo>>48), + uint16(x1Hi>>0) >= uint16(x2Hi>>0), uint16(x1Hi>>16) >= uint16(x2Hi>>16), + uint16(x1Hi>>32) >= uint16(x2Hi>>32), uint16(x1Hi>>48) >= uint16(x2Hi>>48), + } + case v128CmpTypeI32x4Eq: + result = []bool{ + uint32(x1Lo>>0) == uint32(x2Lo>>0), uint32(x1Lo>>32) == uint32(x2Lo>>32), + uint32(x1Hi>>0) == uint32(x2Hi>>0), uint32(x1Hi>>32) == uint32(x2Hi>>32), + } + case v128CmpTypeI32x4Ne: + result = []bool{ + uint32(x1Lo>>0) != uint32(x2Lo>>0), uint32(x1Lo>>32) != uint32(x2Lo>>32), + uint32(x1Hi>>0) != uint32(x2Hi>>0), uint32(x1Hi>>32) != uint32(x2Hi>>32), + } + case v128CmpTypeI32x4LtS: + result = []bool{ + int32(x1Lo>>0) < int32(x2Lo>>0), int32(x1Lo>>32) < int32(x2Lo>>32), + int32(x1Hi>>0) < int32(x2Hi>>0), int32(x1Hi>>32) < int32(x2Hi>>32), + } + case v128CmpTypeI32x4LtU: + result = []bool{ + uint32(x1Lo>>0) < uint32(x2Lo>>0), uint32(x1Lo>>32) < uint32(x2Lo>>32), + uint32(x1Hi>>0) < uint32(x2Hi>>0), uint32(x1Hi>>32) < uint32(x2Hi>>32), + } + case v128CmpTypeI32x4GtS: + result = []bool{ + int32(x1Lo>>0) > int32(x2Lo>>0), int32(x1Lo>>32) > int32(x2Lo>>32), + int32(x1Hi>>0) > int32(x2Hi>>0), int32(x1Hi>>32) > int32(x2Hi>>32), + } + case v128CmpTypeI32x4GtU: + result = []bool{ + uint32(x1Lo>>0) > uint32(x2Lo>>0), uint32(x1Lo>>32) > uint32(x2Lo>>32), + uint32(x1Hi>>0) > uint32(x2Hi>>0), uint32(x1Hi>>32) > uint32(x2Hi>>32), + } + case v128CmpTypeI32x4LeS: + result = []bool{ + int32(x1Lo>>0) <= int32(x2Lo>>0), int32(x1Lo>>32) <= int32(x2Lo>>32), + int32(x1Hi>>0) <= int32(x2Hi>>0), int32(x1Hi>>32) <= int32(x2Hi>>32), + } + case v128CmpTypeI32x4LeU: + result = []bool{ + uint32(x1Lo>>0) <= uint32(x2Lo>>0), uint32(x1Lo>>32) <= uint32(x2Lo>>32), + uint32(x1Hi>>0) <= uint32(x2Hi>>0), uint32(x1Hi>>32) <= uint32(x2Hi>>32), + } + case v128CmpTypeI32x4GeS: + result = []bool{ + int32(x1Lo>>0) >= int32(x2Lo>>0), int32(x1Lo>>32) >= int32(x2Lo>>32), + int32(x1Hi>>0) >= int32(x2Hi>>0), int32(x1Hi>>32) >= int32(x2Hi>>32), + } + case v128CmpTypeI32x4GeU: + result = []bool{ + uint32(x1Lo>>0) >= uint32(x2Lo>>0), uint32(x1Lo>>32) >= uint32(x2Lo>>32), + uint32(x1Hi>>0) >= uint32(x2Hi>>0), uint32(x1Hi>>32) >= uint32(x2Hi>>32), + } + case v128CmpTypeI64x2Eq: + result = []bool{x1Lo == x2Lo, x1Hi == x2Hi} + case v128CmpTypeI64x2Ne: + result = []bool{x1Lo != x2Lo, x1Hi != x2Hi} + case v128CmpTypeI64x2LtS: + result = []bool{int64(x1Lo) < int64(x2Lo), int64(x1Hi) < int64(x2Hi)} + case v128CmpTypeI64x2GtS: + result = []bool{int64(x1Lo) > int64(x2Lo), int64(x1Hi) > int64(x2Hi)} + case v128CmpTypeI64x2LeS: + result = []bool{int64(x1Lo) <= int64(x2Lo), int64(x1Hi) <= int64(x2Hi)} + case v128CmpTypeI64x2GeS: + result = []bool{int64(x1Lo) >= int64(x2Lo), int64(x1Hi) >= int64(x2Hi)} + case v128CmpTypeF32x4Eq: + result = []bool{ + math.Float32frombits(uint32(x1Lo>>0)) == math.Float32frombits(uint32(x2Lo>>0)), + math.Float32frombits(uint32(x1Lo>>32)) == math.Float32frombits(uint32(x2Lo>>32)), + math.Float32frombits(uint32(x1Hi>>0)) == math.Float32frombits(uint32(x2Hi>>0)), + math.Float32frombits(uint32(x1Hi>>32)) == math.Float32frombits(uint32(x2Hi>>32)), + } + case v128CmpTypeF32x4Ne: + result = []bool{ + math.Float32frombits(uint32(x1Lo>>0)) != math.Float32frombits(uint32(x2Lo>>0)), + math.Float32frombits(uint32(x1Lo>>32)) != math.Float32frombits(uint32(x2Lo>>32)), + math.Float32frombits(uint32(x1Hi>>0)) != math.Float32frombits(uint32(x2Hi>>0)), + math.Float32frombits(uint32(x1Hi>>32)) != math.Float32frombits(uint32(x2Hi>>32)), + } + case v128CmpTypeF32x4Lt: + result = []bool{ + math.Float32frombits(uint32(x1Lo>>0)) < math.Float32frombits(uint32(x2Lo>>0)), + math.Float32frombits(uint32(x1Lo>>32)) < math.Float32frombits(uint32(x2Lo>>32)), + math.Float32frombits(uint32(x1Hi>>0)) < math.Float32frombits(uint32(x2Hi>>0)), + math.Float32frombits(uint32(x1Hi>>32)) < math.Float32frombits(uint32(x2Hi>>32)), + } + case v128CmpTypeF32x4Gt: + result = []bool{ + math.Float32frombits(uint32(x1Lo>>0)) > math.Float32frombits(uint32(x2Lo>>0)), + math.Float32frombits(uint32(x1Lo>>32)) > math.Float32frombits(uint32(x2Lo>>32)), + math.Float32frombits(uint32(x1Hi>>0)) > math.Float32frombits(uint32(x2Hi>>0)), + math.Float32frombits(uint32(x1Hi>>32)) > math.Float32frombits(uint32(x2Hi>>32)), + } + case v128CmpTypeF32x4Le: + result = []bool{ + math.Float32frombits(uint32(x1Lo>>0)) <= math.Float32frombits(uint32(x2Lo>>0)), + math.Float32frombits(uint32(x1Lo>>32)) <= math.Float32frombits(uint32(x2Lo>>32)), + math.Float32frombits(uint32(x1Hi>>0)) <= math.Float32frombits(uint32(x2Hi>>0)), + math.Float32frombits(uint32(x1Hi>>32)) <= math.Float32frombits(uint32(x2Hi>>32)), + } + case v128CmpTypeF32x4Ge: + result = []bool{ + math.Float32frombits(uint32(x1Lo>>0)) >= math.Float32frombits(uint32(x2Lo>>0)), + math.Float32frombits(uint32(x1Lo>>32)) >= math.Float32frombits(uint32(x2Lo>>32)), + math.Float32frombits(uint32(x1Hi>>0)) >= math.Float32frombits(uint32(x2Hi>>0)), + math.Float32frombits(uint32(x1Hi>>32)) >= math.Float32frombits(uint32(x2Hi>>32)), + } + case v128CmpTypeF64x2Eq: + result = []bool{ + math.Float64frombits(x1Lo) == math.Float64frombits(x2Lo), + math.Float64frombits(x1Hi) == math.Float64frombits(x2Hi), + } + case v128CmpTypeF64x2Ne: + result = []bool{ + math.Float64frombits(x1Lo) != math.Float64frombits(x2Lo), + math.Float64frombits(x1Hi) != math.Float64frombits(x2Hi), + } + case v128CmpTypeF64x2Lt: + result = []bool{ + math.Float64frombits(x1Lo) < math.Float64frombits(x2Lo), + math.Float64frombits(x1Hi) < math.Float64frombits(x2Hi), + } + case v128CmpTypeF64x2Gt: + result = []bool{ + math.Float64frombits(x1Lo) > math.Float64frombits(x2Lo), + math.Float64frombits(x1Hi) > math.Float64frombits(x2Hi), + } + case v128CmpTypeF64x2Le: + result = []bool{ + math.Float64frombits(x1Lo) <= math.Float64frombits(x2Lo), + math.Float64frombits(x1Hi) <= math.Float64frombits(x2Hi), + } + case v128CmpTypeF64x2Ge: + result = []bool{ + math.Float64frombits(x1Lo) >= math.Float64frombits(x2Lo), + math.Float64frombits(x1Hi) >= math.Float64frombits(x2Hi), + } + } + + var retLo, retHi uint64 + laneNum := len(result) + switch laneNum { + case 16: + for i, b := range result { + if b { + if i < 8 { + retLo |= 0xff << (i * 8) + } else { + retHi |= 0xff << ((i - 8) * 8) + } + } + } + case 8: + for i, b := range result { + if b { + if i < 4 { + retLo |= 0xffff << (i * 16) + } else { + retHi |= 0xffff << ((i - 4) * 16) + } + } + } + case 4: + for i, b := range result { + if b { + if i < 2 { + retLo |= 0xffff_ffff << (i * 32) + } else { + retHi |= 0xffff_ffff << ((i - 2) * 32) + } + } + } + case 2: + if result[0] { + retLo = ^uint64(0) + } + if result[1] { + retHi = ^uint64(0) + } + } + + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128AddSat: + x2hi, x2Lo := ce.popValue(), ce.popValue() + x1hi, x1Lo := ce.popValue(), ce.popValue() + + var retLo, retHi uint64 + + // Lane-wise addition while saturating the overflowing values. + // https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#saturating-integer-addition + switch op.B1 { + case shapeI8x16: + for i := 0; i < 16; i++ { + var v, w byte + if i < 8 { + v, w = byte(x1Lo>>(i*8)), byte(x2Lo>>(i*8)) + } else { + v, w = byte(x1hi>>((i-8)*8)), byte(x2hi>>((i-8)*8)) + } + + var uv uint64 + if op.B3 { // signed + if subbed := int64(int8(v)) + int64(int8(w)); subbed < math.MinInt8 { + uv = uint64(byte(0x80)) + } else if subbed > math.MaxInt8 { + uv = uint64(byte(0x7f)) + } else { + uv = uint64(byte(int8(subbed))) + } + } else { + if subbed := int64(v) + int64(w); subbed < 0 { + uv = uint64(byte(0)) + } else if subbed > math.MaxUint8 { + uv = uint64(byte(0xff)) + } else { + uv = uint64(byte(subbed)) + } + } + + if i < 8 { // first 8 lanes are on lower 64bits. + retLo |= uv << (i * 8) + } else { + retHi |= uv << ((i - 8) * 8) + } + } + case shapeI16x8: + for i := 0; i < 8; i++ { + var v, w uint16 + if i < 4 { + v, w = uint16(x1Lo>>(i*16)), uint16(x2Lo>>(i*16)) + } else { + v, w = uint16(x1hi>>((i-4)*16)), uint16(x2hi>>((i-4)*16)) + } + + var uv uint64 + if op.B3 { // signed + if added := int64(int16(v)) + int64(int16(w)); added < math.MinInt16 { + uv = uint64(uint16(0x8000)) + } else if added > math.MaxInt16 { + uv = uint64(uint16(0x7fff)) + } else { + uv = uint64(uint16(int16(added))) + } + } else { + if added := int64(v) + int64(w); added < 0 { + uv = uint64(uint16(0)) + } else if added > math.MaxUint16 { + uv = uint64(uint16(0xffff)) + } else { + uv = uint64(uint16(added)) + } + } + + if i < 4 { // first 4 lanes are on lower 64bits. + retLo |= uv << (i * 16) + } else { + retHi |= uv << ((i - 4) * 16) + } + } + } + + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128SubSat: + x2hi, x2Lo := ce.popValue(), ce.popValue() + x1hi, x1Lo := ce.popValue(), ce.popValue() + + var retLo, retHi uint64 + + // Lane-wise subtraction while saturating the overflowing values. + // https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#saturating-integer-subtraction + switch op.B1 { + case shapeI8x16: + for i := 0; i < 16; i++ { + var v, w byte + if i < 8 { + v, w = byte(x1Lo>>(i*8)), byte(x2Lo>>(i*8)) + } else { + v, w = byte(x1hi>>((i-8)*8)), byte(x2hi>>((i-8)*8)) + } + + var uv uint64 + if op.B3 { // signed + if subbed := int64(int8(v)) - int64(int8(w)); subbed < math.MinInt8 { + uv = uint64(byte(0x80)) + } else if subbed > math.MaxInt8 { + uv = uint64(byte(0x7f)) + } else { + uv = uint64(byte(int8(subbed))) + } + } else { + if subbed := int64(v) - int64(w); subbed < 0 { + uv = uint64(byte(0)) + } else if subbed > math.MaxUint8 { + uv = uint64(byte(0xff)) + } else { + uv = uint64(byte(subbed)) + } + } + + if i < 8 { + retLo |= uv << (i * 8) + } else { + retHi |= uv << ((i - 8) * 8) + } + } + case shapeI16x8: + for i := 0; i < 8; i++ { + var v, w uint16 + if i < 4 { + v, w = uint16(x1Lo>>(i*16)), uint16(x2Lo>>(i*16)) + } else { + v, w = uint16(x1hi>>((i-4)*16)), uint16(x2hi>>((i-4)*16)) + } + + var uv uint64 + if op.B3 { // signed + if subbed := int64(int16(v)) - int64(int16(w)); subbed < math.MinInt16 { + uv = uint64(uint16(0x8000)) + } else if subbed > math.MaxInt16 { + uv = uint64(uint16(0x7fff)) + } else { + uv = uint64(uint16(int16(subbed))) + } + } else { + if subbed := int64(v) - int64(w); subbed < 0 { + uv = uint64(uint16(0)) + } else if subbed > math.MaxUint16 { + uv = uint64(uint16(0xffff)) + } else { + uv = uint64(uint16(subbed)) + } + } + + if i < 4 { + retLo |= uv << (i * 16) + } else { + retHi |= uv << ((i - 4) * 16) + } + } + } + + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128Mul: + x2hi, x2lo := ce.popValue(), ce.popValue() + x1hi, x1lo := ce.popValue(), ce.popValue() + var retLo, retHi uint64 + switch op.B1 { + case shapeI16x8: + retHi = uint64(uint16(x1hi)*uint16(x2hi)) | (uint64(uint16(x1hi>>16)*uint16(x2hi>>16)) << 16) | + (uint64(uint16(x1hi>>32)*uint16(x2hi>>32)) << 32) | (uint64(uint16(x1hi>>48)*uint16(x2hi>>48)) << 48) + retLo = uint64(uint16(x1lo)*uint16(x2lo)) | (uint64(uint16(x1lo>>16)*uint16(x2lo>>16)) << 16) | + (uint64(uint16(x1lo>>32)*uint16(x2lo>>32)) << 32) | (uint64(uint16(x1lo>>48)*uint16(x2lo>>48)) << 48) + case shapeI32x4: + retHi = uint64(uint32(x1hi)*uint32(x2hi)) | (uint64(uint32(x1hi>>32)*uint32(x2hi>>32)) << 32) + retLo = uint64(uint32(x1lo)*uint32(x2lo)) | (uint64(uint32(x1lo>>32)*uint32(x2lo>>32)) << 32) + case shapeI64x2: + retHi = x1hi * x2hi + retLo = x1lo * x2lo + case shapeF32x4: + retHi = mulFloat32bits(uint32(x1hi), uint32(x2hi)) | mulFloat32bits(uint32(x1hi>>32), uint32(x2hi>>32))<<32 + retLo = mulFloat32bits(uint32(x1lo), uint32(x2lo)) | mulFloat32bits(uint32(x1lo>>32), uint32(x2lo>>32))<<32 + case shapeF64x2: + retHi = math.Float64bits(math.Float64frombits(x1hi) * math.Float64frombits(x2hi)) + retLo = math.Float64bits(math.Float64frombits(x1lo) * math.Float64frombits(x2lo)) + } + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128Div: + x2hi, x2lo := ce.popValue(), ce.popValue() + x1hi, x1lo := ce.popValue(), ce.popValue() + var retLo, retHi uint64 + if op.B1 == shapeF64x2 { + retHi = math.Float64bits(math.Float64frombits(x1hi) / math.Float64frombits(x2hi)) + retLo = math.Float64bits(math.Float64frombits(x1lo) / math.Float64frombits(x2lo)) + } else { + retHi = divFloat32bits(uint32(x1hi), uint32(x2hi)) | divFloat32bits(uint32(x1hi>>32), uint32(x2hi>>32))<<32 + retLo = divFloat32bits(uint32(x1lo), uint32(x2lo)) | divFloat32bits(uint32(x1lo>>32), uint32(x2lo>>32))<<32 + } + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128Neg: + hi, lo := ce.popValue(), ce.popValue() + switch op.B1 { + case shapeI8x16: + lo = uint64(-byte(lo)) | (uint64(-byte(lo>>8)) << 8) | + (uint64(-byte(lo>>16)) << 16) | (uint64(-byte(lo>>24)) << 24) | + (uint64(-byte(lo>>32)) << 32) | (uint64(-byte(lo>>40)) << 40) | + (uint64(-byte(lo>>48)) << 48) | (uint64(-byte(lo>>56)) << 56) + hi = uint64(-byte(hi)) | (uint64(-byte(hi>>8)) << 8) | + (uint64(-byte(hi>>16)) << 16) | (uint64(-byte(hi>>24)) << 24) | + (uint64(-byte(hi>>32)) << 32) | (uint64(-byte(hi>>40)) << 40) | + (uint64(-byte(hi>>48)) << 48) | (uint64(-byte(hi>>56)) << 56) + case shapeI16x8: + hi = uint64(-uint16(hi)) | (uint64(-uint16(hi>>16)) << 16) | + (uint64(-uint16(hi>>32)) << 32) | (uint64(-uint16(hi>>48)) << 48) + lo = uint64(-uint16(lo)) | (uint64(-uint16(lo>>16)) << 16) | + (uint64(-uint16(lo>>32)) << 32) | (uint64(-uint16(lo>>48)) << 48) + case shapeI32x4: + hi = uint64(-uint32(hi)) | (uint64(-uint32(hi>>32)) << 32) + lo = uint64(-uint32(lo)) | (uint64(-uint32(lo>>32)) << 32) + case shapeI64x2: + hi = -hi + lo = -lo + case shapeF32x4: + hi = uint64(math.Float32bits(-math.Float32frombits(uint32(hi)))) | + (uint64(math.Float32bits(-math.Float32frombits(uint32(hi>>32)))) << 32) + lo = uint64(math.Float32bits(-math.Float32frombits(uint32(lo)))) | + (uint64(math.Float32bits(-math.Float32frombits(uint32(lo>>32)))) << 32) + case shapeF64x2: + hi = math.Float64bits(-math.Float64frombits(hi)) + lo = math.Float64bits(-math.Float64frombits(lo)) + } + ce.pushValue(lo) + ce.pushValue(hi) + frame.pc++ + case operationKindV128Sqrt: + hi, lo := ce.popValue(), ce.popValue() + if op.B1 == shapeF64x2 { + hi = math.Float64bits(math.Sqrt(math.Float64frombits(hi))) + lo = math.Float64bits(math.Sqrt(math.Float64frombits(lo))) + } else { + hi = uint64(math.Float32bits(float32(math.Sqrt(float64(math.Float32frombits(uint32(hi))))))) | + (uint64(math.Float32bits(float32(math.Sqrt(float64(math.Float32frombits(uint32(hi>>32))))))) << 32) + lo = uint64(math.Float32bits(float32(math.Sqrt(float64(math.Float32frombits(uint32(lo))))))) | + (uint64(math.Float32bits(float32(math.Sqrt(float64(math.Float32frombits(uint32(lo>>32))))))) << 32) + } + ce.pushValue(lo) + ce.pushValue(hi) + frame.pc++ + case operationKindV128Abs: + hi, lo := ce.popValue(), ce.popValue() + switch op.B1 { + case shapeI8x16: + lo = uint64(i8Abs(byte(lo))) | (uint64(i8Abs(byte(lo>>8))) << 8) | + (uint64(i8Abs(byte(lo>>16))) << 16) | (uint64(i8Abs(byte(lo>>24))) << 24) | + (uint64(i8Abs(byte(lo>>32))) << 32) | (uint64(i8Abs(byte(lo>>40))) << 40) | + (uint64(i8Abs(byte(lo>>48))) << 48) | (uint64(i8Abs(byte(lo>>56))) << 56) + hi = uint64(i8Abs(byte(hi))) | (uint64(i8Abs(byte(hi>>8))) << 8) | + (uint64(i8Abs(byte(hi>>16))) << 16) | (uint64(i8Abs(byte(hi>>24))) << 24) | + (uint64(i8Abs(byte(hi>>32))) << 32) | (uint64(i8Abs(byte(hi>>40))) << 40) | + (uint64(i8Abs(byte(hi>>48))) << 48) | (uint64(i8Abs(byte(hi>>56))) << 56) + case shapeI16x8: + hi = uint64(i16Abs(uint16(hi))) | (uint64(i16Abs(uint16(hi>>16))) << 16) | + (uint64(i16Abs(uint16(hi>>32))) << 32) | (uint64(i16Abs(uint16(hi>>48))) << 48) + lo = uint64(i16Abs(uint16(lo))) | (uint64(i16Abs(uint16(lo>>16))) << 16) | + (uint64(i16Abs(uint16(lo>>32))) << 32) | (uint64(i16Abs(uint16(lo>>48))) << 48) + case shapeI32x4: + hi = uint64(i32Abs(uint32(hi))) | (uint64(i32Abs(uint32(hi>>32))) << 32) + lo = uint64(i32Abs(uint32(lo))) | (uint64(i32Abs(uint32(lo>>32))) << 32) + case shapeI64x2: + if int64(hi) < 0 { + hi = -hi + } + if int64(lo) < 0 { + lo = -lo + } + case shapeF32x4: + hi = hi &^ (1<<31 | 1<<63) + lo = lo &^ (1<<31 | 1<<63) + case shapeF64x2: + hi = hi &^ (1 << 63) + lo = lo &^ (1 << 63) + } + ce.pushValue(lo) + ce.pushValue(hi) + frame.pc++ + case operationKindV128Popcnt: + hi, lo := ce.popValue(), ce.popValue() + var retLo, retHi uint64 + for i := 0; i < 16; i++ { + var v byte + if i < 8 { + v = byte(lo >> (i * 8)) + } else { + v = byte(hi >> ((i - 8) * 8)) + } + + var cnt uint64 + for i := 0; i < 8; i++ { + if (v>>i)&0b1 != 0 { + cnt++ + } + } + + if i < 8 { + retLo |= cnt << (i * 8) + } else { + retHi |= cnt << ((i - 8) * 8) + } + } + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128Min: + x2hi, x2lo := ce.popValue(), ce.popValue() + x1hi, x1lo := ce.popValue(), ce.popValue() + var retLo, retHi uint64 + switch op.B1 { + case shapeI8x16: + if op.B3 { // signed + retLo = uint64(i8MinS(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8MinS(uint8(x1lo), uint8(x2lo))) | + uint64(i8MinS(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8MinS(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 | + uint64(i8MinS(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8MinS(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 | + uint64(i8MinS(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8MinS(uint8(x1lo>>48), uint8(x2lo>>48)))<<48 + retHi = uint64(i8MinS(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8MinS(uint8(x1hi), uint8(x2hi))) | + uint64(i8MinS(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8MinS(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 | + uint64(i8MinS(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8MinS(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 | + uint64(i8MinS(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8MinS(uint8(x1hi>>48), uint8(x2hi>>48)))<<48 + } else { + retLo = uint64(i8MinU(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8MinU(uint8(x1lo), uint8(x2lo))) | + uint64(i8MinU(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8MinU(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 | + uint64(i8MinU(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8MinU(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 | + uint64(i8MinU(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8MinU(uint8(x1lo>>48), uint8(x2lo>>48)))<<48 + retHi = uint64(i8MinU(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8MinU(uint8(x1hi), uint8(x2hi))) | + uint64(i8MinU(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8MinU(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 | + uint64(i8MinU(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8MinU(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 | + uint64(i8MinU(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8MinU(uint8(x1hi>>48), uint8(x2hi>>48)))<<48 + } + case shapeI16x8: + if op.B3 { // signed + retLo = uint64(i16MinS(uint16(x1lo), uint16(x2lo))) | + uint64(i16MinS(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 | + uint64(i16MinS(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 | + uint64(i16MinS(uint16(x1lo>>48), uint16(x2lo>>48)))<<48 + retHi = uint64(i16MinS(uint16(x1hi), uint16(x2hi))) | + uint64(i16MinS(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 | + uint64(i16MinS(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 | + uint64(i16MinS(uint16(x1hi>>48), uint16(x2hi>>48)))<<48 + } else { + retLo = uint64(i16MinU(uint16(x1lo), uint16(x2lo))) | + uint64(i16MinU(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 | + uint64(i16MinU(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 | + uint64(i16MinU(uint16(x1lo>>48), uint16(x2lo>>48)))<<48 + retHi = uint64(i16MinU(uint16(x1hi), uint16(x2hi))) | + uint64(i16MinU(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 | + uint64(i16MinU(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 | + uint64(i16MinU(uint16(x1hi>>48), uint16(x2hi>>48)))<<48 + } + case shapeI32x4: + if op.B3 { // signed + retLo = uint64(i32MinS(uint32(x1lo), uint32(x2lo))) | + uint64(i32MinS(uint32(x1lo>>32), uint32(x2lo>>32)))<<32 + retHi = uint64(i32MinS(uint32(x1hi), uint32(x2hi))) | + uint64(i32MinS(uint32(x1hi>>32), uint32(x2hi>>32)))<<32 + } else { + retLo = uint64(i32MinU(uint32(x1lo), uint32(x2lo))) | + uint64(i32MinU(uint32(x1lo>>32), uint32(x2lo>>32)))<<32 + retHi = uint64(i32MinU(uint32(x1hi), uint32(x2hi))) | + uint64(i32MinU(uint32(x1hi>>32), uint32(x2hi>>32)))<<32 + } + case shapeF32x4: + retHi = wasmCompatMin32bits(uint32(x1hi), uint32(x2hi)) | + wasmCompatMin32bits(uint32(x1hi>>32), uint32(x2hi>>32))<<32 + retLo = wasmCompatMin32bits(uint32(x1lo), uint32(x2lo)) | + wasmCompatMin32bits(uint32(x1lo>>32), uint32(x2lo>>32))<<32 + case shapeF64x2: + retHi = math.Float64bits(moremath.WasmCompatMin64( + math.Float64frombits(x1hi), + math.Float64frombits(x2hi), + )) + retLo = math.Float64bits(moremath.WasmCompatMin64( + math.Float64frombits(x1lo), + math.Float64frombits(x2lo), + )) + } + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128Max: + x2hi, x2lo := ce.popValue(), ce.popValue() + x1hi, x1lo := ce.popValue(), ce.popValue() + var retLo, retHi uint64 + switch op.B1 { + case shapeI8x16: + if op.B3 { // signed + retLo = uint64(i8MaxS(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8MaxS(uint8(x1lo), uint8(x2lo))) | + uint64(i8MaxS(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8MaxS(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 | + uint64(i8MaxS(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8MaxS(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 | + uint64(i8MaxS(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8MaxS(uint8(x1lo>>48), uint8(x2lo>>48)))<<48 + retHi = uint64(i8MaxS(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8MaxS(uint8(x1hi), uint8(x2hi))) | + uint64(i8MaxS(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8MaxS(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 | + uint64(i8MaxS(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8MaxS(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 | + uint64(i8MaxS(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8MaxS(uint8(x1hi>>48), uint8(x2hi>>48)))<<48 + } else { + retLo = uint64(i8MaxU(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8MaxU(uint8(x1lo), uint8(x2lo))) | + uint64(i8MaxU(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8MaxU(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 | + uint64(i8MaxU(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8MaxU(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 | + uint64(i8MaxU(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8MaxU(uint8(x1lo>>48), uint8(x2lo>>48)))<<48 + retHi = uint64(i8MaxU(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8MaxU(uint8(x1hi), uint8(x2hi))) | + uint64(i8MaxU(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8MaxU(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 | + uint64(i8MaxU(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8MaxU(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 | + uint64(i8MaxU(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8MaxU(uint8(x1hi>>48), uint8(x2hi>>48)))<<48 + } + case shapeI16x8: + if op.B3 { // signed + retLo = uint64(i16MaxS(uint16(x1lo), uint16(x2lo))) | + uint64(i16MaxS(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 | + uint64(i16MaxS(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 | + uint64(i16MaxS(uint16(x1lo>>48), uint16(x2lo>>48)))<<48 + retHi = uint64(i16MaxS(uint16(x1hi), uint16(x2hi))) | + uint64(i16MaxS(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 | + uint64(i16MaxS(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 | + uint64(i16MaxS(uint16(x1hi>>48), uint16(x2hi>>48)))<<48 + } else { + retLo = uint64(i16MaxU(uint16(x1lo), uint16(x2lo))) | + uint64(i16MaxU(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 | + uint64(i16MaxU(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 | + uint64(i16MaxU(uint16(x1lo>>48), uint16(x2lo>>48)))<<48 + retHi = uint64(i16MaxU(uint16(x1hi), uint16(x2hi))) | + uint64(i16MaxU(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 | + uint64(i16MaxU(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 | + uint64(i16MaxU(uint16(x1hi>>48), uint16(x2hi>>48)))<<48 + } + case shapeI32x4: + if op.B3 { // signed + retLo = uint64(i32MaxS(uint32(x1lo), uint32(x2lo))) | + uint64(i32MaxS(uint32(x1lo>>32), uint32(x2lo>>32)))<<32 + retHi = uint64(i32MaxS(uint32(x1hi), uint32(x2hi))) | + uint64(i32MaxS(uint32(x1hi>>32), uint32(x2hi>>32)))<<32 + } else { + retLo = uint64(i32MaxU(uint32(x1lo), uint32(x2lo))) | + uint64(i32MaxU(uint32(x1lo>>32), uint32(x2lo>>32)))<<32 + retHi = uint64(i32MaxU(uint32(x1hi), uint32(x2hi))) | + uint64(i32MaxU(uint32(x1hi>>32), uint32(x2hi>>32)))<<32 + } + case shapeF32x4: + retHi = wasmCompatMax32bits(uint32(x1hi), uint32(x2hi)) | + wasmCompatMax32bits(uint32(x1hi>>32), uint32(x2hi>>32))<<32 + retLo = wasmCompatMax32bits(uint32(x1lo), uint32(x2lo)) | + wasmCompatMax32bits(uint32(x1lo>>32), uint32(x2lo>>32))<<32 + case shapeF64x2: + retHi = math.Float64bits(moremath.WasmCompatMax64( + math.Float64frombits(x1hi), + math.Float64frombits(x2hi), + )) + retLo = math.Float64bits(moremath.WasmCompatMax64( + math.Float64frombits(x1lo), + math.Float64frombits(x2lo), + )) + } + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128AvgrU: + x2hi, x2lo := ce.popValue(), ce.popValue() + x1hi, x1lo := ce.popValue(), ce.popValue() + var retLo, retHi uint64 + switch op.B1 { + case shapeI8x16: + retLo = uint64(i8RoundingAverage(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8RoundingAverage(uint8(x1lo), uint8(x2lo))) | + uint64(i8RoundingAverage(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8RoundingAverage(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 | + uint64(i8RoundingAverage(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8RoundingAverage(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 | + uint64(i8RoundingAverage(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8RoundingAverage(uint8(x1lo>>48), uint8(x2lo>>48)))<<48 + retHi = uint64(i8RoundingAverage(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8RoundingAverage(uint8(x1hi), uint8(x2hi))) | + uint64(i8RoundingAverage(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8RoundingAverage(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 | + uint64(i8RoundingAverage(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8RoundingAverage(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 | + uint64(i8RoundingAverage(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8RoundingAverage(uint8(x1hi>>48), uint8(x2hi>>48)))<<48 + case shapeI16x8: + retLo = uint64(i16RoundingAverage(uint16(x1lo), uint16(x2lo))) | + uint64(i16RoundingAverage(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 | + uint64(i16RoundingAverage(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 | + uint64(i16RoundingAverage(uint16(x1lo>>48), uint16(x2lo>>48)))<<48 + retHi = uint64(i16RoundingAverage(uint16(x1hi), uint16(x2hi))) | + uint64(i16RoundingAverage(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 | + uint64(i16RoundingAverage(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 | + uint64(i16RoundingAverage(uint16(x1hi>>48), uint16(x2hi>>48)))<<48 + } + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128Pmin: + x2hi, x2lo := ce.popValue(), ce.popValue() + x1hi, x1lo := ce.popValue(), ce.popValue() + var retLo, retHi uint64 + if op.B1 == shapeF32x4 { + if flt32(math.Float32frombits(uint32(x2lo)), math.Float32frombits(uint32(x1lo))) { + retLo = x2lo & 0x00000000_ffffffff + } else { + retLo = x1lo & 0x00000000_ffffffff + } + if flt32(math.Float32frombits(uint32(x2lo>>32)), math.Float32frombits(uint32(x1lo>>32))) { + retLo |= x2lo & 0xffffffff_00000000 + } else { + retLo |= x1lo & 0xffffffff_00000000 + } + if flt32(math.Float32frombits(uint32(x2hi)), math.Float32frombits(uint32(x1hi))) { + retHi = x2hi & 0x00000000_ffffffff + } else { + retHi = x1hi & 0x00000000_ffffffff + } + if flt32(math.Float32frombits(uint32(x2hi>>32)), math.Float32frombits(uint32(x1hi>>32))) { + retHi |= x2hi & 0xffffffff_00000000 + } else { + retHi |= x1hi & 0xffffffff_00000000 + } + } else { + if flt64(math.Float64frombits(x2lo), math.Float64frombits(x1lo)) { + retLo = x2lo + } else { + retLo = x1lo + } + if flt64(math.Float64frombits(x2hi), math.Float64frombits(x1hi)) { + retHi = x2hi + } else { + retHi = x1hi + } + } + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128Pmax: + x2hi, x2lo := ce.popValue(), ce.popValue() + x1hi, x1lo := ce.popValue(), ce.popValue() + var retLo, retHi uint64 + if op.B1 == shapeF32x4 { + if flt32(math.Float32frombits(uint32(x1lo)), math.Float32frombits(uint32(x2lo))) { + retLo = x2lo & 0x00000000_ffffffff + } else { + retLo = x1lo & 0x00000000_ffffffff + } + if flt32(math.Float32frombits(uint32(x1lo>>32)), math.Float32frombits(uint32(x2lo>>32))) { + retLo |= x2lo & 0xffffffff_00000000 + } else { + retLo |= x1lo & 0xffffffff_00000000 + } + if flt32(math.Float32frombits(uint32(x1hi)), math.Float32frombits(uint32(x2hi))) { + retHi = x2hi & 0x00000000_ffffffff + } else { + retHi = x1hi & 0x00000000_ffffffff + } + if flt32(math.Float32frombits(uint32(x1hi>>32)), math.Float32frombits(uint32(x2hi>>32))) { + retHi |= x2hi & 0xffffffff_00000000 + } else { + retHi |= x1hi & 0xffffffff_00000000 + } + } else { + if flt64(math.Float64frombits(x1lo), math.Float64frombits(x2lo)) { + retLo = x2lo + } else { + retLo = x1lo + } + if flt64(math.Float64frombits(x1hi), math.Float64frombits(x2hi)) { + retHi = x2hi + } else { + retHi = x1hi + } + } + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128Ceil: + hi, lo := ce.popValue(), ce.popValue() + if op.B1 == shapeF32x4 { + lo = uint64(math.Float32bits(moremath.WasmCompatCeilF32(math.Float32frombits(uint32(lo))))) | + (uint64(math.Float32bits(moremath.WasmCompatCeilF32(math.Float32frombits(uint32(lo>>32))))) << 32) + hi = uint64(math.Float32bits(moremath.WasmCompatCeilF32(math.Float32frombits(uint32(hi))))) | + (uint64(math.Float32bits(moremath.WasmCompatCeilF32(math.Float32frombits(uint32(hi>>32))))) << 32) + } else { + lo = math.Float64bits(moremath.WasmCompatCeilF64(math.Float64frombits(lo))) + hi = math.Float64bits(moremath.WasmCompatCeilF64(math.Float64frombits(hi))) + } + ce.pushValue(lo) + ce.pushValue(hi) + frame.pc++ + case operationKindV128Floor: + hi, lo := ce.popValue(), ce.popValue() + if op.B1 == shapeF32x4 { + lo = uint64(math.Float32bits(moremath.WasmCompatFloorF32(math.Float32frombits(uint32(lo))))) | + (uint64(math.Float32bits(moremath.WasmCompatFloorF32(math.Float32frombits(uint32(lo>>32))))) << 32) + hi = uint64(math.Float32bits(moremath.WasmCompatFloorF32(math.Float32frombits(uint32(hi))))) | + (uint64(math.Float32bits(moremath.WasmCompatFloorF32(math.Float32frombits(uint32(hi>>32))))) << 32) + } else { + lo = math.Float64bits(moremath.WasmCompatFloorF64(math.Float64frombits(lo))) + hi = math.Float64bits(moremath.WasmCompatFloorF64(math.Float64frombits(hi))) + } + ce.pushValue(lo) + ce.pushValue(hi) + frame.pc++ + case operationKindV128Trunc: + hi, lo := ce.popValue(), ce.popValue() + if op.B1 == shapeF32x4 { + lo = uint64(math.Float32bits(moremath.WasmCompatTruncF32(math.Float32frombits(uint32(lo))))) | + (uint64(math.Float32bits(moremath.WasmCompatTruncF32(math.Float32frombits(uint32(lo>>32))))) << 32) + hi = uint64(math.Float32bits(moremath.WasmCompatTruncF32(math.Float32frombits(uint32(hi))))) | + (uint64(math.Float32bits(moremath.WasmCompatTruncF32(math.Float32frombits(uint32(hi>>32))))) << 32) + } else { + lo = math.Float64bits(moremath.WasmCompatTruncF64(math.Float64frombits(lo))) + hi = math.Float64bits(moremath.WasmCompatTruncF64(math.Float64frombits(hi))) + } + ce.pushValue(lo) + ce.pushValue(hi) + frame.pc++ + case operationKindV128Nearest: + hi, lo := ce.popValue(), ce.popValue() + if op.B1 == shapeF32x4 { + lo = uint64(math.Float32bits(moremath.WasmCompatNearestF32(math.Float32frombits(uint32(lo))))) | + (uint64(math.Float32bits(moremath.WasmCompatNearestF32(math.Float32frombits(uint32(lo>>32))))) << 32) + hi = uint64(math.Float32bits(moremath.WasmCompatNearestF32(math.Float32frombits(uint32(hi))))) | + (uint64(math.Float32bits(moremath.WasmCompatNearestF32(math.Float32frombits(uint32(hi>>32))))) << 32) + } else { + lo = math.Float64bits(moremath.WasmCompatNearestF64(math.Float64frombits(lo))) + hi = math.Float64bits(moremath.WasmCompatNearestF64(math.Float64frombits(hi))) + } + ce.pushValue(lo) + ce.pushValue(hi) + frame.pc++ + case operationKindV128Extend: + hi, lo := ce.popValue(), ce.popValue() + var origin uint64 + if op.B3 { // use lower 64 bits + origin = lo + } else { + origin = hi + } + + signed := op.B2 == 1 + + var retHi, retLo uint64 + switch op.B1 { + case shapeI8x16: + for i := 0; i < 8; i++ { + v8 := byte(origin >> (i * 8)) + + var v16 uint16 + if signed { + v16 = uint16(int8(v8)) + } else { + v16 = uint16(v8) + } + + if i < 4 { + retLo |= uint64(v16) << (i * 16) + } else { + retHi |= uint64(v16) << ((i - 4) * 16) + } + } + case shapeI16x8: + for i := 0; i < 4; i++ { + v16 := uint16(origin >> (i * 16)) + + var v32 uint32 + if signed { + v32 = uint32(int16(v16)) + } else { + v32 = uint32(v16) + } + + if i < 2 { + retLo |= uint64(v32) << (i * 32) + } else { + retHi |= uint64(v32) << ((i - 2) * 32) + } + } + case shapeI32x4: + v32Lo := uint32(origin) + v32Hi := uint32(origin >> 32) + if signed { + retLo = uint64(int32(v32Lo)) + retHi = uint64(int32(v32Hi)) + } else { + retLo = uint64(v32Lo) + retHi = uint64(v32Hi) + } + } + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128ExtMul: + x2Hi, x2Lo := ce.popValue(), ce.popValue() + x1Hi, x1Lo := ce.popValue(), ce.popValue() + var x1, x2 uint64 + if op.B3 { // use lower 64 bits + x1, x2 = x1Lo, x2Lo + } else { + x1, x2 = x1Hi, x2Hi + } + + signed := op.B2 == 1 + + var retLo, retHi uint64 + switch op.B1 { + case shapeI8x16: + for i := 0; i < 8; i++ { + v1, v2 := byte(x1>>(i*8)), byte(x2>>(i*8)) + + var v16 uint16 + if signed { + v16 = uint16(int16(int8(v1)) * int16(int8(v2))) + } else { + v16 = uint16(v1) * uint16(v2) + } + + if i < 4 { + retLo |= uint64(v16) << (i * 16) + } else { + retHi |= uint64(v16) << ((i - 4) * 16) + } + } + case shapeI16x8: + for i := 0; i < 4; i++ { + v1, v2 := uint16(x1>>(i*16)), uint16(x2>>(i*16)) + + var v32 uint32 + if signed { + v32 = uint32(int32(int16(v1)) * int32(int16(v2))) + } else { + v32 = uint32(v1) * uint32(v2) + } + + if i < 2 { + retLo |= uint64(v32) << (i * 32) + } else { + retHi |= uint64(v32) << ((i - 2) * 32) + } + } + case shapeI32x4: + v1Lo, v2Lo := uint32(x1), uint32(x2) + v1Hi, v2Hi := uint32(x1>>32), uint32(x2>>32) + if signed { + retLo = uint64(int64(int32(v1Lo)) * int64(int32(v2Lo))) + retHi = uint64(int64(int32(v1Hi)) * int64(int32(v2Hi))) + } else { + retLo = uint64(v1Lo) * uint64(v2Lo) + retHi = uint64(v1Hi) * uint64(v2Hi) + } + } + + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128Q15mulrSatS: + x2hi, x2Lo := ce.popValue(), ce.popValue() + x1hi, x1Lo := ce.popValue(), ce.popValue() + var retLo, retHi uint64 + for i := 0; i < 8; i++ { + var v, w int16 + if i < 4 { + v, w = int16(uint16(x1Lo>>(i*16))), int16(uint16(x2Lo>>(i*16))) + } else { + v, w = int16(uint16(x1hi>>((i-4)*16))), int16(uint16(x2hi>>((i-4)*16))) + } + + var uv uint64 + // https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#saturating-integer-q-format-rounding-multiplication + if calc := ((int32(v) * int32(w)) + 0x4000) >> 15; calc < math.MinInt16 { + uv = uint64(uint16(0x8000)) + } else if calc > math.MaxInt16 { + uv = uint64(uint16(0x7fff)) + } else { + uv = uint64(uint16(int16(calc))) + } + + if i < 4 { + retLo |= uv << (i * 16) + } else { + retHi |= uv << ((i - 4) * 16) + } + } + + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128ExtAddPairwise: + hi, lo := ce.popValue(), ce.popValue() + + signed := op.B3 + + var retLo, retHi uint64 + switch op.B1 { + case shapeI8x16: + for i := 0; i < 8; i++ { + var v1, v2 byte + if i < 4 { + v1, v2 = byte(lo>>((i*2)*8)), byte(lo>>((i*2+1)*8)) + } else { + v1, v2 = byte(hi>>(((i-4)*2)*8)), byte(hi>>(((i-4)*2+1)*8)) + } + + var v16 uint16 + if signed { + v16 = uint16(int16(int8(v1)) + int16(int8(v2))) + } else { + v16 = uint16(v1) + uint16(v2) + } + + if i < 4 { + retLo |= uint64(v16) << (i * 16) + } else { + retHi |= uint64(v16) << ((i - 4) * 16) + } + } + case shapeI16x8: + for i := 0; i < 4; i++ { + var v1, v2 uint16 + if i < 2 { + v1, v2 = uint16(lo>>((i*2)*16)), uint16(lo>>((i*2+1)*16)) + } else { + v1, v2 = uint16(hi>>(((i-2)*2)*16)), uint16(hi>>(((i-2)*2+1)*16)) + } + + var v32 uint32 + if signed { + v32 = uint32(int32(int16(v1)) + int32(int16(v2))) + } else { + v32 = uint32(v1) + uint32(v2) + } + + if i < 2 { + retLo |= uint64(v32) << (i * 32) + } else { + retHi |= uint64(v32) << ((i - 2) * 32) + } + } + } + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128FloatPromote: + _, toPromote := ce.popValue(), ce.popValue() + ce.pushValue(math.Float64bits(float64(math.Float32frombits(uint32(toPromote))))) + ce.pushValue(math.Float64bits(float64(math.Float32frombits(uint32(toPromote >> 32))))) + frame.pc++ + case operationKindV128FloatDemote: + hi, lo := ce.popValue(), ce.popValue() + ce.pushValue( + uint64(math.Float32bits(float32(math.Float64frombits(lo)))) | + (uint64(math.Float32bits(float32(math.Float64frombits(hi)))) << 32), + ) + ce.pushValue(0) + frame.pc++ + case operationKindV128FConvertFromI: + hi, lo := ce.popValue(), ce.popValue() + v1, v2, v3, v4 := uint32(lo), uint32(lo>>32), uint32(hi), uint32(hi>>32) + signed := op.B3 + + var retLo, retHi uint64 + switch op.B1 { // Destination shape. + case shapeF32x4: // f32x4 from signed/unsigned i32x4 + if signed { + retLo = uint64(math.Float32bits(float32(int32(v1)))) | + (uint64(math.Float32bits(float32(int32(v2)))) << 32) + retHi = uint64(math.Float32bits(float32(int32(v3)))) | + (uint64(math.Float32bits(float32(int32(v4)))) << 32) + } else { + retLo = uint64(math.Float32bits(float32(v1))) | + (uint64(math.Float32bits(float32(v2))) << 32) + retHi = uint64(math.Float32bits(float32(v3))) | + (uint64(math.Float32bits(float32(v4))) << 32) + } + case shapeF64x2: // f64x2 from signed/unsigned i32x4 + if signed { + retLo, retHi = math.Float64bits(float64(int32(v1))), math.Float64bits(float64(int32(v2))) + } else { + retLo, retHi = math.Float64bits(float64(v1)), math.Float64bits(float64(v2)) + } + } + + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128Narrow: + x2Hi, x2Lo := ce.popValue(), ce.popValue() + x1Hi, x1Lo := ce.popValue(), ce.popValue() + signed := op.B3 + + var retLo, retHi uint64 + switch op.B1 { + case shapeI16x8: // signed/unsigned i16x8 to i8x16 + for i := 0; i < 8; i++ { + var v16 uint16 + if i < 4 { + v16 = uint16(x1Lo >> (i * 16)) + } else { + v16 = uint16(x1Hi >> ((i - 4) * 16)) + } + + var v byte + if signed { + if s := int16(v16); s > math.MaxInt8 { + v = math.MaxInt8 + } else if s < math.MinInt8 { + s = math.MinInt8 + v = byte(s) + } else { + v = byte(v16) + } + } else { + if s := int16(v16); s > math.MaxUint8 { + v = math.MaxUint8 + } else if s < 0 { + v = 0 + } else { + v = byte(v16) + } + } + retLo |= uint64(v) << (i * 8) + } + for i := 0; i < 8; i++ { + var v16 uint16 + if i < 4 { + v16 = uint16(x2Lo >> (i * 16)) + } else { + v16 = uint16(x2Hi >> ((i - 4) * 16)) + } + + var v byte + if signed { + if s := int16(v16); s > math.MaxInt8 { + v = math.MaxInt8 + } else if s < math.MinInt8 { + s = math.MinInt8 + v = byte(s) + } else { + v = byte(v16) + } + } else { + if s := int16(v16); s > math.MaxUint8 { + v = math.MaxUint8 + } else if s < 0 { + v = 0 + } else { + v = byte(v16) + } + } + retHi |= uint64(v) << (i * 8) + } + case shapeI32x4: // signed/unsigned i32x4 to i16x8 + for i := 0; i < 4; i++ { + var v32 uint32 + if i < 2 { + v32 = uint32(x1Lo >> (i * 32)) + } else { + v32 = uint32(x1Hi >> ((i - 2) * 32)) + } + + var v uint16 + if signed { + if s := int32(v32); s > math.MaxInt16 { + v = math.MaxInt16 + } else if s < math.MinInt16 { + s = math.MinInt16 + v = uint16(s) + } else { + v = uint16(v32) + } + } else { + if s := int32(v32); s > math.MaxUint16 { + v = math.MaxUint16 + } else if s < 0 { + v = 0 + } else { + v = uint16(v32) + } + } + retLo |= uint64(v) << (i * 16) + } + + for i := 0; i < 4; i++ { + var v32 uint32 + if i < 2 { + v32 = uint32(x2Lo >> (i * 32)) + } else { + v32 = uint32(x2Hi >> ((i - 2) * 32)) + } + + var v uint16 + if signed { + if s := int32(v32); s > math.MaxInt16 { + v = math.MaxInt16 + } else if s < math.MinInt16 { + s = math.MinInt16 + v = uint16(s) + } else { + v = uint16(v32) + } + } else { + if s := int32(v32); s > math.MaxUint16 { + v = math.MaxUint16 + } else if s < 0 { + v = 0 + } else { + v = uint16(v32) + } + } + retHi |= uint64(v) << (i * 16) + } + } + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindV128Dot: + x2Hi, x2Lo := ce.popValue(), ce.popValue() + x1Hi, x1Lo := ce.popValue(), ce.popValue() + ce.pushValue( + uint64(uint32(int32(int16(x1Lo>>0))*int32(int16(x2Lo>>0))+int32(int16(x1Lo>>16))*int32(int16(x2Lo>>16)))) | + (uint64(uint32(int32(int16(x1Lo>>32))*int32(int16(x2Lo>>32))+int32(int16(x1Lo>>48))*int32(int16(x2Lo>>48)))) << 32), + ) + ce.pushValue( + uint64(uint32(int32(int16(x1Hi>>0))*int32(int16(x2Hi>>0))+int32(int16(x1Hi>>16))*int32(int16(x2Hi>>16)))) | + (uint64(uint32(int32(int16(x1Hi>>32))*int32(int16(x2Hi>>32))+int32(int16(x1Hi>>48))*int32(int16(x2Hi>>48)))) << 32), + ) + frame.pc++ + case operationKindV128ITruncSatFromF: + hi, lo := ce.popValue(), ce.popValue() + signed := op.B3 + var retLo, retHi uint64 + + switch op.B1 { + case shapeF32x4: // f32x4 to i32x4 + for i, f64 := range [4]float64{ + math.Trunc(float64(math.Float32frombits(uint32(lo)))), + math.Trunc(float64(math.Float32frombits(uint32(lo >> 32)))), + math.Trunc(float64(math.Float32frombits(uint32(hi)))), + math.Trunc(float64(math.Float32frombits(uint32(hi >> 32)))), + } { + + var v uint32 + if math.IsNaN(f64) { + v = 0 + } else if signed { + if f64 < math.MinInt32 { + f64 = math.MinInt32 + } else if f64 > math.MaxInt32 { + f64 = math.MaxInt32 + } + v = uint32(int32(f64)) + } else { + if f64 < 0 { + f64 = 0 + } else if f64 > math.MaxUint32 { + f64 = math.MaxUint32 + } + v = uint32(f64) + } + + if i < 2 { + retLo |= uint64(v) << (i * 32) + } else { + retHi |= uint64(v) << ((i - 2) * 32) + } + } + + case shapeF64x2: // f64x2 to i32x4 + for i, f := range [2]float64{ + math.Trunc(math.Float64frombits(lo)), + math.Trunc(math.Float64frombits(hi)), + } { + var v uint32 + if math.IsNaN(f) { + v = 0 + } else if signed { + if f < math.MinInt32 { + f = math.MinInt32 + } else if f > math.MaxInt32 { + f = math.MaxInt32 + } + v = uint32(int32(f)) + } else { + if f < 0 { + f = 0 + } else if f > math.MaxUint32 { + f = math.MaxUint32 + } + v = uint32(f) + } + + retLo |= uint64(v) << (i * 32) + } + } + + ce.pushValue(retLo) + ce.pushValue(retHi) + frame.pc++ + case operationKindAtomicMemoryWait: + timeout := int64(ce.popValue()) + exp := ce.popValue() + offset := ce.popMemoryOffset(op) + // Runtime instead of validation error because the spec intends to allow binaries to include + // such instructions as long as they are not executed. + if !memoryInst.Shared { + panic(wasmruntime.ErrRuntimeExpectedSharedMemory) + } + + switch unsignedType(op.B1) { + case unsignedTypeI32: + if offset%4 != 0 { + panic(wasmruntime.ErrRuntimeUnalignedAtomic) + } + if int(offset) > len(memoryInst.Buffer)-4 { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue(memoryInst.Wait32(offset, uint32(exp), timeout, func(mem *wasm.MemoryInstance, offset uint32) uint32 { + mem.Mux.Lock() + defer mem.Mux.Unlock() + value, _ := mem.ReadUint32Le(offset) + return value + })) + case unsignedTypeI64: + if offset%8 != 0 { + panic(wasmruntime.ErrRuntimeUnalignedAtomic) + } + if int(offset) > len(memoryInst.Buffer)-8 { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue(memoryInst.Wait64(offset, exp, timeout, func(mem *wasm.MemoryInstance, offset uint32) uint64 { + mem.Mux.Lock() + defer mem.Mux.Unlock() + value, _ := mem.ReadUint64Le(offset) + return value + })) + } + frame.pc++ + case operationKindAtomicMemoryNotify: + count := ce.popValue() + offset := ce.popMemoryOffset(op) + if offset%4 != 0 { + panic(wasmruntime.ErrRuntimeUnalignedAtomic) + } + // Just a bounds check + if offset >= memoryInst.Size() { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + res := memoryInst.Notify(offset, uint32(count)) + ce.pushValue(uint64(res)) + frame.pc++ + case operationKindAtomicFence: + // Memory not required for fence only + if memoryInst != nil { + // An empty critical section can be used as a synchronization primitive, which is what + // fence is. Probably, there are no spectests or defined behavior to confirm this yet. + memoryInst.Mux.Lock() + memoryInst.Mux.Unlock() //nolint:staticcheck + } + frame.pc++ + case operationKindAtomicLoad: + offset := ce.popMemoryOffset(op) + switch unsignedType(op.B1) { + case unsignedTypeI32: + if offset%4 != 0 { + panic(wasmruntime.ErrRuntimeUnalignedAtomic) + } + memoryInst.Mux.Lock() + val, ok := memoryInst.ReadUint32Le(offset) + memoryInst.Mux.Unlock() + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue(uint64(val)) + case unsignedTypeI64: + if offset%8 != 0 { + panic(wasmruntime.ErrRuntimeUnalignedAtomic) + } + memoryInst.Mux.Lock() + val, ok := memoryInst.ReadUint64Le(offset) + memoryInst.Mux.Unlock() + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue(val) + } + frame.pc++ + case operationKindAtomicLoad8: + offset := ce.popMemoryOffset(op) + memoryInst.Mux.Lock() + val, ok := memoryInst.ReadByte(offset) + memoryInst.Mux.Unlock() + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue(uint64(val)) + frame.pc++ + case operationKindAtomicLoad16: + offset := ce.popMemoryOffset(op) + if offset%2 != 0 { + panic(wasmruntime.ErrRuntimeUnalignedAtomic) + } + memoryInst.Mux.Lock() + val, ok := memoryInst.ReadUint16Le(offset) + memoryInst.Mux.Unlock() + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + ce.pushValue(uint64(val)) + frame.pc++ + case operationKindAtomicStore: + val := ce.popValue() + offset := ce.popMemoryOffset(op) + switch unsignedType(op.B1) { + case unsignedTypeI32: + if offset%4 != 0 { + panic(wasmruntime.ErrRuntimeUnalignedAtomic) + } + memoryInst.Mux.Lock() + ok := memoryInst.WriteUint32Le(offset, uint32(val)) + memoryInst.Mux.Unlock() + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + case unsignedTypeI64: + if offset%8 != 0 { + panic(wasmruntime.ErrRuntimeUnalignedAtomic) + } + memoryInst.Mux.Lock() + ok := memoryInst.WriteUint64Le(offset, val) + memoryInst.Mux.Unlock() + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + } + frame.pc++ + case operationKindAtomicStore8: + val := byte(ce.popValue()) + offset := ce.popMemoryOffset(op) + memoryInst.Mux.Lock() + ok := memoryInst.WriteByte(offset, val) + memoryInst.Mux.Unlock() + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + frame.pc++ + case operationKindAtomicStore16: + val := uint16(ce.popValue()) + offset := ce.popMemoryOffset(op) + if offset%2 != 0 { + panic(wasmruntime.ErrRuntimeUnalignedAtomic) + } + memoryInst.Mux.Lock() + ok := memoryInst.WriteUint16Le(offset, val) + memoryInst.Mux.Unlock() + if !ok { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + frame.pc++ + case operationKindAtomicRMW: + val := ce.popValue() + offset := ce.popMemoryOffset(op) + switch unsignedType(op.B1) { + case unsignedTypeI32: + if offset%4 != 0 { + panic(wasmruntime.ErrRuntimeUnalignedAtomic) + } + memoryInst.Mux.Lock() + old, ok := memoryInst.ReadUint32Le(offset) + if !ok { + memoryInst.Mux.Unlock() + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + var newVal uint32 + switch atomicArithmeticOp(op.B2) { + case atomicArithmeticOpAdd: + newVal = old + uint32(val) + case atomicArithmeticOpSub: + newVal = old - uint32(val) + case atomicArithmeticOpAnd: + newVal = old & uint32(val) + case atomicArithmeticOpOr: + newVal = old | uint32(val) + case atomicArithmeticOpXor: + newVal = old ^ uint32(val) + case atomicArithmeticOpNop: + newVal = uint32(val) + } + memoryInst.WriteUint32Le(offset, newVal) + memoryInst.Mux.Unlock() + ce.pushValue(uint64(old)) + case unsignedTypeI64: + if offset%8 != 0 { + panic(wasmruntime.ErrRuntimeUnalignedAtomic) + } + memoryInst.Mux.Lock() + old, ok := memoryInst.ReadUint64Le(offset) + if !ok { + memoryInst.Mux.Unlock() + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + var newVal uint64 + switch atomicArithmeticOp(op.B2) { + case atomicArithmeticOpAdd: + newVal = old + val + case atomicArithmeticOpSub: + newVal = old - val + case atomicArithmeticOpAnd: + newVal = old & val + case atomicArithmeticOpOr: + newVal = old | val + case atomicArithmeticOpXor: + newVal = old ^ val + case atomicArithmeticOpNop: + newVal = val + } + memoryInst.WriteUint64Le(offset, newVal) + memoryInst.Mux.Unlock() + ce.pushValue(old) + } + frame.pc++ + case operationKindAtomicRMW8: + val := ce.popValue() + offset := ce.popMemoryOffset(op) + memoryInst.Mux.Lock() + old, ok := memoryInst.ReadByte(offset) + if !ok { + memoryInst.Mux.Unlock() + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + arg := byte(val) + var newVal byte + switch atomicArithmeticOp(op.B2) { + case atomicArithmeticOpAdd: + newVal = old + arg + case atomicArithmeticOpSub: + newVal = old - arg + case atomicArithmeticOpAnd: + newVal = old & arg + case atomicArithmeticOpOr: + newVal = old | arg + case atomicArithmeticOpXor: + newVal = old ^ arg + case atomicArithmeticOpNop: + newVal = arg + } + memoryInst.WriteByte(offset, newVal) + memoryInst.Mux.Unlock() + ce.pushValue(uint64(old)) + frame.pc++ + case operationKindAtomicRMW16: + val := ce.popValue() + offset := ce.popMemoryOffset(op) + if offset%2 != 0 { + panic(wasmruntime.ErrRuntimeUnalignedAtomic) + } + memoryInst.Mux.Lock() + old, ok := memoryInst.ReadUint16Le(offset) + if !ok { + memoryInst.Mux.Unlock() + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + arg := uint16(val) + var newVal uint16 + switch atomicArithmeticOp(op.B2) { + case atomicArithmeticOpAdd: + newVal = old + arg + case atomicArithmeticOpSub: + newVal = old - arg + case atomicArithmeticOpAnd: + newVal = old & arg + case atomicArithmeticOpOr: + newVal = old | arg + case atomicArithmeticOpXor: + newVal = old ^ arg + case atomicArithmeticOpNop: + newVal = arg + } + memoryInst.WriteUint16Le(offset, newVal) + memoryInst.Mux.Unlock() + ce.pushValue(uint64(old)) + frame.pc++ + case operationKindAtomicRMWCmpxchg: + rep := ce.popValue() + exp := ce.popValue() + offset := ce.popMemoryOffset(op) + switch unsignedType(op.B1) { + case unsignedTypeI32: + if offset%4 != 0 { + panic(wasmruntime.ErrRuntimeUnalignedAtomic) + } + memoryInst.Mux.Lock() + old, ok := memoryInst.ReadUint32Le(offset) + if !ok { + memoryInst.Mux.Unlock() + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + if old == uint32(exp) { + memoryInst.WriteUint32Le(offset, uint32(rep)) + } + memoryInst.Mux.Unlock() + ce.pushValue(uint64(old)) + case unsignedTypeI64: + if offset%8 != 0 { + panic(wasmruntime.ErrRuntimeUnalignedAtomic) + } + memoryInst.Mux.Lock() + old, ok := memoryInst.ReadUint64Le(offset) + if !ok { + memoryInst.Mux.Unlock() + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + if old == exp { + memoryInst.WriteUint64Le(offset, rep) + } + memoryInst.Mux.Unlock() + ce.pushValue(old) + } + frame.pc++ + case operationKindAtomicRMW8Cmpxchg: + rep := byte(ce.popValue()) + exp := byte(ce.popValue()) + offset := ce.popMemoryOffset(op) + memoryInst.Mux.Lock() + old, ok := memoryInst.ReadByte(offset) + if !ok { + memoryInst.Mux.Unlock() + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + if old == exp { + memoryInst.WriteByte(offset, rep) + } + memoryInst.Mux.Unlock() + ce.pushValue(uint64(old)) + frame.pc++ + case operationKindAtomicRMW16Cmpxchg: + rep := uint16(ce.popValue()) + exp := uint16(ce.popValue()) + offset := ce.popMemoryOffset(op) + if offset%2 != 0 { + panic(wasmruntime.ErrRuntimeUnalignedAtomic) + } + memoryInst.Mux.Lock() + old, ok := memoryInst.ReadUint16Le(offset) + if !ok { + memoryInst.Mux.Unlock() + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + if old == exp { + memoryInst.WriteUint16Le(offset, rep) + } + memoryInst.Mux.Unlock() + ce.pushValue(uint64(old)) + frame.pc++ + default: + frame.pc++ + } + } + ce.popFrame() +} + +func wasmCompatMax32bits(v1, v2 uint32) uint64 { + return uint64(math.Float32bits(moremath.WasmCompatMax32( + math.Float32frombits(v1), + math.Float32frombits(v2), + ))) +} + +func wasmCompatMin32bits(v1, v2 uint32) uint64 { + return uint64(math.Float32bits(moremath.WasmCompatMin32( + math.Float32frombits(v1), + math.Float32frombits(v2), + ))) +} + +func addFloat32bits(v1, v2 uint32) uint64 { + return uint64(math.Float32bits(math.Float32frombits(v1) + math.Float32frombits(v2))) +} + +func subFloat32bits(v1, v2 uint32) uint64 { + return uint64(math.Float32bits(math.Float32frombits(v1) - math.Float32frombits(v2))) +} + +func mulFloat32bits(v1, v2 uint32) uint64 { + return uint64(math.Float32bits(math.Float32frombits(v1) * math.Float32frombits(v2))) +} + +func divFloat32bits(v1, v2 uint32) uint64 { + return uint64(math.Float32bits(math.Float32frombits(v1) / math.Float32frombits(v2))) +} + +// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/exec/numerics.html#xref-exec-numerics-op-flt-mathrm-flt-n-z-1-z-2 +func flt32(z1, z2 float32) bool { + if z1 != z1 || z2 != z2 { + return false + } else if z1 == z2 { + return false + } else if math.IsInf(float64(z1), 1) { + return false + } else if math.IsInf(float64(z1), -1) { + return true + } else if math.IsInf(float64(z2), 1) { + return true + } else if math.IsInf(float64(z2), -1) { + return false + } + return z1 < z2 +} + +// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/exec/numerics.html#xref-exec-numerics-op-flt-mathrm-flt-n-z-1-z-2 +func flt64(z1, z2 float64) bool { + if z1 != z1 || z2 != z2 { + return false + } else if z1 == z2 { + return false + } else if math.IsInf(z1, 1) { + return false + } else if math.IsInf(z1, -1) { + return true + } else if math.IsInf(z2, 1) { + return true + } else if math.IsInf(z2, -1) { + return false + } + return z1 < z2 +} + +func i8RoundingAverage(v1, v2 byte) byte { + // https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#lane-wise-integer-rounding-average + return byte((uint16(v1) + uint16(v2) + uint16(1)) / 2) +} + +func i16RoundingAverage(v1, v2 uint16) uint16 { + // https://github.com/WebAssembly/spec/blob/wg-2.0.draft1/proposals/simd/SIMD.md#lane-wise-integer-rounding-average + return uint16((uint32(v1) + uint32(v2) + 1) / 2) +} + +func i8Abs(v byte) byte { + if i := int8(v); i < 0 { + return byte(-i) + } else { + return byte(i) + } +} + +func i8MaxU(v1, v2 byte) byte { + if v1 < v2 { + return v2 + } else { + return v1 + } +} + +func i8MinU(v1, v2 byte) byte { + if v1 > v2 { + return v2 + } else { + return v1 + } +} + +func i8MaxS(v1, v2 byte) byte { + if int8(v1) < int8(v2) { + return v2 + } else { + return v1 + } +} + +func i8MinS(v1, v2 byte) byte { + if int8(v1) > int8(v2) { + return v2 + } else { + return v1 + } +} + +func i16MaxU(v1, v2 uint16) uint16 { + if v1 < v2 { + return v2 + } else { + return v1 + } +} + +func i16MinU(v1, v2 uint16) uint16 { + if v1 > v2 { + return v2 + } else { + return v1 + } +} + +func i16MaxS(v1, v2 uint16) uint16 { + if int16(v1) < int16(v2) { + return v2 + } else { + return v1 + } +} + +func i16MinS(v1, v2 uint16) uint16 { + if int16(v1) > int16(v2) { + return v2 + } else { + return v1 + } +} + +func i32MaxU(v1, v2 uint32) uint32 { + if v1 < v2 { + return v2 + } else { + return v1 + } +} + +func i32MinU(v1, v2 uint32) uint32 { + if v1 > v2 { + return v2 + } else { + return v1 + } +} + +func i32MaxS(v1, v2 uint32) uint32 { + if int32(v1) < int32(v2) { + return v2 + } else { + return v1 + } +} + +func i32MinS(v1, v2 uint32) uint32 { + if int32(v1) > int32(v2) { + return v2 + } else { + return v1 + } +} + +func i16Abs(v uint16) uint16 { + if i := int16(v); i < 0 { + return uint16(-i) + } else { + return uint16(i) + } +} + +func i32Abs(v uint32) uint32 { + if i := int32(v); i < 0 { + return uint32(-i) + } else { + return uint32(i) + } +} + +func (ce *callEngine) callNativeFuncWithListener(ctx context.Context, m *wasm.ModuleInstance, f *function, fnl experimental.FunctionListener) context.Context { + def, typ := f.definition(), f.funcType + + ce.stackIterator.reset(ce.stack, ce.frames, f) + fnl.Before(ctx, m, def, ce.peekValues(typ.ParamNumInUint64), &ce.stackIterator) + ce.stackIterator.clear() + ce.callNativeFunc(ctx, m, f) + fnl.After(ctx, m, def, ce.peekValues(typ.ResultNumInUint64)) + return ctx +} + +// popMemoryOffset takes a memory offset off the stack for use in load and store instructions. +// As the top of stack value is 64-bit, this ensures it is in range before returning it. +func (ce *callEngine) popMemoryOffset(op *unionOperation) uint32 { + offset := op.U2 + ce.popValue() + if offset > math.MaxUint32 { + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + } + return uint32(offset) +} + +func (ce *callEngine) callGoFuncWithStack(ctx context.Context, m *wasm.ModuleInstance, f *function) { + typ := f.funcType + paramLen := typ.ParamNumInUint64 + resultLen := typ.ResultNumInUint64 + stackLen := paramLen + + // In the interpreter engine, ce.stack may only have capacity to store + // parameters. Grow when there are more results than parameters. + if growLen := resultLen - paramLen; growLen > 0 { + for i := 0; i < growLen; i++ { + ce.stack = append(ce.stack, 0) + } + stackLen += growLen + } + + // Pass the stack elements to the go function. + stack := ce.stack[len(ce.stack)-stackLen:] + ce.callGoFunc(ctx, m, f, stack) + + // Shrink the stack when there were more parameters than results. + if shrinkLen := paramLen - resultLen; shrinkLen > 0 { + ce.stack = ce.stack[0 : len(ce.stack)-shrinkLen] + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/operations.go b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/operations.go new file mode 100644 index 000000000..3087a718f --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/operations.go @@ -0,0 +1,2812 @@ +package interpreter + +import ( + "fmt" + "math" + "strings" +) + +// unsignedInt represents unsigned 32-bit or 64-bit integers. +type unsignedInt byte + +const ( + unsignedInt32 unsignedInt = iota + unsignedInt64 +) + +// String implements fmt.Stringer. +func (s unsignedInt) String() (ret string) { + switch s { + case unsignedInt32: + ret = "i32" + case unsignedInt64: + ret = "i64" + } + return +} + +// signedInt represents signed or unsigned integers. +type signedInt byte + +const ( + signedInt32 signedInt = iota + signedInt64 + signedUint32 + signedUint64 +) + +// String implements fmt.Stringer. +func (s signedInt) String() (ret string) { + switch s { + case signedUint32: + ret = "u32" + case signedUint64: + ret = "u64" + case signedInt32: + ret = "s32" + case signedInt64: + ret = "s64" + } + return +} + +// float represents the scalar double or single precision floating points. +type float byte + +const ( + f32 float = iota + f64 +) + +// String implements fmt.Stringer. +func (s float) String() (ret string) { + switch s { + case f32: + ret = "f32" + case f64: + ret = "f64" + } + return +} + +// unsignedType is the union of unsignedInt, float and V128 vector type. +type unsignedType byte + +const ( + unsignedTypeI32 unsignedType = iota + unsignedTypeI64 + unsignedTypeF32 + unsignedTypeF64 + unsignedTypeV128 + unsignedTypeUnknown +) + +// String implements fmt.Stringer. +func (s unsignedType) String() (ret string) { + switch s { + case unsignedTypeI32: + ret = "i32" + case unsignedTypeI64: + ret = "i64" + case unsignedTypeF32: + ret = "f32" + case unsignedTypeF64: + ret = "f64" + case unsignedTypeV128: + ret = "v128" + case unsignedTypeUnknown: + ret = "unknown" + } + return +} + +// signedType is the union of signedInt and float types. +type signedType byte + +const ( + signedTypeInt32 signedType = iota + signedTypeUint32 + signedTypeInt64 + signedTypeUint64 + signedTypeFloat32 + signedTypeFloat64 +) + +// String implements fmt.Stringer. +func (s signedType) String() (ret string) { + switch s { + case signedTypeInt32: + ret = "s32" + case signedTypeUint32: + ret = "u32" + case signedTypeInt64: + ret = "s64" + case signedTypeUint64: + ret = "u64" + case signedTypeFloat32: + ret = "f32" + case signedTypeFloat64: + ret = "f64" + } + return +} + +// operationKind is the Kind of each implementation of Operation interface. +type operationKind uint16 + +// String implements fmt.Stringer. +func (o operationKind) String() (ret string) { + switch o { + case operationKindUnreachable: + ret = "Unreachable" + case operationKindLabel: + ret = "label" + case operationKindBr: + ret = "Br" + case operationKindBrIf: + ret = "BrIf" + case operationKindBrTable: + ret = "BrTable" + case operationKindCall: + ret = "Call" + case operationKindCallIndirect: + ret = "CallIndirect" + case operationKindDrop: + ret = "Drop" + case operationKindSelect: + ret = "Select" + case operationKindPick: + ret = "Pick" + case operationKindSet: + ret = "Swap" + case operationKindGlobalGet: + ret = "GlobalGet" + case operationKindGlobalSet: + ret = "GlobalSet" + case operationKindLoad: + ret = "Load" + case operationKindLoad8: + ret = "Load8" + case operationKindLoad16: + ret = "Load16" + case operationKindLoad32: + ret = "Load32" + case operationKindStore: + ret = "Store" + case operationKindStore8: + ret = "Store8" + case operationKindStore16: + ret = "Store16" + case operationKindStore32: + ret = "Store32" + case operationKindMemorySize: + ret = "MemorySize" + case operationKindMemoryGrow: + ret = "MemoryGrow" + case operationKindConstI32: + ret = "ConstI32" + case operationKindConstI64: + ret = "ConstI64" + case operationKindConstF32: + ret = "ConstF32" + case operationKindConstF64: + ret = "ConstF64" + case operationKindEq: + ret = "Eq" + case operationKindNe: + ret = "Ne" + case operationKindEqz: + ret = "Eqz" + case operationKindLt: + ret = "Lt" + case operationKindGt: + ret = "Gt" + case operationKindLe: + ret = "Le" + case operationKindGe: + ret = "Ge" + case operationKindAdd: + ret = "Add" + case operationKindSub: + ret = "Sub" + case operationKindMul: + ret = "Mul" + case operationKindClz: + ret = "Clz" + case operationKindCtz: + ret = "Ctz" + case operationKindPopcnt: + ret = "Popcnt" + case operationKindDiv: + ret = "Div" + case operationKindRem: + ret = "Rem" + case operationKindAnd: + ret = "And" + case operationKindOr: + ret = "Or" + case operationKindXor: + ret = "Xor" + case operationKindShl: + ret = "Shl" + case operationKindShr: + ret = "Shr" + case operationKindRotl: + ret = "Rotl" + case operationKindRotr: + ret = "Rotr" + case operationKindAbs: + ret = "Abs" + case operationKindNeg: + ret = "Neg" + case operationKindCeil: + ret = "Ceil" + case operationKindFloor: + ret = "Floor" + case operationKindTrunc: + ret = "Trunc" + case operationKindNearest: + ret = "Nearest" + case operationKindSqrt: + ret = "Sqrt" + case operationKindMin: + ret = "Min" + case operationKindMax: + ret = "Max" + case operationKindCopysign: + ret = "Copysign" + case operationKindI32WrapFromI64: + ret = "I32WrapFromI64" + case operationKindITruncFromF: + ret = "ITruncFromF" + case operationKindFConvertFromI: + ret = "FConvertFromI" + case operationKindF32DemoteFromF64: + ret = "F32DemoteFromF64" + case operationKindF64PromoteFromF32: + ret = "F64PromoteFromF32" + case operationKindI32ReinterpretFromF32: + ret = "I32ReinterpretFromF32" + case operationKindI64ReinterpretFromF64: + ret = "I64ReinterpretFromF64" + case operationKindF32ReinterpretFromI32: + ret = "F32ReinterpretFromI32" + case operationKindF64ReinterpretFromI64: + ret = "F64ReinterpretFromI64" + case operationKindExtend: + ret = "Extend" + case operationKindMemoryInit: + ret = "MemoryInit" + case operationKindDataDrop: + ret = "DataDrop" + case operationKindMemoryCopy: + ret = "MemoryCopy" + case operationKindMemoryFill: + ret = "MemoryFill" + case operationKindTableInit: + ret = "TableInit" + case operationKindElemDrop: + ret = "ElemDrop" + case operationKindTableCopy: + ret = "TableCopy" + case operationKindRefFunc: + ret = "RefFunc" + case operationKindTableGet: + ret = "TableGet" + case operationKindTableSet: + ret = "TableSet" + case operationKindTableSize: + ret = "TableSize" + case operationKindTableGrow: + ret = "TableGrow" + case operationKindTableFill: + ret = "TableFill" + case operationKindV128Const: + ret = "ConstV128" + case operationKindV128Add: + ret = "V128Add" + case operationKindV128Sub: + ret = "V128Sub" + case operationKindV128Load: + ret = "V128Load" + case operationKindV128LoadLane: + ret = "V128LoadLane" + case operationKindV128Store: + ret = "V128Store" + case operationKindV128StoreLane: + ret = "V128StoreLane" + case operationKindV128ExtractLane: + ret = "V128ExtractLane" + case operationKindV128ReplaceLane: + ret = "V128ReplaceLane" + case operationKindV128Splat: + ret = "V128Splat" + case operationKindV128Shuffle: + ret = "V128Shuffle" + case operationKindV128Swizzle: + ret = "V128Swizzle" + case operationKindV128AnyTrue: + ret = "V128AnyTrue" + case operationKindV128AllTrue: + ret = "V128AllTrue" + case operationKindV128And: + ret = "V128And" + case operationKindV128Not: + ret = "V128Not" + case operationKindV128Or: + ret = "V128Or" + case operationKindV128Xor: + ret = "V128Xor" + case operationKindV128Bitselect: + ret = "V128Bitselect" + case operationKindV128AndNot: + ret = "V128AndNot" + case operationKindV128BitMask: + ret = "V128BitMask" + case operationKindV128Shl: + ret = "V128Shl" + case operationKindV128Shr: + ret = "V128Shr" + case operationKindV128Cmp: + ret = "V128Cmp" + case operationKindSignExtend32From8: + ret = "SignExtend32From8" + case operationKindSignExtend32From16: + ret = "SignExtend32From16" + case operationKindSignExtend64From8: + ret = "SignExtend64From8" + case operationKindSignExtend64From16: + ret = "SignExtend64From16" + case operationKindSignExtend64From32: + ret = "SignExtend64From32" + case operationKindV128AddSat: + ret = "V128AddSat" + case operationKindV128SubSat: + ret = "V128SubSat" + case operationKindV128Mul: + ret = "V128Mul" + case operationKindV128Div: + ret = "V128Div" + case operationKindV128Neg: + ret = "V128Neg" + case operationKindV128Sqrt: + ret = "V128Sqrt" + case operationKindV128Abs: + ret = "V128Abs" + case operationKindV128Popcnt: + ret = "V128Popcnt" + case operationKindV128Min: + ret = "V128Min" + case operationKindV128Max: + ret = "V128Max" + case operationKindV128AvgrU: + ret = "V128AvgrU" + case operationKindV128Ceil: + ret = "V128Ceil" + case operationKindV128Floor: + ret = "V128Floor" + case operationKindV128Trunc: + ret = "V128Trunc" + case operationKindV128Nearest: + ret = "V128Nearest" + case operationKindV128Pmin: + ret = "V128Pmin" + case operationKindV128Pmax: + ret = "V128Pmax" + case operationKindV128Extend: + ret = "V128Extend" + case operationKindV128ExtMul: + ret = "V128ExtMul" + case operationKindV128Q15mulrSatS: + ret = "V128Q15mulrSatS" + case operationKindV128ExtAddPairwise: + ret = "V128ExtAddPairwise" + case operationKindV128FloatPromote: + ret = "V128FloatPromote" + case operationKindV128FloatDemote: + ret = "V128FloatDemote" + case operationKindV128FConvertFromI: + ret = "V128FConvertFromI" + case operationKindV128Dot: + ret = "V128Dot" + case operationKindV128Narrow: + ret = "V128Narrow" + case operationKindV128ITruncSatFromF: + ret = "V128ITruncSatFromF" + case operationKindBuiltinFunctionCheckExitCode: + ret = "BuiltinFunctionCheckExitCode" + case operationKindAtomicMemoryWait: + ret = "operationKindAtomicMemoryWait" + case operationKindAtomicMemoryNotify: + ret = "operationKindAtomicMemoryNotify" + case operationKindAtomicFence: + ret = "operationKindAtomicFence" + case operationKindAtomicLoad: + ret = "operationKindAtomicLoad" + case operationKindAtomicLoad8: + ret = "operationKindAtomicLoad8" + case operationKindAtomicLoad16: + ret = "operationKindAtomicLoad16" + case operationKindAtomicStore: + ret = "operationKindAtomicStore" + case operationKindAtomicStore8: + ret = "operationKindAtomicStore8" + case operationKindAtomicStore16: + ret = "operationKindAtomicStore16" + case operationKindAtomicRMW: + ret = "operationKindAtomicRMW" + case operationKindAtomicRMW8: + ret = "operationKindAtomicRMW8" + case operationKindAtomicRMW16: + ret = "operationKindAtomicRMW16" + case operationKindAtomicRMWCmpxchg: + ret = "operationKindAtomicRMWCmpxchg" + case operationKindAtomicRMW8Cmpxchg: + ret = "operationKindAtomicRMW8Cmpxchg" + case operationKindAtomicRMW16Cmpxchg: + ret = "operationKindAtomicRMW16Cmpxchg" + default: + panic(fmt.Errorf("unknown operation %d", o)) + } + return +} + +const ( + // operationKindUnreachable is the Kind for NewOperationUnreachable. + operationKindUnreachable operationKind = iota + // operationKindLabel is the Kind for NewOperationLabel. + operationKindLabel + // operationKindBr is the Kind for NewOperationBr. + operationKindBr + // operationKindBrIf is the Kind for NewOperationBrIf. + operationKindBrIf + // operationKindBrTable is the Kind for NewOperationBrTable. + operationKindBrTable + // operationKindCall is the Kind for NewOperationCall. + operationKindCall + // operationKindCallIndirect is the Kind for NewOperationCallIndirect. + operationKindCallIndirect + // operationKindDrop is the Kind for NewOperationDrop. + operationKindDrop + // operationKindSelect is the Kind for NewOperationSelect. + operationKindSelect + // operationKindPick is the Kind for NewOperationPick. + operationKindPick + // operationKindSet is the Kind for NewOperationSet. + operationKindSet + // operationKindGlobalGet is the Kind for NewOperationGlobalGet. + operationKindGlobalGet + // operationKindGlobalSet is the Kind for NewOperationGlobalSet. + operationKindGlobalSet + // operationKindLoad is the Kind for NewOperationLoad. + operationKindLoad + // operationKindLoad8 is the Kind for NewOperationLoad8. + operationKindLoad8 + // operationKindLoad16 is the Kind for NewOperationLoad16. + operationKindLoad16 + // operationKindLoad32 is the Kind for NewOperationLoad32. + operationKindLoad32 + // operationKindStore is the Kind for NewOperationStore. + operationKindStore + // operationKindStore8 is the Kind for NewOperationStore8. + operationKindStore8 + // operationKindStore16 is the Kind for NewOperationStore16. + operationKindStore16 + // operationKindStore32 is the Kind for NewOperationStore32. + operationKindStore32 + // operationKindMemorySize is the Kind for NewOperationMemorySize. + operationKindMemorySize + // operationKindMemoryGrow is the Kind for NewOperationMemoryGrow. + operationKindMemoryGrow + // operationKindConstI32 is the Kind for NewOperationConstI32. + operationKindConstI32 + // operationKindConstI64 is the Kind for NewOperationConstI64. + operationKindConstI64 + // operationKindConstF32 is the Kind for NewOperationConstF32. + operationKindConstF32 + // operationKindConstF64 is the Kind for NewOperationConstF64. + operationKindConstF64 + // operationKindEq is the Kind for NewOperationEq. + operationKindEq + // operationKindNe is the Kind for NewOperationNe. + operationKindNe + // operationKindEqz is the Kind for NewOperationEqz. + operationKindEqz + // operationKindLt is the Kind for NewOperationLt. + operationKindLt + // operationKindGt is the Kind for NewOperationGt. + operationKindGt + // operationKindLe is the Kind for NewOperationLe. + operationKindLe + // operationKindGe is the Kind for NewOperationGe. + operationKindGe + // operationKindAdd is the Kind for NewOperationAdd. + operationKindAdd + // operationKindSub is the Kind for NewOperationSub. + operationKindSub + // operationKindMul is the Kind for NewOperationMul. + operationKindMul + // operationKindClz is the Kind for NewOperationClz. + operationKindClz + // operationKindCtz is the Kind for NewOperationCtz. + operationKindCtz + // operationKindPopcnt is the Kind for NewOperationPopcnt. + operationKindPopcnt + // operationKindDiv is the Kind for NewOperationDiv. + operationKindDiv + // operationKindRem is the Kind for NewOperationRem. + operationKindRem + // operationKindAnd is the Kind for NewOperationAnd. + operationKindAnd + // operationKindOr is the Kind for NewOperationOr. + operationKindOr + // operationKindXor is the Kind for NewOperationXor. + operationKindXor + // operationKindShl is the Kind for NewOperationShl. + operationKindShl + // operationKindShr is the Kind for NewOperationShr. + operationKindShr + // operationKindRotl is the Kind for NewOperationRotl. + operationKindRotl + // operationKindRotr is the Kind for NewOperationRotr. + operationKindRotr + // operationKindAbs is the Kind for NewOperationAbs. + operationKindAbs + // operationKindNeg is the Kind for NewOperationNeg. + operationKindNeg + // operationKindCeil is the Kind for NewOperationCeil. + operationKindCeil + // operationKindFloor is the Kind for NewOperationFloor. + operationKindFloor + // operationKindTrunc is the Kind for NewOperationTrunc. + operationKindTrunc + // operationKindNearest is the Kind for NewOperationNearest. + operationKindNearest + // operationKindSqrt is the Kind for NewOperationSqrt. + operationKindSqrt + // operationKindMin is the Kind for NewOperationMin. + operationKindMin + // operationKindMax is the Kind for NewOperationMax. + operationKindMax + // operationKindCopysign is the Kind for NewOperationCopysign. + operationKindCopysign + // operationKindI32WrapFromI64 is the Kind for NewOperationI32WrapFromI64. + operationKindI32WrapFromI64 + // operationKindITruncFromF is the Kind for NewOperationITruncFromF. + operationKindITruncFromF + // operationKindFConvertFromI is the Kind for NewOperationFConvertFromI. + operationKindFConvertFromI + // operationKindF32DemoteFromF64 is the Kind for NewOperationF32DemoteFromF64. + operationKindF32DemoteFromF64 + // operationKindF64PromoteFromF32 is the Kind for NewOperationF64PromoteFromF32. + operationKindF64PromoteFromF32 + // operationKindI32ReinterpretFromF32 is the Kind for NewOperationI32ReinterpretFromF32. + operationKindI32ReinterpretFromF32 + // operationKindI64ReinterpretFromF64 is the Kind for NewOperationI64ReinterpretFromF64. + operationKindI64ReinterpretFromF64 + // operationKindF32ReinterpretFromI32 is the Kind for NewOperationF32ReinterpretFromI32. + operationKindF32ReinterpretFromI32 + // operationKindF64ReinterpretFromI64 is the Kind for NewOperationF64ReinterpretFromI64. + operationKindF64ReinterpretFromI64 + // operationKindExtend is the Kind for NewOperationExtend. + operationKindExtend + // operationKindSignExtend32From8 is the Kind for NewOperationSignExtend32From8. + operationKindSignExtend32From8 + // operationKindSignExtend32From16 is the Kind for NewOperationSignExtend32From16. + operationKindSignExtend32From16 + // operationKindSignExtend64From8 is the Kind for NewOperationSignExtend64From8. + operationKindSignExtend64From8 + // operationKindSignExtend64From16 is the Kind for NewOperationSignExtend64From16. + operationKindSignExtend64From16 + // operationKindSignExtend64From32 is the Kind for NewOperationSignExtend64From32. + operationKindSignExtend64From32 + // operationKindMemoryInit is the Kind for NewOperationMemoryInit. + operationKindMemoryInit + // operationKindDataDrop is the Kind for NewOperationDataDrop. + operationKindDataDrop + // operationKindMemoryCopy is the Kind for NewOperationMemoryCopy. + operationKindMemoryCopy + // operationKindMemoryFill is the Kind for NewOperationMemoryFill. + operationKindMemoryFill + // operationKindTableInit is the Kind for NewOperationTableInit. + operationKindTableInit + // operationKindElemDrop is the Kind for NewOperationElemDrop. + operationKindElemDrop + // operationKindTableCopy is the Kind for NewOperationTableCopy. + operationKindTableCopy + // operationKindRefFunc is the Kind for NewOperationRefFunc. + operationKindRefFunc + // operationKindTableGet is the Kind for NewOperationTableGet. + operationKindTableGet + // operationKindTableSet is the Kind for NewOperationTableSet. + operationKindTableSet + // operationKindTableSize is the Kind for NewOperationTableSize. + operationKindTableSize + // operationKindTableGrow is the Kind for NewOperationTableGrow. + operationKindTableGrow + // operationKindTableFill is the Kind for NewOperationTableFill. + operationKindTableFill + + // Vector value related instructions are prefixed by V128. + + // operationKindV128Const is the Kind for NewOperationV128Const. + operationKindV128Const + // operationKindV128Add is the Kind for NewOperationV128Add. + operationKindV128Add + // operationKindV128Sub is the Kind for NewOperationV128Sub. + operationKindV128Sub + // operationKindV128Load is the Kind for NewOperationV128Load. + operationKindV128Load + // operationKindV128LoadLane is the Kind for NewOperationV128LoadLane. + operationKindV128LoadLane + // operationKindV128Store is the Kind for NewOperationV128Store. + operationKindV128Store + // operationKindV128StoreLane is the Kind for NewOperationV128StoreLane. + operationKindV128StoreLane + // operationKindV128ExtractLane is the Kind for NewOperationV128ExtractLane. + operationKindV128ExtractLane + // operationKindV128ReplaceLane is the Kind for NewOperationV128ReplaceLane. + operationKindV128ReplaceLane + // operationKindV128Splat is the Kind for NewOperationV128Splat. + operationKindV128Splat + // operationKindV128Shuffle is the Kind for NewOperationV128Shuffle. + operationKindV128Shuffle + // operationKindV128Swizzle is the Kind for NewOperationV128Swizzle. + operationKindV128Swizzle + // operationKindV128AnyTrue is the Kind for NewOperationV128AnyTrue. + operationKindV128AnyTrue + // operationKindV128AllTrue is the Kind for NewOperationV128AllTrue. + operationKindV128AllTrue + // operationKindV128BitMask is the Kind for NewOperationV128BitMask. + operationKindV128BitMask + // operationKindV128And is the Kind for NewOperationV128And. + operationKindV128And + // operationKindV128Not is the Kind for NewOperationV128Not. + operationKindV128Not + // operationKindV128Or is the Kind for NewOperationV128Or. + operationKindV128Or + // operationKindV128Xor is the Kind for NewOperationV128Xor. + operationKindV128Xor + // operationKindV128Bitselect is the Kind for NewOperationV128Bitselect. + operationKindV128Bitselect + // operationKindV128AndNot is the Kind for NewOperationV128AndNot. + operationKindV128AndNot + // operationKindV128Shl is the Kind for NewOperationV128Shl. + operationKindV128Shl + // operationKindV128Shr is the Kind for NewOperationV128Shr. + operationKindV128Shr + // operationKindV128Cmp is the Kind for NewOperationV128Cmp. + operationKindV128Cmp + // operationKindV128AddSat is the Kind for NewOperationV128AddSat. + operationKindV128AddSat + // operationKindV128SubSat is the Kind for NewOperationV128SubSat. + operationKindV128SubSat + // operationKindV128Mul is the Kind for NewOperationV128Mul. + operationKindV128Mul + // operationKindV128Div is the Kind for NewOperationV128Div. + operationKindV128Div + // operationKindV128Neg is the Kind for NewOperationV128Neg. + operationKindV128Neg + // operationKindV128Sqrt is the Kind for NewOperationV128Sqrt. + operationKindV128Sqrt + // operationKindV128Abs is the Kind for NewOperationV128Abs. + operationKindV128Abs + // operationKindV128Popcnt is the Kind for NewOperationV128Popcnt. + operationKindV128Popcnt + // operationKindV128Min is the Kind for NewOperationV128Min. + operationKindV128Min + // operationKindV128Max is the Kind for NewOperationV128Max. + operationKindV128Max + // operationKindV128AvgrU is the Kind for NewOperationV128AvgrU. + operationKindV128AvgrU + // operationKindV128Pmin is the Kind for NewOperationV128Pmin. + operationKindV128Pmin + // operationKindV128Pmax is the Kind for NewOperationV128Pmax. + operationKindV128Pmax + // operationKindV128Ceil is the Kind for NewOperationV128Ceil. + operationKindV128Ceil + // operationKindV128Floor is the Kind for NewOperationV128Floor. + operationKindV128Floor + // operationKindV128Trunc is the Kind for NewOperationV128Trunc. + operationKindV128Trunc + // operationKindV128Nearest is the Kind for NewOperationV128Nearest. + operationKindV128Nearest + // operationKindV128Extend is the Kind for NewOperationV128Extend. + operationKindV128Extend + // operationKindV128ExtMul is the Kind for NewOperationV128ExtMul. + operationKindV128ExtMul + // operationKindV128Q15mulrSatS is the Kind for NewOperationV128Q15mulrSatS. + operationKindV128Q15mulrSatS + // operationKindV128ExtAddPairwise is the Kind for NewOperationV128ExtAddPairwise. + operationKindV128ExtAddPairwise + // operationKindV128FloatPromote is the Kind for NewOperationV128FloatPromote. + operationKindV128FloatPromote + // operationKindV128FloatDemote is the Kind for NewOperationV128FloatDemote. + operationKindV128FloatDemote + // operationKindV128FConvertFromI is the Kind for NewOperationV128FConvertFromI. + operationKindV128FConvertFromI + // operationKindV128Dot is the Kind for NewOperationV128Dot. + operationKindV128Dot + // operationKindV128Narrow is the Kind for NewOperationV128Narrow. + operationKindV128Narrow + // operationKindV128ITruncSatFromF is the Kind for NewOperationV128ITruncSatFromF. + operationKindV128ITruncSatFromF + + // operationKindBuiltinFunctionCheckExitCode is the Kind for NewOperationBuiltinFunctionCheckExitCode. + operationKindBuiltinFunctionCheckExitCode + + // operationKindAtomicMemoryWait is the kind for NewOperationAtomicMemoryWait. + operationKindAtomicMemoryWait + // operationKindAtomicMemoryNotify is the kind for NewOperationAtomicMemoryNotify. + operationKindAtomicMemoryNotify + // operationKindAtomicFence is the kind for NewOperationAtomicFence. + operationKindAtomicFence + // operationKindAtomicLoad is the kind for NewOperationAtomicLoad. + operationKindAtomicLoad + // operationKindAtomicLoad8 is the kind for NewOperationAtomicLoad8. + operationKindAtomicLoad8 + // operationKindAtomicLoad16 is the kind for NewOperationAtomicLoad16. + operationKindAtomicLoad16 + // operationKindAtomicStore is the kind for NewOperationAtomicStore. + operationKindAtomicStore + // operationKindAtomicStore8 is the kind for NewOperationAtomicStore8. + operationKindAtomicStore8 + // operationKindAtomicStore16 is the kind for NewOperationAtomicStore16. + operationKindAtomicStore16 + + // operationKindAtomicRMW is the kind for NewOperationAtomicRMW. + operationKindAtomicRMW + // operationKindAtomicRMW8 is the kind for NewOperationAtomicRMW8. + operationKindAtomicRMW8 + // operationKindAtomicRMW16 is the kind for NewOperationAtomicRMW16. + operationKindAtomicRMW16 + + // operationKindAtomicRMWCmpxchg is the kind for NewOperationAtomicRMWCmpxchg. + operationKindAtomicRMWCmpxchg + // operationKindAtomicRMW8Cmpxchg is the kind for NewOperationAtomicRMW8Cmpxchg. + operationKindAtomicRMW8Cmpxchg + // operationKindAtomicRMW16Cmpxchg is the kind for NewOperationAtomicRMW16Cmpxchg. + operationKindAtomicRMW16Cmpxchg + + // operationKindEnd is always placed at the bottom of this iota definition to be used in the test. + operationKindEnd +) + +// NewOperationBuiltinFunctionCheckExitCode is a constructor for unionOperation with Kind operationKindBuiltinFunctionCheckExitCode. +// +// OperationBuiltinFunctionCheckExitCode corresponds to the instruction to check the api.Module is already closed due to +// context.DeadlineExceeded, context.Canceled, or the explicit call of CloseWithExitCode on api.Module. +func newOperationBuiltinFunctionCheckExitCode() unionOperation { + return unionOperation{Kind: operationKindBuiltinFunctionCheckExitCode} +} + +// label is the unique identifier for each block in a single function in interpreterir +// where "block" consists of multiple operations, and must End with branching operations +// (e.g. operationKindBr or operationKindBrIf). +type label uint64 + +// Kind returns the labelKind encoded in this label. +func (l label) Kind() labelKind { + return labelKind(uint32(l)) +} + +// FrameID returns the frame id encoded in this label. +func (l label) FrameID() int { + return int(uint32(l >> 32)) +} + +// NewLabel is a constructor for a label. +func newLabel(kind labelKind, frameID uint32) label { + return label(kind) | label(frameID)<<32 +} + +// String implements fmt.Stringer. +func (l label) String() (ret string) { + frameID := l.FrameID() + switch l.Kind() { + case labelKindHeader: + ret = fmt.Sprintf(".L%d", frameID) + case labelKindElse: + ret = fmt.Sprintf(".L%d_else", frameID) + case labelKindContinuation: + ret = fmt.Sprintf(".L%d_cont", frameID) + case labelKindReturn: + return ".return" + } + return +} + +func (l label) IsReturnTarget() bool { + return l.Kind() == labelKindReturn +} + +// labelKind is the Kind of the label. +type labelKind = byte + +const ( + // labelKindHeader is the header for various blocks. For example, the "then" block of + // wasm.OpcodeIfName in Wasm has the label of this Kind. + labelKindHeader labelKind = iota + // labelKindElse is the Kind of label for "else" block of wasm.OpcodeIfName in Wasm. + labelKindElse + // labelKindContinuation is the Kind of label which is the continuation of blocks. + // For example, for wasm text like + // (func + // .... + // (if (local.get 0) (then (nop)) (else (nop))) + // return + // ) + // we have the continuation block (of if-block) corresponding to "return" opcode. + labelKindContinuation + labelKindReturn + labelKindNum +) + +// unionOperation implements Operation and is the compilation (engine.lowerIR) result of a interpreterir.Operation. +// +// Not all operations result in a unionOperation, e.g. interpreterir.OperationI32ReinterpretFromF32, and some operations are +// more complex than others, e.g. interpreterir.NewOperationBrTable. +// +// Note: This is a form of union type as it can store fields needed for any operation. Hence, most fields are opaque and +// only relevant when in context of its kind. +type unionOperation struct { + // Kind determines how to interpret the other fields in this struct. + Kind operationKind + B1, B2 byte + B3 bool + U1, U2 uint64 + U3 uint64 + Us []uint64 +} + +// String implements fmt.Stringer. +func (o unionOperation) String() string { + switch o.Kind { + case operationKindUnreachable, + operationKindSelect, + operationKindMemorySize, + operationKindMemoryGrow, + operationKindI32WrapFromI64, + operationKindF32DemoteFromF64, + operationKindF64PromoteFromF32, + operationKindI32ReinterpretFromF32, + operationKindI64ReinterpretFromF64, + operationKindF32ReinterpretFromI32, + operationKindF64ReinterpretFromI64, + operationKindSignExtend32From8, + operationKindSignExtend32From16, + operationKindSignExtend64From8, + operationKindSignExtend64From16, + operationKindSignExtend64From32, + operationKindMemoryInit, + operationKindDataDrop, + operationKindMemoryCopy, + operationKindMemoryFill, + operationKindTableInit, + operationKindElemDrop, + operationKindTableCopy, + operationKindRefFunc, + operationKindTableGet, + operationKindTableSet, + operationKindTableSize, + operationKindTableGrow, + operationKindTableFill, + operationKindBuiltinFunctionCheckExitCode: + return o.Kind.String() + + case operationKindCall, + operationKindGlobalGet, + operationKindGlobalSet: + return fmt.Sprintf("%s %d", o.Kind, o.B1) + + case operationKindLabel: + return label(o.U1).String() + + case operationKindBr: + return fmt.Sprintf("%s %s", o.Kind, label(o.U1).String()) + + case operationKindBrIf: + thenTarget := label(o.U1) + elseTarget := label(o.U2) + return fmt.Sprintf("%s %s, %s", o.Kind, thenTarget, elseTarget) + + case operationKindBrTable: + var targets []string + var defaultLabel label + if len(o.Us) > 0 { + targets = make([]string, len(o.Us)-1) + for i, t := range o.Us[1:] { + targets[i] = label(t).String() + } + defaultLabel = label(o.Us[0]) + } + return fmt.Sprintf("%s [%s] %s", o.Kind, strings.Join(targets, ","), defaultLabel) + + case operationKindCallIndirect: + return fmt.Sprintf("%s: type=%d, table=%d", o.Kind, o.U1, o.U2) + + case operationKindDrop: + start := int64(o.U1) + end := int64(o.U2) + return fmt.Sprintf("%s %d..%d", o.Kind, start, end) + + case operationKindPick, operationKindSet: + return fmt.Sprintf("%s %d (is_vector=%v)", o.Kind, o.U1, o.B3) + + case operationKindLoad, operationKindStore: + return fmt.Sprintf("%s.%s (align=%d, offset=%d)", unsignedType(o.B1), o.Kind, o.U1, o.U2) + + case operationKindLoad8, + operationKindLoad16: + return fmt.Sprintf("%s.%s (align=%d, offset=%d)", signedType(o.B1), o.Kind, o.U1, o.U2) + + case operationKindStore8, + operationKindStore16, + operationKindStore32: + return fmt.Sprintf("%s (align=%d, offset=%d)", o.Kind, o.U1, o.U2) + + case operationKindLoad32: + var t string + if o.B1 == 1 { + t = "i64" + } else { + t = "u64" + } + return fmt.Sprintf("%s.%s (align=%d, offset=%d)", t, o.Kind, o.U1, o.U2) + + case operationKindEq, + operationKindNe, + operationKindAdd, + operationKindSub, + operationKindMul: + return fmt.Sprintf("%s.%s", unsignedType(o.B1), o.Kind) + + case operationKindEqz, + operationKindClz, + operationKindCtz, + operationKindPopcnt, + operationKindAnd, + operationKindOr, + operationKindXor, + operationKindShl, + operationKindRotl, + operationKindRotr: + return fmt.Sprintf("%s.%s", unsignedInt(o.B1), o.Kind) + + case operationKindRem, operationKindShr: + return fmt.Sprintf("%s.%s", signedInt(o.B1), o.Kind) + + case operationKindLt, + operationKindGt, + operationKindLe, + operationKindGe, + operationKindDiv: + return fmt.Sprintf("%s.%s", signedType(o.B1), o.Kind) + + case operationKindAbs, + operationKindNeg, + operationKindCeil, + operationKindFloor, + operationKindTrunc, + operationKindNearest, + operationKindSqrt, + operationKindMin, + operationKindMax, + operationKindCopysign: + return fmt.Sprintf("%s.%s", float(o.B1), o.Kind) + + case operationKindConstI32, + operationKindConstI64: + return fmt.Sprintf("%s %#x", o.Kind, o.U1) + + case operationKindConstF32: + return fmt.Sprintf("%s %f", o.Kind, math.Float32frombits(uint32(o.U1))) + case operationKindConstF64: + return fmt.Sprintf("%s %f", o.Kind, math.Float64frombits(o.U1)) + + case operationKindITruncFromF: + return fmt.Sprintf("%s.%s.%s (non_trapping=%v)", signedInt(o.B2), o.Kind, float(o.B1), o.B3) + case operationKindFConvertFromI: + return fmt.Sprintf("%s.%s.%s", float(o.B2), o.Kind, signedInt(o.B1)) + case operationKindExtend: + var in, out string + if o.B3 { + in = "i32" + out = "i64" + } else { + in = "u32" + out = "u64" + } + return fmt.Sprintf("%s.%s.%s", out, o.Kind, in) + + case operationKindV128Const: + return fmt.Sprintf("%s [%#x, %#x]", o.Kind, o.U1, o.U2) + case operationKindV128Add, + operationKindV128Sub: + return fmt.Sprintf("%s (shape=%s)", o.Kind, shapeName(o.B1)) + case operationKindV128Load, + operationKindV128LoadLane, + operationKindV128Store, + operationKindV128StoreLane, + operationKindV128ExtractLane, + operationKindV128ReplaceLane, + operationKindV128Splat, + operationKindV128Shuffle, + operationKindV128Swizzle, + operationKindV128AnyTrue, + operationKindV128AllTrue, + operationKindV128BitMask, + operationKindV128And, + operationKindV128Not, + operationKindV128Or, + operationKindV128Xor, + operationKindV128Bitselect, + operationKindV128AndNot, + operationKindV128Shl, + operationKindV128Shr, + operationKindV128Cmp, + operationKindV128AddSat, + operationKindV128SubSat, + operationKindV128Mul, + operationKindV128Div, + operationKindV128Neg, + operationKindV128Sqrt, + operationKindV128Abs, + operationKindV128Popcnt, + operationKindV128Min, + operationKindV128Max, + operationKindV128AvgrU, + operationKindV128Pmin, + operationKindV128Pmax, + operationKindV128Ceil, + operationKindV128Floor, + operationKindV128Trunc, + operationKindV128Nearest, + operationKindV128Extend, + operationKindV128ExtMul, + operationKindV128Q15mulrSatS, + operationKindV128ExtAddPairwise, + operationKindV128FloatPromote, + operationKindV128FloatDemote, + operationKindV128FConvertFromI, + operationKindV128Dot, + operationKindV128Narrow: + return o.Kind.String() + + case operationKindV128ITruncSatFromF: + if o.B3 { + return fmt.Sprintf("%s.%sS", o.Kind, shapeName(o.B1)) + } else { + return fmt.Sprintf("%s.%sU", o.Kind, shapeName(o.B1)) + } + + case operationKindAtomicMemoryWait, + operationKindAtomicMemoryNotify, + operationKindAtomicFence, + operationKindAtomicLoad, + operationKindAtomicLoad8, + operationKindAtomicLoad16, + operationKindAtomicStore, + operationKindAtomicStore8, + operationKindAtomicStore16, + operationKindAtomicRMW, + operationKindAtomicRMW8, + operationKindAtomicRMW16, + operationKindAtomicRMWCmpxchg, + operationKindAtomicRMW8Cmpxchg, + operationKindAtomicRMW16Cmpxchg: + return o.Kind.String() + + default: + panic(fmt.Sprintf("TODO: %v", o.Kind)) + } +} + +// NewOperationUnreachable is a constructor for unionOperation with operationKindUnreachable +// +// This corresponds to wasm.OpcodeUnreachable. +// +// The engines are expected to exit the execution with wasmruntime.ErrRuntimeUnreachable error. +func newOperationUnreachable() unionOperation { + return unionOperation{Kind: operationKindUnreachable} +} + +// NewOperationLabel is a constructor for unionOperation with operationKindLabel. +// +// This is used to inform the engines of the beginning of a label. +func newOperationLabel(label label) unionOperation { + return unionOperation{Kind: operationKindLabel, U1: uint64(label)} +} + +// NewOperationBr is a constructor for unionOperation with operationKindBr. +// +// The engines are expected to branch into U1 label. +func newOperationBr(target label) unionOperation { + return unionOperation{Kind: operationKindBr, U1: uint64(target)} +} + +// NewOperationBrIf is a constructor for unionOperation with operationKindBrIf. +// +// The engines are expected to pop a value and branch into U1 label if the value equals 1. +// Otherwise, the code branches into U2 label. +func newOperationBrIf(thenTarget, elseTarget label, thenDrop inclusiveRange) unionOperation { + return unionOperation{ + Kind: operationKindBrIf, + U1: uint64(thenTarget), + U2: uint64(elseTarget), + U3: thenDrop.AsU64(), + } +} + +// NewOperationBrTable is a constructor for unionOperation with operationKindBrTable. +// +// This corresponds to wasm.OpcodeBrTableName except that the label +// here means the interpreterir level, not the ones of Wasm. +// +// The engines are expected to do the br_table operation based on the default (Us[len(Us)-1], Us[len(Us)-2]) and +// targets (Us[:len(Us)-1], Rs[:len(Us)-1]). More precisely, this pops a value from the stack (called "index") +// and decides which branch we go into next based on the value. +// +// For example, assume we have operations like {default: L_DEFAULT, targets: [L0, L1, L2]}. +// If "index" >= len(defaults), then branch into the L_DEFAULT label. +// Otherwise, we enter label of targets[index]. +func newOperationBrTable(targetLabelsAndRanges []uint64) unionOperation { + return unionOperation{ + Kind: operationKindBrTable, + Us: targetLabelsAndRanges, + } +} + +// NewOperationCall is a constructor for unionOperation with operationKindCall. +// +// This corresponds to wasm.OpcodeCallName, and engines are expected to +// enter into a function whose index equals OperationCall.FunctionIndex. +func newOperationCall(functionIndex uint32) unionOperation { + return unionOperation{Kind: operationKindCall, U1: uint64(functionIndex)} +} + +// NewOperationCallIndirect implements Operation. +// +// This corresponds to wasm.OpcodeCallIndirectName, and engines are expected to +// consume the one value from the top of stack (called "offset"), +// and make a function call against the function whose function address equals +// Tables[OperationCallIndirect.TableIndex][offset]. +// +// Note: This is called indirect function call in the sense that the target function is indirectly +// determined by the current state (top value) of the stack. +// Therefore, two checks are performed at runtime before entering the target function: +// 1) whether "offset" exceeds the length of table Tables[OperationCallIndirect.TableIndex]. +// 2) whether the type of the function table[offset] matches the function type specified by OperationCallIndirect.TypeIndex. +func newOperationCallIndirect(typeIndex, tableIndex uint32) unionOperation { + return unionOperation{Kind: operationKindCallIndirect, U1: uint64(typeIndex), U2: uint64(tableIndex)} +} + +// inclusiveRange is the range which spans across the value stack starting from the top to the bottom, and +// both boundary are included in the range. +type inclusiveRange struct { + Start, End int32 +} + +// AsU64 is be used to convert inclusiveRange to uint64 so that it can be stored in unionOperation. +func (i inclusiveRange) AsU64() uint64 { + return uint64(uint32(i.Start))<<32 | uint64(uint32(i.End)) +} + +// inclusiveRangeFromU64 retrieves inclusiveRange from the given uint64 which is stored in unionOperation. +func inclusiveRangeFromU64(v uint64) inclusiveRange { + return inclusiveRange{ + Start: int32(uint32(v >> 32)), + End: int32(uint32(v)), + } +} + +// nopinclusiveRange is inclusiveRange which corresponds to no-operation. +var nopinclusiveRange = inclusiveRange{Start: -1, End: -1} + +// NewOperationDrop is a constructor for unionOperation with operationKindDrop. +// +// The engines are expected to discard the values selected by NewOperationDrop.Depth which +// starts from the top of the stack to the bottom. +// +// depth spans across the uint64 value stack at runtime to be dropped by this operation. +func newOperationDrop(depth inclusiveRange) unionOperation { + return unionOperation{Kind: operationKindDrop, U1: depth.AsU64()} +} + +// NewOperationSelect is a constructor for unionOperation with operationKindSelect. +// +// This corresponds to wasm.OpcodeSelect. +// +// The engines are expected to pop three values, say [..., x2, x1, c], then if the value "c" equals zero, +// "x1" is pushed back onto the stack and, otherwise "x2" is pushed back. +// +// isTargetVector true if the selection target value's type is wasm.ValueTypeV128. +func newOperationSelect(isTargetVector bool) unionOperation { + return unionOperation{Kind: operationKindSelect, B3: isTargetVector} +} + +// NewOperationPick is a constructor for unionOperation with operationKindPick. +// +// The engines are expected to copy a value pointed by depth, and push the +// copied value onto the top of the stack. +// +// depth is the location of the pick target in the uint64 value stack at runtime. +// If isTargetVector=true, this points to the location of the lower 64-bits of the vector. +func newOperationPick(depth int, isTargetVector bool) unionOperation { + return unionOperation{Kind: operationKindPick, U1: uint64(depth), B3: isTargetVector} +} + +// NewOperationSet is a constructor for unionOperation with operationKindSet. +// +// The engines are expected to set the top value of the stack to the location specified by +// depth. +// +// depth is the location of the set target in the uint64 value stack at runtime. +// If isTargetVector=true, this points the location of the lower 64-bits of the vector. +func newOperationSet(depth int, isTargetVector bool) unionOperation { + return unionOperation{Kind: operationKindSet, U1: uint64(depth), B3: isTargetVector} +} + +// NewOperationGlobalGet is a constructor for unionOperation with operationKindGlobalGet. +// +// The engines are expected to read the global value specified by OperationGlobalGet.Index, +// and push the copy of the value onto the stack. +// +// See wasm.OpcodeGlobalGet. +func newOperationGlobalGet(index uint32) unionOperation { + return unionOperation{Kind: operationKindGlobalGet, U1: uint64(index)} +} + +// NewOperationGlobalSet is a constructor for unionOperation with operationKindGlobalSet. +// +// The engines are expected to consume the value from the top of the stack, +// and write the value into the global specified by OperationGlobalSet.Index. +// +// See wasm.OpcodeGlobalSet. +func newOperationGlobalSet(index uint32) unionOperation { + return unionOperation{Kind: operationKindGlobalSet, U1: uint64(index)} +} + +// memoryArg is the "memarg" to all memory instructions. +// +// See https://www.w3.org/TR/2019/REC-wasm-core-1-20191205/#memory-instructions%E2%91%A0 +type memoryArg struct { + // Alignment the expected alignment (expressed as the exponent of a power of 2). Default to the natural alignment. + // + // "Natural alignment" is defined here as the smallest power of two that can hold the size of the value type. Ex + // wasm.ValueTypeI64 is encoded in 8 little-endian bytes. 2^3 = 8, so the natural alignment is three. + Alignment uint32 + + // Offset is the address offset added to the instruction's dynamic address operand, yielding a 33-bit effective + // address that is the zero-based index at which the memory is accessed. Default to zero. + Offset uint32 +} + +// NewOperationLoad is a constructor for unionOperation with operationKindLoad. +// +// This corresponds to wasm.OpcodeI32LoadName wasm.OpcodeI64LoadName wasm.OpcodeF32LoadName and wasm.OpcodeF64LoadName. +// +// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary, +// otherwise load the corresponding value following the semantics of the corresponding WebAssembly instruction. +func newOperationLoad(unsignedType unsignedType, arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindLoad, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationLoad8 is a constructor for unionOperation with operationKindLoad8. +// +// This corresponds to wasm.OpcodeI32Load8SName wasm.OpcodeI32Load8UName wasm.OpcodeI64Load8SName wasm.OpcodeI64Load8UName. +// +// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary, +// otherwise load the corresponding value following the semantics of the corresponding WebAssembly instruction. +func newOperationLoad8(signedInt signedInt, arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindLoad8, B1: byte(signedInt), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationLoad16 is a constructor for unionOperation with operationKindLoad16. +// +// This corresponds to wasm.OpcodeI32Load16SName wasm.OpcodeI32Load16UName wasm.OpcodeI64Load16SName wasm.OpcodeI64Load16UName. +// +// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary, +// otherwise load the corresponding value following the semantics of the corresponding WebAssembly instruction. +func newOperationLoad16(signedInt signedInt, arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindLoad16, B1: byte(signedInt), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationLoad32 is a constructor for unionOperation with operationKindLoad32. +// +// This corresponds to wasm.OpcodeI64Load32SName wasm.OpcodeI64Load32UName. +// +// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary, +// otherwise load the corresponding value following the semantics of the corresponding WebAssembly instruction. +func newOperationLoad32(signed bool, arg memoryArg) unionOperation { + sigB := byte(0) + if signed { + sigB = 1 + } + return unionOperation{Kind: operationKindLoad32, B1: sigB, U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationStore is a constructor for unionOperation with operationKindStore. +// +// # This corresponds to wasm.OpcodeI32StoreName wasm.OpcodeI64StoreName wasm.OpcodeF32StoreName wasm.OpcodeF64StoreName +// +// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary, +// otherwise store the corresponding value following the semantics of the corresponding WebAssembly instruction. +func newOperationStore(unsignedType unsignedType, arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindStore, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationStore8 is a constructor for unionOperation with operationKindStore8. +// +// # This corresponds to wasm.OpcodeI32Store8Name wasm.OpcodeI64Store8Name +// +// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary, +// otherwise store the corresponding value following the semantics of the corresponding WebAssembly instruction. +func newOperationStore8(arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindStore8, U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationStore16 is a constructor for unionOperation with operationKindStore16. +// +// # This corresponds to wasm.OpcodeI32Store16Name wasm.OpcodeI64Store16Name +// +// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary, +// otherwise store the corresponding value following the semantics of the corresponding WebAssembly instruction. +func newOperationStore16(arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindStore16, U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationStore32 is a constructor for unionOperation with operationKindStore32. +// +// # This corresponds to wasm.OpcodeI64Store32Name +// +// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary, +// otherwise store the corresponding value following the semantics of the corresponding WebAssembly instruction. +func newOperationStore32(arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindStore32, U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationMemorySize is a constructor for unionOperation with operationKindMemorySize. +// +// This corresponds to wasm.OpcodeMemorySize. +// +// The engines are expected to push the current page size of the memory onto the stack. +func newOperationMemorySize() unionOperation { + return unionOperation{Kind: operationKindMemorySize} +} + +// NewOperationMemoryGrow is a constructor for unionOperation with operationKindMemoryGrow. +// +// This corresponds to wasm.OpcodeMemoryGrow. +// +// The engines are expected to pop one value from the top of the stack, then +// execute wasm.MemoryInstance Grow with the value, and push the previous +// page size of the memory onto the stack. +func newOperationMemoryGrow() unionOperation { + return unionOperation{Kind: operationKindMemoryGrow} +} + +// NewOperationConstI32 is a constructor for unionOperation with OperationConstI32. +// +// This corresponds to wasm.OpcodeI32Const. +func newOperationConstI32(value uint32) unionOperation { + return unionOperation{Kind: operationKindConstI32, U1: uint64(value)} +} + +// NewOperationConstI64 is a constructor for unionOperation with OperationConstI64. +// +// This corresponds to wasm.OpcodeI64Const. +func newOperationConstI64(value uint64) unionOperation { + return unionOperation{Kind: operationKindConstI64, U1: value} +} + +// NewOperationConstF32 is a constructor for unionOperation with OperationConstF32. +// +// This corresponds to wasm.OpcodeF32Const. +func newOperationConstF32(value float32) unionOperation { + return unionOperation{Kind: operationKindConstF32, U1: uint64(math.Float32bits(value))} +} + +// NewOperationConstF64 is a constructor for unionOperation with OperationConstF64. +// +// This corresponds to wasm.OpcodeF64Const. +func newOperationConstF64(value float64) unionOperation { + return unionOperation{Kind: operationKindConstF64, U1: math.Float64bits(value)} +} + +// NewOperationEq is a constructor for unionOperation with operationKindEq. +// +// This corresponds to wasm.OpcodeI32EqName wasm.OpcodeI64EqName wasm.OpcodeF32EqName wasm.OpcodeF64EqName +func newOperationEq(b unsignedType) unionOperation { + return unionOperation{Kind: operationKindEq, B1: byte(b)} +} + +// NewOperationNe is a constructor for unionOperation with operationKindNe. +// +// This corresponds to wasm.OpcodeI32NeName wasm.OpcodeI64NeName wasm.OpcodeF32NeName wasm.OpcodeF64NeName +func newOperationNe(b unsignedType) unionOperation { + return unionOperation{Kind: operationKindNe, B1: byte(b)} +} + +// NewOperationEqz is a constructor for unionOperation with operationKindEqz. +// +// This corresponds to wasm.OpcodeI32EqzName wasm.OpcodeI64EqzName +func newOperationEqz(b unsignedInt) unionOperation { + return unionOperation{Kind: operationKindEqz, B1: byte(b)} +} + +// NewOperationLt is a constructor for unionOperation with operationKindLt. +// +// This corresponds to wasm.OpcodeI32LtS wasm.OpcodeI32LtU wasm.OpcodeI64LtS wasm.OpcodeI64LtU wasm.OpcodeF32Lt wasm.OpcodeF64Lt +func newOperationLt(b signedType) unionOperation { + return unionOperation{Kind: operationKindLt, B1: byte(b)} +} + +// NewOperationGt is a constructor for unionOperation with operationKindGt. +// +// This corresponds to wasm.OpcodeI32GtS wasm.OpcodeI32GtU wasm.OpcodeI64GtS wasm.OpcodeI64GtU wasm.OpcodeF32Gt wasm.OpcodeF64Gt +func newOperationGt(b signedType) unionOperation { + return unionOperation{Kind: operationKindGt, B1: byte(b)} +} + +// NewOperationLe is a constructor for unionOperation with operationKindLe. +// +// This corresponds to wasm.OpcodeI32LeS wasm.OpcodeI32LeU wasm.OpcodeI64LeS wasm.OpcodeI64LeU wasm.OpcodeF32Le wasm.OpcodeF64Le +func newOperationLe(b signedType) unionOperation { + return unionOperation{Kind: operationKindLe, B1: byte(b)} +} + +// NewOperationGe is a constructor for unionOperation with operationKindGe. +// +// This corresponds to wasm.OpcodeI32GeS wasm.OpcodeI32GeU wasm.OpcodeI64GeS wasm.OpcodeI64GeU wasm.OpcodeF32Ge wasm.OpcodeF64Ge +// NewOperationGe is the constructor for OperationGe +func newOperationGe(b signedType) unionOperation { + return unionOperation{Kind: operationKindGe, B1: byte(b)} +} + +// NewOperationAdd is a constructor for unionOperation with operationKindAdd. +// +// This corresponds to wasm.OpcodeI32AddName wasm.OpcodeI64AddName wasm.OpcodeF32AddName wasm.OpcodeF64AddName. +func newOperationAdd(b unsignedType) unionOperation { + return unionOperation{Kind: operationKindAdd, B1: byte(b)} +} + +// NewOperationSub is a constructor for unionOperation with operationKindSub. +// +// This corresponds to wasm.OpcodeI32SubName wasm.OpcodeI64SubName wasm.OpcodeF32SubName wasm.OpcodeF64SubName. +func newOperationSub(b unsignedType) unionOperation { + return unionOperation{Kind: operationKindSub, B1: byte(b)} +} + +// NewOperationMul is a constructor for unionOperation with wperationKindMul. +// +// This corresponds to wasm.OpcodeI32MulName wasm.OpcodeI64MulName wasm.OpcodeF32MulName wasm.OpcodeF64MulName. +// NewOperationMul is the constructor for OperationMul +func newOperationMul(b unsignedType) unionOperation { + return unionOperation{Kind: operationKindMul, B1: byte(b)} +} + +// NewOperationClz is a constructor for unionOperation with operationKindClz. +// +// This corresponds to wasm.OpcodeI32ClzName wasm.OpcodeI64ClzName. +// +// The engines are expected to count up the leading zeros in the +// current top of the stack, and push the count result. +// For example, stack of [..., 0x00_ff_ff_ff] results in [..., 8]. +// See wasm.OpcodeI32Clz wasm.OpcodeI64Clz +func newOperationClz(b unsignedInt) unionOperation { + return unionOperation{Kind: operationKindClz, B1: byte(b)} +} + +// NewOperationCtz is a constructor for unionOperation with operationKindCtz. +// +// This corresponds to wasm.OpcodeI32CtzName wasm.OpcodeI64CtzName. +// +// The engines are expected to count up the trailing zeros in the +// current top of the stack, and push the count result. +// For example, stack of [..., 0xff_ff_ff_00] results in [..., 8]. +func newOperationCtz(b unsignedInt) unionOperation { + return unionOperation{Kind: operationKindCtz, B1: byte(b)} +} + +// NewOperationPopcnt is a constructor for unionOperation with operationKindPopcnt. +// +// This corresponds to wasm.OpcodeI32PopcntName wasm.OpcodeI64PopcntName. +// +// The engines are expected to count up the number of set bits in the +// current top of the stack, and push the count result. +// For example, stack of [..., 0b00_00_00_11] results in [..., 2]. +func newOperationPopcnt(b unsignedInt) unionOperation { + return unionOperation{Kind: operationKindPopcnt, B1: byte(b)} +} + +// NewOperationDiv is a constructor for unionOperation with operationKindDiv. +// +// This corresponds to wasm.OpcodeI32DivS wasm.OpcodeI32DivU wasm.OpcodeI64DivS +// +// wasm.OpcodeI64DivU wasm.OpcodeF32Div wasm.OpcodeF64Div. +func newOperationDiv(b signedType) unionOperation { + return unionOperation{Kind: operationKindDiv, B1: byte(b)} +} + +// NewOperationRem is a constructor for unionOperation with operationKindRem. +// +// This corresponds to wasm.OpcodeI32RemS wasm.OpcodeI32RemU wasm.OpcodeI64RemS wasm.OpcodeI64RemU. +// +// The engines are expected to perform division on the top +// two values of integer type on the stack and puts the remainder of the result +// onto the stack. For example, stack [..., 10, 3] results in [..., 1] where +// the quotient is discarded. +// NewOperationRem is the constructor for OperationRem +func newOperationRem(b signedInt) unionOperation { + return unionOperation{Kind: operationKindRem, B1: byte(b)} +} + +// NewOperationAnd is a constructor for unionOperation with operationKindAnd. +// +// # This corresponds to wasm.OpcodeI32AndName wasm.OpcodeI64AndName +// +// The engines are expected to perform "And" operation on +// top two values on the stack, and pushes the result. +func newOperationAnd(b unsignedInt) unionOperation { + return unionOperation{Kind: operationKindAnd, B1: byte(b)} +} + +// NewOperationOr is a constructor for unionOperation with operationKindOr. +// +// # This corresponds to wasm.OpcodeI32OrName wasm.OpcodeI64OrName +// +// The engines are expected to perform "Or" operation on +// top two values on the stack, and pushes the result. +func newOperationOr(b unsignedInt) unionOperation { + return unionOperation{Kind: operationKindOr, B1: byte(b)} +} + +// NewOperationXor is a constructor for unionOperation with operationKindXor. +// +// # This corresponds to wasm.OpcodeI32XorName wasm.OpcodeI64XorName +// +// The engines are expected to perform "Xor" operation on +// top two values on the stack, and pushes the result. +func newOperationXor(b unsignedInt) unionOperation { + return unionOperation{Kind: operationKindXor, B1: byte(b)} +} + +// NewOperationShl is a constructor for unionOperation with operationKindShl. +// +// # This corresponds to wasm.OpcodeI32ShlName wasm.OpcodeI64ShlName +// +// The engines are expected to perform "Shl" operation on +// top two values on the stack, and pushes the result. +func newOperationShl(b unsignedInt) unionOperation { + return unionOperation{Kind: operationKindShl, B1: byte(b)} +} + +// NewOperationShr is a constructor for unionOperation with operationKindShr. +// +// # This corresponds to wasm.OpcodeI32ShrSName wasm.OpcodeI32ShrUName wasm.OpcodeI64ShrSName wasm.OpcodeI64ShrUName +// +// If OperationShr.Type is signed integer, then, the engines are expected to perform arithmetic right shift on the two +// top values on the stack, otherwise do the logical right shift. +func newOperationShr(b signedInt) unionOperation { + return unionOperation{Kind: operationKindShr, B1: byte(b)} +} + +// NewOperationRotl is a constructor for unionOperation with operationKindRotl. +// +// # This corresponds to wasm.OpcodeI32RotlName wasm.OpcodeI64RotlName +// +// The engines are expected to perform "Rotl" operation on +// top two values on the stack, and pushes the result. +func newOperationRotl(b unsignedInt) unionOperation { + return unionOperation{Kind: operationKindRotl, B1: byte(b)} +} + +// NewOperationRotr is a constructor for unionOperation with operationKindRotr. +// +// # This corresponds to wasm.OpcodeI32RotrName wasm.OpcodeI64RotrName +// +// The engines are expected to perform "Rotr" operation on +// top two values on the stack, and pushes the result. +func newOperationRotr(b unsignedInt) unionOperation { + return unionOperation{Kind: operationKindRotr, B1: byte(b)} +} + +// NewOperationAbs is a constructor for unionOperation with operationKindAbs. +// +// This corresponds to wasm.OpcodeF32Abs wasm.OpcodeF64Abs +func newOperationAbs(b float) unionOperation { + return unionOperation{Kind: operationKindAbs, B1: byte(b)} +} + +// NewOperationNeg is a constructor for unionOperation with operationKindNeg. +// +// This corresponds to wasm.OpcodeF32Neg wasm.OpcodeF64Neg +func newOperationNeg(b float) unionOperation { + return unionOperation{Kind: operationKindNeg, B1: byte(b)} +} + +// NewOperationCeil is a constructor for unionOperation with operationKindCeil. +// +// This corresponds to wasm.OpcodeF32CeilName wasm.OpcodeF64CeilName +func newOperationCeil(b float) unionOperation { + return unionOperation{Kind: operationKindCeil, B1: byte(b)} +} + +// NewOperationFloor is a constructor for unionOperation with operationKindFloor. +// +// This corresponds to wasm.OpcodeF32FloorName wasm.OpcodeF64FloorName +func newOperationFloor(b float) unionOperation { + return unionOperation{Kind: operationKindFloor, B1: byte(b)} +} + +// NewOperationTrunc is a constructor for unionOperation with operationKindTrunc. +// +// This corresponds to wasm.OpcodeF32TruncName wasm.OpcodeF64TruncName +func newOperationTrunc(b float) unionOperation { + return unionOperation{Kind: operationKindTrunc, B1: byte(b)} +} + +// NewOperationNearest is a constructor for unionOperation with operationKindNearest. +// +// # This corresponds to wasm.OpcodeF32NearestName wasm.OpcodeF64NearestName +// +// Note: this is *not* equivalent to math.Round and instead has the same +// the semantics of LLVM's rint intrinsic. See https://llvm.org/docs/LangRef.html#llvm-rint-intrinsic. +// For example, math.Round(-4.5) produces -5 while we want to produce -4. +func newOperationNearest(b float) unionOperation { + return unionOperation{Kind: operationKindNearest, B1: byte(b)} +} + +// NewOperationSqrt is a constructor for unionOperation with operationKindSqrt. +// +// This corresponds to wasm.OpcodeF32SqrtName wasm.OpcodeF64SqrtName +func newOperationSqrt(b float) unionOperation { + return unionOperation{Kind: operationKindSqrt, B1: byte(b)} +} + +// NewOperationMin is a constructor for unionOperation with operationKindMin. +// +// # This corresponds to wasm.OpcodeF32MinName wasm.OpcodeF64MinName +// +// The engines are expected to pop two values from the stack, and push back the maximum of +// these two values onto the stack. For example, stack [..., 100.1, 1.9] results in [..., 1.9]. +// +// Note: WebAssembly specifies that min/max must always return NaN if one of values is NaN, +// which is a different behavior different from math.Min. +func newOperationMin(b float) unionOperation { + return unionOperation{Kind: operationKindMin, B1: byte(b)} +} + +// NewOperationMax is a constructor for unionOperation with operationKindMax. +// +// # This corresponds to wasm.OpcodeF32MaxName wasm.OpcodeF64MaxName +// +// The engines are expected to pop two values from the stack, and push back the maximum of +// these two values onto the stack. For example, stack [..., 100.1, 1.9] results in [..., 100.1]. +// +// Note: WebAssembly specifies that min/max must always return NaN if one of values is NaN, +// which is a different behavior different from math.Max. +func newOperationMax(b float) unionOperation { + return unionOperation{Kind: operationKindMax, B1: byte(b)} +} + +// NewOperationCopysign is a constructor for unionOperation with operationKindCopysign. +// +// # This corresponds to wasm.OpcodeF32CopysignName wasm.OpcodeF64CopysignName +// +// The engines are expected to pop two float values from the stack, and copy the signbit of +// the first-popped value to the last one. +// For example, stack [..., 1.213, -5.0] results in [..., -1.213]. +func newOperationCopysign(b float) unionOperation { + return unionOperation{Kind: operationKindCopysign, B1: byte(b)} +} + +// NewOperationI32WrapFromI64 is a constructor for unionOperation with operationKindI32WrapFromI64. +// +// This corresponds to wasm.OpcodeI32WrapI64 and equivalent to uint64(uint32(v)) in Go. +// +// The engines are expected to replace the 64-bit int on top of the stack +// with the corresponding 32-bit integer. +func newOperationI32WrapFromI64() unionOperation { + return unionOperation{Kind: operationKindI32WrapFromI64} +} + +// NewOperationITruncFromF is a constructor for unionOperation with operationKindITruncFromF. +// +// This corresponds to +// +// wasm.OpcodeI32TruncF32SName wasm.OpcodeI32TruncF32UName wasm.OpcodeI32TruncF64SName +// wasm.OpcodeI32TruncF64UName wasm.OpcodeI64TruncF32SName wasm.OpcodeI64TruncF32UName wasm.OpcodeI64TruncF64SName +// wasm.OpcodeI64TruncF64UName. wasm.OpcodeI32TruncSatF32SName wasm.OpcodeI32TruncSatF32UName +// wasm.OpcodeI32TruncSatF64SName wasm.OpcodeI32TruncSatF64UName wasm.OpcodeI64TruncSatF32SName +// wasm.OpcodeI64TruncSatF32UName wasm.OpcodeI64TruncSatF64SName wasm.OpcodeI64TruncSatF64UName +// +// See [1] and [2] for when we encounter undefined behavior in the WebAssembly specification if NewOperationITruncFromF.NonTrapping == false. +// To summarize, if the source float value is NaN or doesn't fit in the destination range of integers (incl. +=Inf), +// then the runtime behavior is undefined. In wazero, the engines are expected to exit the execution in these undefined cases with +// wasmruntime.ErrRuntimeInvalidConversionToInteger error. +// +// [1] https://www.w3.org/TR/2019/REC-wasm-core-1-20191205/#-hrefop-trunc-umathrmtruncmathsfu_m-n-z for unsigned integers. +// [2] https://www.w3.org/TR/2019/REC-wasm-core-1-20191205/#-hrefop-trunc-smathrmtruncmathsfs_m-n-z for signed integers. +// +// nonTrapping true if this conversion is "nontrapping" in the sense of the +// https://github.com/WebAssembly/spec/blob/ce4b6c4d47eb06098cc7ab2e81f24748da822f20/proposals/nontrapping-float-to-int-conversion/Overview.md +func newOperationITruncFromF(inputType float, outputType signedInt, nonTrapping bool) unionOperation { + return unionOperation{ + Kind: operationKindITruncFromF, + B1: byte(inputType), + B2: byte(outputType), + B3: nonTrapping, + } +} + +// NewOperationFConvertFromI is a constructor for unionOperation with operationKindFConvertFromI. +// +// This corresponds to +// +// wasm.OpcodeF32ConvertI32SName wasm.OpcodeF32ConvertI32UName wasm.OpcodeF32ConvertI64SName wasm.OpcodeF32ConvertI64UName +// wasm.OpcodeF64ConvertI32SName wasm.OpcodeF64ConvertI32UName wasm.OpcodeF64ConvertI64SName wasm.OpcodeF64ConvertI64UName +// +// and equivalent to float32(uint32(x)), float32(int32(x)), etc in Go. +func newOperationFConvertFromI(inputType signedInt, outputType float) unionOperation { + return unionOperation{ + Kind: operationKindFConvertFromI, + B1: byte(inputType), + B2: byte(outputType), + } +} + +// NewOperationF32DemoteFromF64 is a constructor for unionOperation with operationKindF32DemoteFromF64. +// +// This corresponds to wasm.OpcodeF32DemoteF64 and is equivalent float32(float64(v)). +func newOperationF32DemoteFromF64() unionOperation { + return unionOperation{Kind: operationKindF32DemoteFromF64} +} + +// NewOperationF64PromoteFromF32 is a constructor for unionOperation with operationKindF64PromoteFromF32. +// +// This corresponds to wasm.OpcodeF64PromoteF32 and is equivalent float64(float32(v)). +func newOperationF64PromoteFromF32() unionOperation { + return unionOperation{Kind: operationKindF64PromoteFromF32} +} + +// NewOperationI32ReinterpretFromF32 is a constructor for unionOperation with operationKindI32ReinterpretFromF32. +// +// This corresponds to wasm.OpcodeI32ReinterpretF32Name. +func newOperationI32ReinterpretFromF32() unionOperation { + return unionOperation{Kind: operationKindI32ReinterpretFromF32} +} + +// NewOperationI64ReinterpretFromF64 is a constructor for unionOperation with operationKindI64ReinterpretFromF64. +// +// This corresponds to wasm.OpcodeI64ReinterpretF64Name. +func newOperationI64ReinterpretFromF64() unionOperation { + return unionOperation{Kind: operationKindI64ReinterpretFromF64} +} + +// NewOperationF32ReinterpretFromI32 is a constructor for unionOperation with operationKindF32ReinterpretFromI32. +// +// This corresponds to wasm.OpcodeF32ReinterpretI32Name. +func newOperationF32ReinterpretFromI32() unionOperation { + return unionOperation{Kind: operationKindF32ReinterpretFromI32} +} + +// NewOperationF64ReinterpretFromI64 is a constructor for unionOperation with operationKindF64ReinterpretFromI64. +// +// This corresponds to wasm.OpcodeF64ReinterpretI64Name. +func newOperationF64ReinterpretFromI64() unionOperation { + return unionOperation{Kind: operationKindF64ReinterpretFromI64} +} + +// NewOperationExtend is a constructor for unionOperation with operationKindExtend. +// +// # This corresponds to wasm.OpcodeI64ExtendI32SName wasm.OpcodeI64ExtendI32UName +// +// The engines are expected to extend the 32-bit signed or unsigned int on top of the stack +// as a 64-bit integer of corresponding signedness. For unsigned case, this is just reinterpreting the +// underlying bit pattern as 64-bit integer. For signed case, this is sign-extension which preserves the +// original integer's sign. +func newOperationExtend(signed bool) unionOperation { + op := unionOperation{Kind: operationKindExtend} + if signed { + op.B1 = 1 + } + return op +} + +// NewOperationSignExtend32From8 is a constructor for unionOperation with operationKindSignExtend32From8. +// +// This corresponds to wasm.OpcodeI32Extend8SName. +// +// The engines are expected to sign-extend the first 8-bits of 32-bit in as signed 32-bit int. +func newOperationSignExtend32From8() unionOperation { + return unionOperation{Kind: operationKindSignExtend32From8} +} + +// NewOperationSignExtend32From16 is a constructor for unionOperation with operationKindSignExtend32From16. +// +// This corresponds to wasm.OpcodeI32Extend16SName. +// +// The engines are expected to sign-extend the first 16-bits of 32-bit in as signed 32-bit int. +func newOperationSignExtend32From16() unionOperation { + return unionOperation{Kind: operationKindSignExtend32From16} +} + +// NewOperationSignExtend64From8 is a constructor for unionOperation with operationKindSignExtend64From8. +// +// This corresponds to wasm.OpcodeI64Extend8SName. +// +// The engines are expected to sign-extend the first 8-bits of 64-bit in as signed 32-bit int. +func newOperationSignExtend64From8() unionOperation { + return unionOperation{Kind: operationKindSignExtend64From8} +} + +// NewOperationSignExtend64From16 is a constructor for unionOperation with operationKindSignExtend64From16. +// +// This corresponds to wasm.OpcodeI64Extend16SName. +// +// The engines are expected to sign-extend the first 16-bits of 64-bit in as signed 32-bit int. +func newOperationSignExtend64From16() unionOperation { + return unionOperation{Kind: operationKindSignExtend64From16} +} + +// NewOperationSignExtend64From32 is a constructor for unionOperation with operationKindSignExtend64From32. +// +// This corresponds to wasm.OpcodeI64Extend32SName. +// +// The engines are expected to sign-extend the first 32-bits of 64-bit in as signed 32-bit int. +func newOperationSignExtend64From32() unionOperation { + return unionOperation{Kind: operationKindSignExtend64From32} +} + +// NewOperationMemoryInit is a constructor for unionOperation with operationKindMemoryInit. +// +// This corresponds to wasm.OpcodeMemoryInitName. +// +// dataIndex is the index of the data instance in ModuleInstance.DataInstances +// by which this operation instantiates a part of the memory. +func newOperationMemoryInit(dataIndex uint32) unionOperation { + return unionOperation{Kind: operationKindMemoryInit, U1: uint64(dataIndex)} +} + +// NewOperationDataDrop implements Operation. +// +// This corresponds to wasm.OpcodeDataDropName. +// +// dataIndex is the index of the data instance in ModuleInstance.DataInstances +// which this operation drops. +func newOperationDataDrop(dataIndex uint32) unionOperation { + return unionOperation{Kind: operationKindDataDrop, U1: uint64(dataIndex)} +} + +// NewOperationMemoryCopy is a consuctor for unionOperation with operationKindMemoryCopy. +// +// This corresponds to wasm.OpcodeMemoryCopyName. +func newOperationMemoryCopy() unionOperation { + return unionOperation{Kind: operationKindMemoryCopy} +} + +// NewOperationMemoryFill is a consuctor for unionOperation with operationKindMemoryFill. +func newOperationMemoryFill() unionOperation { + return unionOperation{Kind: operationKindMemoryFill} +} + +// NewOperationTableInit is a constructor for unionOperation with operationKindTableInit. +// +// This corresponds to wasm.OpcodeTableInitName. +// +// elemIndex is the index of the element by which this operation initializes a part of the table. +// tableIndex is the index of the table on which this operation initialize by the target element. +func newOperationTableInit(elemIndex, tableIndex uint32) unionOperation { + return unionOperation{Kind: operationKindTableInit, U1: uint64(elemIndex), U2: uint64(tableIndex)} +} + +// NewOperationElemDrop is a constructor for unionOperation with operationKindElemDrop. +// +// This corresponds to wasm.OpcodeElemDropName. +// +// elemIndex is the index of the element which this operation drops. +func newOperationElemDrop(elemIndex uint32) unionOperation { + return unionOperation{Kind: operationKindElemDrop, U1: uint64(elemIndex)} +} + +// NewOperationTableCopy implements Operation. +// +// This corresponds to wasm.OpcodeTableCopyName. +func newOperationTableCopy(srcTableIndex, dstTableIndex uint32) unionOperation { + return unionOperation{Kind: operationKindTableCopy, U1: uint64(srcTableIndex), U2: uint64(dstTableIndex)} +} + +// NewOperationRefFunc constructor for unionOperation with operationKindRefFunc. +// +// This corresponds to wasm.OpcodeRefFuncName, and engines are expected to +// push the opaque pointer value of engine specific func for the given FunctionIndex. +// +// Note: in wazero, we express any reference types (funcref or externref) as opaque pointers which is uint64. +// Therefore, the engine implementations emit instructions to push the address of *function onto the stack. +func newOperationRefFunc(functionIndex uint32) unionOperation { + return unionOperation{Kind: operationKindRefFunc, U1: uint64(functionIndex)} +} + +// NewOperationTableGet constructor for unionOperation with operationKindTableGet. +// +// This corresponds to wasm.OpcodeTableGetName. +func newOperationTableGet(tableIndex uint32) unionOperation { + return unionOperation{Kind: operationKindTableGet, U1: uint64(tableIndex)} +} + +// NewOperationTableSet constructor for unionOperation with operationKindTableSet. +// +// This corresponds to wasm.OpcodeTableSetName. +func newOperationTableSet(tableIndex uint32) unionOperation { + return unionOperation{Kind: operationKindTableSet, U1: uint64(tableIndex)} +} + +// NewOperationTableSize constructor for unionOperation with operationKindTableSize. +// +// This corresponds to wasm.OpcodeTableSizeName. +func newOperationTableSize(tableIndex uint32) unionOperation { + return unionOperation{Kind: operationKindTableSize, U1: uint64(tableIndex)} +} + +// NewOperationTableGrow constructor for unionOperation with operationKindTableGrow. +// +// This corresponds to wasm.OpcodeTableGrowName. +func newOperationTableGrow(tableIndex uint32) unionOperation { + return unionOperation{Kind: operationKindTableGrow, U1: uint64(tableIndex)} +} + +// NewOperationTableFill constructor for unionOperation with operationKindTableFill. +// +// This corresponds to wasm.OpcodeTableFillName. +func newOperationTableFill(tableIndex uint32) unionOperation { + return unionOperation{Kind: operationKindTableFill, U1: uint64(tableIndex)} +} + +// NewOperationV128Const constructor for unionOperation with operationKindV128Const +func newOperationV128Const(lo, hi uint64) unionOperation { + return unionOperation{Kind: operationKindV128Const, U1: lo, U2: hi} +} + +// shape corresponds to a shape of v128 values. +// https://webassembly.github.io/spec/core/syntax/instructions.html#syntax-shape +type shape = byte + +const ( + shapeI8x16 shape = iota + shapeI16x8 + shapeI32x4 + shapeI64x2 + shapeF32x4 + shapeF64x2 +) + +func shapeName(s shape) (ret string) { + switch s { + case shapeI8x16: + ret = "I8x16" + case shapeI16x8: + ret = "I16x8" + case shapeI32x4: + ret = "I32x4" + case shapeI64x2: + ret = "I64x2" + case shapeF32x4: + ret = "F32x4" + case shapeF64x2: + ret = "F64x2" + } + return +} + +// NewOperationV128Add constructor for unionOperation with operationKindV128Add. +// +// This corresponds to wasm.OpcodeVecI8x16AddName wasm.OpcodeVecI16x8AddName wasm.OpcodeVecI32x4AddName +// +// wasm.OpcodeVecI64x2AddName wasm.OpcodeVecF32x4AddName wasm.OpcodeVecF64x2AddName +func newOperationV128Add(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128Add, B1: shape} +} + +// NewOperationV128Sub constructor for unionOperation with operationKindV128Sub. +// +// This corresponds to wasm.OpcodeVecI8x16SubName wasm.OpcodeVecI16x8SubName wasm.OpcodeVecI32x4SubName +// +// wasm.OpcodeVecI64x2SubName wasm.OpcodeVecF32x4SubName wasm.OpcodeVecF64x2SubName +func newOperationV128Sub(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128Sub, B1: shape} +} + +// v128LoadType represents a type of wasm.OpcodeVecV128Load* instructions. +type v128LoadType = byte + +const ( + // v128LoadType128 corresponds to wasm.OpcodeVecV128LoadName. + v128LoadType128 v128LoadType = iota + // v128LoadType8x8s corresponds to wasm.OpcodeVecV128Load8x8SName. + v128LoadType8x8s + // v128LoadType8x8u corresponds to wasm.OpcodeVecV128Load8x8UName. + v128LoadType8x8u + // v128LoadType16x4s corresponds to wasm.OpcodeVecV128Load16x4SName + v128LoadType16x4s + // v128LoadType16x4u corresponds to wasm.OpcodeVecV128Load16x4UName + v128LoadType16x4u + // v128LoadType32x2s corresponds to wasm.OpcodeVecV128Load32x2SName + v128LoadType32x2s + // v128LoadType32x2u corresponds to wasm.OpcodeVecV128Load32x2UName + v128LoadType32x2u + // v128LoadType8Splat corresponds to wasm.OpcodeVecV128Load8SplatName + v128LoadType8Splat + // v128LoadType16Splat corresponds to wasm.OpcodeVecV128Load16SplatName + v128LoadType16Splat + // v128LoadType32Splat corresponds to wasm.OpcodeVecV128Load32SplatName + v128LoadType32Splat + // v128LoadType64Splat corresponds to wasm.OpcodeVecV128Load64SplatName + v128LoadType64Splat + // v128LoadType32zero corresponds to wasm.OpcodeVecV128Load32zeroName + v128LoadType32zero + // v128LoadType64zero corresponds to wasm.OpcodeVecV128Load64zeroName + v128LoadType64zero +) + +// NewOperationV128Load is a constructor for unionOperation with operationKindV128Load. +// +// This corresponds to +// +// wasm.OpcodeVecV128LoadName wasm.OpcodeVecV128Load8x8SName wasm.OpcodeVecV128Load8x8UName +// wasm.OpcodeVecV128Load16x4SName wasm.OpcodeVecV128Load16x4UName wasm.OpcodeVecV128Load32x2SName +// wasm.OpcodeVecV128Load32x2UName wasm.OpcodeVecV128Load8SplatName wasm.OpcodeVecV128Load16SplatName +// wasm.OpcodeVecV128Load32SplatName wasm.OpcodeVecV128Load64SplatName wasm.OpcodeVecV128Load32zeroName +// wasm.OpcodeVecV128Load64zeroName +func newOperationV128Load(loadType v128LoadType, arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindV128Load, B1: loadType, U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationV128LoadLane is a constructor for unionOperation with operationKindV128LoadLane. +// +// This corresponds to wasm.OpcodeVecV128Load8LaneName wasm.OpcodeVecV128Load16LaneName +// +// wasm.OpcodeVecV128Load32LaneName wasm.OpcodeVecV128Load64LaneName. +// +// laneIndex is >=0 && <(128/LaneSize). +// laneSize is either 8, 16, 32, or 64. +func newOperationV128LoadLane(laneIndex, laneSize byte, arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindV128LoadLane, B1: laneSize, B2: laneIndex, U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationV128Store is a constructor for unionOperation with operationKindV128Store. +// +// This corresponds to wasm.OpcodeVecV128Load8LaneName wasm.OpcodeVecV128Load16LaneName +// +// wasm.OpcodeVecV128Load32LaneName wasm.OpcodeVecV128Load64LaneName. +func newOperationV128Store(arg memoryArg) unionOperation { + return unionOperation{ + Kind: operationKindV128Store, + U1: uint64(arg.Alignment), + U2: uint64(arg.Offset), + } +} + +// NewOperationV128StoreLane implements Operation. +// +// This corresponds to wasm.OpcodeVecV128Load8LaneName wasm.OpcodeVecV128Load16LaneName +// +// wasm.OpcodeVecV128Load32LaneName wasm.OpcodeVecV128Load64LaneName. +// +// laneIndex is >=0 && <(128/LaneSize). +// laneSize is either 8, 16, 32, or 64. +func newOperationV128StoreLane(laneIndex byte, laneSize byte, arg memoryArg) unionOperation { + return unionOperation{ + Kind: operationKindV128StoreLane, + B1: laneSize, + B2: laneIndex, + U1: uint64(arg.Alignment), + U2: uint64(arg.Offset), + } +} + +// NewOperationV128ExtractLane is a constructor for unionOperation with operationKindV128ExtractLane. +// +// This corresponds to +// +// wasm.OpcodeVecI8x16ExtractLaneSName wasm.OpcodeVecI8x16ExtractLaneUName +// wasm.OpcodeVecI16x8ExtractLaneSName wasm.OpcodeVecI16x8ExtractLaneUName +// wasm.OpcodeVecI32x4ExtractLaneName wasm.OpcodeVecI64x2ExtractLaneName +// wasm.OpcodeVecF32x4ExtractLaneName wasm.OpcodeVecF64x2ExtractLaneName. +// +// laneIndex is >=0 && <M where shape = NxM. +// signed is used when shape is either i8x16 or i16x2 to specify whether to sign-extend or not. +func newOperationV128ExtractLane(laneIndex byte, signed bool, shape shape) unionOperation { + return unionOperation{ + Kind: operationKindV128ExtractLane, + B1: shape, + B2: laneIndex, + B3: signed, + } +} + +// NewOperationV128ReplaceLane is a constructor for unionOperation with operationKindV128ReplaceLane. +// +// This corresponds to +// +// wasm.OpcodeVecI8x16ReplaceLaneName wasm.OpcodeVecI16x8ReplaceLaneName +// wasm.OpcodeVecI32x4ReplaceLaneName wasm.OpcodeVecI64x2ReplaceLaneName +// wasm.OpcodeVecF32x4ReplaceLaneName wasm.OpcodeVecF64x2ReplaceLaneName. +// +// laneIndex is >=0 && <M where shape = NxM. +func newOperationV128ReplaceLane(laneIndex byte, shape shape) unionOperation { + return unionOperation{Kind: operationKindV128ReplaceLane, B1: shape, B2: laneIndex} +} + +// NewOperationV128Splat is a constructor for unionOperation with operationKindV128Splat. +// +// This corresponds to +// +// wasm.OpcodeVecI8x16SplatName wasm.OpcodeVecI16x8SplatName +// wasm.OpcodeVecI32x4SplatName wasm.OpcodeVecI64x2SplatName +// wasm.OpcodeVecF32x4SplatName wasm.OpcodeVecF64x2SplatName. +func newOperationV128Splat(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128Splat, B1: shape} +} + +// NewOperationV128Shuffle is a constructor for unionOperation with operationKindV128Shuffle. +func newOperationV128Shuffle(lanes []uint64) unionOperation { + return unionOperation{Kind: operationKindV128Shuffle, Us: lanes} +} + +// NewOperationV128Swizzle is a constructor for unionOperation with operationKindV128Swizzle. +// +// This corresponds to wasm.OpcodeVecI8x16SwizzleName. +func newOperationV128Swizzle() unionOperation { + return unionOperation{Kind: operationKindV128Swizzle} +} + +// NewOperationV128AnyTrue is a constructor for unionOperation with operationKindV128AnyTrue. +// +// This corresponds to wasm.OpcodeVecV128AnyTrueName. +func newOperationV128AnyTrue() unionOperation { + return unionOperation{Kind: operationKindV128AnyTrue} +} + +// NewOperationV128AllTrue is a constructor for unionOperation with operationKindV128AllTrue. +// +// This corresponds to +// +// wasm.OpcodeVecI8x16AllTrueName wasm.OpcodeVecI16x8AllTrueName +// wasm.OpcodeVecI32x4AllTrueName wasm.OpcodeVecI64x2AllTrueName. +func newOperationV128AllTrue(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128AllTrue, B1: shape} +} + +// NewOperationV128BitMask is a constructor for unionOperation with operationKindV128BitMask. +// +// This corresponds to +// +// wasm.OpcodeVecI8x16BitMaskName wasm.OpcodeVecI16x8BitMaskName +// wasm.OpcodeVecI32x4BitMaskName wasm.OpcodeVecI64x2BitMaskName. +func newOperationV128BitMask(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128BitMask, B1: shape} +} + +// NewOperationV128And is a constructor for unionOperation with operationKindV128And. +// +// This corresponds to wasm.OpcodeVecV128And. +func newOperationV128And() unionOperation { + return unionOperation{Kind: operationKindV128And} +} + +// NewOperationV128Not is a constructor for unionOperation with operationKindV128Not. +// +// This corresponds to wasm.OpcodeVecV128Not. +func newOperationV128Not() unionOperation { + return unionOperation{Kind: operationKindV128Not} +} + +// NewOperationV128Or is a constructor for unionOperation with operationKindV128Or. +// +// This corresponds to wasm.OpcodeVecV128Or. +func newOperationV128Or() unionOperation { + return unionOperation{Kind: operationKindV128Or} +} + +// NewOperationV128Xor is a constructor for unionOperation with operationKindV128Xor. +// +// This corresponds to wasm.OpcodeVecV128Xor. +func newOperationV128Xor() unionOperation { + return unionOperation{Kind: operationKindV128Xor} +} + +// NewOperationV128Bitselect is a constructor for unionOperation with operationKindV128Bitselect. +// +// This corresponds to wasm.OpcodeVecV128Bitselect. +func newOperationV128Bitselect() unionOperation { + return unionOperation{Kind: operationKindV128Bitselect} +} + +// NewOperationV128AndNot is a constructor for unionOperation with operationKindV128AndNot. +// +// This corresponds to wasm.OpcodeVecV128AndNot. +func newOperationV128AndNot() unionOperation { + return unionOperation{Kind: operationKindV128AndNot} +} + +// NewOperationV128Shl is a constructor for unionOperation with operationKindV128Shl. +// +// This corresponds to +// +// wasm.OpcodeVecI8x16ShlName wasm.OpcodeVecI16x8ShlName +// wasm.OpcodeVecI32x4ShlName wasm.OpcodeVecI64x2ShlName +func newOperationV128Shl(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128Shl, B1: shape} +} + +// NewOperationV128Shr is a constructor for unionOperation with operationKindV128Shr. +// +// This corresponds to +// +// wasm.OpcodeVecI8x16ShrSName wasm.OpcodeVecI8x16ShrUName wasm.OpcodeVecI16x8ShrSName +// wasm.OpcodeVecI16x8ShrUName wasm.OpcodeVecI32x4ShrSName wasm.OpcodeVecI32x4ShrUName. +// wasm.OpcodeVecI64x2ShrSName wasm.OpcodeVecI64x2ShrUName. +func newOperationV128Shr(shape shape, signed bool) unionOperation { + return unionOperation{Kind: operationKindV128Shr, B1: shape, B3: signed} +} + +// NewOperationV128Cmp is a constructor for unionOperation with operationKindV128Cmp. +// +// This corresponds to +// +// wasm.OpcodeVecI8x16EqName, wasm.OpcodeVecI8x16NeName, wasm.OpcodeVecI8x16LtSName, wasm.OpcodeVecI8x16LtUName, wasm.OpcodeVecI8x16GtSName, +// wasm.OpcodeVecI8x16GtUName, wasm.OpcodeVecI8x16LeSName, wasm.OpcodeVecI8x16LeUName, wasm.OpcodeVecI8x16GeSName, wasm.OpcodeVecI8x16GeUName, +// wasm.OpcodeVecI16x8EqName, wasm.OpcodeVecI16x8NeName, wasm.OpcodeVecI16x8LtSName, wasm.OpcodeVecI16x8LtUName, wasm.OpcodeVecI16x8GtSName, +// wasm.OpcodeVecI16x8GtUName, wasm.OpcodeVecI16x8LeSName, wasm.OpcodeVecI16x8LeUName, wasm.OpcodeVecI16x8GeSName, wasm.OpcodeVecI16x8GeUName, +// wasm.OpcodeVecI32x4EqName, wasm.OpcodeVecI32x4NeName, wasm.OpcodeVecI32x4LtSName, wasm.OpcodeVecI32x4LtUName, wasm.OpcodeVecI32x4GtSName, +// wasm.OpcodeVecI32x4GtUName, wasm.OpcodeVecI32x4LeSName, wasm.OpcodeVecI32x4LeUName, wasm.OpcodeVecI32x4GeSName, wasm.OpcodeVecI32x4GeUName, +// wasm.OpcodeVecI64x2EqName, wasm.OpcodeVecI64x2NeName, wasm.OpcodeVecI64x2LtSName, wasm.OpcodeVecI64x2GtSName, wasm.OpcodeVecI64x2LeSName, +// wasm.OpcodeVecI64x2GeSName, wasm.OpcodeVecF32x4EqName, wasm.OpcodeVecF32x4NeName, wasm.OpcodeVecF32x4LtName, wasm.OpcodeVecF32x4GtName, +// wasm.OpcodeVecF32x4LeName, wasm.OpcodeVecF32x4GeName, wasm.OpcodeVecF64x2EqName, wasm.OpcodeVecF64x2NeName, wasm.OpcodeVecF64x2LtName, +// wasm.OpcodeVecF64x2GtName, wasm.OpcodeVecF64x2LeName, wasm.OpcodeVecF64x2GeName +func newOperationV128Cmp(cmpType v128CmpType) unionOperation { + return unionOperation{Kind: operationKindV128Cmp, B1: cmpType} +} + +// v128CmpType represents a type of vector comparison operation. +type v128CmpType = byte + +const ( + // v128CmpTypeI8x16Eq corresponds to wasm.OpcodeVecI8x16EqName. + v128CmpTypeI8x16Eq v128CmpType = iota + // v128CmpTypeI8x16Ne corresponds to wasm.OpcodeVecI8x16NeName. + v128CmpTypeI8x16Ne + // v128CmpTypeI8x16LtS corresponds to wasm.OpcodeVecI8x16LtSName. + v128CmpTypeI8x16LtS + // v128CmpTypeI8x16LtU corresponds to wasm.OpcodeVecI8x16LtUName. + v128CmpTypeI8x16LtU + // v128CmpTypeI8x16GtS corresponds to wasm.OpcodeVecI8x16GtSName. + v128CmpTypeI8x16GtS + // v128CmpTypeI8x16GtU corresponds to wasm.OpcodeVecI8x16GtUName. + v128CmpTypeI8x16GtU + // v128CmpTypeI8x16LeS corresponds to wasm.OpcodeVecI8x16LeSName. + v128CmpTypeI8x16LeS + // v128CmpTypeI8x16LeU corresponds to wasm.OpcodeVecI8x16LeUName. + v128CmpTypeI8x16LeU + // v128CmpTypeI8x16GeS corresponds to wasm.OpcodeVecI8x16GeSName. + v128CmpTypeI8x16GeS + // v128CmpTypeI8x16GeU corresponds to wasm.OpcodeVecI8x16GeUName. + v128CmpTypeI8x16GeU + // v128CmpTypeI16x8Eq corresponds to wasm.OpcodeVecI16x8EqName. + v128CmpTypeI16x8Eq + // v128CmpTypeI16x8Ne corresponds to wasm.OpcodeVecI16x8NeName. + v128CmpTypeI16x8Ne + // v128CmpTypeI16x8LtS corresponds to wasm.OpcodeVecI16x8LtSName. + v128CmpTypeI16x8LtS + // v128CmpTypeI16x8LtU corresponds to wasm.OpcodeVecI16x8LtUName. + v128CmpTypeI16x8LtU + // v128CmpTypeI16x8GtS corresponds to wasm.OpcodeVecI16x8GtSName. + v128CmpTypeI16x8GtS + // v128CmpTypeI16x8GtU corresponds to wasm.OpcodeVecI16x8GtUName. + v128CmpTypeI16x8GtU + // v128CmpTypeI16x8LeS corresponds to wasm.OpcodeVecI16x8LeSName. + v128CmpTypeI16x8LeS + // v128CmpTypeI16x8LeU corresponds to wasm.OpcodeVecI16x8LeUName. + v128CmpTypeI16x8LeU + // v128CmpTypeI16x8GeS corresponds to wasm.OpcodeVecI16x8GeSName. + v128CmpTypeI16x8GeS + // v128CmpTypeI16x8GeU corresponds to wasm.OpcodeVecI16x8GeUName. + v128CmpTypeI16x8GeU + // v128CmpTypeI32x4Eq corresponds to wasm.OpcodeVecI32x4EqName. + v128CmpTypeI32x4Eq + // v128CmpTypeI32x4Ne corresponds to wasm.OpcodeVecI32x4NeName. + v128CmpTypeI32x4Ne + // v128CmpTypeI32x4LtS corresponds to wasm.OpcodeVecI32x4LtSName. + v128CmpTypeI32x4LtS + // v128CmpTypeI32x4LtU corresponds to wasm.OpcodeVecI32x4LtUName. + v128CmpTypeI32x4LtU + // v128CmpTypeI32x4GtS corresponds to wasm.OpcodeVecI32x4GtSName. + v128CmpTypeI32x4GtS + // v128CmpTypeI32x4GtU corresponds to wasm.OpcodeVecI32x4GtUName. + v128CmpTypeI32x4GtU + // v128CmpTypeI32x4LeS corresponds to wasm.OpcodeVecI32x4LeSName. + v128CmpTypeI32x4LeS + // v128CmpTypeI32x4LeU corresponds to wasm.OpcodeVecI32x4LeUName. + v128CmpTypeI32x4LeU + // v128CmpTypeI32x4GeS corresponds to wasm.OpcodeVecI32x4GeSName. + v128CmpTypeI32x4GeS + // v128CmpTypeI32x4GeU corresponds to wasm.OpcodeVecI32x4GeUName. + v128CmpTypeI32x4GeU + // v128CmpTypeI64x2Eq corresponds to wasm.OpcodeVecI64x2EqName. + v128CmpTypeI64x2Eq + // v128CmpTypeI64x2Ne corresponds to wasm.OpcodeVecI64x2NeName. + v128CmpTypeI64x2Ne + // v128CmpTypeI64x2LtS corresponds to wasm.OpcodeVecI64x2LtSName. + v128CmpTypeI64x2LtS + // v128CmpTypeI64x2GtS corresponds to wasm.OpcodeVecI64x2GtSName. + v128CmpTypeI64x2GtS + // v128CmpTypeI64x2LeS corresponds to wasm.OpcodeVecI64x2LeSName. + v128CmpTypeI64x2LeS + // v128CmpTypeI64x2GeS corresponds to wasm.OpcodeVecI64x2GeSName. + v128CmpTypeI64x2GeS + // v128CmpTypeF32x4Eq corresponds to wasm.OpcodeVecF32x4EqName. + v128CmpTypeF32x4Eq + // v128CmpTypeF32x4Ne corresponds to wasm.OpcodeVecF32x4NeName. + v128CmpTypeF32x4Ne + // v128CmpTypeF32x4Lt corresponds to wasm.OpcodeVecF32x4LtName. + v128CmpTypeF32x4Lt + // v128CmpTypeF32x4Gt corresponds to wasm.OpcodeVecF32x4GtName. + v128CmpTypeF32x4Gt + // v128CmpTypeF32x4Le corresponds to wasm.OpcodeVecF32x4LeName. + v128CmpTypeF32x4Le + // v128CmpTypeF32x4Ge corresponds to wasm.OpcodeVecF32x4GeName. + v128CmpTypeF32x4Ge + // v128CmpTypeF64x2Eq corresponds to wasm.OpcodeVecF64x2EqName. + v128CmpTypeF64x2Eq + // v128CmpTypeF64x2Ne corresponds to wasm.OpcodeVecF64x2NeName. + v128CmpTypeF64x2Ne + // v128CmpTypeF64x2Lt corresponds to wasm.OpcodeVecF64x2LtName. + v128CmpTypeF64x2Lt + // v128CmpTypeF64x2Gt corresponds to wasm.OpcodeVecF64x2GtName. + v128CmpTypeF64x2Gt + // v128CmpTypeF64x2Le corresponds to wasm.OpcodeVecF64x2LeName. + v128CmpTypeF64x2Le + // v128CmpTypeF64x2Ge corresponds to wasm.OpcodeVecF64x2GeName. + v128CmpTypeF64x2Ge +) + +// NewOperationV128AddSat is a constructor for unionOperation with operationKindV128AddSat. +// +// This corresponds to wasm.OpcodeVecI8x16AddSatUName wasm.OpcodeVecI8x16AddSatSName +// +// wasm.OpcodeVecI16x8AddSatUName wasm.OpcodeVecI16x8AddSatSName +// +// shape is either shapeI8x16 or shapeI16x8. +func newOperationV128AddSat(shape shape, signed bool) unionOperation { + return unionOperation{Kind: operationKindV128AddSat, B1: shape, B3: signed} +} + +// NewOperationV128SubSat is a constructor for unionOperation with operationKindV128SubSat. +// +// This corresponds to wasm.OpcodeVecI8x16SubSatUName wasm.OpcodeVecI8x16SubSatSName +// +// wasm.OpcodeVecI16x8SubSatUName wasm.OpcodeVecI16x8SubSatSName +// +// shape is either shapeI8x16 or shapeI16x8. +func newOperationV128SubSat(shape shape, signed bool) unionOperation { + return unionOperation{Kind: operationKindV128SubSat, B1: shape, B3: signed} +} + +// NewOperationV128Mul is a constructor for unionOperation with operationKindV128Mul +// +// This corresponds to wasm.OpcodeVecF32x4MulName wasm.OpcodeVecF64x2MulName +// +// wasm.OpcodeVecI16x8MulName wasm.OpcodeVecI32x4MulName wasm.OpcodeVecI64x2MulName. +// shape is either shapeI16x8, shapeI32x4, shapeI64x2, shapeF32x4 or shapeF64x2. +func newOperationV128Mul(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128Mul, B1: shape} +} + +// NewOperationV128Div is a constructor for unionOperation with operationKindV128Div. +// +// This corresponds to wasm.OpcodeVecF32x4DivName wasm.OpcodeVecF64x2DivName. +// shape is either shapeF32x4 or shapeF64x2. +func newOperationV128Div(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128Div, B1: shape} +} + +// NewOperationV128Neg is a constructor for unionOperation with operationKindV128Neg. +// +// This corresponds to wasm.OpcodeVecI8x16NegName wasm.OpcodeVecI16x8NegName wasm.OpcodeVecI32x4NegName +// +// wasm.OpcodeVecI64x2NegName wasm.OpcodeVecF32x4NegName wasm.OpcodeVecF64x2NegName. +func newOperationV128Neg(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128Neg, B1: shape} +} + +// NewOperationV128Sqrt is a constructor for unionOperation with 128operationKindV128Sqrt. +// +// shape is either shapeF32x4 or shapeF64x2. +// This corresponds to wasm.OpcodeVecF32x4SqrtName wasm.OpcodeVecF64x2SqrtName. +func newOperationV128Sqrt(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128Sqrt, B1: shape} +} + +// NewOperationV128Abs is a constructor for unionOperation with operationKindV128Abs. +// +// This corresponds to wasm.OpcodeVecI8x16AbsName wasm.OpcodeVecI16x8AbsName wasm.OpcodeVecI32x4AbsName +// +// wasm.OpcodeVecI64x2AbsName wasm.OpcodeVecF32x4AbsName wasm.OpcodeVecF64x2AbsName. +func newOperationV128Abs(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128Abs, B1: shape} +} + +// NewOperationV128Popcnt is a constructor for unionOperation with operationKindV128Popcnt. +// +// This corresponds to wasm.OpcodeVecI8x16PopcntName. +func newOperationV128Popcnt(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128Popcnt, B1: shape} +} + +// NewOperationV128Min is a constructor for unionOperation with operationKindV128Min. +// +// This corresponds to +// +// wasm.OpcodeVecI8x16MinSName wasm.OpcodeVecI8x16MinUName wasm.OpcodeVecI16x8MinSName wasm.OpcodeVecI16x8MinUName +// wasm.OpcodeVecI32x4MinSName wasm.OpcodeVecI32x4MinUName wasm.OpcodeVecI16x8MinSName wasm.OpcodeVecI16x8MinUName +// wasm.OpcodeVecF32x4MinName wasm.OpcodeVecF64x2MinName +func newOperationV128Min(shape shape, signed bool) unionOperation { + return unionOperation{Kind: operationKindV128Min, B1: shape, B3: signed} +} + +// NewOperationV128Max is a constructor for unionOperation with operationKindV128Max. +// +// This corresponds to +// +// wasm.OpcodeVecI8x16MaxSName wasm.OpcodeVecI8x16MaxUName wasm.OpcodeVecI16x8MaxSName wasm.OpcodeVecI16x8MaxUName +// wasm.OpcodeVecI32x4MaxSName wasm.OpcodeVecI32x4MaxUName wasm.OpcodeVecI16x8MaxSName wasm.OpcodeVecI16x8MaxUName +// wasm.OpcodeVecF32x4MaxName wasm.OpcodeVecF64x2MaxName. +func newOperationV128Max(shape shape, signed bool) unionOperation { + return unionOperation{Kind: operationKindV128Max, B1: shape, B3: signed} +} + +// NewOperationV128AvgrU is a constructor for unionOperation with operationKindV128AvgrU. +// +// This corresponds to wasm.OpcodeVecI8x16AvgrUName. +func newOperationV128AvgrU(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128AvgrU, B1: shape} +} + +// NewOperationV128Pmin is a constructor for unionOperation with operationKindV128Pmin. +// +// This corresponds to wasm.OpcodeVecF32x4PminName wasm.OpcodeVecF64x2PminName. +func newOperationV128Pmin(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128Pmin, B1: shape} +} + +// NewOperationV128Pmax is a constructor for unionOperation with operationKindV128Pmax. +// +// This corresponds to wasm.OpcodeVecF32x4PmaxName wasm.OpcodeVecF64x2PmaxName. +func newOperationV128Pmax(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128Pmax, B1: shape} +} + +// NewOperationV128Ceil is a constructor for unionOperation with operationKindV128Ceil. +// +// This corresponds to wasm.OpcodeVecF32x4CeilName wasm.OpcodeVecF64x2CeilName +func newOperationV128Ceil(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128Ceil, B1: shape} +} + +// NewOperationV128Floor is a constructor for unionOperation with operationKindV128Floor. +// +// This corresponds to wasm.OpcodeVecF32x4FloorName wasm.OpcodeVecF64x2FloorName +func newOperationV128Floor(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128Floor, B1: shape} +} + +// NewOperationV128Trunc is a constructor for unionOperation with operationKindV128Trunc. +// +// This corresponds to wasm.OpcodeVecF32x4TruncName wasm.OpcodeVecF64x2TruncName +func newOperationV128Trunc(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128Trunc, B1: shape} +} + +// NewOperationV128Nearest is a constructor for unionOperation with operationKindV128Nearest. +// +// This corresponds to wasm.OpcodeVecF32x4NearestName wasm.OpcodeVecF64x2NearestName +func newOperationV128Nearest(shape shape) unionOperation { + return unionOperation{Kind: operationKindV128Nearest, B1: shape} +} + +// NewOperationV128Extend is a constructor for unionOperation with operationKindV128Extend. +// +// This corresponds to +// +// wasm.OpcodeVecI16x8ExtendLowI8x16SName wasm.OpcodeVecI16x8ExtendHighI8x16SName +// wasm.OpcodeVecI16x8ExtendLowI8x16UName wasm.OpcodeVecI16x8ExtendHighI8x16UName +// wasm.OpcodeVecI32x4ExtendLowI16x8SName wasm.OpcodeVecI32x4ExtendHighI16x8SName +// wasm.OpcodeVecI32x4ExtendLowI16x8UName wasm.OpcodeVecI32x4ExtendHighI16x8UName +// wasm.OpcodeVecI64x2ExtendLowI32x4SName wasm.OpcodeVecI64x2ExtendHighI32x4SName +// wasm.OpcodeVecI64x2ExtendLowI32x4UName wasm.OpcodeVecI64x2ExtendHighI32x4UName +// +// originshape is the shape of the original lanes for extension which is +// either shapeI8x16, shapeI16x8, or shapeI32x4. +// useLow true if it uses the lower half of vector for extension. +func newOperationV128Extend(originshape shape, signed bool, useLow bool) unionOperation { + op := unionOperation{Kind: operationKindV128Extend} + op.B1 = originshape + if signed { + op.B2 = 1 + } + op.B3 = useLow + return op +} + +// NewOperationV128ExtMul is a constructor for unionOperation with operationKindV128ExtMul. +// +// This corresponds to +// +// wasm.OpcodeVecI16x8ExtMulLowI8x16SName wasm.OpcodeVecI16x8ExtMulLowI8x16UName +// wasm.OpcodeVecI16x8ExtMulHighI8x16SName wasm.OpcodeVecI16x8ExtMulHighI8x16UName +// wasm.OpcodeVecI32x4ExtMulLowI16x8SName wasm.OpcodeVecI32x4ExtMulLowI16x8UName +// wasm.OpcodeVecI32x4ExtMulHighI16x8SName wasm.OpcodeVecI32x4ExtMulHighI16x8UName +// wasm.OpcodeVecI64x2ExtMulLowI32x4SName wasm.OpcodeVecI64x2ExtMulLowI32x4UName +// wasm.OpcodeVecI64x2ExtMulHighI32x4SName wasm.OpcodeVecI64x2ExtMulHighI32x4UName. +// +// originshape is the shape of the original lanes for extension which is +// either shapeI8x16, shapeI16x8, or shapeI32x4. +// useLow true if it uses the lower half of vector for extension. +func newOperationV128ExtMul(originshape shape, signed bool, useLow bool) unionOperation { + op := unionOperation{Kind: operationKindV128ExtMul} + op.B1 = originshape + if signed { + op.B2 = 1 + } + op.B3 = useLow + return op +} + +// NewOperationV128Q15mulrSatS is a constructor for unionOperation with operationKindV128Q15mulrSatS. +// +// This corresponds to wasm.OpcodeVecI16x8Q15mulrSatSName +func newOperationV128Q15mulrSatS() unionOperation { + return unionOperation{Kind: operationKindV128Q15mulrSatS} +} + +// NewOperationV128ExtAddPairwise is a constructor for unionOperation with operationKindV128ExtAddPairwise. +// +// This corresponds to +// +// wasm.OpcodeVecI16x8ExtaddPairwiseI8x16SName wasm.OpcodeVecI16x8ExtaddPairwiseI8x16UName +// wasm.OpcodeVecI32x4ExtaddPairwiseI16x8SName wasm.OpcodeVecI32x4ExtaddPairwiseI16x8UName. +// +// originshape is the shape of the original lanes for extension which is +// either shapeI8x16, or shapeI16x8. +func newOperationV128ExtAddPairwise(originshape shape, signed bool) unionOperation { + return unionOperation{Kind: operationKindV128ExtAddPairwise, B1: originshape, B3: signed} +} + +// NewOperationV128FloatPromote is a constructor for unionOperation with NewOperationV128FloatPromote. +// +// This corresponds to wasm.OpcodeVecF64x2PromoteLowF32x4ZeroName +// This discards the higher 64-bit of a vector, and promotes two +// 32-bit floats in the lower 64-bit as two 64-bit floats. +func newOperationV128FloatPromote() unionOperation { + return unionOperation{Kind: operationKindV128FloatPromote} +} + +// NewOperationV128FloatDemote is a constructor for unionOperation with NewOperationV128FloatDemote. +// +// This corresponds to wasm.OpcodeVecF32x4DemoteF64x2ZeroName. +func newOperationV128FloatDemote() unionOperation { + return unionOperation{Kind: operationKindV128FloatDemote} +} + +// NewOperationV128FConvertFromI is a constructor for unionOperation with NewOperationV128FConvertFromI. +// +// This corresponds to +// +// wasm.OpcodeVecF32x4ConvertI32x4SName wasm.OpcodeVecF32x4ConvertI32x4UName +// wasm.OpcodeVecF64x2ConvertLowI32x4SName wasm.OpcodeVecF64x2ConvertLowI32x4UName. +// +// destinationshape is the shape of the destination lanes for conversion which is +// either shapeF32x4, or shapeF64x2. +func newOperationV128FConvertFromI(destinationshape shape, signed bool) unionOperation { + return unionOperation{Kind: operationKindV128FConvertFromI, B1: destinationshape, B3: signed} +} + +// NewOperationV128Dot is a constructor for unionOperation with operationKindV128Dot. +// +// This corresponds to wasm.OpcodeVecI32x4DotI16x8SName +func newOperationV128Dot() unionOperation { + return unionOperation{Kind: operationKindV128Dot} +} + +// NewOperationV128Narrow is a constructor for unionOperation with operationKindV128Narrow. +// +// This corresponds to +// +// wasm.OpcodeVecI8x16NarrowI16x8SName wasm.OpcodeVecI8x16NarrowI16x8UName +// wasm.OpcodeVecI16x8NarrowI32x4SName wasm.OpcodeVecI16x8NarrowI32x4UName. +// +// originshape is the shape of the original lanes for narrowing which is +// either shapeI16x8, or shapeI32x4. +func newOperationV128Narrow(originshape shape, signed bool) unionOperation { + return unionOperation{Kind: operationKindV128Narrow, B1: originshape, B3: signed} +} + +// NewOperationV128ITruncSatFromF is a constructor for unionOperation with operationKindV128ITruncSatFromF. +// +// This corresponds to +// +// wasm.OpcodeVecI32x4TruncSatF64x2UZeroName wasm.OpcodeVecI32x4TruncSatF64x2SZeroName +// wasm.OpcodeVecI32x4TruncSatF32x4UName wasm.OpcodeVecI32x4TruncSatF32x4SName. +// +// originshape is the shape of the original lanes for truncation which is +// either shapeF32x4, or shapeF64x2. +func newOperationV128ITruncSatFromF(originshape shape, signed bool) unionOperation { + return unionOperation{Kind: operationKindV128ITruncSatFromF, B1: originshape, B3: signed} +} + +// atomicArithmeticOp is the type for the operation kind of atomic arithmetic operations. +type atomicArithmeticOp byte + +const ( + // atomicArithmeticOpAdd is the kind for an add operation. + atomicArithmeticOpAdd atomicArithmeticOp = iota + // atomicArithmeticOpSub is the kind for a sub operation. + atomicArithmeticOpSub + // atomicArithmeticOpAnd is the kind for a bitwise and operation. + atomicArithmeticOpAnd + // atomicArithmeticOpOr is the kind for a bitwise or operation. + atomicArithmeticOpOr + // atomicArithmeticOpXor is the kind for a bitwise xor operation. + atomicArithmeticOpXor + // atomicArithmeticOpNop is the kind for a nop operation. + atomicArithmeticOpNop +) + +// NewOperationAtomicMemoryWait is a constructor for unionOperation with operationKindAtomicMemoryWait. +// +// This corresponds to +// +// wasm.OpcodeAtomicWait32Name wasm.OpcodeAtomicWait64Name +func newOperationAtomicMemoryWait(unsignedType unsignedType, arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindAtomicMemoryWait, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationAtomicMemoryNotify is a constructor for unionOperation with operationKindAtomicMemoryNotify. +// +// This corresponds to +// +// wasm.OpcodeAtomicNotifyName +func newOperationAtomicMemoryNotify(arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindAtomicMemoryNotify, U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationAtomicFence is a constructor for unionOperation with operationKindAtomicFence. +// +// This corresponds to +// +// wasm.OpcodeAtomicFenceName +func newOperationAtomicFence() unionOperation { + return unionOperation{Kind: operationKindAtomicFence} +} + +// NewOperationAtomicLoad is a constructor for unionOperation with operationKindAtomicLoad. +// +// This corresponds to +// +// wasm.OpcodeAtomicI32LoadName wasm.OpcodeAtomicI64LoadName +func newOperationAtomicLoad(unsignedType unsignedType, arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindAtomicLoad, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationAtomicLoad8 is a constructor for unionOperation with operationKindAtomicLoad8. +// +// This corresponds to +// +// wasm.OpcodeAtomicI32Load8UName wasm.OpcodeAtomicI64Load8UName +func newOperationAtomicLoad8(unsignedType unsignedType, arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindAtomicLoad8, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationAtomicLoad16 is a constructor for unionOperation with operationKindAtomicLoad16. +// +// This corresponds to +// +// wasm.OpcodeAtomicI32Load16UName wasm.OpcodeAtomicI64Load16UName +func newOperationAtomicLoad16(unsignedType unsignedType, arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindAtomicLoad16, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationAtomicStore is a constructor for unionOperation with operationKindAtomicStore. +// +// This corresponds to +// +// wasm.OpcodeAtomicI32StoreName wasm.OpcodeAtomicI64StoreName +func newOperationAtomicStore(unsignedType unsignedType, arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindAtomicStore, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationAtomicStore8 is a constructor for unionOperation with operationKindAtomicStore8. +// +// This corresponds to +// +// wasm.OpcodeAtomicI32Store8UName wasm.OpcodeAtomicI64Store8UName +func newOperationAtomicStore8(unsignedType unsignedType, arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindAtomicStore8, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationAtomicStore16 is a constructor for unionOperation with operationKindAtomicStore16. +// +// This corresponds to +// +// wasm.OpcodeAtomicI32Store16UName wasm.OpcodeAtomicI64Store16UName +func newOperationAtomicStore16(unsignedType unsignedType, arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindAtomicStore16, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationAtomicRMW is a constructor for unionOperation with operationKindAtomicRMW. +// +// This corresponds to +// +// wasm.OpcodeAtomicI32RMWAddName wasm.OpcodeAtomicI64RmwAddName +// wasm.OpcodeAtomicI32RMWSubName wasm.OpcodeAtomicI64RmwSubName +// wasm.OpcodeAtomicI32RMWAndName wasm.OpcodeAtomicI64RmwAndName +// wasm.OpcodeAtomicI32RMWOrName wasm.OpcodeAtomicI64RmwOrName +// wasm.OpcodeAtomicI32RMWXorName wasm.OpcodeAtomicI64RmwXorName +func newOperationAtomicRMW(unsignedType unsignedType, arg memoryArg, op atomicArithmeticOp) unionOperation { + return unionOperation{Kind: operationKindAtomicRMW, B1: byte(unsignedType), B2: byte(op), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationAtomicRMW8 is a constructor for unionOperation with operationKindAtomicRMW8. +// +// This corresponds to +// +// wasm.OpcodeAtomicI32RMW8AddUName wasm.OpcodeAtomicI64Rmw8AddUName +// wasm.OpcodeAtomicI32RMW8SubUName wasm.OpcodeAtomicI64Rmw8SubUName +// wasm.OpcodeAtomicI32RMW8AndUName wasm.OpcodeAtomicI64Rmw8AndUName +// wasm.OpcodeAtomicI32RMW8OrUName wasm.OpcodeAtomicI64Rmw8OrUName +// wasm.OpcodeAtomicI32RMW8XorUName wasm.OpcodeAtomicI64Rmw8XorUName +func newOperationAtomicRMW8(unsignedType unsignedType, arg memoryArg, op atomicArithmeticOp) unionOperation { + return unionOperation{Kind: operationKindAtomicRMW8, B1: byte(unsignedType), B2: byte(op), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationAtomicRMW16 is a constructor for unionOperation with operationKindAtomicRMW16. +// +// This corresponds to +// +// wasm.OpcodeAtomicI32RMW16AddUName wasm.OpcodeAtomicI64Rmw16AddUName +// wasm.OpcodeAtomicI32RMW16SubUName wasm.OpcodeAtomicI64Rmw16SubUName +// wasm.OpcodeAtomicI32RMW16AndUName wasm.OpcodeAtomicI64Rmw16AndUName +// wasm.OpcodeAtomicI32RMW16OrUName wasm.OpcodeAtomicI64Rmw16OrUName +// wasm.OpcodeAtomicI32RMW16XorUName wasm.OpcodeAtomicI64Rmw16XorUName +func newOperationAtomicRMW16(unsignedType unsignedType, arg memoryArg, op atomicArithmeticOp) unionOperation { + return unionOperation{Kind: operationKindAtomicRMW16, B1: byte(unsignedType), B2: byte(op), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationAtomicRMWCmpxchg is a constructor for unionOperation with operationKindAtomicRMWCmpxchg. +// +// This corresponds to +// +// wasm.OpcodeAtomicI32RMWCmpxchgName wasm.OpcodeAtomicI64RmwCmpxchgName +func newOperationAtomicRMWCmpxchg(unsignedType unsignedType, arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindAtomicRMWCmpxchg, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationAtomicRMW8Cmpxchg is a constructor for unionOperation with operationKindAtomicRMW8Cmpxchg. +// +// This corresponds to +// +// wasm.OpcodeAtomicI32RMW8CmpxchgUName wasm.OpcodeAtomicI64Rmw8CmpxchgUName +func newOperationAtomicRMW8Cmpxchg(unsignedType unsignedType, arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindAtomicRMW8Cmpxchg, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} + +// NewOperationAtomicRMW16Cmpxchg is a constructor for unionOperation with operationKindAtomicRMW16Cmpxchg. +// +// This corresponds to +// +// wasm.OpcodeAtomicI32RMW16CmpxchgUName wasm.OpcodeAtomicI64Rmw16CmpxchgUName +func newOperationAtomicRMW16Cmpxchg(unsignedType unsignedType, arg memoryArg) unionOperation { + return unionOperation{Kind: operationKindAtomicRMW16Cmpxchg, B1: byte(unsignedType), U1: uint64(arg.Alignment), U2: uint64(arg.Offset)} +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/signature.go b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/signature.go new file mode 100644 index 000000000..7b9d5602d --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/interpreter/signature.go @@ -0,0 +1,767 @@ +package interpreter + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/wasm" +) + +// signature represents how a Wasm opcode +// manipulates the value stacks in terms of value types. +type signature struct { + in, out []unsignedType +} + +var ( + signature_None_None = &signature{} + signature_Unknown_None = &signature{ + in: []unsignedType{unsignedTypeUnknown}, + } + signature_None_I32 = &signature{ + out: []unsignedType{unsignedTypeI32}, + } + signature_None_I64 = &signature{ + out: []unsignedType{unsignedTypeI64}, + } + signature_None_V128 = &signature{ + out: []unsignedType{unsignedTypeV128}, + } + signature_None_F32 = &signature{ + out: []unsignedType{unsignedTypeF32}, + } + signature_None_F64 = &signature{ + out: []unsignedType{unsignedTypeF64}, + } + signature_I32_None = &signature{ + in: []unsignedType{unsignedTypeI32}, + } + signature_I64_None = &signature{ + in: []unsignedType{unsignedTypeI64}, + } + signature_F32_None = &signature{ + in: []unsignedType{unsignedTypeF32}, + } + signature_F64_None = &signature{ + in: []unsignedType{unsignedTypeF64}, + } + signature_V128_None = &signature{ + in: []unsignedType{unsignedTypeV128}, + } + signature_I32_I32 = &signature{ + in: []unsignedType{unsignedTypeI32}, + out: []unsignedType{unsignedTypeI32}, + } + signature_I32_I64 = &signature{ + in: []unsignedType{unsignedTypeI32}, + out: []unsignedType{unsignedTypeI64}, + } + signature_I64_I64 = &signature{ + in: []unsignedType{unsignedTypeI64}, + out: []unsignedType{unsignedTypeI64}, + } + signature_I32_F32 = &signature{ + in: []unsignedType{unsignedTypeI32}, + out: []unsignedType{unsignedTypeF32}, + } + signature_I32_F64 = &signature{ + in: []unsignedType{unsignedTypeI32}, + out: []unsignedType{unsignedTypeF64}, + } + signature_I64_I32 = &signature{ + in: []unsignedType{unsignedTypeI64}, + out: []unsignedType{unsignedTypeI32}, + } + signature_I64_F32 = &signature{ + in: []unsignedType{unsignedTypeI64}, + out: []unsignedType{unsignedTypeF32}, + } + signature_I64_F64 = &signature{ + in: []unsignedType{unsignedTypeI64}, + out: []unsignedType{unsignedTypeF64}, + } + signature_F32_I32 = &signature{ + in: []unsignedType{unsignedTypeF32}, + out: []unsignedType{unsignedTypeI32}, + } + signature_F32_I64 = &signature{ + in: []unsignedType{unsignedTypeF32}, + out: []unsignedType{unsignedTypeI64}, + } + signature_F32_F64 = &signature{ + in: []unsignedType{unsignedTypeF32}, + out: []unsignedType{unsignedTypeF64}, + } + signature_F32_F32 = &signature{ + in: []unsignedType{unsignedTypeF32}, + out: []unsignedType{unsignedTypeF32}, + } + signature_F64_I32 = &signature{ + in: []unsignedType{unsignedTypeF64}, + out: []unsignedType{unsignedTypeI32}, + } + signature_F64_F32 = &signature{ + in: []unsignedType{unsignedTypeF64}, + out: []unsignedType{unsignedTypeF32}, + } + signature_F64_I64 = &signature{ + in: []unsignedType{unsignedTypeF64}, + out: []unsignedType{unsignedTypeI64}, + } + signature_F64_F64 = &signature{ + in: []unsignedType{unsignedTypeF64}, + out: []unsignedType{unsignedTypeF64}, + } + signature_I32I32_None = &signature{ + in: []unsignedType{unsignedTypeI32, unsignedTypeI32}, + } + + signature_I32I32_I32 = &signature{ + in: []unsignedType{unsignedTypeI32, unsignedTypeI32}, + out: []unsignedType{unsignedTypeI32}, + } + signature_I32I64_None = &signature{ + in: []unsignedType{unsignedTypeI32, unsignedTypeI64}, + } + signature_I32F32_None = &signature{ + in: []unsignedType{unsignedTypeI32, unsignedTypeF32}, + } + signature_I32F64_None = &signature{ + in: []unsignedType{unsignedTypeI32, unsignedTypeF64}, + } + signature_I64I32_I32 = &signature{ + in: []unsignedType{unsignedTypeI64, unsignedTypeI32}, + out: []unsignedType{unsignedTypeI32}, + } + signature_I64I64_I32 = &signature{ + in: []unsignedType{unsignedTypeI64, unsignedTypeI64}, + out: []unsignedType{unsignedTypeI32}, + } + signature_I64I64_I64 = &signature{ + in: []unsignedType{unsignedTypeI64, unsignedTypeI64}, + out: []unsignedType{unsignedTypeI64}, + } + signature_F32F32_I32 = &signature{ + in: []unsignedType{unsignedTypeF32, unsignedTypeF32}, + out: []unsignedType{unsignedTypeI32}, + } + signature_F32F32_F32 = &signature{ + in: []unsignedType{unsignedTypeF32, unsignedTypeF32}, + out: []unsignedType{unsignedTypeF32}, + } + signature_F64F64_I32 = &signature{ + in: []unsignedType{unsignedTypeF64, unsignedTypeF64}, + out: []unsignedType{unsignedTypeI32}, + } + signature_F64F64_F64 = &signature{ + in: []unsignedType{unsignedTypeF64, unsignedTypeF64}, + out: []unsignedType{unsignedTypeF64}, + } + signature_I32I32I32_None = &signature{ + in: []unsignedType{unsignedTypeI32, unsignedTypeI32, unsignedTypeI32}, + } + signature_I32I64I32_None = &signature{ + in: []unsignedType{unsignedTypeI32, unsignedTypeI64, unsignedTypeI32}, + } + signature_UnknownUnknownI32_Unknown = &signature{ + in: []unsignedType{unsignedTypeUnknown, unsignedTypeUnknown, unsignedTypeI32}, + out: []unsignedType{unsignedTypeUnknown}, + } + signature_V128V128_V128 = &signature{ + in: []unsignedType{unsignedTypeV128, unsignedTypeV128}, + out: []unsignedType{unsignedTypeV128}, + } + signature_V128V128V128_V32 = &signature{ + in: []unsignedType{unsignedTypeV128, unsignedTypeV128, unsignedTypeV128}, + out: []unsignedType{unsignedTypeV128}, + } + signature_I32_V128 = &signature{ + in: []unsignedType{unsignedTypeI32}, + out: []unsignedType{unsignedTypeV128}, + } + signature_I32V128_None = &signature{ + in: []unsignedType{unsignedTypeI32, unsignedTypeV128}, + } + signature_I32V128_V128 = &signature{ + in: []unsignedType{unsignedTypeI32, unsignedTypeV128}, + out: []unsignedType{unsignedTypeV128}, + } + signature_V128I32_V128 = &signature{ + in: []unsignedType{unsignedTypeV128, unsignedTypeI32}, + out: []unsignedType{unsignedTypeV128}, + } + signature_V128I64_V128 = &signature{ + in: []unsignedType{unsignedTypeV128, unsignedTypeI64}, + out: []unsignedType{unsignedTypeV128}, + } + signature_V128F32_V128 = &signature{ + in: []unsignedType{unsignedTypeV128, unsignedTypeF32}, + out: []unsignedType{unsignedTypeV128}, + } + signature_V128F64_V128 = &signature{ + in: []unsignedType{unsignedTypeV128, unsignedTypeF64}, + out: []unsignedType{unsignedTypeV128}, + } + signature_V128_I32 = &signature{ + in: []unsignedType{unsignedTypeV128}, + out: []unsignedType{unsignedTypeI32}, + } + signature_V128_I64 = &signature{ + in: []unsignedType{unsignedTypeV128}, + out: []unsignedType{unsignedTypeI64}, + } + signature_V128_F32 = &signature{ + in: []unsignedType{unsignedTypeV128}, + out: []unsignedType{unsignedTypeF32}, + } + signature_V128_F64 = &signature{ + in: []unsignedType{unsignedTypeV128}, + out: []unsignedType{unsignedTypeF64}, + } + signature_V128_V128 = &signature{ + in: []unsignedType{unsignedTypeV128}, + out: []unsignedType{unsignedTypeV128}, + } + signature_I64_V128 = &signature{ + in: []unsignedType{unsignedTypeI64}, + out: []unsignedType{unsignedTypeV128}, + } + signature_F32_V128 = &signature{ + in: []unsignedType{unsignedTypeF32}, + out: []unsignedType{unsignedTypeV128}, + } + signature_F64_V128 = &signature{ + in: []unsignedType{unsignedTypeF64}, + out: []unsignedType{unsignedTypeV128}, + } + signature_I32I64_I64 = &signature{ + in: []unsignedType{unsignedTypeI32, unsignedTypeI64}, + out: []unsignedType{unsignedTypeI64}, + } + signature_I32I32I64_I32 = &signature{ + in: []unsignedType{unsignedTypeI32, unsignedTypeI32, unsignedTypeI64}, + out: []unsignedType{unsignedTypeI32}, + } + signature_I32I64I64_I32 = &signature{ + in: []unsignedType{unsignedTypeI32, unsignedTypeI64, unsignedTypeI64}, + out: []unsignedType{unsignedTypeI32}, + } + signature_I32I32I32_I32 = &signature{ + in: []unsignedType{unsignedTypeI32, unsignedTypeI32, unsignedTypeI32}, + out: []unsignedType{unsignedTypeI32}, + } + signature_I32I64I64_I64 = &signature{ + in: []unsignedType{unsignedTypeI32, unsignedTypeI64, unsignedTypeI64}, + out: []unsignedType{unsignedTypeI64}, + } +) + +// wasmOpcodeSignature returns the signature of given Wasm opcode. +// Note that some of opcodes' signature vary depending on +// the function instance (for example, local types). +// "index" parameter is not used by most of opcodes. +// The returned signature is used for stack validation when lowering Wasm's opcodes to interpreterir. +func (c *compiler) wasmOpcodeSignature(op wasm.Opcode, index uint32) (*signature, error) { + switch op { + case wasm.OpcodeUnreachable, wasm.OpcodeNop, wasm.OpcodeBlock, wasm.OpcodeLoop: + return signature_None_None, nil + case wasm.OpcodeIf: + return signature_I32_None, nil + case wasm.OpcodeElse, wasm.OpcodeEnd, wasm.OpcodeBr: + return signature_None_None, nil + case wasm.OpcodeBrIf, wasm.OpcodeBrTable: + return signature_I32_None, nil + case wasm.OpcodeReturn: + return signature_None_None, nil + case wasm.OpcodeCall: + return c.funcTypeToSigs.get(c.funcs[index], false /* direct */), nil + case wasm.OpcodeCallIndirect: + return c.funcTypeToSigs.get(index, true /* call_indirect */), nil + case wasm.OpcodeDrop: + return signature_Unknown_None, nil + case wasm.OpcodeSelect, wasm.OpcodeTypedSelect: + return signature_UnknownUnknownI32_Unknown, nil + case wasm.OpcodeLocalGet: + inputLen := uint32(len(c.sig.Params)) + if l := uint32(len(c.localTypes)) + inputLen; index >= l { + return nil, fmt.Errorf("invalid local index for local.get %d >= %d", index, l) + } + var t wasm.ValueType + if index < inputLen { + t = c.sig.Params[index] + } else { + t = c.localTypes[index-inputLen] + } + return wasmValueTypeToUnsignedOutSignature(t), nil + case wasm.OpcodeLocalSet: + inputLen := uint32(len(c.sig.Params)) + if l := uint32(len(c.localTypes)) + inputLen; index >= l { + return nil, fmt.Errorf("invalid local index for local.get %d >= %d", index, l) + } + var t wasm.ValueType + if index < inputLen { + t = c.sig.Params[index] + } else { + t = c.localTypes[index-inputLen] + } + return wasmValueTypeToUnsignedInSignature(t), nil + case wasm.OpcodeLocalTee: + inputLen := uint32(len(c.sig.Params)) + if l := uint32(len(c.localTypes)) + inputLen; index >= l { + return nil, fmt.Errorf("invalid local index for local.get %d >= %d", index, l) + } + var t wasm.ValueType + if index < inputLen { + t = c.sig.Params[index] + } else { + t = c.localTypes[index-inputLen] + } + return wasmValueTypeToUnsignedInOutSignature(t), nil + case wasm.OpcodeGlobalGet: + if len(c.globals) <= int(index) { + return nil, fmt.Errorf("invalid global index for global.get %d >= %d", index, len(c.globals)) + } + return wasmValueTypeToUnsignedOutSignature(c.globals[index].ValType), nil + case wasm.OpcodeGlobalSet: + if len(c.globals) <= int(index) { + return nil, fmt.Errorf("invalid global index for global.get %d >= %d", index, len(c.globals)) + } + return wasmValueTypeToUnsignedInSignature(c.globals[index].ValType), nil + case wasm.OpcodeI32Load: + return signature_I32_I32, nil + case wasm.OpcodeI64Load: + return signature_I32_I64, nil + case wasm.OpcodeF32Load: + return signature_I32_F32, nil + case wasm.OpcodeF64Load: + return signature_I32_F64, nil + case wasm.OpcodeI32Load8S, wasm.OpcodeI32Load8U, wasm.OpcodeI32Load16S, wasm.OpcodeI32Load16U: + return signature_I32_I32, nil + case wasm.OpcodeI64Load8S, wasm.OpcodeI64Load8U, wasm.OpcodeI64Load16S, wasm.OpcodeI64Load16U, + wasm.OpcodeI64Load32S, wasm.OpcodeI64Load32U: + return signature_I32_I64, nil + case wasm.OpcodeI32Store: + return signature_I32I32_None, nil + case wasm.OpcodeI64Store: + return signature_I32I64_None, nil + case wasm.OpcodeF32Store: + return signature_I32F32_None, nil + case wasm.OpcodeF64Store: + return signature_I32F64_None, nil + case wasm.OpcodeI32Store8: + return signature_I32I32_None, nil + case wasm.OpcodeI32Store16: + return signature_I32I32_None, nil + case wasm.OpcodeI64Store8: + return signature_I32I64_None, nil + case wasm.OpcodeI64Store16: + return signature_I32I64_None, nil + case wasm.OpcodeI64Store32: + return signature_I32I64_None, nil + case wasm.OpcodeMemorySize: + return signature_None_I32, nil + case wasm.OpcodeMemoryGrow: + return signature_I32_I32, nil + case wasm.OpcodeI32Const: + return signature_None_I32, nil + case wasm.OpcodeI64Const: + return signature_None_I64, nil + case wasm.OpcodeF32Const: + return signature_None_F32, nil + case wasm.OpcodeF64Const: + return signature_None_F64, nil + case wasm.OpcodeI32Eqz: + return signature_I32_I32, nil + case wasm.OpcodeI32Eq, wasm.OpcodeI32Ne, wasm.OpcodeI32LtS, + wasm.OpcodeI32LtU, wasm.OpcodeI32GtS, wasm.OpcodeI32GtU, + wasm.OpcodeI32LeS, wasm.OpcodeI32LeU, wasm.OpcodeI32GeS, + wasm.OpcodeI32GeU: + return signature_I32I32_I32, nil + case wasm.OpcodeI64Eqz: + return signature_I64_I32, nil + case wasm.OpcodeI64Eq, wasm.OpcodeI64Ne, wasm.OpcodeI64LtS, + wasm.OpcodeI64LtU, wasm.OpcodeI64GtS, wasm.OpcodeI64GtU, + wasm.OpcodeI64LeS, wasm.OpcodeI64LeU, wasm.OpcodeI64GeS, + wasm.OpcodeI64GeU: + return signature_I64I64_I32, nil + case wasm.OpcodeF32Eq, wasm.OpcodeF32Ne, wasm.OpcodeF32Lt, + wasm.OpcodeF32Gt, wasm.OpcodeF32Le, wasm.OpcodeF32Ge: + return signature_F32F32_I32, nil + case wasm.OpcodeF64Eq, wasm.OpcodeF64Ne, wasm.OpcodeF64Lt, + wasm.OpcodeF64Gt, wasm.OpcodeF64Le, wasm.OpcodeF64Ge: + return signature_F64F64_I32, nil + case wasm.OpcodeI32Clz, wasm.OpcodeI32Ctz, wasm.OpcodeI32Popcnt: + return signature_I32_I32, nil + case wasm.OpcodeI32Add, wasm.OpcodeI32Sub, wasm.OpcodeI32Mul, + wasm.OpcodeI32DivS, wasm.OpcodeI32DivU, wasm.OpcodeI32RemS, + wasm.OpcodeI32RemU, wasm.OpcodeI32And, wasm.OpcodeI32Or, + wasm.OpcodeI32Xor, wasm.OpcodeI32Shl, wasm.OpcodeI32ShrS, + wasm.OpcodeI32ShrU, wasm.OpcodeI32Rotl, wasm.OpcodeI32Rotr: + return signature_I32I32_I32, nil + case wasm.OpcodeI64Clz, wasm.OpcodeI64Ctz, wasm.OpcodeI64Popcnt: + return signature_I64_I64, nil + case wasm.OpcodeI64Add, wasm.OpcodeI64Sub, wasm.OpcodeI64Mul, + wasm.OpcodeI64DivS, wasm.OpcodeI64DivU, wasm.OpcodeI64RemS, + wasm.OpcodeI64RemU, wasm.OpcodeI64And, wasm.OpcodeI64Or, + wasm.OpcodeI64Xor, wasm.OpcodeI64Shl, wasm.OpcodeI64ShrS, + wasm.OpcodeI64ShrU, wasm.OpcodeI64Rotl, wasm.OpcodeI64Rotr: + return signature_I64I64_I64, nil + case wasm.OpcodeF32Abs, wasm.OpcodeF32Neg, wasm.OpcodeF32Ceil, + wasm.OpcodeF32Floor, wasm.OpcodeF32Trunc, wasm.OpcodeF32Nearest, + wasm.OpcodeF32Sqrt: + return signature_F32_F32, nil + case wasm.OpcodeF32Add, wasm.OpcodeF32Sub, wasm.OpcodeF32Mul, + wasm.OpcodeF32Div, wasm.OpcodeF32Min, wasm.OpcodeF32Max, + wasm.OpcodeF32Copysign: + return signature_F32F32_F32, nil + case wasm.OpcodeF64Abs, wasm.OpcodeF64Neg, wasm.OpcodeF64Ceil, + wasm.OpcodeF64Floor, wasm.OpcodeF64Trunc, wasm.OpcodeF64Nearest, + wasm.OpcodeF64Sqrt: + return signature_F64_F64, nil + case wasm.OpcodeF64Add, wasm.OpcodeF64Sub, wasm.OpcodeF64Mul, + wasm.OpcodeF64Div, wasm.OpcodeF64Min, wasm.OpcodeF64Max, + wasm.OpcodeF64Copysign: + return signature_F64F64_F64, nil + case wasm.OpcodeI32WrapI64: + return signature_I64_I32, nil + case wasm.OpcodeI32TruncF32S, wasm.OpcodeI32TruncF32U: + return signature_F32_I32, nil + case wasm.OpcodeI32TruncF64S, wasm.OpcodeI32TruncF64U: + return signature_F64_I32, nil + case wasm.OpcodeI64ExtendI32S, wasm.OpcodeI64ExtendI32U: + return signature_I32_I64, nil + case wasm.OpcodeI64TruncF32S, wasm.OpcodeI64TruncF32U: + return signature_F32_I64, nil + case wasm.OpcodeI64TruncF64S, wasm.OpcodeI64TruncF64U: + return signature_F64_I64, nil + case wasm.OpcodeF32ConvertI32S, wasm.OpcodeF32ConvertI32U: + return signature_I32_F32, nil + case wasm.OpcodeF32ConvertI64S, wasm.OpcodeF32ConvertI64U: + return signature_I64_F32, nil + case wasm.OpcodeF32DemoteF64: + return signature_F64_F32, nil + case wasm.OpcodeF64ConvertI32S, wasm.OpcodeF64ConvertI32U: + return signature_I32_F64, nil + case wasm.OpcodeF64ConvertI64S, wasm.OpcodeF64ConvertI64U: + return signature_I64_F64, nil + case wasm.OpcodeF64PromoteF32: + return signature_F32_F64, nil + case wasm.OpcodeI32ReinterpretF32: + return signature_F32_I32, nil + case wasm.OpcodeI64ReinterpretF64: + return signature_F64_I64, nil + case wasm.OpcodeF32ReinterpretI32: + return signature_I32_F32, nil + case wasm.OpcodeF64ReinterpretI64: + return signature_I64_F64, nil + case wasm.OpcodeI32Extend8S, wasm.OpcodeI32Extend16S: + return signature_I32_I32, nil + case wasm.OpcodeI64Extend8S, wasm.OpcodeI64Extend16S, wasm.OpcodeI64Extend32S: + return signature_I64_I64, nil + case wasm.OpcodeTableGet: + // table.get takes table's offset and pushes the ref type value of opaque pointer as i64 value onto the stack. + return signature_I32_I64, nil + case wasm.OpcodeTableSet: + // table.set takes table's offset and the ref type value of opaque pointer as i64 value. + return signature_I32I64_None, nil + case wasm.OpcodeRefFunc: + // ref.func is translated as pushing the compiled function's opaque pointer (uint64) at interpreterir layer. + return signature_None_I64, nil + case wasm.OpcodeRefIsNull: + // ref.is_null is translated as checking if the uint64 on the top of the stack (opaque pointer) is zero or not. + return signature_I64_I32, nil + case wasm.OpcodeRefNull: + // ref.null is translated as i64.const 0. + return signature_None_I64, nil + case wasm.OpcodeMiscPrefix: + switch miscOp := c.body[c.pc+1]; miscOp { + case wasm.OpcodeMiscI32TruncSatF32S, wasm.OpcodeMiscI32TruncSatF32U: + return signature_F32_I32, nil + case wasm.OpcodeMiscI32TruncSatF64S, wasm.OpcodeMiscI32TruncSatF64U: + return signature_F64_I32, nil + case wasm.OpcodeMiscI64TruncSatF32S, wasm.OpcodeMiscI64TruncSatF32U: + return signature_F32_I64, nil + case wasm.OpcodeMiscI64TruncSatF64S, wasm.OpcodeMiscI64TruncSatF64U: + return signature_F64_I64, nil + case wasm.OpcodeMiscMemoryInit, wasm.OpcodeMiscMemoryCopy, wasm.OpcodeMiscMemoryFill, + wasm.OpcodeMiscTableInit, wasm.OpcodeMiscTableCopy: + return signature_I32I32I32_None, nil + case wasm.OpcodeMiscDataDrop, wasm.OpcodeMiscElemDrop: + return signature_None_None, nil + case wasm.OpcodeMiscTableGrow: + return signature_I64I32_I32, nil + case wasm.OpcodeMiscTableSize: + return signature_None_I32, nil + case wasm.OpcodeMiscTableFill: + return signature_I32I64I32_None, nil + default: + return nil, fmt.Errorf("unsupported misc instruction in interpreterir: 0x%x", op) + } + case wasm.OpcodeVecPrefix: + switch vecOp := c.body[c.pc+1]; vecOp { + case wasm.OpcodeVecV128Const: + return signature_None_V128, nil + case wasm.OpcodeVecV128Load, wasm.OpcodeVecV128Load8x8s, wasm.OpcodeVecV128Load8x8u, + wasm.OpcodeVecV128Load16x4s, wasm.OpcodeVecV128Load16x4u, wasm.OpcodeVecV128Load32x2s, + wasm.OpcodeVecV128Load32x2u, wasm.OpcodeVecV128Load8Splat, wasm.OpcodeVecV128Load16Splat, + wasm.OpcodeVecV128Load32Splat, wasm.OpcodeVecV128Load64Splat, wasm.OpcodeVecV128Load32zero, + wasm.OpcodeVecV128Load64zero: + return signature_I32_V128, nil + case wasm.OpcodeVecV128Load8Lane, wasm.OpcodeVecV128Load16Lane, + wasm.OpcodeVecV128Load32Lane, wasm.OpcodeVecV128Load64Lane: + return signature_I32V128_V128, nil + case wasm.OpcodeVecV128Store, + wasm.OpcodeVecV128Store8Lane, + wasm.OpcodeVecV128Store16Lane, + wasm.OpcodeVecV128Store32Lane, + wasm.OpcodeVecV128Store64Lane: + return signature_I32V128_None, nil + case wasm.OpcodeVecI8x16ExtractLaneS, + wasm.OpcodeVecI8x16ExtractLaneU, + wasm.OpcodeVecI16x8ExtractLaneS, + wasm.OpcodeVecI16x8ExtractLaneU, + wasm.OpcodeVecI32x4ExtractLane: + return signature_V128_I32, nil + case wasm.OpcodeVecI64x2ExtractLane: + return signature_V128_I64, nil + case wasm.OpcodeVecF32x4ExtractLane: + return signature_V128_F32, nil + case wasm.OpcodeVecF64x2ExtractLane: + return signature_V128_F64, nil + case wasm.OpcodeVecI8x16ReplaceLane, wasm.OpcodeVecI16x8ReplaceLane, wasm.OpcodeVecI32x4ReplaceLane, + wasm.OpcodeVecI8x16Shl, wasm.OpcodeVecI8x16ShrS, wasm.OpcodeVecI8x16ShrU, + wasm.OpcodeVecI16x8Shl, wasm.OpcodeVecI16x8ShrS, wasm.OpcodeVecI16x8ShrU, + wasm.OpcodeVecI32x4Shl, wasm.OpcodeVecI32x4ShrS, wasm.OpcodeVecI32x4ShrU, + wasm.OpcodeVecI64x2Shl, wasm.OpcodeVecI64x2ShrS, wasm.OpcodeVecI64x2ShrU: + return signature_V128I32_V128, nil + case wasm.OpcodeVecI64x2ReplaceLane: + return signature_V128I64_V128, nil + case wasm.OpcodeVecF32x4ReplaceLane: + return signature_V128F32_V128, nil + case wasm.OpcodeVecF64x2ReplaceLane: + return signature_V128F64_V128, nil + case wasm.OpcodeVecI8x16Splat, + wasm.OpcodeVecI16x8Splat, + wasm.OpcodeVecI32x4Splat: + return signature_I32_V128, nil + case wasm.OpcodeVecI64x2Splat: + return signature_I64_V128, nil + case wasm.OpcodeVecF32x4Splat: + return signature_F32_V128, nil + case wasm.OpcodeVecF64x2Splat: + return signature_F64_V128, nil + case wasm.OpcodeVecV128i8x16Shuffle, wasm.OpcodeVecI8x16Swizzle, wasm.OpcodeVecV128And, wasm.OpcodeVecV128Or, wasm.OpcodeVecV128Xor, wasm.OpcodeVecV128AndNot: + return signature_V128V128_V128, nil + case wasm.OpcodeVecI8x16AllTrue, wasm.OpcodeVecI16x8AllTrue, wasm.OpcodeVecI32x4AllTrue, wasm.OpcodeVecI64x2AllTrue, + wasm.OpcodeVecV128AnyTrue, + wasm.OpcodeVecI8x16BitMask, wasm.OpcodeVecI16x8BitMask, wasm.OpcodeVecI32x4BitMask, wasm.OpcodeVecI64x2BitMask: + return signature_V128_I32, nil + case wasm.OpcodeVecV128Not, wasm.OpcodeVecI8x16Neg, wasm.OpcodeVecI16x8Neg, wasm.OpcodeVecI32x4Neg, wasm.OpcodeVecI64x2Neg, + wasm.OpcodeVecF32x4Neg, wasm.OpcodeVecF64x2Neg, wasm.OpcodeVecF32x4Sqrt, wasm.OpcodeVecF64x2Sqrt, + wasm.OpcodeVecI8x16Abs, wasm.OpcodeVecI8x16Popcnt, wasm.OpcodeVecI16x8Abs, wasm.OpcodeVecI32x4Abs, wasm.OpcodeVecI64x2Abs, + wasm.OpcodeVecF32x4Abs, wasm.OpcodeVecF64x2Abs, + wasm.OpcodeVecF32x4Ceil, wasm.OpcodeVecF32x4Floor, wasm.OpcodeVecF32x4Trunc, wasm.OpcodeVecF32x4Nearest, + wasm.OpcodeVecF64x2Ceil, wasm.OpcodeVecF64x2Floor, wasm.OpcodeVecF64x2Trunc, wasm.OpcodeVecF64x2Nearest, + wasm.OpcodeVecI16x8ExtendLowI8x16S, wasm.OpcodeVecI16x8ExtendHighI8x16S, wasm.OpcodeVecI16x8ExtendLowI8x16U, wasm.OpcodeVecI16x8ExtendHighI8x16U, + wasm.OpcodeVecI32x4ExtendLowI16x8S, wasm.OpcodeVecI32x4ExtendHighI16x8S, wasm.OpcodeVecI32x4ExtendLowI16x8U, wasm.OpcodeVecI32x4ExtendHighI16x8U, + wasm.OpcodeVecI64x2ExtendLowI32x4S, wasm.OpcodeVecI64x2ExtendHighI32x4S, wasm.OpcodeVecI64x2ExtendLowI32x4U, wasm.OpcodeVecI64x2ExtendHighI32x4U, + wasm.OpcodeVecI16x8ExtaddPairwiseI8x16S, wasm.OpcodeVecI16x8ExtaddPairwiseI8x16U, wasm.OpcodeVecI32x4ExtaddPairwiseI16x8S, wasm.OpcodeVecI32x4ExtaddPairwiseI16x8U, + wasm.OpcodeVecF64x2PromoteLowF32x4Zero, wasm.OpcodeVecF32x4DemoteF64x2Zero, + wasm.OpcodeVecF32x4ConvertI32x4S, wasm.OpcodeVecF32x4ConvertI32x4U, + wasm.OpcodeVecF64x2ConvertLowI32x4S, wasm.OpcodeVecF64x2ConvertLowI32x4U, + wasm.OpcodeVecI32x4TruncSatF32x4S, wasm.OpcodeVecI32x4TruncSatF32x4U, + wasm.OpcodeVecI32x4TruncSatF64x2SZero, wasm.OpcodeVecI32x4TruncSatF64x2UZero: + return signature_V128_V128, nil + case wasm.OpcodeVecV128Bitselect: + return signature_V128V128V128_V32, nil + case wasm.OpcodeVecI8x16Eq, wasm.OpcodeVecI8x16Ne, wasm.OpcodeVecI8x16LtS, wasm.OpcodeVecI8x16LtU, wasm.OpcodeVecI8x16GtS, + wasm.OpcodeVecI8x16GtU, wasm.OpcodeVecI8x16LeS, wasm.OpcodeVecI8x16LeU, wasm.OpcodeVecI8x16GeS, wasm.OpcodeVecI8x16GeU, + wasm.OpcodeVecI16x8Eq, wasm.OpcodeVecI16x8Ne, wasm.OpcodeVecI16x8LtS, wasm.OpcodeVecI16x8LtU, wasm.OpcodeVecI16x8GtS, + wasm.OpcodeVecI16x8GtU, wasm.OpcodeVecI16x8LeS, wasm.OpcodeVecI16x8LeU, wasm.OpcodeVecI16x8GeS, wasm.OpcodeVecI16x8GeU, + wasm.OpcodeVecI32x4Eq, wasm.OpcodeVecI32x4Ne, wasm.OpcodeVecI32x4LtS, wasm.OpcodeVecI32x4LtU, wasm.OpcodeVecI32x4GtS, + wasm.OpcodeVecI32x4GtU, wasm.OpcodeVecI32x4LeS, wasm.OpcodeVecI32x4LeU, wasm.OpcodeVecI32x4GeS, wasm.OpcodeVecI32x4GeU, + wasm.OpcodeVecI64x2Eq, wasm.OpcodeVecI64x2Ne, wasm.OpcodeVecI64x2LtS, wasm.OpcodeVecI64x2GtS, wasm.OpcodeVecI64x2LeS, + wasm.OpcodeVecI64x2GeS, wasm.OpcodeVecF32x4Eq, wasm.OpcodeVecF32x4Ne, wasm.OpcodeVecF32x4Lt, wasm.OpcodeVecF32x4Gt, + wasm.OpcodeVecF32x4Le, wasm.OpcodeVecF32x4Ge, wasm.OpcodeVecF64x2Eq, wasm.OpcodeVecF64x2Ne, wasm.OpcodeVecF64x2Lt, + wasm.OpcodeVecF64x2Gt, wasm.OpcodeVecF64x2Le, wasm.OpcodeVecF64x2Ge, + wasm.OpcodeVecI8x16Add, wasm.OpcodeVecI8x16AddSatS, wasm.OpcodeVecI8x16AddSatU, wasm.OpcodeVecI8x16Sub, + wasm.OpcodeVecI8x16SubSatS, wasm.OpcodeVecI8x16SubSatU, + wasm.OpcodeVecI16x8Add, wasm.OpcodeVecI16x8AddSatS, wasm.OpcodeVecI16x8AddSatU, wasm.OpcodeVecI16x8Sub, + wasm.OpcodeVecI16x8SubSatS, wasm.OpcodeVecI16x8SubSatU, wasm.OpcodeVecI16x8Mul, + wasm.OpcodeVecI32x4Add, wasm.OpcodeVecI32x4Sub, wasm.OpcodeVecI32x4Mul, + wasm.OpcodeVecI64x2Add, wasm.OpcodeVecI64x2Sub, wasm.OpcodeVecI64x2Mul, + wasm.OpcodeVecF32x4Add, wasm.OpcodeVecF32x4Sub, wasm.OpcodeVecF32x4Mul, wasm.OpcodeVecF32x4Div, + wasm.OpcodeVecF64x2Add, wasm.OpcodeVecF64x2Sub, wasm.OpcodeVecF64x2Mul, wasm.OpcodeVecF64x2Div, + wasm.OpcodeVecI8x16MinS, wasm.OpcodeVecI8x16MinU, wasm.OpcodeVecI8x16MaxS, wasm.OpcodeVecI8x16MaxU, wasm.OpcodeVecI8x16AvgrU, + wasm.OpcodeVecI16x8MinS, wasm.OpcodeVecI16x8MinU, wasm.OpcodeVecI16x8MaxS, wasm.OpcodeVecI16x8MaxU, wasm.OpcodeVecI16x8AvgrU, + wasm.OpcodeVecI32x4MinS, wasm.OpcodeVecI32x4MinU, wasm.OpcodeVecI32x4MaxS, wasm.OpcodeVecI32x4MaxU, + wasm.OpcodeVecF32x4Min, wasm.OpcodeVecF32x4Max, wasm.OpcodeVecF64x2Min, wasm.OpcodeVecF64x2Max, + wasm.OpcodeVecF32x4Pmin, wasm.OpcodeVecF32x4Pmax, wasm.OpcodeVecF64x2Pmin, wasm.OpcodeVecF64x2Pmax, + wasm.OpcodeVecI16x8Q15mulrSatS, + wasm.OpcodeVecI16x8ExtMulLowI8x16S, wasm.OpcodeVecI16x8ExtMulHighI8x16S, wasm.OpcodeVecI16x8ExtMulLowI8x16U, wasm.OpcodeVecI16x8ExtMulHighI8x16U, + wasm.OpcodeVecI32x4ExtMulLowI16x8S, wasm.OpcodeVecI32x4ExtMulHighI16x8S, wasm.OpcodeVecI32x4ExtMulLowI16x8U, wasm.OpcodeVecI32x4ExtMulHighI16x8U, + wasm.OpcodeVecI64x2ExtMulLowI32x4S, wasm.OpcodeVecI64x2ExtMulHighI32x4S, wasm.OpcodeVecI64x2ExtMulLowI32x4U, wasm.OpcodeVecI64x2ExtMulHighI32x4U, + wasm.OpcodeVecI32x4DotI16x8S, + wasm.OpcodeVecI8x16NarrowI16x8S, wasm.OpcodeVecI8x16NarrowI16x8U, wasm.OpcodeVecI16x8NarrowI32x4S, wasm.OpcodeVecI16x8NarrowI32x4U: + return signature_V128V128_V128, nil + default: + return nil, fmt.Errorf("unsupported vector instruction in interpreterir: %s", wasm.VectorInstructionName(vecOp)) + } + case wasm.OpcodeAtomicPrefix: + switch atomicOp := c.body[c.pc+1]; atomicOp { + case wasm.OpcodeAtomicMemoryNotify: + return signature_I32I32_I32, nil + case wasm.OpcodeAtomicMemoryWait32: + return signature_I32I32I64_I32, nil + case wasm.OpcodeAtomicMemoryWait64: + return signature_I32I64I64_I32, nil + case wasm.OpcodeAtomicFence: + return signature_None_None, nil + case wasm.OpcodeAtomicI32Load, wasm.OpcodeAtomicI32Load8U, wasm.OpcodeAtomicI32Load16U: + return signature_I32_I32, nil + case wasm.OpcodeAtomicI64Load, wasm.OpcodeAtomicI64Load8U, wasm.OpcodeAtomicI64Load16U, wasm.OpcodeAtomicI64Load32U: + return signature_I32_I64, nil + case wasm.OpcodeAtomicI32Store, wasm.OpcodeAtomicI32Store8, wasm.OpcodeAtomicI32Store16: + return signature_I32I32_None, nil + case wasm.OpcodeAtomicI64Store, wasm.OpcodeAtomicI64Store8, wasm.OpcodeAtomicI64Store16, wasm.OpcodeAtomicI64Store32: + return signature_I32I64_None, nil + case wasm.OpcodeAtomicI32RmwAdd, wasm.OpcodeAtomicI32RmwSub, wasm.OpcodeAtomicI32RmwAnd, wasm.OpcodeAtomicI32RmwOr, wasm.OpcodeAtomicI32RmwXor, wasm.OpcodeAtomicI32RmwXchg, + wasm.OpcodeAtomicI32Rmw8AddU, wasm.OpcodeAtomicI32Rmw8SubU, wasm.OpcodeAtomicI32Rmw8AndU, wasm.OpcodeAtomicI32Rmw8OrU, wasm.OpcodeAtomicI32Rmw8XorU, wasm.OpcodeAtomicI32Rmw8XchgU, + wasm.OpcodeAtomicI32Rmw16AddU, wasm.OpcodeAtomicI32Rmw16SubU, wasm.OpcodeAtomicI32Rmw16AndU, wasm.OpcodeAtomicI32Rmw16OrU, wasm.OpcodeAtomicI32Rmw16XorU, wasm.OpcodeAtomicI32Rmw16XchgU: + return signature_I32I32_I32, nil + case wasm.OpcodeAtomicI64RmwAdd, wasm.OpcodeAtomicI64RmwSub, wasm.OpcodeAtomicI64RmwAnd, wasm.OpcodeAtomicI64RmwOr, wasm.OpcodeAtomicI64RmwXor, wasm.OpcodeAtomicI64RmwXchg, + wasm.OpcodeAtomicI64Rmw8AddU, wasm.OpcodeAtomicI64Rmw8SubU, wasm.OpcodeAtomicI64Rmw8AndU, wasm.OpcodeAtomicI64Rmw8OrU, wasm.OpcodeAtomicI64Rmw8XorU, wasm.OpcodeAtomicI64Rmw8XchgU, + wasm.OpcodeAtomicI64Rmw16AddU, wasm.OpcodeAtomicI64Rmw16SubU, wasm.OpcodeAtomicI64Rmw16AndU, wasm.OpcodeAtomicI64Rmw16OrU, wasm.OpcodeAtomicI64Rmw16XorU, wasm.OpcodeAtomicI64Rmw16XchgU, + wasm.OpcodeAtomicI64Rmw32AddU, wasm.OpcodeAtomicI64Rmw32SubU, wasm.OpcodeAtomicI64Rmw32AndU, wasm.OpcodeAtomicI64Rmw32OrU, wasm.OpcodeAtomicI64Rmw32XorU, wasm.OpcodeAtomicI64Rmw32XchgU: + return signature_I32I64_I64, nil + case wasm.OpcodeAtomicI32RmwCmpxchg, wasm.OpcodeAtomicI32Rmw8CmpxchgU, wasm.OpcodeAtomicI32Rmw16CmpxchgU: + return signature_I32I32I32_I32, nil + case wasm.OpcodeAtomicI64RmwCmpxchg, wasm.OpcodeAtomicI64Rmw8CmpxchgU, wasm.OpcodeAtomicI64Rmw16CmpxchgU, wasm.OpcodeAtomicI64Rmw32CmpxchgU: + return signature_I32I64I64_I64, nil + default: + return nil, fmt.Errorf("unsupported atomic instruction in interpreterir: %s", wasm.AtomicInstructionName(atomicOp)) + } + default: + return nil, fmt.Errorf("unsupported instruction in interpreterir: 0x%x", op) + } +} + +// funcTypeToIRSignatures is the central cache for a module to get the *signature +// for function calls. +type funcTypeToIRSignatures struct { + directCalls []*signature + indirectCalls []*signature + wasmTypes []wasm.FunctionType +} + +// get returns the *signature for the direct or indirect function call against functions whose type is at `typeIndex`. +func (f *funcTypeToIRSignatures) get(typeIndex wasm.Index, indirect bool) *signature { + var sig *signature + if indirect { + sig = f.indirectCalls[typeIndex] + } else { + sig = f.directCalls[typeIndex] + } + if sig != nil { + return sig + } + + tp := &f.wasmTypes[typeIndex] + if indirect { + sig = &signature{ + in: make([]unsignedType, 0, len(tp.Params)+1), // +1 to reserve space for call indirect index. + out: make([]unsignedType, 0, len(tp.Results)), + } + } else { + sig = &signature{ + in: make([]unsignedType, 0, len(tp.Params)), + out: make([]unsignedType, 0, len(tp.Results)), + } + } + + for _, vt := range tp.Params { + sig.in = append(sig.in, wasmValueTypeTounsignedType(vt)) + } + for _, vt := range tp.Results { + sig.out = append(sig.out, wasmValueTypeTounsignedType(vt)) + } + + if indirect { + sig.in = append(sig.in, unsignedTypeI32) + f.indirectCalls[typeIndex] = sig + } else { + f.directCalls[typeIndex] = sig + } + return sig +} + +func wasmValueTypeTounsignedType(vt wasm.ValueType) unsignedType { + switch vt { + case wasm.ValueTypeI32: + return unsignedTypeI32 + case wasm.ValueTypeI64, + // From interpreterir layer, ref type values are opaque 64-bit pointers. + wasm.ValueTypeExternref, wasm.ValueTypeFuncref: + return unsignedTypeI64 + case wasm.ValueTypeF32: + return unsignedTypeF32 + case wasm.ValueTypeF64: + return unsignedTypeF64 + case wasm.ValueTypeV128: + return unsignedTypeV128 + } + panic("unreachable") +} + +func wasmValueTypeToUnsignedOutSignature(vt wasm.ValueType) *signature { + switch vt { + case wasm.ValueTypeI32: + return signature_None_I32 + case wasm.ValueTypeI64, + // From interpreterir layer, ref type values are opaque 64-bit pointers. + wasm.ValueTypeExternref, wasm.ValueTypeFuncref: + return signature_None_I64 + case wasm.ValueTypeF32: + return signature_None_F32 + case wasm.ValueTypeF64: + return signature_None_F64 + case wasm.ValueTypeV128: + return signature_None_V128 + } + panic("unreachable") +} + +func wasmValueTypeToUnsignedInSignature(vt wasm.ValueType) *signature { + switch vt { + case wasm.ValueTypeI32: + return signature_I32_None + case wasm.ValueTypeI64, + // From interpreterir layer, ref type values are opaque 64-bit pointers. + wasm.ValueTypeExternref, wasm.ValueTypeFuncref: + return signature_I64_None + case wasm.ValueTypeF32: + return signature_F32_None + case wasm.ValueTypeF64: + return signature_F64_None + case wasm.ValueTypeV128: + return signature_V128_None + } + panic("unreachable") +} + +func wasmValueTypeToUnsignedInOutSignature(vt wasm.ValueType) *signature { + switch vt { + case wasm.ValueTypeI32: + return signature_I32_I32 + case wasm.ValueTypeI64, + // At interpreterir layer, ref type values are opaque 64-bit pointers. + wasm.ValueTypeExternref, wasm.ValueTypeFuncref: + return signature_I64_I64 + case wasm.ValueTypeF32: + return signature_F32_F32 + case wasm.ValueTypeF64: + return signature_F64_F64 + case wasm.ValueTypeV128: + return signature_V128_V128 + } + panic("unreachable") +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go new file mode 100644 index 000000000..cf91c6b7a --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go @@ -0,0 +1,170 @@ +package backend + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +type ( + // FunctionABI represents the ABI information for a function which corresponds to a ssa.Signature. + FunctionABI struct { + Initialized bool + + Args, Rets []ABIArg + ArgStackSize, RetStackSize int64 + + ArgIntRealRegs byte + ArgFloatRealRegs byte + RetIntRealRegs byte + RetFloatRealRegs byte + } + + // ABIArg represents either argument or return value's location. + ABIArg struct { + // Index is the index of the argument. + Index int + // Kind is the kind of the argument. + Kind ABIArgKind + // Reg is valid if Kind == ABIArgKindReg. + // This VReg must be based on RealReg. + Reg regalloc.VReg + // Offset is valid if Kind == ABIArgKindStack. + // This is the offset from the beginning of either arg or ret stack slot. + Offset int64 + // Type is the type of the argument. + Type ssa.Type + } + + // ABIArgKind is the kind of ABI argument. + ABIArgKind byte +) + +const ( + // ABIArgKindReg represents an argument passed in a register. + ABIArgKindReg = iota + // ABIArgKindStack represents an argument passed in the stack. + ABIArgKindStack +) + +// String implements fmt.Stringer. +func (a *ABIArg) String() string { + return fmt.Sprintf("args[%d]: %s", a.Index, a.Kind) +} + +// String implements fmt.Stringer. +func (a ABIArgKind) String() string { + switch a { + case ABIArgKindReg: + return "reg" + case ABIArgKindStack: + return "stack" + default: + panic("BUG") + } +} + +// Init initializes the abiImpl for the given signature. +func (a *FunctionABI) Init(sig *ssa.Signature, argResultInts, argResultFloats []regalloc.RealReg) { + if len(a.Rets) < len(sig.Results) { + a.Rets = make([]ABIArg, len(sig.Results)) + } + a.Rets = a.Rets[:len(sig.Results)] + a.RetStackSize = a.setABIArgs(a.Rets, sig.Results, argResultInts, argResultFloats) + if argsNum := len(sig.Params); len(a.Args) < argsNum { + a.Args = make([]ABIArg, argsNum) + } + a.Args = a.Args[:len(sig.Params)] + a.ArgStackSize = a.setABIArgs(a.Args, sig.Params, argResultInts, argResultFloats) + + // Gather the real registers usages in arg/return. + a.ArgIntRealRegs, a.ArgFloatRealRegs = 0, 0 + a.RetIntRealRegs, a.RetFloatRealRegs = 0, 0 + for i := range a.Rets { + r := &a.Rets[i] + if r.Kind == ABIArgKindReg { + if r.Type.IsInt() { + a.RetIntRealRegs++ + } else { + a.RetFloatRealRegs++ + } + } + } + for i := range a.Args { + arg := &a.Args[i] + if arg.Kind == ABIArgKindReg { + if arg.Type.IsInt() { + a.ArgIntRealRegs++ + } else { + a.ArgFloatRealRegs++ + } + } + } + + a.Initialized = true +} + +// setABIArgs sets the ABI arguments in the given slice. This assumes that len(s) >= len(types) +// where if len(s) > len(types), the last elements of s is for the multi-return slot. +func (a *FunctionABI) setABIArgs(s []ABIArg, types []ssa.Type, ints, floats []regalloc.RealReg) (stackSize int64) { + il, fl := len(ints), len(floats) + + var stackOffset int64 + intParamIndex, floatParamIndex := 0, 0 + for i, typ := range types { + arg := &s[i] + arg.Index = i + arg.Type = typ + if typ.IsInt() { + if intParamIndex >= il { + arg.Kind = ABIArgKindStack + const slotSize = 8 // Align 8 bytes. + arg.Offset = stackOffset + stackOffset += slotSize + } else { + arg.Kind = ABIArgKindReg + arg.Reg = regalloc.FromRealReg(ints[intParamIndex], regalloc.RegTypeInt) + intParamIndex++ + } + } else { + if floatParamIndex >= fl { + arg.Kind = ABIArgKindStack + slotSize := int64(8) // Align at least 8 bytes. + if typ.Bits() == 128 { // Vector. + slotSize = 16 + } + arg.Offset = stackOffset + stackOffset += slotSize + } else { + arg.Kind = ABIArgKindReg + arg.Reg = regalloc.FromRealReg(floats[floatParamIndex], regalloc.RegTypeFloat) + floatParamIndex++ + } + } + } + return stackOffset +} + +func (a *FunctionABI) AlignedArgResultStackSlotSize() uint32 { + stackSlotSize := a.RetStackSize + a.ArgStackSize + // Align stackSlotSize to 16 bytes. + stackSlotSize = (stackSlotSize + 15) &^ 15 + // Check overflow 32-bit. + if stackSlotSize > 0xFFFFFFFF { + panic("ABI stack slot size overflow") + } + return uint32(stackSlotSize) +} + +func (a *FunctionABI) ABIInfoAsUint64() uint64 { + return uint64(a.ArgIntRealRegs)<<56 | + uint64(a.ArgFloatRealRegs)<<48 | + uint64(a.RetIntRealRegs)<<40 | + uint64(a.RetFloatRealRegs)<<32 | + uint64(a.AlignedArgResultStackSlotSize()) +} + +func ABIInfoFromUint64(info uint64) (argIntRealRegs, argFloatRealRegs, retIntRealRegs, retFloatRealRegs byte, stackSlotSize uint32) { + return byte(info >> 56), byte(info >> 48), byte(info >> 40), byte(info >> 32), uint32(info) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go new file mode 100644 index 000000000..dd67da43e --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go @@ -0,0 +1,3 @@ +// Package backend must be free of Wasm-specific concept. In other words, +// this package must not import internal/wasm package. +package backend diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go new file mode 100644 index 000000000..59bbfe02d --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go @@ -0,0 +1,417 @@ +package backend + +import ( + "context" + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// NewCompiler returns a new Compiler that can generate a machine code. +func NewCompiler(ctx context.Context, mach Machine, builder ssa.Builder) Compiler { + return newCompiler(ctx, mach, builder) +} + +func newCompiler(_ context.Context, mach Machine, builder ssa.Builder) *compiler { + argResultInts, argResultFloats := mach.ArgsResultsRegs() + c := &compiler{ + mach: mach, ssaBuilder: builder, + nextVRegID: regalloc.VRegIDNonReservedBegin, + argResultInts: argResultInts, + argResultFloats: argResultFloats, + } + mach.SetCompiler(c) + return c +} + +// Compiler is the backend of wazevo which takes ssa.Builder and Machine, +// use the information there to emit the final machine code. +type Compiler interface { + // SSABuilder returns the ssa.Builder used by this compiler. + SSABuilder() ssa.Builder + + // Compile executes the following steps: + // 1. Lower() + // 2. RegAlloc() + // 3. Finalize() + // 4. Encode() + // + // Each step can be called individually for testing purpose, therefore they are exposed in this interface too. + // + // The returned byte slices are the machine code and the relocation information for the machine code. + // The caller is responsible for copying them immediately since the compiler may reuse the buffer. + Compile(ctx context.Context) (_ []byte, _ []RelocationInfo, _ error) + + // Lower lowers the given ssa.Instruction to the machine-specific instructions. + Lower() + + // RegAlloc performs the register allocation after Lower is called. + RegAlloc() + + // Finalize performs the finalization of the compilation, including machine code emission. + // This must be called after RegAlloc. + Finalize(ctx context.Context) error + + // Buf returns the buffer of the encoded machine code. This is only used for testing purpose. + Buf() []byte + + BufPtr() *[]byte + + // Format returns the debug string of the current state of the compiler. + Format() string + + // Init initializes the internal state of the compiler for the next compilation. + Init() + + // AllocateVReg allocates a new virtual register of the given type. + AllocateVReg(typ ssa.Type) regalloc.VReg + + // ValueDefinition returns the definition of the given value. + ValueDefinition(ssa.Value) *SSAValueDefinition + + // VRegOf returns the virtual register of the given ssa.Value. + VRegOf(value ssa.Value) regalloc.VReg + + // TypeOf returns the ssa.Type of the given virtual register. + TypeOf(regalloc.VReg) ssa.Type + + // MatchInstr returns true if the given definition is from an instruction with the given opcode, the current group ID, + // and a refcount of 1. That means, the instruction can be merged/swapped within the current instruction group. + MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool + + // MatchInstrOneOf is the same as MatchInstr but for multiple opcodes. If it matches one of ssa.Opcode, + // this returns the opcode. Otherwise, this returns ssa.OpcodeInvalid. + // + // Note: caller should be careful to avoid excessive allocation on opcodes slice. + MatchInstrOneOf(def *SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode + + // AddRelocationInfo appends the relocation information for the function reference at the current buffer offset. + AddRelocationInfo(funcRef ssa.FuncRef) + + // AddSourceOffsetInfo appends the source offset information for the given offset. + AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset) + + // SourceOffsetInfo returns the source offset information for the current buffer offset. + SourceOffsetInfo() []SourceOffsetInfo + + // EmitByte appends a byte to the buffer. Used during the code emission. + EmitByte(b byte) + + // Emit4Bytes appends 4 bytes to the buffer. Used during the code emission. + Emit4Bytes(b uint32) + + // Emit8Bytes appends 8 bytes to the buffer. Used during the code emission. + Emit8Bytes(b uint64) + + // GetFunctionABI returns the ABI information for the given signature. + GetFunctionABI(sig *ssa.Signature) *FunctionABI +} + +// RelocationInfo represents the relocation information for a call instruction. +type RelocationInfo struct { + // Offset represents the offset from the beginning of the machine code of either a function or the entire module. + Offset int64 + // Target is the target function of the call instruction. + FuncRef ssa.FuncRef +} + +// compiler implements Compiler. +type compiler struct { + mach Machine + currentGID ssa.InstructionGroupID + ssaBuilder ssa.Builder + // nextVRegID is the next virtual register ID to be allocated. + nextVRegID regalloc.VRegID + // ssaValueToVRegs maps ssa.ValueID to regalloc.VReg. + ssaValueToVRegs [] /* VRegID to */ regalloc.VReg + // ssaValueDefinitions maps ssa.ValueID to its definition. + ssaValueDefinitions []SSAValueDefinition + // ssaValueRefCounts is a cached list obtained by ssa.Builder.ValueRefCounts(). + ssaValueRefCounts []int + // returnVRegs is the list of virtual registers that store the return values. + returnVRegs []regalloc.VReg + varEdges [][2]regalloc.VReg + varEdgeTypes []ssa.Type + constEdges []struct { + cInst *ssa.Instruction + dst regalloc.VReg + } + vRegSet []bool + vRegIDs []regalloc.VRegID + tempRegs []regalloc.VReg + tmpVals []ssa.Value + ssaTypeOfVRegID [] /* VRegID to */ ssa.Type + buf []byte + relocations []RelocationInfo + sourceOffsets []SourceOffsetInfo + // abis maps ssa.SignatureID to the ABI implementation. + abis []FunctionABI + argResultInts, argResultFloats []regalloc.RealReg +} + +// SourceOffsetInfo is a data to associate the source offset with the executable offset. +type SourceOffsetInfo struct { + // SourceOffset is the source offset in the original source code. + SourceOffset ssa.SourceOffset + // ExecutableOffset is the offset in the compiled executable. + ExecutableOffset int64 +} + +// Compile implements Compiler.Compile. +func (c *compiler) Compile(ctx context.Context) ([]byte, []RelocationInfo, error) { + c.Lower() + if wazevoapi.PrintSSAToBackendIRLowering && wazevoapi.PrintEnabledIndex(ctx) { + fmt.Printf("[[[after lowering for %s ]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format()) + } + if wazevoapi.DeterministicCompilationVerifierEnabled { + wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After lowering to ISA specific IR", c.Format()) + } + c.RegAlloc() + if wazevoapi.PrintRegisterAllocated && wazevoapi.PrintEnabledIndex(ctx) { + fmt.Printf("[[[after regalloc for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format()) + } + if wazevoapi.DeterministicCompilationVerifierEnabled { + wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After Register Allocation", c.Format()) + } + if err := c.Finalize(ctx); err != nil { + return nil, nil, err + } + if wazevoapi.PrintFinalizedMachineCode && wazevoapi.PrintEnabledIndex(ctx) { + fmt.Printf("[[[after finalize for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format()) + } + if wazevoapi.DeterministicCompilationVerifierEnabled { + wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After Finalization", c.Format()) + } + return c.buf, c.relocations, nil +} + +// RegAlloc implements Compiler.RegAlloc. +func (c *compiler) RegAlloc() { + c.mach.RegAlloc() +} + +// Finalize implements Compiler.Finalize. +func (c *compiler) Finalize(ctx context.Context) error { + c.mach.PostRegAlloc() + return c.mach.Encode(ctx) +} + +// setCurrentGroupID sets the current instruction group ID. +func (c *compiler) setCurrentGroupID(gid ssa.InstructionGroupID) { + c.currentGID = gid +} + +// assignVirtualRegisters assigns a virtual register to each ssa.ValueID Valid in the ssa.Builder. +func (c *compiler) assignVirtualRegisters() { + builder := c.ssaBuilder + refCounts := builder.ValueRefCounts() + c.ssaValueRefCounts = refCounts + + need := len(refCounts) + if need >= len(c.ssaValueToVRegs) { + c.ssaValueToVRegs = append(c.ssaValueToVRegs, make([]regalloc.VReg, need+1)...) + } + if need >= len(c.ssaValueDefinitions) { + c.ssaValueDefinitions = append(c.ssaValueDefinitions, make([]SSAValueDefinition, need+1)...) + } + + for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() { + // First we assign a virtual register to each parameter. + for i := 0; i < blk.Params(); i++ { + p := blk.Param(i) + pid := p.ID() + typ := p.Type() + vreg := c.AllocateVReg(typ) + c.ssaValueToVRegs[pid] = vreg + c.ssaValueDefinitions[pid] = SSAValueDefinition{BlockParamValue: p, BlkParamVReg: vreg} + c.ssaTypeOfVRegID[vreg.ID()] = p.Type() + } + + // Assigns each value to a virtual register produced by instructions. + for cur := blk.Root(); cur != nil; cur = cur.Next() { + r, rs := cur.Returns() + var N int + if r.Valid() { + id := r.ID() + ssaTyp := r.Type() + typ := r.Type() + vReg := c.AllocateVReg(typ) + c.ssaValueToVRegs[id] = vReg + c.ssaValueDefinitions[id] = SSAValueDefinition{ + Instr: cur, + N: 0, + RefCount: refCounts[id], + } + c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp + N++ + } + for _, r := range rs { + id := r.ID() + ssaTyp := r.Type() + vReg := c.AllocateVReg(ssaTyp) + c.ssaValueToVRegs[id] = vReg + c.ssaValueDefinitions[id] = SSAValueDefinition{ + Instr: cur, + N: N, + RefCount: refCounts[id], + } + c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp + N++ + } + } + } + + for i, retBlk := 0, builder.ReturnBlock(); i < retBlk.Params(); i++ { + typ := retBlk.Param(i).Type() + vReg := c.AllocateVReg(typ) + c.returnVRegs = append(c.returnVRegs, vReg) + c.ssaTypeOfVRegID[vReg.ID()] = typ + } +} + +// AllocateVReg implements Compiler.AllocateVReg. +func (c *compiler) AllocateVReg(typ ssa.Type) regalloc.VReg { + regType := regalloc.RegTypeOf(typ) + r := regalloc.VReg(c.nextVRegID).SetRegType(regType) + + id := r.ID() + if int(id) >= len(c.ssaTypeOfVRegID) { + c.ssaTypeOfVRegID = append(c.ssaTypeOfVRegID, make([]ssa.Type, id+1)...) + } + c.ssaTypeOfVRegID[id] = typ + c.nextVRegID++ + return r +} + +// Init implements Compiler.Init. +func (c *compiler) Init() { + c.currentGID = 0 + c.nextVRegID = regalloc.VRegIDNonReservedBegin + c.returnVRegs = c.returnVRegs[:0] + c.mach.Reset() + c.varEdges = c.varEdges[:0] + c.constEdges = c.constEdges[:0] + c.buf = c.buf[:0] + c.sourceOffsets = c.sourceOffsets[:0] + c.relocations = c.relocations[:0] +} + +// ValueDefinition implements Compiler.ValueDefinition. +func (c *compiler) ValueDefinition(value ssa.Value) *SSAValueDefinition { + return &c.ssaValueDefinitions[value.ID()] +} + +// VRegOf implements Compiler.VRegOf. +func (c *compiler) VRegOf(value ssa.Value) regalloc.VReg { + return c.ssaValueToVRegs[value.ID()] +} + +// Format implements Compiler.Format. +func (c *compiler) Format() string { + return c.mach.Format() +} + +// TypeOf implements Compiler.Format. +func (c *compiler) TypeOf(v regalloc.VReg) ssa.Type { + return c.ssaTypeOfVRegID[v.ID()] +} + +// MatchInstr implements Compiler.MatchInstr. +func (c *compiler) MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool { + instr := def.Instr + return def.IsFromInstr() && + instr.Opcode() == opcode && + instr.GroupID() == c.currentGID && + def.RefCount < 2 +} + +// MatchInstrOneOf implements Compiler.MatchInstrOneOf. +func (c *compiler) MatchInstrOneOf(def *SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode { + instr := def.Instr + if !def.IsFromInstr() { + return ssa.OpcodeInvalid + } + + if instr.GroupID() != c.currentGID { + return ssa.OpcodeInvalid + } + + if def.RefCount >= 2 { + return ssa.OpcodeInvalid + } + + opcode := instr.Opcode() + for _, op := range opcodes { + if opcode == op { + return opcode + } + } + return ssa.OpcodeInvalid +} + +// SSABuilder implements Compiler .SSABuilder. +func (c *compiler) SSABuilder() ssa.Builder { + return c.ssaBuilder +} + +// AddSourceOffsetInfo implements Compiler.AddSourceOffsetInfo. +func (c *compiler) AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset) { + c.sourceOffsets = append(c.sourceOffsets, SourceOffsetInfo{ + SourceOffset: sourceOffset, + ExecutableOffset: executableOffset, + }) +} + +// SourceOffsetInfo implements Compiler.SourceOffsetInfo. +func (c *compiler) SourceOffsetInfo() []SourceOffsetInfo { + return c.sourceOffsets +} + +// AddRelocationInfo implements Compiler.AddRelocationInfo. +func (c *compiler) AddRelocationInfo(funcRef ssa.FuncRef) { + c.relocations = append(c.relocations, RelocationInfo{ + Offset: int64(len(c.buf)), + FuncRef: funcRef, + }) +} + +// Emit8Bytes implements Compiler.Emit8Bytes. +func (c *compiler) Emit8Bytes(b uint64) { + c.buf = append(c.buf, byte(b), byte(b>>8), byte(b>>16), byte(b>>24), byte(b>>32), byte(b>>40), byte(b>>48), byte(b>>56)) +} + +// Emit4Bytes implements Compiler.Emit4Bytes. +func (c *compiler) Emit4Bytes(b uint32) { + c.buf = append(c.buf, byte(b), byte(b>>8), byte(b>>16), byte(b>>24)) +} + +// EmitByte implements Compiler.EmitByte. +func (c *compiler) EmitByte(b byte) { + c.buf = append(c.buf, b) +} + +// Buf implements Compiler.Buf. +func (c *compiler) Buf() []byte { + return c.buf +} + +// BufPtr implements Compiler.BufPtr. +func (c *compiler) BufPtr() *[]byte { + return &c.buf +} + +func (c *compiler) GetFunctionABI(sig *ssa.Signature) *FunctionABI { + if int(sig.ID) >= len(c.abis) { + c.abis = append(c.abis, make([]FunctionABI, int(sig.ID)+1)...) + } + + abi := &c.abis[sig.ID] + if abi.Initialized { + return abi + } + + abi.Init(sig, c.argResultInts, c.argResultFloats) + return abi +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go new file mode 100644 index 000000000..80e65668a --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go @@ -0,0 +1,226 @@ +package backend + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// Lower implements Compiler.Lower. +func (c *compiler) Lower() { + c.assignVirtualRegisters() + c.mach.SetCurrentABI(c.GetFunctionABI(c.ssaBuilder.Signature())) + c.mach.ExecutableContext().StartLoweringFunction(c.ssaBuilder.BlockIDMax()) + c.lowerBlocks() +} + +// lowerBlocks lowers each block in the ssa.Builder. +func (c *compiler) lowerBlocks() { + builder := c.ssaBuilder + for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() { + c.lowerBlock(blk) + } + + ectx := c.mach.ExecutableContext() + // After lowering all blocks, we need to link adjacent blocks to layout one single instruction list. + var prev ssa.BasicBlock + for next := builder.BlockIteratorReversePostOrderBegin(); next != nil; next = builder.BlockIteratorReversePostOrderNext() { + if prev != nil { + ectx.LinkAdjacentBlocks(prev, next) + } + prev = next + } +} + +func (c *compiler) lowerBlock(blk ssa.BasicBlock) { + mach := c.mach + ectx := mach.ExecutableContext() + ectx.StartBlock(blk) + + // We traverse the instructions in reverse order because we might want to lower multiple + // instructions together. + cur := blk.Tail() + + // First gather the branching instructions at the end of the blocks. + var br0, br1 *ssa.Instruction + if cur.IsBranching() { + br0 = cur + cur = cur.Prev() + if cur != nil && cur.IsBranching() { + br1 = cur + cur = cur.Prev() + } + } + + if br0 != nil { + c.lowerBranches(br0, br1) + } + + if br1 != nil && br0 == nil { + panic("BUG? when a block has conditional branch but doesn't end with an unconditional branch?") + } + + // Now start lowering the non-branching instructions. + for ; cur != nil; cur = cur.Prev() { + c.setCurrentGroupID(cur.GroupID()) + if cur.Lowered() { + continue + } + + switch cur.Opcode() { + case ssa.OpcodeReturn: + rets := cur.ReturnVals() + if len(rets) > 0 { + c.mach.LowerReturns(rets) + } + c.mach.InsertReturn() + default: + mach.LowerInstr(cur) + } + ectx.FlushPendingInstructions() + } + + // Finally, if this is the entry block, we have to insert copies of arguments from the real location to the VReg. + if blk.EntryBlock() { + c.lowerFunctionArguments(blk) + } + + ectx.EndBlock() +} + +// lowerBranches is called right after StartBlock and before any LowerInstr call if +// there are branches to the given block. br0 is the very end of the block and b1 is the before the br0 if it exists. +// At least br0 is not nil, but br1 can be nil if there's no branching before br0. +// +// See ssa.Instruction IsBranching, and the comment on ssa.BasicBlock. +func (c *compiler) lowerBranches(br0, br1 *ssa.Instruction) { + ectx := c.mach.ExecutableContext() + + c.setCurrentGroupID(br0.GroupID()) + c.mach.LowerSingleBranch(br0) + ectx.FlushPendingInstructions() + if br1 != nil { + c.setCurrentGroupID(br1.GroupID()) + c.mach.LowerConditionalBranch(br1) + ectx.FlushPendingInstructions() + } + + if br0.Opcode() == ssa.OpcodeJump { + _, args, target := br0.BranchData() + argExists := len(args) != 0 + if argExists && br1 != nil { + panic("BUG: critical edge split failed") + } + if argExists && target.ReturnBlock() { + if len(args) > 0 { + c.mach.LowerReturns(args) + } + } else if argExists { + c.lowerBlockArguments(args, target) + } + } + ectx.FlushPendingInstructions() +} + +func (c *compiler) lowerFunctionArguments(entry ssa.BasicBlock) { + ectx := c.mach.ExecutableContext() + + c.tmpVals = c.tmpVals[:0] + for i := 0; i < entry.Params(); i++ { + p := entry.Param(i) + if c.ssaValueRefCounts[p.ID()] > 0 { + c.tmpVals = append(c.tmpVals, p) + } else { + // If the argument is not used, we can just pass an invalid value. + c.tmpVals = append(c.tmpVals, ssa.ValueInvalid) + } + } + c.mach.LowerParams(c.tmpVals) + ectx.FlushPendingInstructions() +} + +// lowerBlockArguments lowers how to pass arguments to the given successor block. +func (c *compiler) lowerBlockArguments(args []ssa.Value, succ ssa.BasicBlock) { + if len(args) != succ.Params() { + panic("BUG: mismatched number of arguments") + } + + c.varEdges = c.varEdges[:0] + c.varEdgeTypes = c.varEdgeTypes[:0] + c.constEdges = c.constEdges[:0] + for i := 0; i < len(args); i++ { + dst := succ.Param(i) + src := args[i] + + dstReg := c.VRegOf(dst) + srcDef := c.ssaValueDefinitions[src.ID()] + if srcDef.IsFromInstr() && srcDef.Instr.Constant() { + c.constEdges = append(c.constEdges, struct { + cInst *ssa.Instruction + dst regalloc.VReg + }{cInst: srcDef.Instr, dst: dstReg}) + } else { + srcReg := c.VRegOf(src) + // Even when the src=dst, insert the move so that we can keep such registers keep-alive. + c.varEdges = append(c.varEdges, [2]regalloc.VReg{srcReg, dstReg}) + c.varEdgeTypes = append(c.varEdgeTypes, src.Type()) + } + } + + // Check if there's an overlap among the dsts and srcs in varEdges. + c.vRegIDs = c.vRegIDs[:0] + for _, edge := range c.varEdges { + src := edge[0].ID() + if int(src) >= len(c.vRegSet) { + c.vRegSet = append(c.vRegSet, make([]bool, src+1)...) + } + c.vRegSet[src] = true + c.vRegIDs = append(c.vRegIDs, src) + } + separated := true + for _, edge := range c.varEdges { + dst := edge[1].ID() + if int(dst) >= len(c.vRegSet) { + c.vRegSet = append(c.vRegSet, make([]bool, dst+1)...) + } else { + if c.vRegSet[dst] { + separated = false + break + } + } + } + for _, id := range c.vRegIDs { + c.vRegSet[id] = false // reset for the next use. + } + + if separated { + // If there's no overlap, we can simply move the source to destination. + for i, edge := range c.varEdges { + src, dst := edge[0], edge[1] + c.mach.InsertMove(dst, src, c.varEdgeTypes[i]) + } + } else { + // Otherwise, we allocate a temporary registers and move the source to the temporary register, + // + // First move all of them to temporary registers. + c.tempRegs = c.tempRegs[:0] + for i, edge := range c.varEdges { + src := edge[0] + typ := c.varEdgeTypes[i] + temp := c.AllocateVReg(typ) + c.tempRegs = append(c.tempRegs, temp) + c.mach.InsertMove(temp, src, typ) + } + // Then move the temporary registers to the destination. + for i, edge := range c.varEdges { + temp := c.tempRegs[i] + dst := edge[1] + c.mach.InsertMove(dst, temp, c.varEdgeTypes[i]) + } + } + + // Finally, move the constants. + for _, edge := range c.constEdges { + cInst, dst := edge.cInst, edge.dst + c.mach.InsertLoadConstantBlockArg(cInst, dst) + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go new file mode 100644 index 000000000..81c6a6b62 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go @@ -0,0 +1,219 @@ +package backend + +import ( + "fmt" + "math" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +type ExecutableContext interface { + // StartLoweringFunction is called when the lowering of the given function is started. + // maximumBlockID is the maximum value of ssa.BasicBlockID existing in the function. + StartLoweringFunction(maximumBlockID ssa.BasicBlockID) + + // LinkAdjacentBlocks is called after finished lowering all blocks in order to create one single instruction list. + LinkAdjacentBlocks(prev, next ssa.BasicBlock) + + // StartBlock is called when the compilation of the given block is started. + // The order of this being called is the reverse post order of the ssa.BasicBlock(s) as we iterate with + // ssa.Builder BlockIteratorReversePostOrderBegin and BlockIteratorReversePostOrderEnd. + StartBlock(ssa.BasicBlock) + + // EndBlock is called when the compilation of the current block is finished. + EndBlock() + + // FlushPendingInstructions flushes the pending instructions to the buffer. + // This will be called after the lowering of each SSA Instruction. + FlushPendingInstructions() +} + +type ExecutableContextT[Instr any] struct { + CurrentSSABlk ssa.BasicBlock + + // InstrPool is the InstructionPool of instructions. + InstructionPool wazevoapi.Pool[Instr] + asNop func(*Instr) + setNext func(*Instr, *Instr) + setPrev func(*Instr, *Instr) + + // RootInstr is the root instruction of the executable. + RootInstr *Instr + labelPositionPool wazevoapi.Pool[LabelPosition[Instr]] + NextLabel Label + // LabelPositions maps a label to the instructions of the region which the label represents. + LabelPositions map[Label]*LabelPosition[Instr] + OrderedBlockLabels []*LabelPosition[Instr] + + // PerBlockHead and PerBlockEnd are the head and tail of the instruction list per currently-compiled ssa.BasicBlock. + PerBlockHead, PerBlockEnd *Instr + // PendingInstructions are the instructions which are not yet emitted into the instruction list. + PendingInstructions []*Instr + + // SsaBlockIDToLabels maps an SSA block ID to the label. + SsaBlockIDToLabels []Label +} + +func NewExecutableContextT[Instr any]( + resetInstruction func(*Instr), + setNext func(*Instr, *Instr), + setPrev func(*Instr, *Instr), + asNop func(*Instr), +) *ExecutableContextT[Instr] { + return &ExecutableContextT[Instr]{ + InstructionPool: wazevoapi.NewPool[Instr](resetInstruction), + asNop: asNop, + setNext: setNext, + setPrev: setPrev, + labelPositionPool: wazevoapi.NewPool[LabelPosition[Instr]](resetLabelPosition[Instr]), + LabelPositions: make(map[Label]*LabelPosition[Instr]), + NextLabel: LabelInvalid, + } +} + +func resetLabelPosition[T any](l *LabelPosition[T]) { + *l = LabelPosition[T]{} +} + +// StartLoweringFunction implements ExecutableContext. +func (e *ExecutableContextT[Instr]) StartLoweringFunction(max ssa.BasicBlockID) { + imax := int(max) + if len(e.SsaBlockIDToLabels) <= imax { + // Eagerly allocate labels for the blocks since the underlying slice will be used for the next iteration. + e.SsaBlockIDToLabels = append(e.SsaBlockIDToLabels, make([]Label, imax+1)...) + } +} + +func (e *ExecutableContextT[Instr]) StartBlock(blk ssa.BasicBlock) { + e.CurrentSSABlk = blk + + l := e.SsaBlockIDToLabels[e.CurrentSSABlk.ID()] + if l == LabelInvalid { + l = e.AllocateLabel() + e.SsaBlockIDToLabels[blk.ID()] = l + } + + end := e.allocateNop0() + e.PerBlockHead, e.PerBlockEnd = end, end + + labelPos, ok := e.LabelPositions[l] + if !ok { + labelPos = e.AllocateLabelPosition(l) + e.LabelPositions[l] = labelPos + } + e.OrderedBlockLabels = append(e.OrderedBlockLabels, labelPos) + labelPos.Begin, labelPos.End = end, end + labelPos.SB = blk +} + +// EndBlock implements ExecutableContext. +func (e *ExecutableContextT[T]) EndBlock() { + // Insert nop0 as the head of the block for convenience to simplify the logic of inserting instructions. + e.insertAtPerBlockHead(e.allocateNop0()) + + l := e.SsaBlockIDToLabels[e.CurrentSSABlk.ID()] + e.LabelPositions[l].Begin = e.PerBlockHead + + if e.CurrentSSABlk.EntryBlock() { + e.RootInstr = e.PerBlockHead + } +} + +func (e *ExecutableContextT[T]) insertAtPerBlockHead(i *T) { + if e.PerBlockHead == nil { + e.PerBlockHead = i + e.PerBlockEnd = i + return + } + e.setNext(i, e.PerBlockHead) + e.setPrev(e.PerBlockHead, i) + e.PerBlockHead = i +} + +// FlushPendingInstructions implements ExecutableContext. +func (e *ExecutableContextT[T]) FlushPendingInstructions() { + l := len(e.PendingInstructions) + if l == 0 { + return + } + for i := l - 1; i >= 0; i-- { // reverse because we lower instructions in reverse order. + e.insertAtPerBlockHead(e.PendingInstructions[i]) + } + e.PendingInstructions = e.PendingInstructions[:0] +} + +func (e *ExecutableContextT[T]) Reset() { + e.labelPositionPool.Reset() + e.InstructionPool.Reset() + for l := Label(0); l <= e.NextLabel; l++ { + delete(e.LabelPositions, l) + } + e.PendingInstructions = e.PendingInstructions[:0] + e.OrderedBlockLabels = e.OrderedBlockLabels[:0] + e.RootInstr = nil + e.SsaBlockIDToLabels = e.SsaBlockIDToLabels[:0] + e.PerBlockHead, e.PerBlockEnd = nil, nil + e.NextLabel = LabelInvalid +} + +// AllocateLabel allocates an unused label. +func (e *ExecutableContextT[T]) AllocateLabel() Label { + e.NextLabel++ + return e.NextLabel +} + +func (e *ExecutableContextT[T]) AllocateLabelPosition(la Label) *LabelPosition[T] { + l := e.labelPositionPool.Allocate() + l.L = la + return l +} + +func (e *ExecutableContextT[T]) GetOrAllocateSSABlockLabel(blk ssa.BasicBlock) Label { + if blk.ReturnBlock() { + return LabelReturn + } + l := e.SsaBlockIDToLabels[blk.ID()] + if l == LabelInvalid { + l = e.AllocateLabel() + e.SsaBlockIDToLabels[blk.ID()] = l + } + return l +} + +func (e *ExecutableContextT[T]) allocateNop0() *T { + i := e.InstructionPool.Allocate() + e.asNop(i) + return i +} + +// LinkAdjacentBlocks implements backend.Machine. +func (e *ExecutableContextT[T]) LinkAdjacentBlocks(prev, next ssa.BasicBlock) { + prevLabelPos := e.LabelPositions[e.GetOrAllocateSSABlockLabel(prev)] + nextLabelPos := e.LabelPositions[e.GetOrAllocateSSABlockLabel(next)] + e.setNext(prevLabelPos.End, nextLabelPos.Begin) +} + +// LabelPosition represents the regions of the generated code which the label represents. +type LabelPosition[Instr any] struct { + SB ssa.BasicBlock + L Label + Begin, End *Instr + BinaryOffset int64 +} + +// Label represents a position in the generated code which is either +// a real instruction or the constant InstructionPool (e.g. jump tables). +// +// This is exactly the same as the traditional "label" in assembly code. +type Label uint32 + +const ( + LabelInvalid Label = 0 + LabelReturn Label = math.MaxUint32 +) + +// String implements backend.Machine. +func (l Label) String() string { + return fmt.Sprintf("L%d", l) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go new file mode 100644 index 000000000..6fe6d7b3c --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go @@ -0,0 +1,33 @@ +package backend + +import "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + +// GoFunctionCallRequiredStackSize returns the size of the stack required for the Go function call. +// argBegin is the index of the first argument in the signature which is not either execution context or module context. +func GoFunctionCallRequiredStackSize(sig *ssa.Signature, argBegin int) (ret, retUnaligned int64) { + var paramNeededInBytes, resultNeededInBytes int64 + for _, p := range sig.Params[argBegin:] { + s := int64(p.Size()) + if s < 8 { + s = 8 // We use uint64 for all basic types, except SIMD v128. + } + paramNeededInBytes += s + } + for _, r := range sig.Results { + s := int64(r.Size()) + if s < 8 { + s = 8 // We use uint64 for all basic types, except SIMD v128. + } + resultNeededInBytes += s + } + + if paramNeededInBytes > resultNeededInBytes { + ret = paramNeededInBytes + } else { + ret = resultNeededInBytes + } + retUnaligned = ret + // Align to 16 bytes. + ret = (ret + 15) &^ 15 + return +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go new file mode 100644 index 000000000..130f8c621 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go @@ -0,0 +1,186 @@ +package amd64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// For the details of the ABI, see: +// https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#amd64-architecture + +var ( + intArgResultRegs = []regalloc.RealReg{rax, rbx, rcx, rdi, rsi, r8, r9, r10, r11} + floatArgResultRegs = []regalloc.RealReg{xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7} +) + +var regInfo = ®alloc.RegisterInfo{ + AllocatableRegisters: [regalloc.NumRegType][]regalloc.RealReg{ + regalloc.RegTypeInt: { + rax, rcx, rdx, rbx, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15, + }, + regalloc.RegTypeFloat: { + xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, + }, + }, + CalleeSavedRegisters: regalloc.NewRegSet( + rdx, r12, r13, r14, r15, + xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, + ), + CallerSavedRegisters: regalloc.NewRegSet( + rax, rcx, rbx, rsi, rdi, r8, r9, r10, r11, + xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, + ), + RealRegToVReg: []regalloc.VReg{ + rax: raxVReg, rcx: rcxVReg, rdx: rdxVReg, rbx: rbxVReg, rsp: rspVReg, rbp: rbpVReg, rsi: rsiVReg, rdi: rdiVReg, + r8: r8VReg, r9: r9VReg, r10: r10VReg, r11: r11VReg, r12: r12VReg, r13: r13VReg, r14: r14VReg, r15: r15VReg, + xmm0: xmm0VReg, xmm1: xmm1VReg, xmm2: xmm2VReg, xmm3: xmm3VReg, xmm4: xmm4VReg, xmm5: xmm5VReg, xmm6: xmm6VReg, + xmm7: xmm7VReg, xmm8: xmm8VReg, xmm9: xmm9VReg, xmm10: xmm10VReg, xmm11: xmm11VReg, xmm12: xmm12VReg, + xmm13: xmm13VReg, xmm14: xmm14VReg, xmm15: xmm15VReg, + }, + RealRegName: func(r regalloc.RealReg) string { return regNames[r] }, + RealRegType: func(r regalloc.RealReg) regalloc.RegType { + if r < xmm0 { + return regalloc.RegTypeInt + } + return regalloc.RegTypeFloat + }, +} + +// ArgsResultsRegs implements backend.Machine. +func (m *machine) ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) { + return intArgResultRegs, floatArgResultRegs +} + +// LowerParams implements backend.Machine. +func (m *machine) LowerParams(args []ssa.Value) { + a := m.currentABI + + for i, ssaArg := range args { + if !ssaArg.Valid() { + continue + } + reg := m.c.VRegOf(ssaArg) + arg := &a.Args[i] + if arg.Kind == backend.ABIArgKindReg { + m.InsertMove(reg, arg.Reg, arg.Type) + } else { + // + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | ReturnAddress | + // | Caller_RBP | + // +-----------------+ <-- RBP + // | ........... | + // | clobbered M | + // | ............ | + // | clobbered 0 | + // | spill slot N | + // | ........... | + // | spill slot 0 | + // RSP--> +-----------------+ + // (low address) + + // Load the value from the arg stack slot above the current RBP. + load := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmRBPReg(uint32(arg.Offset + 16))) + switch arg.Type { + case ssa.TypeI32: + load.asMovzxRmR(extModeLQ, mem, reg) + case ssa.TypeI64: + load.asMov64MR(mem, reg) + case ssa.TypeF32: + load.asXmmUnaryRmR(sseOpcodeMovss, mem, reg) + case ssa.TypeF64: + load.asXmmUnaryRmR(sseOpcodeMovsd, mem, reg) + case ssa.TypeV128: + load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, reg) + default: + panic("BUG") + } + m.insert(load) + } + } +} + +// LowerReturns implements backend.Machine. +func (m *machine) LowerReturns(rets []ssa.Value) { + // Load the XMM registers first as it might need a temporary register to inline + // constant return. + a := m.currentABI + for i, ret := range rets { + r := &a.Rets[i] + if !r.Type.IsInt() { + m.LowerReturn(ret, r) + } + } + // Then load the GPR registers. + for i, ret := range rets { + r := &a.Rets[i] + if r.Type.IsInt() { + m.LowerReturn(ret, r) + } + } +} + +func (m *machine) LowerReturn(ret ssa.Value, r *backend.ABIArg) { + reg := m.c.VRegOf(ret) + if def := m.c.ValueDefinition(ret); def.IsFromInstr() { + // Constant instructions are inlined. + if inst := def.Instr; inst.Constant() { + m.insertLoadConstant(inst, reg) + } + } + if r.Kind == backend.ABIArgKindReg { + m.InsertMove(r.Reg, reg, ret.Type()) + } else { + // + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | ReturnAddress | + // | Caller_RBP | + // +-----------------+ <-- RBP + // | ........... | + // | clobbered M | + // | ............ | + // | clobbered 0 | + // | spill slot N | + // | ........... | + // | spill slot 0 | + // RSP--> +-----------------+ + // (low address) + + // Store the value to the return stack slot above the current RBP. + store := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmRBPReg(uint32(m.currentABI.ArgStackSize + 16 + r.Offset))) + switch r.Type { + case ssa.TypeI32: + store.asMovRM(reg, mem, 4) + case ssa.TypeI64: + store.asMovRM(reg, mem, 8) + case ssa.TypeF32: + store.asXmmMovRM(sseOpcodeMovss, reg, mem) + case ssa.TypeF64: + store.asXmmMovRM(sseOpcodeMovsd, reg, mem) + case ssa.TypeV128: + store.asXmmMovRM(sseOpcodeMovdqu, reg, mem) + } + m.insert(store) + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go new file mode 100644 index 000000000..cbf1cfdc5 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go @@ -0,0 +1,9 @@ +package amd64 + +// entrypoint enters the machine code generated by this backend which begins with the preamble generated by functionABI.EmitGoEntryPreamble below. +// This implements wazevo.entrypoint, and see the comments there for detail. +func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr) + +// afterGoFunctionCallEntrypoint enters the machine code after growing the stack. +// This implements wazevo.afterGoFunctionCallEntrypoint, and see the comments there for detail. +func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr) diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s new file mode 100644 index 000000000..e9cb131d1 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s @@ -0,0 +1,29 @@ +#include "funcdata.h" +#include "textflag.h" + +// entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr +TEXT ·entrypoint(SB), NOSPLIT|NOFRAME, $0-48 + MOVQ preambleExecutable+0(FP), R11 + MOVQ functionExectuable+8(FP), R14 + MOVQ executionContextPtr+16(FP), AX // First argument is passed in AX. + MOVQ moduleContextPtr+24(FP), BX // Second argument is passed in BX. + MOVQ paramResultSlicePtr+32(FP), R12 + MOVQ goAllocatedStackSlicePtr+40(FP), R13 + JMP R11 + +// afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr) +TEXT ·afterGoFunctionCallEntrypoint(SB), NOSPLIT|NOFRAME, $0-32 + MOVQ executable+0(FP), CX + MOVQ executionContextPtr+8(FP), AX // First argument is passed in AX. + + // Save the stack pointer and frame pointer. + MOVQ BP, 16(AX) // 16 == ExecutionContextOffsetOriginalFramePointer + MOVQ SP, 24(AX) // 24 == ExecutionContextOffsetOriginalStackPointer + + // Then set the stack pointer and frame pointer to the values we got from the Go runtime. + MOVQ framePointer+24(FP), BP + + // WARNING: do not update SP before BP, because the Go translates (FP) as (SP) + 8. + MOVQ stackPointer+16(FP), SP + + JMP CX diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go new file mode 100644 index 000000000..882d06c06 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go @@ -0,0 +1,248 @@ +package amd64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +var ( + executionContextPtrReg = raxVReg + + // Followings are callee saved registers. They can be used freely in the entry preamble + // since the preamble is called via Go assembly function which has stack-based ABI. + + // savedExecutionContextPtr also must be a callee-saved reg so that they can be used in the prologue and epilogue. + savedExecutionContextPtr = rdxVReg + // paramResultSlicePtr must match with entrypoint function in abi_entry_amd64.s. + paramResultSlicePtr = r12VReg + // goAllocatedStackPtr must match with entrypoint function in abi_entry_amd64.s. + goAllocatedStackPtr = r13VReg + // functionExecutable must match with entrypoint function in abi_entry_amd64.s. + functionExecutable = r14VReg + tmpIntReg = r15VReg + tmpXmmReg = xmm15VReg +) + +// CompileEntryPreamble implements backend.Machine. +func (m *machine) CompileEntryPreamble(sig *ssa.Signature) []byte { + root := m.compileEntryPreamble(sig) + m.encodeWithoutSSA(root) + buf := m.c.Buf() + return buf +} + +func (m *machine) compileEntryPreamble(sig *ssa.Signature) *instruction { + abi := backend.FunctionABI{} + abi.Init(sig, intArgResultRegs, floatArgResultRegs) + + root := m.allocateNop() + + //// ----------------------------------- prologue ----------------------------------- //// + + // First, we save executionContextPtrReg into a callee-saved register so that it can be used in epilogue as well. + // mov %executionContextPtrReg, %savedExecutionContextPtr + cur := m.move64(executionContextPtrReg, savedExecutionContextPtr, root) + + // Next is to save the original RBP and RSP into the execution context. + cur = m.saveOriginalRSPRBP(cur) + + // Now set the RSP to the Go-allocated stack pointer. + // mov %goAllocatedStackPtr, %rsp + cur = m.move64(goAllocatedStackPtr, rspVReg, cur) + + if stackSlotSize := abi.AlignedArgResultStackSlotSize(); stackSlotSize > 0 { + // Allocate stack slots for the arguments and return values. + // sub $stackSlotSize, %rsp + spDec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(stackSlotSize)), rspVReg, true) + cur = linkInstr(cur, spDec) + } + + var offset uint32 + for i := range abi.Args { + if i < 2 { + // module context ptr and execution context ptr are passed in rax and rbx by the Go assembly function. + continue + } + arg := &abi.Args[i] + cur = m.goEntryPreamblePassArg(cur, paramResultSlicePtr, offset, arg) + if arg.Type == ssa.TypeV128 { + offset += 16 + } else { + offset += 8 + } + } + + // Zero out RBP so that the unwind/stack growth code can correctly detect the end of the stack. + zerosRbp := m.allocateInstr().asAluRmiR(aluRmiROpcodeXor, newOperandReg(rbpVReg), rbpVReg, true) + cur = linkInstr(cur, zerosRbp) + + // Now ready to call the real function. Note that at this point stack pointer is already set to the Go-allocated, + // which is aligned to 16 bytes. + call := m.allocateInstr().asCallIndirect(newOperandReg(functionExecutable), &abi) + cur = linkInstr(cur, call) + + //// ----------------------------------- epilogue ----------------------------------- //// + + // Read the results from regs and the stack, and set them correctly into the paramResultSlicePtr. + offset = 0 + for i := range abi.Rets { + r := &abi.Rets[i] + cur = m.goEntryPreamblePassResult(cur, paramResultSlicePtr, offset, r, uint32(abi.ArgStackSize)) + if r.Type == ssa.TypeV128 { + offset += 16 + } else { + offset += 8 + } + } + + // Finally, restore the original RBP and RSP. + cur = m.restoreOriginalRSPRBP(cur) + + ret := m.allocateInstr().asRet() + linkInstr(cur, ret) + return root +} + +// saveOriginalRSPRBP saves the original RSP and RBP into the execution context. +func (m *machine) saveOriginalRSPRBP(cur *instruction) *instruction { + // mov %rbp, wazevoapi.ExecutionContextOffsetOriginalFramePointer(%executionContextPtrReg) + // mov %rsp, wazevoapi.ExecutionContextOffsetOriginalStackPointer(%executionContextPtrReg) + cur = m.loadOrStore64AtExecutionCtx(executionContextPtrReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, rbpVReg, true, cur) + cur = m.loadOrStore64AtExecutionCtx(executionContextPtrReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, rspVReg, true, cur) + return cur +} + +// restoreOriginalRSPRBP restores the original RSP and RBP from the execution context. +func (m *machine) restoreOriginalRSPRBP(cur *instruction) *instruction { + // mov wazevoapi.ExecutionContextOffsetOriginalFramePointer(%executionContextPtrReg), %rbp + // mov wazevoapi.ExecutionContextOffsetOriginalStackPointer(%executionContextPtrReg), %rsp + cur = m.loadOrStore64AtExecutionCtx(savedExecutionContextPtr, wazevoapi.ExecutionContextOffsetOriginalFramePointer, rbpVReg, false, cur) + cur = m.loadOrStore64AtExecutionCtx(savedExecutionContextPtr, wazevoapi.ExecutionContextOffsetOriginalStackPointer, rspVReg, false, cur) + return cur +} + +func (m *machine) move64(src, dst regalloc.VReg, prev *instruction) *instruction { + mov := m.allocateInstr().asMovRR(src, dst, true) + return linkInstr(prev, mov) +} + +func (m *machine) loadOrStore64AtExecutionCtx(execCtx regalloc.VReg, offset wazevoapi.Offset, r regalloc.VReg, store bool, prev *instruction) *instruction { + mem := newOperandMem(m.newAmodeImmReg(offset.U32(), execCtx)) + instr := m.allocateInstr() + if store { + instr.asMovRM(r, mem, 8) + } else { + instr.asMov64MR(mem, r) + } + return linkInstr(prev, instr) +} + +// This is for debugging. +func (m *machine) linkUD2(cur *instruction) *instruction { //nolint + return linkInstr(cur, m.allocateInstr().asUD2()) +} + +func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regalloc.VReg, offsetInParamSlice uint32, arg *backend.ABIArg) *instruction { + var dst regalloc.VReg + argTyp := arg.Type + if arg.Kind == backend.ABIArgKindStack { + // Caller saved registers ca + switch argTyp { + case ssa.TypeI32, ssa.TypeI64: + dst = tmpIntReg + case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: + dst = tmpXmmReg + default: + panic("BUG") + } + } else { + dst = arg.Reg + } + + load := m.allocateInstr() + a := newOperandMem(m.newAmodeImmReg(offsetInParamSlice, paramSlicePtr)) + switch arg.Type { + case ssa.TypeI32: + load.asMovzxRmR(extModeLQ, a, dst) + case ssa.TypeI64: + load.asMov64MR(a, dst) + case ssa.TypeF32: + load.asXmmUnaryRmR(sseOpcodeMovss, a, dst) + case ssa.TypeF64: + load.asXmmUnaryRmR(sseOpcodeMovsd, a, dst) + case ssa.TypeV128: + load.asXmmUnaryRmR(sseOpcodeMovdqu, a, dst) + } + + cur = linkInstr(cur, load) + if arg.Kind == backend.ABIArgKindStack { + // Store back to the stack. + store := m.allocateInstr() + a := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset), rspVReg)) + switch arg.Type { + case ssa.TypeI32: + store.asMovRM(dst, a, 4) + case ssa.TypeI64: + store.asMovRM(dst, a, 8) + case ssa.TypeF32: + store.asXmmMovRM(sseOpcodeMovss, dst, a) + case ssa.TypeF64: + store.asXmmMovRM(sseOpcodeMovsd, dst, a) + case ssa.TypeV128: + store.asXmmMovRM(sseOpcodeMovdqu, dst, a) + } + cur = linkInstr(cur, store) + } + return cur +} + +func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr regalloc.VReg, offsetInResultSlice uint32, result *backend.ABIArg, resultStackSlotBeginOffset uint32) *instruction { + var r regalloc.VReg + if result.Kind == backend.ABIArgKindStack { + // Load the value to the temporary. + load := m.allocateInstr() + offset := resultStackSlotBeginOffset + uint32(result.Offset) + a := newOperandMem(m.newAmodeImmReg(offset, rspVReg)) + switch result.Type { + case ssa.TypeI32: + r = tmpIntReg + load.asMovzxRmR(extModeLQ, a, r) + case ssa.TypeI64: + r = tmpIntReg + load.asMov64MR(a, r) + case ssa.TypeF32: + r = tmpXmmReg + load.asXmmUnaryRmR(sseOpcodeMovss, a, r) + case ssa.TypeF64: + r = tmpXmmReg + load.asXmmUnaryRmR(sseOpcodeMovsd, a, r) + case ssa.TypeV128: + r = tmpXmmReg + load.asXmmUnaryRmR(sseOpcodeMovdqu, a, r) + default: + panic("BUG") + } + cur = linkInstr(cur, load) + } else { + r = result.Reg + } + + store := m.allocateInstr() + a := newOperandMem(m.newAmodeImmReg(offsetInResultSlice, resultSlicePtr)) + switch result.Type { + case ssa.TypeI32: + store.asMovRM(r, a, 4) + case ssa.TypeI64: + store.asMovRM(r, a, 8) + case ssa.TypeF32: + store.asXmmMovRM(sseOpcodeMovss, r, a) + case ssa.TypeF64: + store.asXmmMovRM(sseOpcodeMovsd, r, a) + case ssa.TypeV128: + store.asXmmMovRM(sseOpcodeMovdqu, r, a) + } + + return linkInstr(cur, store) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go new file mode 100644 index 000000000..751050aff --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go @@ -0,0 +1,443 @@ +package amd64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +var calleeSavedVRegs = []regalloc.VReg{ + rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg, + xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg, +} + +// CompileGoFunctionTrampoline implements backend.Machine. +func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte { + ectx := m.ectx + argBegin := 1 // Skips exec context by default. + if needModuleContextPtr { + argBegin++ + } + + abi := &backend.FunctionABI{} + abi.Init(sig, intArgResultRegs, floatArgResultRegs) + m.currentABI = abi + + cur := m.allocateNop() + ectx.RootInstr = cur + + // Execution context is always the first argument. + execCtrPtr := raxVReg + + // First we update RBP and RSP just like the normal prologue. + // + // (high address) (high address) + // RBP ----> +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | ====> | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | Return Addr | | Return Addr | + // RSP ----> +-----------------+ | Caller_RBP | + // (low address) +-----------------+ <----- RSP, RBP + // + cur = m.setupRBPRSP(cur) + + goSliceSizeAligned, goSliceSizeAlignedUnaligned := backend.GoFunctionCallRequiredStackSize(sig, argBegin) + cur = m.insertStackBoundsCheck(goSliceSizeAligned+8 /* size of the Go slice */, cur) + + // Save the callee saved registers. + cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs) + + if needModuleContextPtr { + moduleCtrPtr := rbxVReg // Module context is always the second argument. + mem := m.newAmodeImmReg( + wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.U32(), + execCtrPtr) + store := m.allocateInstr().asMovRM(moduleCtrPtr, newOperandMem(mem), 8) + cur = linkInstr(cur, store) + } + + // Now let's advance the RSP to the stack slot for the arguments. + // + // (high address) (high address) + // +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | =======> | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | Return Addr | | Return Addr | + // | Caller_RBP | | Caller_RBP | + // RBP,RSP --> +-----------------+ +-----------------+ <----- RBP + // (low address) | arg[N]/ret[M] | + // | .......... | + // | arg[1]/ret[1] | + // | arg[0]/ret[0] | + // +-----------------+ <----- RSP + // (low address) + // + // where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions, + // therefore will be accessed as the usual []uint64. So that's where we need to pass/receive + // the arguments/return values to/from Go function. + cur = m.addRSP(-int32(goSliceSizeAligned), cur) + + // Next, we need to store all the arguments to the stack in the typical Wasm stack style. + var offsetInGoSlice int32 + for i := range abi.Args[argBegin:] { + arg := &abi.Args[argBegin+i] + var v regalloc.VReg + if arg.Kind == backend.ABIArgKindReg { + v = arg.Reg + } else { + // We have saved callee saved registers, so we can use them. + if arg.Type.IsInt() { + v = r15VReg + } else { + v = xmm15VReg + } + mem := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg)) + load := m.allocateInstr() + switch arg.Type { + case ssa.TypeI32: + load.asMovzxRmR(extModeLQ, mem, v) + case ssa.TypeI64: + load.asMov64MR(mem, v) + case ssa.TypeF32: + load.asXmmUnaryRmR(sseOpcodeMovss, mem, v) + case ssa.TypeF64: + load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v) + case ssa.TypeV128: + load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v) + default: + panic("BUG") + } + cur = linkInstr(cur, load) + } + + store := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg)) + switch arg.Type { + case ssa.TypeI32: + store.asMovRM(v, mem, 4) + offsetInGoSlice += 8 // always uint64 rep. + case ssa.TypeI64: + store.asMovRM(v, mem, 8) + offsetInGoSlice += 8 + case ssa.TypeF32: + store.asXmmMovRM(sseOpcodeMovss, v, mem) + offsetInGoSlice += 8 // always uint64 rep. + case ssa.TypeF64: + store.asXmmMovRM(sseOpcodeMovsd, v, mem) + offsetInGoSlice += 8 + case ssa.TypeV128: + store.asXmmMovRM(sseOpcodeMovdqu, v, mem) + offsetInGoSlice += 16 + default: + panic("BUG") + } + cur = linkInstr(cur, store) + } + + // Finally we push the size of the slice to the stack so the stack looks like: + // + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | Return Addr | + // | Caller_RBP | + // +-----------------+ <----- RBP + // | arg[N]/ret[M] | + // | .......... | + // | arg[1]/ret[1] | + // | arg[0]/ret[0] | + // | slice size | + // +-----------------+ <----- RSP + // (low address) + // + // push $sliceSize + cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandImm32(uint32(goSliceSizeAlignedUnaligned)))) + + // Load the exitCode to the register. + exitCodeReg := r12VReg // Callee saved which is already saved. + cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(exitCode), false)) + + saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg) + cur = linkInstr(cur, setExitCode) + cur = linkInstr(cur, saveRsp) + cur = linkInstr(cur, saveRbp) + + // Ready to exit the execution. + cur = m.storeReturnAddressAndExit(cur, execCtrPtr) + + // We don't need the slice size anymore, so pop it. + cur = m.addRSP(8, cur) + + // Ready to set up the results. + offsetInGoSlice = 0 + // To avoid overwriting with the execution context pointer by the result, we need to track the offset, + // and defer the restoration of the result to the end of this function. + var argOverlapWithExecCtxOffset int32 = -1 + for i := range abi.Rets { + r := &abi.Rets[i] + var v regalloc.VReg + isRegResult := r.Kind == backend.ABIArgKindReg + if isRegResult { + v = r.Reg + if v.RealReg() == execCtrPtr.RealReg() { + argOverlapWithExecCtxOffset = offsetInGoSlice + offsetInGoSlice += 8 // always uint64 rep. + continue + } + } else { + if r.Type.IsInt() { + v = r15VReg + } else { + v = xmm15VReg + } + } + + load := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg)) + switch r.Type { + case ssa.TypeI32: + load.asMovzxRmR(extModeLQ, mem, v) + offsetInGoSlice += 8 // always uint64 rep. + case ssa.TypeI64: + load.asMov64MR(mem, v) + offsetInGoSlice += 8 + case ssa.TypeF32: + load.asXmmUnaryRmR(sseOpcodeMovss, mem, v) + offsetInGoSlice += 8 // always uint64 rep. + case ssa.TypeF64: + load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v) + offsetInGoSlice += 8 + case ssa.TypeV128: + load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v) + offsetInGoSlice += 16 + default: + panic("BUG") + } + cur = linkInstr(cur, load) + + if !isRegResult { + // We need to store it back to the result slot above rbp. + store := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmReg(uint32(abi.ArgStackSize+r.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg)) + switch r.Type { + case ssa.TypeI32: + store.asMovRM(v, mem, 4) + case ssa.TypeI64: + store.asMovRM(v, mem, 8) + case ssa.TypeF32: + store.asXmmMovRM(sseOpcodeMovss, v, mem) + case ssa.TypeF64: + store.asXmmMovRM(sseOpcodeMovsd, v, mem) + case ssa.TypeV128: + store.asXmmMovRM(sseOpcodeMovdqu, v, mem) + default: + panic("BUG") + } + cur = linkInstr(cur, store) + } + } + + // Before return, we need to restore the callee saved registers. + cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs) + + if argOverlapWithExecCtxOffset >= 0 { + // At this point execCtt is not used anymore, so we can finally store the + // result to the register which overlaps with the execution context pointer. + mem := newOperandMem(m.newAmodeImmReg(uint32(argOverlapWithExecCtxOffset), rspVReg)) + load := m.allocateInstr().asMov64MR(mem, execCtrPtr) + cur = linkInstr(cur, load) + } + + // Finally ready to return. + cur = m.revertRBPRSP(cur) + linkInstr(cur, m.allocateInstr().asRet()) + + m.encodeWithoutSSA(ectx.RootInstr) + return m.c.Buf() +} + +func (m *machine) saveRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction { + offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64() + for _, v := range regs { + store := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx)) + switch v.RegType() { + case regalloc.RegTypeInt: + store.asMovRM(v, mem, 8) + case regalloc.RegTypeFloat: + store.asXmmMovRM(sseOpcodeMovdqu, v, mem) + default: + panic("BUG") + } + cur = linkInstr(cur, store) + offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally. + } + return cur +} + +func (m *machine) restoreRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction { + offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64() + for _, v := range regs { + load := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx)) + switch v.RegType() { + case regalloc.RegTypeInt: + load.asMov64MR(mem, v) + case regalloc.RegTypeFloat: + load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v) + default: + panic("BUG") + } + cur = linkInstr(cur, load) + offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally. + } + return cur +} + +func (m *machine) storeReturnAddressAndExit(cur *instruction, execCtx regalloc.VReg) *instruction { + readRip := m.allocateInstr() + cur = linkInstr(cur, readRip) + + ripReg := r12VReg // Callee saved which is already saved. + saveRip := m.allocateInstr().asMovRM( + ripReg, + newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)), + 8, + ) + cur = linkInstr(cur, saveRip) + + exit := m.allocateExitSeq(execCtx) + cur = linkInstr(cur, exit) + + nop, l := m.allocateBrTarget() + cur = linkInstr(cur, nop) + readRip.asLEA(newOperandLabel(l), ripReg) + return cur +} + +// saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient +// stack space left. Basically this is the all allocatable registers except for RSP and RBP, and RAX which contains the +// execution context pointer. ExecCtx pointer is always the first argument so we don't need to save it. +var stackGrowSaveVRegs = []regalloc.VReg{ + rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg, + rcxVReg, rbxVReg, rsiVReg, rdiVReg, r8VReg, r9VReg, r10VReg, r11VReg, + xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg, + xmm0VReg, xmm1VReg, xmm2VReg, xmm3VReg, xmm4VReg, xmm5VReg, xmm6VReg, xmm7VReg, +} + +// CompileStackGrowCallSequence implements backend.Machine. +func (m *machine) CompileStackGrowCallSequence() []byte { + ectx := m.ectx + + cur := m.allocateNop() + ectx.RootInstr = cur + + cur = m.setupRBPRSP(cur) + + // Execution context is always the first argument. + execCtrPtr := raxVReg + + // Save the callee saved and argument registers. + cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs) + + // Load the exitCode to the register. + exitCodeReg := r12VReg // Already saved. + cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(wazevoapi.ExitCodeGrowStack), false)) + + saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg) + cur = linkInstr(cur, setExitCode) + cur = linkInstr(cur, saveRsp) + cur = linkInstr(cur, saveRbp) + + // Ready to exit the execution. + cur = m.storeReturnAddressAndExit(cur, execCtrPtr) + + // After the exit, restore the saved registers. + cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs) + + // Finally ready to return. + cur = m.revertRBPRSP(cur) + linkInstr(cur, m.allocateInstr().asRet()) + + m.encodeWithoutSSA(ectx.RootInstr) + return m.c.Buf() +} + +// insertStackBoundsCheck will insert the instructions after `cur` to check the +// stack bounds, and if there's no sufficient spaces required for the function, +// exit the execution and try growing it in Go world. +func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction { + // add $requiredStackSize, %rsp ;; Temporarily update the sp. + // cmp ExecutionContextOffsetStackBottomPtr(%rax), %rsp ;; Compare the stack bottom and the sp. + // ja .ok + // sub $requiredStackSize, %rsp ;; Reverse the temporary update. + // pushq r15 ;; save the temporary. + // mov $requiredStackSize, %r15 + // mov %15, ExecutionContextOffsetStackGrowRequiredSize(%rax) ;; Set the required size in the execution context. + // popq r15 ;; restore the temporary. + // callq *ExecutionContextOffsetStackGrowCallTrampolineAddress(%rax) ;; Call the Go function to grow the stack. + // jmp .cont + // .ok: + // sub $requiredStackSize, %rsp ;; Reverse the temporary update. + // .cont: + cur = m.addRSP(-int32(requiredStackSize), cur) + cur = linkInstr(cur, m.allocateInstr().asCmpRmiR(true, + newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackBottomPtr.U32(), raxVReg)), + rspVReg, true)) + + ja := m.allocateInstr() + cur = linkInstr(cur, ja) + + cur = m.addRSP(int32(requiredStackSize), cur) + + // Save the temporary. + + cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r15VReg))) + // Load the required size to the temporary. + cur = linkInstr(cur, m.allocateInstr().asImm(r15VReg, uint64(requiredStackSize), true)) + // Set the required size in the execution context. + cur = linkInstr(cur, m.allocateInstr().asMovRM(r15VReg, + newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.U32(), raxVReg)), 8)) + // Restore the temporary. + cur = linkInstr(cur, m.allocateInstr().asPop64(r15VReg)) + // Call the Go function to grow the stack. + cur = linkInstr(cur, m.allocateInstr().asCallIndirect(newOperandMem(m.newAmodeImmReg( + wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.U32(), raxVReg)), nil)) + // Jump to the continuation. + jmpToCont := m.allocateInstr() + cur = linkInstr(cur, jmpToCont) + + // .ok: + okInstr, ok := m.allocateBrTarget() + cur = linkInstr(cur, okInstr) + ja.asJmpIf(condNBE, newOperandLabel(ok)) + // On the ok path, we only need to reverse the temporary update. + cur = m.addRSP(int32(requiredStackSize), cur) + + // .cont: + contInstr, cont := m.allocateBrTarget() + cur = linkInstr(cur, contInstr) + jmpToCont.asJmp(newOperandLabel(cont)) + + return cur +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go new file mode 100644 index 000000000..75cbeab75 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go @@ -0,0 +1,168 @@ +package amd64 + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +type cond byte + +const ( + // condO represents (overflow) condition. + condO cond = iota + // condNO represents (no overflow) condition. + condNO + // condB represents (< unsigned) condition. + condB + // condNB represents (>= unsigned) condition. + condNB + // condZ represents (zero) condition. + condZ + // condNZ represents (not-zero) condition. + condNZ + // condBE represents (<= unsigned) condition. + condBE + // condNBE represents (> unsigned) condition. + condNBE + // condS represents (negative) condition. + condS + // condNS represents (not-negative) condition. + condNS + // condP represents (parity) condition. + condP + // condNP represents (not parity) condition. + condNP + // condL represents (< signed) condition. + condL + // condNL represents (>= signed) condition. + condNL + // condLE represents (<= signed) condition. + condLE + // condNLE represents (> signed) condition. + condNLE + + condInvalid +) + +func (c cond) String() string { + switch c { + case condO: + return "o" + case condNO: + return "no" + case condB: + return "b" + case condNB: + return "nb" + case condZ: + return "z" + case condNZ: + return "nz" + case condBE: + return "be" + case condNBE: + return "nbe" + case condS: + return "s" + case condNS: + return "ns" + case condL: + return "l" + case condNL: + return "nl" + case condLE: + return "le" + case condNLE: + return "nle" + case condP: + return "p" + case condNP: + return "np" + default: + panic("unreachable") + } +} + +func condFromSSAIntCmpCond(origin ssa.IntegerCmpCond) cond { + switch origin { + case ssa.IntegerCmpCondEqual: + return condZ + case ssa.IntegerCmpCondNotEqual: + return condNZ + case ssa.IntegerCmpCondSignedLessThan: + return condL + case ssa.IntegerCmpCondSignedGreaterThanOrEqual: + return condNL + case ssa.IntegerCmpCondSignedGreaterThan: + return condNLE + case ssa.IntegerCmpCondSignedLessThanOrEqual: + return condLE + case ssa.IntegerCmpCondUnsignedLessThan: + return condB + case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual: + return condNB + case ssa.IntegerCmpCondUnsignedGreaterThan: + return condNBE + case ssa.IntegerCmpCondUnsignedLessThanOrEqual: + return condBE + default: + panic("unreachable") + } +} + +func condFromSSAFloatCmpCond(origin ssa.FloatCmpCond) cond { + switch origin { + case ssa.FloatCmpCondGreaterThanOrEqual: + return condNB + case ssa.FloatCmpCondGreaterThan: + return condNBE + case ssa.FloatCmpCondEqual, ssa.FloatCmpCondNotEqual, ssa.FloatCmpCondLessThan, ssa.FloatCmpCondLessThanOrEqual: + panic(fmt.Sprintf("cond %s must be treated as a special case", origin)) + default: + panic("unreachable") + } +} + +func (c cond) encoding() byte { + return byte(c) +} + +func (c cond) invert() cond { + switch c { + case condO: + return condNO + case condNO: + return condO + case condB: + return condNB + case condNB: + return condB + case condZ: + return condNZ + case condNZ: + return condZ + case condBE: + return condNBE + case condNBE: + return condBE + case condS: + return condNS + case condNS: + return condS + case condP: + return condNP + case condNP: + return condP + case condL: + return condNL + case condNL: + return condL + case condLE: + return condNLE + case condNLE: + return condLE + default: + panic("unreachable") + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go new file mode 100644 index 000000000..5e731e822 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go @@ -0,0 +1,35 @@ +package amd64 + +// extMode represents the mode of extension in movzx/movsx. +type extMode byte + +const ( + // extModeBL represents Byte -> Longword. + extModeBL extMode = iota + // extModeBQ represents Byte -> Quadword. + extModeBQ + // extModeWL represents Word -> Longword. + extModeWL + // extModeWQ represents Word -> Quadword. + extModeWQ + // extModeLQ represents Longword -> Quadword. + extModeLQ +) + +// String implements fmt.Stringer. +func (e extMode) String() string { + switch e { + case extModeBL: + return "bl" + case extModeBQ: + return "bq" + case extModeWL: + return "wl" + case extModeWQ: + return "wq" + case extModeLQ: + return "lq" + default: + panic("BUG: invalid ext mode") + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go new file mode 100644 index 000000000..d27e79c0e --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go @@ -0,0 +1,2472 @@ +package amd64 + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +type instruction struct { + prev, next *instruction + op1, op2 operand + u1, u2 uint64 + b1 bool + addedBeforeRegAlloc bool + kind instructionKind +} + +// Next implements regalloc.Instr. +func (i *instruction) Next() regalloc.Instr { + return i.next +} + +// Prev implements regalloc.Instr. +func (i *instruction) Prev() regalloc.Instr { + return i.prev +} + +// IsCall implements regalloc.Instr. +func (i *instruction) IsCall() bool { return i.kind == call } + +// IsIndirectCall implements regalloc.Instr. +func (i *instruction) IsIndirectCall() bool { return i.kind == callIndirect } + +// IsReturn implements regalloc.Instr. +func (i *instruction) IsReturn() bool { return i.kind == ret } + +// AddedBeforeRegAlloc implements regalloc.Instr. +func (i *instruction) AddedBeforeRegAlloc() bool { return i.addedBeforeRegAlloc } + +// String implements regalloc.Instr. +func (i *instruction) String() string { + switch i.kind { + case nop0: + return "nop" + case sourceOffsetInfo: + return fmt.Sprintf("source_offset_info %d", i.u1) + case ret: + return "ret" + case imm: + if i.b1 { + return fmt.Sprintf("movabsq $%d, %s", int64(i.u1), i.op2.format(true)) + } else { + return fmt.Sprintf("movl $%d, %s", int32(i.u1), i.op2.format(false)) + } + case aluRmiR: + return fmt.Sprintf("%s %s, %s", aluRmiROpcode(i.u1), i.op1.format(i.b1), i.op2.format(i.b1)) + case movRR: + if i.b1 { + return fmt.Sprintf("movq %s, %s", i.op1.format(true), i.op2.format(true)) + } else { + return fmt.Sprintf("movl %s, %s", i.op1.format(false), i.op2.format(false)) + } + case xmmRmR: + return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(false), i.op2.format(false)) + case gprToXmm: + return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(i.b1), i.op2.format(i.b1)) + case xmmUnaryRmR: + return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(false), i.op2.format(false)) + case xmmUnaryRmRImm: + return fmt.Sprintf("%s $%d, %s, %s", sseOpcode(i.u1), roundingMode(i.u2), i.op1.format(false), i.op2.format(false)) + case unaryRmR: + var suffix string + if i.b1 { + suffix = "q" + } else { + suffix = "l" + } + return fmt.Sprintf("%s%s %s, %s", unaryRmROpcode(i.u1), suffix, i.op1.format(i.b1), i.op2.format(i.b1)) + case not: + var op string + if i.b1 { + op = "notq" + } else { + op = "notl" + } + return fmt.Sprintf("%s %s", op, i.op1.format(i.b1)) + case neg: + var op string + if i.b1 { + op = "negq" + } else { + op = "negl" + } + return fmt.Sprintf("%s %s", op, i.op1.format(i.b1)) + case div: + var prefix string + var op string + if i.b1 { + op = "divq" + } else { + op = "divl" + } + if i.u1 != 0 { + prefix = "i" + } + return fmt.Sprintf("%s%s %s", prefix, op, i.op1.format(i.b1)) + case mulHi: + signed, _64 := i.u1 != 0, i.b1 + var op string + switch { + case signed && _64: + op = "imulq" + case !signed && _64: + op = "mulq" + case signed && !_64: + op = "imull" + case !signed && !_64: + op = "mull" + } + return fmt.Sprintf("%s %s", op, i.op1.format(i.b1)) + case signExtendData: + var op string + if i.b1 { + op = "cqo" + } else { + op = "cdq" + } + return op + case movzxRmR: + return fmt.Sprintf("movzx.%s %s, %s", extMode(i.u1), i.op1.format(true), i.op2.format(true)) + case mov64MR: + return fmt.Sprintf("movq %s, %s", i.op1.format(true), i.op2.format(true)) + case lea: + return fmt.Sprintf("lea %s, %s", i.op1.format(true), i.op2.format(true)) + case movsxRmR: + return fmt.Sprintf("movsx.%s %s, %s", extMode(i.u1), i.op1.format(true), i.op2.format(true)) + case movRM: + var suffix string + switch i.u1 { + case 1: + suffix = "b" + case 2: + suffix = "w" + case 4: + suffix = "l" + case 8: + suffix = "q" + } + return fmt.Sprintf("mov.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true)) + case shiftR: + var suffix string + if i.b1 { + suffix = "q" + } else { + suffix = "l" + } + return fmt.Sprintf("%s%s %s, %s", shiftROp(i.u1), suffix, i.op1.format(false), i.op2.format(i.b1)) + case xmmRmiReg: + return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(true), i.op2.format(true)) + case cmpRmiR: + var op, suffix string + if i.u1 != 0 { + op = "cmp" + } else { + op = "test" + } + if i.b1 { + suffix = "q" + } else { + suffix = "l" + } + if op == "test" && i.op1.kind == operandKindMem { + // Print consistently with AT&T syntax. + return fmt.Sprintf("%s%s %s, %s", op, suffix, i.op2.format(i.b1), i.op1.format(i.b1)) + } + return fmt.Sprintf("%s%s %s, %s", op, suffix, i.op1.format(i.b1), i.op2.format(i.b1)) + case setcc: + return fmt.Sprintf("set%s %s", cond(i.u1), i.op2.format(true)) + case cmove: + var suffix string + if i.b1 { + suffix = "q" + } else { + suffix = "l" + } + return fmt.Sprintf("cmov%s%s %s, %s", cond(i.u1), suffix, i.op1.format(i.b1), i.op2.format(i.b1)) + case push64: + return fmt.Sprintf("pushq %s", i.op1.format(true)) + case pop64: + return fmt.Sprintf("popq %s", i.op1.format(true)) + case xmmMovRM: + return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(true), i.op2.format(true)) + case xmmLoadConst: + panic("TODO") + case xmmToGpr: + return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(i.b1), i.op2.format(i.b1)) + case cvtUint64ToFloatSeq: + panic("TODO") + case cvtFloatToSintSeq: + panic("TODO") + case cvtFloatToUintSeq: + panic("TODO") + case xmmMinMaxSeq: + panic("TODO") + case xmmCmpRmR: + return fmt.Sprintf("%s %s, %s", sseOpcode(i.u1), i.op1.format(false), i.op2.format(false)) + case xmmRmRImm: + op := sseOpcode(i.u1) + r1, r2 := i.op1.format(op == sseOpcodePextrq || op == sseOpcodePinsrq), + i.op2.format(op == sseOpcodePextrq || op == sseOpcodePinsrq) + return fmt.Sprintf("%s $%d, %s, %s", op, i.u2, r1, r2) + case jmp: + return fmt.Sprintf("jmp %s", i.op1.format(true)) + case jmpIf: + return fmt.Sprintf("j%s %s", cond(i.u1), i.op1.format(true)) + case jmpTableIsland: + return fmt.Sprintf("jump_table_island: jmp_table_index=%d", i.u1) + case exitSequence: + return fmt.Sprintf("exit_sequence %s", i.op1.format(true)) + case ud2: + return "ud2" + case call: + return fmt.Sprintf("call %s", ssa.FuncRef(i.u1)) + case callIndirect: + return fmt.Sprintf("callq *%s", i.op1.format(true)) + case xchg: + var suffix string + switch i.u1 { + case 1: + suffix = "b" + case 2: + suffix = "w" + case 4: + suffix = "l" + case 8: + suffix = "q" + } + return fmt.Sprintf("xchg.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true)) + case zeros: + return fmt.Sprintf("xor %s, %s", i.op2.format(true), i.op2.format(true)) + case fcvtToSintSequence: + execCtx, src, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat := i.fcvtToSintSequenceData() + return fmt.Sprintf( + "fcvtToSintSequence execCtx=%s, src=%s, tmpGp=%s, tmpGp2=%s, tmpXmm=%s, src64=%v, dst64=%v, sat=%v", + formatVRegSized(execCtx, true), + formatVRegSized(src, true), + formatVRegSized(tmpGp, true), + formatVRegSized(tmpGp2, true), + formatVRegSized(tmpXmm, true), src64, dst64, sat) + case fcvtToUintSequence: + execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat := i.fcvtToUintSequenceData() + return fmt.Sprintf( + "fcvtToUintSequence execCtx=%s, src=%s, tmpGp=%s, tmpGp2=%s, tmpXmm=%s, tmpXmm2=%s, src64=%v, dst64=%v, sat=%v", + formatVRegSized(execCtx, true), + formatVRegSized(src, true), + formatVRegSized(tmpGp, true), + formatVRegSized(tmpGp2, true), + formatVRegSized(tmpXmm, true), + formatVRegSized(tmpXmm2, true), src64, dst64, sat) + case idivRemSequence: + execCtx, divisor, tmpGp, isDiv, signed, _64 := i.idivRemSequenceData() + return fmt.Sprintf("idivRemSequence execCtx=%s, divisor=%s, tmpGp=%s, isDiv=%v, signed=%v, _64=%v", + formatVRegSized(execCtx, true), formatVRegSized(divisor, _64), formatVRegSized(tmpGp, _64), isDiv, signed, _64) + case defineUninitializedReg: + return fmt.Sprintf("defineUninitializedReg %s", i.op2.format(true)) + case xmmCMov: + return fmt.Sprintf("xmmcmov%s %s, %s", cond(i.u1), i.op1.format(true), i.op2.format(true)) + case blendvpd: + return fmt.Sprintf("blendvpd %s, %s, %%xmm0", i.op1.format(false), i.op2.format(false)) + case mfence: + return "mfence" + case lockcmpxchg: + var suffix string + switch i.u1 { + case 1: + suffix = "b" + case 2: + suffix = "w" + case 4: + suffix = "l" + case 8: + suffix = "q" + } + return fmt.Sprintf("lock cmpxchg.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true)) + case lockxadd: + var suffix string + switch i.u1 { + case 1: + suffix = "b" + case 2: + suffix = "w" + case 4: + suffix = "l" + case 8: + suffix = "q" + } + return fmt.Sprintf("lock xadd.%s %s, %s", suffix, i.op1.format(true), i.op2.format(true)) + + case nopUseReg: + return fmt.Sprintf("nop_use_reg %s", i.op1.format(true)) + + default: + panic(fmt.Sprintf("BUG: %d", int(i.kind))) + } +} + +// Defs implements regalloc.Instr. +func (i *instruction) Defs(regs *[]regalloc.VReg) []regalloc.VReg { + *regs = (*regs)[:0] + switch dk := defKinds[i.kind]; dk { + case defKindNone: + case defKindOp2: + *regs = append(*regs, i.op2.reg()) + case defKindCall: + _, _, retIntRealRegs, retFloatRealRegs, _ := backend.ABIInfoFromUint64(i.u2) + for i := byte(0); i < retIntRealRegs; i++ { + *regs = append(*regs, regInfo.RealRegToVReg[intArgResultRegs[i]]) + } + for i := byte(0); i < retFloatRealRegs; i++ { + *regs = append(*regs, regInfo.RealRegToVReg[floatArgResultRegs[i]]) + } + case defKindDivRem: + _, _, _, isDiv, _, _ := i.idivRemSequenceData() + if isDiv { + *regs = append(*regs, raxVReg) + } else { + *regs = append(*regs, rdxVReg) + } + default: + panic(fmt.Sprintf("BUG: invalid defKind \"%s\" for %s", dk, i)) + } + return *regs +} + +// Uses implements regalloc.Instr. +func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg { + *regs = (*regs)[:0] + switch uk := useKinds[i.kind]; uk { + case useKindNone: + case useKindOp1Op2Reg, useKindOp1RegOp2: + opAny, opReg := &i.op1, &i.op2 + if uk == useKindOp1RegOp2 { + opAny, opReg = opReg, opAny + } + // The destination operand (op2) can be only reg, + // the source operand (op1) can be imm32, reg or mem. + switch opAny.kind { + case operandKindReg: + *regs = append(*regs, opAny.reg()) + case operandKindMem: + opAny.addressMode().uses(regs) + case operandKindImm32: + default: + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + if opReg.kind != operandKindReg { + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + *regs = append(*regs, opReg.reg()) + case useKindOp1: + op := i.op1 + switch op.kind { + case operandKindReg: + *regs = append(*regs, op.reg()) + case operandKindMem: + op.addressMode().uses(regs) + case operandKindImm32, operandKindLabel: + default: + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + case useKindCallInd: + op := i.op1 + switch op.kind { + case operandKindReg: + *regs = append(*regs, op.reg()) + case operandKindMem: + op.addressMode().uses(regs) + default: + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + fallthrough + case useKindCall: + argIntRealRegs, argFloatRealRegs, _, _, _ := backend.ABIInfoFromUint64(i.u2) + for i := byte(0); i < argIntRealRegs; i++ { + *regs = append(*regs, regInfo.RealRegToVReg[intArgResultRegs[i]]) + } + for i := byte(0); i < argFloatRealRegs; i++ { + *regs = append(*regs, regInfo.RealRegToVReg[floatArgResultRegs[i]]) + } + case useKindFcvtToSintSequence: + execCtx, src, tmpGp, tmpGp2, tmpXmm, _, _, _ := i.fcvtToSintSequenceData() + *regs = append(*regs, execCtx, src, tmpGp, tmpGp2, tmpXmm) + case useKindFcvtToUintSequence: + execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, _, _, _ := i.fcvtToUintSequenceData() + *regs = append(*regs, execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2) + case useKindDivRem: + execCtx, divisor, tmpGp, _, _, _ := i.idivRemSequenceData() + // idiv uses rax and rdx as implicit operands. + *regs = append(*regs, raxVReg, rdxVReg, execCtx, divisor, tmpGp) + case useKindBlendvpd: + *regs = append(*regs, xmm0VReg) + + opAny, opReg := &i.op1, &i.op2 + switch opAny.kind { + case operandKindReg: + *regs = append(*regs, opAny.reg()) + case operandKindMem: + opAny.addressMode().uses(regs) + default: + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + if opReg.kind != operandKindReg { + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + *regs = append(*regs, opReg.reg()) + + case useKindRaxOp1RegOp2: + opReg, opAny := &i.op1, &i.op2 + *regs = append(*regs, raxVReg, opReg.reg()) + switch opAny.kind { + case operandKindReg: + *regs = append(*regs, opAny.reg()) + case operandKindMem: + opAny.addressMode().uses(regs) + default: + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + if opReg.kind != operandKindReg { + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + + default: + panic(fmt.Sprintf("BUG: invalid useKind %s for %s", uk, i)) + } + return *regs +} + +// AssignUse implements regalloc.Instr. +func (i *instruction) AssignUse(index int, v regalloc.VReg) { + switch uk := useKinds[i.kind]; uk { + case useKindNone: + case useKindCallInd: + if index != 0 { + panic("BUG") + } + op := &i.op1 + switch op.kind { + case operandKindReg: + op.setReg(v) + case operandKindMem: + op.addressMode().assignUses(index, v) + default: + panic("BUG") + } + case useKindOp1Op2Reg, useKindOp1RegOp2: + op, opMustBeReg := &i.op1, &i.op2 + if uk == useKindOp1RegOp2 { + op, opMustBeReg = opMustBeReg, op + } + switch op.kind { + case operandKindReg: + if index == 0 { + op.setReg(v) + } else if index == 1 { + opMustBeReg.setReg(v) + } else { + panic("BUG") + } + case operandKindMem: + nregs := op.addressMode().nregs() + if index < nregs { + op.addressMode().assignUses(index, v) + } else if index == nregs { + opMustBeReg.setReg(v) + } else { + panic("BUG") + } + case operandKindImm32: + if index == 0 { + opMustBeReg.setReg(v) + } else { + panic("BUG") + } + default: + panic(fmt.Sprintf("BUG: invalid operand pair: %s", i)) + } + case useKindOp1: + op := &i.op1 + switch op.kind { + case operandKindReg: + if index != 0 { + panic("BUG") + } + op.setReg(v) + case operandKindMem: + op.addressMode().assignUses(index, v) + default: + panic(fmt.Sprintf("BUG: invalid operand: %s", i)) + } + case useKindFcvtToSintSequence: + switch index { + case 0: + i.op1.addressMode().base = v + case 1: + i.op1.addressMode().index = v + case 2: + i.op2.addressMode().base = v + case 3: + i.op2.addressMode().index = v + case 4: + i.u1 = uint64(v) + default: + panic("BUG") + } + case useKindFcvtToUintSequence: + switch index { + case 0: + i.op1.addressMode().base = v + case 1: + i.op1.addressMode().index = v + case 2: + i.op2.addressMode().base = v + case 3: + i.op2.addressMode().index = v + case 4: + i.u1 = uint64(v) + case 5: + i.u2 = uint64(v) + default: + panic("BUG") + } + case useKindDivRem: + switch index { + case 0: + if v != raxVReg { + panic("BUG") + } + case 1: + if v != rdxVReg { + panic("BUG") + } + case 2: + i.op1.setReg(v) + case 3: + i.op2.setReg(v) + case 4: + i.u1 = uint64(v) + default: + panic("BUG") + } + case useKindBlendvpd: + op, opMustBeReg := &i.op1, &i.op2 + if index == 0 { + if v.RealReg() != xmm0 { + panic("BUG") + } + } else { + switch op.kind { + case operandKindReg: + switch index { + case 1: + op.setReg(v) + case 2: + opMustBeReg.setReg(v) + default: + panic("BUG") + } + case operandKindMem: + nregs := op.addressMode().nregs() + index-- + if index < nregs { + op.addressMode().assignUses(index, v) + } else if index == nregs { + opMustBeReg.setReg(v) + } else { + panic("BUG") + } + default: + panic(fmt.Sprintf("BUG: invalid operand pair: %s", i)) + } + } + + case useKindRaxOp1RegOp2: + switch index { + case 0: + if v.RealReg() != rax { + panic("BUG") + } + case 1: + i.op1.setReg(v) + default: + op := &i.op2 + switch op.kind { + case operandKindReg: + switch index { + case 1: + op.setReg(v) + case 2: + op.setReg(v) + default: + panic("BUG") + } + case operandKindMem: + nregs := op.addressMode().nregs() + index -= 2 + if index < nregs { + op.addressMode().assignUses(index, v) + } else if index == nregs { + op.setReg(v) + } else { + panic("BUG") + } + default: + panic(fmt.Sprintf("BUG: invalid operand pair: %s", i)) + } + } + default: + panic(fmt.Sprintf("BUG: invalid useKind %s for %s", uk, i)) + } +} + +// AssignDef implements regalloc.Instr. +func (i *instruction) AssignDef(reg regalloc.VReg) { + switch dk := defKinds[i.kind]; dk { + case defKindNone: + case defKindOp2: + i.op2.setReg(reg) + default: + panic(fmt.Sprintf("BUG: invalid defKind \"%s\" for %s", dk, i)) + } +} + +// IsCopy implements regalloc.Instr. +func (i *instruction) IsCopy() bool { + k := i.kind + if k == movRR { + return true + } + if k == xmmUnaryRmR { + if i.op1.kind == operandKindReg { + sse := sseOpcode(i.u1) + return sse == sseOpcodeMovss || sse == sseOpcodeMovsd || sse == sseOpcodeMovdqu + } + } + return false +} + +func resetInstruction(i *instruction) { + *i = instruction{} +} + +func setNext(i *instruction, next *instruction) { + i.next = next +} + +func setPrev(i *instruction, prev *instruction) { + i.prev = prev +} + +func asNop(i *instruction) { + i.kind = nop0 +} + +func (i *instruction) asNop0WithLabel(label backend.Label) *instruction { //nolint + i.kind = nop0 + i.u1 = uint64(label) + return i +} + +func (i *instruction) nop0Label() backend.Label { + return backend.Label(i.u1) +} + +type instructionKind byte + +const ( + nop0 instructionKind = iota + 1 + + // Integer arithmetic/bit-twiddling: (add sub and or xor mul, etc.) (32 64) (reg addr imm) reg + aluRmiR + + // Instructions on GPR that only read src and defines dst (dst is not modified): bsr, etc. + unaryRmR + + // Bitwise not + not + + // Integer negation + neg + + // Integer quotient and remainder: (div idiv) $rax $rdx (reg addr) + div + + // The high bits (RDX) of a (un)signed multiply: RDX:RAX := RAX * rhs. + mulHi + + // Do a sign-extend based on the sign of the value in rax into rdx: (cwd cdq cqo) + // or al into ah: (cbw) + signExtendData + + // Constant materialization: (imm32 imm64) reg. + // Either: movl $imm32, %reg32 or movabsq $imm64, %reg64. + imm + + // GPR to GPR move: mov (64 32) reg reg. + movRR + + // movzxRmR is zero-extended loads or move (R to R), except for 64 bits: movz (bl bq wl wq lq) addr reg. + // Note that the lq variant doesn't really exist since the default zero-extend rule makes it + // unnecessary. For that case we emit the equivalent "movl AM, reg32". + movzxRmR + + // mov64MR is a plain 64-bit integer load, since movzxRmR can't represent that. + mov64MR + + // Loads the memory address of addr into dst. + lea + + // Sign-extended loads and moves: movs (bl bq wl wq lq) addr reg. + movsxRmR + + // Integer stores: mov (b w l q) reg addr. + movRM + + // Arithmetic shifts: (shl shr sar) (b w l q) imm reg. + shiftR + + // Arithmetic SIMD shifts. + xmmRmiReg + + // Integer comparisons/tests: cmp or test (b w l q) (reg addr imm) reg. + cmpRmiR + + // Materializes the requested condition code in the destination reg. + setcc + + // Integer conditional move. + // Overwrites the destination register. + cmove + + // pushq (reg addr imm) + push64 + + // popq reg + pop64 + + // XMM (scalar or vector) binary op: (add sub and or xor mul adc? sbb?) (32 64) (reg addr) reg + xmmRmR + + // XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg. + // + // This differs from xmmRmR in that the dst register of xmmUnaryRmR is not used in the + // computation of the instruction dst value and so does not have to be a previously valid + // value. This is characteristic of mov instructions. + xmmUnaryRmR + + // XMM (scalar or vector) unary op with immediate: roundss, roundsd, etc. + // + // This differs from XMM_RM_R_IMM in that the dst register of + // XmmUnaryRmRImm is not used in the computation of the instruction dst + // value and so does not have to be a previously valid value. + xmmUnaryRmRImm + + // XMM (scalar or vector) unary op (from xmm to mem): stores, movd, movq + xmmMovRM + + // XMM (vector) unary op (to move a constant value into an xmm register): movups + xmmLoadConst + + // XMM (scalar) unary op (from xmm to integer reg): movd, movq, cvtts{s,d}2si + xmmToGpr + + // XMM (scalar) unary op (from integer to float reg): movd, movq, cvtsi2s{s,d} + gprToXmm + + // Converts an unsigned int64 to a float32/float64. + cvtUint64ToFloatSeq + + // Converts a scalar xmm to a signed int32/int64. + cvtFloatToSintSeq + + // Converts a scalar xmm to an unsigned int32/int64. + cvtFloatToUintSeq + + // A sequence to compute min/max with the proper NaN semantics for xmm registers. + xmmMinMaxSeq + + // Float comparisons/tests: cmp (b w l q) (reg addr imm) reg. + xmmCmpRmR + + // A binary XMM instruction with an 8-bit immediate: e.g. cmp (ps pd) imm (reg addr) reg + xmmRmRImm + + // Direct call: call simm32. + // Note that the offset is the relative to the *current RIP*, which points to the first byte of the next instruction. + call + + // Indirect call: callq (reg mem). + callIndirect + + // Return. + ret + + // Jump: jmp (reg, mem, imm32 or label) + jmp + + // Jump conditionally: jcond cond label. + jmpIf + + // jmpTableIsland is to emit the jump table. + jmpTableIsland + + // exitSequence exits the execution and go back to the Go world. + exitSequence + + // An instruction that will always trigger the illegal instruction exception. + ud2 + + // xchg is described in https://www.felixcloutier.com/x86/xchg. + // This instruction uses two operands, where one of them can be a memory address, and swaps their values. + // If the dst is a memory address, the execution is atomic. + xchg + + // lockcmpxchg is the cmpxchg instruction https://www.felixcloutier.com/x86/cmpxchg with a lock prefix. + lockcmpxchg + + // zeros puts zeros into the destination register. This is implemented as xor reg, reg for + // either integer or XMM registers. The reason why we have this instruction instead of using aluRmiR + // is that it requires the already-defined registers. From reg alloc's perspective, this defines + // the destination register and takes no inputs. + zeros + + // sourceOffsetInfo is a dummy instruction to emit source offset info. + // The existence of this instruction does not affect the execution. + sourceOffsetInfo + + // defineUninitializedReg is a no-op instruction that defines a register without a defining instruction. + defineUninitializedReg + + // fcvtToSintSequence is a sequence of instructions to convert a float to a signed integer. + fcvtToSintSequence + + // fcvtToUintSequence is a sequence of instructions to convert a float to an unsigned integer. + fcvtToUintSequence + + // xmmCMov is a conditional move instruction for XMM registers. Lowered after register allocation. + xmmCMov + + // idivRemSequence is a sequence of instructions to compute both the quotient and remainder of a division. + idivRemSequence + + // blendvpd is https://www.felixcloutier.com/x86/blendvpd. + blendvpd + + // mfence is https://www.felixcloutier.com/x86/mfence + mfence + + // lockxadd is xadd https://www.felixcloutier.com/x86/xadd with a lock prefix. + lockxadd + + // nopUseReg is a meta instruction that uses one register and does nothing. + nopUseReg + + instrMax +) + +func (i *instruction) asMFence() *instruction { + i.kind = mfence + return i +} + +func (i *instruction) asNopUseReg(r regalloc.VReg) *instruction { + i.kind = nopUseReg + i.op1 = newOperandReg(r) + return i +} + +func (i *instruction) asIdivRemSequence(execCtx, divisor, tmpGp regalloc.VReg, isDiv, signed, _64 bool) *instruction { + i.kind = idivRemSequence + i.op1 = newOperandReg(execCtx) + i.op2 = newOperandReg(divisor) + i.u1 = uint64(tmpGp) + if isDiv { + i.u2 |= 1 + } + if signed { + i.u2 |= 2 + } + if _64 { + i.u2 |= 4 + } + return i +} + +func (i *instruction) idivRemSequenceData() ( + execCtx, divisor, tmpGp regalloc.VReg, isDiv, signed, _64 bool, +) { + if i.kind != idivRemSequence { + panic("BUG") + } + return i.op1.reg(), i.op2.reg(), regalloc.VReg(i.u1), i.u2&1 != 0, i.u2&2 != 0, i.u2&4 != 0 +} + +func (i *instruction) asXmmCMov(cc cond, x operand, rd regalloc.VReg, size byte) *instruction { + i.kind = xmmCMov + i.op1 = x + i.op2 = newOperandReg(rd) + i.u1 = uint64(cc) + i.u2 = uint64(size) + return i +} + +func (i *instruction) asDefineUninitializedReg(r regalloc.VReg) *instruction { + i.kind = defineUninitializedReg + i.op2 = newOperandReg(r) + return i +} + +func (m *machine) allocateFcvtToUintSequence( + execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2 regalloc.VReg, + src64, dst64, sat bool, +) *instruction { + i := m.allocateInstr() + i.kind = fcvtToUintSequence + op1a := m.amodePool.Allocate() + op2a := m.amodePool.Allocate() + i.op1 = newOperandMem(op1a) + i.op2 = newOperandMem(op2a) + if src64 { + op1a.imm32 = 1 + } else { + op1a.imm32 = 0 + } + if dst64 { + op1a.imm32 |= 2 + } + if sat { + op1a.imm32 |= 4 + } + + op1a.base = execCtx + op1a.index = src + op2a.base = tmpGp + op2a.index = tmpGp2 + i.u1 = uint64(tmpXmm) + i.u2 = uint64(tmpXmm2) + return i +} + +func (i *instruction) fcvtToUintSequenceData() ( + execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2 regalloc.VReg, src64, dst64, sat bool, +) { + if i.kind != fcvtToUintSequence { + panic("BUG") + } + op1a := i.op1.addressMode() + op2a := i.op2.addressMode() + return op1a.base, op1a.index, op2a.base, op2a.index, regalloc.VReg(i.u1), regalloc.VReg(i.u2), + op1a.imm32&1 != 0, op1a.imm32&2 != 0, op1a.imm32&4 != 0 +} + +func (m *machine) allocateFcvtToSintSequence( + execCtx, src, tmpGp, tmpGp2, tmpXmm regalloc.VReg, + src64, dst64, sat bool, +) *instruction { + i := m.allocateInstr() + i.kind = fcvtToSintSequence + op1a := m.amodePool.Allocate() + op2a := m.amodePool.Allocate() + i.op1 = newOperandMem(op1a) + i.op2 = newOperandMem(op2a) + op1a.base = execCtx + op1a.index = src + op2a.base = tmpGp + op2a.index = tmpGp2 + i.u1 = uint64(tmpXmm) + if src64 { + i.u2 = 1 + } else { + i.u2 = 0 + } + if dst64 { + i.u2 |= 2 + } + if sat { + i.u2 |= 4 + } + return i +} + +func (i *instruction) fcvtToSintSequenceData() ( + execCtx, src, tmpGp, tmpGp2, tmpXmm regalloc.VReg, src64, dst64, sat bool, +) { + if i.kind != fcvtToSintSequence { + panic("BUG") + } + op1a := i.op1.addressMode() + op2a := i.op2.addressMode() + return op1a.base, op1a.index, op2a.base, op2a.index, regalloc.VReg(i.u1), + i.u2&1 != 0, i.u2&2 != 0, i.u2&4 != 0 +} + +func (k instructionKind) String() string { + switch k { + case nop0: + return "nop" + case ret: + return "ret" + case imm: + return "imm" + case aluRmiR: + return "aluRmiR" + case movRR: + return "movRR" + case xmmRmR: + return "xmmRmR" + case gprToXmm: + return "gprToXmm" + case xmmUnaryRmR: + return "xmmUnaryRmR" + case xmmUnaryRmRImm: + return "xmmUnaryRmRImm" + case unaryRmR: + return "unaryRmR" + case not: + return "not" + case neg: + return "neg" + case div: + return "div" + case mulHi: + return "mulHi" + case signExtendData: + return "signExtendData" + case movzxRmR: + return "movzxRmR" + case mov64MR: + return "mov64MR" + case lea: + return "lea" + case movsxRmR: + return "movsxRmR" + case movRM: + return "movRM" + case shiftR: + return "shiftR" + case xmmRmiReg: + return "xmmRmiReg" + case cmpRmiR: + return "cmpRmiR" + case setcc: + return "setcc" + case cmove: + return "cmove" + case push64: + return "push64" + case pop64: + return "pop64" + case xmmMovRM: + return "xmmMovRM" + case xmmLoadConst: + return "xmmLoadConst" + case xmmToGpr: + return "xmmToGpr" + case cvtUint64ToFloatSeq: + return "cvtUint64ToFloatSeq" + case cvtFloatToSintSeq: + return "cvtFloatToSintSeq" + case cvtFloatToUintSeq: + return "cvtFloatToUintSeq" + case xmmMinMaxSeq: + return "xmmMinMaxSeq" + case xmmCmpRmR: + return "xmmCmpRmR" + case xmmRmRImm: + return "xmmRmRImm" + case jmpIf: + return "jmpIf" + case jmp: + return "jmp" + case jmpTableIsland: + return "jmpTableIsland" + case exitSequence: + return "exit_sequence" + case ud2: + return "ud2" + case xchg: + return "xchg" + case zeros: + return "zeros" + case fcvtToSintSequence: + return "fcvtToSintSequence" + case fcvtToUintSequence: + return "fcvtToUintSequence" + case xmmCMov: + return "xmmCMov" + case idivRemSequence: + return "idivRemSequence" + case mfence: + return "mfence" + case lockcmpxchg: + return "lockcmpxchg" + case lockxadd: + return "lockxadd" + default: + panic("BUG") + } +} + +type aluRmiROpcode byte + +const ( + aluRmiROpcodeAdd aluRmiROpcode = iota + 1 + aluRmiROpcodeSub + aluRmiROpcodeAnd + aluRmiROpcodeOr + aluRmiROpcodeXor + aluRmiROpcodeMul +) + +func (a aluRmiROpcode) String() string { + switch a { + case aluRmiROpcodeAdd: + return "add" + case aluRmiROpcodeSub: + return "sub" + case aluRmiROpcodeAnd: + return "and" + case aluRmiROpcodeOr: + return "or" + case aluRmiROpcodeXor: + return "xor" + case aluRmiROpcodeMul: + return "imul" + default: + panic("BUG") + } +} + +func (i *instruction) asJmpIf(cond cond, target operand) *instruction { + i.kind = jmpIf + i.u1 = uint64(cond) + i.op1 = target + return i +} + +// asJmpTableSequence is used to emit the jump table. +// targetSliceIndex is the index of the target slice in machine.jmpTableTargets. +func (i *instruction) asJmpTableSequence(targetSliceIndex int, targetCount int) *instruction { + i.kind = jmpTableIsland + i.u1 = uint64(targetSliceIndex) + i.u2 = uint64(targetCount) + return i +} + +func (i *instruction) asJmp(target operand) *instruction { + i.kind = jmp + i.op1 = target + return i +} + +func (i *instruction) jmpLabel() backend.Label { + switch i.kind { + case jmp, jmpIf, lea, xmmUnaryRmR: + return i.op1.label() + default: + panic("BUG") + } +} + +func (i *instruction) asLEA(target operand, rd regalloc.VReg) *instruction { + i.kind = lea + i.op1 = target + i.op2 = newOperandReg(rd) + return i +} + +func (i *instruction) asCall(ref ssa.FuncRef, abi *backend.FunctionABI) *instruction { + i.kind = call + i.u1 = uint64(ref) + if abi != nil { + i.u2 = abi.ABIInfoAsUint64() + } + return i +} + +func (i *instruction) asCallIndirect(ptr operand, abi *backend.FunctionABI) *instruction { + if ptr.kind != operandKindReg && ptr.kind != operandKindMem { + panic("BUG") + } + i.kind = callIndirect + i.op1 = ptr + if abi != nil { + i.u2 = abi.ABIInfoAsUint64() + } + return i +} + +func (i *instruction) asRet() *instruction { + i.kind = ret + return i +} + +func (i *instruction) asImm(dst regalloc.VReg, value uint64, _64 bool) *instruction { + i.kind = imm + i.op2 = newOperandReg(dst) + i.u1 = value + i.b1 = _64 + return i +} + +func (i *instruction) asAluRmiR(op aluRmiROpcode, rm operand, rd regalloc.VReg, _64 bool) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem && rm.kind != operandKindImm32 { + panic("BUG") + } + i.kind = aluRmiR + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + i.b1 = _64 + return i +} + +func (i *instruction) asZeros(dst regalloc.VReg) *instruction { + i.kind = zeros + i.op2 = newOperandReg(dst) + return i +} + +func (i *instruction) asBlendvpd(rm operand, rd regalloc.VReg) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = blendvpd + i.op1 = rm + i.op2 = newOperandReg(rd) + return i +} + +func (i *instruction) asXmmRmR(op sseOpcode, rm operand, rd regalloc.VReg) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = xmmRmR + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + return i +} + +func (i *instruction) asXmmRmRImm(op sseOpcode, imm uint8, rm operand, rd regalloc.VReg) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = xmmRmRImm + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + i.u2 = uint64(imm) + return i +} + +func (i *instruction) asGprToXmm(op sseOpcode, rm operand, rd regalloc.VReg, _64 bool) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = gprToXmm + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + i.b1 = _64 + return i +} + +func (i *instruction) asEmitSourceOffsetInfo(l ssa.SourceOffset) *instruction { + i.kind = sourceOffsetInfo + i.u1 = uint64(l) + return i +} + +func (i *instruction) sourceOffsetInfo() ssa.SourceOffset { + return ssa.SourceOffset(i.u1) +} + +func (i *instruction) asXmmToGpr(op sseOpcode, rm, rd regalloc.VReg, _64 bool) *instruction { + i.kind = xmmToGpr + i.op1 = newOperandReg(rm) + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + i.b1 = _64 + return i +} + +func (i *instruction) asMovRM(rm regalloc.VReg, rd operand, size byte) *instruction { + if rd.kind != operandKindMem { + panic("BUG") + } + i.kind = movRM + i.op1 = newOperandReg(rm) + i.op2 = rd + i.u1 = uint64(size) + return i +} + +func (i *instruction) asMovsxRmR(ext extMode, src operand, rd regalloc.VReg) *instruction { + if src.kind != operandKindReg && src.kind != operandKindMem { + panic("BUG") + } + i.kind = movsxRmR + i.op1 = src + i.op2 = newOperandReg(rd) + i.u1 = uint64(ext) + return i +} + +func (i *instruction) asMovzxRmR(ext extMode, src operand, rd regalloc.VReg) *instruction { + if src.kind != operandKindReg && src.kind != operandKindMem { + panic("BUG") + } + i.kind = movzxRmR + i.op1 = src + i.op2 = newOperandReg(rd) + i.u1 = uint64(ext) + return i +} + +func (i *instruction) asSignExtendData(_64 bool) *instruction { + i.kind = signExtendData + i.b1 = _64 + return i +} + +func (i *instruction) asUD2() *instruction { + i.kind = ud2 + return i +} + +func (i *instruction) asDiv(rn operand, signed bool, _64 bool) *instruction { + i.kind = div + i.op1 = rn + i.b1 = _64 + if signed { + i.u1 = 1 + } + return i +} + +func (i *instruction) asMov64MR(rm operand, rd regalloc.VReg) *instruction { + if rm.kind != operandKindMem { + panic("BUG") + } + i.kind = mov64MR + i.op1 = rm + i.op2 = newOperandReg(rd) + return i +} + +func (i *instruction) asMovRR(rm, rd regalloc.VReg, _64 bool) *instruction { + i.kind = movRR + i.op1 = newOperandReg(rm) + i.op2 = newOperandReg(rd) + i.b1 = _64 + return i +} + +func (i *instruction) asNot(rm operand, _64 bool) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = not + i.op1 = rm + i.b1 = _64 + return i +} + +func (i *instruction) asNeg(rm operand, _64 bool) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = neg + i.op1 = rm + i.b1 = _64 + return i +} + +func (i *instruction) asMulHi(rm operand, signed, _64 bool) *instruction { + if rm.kind != operandKindReg && (rm.kind != operandKindMem) { + panic("BUG") + } + i.kind = mulHi + i.op1 = rm + i.b1 = _64 + if signed { + i.u1 = 1 + } + return i +} + +func (i *instruction) asUnaryRmR(op unaryRmROpcode, rm operand, rd regalloc.VReg, _64 bool) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = unaryRmR + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + i.b1 = _64 + return i +} + +func (i *instruction) asShiftR(op shiftROp, amount operand, rd regalloc.VReg, _64 bool) *instruction { + if amount.kind != operandKindReg && amount.kind != operandKindImm32 { + panic("BUG") + } + i.kind = shiftR + i.op1 = amount + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + i.b1 = _64 + return i +} + +func (i *instruction) asXmmRmiReg(op sseOpcode, rm operand, rd regalloc.VReg) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindImm32 && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = xmmRmiReg + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + return i +} + +func (i *instruction) asCmpRmiR(cmp bool, rm operand, rn regalloc.VReg, _64 bool) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindImm32 && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = cmpRmiR + i.op1 = rm + i.op2 = newOperandReg(rn) + if cmp { + i.u1 = 1 + } + i.b1 = _64 + return i +} + +func (i *instruction) asSetcc(c cond, rd regalloc.VReg) *instruction { + i.kind = setcc + i.op2 = newOperandReg(rd) + i.u1 = uint64(c) + return i +} + +func (i *instruction) asCmove(c cond, rm operand, rd regalloc.VReg, _64 bool) *instruction { + i.kind = cmove + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(c) + i.b1 = _64 + return i +} + +func (m *machine) allocateExitSeq(execCtx regalloc.VReg) *instruction { + i := m.allocateInstr() + i.kind = exitSequence + i.op1 = newOperandReg(execCtx) + // Allocate the address mode that will be used in encoding the exit sequence. + i.op2 = newOperandMem(m.amodePool.Allocate()) + return i +} + +func (i *instruction) asXmmUnaryRmR(op sseOpcode, rm operand, rd regalloc.VReg) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = xmmUnaryRmR + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + return i +} + +func (i *instruction) asXmmUnaryRmRImm(op sseOpcode, imm byte, rm operand, rd regalloc.VReg) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = xmmUnaryRmRImm + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + i.u2 = uint64(imm) + return i +} + +func (i *instruction) asXmmCmpRmR(op sseOpcode, rm operand, rd regalloc.VReg) *instruction { + if rm.kind != operandKindReg && rm.kind != operandKindMem { + panic("BUG") + } + i.kind = xmmCmpRmR + i.op1 = rm + i.op2 = newOperandReg(rd) + i.u1 = uint64(op) + return i +} + +func (i *instruction) asXmmMovRM(op sseOpcode, rm regalloc.VReg, rd operand) *instruction { + if rd.kind != operandKindMem { + panic("BUG") + } + i.kind = xmmMovRM + i.op1 = newOperandReg(rm) + i.op2 = rd + i.u1 = uint64(op) + return i +} + +func (i *instruction) asPop64(rm regalloc.VReg) *instruction { + i.kind = pop64 + i.op1 = newOperandReg(rm) + return i +} + +func (i *instruction) asPush64(op operand) *instruction { + if op.kind != operandKindReg && op.kind != operandKindMem && op.kind != operandKindImm32 { + panic("BUG") + } + i.kind = push64 + i.op1 = op + return i +} + +func (i *instruction) asXCHG(rm regalloc.VReg, rd operand, size byte) *instruction { + i.kind = xchg + i.op1 = newOperandReg(rm) + i.op2 = rd + i.u1 = uint64(size) + return i +} + +func (i *instruction) asLockCmpXCHG(rm regalloc.VReg, rd *amode, size byte) *instruction { + i.kind = lockcmpxchg + i.op1 = newOperandReg(rm) + i.op2 = newOperandMem(rd) + i.u1 = uint64(size) + return i +} + +func (i *instruction) asLockXAdd(rm regalloc.VReg, rd *amode, size byte) *instruction { + i.kind = lockxadd + i.op1 = newOperandReg(rm) + i.op2 = newOperandMem(rd) + i.u1 = uint64(size) + return i +} + +type unaryRmROpcode byte + +const ( + unaryRmROpcodeBsr unaryRmROpcode = iota + unaryRmROpcodeBsf + unaryRmROpcodeLzcnt + unaryRmROpcodeTzcnt + unaryRmROpcodePopcnt +) + +func (u unaryRmROpcode) String() string { + switch u { + case unaryRmROpcodeBsr: + return "bsr" + case unaryRmROpcodeBsf: + return "bsf" + case unaryRmROpcodeLzcnt: + return "lzcnt" + case unaryRmROpcodeTzcnt: + return "tzcnt" + case unaryRmROpcodePopcnt: + return "popcnt" + default: + panic("BUG") + } +} + +type shiftROp byte + +const ( + shiftROpRotateLeft shiftROp = 0 + shiftROpRotateRight shiftROp = 1 + shiftROpShiftLeft shiftROp = 4 + shiftROpShiftRightLogical shiftROp = 5 + shiftROpShiftRightArithmetic shiftROp = 7 +) + +func (s shiftROp) String() string { + switch s { + case shiftROpRotateLeft: + return "rol" + case shiftROpRotateRight: + return "ror" + case shiftROpShiftLeft: + return "shl" + case shiftROpShiftRightLogical: + return "shr" + case shiftROpShiftRightArithmetic: + return "sar" + default: + panic("BUG") + } +} + +type sseOpcode byte + +const ( + sseOpcodeInvalid sseOpcode = iota + sseOpcodeAddps + sseOpcodeAddpd + sseOpcodeAddss + sseOpcodeAddsd + sseOpcodeAndps + sseOpcodeAndpd + sseOpcodeAndnps + sseOpcodeAndnpd + sseOpcodeBlendvps + sseOpcodeBlendvpd + sseOpcodeComiss + sseOpcodeComisd + sseOpcodeCmpps + sseOpcodeCmppd + sseOpcodeCmpss + sseOpcodeCmpsd + sseOpcodeCvtdq2ps + sseOpcodeCvtdq2pd + sseOpcodeCvtsd2ss + sseOpcodeCvtsd2si + sseOpcodeCvtsi2ss + sseOpcodeCvtsi2sd + sseOpcodeCvtss2si + sseOpcodeCvtss2sd + sseOpcodeCvttps2dq + sseOpcodeCvttss2si + sseOpcodeCvttsd2si + sseOpcodeDivps + sseOpcodeDivpd + sseOpcodeDivss + sseOpcodeDivsd + sseOpcodeInsertps + sseOpcodeMaxps + sseOpcodeMaxpd + sseOpcodeMaxss + sseOpcodeMaxsd + sseOpcodeMinps + sseOpcodeMinpd + sseOpcodeMinss + sseOpcodeMinsd + sseOpcodeMovaps + sseOpcodeMovapd + sseOpcodeMovd + sseOpcodeMovdqa + sseOpcodeMovdqu + sseOpcodeMovlhps + sseOpcodeMovmskps + sseOpcodeMovmskpd + sseOpcodeMovq + sseOpcodeMovss + sseOpcodeMovsd + sseOpcodeMovups + sseOpcodeMovupd + sseOpcodeMulps + sseOpcodeMulpd + sseOpcodeMulss + sseOpcodeMulsd + sseOpcodeOrps + sseOpcodeOrpd + sseOpcodePabsb + sseOpcodePabsw + sseOpcodePabsd + sseOpcodePackssdw + sseOpcodePacksswb + sseOpcodePackusdw + sseOpcodePackuswb + sseOpcodePaddb + sseOpcodePaddd + sseOpcodePaddq + sseOpcodePaddw + sseOpcodePaddsb + sseOpcodePaddsw + sseOpcodePaddusb + sseOpcodePaddusw + sseOpcodePalignr + sseOpcodePand + sseOpcodePandn + sseOpcodePavgb + sseOpcodePavgw + sseOpcodePcmpeqb + sseOpcodePcmpeqw + sseOpcodePcmpeqd + sseOpcodePcmpeqq + sseOpcodePcmpgtb + sseOpcodePcmpgtw + sseOpcodePcmpgtd + sseOpcodePcmpgtq + sseOpcodePextrb + sseOpcodePextrw + sseOpcodePextrd + sseOpcodePextrq + sseOpcodePinsrb + sseOpcodePinsrw + sseOpcodePinsrd + sseOpcodePinsrq + sseOpcodePmaddwd + sseOpcodePmaxsb + sseOpcodePmaxsw + sseOpcodePmaxsd + sseOpcodePmaxub + sseOpcodePmaxuw + sseOpcodePmaxud + sseOpcodePminsb + sseOpcodePminsw + sseOpcodePminsd + sseOpcodePminub + sseOpcodePminuw + sseOpcodePminud + sseOpcodePmovmskb + sseOpcodePmovsxbd + sseOpcodePmovsxbw + sseOpcodePmovsxbq + sseOpcodePmovsxwd + sseOpcodePmovsxwq + sseOpcodePmovsxdq + sseOpcodePmovzxbd + sseOpcodePmovzxbw + sseOpcodePmovzxbq + sseOpcodePmovzxwd + sseOpcodePmovzxwq + sseOpcodePmovzxdq + sseOpcodePmulld + sseOpcodePmullw + sseOpcodePmuludq + sseOpcodePor + sseOpcodePshufb + sseOpcodePshufd + sseOpcodePsllw + sseOpcodePslld + sseOpcodePsllq + sseOpcodePsraw + sseOpcodePsrad + sseOpcodePsrlw + sseOpcodePsrld + sseOpcodePsrlq + sseOpcodePsubb + sseOpcodePsubd + sseOpcodePsubq + sseOpcodePsubw + sseOpcodePsubsb + sseOpcodePsubsw + sseOpcodePsubusb + sseOpcodePsubusw + sseOpcodePtest + sseOpcodePunpckhbw + sseOpcodePunpcklbw + sseOpcodePxor + sseOpcodeRcpss + sseOpcodeRoundps + sseOpcodeRoundpd + sseOpcodeRoundss + sseOpcodeRoundsd + sseOpcodeRsqrtss + sseOpcodeSqrtps + sseOpcodeSqrtpd + sseOpcodeSqrtss + sseOpcodeSqrtsd + sseOpcodeSubps + sseOpcodeSubpd + sseOpcodeSubss + sseOpcodeSubsd + sseOpcodeUcomiss + sseOpcodeUcomisd + sseOpcodeXorps + sseOpcodeXorpd + sseOpcodePmulhrsw + sseOpcodeUnpcklps + sseOpcodeCvtps2pd + sseOpcodeCvtpd2ps + sseOpcodeCvttpd2dq + sseOpcodeShufps + sseOpcodePmaddubsw +) + +func (s sseOpcode) String() string { + switch s { + case sseOpcodeInvalid: + return "invalid" + case sseOpcodeAddps: + return "addps" + case sseOpcodeAddpd: + return "addpd" + case sseOpcodeAddss: + return "addss" + case sseOpcodeAddsd: + return "addsd" + case sseOpcodeAndps: + return "andps" + case sseOpcodeAndpd: + return "andpd" + case sseOpcodeAndnps: + return "andnps" + case sseOpcodeAndnpd: + return "andnpd" + case sseOpcodeBlendvps: + return "blendvps" + case sseOpcodeBlendvpd: + return "blendvpd" + case sseOpcodeComiss: + return "comiss" + case sseOpcodeComisd: + return "comisd" + case sseOpcodeCmpps: + return "cmpps" + case sseOpcodeCmppd: + return "cmppd" + case sseOpcodeCmpss: + return "cmpss" + case sseOpcodeCmpsd: + return "cmpsd" + case sseOpcodeCvtdq2ps: + return "cvtdq2ps" + case sseOpcodeCvtdq2pd: + return "cvtdq2pd" + case sseOpcodeCvtsd2ss: + return "cvtsd2ss" + case sseOpcodeCvtsd2si: + return "cvtsd2si" + case sseOpcodeCvtsi2ss: + return "cvtsi2ss" + case sseOpcodeCvtsi2sd: + return "cvtsi2sd" + case sseOpcodeCvtss2si: + return "cvtss2si" + case sseOpcodeCvtss2sd: + return "cvtss2sd" + case sseOpcodeCvttps2dq: + return "cvttps2dq" + case sseOpcodeCvttss2si: + return "cvttss2si" + case sseOpcodeCvttsd2si: + return "cvttsd2si" + case sseOpcodeDivps: + return "divps" + case sseOpcodeDivpd: + return "divpd" + case sseOpcodeDivss: + return "divss" + case sseOpcodeDivsd: + return "divsd" + case sseOpcodeInsertps: + return "insertps" + case sseOpcodeMaxps: + return "maxps" + case sseOpcodeMaxpd: + return "maxpd" + case sseOpcodeMaxss: + return "maxss" + case sseOpcodeMaxsd: + return "maxsd" + case sseOpcodeMinps: + return "minps" + case sseOpcodeMinpd: + return "minpd" + case sseOpcodeMinss: + return "minss" + case sseOpcodeMinsd: + return "minsd" + case sseOpcodeMovaps: + return "movaps" + case sseOpcodeMovapd: + return "movapd" + case sseOpcodeMovd: + return "movd" + case sseOpcodeMovdqa: + return "movdqa" + case sseOpcodeMovdqu: + return "movdqu" + case sseOpcodeMovlhps: + return "movlhps" + case sseOpcodeMovmskps: + return "movmskps" + case sseOpcodeMovmskpd: + return "movmskpd" + case sseOpcodeMovq: + return "movq" + case sseOpcodeMovss: + return "movss" + case sseOpcodeMovsd: + return "movsd" + case sseOpcodeMovups: + return "movups" + case sseOpcodeMovupd: + return "movupd" + case sseOpcodeMulps: + return "mulps" + case sseOpcodeMulpd: + return "mulpd" + case sseOpcodeMulss: + return "mulss" + case sseOpcodeMulsd: + return "mulsd" + case sseOpcodeOrps: + return "orps" + case sseOpcodeOrpd: + return "orpd" + case sseOpcodePabsb: + return "pabsb" + case sseOpcodePabsw: + return "pabsw" + case sseOpcodePabsd: + return "pabsd" + case sseOpcodePackssdw: + return "packssdw" + case sseOpcodePacksswb: + return "packsswb" + case sseOpcodePackusdw: + return "packusdw" + case sseOpcodePackuswb: + return "packuswb" + case sseOpcodePaddb: + return "paddb" + case sseOpcodePaddd: + return "paddd" + case sseOpcodePaddq: + return "paddq" + case sseOpcodePaddw: + return "paddw" + case sseOpcodePaddsb: + return "paddsb" + case sseOpcodePaddsw: + return "paddsw" + case sseOpcodePaddusb: + return "paddusb" + case sseOpcodePaddusw: + return "paddusw" + case sseOpcodePalignr: + return "palignr" + case sseOpcodePand: + return "pand" + case sseOpcodePandn: + return "pandn" + case sseOpcodePavgb: + return "pavgb" + case sseOpcodePavgw: + return "pavgw" + case sseOpcodePcmpeqb: + return "pcmpeqb" + case sseOpcodePcmpeqw: + return "pcmpeqw" + case sseOpcodePcmpeqd: + return "pcmpeqd" + case sseOpcodePcmpeqq: + return "pcmpeqq" + case sseOpcodePcmpgtb: + return "pcmpgtb" + case sseOpcodePcmpgtw: + return "pcmpgtw" + case sseOpcodePcmpgtd: + return "pcmpgtd" + case sseOpcodePcmpgtq: + return "pcmpgtq" + case sseOpcodePextrb: + return "pextrb" + case sseOpcodePextrw: + return "pextrw" + case sseOpcodePextrd: + return "pextrd" + case sseOpcodePextrq: + return "pextrq" + case sseOpcodePinsrb: + return "pinsrb" + case sseOpcodePinsrw: + return "pinsrw" + case sseOpcodePinsrd: + return "pinsrd" + case sseOpcodePinsrq: + return "pinsrq" + case sseOpcodePmaddwd: + return "pmaddwd" + case sseOpcodePmaxsb: + return "pmaxsb" + case sseOpcodePmaxsw: + return "pmaxsw" + case sseOpcodePmaxsd: + return "pmaxsd" + case sseOpcodePmaxub: + return "pmaxub" + case sseOpcodePmaxuw: + return "pmaxuw" + case sseOpcodePmaxud: + return "pmaxud" + case sseOpcodePminsb: + return "pminsb" + case sseOpcodePminsw: + return "pminsw" + case sseOpcodePminsd: + return "pminsd" + case sseOpcodePminub: + return "pminub" + case sseOpcodePminuw: + return "pminuw" + case sseOpcodePminud: + return "pminud" + case sseOpcodePmovmskb: + return "pmovmskb" + case sseOpcodePmovsxbd: + return "pmovsxbd" + case sseOpcodePmovsxbw: + return "pmovsxbw" + case sseOpcodePmovsxbq: + return "pmovsxbq" + case sseOpcodePmovsxwd: + return "pmovsxwd" + case sseOpcodePmovsxwq: + return "pmovsxwq" + case sseOpcodePmovsxdq: + return "pmovsxdq" + case sseOpcodePmovzxbd: + return "pmovzxbd" + case sseOpcodePmovzxbw: + return "pmovzxbw" + case sseOpcodePmovzxbq: + return "pmovzxbq" + case sseOpcodePmovzxwd: + return "pmovzxwd" + case sseOpcodePmovzxwq: + return "pmovzxwq" + case sseOpcodePmovzxdq: + return "pmovzxdq" + case sseOpcodePmulld: + return "pmulld" + case sseOpcodePmullw: + return "pmullw" + case sseOpcodePmuludq: + return "pmuludq" + case sseOpcodePor: + return "por" + case sseOpcodePshufb: + return "pshufb" + case sseOpcodePshufd: + return "pshufd" + case sseOpcodePsllw: + return "psllw" + case sseOpcodePslld: + return "pslld" + case sseOpcodePsllq: + return "psllq" + case sseOpcodePsraw: + return "psraw" + case sseOpcodePsrad: + return "psrad" + case sseOpcodePsrlw: + return "psrlw" + case sseOpcodePsrld: + return "psrld" + case sseOpcodePsrlq: + return "psrlq" + case sseOpcodePsubb: + return "psubb" + case sseOpcodePsubd: + return "psubd" + case sseOpcodePsubq: + return "psubq" + case sseOpcodePsubw: + return "psubw" + case sseOpcodePsubsb: + return "psubsb" + case sseOpcodePsubsw: + return "psubsw" + case sseOpcodePsubusb: + return "psubusb" + case sseOpcodePsubusw: + return "psubusw" + case sseOpcodePtest: + return "ptest" + case sseOpcodePunpckhbw: + return "punpckhbw" + case sseOpcodePunpcklbw: + return "punpcklbw" + case sseOpcodePxor: + return "pxor" + case sseOpcodeRcpss: + return "rcpss" + case sseOpcodeRoundps: + return "roundps" + case sseOpcodeRoundpd: + return "roundpd" + case sseOpcodeRoundss: + return "roundss" + case sseOpcodeRoundsd: + return "roundsd" + case sseOpcodeRsqrtss: + return "rsqrtss" + case sseOpcodeSqrtps: + return "sqrtps" + case sseOpcodeSqrtpd: + return "sqrtpd" + case sseOpcodeSqrtss: + return "sqrtss" + case sseOpcodeSqrtsd: + return "sqrtsd" + case sseOpcodeSubps: + return "subps" + case sseOpcodeSubpd: + return "subpd" + case sseOpcodeSubss: + return "subss" + case sseOpcodeSubsd: + return "subsd" + case sseOpcodeUcomiss: + return "ucomiss" + case sseOpcodeUcomisd: + return "ucomisd" + case sseOpcodeXorps: + return "xorps" + case sseOpcodeXorpd: + return "xorpd" + case sseOpcodePmulhrsw: + return "pmulhrsw" + case sseOpcodeUnpcklps: + return "unpcklps" + case sseOpcodeCvtps2pd: + return "cvtps2pd" + case sseOpcodeCvtpd2ps: + return "cvtpd2ps" + case sseOpcodeCvttpd2dq: + return "cvttpd2dq" + case sseOpcodeShufps: + return "shufps" + case sseOpcodePmaddubsw: + return "pmaddubsw" + default: + panic("BUG") + } +} + +type roundingMode uint8 + +const ( + roundingModeNearest roundingMode = iota + roundingModeDown + roundingModeUp + roundingModeZero +) + +func (r roundingMode) String() string { + switch r { + case roundingModeNearest: + return "nearest" + case roundingModeDown: + return "down" + case roundingModeUp: + return "up" + case roundingModeZero: + return "zero" + default: + panic("BUG") + } +} + +// cmpPred is the immediate value for a comparison operation in xmmRmRImm. +type cmpPred uint8 + +const ( + // cmpPredEQ_OQ is Equal (ordered, non-signaling) + cmpPredEQ_OQ cmpPred = iota + // cmpPredLT_OS is Less-than (ordered, signaling) + cmpPredLT_OS + // cmpPredLE_OS is Less-than-or-equal (ordered, signaling) + cmpPredLE_OS + // cmpPredUNORD_Q is Unordered (non-signaling) + cmpPredUNORD_Q + // cmpPredNEQ_UQ is Not-equal (unordered, non-signaling) + cmpPredNEQ_UQ + // cmpPredNLT_US is Not-less-than (unordered, signaling) + cmpPredNLT_US + // cmpPredNLE_US is Not-less-than-or-equal (unordered, signaling) + cmpPredNLE_US + // cmpPredORD_Q is Ordered (non-signaling) + cmpPredORD_Q + // cmpPredEQ_UQ is Equal (unordered, non-signaling) + cmpPredEQ_UQ + // cmpPredNGE_US is Not-greater-than-or-equal (unordered, signaling) + cmpPredNGE_US + // cmpPredNGT_US is Not-greater-than (unordered, signaling) + cmpPredNGT_US + // cmpPredFALSE_OQ is False (ordered, non-signaling) + cmpPredFALSE_OQ + // cmpPredNEQ_OQ is Not-equal (ordered, non-signaling) + cmpPredNEQ_OQ + // cmpPredGE_OS is Greater-than-or-equal (ordered, signaling) + cmpPredGE_OS + // cmpPredGT_OS is Greater-than (ordered, signaling) + cmpPredGT_OS + // cmpPredTRUE_UQ is True (unordered, non-signaling) + cmpPredTRUE_UQ + // Equal (ordered, signaling) + cmpPredEQ_OS + // Less-than (ordered, nonsignaling) + cmpPredLT_OQ + // Less-than-or-equal (ordered, nonsignaling) + cmpPredLE_OQ + // Unordered (signaling) + cmpPredUNORD_S + // Not-equal (unordered, signaling) + cmpPredNEQ_US + // Not-less-than (unordered, nonsignaling) + cmpPredNLT_UQ + // Not-less-than-or-equal (unordered, nonsignaling) + cmpPredNLE_UQ + // Ordered (signaling) + cmpPredORD_S + // Equal (unordered, signaling) + cmpPredEQ_US + // Not-greater-than-or-equal (unordered, non-signaling) + cmpPredNGE_UQ + // Not-greater-than (unordered, nonsignaling) + cmpPredNGT_UQ + // False (ordered, signaling) + cmpPredFALSE_OS + // Not-equal (ordered, signaling) + cmpPredNEQ_OS + // Greater-than-or-equal (ordered, nonsignaling) + cmpPredGE_OQ + // Greater-than (ordered, nonsignaling) + cmpPredGT_OQ + // True (unordered, signaling) + cmpPredTRUE_US +) + +func (r cmpPred) String() string { + switch r { + case cmpPredEQ_OQ: + return "eq_oq" + case cmpPredLT_OS: + return "lt_os" + case cmpPredLE_OS: + return "le_os" + case cmpPredUNORD_Q: + return "unord_q" + case cmpPredNEQ_UQ: + return "neq_uq" + case cmpPredNLT_US: + return "nlt_us" + case cmpPredNLE_US: + return "nle_us" + case cmpPredORD_Q: + return "ord_q" + case cmpPredEQ_UQ: + return "eq_uq" + case cmpPredNGE_US: + return "nge_us" + case cmpPredNGT_US: + return "ngt_us" + case cmpPredFALSE_OQ: + return "false_oq" + case cmpPredNEQ_OQ: + return "neq_oq" + case cmpPredGE_OS: + return "ge_os" + case cmpPredGT_OS: + return "gt_os" + case cmpPredTRUE_UQ: + return "true_uq" + case cmpPredEQ_OS: + return "eq_os" + case cmpPredLT_OQ: + return "lt_oq" + case cmpPredLE_OQ: + return "le_oq" + case cmpPredUNORD_S: + return "unord_s" + case cmpPredNEQ_US: + return "neq_us" + case cmpPredNLT_UQ: + return "nlt_uq" + case cmpPredNLE_UQ: + return "nle_uq" + case cmpPredORD_S: + return "ord_s" + case cmpPredEQ_US: + return "eq_us" + case cmpPredNGE_UQ: + return "nge_uq" + case cmpPredNGT_UQ: + return "ngt_uq" + case cmpPredFALSE_OS: + return "false_os" + case cmpPredNEQ_OS: + return "neq_os" + case cmpPredGE_OQ: + return "ge_oq" + case cmpPredGT_OQ: + return "gt_oq" + case cmpPredTRUE_US: + return "true_us" + default: + panic("BUG") + } +} + +func linkInstr(prev, next *instruction) *instruction { + prev.next = next + next.prev = prev + return next +} + +type defKind byte + +const ( + defKindNone defKind = iota + 1 + defKindOp2 + defKindCall + defKindDivRem +) + +var defKinds = [instrMax]defKind{ + nop0: defKindNone, + ret: defKindNone, + movRR: defKindOp2, + movRM: defKindNone, + xmmMovRM: defKindNone, + aluRmiR: defKindNone, + shiftR: defKindNone, + imm: defKindOp2, + unaryRmR: defKindOp2, + xmmRmiReg: defKindNone, + xmmUnaryRmR: defKindOp2, + xmmUnaryRmRImm: defKindOp2, + xmmCmpRmR: defKindNone, + xmmRmR: defKindNone, + xmmRmRImm: defKindNone, + mov64MR: defKindOp2, + movsxRmR: defKindOp2, + movzxRmR: defKindOp2, + gprToXmm: defKindOp2, + xmmToGpr: defKindOp2, + cmove: defKindNone, + call: defKindCall, + callIndirect: defKindCall, + ud2: defKindNone, + jmp: defKindNone, + jmpIf: defKindNone, + jmpTableIsland: defKindNone, + cmpRmiR: defKindNone, + exitSequence: defKindNone, + lea: defKindOp2, + setcc: defKindOp2, + zeros: defKindOp2, + sourceOffsetInfo: defKindNone, + fcvtToSintSequence: defKindNone, + defineUninitializedReg: defKindOp2, + fcvtToUintSequence: defKindNone, + xmmCMov: defKindOp2, + idivRemSequence: defKindDivRem, + blendvpd: defKindNone, + mfence: defKindNone, + xchg: defKindNone, + lockcmpxchg: defKindNone, + lockxadd: defKindNone, + neg: defKindNone, + nopUseReg: defKindNone, +} + +// String implements fmt.Stringer. +func (d defKind) String() string { + switch d { + case defKindNone: + return "none" + case defKindOp2: + return "op2" + case defKindCall: + return "call" + case defKindDivRem: + return "divrem" + default: + return "invalid" + } +} + +type useKind byte + +const ( + useKindNone useKind = iota + 1 + useKindOp1 + // useKindOp1Op2Reg is Op1 can be any operand, Op2 must be a register. + useKindOp1Op2Reg + // useKindOp1RegOp2 is Op1 must be a register, Op2 can be any operand. + useKindOp1RegOp2 + // useKindRaxOp1RegOp2 is Op1 must be a register, Op2 can be any operand, and RAX is used. + useKindRaxOp1RegOp2 + useKindDivRem + useKindBlendvpd + useKindCall + useKindCallInd + useKindFcvtToSintSequence + useKindFcvtToUintSequence +) + +var useKinds = [instrMax]useKind{ + nop0: useKindNone, + ret: useKindNone, + movRR: useKindOp1, + movRM: useKindOp1RegOp2, + xmmMovRM: useKindOp1RegOp2, + cmove: useKindOp1Op2Reg, + aluRmiR: useKindOp1Op2Reg, + shiftR: useKindOp1Op2Reg, + imm: useKindNone, + unaryRmR: useKindOp1, + xmmRmiReg: useKindOp1Op2Reg, + xmmUnaryRmR: useKindOp1, + xmmUnaryRmRImm: useKindOp1, + xmmCmpRmR: useKindOp1Op2Reg, + xmmRmR: useKindOp1Op2Reg, + xmmRmRImm: useKindOp1Op2Reg, + mov64MR: useKindOp1, + movzxRmR: useKindOp1, + movsxRmR: useKindOp1, + gprToXmm: useKindOp1, + xmmToGpr: useKindOp1, + call: useKindCall, + callIndirect: useKindCallInd, + ud2: useKindNone, + jmpIf: useKindOp1, + jmp: useKindOp1, + cmpRmiR: useKindOp1Op2Reg, + exitSequence: useKindOp1, + lea: useKindOp1, + jmpTableIsland: useKindNone, + setcc: useKindNone, + zeros: useKindNone, + sourceOffsetInfo: useKindNone, + fcvtToSintSequence: useKindFcvtToSintSequence, + defineUninitializedReg: useKindNone, + fcvtToUintSequence: useKindFcvtToUintSequence, + xmmCMov: useKindOp1, + idivRemSequence: useKindDivRem, + blendvpd: useKindBlendvpd, + mfence: useKindNone, + xchg: useKindOp1RegOp2, + lockcmpxchg: useKindRaxOp1RegOp2, + lockxadd: useKindOp1RegOp2, + neg: useKindOp1, + nopUseReg: useKindOp1, +} + +func (u useKind) String() string { + switch u { + case useKindNone: + return "none" + case useKindOp1: + return "op1" + case useKindOp1Op2Reg: + return "op1op2Reg" + case useKindOp1RegOp2: + return "op1RegOp2" + case useKindCall: + return "call" + case useKindCallInd: + return "callInd" + default: + return "invalid" + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go new file mode 100644 index 000000000..6637b428c --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go @@ -0,0 +1,1683 @@ +package amd64 + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +func (i *instruction) encode(c backend.Compiler) (needsLabelResolution bool) { + switch kind := i.kind; kind { + case nop0, sourceOffsetInfo, defineUninitializedReg, fcvtToSintSequence, fcvtToUintSequence, nopUseReg: + case ret: + encodeRet(c) + case imm: + dst := regEncodings[i.op2.reg().RealReg()] + con := i.u1 + if i.b1 { // 64 bit. + if lower32willSignExtendTo64(con) { + // Sign extend mov(imm32). + encodeRegReg(c, + legacyPrefixesNone, + 0xc7, 1, + 0, + dst, + rexInfo(0).setW(), + ) + c.Emit4Bytes(uint32(con)) + } else { + c.EmitByte(rexEncodingW | dst.rexBit()) + c.EmitByte(0xb8 | dst.encoding()) + c.Emit8Bytes(con) + } + } else { + if dst.rexBit() > 0 { + c.EmitByte(rexEncodingDefault | 0x1) + } + c.EmitByte(0xb8 | dst.encoding()) + c.Emit4Bytes(uint32(con)) + } + + case aluRmiR: + var rex rexInfo + if i.b1 { + rex = rex.setW() + } else { + rex = rex.clearW() + } + + dst := regEncodings[i.op2.reg().RealReg()] + + aluOp := aluRmiROpcode(i.u1) + if aluOp == aluRmiROpcodeMul { + op1 := i.op1 + const regMemOpc, regMemOpcNum = 0x0FAF, 2 + switch op1.kind { + case operandKindReg: + src := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, legacyPrefixesNone, regMemOpc, regMemOpcNum, dst, src, rex) + case operandKindMem: + m := i.op1.addressMode() + encodeRegMem(c, legacyPrefixesNone, regMemOpc, regMemOpcNum, dst, m, rex) + case operandKindImm32: + imm8 := lower8willSignExtendTo32(op1.imm32()) + var opc uint32 + if imm8 { + opc = 0x6b + } else { + opc = 0x69 + } + encodeRegReg(c, legacyPrefixesNone, opc, 1, dst, dst, rex) + if imm8 { + c.EmitByte(byte(op1.imm32())) + } else { + c.Emit4Bytes(op1.imm32()) + } + default: + panic("BUG: invalid operand kind") + } + } else { + const opcodeNum = 1 + var opcR, opcM, subOpcImm uint32 + switch aluOp { + case aluRmiROpcodeAdd: + opcR, opcM, subOpcImm = 0x01, 0x03, 0x0 + case aluRmiROpcodeSub: + opcR, opcM, subOpcImm = 0x29, 0x2b, 0x5 + case aluRmiROpcodeAnd: + opcR, opcM, subOpcImm = 0x21, 0x23, 0x4 + case aluRmiROpcodeOr: + opcR, opcM, subOpcImm = 0x09, 0x0b, 0x1 + case aluRmiROpcodeXor: + opcR, opcM, subOpcImm = 0x31, 0x33, 0x6 + default: + panic("BUG: invalid aluRmiROpcode") + } + + op1 := i.op1 + switch op1.kind { + case operandKindReg: + src := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, legacyPrefixesNone, opcR, opcodeNum, src, dst, rex) + case operandKindMem: + m := i.op1.addressMode() + encodeRegMem(c, legacyPrefixesNone, opcM, opcodeNum, dst, m, rex) + case operandKindImm32: + imm8 := lower8willSignExtendTo32(op1.imm32()) + var opc uint32 + if imm8 { + opc = 0x83 + } else { + opc = 0x81 + } + encodeRegReg(c, legacyPrefixesNone, opc, opcodeNum, regEnc(subOpcImm), dst, rex) + if imm8 { + c.EmitByte(byte(op1.imm32())) + } else { + c.Emit4Bytes(op1.imm32()) + } + default: + panic("BUG: invalid operand kind") + } + } + + case movRR: + src := regEncodings[i.op1.reg().RealReg()] + dst := regEncodings[i.op2.reg().RealReg()] + var rex rexInfo + if i.b1 { + rex = rex.setW() + } else { + rex = rex.clearW() + } + encodeRegReg(c, legacyPrefixesNone, 0x89, 1, src, dst, rex) + + case xmmRmR, blendvpd: + op := sseOpcode(i.u1) + var legPrex legacyPrefixes + var opcode uint32 + var opcodeNum uint32 + switch op { + case sseOpcodeAddps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F58, 2 + case sseOpcodeAddpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F58, 2 + case sseOpcodeAddss: + legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F58, 2 + case sseOpcodeAddsd: + legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F58, 2 + case sseOpcodeAndps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F54, 2 + case sseOpcodeAndpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F54, 2 + case sseOpcodeAndnps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F55, 2 + case sseOpcodeAndnpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F55, 2 + case sseOpcodeBlendvps: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3814, 3 + case sseOpcodeBlendvpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3815, 3 + case sseOpcodeDivps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5E, 2 + case sseOpcodeDivpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5E, 2 + case sseOpcodeDivss: + legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5E, 2 + case sseOpcodeDivsd: + legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5E, 2 + case sseOpcodeMaxps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5F, 2 + case sseOpcodeMaxpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5F, 2 + case sseOpcodeMaxss: + legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5F, 2 + case sseOpcodeMaxsd: + legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5F, 2 + case sseOpcodeMinps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5D, 2 + case sseOpcodeMinpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5D, 2 + case sseOpcodeMinss: + legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5D, 2 + case sseOpcodeMinsd: + legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5D, 2 + case sseOpcodeMovlhps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F16, 2 + case sseOpcodeMovsd: + legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F10, 2 + case sseOpcodeMulps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F59, 2 + case sseOpcodeMulpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F59, 2 + case sseOpcodeMulss: + legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F59, 2 + case sseOpcodeMulsd: + legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F59, 2 + case sseOpcodeOrpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F56, 2 + case sseOpcodeOrps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F56, 2 + case sseOpcodePackssdw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F6B, 2 + case sseOpcodePacksswb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F63, 2 + case sseOpcodePackusdw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F382B, 3 + case sseOpcodePackuswb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F67, 2 + case sseOpcodePaddb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFC, 2 + case sseOpcodePaddd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFE, 2 + case sseOpcodePaddq: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD4, 2 + case sseOpcodePaddw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFD, 2 + case sseOpcodePaddsb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEC, 2 + case sseOpcodePaddsw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FED, 2 + case sseOpcodePaddusb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDC, 2 + case sseOpcodePaddusw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDD, 2 + case sseOpcodePand: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDB, 2 + case sseOpcodePandn: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDF, 2 + case sseOpcodePavgb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE0, 2 + case sseOpcodePavgw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE3, 2 + case sseOpcodePcmpeqb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F74, 2 + case sseOpcodePcmpeqw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F75, 2 + case sseOpcodePcmpeqd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F76, 2 + case sseOpcodePcmpeqq: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3829, 3 + case sseOpcodePcmpgtb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F64, 2 + case sseOpcodePcmpgtw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F65, 2 + case sseOpcodePcmpgtd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F66, 2 + case sseOpcodePcmpgtq: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3837, 3 + case sseOpcodePmaddwd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF5, 2 + case sseOpcodePmaxsb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383C, 3 + case sseOpcodePmaxsw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEE, 2 + case sseOpcodePmaxsd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383D, 3 + case sseOpcodePmaxub: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDE, 2 + case sseOpcodePmaxuw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383E, 3 + case sseOpcodePmaxud: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383F, 3 + case sseOpcodePminsb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3838, 3 + case sseOpcodePminsw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEA, 2 + case sseOpcodePminsd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3839, 3 + case sseOpcodePminub: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FDA, 2 + case sseOpcodePminuw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383A, 3 + case sseOpcodePminud: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F383B, 3 + case sseOpcodePmulld: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3840, 3 + case sseOpcodePmullw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD5, 2 + case sseOpcodePmuludq: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF4, 2 + case sseOpcodePor: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEB, 2 + case sseOpcodePshufb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3800, 3 + case sseOpcodePsubb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF8, 2 + case sseOpcodePsubd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFA, 2 + case sseOpcodePsubq: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FFB, 2 + case sseOpcodePsubw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FF9, 2 + case sseOpcodePsubsb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE8, 2 + case sseOpcodePsubsw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE9, 2 + case sseOpcodePsubusb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD8, 2 + case sseOpcodePsubusw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FD9, 2 + case sseOpcodePunpckhbw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F68, 2 + case sseOpcodePunpcklbw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F60, 2 + case sseOpcodePxor: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FEF, 2 + case sseOpcodeSubps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F5C, 2 + case sseOpcodeSubpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5C, 2 + case sseOpcodeSubss: + legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5C, 2 + case sseOpcodeSubsd: + legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5C, 2 + case sseOpcodeXorps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F57, 2 + case sseOpcodeXorpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F57, 2 + case sseOpcodePmulhrsw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F380B, 3 + case sseOpcodeUnpcklps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0F14, 2 + case sseOpcodePmaddubsw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3804, 3 + default: + if kind == blendvpd { + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3815, 3 + } else { + panic(fmt.Sprintf("Unsupported sseOpcode: %s", op)) + } + } + + dst := regEncodings[i.op2.reg().RealReg()] + + rex := rexInfo(0).clearW() + op1 := i.op1 + if op1.kind == operandKindReg { + src := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, legPrex, opcode, opcodeNum, dst, src, rex) + } else if i.op1.kind == operandKindMem { + m := i.op1.addressMode() + encodeRegMem(c, legPrex, opcode, opcodeNum, dst, m, rex) + } else { + panic("BUG: invalid operand kind") + } + + case gprToXmm: + var legPrefix legacyPrefixes + var opcode uint32 + const opcodeNum = 2 + switch sseOpcode(i.u1) { + case sseOpcodeMovd, sseOpcodeMovq: + legPrefix, opcode = legacyPrefixes0x66, 0x0f6e + case sseOpcodeCvtsi2ss: + legPrefix, opcode = legacyPrefixes0xF3, 0x0f2a + case sseOpcodeCvtsi2sd: + legPrefix, opcode = legacyPrefixes0xF2, 0x0f2a + default: + panic(fmt.Sprintf("Unsupported sseOpcode: %s", sseOpcode(i.u1))) + } + + var rex rexInfo + if i.b1 { + rex = rex.setW() + } else { + rex = rex.clearW() + } + dst := regEncodings[i.op2.reg().RealReg()] + + op1 := i.op1 + if op1.kind == operandKindReg { + src := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, legPrefix, opcode, opcodeNum, dst, src, rex) + } else if i.op1.kind == operandKindMem { + m := i.op1.addressMode() + encodeRegMem(c, legPrefix, opcode, opcodeNum, dst, m, rex) + } else { + panic("BUG: invalid operand kind") + } + + case xmmUnaryRmR: + var prefix legacyPrefixes + var opcode uint32 + var opcodeNum uint32 + op := sseOpcode(i.u1) + switch op { + case sseOpcodeCvtss2sd: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5A, 2 + case sseOpcodeCvtsd2ss: + prefix, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F5A, 2 + case sseOpcodeMovaps: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F28, 2 + case sseOpcodeMovapd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F28, 2 + case sseOpcodeMovdqa: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F6F, 2 + case sseOpcodeMovdqu: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F6F, 2 + case sseOpcodeMovsd: + prefix, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F10, 2 + case sseOpcodeMovss: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F10, 2 + case sseOpcodeMovups: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F10, 2 + case sseOpcodeMovupd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F10, 2 + case sseOpcodePabsb: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F381C, 3 + case sseOpcodePabsw: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F381D, 3 + case sseOpcodePabsd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F381E, 3 + case sseOpcodePmovsxbd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3821, 3 + case sseOpcodePmovsxbw: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3820, 3 + case sseOpcodePmovsxbq: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3822, 3 + case sseOpcodePmovsxwd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3823, 3 + case sseOpcodePmovsxwq: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3824, 3 + case sseOpcodePmovsxdq: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3825, 3 + case sseOpcodePmovzxbd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3831, 3 + case sseOpcodePmovzxbw: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3830, 3 + case sseOpcodePmovzxbq: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3832, 3 + case sseOpcodePmovzxwd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3833, 3 + case sseOpcodePmovzxwq: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3834, 3 + case sseOpcodePmovzxdq: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3835, 3 + case sseOpcodeSqrtps: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F51, 2 + case sseOpcodeSqrtpd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F51, 2 + case sseOpcodeSqrtss: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F51, 2 + case sseOpcodeSqrtsd: + prefix, opcode, opcodeNum = legacyPrefixes0xF2, 0x0F51, 2 + case sseOpcodeXorps: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F57, 2 + case sseOpcodeXorpd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F57, 2 + case sseOpcodeCvtdq2ps: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F5B, 2 + case sseOpcodeCvtdq2pd: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0FE6, 2 + case sseOpcodeCvtps2pd: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0F5A, 2 + case sseOpcodeCvtpd2ps: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0F5A, 2 + case sseOpcodeCvttps2dq: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0F5B, 2 + case sseOpcodeCvttpd2dq: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0FE6, 2 + default: + panic(fmt.Sprintf("Unsupported sseOpcode: %s", op)) + } + + dst := regEncodings[i.op2.reg().RealReg()] + + rex := rexInfo(0).clearW() + op1 := i.op1 + if op1.kind == operandKindReg { + src := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, prefix, opcode, opcodeNum, dst, src, rex) + } else if i.op1.kind == operandKindMem { + m := i.op1.addressMode() + needsLabelResolution = encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex) + } else { + panic("BUG: invalid operand kind") + } + + case xmmUnaryRmRImm: + var prefix legacyPrefixes + var opcode uint32 + var opcodeNum uint32 + op := sseOpcode(i.u1) + switch op { + case sseOpcodeRoundps: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a08, 3 + case sseOpcodeRoundss: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a0a, 3 + case sseOpcodeRoundpd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a09, 3 + case sseOpcodeRoundsd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3a0b, 3 + } + rex := rexInfo(0).clearW() + dst := regEncodings[i.op2.reg().RealReg()] + op1 := i.op1 + if op1.kind == operandKindReg { + src := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, prefix, opcode, opcodeNum, dst, src, rex) + } else if i.op1.kind == operandKindMem { + m := i.op1.addressMode() + encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex) + } else { + panic("BUG: invalid operand kind") + } + + c.EmitByte(byte(i.u2)) + + case unaryRmR: + var prefix legacyPrefixes + var opcode uint32 + var opcodeNum uint32 + op := unaryRmROpcode(i.u1) + // We assume size is either 32 or 64. + switch op { + case unaryRmROpcodeBsr: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0fbd, 2 + case unaryRmROpcodeBsf: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0fbc, 2 + case unaryRmROpcodeLzcnt: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0fbd, 2 + case unaryRmROpcodeTzcnt: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0fbc, 2 + case unaryRmROpcodePopcnt: + prefix, opcode, opcodeNum = legacyPrefixes0xF3, 0x0fb8, 2 + default: + panic(fmt.Sprintf("Unsupported unaryRmROpcode: %s", op)) + } + + dst := regEncodings[i.op2.reg().RealReg()] + + rex := rexInfo(0) + if i.b1 { // 64 bit. + rex = rexInfo(0).setW() + } else { + rex = rexInfo(0).clearW() + } + op1 := i.op1 + if op1.kind == operandKindReg { + src := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, prefix, opcode, opcodeNum, dst, src, rex) + } else if i.op1.kind == operandKindMem { + m := i.op1.addressMode() + encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex) + } else { + panic("BUG: invalid operand kind") + } + + case not: + var prefix legacyPrefixes + src := regEncodings[i.op1.reg().RealReg()] + rex := rexInfo(0) + if i.b1 { // 64 bit. + rex = rexInfo(0).setW() + } else { + rex = rexInfo(0).clearW() + } + subopcode := uint8(2) + encodeEncEnc(c, prefix, 0xf7, 1, subopcode, uint8(src), rex) + + case neg: + var prefix legacyPrefixes + src := regEncodings[i.op1.reg().RealReg()] + rex := rexInfo(0) + if i.b1 { // 64 bit. + rex = rexInfo(0).setW() + } else { + rex = rexInfo(0).clearW() + } + subopcode := uint8(3) + encodeEncEnc(c, prefix, 0xf7, 1, subopcode, uint8(src), rex) + + case div: + rex := rexInfo(0) + if i.b1 { // 64 bit. + rex = rexInfo(0).setW() + } else { + rex = rexInfo(0).clearW() + } + var subopcode uint8 + if i.u1 != 0 { // Signed. + subopcode = 7 + } else { + subopcode = 6 + } + + divisor := i.op1 + if divisor.kind == operandKindReg { + src := regEncodings[divisor.reg().RealReg()] + encodeEncEnc(c, legacyPrefixesNone, 0xf7, 1, subopcode, uint8(src), rex) + } else if divisor.kind == operandKindMem { + m := divisor.addressMode() + encodeEncMem(c, legacyPrefixesNone, 0xf7, 1, subopcode, m, rex) + } else { + panic("BUG: invalid operand kind") + } + + case mulHi: + var prefix legacyPrefixes + rex := rexInfo(0) + if i.b1 { // 64 bit. + rex = rexInfo(0).setW() + } else { + rex = rexInfo(0).clearW() + } + + signed := i.u1 != 0 + var subopcode uint8 + if signed { + subopcode = 5 + } else { + subopcode = 4 + } + + // src1 is implicitly rax, + // dst_lo is implicitly rax, + // dst_hi is implicitly rdx. + src2 := i.op1 + if src2.kind == operandKindReg { + src := regEncodings[src2.reg().RealReg()] + encodeEncEnc(c, prefix, 0xf7, 1, subopcode, uint8(src), rex) + } else if src2.kind == operandKindMem { + m := src2.addressMode() + encodeEncMem(c, prefix, 0xf7, 1, subopcode, m, rex) + } else { + panic("BUG: invalid operand kind") + } + + case signExtendData: + if i.b1 { // 64 bit. + c.EmitByte(0x48) + c.EmitByte(0x99) + } else { + c.EmitByte(0x99) + } + case movzxRmR, movsxRmR: + signed := i.kind == movsxRmR + + ext := extMode(i.u1) + var opcode uint32 + var opcodeNum uint32 + var rex rexInfo + switch ext { + case extModeBL: + if signed { + opcode, opcodeNum, rex = 0x0fbe, 2, rex.clearW() + } else { + opcode, opcodeNum, rex = 0x0fb6, 2, rex.clearW() + } + case extModeBQ: + if signed { + opcode, opcodeNum, rex = 0x0fbe, 2, rex.setW() + } else { + opcode, opcodeNum, rex = 0x0fb6, 2, rex.setW() + } + case extModeWL: + if signed { + opcode, opcodeNum, rex = 0x0fbf, 2, rex.clearW() + } else { + opcode, opcodeNum, rex = 0x0fb7, 2, rex.clearW() + } + case extModeWQ: + if signed { + opcode, opcodeNum, rex = 0x0fbf, 2, rex.setW() + } else { + opcode, opcodeNum, rex = 0x0fb7, 2, rex.setW() + } + case extModeLQ: + if signed { + opcode, opcodeNum, rex = 0x63, 1, rex.setW() + } else { + opcode, opcodeNum, rex = 0x8b, 1, rex.clearW() + } + default: + panic("BUG: invalid extMode") + } + + op := i.op1 + dst := regEncodings[i.op2.reg().RealReg()] + switch op.kind { + case operandKindReg: + src := regEncodings[op.reg().RealReg()] + if ext == extModeBL || ext == extModeBQ { + // Some destinations must be encoded with REX.R = 1. + if e := src.encoding(); e >= 4 && e <= 7 { + rex = rex.always() + } + } + encodeRegReg(c, legacyPrefixesNone, opcode, opcodeNum, dst, src, rex) + case operandKindMem: + m := op.addressMode() + encodeRegMem(c, legacyPrefixesNone, opcode, opcodeNum, dst, m, rex) + default: + panic("BUG: invalid operand kind") + } + + case mov64MR: + m := i.op1.addressMode() + encodeLoad64(c, m, i.op2.reg().RealReg()) + + case lea: + needsLabelResolution = true + dst := regEncodings[i.op2.reg().RealReg()] + rex := rexInfo(0).setW() + const opcode, opcodeNum = 0x8d, 1 + switch i.op1.kind { + case operandKindMem: + a := i.op1.addressMode() + encodeRegMem(c, legacyPrefixesNone, opcode, opcodeNum, dst, a, rex) + case operandKindLabel: + rex.encode(c, regRexBit(byte(dst)), 0) + c.EmitByte(byte((opcode) & 0xff)) + + // Indicate "LEAQ [RIP + 32bit displacement]. + // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing + c.EmitByte(encodeModRM(0b00, dst.encoding(), 0b101)) + + // This will be resolved later, so we just emit a placeholder (0xffffffff for testing). + c.Emit4Bytes(0xffffffff) + default: + panic("BUG: invalid operand kind") + } + + case movRM: + m := i.op2.addressMode() + src := regEncodings[i.op1.reg().RealReg()] + + var rex rexInfo + switch i.u1 { + case 1: + if e := src.encoding(); e >= 4 && e <= 7 { + rex = rex.always() + } + encodeRegMem(c, legacyPrefixesNone, 0x88, 1, src, m, rex.clearW()) + case 2: + encodeRegMem(c, legacyPrefixes0x66, 0x89, 1, src, m, rex.clearW()) + case 4: + encodeRegMem(c, legacyPrefixesNone, 0x89, 1, src, m, rex.clearW()) + case 8: + encodeRegMem(c, legacyPrefixesNone, 0x89, 1, src, m, rex.setW()) + default: + panic(fmt.Sprintf("BUG: invalid size %d: %s", i.u1, i.String())) + } + + case shiftR: + src := regEncodings[i.op2.reg().RealReg()] + amount := i.op1 + + var opcode uint32 + var prefix legacyPrefixes + rex := rexInfo(0) + if i.b1 { // 64 bit. + rex = rexInfo(0).setW() + } else { + rex = rexInfo(0).clearW() + } + + switch amount.kind { + case operandKindReg: + if amount.reg() != rcxVReg { + panic("BUG: invalid reg operand: must be rcx") + } + opcode, prefix = 0xd3, legacyPrefixesNone + encodeEncEnc(c, prefix, opcode, 1, uint8(i.u1), uint8(src), rex) + case operandKindImm32: + opcode, prefix = 0xc1, legacyPrefixesNone + encodeEncEnc(c, prefix, opcode, 1, uint8(i.u1), uint8(src), rex) + c.EmitByte(byte(amount.imm32())) + default: + panic("BUG: invalid operand kind") + } + case xmmRmiReg: + const legPrefix = legacyPrefixes0x66 + rex := rexInfo(0).clearW() + dst := regEncodings[i.op2.reg().RealReg()] + + var opcode uint32 + var regDigit uint8 + + op := sseOpcode(i.u1) + op1 := i.op1 + if i.op1.kind == operandKindImm32 { + switch op { + case sseOpcodePsllw: + opcode, regDigit = 0x0f71, 6 + case sseOpcodePslld: + opcode, regDigit = 0x0f72, 6 + case sseOpcodePsllq: + opcode, regDigit = 0x0f73, 6 + case sseOpcodePsraw: + opcode, regDigit = 0x0f71, 4 + case sseOpcodePsrad: + opcode, regDigit = 0x0f72, 4 + case sseOpcodePsrlw: + opcode, regDigit = 0x0f71, 2 + case sseOpcodePsrld: + opcode, regDigit = 0x0f72, 2 + case sseOpcodePsrlq: + opcode, regDigit = 0x0f73, 2 + default: + panic("invalid opcode") + } + + encodeEncEnc(c, legPrefix, opcode, 2, regDigit, uint8(dst), rex) + imm32 := op1.imm32() + if imm32 > 0xff&imm32 { + panic("immediate value does not fit 1 byte") + } + c.EmitByte(uint8(imm32)) + } else { + switch op { + case sseOpcodePsllw: + opcode = 0x0ff1 + case sseOpcodePslld: + opcode = 0x0ff2 + case sseOpcodePsllq: + opcode = 0x0ff3 + case sseOpcodePsraw: + opcode = 0x0fe1 + case sseOpcodePsrad: + opcode = 0x0fe2 + case sseOpcodePsrlw: + opcode = 0x0fd1 + case sseOpcodePsrld: + opcode = 0x0fd2 + case sseOpcodePsrlq: + opcode = 0x0fd3 + default: + panic("invalid opcode") + } + + if op1.kind == operandKindReg { + reg := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, legPrefix, opcode, 2, dst, reg, rex) + } else if op1.kind == operandKindMem { + m := op1.addressMode() + encodeRegMem(c, legPrefix, opcode, 2, dst, m, rex) + } else { + panic("BUG: invalid operand kind") + } + } + + case cmpRmiR: + var opcode uint32 + isCmp := i.u1 != 0 + rex := rexInfo(0) + _64 := i.b1 + if _64 { // 64 bit. + rex = rex.setW() + } else { + rex = rex.clearW() + } + dst := regEncodings[i.op2.reg().RealReg()] + op1 := i.op1 + switch op1.kind { + case operandKindReg: + reg := regEncodings[op1.reg().RealReg()] + if isCmp { + opcode = 0x39 + } else { + opcode = 0x85 + } + // Here we swap the encoding of the operands for CMP to be consistent with the output of LLVM/GCC. + encodeRegReg(c, legacyPrefixesNone, opcode, 1, reg, dst, rex) + + case operandKindMem: + if isCmp { + opcode = 0x3b + } else { + opcode = 0x85 + } + m := op1.addressMode() + encodeRegMem(c, legacyPrefixesNone, opcode, 1, dst, m, rex) + + case operandKindImm32: + imm32 := op1.imm32() + useImm8 := isCmp && lower8willSignExtendTo32(imm32) + var subopcode uint8 + + switch { + case isCmp && useImm8: + opcode, subopcode = 0x83, 7 + case isCmp && !useImm8: + opcode, subopcode = 0x81, 7 + default: + opcode, subopcode = 0xf7, 0 + } + encodeEncEnc(c, legacyPrefixesNone, opcode, 1, subopcode, uint8(dst), rex) + if useImm8 { + c.EmitByte(uint8(imm32)) + } else { + c.Emit4Bytes(imm32) + } + + default: + panic("BUG: invalid operand kind") + } + case setcc: + cc := cond(i.u1) + dst := regEncodings[i.op2.reg().RealReg()] + rex := rexInfo(0).clearW().always() + opcode := uint32(0x0f90) + uint32(cc) + encodeEncEnc(c, legacyPrefixesNone, opcode, 2, 0, uint8(dst), rex) + case cmove: + cc := cond(i.u1) + dst := regEncodings[i.op2.reg().RealReg()] + rex := rexInfo(0) + if i.b1 { // 64 bit. + rex = rex.setW() + } else { + rex = rex.clearW() + } + opcode := uint32(0x0f40) + uint32(cc) + src := i.op1 + switch src.kind { + case operandKindReg: + srcReg := regEncodings[src.reg().RealReg()] + encodeRegReg(c, legacyPrefixesNone, opcode, 2, dst, srcReg, rex) + case operandKindMem: + m := src.addressMode() + encodeRegMem(c, legacyPrefixesNone, opcode, 2, dst, m, rex) + default: + panic("BUG: invalid operand kind") + } + case push64: + op := i.op1 + + switch op.kind { + case operandKindReg: + dst := regEncodings[op.reg().RealReg()] + if dst.rexBit() > 0 { + c.EmitByte(rexEncodingDefault | 0x1) + } + c.EmitByte(0x50 | dst.encoding()) + case operandKindMem: + m := op.addressMode() + encodeRegMem( + c, legacyPrefixesNone, 0xff, 1, regEnc(6), m, rexInfo(0).clearW(), + ) + case operandKindImm32: + c.EmitByte(0x68) + c.Emit4Bytes(op.imm32()) + default: + panic("BUG: invalid operand kind") + } + + case pop64: + dst := regEncodings[i.op1.reg().RealReg()] + if dst.rexBit() > 0 { + c.EmitByte(rexEncodingDefault | 0x1) + } + c.EmitByte(0x58 | dst.encoding()) + + case xmmMovRM: + var legPrefix legacyPrefixes + var opcode uint32 + const opcodeNum = 2 + switch sseOpcode(i.u1) { + case sseOpcodeMovaps: + legPrefix, opcode = legacyPrefixesNone, 0x0f29 + case sseOpcodeMovapd: + legPrefix, opcode = legacyPrefixes0x66, 0x0f29 + case sseOpcodeMovdqa: + legPrefix, opcode = legacyPrefixes0x66, 0x0f7f + case sseOpcodeMovdqu: + legPrefix, opcode = legacyPrefixes0xF3, 0x0f7f + case sseOpcodeMovss: + legPrefix, opcode = legacyPrefixes0xF3, 0x0f11 + case sseOpcodeMovsd: + legPrefix, opcode = legacyPrefixes0xF2, 0x0f11 + case sseOpcodeMovups: + legPrefix, opcode = legacyPrefixesNone, 0x0f11 + case sseOpcodeMovupd: + legPrefix, opcode = legacyPrefixes0x66, 0x0f11 + default: + panic(fmt.Sprintf("Unsupported sseOpcode: %s", sseOpcode(i.u1))) + } + + dst := regEncodings[i.op1.reg().RealReg()] + encodeRegMem(c, legPrefix, opcode, opcodeNum, dst, i.op2.addressMode(), rexInfo(0).clearW()) + case xmmLoadConst: + panic("TODO") + case xmmToGpr: + var legPrefix legacyPrefixes + var opcode uint32 + var argSwap bool + const opcodeNum = 2 + switch sseOpcode(i.u1) { + case sseOpcodeMovd, sseOpcodeMovq: + legPrefix, opcode, argSwap = legacyPrefixes0x66, 0x0f7e, false + case sseOpcodeMovmskps: + legPrefix, opcode, argSwap = legacyPrefixesNone, 0x0f50, true + case sseOpcodeMovmskpd: + legPrefix, opcode, argSwap = legacyPrefixes0x66, 0x0f50, true + case sseOpcodePmovmskb: + legPrefix, opcode, argSwap = legacyPrefixes0x66, 0x0fd7, true + case sseOpcodeCvttss2si: + legPrefix, opcode, argSwap = legacyPrefixes0xF3, 0x0f2c, true + case sseOpcodeCvttsd2si: + legPrefix, opcode, argSwap = legacyPrefixes0xF2, 0x0f2c, true + default: + panic(fmt.Sprintf("Unsupported sseOpcode: %s", sseOpcode(i.u1))) + } + + var rex rexInfo + if i.b1 { + rex = rex.setW() + } else { + rex = rex.clearW() + } + src := regEncodings[i.op1.reg().RealReg()] + dst := regEncodings[i.op2.reg().RealReg()] + if argSwap { + src, dst = dst, src + } + encodeRegReg(c, legPrefix, opcode, opcodeNum, src, dst, rex) + + case cvtUint64ToFloatSeq: + panic("TODO") + case cvtFloatToSintSeq: + panic("TODO") + case cvtFloatToUintSeq: + panic("TODO") + case xmmMinMaxSeq: + panic("TODO") + case xmmCmpRmR: + var prefix legacyPrefixes + var opcode uint32 + var opcodeNum uint32 + rex := rexInfo(0) + _64 := i.b1 + if _64 { // 64 bit. + rex = rex.setW() + } else { + rex = rex.clearW() + } + + op := sseOpcode(i.u1) + switch op { + case sseOpcodePtest: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f3817, 3 + case sseOpcodeUcomisd: + prefix, opcode, opcodeNum = legacyPrefixes0x66, 0x0f2e, 2 + case sseOpcodeUcomiss: + prefix, opcode, opcodeNum = legacyPrefixesNone, 0x0f2e, 2 + default: + panic(fmt.Sprintf("Unsupported sseOpcode: %s", op)) + } + + dst := regEncodings[i.op2.reg().RealReg()] + op1 := i.op1 + switch op1.kind { + case operandKindReg: + reg := regEncodings[op1.reg().RealReg()] + encodeRegReg(c, prefix, opcode, opcodeNum, dst, reg, rex) + + case operandKindMem: + m := op1.addressMode() + encodeRegMem(c, prefix, opcode, opcodeNum, dst, m, rex) + + default: + panic("BUG: invalid operand kind") + } + case xmmRmRImm: + op := sseOpcode(i.u1) + var legPrex legacyPrefixes + var opcode uint32 + var opcodeNum uint32 + var swap bool + switch op { + case sseOpcodeCmpps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0FC2, 2 + case sseOpcodeCmppd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FC2, 2 + case sseOpcodeCmpss: + legPrex, opcode, opcodeNum = legacyPrefixes0xF3, 0x0FC2, 2 + case sseOpcodeCmpsd: + legPrex, opcode, opcodeNum = legacyPrefixes0xF2, 0x0FC2, 2 + case sseOpcodeInsertps: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A21, 3 + case sseOpcodePalignr: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A0F, 3 + case sseOpcodePinsrb: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A20, 3 + case sseOpcodePinsrw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FC4, 2 + case sseOpcodePinsrd, sseOpcodePinsrq: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A22, 3 + case sseOpcodePextrb: + swap = true + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A14, 3 + case sseOpcodePextrw: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0FC5, 2 + case sseOpcodePextrd, sseOpcodePextrq: + swap = true + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A16, 3 + case sseOpcodePshufd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F70, 2 + case sseOpcodeRoundps: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A08, 3 + case sseOpcodeRoundpd: + legPrex, opcode, opcodeNum = legacyPrefixes0x66, 0x0F3A09, 3 + case sseOpcodeShufps: + legPrex, opcode, opcodeNum = legacyPrefixesNone, 0x0FC6, 2 + default: + panic(fmt.Sprintf("Unsupported sseOpcode: %s", op)) + } + + dst := regEncodings[i.op2.reg().RealReg()] + + var rex rexInfo + if op == sseOpcodePextrq || op == sseOpcodePinsrq { + rex = rexInfo(0).setW() + } else { + rex = rexInfo(0).clearW() + } + op1 := i.op1 + if op1.kind == operandKindReg { + src := regEncodings[op1.reg().RealReg()] + if swap { + src, dst = dst, src + } + encodeRegReg(c, legPrex, opcode, opcodeNum, dst, src, rex) + } else if i.op1.kind == operandKindMem { + if swap { + panic("BUG: this is not possible to encode") + } + m := i.op1.addressMode() + encodeRegMem(c, legPrex, opcode, opcodeNum, dst, m, rex) + } else { + panic("BUG: invalid operand kind") + } + + c.EmitByte(byte(i.u2)) + + case jmp: + const ( + regMemOpcode = 0xff + regMemOpcodeNum = 1 + regMemSubOpcode = 4 + ) + op := i.op1 + switch op.kind { + case operandKindLabel: + needsLabelResolution = true + fallthrough + case operandKindImm32: + c.EmitByte(0xe9) + c.Emit4Bytes(op.imm32()) + case operandKindMem: + m := op.addressMode() + encodeRegMem(c, + legacyPrefixesNone, + regMemOpcode, regMemOpcodeNum, + regMemSubOpcode, m, rexInfo(0).clearW(), + ) + case operandKindReg: + r := op.reg().RealReg() + encodeRegReg( + c, + legacyPrefixesNone, + regMemOpcode, regMemOpcodeNum, + regMemSubOpcode, + regEncodings[r], rexInfo(0).clearW(), + ) + default: + panic("BUG: invalid operand kind") + } + + case jmpIf: + op := i.op1 + switch op.kind { + case operandKindLabel: + needsLabelResolution = true + fallthrough + case operandKindImm32: + c.EmitByte(0x0f) + c.EmitByte(0x80 | cond(i.u1).encoding()) + c.Emit4Bytes(op.imm32()) + default: + panic("BUG: invalid operand kind") + } + + case jmpTableIsland: + needsLabelResolution = true + for tc := uint64(0); tc < i.u2; tc++ { + c.Emit8Bytes(0) + } + + case exitSequence: + execCtx := i.op1.reg() + allocatedAmode := i.op2.addressMode() + + // Restore the RBP, RSP, and return to the Go code: + *allocatedAmode = amode{ + kindWithShift: uint32(amodeImmReg), base: execCtx, + imm32: wazevoapi.ExecutionContextOffsetOriginalFramePointer.U32(), + } + encodeLoad64(c, allocatedAmode, rbp) + allocatedAmode.imm32 = wazevoapi.ExecutionContextOffsetOriginalStackPointer.U32() + encodeLoad64(c, allocatedAmode, rsp) + encodeRet(c) + + case ud2: + c.EmitByte(0x0f) + c.EmitByte(0x0b) + + case call: + c.EmitByte(0xe8) + // Meaning that the call target is a function value, and requires relocation. + c.AddRelocationInfo(ssa.FuncRef(i.u1)) + // Note that this is zero as a placeholder for the call target if it's a function value. + c.Emit4Bytes(uint32(i.u2)) + + case callIndirect: + op := i.op1 + + const opcodeNum = 1 + const opcode = 0xff + rex := rexInfo(0).clearW() + switch op.kind { + case operandKindReg: + dst := regEncodings[op.reg().RealReg()] + encodeRegReg(c, + legacyPrefixesNone, + opcode, opcodeNum, + regEnc(2), + dst, + rex, + ) + case operandKindMem: + m := op.addressMode() + encodeRegMem(c, + legacyPrefixesNone, + opcode, opcodeNum, + regEnc(2), + m, + rex, + ) + default: + panic("BUG: invalid operand kind") + } + + case xchg: + src, dst := regEncodings[i.op1.reg().RealReg()], i.op2 + size := i.u1 + + var rex rexInfo + var opcode uint32 + lp := legacyPrefixesNone + switch size { + case 8: + opcode = 0x87 + rex = rexInfo(0).setW() + case 4: + opcode = 0x87 + rex = rexInfo(0).clearW() + case 2: + lp = legacyPrefixes0x66 + opcode = 0x87 + rex = rexInfo(0).clearW() + case 1: + opcode = 0x86 + if i.op2.kind == operandKindReg { + panic("TODO?: xchg on two 1-byte registers") + } + // Some destinations must be encoded with REX.R = 1. + if e := src.encoding(); e >= 4 && e <= 7 { + rex = rexInfo(0).always() + } + default: + panic(fmt.Sprintf("BUG: invalid size %d: %s", size, i.String())) + } + + switch dst.kind { + case operandKindMem: + m := dst.addressMode() + encodeRegMem(c, lp, opcode, 1, src, m, rex) + case operandKindReg: + r := dst.reg().RealReg() + encodeRegReg(c, lp, opcode, 1, src, regEncodings[r], rex) + default: + panic("BUG: invalid operand kind") + } + + case lockcmpxchg: + src, dst := regEncodings[i.op1.reg().RealReg()], i.op2 + size := i.u1 + + var rex rexInfo + var opcode uint32 + lp := legacyPrefixes0xF0 // Lock prefix. + switch size { + case 8: + opcode = 0x0FB1 + rex = rexInfo(0).setW() + case 4: + opcode = 0x0FB1 + rex = rexInfo(0).clearW() + case 2: + lp = legacyPrefixes0x660xF0 // Legacy prefix + Lock prefix. + opcode = 0x0FB1 + rex = rexInfo(0).clearW() + case 1: + opcode = 0x0FB0 + // Some destinations must be encoded with REX.R = 1. + if e := src.encoding(); e >= 4 && e <= 7 { + rex = rexInfo(0).always() + } + default: + panic(fmt.Sprintf("BUG: invalid size %d: %s", size, i.String())) + } + + switch dst.kind { + case operandKindMem: + m := dst.addressMode() + encodeRegMem(c, lp, opcode, 2, src, m, rex) + default: + panic("BUG: invalid operand kind") + } + + case lockxadd: + src, dst := regEncodings[i.op1.reg().RealReg()], i.op2 + size := i.u1 + + var rex rexInfo + var opcode uint32 + lp := legacyPrefixes0xF0 // Lock prefix. + switch size { + case 8: + opcode = 0x0FC1 + rex = rexInfo(0).setW() + case 4: + opcode = 0x0FC1 + rex = rexInfo(0).clearW() + case 2: + lp = legacyPrefixes0x660xF0 // Legacy prefix + Lock prefix. + opcode = 0x0FC1 + rex = rexInfo(0).clearW() + case 1: + opcode = 0x0FC0 + // Some destinations must be encoded with REX.R = 1. + if e := src.encoding(); e >= 4 && e <= 7 { + rex = rexInfo(0).always() + } + default: + panic(fmt.Sprintf("BUG: invalid size %d: %s", size, i.String())) + } + + switch dst.kind { + case operandKindMem: + m := dst.addressMode() + encodeRegMem(c, lp, opcode, 2, src, m, rex) + default: + panic("BUG: invalid operand kind") + } + + case zeros: + r := i.op2.reg() + if r.RegType() == regalloc.RegTypeInt { + i.asAluRmiR(aluRmiROpcodeXor, newOperandReg(r), r, true) + } else { + i.asXmmRmR(sseOpcodePxor, newOperandReg(r), r) + } + i.encode(c) + + case mfence: + // https://www.felixcloutier.com/x86/mfence + c.EmitByte(0x0f) + c.EmitByte(0xae) + c.EmitByte(0xf0) + + default: + panic(fmt.Sprintf("TODO: %v", i.kind)) + } + return +} + +func encodeLoad64(c backend.Compiler, m *amode, rd regalloc.RealReg) { + dst := regEncodings[rd] + encodeRegMem(c, legacyPrefixesNone, 0x8b, 1, dst, m, rexInfo(0).setW()) +} + +func encodeRet(c backend.Compiler) { + c.EmitByte(0xc3) +} + +func encodeEncEnc( + c backend.Compiler, + legPrefixes legacyPrefixes, + opcodes uint32, + opcodeNum uint32, + r uint8, + rm uint8, + rex rexInfo, +) { + legPrefixes.encode(c) + rex.encode(c, r>>3, rm>>3) + + for opcodeNum > 0 { + opcodeNum-- + c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff)) + } + c.EmitByte(encodeModRM(3, r&7, rm&7)) +} + +func encodeRegReg( + c backend.Compiler, + legPrefixes legacyPrefixes, + opcodes uint32, + opcodeNum uint32, + r regEnc, + rm regEnc, + rex rexInfo, +) { + encodeEncEnc(c, legPrefixes, opcodes, opcodeNum, uint8(r), uint8(rm), rex) +} + +func encodeModRM(mod byte, reg byte, rm byte) byte { + return mod<<6 | reg<<3 | rm +} + +func encodeSIB(shift byte, encIndex byte, encBase byte) byte { + return shift<<6 | encIndex<<3 | encBase +} + +func encodeRegMem( + c backend.Compiler, legPrefixes legacyPrefixes, opcodes uint32, opcodeNum uint32, r regEnc, m *amode, rex rexInfo, +) (needsLabelResolution bool) { + needsLabelResolution = encodeEncMem(c, legPrefixes, opcodes, opcodeNum, uint8(r), m, rex) + return +} + +func encodeEncMem( + c backend.Compiler, legPrefixes legacyPrefixes, opcodes uint32, opcodeNum uint32, r uint8, m *amode, rex rexInfo, +) (needsLabelResolution bool) { + legPrefixes.encode(c) + + const ( + modNoDisplacement = 0b00 + modShortDisplacement = 0b01 + modLongDisplacement = 0b10 + + useSBI = 4 // the encoding of rsp or r12 register. + ) + + switch m.kind() { + case amodeImmReg, amodeImmRBP: + base := m.base.RealReg() + baseEnc := regEncodings[base] + + rex.encode(c, regRexBit(r), baseEnc.rexBit()) + + for opcodeNum > 0 { + opcodeNum-- + c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff)) + } + + // SIB byte is the last byte of the memory encoding before the displacement + const sibByte = 0x24 // == encodeSIB(0, 4, 4) + + immZero, baseRbp, baseR13 := m.imm32 == 0, base == rbp, base == r13 + short := lower8willSignExtendTo32(m.imm32) + rspOrR12 := base == rsp || base == r12 + + if immZero && !baseRbp && !baseR13 { // rbp or r13 can't be used as base for without displacement encoding. + c.EmitByte(encodeModRM(modNoDisplacement, regEncoding(r), baseEnc.encoding())) + if rspOrR12 { + c.EmitByte(sibByte) + } + } else if short { // Note: this includes the case where m.imm32 == 0 && base == rbp || base == r13. + c.EmitByte(encodeModRM(modShortDisplacement, regEncoding(r), baseEnc.encoding())) + if rspOrR12 { + c.EmitByte(sibByte) + } + c.EmitByte(byte(m.imm32)) + } else { + c.EmitByte(encodeModRM(modLongDisplacement, regEncoding(r), baseEnc.encoding())) + if rspOrR12 { + c.EmitByte(sibByte) + } + c.Emit4Bytes(m.imm32) + } + + case amodeRegRegShift: + base := m.base.RealReg() + baseEnc := regEncodings[base] + index := m.index.RealReg() + indexEnc := regEncodings[index] + + if index == rsp { + panic("BUG: rsp can't be used as index of addressing mode") + } + + rex.encodeForIndex(c, regEnc(r), indexEnc, baseEnc) + + for opcodeNum > 0 { + opcodeNum-- + c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff)) + } + + immZero, baseRbp, baseR13 := m.imm32 == 0, base == rbp, base == r13 + if immZero && !baseRbp && !baseR13 { // rbp or r13 can't be used as base for without displacement encoding. (curious why? because it's interpreted as RIP relative addressing). + c.EmitByte(encodeModRM(modNoDisplacement, regEncoding(r), useSBI)) + c.EmitByte(encodeSIB(m.shift(), indexEnc.encoding(), baseEnc.encoding())) + } else if lower8willSignExtendTo32(m.imm32) { + c.EmitByte(encodeModRM(modShortDisplacement, regEncoding(r), useSBI)) + c.EmitByte(encodeSIB(m.shift(), indexEnc.encoding(), baseEnc.encoding())) + c.EmitByte(byte(m.imm32)) + } else { + c.EmitByte(encodeModRM(modLongDisplacement, regEncoding(r), useSBI)) + c.EmitByte(encodeSIB(m.shift(), indexEnc.encoding(), baseEnc.encoding())) + c.Emit4Bytes(m.imm32) + } + + case amodeRipRel: + rex.encode(c, regRexBit(r), 0) + for opcodeNum > 0 { + opcodeNum-- + c.EmitByte(byte((opcodes >> (opcodeNum << 3)) & 0xff)) + } + + // Indicate "LEAQ [RIP + 32bit displacement]. + // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing + c.EmitByte(encodeModRM(0b00, regEncoding(r), 0b101)) + + // This will be resolved later, so we just emit a placeholder. + needsLabelResolution = true + c.Emit4Bytes(0) + + default: + panic("BUG: invalid addressing mode") + } + return +} + +const ( + rexEncodingDefault byte = 0x40 + rexEncodingW = rexEncodingDefault | 0x08 +) + +// rexInfo is a bit set to indicate: +// +// 0x01: W bit must be cleared. +// 0x02: REX prefix must be emitted. +type rexInfo byte + +func (ri rexInfo) setW() rexInfo { + return ri | 0x01 +} + +func (ri rexInfo) clearW() rexInfo { + return ri & 0x02 +} + +func (ri rexInfo) always() rexInfo { + return ri | 0x02 +} + +func (ri rexInfo) notAlways() rexInfo { //nolint + return ri & 0x01 +} + +func (ri rexInfo) encode(c backend.Compiler, r uint8, b uint8) { + var w byte = 0 + if ri&0x01 != 0 { + w = 0x01 + } + rex := rexEncodingDefault | w<<3 | r<<2 | b + if rex != rexEncodingDefault || ri&0x02 != 0 { + c.EmitByte(rex) + } +} + +func (ri rexInfo) encodeForIndex(c backend.Compiler, encR regEnc, encIndex regEnc, encBase regEnc) { + var w byte = 0 + if ri&0x01 != 0 { + w = 0x01 + } + r := encR.rexBit() + x := encIndex.rexBit() + b := encBase.rexBit() + rex := byte(0x40) | w<<3 | r<<2 | x<<1 | b + if rex != 0x40 || ri&0x02 != 0 { + c.EmitByte(rex) + } +} + +type regEnc byte + +func (r regEnc) rexBit() byte { + return regRexBit(byte(r)) +} + +func (r regEnc) encoding() byte { + return regEncoding(byte(r)) +} + +func regRexBit(r byte) byte { + return r >> 3 +} + +func regEncoding(r byte) byte { + return r & 0x07 +} + +var regEncodings = [...]regEnc{ + rax: 0b000, + rcx: 0b001, + rdx: 0b010, + rbx: 0b011, + rsp: 0b100, + rbp: 0b101, + rsi: 0b110, + rdi: 0b111, + r8: 0b1000, + r9: 0b1001, + r10: 0b1010, + r11: 0b1011, + r12: 0b1100, + r13: 0b1101, + r14: 0b1110, + r15: 0b1111, + xmm0: 0b000, + xmm1: 0b001, + xmm2: 0b010, + xmm3: 0b011, + xmm4: 0b100, + xmm5: 0b101, + xmm6: 0b110, + xmm7: 0b111, + xmm8: 0b1000, + xmm9: 0b1001, + xmm10: 0b1010, + xmm11: 0b1011, + xmm12: 0b1100, + xmm13: 0b1101, + xmm14: 0b1110, + xmm15: 0b1111, +} + +type legacyPrefixes byte + +const ( + legacyPrefixesNone legacyPrefixes = iota + legacyPrefixes0x66 + legacyPrefixes0xF0 + legacyPrefixes0x660xF0 + legacyPrefixes0xF2 + legacyPrefixes0xF3 +) + +func (p legacyPrefixes) encode(c backend.Compiler) { + switch p { + case legacyPrefixesNone: + case legacyPrefixes0x66: + c.EmitByte(0x66) + case legacyPrefixes0xF0: + c.EmitByte(0xf0) + case legacyPrefixes0x660xF0: + c.EmitByte(0x66) + c.EmitByte(0xf0) + case legacyPrefixes0xF2: + c.EmitByte(0xf2) + case legacyPrefixes0xF3: + c.EmitByte(0xf3) + default: + panic("BUG: invalid legacy prefix") + } +} + +func lower32willSignExtendTo64(x uint64) bool { + xs := int64(x) + return xs == int64(uint64(int32(xs))) +} + +func lower8willSignExtendTo32(x uint32) bool { + xs := int32(x) + return xs == ((xs << 24) >> 24) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go new file mode 100644 index 000000000..55d05ef63 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go @@ -0,0 +1,71 @@ +package amd64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// lowerConstant allocates a new VReg and inserts the instruction to load the constant value. +func (m *machine) lowerConstant(instr *ssa.Instruction) (vr regalloc.VReg) { + val := instr.Return() + valType := val.Type() + + vr = m.c.AllocateVReg(valType) + m.insertLoadConstant(instr, vr) + return +} + +// InsertLoadConstantBlockArg implements backend.Machine. +func (m *machine) InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) { + m.insertLoadConstant(instr, vr) +} + +func (m *machine) insertLoadConstant(instr *ssa.Instruction, vr regalloc.VReg) { + val := instr.Return() + valType := val.Type() + v := instr.ConstantVal() + + bits := valType.Bits() + if bits < 64 { // Clear the redundant bits just in case it's unexpectedly sign-extended, etc. + v = v & ((1 << valType.Bits()) - 1) + } + + switch valType { + case ssa.TypeF32, ssa.TypeF64: + m.lowerFconst(vr, v, bits == 64) + case ssa.TypeI32, ssa.TypeI64: + m.lowerIconst(vr, v, bits == 64) + default: + panic("BUG") + } +} + +func (m *machine) lowerFconst(dst regalloc.VReg, c uint64, _64 bool) { + if c == 0 { + xor := m.allocateInstr().asZeros(dst) + m.insert(xor) + } else { + var tmpType ssa.Type + if _64 { + tmpType = ssa.TypeI64 + } else { + tmpType = ssa.TypeI32 + } + tmpInt := m.c.AllocateVReg(tmpType) + loadToGP := m.allocateInstr().asImm(tmpInt, c, _64) + m.insert(loadToGP) + + movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpInt), dst, _64) + m.insert(movToXmm) + } +} + +func (m *machine) lowerIconst(dst regalloc.VReg, c uint64, _64 bool) { + i := m.allocateInstr() + if c == 0 { + i.asZeros(dst) + } else { + i.asImm(dst, c, _64) + } + m.insert(i) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go new file mode 100644 index 000000000..bee673d25 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go @@ -0,0 +1,187 @@ +package amd64 + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +var addendsMatchOpcodes = [...]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst, ssa.OpcodeIshl} + +type addend struct { + r regalloc.VReg + off int64 + shift byte +} + +func (a addend) String() string { + return fmt.Sprintf("addend{r=%s, off=%d, shift=%d}", a.r, a.off, a.shift) +} + +// lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions. +func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32) (am *amode) { + def := m.c.ValueDefinition(ptr) + + if offsetBase&0x80000000 != 0 { + // Special casing the huge base offset whose MSB is set. In x64, the immediate is always + // sign-extended, but our IR semantics requires the offset base is always unsigned. + // Note that this should be extremely rare or even this shouldn't hit in the real application, + // therefore we don't need to optimize this case in my opinion. + + a := m.lowerAddend(def) + off64 := a.off + int64(offsetBase) + offsetBaseReg := m.c.AllocateVReg(ssa.TypeI64) + m.lowerIconst(offsetBaseReg, uint64(off64), true) + if a.r != regalloc.VRegInvalid { + return m.newAmodeRegRegShift(0, offsetBaseReg, a.r, a.shift) + } else { + return m.newAmodeImmReg(0, offsetBaseReg) + } + } + + if op := m.c.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op == ssa.OpcodeIadd { + add := def.Instr + x, y := add.Arg2() + xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) + ax := m.lowerAddend(xDef) + ay := m.lowerAddend(yDef) + add.MarkLowered() + return m.lowerAddendsToAmode(ax, ay, offsetBase) + } else { + // If it is not an Iadd, then we lower the one addend. + a := m.lowerAddend(def) + // off is always 0 if r is valid. + if a.r != regalloc.VRegInvalid { + if a.shift != 0 { + tmpReg := m.c.AllocateVReg(ssa.TypeI64) + m.lowerIconst(tmpReg, 0, true) + return m.newAmodeRegRegShift(offsetBase, tmpReg, a.r, a.shift) + } + return m.newAmodeImmReg(offsetBase, a.r) + } else { + off64 := a.off + int64(offsetBase) + tmpReg := m.c.AllocateVReg(ssa.TypeI64) + m.lowerIconst(tmpReg, uint64(off64), true) + return m.newAmodeImmReg(0, tmpReg) + } + } +} + +func (m *machine) lowerAddendsToAmode(x, y addend, offBase uint32) *amode { + if x.r != regalloc.VRegInvalid && x.off != 0 || y.r != regalloc.VRegInvalid && y.off != 0 { + panic("invalid input") + } + + u64 := uint64(x.off+y.off) + uint64(offBase) + if u64 != 0 { + if _, ok := asImm32(u64, false); !ok { + tmpReg := m.c.AllocateVReg(ssa.TypeI64) + m.lowerIconst(tmpReg, u64, true) + // Blank u64 as it has been already lowered. + u64 = 0 + + if x.r == regalloc.VRegInvalid { + x.r = tmpReg + } else if y.r == regalloc.VRegInvalid { + y.r = tmpReg + } else { + // We already know that either rx or ry is invalid, + // so we overwrite it with the temporary register. + panic("BUG") + } + } + } + + u32 := uint32(u64) + switch { + // We assume rx, ry are valid iff offx, offy are 0. + case x.r != regalloc.VRegInvalid && y.r != regalloc.VRegInvalid: + switch { + case x.shift != 0 && y.shift != 0: + // Cannot absorb two shifted registers, must lower one to a shift instruction. + shifted := m.allocateInstr() + shifted.asShiftR(shiftROpShiftLeft, newOperandImm32(uint32(x.shift)), x.r, true) + m.insert(shifted) + + return m.newAmodeRegRegShift(u32, x.r, y.r, y.shift) + case x.shift != 0 && y.shift == 0: + // Swap base and index. + x, y = y, x + fallthrough + default: + return m.newAmodeRegRegShift(u32, x.r, y.r, y.shift) + } + case x.r == regalloc.VRegInvalid && y.r != regalloc.VRegInvalid: + x, y = y, x + fallthrough + case x.r != regalloc.VRegInvalid && y.r == regalloc.VRegInvalid: + if x.shift != 0 { + zero := m.c.AllocateVReg(ssa.TypeI64) + m.lowerIconst(zero, 0, true) + return m.newAmodeRegRegShift(u32, zero, x.r, x.shift) + } + return m.newAmodeImmReg(u32, x.r) + default: // Both are invalid: use the offset. + tmpReg := m.c.AllocateVReg(ssa.TypeI64) + m.lowerIconst(tmpReg, u64, true) + return m.newAmodeImmReg(0, tmpReg) + } +} + +func (m *machine) lowerAddend(x *backend.SSAValueDefinition) addend { + if x.IsFromBlockParam() { + return addend{x.BlkParamVReg, 0, 0} + } + // Ensure the addend is not referenced in multiple places; we will discard nested Iadds. + op := m.c.MatchInstrOneOf(x, addendsMatchOpcodes[:]) + if op != ssa.OpcodeInvalid && op != ssa.OpcodeIadd { + return m.lowerAddendFromInstr(x.Instr) + } + p := m.getOperand_Reg(x) + return addend{p.reg(), 0, 0} +} + +// lowerAddendFromInstr takes an instruction returns a Vreg and an offset that can be used in an address mode. +// The Vreg is regalloc.VRegInvalid if the addend cannot be lowered to a register. +// The offset is 0 if the addend can be lowered to a register. +func (m *machine) lowerAddendFromInstr(instr *ssa.Instruction) addend { + instr.MarkLowered() + switch op := instr.Opcode(); op { + case ssa.OpcodeIconst: + u64 := instr.ConstantVal() + if instr.Return().Type().Bits() == 32 { + return addend{regalloc.VRegInvalid, int64(int32(u64)), 0} // sign-extend. + } else { + return addend{regalloc.VRegInvalid, int64(u64), 0} + } + case ssa.OpcodeUExtend, ssa.OpcodeSExtend: + input := instr.Arg() + inputDef := m.c.ValueDefinition(input) + if input.Type().Bits() != 32 { + panic("BUG: invalid input type " + input.Type().String()) + } + constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant() + switch { + case constInst && op == ssa.OpcodeSExtend: + return addend{regalloc.VRegInvalid, int64(uint32(inputDef.Instr.ConstantVal())), 0} + case constInst && op == ssa.OpcodeUExtend: + return addend{regalloc.VRegInvalid, int64(int32(inputDef.Instr.ConstantVal())), 0} // sign-extend! + default: + r := m.getOperand_Reg(inputDef) + return addend{r.reg(), 0, 0} + } + case ssa.OpcodeIshl: + // If the addend is a shift, we can only handle it if the shift amount is a constant. + x, amount := instr.Arg2() + amountDef := m.c.ValueDefinition(amount) + if amountDef.IsFromInstr() && amountDef.Instr.Constant() && amountDef.Instr.ConstantVal() <= 3 { + r := m.getOperand_Reg(m.c.ValueDefinition(x)) + return addend{r.reg(), 0, uint8(amountDef.Instr.ConstantVal())} + } + r := m.getOperand_Reg(m.c.ValueDefinition(x)) + return addend{r.reg(), 0, 0} + } + panic("BUG: invalid opcode") +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go new file mode 100644 index 000000000..310ad2203 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go @@ -0,0 +1,3611 @@ +package amd64 + +import ( + "context" + "encoding/binary" + "fmt" + "math" + "strings" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" + "github.com/tetratelabs/wazero/internal/platform" +) + +// NewBackend returns a new backend for arm64. +func NewBackend() backend.Machine { + ectx := backend.NewExecutableContextT[instruction]( + resetInstruction, + setNext, + setPrev, + asNop, + ) + return &machine{ + ectx: ectx, + cpuFeatures: platform.CpuFeatures, + regAlloc: regalloc.NewAllocator(regInfo), + spillSlots: map[regalloc.VRegID]int64{}, + amodePool: wazevoapi.NewPool[amode](nil), + constSwizzleMaskConstIndex: -1, + constSqmulRoundSatIndex: -1, + constI8x16SHLMaskTableIndex: -1, + constI8x16LogicalSHRMaskTableIndex: -1, + constF64x2CvtFromIMaskIndex: -1, + constTwop52Index: -1, + constI32sMaxOnF64x2Index: -1, + constI32uMaxOnF64x2Index: -1, + constAllOnesI8x16Index: -1, + constAllOnesI16x8Index: -1, + constExtAddPairwiseI16x8uMask1Index: -1, + constExtAddPairwiseI16x8uMask2Index: -1, + } +} + +type ( + // machine implements backend.Machine for amd64. + machine struct { + c backend.Compiler + ectx *backend.ExecutableContextT[instruction] + stackBoundsCheckDisabled bool + + amodePool wazevoapi.Pool[amode] + + cpuFeatures platform.CpuFeatureFlags + + regAlloc regalloc.Allocator + regAllocFn *backend.RegAllocFunction[*instruction, *machine] + regAllocStarted bool + + spillSlotSize int64 + spillSlots map[regalloc.VRegID]int64 + currentABI *backend.FunctionABI + clobberedRegs []regalloc.VReg + + maxRequiredStackSizeForCalls int64 + + labelResolutionPends []labelResolutionPend + + jmpTableTargets [][]uint32 + consts []_const + + constSwizzleMaskConstIndex, constSqmulRoundSatIndex, + constI8x16SHLMaskTableIndex, constI8x16LogicalSHRMaskTableIndex, + constF64x2CvtFromIMaskIndex, constTwop52Index, + constI32sMaxOnF64x2Index, constI32uMaxOnF64x2Index, + constAllOnesI8x16Index, constAllOnesI16x8Index, + constExtAddPairwiseI16x8uMask1Index, constExtAddPairwiseI16x8uMask2Index int + } + + _const struct { + lo, hi uint64 + _var []byte + label *labelPosition + } + + labelResolutionPend struct { + instr *instruction + instrOffset int64 + // imm32Offset is the offset of the last 4 bytes of the instruction. + imm32Offset int64 + } + + labelPosition = backend.LabelPosition[instruction] +) + +func (m *machine) getOrAllocateConstLabel(i *int, _var []byte) backend.Label { + index := *i + if index == -1 { + label := m.allocateLabel() + index = len(m.consts) + m.consts = append(m.consts, _const{ + _var: _var, + label: label, + }) + *i = index + } + return m.consts[index].label.L +} + +// Reset implements backend.Machine. +func (m *machine) Reset() { + m.consts = m.consts[:0] + m.clobberedRegs = m.clobberedRegs[:0] + for key := range m.spillSlots { + m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key)) + } + for _, key := range m.clobberedRegs { + delete(m.spillSlots, regalloc.VRegID(key)) + } + + m.stackBoundsCheckDisabled = false + m.ectx.Reset() + + m.regAllocFn.Reset() + m.regAlloc.Reset() + m.regAllocStarted = false + m.clobberedRegs = m.clobberedRegs[:0] + + m.spillSlotSize = 0 + m.maxRequiredStackSizeForCalls = 0 + + m.amodePool.Reset() + m.jmpTableTargets = m.jmpTableTargets[:0] + m.constSwizzleMaskConstIndex = -1 + m.constSqmulRoundSatIndex = -1 + m.constI8x16SHLMaskTableIndex = -1 + m.constI8x16LogicalSHRMaskTableIndex = -1 + m.constF64x2CvtFromIMaskIndex = -1 + m.constTwop52Index = -1 + m.constI32sMaxOnF64x2Index = -1 + m.constI32uMaxOnF64x2Index = -1 + m.constAllOnesI8x16Index = -1 + m.constAllOnesI16x8Index = -1 + m.constExtAddPairwiseI16x8uMask1Index = -1 + m.constExtAddPairwiseI16x8uMask2Index = -1 +} + +// ExecutableContext implements backend.Machine. +func (m *machine) ExecutableContext() backend.ExecutableContext { return m.ectx } + +// DisableStackCheck implements backend.Machine. +func (m *machine) DisableStackCheck() { m.stackBoundsCheckDisabled = true } + +// SetCompiler implements backend.Machine. +func (m *machine) SetCompiler(c backend.Compiler) { + m.c = c + m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, c.SSABuilder(), c) +} + +// SetCurrentABI implements backend.Machine. +func (m *machine) SetCurrentABI(abi *backend.FunctionABI) { + m.currentABI = abi +} + +// RegAlloc implements backend.Machine. +func (m *machine) RegAlloc() { + rf := m.regAllocFn + for _, pos := range m.ectx.OrderedBlockLabels { + rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End) + } + + m.regAllocStarted = true + m.regAlloc.DoAllocation(rf) + // Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes. + m.spillSlotSize = (m.spillSlotSize + 15) &^ 15 +} + +// InsertReturn implements backend.Machine. +func (m *machine) InsertReturn() { + i := m.allocateInstr().asRet() + m.insert(i) +} + +// LowerSingleBranch implements backend.Machine. +func (m *machine) LowerSingleBranch(b *ssa.Instruction) { + ectx := m.ectx + switch b.Opcode() { + case ssa.OpcodeJump: + _, _, targetBlk := b.BranchData() + if b.IsFallthroughJump() { + return + } + jmp := m.allocateInstr() + target := ectx.GetOrAllocateSSABlockLabel(targetBlk) + if target == backend.LabelReturn { + jmp.asRet() + } else { + jmp.asJmp(newOperandLabel(target)) + } + m.insert(jmp) + case ssa.OpcodeBrTable: + index, target := b.BrTableData() + m.lowerBrTable(index, target) + default: + panic("BUG: unexpected branch opcode" + b.Opcode().String()) + } +} + +func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) { + // TODO: reuse the slice! + labels := make([]uint32, len(targets)) + for j, target := range targets { + labels[j] = uint32(m.ectx.GetOrAllocateSSABlockLabel(target)) + } + index = len(m.jmpTableTargets) + m.jmpTableTargets = append(m.jmpTableTargets, labels) + return +} + +var condBranchMatches = [...]ssa.Opcode{ssa.OpcodeIcmp, ssa.OpcodeFcmp} + +func (m *machine) lowerBrTable(index ssa.Value, targets []ssa.BasicBlock) { + _v := m.getOperand_Reg(m.c.ValueDefinition(index)) + v := m.copyToTmp(_v.reg()) + + // First, we need to do the bounds check. + maxIndex := m.c.AllocateVReg(ssa.TypeI32) + m.lowerIconst(maxIndex, uint64(len(targets)-1), false) + cmp := m.allocateInstr().asCmpRmiR(true, newOperandReg(maxIndex), v, false) + m.insert(cmp) + + // Then do the conditional move maxIndex to v if v > maxIndex. + cmov := m.allocateInstr().asCmove(condNB, newOperandReg(maxIndex), v, false) + m.insert(cmov) + + // Now that v has the correct index. Load the address of the jump table into the addr. + addr := m.c.AllocateVReg(ssa.TypeI64) + leaJmpTableAddr := m.allocateInstr() + m.insert(leaJmpTableAddr) + + // Then add the target's offset into jmpTableAddr. + loadTargetOffsetFromJmpTable := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, + // Shift by 3 because each entry is 8 bytes. + newOperandMem(m.newAmodeRegRegShift(0, addr, v, 3)), addr, true) + m.insert(loadTargetOffsetFromJmpTable) + + // Now ready to jump. + jmp := m.allocateInstr().asJmp(newOperandReg(addr)) + m.insert(jmp) + + jmpTableBegin, jmpTableBeginLabel := m.allocateBrTarget() + m.insert(jmpTableBegin) + leaJmpTableAddr.asLEA(newOperandLabel(jmpTableBeginLabel), addr) + + jmpTable := m.allocateInstr() + targetSliceIndex := m.addJmpTableTarget(targets) + jmpTable.asJmpTableSequence(targetSliceIndex, len(targets)) + m.insert(jmpTable) +} + +// LowerConditionalBranch implements backend.Machine. +func (m *machine) LowerConditionalBranch(b *ssa.Instruction) { + exctx := m.ectx + cval, args, targetBlk := b.BranchData() + if len(args) > 0 { + panic(fmt.Sprintf( + "conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s", + exctx.CurrentSSABlk, + targetBlk, + )) + } + + target := exctx.GetOrAllocateSSABlockLabel(targetBlk) + cvalDef := m.c.ValueDefinition(cval) + + switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) { + case ssa.OpcodeIcmp: + cvalInstr := cvalDef.Instr + x, y, c := cvalInstr.IcmpData() + + cc := condFromSSAIntCmpCond(c) + if b.Opcode() == ssa.OpcodeBrz { + cc = cc.invert() + } + + // First, perform the comparison and set the flag. + xd, yd := m.c.ValueDefinition(x), m.c.ValueDefinition(y) + if !m.tryLowerBandToFlag(xd, yd) { + m.lowerIcmpToFlag(xd, yd, x.Type() == ssa.TypeI64) + } + + // Then perform the conditional branch. + m.insert(m.allocateInstr().asJmpIf(cc, newOperandLabel(target))) + cvalDef.Instr.MarkLowered() + case ssa.OpcodeFcmp: + cvalInstr := cvalDef.Instr + + f1, f2, and := m.lowerFcmpToFlags(cvalInstr) + isBrz := b.Opcode() == ssa.OpcodeBrz + if isBrz { + f1 = f1.invert() + } + if f2 == condInvalid { + m.insert(m.allocateInstr().asJmpIf(f1, newOperandLabel(target))) + } else { + if isBrz { + f2 = f2.invert() + and = !and + } + jmp1, jmp2 := m.allocateInstr(), m.allocateInstr() + m.insert(jmp1) + m.insert(jmp2) + notTaken, notTakenLabel := m.allocateBrTarget() + m.insert(notTaken) + if and { + jmp1.asJmpIf(f1.invert(), newOperandLabel(notTakenLabel)) + jmp2.asJmpIf(f2, newOperandLabel(target)) + } else { + jmp1.asJmpIf(f1, newOperandLabel(target)) + jmp2.asJmpIf(f2, newOperandLabel(target)) + } + } + + cvalDef.Instr.MarkLowered() + default: + v := m.getOperand_Reg(cvalDef) + + var cc cond + if b.Opcode() == ssa.OpcodeBrz { + cc = condZ + } else { + cc = condNZ + } + + // Perform test %v, %v to set the flag. + cmp := m.allocateInstr().asCmpRmiR(false, v, v.reg(), false) + m.insert(cmp) + m.insert(m.allocateInstr().asJmpIf(cc, newOperandLabel(target))) + } +} + +// LowerInstr implements backend.Machine. +func (m *machine) LowerInstr(instr *ssa.Instruction) { + if l := instr.SourceOffset(); l.Valid() { + info := m.allocateInstr().asEmitSourceOffsetInfo(l) + m.insert(info) + } + + switch op := instr.Opcode(); op { + case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable: + panic("BUG: branching instructions are handled by LowerBranches") + case ssa.OpcodeReturn: + panic("BUG: return must be handled by backend.Compiler") + case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined. + case ssa.OpcodeCall, ssa.OpcodeCallIndirect: + m.lowerCall(instr) + case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32: + m.lowerStore(instr) + case ssa.OpcodeIadd: + m.lowerAluRmiROp(instr, aluRmiROpcodeAdd) + case ssa.OpcodeIsub: + m.lowerAluRmiROp(instr, aluRmiROpcodeSub) + case ssa.OpcodeImul: + m.lowerAluRmiROp(instr, aluRmiROpcodeMul) + case ssa.OpcodeSdiv, ssa.OpcodeUdiv, ssa.OpcodeSrem, ssa.OpcodeUrem: + isDiv := op == ssa.OpcodeSdiv || op == ssa.OpcodeUdiv + isSigned := op == ssa.OpcodeSdiv || op == ssa.OpcodeSrem + m.lowerIDivRem(instr, isDiv, isSigned) + case ssa.OpcodeBand: + m.lowerAluRmiROp(instr, aluRmiROpcodeAnd) + case ssa.OpcodeBor: + m.lowerAluRmiROp(instr, aluRmiROpcodeOr) + case ssa.OpcodeBxor: + m.lowerAluRmiROp(instr, aluRmiROpcodeXor) + case ssa.OpcodeIshl: + m.lowerShiftR(instr, shiftROpShiftLeft) + case ssa.OpcodeSshr: + m.lowerShiftR(instr, shiftROpShiftRightArithmetic) + case ssa.OpcodeUshr: + m.lowerShiftR(instr, shiftROpShiftRightLogical) + case ssa.OpcodeRotl: + m.lowerShiftR(instr, shiftROpRotateLeft) + case ssa.OpcodeRotr: + m.lowerShiftR(instr, shiftROpRotateRight) + case ssa.OpcodeClz: + m.lowerClz(instr) + case ssa.OpcodeCtz: + m.lowerCtz(instr) + case ssa.OpcodePopcnt: + m.lowerUnaryRmR(instr, unaryRmROpcodePopcnt) + case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv: + m.lowerXmmRmR(instr) + case ssa.OpcodeFabs: + m.lowerFabsFneg(instr) + case ssa.OpcodeFneg: + m.lowerFabsFneg(instr) + case ssa.OpcodeCeil: + m.lowerRound(instr, roundingModeUp) + case ssa.OpcodeFloor: + m.lowerRound(instr, roundingModeDown) + case ssa.OpcodeTrunc: + m.lowerRound(instr, roundingModeZero) + case ssa.OpcodeNearest: + m.lowerRound(instr, roundingModeNearest) + case ssa.OpcodeFmin, ssa.OpcodeFmax: + m.lowerFminFmax(instr) + case ssa.OpcodeFcopysign: + m.lowerFcopysign(instr) + case ssa.OpcodeBitcast: + m.lowerBitcast(instr) + case ssa.OpcodeSqrt: + m.lowerSqrt(instr) + case ssa.OpcodeFpromote: + v := instr.Arg() + rn := m.getOperand_Reg(m.c.ValueDefinition(v)) + rd := m.c.VRegOf(instr.Return()) + cnt := m.allocateInstr() + cnt.asXmmUnaryRmR(sseOpcodeCvtss2sd, rn, rd) + m.insert(cnt) + case ssa.OpcodeFdemote: + v := instr.Arg() + rn := m.getOperand_Reg(m.c.ValueDefinition(v)) + rd := m.c.VRegOf(instr.Return()) + cnt := m.allocateInstr() + cnt.asXmmUnaryRmR(sseOpcodeCvtsd2ss, rn, rd) + m.insert(cnt) + case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat: + x, ctx := instr.Arg2() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + ctxVReg := m.c.VRegOf(ctx) + m.lowerFcvtToSint(ctxVReg, rn.reg(), rd, x.Type() == ssa.TypeF64, + instr.Return().Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat) + case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat: + x, ctx := instr.Arg2() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + ctxVReg := m.c.VRegOf(ctx) + m.lowerFcvtToUint(ctxVReg, rn.reg(), rd, x.Type() == ssa.TypeF64, + instr.Return().Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat) + case ssa.OpcodeFcvtFromSint: + x := instr.Arg() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := newOperandReg(m.c.VRegOf(instr.Return())) + m.lowerFcvtFromSint(rn, rd, + x.Type() == ssa.TypeI64, instr.Return().Type().Bits() == 64) + case ssa.OpcodeFcvtFromUint: + x := instr.Arg() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := newOperandReg(m.c.VRegOf(instr.Return())) + m.lowerFcvtFromUint(rn, rd, x.Type() == ssa.TypeI64, + instr.Return().Type().Bits() == 64) + case ssa.OpcodeVanyTrue: + m.lowerVanyTrue(instr) + case ssa.OpcodeVallTrue: + m.lowerVallTrue(instr) + case ssa.OpcodeVhighBits: + m.lowerVhighBits(instr) + case ssa.OpcodeVbnot: + m.lowerVbnot(instr) + case ssa.OpcodeVband: + x, y := instr.Arg2() + m.lowerVbBinOp(sseOpcodePand, x, y, instr.Return()) + case ssa.OpcodeVbor: + x, y := instr.Arg2() + m.lowerVbBinOp(sseOpcodePor, x, y, instr.Return()) + case ssa.OpcodeVbxor: + x, y := instr.Arg2() + m.lowerVbBinOp(sseOpcodePxor, x, y, instr.Return()) + case ssa.OpcodeVbandnot: + m.lowerVbandnot(instr, sseOpcodePandn) + case ssa.OpcodeVbitselect: + m.lowerVbitselect(instr) + case ssa.OpcodeVIadd: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePaddb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePaddw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePaddd + case ssa.VecLaneI64x2: + vecOp = sseOpcodePaddq + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVSaddSat: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePaddsb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePaddsw + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVUaddSat: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePaddusb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePaddusw + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVIsub: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePsubb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePsubw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePsubd + case ssa.VecLaneI64x2: + vecOp = sseOpcodePsubq + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVSsubSat: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePsubsb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePsubsw + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVUsubSat: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePsubusb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePsubusw + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVImul: + m.lowerVImul(instr) + case ssa.OpcodeVIneg: + x, lane := instr.ArgWithLane() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePsubb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePsubw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePsubd + case ssa.VecLaneI64x2: + vecOp = sseOpcodePsubq + default: + panic("BUG") + } + + tmp := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asZeros(tmp)) + + i := m.allocateInstr() + i.asXmmRmR(vecOp, rn, tmp) + m.insert(i) + + m.copyTo(tmp, rd) + case ssa.OpcodeVFadd: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneF32x4: + vecOp = sseOpcodeAddps + case ssa.VecLaneF64x2: + vecOp = sseOpcodeAddpd + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVFsub: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneF32x4: + vecOp = sseOpcodeSubps + case ssa.VecLaneF64x2: + vecOp = sseOpcodeSubpd + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVFdiv: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneF32x4: + vecOp = sseOpcodeDivps + case ssa.VecLaneF64x2: + vecOp = sseOpcodeDivpd + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVFmul: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneF32x4: + vecOp = sseOpcodeMulps + case ssa.VecLaneF64x2: + vecOp = sseOpcodeMulpd + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVFneg: + x, lane := instr.ArgWithLane() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + + tmp := m.c.AllocateVReg(ssa.TypeV128) + + var shiftOp, xorOp sseOpcode + var shiftAmt uint32 + switch lane { + case ssa.VecLaneF32x4: + shiftOp, shiftAmt, xorOp = sseOpcodePslld, 31, sseOpcodeXorps + case ssa.VecLaneF64x2: + shiftOp, shiftAmt, xorOp = sseOpcodePsllq, 63, sseOpcodeXorpd + } + + zero := m.allocateInstr() + zero.asZeros(tmp) + m.insert(zero) + + // Set all bits on tmp by CMPPD with arg=0 (== pseudo CMPEQPD instruction). + // See https://www.felixcloutier.com/x86/cmpps + // + // Note: if we do not clear all the bits ^ with XORPS, this might end up not setting ones on some lane + // if the lane is NaN. + cmp := m.allocateInstr() + cmp.asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_UQ), newOperandReg(tmp), tmp) + m.insert(cmp) + + // Do the left shift on each lane to set only the most significant bit in each. + i := m.allocateInstr() + i.asXmmRmiReg(shiftOp, newOperandImm32(shiftAmt), tmp) + m.insert(i) + + // Get the negated result by XOR on each lane with tmp. + i = m.allocateInstr() + i.asXmmRmR(xorOp, rn, tmp) + m.insert(i) + + m.copyTo(tmp, rd) + + case ssa.OpcodeVSqrt: + x, lane := instr.ArgWithLane() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + + var vecOp sseOpcode + switch lane { + case ssa.VecLaneF32x4: + vecOp = sseOpcodeSqrtps + case ssa.VecLaneF64x2: + vecOp = sseOpcodeSqrtpd + } + i := m.allocateInstr() + i.asXmmUnaryRmR(vecOp, rn, rd) + m.insert(i) + + case ssa.OpcodeVImin: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePminsb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePminsw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePminsd + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVUmin: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePminub + case ssa.VecLaneI16x8: + vecOp = sseOpcodePminuw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePminud + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVImax: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePmaxsb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePmaxsw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePmaxsd + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVUmax: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePmaxub + case ssa.VecLaneI16x8: + vecOp = sseOpcodePmaxuw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePmaxud + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVAvgRound: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePavgb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePavgw + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + + case ssa.OpcodeVIcmp: + x, y, c, lane := instr.VIcmpData() + m.lowerVIcmp(x, y, c, instr.Return(), lane) + + case ssa.OpcodeVFcmp: + x, y, c, lane := instr.VFcmpData() + m.lowerVFcmp(x, y, c, instr.Return(), lane) + + case ssa.OpcodeExtractlane: + x, index, signed, lane := instr.ExtractlaneData() + m.lowerExtractLane(x, index, signed, instr.Return(), lane) + + case ssa.OpcodeInsertlane: + x, y, index, lane := instr.InsertlaneData() + m.lowerInsertLane(x, y, index, instr.Return(), lane) + + case ssa.OpcodeSwizzle: + x, y, _ := instr.Arg2WithLane() + m.lowerSwizzle(x, y, instr.Return()) + + case ssa.OpcodeShuffle: + x, y, lo, hi := instr.ShuffleData() + m.lowerShuffle(x, y, lo, hi, instr.Return()) + + case ssa.OpcodeSplat: + x, lane := instr.ArgWithLane() + m.lowerSplat(x, instr.Return(), lane) + + case ssa.OpcodeSqmulRoundSat: + x, y := instr.Arg2() + m.lowerSqmulRoundSat(x, y, instr.Return()) + + case ssa.OpcodeVZeroExtLoad: + ptr, offset, typ := instr.VZeroExtLoadData() + var sseOp sseOpcode + // Both movss and movsd clears the higher bits of the destination register upt 128 bits. + // https://www.felixcloutier.com/x86/movss + // https://www.felixcloutier.com/x86/movsd + if typ == ssa.TypeF32 { + sseOp = sseOpcodeMovss + } else { + sseOp = sseOpcodeMovsd + } + mem := m.lowerToAddressMode(ptr, offset) + dst := m.c.VRegOf(instr.Return()) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandMem(mem), dst)) + + case ssa.OpcodeVMinPseudo: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneF32x4: + vecOp = sseOpcodeMinps + case ssa.VecLaneF64x2: + vecOp = sseOpcodeMinpd + default: + panic("BUG: unexpected lane type") + } + m.lowerVbBinOpUnaligned(vecOp, y, x, instr.Return()) + + case ssa.OpcodeVMaxPseudo: + x, y, lane := instr.Arg2WithLane() + var vecOp sseOpcode + switch lane { + case ssa.VecLaneF32x4: + vecOp = sseOpcodeMaxps + case ssa.VecLaneF64x2: + vecOp = sseOpcodeMaxpd + default: + panic("BUG: unexpected lane type") + } + m.lowerVbBinOpUnaligned(vecOp, y, x, instr.Return()) + + case ssa.OpcodeVIshl: + x, y, lane := instr.Arg2WithLane() + m.lowerVIshl(x, y, instr.Return(), lane) + + case ssa.OpcodeVSshr: + x, y, lane := instr.Arg2WithLane() + m.lowerVSshr(x, y, instr.Return(), lane) + + case ssa.OpcodeVUshr: + x, y, lane := instr.Arg2WithLane() + m.lowerVUshr(x, y, instr.Return(), lane) + + case ssa.OpcodeVCeil: + x, lane := instr.ArgWithLane() + m.lowerVRound(x, instr.Return(), 0x2, lane == ssa.VecLaneF64x2) + + case ssa.OpcodeVFloor: + x, lane := instr.ArgWithLane() + m.lowerVRound(x, instr.Return(), 0x1, lane == ssa.VecLaneF64x2) + + case ssa.OpcodeVTrunc: + x, lane := instr.ArgWithLane() + m.lowerVRound(x, instr.Return(), 0x3, lane == ssa.VecLaneF64x2) + + case ssa.OpcodeVNearest: + x, lane := instr.ArgWithLane() + m.lowerVRound(x, instr.Return(), 0x0, lane == ssa.VecLaneF64x2) + + case ssa.OpcodeExtIaddPairwise: + x, lane, signed := instr.ExtIaddPairwiseData() + m.lowerExtIaddPairwise(x, instr.Return(), lane, signed) + + case ssa.OpcodeUwidenLow, ssa.OpcodeSwidenLow: + x, lane := instr.ArgWithLane() + m.lowerWidenLow(x, instr.Return(), lane, op == ssa.OpcodeSwidenLow) + + case ssa.OpcodeUwidenHigh, ssa.OpcodeSwidenHigh: + x, lane := instr.ArgWithLane() + m.lowerWidenHigh(x, instr.Return(), lane, op == ssa.OpcodeSwidenHigh) + + case ssa.OpcodeLoadSplat: + ptr, offset, lane := instr.LoadSplatData() + m.lowerLoadSplat(ptr, offset, instr.Return(), lane) + + case ssa.OpcodeVFcvtFromUint, ssa.OpcodeVFcvtFromSint: + x, lane := instr.ArgWithLane() + m.lowerVFcvtFromInt(x, instr.Return(), lane, op == ssa.OpcodeVFcvtFromSint) + + case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat: + x, lane := instr.ArgWithLane() + m.lowerVFcvtToIntSat(x, instr.Return(), lane, op == ssa.OpcodeVFcvtToSintSat) + + case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow: + x, y, lane := instr.Arg2WithLane() + m.lowerNarrow(x, y, instr.Return(), lane, op == ssa.OpcodeSnarrow) + + case ssa.OpcodeFvpromoteLow: + x := instr.Arg() + src := m.getOperand_Reg(m.c.ValueDefinition(x)) + dst := m.c.VRegOf(instr.Return()) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtps2pd, src, dst)) + + case ssa.OpcodeFvdemote: + x := instr.Arg() + src := m.getOperand_Reg(m.c.ValueDefinition(x)) + dst := m.c.VRegOf(instr.Return()) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtpd2ps, src, dst)) + + case ssa.OpcodeWideningPairwiseDotProductS: + x, y := instr.Arg2() + m.lowerWideningPairwiseDotProductS(x, y, instr.Return()) + + case ssa.OpcodeVIabs: + m.lowerVIabs(instr) + case ssa.OpcodeVIpopcnt: + m.lowerVIpopcnt(instr) + case ssa.OpcodeVFmin: + m.lowerVFmin(instr) + case ssa.OpcodeVFmax: + m.lowerVFmax(instr) + case ssa.OpcodeVFabs: + m.lowerVFabs(instr) + case ssa.OpcodeUndefined: + m.insert(m.allocateInstr().asUD2()) + case ssa.OpcodeExitWithCode: + execCtx, code := instr.ExitWithCodeData() + m.lowerExitWithCode(m.c.VRegOf(execCtx), code) + case ssa.OpcodeExitIfTrueWithCode: + execCtx, c, code := instr.ExitIfTrueWithCodeData() + m.lowerExitIfTrueWithCode(m.c.VRegOf(execCtx), c, code) + case ssa.OpcodeLoad: + ptr, offset, typ := instr.LoadData() + dst := m.c.VRegOf(instr.Return()) + m.lowerLoad(ptr, offset, typ, dst) + case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32: + ptr, offset, _ := instr.LoadData() + ret := m.c.VRegOf(instr.Return()) + m.lowerExtLoad(op, ptr, offset, ret) + case ssa.OpcodeVconst: + result := m.c.VRegOf(instr.Return()) + lo, hi := instr.VconstData() + m.lowerVconst(result, lo, hi) + case ssa.OpcodeSExtend, ssa.OpcodeUExtend: + from, to, signed := instr.ExtendData() + m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed) + case ssa.OpcodeIcmp: + m.lowerIcmp(instr) + case ssa.OpcodeFcmp: + m.lowerFcmp(instr) + case ssa.OpcodeSelect: + cval, x, y := instr.SelectData() + m.lowerSelect(x, y, cval, instr.Return()) + case ssa.OpcodeIreduce: + rn := m.getOperand_Mem_Reg(m.c.ValueDefinition(instr.Arg())) + retVal := instr.Return() + rd := m.c.VRegOf(retVal) + + if retVal.Type() != ssa.TypeI32 { + panic("TODO?: Ireduce to non-i32") + } + m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, rn, rd)) + + case ssa.OpcodeAtomicLoad: + ptr := instr.Arg() + size := instr.AtomicTargetSize() + dst := m.c.VRegOf(instr.Return()) + + // At this point, the ptr is ensured to be aligned, so using a normal load is atomic. + // https://github.com/golang/go/blob/adead1a93f472affa97c494ef19f2f492ee6f34a/src/runtime/internal/atomic/atomic_amd64.go#L30 + mem := newOperandMem(m.lowerToAddressMode(ptr, 0)) + load := m.allocateInstr() + switch size { + case 8: + load.asMov64MR(mem, dst) + case 4: + load.asMovzxRmR(extModeLQ, mem, dst) + case 2: + load.asMovzxRmR(extModeWQ, mem, dst) + case 1: + load.asMovzxRmR(extModeBQ, mem, dst) + default: + panic("BUG") + } + m.insert(load) + + case ssa.OpcodeFence: + m.insert(m.allocateInstr().asMFence()) + + case ssa.OpcodeAtomicStore: + ptr, _val := instr.Arg2() + size := instr.AtomicTargetSize() + + val := m.getOperand_Reg(m.c.ValueDefinition(_val)) + // The content on the val register will be overwritten by xchg, so we need to copy it to a temporary register. + copied := m.copyToTmp(val.reg()) + + mem := newOperandMem(m.lowerToAddressMode(ptr, 0)) + store := m.allocateInstr().asXCHG(copied, mem, byte(size)) + m.insert(store) + + case ssa.OpcodeAtomicCas: + addr, exp, repl := instr.Arg3() + size := instr.AtomicTargetSize() + m.lowerAtomicCas(addr, exp, repl, size, instr.Return()) + + case ssa.OpcodeAtomicRmw: + addr, val := instr.Arg2() + atomicOp, size := instr.AtomicRmwData() + m.lowerAtomicRmw(atomicOp, addr, val, size, instr.Return()) + + default: + panic("TODO: lowering " + op.String()) + } +} + +func (m *machine) lowerAtomicRmw(op ssa.AtomicRmwOp, addr, val ssa.Value, size uint64, ret ssa.Value) { + mem := m.lowerToAddressMode(addr, 0) + _val := m.getOperand_Reg(m.c.ValueDefinition(val)) + + switch op { + case ssa.AtomicRmwOpAdd, ssa.AtomicRmwOpSub: + valCopied := m.copyToTmp(_val.reg()) + if op == ssa.AtomicRmwOpSub { + // Negate the value. + m.insert(m.allocateInstr().asNeg(newOperandReg(valCopied), true)) + } + m.insert(m.allocateInstr().asLockXAdd(valCopied, mem, byte(size))) + m.clearHigherBitsForAtomic(valCopied, size, ret.Type()) + m.copyTo(valCopied, m.c.VRegOf(ret)) + + case ssa.AtomicRmwOpAnd, ssa.AtomicRmwOpOr, ssa.AtomicRmwOpXor: + accumulator := raxVReg + // Reserve rax for the accumulator to make regalloc happy. + // Note: do this initialization before defining valCopied, because it might be the same register and + // if that happens, the unnecessary load/store will be performed inside the loop. + // This can be mitigated in any way once the register allocator is clever enough. + m.insert(m.allocateInstr().asDefineUninitializedReg(accumulator)) + + // Copy the value to a temporary register. + valCopied := m.copyToTmp(_val.reg()) + m.clearHigherBitsForAtomic(valCopied, size, ret.Type()) + + memOp := newOperandMem(mem) + tmp := m.c.AllocateVReg(ssa.TypeI64) + beginLoop, beginLoopLabel := m.allocateBrTarget() + { + m.insert(beginLoop) + // Reset the value on tmp by the original value. + m.copyTo(valCopied, tmp) + // Load the current value at the memory location into accumulator. + switch size { + case 1: + m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, memOp, accumulator)) + case 2: + m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, memOp, accumulator)) + case 4: + m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, memOp, accumulator)) + case 8: + m.insert(m.allocateInstr().asMov64MR(memOp, accumulator)) + default: + panic("BUG") + } + // Then perform the logical operation on the accumulator and the value on tmp. + switch op { + case ssa.AtomicRmwOpAnd: + m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, newOperandReg(accumulator), tmp, true)) + case ssa.AtomicRmwOpOr: + m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeOr, newOperandReg(accumulator), tmp, true)) + case ssa.AtomicRmwOpXor: + m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeXor, newOperandReg(accumulator), tmp, true)) + default: + panic("BUG") + } + // Finally, try compare-exchange the value at the memory location with the tmp. + m.insert(m.allocateInstr().asLockCmpXCHG(tmp, memOp.addressMode(), byte(size))) + // If it succeeds, ZF will be set, and we can break the loop. + m.insert(m.allocateInstr().asJmpIf(condNZ, newOperandLabel(beginLoopLabel))) + } + + // valCopied must be alive at the end of the loop. + m.insert(m.allocateInstr().asNopUseReg(valCopied)) + + // At this point, accumulator contains the result. + m.clearHigherBitsForAtomic(accumulator, size, ret.Type()) + m.copyTo(accumulator, m.c.VRegOf(ret)) + + case ssa.AtomicRmwOpXchg: + valCopied := m.copyToTmp(_val.reg()) + + m.insert(m.allocateInstr().asXCHG(valCopied, newOperandMem(mem), byte(size))) + m.clearHigherBitsForAtomic(valCopied, size, ret.Type()) + m.copyTo(valCopied, m.c.VRegOf(ret)) + + default: + panic("BUG") + } +} + +func (m *machine) lowerAtomicCas(addr, exp, repl ssa.Value, size uint64, ret ssa.Value) { + mem := m.lowerToAddressMode(addr, 0) + expOp := m.getOperand_Reg(m.c.ValueDefinition(exp)) + replOp := m.getOperand_Reg(m.c.ValueDefinition(repl)) + + accumulator := raxVReg + m.copyTo(expOp.reg(), accumulator) + m.insert(m.allocateInstr().asLockCmpXCHG(replOp.reg(), mem, byte(size))) + m.clearHigherBitsForAtomic(accumulator, size, ret.Type()) + m.copyTo(accumulator, m.c.VRegOf(ret)) +} + +func (m *machine) clearHigherBitsForAtomic(r regalloc.VReg, valSize uint64, resultType ssa.Type) { + switch resultType { + case ssa.TypeI32: + switch valSize { + case 1: + m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(r), r)) + case 2: + m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(r), r)) + } + case ssa.TypeI64: + switch valSize { + case 1: + m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(r), r)) + case 2: + m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, newOperandReg(r), r)) + case 4: + m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, newOperandReg(r), r)) + } + } +} + +func (m *machine) lowerFcmp(instr *ssa.Instruction) { + f1, f2, and := m.lowerFcmpToFlags(instr) + rd := m.c.VRegOf(instr.Return()) + if f2 == condInvalid { + tmp := m.c.AllocateVReg(ssa.TypeI32) + m.insert(m.allocateInstr().asSetcc(f1, tmp)) + // On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match + // the semantics of Icmp that sets either 0 or 1. + m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp), rd)) + } else { + tmp1, tmp2 := m.c.AllocateVReg(ssa.TypeI32), m.c.AllocateVReg(ssa.TypeI32) + m.insert(m.allocateInstr().asSetcc(f1, tmp1)) + m.insert(m.allocateInstr().asSetcc(f2, tmp2)) + var op aluRmiROpcode + if and { + op = aluRmiROpcodeAnd + } else { + op = aluRmiROpcodeOr + } + m.insert(m.allocateInstr().asAluRmiR(op, newOperandReg(tmp1), tmp2, false)) + m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp2), rd)) + } +} + +func (m *machine) lowerIcmp(instr *ssa.Instruction) { + x, y, c := instr.IcmpData() + m.lowerIcmpToFlag(m.c.ValueDefinition(x), m.c.ValueDefinition(y), x.Type() == ssa.TypeI64) + rd := m.c.VRegOf(instr.Return()) + tmp := m.c.AllocateVReg(ssa.TypeI32) + m.insert(m.allocateInstr().asSetcc(condFromSSAIntCmpCond(c), tmp)) + // On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match + // the semantics of Icmp that sets either 0 or 1. + m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(tmp), rd)) +} + +func (m *machine) lowerSelect(x, y, cval, ret ssa.Value) { + xo, yo := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y)) + rd := m.c.VRegOf(ret) + + var cond cond + cvalDef := m.c.ValueDefinition(cval) + switch m.c.MatchInstrOneOf(cvalDef, condBranchMatches[:]) { + case ssa.OpcodeIcmp: + icmp := cvalDef.Instr + xc, yc, cc := icmp.IcmpData() + m.lowerIcmpToFlag(m.c.ValueDefinition(xc), m.c.ValueDefinition(yc), xc.Type() == ssa.TypeI64) + cond = condFromSSAIntCmpCond(cc) + icmp.Lowered() + default: // TODO: match ssa.OpcodeFcmp for optimization, but seems a bit complex. + cv := m.getOperand_Reg(cvalDef) + test := m.allocateInstr().asCmpRmiR(false, cv, cv.reg(), false) + m.insert(test) + cond = condNZ + } + + if typ := x.Type(); typ.IsInt() { + _64 := typ.Bits() == 64 + mov := m.allocateInstr() + tmp := m.c.AllocateVReg(typ) + switch yo.kind { + case operandKindReg: + mov.asMovRR(yo.reg(), tmp, _64) + case operandKindMem: + if _64 { + mov.asMov64MR(yo, tmp) + } else { + mov.asMovzxRmR(extModeLQ, yo, tmp) + } + default: + panic("BUG") + } + m.insert(mov) + cmov := m.allocateInstr().asCmove(cond, xo, tmp, _64) + m.insert(cmov) + m.insert(m.allocateInstr().asMovRR(tmp, rd, _64)) + } else { + mov := m.allocateInstr() + tmp := m.c.AllocateVReg(typ) + switch typ { + case ssa.TypeF32: + mov.asXmmUnaryRmR(sseOpcodeMovss, yo, tmp) + case ssa.TypeF64: + mov.asXmmUnaryRmR(sseOpcodeMovsd, yo, tmp) + case ssa.TypeV128: + mov.asXmmUnaryRmR(sseOpcodeMovdqu, yo, tmp) + default: + panic("BUG") + } + m.insert(mov) + + cmov := m.allocateInstr().asXmmCMov(cond, xo, tmp, typ.Size()) + m.insert(cmov) + + m.copyTo(tmp, rd) + } +} + +func (m *machine) lowerXmmCmovAfterRegAlloc(i *instruction) { + x := i.op1 + rd := i.op2.reg() + cond := cond(i.u1) + + jcc := m.allocateInstr() + m.insert(jcc) + + mov := m.allocateInstr() + switch i.u2 { + case 4: + mov.asXmmUnaryRmR(sseOpcodeMovss, x, rd) + case 8: + mov.asXmmUnaryRmR(sseOpcodeMovsd, x, rd) + case 16: + mov.asXmmUnaryRmR(sseOpcodeMovdqu, x, rd) + default: + panic("BUG") + } + m.insert(mov) + + nop, end := m.allocateBrTarget() + m.insert(nop) + jcc.asJmpIf(cond.invert(), newOperandLabel(end)) +} + +func (m *machine) lowerExtend(_arg, ret ssa.Value, from, to byte, signed bool) { + rd0 := m.c.VRegOf(ret) + arg := m.getOperand_Mem_Reg(m.c.ValueDefinition(_arg)) + + rd := m.c.AllocateVReg(ret.Type()) + + ext := m.allocateInstr() + switch { + case from == 8 && to == 16 && signed: + ext.asMovsxRmR(extModeBQ, arg, rd) + case from == 8 && to == 16 && !signed: + ext.asMovzxRmR(extModeBL, arg, rd) + case from == 8 && to == 32 && signed: + ext.asMovsxRmR(extModeBL, arg, rd) + case from == 8 && to == 32 && !signed: + ext.asMovzxRmR(extModeBQ, arg, rd) + case from == 8 && to == 64 && signed: + ext.asMovsxRmR(extModeBQ, arg, rd) + case from == 8 && to == 64 && !signed: + ext.asMovzxRmR(extModeBQ, arg, rd) + case from == 16 && to == 32 && signed: + ext.asMovsxRmR(extModeWL, arg, rd) + case from == 16 && to == 32 && !signed: + ext.asMovzxRmR(extModeWL, arg, rd) + case from == 16 && to == 64 && signed: + ext.asMovsxRmR(extModeWQ, arg, rd) + case from == 16 && to == 64 && !signed: + ext.asMovzxRmR(extModeWQ, arg, rd) + case from == 32 && to == 64 && signed: + ext.asMovsxRmR(extModeLQ, arg, rd) + case from == 32 && to == 64 && !signed: + ext.asMovzxRmR(extModeLQ, arg, rd) + default: + panic(fmt.Sprintf("BUG: unhandled extend: from=%d, to=%d, signed=%t", from, to, signed)) + } + m.insert(ext) + + m.copyTo(rd, rd0) +} + +func (m *machine) lowerVconst(dst regalloc.VReg, lo, hi uint64) { + if lo == 0 && hi == 0 { + m.insert(m.allocateInstr().asZeros(dst)) + return + } + + load := m.allocateInstr() + constLabel := m.allocateLabel() + m.consts = append(m.consts, _const{label: constLabel, lo: lo, hi: hi}) + load.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(constLabel.L)), dst) + m.insert(load) +} + +func (m *machine) lowerCtz(instr *ssa.Instruction) { + if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) { + m.lowerUnaryRmR(instr, unaryRmROpcodeTzcnt) + } else { + // On processors that do not support TZCNT, the BSF instruction is + // executed instead. The key difference between TZCNT and BSF + // instruction is that if source operand is zero, the content of + // destination operand is undefined. + // https://www.felixcloutier.com/x86/tzcnt.html + + x := instr.Arg() + if !x.Type().IsInt() { + panic("BUG?") + } + _64 := x.Type().Bits() == 64 + + xDef := m.c.ValueDefinition(x) + tmp := m.c.AllocateVReg(x.Type()) + rm := m.getOperand_Reg(xDef) + + // First, we have to check if the target is non-zero. + test := m.allocateInstr() + test.asCmpRmiR(false, rm, rm.reg(), _64) + m.insert(test) + + jmpNz := m.allocateInstr() + m.insert(jmpNz) + + // If the value is zero, we just push the const value. + m.lowerIconst(tmp, uint64(x.Type().Bits()), _64) + + // Now jump right after the non-zero case. + jmpAtEnd := m.allocateInstr() + m.insert(jmpAtEnd) + + // jmpNz target label is set here. + nop, nz := m.allocateBrTarget() + jmpNz.asJmpIf(condNZ, newOperandLabel(nz)) + m.insert(nop) + + // Emit the non-zero case. + bsr := m.allocateInstr() + bsr.asUnaryRmR(unaryRmROpcodeBsf, rm, tmp, _64) + m.insert(bsr) + + // jmpAtEnd target label is set here. + nopEnd, end := m.allocateBrTarget() + jmpAtEnd.asJmp(newOperandLabel(end)) + m.insert(nopEnd) + + m.copyTo(tmp, m.c.VRegOf(instr.Return())) + } +} + +func (m *machine) lowerClz(instr *ssa.Instruction) { + if m.cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) { + m.lowerUnaryRmR(instr, unaryRmROpcodeLzcnt) + } else { + // On processors that do not support LZCNT, we combine BSR (calculating + // most significant set bit) with XOR. This logic is described in + // "Replace Raw Assembly Code with Builtin Intrinsics" section in: + // https://developer.apple.com/documentation/apple-silicon/addressing-architectural-differences-in-your-macos-code. + + x := instr.Arg() + if !x.Type().IsInt() { + panic("BUG?") + } + _64 := x.Type().Bits() == 64 + + xDef := m.c.ValueDefinition(x) + rm := m.getOperand_Reg(xDef) + tmp := m.c.AllocateVReg(x.Type()) + + // First, we have to check if the rm is non-zero as BSR is undefined + // on zero. See https://www.felixcloutier.com/x86/bsr. + test := m.allocateInstr() + test.asCmpRmiR(false, rm, rm.reg(), _64) + m.insert(test) + + jmpNz := m.allocateInstr() + m.insert(jmpNz) + + // If the value is zero, we just push the const value. + m.lowerIconst(tmp, uint64(x.Type().Bits()), _64) + + // Now jump right after the non-zero case. + jmpAtEnd := m.allocateInstr() + m.insert(jmpAtEnd) + + // jmpNz target label is set here. + nop, nz := m.allocateBrTarget() + jmpNz.asJmpIf(condNZ, newOperandLabel(nz)) + m.insert(nop) + + // Emit the non-zero case. + bsr := m.allocateInstr() + bsr.asUnaryRmR(unaryRmROpcodeBsr, rm, tmp, _64) + m.insert(bsr) + + // Now we XOR the value with the bit length minus one. + xor := m.allocateInstr() + xor.asAluRmiR(aluRmiROpcodeXor, newOperandImm32(uint32(x.Type().Bits()-1)), tmp, _64) + m.insert(xor) + + // jmpAtEnd target label is set here. + nopEnd, end := m.allocateBrTarget() + jmpAtEnd.asJmp(newOperandLabel(end)) + m.insert(nopEnd) + + m.copyTo(tmp, m.c.VRegOf(instr.Return())) + } +} + +func (m *machine) lowerUnaryRmR(si *ssa.Instruction, op unaryRmROpcode) { + x := si.Arg() + if !x.Type().IsInt() { + panic("BUG?") + } + _64 := x.Type().Bits() == 64 + + xDef := m.c.ValueDefinition(x) + rm := m.getOperand_Mem_Reg(xDef) + rd := m.c.VRegOf(si.Return()) + + instr := m.allocateInstr() + instr.asUnaryRmR(op, rm, rd, _64) + m.insert(instr) +} + +func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, dst regalloc.VReg) { + mem := newOperandMem(m.lowerToAddressMode(ptr, offset)) + load := m.allocateInstr() + switch typ { + case ssa.TypeI32: + load.asMovzxRmR(extModeLQ, mem, dst) + case ssa.TypeI64: + load.asMov64MR(mem, dst) + case ssa.TypeF32: + load.asXmmUnaryRmR(sseOpcodeMovss, mem, dst) + case ssa.TypeF64: + load.asXmmUnaryRmR(sseOpcodeMovsd, mem, dst) + case ssa.TypeV128: + load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, dst) + default: + panic("BUG") + } + m.insert(load) +} + +func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, dst regalloc.VReg) { + mem := newOperandMem(m.lowerToAddressMode(ptr, offset)) + load := m.allocateInstr() + switch op { + case ssa.OpcodeUload8: + load.asMovzxRmR(extModeBQ, mem, dst) + case ssa.OpcodeUload16: + load.asMovzxRmR(extModeWQ, mem, dst) + case ssa.OpcodeUload32: + load.asMovzxRmR(extModeLQ, mem, dst) + case ssa.OpcodeSload8: + load.asMovsxRmR(extModeBQ, mem, dst) + case ssa.OpcodeSload16: + load.asMovsxRmR(extModeWQ, mem, dst) + case ssa.OpcodeSload32: + load.asMovsxRmR(extModeLQ, mem, dst) + default: + panic("BUG") + } + m.insert(load) +} + +func (m *machine) lowerExitIfTrueWithCode(execCtx regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) { + condDef := m.c.ValueDefinition(cond) + if !m.c.MatchInstr(condDef, ssa.OpcodeIcmp) { + panic("TODO: ExitIfTrue must come after Icmp at the moment: " + condDef.Instr.Opcode().String()) + } + cvalInstr := condDef.Instr + cvalInstr.MarkLowered() + + // We need to copy the execution context to a temp register, because if it's spilled, + // it might end up being reloaded inside the exiting branch. + execCtxTmp := m.copyToTmp(execCtx) + + x, y, c := cvalInstr.IcmpData() + xx, yy := m.c.ValueDefinition(x), m.c.ValueDefinition(y) + if !m.tryLowerBandToFlag(xx, yy) { + m.lowerIcmpToFlag(xx, yy, x.Type() == ssa.TypeI64) + } + + jmpIf := m.allocateInstr() + m.insert(jmpIf) + l := m.lowerExitWithCode(execCtxTmp, code) + jmpIf.asJmpIf(condFromSSAIntCmpCond(c).invert(), newOperandLabel(l)) +} + +func (m *machine) tryLowerBandToFlag(x, y *backend.SSAValueDefinition) (ok bool) { + var target *backend.SSAValueDefinition + if x.IsFromInstr() && x.Instr.Constant() && x.Instr.ConstantVal() == 0 { + if m.c.MatchInstr(y, ssa.OpcodeBand) { + target = y + } + } + + if y.IsFromInstr() && y.Instr.Constant() && y.Instr.ConstantVal() == 0 { + if m.c.MatchInstr(x, ssa.OpcodeBand) { + target = x + } + } + + if target == nil { + return false + } + + bandInstr := target.Instr + bandX, bandY := bandInstr.Arg2() + + xx := m.getOperand_Reg(m.c.ValueDefinition(bandX)) + yy := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(bandY)) + test := m.allocateInstr().asCmpRmiR(false, yy, xx.reg(), bandX.Type() == ssa.TypeI64) + m.insert(test) + bandInstr.MarkLowered() + return true +} + +func (m *machine) allocateExitInstructions(execCtx, exitCodeReg regalloc.VReg) (saveRsp, saveRbp, setExitCode *instruction) { + saveRsp = m.allocateInstr().asMovRM( + rspVReg, + newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.U32(), execCtx)), + 8, + ) + + saveRbp = m.allocateInstr().asMovRM( + rbpVReg, + newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetFramePointerBeforeGoCall.U32(), execCtx)), + 8, + ) + setExitCode = m.allocateInstr().asMovRM( + exitCodeReg, + newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetExitCodeOffset.U32(), execCtx)), + 4, + ) + return +} + +func (m *machine) lowerExitWithCode(execCtx regalloc.VReg, code wazevoapi.ExitCode) (afterLabel backend.Label) { + exitCodeReg := rbpVReg + saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtx, exitCodeReg) + + // Set save RSP, RBP, and write exit code. + m.insert(saveRsp) + m.insert(saveRbp) + m.lowerIconst(exitCodeReg, uint64(code), false) + m.insert(setExitCode) + + ripReg := rbpVReg + + // Next is to save the current address for stack unwinding. + nop, currentAddrLabel := m.allocateBrTarget() + m.insert(nop) + readRip := m.allocateInstr().asLEA(newOperandLabel(currentAddrLabel), ripReg) + m.insert(readRip) + saveRip := m.allocateInstr().asMovRM( + ripReg, + newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)), + 8, + ) + m.insert(saveRip) + + // Finally exit. + exitSq := m.allocateExitSeq(execCtx) + m.insert(exitSq) + + // Return the label for continuation. + continuation, afterLabel := m.allocateBrTarget() + m.insert(continuation) + return afterLabel +} + +func (m *machine) lowerAluRmiROp(si *ssa.Instruction, op aluRmiROpcode) { + x, y := si.Arg2() + if !x.Type().IsInt() { + panic("BUG?") + } + + _64 := x.Type().Bits() == 64 + + xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) + + // TODO: commutative args can be swapped if one of them is an immediate. + rn := m.getOperand_Reg(xDef) + rm := m.getOperand_Mem_Imm32_Reg(yDef) + rd := m.c.VRegOf(si.Return()) + + // rn is being overwritten, so we first copy its value to a temp register, + // in case it is referenced again later. + tmp := m.copyToTmp(rn.reg()) + + alu := m.allocateInstr() + alu.asAluRmiR(op, rm, tmp, _64) + m.insert(alu) + + // tmp now contains the result, we copy it to the dest register. + m.copyTo(tmp, rd) +} + +func (m *machine) lowerShiftR(si *ssa.Instruction, op shiftROp) { + x, amt := si.Arg2() + if !x.Type().IsInt() { + panic("BUG?") + } + _64 := x.Type().Bits() == 64 + + xDef, amtDef := m.c.ValueDefinition(x), m.c.ValueDefinition(amt) + + opAmt := m.getOperand_Imm32_Reg(amtDef) + rx := m.getOperand_Reg(xDef) + rd := m.c.VRegOf(si.Return()) + + // rx is being overwritten, so we first copy its value to a temp register, + // in case it is referenced again later. + tmpDst := m.copyToTmp(rx.reg()) + + if opAmt.kind == operandKindReg { + // If opAmt is a register we must copy its value to rcx, + // because shiftR encoding mandates that the shift amount is in rcx. + m.copyTo(opAmt.reg(), rcxVReg) + + alu := m.allocateInstr() + alu.asShiftR(op, newOperandReg(rcxVReg), tmpDst, _64) + m.insert(alu) + + } else { + alu := m.allocateInstr() + alu.asShiftR(op, opAmt, tmpDst, _64) + m.insert(alu) + } + + // tmp now contains the result, we copy it to the dest register. + m.copyTo(tmpDst, rd) +} + +func (m *machine) lowerXmmRmR(instr *ssa.Instruction) { + x, y := instr.Arg2() + if !x.Type().IsFloat() { + panic("BUG?") + } + _64 := x.Type().Bits() == 64 + + var op sseOpcode + if _64 { + switch instr.Opcode() { + case ssa.OpcodeFadd: + op = sseOpcodeAddsd + case ssa.OpcodeFsub: + op = sseOpcodeSubsd + case ssa.OpcodeFmul: + op = sseOpcodeMulsd + case ssa.OpcodeFdiv: + op = sseOpcodeDivsd + default: + panic("BUG") + } + } else { + switch instr.Opcode() { + case ssa.OpcodeFadd: + op = sseOpcodeAddss + case ssa.OpcodeFsub: + op = sseOpcodeSubss + case ssa.OpcodeFmul: + op = sseOpcodeMulss + case ssa.OpcodeFdiv: + op = sseOpcodeDivss + default: + panic("BUG") + } + } + + xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) + rn := m.getOperand_Reg(yDef) + rm := m.getOperand_Reg(xDef) + rd := m.c.VRegOf(instr.Return()) + + // rm is being overwritten, so we first copy its value to a temp register, + // in case it is referenced again later. + tmp := m.copyToTmp(rm.reg()) + + xmm := m.allocateInstr().asXmmRmR(op, rn, tmp) + m.insert(xmm) + + m.copyTo(tmp, rd) +} + +func (m *machine) lowerSqrt(instr *ssa.Instruction) { + x := instr.Arg() + if !x.Type().IsFloat() { + panic("BUG") + } + _64 := x.Type().Bits() == 64 + var op sseOpcode + if _64 { + op = sseOpcodeSqrtsd + } else { + op = sseOpcodeSqrtss + } + + xDef := m.c.ValueDefinition(x) + rm := m.getOperand_Mem_Reg(xDef) + rd := m.c.VRegOf(instr.Return()) + + xmm := m.allocateInstr().asXmmUnaryRmR(op, rm, rd) + m.insert(xmm) +} + +func (m *machine) lowerFabsFneg(instr *ssa.Instruction) { + x := instr.Arg() + if !x.Type().IsFloat() { + panic("BUG") + } + _64 := x.Type().Bits() == 64 + var op sseOpcode + var mask uint64 + if _64 { + switch instr.Opcode() { + case ssa.OpcodeFabs: + mask, op = 0x7fffffffffffffff, sseOpcodeAndpd + case ssa.OpcodeFneg: + mask, op = 0x8000000000000000, sseOpcodeXorpd + } + } else { + switch instr.Opcode() { + case ssa.OpcodeFabs: + mask, op = 0x7fffffff, sseOpcodeAndps + case ssa.OpcodeFneg: + mask, op = 0x80000000, sseOpcodeXorps + } + } + + tmp := m.c.AllocateVReg(x.Type()) + + xDef := m.c.ValueDefinition(x) + rm := m.getOperand_Reg(xDef) + rd := m.c.VRegOf(instr.Return()) + + m.lowerFconst(tmp, mask, _64) + + xmm := m.allocateInstr().asXmmRmR(op, rm, tmp) + m.insert(xmm) + + m.copyTo(tmp, rd) +} + +func (m *machine) lowerStore(si *ssa.Instruction) { + value, ptr, offset, storeSizeInBits := si.StoreData() + rm := m.getOperand_Reg(m.c.ValueDefinition(value)) + mem := newOperandMem(m.lowerToAddressMode(ptr, offset)) + + store := m.allocateInstr() + switch value.Type() { + case ssa.TypeI32: + store.asMovRM(rm.reg(), mem, storeSizeInBits/8) + case ssa.TypeI64: + store.asMovRM(rm.reg(), mem, storeSizeInBits/8) + case ssa.TypeF32: + store.asXmmMovRM(sseOpcodeMovss, rm.reg(), mem) + case ssa.TypeF64: + store.asXmmMovRM(sseOpcodeMovsd, rm.reg(), mem) + case ssa.TypeV128: + store.asXmmMovRM(sseOpcodeMovdqu, rm.reg(), mem) + default: + panic("BUG") + } + m.insert(store) +} + +func (m *machine) lowerCall(si *ssa.Instruction) { + isDirectCall := si.Opcode() == ssa.OpcodeCall + var indirectCalleePtr ssa.Value + var directCallee ssa.FuncRef + var sigID ssa.SignatureID + var args []ssa.Value + var isMemmove bool + if isDirectCall { + directCallee, sigID, args = si.CallData() + } else { + indirectCalleePtr, sigID, args, isMemmove = si.CallIndirectData() + } + calleeABI := m.c.GetFunctionABI(m.c.SSABuilder().ResolveSignature(sigID)) + + stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize()) + if m.maxRequiredStackSizeForCalls < stackSlotSize+16 { + m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // 16 == return address + RBP. + } + + // Note: See machine.SetupPrologue for the stack layout. + // The stack pointer decrease/increase will be inserted later in the compilation. + + for i, arg := range args { + reg := m.c.VRegOf(arg) + def := m.c.ValueDefinition(arg) + m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize) + } + + if isMemmove { + // Go's memmove *might* use all xmm0-xmm15, so we need to release them. + // https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#architecture-specifics + // https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/runtime/memmove_amd64.s#L271-L286 + for i := regalloc.RealReg(0); i < 16; i++ { + m.insert(m.allocateInstr().asDefineUninitializedReg(regInfo.RealRegToVReg[xmm0+i])) + } + } + + if isDirectCall { + call := m.allocateInstr().asCall(directCallee, calleeABI) + m.insert(call) + } else { + ptrOp := m.getOperand_Mem_Reg(m.c.ValueDefinition(indirectCalleePtr)) + callInd := m.allocateInstr().asCallIndirect(ptrOp, calleeABI) + m.insert(callInd) + } + + if isMemmove { + for i := regalloc.RealReg(0); i < 16; i++ { + m.insert(m.allocateInstr().asNopUseReg(regInfo.RealRegToVReg[xmm0+i])) + } + } + + var index int + r1, rs := si.Returns() + if r1.Valid() { + m.callerGenFunctionReturnVReg(calleeABI, 0, m.c.VRegOf(r1), stackSlotSize) + index++ + } + + for _, r := range rs { + m.callerGenFunctionReturnVReg(calleeABI, index, m.c.VRegOf(r), stackSlotSize) + index++ + } +} + +// callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the +// caller side of the function call. +func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def *backend.SSAValueDefinition, stackSlotSize int64) { + arg := &a.Args[argIndex] + if def != nil && def.IsFromInstr() { + // Constant instructions are inlined. + if inst := def.Instr; inst.Constant() { + m.insertLoadConstant(inst, reg) + } + } + if arg.Kind == backend.ABIArgKindReg { + m.InsertMove(arg.Reg, reg, arg.Type) + } else { + store := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmReg( + // -stackSlotSize because the stack pointer is not yet decreased. + uint32(arg.Offset-stackSlotSize), rspVReg)) + switch arg.Type { + case ssa.TypeI32: + store.asMovRM(reg, mem, 4) + case ssa.TypeI64: + store.asMovRM(reg, mem, 8) + case ssa.TypeF32: + store.asXmmMovRM(sseOpcodeMovss, reg, mem) + case ssa.TypeF64: + store.asXmmMovRM(sseOpcodeMovsd, reg, mem) + case ssa.TypeV128: + store.asXmmMovRM(sseOpcodeMovdqu, reg, mem) + default: + panic("BUG") + } + m.insert(store) + } +} + +func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex int, reg regalloc.VReg, stackSlotSize int64) { + r := &a.Rets[retIndex] + if r.Kind == backend.ABIArgKindReg { + m.InsertMove(reg, r.Reg, r.Type) + } else { + load := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmReg( + // -stackSlotSize because the stack pointer is not yet decreased. + uint32(a.ArgStackSize+r.Offset-stackSlotSize), rspVReg)) + switch r.Type { + case ssa.TypeI32: + load.asMovzxRmR(extModeLQ, mem, reg) + case ssa.TypeI64: + load.asMov64MR(mem, reg) + case ssa.TypeF32: + load.asXmmUnaryRmR(sseOpcodeMovss, mem, reg) + case ssa.TypeF64: + load.asXmmUnaryRmR(sseOpcodeMovsd, mem, reg) + case ssa.TypeV128: + load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, reg) + default: + panic("BUG") + } + m.insert(load) + } +} + +// InsertMove implements backend.Machine. +func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) { + switch typ { + case ssa.TypeI32, ssa.TypeI64: + i := m.allocateInstr().asMovRR(src, dst, typ.Bits() == 64) + m.insert(i) + case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: + var op sseOpcode + switch typ { + case ssa.TypeF32: + op = sseOpcodeMovss + case ssa.TypeF64: + op = sseOpcodeMovsd + case ssa.TypeV128: + op = sseOpcodeMovdqa + } + i := m.allocateInstr().asXmmUnaryRmR(op, newOperandReg(src), dst) + m.insert(i) + default: + panic("BUG") + } +} + +// Format implements backend.Machine. +func (m *machine) Format() string { + ectx := m.ectx + begins := map[*instruction]backend.Label{} + for l, pos := range ectx.LabelPositions { + begins[pos.Begin] = l + } + + irBlocks := map[backend.Label]ssa.BasicBlockID{} + for i, l := range ectx.SsaBlockIDToLabels { + irBlocks[l] = ssa.BasicBlockID(i) + } + + var lines []string + for cur := ectx.RootInstr; cur != nil; cur = cur.next { + if l, ok := begins[cur]; ok { + var labelStr string + if blkID, ok := irBlocks[l]; ok { + labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID) + } else { + labelStr = fmt.Sprintf("%s:", l) + } + lines = append(lines, labelStr) + } + if cur.kind == nop0 { + continue + } + lines = append(lines, "\t"+cur.String()) + } + for _, vc := range m.consts { + if vc._var == nil { + lines = append(lines, fmt.Sprintf("%s: const [%d %d]", vc.label.L, vc.lo, vc.hi)) + } else { + lines = append(lines, fmt.Sprintf("%s: const %#x", vc.label.L, vc._var)) + } + } + return "\n" + strings.Join(lines, "\n") + "\n" +} + +func (m *machine) encodeWithoutSSA(root *instruction) { + m.labelResolutionPends = m.labelResolutionPends[:0] + ectx := m.ectx + + bufPtr := m.c.BufPtr() + for cur := root; cur != nil; cur = cur.next { + offset := int64(len(*bufPtr)) + if cur.kind == nop0 { + l := cur.nop0Label() + if pos, ok := ectx.LabelPositions[l]; ok { + pos.BinaryOffset = offset + } + } + + needLabelResolution := cur.encode(m.c) + if needLabelResolution { + m.labelResolutionPends = append(m.labelResolutionPends, + labelResolutionPend{instr: cur, imm32Offset: int64(len(*bufPtr)) - 4}, + ) + } + } + + for i := range m.labelResolutionPends { + p := &m.labelResolutionPends[i] + switch p.instr.kind { + case jmp, jmpIf, lea: + target := p.instr.jmpLabel() + targetOffset := ectx.LabelPositions[target].BinaryOffset + imm32Offset := p.imm32Offset + jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction. + binary.LittleEndian.PutUint32((*bufPtr)[imm32Offset:], uint32(jmpOffset)) + default: + panic("BUG") + } + } +} + +// Encode implements backend.Machine Encode. +func (m *machine) Encode(ctx context.Context) (err error) { + ectx := m.ectx + bufPtr := m.c.BufPtr() + + var fn string + var fnIndex int + var labelToSSABlockID map[backend.Label]ssa.BasicBlockID + if wazevoapi.PerfMapEnabled { + fn = wazevoapi.GetCurrentFunctionName(ctx) + labelToSSABlockID = make(map[backend.Label]ssa.BasicBlockID) + for i, l := range ectx.SsaBlockIDToLabels { + labelToSSABlockID[l] = ssa.BasicBlockID(i) + } + fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx) + } + + m.labelResolutionPends = m.labelResolutionPends[:0] + for _, pos := range ectx.OrderedBlockLabels { + offset := int64(len(*bufPtr)) + pos.BinaryOffset = offset + for cur := pos.Begin; cur != pos.End.next; cur = cur.next { + offset := int64(len(*bufPtr)) + + switch cur.kind { + case nop0: + l := cur.nop0Label() + if pos, ok := ectx.LabelPositions[l]; ok { + pos.BinaryOffset = offset + } + case sourceOffsetInfo: + m.c.AddSourceOffsetInfo(offset, cur.sourceOffsetInfo()) + } + + needLabelResolution := cur.encode(m.c) + if needLabelResolution { + m.labelResolutionPends = append(m.labelResolutionPends, + labelResolutionPend{instr: cur, instrOffset: offset, imm32Offset: int64(len(*bufPtr)) - 4}, + ) + } + } + + if wazevoapi.PerfMapEnabled { + l := pos.L + var labelStr string + if blkID, ok := labelToSSABlockID[l]; ok { + labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID) + } else { + labelStr = l.String() + } + size := int64(len(*bufPtr)) - offset + wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr)) + } + } + + for i := range m.consts { + offset := int64(len(*bufPtr)) + vc := &m.consts[i] + vc.label.BinaryOffset = offset + if vc._var == nil { + lo, hi := vc.lo, vc.hi + m.c.Emit8Bytes(lo) + m.c.Emit8Bytes(hi) + } else { + for _, b := range vc._var { + m.c.EmitByte(b) + } + } + } + + buf := *bufPtr + for i := range m.labelResolutionPends { + p := &m.labelResolutionPends[i] + switch p.instr.kind { + case jmp, jmpIf, lea, xmmUnaryRmR: + target := p.instr.jmpLabel() + targetOffset := ectx.LabelPositions[target].BinaryOffset + imm32Offset := p.imm32Offset + jmpOffset := int32(targetOffset - (p.imm32Offset + 4)) // +4 because RIP points to the next instruction. + binary.LittleEndian.PutUint32(buf[imm32Offset:], uint32(jmpOffset)) + case jmpTableIsland: + tableBegin := p.instrOffset + // Each entry is the offset from the beginning of the jmpTableIsland instruction in 8 bytes. + targets := m.jmpTableTargets[p.instr.u1] + for i, l := range targets { + targetOffset := ectx.LabelPositions[backend.Label(l)].BinaryOffset + jmpOffset := targetOffset - tableBegin + binary.LittleEndian.PutUint64(buf[tableBegin+int64(i)*8:], uint64(jmpOffset)) + } + default: + panic("BUG") + } + } + return +} + +// ResolveRelocations implements backend.Machine. +func (m *machine) ResolveRelocations(refToBinaryOffset []int, binary []byte, relocations []backend.RelocationInfo, _ []int) { + for _, r := range relocations { + offset := r.Offset + calleeFnOffset := refToBinaryOffset[r.FuncRef] + // offset is the offset of the last 4 bytes of the call instruction. + callInstrOffsetBytes := binary[offset : offset+4] + diff := int64(calleeFnOffset) - (offset + 4) // +4 because we want the offset of the next instruction (In x64, RIP always points to the next instruction). + callInstrOffsetBytes[0] = byte(diff) + callInstrOffsetBytes[1] = byte(diff >> 8) + callInstrOffsetBytes[2] = byte(diff >> 16) + callInstrOffsetBytes[3] = byte(diff >> 24) + } +} + +// CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo. +func (m *machine) CallTrampolineIslandInfo(_ int) (_, _ int, _ error) { return } + +func (m *machine) lowerIcmpToFlag(xd, yd *backend.SSAValueDefinition, _64 bool) { + x := m.getOperand_Reg(xd) + y := m.getOperand_Mem_Imm32_Reg(yd) + cmp := m.allocateInstr().asCmpRmiR(true, y, x.reg(), _64) + m.insert(cmp) +} + +func (m *machine) lowerFcmpToFlags(instr *ssa.Instruction) (f1, f2 cond, and bool) { + x, y, c := instr.FcmpData() + switch c { + case ssa.FloatCmpCondEqual: + f1, f2 = condNP, condZ + and = true + case ssa.FloatCmpCondNotEqual: + f1, f2 = condP, condNZ + case ssa.FloatCmpCondLessThan: + f1 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThan) + f2 = condInvalid + x, y = y, x + case ssa.FloatCmpCondLessThanOrEqual: + f1 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThanOrEqual) + f2 = condInvalid + x, y = y, x + default: + f1 = condFromSSAFloatCmpCond(c) + f2 = condInvalid + } + + var opc sseOpcode + if x.Type() == ssa.TypeF32 { + opc = sseOpcodeUcomiss + } else { + opc = sseOpcodeUcomisd + } + + xr := m.getOperand_Reg(m.c.ValueDefinition(x)) + yr := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + m.insert(m.allocateInstr().asXmmCmpRmR(opc, yr, xr.reg())) + return +} + +// allocateInstr allocates an instruction. +func (m *machine) allocateInstr() *instruction { + instr := m.ectx.InstructionPool.Allocate() + if !m.regAllocStarted { + instr.addedBeforeRegAlloc = true + } + return instr +} + +func (m *machine) allocateNop() *instruction { + instr := m.allocateInstr() + instr.kind = nop0 + return instr +} + +func (m *machine) insert(i *instruction) { + ectx := m.ectx + ectx.PendingInstructions = append(ectx.PendingInstructions, i) +} + +func (m *machine) allocateBrTarget() (nop *instruction, l backend.Label) { //nolint + pos := m.allocateLabel() + l = pos.L + nop = m.allocateInstr() + nop.asNop0WithLabel(l) + pos.Begin, pos.End = nop, nop + return +} + +func (m *machine) allocateLabel() *labelPosition { + ectx := m.ectx + l := ectx.AllocateLabel() + pos := ectx.AllocateLabelPosition(l) + ectx.LabelPositions[l] = pos + return pos +} + +func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 { + offset, ok := m.spillSlots[id] + if !ok { + offset = m.spillSlotSize + m.spillSlots[id] = offset + m.spillSlotSize += int64(size) + } + return offset +} + +func (m *machine) copyTo(src regalloc.VReg, dst regalloc.VReg) { + mov := m.allocateInstr() + if src.RegType() == regalloc.RegTypeInt { + mov.asMovRR(src, dst, true) + } else { + mov.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), dst) + } + m.insert(mov) +} + +func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg { + typ := m.c.TypeOf(v) + tmp := m.c.AllocateVReg(typ) + m.copyTo(v, tmp) + return tmp +} + +func (m *machine) requiredStackSize() int64 { + return m.maxRequiredStackSizeForCalls + + m.frameSize() + + 16 + // Need for stack checking. + 16 // return address and the caller RBP. +} + +func (m *machine) frameSize() int64 { + s := m.clobberedRegSlotSize() + m.spillSlotSize + if s&0xf != 0 { + panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s)) + } + return s +} + +func (m *machine) clobberedRegSlotSize() int64 { + return int64(len(m.clobberedRegs) * 16) +} + +func (m *machine) lowerIDivRem(si *ssa.Instruction, isDiv bool, signed bool) { + x, y, execCtx := si.Arg3() + + dividend := m.getOperand_Reg(m.c.ValueDefinition(x)) + divisor := m.getOperand_Reg(m.c.ValueDefinition(y)) + ctxVReg := m.c.VRegOf(execCtx) + tmpGp := m.c.AllocateVReg(si.Return().Type()) + + m.copyTo(dividend.reg(), raxVReg) + m.insert(m.allocateInstr().asDefineUninitializedReg(rdxVReg)) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp)) + seq := m.allocateInstr().asIdivRemSequence(ctxVReg, divisor.reg(), tmpGp, isDiv, signed, x.Type().Bits() == 64) + m.insert(seq) + rd := m.c.VRegOf(si.Return()) + if isDiv { + m.copyTo(raxVReg, rd) + } else { + m.copyTo(rdxVReg, rd) + } +} + +func (m *machine) lowerIDivRemSequenceAfterRegAlloc(i *instruction) { + execCtx, divisor, tmpGp, isDiv, signed, _64 := i.idivRemSequenceData() + + dividend := raxVReg + + // Ensure yr is not zero. + test := m.allocateInstr() + test.asCmpRmiR(false, newOperandReg(divisor), divisor, _64) + m.insert(test) + + jnz := m.allocateInstr() + m.insert(jnz) + + nz := m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerDivisionByZero) + + // If not zero, we can proceed with the division. + jnz.asJmpIf(condNZ, newOperandLabel(nz)) + + var ifRemNeg1 *instruction + if signed { + var neg1 uint64 + if _64 { + neg1 = 0xffffffffffffffff + } else { + neg1 = 0xffffffff + } + m.lowerIconst(tmpGp, neg1, _64) + + if isDiv { + // For signed division, we have to have branches for "math.MinInt{32,64} / -1" + // case which results in the floating point exception via division error as + // the resulting value exceeds the maximum of signed int. + + // First, we check if the divisor is -1. + cmp := m.allocateInstr() + cmp.asCmpRmiR(true, newOperandReg(tmpGp), divisor, _64) + m.insert(cmp) + + ifNotNeg1 := m.allocateInstr() + m.insert(ifNotNeg1) + + var minInt uint64 + if _64 { + minInt = 0x8000000000000000 + } else { + minInt = 0x80000000 + } + m.lowerIconst(tmpGp, minInt, _64) + + // Next we check if the quotient is the most negative value for the signed integer, i.e. + // if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1) respectively. + cmp2 := m.allocateInstr() + cmp2.asCmpRmiR(true, newOperandReg(tmpGp), dividend, _64) + m.insert(cmp2) + + ifNotMinInt := m.allocateInstr() + m.insert(ifNotMinInt) + + // Trap if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1), + // as that is the overflow in division as the result becomes 2^31 which is larger than + // the maximum of signed 32-bit int (2^31-1). + end := m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) + ifNotNeg1.asJmpIf(condNZ, newOperandLabel(end)) + ifNotMinInt.asJmpIf(condNZ, newOperandLabel(end)) + } else { + // If it is remainder, zeros DX register and compare the divisor to -1. + xor := m.allocateInstr().asZeros(rdxVReg) + m.insert(xor) + + // We check if the divisor is -1. + cmp := m.allocateInstr() + cmp.asCmpRmiR(true, newOperandReg(tmpGp), divisor, _64) + m.insert(cmp) + + ifRemNeg1 = m.allocateInstr() + m.insert(ifRemNeg1) + } + + // Sign-extend DX register to have 2*x.Type().Bits() dividend over DX and AX registers. + sed := m.allocateInstr() + sed.asSignExtendData(_64) + m.insert(sed) + } else { + // Zeros DX register to have 2*x.Type().Bits() dividend over DX and AX registers. + zeros := m.allocateInstr().asZeros(rdxVReg) + m.insert(zeros) + } + + div := m.allocateInstr() + div.asDiv(newOperandReg(divisor), signed, _64) + m.insert(div) + + nop, end := m.allocateBrTarget() + m.insert(nop) + // If we are compiling a Rem instruction, when the divisor is -1 we land at the end of the function. + if ifRemNeg1 != nil { + ifRemNeg1.asJmpIf(condZ, newOperandLabel(end)) + } +} + +func (m *machine) lowerRound(instr *ssa.Instruction, imm roundingMode) { + x := instr.Arg() + if !x.Type().IsFloat() { + panic("BUG?") + } + var op sseOpcode + if x.Type().Bits() == 64 { + op = sseOpcodeRoundsd + } else { + op = sseOpcodeRoundss + } + + xDef := m.c.ValueDefinition(x) + rm := m.getOperand_Mem_Reg(xDef) + rd := m.c.VRegOf(instr.Return()) + + xmm := m.allocateInstr().asXmmUnaryRmRImm(op, uint8(imm), rm, rd) + m.insert(xmm) +} + +func (m *machine) lowerFminFmax(instr *ssa.Instruction) { + x, y := instr.Arg2() + if !x.Type().IsFloat() { + panic("BUG?") + } + + _64 := x.Type().Bits() == 64 + isMin := instr.Opcode() == ssa.OpcodeFmin + var minMaxOp sseOpcode + + switch { + case _64 && isMin: + minMaxOp = sseOpcodeMinpd + case _64 && !isMin: + minMaxOp = sseOpcodeMaxpd + case !_64 && isMin: + minMaxOp = sseOpcodeMinps + case !_64 && !isMin: + minMaxOp = sseOpcodeMaxps + } + + xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) + rm := m.getOperand_Reg(xDef) + // We cannot ensure that y is aligned to 16 bytes, so we have to use it on reg. + rn := m.getOperand_Reg(yDef) + rd := m.c.VRegOf(instr.Return()) + + tmp := m.copyToTmp(rm.reg()) + + // Check if this is (either x1 or x2 is NaN) or (x1 equals x2) case. + cmp := m.allocateInstr() + if _64 { + cmp.asXmmCmpRmR(sseOpcodeUcomisd, rn, tmp) + } else { + cmp.asXmmCmpRmR(sseOpcodeUcomiss, rn, tmp) + } + m.insert(cmp) + + // At this point, we have the three cases of conditional flags below + // (See https://www.felixcloutier.com/x86/ucomiss#operation for detail.) + // + // 1) Two values are NaN-free and different: All flags are cleared. + // 2) Two values are NaN-free and equal: Only ZF flags is set. + // 3) One of Two values is NaN: ZF, PF and CF flags are set. + + // Jump instruction to handle 1) case by checking the ZF flag + // as ZF is only set for 2) and 3) cases. + nanFreeOrDiffJump := m.allocateInstr() + m.insert(nanFreeOrDiffJump) + + // Start handling 2) and 3). + + // Jump if one of two values is NaN by checking the parity flag (PF). + ifIsNan := m.allocateInstr() + m.insert(ifIsNan) + + // Start handling 2) NaN-free and equal. + + // Before we exit this case, we have to ensure that positive zero (or negative zero for min instruction) is + // returned if two values are positive and negative zeros. + var op sseOpcode + switch { + case !_64 && isMin: + op = sseOpcodeOrps + case _64 && isMin: + op = sseOpcodeOrpd + case !_64 && !isMin: + op = sseOpcodeAndps + case _64 && !isMin: + op = sseOpcodeAndpd + } + orAnd := m.allocateInstr() + orAnd.asXmmRmR(op, rn, tmp) + m.insert(orAnd) + + // Done, jump to end. + sameExitJump := m.allocateInstr() + m.insert(sameExitJump) + + // Start handling 3) either is NaN. + isNanTarget, isNan := m.allocateBrTarget() + m.insert(isNanTarget) + ifIsNan.asJmpIf(condP, newOperandLabel(isNan)) + + // We emit the ADD instruction to produce the NaN in tmp. + add := m.allocateInstr() + if _64 { + add.asXmmRmR(sseOpcodeAddsd, rn, tmp) + } else { + add.asXmmRmR(sseOpcodeAddss, rn, tmp) + } + m.insert(add) + + // Exit from the NaN case branch. + nanExitJmp := m.allocateInstr() + m.insert(nanExitJmp) + + // Start handling 1). + doMinMaxTarget, doMinMax := m.allocateBrTarget() + m.insert(doMinMaxTarget) + nanFreeOrDiffJump.asJmpIf(condNZ, newOperandLabel(doMinMax)) + + // Now handle the NaN-free and different values case. + minMax := m.allocateInstr() + minMax.asXmmRmR(minMaxOp, rn, tmp) + m.insert(minMax) + + endNop, end := m.allocateBrTarget() + m.insert(endNop) + nanExitJmp.asJmp(newOperandLabel(end)) + sameExitJump.asJmp(newOperandLabel(end)) + + m.copyTo(tmp, rd) +} + +func (m *machine) lowerFcopysign(instr *ssa.Instruction) { + x, y := instr.Arg2() + if !x.Type().IsFloat() { + panic("BUG") + } + + _64 := x.Type().Bits() == 64 + + xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y) + rm := m.getOperand_Reg(xDef) + rn := m.getOperand_Reg(yDef) + rd := m.c.VRegOf(instr.Return()) + + // Clear the non-sign bits of src via AND with the mask. + var opAnd, opOr sseOpcode + var signMask uint64 + if _64 { + signMask, opAnd, opOr = 0x8000000000000000, sseOpcodeAndpd, sseOpcodeOrpd + } else { + signMask, opAnd, opOr = 0x80000000, sseOpcodeAndps, sseOpcodeOrps + } + + signBitReg := m.c.AllocateVReg(x.Type()) + m.lowerFconst(signBitReg, signMask, _64) + nonSignBitReg := m.c.AllocateVReg(x.Type()) + m.lowerFconst(nonSignBitReg, ^signMask, _64) + + // Extract the sign bits of rn. + and := m.allocateInstr().asXmmRmR(opAnd, rn, signBitReg) + m.insert(and) + + // Clear the sign bit of dst via AND with the non-sign bit mask. + xor := m.allocateInstr().asXmmRmR(opAnd, rm, nonSignBitReg) + m.insert(xor) + + // Copy the sign bits of src to dst via OR. + or := m.allocateInstr().asXmmRmR(opOr, newOperandReg(signBitReg), nonSignBitReg) + m.insert(or) + + m.copyTo(nonSignBitReg, rd) +} + +func (m *machine) lowerBitcast(instr *ssa.Instruction) { + x, dstTyp := instr.BitcastData() + srcTyp := x.Type() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + switch { + case srcTyp == ssa.TypeF32 && dstTyp == ssa.TypeI32: + cvt := m.allocateInstr().asXmmToGpr(sseOpcodeMovd, rn.reg(), rd, false) + m.insert(cvt) + case srcTyp == ssa.TypeI32 && dstTyp == ssa.TypeF32: + cvt := m.allocateInstr().asGprToXmm(sseOpcodeMovd, rn, rd, false) + m.insert(cvt) + case srcTyp == ssa.TypeF64 && dstTyp == ssa.TypeI64: + cvt := m.allocateInstr().asXmmToGpr(sseOpcodeMovq, rn.reg(), rd, true) + m.insert(cvt) + case srcTyp == ssa.TypeI64 && dstTyp == ssa.TypeF64: + cvt := m.allocateInstr().asGprToXmm(sseOpcodeMovq, rn, rd, true) + m.insert(cvt) + default: + panic(fmt.Sprintf("invalid bitcast from %s to %s", srcTyp, dstTyp)) + } +} + +func (m *machine) lowerFcvtToSint(ctxVReg, rn, rd regalloc.VReg, src64, dst64, sat bool) { + var tmpXmm regalloc.VReg + if dst64 { + tmpXmm = m.c.AllocateVReg(ssa.TypeF64) + } else { + tmpXmm = m.c.AllocateVReg(ssa.TypeF32) + } + + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm)) + tmpGp, tmpGp2 := m.c.AllocateVReg(ssa.TypeI64), m.c.AllocateVReg(ssa.TypeI64) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp)) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp2)) + + m.insert(m.allocateFcvtToSintSequence(ctxVReg, rn, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat)) + m.copyTo(tmpGp, rd) +} + +func (m *machine) lowerFcvtToSintSequenceAfterRegalloc(i *instruction) { + execCtx, src, tmpGp, tmpGp2, tmpXmm, src64, dst64, sat := i.fcvtToSintSequenceData() + var cmpOp, truncOp sseOpcode + if src64 { + cmpOp, truncOp = sseOpcodeUcomisd, sseOpcodeCvttsd2si + } else { + cmpOp, truncOp = sseOpcodeUcomiss, sseOpcodeCvttss2si + } + + trunc := m.allocateInstr() + trunc.asXmmToGpr(truncOp, src, tmpGp, dst64) + m.insert(trunc) + + // Check if the dst operand was INT_MIN, by checking it against 1. + cmp1 := m.allocateInstr() + cmp1.asCmpRmiR(true, newOperandImm32(1), tmpGp, dst64) + m.insert(cmp1) + + // If no overflow, then we are done. + doneTarget, done := m.allocateBrTarget() + ifNoOverflow := m.allocateInstr() + ifNoOverflow.asJmpIf(condNO, newOperandLabel(done)) + m.insert(ifNoOverflow) + + // Now, check for NaN. + cmpNan := m.allocateInstr() + cmpNan.asXmmCmpRmR(cmpOp, newOperandReg(src), src) + m.insert(cmpNan) + + // We allocate the "non-nan target" here, but we will insert it later. + notNanTarget, notNaN := m.allocateBrTarget() + ifNotNan := m.allocateInstr() + ifNotNan.asJmpIf(condNP, newOperandLabel(notNaN)) + m.insert(ifNotNan) + + if sat { + // If NaN and saturating, return 0. + zeroDst := m.allocateInstr().asZeros(tmpGp) + m.insert(zeroDst) + + jmpEnd := m.allocateInstr() + jmpEnd.asJmp(newOperandLabel(done)) + m.insert(jmpEnd) + + // Otherwise: + m.insert(notNanTarget) + + // Zero-out the tmp register. + zero := m.allocateInstr().asZeros(tmpXmm) + m.insert(zero) + + cmpXmm := m.allocateInstr().asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src) + m.insert(cmpXmm) + + // if >= jump to end. + jmpEnd2 := m.allocateInstr() + jmpEnd2.asJmpIf(condB, newOperandLabel(done)) + m.insert(jmpEnd2) + + // Otherwise, saturate to INT_MAX. + if dst64 { + m.lowerIconst(tmpGp, math.MaxInt64, dst64) + } else { + m.lowerIconst(tmpGp, math.MaxInt32, dst64) + } + + } else { + + // If non-sat, NaN, trap. + m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeInvalidConversionToInteger) + + // Otherwise, we will jump here. + m.insert(notNanTarget) + + // jump over trap if src larger than threshold + condAboveThreshold := condNB + + // The magic constants are various combination of minInt for int[32|64] represented as float[32|64]. + var minInt uint64 + switch { + case src64 && dst64: + minInt = 0xc3e0000000000000 + case src64 && !dst64: + condAboveThreshold = condNBE + minInt = 0xC1E0_0000_0020_0000 + case !src64 && dst64: + minInt = 0xDF00_0000 + case !src64 && !dst64: + minInt = 0xCF00_0000 + } + + loadToGP := m.allocateInstr().asImm(tmpGp2, minInt, src64) + m.insert(loadToGP) + + movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp2), tmpXmm, src64) + m.insert(movToXmm) + + cmpXmm := m.allocateInstr().asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src) + m.insert(cmpXmm) + + jmpIfLarger := m.allocateInstr() + checkPositiveTarget, checkPositive := m.allocateBrTarget() + jmpIfLarger.asJmpIf(condAboveThreshold, newOperandLabel(checkPositive)) + m.insert(jmpIfLarger) + + m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) + + // If positive, it was a real overflow. + m.insert(checkPositiveTarget) + + // Zero out the temp register. + xorpd := m.allocateInstr() + xorpd.asXmmRmR(sseOpcodeXorpd, newOperandReg(tmpXmm), tmpXmm) + m.insert(xorpd) + + pos := m.allocateInstr() + pos.asXmmCmpRmR(cmpOp, newOperandReg(src), tmpXmm) + m.insert(pos) + + // If >= jump to end. + jmp := m.allocateInstr().asJmpIf(condNB, newOperandLabel(done)) + m.insert(jmp) + m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) + } + + m.insert(doneTarget) +} + +func (m *machine) lowerFcvtToUint(ctxVReg, rn, rd regalloc.VReg, src64, dst64, sat bool) { + tmpXmm, tmpXmm2 := m.c.AllocateVReg(ssa.TypeF64), m.c.AllocateVReg(ssa.TypeF64) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm)) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpXmm2)) + tmpGp, tmpGp2 := m.c.AllocateVReg(ssa.TypeI64), m.c.AllocateVReg(ssa.TypeI64) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp)) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp2)) + + m.insert(m.allocateFcvtToUintSequence( + ctxVReg, rn, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat, + )) + m.copyTo(tmpGp, rd) +} + +func (m *machine) lowerFcvtToUintSequenceAfterRegalloc(i *instruction) { + execCtx, src, tmpGp, tmpGp2, tmpXmm, tmpXmm2, src64, dst64, sat := i.fcvtToUintSequenceData() + + var subOp, cmpOp, truncOp sseOpcode + if src64 { + subOp, cmpOp, truncOp = sseOpcodeSubsd, sseOpcodeUcomisd, sseOpcodeCvttsd2si + } else { + subOp, cmpOp, truncOp = sseOpcodeSubss, sseOpcodeUcomiss, sseOpcodeCvttss2si + } + + doneTarget, done := m.allocateBrTarget() + + switch { + case src64 && dst64: + loadToGP := m.allocateInstr().asImm(tmpGp, 0x43e0000000000000, true) + m.insert(loadToGP) + movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, true) + m.insert(movToXmm) + case src64 && !dst64: + loadToGP := m.allocateInstr().asImm(tmpGp, 0x41e0000000000000, true) + m.insert(loadToGP) + movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, true) + m.insert(movToXmm) + case !src64 && dst64: + loadToGP := m.allocateInstr().asImm(tmpGp, 0x5f000000, false) + m.insert(loadToGP) + movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, false) + m.insert(movToXmm) + case !src64 && !dst64: + loadToGP := m.allocateInstr().asImm(tmpGp, 0x4f000000, false) + m.insert(loadToGP) + movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpGp), tmpXmm, false) + m.insert(movToXmm) + } + + cmp := m.allocateInstr() + cmp.asXmmCmpRmR(cmpOp, newOperandReg(tmpXmm), src) + m.insert(cmp) + + // If above `tmp` ("large threshold"), jump to `ifAboveThreshold` + ifAboveThresholdTarget, ifAboveThreshold := m.allocateBrTarget() + jmpIfAboveThreshold := m.allocateInstr() + jmpIfAboveThreshold.asJmpIf(condNB, newOperandLabel(ifAboveThreshold)) + m.insert(jmpIfAboveThreshold) + + ifNotNaNTarget, ifNotNaN := m.allocateBrTarget() + jmpIfNotNaN := m.allocateInstr() + jmpIfNotNaN.asJmpIf(condNP, newOperandLabel(ifNotNaN)) + m.insert(jmpIfNotNaN) + + // If NaN, handle the error condition. + if sat { + // On NaN, saturating, we just return 0. + zeros := m.allocateInstr().asZeros(tmpGp) + m.insert(zeros) + + jmpEnd := m.allocateInstr() + jmpEnd.asJmp(newOperandLabel(done)) + m.insert(jmpEnd) + } else { + // On NaN, non-saturating, we trap. + m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeInvalidConversionToInteger) + } + + // If not NaN, land here. + m.insert(ifNotNaNTarget) + + // Truncation happens here. + + trunc := m.allocateInstr() + trunc.asXmmToGpr(truncOp, src, tmpGp, dst64) + m.insert(trunc) + + // Check if the result is negative. + cmpNeg := m.allocateInstr() + cmpNeg.asCmpRmiR(true, newOperandImm32(0), tmpGp, dst64) + m.insert(cmpNeg) + + // If non-neg, jump to end. + jmpIfNonNeg := m.allocateInstr() + jmpIfNonNeg.asJmpIf(condNL, newOperandLabel(done)) + m.insert(jmpIfNonNeg) + + if sat { + // If the input was "small" (< 2**(width -1)), the only way to get an integer + // overflow is because the input was too small: saturate to the min value, i.e. 0. + zeros := m.allocateInstr().asZeros(tmpGp) + m.insert(zeros) + + jmpEnd := m.allocateInstr() + jmpEnd.asJmp(newOperandLabel(done)) + m.insert(jmpEnd) + } else { + // If not saturating, trap. + m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) + } + + // If above the threshold, land here. + m.insert(ifAboveThresholdTarget) + + // tmpDiff := threshold - rn. + copySrc := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), tmpXmm2) + m.insert(copySrc) + + sub := m.allocateInstr() + sub.asXmmRmR(subOp, newOperandReg(tmpXmm), tmpXmm2) // must be -0x8000000000000000 + m.insert(sub) + + trunc2 := m.allocateInstr() + trunc2.asXmmToGpr(truncOp, tmpXmm2, tmpGp, dst64) + m.insert(trunc2) + + // Check if the result is negative. + cmpNeg2 := m.allocateInstr().asCmpRmiR(true, newOperandImm32(0), tmpGp, dst64) + m.insert(cmpNeg2) + + ifNextLargeTarget, ifNextLarge := m.allocateBrTarget() + jmpIfNextLarge := m.allocateInstr() + jmpIfNextLarge.asJmpIf(condNL, newOperandLabel(ifNextLarge)) + m.insert(jmpIfNextLarge) + + if sat { + // The input was "large" (>= maxInt), so the only way to get an integer + // overflow is because the input was too large: saturate to the max value. + var maxInt uint64 + if dst64 { + maxInt = math.MaxUint64 + } else { + maxInt = math.MaxUint32 + } + m.lowerIconst(tmpGp, maxInt, dst64) + + jmpToEnd := m.allocateInstr() + jmpToEnd.asJmp(newOperandLabel(done)) + m.insert(jmpToEnd) + } else { + // If not saturating, trap. + m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) + } + + m.insert(ifNextLargeTarget) + + var op operand + if dst64 { + m.lowerIconst(tmpGp2, 0x8000000000000000, true) + op = newOperandReg(tmpGp2) + } else { + op = newOperandImm32(0x80000000) + } + + add := m.allocateInstr() + add.asAluRmiR(aluRmiROpcodeAdd, op, tmpGp, dst64) + m.insert(add) + + m.insert(doneTarget) +} + +func (m *machine) lowerFcvtFromSint(rn, rd operand, src64, dst64 bool) { + var op sseOpcode + if dst64 { + op = sseOpcodeCvtsi2sd + } else { + op = sseOpcodeCvtsi2ss + } + + trunc := m.allocateInstr() + trunc.asGprToXmm(op, rn, rd.reg(), src64) + m.insert(trunc) +} + +func (m *machine) lowerFcvtFromUint(rn, rd operand, src64, dst64 bool) { + var op sseOpcode + if dst64 { + op = sseOpcodeCvtsi2sd + } else { + op = sseOpcodeCvtsi2ss + } + + // Src is 32 bit, then we just perform the conversion with 64 bit width. + // + // See the following link for why we use 64bit conversion for unsigned 32bit integer sources: + // https://stackoverflow.com/questions/41495498/fpu-operations-generated-by-gcc-during-casting-integer-to-float. + // + // Here's the summary: + // >> CVTSI2SS is indeed designed for converting a signed integer to a scalar single-precision float, + // >> not an unsigned integer like you have here. So what gives? Well, a 64-bit processor has 64-bit wide + // >> registers available, so the unsigned 32-bit input values can be stored as signed 64-bit intermediate values, + // >> which allows CVTSI2SS to be used after all. + // + if !src64 { + // Before we convert, we have to clear the higher 32-bits of the 64-bit register + // to get the correct result. + tmp := m.c.AllocateVReg(ssa.TypeI32) + m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, rn, tmp)) + m.insert(m.allocateInstr().asGprToXmm(op, newOperandReg(tmp), rd.reg(), true)) + return + } + + // If uint64, we have to do a bit more work. + endTarget, end := m.allocateBrTarget() + + var tmpXmm regalloc.VReg + if dst64 { + tmpXmm = m.c.AllocateVReg(ssa.TypeF64) + } else { + tmpXmm = m.c.AllocateVReg(ssa.TypeF32) + } + + // Check if the most significant bit (sign bit) is set. + test := m.allocateInstr() + test.asCmpRmiR(false, rn, rn.reg(), src64) + m.insert(test) + + // Jump if the sign bit is set. + ifSignTarget, ifSign := m.allocateBrTarget() + jmpIfNeg := m.allocateInstr() + jmpIfNeg.asJmpIf(condS, newOperandLabel(ifSign)) + m.insert(jmpIfNeg) + + // If the sign bit is not set, we could fit the unsigned int into float32/float64. + // So, we convert it to float and emit jump instruction to exit from this branch. + cvt := m.allocateInstr() + cvt.asGprToXmm(op, rn, tmpXmm, src64) + m.insert(cvt) + + // We are done, jump to end. + jmpEnd := m.allocateInstr() + jmpEnd.asJmp(newOperandLabel(end)) + m.insert(jmpEnd) + + // Now handling the case where sign-bit is set. + // We emit the following sequences: + // mov %rn, %tmp + // shr 1, %tmp + // mov %rn, %tmp2 + // and 1, %tmp2 + // or %tmp2, %tmp + // cvtsi2ss %tmp, %xmm0 + // addsd %xmm0, %xmm0 + m.insert(ifSignTarget) + + tmp := m.copyToTmp(rn.reg()) + shr := m.allocateInstr() + shr.asShiftR(shiftROpShiftRightLogical, newOperandImm32(1), tmp, src64) + m.insert(shr) + + tmp2 := m.copyToTmp(rn.reg()) + and := m.allocateInstr() + and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp2, src64) + m.insert(and) + + or := m.allocateInstr() + or.asAluRmiR(aluRmiROpcodeOr, newOperandReg(tmp2), tmp, src64) + m.insert(or) + + cvt2 := m.allocateInstr() + cvt2.asGprToXmm(op, newOperandReg(tmp), tmpXmm, src64) + m.insert(cvt2) + + addsd := m.allocateInstr() + if dst64 { + addsd.asXmmRmR(sseOpcodeAddsd, newOperandReg(tmpXmm), tmpXmm) + } else { + addsd.asXmmRmR(sseOpcodeAddss, newOperandReg(tmpXmm), tmpXmm) + } + m.insert(addsd) + + m.insert(endTarget) + m.copyTo(tmpXmm, rd.reg()) +} + +func (m *machine) lowerVanyTrue(instr *ssa.Instruction) { + x := instr.Arg() + rm := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + + tmp := m.c.AllocateVReg(ssa.TypeI32) + + cmp := m.allocateInstr() + cmp.asXmmCmpRmR(sseOpcodePtest, rm, rm.reg()) + m.insert(cmp) + + setcc := m.allocateInstr() + setcc.asSetcc(condNZ, tmp) + m.insert(setcc) + + // Clear the irrelevant bits. + and := m.allocateInstr() + and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp, false) + m.insert(and) + + m.copyTo(tmp, rd) +} + +func (m *machine) lowerVallTrue(instr *ssa.Instruction) { + x, lane := instr.ArgWithLane() + var op sseOpcode + switch lane { + case ssa.VecLaneI8x16: + op = sseOpcodePcmpeqb + case ssa.VecLaneI16x8: + op = sseOpcodePcmpeqw + case ssa.VecLaneI32x4: + op = sseOpcodePcmpeqd + case ssa.VecLaneI64x2: + op = sseOpcodePcmpeqq + } + rm := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + + tmp := m.c.AllocateVReg(ssa.TypeV128) + + zeros := m.allocateInstr() + zeros.asZeros(tmp) + m.insert(zeros) + + pcmp := m.allocateInstr() + pcmp.asXmmRmR(op, rm, tmp) + m.insert(pcmp) + + test := m.allocateInstr() + test.asXmmCmpRmR(sseOpcodePtest, newOperandReg(tmp), tmp) + m.insert(test) + + tmp2 := m.c.AllocateVReg(ssa.TypeI32) + + setcc := m.allocateInstr() + setcc.asSetcc(condZ, tmp2) + m.insert(setcc) + + // Clear the irrelevant bits. + and := m.allocateInstr() + and.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), tmp2, false) + m.insert(and) + + m.copyTo(tmp2, rd) +} + +func (m *machine) lowerVhighBits(instr *ssa.Instruction) { + x, lane := instr.ArgWithLane() + rm := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + switch lane { + case ssa.VecLaneI8x16: + mov := m.allocateInstr() + mov.asXmmToGpr(sseOpcodePmovmskb, rm.reg(), rd, false) + m.insert(mov) + + case ssa.VecLaneI16x8: + // When we have: + // R1 = [R1(w1), R1(w2), R1(w3), R1(w4), R1(w5), R1(w6), R1(w7), R1(v8)] + // R2 = [R2(w1), R2(w2), R2(w3), R2(v4), R2(w5), R2(w6), R2(w7), R2(v8)] + // where RX(wn) is n-th signed word (16-bit) of RX register, + // + // "PACKSSWB R1, R2" produces + // R1 = [ + // byte_sat(R1(w1)), byte_sat(R1(w2)), byte_sat(R1(w3)), byte_sat(R1(w4)), + // byte_sat(R1(w5)), byte_sat(R1(w6)), byte_sat(R1(w7)), byte_sat(R1(w8)), + // byte_sat(R2(w1)), byte_sat(R2(w2)), byte_sat(R2(w3)), byte_sat(R2(w4)), + // byte_sat(R2(w5)), byte_sat(R2(w6)), byte_sat(R2(w7)), byte_sat(R2(w8)), + // ] + // where R1 is the destination register, and + // byte_sat(w) = int8(w) if w fits as signed 8-bit, + // 0x80 if w is less than 0x80 + // 0x7F if w is greater than 0x7f + // + // See https://www.felixcloutier.com/x86/packsswb:packssdw for detail. + // + // Therefore, v.register ends up having i-th and (i+8)-th bit set if i-th lane is negative (for i in 0..8). + tmp := m.copyToTmp(rm.reg()) + res := m.c.AllocateVReg(ssa.TypeI32) + + pak := m.allocateInstr() + pak.asXmmRmR(sseOpcodePacksswb, rm, tmp) + m.insert(pak) + + mov := m.allocateInstr() + mov.asXmmToGpr(sseOpcodePmovmskb, tmp, res, false) + m.insert(mov) + + // Clear the higher bits than 8. + shr := m.allocateInstr() + shr.asShiftR(shiftROpShiftRightLogical, newOperandImm32(8), res, false) + m.insert(shr) + + m.copyTo(res, rd) + + case ssa.VecLaneI32x4: + mov := m.allocateInstr() + mov.asXmmToGpr(sseOpcodeMovmskps, rm.reg(), rd, true) + m.insert(mov) + + case ssa.VecLaneI64x2: + mov := m.allocateInstr() + mov.asXmmToGpr(sseOpcodeMovmskpd, rm.reg(), rd, true) + m.insert(mov) + } +} + +func (m *machine) lowerVbnot(instr *ssa.Instruction) { + x := instr.Arg() + xDef := m.c.ValueDefinition(x) + rm := m.getOperand_Reg(xDef) + rd := m.c.VRegOf(instr.Return()) + + tmp := m.copyToTmp(rm.reg()) + tmp2 := m.c.AllocateVReg(ssa.TypeV128) + + // Ensure tmp2 is considered defined by regalloc. + m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2)) + + // Set all bits on tmp register. + pak := m.allocateInstr() + pak.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp2), tmp2) + m.insert(pak) + + // Then XOR with tmp to reverse all bits on v.register. + xor := m.allocateInstr() + xor.asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp) + m.insert(xor) + + m.copyTo(tmp, rd) +} + +func (m *machine) lowerSplat(x, ret ssa.Value, lane ssa.VecLane) { + tmpDst := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst)) + + switch lane { + case ssa.VecLaneI8x16: + tmp := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmp)) + xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, xx, tmpDst)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpDst)) + case ssa.VecLaneI16x8: + xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, xx, tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, xx, tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) + case ssa.VecLaneI32x4: + xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, xx, tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) + case ssa.VecLaneI64x2: + xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, xx, tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, xx, tmpDst)) + case ssa.VecLaneF32x4: + xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, 0, xx, tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) + case ssa.VecLaneF64x2: + xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, xx, tmpDst)) + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + m.copyTo(tmpDst, m.c.VRegOf(ret)) +} + +func (m *machine) lowerShuffle(x, y ssa.Value, lo, hi uint64, ret ssa.Value) { + var xMask, yMask [2]uint64 + for i := 0; i < 8; i++ { + loLane := byte(lo >> (i * 8)) + if loLane < 16 { + xMask[0] |= uint64(loLane) << (i * 8) + yMask[0] |= uint64(0x80) << (i * 8) + } else { + xMask[0] |= uint64(0x80) << (i * 8) + yMask[0] |= uint64(loLane-16) << (i * 8) + } + hiLane := byte(hi >> (i * 8)) + if hiLane < 16 { + xMask[1] |= uint64(hiLane) << (i * 8) + yMask[1] |= uint64(0x80) << (i * 8) + } else { + xMask[1] |= uint64(0x80) << (i * 8) + yMask[1] |= uint64(hiLane-16) << (i * 8) + } + } + + xmaskLabel := m.allocateLabel() + m.consts = append(m.consts, _const{lo: xMask[0], hi: xMask[1], label: xmaskLabel}) + ymaskLabel := m.allocateLabel() + m.consts = append(m.consts, _const{lo: yMask[0], hi: yMask[1], label: ymaskLabel}) + + xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Reg(m.c.ValueDefinition(y)) + tmpX, tmpY := m.copyToTmp(xx.reg()), m.copyToTmp(yy.reg()) + + // Apply mask to X. + tmp := m.c.AllocateVReg(ssa.TypeV128) + loadMaskLo := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(xmaskLabel.L)), tmp) + m.insert(loadMaskLo) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpX)) + + // Apply mask to Y. + loadMaskHi := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(ymaskLabel.L)), tmp) + m.insert(loadMaskHi) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmp), tmpY)) + + // Combine the results. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeOrps, newOperandReg(tmpX), tmpY)) + + m.copyTo(tmpY, m.c.VRegOf(ret)) +} + +func (m *machine) lowerVbBinOpUnaligned(op sseOpcode, x, y, ret ssa.Value) { + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rm := m.getOperand_Reg(m.c.ValueDefinition(y)) + rd := m.c.VRegOf(ret) + + tmp := m.copyToTmp(rn.reg()) + + binOp := m.allocateInstr() + binOp.asXmmRmR(op, rm, tmp) + m.insert(binOp) + + m.copyTo(tmp, rd) +} + +func (m *machine) lowerVbBinOp(op sseOpcode, x, y, ret ssa.Value) { + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rm := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + rd := m.c.VRegOf(ret) + + tmp := m.copyToTmp(rn.reg()) + + binOp := m.allocateInstr() + binOp.asXmmRmR(op, rm, tmp) + m.insert(binOp) + + m.copyTo(tmp, rd) +} + +func (m *machine) lowerVFcmp(x, y ssa.Value, c ssa.FloatCmpCond, ret ssa.Value, lane ssa.VecLane) { + var cmpOp sseOpcode + switch lane { + case ssa.VecLaneF32x4: + cmpOp = sseOpcodeCmpps + case ssa.VecLaneF64x2: + cmpOp = sseOpcodeCmppd + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + xx, yy := m.c.ValueDefinition(x), m.c.ValueDefinition(y) + var cmpImm cmpPred + switch c { + case ssa.FloatCmpCondGreaterThan: + yy, xx = xx, yy + cmpImm = cmpPredLT_OS + case ssa.FloatCmpCondGreaterThanOrEqual: + yy, xx = xx, yy + cmpImm = cmpPredLE_OS + case ssa.FloatCmpCondEqual: + cmpImm = cmpPredEQ_OQ + case ssa.FloatCmpCondNotEqual: + cmpImm = cmpPredNEQ_UQ + case ssa.FloatCmpCondLessThan: + cmpImm = cmpPredLT_OS + case ssa.FloatCmpCondLessThanOrEqual: + cmpImm = cmpPredLE_OS + default: + panic(fmt.Sprintf("invalid float comparison condition: %s", c)) + } + + tmp := m.c.AllocateVReg(ssa.TypeV128) + xxx := m.getOperand_Mem_Reg(xx) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, xxx, tmp)) + + rm := m.getOperand_Mem_Reg(yy) + m.insert(m.allocateInstr().asXmmRmRImm(cmpOp, byte(cmpImm), rm, tmp)) + + m.copyTo(tmp, m.c.VRegOf(ret)) +} + +func (m *machine) lowerVIcmp(x, y ssa.Value, c ssa.IntegerCmpCond, ret ssa.Value, lane ssa.VecLane) { + var eq, gt, maxu, minu, mins sseOpcode + switch lane { + case ssa.VecLaneI8x16: + eq, gt, maxu, minu, mins = sseOpcodePcmpeqb, sseOpcodePcmpgtb, sseOpcodePmaxub, sseOpcodePminub, sseOpcodePminsb + case ssa.VecLaneI16x8: + eq, gt, maxu, minu, mins = sseOpcodePcmpeqw, sseOpcodePcmpgtw, sseOpcodePmaxuw, sseOpcodePminuw, sseOpcodePminsw + case ssa.VecLaneI32x4: + eq, gt, maxu, minu, mins = sseOpcodePcmpeqd, sseOpcodePcmpgtd, sseOpcodePmaxud, sseOpcodePminud, sseOpcodePminsd + case ssa.VecLaneI64x2: + eq, gt = sseOpcodePcmpeqq, sseOpcodePcmpgtq + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + tmp := m.c.AllocateVReg(ssa.TypeV128) + var op operand + switch c { + case ssa.IntegerCmpCondSignedLessThanOrEqual: + if lane == ssa.VecLaneI64x2 { + x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + // Copy x to tmp. + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp)) + op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + } else { + y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + // Copy y to tmp. + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp)) + op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + } + case ssa.IntegerCmpCondSignedGreaterThanOrEqual: + if lane == ssa.VecLaneI64x2 { + y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + // Copy y to tmp. + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp)) + op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + } else { + x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + // Copy x to tmp. + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp)) + op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + } + case ssa.IntegerCmpCondSignedLessThan, ssa.IntegerCmpCondUnsignedLessThan, ssa.IntegerCmpCondUnsignedLessThanOrEqual: + y := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + // Copy y to tmp. + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, y, tmp)) + op = m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + default: + x := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + // Copy x to tmp. + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, x, tmp)) + op = m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + } + + switch c { + case ssa.IntegerCmpCondEqual: + m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) + case ssa.IntegerCmpCondNotEqual: + // First we compare for equality. + m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) + // Then flip the bits. To do so, we set all bits on tmp2. + tmp2 := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2)) + m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2)) + // And then xor with tmp. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp)) + case ssa.IntegerCmpCondSignedGreaterThan, ssa.IntegerCmpCondSignedLessThan: + m.insert(m.allocateInstr().asXmmRmR(gt, op, tmp)) + case ssa.IntegerCmpCondSignedGreaterThanOrEqual, ssa.IntegerCmpCondSignedLessThanOrEqual: + if lane == ssa.VecLaneI64x2 { + m.insert(m.allocateInstr().asXmmRmR(gt, op, tmp)) + // Then flip the bits. To do so, we set all bits on tmp2. + tmp2 := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2)) + m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2)) + // And then xor with tmp. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp)) + } else { + // First take min of x and y. + m.insert(m.allocateInstr().asXmmRmR(mins, op, tmp)) + // Then compare for equality. + m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) + } + case ssa.IntegerCmpCondUnsignedGreaterThan, ssa.IntegerCmpCondUnsignedLessThan: + // First maxu of x and y. + m.insert(m.allocateInstr().asXmmRmR(maxu, op, tmp)) + // Then compare for equality. + m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) + // Then flip the bits. To do so, we set all bits on tmp2. + tmp2 := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmp2)) + m.insert(m.allocateInstr().asXmmRmR(eq, newOperandReg(tmp2), tmp2)) + // And then xor with tmp. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp2), tmp)) + case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual, ssa.IntegerCmpCondUnsignedLessThanOrEqual: + m.insert(m.allocateInstr().asXmmRmR(minu, op, tmp)) + m.insert(m.allocateInstr().asXmmRmR(eq, op, tmp)) + default: + panic("BUG") + } + + m.copyTo(tmp, m.c.VRegOf(ret)) +} + +func (m *machine) lowerVbandnot(instr *ssa.Instruction, op sseOpcode) { + x, y := instr.Arg2() + xDef := m.c.ValueDefinition(x) + yDef := m.c.ValueDefinition(y) + rm, rn := m.getOperand_Reg(xDef), m.getOperand_Reg(yDef) + rd := m.c.VRegOf(instr.Return()) + + tmp := m.copyToTmp(rn.reg()) + + // pandn between rn, rm. + pand := m.allocateInstr() + pand.asXmmRmR(sseOpcodePandn, rm, tmp) + m.insert(pand) + + m.copyTo(tmp, rd) +} + +func (m *machine) lowerVbitselect(instr *ssa.Instruction) { + c, x, y := instr.SelectData() + xDef := m.c.ValueDefinition(x) + yDef := m.c.ValueDefinition(y) + rm, rn := m.getOperand_Reg(xDef), m.getOperand_Reg(yDef) + creg := m.getOperand_Reg(m.c.ValueDefinition(c)) + rd := m.c.VRegOf(instr.Return()) + + tmpC := m.copyToTmp(creg.reg()) + tmpX := m.copyToTmp(rm.reg()) + + // And between c, x (overwrites x). + pand := m.allocateInstr() + pand.asXmmRmR(sseOpcodePand, creg, tmpX) + m.insert(pand) + + // Andn between y, c (overwrites c). + pandn := m.allocateInstr() + pandn.asXmmRmR(sseOpcodePandn, rn, tmpC) + m.insert(pandn) + + por := m.allocateInstr() + por.asXmmRmR(sseOpcodePor, newOperandReg(tmpC), tmpX) + m.insert(por) + + m.copyTo(tmpX, rd) +} + +func (m *machine) lowerVFmin(instr *ssa.Instruction) { + x, y, lane := instr.Arg2WithLane() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rm := m.getOperand_Reg(m.c.ValueDefinition(y)) + rd := m.c.VRegOf(instr.Return()) + + var min, cmp, andn, or, srl /* shift right logical */ sseOpcode + var shiftNumToInverseNaN uint32 + if lane == ssa.VecLaneF32x4 { + min, cmp, andn, or, srl, shiftNumToInverseNaN = sseOpcodeMinps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodePsrld, 0xa + } else { + min, cmp, andn, or, srl, shiftNumToInverseNaN = sseOpcodeMinpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodePsrlq, 0xd + } + + tmp1 := m.copyToTmp(rn.reg()) + tmp2 := m.copyToTmp(rm.reg()) + + // tmp1=min(rn, rm) + minIns1 := m.allocateInstr() + minIns1.asXmmRmR(min, rn, tmp2) + m.insert(minIns1) + + // tmp2=min(rm, rn) + minIns2 := m.allocateInstr() + minIns2.asXmmRmR(min, rm, tmp1) + m.insert(minIns2) + + // tmp3:=tmp1=min(rn, rm) + tmp3 := m.copyToTmp(tmp1) + + // tmp1 = -0 if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN + // NaN if rn == NaN || rm == NaN + // min(rm, rm) otherwise + orIns := m.allocateInstr() + orIns.asXmmRmR(or, newOperandReg(tmp2), tmp1) + m.insert(orIns) + + // tmp3 is originally min(rn,rm). + // tmp3 = 0^ (set all bits) if rn == NaN || rm == NaN + // 0 otherwise + cmpIns := m.allocateInstr() + cmpIns.asXmmRmRImm(cmp, uint8(cmpPredUNORD_Q), newOperandReg(tmp2), tmp3) + m.insert(cmpIns) + + // tmp1 = -0 if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN + // ^0 if rn == NaN || rm == NaN + // min(v1, v2) otherwise + orIns2 := m.allocateInstr() + orIns2.asXmmRmR(or, newOperandReg(tmp3), tmp1) + m.insert(orIns2) + + // tmp3 = set all bits on the mantissa bits + // 0 otherwise + shift := m.allocateInstr() + shift.asXmmRmiReg(srl, newOperandImm32(shiftNumToInverseNaN), tmp3) + m.insert(shift) + + // tmp3 = tmp1 and !tmp3 + // = -0 if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN + // set all bits on exponential and sign bit (== NaN) if rn == NaN || rm == NaN + // min(rn, rm) otherwise + andnIns := m.allocateInstr() + andnIns.asXmmRmR(andn, newOperandReg(tmp1), tmp3) + m.insert(andnIns) + + m.copyTo(tmp3, rd) +} + +func (m *machine) lowerVFmax(instr *ssa.Instruction) { + x, y, lane := instr.Arg2WithLane() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rm := m.getOperand_Reg(m.c.ValueDefinition(y)) + rd := m.c.VRegOf(instr.Return()) + + var max, cmp, andn, or, xor, sub, srl /* shift right logical */ sseOpcode + var shiftNumToInverseNaN uint32 + if lane == ssa.VecLaneF32x4 { + max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = sseOpcodeMaxps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodeXorps, sseOpcodeSubps, sseOpcodePsrld, 0xa + } else { + max, cmp, andn, or, xor, sub, srl, shiftNumToInverseNaN = sseOpcodeMaxpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodeXorpd, sseOpcodeSubpd, sseOpcodePsrlq, 0xd + } + + tmp0 := m.copyToTmp(rm.reg()) + tmp1 := m.copyToTmp(rn.reg()) + + // tmp0=max(rn, rm) + maxIns1 := m.allocateInstr() + maxIns1.asXmmRmR(max, rn, tmp0) + m.insert(maxIns1) + + // tmp1=max(rm, rn) + maxIns2 := m.allocateInstr() + maxIns2.asXmmRmR(max, rm, tmp1) + m.insert(maxIns2) + + // tmp2=max(rm, rn) + tmp2 := m.copyToTmp(tmp1) + + // tmp2 = -0 if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) + // 0 if (rn == 0 && rm == 0) + // -0 if (rn == -0 && rm == -0) + // v1^v2 if rn == NaN || rm == NaN + // 0 otherwise + xorInstr := m.allocateInstr() + xorInstr.asXmmRmR(xor, newOperandReg(tmp0), tmp2) + m.insert(xorInstr) + // tmp1 = -0 if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) + // 0 if (rn == 0 && rm == 0) + // -0 if (rn == -0 && rm == -0) + // NaN if rn == NaN || rm == NaN + // max(v1, v2) otherwise + orInstr := m.allocateInstr() + orInstr.asXmmRmR(or, newOperandReg(tmp2), tmp1) + m.insert(orInstr) + + tmp3 := m.copyToTmp(tmp1) + + // tmp3 = 0 if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) || (rn == 0 && rm == 0) + // -0 if (rn == -0 && rm == -0) + // NaN if rn == NaN || rm == NaN + // max(v1, v2) otherwise + // + // Note: -0 - (-0) = 0 (!= -0) in floating point operation. + subIns := m.allocateInstr() + subIns.asXmmRmR(sub, newOperandReg(tmp2), tmp3) + m.insert(subIns) + + // tmp1 = 0^ if rn == NaN || rm == NaN + cmpIns := m.allocateInstr() + cmpIns.asXmmRmRImm(cmp, uint8(cmpPredUNORD_Q), newOperandReg(tmp1), tmp1) + m.insert(cmpIns) + + // tmp1 = set all bits on the mantissa bits + // 0 otherwise + shift := m.allocateInstr() + shift.asXmmRmiReg(srl, newOperandImm32(shiftNumToInverseNaN), tmp1) + m.insert(shift) + + andnIns := m.allocateInstr() + andnIns.asXmmRmR(andn, newOperandReg(tmp3), tmp1) + m.insert(andnIns) + + m.copyTo(tmp1, rd) +} + +func (m *machine) lowerVFabs(instr *ssa.Instruction) { + x, lane := instr.ArgWithLane() + rm := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + + tmp := m.c.AllocateVReg(ssa.TypeV128) + + def := m.allocateInstr() + def.asDefineUninitializedReg(tmp) + m.insert(def) + + // Set all bits on tmp. + pcmp := m.allocateInstr() + pcmp.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp) + m.insert(pcmp) + + switch lane { + case ssa.VecLaneF32x4: + // Shift right packed single floats by 1 to clear the sign bits. + shift := m.allocateInstr() + shift.asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp) + m.insert(shift) + // Clear the sign bit of rm. + andp := m.allocateInstr() + andp.asXmmRmR(sseOpcodeAndpd, rm, tmp) + m.insert(andp) + case ssa.VecLaneF64x2: + // Shift right packed single floats by 1 to clear the sign bits. + shift := m.allocateInstr() + shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(1), tmp) + m.insert(shift) + // Clear the sign bit of rm. + andp := m.allocateInstr() + andp.asXmmRmR(sseOpcodeAndps, rm, tmp) + m.insert(andp) + } + + m.copyTo(tmp, rd) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go new file mode 100644 index 000000000..8fa974c66 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go @@ -0,0 +1,304 @@ +package amd64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" +) + +// PostRegAlloc implements backend.Machine. +func (m *machine) PostRegAlloc() { + m.setupPrologue() + m.postRegAlloc() +} + +func (m *machine) setupPrologue() { + cur := m.ectx.RootInstr + prevInitInst := cur.next + + // At this point, we have the stack layout as follows: + // + // (high address) + // +-----------------+ <----- RBP (somewhere in the middle of the stack) + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | Return Addr | + // RSP ----> +-----------------+ + // (low address) + + // First, we push the RBP, and update the RBP to the current RSP. + // + // (high address) (high address) + // RBP ----> +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | ====> | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | Return Addr | | Return Addr | + // RSP ----> +-----------------+ | Caller_RBP | + // (low address) +-----------------+ <----- RSP, RBP + // + cur = m.setupRBPRSP(cur) + + if !m.stackBoundsCheckDisabled { + cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur) + } + + // + // (high address) + // +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | xxxxx | | xxxxx | + // | Return Addr | | Return Addr | + // | Caller_RBP | ====> | Caller_RBP | + // RBP,RSP->+-----------------+ +-----------------+ <----- RBP + // (low address) | clobbered M | + // | clobbered 1 | + // | ........... | + // | clobbered 0 | + // +-----------------+ <----- RSP + // + if regs := m.clobberedRegs; len(regs) > 0 { + for i := range regs { + r := regs[len(regs)-1-i] // Reverse order. + if r.RegType() == regalloc.RegTypeInt { + cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r))) + } else { + // Push the XMM register is not supported by the PUSH instruction. + cur = m.addRSP(-16, cur) + push := m.allocateInstr().asXmmMovRM( + sseOpcodeMovdqu, r, newOperandMem(m.newAmodeImmReg(0, rspVReg)), + ) + cur = linkInstr(cur, push) + } + } + } + + if size := m.spillSlotSize; size > 0 { + // Simply decrease the RSP to allocate the spill slots. + // sub $size, %rsp + cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(size)), rspVReg, true)) + + // At this point, we have the stack layout as follows: + // + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | ReturnAddress | + // | Caller_RBP | + // +-----------------+ <--- RBP + // | clobbered M | + // | ............ | + // | clobbered 1 | + // | clobbered 0 | + // | spill slot N | + // | ............ | + // | spill slot 0 | + // +-----------------+ <--- RSP + // (low address) + } + + linkInstr(cur, prevInitInst) +} + +// postRegAlloc does multiple things while walking through the instructions: +// 1. Inserts the epilogue code. +// 2. Removes the redundant copy instruction. +// 3. Inserts the dec/inc RSP instruction right before/after the call instruction. +// 4. Lowering that is supposed to be done after regalloc. +func (m *machine) postRegAlloc() { + ectx := m.ectx + for cur := ectx.RootInstr; cur != nil; cur = cur.next { + switch k := cur.kind; k { + case ret: + m.setupEpilogueAfter(cur.prev) + continue + case fcvtToSintSequence, fcvtToUintSequence: + m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0] + if k == fcvtToSintSequence { + m.lowerFcvtToSintSequenceAfterRegalloc(cur) + } else { + m.lowerFcvtToUintSequenceAfterRegalloc(cur) + } + prev := cur.prev + next := cur.next + cur := prev + for _, instr := range m.ectx.PendingInstructions { + cur = linkInstr(cur, instr) + } + linkInstr(cur, next) + continue + case xmmCMov: + m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0] + m.lowerXmmCmovAfterRegAlloc(cur) + prev := cur.prev + next := cur.next + cur := prev + for _, instr := range m.ectx.PendingInstructions { + cur = linkInstr(cur, instr) + } + linkInstr(cur, next) + continue + case idivRemSequence: + m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0] + m.lowerIDivRemSequenceAfterRegAlloc(cur) + prev := cur.prev + next := cur.next + cur := prev + for _, instr := range m.ectx.PendingInstructions { + cur = linkInstr(cur, instr) + } + linkInstr(cur, next) + continue + case call, callIndirect: + // At this point, reg alloc is done, therefore we can safely insert dec/inc RPS instruction + // right before/after the call instruction. If this is done before reg alloc, the stack slot + // can point to the wrong location and therefore results in a wrong value. + call := cur + next := call.next + _, _, _, _, size := backend.ABIInfoFromUint64(call.u2) + if size > 0 { + dec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(size), rspVReg, true) + linkInstr(call.prev, dec) + linkInstr(dec, call) + inc := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(size), rspVReg, true) + linkInstr(call, inc) + linkInstr(inc, next) + } + continue + } + + // Removes the redundant copy instruction. + if cur.IsCopy() && cur.op1.reg().RealReg() == cur.op2.reg().RealReg() { + prev, next := cur.prev, cur.next + // Remove the copy instruction. + prev.next = next + if next != nil { + next.prev = prev + } + } + } +} + +func (m *machine) setupEpilogueAfter(cur *instruction) { + prevNext := cur.next + + // At this point, we have the stack layout as follows: + // + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | ReturnAddress | + // | Caller_RBP | + // +-----------------+ <--- RBP + // | clobbered M | + // | ............ | + // | clobbered 1 | + // | clobbered 0 | + // | spill slot N | + // | ............ | + // | spill slot 0 | + // +-----------------+ <--- RSP + // (low address) + + if size := m.spillSlotSize; size > 0 { + // Simply increase the RSP to free the spill slots. + // add $size, %rsp + cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(uint32(size)), rspVReg, true)) + } + + // + // (high address) + // +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | ReturnAddress | | ReturnAddress | + // | Caller_RBP | | Caller_RBP | + // RBP ---> +-----------------+ ========> +-----------------+ <---- RSP, RBP + // | clobbered M | + // | ............ | + // | clobbered 1 | + // | clobbered 0 | + // RSP ---> +-----------------+ + // (low address) + // + if regs := m.clobberedRegs; len(regs) > 0 { + for _, r := range regs { + if r.RegType() == regalloc.RegTypeInt { + cur = linkInstr(cur, m.allocateInstr().asPop64(r)) + } else { + // Pop the XMM register is not supported by the POP instruction. + pop := m.allocateInstr().asXmmUnaryRmR( + sseOpcodeMovdqu, newOperandMem(m.newAmodeImmReg(0, rspVReg)), r, + ) + cur = linkInstr(cur, pop) + cur = m.addRSP(16, cur) + } + } + } + + // Now roll back the RSP to RBP, and pop the caller's RBP. + cur = m.revertRBPRSP(cur) + + linkInstr(cur, prevNext) +} + +func (m *machine) addRSP(offset int32, cur *instruction) *instruction { + if offset == 0 { + return cur + } + opcode := aluRmiROpcodeAdd + if offset < 0 { + opcode = aluRmiROpcodeSub + offset = -offset + } + return linkInstr(cur, m.allocateInstr().asAluRmiR(opcode, newOperandImm32(uint32(offset)), rspVReg, true)) +} + +func (m *machine) setupRBPRSP(cur *instruction) *instruction { + cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(rbpVReg))) + cur = linkInstr(cur, m.allocateInstr().asMovRR(rspVReg, rbpVReg, true)) + return cur +} + +func (m *machine) revertRBPRSP(cur *instruction) *instruction { + cur = linkInstr(cur, m.allocateInstr().asMovRR(rbpVReg, rspVReg, true)) + cur = linkInstr(cur, m.allocateInstr().asPop64(rbpVReg)) + return cur +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go new file mode 100644 index 000000000..0bb28ee9e --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go @@ -0,0 +1,153 @@ +package amd64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// InsertMoveBefore implements backend.RegAllocFunctionMachine. +func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) { + typ := src.RegType() + if typ != dst.RegType() { + panic("BUG: src and dst must have the same type") + } + + mov := m.allocateInstr() + if typ == regalloc.RegTypeInt { + mov.asMovRR(src, dst, true) + } else { + mov.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), dst) + } + + cur := instr.prev + prevNext := cur.next + cur = linkInstr(cur, mov) + linkInstr(cur, prevNext) +} + +// InsertStoreRegisterAt implements backend.RegAllocFunctionMachine. +func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction { + if !v.IsRealReg() { + panic("BUG: VReg must be backed by real reg to be stored") + } + + typ := m.c.TypeOf(v) + + var prevNext, cur *instruction + if after { + cur, prevNext = instr, instr.next + } else { + cur, prevNext = instr.prev, instr + } + + offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size()) + store := m.allocateInstr() + mem := newOperandMem(m.newAmodeImmReg(uint32(offsetFromSP), rspVReg)) + switch typ { + case ssa.TypeI32: + store.asMovRM(v, mem, 4) + case ssa.TypeI64: + store.asMovRM(v, mem, 8) + case ssa.TypeF32: + store.asXmmMovRM(sseOpcodeMovss, v, mem) + case ssa.TypeF64: + store.asXmmMovRM(sseOpcodeMovsd, v, mem) + case ssa.TypeV128: + store.asXmmMovRM(sseOpcodeMovdqu, v, mem) + } + + cur = linkInstr(cur, store) + return linkInstr(cur, prevNext) +} + +// InsertReloadRegisterAt implements backend.RegAllocFunctionMachine. +func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction { + if !v.IsRealReg() { + panic("BUG: VReg must be backed by real reg to be stored") + } + + typ := m.c.TypeOf(v) + var prevNext, cur *instruction + if after { + cur, prevNext = instr, instr.next + } else { + cur, prevNext = instr.prev, instr + } + + // Load the value to the temporary. + load := m.allocateInstr() + offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size()) + a := newOperandMem(m.newAmodeImmReg(uint32(offsetFromSP), rspVReg)) + switch typ { + case ssa.TypeI32: + load.asMovzxRmR(extModeLQ, a, v) + case ssa.TypeI64: + load.asMov64MR(a, v) + case ssa.TypeF32: + load.asXmmUnaryRmR(sseOpcodeMovss, a, v) + case ssa.TypeF64: + load.asXmmUnaryRmR(sseOpcodeMovsd, a, v) + case ssa.TypeV128: + load.asXmmUnaryRmR(sseOpcodeMovdqu, a, v) + default: + panic("BUG") + } + + cur = linkInstr(cur, load) + return linkInstr(cur, prevNext) +} + +// ClobberedRegisters implements backend.RegAllocFunctionMachine. +func (m *machine) ClobberedRegisters(regs []regalloc.VReg) { + m.clobberedRegs = append(m.clobberedRegs[:0], regs...) +} + +// Swap implements backend.RegAllocFunctionMachine. +func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) { + if x1.RegType() == regalloc.RegTypeInt { + prevNext := cur.next + xc := m.allocateInstr().asXCHG(x1, newOperandReg(x2), 8) + cur = linkInstr(cur, xc) + linkInstr(cur, prevNext) + } else { + if tmp.Valid() { + prevNext := cur.next + m.InsertMoveBefore(tmp, x1, prevNext) + m.InsertMoveBefore(x1, x2, prevNext) + m.InsertMoveBefore(x2, tmp, prevNext) + } else { + prevNext := cur.next + r2 := x2.RealReg() + // Temporarily spill x1 to stack. + cur = m.InsertStoreRegisterAt(x1, cur, true).prev + // Then move x2 to x1. + cur = linkInstr(cur, m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqa, newOperandReg(x2), x1)) + linkInstr(cur, prevNext) + // Then reload the original value on x1 from stack to r2. + m.InsertReloadRegisterAt(x1.SetRealReg(r2), cur, true) + } + } +} + +// LastInstrForInsertion implements backend.RegAllocFunctionMachine. +func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction { + cur := end + for cur.kind == nop0 { + cur = cur.prev + if cur == begin { + return end + } + } + switch cur.kind { + case jmp: + return cur + default: + return end + } +} + +// SSABlockLabel implements backend.RegAllocFunctionMachine. +func (m *machine) SSABlockLabel(id ssa.BasicBlockID) backend.Label { + return m.ectx.SsaBlockIDToLabels[id] +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go new file mode 100644 index 000000000..539a8b754 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go @@ -0,0 +1,992 @@ +package amd64 + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +var swizzleMask = [16]byte{ + 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, + 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, +} + +func (m *machine) lowerSwizzle(x, y ssa.Value, ret ssa.Value) { + masklabel := m.getOrAllocateConstLabel(&m.constSwizzleMaskConstIndex, swizzleMask[:]) + + // Load mask to maskReg. + maskReg := m.c.AllocateVReg(ssa.TypeV128) + loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(masklabel)), maskReg) + m.insert(loadMask) + + // Copy x and y to tmp registers. + xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + tmpDst := m.copyToTmp(xx.reg()) + yy := m.getOperand_Reg(m.c.ValueDefinition(y)) + tmpX := m.copyToTmp(yy.reg()) + + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddusb, newOperandReg(maskReg), tmpX)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpX), tmpDst)) + + // Copy the result to the destination register. + m.copyTo(tmpDst, m.c.VRegOf(ret)) +} + +func (m *machine) lowerInsertLane(x, y ssa.Value, index byte, ret ssa.Value, lane ssa.VecLane) { + // Copy x to tmp. + tmpDst := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), tmpDst)) + + yy := m.getOperand_Reg(m.c.ValueDefinition(y)) + switch lane { + case ssa.VecLaneI8x16: + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, index, yy, tmpDst)) + case ssa.VecLaneI16x8: + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, index, yy, tmpDst)) + case ssa.VecLaneI32x4: + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, index, yy, tmpDst)) + case ssa.VecLaneI64x2: + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, index, yy, tmpDst)) + case ssa.VecLaneF32x4: + // In INSERTPS instruction, the destination index is encoded at 4 and 5 bits of the argument. + // See https://www.felixcloutier.com/x86/insertps + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, index<<4, yy, tmpDst)) + case ssa.VecLaneF64x2: + if index == 0 { + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, yy, tmpDst)) + } else { + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, yy, tmpDst)) + } + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + m.copyTo(tmpDst, m.c.VRegOf(ret)) +} + +func (m *machine) lowerExtractLane(x ssa.Value, index byte, signed bool, ret ssa.Value, lane ssa.VecLane) { + // Pextr variants are used to extract a lane from a vector register. + xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + + tmpDst := m.c.AllocateVReg(ret.Type()) + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst)) + switch lane { + case ssa.VecLaneI8x16: + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrb, index, xx, tmpDst)) + if signed { + m.insert(m.allocateInstr().asMovsxRmR(extModeBL, newOperandReg(tmpDst), tmpDst)) + } else { + m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(tmpDst), tmpDst)) + } + case ssa.VecLaneI16x8: + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrw, index, xx, tmpDst)) + if signed { + m.insert(m.allocateInstr().asMovsxRmR(extModeWL, newOperandReg(tmpDst), tmpDst)) + } else { + m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(tmpDst), tmpDst)) + } + case ssa.VecLaneI32x4: + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrd, index, xx, tmpDst)) + case ssa.VecLaneI64x2: + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, index, xx, tmpDst)) + case ssa.VecLaneF32x4: + if index == 0 { + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovss, xx, tmpDst)) + } else { + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, index, xx, tmpDst)) + } + case ssa.VecLaneF64x2: + if index == 0 { + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst)) + } else { + m.copyTo(xx.reg(), tmpDst) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0b00_00_11_10, newOperandReg(tmpDst), tmpDst)) + } + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + m.copyTo(tmpDst, m.c.VRegOf(ret)) +} + +var sqmulRoundSat = [16]byte{ + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, +} + +func (m *machine) lowerSqmulRoundSat(x, y, ret ssa.Value) { + // See https://github.com/WebAssembly/simd/pull/365 for the following logic. + maskLabel := m.getOrAllocateConstLabel(&m.constSqmulRoundSatIndex, sqmulRoundSat[:]) + + tmp := m.c.AllocateVReg(ssa.TypeV128) + loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp) + m.insert(loadMask) + + xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + tmpX := m.copyToTmp(xx.reg()) + + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmulhrsw, yy, tmpX)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmpX), tmp)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmpX)) + + m.copyTo(tmpX, m.c.VRegOf(ret)) +} + +func (m *machine) lowerVUshr(x, y, ret ssa.Value, lane ssa.VecLane) { + switch lane { + case ssa.VecLaneI8x16: + m.lowerVUshri8x16(x, y, ret) + case ssa.VecLaneI16x8, ssa.VecLaneI32x4, ssa.VecLaneI64x2: + m.lowerShr(x, y, ret, lane, false) + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } +} + +// i8x16LogicalSHRMaskTable is necessary for emulating non-existent packed bytes logical right shifts on amd64. +// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits. +var i8x16LogicalSHRMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes. + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // for 1 shift + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, // for 2 shift + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // for 3 shift + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // for 4 shift + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // for 5 shift + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // for 6 shift + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // for 7 shift +} + +func (m *machine) lowerVUshri8x16(x, y, ret ssa.Value) { + tmpGpReg := m.c.AllocateVReg(ssa.TypeI32) + // Load the modulo 8 mask to tmpReg. + m.lowerIconst(tmpGpReg, 0x7, false) + // Take the modulo 8 of the shift amount. + shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)) + m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, tmpGpReg, false)) + + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + + vecTmp := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), vecTmp, false)) + m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrlw, newOperandReg(vecTmp), xx)) + + maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16LogicalSHRMaskTableIndex, i8x16LogicalSHRMaskTable[:]) + base := m.c.AllocateVReg(ssa.TypeI64) + lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base) + m.insert(lea) + + // Shift tmpGpReg by 4 to multiply the shift amount by 16. + m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false)) + + mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0) + loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), vecTmp) + m.insert(loadMask) + + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(vecTmp), xx)) + m.copyTo(xx, m.c.VRegOf(ret)) +} + +func (m *machine) lowerVSshr(x, y, ret ssa.Value, lane ssa.VecLane) { + switch lane { + case ssa.VecLaneI8x16: + m.lowerVSshri8x16(x, y, ret) + case ssa.VecLaneI16x8, ssa.VecLaneI32x4: + m.lowerShr(x, y, ret, lane, true) + case ssa.VecLaneI64x2: + m.lowerVSshri64x2(x, y, ret) + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } +} + +func (m *machine) lowerVSshri8x16(x, y, ret ssa.Value) { + shiftAmtReg := m.c.AllocateVReg(ssa.TypeI32) + // Load the modulo 8 mask to tmpReg. + m.lowerIconst(shiftAmtReg, 0x7, false) + // Take the modulo 8 of the shift amount. + shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)) + m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, shiftAmtReg, false)) + + // Copy the x value to two temporary registers. + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + vecTmp := m.c.AllocateVReg(ssa.TypeV128) + m.copyTo(xx, vecTmp) + + // Assuming that we have + // xx = [b1, ..., b16] + // vecTmp = [b1, ..., b16] + // at this point, then we use PUNPCKLBW and PUNPCKHBW to produce: + // xx = [b1, b1, b2, b2, ..., b8, b8] + // vecTmp = [b9, b9, b10, b10, ..., b16, b16] + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpcklbw, newOperandReg(xx), xx)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpckhbw, newOperandReg(vecTmp), vecTmp)) + + // Adding 8 to the shift amount, and then move the amount to vecTmp2. + vecTmp2 := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(8), shiftAmtReg, false)) + m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(shiftAmtReg), vecTmp2, false)) + + // Perform the word packed arithmetic right shifts on vreg and vecTmp. + // This changes these two registers as: + // xx = [xxx, b1 >> s, xxx, b2 >> s, ..., xxx, b8 >> s] + // vecTmp = [xxx, b9 >> s, xxx, b10 >> s, ..., xxx, b16 >> s] + // where xxx is 1 or 0 depending on each byte's sign, and ">>" is the arithmetic shift on a byte. + m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), xx)) + m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), vecTmp)) + + // Finally, we can get the result by packing these two word vectors. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePacksswb, newOperandReg(vecTmp), xx)) + + m.copyTo(xx, m.c.VRegOf(ret)) +} + +func (m *machine) lowerVSshri64x2(x, y, ret ssa.Value) { + // Load the shift amount to RCX. + shiftAmt := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, shiftAmt, rcxVReg)) + + tmpGp := m.c.AllocateVReg(ssa.TypeI64) + + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xxReg := m.copyToTmp(_xx.reg()) + + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 0, newOperandReg(xxReg), tmpGp)) + m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), xxReg)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 1, newOperandReg(xxReg), tmpGp)) + m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), xxReg)) + + m.copyTo(xxReg, m.c.VRegOf(ret)) +} + +func (m *machine) lowerShr(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) { + var modulo uint64 + var shiftOp sseOpcode + switch lane { + case ssa.VecLaneI16x8: + modulo = 0xf + if signed { + shiftOp = sseOpcodePsraw + } else { + shiftOp = sseOpcodePsrlw + } + case ssa.VecLaneI32x4: + modulo = 0x1f + if signed { + shiftOp = sseOpcodePsrad + } else { + shiftOp = sseOpcodePsrld + } + case ssa.VecLaneI64x2: + modulo = 0x3f + if signed { + panic("BUG") + } + shiftOp = sseOpcodePsrlq + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + + tmpGpReg := m.c.AllocateVReg(ssa.TypeI32) + // Load the modulo 8 mask to tmpReg. + m.lowerIconst(tmpGpReg, modulo, false) + // Take the modulo 8 of the shift amount. + m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, + m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false)) + // And move it to a xmm register. + tmpVec := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false)) + + // Then do the actual shift. + m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx)) + + m.copyTo(xx, m.c.VRegOf(ret)) +} + +func (m *machine) lowerVIshl(x, y, ret ssa.Value, lane ssa.VecLane) { + var modulo uint64 + var shiftOp sseOpcode + var isI8x16 bool + switch lane { + case ssa.VecLaneI8x16: + isI8x16 = true + modulo = 0x7 + shiftOp = sseOpcodePsllw + case ssa.VecLaneI16x8: + modulo = 0xf + shiftOp = sseOpcodePsllw + case ssa.VecLaneI32x4: + modulo = 0x1f + shiftOp = sseOpcodePslld + case ssa.VecLaneI64x2: + modulo = 0x3f + shiftOp = sseOpcodePsllq + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + + tmpGpReg := m.c.AllocateVReg(ssa.TypeI32) + // Load the modulo 8 mask to tmpReg. + m.lowerIconst(tmpGpReg, modulo, false) + // Take the modulo 8 of the shift amount. + m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, + m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false)) + // And move it to a xmm register. + tmpVec := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false)) + + // Then do the actual shift. + m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx)) + + if isI8x16 { + maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16SHLMaskTableIndex, i8x16SHLMaskTable[:]) + base := m.c.AllocateVReg(ssa.TypeI64) + lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base) + m.insert(lea) + + // Shift tmpGpReg by 4 to multiply the shift amount by 16. + m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false)) + + mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0) + loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), tmpVec) + m.insert(loadMask) + + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(tmpVec), xx)) + } + + m.copyTo(xx, m.c.VRegOf(ret)) +} + +// i8x16SHLMaskTable is necessary for emulating non-existent packed bytes left shifts on amd64. +// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits. +var i8x16SHLMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes. + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, // for 1 shift + 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, // for 2 shift + 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, // for 3 shift + 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // for 4 shift + 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, // for 5 shift + 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, // for 6 shift + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // for 7 shift +} + +func (m *machine) lowerVRound(x, ret ssa.Value, imm byte, _64 bool) { + xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + var round sseOpcode + if _64 { + round = sseOpcodeRoundpd + } else { + round = sseOpcodeRoundps + } + m.insert(m.allocateInstr().asXmmUnaryRmRImm(round, imm, xx, m.c.VRegOf(ret))) +} + +var ( + allOnesI8x16 = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1} + allOnesI16x8 = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0} + extAddPairwiseI16x8uMask1 = [16]byte{0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80} + extAddPairwiseI16x8uMask2 = [16]byte{0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00} +) + +func (m *machine) lowerExtIaddPairwise(x, ret ssa.Value, srcLane ssa.VecLane, signed bool) { + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + switch srcLane { + case ssa.VecLaneI8x16: + allOneReg := m.c.AllocateVReg(ssa.TypeV128) + mask := m.getOrAllocateConstLabel(&m.constAllOnesI8x16Index, allOnesI8x16[:]) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOneReg)) + + var resultReg regalloc.VReg + if signed { + resultReg = allOneReg + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(xx), resultReg)) + } else { + // Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned. + resultReg = xx + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(allOneReg), resultReg)) + } + m.copyTo(resultReg, m.c.VRegOf(ret)) + + case ssa.VecLaneI16x8: + if signed { + allOnesReg := m.c.AllocateVReg(ssa.TypeV128) + mask := m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:]) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOnesReg)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(allOnesReg), xx)) + m.copyTo(xx, m.c.VRegOf(ret)) + } else { + maskReg := m.c.AllocateVReg(ssa.TypeV128) + mask := m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask1Index, extAddPairwiseI16x8uMask1[:]) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg)) + + // Flip the sign bits on xx. + // + // Assuming that xx = [w1, ..., w8], now we have, + // xx[i] = int8(-w1) for i = 0...8 + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(maskReg), xx)) + + mask = m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:]) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg)) + + // For i = 0,..4 (as this results in i32x4 lanes), now we have + // xx[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1))) + // c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(maskReg), xx)) + + mask = m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask2Index, extAddPairwiseI16x8uMask2[:]) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg)) + + // vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)). + // c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(maskReg), xx)) + + m.copyTo(xx, m.c.VRegOf(ret)) + } + default: + panic(fmt.Sprintf("invalid lane type: %s", srcLane)) + } +} + +func (m *machine) lowerWidenLow(x, ret ssa.Value, lane ssa.VecLane, signed bool) { + var sseOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + if signed { + sseOp = sseOpcodePmovsxbw + } else { + sseOp = sseOpcodePmovzxbw + } + case ssa.VecLaneI16x8: + if signed { + sseOp = sseOpcodePmovsxwd + } else { + sseOp = sseOpcodePmovzxwd + } + case ssa.VecLaneI32x4: + if signed { + sseOp = sseOpcodePmovsxdq + } else { + sseOp = sseOpcodePmovzxdq + } + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, xx, m.c.VRegOf(ret))) +} + +func (m *machine) lowerWidenHigh(x, ret ssa.Value, lane ssa.VecLane, signed bool) { + tmp := m.c.AllocateVReg(ssa.TypeV128) + xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + m.copyTo(xx.reg(), tmp) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePalignr, 8, newOperandReg(tmp), tmp)) + + var sseOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + if signed { + sseOp = sseOpcodePmovsxbw + } else { + sseOp = sseOpcodePmovzxbw + } + case ssa.VecLaneI16x8: + if signed { + sseOp = sseOpcodePmovsxwd + } else { + sseOp = sseOpcodePmovzxwd + } + case ssa.VecLaneI32x4: + if signed { + sseOp = sseOpcodePmovsxdq + } else { + sseOp = sseOpcodePmovzxdq + } + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandReg(tmp), m.c.VRegOf(ret))) +} + +func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, ret ssa.Value, lane ssa.VecLane) { + tmpDst, tmpGp := m.c.AllocateVReg(ssa.TypeV128), m.c.AllocateVReg(ssa.TypeI64) + am := newOperandMem(m.lowerToAddressMode(ptr, offset)) + + m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst)) + switch lane { + case ssa.VecLaneI8x16: + m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, am, tmpGp)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, newOperandReg(tmpGp), tmpDst)) + tmpZeroVec := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asZeros(tmpZeroVec)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpZeroVec), tmpDst)) + case ssa.VecLaneI16x8: + m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, am, tmpGp)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, newOperandReg(tmpGp), tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, newOperandReg(tmpGp), tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) + case ssa.VecLaneI32x4: + m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, am, tmpGp)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, newOperandReg(tmpGp), tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst)) + case ssa.VecLaneI64x2: + m.insert(m.allocateInstr().asMov64MR(am, tmpGp)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), tmpDst)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), tmpDst)) + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + m.copyTo(tmpDst, m.c.VRegOf(ret)) +} + +var f64x2CvtFromIMask = [16]byte{ + 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +} + +func (m *machine) lowerVFcvtFromInt(x, ret ssa.Value, lane ssa.VecLane, signed bool) { + switch lane { + case ssa.VecLaneF32x4: + if signed { + xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, xx, m.c.VRegOf(ret))) + } else { + xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + // Copy the value to two temporary registers. + tmp := m.copyToTmp(xx.reg()) + tmp2 := m.copyToTmp(xx.reg()) + + // Clear the higher 16 bits of each 32-bit element. + m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePslld, newOperandImm32(0xa), tmp)) + m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0xa), tmp)) + + // Subtract the higher 16-bits from tmp2: clear the lower 16-bits of tmp2. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubd, newOperandReg(tmp), tmp2)) + + // Convert the lower 16-bits in tmp. + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp)) + + // Left shift by one and convert tmp2, meaning that halved conversion result of higher 16-bits in tmp2. + m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp2)) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp2), tmp2)) + + // Double the converted halved higher 16bits. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp2), tmp2)) + + // Get the conversion result by add tmp (holding lower 16-bit conversion) into tmp2. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp), tmp2)) + + m.copyTo(tmp2, m.c.VRegOf(ret)) + } + case ssa.VecLaneF64x2: + if signed { + xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x)) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2pd, xx, m.c.VRegOf(ret))) + } else { + maskReg := m.c.AllocateVReg(ssa.TypeV128) + maskLabel := m.getOrAllocateConstLabel(&m.constF64x2CvtFromIMaskIndex, f64x2CvtFromIMask[:]) + // maskReg = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00] + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg)) + + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + + // Given that we have xx = [d1, d2, d3, d4], this results in + // xx = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]] + // = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52] + // ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeUnpcklps, newOperandReg(maskReg), xx)) + + // maskReg = [float64(0x1.0p52), float64(0x1.0p52)] + maskLabel = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:]) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg)) + + // Now, we get the result as + // xx = [float64(uint32(d1)), float64(uint32(d2))] + // because the following equality always satisfies: + // float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubpd, newOperandReg(maskReg), xx)) + + m.copyTo(xx, m.c.VRegOf(ret)) + } + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } +} + +var ( + // i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes. + i32sMaxOnF64x2 = [16]byte{ + 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0) + 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0) + } + + // i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes. + i32uMaxOnF64x2 = [16]byte{ + 0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0) + 0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0) + } + + // twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that + // with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics, + // like addition or subtraction, the resulted floating point holds exactly the same + // bit representations in 32-bit integer on its mantissa. + // + // Note: the name twop52 is common across various compiler ecosystem. + // E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28 + // E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html + twop52 = [16]byte{ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52) + } +) + +func (m *machine) lowerVFcvtToIntSat(x, ret ssa.Value, lane ssa.VecLane, signed bool) { + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + + switch lane { + case ssa.VecLaneF32x4: + if signed { + tmp := m.copyToTmp(xx) + + // Assuming we have xx = [v1, v2, v3, v4]. + // + // Set all bits if lane is not NaN on tmp. + // tmp[i] = 0xffffffff if vi != NaN + // = 0 if vi == NaN + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp)) + + // Clear NaN lanes on xx, meaning that + // xx[i] = vi if vi != NaN + // 0 if vi == NaN + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp), xx)) + + // tmp[i] = ^vi if vi != NaN + // = 0xffffffff if vi == NaN + // which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeXorps, newOperandReg(xx), tmp)) + + // xx[i] = int32(vi) if vi != NaN and xx is not overflowing. + // = 0x80000000 if vi != NaN and xx is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq) + // = 0 if vi == NaN + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx)) + + // Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane. + // + // tmp[i] = 0x80000000 if vi is positive + // = any satisfying any&0x80000000 = 0 if vi is negative or zero. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(xx), tmp)) + + // Arithmetic right shifting tmp by 31, meaning that we have + // tmp[i] = 0xffffffff if vi is positive, 0 otherwise. + m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrad, newOperandImm32(0x1f), tmp)) + + // Flipping 0x80000000 if vi is positive, otherwise keep intact. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), xx)) + } else { + tmp := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asZeros(tmp)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxps, newOperandReg(tmp), xx)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp)) + m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0x1), tmp)) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp)) + tmp2 := m.copyToTmp(xx) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubps, newOperandReg(tmp), tmp2)) + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredLE_OS), newOperandReg(tmp2), tmp)) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(tmp2), tmp2)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp2)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaxsd, newOperandReg(tmp), tmp2)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(tmp2), xx)) + } + + case ssa.VecLaneF64x2: + tmp2 := m.c.AllocateVReg(ssa.TypeV128) + if signed { + tmp := m.copyToTmp(xx) + + // Set all bits for non-NaN lanes, zeros otherwise. + // I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise. + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp)) + + maskLabel := m.getOrAllocateConstLabel(&m.constI32sMaxOnF64x2Index, i32sMaxOnF64x2[:]) + // Load the 2147483647 into tmp2's each lane. + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp2)) + + // tmp[i] = 2147483647 if vi != NaN, 0 otherwise. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp2), tmp)) + + // MINPD returns the source register's value as-is, so we have + // xx[i] = vi if vi != NaN + // = 0 if vi == NaN + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp), xx)) + + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttpd2dq, newOperandReg(xx), xx)) + } else { + tmp := m.c.AllocateVReg(ssa.TypeV128) + m.insert(m.allocateInstr().asZeros(tmp)) + + // xx[i] = vi if vi != NaN && vi > 0 + // = 0 if vi == NaN || vi <= 0 + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxpd, newOperandReg(tmp), xx)) + + // tmp2[i] = float64(math.MaxUint32) = math.MaxUint32 + maskIndex := m.getOrAllocateConstLabel(&m.constI32uMaxOnF64x2Index, i32uMaxOnF64x2[:]) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2)) + + // xx[i] = vi if vi != NaN && vi > 0 && vi <= math.MaxUint32 + // = 0 otherwise + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp2), xx)) + + // Round the floating points into integer. + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeRoundpd, 0x3, newOperandReg(xx), xx)) + + // tmp2[i] = float64(0x1.0p52) + maskIndex = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:]) + m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2)) + + // xx[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32 + // = 0 otherwise + // + // This means that xx[i] holds exactly the same bit of uint32(vi) in its lower 32-bits. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddpd, newOperandReg(tmp2), xx)) + + // At this point, we have + // xx = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)] + // tmp = [0, 0, 0, 0] + // as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in + // xx = [xx[00], xx[10], tmp[00], tmp[00]] = [xx[00], xx[10], 0, 0] + // meaning that for i = 0 and 1, we have + // xx[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32 + // = 0 otherwise. + m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeShufps, 0b00_00_10_00, newOperandReg(tmp), xx)) + } + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + + m.copyTo(xx, m.c.VRegOf(ret)) +} + +func (m *machine) lowerNarrow(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) { + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + + var sseOp sseOpcode + switch lane { + case ssa.VecLaneI16x8: + if signed { + sseOp = sseOpcodePacksswb + } else { + sseOp = sseOpcodePackuswb + } + case ssa.VecLaneI32x4: + if signed { + sseOp = sseOpcodePackssdw + } else { + sseOp = sseOpcodePackusdw + } + default: + panic(fmt.Sprintf("invalid lane type: %s", lane)) + } + m.insert(m.allocateInstr().asXmmRmR(sseOp, yy, xx)) + m.copyTo(xx, m.c.VRegOf(ret)) +} + +func (m *machine) lowerWideningPairwiseDotProductS(x, y, ret ssa.Value) { + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + xx := m.copyToTmp(_xx.reg()) + yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y)) + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, yy, xx)) + m.copyTo(xx, m.c.VRegOf(ret)) +} + +func (m *machine) lowerVIabs(instr *ssa.Instruction) { + x, lane := instr.ArgWithLane() + rd := m.c.VRegOf(instr.Return()) + + if lane == ssa.VecLaneI64x2 { + _xx := m.getOperand_Reg(m.c.ValueDefinition(x)) + + blendReg := xmm0VReg + m.insert(m.allocateInstr().asDefineUninitializedReg(blendReg)) + + tmp := m.copyToTmp(_xx.reg()) + xx := m.copyToTmp(_xx.reg()) + + // Clear all bits on blendReg. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(blendReg), blendReg)) + // Subtract xx from blendMaskReg. + m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubq, newOperandReg(xx), blendReg)) + // Copy the subtracted value ^^ back into tmp. + m.copyTo(blendReg, xx) + + m.insert(m.allocateInstr().asBlendvpd(newOperandReg(tmp), xx)) + + m.copyTo(xx, rd) + } else { + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI8x16: + vecOp = sseOpcodePabsb + case ssa.VecLaneI16x8: + vecOp = sseOpcodePabsw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePabsd + } + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + + i := m.allocateInstr() + i.asXmmUnaryRmR(vecOp, rn, rd) + m.insert(i) + } +} + +func (m *machine) lowerVIpopcnt(instr *ssa.Instruction) { + x := instr.Arg() + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rd := m.c.VRegOf(instr.Return()) + + tmp1 := m.c.AllocateVReg(ssa.TypeV128) + m.lowerVconst(tmp1, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f) + + // Copy input into tmp2. + tmp2 := m.copyToTmp(rn.reg()) + + // Given that we have: + // rm = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn. + // + // Take PAND on tmp1 and tmp2, so that we mask out all the higher bits. + // tmp2 = [l1, ..., l16]. + pand := m.allocateInstr() + pand.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp2) + m.insert(pand) + + // Do logical (packed word) right shift by 4 on rm and PAND against the mask (tmp1); meaning that we have + // tmp3 = [h1, ...., h16]. + tmp3 := m.copyToTmp(rn.reg()) + psrlw := m.allocateInstr() + psrlw.asXmmRmiReg(sseOpcodePsrlw, newOperandImm32(4), tmp3) + m.insert(psrlw) + + pand2 := m.allocateInstr() + pand2.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp3) + m.insert(pand2) + + // Read the popcntTable into tmp4, and we have + // tmp4 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04] + tmp4 := m.c.AllocateVReg(ssa.TypeV128) + m.lowerVconst(tmp4, 0x03_02_02_01_02_01_01_00, 0x04_03_03_02_03_02_02_01) + + // Make a copy for later. + tmp5 := m.copyToTmp(tmp4) + + // tmp4 = [popcnt(l1), ..., popcnt(l16)]. + pshufb := m.allocateInstr() + pshufb.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp2), tmp4) + m.insert(pshufb) + + pshufb2 := m.allocateInstr() + pshufb2.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp3), tmp5) + m.insert(pshufb2) + + // tmp4 + tmp5 is the result. + paddb := m.allocateInstr() + paddb.asXmmRmR(sseOpcodePaddb, newOperandReg(tmp4), tmp5) + m.insert(paddb) + + m.copyTo(tmp5, rd) +} + +func (m *machine) lowerVImul(instr *ssa.Instruction) { + x, y, lane := instr.Arg2WithLane() + rd := m.c.VRegOf(instr.Return()) + if lane == ssa.VecLaneI64x2 { + rn := m.getOperand_Reg(m.c.ValueDefinition(x)) + rm := m.getOperand_Reg(m.c.ValueDefinition(y)) + // Assuming that we have + // rm = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high] + // rn = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high] + // where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane. + + // Copy rn into tmp1. + tmp1 := m.copyToTmp(rn.reg()) + + // And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high] + shift := m.allocateInstr() + shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp1) + m.insert(shift) + + // Execute "pmuludq rm,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit. + mul := m.allocateInstr() + mul.asXmmRmR(sseOpcodePmuludq, rm, tmp1) + m.insert(mul) + + // Copy rm value into tmp2. + tmp2 := m.copyToTmp(rm.reg()) + + // And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high] + shift2 := m.allocateInstr() + shift2.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp2) + m.insert(shift2) + + // Execute "pmuludq rm,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit. + mul2 := m.allocateInstr() + mul2.asXmmRmR(sseOpcodePmuludq, rn, tmp2) + m.insert(mul2) + + // Adds tmp1 and tmp2 and do the logical left shift by 32-bit, + // which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32] + add := m.allocateInstr() + add.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp2), tmp1) + m.insert(add) + + shift3 := m.allocateInstr() + shift3.asXmmRmiReg(sseOpcodePsllq, newOperandImm32(32), tmp1) + m.insert(shift3) + + // Copy rm value into tmp3. + tmp3 := m.copyToTmp(rm.reg()) + + // "pmuludq rm,tmp3" makes tmp3 = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit. + mul3 := m.allocateInstr() + mul3.asXmmRmR(sseOpcodePmuludq, rn, tmp3) + m.insert(mul3) + + // Finally, we get the result by computing tmp1 + tmp3, + // which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo] + add2 := m.allocateInstr() + add2.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp3), tmp1) + m.insert(add2) + + m.copyTo(tmp1, rd) + + } else { + var vecOp sseOpcode + switch lane { + case ssa.VecLaneI16x8: + vecOp = sseOpcodePmullw + case ssa.VecLaneI32x4: + vecOp = sseOpcodePmulld + default: + panic("unsupported: " + lane.String()) + } + m.lowerVbBinOp(vecOp, x, y, instr.Return()) + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go new file mode 100644 index 000000000..c6fcb8673 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go @@ -0,0 +1,346 @@ +package amd64 + +import ( + "fmt" + "unsafe" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +type operand struct { + kind operandKind + data uint64 +} + +type operandKind byte + +const ( + // operandKindReg is an operand which is an integer Register. + operandKindReg operandKind = iota + 1 + + // operandKindMem is a value in Memory. + // 32, 64, or 128 bit value. + operandKindMem + + // operandKindImm32 is a signed-32-bit integer immediate value. + operandKindImm32 + + // operandKindLabel is a label. + operandKindLabel +) + +// String implements fmt.Stringer. +func (o operandKind) String() string { + switch o { + case operandKindReg: + return "reg" + case operandKindMem: + return "mem" + case operandKindImm32: + return "imm32" + case operandKindLabel: + return "label" + default: + panic("BUG: invalid operand kind") + } +} + +// format returns the string representation of the operand. +// _64 is only for the case where the operand is a register, and it's integer. +func (o *operand) format(_64 bool) string { + switch o.kind { + case operandKindReg: + return formatVRegSized(o.reg(), _64) + case operandKindMem: + return o.addressMode().String() + case operandKindImm32: + return fmt.Sprintf("$%d", int32(o.imm32())) + case operandKindLabel: + return backend.Label(o.imm32()).String() + default: + panic(fmt.Sprintf("BUG: invalid operand: %s", o.kind)) + } +} + +//go:inline +func (o *operand) reg() regalloc.VReg { + return regalloc.VReg(o.data) +} + +//go:inline +func (o *operand) setReg(r regalloc.VReg) { + o.data = uint64(r) +} + +//go:inline +func (o *operand) addressMode() *amode { + return wazevoapi.PtrFromUintptr[amode](uintptr(o.data)) +} + +//go:inline +func (o *operand) imm32() uint32 { + return uint32(o.data) +} + +func (o *operand) label() backend.Label { + switch o.kind { + case operandKindLabel: + return backend.Label(o.data) + case operandKindMem: + mem := o.addressMode() + if mem.kind() != amodeRipRel { + panic("BUG: invalid label") + } + return backend.Label(mem.imm32) + default: + panic("BUG: invalid operand kind") + } +} + +func newOperandLabel(label backend.Label) operand { + return operand{kind: operandKindLabel, data: uint64(label)} +} + +func newOperandReg(r regalloc.VReg) operand { + return operand{kind: operandKindReg, data: uint64(r)} +} + +func newOperandImm32(imm32 uint32) operand { + return operand{kind: operandKindImm32, data: uint64(imm32)} +} + +func newOperandMem(amode *amode) operand { + return operand{kind: operandKindMem, data: uint64(uintptr(unsafe.Pointer(amode)))} +} + +// amode is a memory operand (addressing mode). +type amode struct { + kindWithShift uint32 + imm32 uint32 + base regalloc.VReg + + // For amodeRegRegShift: + index regalloc.VReg +} + +type amodeKind byte + +const ( + // amodeRegRegShift calculates sign-extend-32-to-64(Immediate) + base + amodeImmReg amodeKind = iota + 1 + + // amodeImmRBP is the same as amodeImmReg, but the base register is fixed to RBP. + // The only differece is that it doesn't tell the register allocator to use RBP which is distracting for the + // register allocator. + amodeImmRBP + + // amodeRegRegShift calculates sign-extend-32-to-64(Immediate) + base + (Register2 << Shift) + amodeRegRegShift + + // amodeRipRel is a RIP-relative addressing mode specified by the label. + amodeRipRel + + // TODO: there are other addressing modes such as the one without base register. +) + +func (a *amode) kind() amodeKind { + return amodeKind(a.kindWithShift & 0xff) +} + +func (a *amode) shift() byte { + return byte(a.kindWithShift >> 8) +} + +func (a *amode) uses(rs *[]regalloc.VReg) { + switch a.kind() { + case amodeImmReg: + *rs = append(*rs, a.base) + case amodeRegRegShift: + *rs = append(*rs, a.base, a.index) + case amodeImmRBP, amodeRipRel: + default: + panic("BUG: invalid amode kind") + } +} + +func (a *amode) nregs() int { + switch a.kind() { + case amodeImmReg: + return 1 + case amodeRegRegShift: + return 2 + case amodeImmRBP, amodeRipRel: + return 0 + default: + panic("BUG: invalid amode kind") + } +} + +func (a *amode) assignUses(i int, reg regalloc.VReg) { + switch a.kind() { + case amodeImmReg: + if i == 0 { + a.base = reg + } else { + panic("BUG: invalid amode assignment") + } + case amodeRegRegShift: + if i == 0 { + a.base = reg + } else if i == 1 { + a.index = reg + } else { + panic("BUG: invalid amode assignment") + } + default: + panic("BUG: invalid amode assignment") + } +} + +func (m *machine) newAmodeImmReg(imm32 uint32, base regalloc.VReg) *amode { + ret := m.amodePool.Allocate() + *ret = amode{kindWithShift: uint32(amodeImmReg), imm32: imm32, base: base} + return ret +} + +func (m *machine) newAmodeImmRBPReg(imm32 uint32) *amode { + ret := m.amodePool.Allocate() + *ret = amode{kindWithShift: uint32(amodeImmRBP), imm32: imm32, base: rbpVReg} + return ret +} + +func (m *machine) newAmodeRegRegShift(imm32 uint32, base, index regalloc.VReg, shift byte) *amode { + if shift > 3 { + panic(fmt.Sprintf("BUG: invalid shift (must be 3>=): %d", shift)) + } + ret := m.amodePool.Allocate() + *ret = amode{kindWithShift: uint32(amodeRegRegShift) | uint32(shift)<<8, imm32: imm32, base: base, index: index} + return ret +} + +func (m *machine) newAmodeRipRel(label backend.Label) *amode { + ret := m.amodePool.Allocate() + *ret = amode{kindWithShift: uint32(amodeRipRel), imm32: uint32(label)} + return ret +} + +// String implements fmt.Stringer. +func (a *amode) String() string { + switch a.kind() { + case amodeImmReg, amodeImmRBP: + if a.imm32 == 0 { + return fmt.Sprintf("(%s)", formatVRegSized(a.base, true)) + } + return fmt.Sprintf("%d(%s)", int32(a.imm32), formatVRegSized(a.base, true)) + case amodeRegRegShift: + shift := 1 << a.shift() + if a.imm32 == 0 { + return fmt.Sprintf( + "(%s,%s,%d)", + formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift) + } + return fmt.Sprintf( + "%d(%s,%s,%d)", + int32(a.imm32), formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift) + case amodeRipRel: + return fmt.Sprintf("%s(%%rip)", backend.Label(a.imm32)) + default: + panic("BUG: invalid amode kind") + } +} + +func (m *machine) getOperand_Mem_Reg(def *backend.SSAValueDefinition) (op operand) { + if def.IsFromBlockParam() { + return newOperandReg(def.BlkParamVReg) + } + + if def.SSAValue().Type() == ssa.TypeV128 { + // SIMD instructions require strict memory alignment, so we don't support the memory operand for V128 at the moment. + return m.getOperand_Reg(def) + } + + if m.c.MatchInstr(def, ssa.OpcodeLoad) { + instr := def.Instr + ptr, offset, _ := instr.LoadData() + op = newOperandMem(m.lowerToAddressMode(ptr, offset)) + instr.MarkLowered() + return op + } + return m.getOperand_Reg(def) +} + +func (m *machine) getOperand_Mem_Imm32_Reg(def *backend.SSAValueDefinition) (op operand) { + if def.IsFromBlockParam() { + return newOperandReg(def.BlkParamVReg) + } + + if m.c.MatchInstr(def, ssa.OpcodeLoad) { + instr := def.Instr + ptr, offset, _ := instr.LoadData() + op = newOperandMem(m.lowerToAddressMode(ptr, offset)) + instr.MarkLowered() + return op + } + return m.getOperand_Imm32_Reg(def) +} + +func (m *machine) getOperand_Imm32_Reg(def *backend.SSAValueDefinition) (op operand) { + if def.IsFromBlockParam() { + return newOperandReg(def.BlkParamVReg) + } + + instr := def.Instr + if instr.Constant() { + // If the operation is 64-bit, x64 sign-extends the 32-bit immediate value. + // Therefore, we need to check if the immediate value is within the 32-bit range and if the sign bit is set, + // we should not use the immediate value. + if op, ok := asImm32Operand(instr.ConstantVal(), instr.Return().Type() == ssa.TypeI32); ok { + instr.MarkLowered() + return op + } + } + return m.getOperand_Reg(def) +} + +func asImm32Operand(val uint64, allowSignExt bool) (operand, bool) { + if imm32, ok := asImm32(val, allowSignExt); ok { + return newOperandImm32(imm32), true + } + return operand{}, false +} + +func asImm32(val uint64, allowSignExt bool) (uint32, bool) { + u32val := uint32(val) + if uint64(u32val) != val { + return 0, false + } + if !allowSignExt && u32val&0x80000000 != 0 { + return 0, false + } + return u32val, true +} + +func (m *machine) getOperand_Reg(def *backend.SSAValueDefinition) (op operand) { + var v regalloc.VReg + if def.IsFromBlockParam() { + v = def.BlkParamVReg + } else { + instr := def.Instr + if instr.Constant() { + // We inline all the constant instructions so that we could reduce the register usage. + v = m.lowerConstant(instr) + instr.MarkLowered() + } else { + if n := def.N; n == 0 { + v = m.c.VRegOf(instr.Return()) + } else { + _, rs := instr.Returns() + v = m.c.VRegOf(rs[n-1]) + } + } + } + return newOperandReg(v) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go new file mode 100644 index 000000000..5219837e3 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go @@ -0,0 +1,11 @@ +//go:build !tinygo + +package amd64 + +import "reflect" + +// setSliceLimits sets both Cap and Len for the given reflected slice. +func setSliceLimits(s *reflect.SliceHeader, limit uintptr) { + s.Len = int(limit) + s.Cap = int(limit) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go new file mode 100644 index 000000000..df4cf46ec --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go @@ -0,0 +1,11 @@ +//go:build tinygo + +package amd64 + +import "reflect" + +// setSliceLimits sets both Cap and Len for the given reflected slice. +func setSliceLimits(s *reflect.SliceHeader, limit uintptr) { + s.Len = limit + s.Len = limit +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go new file mode 100644 index 000000000..4aec856fa --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go @@ -0,0 +1,181 @@ +package amd64 + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" +) + +// Amd64-specific registers. +const ( + // rax is a gp register. + rax = regalloc.RealRegInvalid + 1 + iota + // rcx is a gp register. + rcx + // rdx is a gp register. + rdx + // rbx is a gp register. + rbx + // rsp is a gp register. + rsp + // rbp is a gp register. + rbp + // rsi is a gp register. + rsi + // rdi is a gp register. + rdi + // r8 is a gp register. + r8 + // r9 is a gp register. + r9 + // r10 is a gp register. + r10 + // r11 is a gp register. + r11 + // r12 is a gp register. + r12 + // r13 is a gp register. + r13 + // r14 is a gp register. + r14 + // r15 is a gp register. + r15 + + // xmm0 is a vector register. + xmm0 + // xmm1 is a vector register. + xmm1 + // xmm2 is a vector register. + xmm2 + // xmm3 is a vector register. + xmm3 + // xmm4 is a vector register. + xmm4 + // xmm5 is a vector register. + xmm5 + // xmm6 is a vector register. + xmm6 + // xmm7 is a vector register. + xmm7 + // xmm8 is a vector register. + xmm8 + // xmm9 is a vector register. + xmm9 + // xmm10 is a vector register. + xmm10 + // xmm11 is a vector register. + xmm11 + // xmm12 is a vector register. + xmm12 + // xmm13 is a vector register. + xmm13 + // xmm14 is a vector register. + xmm14 + // xmm15 is a vector register. + xmm15 +) + +var ( + raxVReg = regalloc.FromRealReg(rax, regalloc.RegTypeInt) + rcxVReg = regalloc.FromRealReg(rcx, regalloc.RegTypeInt) + rdxVReg = regalloc.FromRealReg(rdx, regalloc.RegTypeInt) + rbxVReg = regalloc.FromRealReg(rbx, regalloc.RegTypeInt) + rspVReg = regalloc.FromRealReg(rsp, regalloc.RegTypeInt) + rbpVReg = regalloc.FromRealReg(rbp, regalloc.RegTypeInt) + rsiVReg = regalloc.FromRealReg(rsi, regalloc.RegTypeInt) + rdiVReg = regalloc.FromRealReg(rdi, regalloc.RegTypeInt) + r8VReg = regalloc.FromRealReg(r8, regalloc.RegTypeInt) + r9VReg = regalloc.FromRealReg(r9, regalloc.RegTypeInt) + r10VReg = regalloc.FromRealReg(r10, regalloc.RegTypeInt) + r11VReg = regalloc.FromRealReg(r11, regalloc.RegTypeInt) + r12VReg = regalloc.FromRealReg(r12, regalloc.RegTypeInt) + r13VReg = regalloc.FromRealReg(r13, regalloc.RegTypeInt) + r14VReg = regalloc.FromRealReg(r14, regalloc.RegTypeInt) + r15VReg = regalloc.FromRealReg(r15, regalloc.RegTypeInt) + + xmm0VReg = regalloc.FromRealReg(xmm0, regalloc.RegTypeFloat) + xmm1VReg = regalloc.FromRealReg(xmm1, regalloc.RegTypeFloat) + xmm2VReg = regalloc.FromRealReg(xmm2, regalloc.RegTypeFloat) + xmm3VReg = regalloc.FromRealReg(xmm3, regalloc.RegTypeFloat) + xmm4VReg = regalloc.FromRealReg(xmm4, regalloc.RegTypeFloat) + xmm5VReg = regalloc.FromRealReg(xmm5, regalloc.RegTypeFloat) + xmm6VReg = regalloc.FromRealReg(xmm6, regalloc.RegTypeFloat) + xmm7VReg = regalloc.FromRealReg(xmm7, regalloc.RegTypeFloat) + xmm8VReg = regalloc.FromRealReg(xmm8, regalloc.RegTypeFloat) + xmm9VReg = regalloc.FromRealReg(xmm9, regalloc.RegTypeFloat) + xmm10VReg = regalloc.FromRealReg(xmm10, regalloc.RegTypeFloat) + xmm11VReg = regalloc.FromRealReg(xmm11, regalloc.RegTypeFloat) + xmm12VReg = regalloc.FromRealReg(xmm12, regalloc.RegTypeFloat) + xmm13VReg = regalloc.FromRealReg(xmm13, regalloc.RegTypeFloat) + xmm14VReg = regalloc.FromRealReg(xmm14, regalloc.RegTypeFloat) + xmm15VReg = regalloc.FromRealReg(xmm15, regalloc.RegTypeFloat) +) + +var regNames = [...]string{ + rax: "rax", + rcx: "rcx", + rdx: "rdx", + rbx: "rbx", + rsp: "rsp", + rbp: "rbp", + rsi: "rsi", + rdi: "rdi", + r8: "r8", + r9: "r9", + r10: "r10", + r11: "r11", + r12: "r12", + r13: "r13", + r14: "r14", + r15: "r15", + xmm0: "xmm0", + xmm1: "xmm1", + xmm2: "xmm2", + xmm3: "xmm3", + xmm4: "xmm4", + xmm5: "xmm5", + xmm6: "xmm6", + xmm7: "xmm7", + xmm8: "xmm8", + xmm9: "xmm9", + xmm10: "xmm10", + xmm11: "xmm11", + xmm12: "xmm12", + xmm13: "xmm13", + xmm14: "xmm14", + xmm15: "xmm15", +} + +func formatVRegSized(r regalloc.VReg, _64 bool) string { + if r.IsRealReg() { + if r.RegType() == regalloc.RegTypeInt { + rr := r.RealReg() + orig := regNames[rr] + if rr <= rdi { + if _64 { + return "%" + orig + } else { + return "%e" + orig[1:] + } + } else { + if _64 { + return "%" + orig + } else { + return "%" + orig + "d" + } + } + } else { + return "%" + regNames[r.RealReg()] + } + } else { + if r.RegType() == regalloc.RegTypeInt { + if _64 { + return fmt.Sprintf("%%r%d?", r.ID()) + } else { + return fmt.Sprintf("%%r%dd?", r.ID()) + } + } else { + return fmt.Sprintf("%%xmm%d?", r.ID()) + } + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go new file mode 100644 index 000000000..05ba5f027 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go @@ -0,0 +1,128 @@ +package amd64 + +import ( + "encoding/binary" + "reflect" + "unsafe" + + "github.com/tetratelabs/wazero/internal/wasmdebug" +) + +func stackView(rbp, top uintptr) []byte { + var stackBuf []byte + { + // TODO: use unsafe.Slice after floor version is set to Go 1.20. + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf)) + hdr.Data = rbp + setSliceLimits(hdr, top-rbp) + } + return stackBuf +} + +// UnwindStack implements wazevo.unwindStack. +func UnwindStack(_, rbp, top uintptr, returnAddresses []uintptr) []uintptr { + stackBuf := stackView(rbp, top) + + for i := uint64(0); i < uint64(len(stackBuf)); { + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | ReturnAddress | + // | Caller_RBP | + // +-----------------+ <---- Caller_RBP + // | ........... | + // | clobbered M | + // | ............ | + // | clobbered 0 | + // | spill slot N | + // | ............ | + // | spill slot 0 | + // | ReturnAddress | + // | Caller_RBP | + // +-----------------+ <---- RBP + // (low address) + + callerRBP := binary.LittleEndian.Uint64(stackBuf[i:]) + retAddr := binary.LittleEndian.Uint64(stackBuf[i+8:]) + returnAddresses = append(returnAddresses, uintptr(retAddr)) + i = callerRBP - uint64(rbp) + if len(returnAddresses) == wasmdebug.MaxFrames { + break + } + } + return returnAddresses +} + +// GoCallStackView implements wazevo.goCallStackView. +func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 { + // (high address) + // +-----------------+ <----+ + // | xxxxxxxxxxx | | ;; optional unused space to make it 16-byte aligned. + // ^ | arg[N]/ret[M] | | + // sliceSize | | ............ | | SizeInBytes/8 + // | | arg[1]/ret[1] | | + // v | arg[0]/ret[0] | <----+ + // | SizeInBytes | + // +-----------------+ <---- stackPointerBeforeGoCall + // (low address) + data := unsafe.Pointer(uintptr(unsafe.Pointer(stackPointerBeforeGoCall)) + 8) + size := *stackPointerBeforeGoCall / 8 + return unsafe.Slice((*uint64)(data), int(size)) +} + +func AdjustClonedStack(oldRsp, oldTop, rsp, rbp, top uintptr) { + diff := uint64(rsp - oldRsp) + + newBuf := stackView(rbp, top) + for i := uint64(0); i < uint64(len(newBuf)); { + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | ReturnAddress | + // | Caller_RBP | + // +-----------------+ <---- Caller_RBP + // | ........... | + // | clobbered M | + // | ............ | + // | clobbered 0 | + // | spill slot N | + // | ............ | + // | spill slot 0 | + // | ReturnAddress | + // | Caller_RBP | + // +-----------------+ <---- RBP + // (low address) + + callerRBP := binary.LittleEndian.Uint64(newBuf[i:]) + if callerRBP == 0 { + // End of stack. + break + } + if i64 := int64(callerRBP); i64 < int64(oldRsp) || i64 >= int64(oldTop) { + panic("BUG: callerRBP is out of range") + } + if int(callerRBP) < 0 { + panic("BUG: callerRBP is negative") + } + adjustedCallerRBP := callerRBP + diff + if int(adjustedCallerRBP) < 0 { + panic("BUG: adjustedCallerRBP is negative") + } + binary.LittleEndian.PutUint64(newBuf[i:], adjustedCallerRBP) + i = adjustedCallerRBP - uint64(rbp) + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go new file mode 100644 index 000000000..6615471c6 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go @@ -0,0 +1,332 @@ +package arm64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// References: +// * https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#arm64-architecture +// * https://developer.arm.com/documentation/102374/0101/Procedure-Call-Standard + +var ( + intParamResultRegs = []regalloc.RealReg{x0, x1, x2, x3, x4, x5, x6, x7} + floatParamResultRegs = []regalloc.RealReg{v0, v1, v2, v3, v4, v5, v6, v7} +) + +var regInfo = ®alloc.RegisterInfo{ + AllocatableRegisters: [regalloc.NumRegType][]regalloc.RealReg{ + // We don't allocate: + // - x18: Reserved by the macOS: https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms#Respect-the-purpose-of-specific-CPU-registers + // - x28: Reserved by Go runtime. + // - x27(=tmpReg): because of the reason described on tmpReg. + regalloc.RegTypeInt: { + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x19, x20, x21, x22, x23, x24, x25, + x26, x29, x30, + // These are the argument/return registers. Less preferred in the allocation. + x7, x6, x5, x4, x3, x2, x1, x0, + }, + regalloc.RegTypeFloat: { + v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, + // These are the argument/return registers. Less preferred in the allocation. + v7, v6, v5, v4, v3, v2, v1, v0, + }, + }, + CalleeSavedRegisters: regalloc.NewRegSet( + x19, x20, x21, x22, x23, x24, x25, x26, x28, + v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + ), + CallerSavedRegisters: regalloc.NewRegSet( + x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x29, x30, + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, + ), + RealRegToVReg: []regalloc.VReg{ + x0: x0VReg, x1: x1VReg, x2: x2VReg, x3: x3VReg, x4: x4VReg, x5: x5VReg, x6: x6VReg, x7: x7VReg, x8: x8VReg, x9: x9VReg, x10: x10VReg, x11: x11VReg, x12: x12VReg, x13: x13VReg, x14: x14VReg, x15: x15VReg, x16: x16VReg, x17: x17VReg, x18: x18VReg, x19: x19VReg, x20: x20VReg, x21: x21VReg, x22: x22VReg, x23: x23VReg, x24: x24VReg, x25: x25VReg, x26: x26VReg, x27: x27VReg, x28: x28VReg, x29: x29VReg, x30: x30VReg, + v0: v0VReg, v1: v1VReg, v2: v2VReg, v3: v3VReg, v4: v4VReg, v5: v5VReg, v6: v6VReg, v7: v7VReg, v8: v8VReg, v9: v9VReg, v10: v10VReg, v11: v11VReg, v12: v12VReg, v13: v13VReg, v14: v14VReg, v15: v15VReg, v16: v16VReg, v17: v17VReg, v18: v18VReg, v19: v19VReg, v20: v20VReg, v21: v21VReg, v22: v22VReg, v23: v23VReg, v24: v24VReg, v25: v25VReg, v26: v26VReg, v27: v27VReg, v28: v28VReg, v29: v29VReg, v30: v30VReg, v31: v31VReg, + }, + RealRegName: func(r regalloc.RealReg) string { return regNames[r] }, + RealRegType: func(r regalloc.RealReg) regalloc.RegType { + if r < v0 { + return regalloc.RegTypeInt + } + return regalloc.RegTypeFloat + }, +} + +// ArgsResultsRegs implements backend.Machine. +func (m *machine) ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) { + return intParamResultRegs, floatParamResultRegs +} + +// LowerParams implements backend.FunctionABI. +func (m *machine) LowerParams(args []ssa.Value) { + a := m.currentABI + + for i, ssaArg := range args { + if !ssaArg.Valid() { + continue + } + reg := m.compiler.VRegOf(ssaArg) + arg := &a.Args[i] + if arg.Kind == backend.ABIArgKindReg { + m.InsertMove(reg, arg.Reg, arg.Type) + } else { + // TODO: we could use pair load if there's consecutive loads for the same type. + // + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | <-| + // | ReturnAddress | | + // +-----------------+ | + // | ........... | | + // | clobbered M | | argStackOffset: is unknown at this point of compilation. + // | ............ | | + // | clobbered 0 | | + // | spill slot N | | + // | ........... | | + // | spill slot 0 | | + // SP---> +-----------------+ <-+ + // (low address) + + bits := arg.Type.Bits() + // At this point of compilation, we don't yet know how much space exist below the return address. + // So we instruct the address mode to add the `argStackOffset` to the offset at the later phase of compilation. + amode := addressMode{imm: arg.Offset, rn: spVReg, kind: addressModeKindArgStackSpace} + load := m.allocateInstr() + switch arg.Type { + case ssa.TypeI32, ssa.TypeI64: + load.asULoad(operandNR(reg), amode, bits) + case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: + load.asFpuLoad(operandNR(reg), amode, bits) + default: + panic("BUG") + } + m.insert(load) + m.unresolvedAddressModes = append(m.unresolvedAddressModes, load) + } + } +} + +// LowerReturns lowers the given returns. +func (m *machine) LowerReturns(rets []ssa.Value) { + a := m.currentABI + + l := len(rets) - 1 + for i := range rets { + // Reverse order in order to avoid overwriting the stack returns existing in the return registers. + ret := rets[l-i] + r := &a.Rets[l-i] + reg := m.compiler.VRegOf(ret) + if def := m.compiler.ValueDefinition(ret); def.IsFromInstr() { + // Constant instructions are inlined. + if inst := def.Instr; inst.Constant() { + val := inst.Return() + valType := val.Type() + v := inst.ConstantVal() + m.insertLoadConstant(v, valType, reg) + } + } + if r.Kind == backend.ABIArgKindReg { + m.InsertMove(r.Reg, reg, ret.Type()) + } else { + // TODO: we could use pair store if there's consecutive stores for the same type. + // + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | <-+ + // | arg X | | + // | ....... | | + // | arg 1 | | + // | arg 0 | | + // | ReturnAddress | | + // +-----------------+ | + // | ........... | | + // | spill slot M | | retStackOffset: is unknown at this point of compilation. + // | ............ | | + // | spill slot 2 | | + // | spill slot 1 | | + // | clobbered 0 | | + // | clobbered 1 | | + // | ........... | | + // | clobbered N | | + // SP---> +-----------------+ <-+ + // (low address) + + bits := r.Type.Bits() + + // At this point of compilation, we don't yet know how much space exist below the return address. + // So we instruct the address mode to add the `retStackOffset` to the offset at the later phase of compilation. + amode := addressMode{imm: r.Offset, rn: spVReg, kind: addressModeKindResultStackSpace} + store := m.allocateInstr() + store.asStore(operandNR(reg), amode, bits) + m.insert(store) + m.unresolvedAddressModes = append(m.unresolvedAddressModes, store) + } + } +} + +// callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the +// caller side of the function call. +func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def *backend.SSAValueDefinition, slotBegin int64) { + arg := &a.Args[argIndex] + if def != nil && def.IsFromInstr() { + // Constant instructions are inlined. + if inst := def.Instr; inst.Constant() { + val := inst.Return() + valType := val.Type() + v := inst.ConstantVal() + m.insertLoadConstant(v, valType, reg) + } + } + if arg.Kind == backend.ABIArgKindReg { + m.InsertMove(arg.Reg, reg, arg.Type) + } else { + // TODO: we could use pair store if there's consecutive stores for the same type. + // + // Note that at this point, stack pointer is already adjusted. + bits := arg.Type.Bits() + amode := m.resolveAddressModeForOffset(arg.Offset-slotBegin, bits, spVReg, false) + store := m.allocateInstr() + store.asStore(operandNR(reg), amode, bits) + m.insert(store) + } +} + +func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex int, reg regalloc.VReg, slotBegin int64) { + r := &a.Rets[retIndex] + if r.Kind == backend.ABIArgKindReg { + m.InsertMove(reg, r.Reg, r.Type) + } else { + // TODO: we could use pair load if there's consecutive loads for the same type. + amode := m.resolveAddressModeForOffset(a.ArgStackSize+r.Offset-slotBegin, r.Type.Bits(), spVReg, false) + ldr := m.allocateInstr() + switch r.Type { + case ssa.TypeI32, ssa.TypeI64: + ldr.asULoad(operandNR(reg), amode, r.Type.Bits()) + case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: + ldr.asFpuLoad(operandNR(reg), amode, r.Type.Bits()) + default: + panic("BUG") + } + m.insert(ldr) + } +} + +func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) (*instruction, addressMode) { + exct := m.executableContext + exct.PendingInstructions = exct.PendingInstructions[:0] + mode := m.resolveAddressModeForOffset(offset, dstBits, rn, allowTmpRegUse) + for _, instr := range exct.PendingInstructions { + cur = linkInstr(cur, instr) + } + return cur, mode +} + +func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) addressMode { + if rn.RegType() != regalloc.RegTypeInt { + panic("BUG: rn should be a pointer: " + formatVRegSized(rn, 64)) + } + var amode addressMode + if offsetFitsInAddressModeKindRegUnsignedImm12(dstBits, offset) { + amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: rn, imm: offset} + } else if offsetFitsInAddressModeKindRegSignedImm9(offset) { + amode = addressMode{kind: addressModeKindRegSignedImm9, rn: rn, imm: offset} + } else { + var indexReg regalloc.VReg + if allowTmpRegUse { + m.lowerConstantI64(tmpRegVReg, offset) + indexReg = tmpRegVReg + } else { + indexReg = m.compiler.AllocateVReg(ssa.TypeI64) + m.lowerConstantI64(indexReg, offset) + } + amode = addressMode{kind: addressModeKindRegReg, rn: rn, rm: indexReg, extOp: extendOpUXTX /* indicates index rm is 64-bit */} + } + return amode +} + +func (m *machine) lowerCall(si *ssa.Instruction) { + isDirectCall := si.Opcode() == ssa.OpcodeCall + var indirectCalleePtr ssa.Value + var directCallee ssa.FuncRef + var sigID ssa.SignatureID + var args []ssa.Value + if isDirectCall { + directCallee, sigID, args = si.CallData() + } else { + indirectCalleePtr, sigID, args, _ /* on arm64, the calling convention is compatible with the Go runtime */ = si.CallIndirectData() + } + calleeABI := m.compiler.GetFunctionABI(m.compiler.SSABuilder().ResolveSignature(sigID)) + + stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize()) + if m.maxRequiredStackSizeForCalls < stackSlotSize+16 { + m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // return address frame. + } + + for i, arg := range args { + reg := m.compiler.VRegOf(arg) + def := m.compiler.ValueDefinition(arg) + m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize) + } + + if isDirectCall { + call := m.allocateInstr() + call.asCall(directCallee, calleeABI) + m.insert(call) + } else { + ptr := m.compiler.VRegOf(indirectCalleePtr) + callInd := m.allocateInstr() + callInd.asCallIndirect(ptr, calleeABI) + m.insert(callInd) + } + + var index int + r1, rs := si.Returns() + if r1.Valid() { + m.callerGenFunctionReturnVReg(calleeABI, 0, m.compiler.VRegOf(r1), stackSlotSize) + index++ + } + + for _, r := range rs { + m.callerGenFunctionReturnVReg(calleeABI, index, m.compiler.VRegOf(r), stackSlotSize) + index++ + } +} + +func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add bool) { + if imm12Operand, ok := asImm12Operand(uint64(diff)); ok { + alu := m.allocateInstr() + var ao aluOp + if add { + ao = aluOpAdd + } else { + ao = aluOpSub + } + alu.asALU(ao, operandNR(rd), operandNR(spVReg), imm12Operand, true) + m.insert(alu) + } else { + m.lowerConstantI64(tmpRegVReg, diff) + alu := m.allocateInstr() + var ao aluOp + if add { + ao = aluOpAdd + } else { + ao = aluOpSub + } + alu.asALU(ao, operandNR(rd), operandNR(spVReg), operandNR(tmpRegVReg), true) + m.insert(alu) + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go new file mode 100644 index 000000000..5f0c613df --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go @@ -0,0 +1,9 @@ +package arm64 + +// entrypoint enters the machine code generated by this backend which begins with the preamble generated by functionABI.EmitGoEntryPreamble below. +// This implements wazevo.entrypoint, and see the comments there for detail. +func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr) + +// afterGoFunctionCallEntrypoint enters the machine code after growing the stack. +// This implements wazevo.afterGoFunctionCallEntrypoint, and see the comments there for detail. +func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr) diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s new file mode 100644 index 000000000..0b579f852 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s @@ -0,0 +1,29 @@ +//go:build arm64 + +#include "funcdata.h" +#include "textflag.h" + +// See the comments on EmitGoEntryPreamble for what this function is supposed to do. +TEXT ·entrypoint(SB), NOSPLIT|NOFRAME, $0-48 + MOVD preambleExecutable+0(FP), R27 + MOVD functionExectuable+8(FP), R24 + MOVD executionContextPtr+16(FP), R0 + MOVD moduleContextPtr+24(FP), R1 + MOVD paramResultSlicePtr+32(FP), R19 + MOVD goAllocatedStackSlicePtr+40(FP), R26 + JMP (R27) + +TEXT ·afterGoFunctionCallEntrypoint(SB), NOSPLIT|NOFRAME, $0-32 + MOVD goCallReturnAddress+0(FP), R20 + MOVD executionContextPtr+8(FP), R0 + MOVD stackPointer+16(FP), R19 + + // Save the current FP(R29), SP and LR(R30) into the wazevo.executionContext (stored in R0). + MOVD R29, 16(R0) // Store FP(R29) into [RO, #ExecutionContextOffsets.OriginalFramePointer] + MOVD RSP, R27 // Move SP to R27 (temporary register) since SP cannot be stored directly in str instructions. + MOVD R27, 24(R0) // Store R27 into [RO, #ExecutionContextOffsets.OriginalFramePointer] + MOVD R30, 32(R0) // Store R30 into [R0, #ExecutionContextOffsets.GoReturnAddress] + + // Load the new stack pointer (which sits somewhere in Go-allocated stack) into SP. + MOVD R19, RSP + JMP (R20) diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go new file mode 100644 index 000000000..7a9cceb33 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go @@ -0,0 +1,230 @@ +package arm64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// CompileEntryPreamble implements backend.Machine. This assumes `entrypoint` function (in abi_go_entry_arm64.s) passes: +// +// 1. First (execution context ptr) and Second arguments are already passed in x0, and x1. +// 2. param/result slice ptr in x19; the pointer to []uint64{} which is used to pass arguments and accept return values. +// 3. Go-allocated stack slice ptr in x26. +// 4. Function executable in x24. +// +// also SP and FP are correct Go-runtime-based values, and LR is the return address to the Go-side caller. +func (m *machine) CompileEntryPreamble(signature *ssa.Signature) []byte { + root := m.constructEntryPreamble(signature) + m.encode(root) + return m.compiler.Buf() +} + +var ( + executionContextPtrReg = x0VReg + // callee-saved regs so that they can be used in the prologue and epilogue. + paramResultSlicePtr = x19VReg + savedExecutionContextPtr = x20VReg + // goAllocatedStackPtr is not used in the epilogue. + goAllocatedStackPtr = x26VReg + // paramResultSliceCopied is not used in the epilogue. + paramResultSliceCopied = x25VReg + // tmpRegVReg is not used in the epilogue. + functionExecutable = x24VReg +) + +func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regalloc.VReg, arg *backend.ABIArg, argStartOffsetFromSP int64) *instruction { + typ := arg.Type + bits := typ.Bits() + isStackArg := arg.Kind == backend.ABIArgKindStack + + var loadTargetReg operand + if !isStackArg { + loadTargetReg = operandNR(arg.Reg) + } else { + switch typ { + case ssa.TypeI32, ssa.TypeI64: + loadTargetReg = operandNR(x15VReg) + case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: + loadTargetReg = operandNR(v15VReg) + default: + panic("TODO?") + } + } + + var postIndexImm int64 + if typ == ssa.TypeV128 { + postIndexImm = 16 // v128 is represented as 2x64-bit in Go slice. + } else { + postIndexImm = 8 + } + loadMode := addressMode{kind: addressModeKindPostIndex, rn: paramSlicePtr, imm: postIndexImm} + + instr := m.allocateInstr() + switch typ { + case ssa.TypeI32: + instr.asULoad(loadTargetReg, loadMode, 32) + case ssa.TypeI64: + instr.asULoad(loadTargetReg, loadMode, 64) + case ssa.TypeF32: + instr.asFpuLoad(loadTargetReg, loadMode, 32) + case ssa.TypeF64: + instr.asFpuLoad(loadTargetReg, loadMode, 64) + case ssa.TypeV128: + instr.asFpuLoad(loadTargetReg, loadMode, 128) + } + cur = linkInstr(cur, instr) + + if isStackArg { + var storeMode addressMode + cur, storeMode = m.resolveAddressModeForOffsetAndInsert(cur, argStartOffsetFromSP+arg.Offset, bits, spVReg, true) + toStack := m.allocateInstr() + toStack.asStore(loadTargetReg, storeMode, bits) + cur = linkInstr(cur, toStack) + } + return cur +} + +func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr regalloc.VReg, result *backend.ABIArg, resultStartOffsetFromSP int64) *instruction { + isStackArg := result.Kind == backend.ABIArgKindStack + typ := result.Type + bits := typ.Bits() + + var storeTargetReg operand + if !isStackArg { + storeTargetReg = operandNR(result.Reg) + } else { + switch typ { + case ssa.TypeI32, ssa.TypeI64: + storeTargetReg = operandNR(x15VReg) + case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: + storeTargetReg = operandNR(v15VReg) + default: + panic("TODO?") + } + } + + var postIndexImm int64 + if typ == ssa.TypeV128 { + postIndexImm = 16 // v128 is represented as 2x64-bit in Go slice. + } else { + postIndexImm = 8 + } + + if isStackArg { + var loadMode addressMode + cur, loadMode = m.resolveAddressModeForOffsetAndInsert(cur, resultStartOffsetFromSP+result.Offset, bits, spVReg, true) + toReg := m.allocateInstr() + switch typ { + case ssa.TypeI32, ssa.TypeI64: + toReg.asULoad(storeTargetReg, loadMode, bits) + case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: + toReg.asFpuLoad(storeTargetReg, loadMode, bits) + default: + panic("TODO?") + } + cur = linkInstr(cur, toReg) + } + + mode := addressMode{kind: addressModeKindPostIndex, rn: resultSlicePtr, imm: postIndexImm} + instr := m.allocateInstr() + instr.asStore(storeTargetReg, mode, bits) + cur = linkInstr(cur, instr) + return cur +} + +func (m *machine) constructEntryPreamble(sig *ssa.Signature) (root *instruction) { + abi := backend.FunctionABI{} + abi.Init(sig, intParamResultRegs, floatParamResultRegs) + + root = m.allocateNop() + + //// ----------------------------------- prologue ----------------------------------- //// + + // First, we save executionContextPtrReg into a callee-saved register so that it can be used in epilogue as well. + // mov savedExecutionContextPtr, x0 + cur := m.move64(savedExecutionContextPtr, executionContextPtrReg, root) + + // Next, save the current FP, SP and LR into the wazevo.executionContext: + // str fp, [savedExecutionContextPtr, #OriginalFramePointer] + // mov tmp, sp ;; sp cannot be str'ed directly. + // str sp, [savedExecutionContextPtr, #OriginalStackPointer] + // str lr, [savedExecutionContextPtr, #GoReturnAddress] + cur = m.loadOrStoreAtExecutionContext(fpVReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, true, cur) + cur = m.move64(tmpRegVReg, spVReg, cur) + cur = m.loadOrStoreAtExecutionContext(tmpRegVReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, true, cur) + cur = m.loadOrStoreAtExecutionContext(lrVReg, wazevoapi.ExecutionContextOffsetGoReturnAddress, true, cur) + + // Then, move the Go-allocated stack pointer to SP: + // mov sp, goAllocatedStackPtr + cur = m.move64(spVReg, goAllocatedStackPtr, cur) + + prReg := paramResultSlicePtr + if len(abi.Args) > 2 && len(abi.Rets) > 0 { + // paramResultSlicePtr is modified during the execution of goEntryPreamblePassArg, + // so copy it to another reg. + cur = m.move64(paramResultSliceCopied, paramResultSlicePtr, cur) + prReg = paramResultSliceCopied + } + + stackSlotSize := int64(abi.AlignedArgResultStackSlotSize()) + for i := range abi.Args { + if i < 2 { + // module context ptr and execution context ptr are passed in x0 and x1 by the Go assembly function. + continue + } + arg := &abi.Args[i] + cur = m.goEntryPreamblePassArg(cur, prReg, arg, -stackSlotSize) + } + + // Call the real function. + bl := m.allocateInstr() + bl.asCallIndirect(functionExecutable, &abi) + cur = linkInstr(cur, bl) + + ///// ----------------------------------- epilogue ----------------------------------- ///// + + // Store the register results into paramResultSlicePtr. + for i := range abi.Rets { + cur = m.goEntryPreamblePassResult(cur, paramResultSlicePtr, &abi.Rets[i], abi.ArgStackSize-stackSlotSize) + } + + // Finally, restore the FP, SP and LR, and return to the Go code. + // ldr fp, [savedExecutionContextPtr, #OriginalFramePointer] + // ldr tmp, [savedExecutionContextPtr, #OriginalStackPointer] + // mov sp, tmp ;; sp cannot be str'ed directly. + // ldr lr, [savedExecutionContextPtr, #GoReturnAddress] + // ret ;; --> return to the Go code + cur = m.loadOrStoreAtExecutionContext(fpVReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, false, cur) + cur = m.loadOrStoreAtExecutionContext(tmpRegVReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, false, cur) + cur = m.move64(spVReg, tmpRegVReg, cur) + cur = m.loadOrStoreAtExecutionContext(lrVReg, wazevoapi.ExecutionContextOffsetGoReturnAddress, false, cur) + retInst := m.allocateInstr() + retInst.asRet() + linkInstr(cur, retInst) + return +} + +func (m *machine) move64(dst, src regalloc.VReg, prev *instruction) *instruction { + instr := m.allocateInstr() + instr.asMove64(dst, src) + return linkInstr(prev, instr) +} + +func (m *machine) loadOrStoreAtExecutionContext(d regalloc.VReg, offset wazevoapi.Offset, store bool, prev *instruction) *instruction { + instr := m.allocateInstr() + mode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: savedExecutionContextPtr, imm: offset.I64()} + if store { + instr.asStore(operandNR(d), mode, 64) + } else { + instr.asULoad(operandNR(d), mode, 64) + } + return linkInstr(prev, instr) +} + +func linkInstr(prev, next *instruction) *instruction { + prev.next = next + next.prev = prev + return next +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go new file mode 100644 index 000000000..466b1f960 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go @@ -0,0 +1,428 @@ +package arm64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +var calleeSavedRegistersSorted = []regalloc.VReg{ + x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg, + v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg, +} + +// CompileGoFunctionTrampoline implements backend.Machine. +func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte { + exct := m.executableContext + argBegin := 1 // Skips exec context by default. + if needModuleContextPtr { + argBegin++ + } + + abi := &backend.FunctionABI{} + abi.Init(sig, intParamResultRegs, floatParamResultRegs) + m.currentABI = abi + + cur := m.allocateInstr() + cur.asNop0() + exct.RootInstr = cur + + // Execution context is always the first argument. + execCtrPtr := x0VReg + + // In the following, we create the following stack layout: + // + // (high address) + // SP ------> +-----------------+ <----+ + // | ....... | | + // | ret Y | | + // | ....... | | + // | ret 0 | | + // | arg X | | size_of_arg_ret + // | ....... | | + // | arg 1 | | + // | arg 0 | <----+ <-------- originalArg0Reg + // | size_of_arg_ret | + // | ReturnAddress | + // +-----------------+ <----+ + // | xxxx | | ;; might be padded to make it 16-byte aligned. + // +--->| arg[N]/ret[M] | | + // sliceSize| | ............ | | goCallStackSize + // | | arg[1]/ret[1] | | + // +--->| arg[0]/ret[0] | <----+ <-------- arg0ret0AddrReg + // | sliceSize | + // | frame_size | + // +-----------------+ + // (low address) + // + // where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions, + // therefore will be accessed as the usual []uint64. So that's where we need to pass/receive + // the arguments/return values. + + // First of all, to update the SP, and create "ReturnAddress + size_of_arg_ret". + cur = m.createReturnAddrAndSizeOfArgRetSlot(cur) + + const frameInfoSize = 16 // == frame_size + sliceSize. + + // Next, we should allocate the stack for the Go function call if necessary. + goCallStackSize, sliceSizeInBytes := backend.GoFunctionCallRequiredStackSize(sig, argBegin) + cur = m.insertStackBoundsCheck(goCallStackSize+frameInfoSize, cur) + + originalArg0Reg := x17VReg // Caller save, so we can use it for whatever we want. + if m.currentABI.AlignedArgResultStackSlotSize() > 0 { + // At this point, SP points to `ReturnAddress`, so add 16 to get the original arg 0 slot. + cur = m.addsAddOrSubStackPointer(cur, originalArg0Reg, frameInfoSize, true) + } + + // Save the callee saved registers. + cur = m.saveRegistersInExecutionContext(cur, calleeSavedRegistersSorted) + + if needModuleContextPtr { + offset := wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.I64() + if !offsetFitsInAddressModeKindRegUnsignedImm12(64, offset) { + panic("BUG: too large or un-aligned offset for goFunctionCallCalleeModuleContextOpaque in execution context") + } + + // Module context is always the second argument. + moduleCtrPtr := x1VReg + store := m.allocateInstr() + amode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtrPtr, imm: offset} + store.asStore(operandNR(moduleCtrPtr), amode, 64) + cur = linkInstr(cur, store) + } + + // Advances the stack pointer. + cur = m.addsAddOrSubStackPointer(cur, spVReg, goCallStackSize, false) + + // Copy the pointer to x15VReg. + arg0ret0AddrReg := x15VReg // Caller save, so we can use it for whatever we want. + copySp := m.allocateInstr() + copySp.asMove64(arg0ret0AddrReg, spVReg) + cur = linkInstr(cur, copySp) + + // Next, we need to store all the arguments to the stack in the typical Wasm stack style. + for i := range abi.Args[argBegin:] { + arg := &abi.Args[argBegin+i] + store := m.allocateInstr() + var v regalloc.VReg + if arg.Kind == backend.ABIArgKindReg { + v = arg.Reg + } else { + cur, v = m.goFunctionCallLoadStackArg(cur, originalArg0Reg, arg, + // Caller save, so we can use it for whatever we want. + x11VReg, v11VReg) + } + + var sizeInBits byte + if arg.Type == ssa.TypeV128 { + sizeInBits = 128 + } else { + sizeInBits = 64 + } + store.asStore(operandNR(v), + addressMode{ + kind: addressModeKindPostIndex, + rn: arg0ret0AddrReg, imm: int64(sizeInBits / 8), + }, sizeInBits) + cur = linkInstr(cur, store) + } + + // Finally, now that we've advanced SP to arg[0]/ret[0], we allocate `frame_size + sliceSize`. + var frameSizeReg, sliceSizeReg regalloc.VReg + if goCallStackSize > 0 { + cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, goCallStackSize) + frameSizeReg = tmpRegVReg + cur = m.lowerConstantI64AndInsert(cur, x16VReg, sliceSizeInBytes/8) + sliceSizeReg = x16VReg + } else { + frameSizeReg = xzrVReg + sliceSizeReg = xzrVReg + } + _amode := addressModePreOrPostIndex(spVReg, -16, true) + storeP := m.allocateInstr() + storeP.asStorePair64(frameSizeReg, sliceSizeReg, _amode) + cur = linkInstr(cur, storeP) + + // Set the exit status on the execution context. + cur = m.setExitCode(cur, x0VReg, exitCode) + + // Save the current stack pointer. + cur = m.saveCurrentStackPointer(cur, x0VReg) + + // Exit the execution. + cur = m.storeReturnAddressAndExit(cur) + + // After the call, we need to restore the callee saved registers. + cur = m.restoreRegistersInExecutionContext(cur, calleeSavedRegistersSorted) + + // Get the pointer to the arg[0]/ret[0]: We need to skip `frame_size + sliceSize`. + if len(abi.Rets) > 0 { + cur = m.addsAddOrSubStackPointer(cur, arg0ret0AddrReg, frameInfoSize, true) + } + + // Advances the SP so that it points to `ReturnAddress`. + cur = m.addsAddOrSubStackPointer(cur, spVReg, frameInfoSize+goCallStackSize, true) + ldr := m.allocateInstr() + // And load the return address. + ldr.asULoad(operandNR(lrVReg), + addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64) + cur = linkInstr(cur, ldr) + + originalRet0Reg := x17VReg // Caller save, so we can use it for whatever we want. + if m.currentABI.RetStackSize > 0 { + cur = m.addsAddOrSubStackPointer(cur, originalRet0Reg, m.currentABI.ArgStackSize, true) + } + + // Make the SP point to the original address (above the result slot). + if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 { + cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true) + } + + for i := range abi.Rets { + r := &abi.Rets[i] + if r.Kind == backend.ABIArgKindReg { + loadIntoReg := m.allocateInstr() + mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg} + switch r.Type { + case ssa.TypeI32: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + loadIntoReg.asULoad(operandNR(r.Reg), mode, 32) + case ssa.TypeI64: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + loadIntoReg.asULoad(operandNR(r.Reg), mode, 64) + case ssa.TypeF32: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 32) + case ssa.TypeF64: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 64) + case ssa.TypeV128: + mode.imm = 16 + loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 128) + default: + panic("TODO") + } + cur = linkInstr(cur, loadIntoReg) + } else { + // First we need to load the value to a temporary just like ^^. + intTmp, floatTmp := x11VReg, v11VReg + loadIntoTmpReg := m.allocateInstr() + mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg} + var resultReg regalloc.VReg + switch r.Type { + case ssa.TypeI32: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 32) + resultReg = intTmp + case ssa.TypeI64: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 64) + resultReg = intTmp + case ssa.TypeF32: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 32) + resultReg = floatTmp + case ssa.TypeF64: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 64) + resultReg = floatTmp + case ssa.TypeV128: + mode.imm = 16 + loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 128) + resultReg = floatTmp + default: + panic("TODO") + } + cur = linkInstr(cur, loadIntoTmpReg) + cur = m.goFunctionCallStoreStackResult(cur, originalRet0Reg, r, resultReg) + } + } + + ret := m.allocateInstr() + ret.asRet() + linkInstr(cur, ret) + + m.encode(m.executableContext.RootInstr) + return m.compiler.Buf() +} + +func (m *machine) saveRegistersInExecutionContext(cur *instruction, regs []regalloc.VReg) *instruction { + offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64() + for _, v := range regs { + store := m.allocateInstr() + var sizeInBits byte + switch v.RegType() { + case regalloc.RegTypeInt: + sizeInBits = 64 + case regalloc.RegTypeFloat: + sizeInBits = 128 + } + store.asStore(operandNR(v), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + // Execution context is always the first argument. + rn: x0VReg, imm: offset, + }, sizeInBits) + store.prev = cur + cur.next = store + cur = store + offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally store regs at the offset of multiple of 16. + } + return cur +} + +func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []regalloc.VReg) *instruction { + offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64() + for _, v := range regs { + load := m.allocateInstr() + var as func(dst operand, amode addressMode, sizeInBits byte) + var sizeInBits byte + switch v.RegType() { + case regalloc.RegTypeInt: + as = load.asULoad + sizeInBits = 64 + case regalloc.RegTypeFloat: + as = load.asFpuLoad + sizeInBits = 128 + } + as(operandNR(v), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + // Execution context is always the first argument. + rn: x0VReg, imm: offset, + }, sizeInBits) + cur = linkInstr(cur, load) + offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally load regs at the offset of multiple of 16. + } + return cur +} + +func (m *machine) lowerConstantI64AndInsert(cur *instruction, dst regalloc.VReg, v int64) *instruction { + exct := m.executableContext + exct.PendingInstructions = exct.PendingInstructions[:0] + m.lowerConstantI64(dst, v) + for _, instr := range exct.PendingInstructions { + cur = linkInstr(cur, instr) + } + return cur +} + +func (m *machine) lowerConstantI32AndInsert(cur *instruction, dst regalloc.VReg, v int32) *instruction { + exct := m.executableContext + exct.PendingInstructions = exct.PendingInstructions[:0] + m.lowerConstantI32(dst, v) + for _, instr := range exct.PendingInstructions { + cur = linkInstr(cur, instr) + } + return cur +} + +func (m *machine) setExitCode(cur *instruction, execCtr regalloc.VReg, exitCode wazevoapi.ExitCode) *instruction { + constReg := x17VReg // caller-saved, so we can use it. + cur = m.lowerConstantI32AndInsert(cur, constReg, int32(exitCode)) + + // Set the exit status on the execution context. + setExistStatus := m.allocateInstr() + setExistStatus.asStore(operandNR(constReg), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: execCtr, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(), + }, 32) + cur = linkInstr(cur, setExistStatus) + return cur +} + +func (m *machine) storeReturnAddressAndExit(cur *instruction) *instruction { + // Read the return address into tmp, and store it in the execution context. + adr := m.allocateInstr() + adr.asAdr(tmpRegVReg, exitSequenceSize+8) + cur = linkInstr(cur, adr) + + storeReturnAddr := m.allocateInstr() + storeReturnAddr.asStore(operandNR(tmpRegVReg), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + // Execution context is always the first argument. + rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(), + }, 64) + cur = linkInstr(cur, storeReturnAddr) + + // Exit the execution. + trapSeq := m.allocateInstr() + trapSeq.asExitSequence(x0VReg) + cur = linkInstr(cur, trapSeq) + return cur +} + +func (m *machine) saveCurrentStackPointer(cur *instruction, execCtr regalloc.VReg) *instruction { + // Save the current stack pointer: + // mov tmp, sp, + // str tmp, [exec_ctx, #stackPointerBeforeGoCall] + movSp := m.allocateInstr() + movSp.asMove64(tmpRegVReg, spVReg) + cur = linkInstr(cur, movSp) + + strSp := m.allocateInstr() + strSp.asStore(operandNR(tmpRegVReg), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: execCtr, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(), + }, 64) + cur = linkInstr(cur, strSp) + return cur +} + +func (m *machine) goFunctionCallLoadStackArg(cur *instruction, originalArg0Reg regalloc.VReg, arg *backend.ABIArg, intVReg, floatVReg regalloc.VReg) (*instruction, regalloc.VReg) { + load := m.allocateInstr() + var result regalloc.VReg + mode := addressMode{kind: addressModeKindPostIndex, rn: originalArg0Reg} + switch arg.Type { + case ssa.TypeI32: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + load.asULoad(operandNR(intVReg), mode, 32) + result = intVReg + case ssa.TypeI64: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + load.asULoad(operandNR(intVReg), mode, 64) + result = intVReg + case ssa.TypeF32: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + load.asFpuLoad(operandNR(floatVReg), mode, 32) + result = floatVReg + case ssa.TypeF64: + mode.imm = 8 // We use uint64 for all basic types, except SIMD v128. + load.asFpuLoad(operandNR(floatVReg), mode, 64) + result = floatVReg + case ssa.TypeV128: + mode.imm = 16 + load.asFpuLoad(operandNR(floatVReg), mode, 128) + result = floatVReg + default: + panic("TODO") + } + + cur = linkInstr(cur, load) + return cur, result +} + +func (m *machine) goFunctionCallStoreStackResult(cur *instruction, originalRet0Reg regalloc.VReg, result *backend.ABIArg, resultVReg regalloc.VReg) *instruction { + store := m.allocateInstr() + mode := addressMode{kind: addressModeKindPostIndex, rn: originalRet0Reg} + var sizeInBits byte + switch result.Type { + case ssa.TypeI32, ssa.TypeF32: + mode.imm = 8 + sizeInBits = 32 + case ssa.TypeI64, ssa.TypeF64: + mode.imm = 8 + sizeInBits = 64 + case ssa.TypeV128: + mode.imm = 16 + sizeInBits = 128 + default: + panic("TODO") + } + store.asStore(operandNR(resultVReg), mode, sizeInBits) + return linkInstr(cur, store) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go new file mode 100644 index 000000000..6f6cdd1b2 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go @@ -0,0 +1,215 @@ +package arm64 + +import ( + "strconv" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +type ( + cond uint64 + condKind byte +) + +const ( + // condKindRegisterZero represents a condition which checks if the register is zero. + // This indicates that the instruction must be encoded as CBZ: + // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/CBZ--Compare-and-Branch-on-Zero- + condKindRegisterZero condKind = iota + // condKindRegisterNotZero indicates that the instruction must be encoded as CBNZ: + // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/CBNZ--Compare-and-Branch-on-Nonzero- + condKindRegisterNotZero + // condKindCondFlagSet indicates that the instruction must be encoded as B.cond: + // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/B-cond--Branch-conditionally- + condKindCondFlagSet +) + +// kind returns the kind of condition which is stored in the first two bits. +func (c cond) kind() condKind { + return condKind(c & 0b11) +} + +func (c cond) asUint64() uint64 { + return uint64(c) +} + +// register returns the register for register conditions. +// This panics if the condition is not a register condition (condKindRegisterZero or condKindRegisterNotZero). +func (c cond) register() regalloc.VReg { + if c.kind() != condKindRegisterZero && c.kind() != condKindRegisterNotZero { + panic("condition is not a register") + } + return regalloc.VReg(c >> 2) +} + +func registerAsRegZeroCond(r regalloc.VReg) cond { + return cond(r)<<2 | cond(condKindRegisterZero) +} + +func registerAsRegNotZeroCond(r regalloc.VReg) cond { + return cond(r)<<2 | cond(condKindRegisterNotZero) +} + +func (c cond) flag() condFlag { + if c.kind() != condKindCondFlagSet { + panic("condition is not a flag") + } + return condFlag(c >> 2) +} + +func (c condFlag) asCond() cond { + return cond(c)<<2 | cond(condKindCondFlagSet) +} + +// condFlag represents a condition flag for conditional branches. +// The value matches the encoding of condition flags in the ARM64 instruction set. +// https://developer.arm.com/documentation/den0024/a/The-A64-instruction-set/Data-processing-instructions/Conditional-instructions +type condFlag uint8 + +const ( + eq condFlag = iota // eq represents "equal" + ne // ne represents "not equal" + hs // hs represents "higher or same" + lo // lo represents "lower" + mi // mi represents "minus or negative result" + pl // pl represents "plus or positive result" + vs // vs represents "overflow set" + vc // vc represents "overflow clear" + hi // hi represents "higher" + ls // ls represents "lower or same" + ge // ge represents "greater or equal" + lt // lt represents "less than" + gt // gt represents "greater than" + le // le represents "less than or equal" + al // al represents "always" + nv // nv represents "never" +) + +// invert returns the inverted condition. +func (c condFlag) invert() condFlag { + switch c { + case eq: + return ne + case ne: + return eq + case hs: + return lo + case lo: + return hs + case mi: + return pl + case pl: + return mi + case vs: + return vc + case vc: + return vs + case hi: + return ls + case ls: + return hi + case ge: + return lt + case lt: + return ge + case gt: + return le + case le: + return gt + case al: + return nv + case nv: + return al + default: + panic(c) + } +} + +// String implements fmt.Stringer. +func (c condFlag) String() string { + switch c { + case eq: + return "eq" + case ne: + return "ne" + case hs: + return "hs" + case lo: + return "lo" + case mi: + return "mi" + case pl: + return "pl" + case vs: + return "vs" + case vc: + return "vc" + case hi: + return "hi" + case ls: + return "ls" + case ge: + return "ge" + case lt: + return "lt" + case gt: + return "gt" + case le: + return "le" + case al: + return "al" + case nv: + return "nv" + default: + panic(strconv.Itoa(int(c))) + } +} + +// condFlagFromSSAIntegerCmpCond returns the condition flag for the given ssa.IntegerCmpCond. +func condFlagFromSSAIntegerCmpCond(c ssa.IntegerCmpCond) condFlag { + switch c { + case ssa.IntegerCmpCondEqual: + return eq + case ssa.IntegerCmpCondNotEqual: + return ne + case ssa.IntegerCmpCondSignedLessThan: + return lt + case ssa.IntegerCmpCondSignedGreaterThanOrEqual: + return ge + case ssa.IntegerCmpCondSignedGreaterThan: + return gt + case ssa.IntegerCmpCondSignedLessThanOrEqual: + return le + case ssa.IntegerCmpCondUnsignedLessThan: + return lo + case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual: + return hs + case ssa.IntegerCmpCondUnsignedGreaterThan: + return hi + case ssa.IntegerCmpCondUnsignedLessThanOrEqual: + return ls + default: + panic(c) + } +} + +// condFlagFromSSAFloatCmpCond returns the condition flag for the given ssa.FloatCmpCond. +func condFlagFromSSAFloatCmpCond(c ssa.FloatCmpCond) condFlag { + switch c { + case ssa.FloatCmpCondEqual: + return eq + case ssa.FloatCmpCondNotEqual: + return ne + case ssa.FloatCmpCondLessThan: + return mi + case ssa.FloatCmpCondLessThanOrEqual: + return ls + case ssa.FloatCmpCondGreaterThan: + return gt + case ssa.FloatCmpCondGreaterThanOrEqual: + return ge + default: + panic(c) + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go new file mode 100644 index 000000000..8aabc5997 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go @@ -0,0 +1,2545 @@ +package arm64 + +import ( + "fmt" + "math" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +type ( + // instruction represents either a real instruction in arm64, or the meta instructions + // that are convenient for code generation. For example, inline constants are also treated + // as instructions. + // + // Basically, each instruction knows how to get encoded in binaries. Hence, the final output of compilation + // can be considered equivalent to the sequence of such instructions. + // + // Each field is interpreted depending on the kind. + // + // TODO: optimize the layout later once the impl settles. + instruction struct { + prev, next *instruction + u1, u2, u3 uint64 + rd, rm, rn, ra operand + amode addressMode + kind instructionKind + addedBeforeRegAlloc bool + } + + // instructionKind represents the kind of instruction. + // This controls how the instruction struct is interpreted. + instructionKind byte +) + +func asNop0(i *instruction) { + i.kind = nop0 +} + +func setNext(i, next *instruction) { + i.next = next +} + +func setPrev(i, prev *instruction) { + i.prev = prev +} + +// IsCall implements regalloc.Instr IsCall. +func (i *instruction) IsCall() bool { + return i.kind == call +} + +// IsIndirectCall implements regalloc.Instr IsIndirectCall. +func (i *instruction) IsIndirectCall() bool { + return i.kind == callInd +} + +// IsReturn implements regalloc.Instr IsReturn. +func (i *instruction) IsReturn() bool { + return i.kind == ret +} + +// Next implements regalloc.Instr Next. +func (i *instruction) Next() regalloc.Instr { + return i.next +} + +// Prev implements regalloc.Instr Prev. +func (i *instruction) Prev() regalloc.Instr { + return i.prev +} + +// AddedBeforeRegAlloc implements regalloc.Instr AddedBeforeRegAlloc. +func (i *instruction) AddedBeforeRegAlloc() bool { + return i.addedBeforeRegAlloc +} + +type defKind byte + +const ( + defKindNone defKind = iota + 1 + defKindRD + defKindCall +) + +var defKinds = [numInstructionKinds]defKind{ + adr: defKindRD, + aluRRR: defKindRD, + aluRRRR: defKindRD, + aluRRImm12: defKindRD, + aluRRBitmaskImm: defKindRD, + aluRRRShift: defKindRD, + aluRRImmShift: defKindRD, + aluRRRExtend: defKindRD, + bitRR: defKindRD, + movZ: defKindRD, + movK: defKindRD, + movN: defKindRD, + mov32: defKindRD, + mov64: defKindRD, + fpuMov64: defKindRD, + fpuMov128: defKindRD, + fpuRR: defKindRD, + fpuRRR: defKindRD, + nop0: defKindNone, + call: defKindCall, + callInd: defKindCall, + ret: defKindNone, + store8: defKindNone, + store16: defKindNone, + store32: defKindNone, + store64: defKindNone, + exitSequence: defKindNone, + condBr: defKindNone, + br: defKindNone, + brTableSequence: defKindNone, + cSet: defKindRD, + extend: defKindRD, + fpuCmp: defKindNone, + uLoad8: defKindRD, + uLoad16: defKindRD, + uLoad32: defKindRD, + sLoad8: defKindRD, + sLoad16: defKindRD, + sLoad32: defKindRD, + uLoad64: defKindRD, + fpuLoad32: defKindRD, + fpuLoad64: defKindRD, + fpuLoad128: defKindRD, + vecLoad1R: defKindRD, + loadFpuConst32: defKindRD, + loadFpuConst64: defKindRD, + loadFpuConst128: defKindRD, + fpuStore32: defKindNone, + fpuStore64: defKindNone, + fpuStore128: defKindNone, + udf: defKindNone, + cSel: defKindRD, + fpuCSel: defKindRD, + movToVec: defKindRD, + movFromVec: defKindRD, + movFromVecSigned: defKindRD, + vecDup: defKindRD, + vecDupElement: defKindRD, + vecExtract: defKindRD, + vecMisc: defKindRD, + vecMovElement: defKindRD, + vecLanes: defKindRD, + vecShiftImm: defKindRD, + vecTbl: defKindRD, + vecTbl2: defKindRD, + vecPermute: defKindRD, + vecRRR: defKindRD, + vecRRRRewrite: defKindNone, + fpuToInt: defKindRD, + intToFpu: defKindRD, + cCmpImm: defKindNone, + movToFPSR: defKindNone, + movFromFPSR: defKindRD, + emitSourceOffsetInfo: defKindNone, + atomicRmw: defKindRD, + atomicCas: defKindNone, + atomicLoad: defKindRD, + atomicStore: defKindNone, + dmb: defKindNone, + loadConstBlockArg: defKindRD, +} + +// Defs returns the list of regalloc.VReg that are defined by the instruction. +// In order to reduce the number of allocations, the caller can pass the slice to be used. +func (i *instruction) Defs(regs *[]regalloc.VReg) []regalloc.VReg { + *regs = (*regs)[:0] + switch defKinds[i.kind] { + case defKindNone: + case defKindRD: + *regs = append(*regs, i.rd.nr()) + case defKindCall: + _, _, retIntRealRegs, retFloatRealRegs, _ := backend.ABIInfoFromUint64(i.u2) + for i := byte(0); i < retIntRealRegs; i++ { + *regs = append(*regs, regInfo.RealRegToVReg[intParamResultRegs[i]]) + } + for i := byte(0); i < retFloatRealRegs; i++ { + *regs = append(*regs, regInfo.RealRegToVReg[floatParamResultRegs[i]]) + } + default: + panic(fmt.Sprintf("defKind for %v not defined", i)) + } + return *regs +} + +// AssignDef implements regalloc.Instr AssignDef. +func (i *instruction) AssignDef(reg regalloc.VReg) { + switch defKinds[i.kind] { + case defKindNone: + case defKindRD: + i.rd = i.rd.assignReg(reg) + case defKindCall: + panic("BUG: call instructions shouldn't be assigned") + default: + panic(fmt.Sprintf("defKind for %v not defined", i)) + } +} + +type useKind byte + +const ( + useKindNone useKind = iota + 1 + useKindRN + useKindRNRM + useKindRNRMRA + useKindRNRN1RM + useKindCall + useKindCallInd + useKindAMode + useKindRNAMode + useKindCond + // useKindRDRewrite indicates an instruction where RD is used both as a source and destination. + // A temporary register for RD must be allocated explicitly with the source copied to this + // register before the instruction and the value copied from this register to the instruction + // return register. + useKindRDRewrite +) + +var useKinds = [numInstructionKinds]useKind{ + udf: useKindNone, + aluRRR: useKindRNRM, + aluRRRR: useKindRNRMRA, + aluRRImm12: useKindRN, + aluRRBitmaskImm: useKindRN, + aluRRRShift: useKindRNRM, + aluRRImmShift: useKindRN, + aluRRRExtend: useKindRNRM, + bitRR: useKindRN, + movZ: useKindNone, + movK: useKindNone, + movN: useKindNone, + mov32: useKindRN, + mov64: useKindRN, + fpuMov64: useKindRN, + fpuMov128: useKindRN, + fpuRR: useKindRN, + fpuRRR: useKindRNRM, + nop0: useKindNone, + call: useKindCall, + callInd: useKindCallInd, + ret: useKindNone, + store8: useKindRNAMode, + store16: useKindRNAMode, + store32: useKindRNAMode, + store64: useKindRNAMode, + exitSequence: useKindRN, + condBr: useKindCond, + br: useKindNone, + brTableSequence: useKindRN, + cSet: useKindNone, + extend: useKindRN, + fpuCmp: useKindRNRM, + uLoad8: useKindAMode, + uLoad16: useKindAMode, + uLoad32: useKindAMode, + sLoad8: useKindAMode, + sLoad16: useKindAMode, + sLoad32: useKindAMode, + uLoad64: useKindAMode, + fpuLoad32: useKindAMode, + fpuLoad64: useKindAMode, + fpuLoad128: useKindAMode, + fpuStore32: useKindRNAMode, + fpuStore64: useKindRNAMode, + fpuStore128: useKindRNAMode, + loadFpuConst32: useKindNone, + loadFpuConst64: useKindNone, + loadFpuConst128: useKindNone, + vecLoad1R: useKindRN, + cSel: useKindRNRM, + fpuCSel: useKindRNRM, + movToVec: useKindRN, + movFromVec: useKindRN, + movFromVecSigned: useKindRN, + vecDup: useKindRN, + vecDupElement: useKindRN, + vecExtract: useKindRNRM, + cCmpImm: useKindRN, + vecMisc: useKindRN, + vecMovElement: useKindRN, + vecLanes: useKindRN, + vecShiftImm: useKindRN, + vecTbl: useKindRNRM, + vecTbl2: useKindRNRN1RM, + vecRRR: useKindRNRM, + vecRRRRewrite: useKindRDRewrite, + vecPermute: useKindRNRM, + fpuToInt: useKindRN, + intToFpu: useKindRN, + movToFPSR: useKindRN, + movFromFPSR: useKindNone, + adr: useKindNone, + emitSourceOffsetInfo: useKindNone, + atomicRmw: useKindRNRM, + atomicCas: useKindRDRewrite, + atomicLoad: useKindRN, + atomicStore: useKindRNRM, + loadConstBlockArg: useKindNone, + dmb: useKindNone, +} + +// Uses returns the list of regalloc.VReg that are used by the instruction. +// In order to reduce the number of allocations, the caller can pass the slice to be used. +func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg { + *regs = (*regs)[:0] + switch useKinds[i.kind] { + case useKindNone: + case useKindRN: + if rn := i.rn.reg(); rn.Valid() { + *regs = append(*regs, rn) + } + case useKindRNRM: + if rn := i.rn.reg(); rn.Valid() { + *regs = append(*regs, rn) + } + if rm := i.rm.reg(); rm.Valid() { + *regs = append(*regs, rm) + } + case useKindRNRMRA: + if rn := i.rn.reg(); rn.Valid() { + *regs = append(*regs, rn) + } + if rm := i.rm.reg(); rm.Valid() { + *regs = append(*regs, rm) + } + if ra := i.ra.reg(); ra.Valid() { + *regs = append(*regs, ra) + } + case useKindRNRN1RM: + if rn := i.rn.reg(); rn.Valid() && rn.IsRealReg() { + rn1 := regalloc.FromRealReg(rn.RealReg()+1, rn.RegType()) + *regs = append(*regs, rn, rn1) + } + if rm := i.rm.reg(); rm.Valid() { + *regs = append(*regs, rm) + } + case useKindAMode: + if amodeRN := i.amode.rn; amodeRN.Valid() { + *regs = append(*regs, amodeRN) + } + if amodeRM := i.amode.rm; amodeRM.Valid() { + *regs = append(*regs, amodeRM) + } + case useKindRNAMode: + *regs = append(*regs, i.rn.reg()) + if amodeRN := i.amode.rn; amodeRN.Valid() { + *regs = append(*regs, amodeRN) + } + if amodeRM := i.amode.rm; amodeRM.Valid() { + *regs = append(*regs, amodeRM) + } + case useKindCond: + cnd := cond(i.u1) + if cnd.kind() != condKindCondFlagSet { + *regs = append(*regs, cnd.register()) + } + case useKindCallInd: + *regs = append(*regs, i.rn.nr()) + fallthrough + case useKindCall: + argIntRealRegs, argFloatRealRegs, _, _, _ := backend.ABIInfoFromUint64(i.u2) + for i := byte(0); i < argIntRealRegs; i++ { + *regs = append(*regs, regInfo.RealRegToVReg[intParamResultRegs[i]]) + } + for i := byte(0); i < argFloatRealRegs; i++ { + *regs = append(*regs, regInfo.RealRegToVReg[floatParamResultRegs[i]]) + } + case useKindRDRewrite: + *regs = append(*regs, i.rn.reg()) + *regs = append(*regs, i.rm.reg()) + *regs = append(*regs, i.rd.reg()) + default: + panic(fmt.Sprintf("useKind for %v not defined", i)) + } + return *regs +} + +func (i *instruction) AssignUse(index int, reg regalloc.VReg) { + switch useKinds[i.kind] { + case useKindNone: + case useKindRN: + if rn := i.rn.reg(); rn.Valid() { + i.rn = i.rn.assignReg(reg) + } + case useKindRNRM: + if index == 0 { + if rn := i.rn.reg(); rn.Valid() { + i.rn = i.rn.assignReg(reg) + } + } else { + if rm := i.rm.reg(); rm.Valid() { + i.rm = i.rm.assignReg(reg) + } + } + case useKindRDRewrite: + if index == 0 { + if rn := i.rn.reg(); rn.Valid() { + i.rn = i.rn.assignReg(reg) + } + } else if index == 1 { + if rm := i.rm.reg(); rm.Valid() { + i.rm = i.rm.assignReg(reg) + } + } else { + if rd := i.rd.reg(); rd.Valid() { + i.rd = i.rd.assignReg(reg) + } + } + case useKindRNRN1RM: + if index == 0 { + if rn := i.rn.reg(); rn.Valid() { + i.rn = i.rn.assignReg(reg) + } + if rn1 := i.rn.reg() + 1; rn1.Valid() { + i.rm = i.rm.assignReg(reg + 1) + } + } else { + if rm := i.rm.reg(); rm.Valid() { + i.rm = i.rm.assignReg(reg) + } + } + case useKindRNRMRA: + if index == 0 { + if rn := i.rn.reg(); rn.Valid() { + i.rn = i.rn.assignReg(reg) + } + } else if index == 1 { + if rm := i.rm.reg(); rm.Valid() { + i.rm = i.rm.assignReg(reg) + } + } else { + if ra := i.ra.reg(); ra.Valid() { + i.ra = i.ra.assignReg(reg) + } + } + case useKindAMode: + if index == 0 { + if amodeRN := i.amode.rn; amodeRN.Valid() { + i.amode.rn = reg + } + } else { + if amodeRM := i.amode.rm; amodeRM.Valid() { + i.amode.rm = reg + } + } + case useKindRNAMode: + if index == 0 { + i.rn = i.rn.assignReg(reg) + } else if index == 1 { + if amodeRN := i.amode.rn; amodeRN.Valid() { + i.amode.rn = reg + } else { + panic("BUG") + } + } else { + if amodeRM := i.amode.rm; amodeRM.Valid() { + i.amode.rm = reg + } else { + panic("BUG") + } + } + case useKindCond: + c := cond(i.u1) + switch c.kind() { + case condKindRegisterZero: + i.u1 = uint64(registerAsRegZeroCond(reg)) + case condKindRegisterNotZero: + i.u1 = uint64(registerAsRegNotZeroCond(reg)) + } + case useKindCall: + panic("BUG: call instructions shouldn't be assigned") + case useKindCallInd: + i.rn = i.rn.assignReg(reg) + default: + panic(fmt.Sprintf("useKind for %v not defined", i)) + } +} + +func (i *instruction) asCall(ref ssa.FuncRef, abi *backend.FunctionABI) { + i.kind = call + i.u1 = uint64(ref) + if abi != nil { + i.u2 = abi.ABIInfoAsUint64() + } +} + +func (i *instruction) asCallIndirect(ptr regalloc.VReg, abi *backend.FunctionABI) { + i.kind = callInd + i.rn = operandNR(ptr) + if abi != nil { + i.u2 = abi.ABIInfoAsUint64() + } +} + +func (i *instruction) callFuncRef() ssa.FuncRef { + return ssa.FuncRef(i.u1) +} + +// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false) +func (i *instruction) asMOVZ(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) { + i.kind = movZ + i.rd = operandNR(dst) + i.u1 = imm + i.u2 = shift + if dst64bit { + i.u3 = 1 + } +} + +// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false) +func (i *instruction) asMOVK(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) { + i.kind = movK + i.rd = operandNR(dst) + i.u1 = imm + i.u2 = shift + if dst64bit { + i.u3 = 1 + } +} + +// shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false) +func (i *instruction) asMOVN(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) { + i.kind = movN + i.rd = operandNR(dst) + i.u1 = imm + i.u2 = shift + if dst64bit { + i.u3 = 1 + } +} + +func (i *instruction) asNop0() *instruction { + i.kind = nop0 + return i +} + +func (i *instruction) asNop0WithLabel(l label) { + i.kind = nop0 + i.u1 = uint64(l) +} + +func (i *instruction) nop0Label() label { + return label(i.u1) +} + +func (i *instruction) asRet() { + i.kind = ret +} + +func (i *instruction) asStorePair64(src1, src2 regalloc.VReg, amode addressMode) { + i.kind = storeP64 + i.rn = operandNR(src1) + i.rm = operandNR(src2) + i.amode = amode +} + +func (i *instruction) asLoadPair64(src1, src2 regalloc.VReg, amode addressMode) { + i.kind = loadP64 + i.rn = operandNR(src1) + i.rm = operandNR(src2) + i.amode = amode +} + +func (i *instruction) asStore(src operand, amode addressMode, sizeInBits byte) { + switch sizeInBits { + case 8: + i.kind = store8 + case 16: + i.kind = store16 + case 32: + if src.reg().RegType() == regalloc.RegTypeInt { + i.kind = store32 + } else { + i.kind = fpuStore32 + } + case 64: + if src.reg().RegType() == regalloc.RegTypeInt { + i.kind = store64 + } else { + i.kind = fpuStore64 + } + case 128: + i.kind = fpuStore128 + } + i.rn = src + i.amode = amode +} + +func (i *instruction) asSLoad(dst operand, amode addressMode, sizeInBits byte) { + switch sizeInBits { + case 8: + i.kind = sLoad8 + case 16: + i.kind = sLoad16 + case 32: + i.kind = sLoad32 + default: + panic("BUG") + } + i.rd = dst + i.amode = amode +} + +func (i *instruction) asULoad(dst operand, amode addressMode, sizeInBits byte) { + switch sizeInBits { + case 8: + i.kind = uLoad8 + case 16: + i.kind = uLoad16 + case 32: + i.kind = uLoad32 + case 64: + i.kind = uLoad64 + } + i.rd = dst + i.amode = amode +} + +func (i *instruction) asFpuLoad(dst operand, amode addressMode, sizeInBits byte) { + switch sizeInBits { + case 32: + i.kind = fpuLoad32 + case 64: + i.kind = fpuLoad64 + case 128: + i.kind = fpuLoad128 + } + i.rd = dst + i.amode = amode +} + +func (i *instruction) asVecLoad1R(rd, rn operand, arr vecArrangement) { + // NOTE: currently only has support for no-offset loads, though it is suspicious that + // we would need to support offset load (that is only available for post-index). + i.kind = vecLoad1R + i.rd = rd + i.rn = rn + i.u1 = uint64(arr) +} + +func (i *instruction) asCSet(rd regalloc.VReg, mask bool, c condFlag) { + i.kind = cSet + i.rd = operandNR(rd) + i.u1 = uint64(c) + if mask { + i.u2 = 1 + } +} + +func (i *instruction) asCSel(rd, rn, rm operand, c condFlag, _64bit bool) { + i.kind = cSel + i.rd = rd + i.rn = rn + i.rm = rm + i.u1 = uint64(c) + if _64bit { + i.u3 = 1 + } +} + +func (i *instruction) asFpuCSel(rd, rn, rm operand, c condFlag, _64bit bool) { + i.kind = fpuCSel + i.rd = rd + i.rn = rn + i.rm = rm + i.u1 = uint64(c) + if _64bit { + i.u3 = 1 + } +} + +func (i *instruction) asBr(target label) { + if target == labelReturn { + panic("BUG: call site should special case for returnLabel") + } + i.kind = br + i.u1 = uint64(target) +} + +func (i *instruction) asBrTableSequence(indexReg regalloc.VReg, targetIndex, targetCounts int) { + i.kind = brTableSequence + i.rn = operandNR(indexReg) + i.u1 = uint64(targetIndex) + i.u2 = uint64(targetCounts) +} + +func (i *instruction) brTableSequenceOffsetsResolved() { + i.u3 = 1 // indicate that the offsets are resolved, for debugging. +} + +func (i *instruction) brLabel() label { + return label(i.u1) +} + +// brOffsetResolved is called when the target label is resolved. +func (i *instruction) brOffsetResolve(offset int64) { + i.u2 = uint64(offset) + i.u3 = 1 // indicate that the offset is resolved, for debugging. +} + +func (i *instruction) brOffset() int64 { + return int64(i.u2) +} + +// asCondBr encodes a conditional branch instruction. is64bit is only needed when cond is not flag. +func (i *instruction) asCondBr(c cond, target label, is64bit bool) { + i.kind = condBr + i.u1 = c.asUint64() + i.u2 = uint64(target) + if is64bit { + i.u3 = 1 + } +} + +func (i *instruction) setCondBrTargets(target label) { + i.u2 = uint64(target) +} + +func (i *instruction) condBrLabel() label { + return label(i.u2) +} + +// condBrOffsetResolve is called when the target label is resolved. +func (i *instruction) condBrOffsetResolve(offset int64) { + i.rd.data = uint64(offset) + i.rd.data2 = 1 // indicate that the offset is resolved, for debugging. +} + +// condBrOffsetResolved returns true if condBrOffsetResolve is already called. +func (i *instruction) condBrOffsetResolved() bool { + return i.rd.data2 == 1 +} + +func (i *instruction) condBrOffset() int64 { + return int64(i.rd.data) +} + +func (i *instruction) condBrCond() cond { + return cond(i.u1) +} + +func (i *instruction) condBr64bit() bool { + return i.u3 == 1 +} + +func (i *instruction) asLoadFpuConst32(rd regalloc.VReg, raw uint64) { + i.kind = loadFpuConst32 + i.u1 = raw + i.rd = operandNR(rd) +} + +func (i *instruction) asLoadFpuConst64(rd regalloc.VReg, raw uint64) { + i.kind = loadFpuConst64 + i.u1 = raw + i.rd = operandNR(rd) +} + +func (i *instruction) asLoadFpuConst128(rd regalloc.VReg, lo, hi uint64) { + i.kind = loadFpuConst128 + i.u1 = lo + i.u2 = hi + i.rd = operandNR(rd) +} + +func (i *instruction) asFpuCmp(rn, rm operand, is64bit bool) { + i.kind = fpuCmp + i.rn, i.rm = rn, rm + if is64bit { + i.u3 = 1 + } +} + +func (i *instruction) asCCmpImm(rn operand, imm uint64, c condFlag, flag byte, is64bit bool) { + i.kind = cCmpImm + i.rn = rn + i.rm.data = imm + i.u1 = uint64(c) + i.u2 = uint64(flag) + if is64bit { + i.u3 = 1 + } +} + +// asALU setups a basic ALU instruction. +func (i *instruction) asALU(aluOp aluOp, rd, rn, rm operand, dst64bit bool) { + switch rm.kind { + case operandKindNR: + i.kind = aluRRR + case operandKindSR: + i.kind = aluRRRShift + case operandKindER: + i.kind = aluRRRExtend + case operandKindImm12: + i.kind = aluRRImm12 + default: + panic("BUG") + } + i.u1 = uint64(aluOp) + i.rd, i.rn, i.rm = rd, rn, rm + if dst64bit { + i.u3 = 1 + } +} + +// asALU setups a basic ALU instruction. +func (i *instruction) asALURRRR(aluOp aluOp, rd, rn, rm, ra operand, dst64bit bool) { + i.kind = aluRRRR + i.u1 = uint64(aluOp) + i.rd, i.rn, i.rm, i.ra = rd, rn, rm, ra + if dst64bit { + i.u3 = 1 + } +} + +// asALUShift setups a shift based ALU instruction. +func (i *instruction) asALUShift(aluOp aluOp, rd, rn, rm operand, dst64bit bool) { + switch rm.kind { + case operandKindNR: + i.kind = aluRRR // If the shift amount op is a register, then the instruction is encoded as a normal ALU instruction with two register operands. + case operandKindShiftImm: + i.kind = aluRRImmShift + default: + panic("BUG") + } + i.u1 = uint64(aluOp) + i.rd, i.rn, i.rm = rd, rn, rm + if dst64bit { + i.u3 = 1 + } +} + +func (i *instruction) asALUBitmaskImm(aluOp aluOp, rd, rn regalloc.VReg, imm uint64, dst64bit bool) { + i.kind = aluRRBitmaskImm + i.u1 = uint64(aluOp) + i.rn, i.rd = operandNR(rn), operandNR(rd) + i.u2 = imm + if dst64bit { + i.u3 = 1 + } +} + +func (i *instruction) asMovToFPSR(rn regalloc.VReg) { + i.kind = movToFPSR + i.rn = operandNR(rn) +} + +func (i *instruction) asMovFromFPSR(rd regalloc.VReg) { + i.kind = movFromFPSR + i.rd = operandNR(rd) +} + +func (i *instruction) asBitRR(bitOp bitOp, rd, rn regalloc.VReg, is64bit bool) { + i.kind = bitRR + i.rn, i.rd = operandNR(rn), operandNR(rd) + i.u1 = uint64(bitOp) + if is64bit { + i.u2 = 1 + } +} + +func (i *instruction) asFpuRRR(op fpuBinOp, rd, rn, rm operand, dst64bit bool) { + i.kind = fpuRRR + i.u1 = uint64(op) + i.rd, i.rn, i.rm = rd, rn, rm + if dst64bit { + i.u3 = 1 + } +} + +func (i *instruction) asFpuRR(op fpuUniOp, rd, rn operand, dst64bit bool) { + i.kind = fpuRR + i.u1 = uint64(op) + i.rd, i.rn = rd, rn + if dst64bit { + i.u3 = 1 + } +} + +func (i *instruction) asExtend(rd, rn regalloc.VReg, fromBits, toBits byte, signed bool) { + i.kind = extend + i.rn, i.rd = operandNR(rn), operandNR(rd) + i.u1 = uint64(fromBits) + i.u2 = uint64(toBits) + if signed { + i.u3 = 1 + } +} + +func (i *instruction) asMove32(rd, rn regalloc.VReg) { + i.kind = mov32 + i.rn, i.rd = operandNR(rn), operandNR(rd) +} + +func (i *instruction) asMove64(rd, rn regalloc.VReg) *instruction { + i.kind = mov64 + i.rn, i.rd = operandNR(rn), operandNR(rd) + return i +} + +func (i *instruction) asFpuMov64(rd, rn regalloc.VReg) { + i.kind = fpuMov64 + i.rn, i.rd = operandNR(rn), operandNR(rd) +} + +func (i *instruction) asFpuMov128(rd, rn regalloc.VReg) *instruction { + i.kind = fpuMov128 + i.rn, i.rd = operandNR(rn), operandNR(rd) + return i +} + +func (i *instruction) asMovToVec(rd, rn operand, arr vecArrangement, index vecIndex) { + i.kind = movToVec + i.rd = rd + i.rn = rn + i.u1, i.u2 = uint64(arr), uint64(index) +} + +func (i *instruction) asMovFromVec(rd, rn operand, arr vecArrangement, index vecIndex, signed bool) { + if signed { + i.kind = movFromVecSigned + } else { + i.kind = movFromVec + } + i.rd = rd + i.rn = rn + i.u1, i.u2 = uint64(arr), uint64(index) +} + +func (i *instruction) asVecDup(rd, rn operand, arr vecArrangement) { + i.kind = vecDup + i.u1 = uint64(arr) + i.rn, i.rd = rn, rd +} + +func (i *instruction) asVecDupElement(rd, rn operand, arr vecArrangement, index vecIndex) { + i.kind = vecDupElement + i.u1 = uint64(arr) + i.rn, i.rd = rn, rd + i.u2 = uint64(index) +} + +func (i *instruction) asVecExtract(rd, rn, rm operand, arr vecArrangement, index uint32) { + i.kind = vecExtract + i.u1 = uint64(arr) + i.rn, i.rm, i.rd = rn, rm, rd + i.u2 = uint64(index) +} + +func (i *instruction) asVecMovElement(rd, rn operand, arr vecArrangement, rdIndex, rnIndex vecIndex) { + i.kind = vecMovElement + i.u1 = uint64(arr) + i.u2, i.u3 = uint64(rdIndex), uint64(rnIndex) + i.rn, i.rd = rn, rd +} + +func (i *instruction) asVecMisc(op vecOp, rd, rn operand, arr vecArrangement) { + i.kind = vecMisc + i.u1 = uint64(op) + i.rn, i.rd = rn, rd + i.u2 = uint64(arr) +} + +func (i *instruction) asVecLanes(op vecOp, rd, rn operand, arr vecArrangement) { + i.kind = vecLanes + i.u1 = uint64(op) + i.rn, i.rd = rn, rd + i.u2 = uint64(arr) +} + +func (i *instruction) asVecShiftImm(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction { + i.kind = vecShiftImm + i.u1 = uint64(op) + i.rn, i.rm, i.rd = rn, rm, rd + i.u2 = uint64(arr) + return i +} + +func (i *instruction) asVecTbl(nregs byte, rd, rn, rm operand, arr vecArrangement) { + switch nregs { + case 0, 1: + i.kind = vecTbl + case 2: + i.kind = vecTbl2 + if !rn.reg().IsRealReg() { + panic("rn is not a RealReg") + } + if rn.realReg() == v31 { + panic("rn cannot be v31") + } + default: + panic(fmt.Sprintf("unsupported number of registers %d", nregs)) + } + i.rn, i.rm, i.rd = rn, rm, rd + i.u2 = uint64(arr) +} + +func (i *instruction) asVecPermute(op vecOp, rd, rn, rm operand, arr vecArrangement) { + i.kind = vecPermute + i.u1 = uint64(op) + i.rn, i.rm, i.rd = rn, rm, rd + i.u2 = uint64(arr) +} + +func (i *instruction) asVecRRR(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction { + i.kind = vecRRR + i.u1 = uint64(op) + i.rn, i.rd, i.rm = rn, rd, rm + i.u2 = uint64(arr) + return i +} + +// asVecRRRRewrite encodes a vector instruction that rewrites the destination register. +// IMPORTANT: the destination register must be already defined before this instruction. +func (i *instruction) asVecRRRRewrite(op vecOp, rd, rn, rm operand, arr vecArrangement) { + i.kind = vecRRRRewrite + i.u1 = uint64(op) + i.rn, i.rd, i.rm = rn, rd, rm + i.u2 = uint64(arr) +} + +func (i *instruction) IsCopy() bool { + op := i.kind + // We do not include mov32 as it is not a copy instruction in the sense that it does not preserve the upper 32 bits, + // and it is only used in the translation of IReduce, not the actual copy indeed. + return op == mov64 || op == fpuMov64 || op == fpuMov128 +} + +// String implements fmt.Stringer. +func (i *instruction) String() (str string) { + is64SizeBitToSize := func(u3 uint64) byte { + if u3 == 0 { + return 32 + } + return 64 + } + + switch i.kind { + case nop0: + if i.u1 != 0 { + l := label(i.u1) + str = fmt.Sprintf("%s:", l) + } else { + str = "nop0" + } + case aluRRR: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(), + formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), + i.rm.format(size)) + case aluRRRR: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("%s %s, %s, %s, %s", aluOp(i.u1).String(), + formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.ra.nr(), size)) + case aluRRImm12: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(), + formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), i.rm.format(size)) + case aluRRBitmaskImm: + size := is64SizeBitToSize(i.u3) + rd, rn := formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size) + if size == 32 { + str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, uint32(i.u2)) + } else { + str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, i.u2) + } + case aluRRImmShift: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("%s %s, %s, %#x", + aluOp(i.u1).String(), + formatVRegSized(i.rd.nr(), size), + formatVRegSized(i.rn.nr(), size), + i.rm.shiftImm(), + ) + case aluRRRShift: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("%s %s, %s, %s", + aluOp(i.u1).String(), + formatVRegSized(i.rd.nr(), size), + formatVRegSized(i.rn.nr(), size), + i.rm.format(size), + ) + case aluRRRExtend: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(), + formatVRegSized(i.rd.nr(), size), + formatVRegSized(i.rn.nr(), size), + // Regardless of the source size, the register is formatted in 32-bit. + i.rm.format(32), + ) + case bitRR: + size := is64SizeBitToSize(i.u2) + str = fmt.Sprintf("%s %s, %s", + bitOp(i.u1), + formatVRegSized(i.rd.nr(), size), + formatVRegSized(i.rn.nr(), size), + ) + case uLoad8: + str = fmt.Sprintf("ldrb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32)) + case sLoad8: + str = fmt.Sprintf("ldrsb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32)) + case uLoad16: + str = fmt.Sprintf("ldrh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32)) + case sLoad16: + str = fmt.Sprintf("ldrsh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32)) + case uLoad32: + str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32)) + case sLoad32: + str = fmt.Sprintf("ldrs %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32)) + case uLoad64: + str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64)) + case store8: + str = fmt.Sprintf("strb %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(8)) + case store16: + str = fmt.Sprintf("strh %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(16)) + case store32: + str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(32)) + case store64: + str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64)) + case storeP64: + str = fmt.Sprintf("stp %s, %s, %s", + formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64)) + case loadP64: + str = fmt.Sprintf("ldp %s, %s, %s", + formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64)) + case mov64: + str = fmt.Sprintf("mov %s, %s", + formatVRegSized(i.rd.nr(), 64), + formatVRegSized(i.rn.nr(), 64)) + case mov32: + str = fmt.Sprintf("mov %s, %s", formatVRegSized(i.rd.nr(), 32), formatVRegSized(i.rn.nr(), 32)) + case movZ: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("movz %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16) + case movN: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("movn %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16) + case movK: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("movk %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16) + case extend: + fromBits, toBits := byte(i.u1), byte(i.u2) + + var signedStr string + if i.u3 == 1 { + signedStr = "s" + } else { + signedStr = "u" + } + var fromStr string + switch fromBits { + case 8: + fromStr = "b" + case 16: + fromStr = "h" + case 32: + fromStr = "w" + } + str = fmt.Sprintf("%sxt%s %s, %s", signedStr, fromStr, formatVRegSized(i.rd.nr(), toBits), formatVRegSized(i.rn.nr(), 32)) + case cSel: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("csel %s, %s, %s, %s", + formatVRegSized(i.rd.nr(), size), + formatVRegSized(i.rn.nr(), size), + formatVRegSized(i.rm.nr(), size), + condFlag(i.u1), + ) + case cSet: + if i.u2 != 0 { + str = fmt.Sprintf("csetm %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1)) + } else { + str = fmt.Sprintf("cset %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1)) + } + case cCmpImm: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("ccmp %s, #%#x, #%#x, %s", + formatVRegSized(i.rn.nr(), size), i.rm.data, + i.u2&0b1111, + condFlag(i.u1)) + case fpuMov64: + str = fmt.Sprintf("mov %s, %s", + formatVRegVec(i.rd.nr(), vecArrangement8B, vecIndexNone), + formatVRegVec(i.rn.nr(), vecArrangement8B, vecIndexNone)) + case fpuMov128: + str = fmt.Sprintf("mov %s, %s", + formatVRegVec(i.rd.nr(), vecArrangement16B, vecIndexNone), + formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone)) + case fpuMovFromVec: + panic("TODO") + case fpuRR: + dstSz := is64SizeBitToSize(i.u3) + srcSz := dstSz + op := fpuUniOp(i.u1) + switch op { + case fpuUniOpCvt32To64: + srcSz = 32 + case fpuUniOpCvt64To32: + srcSz = 64 + } + str = fmt.Sprintf("%s %s, %s", op.String(), + formatVRegSized(i.rd.nr(), dstSz), formatVRegSized(i.rn.nr(), srcSz)) + case fpuRRR: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("%s %s, %s, %s", fpuBinOp(i.u1).String(), + formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size)) + case fpuRRI: + panic("TODO") + case fpuRRRR: + panic("TODO") + case fpuCmp: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("fcmp %s, %s", + formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size)) + case fpuLoad32: + str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32)) + case fpuStore32: + str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(64)) + case fpuLoad64: + str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64)) + case fpuStore64: + str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64)) + case fpuLoad128: + str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 128), i.amode.format(64)) + case fpuStore128: + str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 128), i.amode.format(64)) + case loadFpuConst32: + str = fmt.Sprintf("ldr %s, #8; b 8; data.f32 %f", formatVRegSized(i.rd.nr(), 32), math.Float32frombits(uint32(i.u1))) + case loadFpuConst64: + str = fmt.Sprintf("ldr %s, #8; b 16; data.f64 %f", formatVRegSized(i.rd.nr(), 64), math.Float64frombits(i.u1)) + case loadFpuConst128: + str = fmt.Sprintf("ldr %s, #8; b 32; data.v128 %016x %016x", + formatVRegSized(i.rd.nr(), 128), i.u1, i.u2) + case fpuToInt: + var op, src, dst string + if signed := i.u1 == 1; signed { + op = "fcvtzs" + } else { + op = "fcvtzu" + } + if src64 := i.u2 == 1; src64 { + src = formatVRegWidthVec(i.rn.nr(), vecArrangementD) + } else { + src = formatVRegWidthVec(i.rn.nr(), vecArrangementS) + } + if dst64 := i.u3 == 1; dst64 { + dst = formatVRegSized(i.rd.nr(), 64) + } else { + dst = formatVRegSized(i.rd.nr(), 32) + } + str = fmt.Sprintf("%s %s, %s", op, dst, src) + + case intToFpu: + var op, src, dst string + if signed := i.u1 == 1; signed { + op = "scvtf" + } else { + op = "ucvtf" + } + if src64 := i.u2 == 1; src64 { + src = formatVRegSized(i.rn.nr(), 64) + } else { + src = formatVRegSized(i.rn.nr(), 32) + } + if dst64 := i.u3 == 1; dst64 { + dst = formatVRegWidthVec(i.rd.nr(), vecArrangementD) + } else { + dst = formatVRegWidthVec(i.rd.nr(), vecArrangementS) + } + str = fmt.Sprintf("%s %s, %s", op, dst, src) + case fpuCSel: + size := is64SizeBitToSize(i.u3) + str = fmt.Sprintf("fcsel %s, %s, %s, %s", + formatVRegSized(i.rd.nr(), size), + formatVRegSized(i.rn.nr(), size), + formatVRegSized(i.rm.nr(), size), + condFlag(i.u1), + ) + case movToVec: + var size byte + arr := vecArrangement(i.u1) + switch arr { + case vecArrangementB, vecArrangementH, vecArrangementS: + size = 32 + case vecArrangementD: + size = 64 + default: + panic("unsupported arrangement " + arr.String()) + } + str = fmt.Sprintf("ins %s, %s", formatVRegVec(i.rd.nr(), arr, vecIndex(i.u2)), formatVRegSized(i.rn.nr(), size)) + case movFromVec, movFromVecSigned: + var size byte + var opcode string + arr := vecArrangement(i.u1) + signed := i.kind == movFromVecSigned + switch arr { + case vecArrangementB, vecArrangementH, vecArrangementS: + size = 32 + if signed { + opcode = "smov" + } else { + opcode = "umov" + } + case vecArrangementD: + size = 64 + if signed { + opcode = "smov" + } else { + opcode = "mov" + } + default: + panic("unsupported arrangement " + arr.String()) + } + str = fmt.Sprintf("%s %s, %s", opcode, formatVRegSized(i.rd.nr(), size), formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2))) + case vecDup: + str = fmt.Sprintf("dup %s, %s", + formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone), + formatVRegSized(i.rn.nr(), 64), + ) + case vecDupElement: + arr := vecArrangement(i.u1) + str = fmt.Sprintf("dup %s, %s", + formatVRegVec(i.rd.nr(), arr, vecIndexNone), + formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)), + ) + case vecDupFromFpu: + panic("TODO") + case vecExtract: + str = fmt.Sprintf("ext %s, %s, %s, #%d", + formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone), + formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndexNone), + formatVRegVec(i.rm.nr(), vecArrangement(i.u1), vecIndexNone), + uint32(i.u2), + ) + case vecExtend: + panic("TODO") + case vecMovElement: + str = fmt.Sprintf("mov %s, %s", + formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndex(i.u2)), + formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndex(i.u3)), + ) + case vecMiscNarrow: + panic("TODO") + case vecRRR, vecRRRRewrite: + str = fmt.Sprintf("%s %s, %s, %s", + vecOp(i.u1), + formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone), + formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone), + formatVRegVec(i.rm.nr(), vecArrangement(i.u2), vecIndexNone), + ) + case vecMisc: + vop := vecOp(i.u1) + if vop == vecOpCmeq0 { + str = fmt.Sprintf("cmeq %s, %s, #0", + formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone), + formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone)) + } else { + str = fmt.Sprintf("%s %s, %s", + vop, + formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone), + formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone)) + } + case vecLanes: + arr := vecArrangement(i.u2) + var destArr vecArrangement + switch arr { + case vecArrangement8B, vecArrangement16B: + destArr = vecArrangementH + case vecArrangement4H, vecArrangement8H: + destArr = vecArrangementS + case vecArrangement4S: + destArr = vecArrangementD + default: + panic("invalid arrangement " + arr.String()) + } + str = fmt.Sprintf("%s %s, %s", + vecOp(i.u1), + formatVRegWidthVec(i.rd.nr(), destArr), + formatVRegVec(i.rn.nr(), arr, vecIndexNone)) + case vecShiftImm: + arr := vecArrangement(i.u2) + str = fmt.Sprintf("%s %s, %s, #%d", + vecOp(i.u1), + formatVRegVec(i.rd.nr(), arr, vecIndexNone), + formatVRegVec(i.rn.nr(), arr, vecIndexNone), + i.rm.shiftImm()) + case vecTbl: + arr := vecArrangement(i.u2) + str = fmt.Sprintf("tbl %s, { %s }, %s", + formatVRegVec(i.rd.nr(), arr, vecIndexNone), + formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone), + formatVRegVec(i.rm.nr(), arr, vecIndexNone)) + case vecTbl2: + arr := vecArrangement(i.u2) + rd, rn, rm := i.rd.nr(), i.rn.nr(), i.rm.nr() + rn1 := regalloc.FromRealReg(rn.RealReg()+1, rn.RegType()) + str = fmt.Sprintf("tbl %s, { %s, %s }, %s", + formatVRegVec(rd, arr, vecIndexNone), + formatVRegVec(rn, vecArrangement16B, vecIndexNone), + formatVRegVec(rn1, vecArrangement16B, vecIndexNone), + formatVRegVec(rm, arr, vecIndexNone)) + case vecPermute: + arr := vecArrangement(i.u2) + str = fmt.Sprintf("%s %s, %s, %s", + vecOp(i.u1), + formatVRegVec(i.rd.nr(), arr, vecIndexNone), + formatVRegVec(i.rn.nr(), arr, vecIndexNone), + formatVRegVec(i.rm.nr(), arr, vecIndexNone)) + case movToFPSR: + str = fmt.Sprintf("msr fpsr, %s", formatVRegSized(i.rn.nr(), 64)) + case movFromFPSR: + str = fmt.Sprintf("mrs %s fpsr", formatVRegSized(i.rd.nr(), 64)) + case call: + str = fmt.Sprintf("bl %s", ssa.FuncRef(i.u1)) + case callInd: + str = fmt.Sprintf("bl %s", formatVRegSized(i.rn.nr(), 64)) + case ret: + str = "ret" + case br: + target := label(i.u1) + if i.u3 != 0 { + str = fmt.Sprintf("b #%#x (%s)", i.brOffset(), target.String()) + } else { + str = fmt.Sprintf("b %s", target.String()) + } + case condBr: + size := is64SizeBitToSize(i.u3) + c := cond(i.u1) + target := label(i.u2) + switch c.kind() { + case condKindRegisterZero: + if !i.condBrOffsetResolved() { + str = fmt.Sprintf("cbz %s, (%s)", formatVRegSized(c.register(), size), target.String()) + } else { + str = fmt.Sprintf("cbz %s, #%#x %s", formatVRegSized(c.register(), size), i.condBrOffset(), target.String()) + } + case condKindRegisterNotZero: + if offset := i.condBrOffset(); offset != 0 { + str = fmt.Sprintf("cbnz %s, #%#x (%s)", formatVRegSized(c.register(), size), offset, target.String()) + } else { + str = fmt.Sprintf("cbnz %s, %s", formatVRegSized(c.register(), size), target.String()) + } + case condKindCondFlagSet: + if offset := i.condBrOffset(); offset != 0 { + if target == labelInvalid { + str = fmt.Sprintf("b.%s #%#x", c.flag(), offset) + } else { + str = fmt.Sprintf("b.%s #%#x, (%s)", c.flag(), offset, target.String()) + } + } else { + str = fmt.Sprintf("b.%s %s", c.flag(), target.String()) + } + } + case adr: + str = fmt.Sprintf("adr %s, #%#x", formatVRegSized(i.rd.nr(), 64), int64(i.u1)) + case brTableSequence: + targetIndex := i.u1 + str = fmt.Sprintf("br_table_sequence %s, table_index=%d", formatVRegSized(i.rn.nr(), 64), targetIndex) + case exitSequence: + str = fmt.Sprintf("exit_sequence %s", formatVRegSized(i.rn.nr(), 64)) + case atomicRmw: + m := atomicRmwOp(i.u1).String() + size := byte(32) + switch i.u2 { + case 8: + size = 64 + case 2: + m = m + "h" + case 1: + m = m + "b" + } + str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64)) + case atomicCas: + m := "casal" + size := byte(32) + switch i.u2 { + case 8: + size = 64 + case 2: + m = m + "h" + case 1: + m = m + "b" + } + str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64)) + case atomicLoad: + m := "ldar" + size := byte(32) + switch i.u2 { + case 8: + size = 64 + case 2: + m = m + "h" + case 1: + m = m + "b" + } + str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64)) + case atomicStore: + m := "stlr" + size := byte(32) + switch i.u2 { + case 8: + size = 64 + case 2: + m = m + "h" + case 1: + m = m + "b" + } + str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64)) + case dmb: + str = "dmb" + case udf: + str = "udf" + case emitSourceOffsetInfo: + str = fmt.Sprintf("source_offset_info %d", ssa.SourceOffset(i.u1)) + case vecLoad1R: + str = fmt.Sprintf("ld1r {%s}, [%s]", formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone), formatVRegSized(i.rn.nr(), 64)) + case loadConstBlockArg: + str = fmt.Sprintf("load_const_block_arg %s, %#x", formatVRegSized(i.rd.nr(), 64), i.u1) + default: + panic(i.kind) + } + return +} + +func (i *instruction) asAdr(rd regalloc.VReg, offset int64) { + i.kind = adr + i.rd = operandNR(rd) + i.u1 = uint64(offset) +} + +func (i *instruction) asAtomicRmw(op atomicRmwOp, rn, rs, rt operand, size uint64) { + i.kind = atomicRmw + i.rd, i.rn, i.rm = rt, rn, rs + i.u1 = uint64(op) + i.u2 = size +} + +func (i *instruction) asAtomicCas(rn, rs, rt operand, size uint64) { + i.kind = atomicCas + i.rm, i.rn, i.rd = rt, rn, rs + i.u2 = size +} + +func (i *instruction) asAtomicLoad(rn, rt operand, size uint64) { + i.kind = atomicLoad + i.rn, i.rd = rn, rt + i.u2 = size +} + +func (i *instruction) asAtomicStore(rn, rt operand, size uint64) { + i.kind = atomicStore + i.rn, i.rm = rn, rt + i.u2 = size +} + +func (i *instruction) asDMB() { + i.kind = dmb +} + +// TODO: delete unnecessary things. +const ( + // nop0 represents a no-op of zero size. + nop0 instructionKind = iota + 1 + // aluRRR represents an ALU operation with two register sources and a register destination. + aluRRR + // aluRRRR represents an ALU operation with three register sources and a register destination. + aluRRRR + // aluRRImm12 represents an ALU operation with a register source and an immediate-12 source, with a register destination. + aluRRImm12 + // aluRRBitmaskImm represents an ALU operation with a register source and a bitmask immediate, with a register destination. + aluRRBitmaskImm + // aluRRImmShift represents an ALU operation with a register source and an immediate-shifted source, with a register destination. + aluRRImmShift + // aluRRRShift represents an ALU operation with two register sources, one of which can be shifted, with a register destination. + aluRRRShift + // aluRRRExtend represents an ALU operation with two register sources, one of which can be extended, with a register destination. + aluRRRExtend + // bitRR represents a bit op instruction with a single register source. + bitRR + // uLoad8 represents an unsigned 8-bit load. + uLoad8 + // sLoad8 represents a signed 8-bit load into 64-bit register. + sLoad8 + // uLoad16 represents an unsigned 16-bit load into 64-bit register. + uLoad16 + // sLoad16 represents a signed 16-bit load into 64-bit register. + sLoad16 + // uLoad32 represents an unsigned 32-bit load into 64-bit register. + uLoad32 + // sLoad32 represents a signed 32-bit load into 64-bit register. + sLoad32 + // uLoad64 represents a 64-bit load. + uLoad64 + // store8 represents an 8-bit store. + store8 + // store16 represents a 16-bit store. + store16 + // store32 represents a 32-bit store. + store32 + // store64 represents a 64-bit store. + store64 + // storeP64 represents a store of a pair of registers. + storeP64 + // loadP64 represents a load of a pair of registers. + loadP64 + // mov64 represents a MOV instruction. These are encoded as ORR's but we keep them separate for better handling. + mov64 + // mov32 represents a 32-bit MOV. This zeroes the top 32 bits of the destination. + mov32 + // movZ represents a MOVZ with a 16-bit immediate. + movZ + // movN represents a MOVN with a 16-bit immediate. + movN + // movK represents a MOVK with a 16-bit immediate. + movK + // extend represents a sign- or zero-extend operation. + extend + // cSel represents a conditional-select operation. + cSel + // cSet represents a conditional-set operation. + cSet + // cCmpImm represents a conditional comparison with an immediate. + cCmpImm + // fpuMov64 represents a FPU move. Distinct from a vector-register move; moving just 64 bits appears to be significantly faster. + fpuMov64 + // fpuMov128 represents a vector register move. + fpuMov128 + // fpuMovFromVec represents a move to scalar from a vector element. + fpuMovFromVec + // fpuRR represents a 1-op FPU instruction. + fpuRR + // fpuRRR represents a 2-op FPU instruction. + fpuRRR + // fpuRRI represents a 2-op FPU instruction with immediate value. + fpuRRI + // fpuRRRR represents a 3-op FPU instruction. + fpuRRRR + // fpuCmp represents a FPU comparison, either 32 or 64 bit. + fpuCmp + // fpuLoad32 represents a floating-point load, single-precision (32 bit). + fpuLoad32 + // fpuStore32 represents a floating-point store, single-precision (32 bit). + fpuStore32 + // fpuLoad64 represents a floating-point load, double-precision (64 bit). + fpuLoad64 + // fpuStore64 represents a floating-point store, double-precision (64 bit). + fpuStore64 + // fpuLoad128 represents a floating-point/vector load, 128 bit. + fpuLoad128 + // fpuStore128 represents a floating-point/vector store, 128 bit. + fpuStore128 + // loadFpuConst32 represents a load of a 32-bit floating-point constant. + loadFpuConst32 + // loadFpuConst64 represents a load of a 64-bit floating-point constant. + loadFpuConst64 + // loadFpuConst128 represents a load of a 128-bit floating-point constant. + loadFpuConst128 + // vecLoad1R represents a load of a one single-element structure that replicates to all lanes of a vector. + vecLoad1R + // fpuToInt represents a conversion from FP to integer. + fpuToInt + // intToFpu represents a conversion from integer to FP. + intToFpu + // fpuCSel represents a 32/64-bit FP conditional select. + fpuCSel + // movToVec represents a move to a vector element from a GPR. + movToVec + // movFromVec represents an unsigned move from a vector element to a GPR. + movFromVec + // movFromVecSigned represents a signed move from a vector element to a GPR. + movFromVecSigned + // vecDup represents a duplication of general-purpose register to vector. + vecDup + // vecDupElement represents a duplication of a vector element to vector or scalar. + vecDupElement + // vecDupFromFpu represents a duplication of scalar to vector. + vecDupFromFpu + // vecExtract represents a vector extraction operation. + vecExtract + // vecExtend represents a vector extension operation. + vecExtend + // vecMovElement represents a move vector element to another vector element operation. + vecMovElement + // vecMiscNarrow represents a vector narrowing operation. + vecMiscNarrow + // vecRRR represents a vector ALU operation. + vecRRR + // vecRRRRewrite is exactly the same as vecRRR except that this rewrites the destination register. + // For example, BSL instruction rewrites the destination register, and the existing value influences the result. + // Therefore, the "destination" register in vecRRRRewrite will be treated as "use" which makes the register outlive + // the instruction while this instruction doesn't have "def" in the context of register allocation. + vecRRRRewrite + // vecMisc represents a vector two register miscellaneous instruction. + vecMisc + // vecLanes represents a vector instruction across lanes. + vecLanes + // vecShiftImm represents a SIMD scalar shift by immediate instruction. + vecShiftImm + // vecTbl represents a table vector lookup - single register table. + vecTbl + // vecTbl2 represents a table vector lookup - two register table. + vecTbl2 + // vecPermute represents a vector permute instruction. + vecPermute + // movToNZCV represents a move to the FPSR. + movToFPSR + // movFromNZCV represents a move from the FPSR. + movFromFPSR + // call represents a machine call instruction. + call + // callInd represents a machine indirect-call instruction. + callInd + // ret represents a machine return instruction. + ret + // br represents an unconditional branch. + br + // condBr represents a conditional branch. + condBr + // adr represents a compute the address (using a PC-relative offset) of a memory location. + adr + // brTableSequence represents a jump-table sequence. + brTableSequence + // exitSequence consists of multiple instructions, and exits the execution immediately. + // See encodeExitSequence. + exitSequence + // atomicRmw represents an atomic read-modify-write operation with two register sources and a register destination. + atomicRmw + // atomicCas represents an atomic compare-and-swap operation with three register sources. The value is loaded to + // the source register containing the comparison value. + atomicCas + // atomicLoad represents an atomic load with one source register and a register destination. + atomicLoad + // atomicStore represents an atomic store with two source registers and no destination. + atomicStore + // dmb represents the data memory barrier instruction in inner-shareable (ish) mode. + dmb + // UDF is the undefined instruction. For debugging only. + udf + // loadConstBlockArg represents a load of a constant block argument. + loadConstBlockArg + + // emitSourceOffsetInfo is a dummy instruction to emit source offset info. + // The existence of this instruction does not affect the execution. + emitSourceOffsetInfo + + // ------------------- do not define below this line ------------------- + numInstructionKinds +) + +func (i *instruction) asLoadConstBlockArg(v uint64, typ ssa.Type, dst regalloc.VReg) *instruction { + i.kind = loadConstBlockArg + i.u1 = v + i.u2 = uint64(typ) + i.rd = operandNR(dst) + return i +} + +func (i *instruction) loadConstBlockArgData() (v uint64, typ ssa.Type, dst regalloc.VReg) { + return i.u1, ssa.Type(i.u2), i.rd.nr() +} + +func (i *instruction) asEmitSourceOffsetInfo(l ssa.SourceOffset) *instruction { + i.kind = emitSourceOffsetInfo + i.u1 = uint64(l) + return i +} + +func (i *instruction) sourceOffsetInfo() ssa.SourceOffset { + return ssa.SourceOffset(i.u1) +} + +func (i *instruction) asUDF() *instruction { + i.kind = udf + return i +} + +func (i *instruction) asFpuToInt(rd, rn operand, rdSigned, src64bit, dst64bit bool) { + i.kind = fpuToInt + i.rn = rn + i.rd = rd + if rdSigned { + i.u1 = 1 + } + if src64bit { + i.u2 = 1 + } + if dst64bit { + i.u3 = 1 + } +} + +func (i *instruction) asIntToFpu(rd, rn operand, rnSigned, src64bit, dst64bit bool) { + i.kind = intToFpu + i.rn = rn + i.rd = rd + if rnSigned { + i.u1 = 1 + } + if src64bit { + i.u2 = 1 + } + if dst64bit { + i.u3 = 1 + } +} + +func (i *instruction) asExitSequence(ctx regalloc.VReg) *instruction { + i.kind = exitSequence + i.rn = operandNR(ctx) + return i +} + +// aluOp determines the type of ALU operation. Instructions whose kind is one of +// aluRRR, aluRRRR, aluRRImm12, aluRRBitmaskImm, aluRRImmShift, aluRRRShift and aluRRRExtend +// would use this type. +type aluOp int + +func (a aluOp) String() string { + switch a { + case aluOpAdd: + return "add" + case aluOpSub: + return "sub" + case aluOpOrr: + return "orr" + case aluOpOrn: + return "orn" + case aluOpAnd: + return "and" + case aluOpAnds: + return "ands" + case aluOpBic: + return "bic" + case aluOpEor: + return "eor" + case aluOpAddS: + return "adds" + case aluOpSubS: + return "subs" + case aluOpSMulH: + return "sMulH" + case aluOpUMulH: + return "uMulH" + case aluOpSDiv: + return "sdiv" + case aluOpUDiv: + return "udiv" + case aluOpRotR: + return "ror" + case aluOpLsr: + return "lsr" + case aluOpAsr: + return "asr" + case aluOpLsl: + return "lsl" + case aluOpMAdd: + return "madd" + case aluOpMSub: + return "msub" + } + panic(int(a)) +} + +const ( + // 32/64-bit Add. + aluOpAdd aluOp = iota + // 32/64-bit Subtract. + aluOpSub + // 32/64-bit Bitwise OR. + aluOpOrr + // 32/64-bit Bitwise OR NOT. + aluOpOrn + // 32/64-bit Bitwise AND. + aluOpAnd + // 32/64-bit Bitwise ANDS. + aluOpAnds + // 32/64-bit Bitwise AND NOT. + aluOpBic + // 32/64-bit Bitwise XOR (Exclusive OR). + aluOpEor + // 32/64-bit Add setting flags. + aluOpAddS + // 32/64-bit Subtract setting flags. + aluOpSubS + // Signed multiply, high-word result. + aluOpSMulH + // Unsigned multiply, high-word result. + aluOpUMulH + // 64-bit Signed divide. + aluOpSDiv + // 64-bit Unsigned divide. + aluOpUDiv + // 32/64-bit Rotate right. + aluOpRotR + // 32/64-bit Logical shift right. + aluOpLsr + // 32/64-bit Arithmetic shift right. + aluOpAsr + // 32/64-bit Logical shift left. + aluOpLsl /// Multiply-add + + // MAdd and MSub are only applicable for aluRRRR. + aluOpMAdd + aluOpMSub +) + +// vecOp determines the type of vector operation. Instructions whose kind is one of +// vecOpCnt would use this type. +type vecOp int + +// String implements fmt.Stringer. +func (b vecOp) String() string { + switch b { + case vecOpCnt: + return "cnt" + case vecOpCmeq: + return "cmeq" + case vecOpCmgt: + return "cmgt" + case vecOpCmhi: + return "cmhi" + case vecOpCmge: + return "cmge" + case vecOpCmhs: + return "cmhs" + case vecOpFcmeq: + return "fcmeq" + case vecOpFcmgt: + return "fcmgt" + case vecOpFcmge: + return "fcmge" + case vecOpCmeq0: + return "cmeq0" + case vecOpUaddlv: + return "uaddlv" + case vecOpBit: + return "bit" + case vecOpBic: + return "bic" + case vecOpBsl: + return "bsl" + case vecOpNot: + return "not" + case vecOpAnd: + return "and" + case vecOpOrr: + return "orr" + case vecOpEOR: + return "eor" + case vecOpFadd: + return "fadd" + case vecOpAdd: + return "add" + case vecOpAddp: + return "addp" + case vecOpAddv: + return "addv" + case vecOpSub: + return "sub" + case vecOpFsub: + return "fsub" + case vecOpSmin: + return "smin" + case vecOpUmin: + return "umin" + case vecOpUminv: + return "uminv" + case vecOpSmax: + return "smax" + case vecOpUmax: + return "umax" + case vecOpUmaxp: + return "umaxp" + case vecOpUrhadd: + return "urhadd" + case vecOpFmul: + return "fmul" + case vecOpSqrdmulh: + return "sqrdmulh" + case vecOpMul: + return "mul" + case vecOpUmlal: + return "umlal" + case vecOpFdiv: + return "fdiv" + case vecOpFsqrt: + return "fsqrt" + case vecOpAbs: + return "abs" + case vecOpFabs: + return "fabs" + case vecOpNeg: + return "neg" + case vecOpFneg: + return "fneg" + case vecOpFrintp: + return "frintp" + case vecOpFrintm: + return "frintm" + case vecOpFrintn: + return "frintn" + case vecOpFrintz: + return "frintz" + case vecOpFcvtl: + return "fcvtl" + case vecOpFcvtn: + return "fcvtn" + case vecOpFcvtzu: + return "fcvtzu" + case vecOpFcvtzs: + return "fcvtzs" + case vecOpScvtf: + return "scvtf" + case vecOpUcvtf: + return "ucvtf" + case vecOpSqxtn: + return "sqxtn" + case vecOpUqxtn: + return "uqxtn" + case vecOpSqxtun: + return "sqxtun" + case vecOpRev64: + return "rev64" + case vecOpXtn: + return "xtn" + case vecOpShll: + return "shll" + case vecOpSshl: + return "sshl" + case vecOpSshll: + return "sshll" + case vecOpUshl: + return "ushl" + case vecOpUshll: + return "ushll" + case vecOpSshr: + return "sshr" + case vecOpZip1: + return "zip1" + case vecOpFmin: + return "fmin" + case vecOpFmax: + return "fmax" + case vecOpSmull: + return "smull" + case vecOpSmull2: + return "smull2" + } + panic(int(b)) +} + +const ( + vecOpCnt vecOp = iota + vecOpCmeq0 + vecOpCmeq + vecOpCmgt + vecOpCmhi + vecOpCmge + vecOpCmhs + vecOpFcmeq + vecOpFcmgt + vecOpFcmge + vecOpUaddlv + vecOpBit + vecOpBic + vecOpBsl + vecOpNot + vecOpAnd + vecOpOrr + vecOpEOR + vecOpAdd + vecOpFadd + vecOpAddv + vecOpSqadd + vecOpUqadd + vecOpAddp + vecOpSub + vecOpFsub + vecOpSqsub + vecOpUqsub + vecOpSmin + vecOpUmin + vecOpUminv + vecOpFmin + vecOpSmax + vecOpUmax + vecOpUmaxp + vecOpFmax + vecOpUrhadd + vecOpMul + vecOpFmul + vecOpSqrdmulh + vecOpUmlal + vecOpFdiv + vecOpFsqrt + vecOpAbs + vecOpFabs + vecOpNeg + vecOpFneg + vecOpFrintm + vecOpFrintn + vecOpFrintp + vecOpFrintz + vecOpFcvtl + vecOpFcvtn + vecOpFcvtzs + vecOpFcvtzu + vecOpScvtf + vecOpUcvtf + vecOpSqxtn + vecOpSqxtun + vecOpUqxtn + vecOpRev64 + vecOpXtn + vecOpShll + vecOpSshl + vecOpSshll + vecOpUshl + vecOpUshll + vecOpSshr + vecOpZip1 + vecOpSmull + vecOpSmull2 +) + +// bitOp determines the type of bitwise operation. Instructions whose kind is one of +// bitOpRbit and bitOpClz would use this type. +type bitOp int + +// String implements fmt.Stringer. +func (b bitOp) String() string { + switch b { + case bitOpRbit: + return "rbit" + case bitOpClz: + return "clz" + } + panic(int(b)) +} + +const ( + // 32/64-bit Rbit. + bitOpRbit bitOp = iota + // 32/64-bit Clz. + bitOpClz +) + +// fpuUniOp represents a unary floating-point unit (FPU) operation. +type fpuUniOp byte + +const ( + fpuUniOpNeg fpuUniOp = iota + fpuUniOpCvt32To64 + fpuUniOpCvt64To32 + fpuUniOpSqrt + fpuUniOpRoundPlus + fpuUniOpRoundMinus + fpuUniOpRoundZero + fpuUniOpRoundNearest + fpuUniOpAbs +) + +// String implements the fmt.Stringer. +func (f fpuUniOp) String() string { + switch f { + case fpuUniOpNeg: + return "fneg" + case fpuUniOpCvt32To64: + return "fcvt" + case fpuUniOpCvt64To32: + return "fcvt" + case fpuUniOpSqrt: + return "fsqrt" + case fpuUniOpRoundPlus: + return "frintp" + case fpuUniOpRoundMinus: + return "frintm" + case fpuUniOpRoundZero: + return "frintz" + case fpuUniOpRoundNearest: + return "frintn" + case fpuUniOpAbs: + return "fabs" + } + panic(int(f)) +} + +// fpuBinOp represents a binary floating-point unit (FPU) operation. +type fpuBinOp byte + +const ( + fpuBinOpAdd = iota + fpuBinOpSub + fpuBinOpMul + fpuBinOpDiv + fpuBinOpMax + fpuBinOpMin +) + +// String implements the fmt.Stringer. +func (f fpuBinOp) String() string { + switch f { + case fpuBinOpAdd: + return "fadd" + case fpuBinOpSub: + return "fsub" + case fpuBinOpMul: + return "fmul" + case fpuBinOpDiv: + return "fdiv" + case fpuBinOpMax: + return "fmax" + case fpuBinOpMin: + return "fmin" + } + panic(int(f)) +} + +// extMode represents the mode of a register operand extension. +// For example, aluRRRExtend instructions need this info to determine the extensions. +type extMode byte + +const ( + extModeNone extMode = iota + // extModeZeroExtend64 suggests a zero-extension to 32 bits if the original bit size is less than 32. + extModeZeroExtend32 + // extModeSignExtend64 stands for a sign-extension to 32 bits if the original bit size is less than 32. + extModeSignExtend32 + // extModeZeroExtend64 suggests a zero-extension to 64 bits if the original bit size is less than 64. + extModeZeroExtend64 + // extModeSignExtend64 stands for a sign-extension to 64 bits if the original bit size is less than 64. + extModeSignExtend64 +) + +func (e extMode) bits() byte { + switch e { + case extModeZeroExtend32, extModeSignExtend32: + return 32 + case extModeZeroExtend64, extModeSignExtend64: + return 64 + default: + return 0 + } +} + +func (e extMode) signed() bool { + switch e { + case extModeSignExtend32, extModeSignExtend64: + return true + default: + return false + } +} + +func extModeOf(t ssa.Type, signed bool) extMode { + switch t.Bits() { + case 32: + if signed { + return extModeSignExtend32 + } + return extModeZeroExtend32 + case 64: + if signed { + return extModeSignExtend64 + } + return extModeZeroExtend64 + default: + panic("TODO? do we need narrower than 32 bits?") + } +} + +type extendOp byte + +const ( + extendOpUXTB extendOp = 0b000 + extendOpUXTH extendOp = 0b001 + extendOpUXTW extendOp = 0b010 + // extendOpUXTX does nothing, but convenient symbol that officially exists. See: + // https://stackoverflow.com/questions/72041372/what-do-the-uxtx-and-sxtx-extensions-mean-for-32-bit-aarch64-adds-instruct + extendOpUXTX extendOp = 0b011 + extendOpSXTB extendOp = 0b100 + extendOpSXTH extendOp = 0b101 + extendOpSXTW extendOp = 0b110 + // extendOpSXTX does nothing, but convenient symbol that officially exists. See: + // https://stackoverflow.com/questions/72041372/what-do-the-uxtx-and-sxtx-extensions-mean-for-32-bit-aarch64-adds-instruct + extendOpSXTX extendOp = 0b111 + extendOpNone extendOp = 0xff +) + +func (e extendOp) srcBits() byte { + switch e { + case extendOpUXTB, extendOpSXTB: + return 8 + case extendOpUXTH, extendOpSXTH: + return 16 + case extendOpUXTW, extendOpSXTW: + return 32 + case extendOpUXTX, extendOpSXTX: + return 64 + } + panic(int(e)) +} + +func (e extendOp) String() string { + switch e { + case extendOpUXTB: + return "UXTB" + case extendOpUXTH: + return "UXTH" + case extendOpUXTW: + return "UXTW" + case extendOpUXTX: + return "UXTX" + case extendOpSXTB: + return "SXTB" + case extendOpSXTH: + return "SXTH" + case extendOpSXTW: + return "SXTW" + case extendOpSXTX: + return "SXTX" + } + panic(int(e)) +} + +func extendOpFrom(signed bool, from byte) extendOp { + switch from { + case 8: + if signed { + return extendOpSXTB + } + return extendOpUXTB + case 16: + if signed { + return extendOpSXTH + } + return extendOpUXTH + case 32: + if signed { + return extendOpSXTW + } + return extendOpUXTW + case 64: + if signed { + return extendOpSXTX + } + return extendOpUXTX + } + panic("invalid extendOpFrom") +} + +type shiftOp byte + +const ( + shiftOpLSL shiftOp = 0b00 + shiftOpLSR shiftOp = 0b01 + shiftOpASR shiftOp = 0b10 + shiftOpROR shiftOp = 0b11 +) + +func (s shiftOp) String() string { + switch s { + case shiftOpLSL: + return "lsl" + case shiftOpLSR: + return "lsr" + case shiftOpASR: + return "asr" + case shiftOpROR: + return "ror" + } + panic(int(s)) +} + +const exitSequenceSize = 6 * 4 // 6 instructions as in encodeExitSequence. + +// size returns the size of the instruction in encoded bytes. +func (i *instruction) size() int64 { + switch i.kind { + case exitSequence: + return exitSequenceSize // 5 instructions as in encodeExitSequence. + case nop0, loadConstBlockArg: + return 0 + case emitSourceOffsetInfo: + return 0 + case loadFpuConst32: + if i.u1 == 0 { + return 4 // zero loading can be encoded as a single instruction. + } + return 4 + 4 + 4 + case loadFpuConst64: + if i.u1 == 0 { + return 4 // zero loading can be encoded as a single instruction. + } + return 4 + 4 + 8 + case loadFpuConst128: + if i.u1 == 0 && i.u2 == 0 { + return 4 // zero loading can be encoded as a single instruction. + } + return 4 + 4 + 16 + case brTableSequence: + return 4*4 + int64(i.u2)*4 + default: + return 4 + } +} + +// vecArrangement is the arrangement of data within a vector register. +type vecArrangement byte + +const ( + // vecArrangementNone is an arrangement indicating no data is stored. + vecArrangementNone vecArrangement = iota + // vecArrangement8B is an arrangement of 8 bytes (64-bit vector) + vecArrangement8B + // vecArrangement16B is an arrangement of 16 bytes (128-bit vector) + vecArrangement16B + // vecArrangement4H is an arrangement of 4 half precisions (64-bit vector) + vecArrangement4H + // vecArrangement8H is an arrangement of 8 half precisions (128-bit vector) + vecArrangement8H + // vecArrangement2S is an arrangement of 2 single precisions (64-bit vector) + vecArrangement2S + // vecArrangement4S is an arrangement of 4 single precisions (128-bit vector) + vecArrangement4S + // vecArrangement1D is an arrangement of 1 double precision (64-bit vector) + vecArrangement1D + // vecArrangement2D is an arrangement of 2 double precisions (128-bit vector) + vecArrangement2D + + // Assign each vector size specifier to a vector arrangement ID. + // Instructions can only have an arrangement or a size specifier, but not both, so it + // simplifies the internal representation of vector instructions by being able to + // store either into the same field. + + // vecArrangementB is a size specifier of byte + vecArrangementB + // vecArrangementH is a size specifier of word (16-bit) + vecArrangementH + // vecArrangementS is a size specifier of double word (32-bit) + vecArrangementS + // vecArrangementD is a size specifier of quad word (64-bit) + vecArrangementD + // vecArrangementQ is a size specifier of the entire vector (128-bit) + vecArrangementQ +) + +// String implements fmt.Stringer +func (v vecArrangement) String() (ret string) { + switch v { + case vecArrangement8B: + ret = "8B" + case vecArrangement16B: + ret = "16B" + case vecArrangement4H: + ret = "4H" + case vecArrangement8H: + ret = "8H" + case vecArrangement2S: + ret = "2S" + case vecArrangement4S: + ret = "4S" + case vecArrangement1D: + ret = "1D" + case vecArrangement2D: + ret = "2D" + case vecArrangementB: + ret = "B" + case vecArrangementH: + ret = "H" + case vecArrangementS: + ret = "S" + case vecArrangementD: + ret = "D" + case vecArrangementQ: + ret = "Q" + case vecArrangementNone: + ret = "none" + default: + panic(v) + } + return +} + +// vecIndex is the index of an element of a vector register +type vecIndex byte + +// vecIndexNone indicates no vector index specified. +const vecIndexNone = ^vecIndex(0) + +func ssaLaneToArrangement(lane ssa.VecLane) vecArrangement { + switch lane { + case ssa.VecLaneI8x16: + return vecArrangement16B + case ssa.VecLaneI16x8: + return vecArrangement8H + case ssa.VecLaneI32x4: + return vecArrangement4S + case ssa.VecLaneI64x2: + return vecArrangement2D + case ssa.VecLaneF32x4: + return vecArrangement4S + case ssa.VecLaneF64x2: + return vecArrangement2D + default: + panic(lane) + } +} + +// atomicRmwOp is the type of atomic read-modify-write operation. +type atomicRmwOp byte + +const ( + // atomicRmwOpAdd is an atomic add operation. + atomicRmwOpAdd atomicRmwOp = iota + // atomicRmwOpClr is an atomic clear operation, i.e. AND NOT. + atomicRmwOpClr + // atomicRmwOpSet is an atomic set operation, i.e. OR. + atomicRmwOpSet + // atomicRmwOpEor is an atomic exclusive OR operation. + atomicRmwOpEor + // atomicRmwOpSwp is an atomic swap operation. + atomicRmwOpSwp +) + +// String implements fmt.Stringer +func (a atomicRmwOp) String() string { + switch a { + case atomicRmwOpAdd: + return "ldaddal" + case atomicRmwOpClr: + return "ldclral" + case atomicRmwOpSet: + return "ldsetal" + case atomicRmwOpEor: + return "ldeoral" + case atomicRmwOpSwp: + return "swpal" + } + panic(fmt.Sprintf("unknown atomicRmwOp: %d", a)) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go new file mode 100644 index 000000000..227a96474 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go @@ -0,0 +1,2351 @@ +package arm64 + +import ( + "context" + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// Encode implements backend.Machine Encode. +func (m *machine) Encode(ctx context.Context) error { + m.resolveRelativeAddresses(ctx) + m.encode(m.executableContext.RootInstr) + if l := len(m.compiler.Buf()); l > maxFunctionExecutableSize { + return fmt.Errorf("function size exceeds the limit: %d > %d", l, maxFunctionExecutableSize) + } + return nil +} + +func (m *machine) encode(root *instruction) { + for cur := root; cur != nil; cur = cur.next { + cur.encode(m) + } +} + +func (i *instruction) encode(m *machine) { + c := m.compiler + switch kind := i.kind; kind { + case nop0, emitSourceOffsetInfo, loadConstBlockArg: + case exitSequence: + encodeExitSequence(c, i.rn.reg()) + case ret: + // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/RET--Return-from-subroutine-?lang=en + c.Emit4Bytes(encodeRet()) + case br: + imm := i.brOffset() + c.Emit4Bytes(encodeUnconditionalBranch(false, imm)) + case call: + // We still don't know the exact address of the function to call, so we emit a placeholder. + c.AddRelocationInfo(i.callFuncRef()) + c.Emit4Bytes(encodeUnconditionalBranch(true, 0)) // 0 = placeholder + case callInd: + c.Emit4Bytes(encodeUnconditionalBranchReg(regNumberInEncoding[i.rn.realReg()], true)) + case store8, store16, store32, store64, fpuStore32, fpuStore64, fpuStore128: + c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rn.realReg()], i.amode)) + case uLoad8, uLoad16, uLoad32, uLoad64, sLoad8, sLoad16, sLoad32, fpuLoad32, fpuLoad64, fpuLoad128: + c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rd.realReg()], i.amode)) + case vecLoad1R: + c.Emit4Bytes(encodeVecLoad1R( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(i.u1))) + case condBr: + imm19 := i.condBrOffset() + if imm19%4 != 0 { + panic("imm26 for branch must be a multiple of 4") + } + + imm19U32 := uint32(imm19/4) & 0b111_11111111_11111111 + brCond := i.condBrCond() + switch brCond.kind() { + case condKindRegisterZero: + rt := regNumberInEncoding[brCond.register().RealReg()] + c.Emit4Bytes(encodeCBZCBNZ(rt, false, imm19U32, i.condBr64bit())) + case condKindRegisterNotZero: + rt := regNumberInEncoding[brCond.register().RealReg()] + c.Emit4Bytes(encodeCBZCBNZ(rt, true, imm19U32, i.condBr64bit())) + case condKindCondFlagSet: + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B-cond--Branch-conditionally- + fl := brCond.flag() + c.Emit4Bytes(0b01010100<<24 | (imm19U32 << 5) | uint32(fl)) + default: + panic("BUG") + } + case movN: + c.Emit4Bytes(encodeMoveWideImmediate(0b00, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3)) + case movZ: + c.Emit4Bytes(encodeMoveWideImmediate(0b10, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3)) + case movK: + c.Emit4Bytes(encodeMoveWideImmediate(0b11, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3)) + case mov32: + to, from := i.rd.realReg(), i.rn.realReg() + c.Emit4Bytes(encodeAsMov32(regNumberInEncoding[from], regNumberInEncoding[to])) + case mov64: + to, from := i.rd.realReg(), i.rn.realReg() + toIsSp := to == sp + fromIsSp := from == sp + c.Emit4Bytes(encodeMov64(regNumberInEncoding[to], regNumberInEncoding[from], toIsSp, fromIsSp)) + case loadP64, storeP64: + rt, rt2 := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()] + amode := i.amode + rn := regNumberInEncoding[amode.rn.RealReg()] + var pre bool + switch amode.kind { + case addressModeKindPostIndex: + case addressModeKindPreIndex: + pre = true + default: + panic("BUG") + } + c.Emit4Bytes(encodePreOrPostIndexLoadStorePair64(pre, kind == loadP64, rn, rt, rt2, amode.imm)) + case loadFpuConst32: + rd := regNumberInEncoding[i.rd.realReg()] + if i.u1 == 0 { + c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B)) + } else { + encodeLoadFpuConst32(c, rd, i.u1) + } + case loadFpuConst64: + rd := regNumberInEncoding[i.rd.realReg()] + if i.u1 == 0 { + c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B)) + } else { + encodeLoadFpuConst64(c, regNumberInEncoding[i.rd.realReg()], i.u1) + } + case loadFpuConst128: + rd := regNumberInEncoding[i.rd.realReg()] + lo, hi := i.u1, i.u2 + if lo == 0 && hi == 0 { + c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement16B)) + } else { + encodeLoadFpuConst128(c, rd, lo, hi) + } + case aluRRRR: + c.Emit4Bytes(encodeAluRRRR( + aluOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + regNumberInEncoding[i.ra.realReg()], + uint32(i.u3), + )) + case aluRRImmShift: + c.Emit4Bytes(encodeAluRRImm( + aluOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + uint32(i.rm.shiftImm()), + uint32(i.u3), + )) + case aluRRR: + rn := i.rn.realReg() + c.Emit4Bytes(encodeAluRRR( + aluOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[rn], + regNumberInEncoding[i.rm.realReg()], + i.u3 == 1, + rn == sp, + )) + case aluRRRExtend: + rm, exo, to := i.rm.er() + c.Emit4Bytes(encodeAluRRRExtend( + aluOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[rm.RealReg()], + exo, + to, + )) + case aluRRRShift: + r, amt, sop := i.rm.sr() + c.Emit4Bytes(encodeAluRRRShift( + aluOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[r.RealReg()], + uint32(amt), + sop, + i.u3 == 1, + )) + case aluRRBitmaskImm: + c.Emit4Bytes(encodeAluBitmaskImmediate( + aluOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + i.u2, + i.u3 == 1, + )) + case bitRR: + c.Emit4Bytes(encodeBitRR( + bitOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + uint32(i.u2)), + ) + case aluRRImm12: + imm12, shift := i.rm.imm12() + c.Emit4Bytes(encodeAluRRImm12( + aluOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + imm12, shift, + i.u3 == 1, + )) + case fpuRRR: + c.Emit4Bytes(encodeFpuRRR( + fpuBinOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + i.u3 == 1, + )) + case fpuMov64, fpuMov128: + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/MOV--vector---Move-vector--an-alias-of-ORR--vector--register-- + rd := regNumberInEncoding[i.rd.realReg()] + rn := regNumberInEncoding[i.rn.realReg()] + var q uint32 + if kind == fpuMov128 { + q = 0b1 + } + c.Emit4Bytes(q<<30 | 0b1110101<<21 | rn<<16 | 0b000111<<10 | rn<<5 | rd) + case cSet: + rd := regNumberInEncoding[i.rd.realReg()] + cf := condFlag(i.u1) + if i.u2 == 1 { + // https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/CSETM--Conditional-Set-Mask--an-alias-of-CSINV- + // Note that we set 64bit version here. + c.Emit4Bytes(0b1101101010011111<<16 | uint32(cf.invert())<<12 | 0b011111<<5 | rd) + } else { + // https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/CSET--Conditional-Set--an-alias-of-CSINC- + // Note that we set 64bit version here. + c.Emit4Bytes(0b1001101010011111<<16 | uint32(cf.invert())<<12 | 0b111111<<5 | rd) + } + case extend: + c.Emit4Bytes(encodeExtend(i.u3 == 1, byte(i.u1), byte(i.u2), regNumberInEncoding[i.rd.realReg()], regNumberInEncoding[i.rn.realReg()])) + case fpuCmp: + // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/FCMP--Floating-point-quiet-Compare--scalar--?lang=en + rn, rm := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()] + var ftype uint32 + if i.u3 == 1 { + ftype = 0b01 // double precision. + } + c.Emit4Bytes(0b1111<<25 | ftype<<22 | 1<<21 | rm<<16 | 0b1<<13 | rn<<5) + case udf: + // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UDF--Permanently-Undefined-?lang=en + if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable { + c.Emit4Bytes(dummyInstruction) + } else { + c.Emit4Bytes(0) + } + case adr: + c.Emit4Bytes(encodeAdr(regNumberInEncoding[i.rd.realReg()], uint32(i.u1))) + case cSel: + c.Emit4Bytes(encodeConditionalSelect( + kind, + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + condFlag(i.u1), + i.u3 == 1, + )) + case fpuCSel: + c.Emit4Bytes(encodeFpuCSel( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + condFlag(i.u1), + i.u3 == 1, + )) + case movToVec: + c.Emit4Bytes(encodeMoveToVec( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(byte(i.u1)), + vecIndex(i.u2), + )) + case movFromVec, movFromVecSigned: + c.Emit4Bytes(encodeMoveFromVec( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(byte(i.u1)), + vecIndex(i.u2), + i.kind == movFromVecSigned, + )) + case vecDup: + c.Emit4Bytes(encodeVecDup( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(byte(i.u1)))) + case vecDupElement: + c.Emit4Bytes(encodeVecDupElement( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(byte(i.u1)), + vecIndex(i.u2))) + case vecExtract: + c.Emit4Bytes(encodeVecExtract( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + vecArrangement(byte(i.u1)), + uint32(i.u2))) + case vecPermute: + c.Emit4Bytes(encodeVecPermute( + vecOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + vecArrangement(byte(i.u2)))) + case vecMovElement: + c.Emit4Bytes(encodeVecMovElement( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(i.u1), + uint32(i.u2), uint32(i.u3), + )) + case vecMisc: + c.Emit4Bytes(encodeAdvancedSIMDTwoMisc( + vecOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(i.u2), + )) + case vecLanes: + c.Emit4Bytes(encodeVecLanes( + vecOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(i.u2), + )) + case vecShiftImm: + c.Emit4Bytes(encodeVecShiftImm( + vecOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + uint32(i.rm.shiftImm()), + vecArrangement(i.u2), + )) + case vecTbl: + c.Emit4Bytes(encodeVecTbl( + 1, + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + vecArrangement(i.u2)), + ) + case vecTbl2: + c.Emit4Bytes(encodeVecTbl( + 2, + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + vecArrangement(i.u2)), + ) + case brTableSequence: + targets := m.jmpTableTargets[i.u1] + encodeBrTableSequence(c, i.rn.reg(), targets) + case fpuToInt, intToFpu: + c.Emit4Bytes(encodeCnvBetweenFloatInt(i)) + case fpuRR: + c.Emit4Bytes(encodeFloatDataOneSource( + fpuUniOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + i.u3 == 1, + )) + case vecRRR: + if op := vecOp(i.u1); op == vecOpBsl || op == vecOpBit || op == vecOpUmlal { + panic(fmt.Sprintf("vecOp %s must use vecRRRRewrite instead of vecRRR", op.String())) + } + fallthrough + case vecRRRRewrite: + c.Emit4Bytes(encodeVecRRR( + vecOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + vecArrangement(i.u2), + )) + case cCmpImm: + // Conditional compare (immediate) in https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en + sf := uint32(i.u3 & 0b1) + nzcv := uint32(i.u2 & 0b1111) + cond := uint32(condFlag(i.u1)) + imm := uint32(i.rm.data & 0b11111) + rn := regNumberInEncoding[i.rn.realReg()] + c.Emit4Bytes( + sf<<31 | 0b111101001<<22 | imm<<16 | cond<<12 | 0b1<<11 | rn<<5 | nzcv, + ) + case movFromFPSR: + rt := regNumberInEncoding[i.rd.realReg()] + c.Emit4Bytes(encodeSystemRegisterMove(rt, true)) + case movToFPSR: + rt := regNumberInEncoding[i.rn.realReg()] + c.Emit4Bytes(encodeSystemRegisterMove(rt, false)) + case atomicRmw: + c.Emit4Bytes(encodeAtomicRmw( + atomicRmwOp(i.u1), + regNumberInEncoding[i.rm.realReg()], + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + uint32(i.u2), + )) + case atomicCas: + c.Emit4Bytes(encodeAtomicCas( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rm.realReg()], + regNumberInEncoding[i.rn.realReg()], + uint32(i.u2), + )) + case atomicLoad: + c.Emit4Bytes(encodeAtomicLoadStore( + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rd.realReg()], + uint32(i.u2), + 1, + )) + case atomicStore: + c.Emit4Bytes(encodeAtomicLoadStore( + regNumberInEncoding[i.rn.realReg()], + regNumberInEncoding[i.rm.realReg()], + uint32(i.u2), + 0, + )) + case dmb: + c.Emit4Bytes(encodeDMB()) + default: + panic(i.String()) + } +} + +func encodeMov64(rd, rn uint32, toIsSp, fromIsSp bool) uint32 { + if toIsSp || fromIsSp { + // This is an alias of ADD (immediate): + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MOV--to-from-SP---Move-between-register-and-stack-pointer--an-alias-of-ADD--immediate-- + return encodeAddSubtractImmediate(0b100, 0, 0, rn, rd) + } else { + // This is an alias of ORR (shifted register): + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MOV--register---Move--register---an-alias-of-ORR--shifted-register-- + return encodeLogicalShiftedRegister(0b101, 0, rn, 0, regNumberInEncoding[xzr], rd) + } +} + +// encodeSystemRegisterMove encodes as "System register move" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en +// +// Note that currently we only supports read/write of FPSR. +func encodeSystemRegisterMove(rt uint32, fromSystem bool) uint32 { + ret := 0b11010101<<24 | 0b11011<<16 | 0b01000100<<8 | 0b001<<5 | rt + if fromSystem { + ret |= 0b1 << 21 + } + return ret +} + +// encodeVecRRR encodes as either "Advanced SIMD three *" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeVecRRR(op vecOp, rd, rn, rm uint32, arr vecArrangement) uint32 { + switch op { + case vecOpBit: + _, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b10 /* always has size 0b10 */, 0b1, q) + case vecOpBic: + if arr > vecArrangement16B { + panic("unsupported arrangement: " + arr.String()) + } + _, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b01 /* always has size 0b01 */, 0b0, q) + case vecOpBsl: + if arr > vecArrangement16B { + panic("unsupported arrangement: " + arr.String()) + } + _, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b01 /* always has size 0b01 */, 0b1, q) + case vecOpAnd: + if arr > vecArrangement16B { + panic("unsupported arrangement: " + arr.String()) + } + _, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b00 /* always has size 0b00 */, 0b0, q) + case vecOpOrr: + _, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, 0b10 /* always has size 0b10 */, 0b0, q) + case vecOpEOR: + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00011, size, 0b1, q) + case vecOpCmeq: + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10001, size, 0b1, q) + case vecOpCmgt: + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00110, size, 0b0, q) + case vecOpCmhi: + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00110, size, 0b1, q) + case vecOpCmge: + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00111, size, 0b0, q) + case vecOpCmhs: + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00111, size, 0b1, q) + case vecOpFcmeq: + var size, q uint32 + switch arr { + case vecArrangement4S: + size, q = 0b00, 0b1 + case vecArrangement2S: + size, q = 0b00, 0b0 + case vecArrangement2D: + size, q = 0b01, 0b1 + default: + panic("unsupported arrangement: " + arr.String()) + } + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11100, size, 0b0, q) + case vecOpFcmgt: + if arr < vecArrangement2S || arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11100, size, 0b1, q) + case vecOpFcmge: + var size, q uint32 + switch arr { + case vecArrangement4S: + size, q = 0b00, 0b1 + case vecArrangement2S: + size, q = 0b00, 0b0 + case vecArrangement2D: + size, q = 0b01, 0b1 + default: + panic("unsupported arrangement: " + arr.String()) + } + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11100, size, 0b1, q) + case vecOpAdd: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10000, size, 0b0, q) + case vecOpSqadd: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00001, size, 0b0, q) + case vecOpUqadd: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00001, size, 0b1, q) + case vecOpAddp: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10111, size, 0b0, q) + case vecOpSqsub: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00101, size, 0b0, q) + case vecOpUqsub: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00101, size, 0b1, q) + case vecOpSub: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10000, size, 0b1, q) + case vecOpFmin: + if arr < vecArrangement2S || arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11110, size, 0b0, q) + case vecOpSmin: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01101, size, 0b0, q) + case vecOpUmin: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01101, size, 0b1, q) + case vecOpFmax: + var size, q uint32 + switch arr { + case vecArrangement4S: + size, q = 0b00, 0b1 + case vecArrangement2S: + size, q = 0b00, 0b0 + case vecArrangement2D: + size, q = 0b01, 0b1 + default: + panic("unsupported arrangement: " + arr.String()) + } + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11110, size, 0b0, q) + case vecOpFadd: + var size, q uint32 + switch arr { + case vecArrangement4S: + size, q = 0b00, 0b1 + case vecArrangement2S: + size, q = 0b00, 0b0 + case vecArrangement2D: + size, q = 0b01, 0b1 + default: + panic("unsupported arrangement: " + arr.String()) + } + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11010, size, 0b0, q) + case vecOpFsub: + if arr < vecArrangement2S || arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11010, size, 0b0, q) + case vecOpFmul: + var size, q uint32 + switch arr { + case vecArrangement4S: + size, q = 0b00, 0b1 + case vecArrangement2S: + size, q = 0b00, 0b0 + case vecArrangement2D: + size, q = 0b01, 0b1 + default: + panic("unsupported arrangement: " + arr.String()) + } + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11011, size, 0b1, q) + case vecOpSqrdmulh: + if arr < vecArrangement4H || arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10110, size, 0b1, q) + case vecOpFdiv: + var size, q uint32 + switch arr { + case vecArrangement4S: + size, q = 0b00, 0b1 + case vecArrangement2S: + size, q = 0b00, 0b0 + case vecArrangement2D: + size, q = 0b01, 0b1 + default: + panic("unsupported arrangement: " + arr.String()) + } + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b11111, size, 0b1, q) + case vecOpSmax: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01100, size, 0b0, q) + case vecOpUmax: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01100, size, 0b1, q) + case vecOpUmaxp: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10100, size, 0b1, q) + case vecOpUrhadd: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b00010, size, 0b1, q) + case vecOpMul: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b10011, size, 0b0, q) + case vecOpUmlal: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeDifferent(rd, rn, rm, 0b1000, size, 0b1, q) + case vecOpSshl: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01000, size, 0b0, q) + case vecOpUshl: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeSame(rd, rn, rm, 0b01000, size, 0b1, q) + + case vecOpSmull: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, _ := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeDifferent(rd, rn, rm, 0b1100, size, 0b0, 0b0) + + case vecOpSmull2: + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, _ := arrToSizeQEncoded(arr) + return encodeAdvancedSIMDThreeDifferent(rd, rn, rm, 0b1100, size, 0b0, 0b1) + + default: + panic("TODO: " + op.String()) + } +} + +func arrToSizeQEncoded(arr vecArrangement) (size, q uint32) { + switch arr { + case vecArrangement16B: + q = 0b1 + fallthrough + case vecArrangement8B: + size = 0b00 + case vecArrangement8H: + q = 0b1 + fallthrough + case vecArrangement4H: + size = 0b01 + case vecArrangement4S: + q = 0b1 + fallthrough + case vecArrangement2S: + size = 0b10 + case vecArrangement2D: + q = 0b1 + fallthrough + case vecArrangement1D: + size = 0b11 + default: + panic("BUG") + } + return +} + +// encodeAdvancedSIMDThreeSame encodes as "Advanced SIMD three same" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeAdvancedSIMDThreeSame(rd, rn, rm, opcode, size, U, Q uint32) uint32 { + return Q<<30 | U<<29 | 0b111<<25 | size<<22 | 0b1<<21 | rm<<16 | opcode<<11 | 0b1<<10 | rn<<5 | rd +} + +// encodeAdvancedSIMDThreeDifferent encodes as "Advanced SIMD three different" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeAdvancedSIMDThreeDifferent(rd, rn, rm, opcode, size, U, Q uint32) uint32 { + return Q<<30 | U<<29 | 0b111<<25 | size<<22 | 0b1<<21 | rm<<16 | opcode<<12 | rn<<5 | rd +} + +// encodeFloatDataOneSource encodes as "Floating-point data-processing (1 source)" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp +func encodeFloatDataOneSource(op fpuUniOp, rd, rn uint32, dst64bit bool) uint32 { + var opcode, ptype uint32 + switch op { + case fpuUniOpCvt32To64: + opcode = 0b000101 + case fpuUniOpCvt64To32: + opcode = 0b000100 + ptype = 0b01 + case fpuUniOpNeg: + opcode = 0b000010 + if dst64bit { + ptype = 0b01 + } + case fpuUniOpSqrt: + opcode = 0b000011 + if dst64bit { + ptype = 0b01 + } + case fpuUniOpRoundPlus: + opcode = 0b001001 + if dst64bit { + ptype = 0b01 + } + case fpuUniOpRoundMinus: + opcode = 0b001010 + if dst64bit { + ptype = 0b01 + } + case fpuUniOpRoundZero: + opcode = 0b001011 + if dst64bit { + ptype = 0b01 + } + case fpuUniOpRoundNearest: + opcode = 0b001000 + if dst64bit { + ptype = 0b01 + } + case fpuUniOpAbs: + opcode = 0b000001 + if dst64bit { + ptype = 0b01 + } + default: + panic("BUG") + } + return 0b1111<<25 | ptype<<22 | 0b1<<21 | opcode<<15 | 0b1<<14 | rn<<5 | rd +} + +// encodeCnvBetweenFloatInt encodes as "Conversion between floating-point and integer" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeCnvBetweenFloatInt(i *instruction) uint32 { + rd := regNumberInEncoding[i.rd.realReg()] + rn := regNumberInEncoding[i.rn.realReg()] + + var opcode uint32 + var rmode uint32 + var ptype uint32 + var sf uint32 + switch i.kind { + case intToFpu: // Either UCVTF or SCVTF. + rmode = 0b00 + + signed := i.u1 == 1 + src64bit := i.u2 == 1 + dst64bit := i.u3 == 1 + if signed { + opcode = 0b010 + } else { + opcode = 0b011 + } + if src64bit { + sf = 0b1 + } + if dst64bit { + ptype = 0b01 + } else { + ptype = 0b00 + } + case fpuToInt: // Either FCVTZU or FCVTZS. + rmode = 0b11 + + signed := i.u1 == 1 + src64bit := i.u2 == 1 + dst64bit := i.u3 == 1 + + if signed { + opcode = 0b000 + } else { + opcode = 0b001 + } + if dst64bit { + sf = 0b1 + } + if src64bit { + ptype = 0b01 + } else { + ptype = 0b00 + } + } + return sf<<31 | 0b1111<<25 | ptype<<22 | 0b1<<21 | rmode<<19 | opcode<<16 | rn<<5 | rd +} + +// encodeAdr encodes a PC-relative ADR instruction. +// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/ADR--Form-PC-relative-address- +func encodeAdr(rd uint32, offset uint32) uint32 { + if offset >= 1<<20 { + panic("BUG: too large adr instruction") + } + return offset&0b11<<29 | 0b1<<28 | offset&0b1111111111_1111111100<<3 | rd +} + +// encodeFpuCSel encodes as "Floating-point conditional select" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeFpuCSel(rd, rn, rm uint32, c condFlag, _64bit bool) uint32 { + var ftype uint32 + if _64bit { + ftype = 0b01 // double precision. + } + return 0b1111<<25 | ftype<<22 | 0b1<<21 | rm<<16 | uint32(c)<<12 | 0b11<<10 | rn<<5 | rd +} + +// encodeMoveToVec encodes as "Move general-purpose register to a vector element" (represented as `ins`) in +// https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/MOV--vector--from-general- +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--from-general---Move-general-purpose-register-to-a-vector-element--an-alias-of-INS--general--?lang=en +func encodeMoveToVec(rd, rn uint32, arr vecArrangement, index vecIndex) uint32 { + var imm5 uint32 + switch arr { + case vecArrangementB: + imm5 |= 0b1 + imm5 |= uint32(index) << 1 + if index > 0b1111 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", index)) + } + case vecArrangementH: + imm5 |= 0b10 + imm5 |= uint32(index) << 2 + if index > 0b111 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", index)) + } + case vecArrangementS: + imm5 |= 0b100 + imm5 |= uint32(index) << 3 + if index > 0b11 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", index)) + } + case vecArrangementD: + imm5 |= 0b1000 + imm5 |= uint32(index) << 4 + if index > 0b1 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", index)) + } + default: + panic("Unsupported arrangement " + arr.String()) + } + + return 0b01001110000<<21 | imm5<<16 | 0b000111<<10 | rn<<5 | rd +} + +// encodeMoveToVec encodes as "Move vector element to another vector element, mov (element)" (represented as `ins`) in +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--element---Move-vector-element-to-another-vector-element--an-alias-of-INS--element--?lang=en +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en +func encodeVecMovElement(rd, rn uint32, arr vecArrangement, srcIndex, dstIndex uint32) uint32 { + var imm4, imm5 uint32 + switch arr { + case vecArrangementB: + imm5 |= 0b1 + imm5 |= srcIndex << 1 + imm4 = dstIndex + if srcIndex > 0b1111 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", srcIndex)) + } + case vecArrangementH: + imm5 |= 0b10 + imm5 |= srcIndex << 2 + imm4 = dstIndex << 1 + if srcIndex > 0b111 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", srcIndex)) + } + case vecArrangementS: + imm5 |= 0b100 + imm5 |= srcIndex << 3 + imm4 = dstIndex << 2 + if srcIndex > 0b11 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", srcIndex)) + } + case vecArrangementD: + imm5 |= 0b1000 + imm5 |= srcIndex << 4 + imm4 = dstIndex << 3 + if srcIndex > 0b1 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", srcIndex)) + } + default: + panic("Unsupported arrangement " + arr.String()) + } + + return 0b01101110000<<21 | imm5<<16 | imm4<<11 | 0b1<<10 | rn<<5 | rd +} + +// encodeUnconditionalBranchReg encodes as "Unconditional branch (register)" in: +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en +func encodeUnconditionalBranchReg(rn uint32, link bool) uint32 { + var opc uint32 + if link { + opc = 0b0001 + } + return 0b1101011<<25 | opc<<21 | 0b11111<<16 | rn<<5 +} + +// encodeMoveFromVec encodes as "Move vector element to a general-purpose register" +// (represented as `umov` when dest is 32-bit, `umov` otherwise) in +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register-?lang=en +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--to-general---Move-vector-element-to-general-purpose-register--an-alias-of-UMOV-?lang=en +func encodeMoveFromVec(rd, rn uint32, arr vecArrangement, index vecIndex, signed bool) uint32 { + var op, imm4, q, imm5 uint32 + switch { + case arr == vecArrangementB: + imm5 |= 0b1 + imm5 |= uint32(index) << 1 + if index > 0b1111 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", index)) + } + case arr == vecArrangementH: + imm5 |= 0b10 + imm5 |= uint32(index) << 2 + if index > 0b111 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", index)) + } + case arr == vecArrangementS && signed: + q = 0b1 + fallthrough + case arr == vecArrangementS: + imm5 |= 0b100 + imm5 |= uint32(index) << 3 + if index > 0b11 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", index)) + } + case arr == vecArrangementD && !signed: + imm5 |= 0b1000 + imm5 |= uint32(index) << 4 + q = 0b1 + if index > 0b1 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", index)) + } + default: + panic("Unsupported arrangement " + arr.String()) + } + if signed { + op, imm4 = 0, 0b0101 + } else { + op, imm4 = 0, 0b0111 + } + return op<<29 | 0b01110000<<21 | q<<30 | imm5<<16 | imm4<<11 | 1<<10 | rn<<5 | rd +} + +// encodeVecDup encodes as "Duplicate general-purpose register to vector" DUP (general) +// (represented as `dup`) +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--general---Duplicate-general-purpose-register-to-vector-?lang=en +func encodeVecDup(rd, rn uint32, arr vecArrangement) uint32 { + var q, imm5 uint32 + switch arr { + case vecArrangement8B: + q, imm5 = 0b0, 0b1 + case vecArrangement16B: + q, imm5 = 0b1, 0b1 + case vecArrangement4H: + q, imm5 = 0b0, 0b10 + case vecArrangement8H: + q, imm5 = 0b1, 0b10 + case vecArrangement2S: + q, imm5 = 0b0, 0b100 + case vecArrangement4S: + q, imm5 = 0b1, 0b100 + case vecArrangement2D: + q, imm5 = 0b1, 0b1000 + default: + panic("Unsupported arrangement " + arr.String()) + } + return q<<30 | 0b001110000<<21 | imm5<<16 | 0b000011<<10 | rn<<5 | rd +} + +// encodeVecDup encodes as "Duplicate vector element to vector or scalar" DUP (element). +// (represented as `dup`) +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar- +func encodeVecDupElement(rd, rn uint32, arr vecArrangement, srcIndex vecIndex) uint32 { + var q, imm5 uint32 + q = 0b1 + switch arr { + case vecArrangementB: + imm5 |= 0b1 + imm5 |= uint32(srcIndex) << 1 + case vecArrangementH: + imm5 |= 0b10 + imm5 |= uint32(srcIndex) << 2 + case vecArrangementS: + imm5 |= 0b100 + imm5 |= uint32(srcIndex) << 3 + case vecArrangementD: + imm5 |= 0b1000 + imm5 |= uint32(srcIndex) << 4 + default: + panic("unsupported arrangement" + arr.String()) + } + + return q<<30 | 0b001110000<<21 | imm5<<16 | 0b1<<10 | rn<<5 | rd +} + +// encodeVecExtract encodes as "Advanced SIMD extract." +// Currently only `ext` is defined. +// https://developer.arm.com/documentation/ddi0602/2023-06/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp +// https://developer.arm.com/documentation/ddi0602/2023-06/SIMD-FP-Instructions/EXT--Extract-vector-from-pair-of-vectors-?lang=en +func encodeVecExtract(rd, rn, rm uint32, arr vecArrangement, index uint32) uint32 { + var q, imm4 uint32 + switch arr { + case vecArrangement8B: + q, imm4 = 0, 0b0111&uint32(index) + case vecArrangement16B: + q, imm4 = 1, 0b1111&uint32(index) + default: + panic("Unsupported arrangement " + arr.String()) + } + return q<<30 | 0b101110000<<21 | rm<<16 | imm4<<11 | rn<<5 | rd +} + +// encodeVecPermute encodes as "Advanced SIMD permute." +// https://developer.arm.com/documentation/ddi0602/2023-06/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp +func encodeVecPermute(op vecOp, rd, rn, rm uint32, arr vecArrangement) uint32 { + var q, size, opcode uint32 + switch op { + case vecOpZip1: + opcode = 0b011 + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q = arrToSizeQEncoded(arr) + default: + panic("TODO: " + op.String()) + } + return q<<30 | 0b001110<<24 | size<<22 | rm<<16 | opcode<<12 | 0b10<<10 | rn<<5 | rd +} + +// encodeConditionalSelect encodes as "Conditional select" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#condsel +func encodeConditionalSelect(kind instructionKind, rd, rn, rm uint32, c condFlag, _64bit bool) uint32 { + if kind != cSel { + panic("TODO: support other conditional select") + } + + ret := 0b110101<<23 | rm<<16 | uint32(c)<<12 | rn<<5 | rd + if _64bit { + ret |= 0b1 << 31 + } + return ret +} + +const dummyInstruction uint32 = 0x14000000 // "b 0" + +// encodeLoadFpuConst32 encodes the following three instructions: +// +// ldr s8, #8 ;; literal load of data.f32 +// b 8 ;; skip the data +// data.f32 xxxxxxx +func encodeLoadFpuConst32(c backend.Compiler, rd uint32, rawF32 uint64) { + c.Emit4Bytes( + // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--?lang=en + 0b111<<26 | (0x8/4)<<5 | rd, + ) + c.Emit4Bytes(encodeUnconditionalBranch(false, 8)) // b 8 + if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable { + // Inlined data.f32 cannot be disassembled, so we add a dummy instruction here. + c.Emit4Bytes(dummyInstruction) + } else { + c.Emit4Bytes(uint32(rawF32)) // data.f32 xxxxxxx + } +} + +// encodeLoadFpuConst64 encodes the following three instructions: +// +// ldr d8, #8 ;; literal load of data.f64 +// b 12 ;; skip the data +// data.f64 xxxxxxx +func encodeLoadFpuConst64(c backend.Compiler, rd uint32, rawF64 uint64) { + c.Emit4Bytes( + // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--?lang=en + 0b1<<30 | 0b111<<26 | (0x8/4)<<5 | rd, + ) + c.Emit4Bytes(encodeUnconditionalBranch(false, 12)) // b 12 + if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable { + // Inlined data.f64 cannot be disassembled, so we add dummy instructions here. + c.Emit4Bytes(dummyInstruction) + c.Emit4Bytes(dummyInstruction) + } else { + // data.f64 xxxxxxx + c.Emit4Bytes(uint32(rawF64)) + c.Emit4Bytes(uint32(rawF64 >> 32)) + } +} + +// encodeLoadFpuConst128 encodes the following three instructions: +// +// ldr v8, #8 ;; literal load of data.f64 +// b 20 ;; skip the data +// data.v128 xxxxxxx +func encodeLoadFpuConst128(c backend.Compiler, rd uint32, lo, hi uint64) { + c.Emit4Bytes( + // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--?lang=en + 0b1<<31 | 0b111<<26 | (0x8/4)<<5 | rd, + ) + c.Emit4Bytes(encodeUnconditionalBranch(false, 20)) // b 20 + if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable { + // Inlined data.v128 cannot be disassembled, so we add dummy instructions here. + c.Emit4Bytes(dummyInstruction) + c.Emit4Bytes(dummyInstruction) + c.Emit4Bytes(dummyInstruction) + c.Emit4Bytes(dummyInstruction) + } else { + // data.v128 xxxxxxx + c.Emit4Bytes(uint32(lo)) + c.Emit4Bytes(uint32(lo >> 32)) + c.Emit4Bytes(uint32(hi)) + c.Emit4Bytes(uint32(hi >> 32)) + } +} + +// encodeAluRRRR encodes as Data-processing (3 source) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en +func encodeAluRRRR(op aluOp, rd, rn, rm, ra, _64bit uint32) uint32 { + var oO, op31 uint32 + switch op { + case aluOpMAdd: + op31, oO = 0b000, 0b0 + case aluOpMSub: + op31, oO = 0b000, 0b1 + default: + panic("TODO/BUG") + } + return _64bit<<31 | 0b11011<<24 | op31<<21 | rm<<16 | oO<<15 | ra<<10 | rn<<5 | rd +} + +// encodeBitRR encodes as Data-processing (1 source) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en +func encodeBitRR(op bitOp, rd, rn, _64bit uint32) uint32 { + var opcode2, opcode uint32 + switch op { + case bitOpRbit: + opcode2, opcode = 0b00000, 0b000000 + case bitOpClz: + opcode2, opcode = 0b00000, 0b000100 + default: + panic("TODO/BUG") + } + return _64bit<<31 | 0b1_0_11010110<<21 | opcode2<<15 | opcode<<10 | rn<<5 | rd +} + +func encodeAsMov32(rn, rd uint32) uint32 { + // This is an alias of ORR (shifted register): + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MOV--register---Move--register---an-alias-of-ORR--shifted-register-- + return encodeLogicalShiftedRegister(0b001, 0, rn, 0, regNumberInEncoding[xzr], rd) +} + +// encodeExtend encodes extension instructions. +func encodeExtend(signed bool, from, to byte, rd, rn uint32) uint32 { + // UTXB: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UXTB--Unsigned-Extend-Byte--an-alias-of-UBFM-?lang=en + // UTXH: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UXTH--Unsigned-Extend-Halfword--an-alias-of-UBFM-?lang=en + // STXB: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SXTB--Signed-Extend-Byte--an-alias-of-SBFM- + // STXH: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SXTH--Sign-Extend-Halfword--an-alias-of-SBFM- + // STXW: https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SXTW--Sign-Extend-Word--an-alias-of-SBFM- + var _31to10 uint32 + switch { + case !signed && from == 8 && to == 32: + // 32-bit UXTB + _31to10 = 0b0101001100000000000111 + case !signed && from == 16 && to == 32: + // 32-bit UXTH + _31to10 = 0b0101001100000000001111 + case !signed && from == 8 && to == 64: + // 64-bit UXTB + _31to10 = 0b0101001100000000000111 + case !signed && from == 16 && to == 64: + // 64-bit UXTH + _31to10 = 0b0101001100000000001111 + case !signed && from == 32 && to == 64: + return encodeAsMov32(rn, rd) + case signed && from == 8 && to == 32: + // 32-bit SXTB + _31to10 = 0b0001001100000000000111 + case signed && from == 16 && to == 32: + // 32-bit SXTH + _31to10 = 0b0001001100000000001111 + case signed && from == 8 && to == 64: + // 64-bit SXTB + _31to10 = 0b1001001101000000000111 + case signed && from == 16 && to == 64: + // 64-bit SXTH + _31to10 = 0b1001001101000000001111 + case signed && from == 32 && to == 64: + // SXTW + _31to10 = 0b1001001101000000011111 + default: + panic("BUG") + } + return _31to10<<10 | rn<<5 | rd +} + +func encodeLoadOrStore(kind instructionKind, rt uint32, amode addressMode) uint32 { + var _22to31 uint32 + var bits int64 + switch kind { + case uLoad8: + _22to31 = 0b0011100001 + bits = 8 + case sLoad8: + _22to31 = 0b0011100010 + bits = 8 + case uLoad16: + _22to31 = 0b0111100001 + bits = 16 + case sLoad16: + _22to31 = 0b0111100010 + bits = 16 + case uLoad32: + _22to31 = 0b1011100001 + bits = 32 + case sLoad32: + _22to31 = 0b1011100010 + bits = 32 + case uLoad64: + _22to31 = 0b1111100001 + bits = 64 + case fpuLoad32: + _22to31 = 0b1011110001 + bits = 32 + case fpuLoad64: + _22to31 = 0b1111110001 + bits = 64 + case fpuLoad128: + _22to31 = 0b0011110011 + bits = 128 + case store8: + _22to31 = 0b0011100000 + bits = 8 + case store16: + _22to31 = 0b0111100000 + bits = 16 + case store32: + _22to31 = 0b1011100000 + bits = 32 + case store64: + _22to31 = 0b1111100000 + bits = 64 + case fpuStore32: + _22to31 = 0b1011110000 + bits = 32 + case fpuStore64: + _22to31 = 0b1111110000 + bits = 64 + case fpuStore128: + _22to31 = 0b0011110010 + bits = 128 + default: + panic("BUG") + } + + switch amode.kind { + case addressModeKindRegScaledExtended: + return encodeLoadOrStoreExtended(_22to31, + regNumberInEncoding[amode.rn.RealReg()], + regNumberInEncoding[amode.rm.RealReg()], + rt, true, amode.extOp) + case addressModeKindRegScaled: + return encodeLoadOrStoreExtended(_22to31, + regNumberInEncoding[amode.rn.RealReg()], regNumberInEncoding[amode.rm.RealReg()], + rt, true, extendOpNone) + case addressModeKindRegExtended: + return encodeLoadOrStoreExtended(_22to31, + regNumberInEncoding[amode.rn.RealReg()], regNumberInEncoding[amode.rm.RealReg()], + rt, false, amode.extOp) + case addressModeKindRegReg: + return encodeLoadOrStoreExtended(_22to31, + regNumberInEncoding[amode.rn.RealReg()], regNumberInEncoding[amode.rm.RealReg()], + rt, false, extendOpNone) + case addressModeKindRegSignedImm9: + // e.g. https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled-- + return encodeLoadOrStoreSIMM9(_22to31, 0b00 /* unscaled */, regNumberInEncoding[amode.rn.RealReg()], rt, amode.imm) + case addressModeKindPostIndex: + return encodeLoadOrStoreSIMM9(_22to31, 0b01 /* post index */, regNumberInEncoding[amode.rn.RealReg()], rt, amode.imm) + case addressModeKindPreIndex: + return encodeLoadOrStoreSIMM9(_22to31, 0b11 /* pre index */, regNumberInEncoding[amode.rn.RealReg()], rt, amode.imm) + case addressModeKindRegUnsignedImm12: + // "unsigned immediate" in https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Loads-and-Stores?lang=en + rn := regNumberInEncoding[amode.rn.RealReg()] + imm := amode.imm + div := bits / 8 + if imm != 0 && !offsetFitsInAddressModeKindRegUnsignedImm12(byte(bits), imm) { + panic("BUG") + } + imm /= div + return _22to31<<22 | 0b1<<24 | uint32(imm&0b111111111111)<<10 | rn<<5 | rt + default: + panic("BUG") + } +} + +// encodeVecLoad1R encodes as Load one single-element structure and Replicate to all lanes (of one register) in +// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LD1R--Load-one-single-element-structure-and-Replicate-to-all-lanes--of-one-register--?lang=en#sa_imm +func encodeVecLoad1R(rt, rn uint32, arr vecArrangement) uint32 { + size, q := arrToSizeQEncoded(arr) + return q<<30 | 0b001101010000001100<<12 | size<<10 | rn<<5 | rt +} + +// encodeAluBitmaskImmediate encodes as Logical (immediate) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en +func encodeAluBitmaskImmediate(op aluOp, rd, rn uint32, imm uint64, _64bit bool) uint32 { + var _31to23 uint32 + switch op { + case aluOpAnd: + _31to23 = 0b00_100100 + case aluOpOrr: + _31to23 = 0b01_100100 + case aluOpEor: + _31to23 = 0b10_100100 + case aluOpAnds: + _31to23 = 0b11_100100 + default: + panic("BUG") + } + if _64bit { + _31to23 |= 0b1 << 8 + } + immr, imms, N := bitmaskImmediate(imm, _64bit) + return _31to23<<23 | uint32(N)<<22 | uint32(immr)<<16 | uint32(imms)<<10 | rn<<5 | rd +} + +func bitmaskImmediate(c uint64, is64bit bool) (immr, imms, N byte) { + var size uint32 + switch { + case c != c>>32|c<<32: + size = 64 + case c != c>>16|c<<48: + size = 32 + c = uint64(int32(c)) + case c != c>>8|c<<56: + size = 16 + c = uint64(int16(c)) + case c != c>>4|c<<60: + size = 8 + c = uint64(int8(c)) + case c != c>>2|c<<62: + size = 4 + c = uint64(int64(c<<60) >> 60) + default: + size = 2 + c = uint64(int64(c<<62) >> 62) + } + + neg := false + if int64(c) < 0 { + c = ^c + neg = true + } + + onesSize, nonZeroPos := getOnesSequenceSize(c) + if neg { + nonZeroPos = onesSize + nonZeroPos + onesSize = size - onesSize + } + + var mode byte = 32 + if is64bit && size == 64 { + N, mode = 0b1, 64 + } + + immr = byte((size - nonZeroPos) & (size - 1) & uint32(mode-1)) + imms = byte((onesSize - 1) | 63&^(size<<1-1)) + return +} + +func getOnesSequenceSize(x uint64) (size, nonZeroPos uint32) { + // Take 0b00111000 for example: + y := getLowestBit(x) // = 0b0000100 + nonZeroPos = setBitPos(y) // = 2 + size = setBitPos(x+y) - nonZeroPos // = setBitPos(0b0100000) - 2 = 5 - 2 = 3 + return +} + +func setBitPos(x uint64) (ret uint32) { + for ; ; ret++ { + if x == 0b1 { + break + } + x = x >> 1 + } + return +} + +// encodeLoadOrStoreExtended encodes store/load instruction as "extended register offset" in Load/store register (register offset): +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Loads-and-Stores?lang=en +func encodeLoadOrStoreExtended(_22to32 uint32, rn, rm, rt uint32, scaled bool, extOp extendOp) uint32 { + var option uint32 + switch extOp { + case extendOpUXTW: + option = 0b010 + case extendOpSXTW: + option = 0b110 + case extendOpNone: + option = 0b111 + default: + panic("BUG") + } + var s uint32 + if scaled { + s = 0b1 + } + return _22to32<<22 | 0b1<<21 | rm<<16 | option<<13 | s<<12 | 0b10<<10 | rn<<5 | rt +} + +// encodeLoadOrStoreSIMM9 encodes store/load instruction as one of post-index, pre-index or unscaled immediate as in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Loads-and-Stores?lang=en +func encodeLoadOrStoreSIMM9(_22to32, _1011 uint32, rn, rt uint32, imm9 int64) uint32 { + return _22to32<<22 | (uint32(imm9)&0b111111111)<<12 | _1011<<10 | rn<<5 | rt +} + +// encodeFpuRRR encodes as single or double precision (depending on `_64bit`) of Floating-point data-processing (2 source) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeFpuRRR(op fpuBinOp, rd, rn, rm uint32, _64bit bool) (ret uint32) { + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADD--vector--Add-vectors--scalar--floating-point-and-integer- + var opcode uint32 + switch op { + case fpuBinOpAdd: + opcode = 0b0010 + case fpuBinOpSub: + opcode = 0b0011 + case fpuBinOpMul: + opcode = 0b0000 + case fpuBinOpDiv: + opcode = 0b0001 + case fpuBinOpMax: + opcode = 0b0100 + case fpuBinOpMin: + opcode = 0b0101 + default: + panic("BUG") + } + var ptype uint32 + if _64bit { + ptype = 0b01 + } + return 0b1111<<25 | ptype<<22 | 0b1<<21 | rm<<16 | opcode<<12 | 0b1<<11 | rn<<5 | rd +} + +// encodeAluRRImm12 encodes as Add/subtract (immediate) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en +func encodeAluRRImm12(op aluOp, rd, rn uint32, imm12 uint16, shiftBit byte, _64bit bool) uint32 { + var _31to24 uint32 + switch op { + case aluOpAdd: + _31to24 = 0b00_10001 + case aluOpAddS: + _31to24 = 0b01_10001 + case aluOpSub: + _31to24 = 0b10_10001 + case aluOpSubS: + _31to24 = 0b11_10001 + default: + panic("BUG") + } + if _64bit { + _31to24 |= 0b1 << 7 + } + return _31to24<<24 | uint32(shiftBit)<<22 | uint32(imm12&0b111111111111)<<10 | rn<<5 | rd +} + +// encodeAluRRR encodes as Data Processing (shifted register), depending on aluOp. +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift +func encodeAluRRRShift(op aluOp, rd, rn, rm, amount uint32, shiftOp shiftOp, _64bit bool) uint32 { + var _31to24 uint32 + var opc, n uint32 + switch op { + case aluOpAdd: + _31to24 = 0b00001011 + case aluOpAddS: + _31to24 = 0b00101011 + case aluOpSub: + _31to24 = 0b01001011 + case aluOpSubS: + _31to24 = 0b01101011 + case aluOpAnd, aluOpOrr, aluOpEor, aluOpAnds: + // "Logical (shifted register)". + switch op { + case aluOpAnd: + // all zeros + case aluOpOrr: + opc = 0b01 + case aluOpEor: + opc = 0b10 + case aluOpAnds: + opc = 0b11 + } + _31to24 = 0b000_01010 + default: + panic(op.String()) + } + + if _64bit { + _31to24 |= 0b1 << 7 + } + + var shift uint32 + switch shiftOp { + case shiftOpLSL: + shift = 0b00 + case shiftOpLSR: + shift = 0b01 + case shiftOpASR: + shift = 0b10 + default: + panic(shiftOp.String()) + } + return opc<<29 | n<<21 | _31to24<<24 | shift<<22 | rm<<16 | (amount << 10) | (rn << 5) | rd +} + +// "Add/subtract (extended register)" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_ext +func encodeAluRRRExtend(ao aluOp, rd, rn, rm uint32, extOp extendOp, to byte) uint32 { + var s, op uint32 + switch ao { + case aluOpAdd: + op = 0b0 + case aluOpAddS: + op, s = 0b0, 0b1 + case aluOpSub: + op = 0b1 + case aluOpSubS: + op, s = 0b1, 0b1 + default: + panic("BUG: extended register operand can be used only for add/sub") + } + + var sf uint32 + if to == 64 { + sf = 0b1 + } + + var option uint32 + switch extOp { + case extendOpUXTB: + option = 0b000 + case extendOpUXTH: + option = 0b001 + case extendOpUXTW: + option = 0b010 + case extendOpSXTB: + option = 0b100 + case extendOpSXTH: + option = 0b101 + case extendOpSXTW: + option = 0b110 + case extendOpSXTX, extendOpUXTX: + panic(fmt.Sprintf("%s is essentially noop, and should be handled much earlier than encoding", extOp.String())) + } + return sf<<31 | op<<30 | s<<29 | 0b1011001<<21 | rm<<16 | option<<13 | rn<<5 | rd +} + +// encodeAluRRR encodes as Data Processing (register), depending on aluOp. +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en +func encodeAluRRR(op aluOp, rd, rn, rm uint32, _64bit, isRnSp bool) uint32 { + var _31to21, _15to10 uint32 + switch op { + case aluOpAdd: + if isRnSp { + // "Extended register" with UXTW. + _31to21 = 0b00001011_001 + _15to10 = 0b011000 + } else { + // "Shifted register" with shift = 0 + _31to21 = 0b00001011_000 + } + case aluOpAddS: + if isRnSp { + panic("TODO") + } + // "Shifted register" with shift = 0 + _31to21 = 0b00101011_000 + case aluOpSub: + if isRnSp { + // "Extended register" with UXTW. + _31to21 = 0b01001011_001 + _15to10 = 0b011000 + } else { + // "Shifted register" with shift = 0 + _31to21 = 0b01001011_000 + } + case aluOpSubS: + if isRnSp { + panic("TODO") + } + // "Shifted register" with shift = 0 + _31to21 = 0b01101011_000 + case aluOpAnd, aluOpOrr, aluOpOrn, aluOpEor, aluOpAnds: + // "Logical (shifted register)". + var opc, n uint32 + switch op { + case aluOpAnd: + // all zeros + case aluOpOrr: + opc = 0b01 + case aluOpOrn: + opc = 0b01 + n = 1 + case aluOpEor: + opc = 0b10 + case aluOpAnds: + opc = 0b11 + } + _31to21 = 0b000_01010_000 | opc<<8 | n + case aluOpLsl, aluOpAsr, aluOpLsr, aluOpRotR: + // "Data-processing (2 source)". + _31to21 = 0b00011010_110 + switch op { + case aluOpLsl: + _15to10 = 0b001000 + case aluOpLsr: + _15to10 = 0b001001 + case aluOpAsr: + _15to10 = 0b001010 + case aluOpRotR: + _15to10 = 0b001011 + } + case aluOpSDiv: + // "Data-processing (2 source)". + _31to21 = 0b11010110 + _15to10 = 0b000011 + case aluOpUDiv: + // "Data-processing (2 source)". + _31to21 = 0b11010110 + _15to10 = 0b000010 + default: + panic(op.String()) + } + if _64bit { + _31to21 |= 0b1 << 10 + } + return _31to21<<21 | rm<<16 | (_15to10 << 10) | (rn << 5) | rd +} + +// encodeLogicalShiftedRegister encodes as Logical (shifted register) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en +func encodeLogicalShiftedRegister(sf_opc uint32, shift_N uint32, rm uint32, imm6 uint32, rn, rd uint32) (ret uint32) { + ret = sf_opc << 29 + ret |= 0b01010 << 24 + ret |= shift_N << 21 + ret |= rm << 16 + ret |= imm6 << 10 + ret |= rn << 5 + ret |= rd + return +} + +// encodeAddSubtractImmediate encodes as Add/subtract (immediate) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en +func encodeAddSubtractImmediate(sf_op_s uint32, sh uint32, imm12 uint32, rn, rd uint32) (ret uint32) { + ret = sf_op_s << 29 + ret |= 0b100010 << 23 + ret |= sh << 22 + ret |= imm12 << 10 + ret |= rn << 5 + ret |= rd + return +} + +// encodePreOrPostIndexLoadStorePair64 encodes as Load/store pair (pre/post-indexed) in +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers- +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STP--Store-Pair-of-Registers- +func encodePreOrPostIndexLoadStorePair64(pre bool, load bool, rn, rt, rt2 uint32, imm7 int64) (ret uint32) { + if imm7%8 != 0 { + panic("imm7 for pair load/store must be a multiple of 8") + } + imm7 /= 8 + ret = rt + ret |= rn << 5 + ret |= rt2 << 10 + ret |= (uint32(imm7) & 0b1111111) << 15 + if load { + ret |= 0b1 << 22 + } + ret |= 0b101010001 << 23 + if pre { + ret |= 0b1 << 24 + } + return +} + +// encodeUnconditionalBranch encodes as B or BL instructions: +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B--Branch- +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link- +func encodeUnconditionalBranch(link bool, imm26 int64) (ret uint32) { + if imm26%4 != 0 { + panic("imm26 for branch must be a multiple of 4") + } + imm26 /= 4 + ret = uint32(imm26 & 0b11_11111111_11111111_11111111) + ret |= 0b101 << 26 + if link { + ret |= 0b1 << 31 + } + return +} + +// encodeCBZCBNZ encodes as either CBZ or CBNZ: +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CBZ--Compare-and-Branch-on-Zero- +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CBNZ--Compare-and-Branch-on-Nonzero- +func encodeCBZCBNZ(rt uint32, nz bool, imm19 uint32, _64bit bool) (ret uint32) { + ret = rt + ret |= imm19 << 5 + if nz { + ret |= 1 << 24 + } + ret |= 0b11010 << 25 + if _64bit { + ret |= 1 << 31 + } + return +} + +// encodeMoveWideImmediate encodes as either MOVZ, MOVN or MOVK, as Move wide (immediate) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en +// +// "shift" must have been divided by 16 at this point. +func encodeMoveWideImmediate(opc uint32, rd uint32, imm, shift, _64bit uint64) (ret uint32) { + ret = rd + ret |= uint32(imm&0xffff) << 5 + ret |= (uint32(shift)) << 21 + ret |= 0b100101 << 23 + ret |= opc << 29 + ret |= uint32(_64bit) << 31 + return +} + +// encodeAluRRImm encodes as "Bitfield" in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en#log_imm +func encodeAluRRImm(op aluOp, rd, rn, amount, _64bit uint32) uint32 { + var opc uint32 + var immr, imms uint32 + switch op { + case aluOpLsl: + // LSL (immediate) is an alias for UBFM. + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/UBFM--Unsigned-Bitfield-Move-?lang=en + opc = 0b10 + if amount == 0 { + // This can be encoded as NOP, but we don't do it for consistency: lsr xn, xm, #0 + immr = 0 + if _64bit == 1 { + imms = 0b111111 + } else { + imms = 0b11111 + } + } else { + if _64bit == 1 { + immr = 64 - amount + } else { + immr = (32 - amount) & 0b11111 + } + imms = immr - 1 + } + case aluOpLsr: + // LSR (immediate) is an alias for UBFM. + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LSR--immediate---Logical-Shift-Right--immediate---an-alias-of-UBFM-?lang=en + opc = 0b10 + imms, immr = 0b011111|_64bit<<5, amount + case aluOpAsr: + // ASR (immediate) is an alias for SBFM. + // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/SBFM--Signed-Bitfield-Move-?lang=en + opc = 0b00 + imms, immr = 0b011111|_64bit<<5, amount + default: + panic(op.String()) + } + return _64bit<<31 | opc<<29 | 0b100110<<23 | _64bit<<22 | immr<<16 | imms<<10 | rn<<5 | rd +} + +// encodeVecLanes encodes as Data Processing (Advanced SIMD across lanes) depending on vecOp in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeVecLanes(op vecOp, rd uint32, rn uint32, arr vecArrangement) uint32 { + var u, q, size, opcode uint32 + switch arr { + case vecArrangement8B: + q, size = 0b0, 0b00 + case vecArrangement16B: + q, size = 0b1, 0b00 + case vecArrangement4H: + q, size = 0, 0b01 + case vecArrangement8H: + q, size = 1, 0b01 + case vecArrangement4S: + q, size = 1, 0b10 + default: + panic("unsupported arrangement: " + arr.String()) + } + switch op { + case vecOpUaddlv: + u, opcode = 1, 0b00011 + case vecOpUminv: + u, opcode = 1, 0b11010 + case vecOpAddv: + u, opcode = 0, 0b11011 + default: + panic("unsupported or illegal vecOp: " + op.String()) + } + return q<<30 | u<<29 | 0b1110<<24 | size<<22 | 0b11000<<17 | opcode<<12 | 0b10<<10 | rn<<5 | rd +} + +// encodeVecLanes encodes as Data Processing (Advanced SIMD scalar shift by immediate) depending on vecOp in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeVecShiftImm(op vecOp, rd uint32, rn, amount uint32, arr vecArrangement) uint32 { + var u, q, immh, immb, opcode uint32 + switch op { + case vecOpSshll: + u, opcode = 0b0, 0b10100 + case vecOpUshll: + u, opcode = 0b1, 0b10100 + case vecOpSshr: + u, opcode = 0, 0b00000 + default: + panic("unsupported or illegal vecOp: " + op.String()) + } + switch arr { + case vecArrangement16B: + q = 0b1 + fallthrough + case vecArrangement8B: + immh = 0b0001 + immb = 8 - uint32(amount&0b111) + case vecArrangement8H: + q = 0b1 + fallthrough + case vecArrangement4H: + v := 16 - uint32(amount&0b1111) + immb = v & 0b111 + immh = 0b0010 | (v >> 3) + case vecArrangement4S: + q = 0b1 + fallthrough + case vecArrangement2S: + v := 32 - uint32(amount&0b11111) + immb = v & 0b111 + immh = 0b0100 | (v >> 3) + case vecArrangement2D: + q = 0b1 + v := 64 - uint32(amount&0b111111) + immb = v & 0b111 + immh = 0b1000 | (v >> 3) + default: + panic("unsupported arrangement: " + arr.String()) + } + return q<<30 | u<<29 | 0b011110<<23 | immh<<19 | immb<<16 | 0b000001<<10 | opcode<<11 | 0b1<<10 | rn<<5 | rd +} + +// encodeVecTbl encodes as Data Processing (Advanced SIMD table lookup) in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp +// +// Note: tblOp may encode tbl1, tbl2... in the future. Currently, it is ignored. +func encodeVecTbl(nregs, rd, rn, rm uint32, arr vecArrangement) uint32 { + var q, op2, len, op uint32 + + switch nregs { + case 1: + // tbl: single-register + len = 0b00 + case 2: + // tbl2: 2-register table + len = 0b01 + default: + panic(fmt.Sprintf("unsupported number or registers %d", nregs)) + } + switch arr { + case vecArrangement8B: + q = 0b0 + case vecArrangement16B: + q = 0b1 + default: + panic("unsupported arrangement: " + arr.String()) + } + + return q<<30 | 0b001110<<24 | op2<<22 | rm<<16 | len<<13 | op<<12 | rn<<5 | rd +} + +// encodeVecMisc encodes as Data Processing (Advanced SIMD two-register miscellaneous) depending on vecOp in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp +func encodeAdvancedSIMDTwoMisc(op vecOp, rd, rn uint32, arr vecArrangement) uint32 { + var q, u, size, opcode uint32 + switch op { + case vecOpCnt: + opcode = 0b00101 + switch arr { + case vecArrangement8B: + q, size = 0b0, 0b00 + case vecArrangement16B: + q, size = 0b1, 0b00 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpCmeq0: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + opcode = 0b01001 + size, q = arrToSizeQEncoded(arr) + case vecOpNot: + u = 1 + opcode = 0b00101 + switch arr { + case vecArrangement8B: + q, size = 0b0, 0b00 + case vecArrangement16B: + q, size = 0b1, 0b00 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpAbs: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + opcode = 0b01011 + u = 0b0 + size, q = arrToSizeQEncoded(arr) + case vecOpNeg: + if arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + opcode = 0b01011 + u = 0b1 + size, q = arrToSizeQEncoded(arr) + case vecOpFabs: + if arr < vecArrangement2S || arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + opcode = 0b01111 + u = 0b0 + size, q = arrToSizeQEncoded(arr) + case vecOpFneg: + if arr < vecArrangement2S || arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + opcode = 0b01111 + u = 0b1 + size, q = arrToSizeQEncoded(arr) + case vecOpFrintm: + u = 0b0 + opcode = 0b11001 + switch arr { + case vecArrangement2S: + q, size = 0b0, 0b00 + case vecArrangement4S: + q, size = 0b1, 0b00 + case vecArrangement2D: + q, size = 0b1, 0b01 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpFrintn: + u = 0b0 + opcode = 0b11000 + switch arr { + case vecArrangement2S: + q, size = 0b0, 0b00 + case vecArrangement4S: + q, size = 0b1, 0b00 + case vecArrangement2D: + q, size = 0b1, 0b01 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpFrintp: + u = 0b0 + opcode = 0b11000 + if arr < vecArrangement2S || arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q = arrToSizeQEncoded(arr) + case vecOpFrintz: + u = 0b0 + opcode = 0b11001 + if arr < vecArrangement2S || arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + size, q = arrToSizeQEncoded(arr) + case vecOpFsqrt: + if arr < vecArrangement2S || arr == vecArrangement1D { + panic("unsupported arrangement: " + arr.String()) + } + opcode = 0b11111 + u = 0b1 + size, q = arrToSizeQEncoded(arr) + case vecOpFcvtl: + opcode = 0b10111 + u = 0b0 + switch arr { + case vecArrangement2S: + size, q = 0b01, 0b0 + case vecArrangement4H: + size, q = 0b00, 0b0 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpFcvtn: + opcode = 0b10110 + u = 0b0 + switch arr { + case vecArrangement2S: + size, q = 0b01, 0b0 + case vecArrangement4H: + size, q = 0b00, 0b0 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpFcvtzs: + opcode = 0b11011 + u = 0b0 + switch arr { + case vecArrangement2S: + q, size = 0b0, 0b10 + case vecArrangement4S: + q, size = 0b1, 0b10 + case vecArrangement2D: + q, size = 0b1, 0b11 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpFcvtzu: + opcode = 0b11011 + u = 0b1 + switch arr { + case vecArrangement2S: + q, size = 0b0, 0b10 + case vecArrangement4S: + q, size = 0b1, 0b10 + case vecArrangement2D: + q, size = 0b1, 0b11 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpScvtf: + opcode = 0b11101 + u = 0b0 + switch arr { + case vecArrangement4S: + q, size = 0b1, 0b00 + case vecArrangement2S: + q, size = 0b0, 0b00 + case vecArrangement2D: + q, size = 0b1, 0b01 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpUcvtf: + opcode = 0b11101 + u = 0b1 + switch arr { + case vecArrangement4S: + q, size = 0b1, 0b00 + case vecArrangement2S: + q, size = 0b0, 0b00 + case vecArrangement2D: + q, size = 0b1, 0b01 + default: + panic("unsupported arrangement: " + arr.String()) + } + case vecOpSqxtn: + // When q == 1 it encodes sqxtn2 (operates on upper 64 bits). + opcode = 0b10100 + u = 0b0 + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q = arrToSizeQEncoded(arr) + case vecOpUqxtn: + // When q == 1 it encodes uqxtn2 (operates on upper 64 bits). + opcode = 0b10100 + u = 0b1 + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q = arrToSizeQEncoded(arr) + case vecOpSqxtun: + // When q == 1 it encodes sqxtun2 (operates on upper 64 bits). + opcode = 0b10010 // 0b10100 + u = 0b1 + if arr > vecArrangement4S { + panic("unsupported arrangement: " + arr.String()) + } + size, q = arrToSizeQEncoded(arr) + case vecOpRev64: + opcode = 0b00000 + size, q = arrToSizeQEncoded(arr) + case vecOpXtn: + u = 0b0 + opcode = 0b10010 + size, q = arrToSizeQEncoded(arr) + case vecOpShll: + u = 0b1 + opcode = 0b10011 + switch arr { + case vecArrangement8B: + q, size = 0b0, 0b00 + case vecArrangement4H: + q, size = 0b0, 0b01 + case vecArrangement2S: + q, size = 0b0, 0b10 + default: + panic("unsupported arrangement: " + arr.String()) + } + default: + panic("unsupported or illegal vecOp: " + op.String()) + } + return q<<30 | u<<29 | 0b01110<<24 | size<<22 | 0b10000<<17 | opcode<<12 | 0b10<<10 | rn<<5 | rd +} + +// brTableSequenceOffsetTableBegin is the offset inside the brTableSequence where the table begins after 4 instructions +const brTableSequenceOffsetTableBegin = 16 + +func encodeBrTableSequence(c backend.Compiler, index regalloc.VReg, targets []uint32) { + tmpRegNumber := regNumberInEncoding[tmp] + indexNumber := regNumberInEncoding[index.RealReg()] + + // adr tmpReg, PC+16 (PC+16 is the address of the first label offset) + // ldrsw index, [tmpReg, index, UXTW 2] ;; index = int64(*(tmpReg + index*8)) + // add tmpReg, tmpReg, index + // br tmpReg + // [offset_to_l1, offset_to_l2, ..., offset_to_lN] + c.Emit4Bytes(encodeAdr(tmpRegNumber, 16)) + c.Emit4Bytes(encodeLoadOrStore(sLoad32, indexNumber, + addressMode{kind: addressModeKindRegScaledExtended, rn: tmpRegVReg, rm: index, extOp: extendOpUXTW}, + )) + c.Emit4Bytes(encodeAluRRR(aluOpAdd, tmpRegNumber, tmpRegNumber, indexNumber, true, false)) + c.Emit4Bytes(encodeUnconditionalBranchReg(tmpRegNumber, false)) + + // Offsets are resolved in ResolveRelativeAddress phase. + for _, offset := range targets { + if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable { + // Inlined offset tables cannot be disassembled properly, so pad dummy instructions to make the debugging easier. + c.Emit4Bytes(dummyInstruction) + } else { + c.Emit4Bytes(offset) + } + } +} + +// encodeExitSequence matches the implementation detail of functionABI.emitGoEntryPreamble. +func encodeExitSequence(c backend.Compiler, ctxReg regalloc.VReg) { + // Restore the FP, SP and LR, and return to the Go code: + // ldr lr, [ctxReg, #GoReturnAddress] + // ldr fp, [ctxReg, #OriginalFramePointer] + // ldr tmp, [ctxReg, #OriginalStackPointer] + // mov sp, tmp ;; sp cannot be str'ed directly. + // ret ;; --> return to the Go code + + var ctxEvicted bool + if ctx := ctxReg.RealReg(); ctx == fp || ctx == lr { + // In order to avoid overwriting the context register, we move ctxReg to tmp. + c.Emit4Bytes(encodeMov64(regNumberInEncoding[tmp], regNumberInEncoding[ctx], false, false)) + ctxReg = tmpRegVReg + ctxEvicted = true + } + + restoreLr := encodeLoadOrStore( + uLoad64, + regNumberInEncoding[lr], + addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: ctxReg, + imm: wazevoapi.ExecutionContextOffsetGoReturnAddress.I64(), + }, + ) + + restoreFp := encodeLoadOrStore( + uLoad64, + regNumberInEncoding[fp], + addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: ctxReg, + imm: wazevoapi.ExecutionContextOffsetOriginalFramePointer.I64(), + }, + ) + + restoreSpToTmp := encodeLoadOrStore( + uLoad64, + regNumberInEncoding[tmp], + addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: ctxReg, + imm: wazevoapi.ExecutionContextOffsetOriginalStackPointer.I64(), + }, + ) + + movTmpToSp := encodeAddSubtractImmediate(0b100, 0, 0, + regNumberInEncoding[tmp], regNumberInEncoding[sp]) + + c.Emit4Bytes(restoreFp) + c.Emit4Bytes(restoreLr) + c.Emit4Bytes(restoreSpToTmp) + c.Emit4Bytes(movTmpToSp) + c.Emit4Bytes(encodeRet()) + if !ctxEvicted { + // In order to have the fixed-length exit sequence, we need to padd the binary. + // Since this will never be reached, we insert a dummy instruction. + c.Emit4Bytes(dummyInstruction) + } +} + +func encodeRet() uint32 { + // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/RET--Return-from-subroutine-?lang=en + return 0b1101011001011111<<16 | regNumberInEncoding[lr]<<5 +} + +func encodeAtomicRmw(op atomicRmwOp, rs, rt, rn uint32, size uint32) uint32 { + var _31to21, _15to10, sz uint32 + + switch size { + case 8: + sz = 0b11 + case 4: + sz = 0b10 + case 2: + sz = 0b01 + case 1: + sz = 0b00 + } + + _31to21 = 0b00111000_111 | sz<<9 + + switch op { + case atomicRmwOpAdd: + _15to10 = 0b000000 + case atomicRmwOpClr: + _15to10 = 0b000100 + case atomicRmwOpSet: + _15to10 = 0b001100 + case atomicRmwOpEor: + _15to10 = 0b001000 + case atomicRmwOpSwp: + _15to10 = 0b100000 + } + + return _31to21<<21 | rs<<16 | _15to10<<10 | rn<<5 | rt +} + +func encodeAtomicCas(rs, rt, rn uint32, size uint32) uint32 { + var _31to21, _15to10, sz uint32 + + switch size { + case 8: + sz = 0b11 + case 4: + sz = 0b10 + case 2: + sz = 0b01 + case 1: + sz = 0b00 + } + + _31to21 = 0b00001000_111 | sz<<9 + _15to10 = 0b111111 + + return _31to21<<21 | rs<<16 | _15to10<<10 | rn<<5 | rt +} + +func encodeAtomicLoadStore(rn, rt, size, l uint32) uint32 { + var _31to21, _20to16, _15to10, sz uint32 + + switch size { + case 8: + sz = 0b11 + case 4: + sz = 0b10 + case 2: + sz = 0b01 + case 1: + sz = 0b00 + } + + _31to21 = 0b00001000_100 | sz<<9 | l<<1 + _20to16 = 0b11111 + _15to10 = 0b111111 + + return _31to21<<21 | _20to16<<16 | _15to10<<10 | rn<<5 | rt +} + +func encodeDMB() uint32 { + return 0b11010101000000110011101110111111 +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go new file mode 100644 index 000000000..698b382d4 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go @@ -0,0 +1,301 @@ +package arm64 + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// lowerConstant allocates a new VReg and inserts the instruction to load the constant value. +func (m *machine) lowerConstant(instr *ssa.Instruction) (vr regalloc.VReg) { + val := instr.Return() + valType := val.Type() + + vr = m.compiler.AllocateVReg(valType) + v := instr.ConstantVal() + m.insertLoadConstant(v, valType, vr) + return +} + +// InsertLoadConstantBlockArg implements backend.Machine. +func (m *machine) InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) { + val := instr.Return() + valType := val.Type() + v := instr.ConstantVal() + load := m.allocateInstr() + load.asLoadConstBlockArg(v, valType, vr) + m.insert(load) +} + +func (m *machine) lowerLoadConstantBlockArgAfterRegAlloc(i *instruction) { + v, typ, dst := i.loadConstBlockArgData() + m.insertLoadConstant(v, typ, dst) +} + +func (m *machine) insertLoadConstant(v uint64, valType ssa.Type, vr regalloc.VReg) { + if valType.Bits() < 64 { // Clear the redundant bits just in case it's unexpectedly sign-extended, etc. + v = v & ((1 << valType.Bits()) - 1) + } + + switch valType { + case ssa.TypeF32: + loadF := m.allocateInstr() + loadF.asLoadFpuConst32(vr, v) + m.insert(loadF) + case ssa.TypeF64: + loadF := m.allocateInstr() + loadF.asLoadFpuConst64(vr, v) + m.insert(loadF) + case ssa.TypeI32: + if v == 0 { + m.InsertMove(vr, xzrVReg, ssa.TypeI32) + } else { + m.lowerConstantI32(vr, int32(v)) + } + case ssa.TypeI64: + if v == 0 { + m.InsertMove(vr, xzrVReg, ssa.TypeI64) + } else { + m.lowerConstantI64(vr, int64(v)) + } + default: + panic("TODO") + } +} + +// The following logics are based on the old asm/arm64 package. +// https://github.com/tetratelabs/wazero/blob/39f2ff23a6d609e10c82b9cc0b981f6de5b87a9c/internal/asm/arm64/impl.go + +func (m *machine) lowerConstantI32(dst regalloc.VReg, c int32) { + // Following the logic here: + // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1637 + ic := int64(uint32(c)) + if ic >= 0 && (ic <= 0xfff || (ic&0xfff) == 0 && (uint64(ic>>12) <= 0xfff)) { + if isBitMaskImmediate(uint64(c), false) { + m.lowerConstViaBitMaskImmediate(uint64(uint32(c)), dst, false) + return + } + } + + if t := const16bitAligned(int64(uint32(c))); t >= 0 { + // If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000 + // We could load it into temporary with movk. + m.insertMOVZ(dst, uint64(uint32(c)>>(16*t)), t, false) + } else if t := const16bitAligned(int64(^c)); t >= 0 { + // Also, if the inverse of the const can fit within 16-bit range, do the same ^^. + m.insertMOVN(dst, uint64(^c>>(16*t)), t, false) + } else if isBitMaskImmediate(uint64(uint32(c)), false) { + m.lowerConstViaBitMaskImmediate(uint64(c), dst, false) + } else { + // Otherwise, we use MOVZ and MOVK to load it. + c16 := uint16(c) + m.insertMOVZ(dst, uint64(c16), 0, false) + c16 = uint16(uint32(c) >> 16) + m.insertMOVK(dst, uint64(c16), 1, false) + } +} + +func (m *machine) lowerConstantI64(dst regalloc.VReg, c int64) { + // Following the logic here: + // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1798-L1852 + if c >= 0 && (c <= 0xfff || (c&0xfff) == 0 && (uint64(c>>12) <= 0xfff)) { + if isBitMaskImmediate(uint64(c), true) { + m.lowerConstViaBitMaskImmediate(uint64(c), dst, true) + return + } + } + + if t := const16bitAligned(c); t >= 0 { + // If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000 + // We could load it into temporary with movk. + m.insertMOVZ(dst, uint64(c)>>(16*t), t, true) + } else if t := const16bitAligned(^c); t >= 0 { + // Also, if the reverse of the const can fit within 16-bit range, do the same ^^. + m.insertMOVN(dst, uint64(^c)>>(16*t), t, true) + } else if isBitMaskImmediate(uint64(c), true) { + m.lowerConstViaBitMaskImmediate(uint64(c), dst, true) + } else { + m.load64bitConst(c, dst) + } +} + +func (m *machine) lowerConstViaBitMaskImmediate(c uint64, dst regalloc.VReg, b64 bool) { + instr := m.allocateInstr() + instr.asALUBitmaskImm(aluOpOrr, dst, xzrVReg, c, b64) + m.insert(instr) +} + +// isBitMaskImmediate determines if the value can be encoded as "bitmask immediate". +// +// Such an immediate is a 32-bit or 64-bit pattern viewed as a vector of identical elements of size e = 2, 4, 8, 16, 32, or 64 bits. +// Each element contains the same sub-pattern: a single run of 1 to e-1 non-zero bits, rotated by 0 to e-1 bits. +// +// See https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/MOV--bitmask-immediate- +func isBitMaskImmediate(x uint64, _64 bool) bool { + // All zeros and ones are not "bitmask immediate" by definition. + if x == 0 || (_64 && x == 0xffff_ffff_ffff_ffff) || (!_64 && x == 0xffff_ffff) { + return false + } + + switch { + case x != x>>32|x<<32: + // e = 64 + case x != x>>16|x<<48: + // e = 32 (x == x>>32|x<<32). + // e.g. 0x00ff_ff00_00ff_ff00 + x = uint64(int32(x)) + case x != x>>8|x<<56: + // e = 16 (x == x>>16|x<<48). + // e.g. 0x00ff_00ff_00ff_00ff + x = uint64(int16(x)) + case x != x>>4|x<<60: + // e = 8 (x == x>>8|x<<56). + // e.g. 0x0f0f_0f0f_0f0f_0f0f + x = uint64(int8(x)) + default: + // e = 4 or 2. + return true + } + return sequenceOfSetbits(x) || sequenceOfSetbits(^x) +} + +// sequenceOfSetbits returns true if the number's binary representation is the sequence set bit (1). +// For example: 0b1110 -> true, 0b1010 -> false +func sequenceOfSetbits(x uint64) bool { + y := getLowestBit(x) + // If x is a sequence of set bit, this should results in the number + // with only one set bit (i.e. power of two). + y += x + return (y-1)&y == 0 +} + +func getLowestBit(x uint64) uint64 { + return x & (^x + 1) +} + +// const16bitAligned check if the value is on the 16-bit alignment. +// If so, returns the shift num divided by 16, and otherwise -1. +func const16bitAligned(v int64) (ret int) { + ret = -1 + for s := 0; s < 64; s += 16 { + if (uint64(v) &^ (uint64(0xffff) << uint(s))) == 0 { + ret = s / 16 + break + } + } + return +} + +// load64bitConst loads a 64-bit constant into the register, following the same logic to decide how to load large 64-bit +// consts as in the Go assembler. +// +// See https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6632-L6759 +func (m *machine) load64bitConst(c int64, dst regalloc.VReg) { + var bits [4]uint64 + var zeros, negs int + for i := 0; i < 4; i++ { + bits[i] = uint64(c) >> uint(i*16) & 0xffff + if v := bits[i]; v == 0 { + zeros++ + } else if v == 0xffff { + negs++ + } + } + + if zeros == 3 { + // one MOVZ instruction. + for i, v := range bits { + if v != 0 { + m.insertMOVZ(dst, v, i, true) + } + } + } else if negs == 3 { + // one MOVN instruction. + for i, v := range bits { + if v != 0xffff { + v = ^v + m.insertMOVN(dst, v, i, true) + } + } + } else if zeros == 2 { + // one MOVZ then one OVK. + var movz bool + for i, v := range bits { + if !movz && v != 0 { // MOVZ. + m.insertMOVZ(dst, v, i, true) + movz = true + } else if v != 0 { + m.insertMOVK(dst, v, i, true) + } + } + + } else if negs == 2 { + // one MOVN then one or two MOVK. + var movn bool + for i, v := range bits { // Emit MOVN. + if !movn && v != 0xffff { + v = ^v + // https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN + m.insertMOVN(dst, v, i, true) + movn = true + } else if v != 0xffff { + m.insertMOVK(dst, v, i, true) + } + } + + } else if zeros == 1 { + // one MOVZ then two MOVK. + var movz bool + for i, v := range bits { + if !movz && v != 0 { // MOVZ. + m.insertMOVZ(dst, v, i, true) + movz = true + } else if v != 0 { + m.insertMOVK(dst, v, i, true) + } + } + + } else if negs == 1 { + // one MOVN then two MOVK. + var movn bool + for i, v := range bits { // Emit MOVN. + if !movn && v != 0xffff { + v = ^v + // https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN + m.insertMOVN(dst, v, i, true) + movn = true + } else if v != 0xffff { + m.insertMOVK(dst, v, i, true) + } + } + + } else { + // one MOVZ then up to three MOVK. + var movz bool + for i, v := range bits { + if !movz && v != 0 { // MOVZ. + m.insertMOVZ(dst, v, i, true) + movz = true + } else if v != 0 { + m.insertMOVK(dst, v, i, true) + } + } + } +} + +func (m *machine) insertMOVZ(dst regalloc.VReg, v uint64, shift int, dst64 bool) { + instr := m.allocateInstr() + instr.asMOVZ(dst, v, uint64(shift), dst64) + m.insert(instr) +} + +func (m *machine) insertMOVK(dst regalloc.VReg, v uint64, shift int, dst64 bool) { + instr := m.allocateInstr() + instr.asMOVK(dst, v, uint64(shift), dst64) + m.insert(instr) +} + +func (m *machine) insertMOVN(dst regalloc.VReg, v uint64, shift int, dst64 bool) { + instr := m.allocateInstr() + instr.asMOVN(dst, v, uint64(shift), dst64) + m.insert(instr) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go new file mode 100644 index 000000000..2bb234e8c --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go @@ -0,0 +1,2221 @@ +package arm64 + +// Files prefixed as lower_instr** do the instruction selection, meaning that lowering SSA level instructions +// into machine specific instructions. +// +// Importantly, what the lower** functions does includes tree-matching; find the pattern from the given instruction tree, +// and merge the multiple instructions if possible. It can be considered as "N:1" instruction selection. + +import ( + "fmt" + "math" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// LowerSingleBranch implements backend.Machine. +func (m *machine) LowerSingleBranch(br *ssa.Instruction) { + ectx := m.executableContext + switch br.Opcode() { + case ssa.OpcodeJump: + _, _, targetBlk := br.BranchData() + if br.IsFallthroughJump() { + return + } + b := m.allocateInstr() + target := ectx.GetOrAllocateSSABlockLabel(targetBlk) + if target == labelReturn { + b.asRet() + } else { + b.asBr(target) + } + m.insert(b) + case ssa.OpcodeBrTable: + m.lowerBrTable(br) + default: + panic("BUG: unexpected branch opcode" + br.Opcode().String()) + } +} + +func (m *machine) lowerBrTable(i *ssa.Instruction) { + index, targets := i.BrTableData() + indexOperand := m.getOperand_NR(m.compiler.ValueDefinition(index), extModeNone) + + // Firstly, we have to do the bounds check of the index, and + // set it to the default target (sitting at the end of the list) if it's out of bounds. + + // mov maxIndexReg #maximum_index + // subs wzr, index, maxIndexReg + // csel adjustedIndex, maxIndexReg, index, hs ;; if index is higher or equal than maxIndexReg. + maxIndexReg := m.compiler.AllocateVReg(ssa.TypeI32) + m.lowerConstantI32(maxIndexReg, int32(len(targets)-1)) + subs := m.allocateInstr() + subs.asALU(aluOpSubS, operandNR(xzrVReg), indexOperand, operandNR(maxIndexReg), false) + m.insert(subs) + csel := m.allocateInstr() + adjustedIndex := m.compiler.AllocateVReg(ssa.TypeI32) + csel.asCSel(operandNR(adjustedIndex), operandNR(maxIndexReg), indexOperand, hs, false) + m.insert(csel) + + brSequence := m.allocateInstr() + + tableIndex := m.addJmpTableTarget(targets) + brSequence.asBrTableSequence(adjustedIndex, tableIndex, len(targets)) + m.insert(brSequence) +} + +// LowerConditionalBranch implements backend.Machine. +func (m *machine) LowerConditionalBranch(b *ssa.Instruction) { + exctx := m.executableContext + cval, args, targetBlk := b.BranchData() + if len(args) > 0 { + panic(fmt.Sprintf( + "conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s", + exctx.CurrentSSABlk, + targetBlk, + )) + } + + target := exctx.GetOrAllocateSSABlockLabel(targetBlk) + cvalDef := m.compiler.ValueDefinition(cval) + + switch { + case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction. + cvalInstr := cvalDef.Instr + x, y, c := cvalInstr.IcmpData() + cc, signed := condFlagFromSSAIntegerCmpCond(c), c.Signed() + if b.Opcode() == ssa.OpcodeBrz { + cc = cc.invert() + } + + if !m.tryLowerBandToFlag(x, y) { + m.lowerIcmpToFlag(x, y, signed) + } + cbr := m.allocateInstr() + cbr.asCondBr(cc.asCond(), target, false /* ignored */) + m.insert(cbr) + cvalDef.Instr.MarkLowered() + case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly. + cvalInstr := cvalDef.Instr + x, y, c := cvalInstr.FcmpData() + cc := condFlagFromSSAFloatCmpCond(c) + if b.Opcode() == ssa.OpcodeBrz { + cc = cc.invert() + } + m.lowerFcmpToFlag(x, y) + cbr := m.allocateInstr() + cbr.asCondBr(cc.asCond(), target, false /* ignored */) + m.insert(cbr) + cvalDef.Instr.MarkLowered() + default: + rn := m.getOperand_NR(cvalDef, extModeNone) + var c cond + if b.Opcode() == ssa.OpcodeBrz { + c = registerAsRegZeroCond(rn.nr()) + } else { + c = registerAsRegNotZeroCond(rn.nr()) + } + cbr := m.allocateInstr() + cbr.asCondBr(c, target, false) + m.insert(cbr) + } +} + +func (m *machine) tryLowerBandToFlag(x, y ssa.Value) (ok bool) { + xx := m.compiler.ValueDefinition(x) + yy := m.compiler.ValueDefinition(y) + if xx.IsFromInstr() && xx.Instr.Constant() && xx.Instr.ConstantVal() == 0 { + if m.compiler.MatchInstr(yy, ssa.OpcodeBand) { + bandInstr := yy.Instr + m.lowerBitwiseAluOp(bandInstr, aluOpAnds, true) + ok = true + bandInstr.MarkLowered() + return + } + } + + if yy.IsFromInstr() && yy.Instr.Constant() && yy.Instr.ConstantVal() == 0 { + if m.compiler.MatchInstr(xx, ssa.OpcodeBand) { + bandInstr := xx.Instr + m.lowerBitwiseAluOp(bandInstr, aluOpAnds, true) + ok = true + bandInstr.MarkLowered() + return + } + } + return +} + +// LowerInstr implements backend.Machine. +func (m *machine) LowerInstr(instr *ssa.Instruction) { + if l := instr.SourceOffset(); l.Valid() { + info := m.allocateInstr().asEmitSourceOffsetInfo(l) + m.insert(info) + } + + switch op := instr.Opcode(); op { + case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable: + panic("BUG: branching instructions are handled by LowerBranches") + case ssa.OpcodeReturn: + panic("BUG: return must be handled by backend.Compiler") + case ssa.OpcodeIadd, ssa.OpcodeIsub: + m.lowerSubOrAdd(instr, op == ssa.OpcodeIadd) + case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv, ssa.OpcodeFmax, ssa.OpcodeFmin: + m.lowerFpuBinOp(instr) + case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined. + case ssa.OpcodeExitWithCode: + execCtx, code := instr.ExitWithCodeData() + m.lowerExitWithCode(m.compiler.VRegOf(execCtx), code) + case ssa.OpcodeExitIfTrueWithCode: + execCtx, c, code := instr.ExitIfTrueWithCodeData() + m.lowerExitIfTrueWithCode(m.compiler.VRegOf(execCtx), c, code) + case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32: + m.lowerStore(instr) + case ssa.OpcodeLoad: + dst := instr.Return() + ptr, offset, typ := instr.LoadData() + m.lowerLoad(ptr, offset, typ, dst) + case ssa.OpcodeVZeroExtLoad: + dst := instr.Return() + ptr, offset, typ := instr.VZeroExtLoadData() + m.lowerLoad(ptr, offset, typ, dst) + case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32: + ptr, offset, _ := instr.LoadData() + ret := m.compiler.VRegOf(instr.Return()) + m.lowerExtLoad(op, ptr, offset, ret) + case ssa.OpcodeCall, ssa.OpcodeCallIndirect: + m.lowerCall(instr) + case ssa.OpcodeIcmp: + m.lowerIcmp(instr) + case ssa.OpcodeVIcmp: + m.lowerVIcmp(instr) + case ssa.OpcodeVFcmp: + m.lowerVFcmp(instr) + case ssa.OpcodeVCeil: + m.lowerVecMisc(vecOpFrintp, instr) + case ssa.OpcodeVFloor: + m.lowerVecMisc(vecOpFrintm, instr) + case ssa.OpcodeVTrunc: + m.lowerVecMisc(vecOpFrintz, instr) + case ssa.OpcodeVNearest: + m.lowerVecMisc(vecOpFrintn, instr) + case ssa.OpcodeVMaxPseudo: + m.lowerVMinMaxPseudo(instr, true) + case ssa.OpcodeVMinPseudo: + m.lowerVMinMaxPseudo(instr, false) + case ssa.OpcodeBand: + m.lowerBitwiseAluOp(instr, aluOpAnd, false) + case ssa.OpcodeBor: + m.lowerBitwiseAluOp(instr, aluOpOrr, false) + case ssa.OpcodeBxor: + m.lowerBitwiseAluOp(instr, aluOpEor, false) + case ssa.OpcodeIshl: + m.lowerShifts(instr, extModeNone, aluOpLsl) + case ssa.OpcodeSshr: + if instr.Return().Type().Bits() == 64 { + m.lowerShifts(instr, extModeSignExtend64, aluOpAsr) + } else { + m.lowerShifts(instr, extModeSignExtend32, aluOpAsr) + } + case ssa.OpcodeUshr: + if instr.Return().Type().Bits() == 64 { + m.lowerShifts(instr, extModeZeroExtend64, aluOpLsr) + } else { + m.lowerShifts(instr, extModeZeroExtend32, aluOpLsr) + } + case ssa.OpcodeRotl: + m.lowerRotl(instr) + case ssa.OpcodeRotr: + m.lowerRotr(instr) + case ssa.OpcodeSExtend, ssa.OpcodeUExtend: + from, to, signed := instr.ExtendData() + m.lowerExtend(instr.Arg(), instr.Return(), from, to, signed) + case ssa.OpcodeFcmp: + x, y, c := instr.FcmpData() + m.lowerFcmp(x, y, instr.Return(), c) + case ssa.OpcodeImul: + x, y := instr.Arg2() + result := instr.Return() + m.lowerImul(x, y, result) + case ssa.OpcodeUndefined: + undef := m.allocateInstr() + undef.asUDF() + m.insert(undef) + case ssa.OpcodeSelect: + c, x, y := instr.SelectData() + if x.Type() == ssa.TypeV128 { + rc := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.lowerSelectVec(rc, rn, rm, rd) + } else { + m.lowerSelect(c, x, y, instr.Return()) + } + case ssa.OpcodeClz: + x := instr.Arg() + result := instr.Return() + m.lowerClz(x, result) + case ssa.OpcodeCtz: + x := instr.Arg() + result := instr.Return() + m.lowerCtz(x, result) + case ssa.OpcodePopcnt: + x := instr.Arg() + result := instr.Return() + m.lowerPopcnt(x, result) + case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat: + x, ctx := instr.Arg2() + result := instr.Return() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(result)) + ctxVReg := m.compiler.VRegOf(ctx) + m.lowerFpuToInt(rd, rn, ctxVReg, true, x.Type() == ssa.TypeF64, + result.Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat) + case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat: + x, ctx := instr.Arg2() + result := instr.Return() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(result)) + ctxVReg := m.compiler.VRegOf(ctx) + m.lowerFpuToInt(rd, rn, ctxVReg, false, x.Type() == ssa.TypeF64, + result.Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat) + case ssa.OpcodeFcvtFromSint: + x := instr.Arg() + result := instr.Return() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(result)) + m.lowerIntToFpu(rd, rn, true, x.Type() == ssa.TypeI64, result.Type().Bits() == 64) + case ssa.OpcodeFcvtFromUint: + x := instr.Arg() + result := instr.Return() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(result)) + m.lowerIntToFpu(rd, rn, false, x.Type() == ssa.TypeI64, result.Type().Bits() == 64) + case ssa.OpcodeFdemote: + v := instr.Arg() + rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + cnt := m.allocateInstr() + cnt.asFpuRR(fpuUniOpCvt64To32, rd, rn, false) + m.insert(cnt) + case ssa.OpcodeFpromote: + v := instr.Arg() + rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + cnt := m.allocateInstr() + cnt.asFpuRR(fpuUniOpCvt32To64, rd, rn, true) + m.insert(cnt) + case ssa.OpcodeIreduce: + rn := m.getOperand_NR(m.compiler.ValueDefinition(instr.Arg()), extModeNone) + retVal := instr.Return() + rd := m.compiler.VRegOf(retVal) + + if retVal.Type() != ssa.TypeI32 { + panic("TODO?: Ireduce to non-i32") + } + mov := m.allocateInstr() + mov.asMove32(rd, rn.reg()) + m.insert(mov) + case ssa.OpcodeFneg: + m.lowerFpuUniOp(fpuUniOpNeg, instr.Arg(), instr.Return()) + case ssa.OpcodeSqrt: + m.lowerFpuUniOp(fpuUniOpSqrt, instr.Arg(), instr.Return()) + case ssa.OpcodeCeil: + m.lowerFpuUniOp(fpuUniOpRoundPlus, instr.Arg(), instr.Return()) + case ssa.OpcodeFloor: + m.lowerFpuUniOp(fpuUniOpRoundMinus, instr.Arg(), instr.Return()) + case ssa.OpcodeTrunc: + m.lowerFpuUniOp(fpuUniOpRoundZero, instr.Arg(), instr.Return()) + case ssa.OpcodeNearest: + m.lowerFpuUniOp(fpuUniOpRoundNearest, instr.Arg(), instr.Return()) + case ssa.OpcodeFabs: + m.lowerFpuUniOp(fpuUniOpAbs, instr.Arg(), instr.Return()) + case ssa.OpcodeBitcast: + m.lowerBitcast(instr) + case ssa.OpcodeFcopysign: + x, y := instr.Arg2() + m.lowerFcopysign(x, y, instr.Return()) + case ssa.OpcodeSdiv, ssa.OpcodeUdiv: + x, y, ctx := instr.Arg3() + ctxVReg := m.compiler.VRegOf(ctx) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.lowerIDiv(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSdiv) + case ssa.OpcodeSrem, ssa.OpcodeUrem: + x, y, ctx := instr.Arg3() + ctxVReg := m.compiler.VRegOf(ctx) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.lowerIRem(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem) + case ssa.OpcodeVconst: + result := m.compiler.VRegOf(instr.Return()) + lo, hi := instr.VconstData() + v := m.allocateInstr() + v.asLoadFpuConst128(result, lo, hi) + m.insert(v) + case ssa.OpcodeVbnot: + x := instr.Arg() + ins := m.allocateInstr() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + ins.asVecMisc(vecOpNot, rd, rn, vecArrangement16B) + m.insert(ins) + case ssa.OpcodeVbxor: + x, y := instr.Arg2() + m.lowerVecRRR(vecOpEOR, x, y, instr.Return(), vecArrangement16B) + case ssa.OpcodeVbor: + x, y := instr.Arg2() + m.lowerVecRRR(vecOpOrr, x, y, instr.Return(), vecArrangement16B) + case ssa.OpcodeVband: + x, y := instr.Arg2() + m.lowerVecRRR(vecOpAnd, x, y, instr.Return(), vecArrangement16B) + case ssa.OpcodeVbandnot: + x, y := instr.Arg2() + m.lowerVecRRR(vecOpBic, x, y, instr.Return(), vecArrangement16B) + case ssa.OpcodeVbitselect: + c, x, y := instr.SelectData() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + creg := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone) + tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + // creg is overwritten by BSL, so we need to move it to the result register before the instruction + // in case when it is used somewhere else. + mov := m.allocateInstr() + mov.asFpuMov128(tmp.nr(), creg.nr()) + m.insert(mov) + + ins := m.allocateInstr() + ins.asVecRRRRewrite(vecOpBsl, tmp, rn, rm, vecArrangement16B) + m.insert(ins) + + mov2 := m.allocateInstr() + rd := m.compiler.VRegOf(instr.Return()) + mov2.asFpuMov128(rd, tmp.nr()) + m.insert(mov2) + case ssa.OpcodeVanyTrue, ssa.OpcodeVallTrue: + x, lane := instr.ArgWithLane() + var arr vecArrangement + if op == ssa.OpcodeVallTrue { + arr = ssaLaneToArrangement(lane) + } + rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.lowerVcheckTrue(op, rm, rd, arr) + case ssa.OpcodeVhighBits: + x, lane := instr.ArgWithLane() + rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + arr := ssaLaneToArrangement(lane) + m.lowerVhighBits(rm, rd, arr) + case ssa.OpcodeVIadd: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpAdd, x, y, instr.Return(), arr) + case ssa.OpcodeExtIaddPairwise: + v, lane, signed := instr.ExtIaddPairwiseData() + vv := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) + + tmpLo, tmpHi := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + var widen vecOp + if signed { + widen = vecOpSshll + } else { + widen = vecOpUshll + } + + var loArr, hiArr, dstArr vecArrangement + switch lane { + case ssa.VecLaneI8x16: + loArr, hiArr, dstArr = vecArrangement8B, vecArrangement16B, vecArrangement8H + case ssa.VecLaneI16x8: + loArr, hiArr, dstArr = vecArrangement4H, vecArrangement8H, vecArrangement4S + case ssa.VecLaneI32x4: + loArr, hiArr, dstArr = vecArrangement2S, vecArrangement4S, vecArrangement2D + default: + panic("unsupported lane " + lane.String()) + } + + widenLo := m.allocateInstr().asVecShiftImm(widen, tmpLo, vv, operandShiftImm(0), loArr) + widenHi := m.allocateInstr().asVecShiftImm(widen, tmpHi, vv, operandShiftImm(0), hiArr) + addp := m.allocateInstr().asVecRRR(vecOpAddp, operandNR(m.compiler.VRegOf(instr.Return())), tmpLo, tmpHi, dstArr) + m.insert(widenLo) + m.insert(widenHi) + m.insert(addp) + + case ssa.OpcodeVSaddSat: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpSqadd, x, y, instr.Return(), arr) + case ssa.OpcodeVUaddSat: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpUqadd, x, y, instr.Return(), arr) + case ssa.OpcodeVIsub: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpSub, x, y, instr.Return(), arr) + case ssa.OpcodeVSsubSat: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpSqsub, x, y, instr.Return(), arr) + case ssa.OpcodeVUsubSat: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpUqsub, x, y, instr.Return(), arr) + case ssa.OpcodeVImin: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpSmin, x, y, instr.Return(), arr) + case ssa.OpcodeVUmin: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpUmin, x, y, instr.Return(), arr) + case ssa.OpcodeVImax: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpSmax, x, y, instr.Return(), arr) + case ssa.OpcodeVUmax: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpUmax, x, y, instr.Return(), arr) + case ssa.OpcodeVAvgRound: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpUrhadd, x, y, instr.Return(), arr) + case ssa.OpcodeVImul: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.lowerVIMul(rd, rn, rm, arr) + case ssa.OpcodeVIabs: + m.lowerVecMisc(vecOpAbs, instr) + case ssa.OpcodeVIneg: + m.lowerVecMisc(vecOpNeg, instr) + case ssa.OpcodeVIpopcnt: + m.lowerVecMisc(vecOpCnt, instr) + case ssa.OpcodeVIshl, + ssa.OpcodeVSshr, ssa.OpcodeVUshr: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.lowerVShift(op, rd, rn, rm, arr) + case ssa.OpcodeVSqrt: + m.lowerVecMisc(vecOpFsqrt, instr) + case ssa.OpcodeVFabs: + m.lowerVecMisc(vecOpFabs, instr) + case ssa.OpcodeVFneg: + m.lowerVecMisc(vecOpFneg, instr) + case ssa.OpcodeVFmin: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpFmin, x, y, instr.Return(), arr) + case ssa.OpcodeVFmax: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpFmax, x, y, instr.Return(), arr) + case ssa.OpcodeVFadd: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpFadd, x, y, instr.Return(), arr) + case ssa.OpcodeVFsub: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpFsub, x, y, instr.Return(), arr) + case ssa.OpcodeVFmul: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpFmul, x, y, instr.Return(), arr) + case ssa.OpcodeSqmulRoundSat: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpSqrdmulh, x, y, instr.Return(), arr) + case ssa.OpcodeVFdiv: + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + m.lowerVecRRR(vecOpFdiv, x, y, instr.Return(), arr) + case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat: + x, lane := instr.ArgWithLane() + arr := ssaLaneToArrangement(lane) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.lowerVfpuToInt(rd, rn, arr, op == ssa.OpcodeVFcvtToSintSat) + case ssa.OpcodeVFcvtFromSint, ssa.OpcodeVFcvtFromUint: + x, lane := instr.ArgWithLane() + arr := ssaLaneToArrangement(lane) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.lowerVfpuFromInt(rd, rn, arr, op == ssa.OpcodeVFcvtFromSint) + case ssa.OpcodeSwidenLow, ssa.OpcodeUwidenLow: + x, lane := instr.ArgWithLane() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + + var arr vecArrangement + switch lane { + case ssa.VecLaneI8x16: + arr = vecArrangement8B + case ssa.VecLaneI16x8: + arr = vecArrangement4H + case ssa.VecLaneI32x4: + arr = vecArrangement2S + } + + shll := m.allocateInstr() + if signed := op == ssa.OpcodeSwidenLow; signed { + shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr) + } else { + shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr) + } + m.insert(shll) + case ssa.OpcodeSwidenHigh, ssa.OpcodeUwidenHigh: + x, lane := instr.ArgWithLane() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + + arr := ssaLaneToArrangement(lane) + + shll := m.allocateInstr() + if signed := op == ssa.OpcodeSwidenHigh; signed { + shll.asVecShiftImm(vecOpSshll, rd, rn, operandShiftImm(0), arr) + } else { + shll.asVecShiftImm(vecOpUshll, rd, rn, operandShiftImm(0), arr) + } + m.insert(shll) + + case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow: + x, y, lane := instr.Arg2WithLane() + var arr, arr2 vecArrangement + switch lane { + case ssa.VecLaneI16x8: // I16x8 + arr = vecArrangement8B + arr2 = vecArrangement16B // Implies sqxtn2. + case ssa.VecLaneI32x4: + arr = vecArrangement4H + arr2 = vecArrangement8H // Implies sqxtn2. + default: + panic("unsupported lane " + lane.String()) + } + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + + tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + loQxtn := m.allocateInstr() + hiQxtn := m.allocateInstr() + if signed := op == ssa.OpcodeSnarrow; signed { + // Narrow lanes on rn and write them into lower-half of rd. + loQxtn.asVecMisc(vecOpSqxtn, tmp, rn, arr) // low + // Narrow lanes on rm and write them into higher-half of rd. + hiQxtn.asVecMisc(vecOpSqxtn, tmp, rm, arr2) // high (sqxtn2) + } else { + // Narrow lanes on rn and write them into lower-half of rd. + loQxtn.asVecMisc(vecOpSqxtun, tmp, rn, arr) // low + // Narrow lanes on rm and write them into higher-half of rd. + hiQxtn.asVecMisc(vecOpSqxtun, tmp, rm, arr2) // high (sqxtn2) + } + m.insert(loQxtn) + m.insert(hiQxtn) + + mov := m.allocateInstr() + mov.asFpuMov128(rd.nr(), tmp.nr()) + m.insert(mov) + case ssa.OpcodeFvpromoteLow: + x, lane := instr.ArgWithLane() + if lane != ssa.VecLaneF32x4 { + panic("unsupported lane type " + lane.String()) + } + ins := m.allocateInstr() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + ins.asVecMisc(vecOpFcvtl, rd, rn, vecArrangement2S) + m.insert(ins) + case ssa.OpcodeFvdemote: + x, lane := instr.ArgWithLane() + if lane != ssa.VecLaneF64x2 { + panic("unsupported lane type " + lane.String()) + } + ins := m.allocateInstr() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + ins.asVecMisc(vecOpFcvtn, rd, rn, vecArrangement2S) + m.insert(ins) + case ssa.OpcodeExtractlane: + x, index, signed, lane := instr.ExtractlaneData() + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + + mov := m.allocateInstr() + switch lane { + case ssa.VecLaneI8x16: + mov.asMovFromVec(rd, rn, vecArrangementB, vecIndex(index), signed) + case ssa.VecLaneI16x8: + mov.asMovFromVec(rd, rn, vecArrangementH, vecIndex(index), signed) + case ssa.VecLaneI32x4: + mov.asMovFromVec(rd, rn, vecArrangementS, vecIndex(index), signed) + case ssa.VecLaneI64x2: + mov.asMovFromVec(rd, rn, vecArrangementD, vecIndex(index), signed) + case ssa.VecLaneF32x4: + mov.asVecMovElement(rd, rn, vecArrangementS, vecIndex(0), vecIndex(index)) + case ssa.VecLaneF64x2: + mov.asVecMovElement(rd, rn, vecArrangementD, vecIndex(0), vecIndex(index)) + default: + panic("unsupported lane: " + lane.String()) + } + + m.insert(mov) + + case ssa.OpcodeInsertlane: + x, y, index, lane := instr.InsertlaneData() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + // Initially mov rn to tmp. + mov1 := m.allocateInstr() + mov1.asFpuMov128(tmpReg.nr(), rn.nr()) + m.insert(mov1) + + // movToVec and vecMovElement do not clear the remaining bits to zero, + // thus, we can mov rm in-place to tmp. + mov2 := m.allocateInstr() + switch lane { + case ssa.VecLaneI8x16: + mov2.asMovToVec(tmpReg, rm, vecArrangementB, vecIndex(index)) + case ssa.VecLaneI16x8: + mov2.asMovToVec(tmpReg, rm, vecArrangementH, vecIndex(index)) + case ssa.VecLaneI32x4: + mov2.asMovToVec(tmpReg, rm, vecArrangementS, vecIndex(index)) + case ssa.VecLaneI64x2: + mov2.asMovToVec(tmpReg, rm, vecArrangementD, vecIndex(index)) + case ssa.VecLaneF32x4: + mov2.asVecMovElement(tmpReg, rm, vecArrangementS, vecIndex(index), vecIndex(0)) + case ssa.VecLaneF64x2: + mov2.asVecMovElement(tmpReg, rm, vecArrangementD, vecIndex(index), vecIndex(0)) + } + m.insert(mov2) + + // Finally mov tmp to rd. + mov3 := m.allocateInstr() + mov3.asFpuMov128(rd.nr(), tmpReg.nr()) + m.insert(mov3) + + case ssa.OpcodeSwizzle: + x, y, lane := instr.Arg2WithLane() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + + arr := ssaLaneToArrangement(lane) + + // tbl <rd>.<arr>, { <rn>.<arr> }, <rm>.<arr> + tbl1 := m.allocateInstr() + tbl1.asVecTbl(1, rd, rn, rm, arr) + m.insert(tbl1) + + case ssa.OpcodeShuffle: + x, y, lane1, lane2 := instr.ShuffleData() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + + m.lowerShuffle(rd, rn, rm, lane1, lane2) + + case ssa.OpcodeSplat: + x, lane := instr.ArgWithLane() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + + dup := m.allocateInstr() + switch lane { + case ssa.VecLaneI8x16: + dup.asVecDup(rd, rn, vecArrangement16B) + case ssa.VecLaneI16x8: + dup.asVecDup(rd, rn, vecArrangement8H) + case ssa.VecLaneI32x4: + dup.asVecDup(rd, rn, vecArrangement4S) + case ssa.VecLaneI64x2: + dup.asVecDup(rd, rn, vecArrangement2D) + case ssa.VecLaneF32x4: + dup.asVecDupElement(rd, rn, vecArrangementS, vecIndex(0)) + case ssa.VecLaneF64x2: + dup.asVecDupElement(rd, rn, vecArrangementD, vecIndex(0)) + } + m.insert(dup) + + case ssa.OpcodeWideningPairwiseDotProductS: + x, y := instr.Arg2() + xx, yy := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone), + m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + tmp, tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + m.insert(m.allocateInstr().asVecRRR(vecOpSmull, tmp, xx, yy, vecArrangement8H)) + m.insert(m.allocateInstr().asVecRRR(vecOpSmull2, tmp2, xx, yy, vecArrangement8H)) + m.insert(m.allocateInstr().asVecRRR(vecOpAddp, tmp, tmp, tmp2, vecArrangement4S)) + + rd := operandNR(m.compiler.VRegOf(instr.Return())) + m.insert(m.allocateInstr().asFpuMov128(rd.nr(), tmp.nr())) + + case ssa.OpcodeLoadSplat: + ptr, offset, lane := instr.LoadSplatData() + m.lowerLoadSplat(ptr, offset, lane, instr.Return()) + + case ssa.OpcodeAtomicRmw: + m.lowerAtomicRmw(instr) + + case ssa.OpcodeAtomicCas: + m.lowerAtomicCas(instr) + + case ssa.OpcodeAtomicLoad: + m.lowerAtomicLoad(instr) + + case ssa.OpcodeAtomicStore: + m.lowerAtomicStore(instr) + + case ssa.OpcodeFence: + instr := m.allocateInstr() + instr.asDMB() + m.insert(instr) + + default: + panic("TODO: lowering " + op.String()) + } + m.executableContext.FlushPendingInstructions() +} + +func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) { + // `tbl2` requires 2 consecutive registers, so we arbitrarily pick v29, v30. + vReg, wReg := v29VReg, v30VReg + + // Initialize v29, v30 to rn, rm. + movv := m.allocateInstr() + movv.asFpuMov128(vReg, rn.nr()) + m.insert(movv) + + movw := m.allocateInstr() + movw.asFpuMov128(wReg, rm.nr()) + m.insert(movw) + + // `lane1`, `lane2` are already encoded as two u64s with the right layout: + // lane1 := lane[7]<<56 | ... | lane[1]<<8 | lane[0] + // lane2 := lane[15]<<56 | ... | lane[9]<<8 | lane[8] + // Thus, we can use loadFpuConst128. + tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + lfc := m.allocateInstr() + lfc.asLoadFpuConst128(tmp.nr(), lane1, lane2) + m.insert(lfc) + + // tbl <rd>.16b, { <vReg>.16B, <wReg>.16b }, <tmp>.16b + tbl2 := m.allocateInstr() + tbl2.asVecTbl(2, rd, operandNR(vReg), tmp, vecArrangement16B) + m.insert(tbl2) +} + +func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangement) { + var modulo byte + switch arr { + case vecArrangement16B: + modulo = 0x7 // Modulo 8. + case vecArrangement8H: + modulo = 0xf // Modulo 16. + case vecArrangement4S: + modulo = 0x1f // Modulo 32. + case vecArrangement2D: + modulo = 0x3f // Modulo 64. + default: + panic("unsupported arrangment " + arr.String()) + } + + rtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) + vtmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + and := m.allocateInstr() + and.asALUBitmaskImm(aluOpAnd, rtmp.nr(), rm.nr(), uint64(modulo), true) + m.insert(and) + + if op != ssa.OpcodeVIshl { + // Negate the amount to make this as right shift. + neg := m.allocateInstr() + neg.asALU(aluOpSub, rtmp, operandNR(xzrVReg), rtmp, true) + m.insert(neg) + } + + // Copy the shift amount into a vector register as sshl/ushl requires it to be there. + dup := m.allocateInstr() + dup.asVecDup(vtmp, rtmp, arr) + m.insert(dup) + + if op == ssa.OpcodeVIshl || op == ssa.OpcodeVSshr { + sshl := m.allocateInstr() + sshl.asVecRRR(vecOpSshl, rd, rn, vtmp, arr) + m.insert(sshl) + } else { + ushl := m.allocateInstr() + ushl.asVecRRR(vecOpUshl, rd, rn, vtmp, arr) + m.insert(ushl) + } +} + +func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangement) { + tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + // Special case VallTrue for i64x2. + if op == ssa.OpcodeVallTrue && arr == vecArrangement2D { + // cmeq v3?.2d, v2?.2d, #0 + // addp v3?.2d, v3?.2d, v3?.2d + // fcmp v3?, v3? + // cset dst, eq + + ins := m.allocateInstr() + ins.asVecMisc(vecOpCmeq0, tmp, rm, vecArrangement2D) + m.insert(ins) + + addp := m.allocateInstr() + addp.asVecRRR(vecOpAddp, tmp, tmp, tmp, vecArrangement2D) + m.insert(addp) + + fcmp := m.allocateInstr() + fcmp.asFpuCmp(tmp, tmp, true) + m.insert(fcmp) + + cset := m.allocateInstr() + cset.asCSet(rd.nr(), false, eq) + m.insert(cset) + + return + } + + // Create a scalar value with umaxp or uminv, then compare it against zero. + ins := m.allocateInstr() + if op == ssa.OpcodeVanyTrue { + // umaxp v4?.16b, v2?.16b, v2?.16b + ins.asVecRRR(vecOpUmaxp, tmp, rm, rm, vecArrangement16B) + } else { + // uminv d4?, v2?.4s + ins.asVecLanes(vecOpUminv, tmp, rm, arr) + } + m.insert(ins) + + // mov x3?, v4?.d[0] + // ccmp x3?, #0x0, #0x0, al + // cset x3?, ne + // mov x0, x3? + + movv := m.allocateInstr() + movv.asMovFromVec(rd, tmp, vecArrangementD, vecIndex(0), false) + m.insert(movv) + + fc := m.allocateInstr() + fc.asCCmpImm(rd, uint64(0), al, 0, true) + m.insert(fc) + + cset := m.allocateInstr() + cset.asCSet(rd.nr(), false, ne) + m.insert(cset) +} + +func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) { + r0 := operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) + v0 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + v1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + switch arr { + case vecArrangement16B: + // sshr v6?.16b, v2?.16b, #7 + // movz x4?, #0x201, lsl 0 + // movk x4?, #0x804, lsl 16 + // movk x4?, #0x2010, lsl 32 + // movk x4?, #0x8040, lsl 48 + // dup v5?.2d, x4? + // and v6?.16b, v6?.16b, v5?.16b + // ext v5?.16b, v6?.16b, v6?.16b, #8 + // zip1 v5?.16b, v6?.16b, v5?.16b + // addv s5?, v5?.8h + // umov s3?, v5?.h[0] + + // Right arithmetic shift on the original vector and store the result into v1. So we have: + // v1[i] = 0xff if vi<0, 0 otherwise. + sshr := m.allocateInstr() + sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(7), vecArrangement16B) + m.insert(sshr) + + // Load the bit mask into r0. + m.insertMOVZ(r0.nr(), 0x0201, 0, true) + m.insertMOVK(r0.nr(), 0x0804, 1, true) + m.insertMOVK(r0.nr(), 0x2010, 2, true) + m.insertMOVK(r0.nr(), 0x8040, 3, true) + + // dup r0 to v0. + dup := m.allocateInstr() + dup.asVecDup(v0, r0, vecArrangement2D) + m.insert(dup) + + // Lane-wise logical AND with the bit mask, meaning that we have + // v[i] = (1 << i) if vi<0, 0 otherwise. + // + // Below, we use the following notation: + // wi := (1 << i) if vi<0, 0 otherwise. + and := m.allocateInstr() + and.asVecRRR(vecOpAnd, v1, v1, v0, vecArrangement16B) + m.insert(and) + + // Swap the lower and higher 8 byte elements, and write it into v0, meaning that we have + // v0[i] = w(i+8) if i < 8, w(i-8) otherwise. + ext := m.allocateInstr() + ext.asVecExtract(v0, v1, v1, vecArrangement16B, uint32(8)) + m.insert(ext) + + // v = [w0, w8, ..., w7, w15] + zip1 := m.allocateInstr() + zip1.asVecPermute(vecOpZip1, v0, v1, v0, vecArrangement16B) + m.insert(zip1) + + // v.h[0] = w0 + ... + w15 + addv := m.allocateInstr() + addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H) + m.insert(addv) + + // Extract the v.h[0] as the result. + movfv := m.allocateInstr() + movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false) + m.insert(movfv) + case vecArrangement8H: + // sshr v6?.8h, v2?.8h, #15 + // movz x4?, #0x1, lsl 0 + // movk x4?, #0x2, lsl 16 + // movk x4?, #0x4, lsl 32 + // movk x4?, #0x8, lsl 48 + // dup v5?.2d, x4? + // lsl x4?, x4?, 0x4 + // ins v5?.d[1], x4? + // and v5?.16b, v6?.16b, v5?.16b + // addv s5?, v5?.8h + // umov s3?, v5?.h[0] + + // Right arithmetic shift on the original vector and store the result into v1. So we have: + // v[i] = 0xffff if vi<0, 0 otherwise. + sshr := m.allocateInstr() + sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(15), vecArrangement8H) + m.insert(sshr) + + // Load the bit mask into r0. + m.lowerConstantI64(r0.nr(), 0x0008000400020001) + + // dup r0 to vector v0. + dup := m.allocateInstr() + dup.asVecDup(v0, r0, vecArrangement2D) + m.insert(dup) + + lsl := m.allocateInstr() + lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(4), true) + m.insert(lsl) + + movv := m.allocateInstr() + movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1)) + m.insert(movv) + + // Lane-wise logical AND with the bitmask, meaning that we have + // v[i] = (1 << i) if vi<0, 0 otherwise for i=0..3 + // = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7 + and := m.allocateInstr() + and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B) + m.insert(and) + + addv := m.allocateInstr() + addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H) + m.insert(addv) + + movfv := m.allocateInstr() + movfv.asMovFromVec(rd, v0, vecArrangementH, vecIndex(0), false) + m.insert(movfv) + case vecArrangement4S: + // sshr v6?.8h, v2?.8h, #15 + // movz x4?, #0x1, lsl 0 + // movk x4?, #0x2, lsl 16 + // movk x4?, #0x4, lsl 32 + // movk x4?, #0x8, lsl 48 + // dup v5?.2d, x4? + // lsl x4?, x4?, 0x4 + // ins v5?.d[1], x4? + // and v5?.16b, v6?.16b, v5?.16b + // addv s5?, v5?.8h + // umov s3?, v5?.h[0] + + // Right arithmetic shift on the original vector and store the result into v1. So we have: + // v[i] = 0xffffffff if vi<0, 0 otherwise. + sshr := m.allocateInstr() + sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(31), vecArrangement4S) + m.insert(sshr) + + // Load the bit mask into r0. + m.lowerConstantI64(r0.nr(), 0x0000000200000001) + + // dup r0 to vector v0. + dup := m.allocateInstr() + dup.asVecDup(v0, r0, vecArrangement2D) + m.insert(dup) + + lsl := m.allocateInstr() + lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(2), true) + m.insert(lsl) + + movv := m.allocateInstr() + movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1)) + m.insert(movv) + + // Lane-wise logical AND with the bitmask, meaning that we have + // v[i] = (1 << i) if vi<0, 0 otherwise for i in [0, 1] + // = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3] + and := m.allocateInstr() + and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B) + m.insert(and) + + addv := m.allocateInstr() + addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement4S) + m.insert(addv) + + movfv := m.allocateInstr() + movfv.asMovFromVec(rd, v0, vecArrangementS, vecIndex(0), false) + m.insert(movfv) + case vecArrangement2D: + // mov d3?, v2?.d[0] + // mov x4?, v2?.d[1] + // lsr x4?, x4?, 0x3f + // lsr d3?, d3?, 0x3f + // add s3?, s3?, w4?, lsl #1 + + // Move the lower 64-bit int into result. + movv0 := m.allocateInstr() + movv0.asMovFromVec(rd, rm, vecArrangementD, vecIndex(0), false) + m.insert(movv0) + + // Move the higher 64-bit int into r0. + movv1 := m.allocateInstr() + movv1.asMovFromVec(r0, rm, vecArrangementD, vecIndex(1), false) + m.insert(movv1) + + // Move the sign bit into the least significant bit. + lsr1 := m.allocateInstr() + lsr1.asALUShift(aluOpLsr, r0, r0, operandShiftImm(63), true) + m.insert(lsr1) + + lsr2 := m.allocateInstr() + lsr2.asALUShift(aluOpLsr, rd, rd, operandShiftImm(63), true) + m.insert(lsr2) + + // rd = (r0<<1) | rd + lsl := m.allocateInstr() + lsl.asALU(aluOpAdd, rd, rd, operandSR(r0.nr(), 1, shiftOpLSL), false) + m.insert(lsl) + default: + panic("Unsupported " + arr.String()) + } +} + +func (m *machine) lowerVecMisc(op vecOp, instr *ssa.Instruction) { + x, lane := instr.ArgWithLane() + arr := ssaLaneToArrangement(lane) + ins := m.allocateInstr() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + ins.asVecMisc(op, rd, rn, arr) + m.insert(ins) +} + +func (m *machine) lowerVecRRR(op vecOp, x, y, ret ssa.Value, arr vecArrangement) { + ins := m.allocateInstr() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(ret)) + ins.asVecRRR(op, rd, rn, rm, arr) + m.insert(ins) +} + +func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) { + if arr != vecArrangement2D { + mul := m.allocateInstr() + mul.asVecRRR(vecOpMul, rd, rn, rm, arr) + m.insert(mul) + } else { + tmp1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + tmp3 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + tmpRes := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + // Following the algorithm in https://chromium-review.googlesource.com/c/v8/v8/+/1781696 + rev64 := m.allocateInstr() + rev64.asVecMisc(vecOpRev64, tmp2, rm, vecArrangement4S) + m.insert(rev64) + + mul := m.allocateInstr() + mul.asVecRRR(vecOpMul, tmp2, tmp2, rn, vecArrangement4S) + m.insert(mul) + + xtn1 := m.allocateInstr() + xtn1.asVecMisc(vecOpXtn, tmp1, rn, vecArrangement2S) + m.insert(xtn1) + + addp := m.allocateInstr() + addp.asVecRRR(vecOpAddp, tmp2, tmp2, tmp2, vecArrangement4S) + m.insert(addp) + + xtn2 := m.allocateInstr() + xtn2.asVecMisc(vecOpXtn, tmp3, rm, vecArrangement2S) + m.insert(xtn2) + + // Note: do not write the result directly into result yet. This is the same reason as in bsl. + // In short, in UMLAL instruction, the result register is also one of the source register, and + // the value on the result register is significant. + shll := m.allocateInstr() + shll.asVecMisc(vecOpShll, tmpRes, tmp2, vecArrangement2S) + m.insert(shll) + + umlal := m.allocateInstr() + umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, tmp3, tmp1, vecArrangement2S) + m.insert(umlal) + + mov := m.allocateInstr() + mov.asFpuMov128(rd.nr(), tmpRes.nr()) + m.insert(mov) + } +} + +func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) { + x, y, lane := instr.Arg2WithLane() + arr := ssaLaneToArrangement(lane) + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + + // Note: this usage of tmp is important. + // BSL modifies the destination register, so we need to use a temporary register so that + // the actual definition of the destination register happens *after* the BSL instruction. + // That way, we can force the spill instruction to be inserted after the BSL instruction. + tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + + fcmgt := m.allocateInstr() + if max { + fcmgt.asVecRRR(vecOpFcmgt, tmp, rm, rn, arr) + } else { + // If min, swap the args. + fcmgt.asVecRRR(vecOpFcmgt, tmp, rn, rm, arr) + } + m.insert(fcmgt) + + bsl := m.allocateInstr() + bsl.asVecRRRRewrite(vecOpBsl, tmp, rm, rn, vecArrangement16B) + m.insert(bsl) + + res := operandNR(m.compiler.VRegOf(instr.Return())) + mov2 := m.allocateInstr() + mov2.asFpuMov128(res.nr(), tmp.nr()) + m.insert(mov2) +} + +func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) { + div := m.allocateInstr() + + if signed { + div.asALU(aluOpSDiv, rd, rn, rm, _64bit) + } else { + div.asALU(aluOpUDiv, rd, rn, rm, _64bit) + } + m.insert(div) + + // Check if rm is zero: + m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero) + + // rd = rn-rd*rm by MSUB instruction. + msub := m.allocateInstr() + msub.asALURRRR(aluOpMSub, rd, rd, rm, rn, _64bit) + m.insert(msub) +} + +func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) { + div := m.allocateInstr() + + if signed { + div.asALU(aluOpSDiv, rd, rn, rm, _64bit) + } else { + div.asALU(aluOpUDiv, rd, rn, rm, _64bit) + } + m.insert(div) + + // Check if rm is zero: + m.exitIfNot(execCtxVReg, registerAsRegNotZeroCond(rm.nr()), _64bit, wazevoapi.ExitCodeIntegerDivisionByZero) + + if signed { + // We need to check the signed overflow which happens iff "math.MinInt{32,64} / -1" + minusOneCheck := m.allocateInstr() + // Sets eq condition if rm == -1. + minusOneCheck.asALU(aluOpAddS, operandNR(xzrVReg), rm, operandImm12(1, 0), _64bit) + m.insert(minusOneCheck) + + ccmp := m.allocateInstr() + // If eq condition is set, sets the flag by the result based on "rn - 1", otherwise clears the flag. + ccmp.asCCmpImm(rn, 1, eq, 0, _64bit) + m.insert(ccmp) + + // Check the overflow flag. + m.exitIfNot(execCtxVReg, vs.invert().asCond(), false, wazevoapi.ExitCodeIntegerOverflow) + } +} + +// exitIfNot emits a conditional branch to exit if the condition is not met. +// If `c` (cond type) is a register, `cond64bit` must be chosen to indicate whether the register is 32-bit or 64-bit. +// Otherwise, `cond64bit` is ignored. +func (m *machine) exitIfNot(execCtxVReg regalloc.VReg, c cond, cond64bit bool, code wazevoapi.ExitCode) { + execCtxTmp := m.copyToTmp(execCtxVReg) + + cbr := m.allocateInstr() + m.insert(cbr) + m.lowerExitWithCode(execCtxTmp, code) + // Conditional branch target is after exit. + l := m.insertBrTargetLabel() + cbr.asCondBr(c, l, cond64bit) +} + +func (m *machine) lowerFcopysign(x, y, ret ssa.Value) { + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + var tmpI, tmpF operand + _64 := x.Type() == ssa.TypeF64 + if _64 { + tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) + tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) + } else { + tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF32)) + tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI32)) + } + rd := m.compiler.VRegOf(ret) + m.lowerFcopysignImpl(operandNR(rd), rn, rm, tmpI, tmpF, _64) +} + +func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool) { + // This is exactly the same code emitted by GCC for "__builtin_copysign": + // + // mov x0, -9223372036854775808 + // fmov d2, x0 + // vbit v0.8b, v1.8b, v2.8b + // + + setMSB := m.allocateInstr() + if _64bit { + m.lowerConstantI64(tmpI.nr(), math.MinInt64) + setMSB.asMovToVec(tmpF, tmpI, vecArrangementD, vecIndex(0)) + } else { + m.lowerConstantI32(tmpI.nr(), math.MinInt32) + setMSB.asMovToVec(tmpF, tmpI, vecArrangementS, vecIndex(0)) + } + m.insert(setMSB) + + tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) + + mov := m.allocateInstr() + mov.asFpuMov64(tmpReg.nr(), rn.nr()) + m.insert(mov) + + vbit := m.allocateInstr() + vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, tmpF, vecArrangement8B) + m.insert(vbit) + + movDst := m.allocateInstr() + movDst.asFpuMov64(rd.nr(), tmpReg.nr()) + m.insert(movDst) +} + +func (m *machine) lowerBitcast(instr *ssa.Instruction) { + v, dstType := instr.BitcastData() + srcType := v.Type() + rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone) + rd := operandNR(m.compiler.VRegOf(instr.Return())) + srcInt := srcType.IsInt() + dstInt := dstType.IsInt() + switch { + case srcInt && !dstInt: // Int to Float: + mov := m.allocateInstr() + var arr vecArrangement + if srcType.Bits() == 64 { + arr = vecArrangementD + } else { + arr = vecArrangementS + } + mov.asMovToVec(rd, rn, arr, vecIndex(0)) + m.insert(mov) + case !srcInt && dstInt: // Float to Int: + mov := m.allocateInstr() + var arr vecArrangement + if dstType.Bits() == 64 { + arr = vecArrangementD + } else { + arr = vecArrangementS + } + mov.asMovFromVec(rd, rn, arr, vecIndex(0), false) + m.insert(mov) + default: + panic("TODO?BUG?") + } +} + +func (m *machine) lowerFpuUniOp(op fpuUniOp, in, out ssa.Value) { + rn := m.getOperand_NR(m.compiler.ValueDefinition(in), extModeNone) + rd := operandNR(m.compiler.VRegOf(out)) + + neg := m.allocateInstr() + neg.asFpuRR(op, rd, rn, in.Type().Bits() == 64) + m.insert(neg) +} + +func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) { + if !nonTrapping { + // First of all, we have to clear the FPU flags. + flagClear := m.allocateInstr() + flagClear.asMovToFPSR(xzrVReg) + m.insert(flagClear) + } + + // Then, do the conversion which doesn't trap inherently. + cvt := m.allocateInstr() + cvt.asFpuToInt(rd, rn, signed, src64bit, dst64bit) + m.insert(cvt) + + if !nonTrapping { + tmpReg := m.compiler.AllocateVReg(ssa.TypeI64) + + // After the conversion, check the FPU flags. + getFlag := m.allocateInstr() + getFlag.asMovFromFPSR(tmpReg) + m.insert(getFlag) + + execCtx := m.copyToTmp(ctx) + _rn := operandNR(m.copyToTmp(rn.nr())) + + // Check if the conversion was undefined by comparing the status with 1. + // See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register + alu := m.allocateInstr() + alu.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpReg), operandImm12(1, 0), true) + m.insert(alu) + + // If it is not undefined, we can return the result. + ok := m.allocateInstr() + m.insert(ok) + + // Otherwise, we have to choose the status depending on it is overflow or NaN conversion. + + // Comparing itself to check if it is a NaN. + fpuCmp := m.allocateInstr() + fpuCmp.asFpuCmp(_rn, _rn, src64bit) + m.insert(fpuCmp) + // If the VC flag is not set (== VS flag is set), it is a NaN. + m.exitIfNot(execCtx, vc.asCond(), false, wazevoapi.ExitCodeInvalidConversionToInteger) + // Otherwise, it is an overflow. + m.lowerExitWithCode(execCtx, wazevoapi.ExitCodeIntegerOverflow) + + // Conditional branch target is after exit. + l := m.insertBrTargetLabel() + ok.asCondBr(ne.asCond(), l, false /* ignored */) + } +} + +func (m *machine) lowerIntToFpu(rd, rn operand, signed, src64bit, dst64bit bool) { + cvt := m.allocateInstr() + cvt.asIntToFpu(rd, rn, signed, src64bit, dst64bit) + m.insert(cvt) +} + +func (m *machine) lowerFpuBinOp(si *ssa.Instruction) { + instr := m.allocateInstr() + var op fpuBinOp + switch si.Opcode() { + case ssa.OpcodeFadd: + op = fpuBinOpAdd + case ssa.OpcodeFsub: + op = fpuBinOpSub + case ssa.OpcodeFmul: + op = fpuBinOpMul + case ssa.OpcodeFdiv: + op = fpuBinOpDiv + case ssa.OpcodeFmax: + op = fpuBinOpMax + case ssa.OpcodeFmin: + op = fpuBinOpMin + } + x, y := si.Arg2() + xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) + rn := m.getOperand_NR(xDef, extModeNone) + rm := m.getOperand_NR(yDef, extModeNone) + rd := operandNR(m.compiler.VRegOf(si.Return())) + instr.asFpuRRR(op, rd, rn, rm, x.Type().Bits() == 64) + m.insert(instr) +} + +func (m *machine) lowerSubOrAdd(si *ssa.Instruction, add bool) { + x, y := si.Arg2() + if !x.Type().IsInt() { + panic("BUG?") + } + + xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) + rn := m.getOperand_NR(xDef, extModeNone) + rm, yNegated := m.getOperand_MaybeNegatedImm12_ER_SR_NR(yDef, extModeNone) + + var aop aluOp + switch { + case add && !yNegated: // rn+rm = x+y + aop = aluOpAdd + case add && yNegated: // rn-rm = x-(-y) = x+y + aop = aluOpSub + case !add && !yNegated: // rn-rm = x-y + aop = aluOpSub + case !add && yNegated: // rn+rm = x-(-y) = x-y + aop = aluOpAdd + } + rd := operandNR(m.compiler.VRegOf(si.Return())) + alu := m.allocateInstr() + alu.asALU(aop, rd, rn, rm, x.Type().Bits() == 64) + m.insert(alu) +} + +// InsertMove implements backend.Machine. +func (m *machine) InsertMove(dst, src regalloc.VReg, typ ssa.Type) { + instr := m.allocateInstr() + switch typ { + case ssa.TypeI32, ssa.TypeI64: + instr.asMove64(dst, src) + case ssa.TypeF32, ssa.TypeF64: + instr.asFpuMov64(dst, src) + case ssa.TypeV128: + instr.asFpuMov128(dst, src) + default: + panic("TODO") + } + m.insert(instr) +} + +func (m *machine) lowerIcmp(si *ssa.Instruction) { + x, y, c := si.IcmpData() + flag := condFlagFromSSAIntegerCmpCond(c) + + in64bit := x.Type().Bits() == 64 + var ext extMode + if in64bit { + if c.Signed() { + ext = extModeSignExtend64 + } else { + ext = extModeZeroExtend64 + } + } else { + if c.Signed() { + ext = extModeSignExtend32 + } else { + ext = extModeZeroExtend32 + } + } + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext) + rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), ext) + alu := m.allocateInstr() + alu.asALU(aluOpSubS, operandNR(xzrVReg), rn, rm, in64bit) + m.insert(alu) + + cset := m.allocateInstr() + cset.asCSet(m.compiler.VRegOf(si.Return()), false, flag) + m.insert(cset) +} + +func (m *machine) lowerVIcmp(si *ssa.Instruction) { + x, y, c, lane := si.VIcmpData() + flag := condFlagFromSSAIntegerCmpCond(c) + arr := ssaLaneToArrangement(lane) + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(si.Return())) + + switch flag { + case eq: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr) + m.insert(cmp) + case ne: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr) + m.insert(cmp) + not := m.allocateInstr() + not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B) + m.insert(not) + case ge: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmge, rd, rn, rm, arr) + m.insert(cmp) + case gt: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmgt, rd, rn, rm, arr) + m.insert(cmp) + case le: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmge, rd, rm, rn, arr) // rm, rn are swapped + m.insert(cmp) + case lt: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmgt, rd, rm, rn, arr) // rm, rn are swapped + m.insert(cmp) + case hs: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmhs, rd, rn, rm, arr) + m.insert(cmp) + case hi: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmhi, rd, rn, rm, arr) + m.insert(cmp) + case ls: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmhs, rd, rm, rn, arr) // rm, rn are swapped + m.insert(cmp) + case lo: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpCmhi, rd, rm, rn, arr) // rm, rn are swapped + m.insert(cmp) + } +} + +func (m *machine) lowerVFcmp(si *ssa.Instruction) { + x, y, c, lane := si.VFcmpData() + flag := condFlagFromSSAFloatCmpCond(c) + arr := ssaLaneToArrangement(lane) + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + rd := operandNR(m.compiler.VRegOf(si.Return())) + + switch flag { + case eq: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr) + m.insert(cmp) + case ne: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr) + m.insert(cmp) + not := m.allocateInstr() + not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B) + m.insert(not) + case ge: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpFcmge, rd, rn, rm, arr) + m.insert(cmp) + case gt: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpFcmgt, rd, rn, rm, arr) + m.insert(cmp) + case mi: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpFcmgt, rd, rm, rn, arr) // rm, rn are swapped + m.insert(cmp) + case ls: + cmp := m.allocateInstr() + cmp.asVecRRR(vecOpFcmge, rd, rm, rn, arr) // rm, rn are swapped + m.insert(cmp) + } +} + +func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool) { + cvt := m.allocateInstr() + if signed { + cvt.asVecMisc(vecOpFcvtzs, rd, rn, arr) + } else { + cvt.asVecMisc(vecOpFcvtzu, rd, rn, arr) + } + m.insert(cvt) + + if arr == vecArrangement2D { + narrow := m.allocateInstr() + if signed { + narrow.asVecMisc(vecOpSqxtn, rd, rd, vecArrangement2S) + } else { + narrow.asVecMisc(vecOpUqxtn, rd, rd, vecArrangement2S) + } + m.insert(narrow) + } +} + +func (m *machine) lowerVfpuFromInt(rd, rn operand, arr vecArrangement, signed bool) { + cvt := m.allocateInstr() + if signed { + cvt.asVecMisc(vecOpScvtf, rd, rn, arr) + } else { + cvt.asVecMisc(vecOpUcvtf, rd, rn, arr) + } + m.insert(cvt) +} + +func (m *machine) lowerShifts(si *ssa.Instruction, ext extMode, aluOp aluOp) { + x, amount := si.Arg2() + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext) + rm := m.getOperand_ShiftImm_NR(m.compiler.ValueDefinition(amount), ext, x.Type().Bits()) + rd := operandNR(m.compiler.VRegOf(si.Return())) + + alu := m.allocateInstr() + alu.asALUShift(aluOp, rd, rn, rm, x.Type().Bits() == 64) + m.insert(alu) +} + +func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp, ignoreResult bool) { + x, y := si.Arg2() + + xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) + rn := m.getOperand_NR(xDef, extModeNone) + + var rd operand + if ignoreResult { + rd = operandNR(xzrVReg) + } else { + rd = operandNR(m.compiler.VRegOf(si.Return())) + } + + _64 := x.Type().Bits() == 64 + alu := m.allocateInstr() + if instr := yDef.Instr; instr != nil && instr.Constant() { + c := instr.ConstantVal() + if isBitMaskImmediate(c, _64) { + // Constant bit wise operations can be lowered to a single instruction. + alu.asALUBitmaskImm(op, rd.nr(), rn.nr(), c, _64) + m.insert(alu) + return + } + } + + rm := m.getOperand_SR_NR(yDef, extModeNone) + alu.asALU(op, rd, rn, rm, _64) + m.insert(alu) +} + +func (m *machine) lowerRotl(si *ssa.Instruction) { + x, y := si.Arg2() + r := si.Return() + _64 := r.Type().Bits() == 64 + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + var tmp operand + if _64 { + tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) + } else { + tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32)) + } + rd := operandNR(m.compiler.VRegOf(r)) + + // Encode rotl as neg + rotr: neg is a sub against the zero-reg. + m.lowerRotlImpl(rd, rn, rm, tmp, _64) +} + +func (m *machine) lowerRotlImpl(rd, rn, rm, tmp operand, is64bit bool) { + // Encode rotl as neg + rotr: neg is a sub against the zero-reg. + neg := m.allocateInstr() + neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rm, is64bit) + m.insert(neg) + alu := m.allocateInstr() + alu.asALU(aluOpRotR, rd, rn, tmp, is64bit) + m.insert(alu) +} + +func (m *machine) lowerRotr(si *ssa.Instruction) { + x, y := si.Arg2() + + xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y) + rn := m.getOperand_NR(xDef, extModeNone) + rm := m.getOperand_NR(yDef, extModeNone) + rd := operandNR(m.compiler.VRegOf(si.Return())) + + alu := m.allocateInstr() + alu.asALU(aluOpRotR, rd, rn, rm, si.Return().Type().Bits() == 64) + m.insert(alu) +} + +func (m *machine) lowerExtend(arg, ret ssa.Value, from, to byte, signed bool) { + rd := m.compiler.VRegOf(ret) + def := m.compiler.ValueDefinition(arg) + + if instr := def.Instr; !signed && from == 32 && instr != nil { + // We can optimize out the unsigned extend because: + // Writes to the W register set bits [63:32] of the X register to zero + // https://developer.arm.com/documentation/den0024/a/An-Introduction-to-the-ARMv8-Instruction-Sets/The-ARMv8-instruction-sets/Distinguishing-between-32-bit-and-64-bit-A64-instructions + switch instr.Opcode() { + case + ssa.OpcodeIadd, ssa.OpcodeIsub, ssa.OpcodeLoad, + ssa.OpcodeBand, ssa.OpcodeBor, ssa.OpcodeBnot, + ssa.OpcodeIshl, ssa.OpcodeUshr, ssa.OpcodeSshr, + ssa.OpcodeRotl, ssa.OpcodeRotr, + ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32: + // So, if the argument is the result of a 32-bit operation, we can just copy the register. + // It is highly likely that this copy will be optimized out after register allocation. + rn := m.compiler.VRegOf(arg) + mov := m.allocateInstr() + // Note: do not use move32 as it will be lowered to a 32-bit move, which is not copy (that is actually the impl of UExtend). + mov.asMove64(rd, rn) + m.insert(mov) + return + default: + } + } + rn := m.getOperand_NR(def, extModeNone) + + ext := m.allocateInstr() + ext.asExtend(rd, rn.nr(), from, to, signed) + m.insert(ext) +} + +func (m *machine) lowerFcmp(x, y, result ssa.Value, c ssa.FloatCmpCond) { + rn, rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone), m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + + fc := m.allocateInstr() + fc.asFpuCmp(rn, rm, x.Type().Bits() == 64) + m.insert(fc) + + cset := m.allocateInstr() + cset.asCSet(m.compiler.VRegOf(result), false, condFlagFromSSAFloatCmpCond(c)) + m.insert(cset) +} + +func (m *machine) lowerImul(x, y, result ssa.Value) { + rd := m.compiler.VRegOf(result) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + + // TODO: if this comes before Add/Sub, we could merge it by putting it into the place of xzrVReg. + + mul := m.allocateInstr() + mul.asALURRRR(aluOpMAdd, operandNR(rd), rn, rm, operandNR(xzrVReg), x.Type().Bits() == 64) + m.insert(mul) +} + +func (m *machine) lowerClz(x, result ssa.Value) { + rd := m.compiler.VRegOf(result) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + clz := m.allocateInstr() + clz.asBitRR(bitOpClz, rd, rn.nr(), x.Type().Bits() == 64) + m.insert(clz) +} + +func (m *machine) lowerCtz(x, result ssa.Value) { + rd := m.compiler.VRegOf(result) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rbit := m.allocateInstr() + _64 := x.Type().Bits() == 64 + var tmpReg regalloc.VReg + if _64 { + tmpReg = m.compiler.AllocateVReg(ssa.TypeI64) + } else { + tmpReg = m.compiler.AllocateVReg(ssa.TypeI32) + } + rbit.asBitRR(bitOpRbit, tmpReg, rn.nr(), _64) + m.insert(rbit) + + clz := m.allocateInstr() + clz.asBitRR(bitOpClz, rd, tmpReg, _64) + m.insert(clz) +} + +func (m *machine) lowerPopcnt(x, result ssa.Value) { + // arm64 doesn't have an instruction for population count on scalar register, + // so we use the vector instruction `cnt`. + // This is exactly what the official Go implements bits.OneCount. + // For example, "func () int { return bits.OneCount(10) }" is compiled as + // + // MOVD $10, R0 ;; Load 10. + // FMOVD R0, F0 + // VCNT V0.B8, V0.B8 + // UADDLV V0.B8, V0 + // + // In aarch64 asm, FMOVD is encoded as `ins`, VCNT is `cnt`, + // and the registers may use different names. In our encoding we use the following + // instructions: + // + // ins v0.d[0], x0 ;; mov from GPR to vec (FMOV above) is encoded as INS + // cnt v0.16b, v0.16b ;; we use vec arrangement 16b + // uaddlv h0, v0.8b ;; h0 is still v0 with the dest width specifier 'H', implied when src arrangement is 8b + // mov x5, v0.d[0] ;; finally we mov the result back to a GPR + // + + rd := operandNR(m.compiler.VRegOf(result)) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + + rf1 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) + ins := m.allocateInstr() + ins.asMovToVec(rf1, rn, vecArrangementD, vecIndex(0)) + m.insert(ins) + + rf2 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) + cnt := m.allocateInstr() + cnt.asVecMisc(vecOpCnt, rf2, rf1, vecArrangement16B) + m.insert(cnt) + + rf3 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64)) + uaddlv := m.allocateInstr() + uaddlv.asVecLanes(vecOpUaddlv, rf3, rf2, vecArrangement8B) + m.insert(uaddlv) + + mov := m.allocateInstr() + mov.asMovFromVec(rd, rf3, vecArrangementD, vecIndex(0), false) + m.insert(mov) +} + +// lowerExitWithCode lowers the lowerExitWithCode takes a context pointer as argument. +func (m *machine) lowerExitWithCode(execCtxVReg regalloc.VReg, code wazevoapi.ExitCode) { + tmpReg1 := m.compiler.AllocateVReg(ssa.TypeI32) + loadExitCodeConst := m.allocateInstr() + loadExitCodeConst.asMOVZ(tmpReg1, uint64(code), 0, true) + + setExitCode := m.allocateInstr() + setExitCode.asStore(operandNR(tmpReg1), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(), + }, 32) + + // In order to unwind the stack, we also need to push the current stack pointer: + tmp2 := m.compiler.AllocateVReg(ssa.TypeI64) + movSpToTmp := m.allocateInstr() + movSpToTmp.asMove64(tmp2, spVReg) + strSpToExecCtx := m.allocateInstr() + strSpToExecCtx.asStore(operandNR(tmp2), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(), + }, 64) + // Also the address of this exit. + tmp3 := m.compiler.AllocateVReg(ssa.TypeI64) + currentAddrToTmp := m.allocateInstr() + currentAddrToTmp.asAdr(tmp3, 0) + storeCurrentAddrToExecCtx := m.allocateInstr() + storeCurrentAddrToExecCtx.asStore(operandNR(tmp3), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(), + }, 64) + + exitSeq := m.allocateInstr() + exitSeq.asExitSequence(execCtxVReg) + + m.insert(loadExitCodeConst) + m.insert(setExitCode) + m.insert(movSpToTmp) + m.insert(strSpToExecCtx) + m.insert(currentAddrToTmp) + m.insert(storeCurrentAddrToExecCtx) + m.insert(exitSeq) +} + +func (m *machine) lowerIcmpToFlag(x, y ssa.Value, signed bool) { + if x.Type() != y.Type() { + panic( + fmt.Sprintf("TODO(maybe): support icmp with different types: v%d=%s != v%d=%s", + x.ID(), x.Type(), y.ID(), y.Type())) + } + + extMod := extModeOf(x.Type(), signed) + + // First operand must be in pure register form. + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extMod) + // Second operand can be in any of Imm12, ER, SR, or NR form supported by the SUBS instructions. + rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), extMod) + + alu := m.allocateInstr() + // subs zr, rn, rm + alu.asALU( + aluOpSubS, + // We don't need the result, just need to set flags. + operandNR(xzrVReg), + rn, + rm, + x.Type().Bits() == 64, + ) + m.insert(alu) +} + +func (m *machine) lowerFcmpToFlag(x, y ssa.Value) { + if x.Type() != y.Type() { + panic("TODO(maybe): support icmp with different types") + } + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + cmp := m.allocateInstr() + cmp.asFpuCmp(rn, rm, x.Type().Bits() == 64) + m.insert(cmp) +} + +func (m *machine) lowerExitIfTrueWithCode(execCtxVReg regalloc.VReg, cond ssa.Value, code wazevoapi.ExitCode) { + condDef := m.compiler.ValueDefinition(cond) + if !m.compiler.MatchInstr(condDef, ssa.OpcodeIcmp) { + panic("TODO: OpcodeExitIfTrueWithCode must come after Icmp at the moment: " + condDef.Instr.Opcode().String()) + } + condDef.Instr.MarkLowered() + + cvalInstr := condDef.Instr + x, y, c := cvalInstr.IcmpData() + signed := c.Signed() + + if !m.tryLowerBandToFlag(x, y) { + m.lowerIcmpToFlag(x, y, signed) + } + + // We need to copy the execution context to a temp register, because if it's spilled, + // it might end up being reloaded inside the exiting branch. + execCtxTmp := m.copyToTmp(execCtxVReg) + + // We have to skip the entire exit sequence if the condition is false. + cbr := m.allocateInstr() + m.insert(cbr) + m.lowerExitWithCode(execCtxTmp, code) + // conditional branch target is after exit. + l := m.insertBrTargetLabel() + cbr.asCondBr(condFlagFromSSAIntegerCmpCond(c).invert().asCond(), l, false /* ignored */) +} + +func (m *machine) lowerSelect(c, x, y, result ssa.Value) { + cvalDef := m.compiler.ValueDefinition(c) + + var cc condFlag + switch { + case m.compiler.MatchInstr(cvalDef, ssa.OpcodeIcmp): // This case, we can use the ALU flag set by SUBS instruction. + cvalInstr := cvalDef.Instr + x, y, c := cvalInstr.IcmpData() + cc = condFlagFromSSAIntegerCmpCond(c) + m.lowerIcmpToFlag(x, y, c.Signed()) + cvalDef.Instr.MarkLowered() + case m.compiler.MatchInstr(cvalDef, ssa.OpcodeFcmp): // This case we can use the Fpu flag directly. + cvalInstr := cvalDef.Instr + x, y, c := cvalInstr.FcmpData() + cc = condFlagFromSSAFloatCmpCond(c) + m.lowerFcmpToFlag(x, y) + cvalDef.Instr.MarkLowered() + default: + rn := m.getOperand_NR(cvalDef, extModeNone) + if c.Type() != ssa.TypeI32 && c.Type() != ssa.TypeI64 { + panic("TODO?BUG?: support select with non-integer condition") + } + alu := m.allocateInstr() + // subs zr, rn, zr + alu.asALU( + aluOpSubS, + // We don't need the result, just need to set flags. + operandNR(xzrVReg), + rn, + operandNR(xzrVReg), + c.Type().Bits() == 64, + ) + m.insert(alu) + cc = ne + } + + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone) + + rd := operandNR(m.compiler.VRegOf(result)) + switch x.Type() { + case ssa.TypeI32, ssa.TypeI64: + // csel rd, rn, rm, cc + csel := m.allocateInstr() + csel.asCSel(rd, rn, rm, cc, x.Type().Bits() == 64) + m.insert(csel) + case ssa.TypeF32, ssa.TypeF64: + // fcsel rd, rn, rm, cc + fcsel := m.allocateInstr() + fcsel.asFpuCSel(rd, rn, rm, cc, x.Type().Bits() == 64) + m.insert(fcsel) + default: + panic("BUG") + } +} + +func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) { + // First check if `rc` is zero or not. + checkZero := m.allocateInstr() + checkZero.asALU(aluOpSubS, operandNR(xzrVReg), rc, operandNR(xzrVReg), false) + m.insert(checkZero) + + // Then use CSETM to set all bits to one if `rc` is zero. + allOnesOrZero := m.compiler.AllocateVReg(ssa.TypeI64) + cset := m.allocateInstr() + cset.asCSet(allOnesOrZero, true, ne) + m.insert(cset) + + // Then move the bits to the result vector register. + tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)) + dup := m.allocateInstr() + dup.asVecDup(tmp2, operandNR(allOnesOrZero), vecArrangement2D) + m.insert(dup) + + // Now that `tmp2` has either all bits one or zero depending on `rc`, + // we can use bsl to select between `rn` and `rm`. + ins := m.allocateInstr() + ins.asVecRRRRewrite(vecOpBsl, tmp2, rn, rm, vecArrangement16B) + m.insert(ins) + + // Finally, move the result to the destination register. + mov2 := m.allocateInstr() + mov2.asFpuMov128(rd.nr(), tmp2.nr()) + m.insert(mov2) +} + +func (m *machine) lowerAtomicRmw(si *ssa.Instruction) { + ssaOp, size := si.AtomicRmwData() + + var op atomicRmwOp + var negateArg bool + var flipArg bool + switch ssaOp { + case ssa.AtomicRmwOpAdd: + op = atomicRmwOpAdd + case ssa.AtomicRmwOpSub: + op = atomicRmwOpAdd + negateArg = true + case ssa.AtomicRmwOpAnd: + op = atomicRmwOpClr + flipArg = true + case ssa.AtomicRmwOpOr: + op = atomicRmwOpSet + case ssa.AtomicRmwOpXor: + op = atomicRmwOpEor + case ssa.AtomicRmwOpXchg: + op = atomicRmwOpSwp + default: + panic(fmt.Sprintf("unknown ssa atomic rmw op: %s", ssaOp)) + } + + addr, val := si.Arg2() + addrDef, valDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(val) + rn := m.getOperand_NR(addrDef, extModeNone) + rt := operandNR(m.compiler.VRegOf(si.Return())) + rs := m.getOperand_NR(valDef, extModeNone) + + _64 := si.Return().Type().Bits() == 64 + var tmp operand + if _64 { + tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64)) + } else { + tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32)) + } + m.lowerAtomicRmwImpl(op, rn, rs, rt, tmp, size, negateArg, flipArg, _64) +} + +func (m *machine) lowerAtomicRmwImpl(op atomicRmwOp, rn, rs, rt, tmp operand, size uint64, negateArg, flipArg, dst64bit bool) { + switch { + case negateArg: + neg := m.allocateInstr() + neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rs, dst64bit) + m.insert(neg) + case flipArg: + flip := m.allocateInstr() + flip.asALU(aluOpOrn, tmp, operandNR(xzrVReg), rs, dst64bit) + m.insert(flip) + default: + tmp = rs + } + + rmw := m.allocateInstr() + rmw.asAtomicRmw(op, rn, tmp, rt, size) + m.insert(rmw) +} + +func (m *machine) lowerAtomicCas(si *ssa.Instruction) { + addr, exp, repl := si.Arg3() + size := si.AtomicTargetSize() + + addrDef, expDef, replDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(exp), m.compiler.ValueDefinition(repl) + rn := m.getOperand_NR(addrDef, extModeNone) + rt := m.getOperand_NR(replDef, extModeNone) + rs := m.getOperand_NR(expDef, extModeNone) + tmp := operandNR(m.compiler.AllocateVReg(si.Return().Type())) + + _64 := si.Return().Type().Bits() == 64 + // rs is overwritten by CAS, so we need to move it to the result register before the instruction + // in case when it is used somewhere else. + mov := m.allocateInstr() + if _64 { + mov.asMove64(tmp.nr(), rs.nr()) + } else { + mov.asMove32(tmp.nr(), rs.nr()) + } + m.insert(mov) + + m.lowerAtomicCasImpl(rn, tmp, rt, size) + + mov2 := m.allocateInstr() + rd := m.compiler.VRegOf(si.Return()) + if _64 { + mov2.asMove64(rd, tmp.nr()) + } else { + mov2.asMove32(rd, tmp.nr()) + } + m.insert(mov2) +} + +func (m *machine) lowerAtomicCasImpl(rn, rs, rt operand, size uint64) { + cas := m.allocateInstr() + cas.asAtomicCas(rn, rs, rt, size) + m.insert(cas) +} + +func (m *machine) lowerAtomicLoad(si *ssa.Instruction) { + addr := si.Arg() + size := si.AtomicTargetSize() + + addrDef := m.compiler.ValueDefinition(addr) + rn := m.getOperand_NR(addrDef, extModeNone) + rt := operandNR(m.compiler.VRegOf(si.Return())) + + m.lowerAtomicLoadImpl(rn, rt, size) +} + +func (m *machine) lowerAtomicLoadImpl(rn, rt operand, size uint64) { + ld := m.allocateInstr() + ld.asAtomicLoad(rn, rt, size) + m.insert(ld) +} + +func (m *machine) lowerAtomicStore(si *ssa.Instruction) { + addr, val := si.Arg2() + size := si.AtomicTargetSize() + + addrDef := m.compiler.ValueDefinition(addr) + valDef := m.compiler.ValueDefinition(val) + rn := m.getOperand_NR(addrDef, extModeNone) + rt := m.getOperand_NR(valDef, extModeNone) + + m.lowerAtomicStoreImpl(rn, rt, size) +} + +func (m *machine) lowerAtomicStoreImpl(rn, rt operand, size uint64) { + ld := m.allocateInstr() + ld.asAtomicStore(rn, rt, size) + m.insert(ld) +} + +// copyToTmp copies the given regalloc.VReg to a temporary register. This is called before cbr to avoid the regalloc issue +// e.g. reload happening in the middle of the exit sequence which is not the path the normal path executes +func (m *machine) copyToTmp(v regalloc.VReg) regalloc.VReg { + typ := m.compiler.TypeOf(v) + mov := m.allocateInstr() + tmp := m.compiler.AllocateVReg(typ) + if typ.IsInt() { + mov.asMove64(tmp, v) + } else { + mov.asFpuMov128(tmp, v) + } + m.insert(mov) + return tmp +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go new file mode 100644 index 000000000..d9fbf1789 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go @@ -0,0 +1,350 @@ +package arm64 + +// This file contains the logic to "find and determine operands" for instructions. +// In order to finalize the form of an operand, we might end up merging/eliminating +// the source instructions into an operand whenever possible. + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +type ( + // operand represents an operand of an instruction whose type is determined by the kind. + operand struct { + kind operandKind + data, data2 uint64 + } + operandKind byte +) + +// Here's the list of operand kinds. We use the abbreviation of the kind name not only for these consts, +// but also names of functions which return the operand of the kind. +const ( + // operandKindNR represents "NormalRegister" (NR). This is literally the register without any special operation unlike others. + operandKindNR operandKind = iota + // operandKindSR represents "Shifted Register" (SR). This is a register which is shifted by a constant. + // Some of the arm64 instructions can take this kind of operand. + operandKindSR + // operandKindER represents "Extended Register (ER). This is a register which is sign/zero-extended to a larger size. + // Some of the arm64 instructions can take this kind of operand. + operandKindER + // operandKindImm12 represents "Immediate 12" (Imm12). This is a 12-bit immediate value which can be either shifted by 12 or not. + // See asImm12 function for detail. + operandKindImm12 + // operandKindShiftImm represents "Shifted Immediate" (ShiftImm) used by shift operations. + operandKindShiftImm +) + +// String implements fmt.Stringer for debugging. +func (o operand) format(size byte) string { + switch o.kind { + case operandKindNR: + return formatVRegSized(o.nr(), size) + case operandKindSR: + r, amt, sop := o.sr() + return fmt.Sprintf("%s, %s #%d", formatVRegSized(r, size), sop, amt) + case operandKindER: + r, eop, _ := o.er() + return fmt.Sprintf("%s %s", formatVRegSized(r, size), eop) + case operandKindImm12: + imm12, shiftBit := o.imm12() + if shiftBit == 1 { + return fmt.Sprintf("#%#x", uint64(imm12)<<12) + } else { + return fmt.Sprintf("#%#x", imm12) + } + default: + panic(fmt.Sprintf("unknown operand kind: %d", o.kind)) + } +} + +// operandNR encodes the given VReg as an operand of operandKindNR. +func operandNR(r regalloc.VReg) operand { + return operand{kind: operandKindNR, data: uint64(r)} +} + +// nr decodes the underlying VReg assuming the operand is of operandKindNR. +func (o operand) nr() regalloc.VReg { + return regalloc.VReg(o.data) +} + +// operandER encodes the given VReg as an operand of operandKindER. +func operandER(r regalloc.VReg, eop extendOp, to byte) operand { + if to < 32 { + panic("TODO?BUG?: when we need to extend to less than 32 bits?") + } + return operand{kind: operandKindER, data: uint64(r), data2: uint64(eop)<<32 | uint64(to)} +} + +// er decodes the underlying VReg, extend operation, and the target size assuming the operand is of operandKindER. +func (o operand) er() (r regalloc.VReg, eop extendOp, to byte) { + return regalloc.VReg(o.data), extendOp(o.data2>>32) & 0xff, byte(o.data2 & 0xff) +} + +// operandSR encodes the given VReg as an operand of operandKindSR. +func operandSR(r regalloc.VReg, amt byte, sop shiftOp) operand { + return operand{kind: operandKindSR, data: uint64(r), data2: uint64(amt)<<32 | uint64(sop)} +} + +// sr decodes the underlying VReg, shift amount, and shift operation assuming the operand is of operandKindSR. +func (o operand) sr() (r regalloc.VReg, amt byte, sop shiftOp) { + return regalloc.VReg(o.data), byte(o.data2>>32) & 0xff, shiftOp(o.data2) & 0xff +} + +// operandImm12 encodes the given imm12 as an operand of operandKindImm12. +func operandImm12(imm12 uint16, shiftBit byte) operand { + return operand{kind: operandKindImm12, data: uint64(imm12) | uint64(shiftBit)<<32} +} + +// imm12 decodes the underlying imm12 data assuming the operand is of operandKindImm12. +func (o operand) imm12() (v uint16, shiftBit byte) { + return uint16(o.data), byte(o.data >> 32) +} + +// operandShiftImm encodes the given amount as an operand of operandKindShiftImm. +func operandShiftImm(amount byte) operand { + return operand{kind: operandKindShiftImm, data: uint64(amount)} +} + +// shiftImm decodes the underlying shift amount data assuming the operand is of operandKindShiftImm. +func (o operand) shiftImm() byte { + return byte(o.data) +} + +// reg returns the register of the operand if applicable. +func (o operand) reg() regalloc.VReg { + switch o.kind { + case operandKindNR: + return o.nr() + case operandKindSR: + r, _, _ := o.sr() + return r + case operandKindER: + r, _, _ := o.er() + return r + case operandKindImm12: + // Does not have a register. + case operandKindShiftImm: + // Does not have a register. + default: + panic(o.kind) + } + return regalloc.VRegInvalid +} + +func (o operand) realReg() regalloc.RealReg { + return o.nr().RealReg() +} + +func (o operand) assignReg(v regalloc.VReg) operand { + switch o.kind { + case operandKindNR: + return operandNR(v) + case operandKindSR: + _, amt, sop := o.sr() + return operandSR(v, amt, sop) + case operandKindER: + _, eop, to := o.er() + return operandER(v, eop, to) + case operandKindImm12: + // Does not have a register. + case operandKindShiftImm: + // Does not have a register. + } + panic(o.kind) +} + +// ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def). +// +// `mode` is used to extend the operand if the bit length is smaller than mode.bits(). +// If the operand can be expressed as operandKindImm12, `mode` is ignored. +func (m *machine) getOperand_Imm12_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) { + if def.IsFromBlockParam() { + return operandNR(def.BlkParamVReg) + } + + instr := def.Instr + if instr.Opcode() == ssa.OpcodeIconst { + if imm12Op, ok := asImm12Operand(instr.ConstantVal()); ok { + instr.MarkLowered() + return imm12Op + } + } + return m.getOperand_ER_SR_NR(def, mode) +} + +// getOperand_MaybeNegatedImm12_ER_SR_NR is almost the same as getOperand_Imm12_ER_SR_NR, but this might negate the immediate value. +// If the immediate value is negated, the second return value is true, otherwise always false. +func (m *machine) getOperand_MaybeNegatedImm12_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand, negatedImm12 bool) { + if def.IsFromBlockParam() { + return operandNR(def.BlkParamVReg), false + } + + instr := def.Instr + if instr.Opcode() == ssa.OpcodeIconst { + c := instr.ConstantVal() + if imm12Op, ok := asImm12Operand(c); ok { + instr.MarkLowered() + return imm12Op, false + } + + signExtended := int64(c) + if def.SSAValue().Type().Bits() == 32 { + signExtended = (signExtended << 32) >> 32 + } + negatedWithoutSign := -signExtended + if imm12Op, ok := asImm12Operand(uint64(negatedWithoutSign)); ok { + instr.MarkLowered() + return imm12Op, true + } + } + return m.getOperand_ER_SR_NR(def, mode), false +} + +// ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def). +// +// `mode` is used to extend the operand if the bit length is smaller than mode.bits(). +func (m *machine) getOperand_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) { + if def.IsFromBlockParam() { + return operandNR(def.BlkParamVReg) + } + + if m.compiler.MatchInstr(def, ssa.OpcodeSExtend) || m.compiler.MatchInstr(def, ssa.OpcodeUExtend) { + extInstr := def.Instr + + signed := extInstr.Opcode() == ssa.OpcodeSExtend + innerExtFromBits, innerExtToBits := extInstr.ExtendFromToBits() + modeBits, modeSigned := mode.bits(), mode.signed() + if mode == extModeNone || innerExtToBits == modeBits { + eop := extendOpFrom(signed, innerExtFromBits) + extArg := m.getOperand_NR(m.compiler.ValueDefinition(extInstr.Arg()), extModeNone) + op = operandER(extArg.nr(), eop, innerExtToBits) + extInstr.MarkLowered() + return + } + + if innerExtToBits > modeBits { + panic("BUG?TODO?: need the results of inner extension to be larger than the mode") + } + + switch { + case (!signed && !modeSigned) || (signed && modeSigned): + // Two sign/zero extensions are equivalent to one sign/zero extension for the larger size. + eop := extendOpFrom(modeSigned, innerExtFromBits) + op = operandER(m.compiler.VRegOf(extInstr.Arg()), eop, modeBits) + extInstr.MarkLowered() + case (signed && !modeSigned) || (!signed && modeSigned): + // We need to {sign, zero}-extend the result of the {zero,sign} extension. + eop := extendOpFrom(modeSigned, innerExtToBits) + op = operandER(m.compiler.VRegOf(extInstr.Return()), eop, modeBits) + // Note that we failed to merge the inner extension instruction this case. + } + return + } + return m.getOperand_SR_NR(def, mode) +} + +// ensureValueNR returns an operand of either operandKindSR or operandKindNR from the given value (defined by `def). +// +// `mode` is used to extend the operand if the bit length is smaller than mode.bits(). +func (m *machine) getOperand_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) { + if def.IsFromBlockParam() { + return operandNR(def.BlkParamVReg) + } + + if m.compiler.MatchInstr(def, ssa.OpcodeIshl) { + // Check if the shift amount is constant instruction. + targetVal, amountVal := def.Instr.Arg2() + targetVReg := m.getOperand_NR(m.compiler.ValueDefinition(targetVal), extModeNone).nr() + amountDef := m.compiler.ValueDefinition(amountVal) + if amountDef.IsFromInstr() && amountDef.Instr.Constant() { + // If that is the case, we can use the shifted register operand (SR). + c := byte(amountDef.Instr.ConstantVal()) & (targetVal.Type().Bits() - 1) // Clears the unnecessary bits. + def.Instr.MarkLowered() + amountDef.Instr.MarkLowered() + return operandSR(targetVReg, c, shiftOpLSL) + } + } + return m.getOperand_NR(def, mode) +} + +// getOperand_ShiftImm_NR returns an operand of either operandKindShiftImm or operandKindNR from the given value (defined by `def). +func (m *machine) getOperand_ShiftImm_NR(def *backend.SSAValueDefinition, mode extMode, shiftBitWidth byte) (op operand) { + if def.IsFromBlockParam() { + return operandNR(def.BlkParamVReg) + } + + instr := def.Instr + if instr.Constant() { + amount := byte(instr.ConstantVal()) & (shiftBitWidth - 1) // Clears the unnecessary bits. + return operandShiftImm(amount) + } + return m.getOperand_NR(def, mode) +} + +// ensureValueNR returns an operand of operandKindNR from the given value (defined by `def). +// +// `mode` is used to extend the operand if the bit length is smaller than mode.bits(). +func (m *machine) getOperand_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) { + var v regalloc.VReg + if def.IsFromBlockParam() { + v = def.BlkParamVReg + } else { + instr := def.Instr + if instr.Constant() { + // We inline all the constant instructions so that we could reduce the register usage. + v = m.lowerConstant(instr) + instr.MarkLowered() + } else { + if n := def.N; n == 0 { + v = m.compiler.VRegOf(instr.Return()) + } else { + _, rs := instr.Returns() + v = m.compiler.VRegOf(rs[n-1]) + } + } + } + + r := v + switch inBits := def.SSAValue().Type().Bits(); { + case mode == extModeNone: + case inBits == 32 && (mode == extModeZeroExtend32 || mode == extModeSignExtend32): + case inBits == 32 && mode == extModeZeroExtend64: + extended := m.compiler.AllocateVReg(ssa.TypeI64) + ext := m.allocateInstr() + ext.asExtend(extended, v, 32, 64, false) + m.insert(ext) + r = extended + case inBits == 32 && mode == extModeSignExtend64: + extended := m.compiler.AllocateVReg(ssa.TypeI64) + ext := m.allocateInstr() + ext.asExtend(extended, v, 32, 64, true) + m.insert(ext) + r = extended + case inBits == 64 && (mode == extModeZeroExtend64 || mode == extModeSignExtend64): + } + return operandNR(r) +} + +func asImm12Operand(val uint64) (op operand, ok bool) { + v, shiftBit, ok := asImm12(val) + if !ok { + return operand{}, false + } + return operandImm12(v, shiftBit), true +} + +func asImm12(val uint64) (v uint16, shiftBit byte, ok bool) { + const mask1, mask2 uint64 = 0xfff, 0xfff_000 + if val&^mask1 == 0 { + return uint16(val), 0, true + } else if val&^mask2 == 0 { + return uint16(val >> 12), 1, true + } else { + return 0, 0, false + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go new file mode 100644 index 000000000..4842eaa38 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go @@ -0,0 +1,440 @@ +package arm64 + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +type ( + // addressMode represents an ARM64 addressing mode. + // + // https://developer.arm.com/documentation/102374/0101/Loads-and-stores---addressing + // TODO: use the bit-packed layout like operand struct. + addressMode struct { + kind addressModeKind + rn, rm regalloc.VReg + extOp extendOp + imm int64 + } + + // addressModeKind represents the kind of ARM64 addressing mode. + addressModeKind byte +) + +const ( + // addressModeKindRegExtended takes a base register and an index register. The index register is sign/zero-extended, + // and then scaled by bits(type)/8. + // + // e.g. + // - ldrh w1, [x2, w3, SXTW #1] ;; sign-extended and scaled by 2 (== LSL #1) + // - strh w1, [x2, w3, UXTW #1] ;; zero-extended and scaled by 2 (== LSL #1) + // - ldr w1, [x2, w3, SXTW #2] ;; sign-extended and scaled by 4 (== LSL #2) + // - str x1, [x2, w3, UXTW #3] ;; zero-extended and scaled by 8 (== LSL #3) + // + // See the following pages: + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--register---Load-Register-Halfword--register-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--register---Load-Register--register-- + addressModeKindRegScaledExtended addressModeKind = iota + + // addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without extension factor. + addressModeKindRegScaled + + // addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without scale factor. + addressModeKindRegExtended + + // addressModeKindRegReg takes a base register and an index register. The index register is not either scaled or extended. + addressModeKindRegReg + + // addressModeKindRegSignedImm9 takes a base register and a 9-bit "signed" immediate offset (-256 to 255). + // The immediate will be sign-extended, and be added to the base register. + // This is a.k.a. "unscaled" since the immediate is not scaled. + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled-- + addressModeKindRegSignedImm9 + + // addressModeKindRegUnsignedImm12 takes a base register and a 12-bit "unsigned" immediate offset. scaled by + // the size of the type. In other words, the actual offset will be imm12 * bits(type)/8. + // See "Unsigned offset" in the following pages: + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate-- + addressModeKindRegUnsignedImm12 + + // addressModePostIndex takes a base register and a 9-bit "signed" immediate offset. + // After the load/store, the base register will be updated by the offset. + // + // Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset. + // + // See "Post-index" in the following pages for examples: + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers- + addressModeKindPostIndex + + // addressModePostIndex takes a base register and a 9-bit "signed" immediate offset. + // Before the load/store, the base register will be updated by the offset. + // + // Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset. + // + // See "Pre-index" in the following pages for examples: + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate-- + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers- + addressModeKindPreIndex + + // addressModeKindArgStackSpace is used to resolve the address of the argument stack space + // exiting right above the stack pointer. Since we don't know the exact stack space needed for a function + // at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above. + addressModeKindArgStackSpace + + // addressModeKindResultStackSpace is used to resolve the address of the result stack space + // exiting right above the stack pointer. Since we don't know the exact stack space needed for a function + // at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above. + addressModeKindResultStackSpace +) + +func (a addressMode) format(dstSizeBits byte) (ret string) { + base := formatVRegSized(a.rn, 64) + if rn := a.rn; rn.RegType() != regalloc.RegTypeInt { + panic("invalid base register type: " + a.rn.RegType().String()) + } else if rn.IsRealReg() && v0 <= a.rn.RealReg() && a.rn.RealReg() <= v30 { + panic("BUG: likely a bug in reg alloc or reset behavior") + } + + switch a.kind { + case addressModeKindRegScaledExtended: + amount := a.sizeInBitsToShiftAmount(dstSizeBits) + ret = fmt.Sprintf("[%s, %s, %s #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp, amount) + case addressModeKindRegScaled: + amount := a.sizeInBitsToShiftAmount(dstSizeBits) + ret = fmt.Sprintf("[%s, %s, lsl #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), amount) + case addressModeKindRegExtended: + ret = fmt.Sprintf("[%s, %s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp) + case addressModeKindRegReg: + ret = fmt.Sprintf("[%s, %s]", base, formatVRegSized(a.rm, a.indexRegBits())) + case addressModeKindRegSignedImm9: + if a.imm != 0 { + ret = fmt.Sprintf("[%s, #%#x]", base, a.imm) + } else { + ret = fmt.Sprintf("[%s]", base) + } + case addressModeKindRegUnsignedImm12: + if a.imm != 0 { + ret = fmt.Sprintf("[%s, #%#x]", base, a.imm) + } else { + ret = fmt.Sprintf("[%s]", base) + } + case addressModeKindPostIndex: + ret = fmt.Sprintf("[%s], #%#x", base, a.imm) + case addressModeKindPreIndex: + ret = fmt.Sprintf("[%s, #%#x]!", base, a.imm) + case addressModeKindArgStackSpace: + ret = fmt.Sprintf("[#arg_space, #%#x]", a.imm) + case addressModeKindResultStackSpace: + ret = fmt.Sprintf("[#ret_space, #%#x]", a.imm) + } + return +} + +func addressModePreOrPostIndex(rn regalloc.VReg, imm int64, preIndex bool) addressMode { + if !offsetFitsInAddressModeKindRegSignedImm9(imm) { + panic(fmt.Sprintf("BUG: offset %#x does not fit in addressModeKindRegSignedImm9", imm)) + } + if preIndex { + return addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm} + } else { + return addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm} + } +} + +func offsetFitsInAddressModeKindRegUnsignedImm12(dstSizeInBits byte, offset int64) bool { + divisor := int64(dstSizeInBits) / 8 + return 0 < offset && offset%divisor == 0 && offset/divisor < 4096 +} + +func offsetFitsInAddressModeKindRegSignedImm9(offset int64) bool { + return -256 <= offset && offset <= 255 +} + +func (a addressMode) indexRegBits() byte { + bits := a.extOp.srcBits() + if bits != 32 && bits != 64 { + panic("invalid index register for address mode. it must be either 32 or 64 bits") + } + return bits +} + +func (a addressMode) sizeInBitsToShiftAmount(sizeInBits byte) (lsl byte) { + switch sizeInBits { + case 8: + lsl = 0 + case 16: + lsl = 1 + case 32: + lsl = 2 + case 64: + lsl = 3 + } + return +} + +func extLoadSignSize(op ssa.Opcode) (size byte, signed bool) { + switch op { + case ssa.OpcodeUload8: + size, signed = 8, false + case ssa.OpcodeUload16: + size, signed = 16, false + case ssa.OpcodeUload32: + size, signed = 32, false + case ssa.OpcodeSload8: + size, signed = 8, true + case ssa.OpcodeSload16: + size, signed = 16, true + case ssa.OpcodeSload32: + size, signed = 32, true + default: + panic("BUG") + } + return +} + +func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, ret regalloc.VReg) { + size, signed := extLoadSignSize(op) + amode := m.lowerToAddressMode(ptr, offset, size) + load := m.allocateInstr() + if signed { + load.asSLoad(operandNR(ret), amode, size) + } else { + load.asULoad(operandNR(ret), amode, size) + } + m.insert(load) +} + +func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, ret ssa.Value) { + amode := m.lowerToAddressMode(ptr, offset, typ.Bits()) + + dst := m.compiler.VRegOf(ret) + load := m.allocateInstr() + switch typ { + case ssa.TypeI32, ssa.TypeI64: + load.asULoad(operandNR(dst), amode, typ.Bits()) + case ssa.TypeF32, ssa.TypeF64: + load.asFpuLoad(operandNR(dst), amode, typ.Bits()) + case ssa.TypeV128: + load.asFpuLoad(operandNR(dst), amode, 128) + default: + panic("TODO") + } + m.insert(load) +} + +func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, lane ssa.VecLane, ret ssa.Value) { + // vecLoad1R has offset address mode (base+imm) only for post index, so we simply add the offset to the base. + base := m.getOperand_NR(m.compiler.ValueDefinition(ptr), extModeNone).nr() + offsetReg := m.compiler.AllocateVReg(ssa.TypeI64) + m.lowerConstantI64(offsetReg, int64(offset)) + addedBase := m.addReg64ToReg64(base, offsetReg) + + rd := operandNR(m.compiler.VRegOf(ret)) + + ld1r := m.allocateInstr() + ld1r.asVecLoad1R(rd, operandNR(addedBase), ssaLaneToArrangement(lane)) + m.insert(ld1r) +} + +func (m *machine) lowerStore(si *ssa.Instruction) { + // TODO: merge consecutive stores into a single pair store instruction. + value, ptr, offset, storeSizeInBits := si.StoreData() + amode := m.lowerToAddressMode(ptr, offset, storeSizeInBits) + + valueOp := m.getOperand_NR(m.compiler.ValueDefinition(value), extModeNone) + store := m.allocateInstr() + store.asStore(valueOp, amode, storeSizeInBits) + m.insert(store) +} + +// lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions. +func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode addressMode) { + // TODO: currently the instruction selection logic doesn't support addressModeKindRegScaledExtended and + // addressModeKindRegScaled since collectAddends doesn't take ssa.OpcodeIshl into account. This should be fixed + // to support more efficient address resolution. + + a32s, a64s, offset := m.collectAddends(ptr) + offset += int64(offsetBase) + return m.lowerToAddressModeFromAddends(a32s, a64s, size, offset) +} + +// lowerToAddressModeFromAddends creates an addressMode from a list of addends collected by collectAddends. +// During the construction, this might emit additional instructions. +// +// Extracted as a separate function for easy testing. +func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32], a64s *wazevoapi.Queue[regalloc.VReg], size byte, offset int64) (amode addressMode) { + switch a64sExist, a32sExist := !a64s.Empty(), !a32s.Empty(); { + case a64sExist && a32sExist: + var base regalloc.VReg + base = a64s.Dequeue() + var a32 addend32 + a32 = a32s.Dequeue() + amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext} + case a64sExist && offsetFitsInAddressModeKindRegUnsignedImm12(size, offset): + var base regalloc.VReg + base = a64s.Dequeue() + amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset} + offset = 0 + case a64sExist && offsetFitsInAddressModeKindRegSignedImm9(offset): + var base regalloc.VReg + base = a64s.Dequeue() + amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset} + offset = 0 + case a64sExist: + var base regalloc.VReg + base = a64s.Dequeue() + if !a64s.Empty() { + index := a64s.Dequeue() + amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */} + } else { + amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0} + } + case a32sExist: + base32 := a32s.Dequeue() + + // First we need 64-bit base. + base := m.compiler.AllocateVReg(ssa.TypeI64) + baseExt := m.allocateInstr() + var signed bool + if base32.ext == extendOpSXTW { + signed = true + } + baseExt.asExtend(base, base32.r, 32, 64, signed) + m.insert(baseExt) + + if !a32s.Empty() { + index := a32s.Dequeue() + amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext} + } else { + amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0} + } + default: // Only static offsets. + tmpReg := m.compiler.AllocateVReg(ssa.TypeI64) + m.lowerConstantI64(tmpReg, offset) + amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0} + offset = 0 + } + + baseReg := amode.rn + if offset > 0 { + baseReg = m.addConstToReg64(baseReg, offset) // baseReg += offset + } + + for !a64s.Empty() { + a64 := a64s.Dequeue() + baseReg = m.addReg64ToReg64(baseReg, a64) // baseReg += a64 + } + + for !a32s.Empty() { + a32 := a32s.Dequeue() + baseReg = m.addRegToReg64Ext(baseReg, a32.r, a32.ext) // baseReg += (a32 extended to 64-bit) + } + amode.rn = baseReg + return +} + +var addendsMatchOpcodes = [4]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst} + +func (m *machine) collectAddends(ptr ssa.Value) (addends32 *wazevoapi.Queue[addend32], addends64 *wazevoapi.Queue[regalloc.VReg], offset int64) { + m.addendsWorkQueue.Reset() + m.addends32.Reset() + m.addends64.Reset() + m.addendsWorkQueue.Enqueue(ptr) + + for !m.addendsWorkQueue.Empty() { + v := m.addendsWorkQueue.Dequeue() + + def := m.compiler.ValueDefinition(v) + switch op := m.compiler.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op { + case ssa.OpcodeIadd: + // If the addend is an add, we recursively collect its operands. + x, y := def.Instr.Arg2() + m.addendsWorkQueue.Enqueue(x) + m.addendsWorkQueue.Enqueue(y) + def.Instr.MarkLowered() + case ssa.OpcodeIconst: + // If the addend is constant, we just statically merge it into the offset. + ic := def.Instr + u64 := ic.ConstantVal() + if ic.Return().Type().Bits() == 32 { + offset += int64(int32(u64)) // sign-extend. + } else { + offset += int64(u64) + } + def.Instr.MarkLowered() + case ssa.OpcodeUExtend, ssa.OpcodeSExtend: + input := def.Instr.Arg() + if input.Type().Bits() != 32 { + panic("illegal size: " + input.Type().String()) + } + + var ext extendOp + if op == ssa.OpcodeUExtend { + ext = extendOpUXTW + } else { + ext = extendOpSXTW + } + + inputDef := m.compiler.ValueDefinition(input) + constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant() + switch { + case constInst && ext == extendOpUXTW: + // Zero-extension of a 32-bit constant can be merged into the offset. + offset += int64(uint32(inputDef.Instr.ConstantVal())) + case constInst && ext == extendOpSXTW: + // Sign-extension of a 32-bit constant can be merged into the offset. + offset += int64(int32(inputDef.Instr.ConstantVal())) // sign-extend! + default: + m.addends32.Enqueue(addend32{r: m.getOperand_NR(inputDef, extModeNone).nr(), ext: ext}) + } + def.Instr.MarkLowered() + continue + default: + // If the addend is not one of them, we simply use it as-is (without merging!), optionally zero-extending it. + m.addends64.Enqueue(m.getOperand_NR(def, extModeZeroExtend64 /* optional zero ext */).nr()) + } + } + return &m.addends32, &m.addends64, offset +} + +func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) { + rd = m.compiler.AllocateVReg(ssa.TypeI64) + alu := m.allocateInstr() + if imm12Op, ok := asImm12Operand(uint64(c)); ok { + alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), imm12Op, true) + } else if imm12Op, ok = asImm12Operand(uint64(-c)); ok { + alu.asALU(aluOpSub, operandNR(rd), operandNR(r), imm12Op, true) + } else { + tmp := m.compiler.AllocateVReg(ssa.TypeI64) + m.load64bitConst(c, tmp) + alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), operandNR(tmp), true) + } + m.insert(alu) + return +} + +func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) { + rd = m.compiler.AllocateVReg(ssa.TypeI64) + alu := m.allocateInstr() + alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandNR(rm), true) + m.insert(alu) + return +} + +func (m *machine) addRegToReg64Ext(rn, rm regalloc.VReg, ext extendOp) (rd regalloc.VReg) { + rd = m.compiler.AllocateVReg(ssa.TypeI64) + alu := m.allocateInstr() + alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandER(rm, ext, 64), true) + m.insert(alu) + return +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go new file mode 100644 index 000000000..b435d9ba9 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go @@ -0,0 +1,515 @@ +package arm64 + +import ( + "context" + "fmt" + "strings" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +type ( + // machine implements backend.Machine. + machine struct { + compiler backend.Compiler + executableContext *backend.ExecutableContextT[instruction] + currentABI *backend.FunctionABI + + regAlloc regalloc.Allocator + regAllocFn *backend.RegAllocFunction[*instruction, *machine] + + // addendsWorkQueue is used during address lowering, defined here for reuse. + addendsWorkQueue wazevoapi.Queue[ssa.Value] + addends32 wazevoapi.Queue[addend32] + // addends64 is used during address lowering, defined here for reuse. + addends64 wazevoapi.Queue[regalloc.VReg] + unresolvedAddressModes []*instruction + + // condBrRelocs holds the conditional branches which need offset relocation. + condBrRelocs []condBrReloc + + // jmpTableTargets holds the labels of the jump table targets. + jmpTableTargets [][]uint32 + + // spillSlotSize is the size of the stack slot in bytes used for spilling registers. + // During the execution of the function, the stack looks like: + // + // + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | xxxxx | + // | ReturnAddress | + // +-----------------+ <<-| + // | ........... | | + // | spill slot M | | <--- spillSlotSize + // | ............ | | + // | spill slot 2 | | + // | spill slot 1 | <<-+ + // | clobbered N | + // | ........... | + // | clobbered 1 | + // | clobbered 0 | + // SP---> +-----------------+ + // (low address) + // + // and it represents the size of the space between FP and the first spilled slot. This must be a multiple of 16. + // Also note that this is only known after register allocation. + spillSlotSize int64 + spillSlots map[regalloc.VRegID]int64 // regalloc.VRegID to offset. + // clobberedRegs holds real-register backed VRegs saved at the function prologue, and restored at the epilogue. + clobberedRegs []regalloc.VReg + + maxRequiredStackSizeForCalls int64 + stackBoundsCheckDisabled bool + + regAllocStarted bool + } + + addend32 struct { + r regalloc.VReg + ext extendOp + } + + condBrReloc struct { + cbr *instruction + // currentLabelPos is the labelPosition within which condBr is defined. + currentLabelPos *labelPosition + // Next block's labelPosition. + nextLabel label + offset int64 + } + + labelPosition = backend.LabelPosition[instruction] + label = backend.Label +) + +const ( + labelReturn = backend.LabelReturn + labelInvalid = backend.LabelInvalid +) + +// NewBackend returns a new backend for arm64. +func NewBackend() backend.Machine { + m := &machine{ + spillSlots: make(map[regalloc.VRegID]int64), + executableContext: newExecutableContext(), + regAlloc: regalloc.NewAllocator(regInfo), + } + return m +} + +func newExecutableContext() *backend.ExecutableContextT[instruction] { + return backend.NewExecutableContextT[instruction](resetInstruction, setNext, setPrev, asNop0) +} + +// ExecutableContext implements backend.Machine. +func (m *machine) ExecutableContext() backend.ExecutableContext { + return m.executableContext +} + +// RegAlloc implements backend.Machine Function. +func (m *machine) RegAlloc() { + rf := m.regAllocFn + for _, pos := range m.executableContext.OrderedBlockLabels { + rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End) + } + + m.regAllocStarted = true + m.regAlloc.DoAllocation(rf) + // Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes. + m.spillSlotSize = (m.spillSlotSize + 15) &^ 15 +} + +// Reset implements backend.Machine. +func (m *machine) Reset() { + m.clobberedRegs = m.clobberedRegs[:0] + for key := range m.spillSlots { + m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key)) + } + for _, key := range m.clobberedRegs { + delete(m.spillSlots, regalloc.VRegID(key)) + } + m.clobberedRegs = m.clobberedRegs[:0] + m.regAllocStarted = false + m.regAlloc.Reset() + m.regAllocFn.Reset() + m.spillSlotSize = 0 + m.unresolvedAddressModes = m.unresolvedAddressModes[:0] + m.maxRequiredStackSizeForCalls = 0 + m.executableContext.Reset() + m.jmpTableTargets = m.jmpTableTargets[:0] +} + +// SetCurrentABI implements backend.Machine SetCurrentABI. +func (m *machine) SetCurrentABI(abi *backend.FunctionABI) { + m.currentABI = abi +} + +// DisableStackCheck implements backend.Machine DisableStackCheck. +func (m *machine) DisableStackCheck() { + m.stackBoundsCheckDisabled = true +} + +// SetCompiler implements backend.Machine. +func (m *machine) SetCompiler(ctx backend.Compiler) { + m.compiler = ctx + m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, ctx.SSABuilder(), ctx) +} + +func (m *machine) insert(i *instruction) { + ectx := m.executableContext + ectx.PendingInstructions = append(ectx.PendingInstructions, i) +} + +func (m *machine) insertBrTargetLabel() label { + nop, l := m.allocateBrTarget() + m.insert(nop) + return l +} + +func (m *machine) allocateBrTarget() (nop *instruction, l label) { + ectx := m.executableContext + l = ectx.AllocateLabel() + nop = m.allocateInstr() + nop.asNop0WithLabel(l) + pos := ectx.AllocateLabelPosition(l) + pos.Begin, pos.End = nop, nop + ectx.LabelPositions[l] = pos + return +} + +// allocateInstr allocates an instruction. +func (m *machine) allocateInstr() *instruction { + instr := m.executableContext.InstructionPool.Allocate() + if !m.regAllocStarted { + instr.addedBeforeRegAlloc = true + } + return instr +} + +func resetInstruction(i *instruction) { + *i = instruction{} +} + +func (m *machine) allocateNop() *instruction { + instr := m.allocateInstr() + instr.asNop0() + return instr +} + +func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruction) { + amode := &i.amode + switch amode.kind { + case addressModeKindResultStackSpace: + amode.imm += ret0offset + case addressModeKindArgStackSpace: + amode.imm += arg0offset + default: + panic("BUG") + } + + var sizeInBits byte + switch i.kind { + case store8, uLoad8: + sizeInBits = 8 + case store16, uLoad16: + sizeInBits = 16 + case store32, fpuStore32, uLoad32, fpuLoad32: + sizeInBits = 32 + case store64, fpuStore64, uLoad64, fpuLoad64: + sizeInBits = 64 + case fpuStore128, fpuLoad128: + sizeInBits = 128 + default: + panic("BUG") + } + + if offsetFitsInAddressModeKindRegUnsignedImm12(sizeInBits, amode.imm) { + amode.kind = addressModeKindRegUnsignedImm12 + } else { + // This case, we load the offset into the temporary register, + // and then use it as the index register. + newPrev := m.lowerConstantI64AndInsert(i.prev, tmpRegVReg, amode.imm) + linkInstr(newPrev, i) + *amode = addressMode{kind: addressModeKindRegReg, rn: amode.rn, rm: tmpRegVReg, extOp: extendOpUXTX /* indicates rm reg is 64-bit */} + } +} + +// resolveRelativeAddresses resolves the relative addresses before encoding. +func (m *machine) resolveRelativeAddresses(ctx context.Context) { + ectx := m.executableContext + for { + if len(m.unresolvedAddressModes) > 0 { + arg0offset, ret0offset := m.arg0OffsetFromSP(), m.ret0OffsetFromSP() + for _, i := range m.unresolvedAddressModes { + m.resolveAddressingMode(arg0offset, ret0offset, i) + } + } + + // Reuse the slice to gather the unresolved conditional branches. + m.condBrRelocs = m.condBrRelocs[:0] + + var fn string + var fnIndex int + var labelToSSABlockID map[label]ssa.BasicBlockID + if wazevoapi.PerfMapEnabled { + fn = wazevoapi.GetCurrentFunctionName(ctx) + labelToSSABlockID = make(map[label]ssa.BasicBlockID) + for i, l := range ectx.SsaBlockIDToLabels { + labelToSSABlockID[l] = ssa.BasicBlockID(i) + } + fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx) + } + + // Next, in order to determine the offsets of relative jumps, we have to calculate the size of each label. + var offset int64 + for i, pos := range ectx.OrderedBlockLabels { + pos.BinaryOffset = offset + var size int64 + for cur := pos.Begin; ; cur = cur.next { + switch cur.kind { + case nop0: + l := cur.nop0Label() + if pos, ok := ectx.LabelPositions[l]; ok { + pos.BinaryOffset = offset + size + } + case condBr: + if !cur.condBrOffsetResolved() { + var nextLabel label + if i < len(ectx.OrderedBlockLabels)-1 { + // Note: this is only used when the block ends with fallthrough, + // therefore can be safely assumed that the next block exists when it's needed. + nextLabel = ectx.OrderedBlockLabels[i+1].L + } + m.condBrRelocs = append(m.condBrRelocs, condBrReloc{ + cbr: cur, currentLabelPos: pos, offset: offset + size, + nextLabel: nextLabel, + }) + } + } + size += cur.size() + if cur == pos.End { + break + } + } + + if wazevoapi.PerfMapEnabled { + if size > 0 { + l := pos.L + var labelStr string + if blkID, ok := labelToSSABlockID[l]; ok { + labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID) + } else { + labelStr = l.String() + } + wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr)) + } + } + offset += size + } + + // Before resolving any offsets, we need to check if all the conditional branches can be resolved. + var needRerun bool + for i := range m.condBrRelocs { + reloc := &m.condBrRelocs[i] + cbr := reloc.cbr + offset := reloc.offset + + target := cbr.condBrLabel() + offsetOfTarget := ectx.LabelPositions[target].BinaryOffset + diff := offsetOfTarget - offset + if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 { + // This case the conditional branch is too huge. We place the trampoline instructions at the end of the current block, + // and jump to it. + m.insertConditionalJumpTrampoline(cbr, reloc.currentLabelPos, reloc.nextLabel) + // Then, we need to recall this function to fix up the label offsets + // as they have changed after the trampoline is inserted. + needRerun = true + } + } + if needRerun { + if wazevoapi.PerfMapEnabled { + wazevoapi.PerfMap.Clear() + } + } else { + break + } + } + + var currentOffset int64 + for cur := ectx.RootInstr; cur != nil; cur = cur.next { + switch cur.kind { + case br: + target := cur.brLabel() + offsetOfTarget := ectx.LabelPositions[target].BinaryOffset + diff := offsetOfTarget - currentOffset + divided := diff >> 2 + if divided < minSignedInt26 || divided > maxSignedInt26 { + // This means the currently compiled single function is extremely large. + panic("too large function that requires branch relocation of large unconditional branch larger than 26-bit range") + } + cur.brOffsetResolve(diff) + case condBr: + if !cur.condBrOffsetResolved() { + target := cur.condBrLabel() + offsetOfTarget := ectx.LabelPositions[target].BinaryOffset + diff := offsetOfTarget - currentOffset + if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 { + panic("BUG: branch relocation for large conditional branch larger than 19-bit range must be handled properly") + } + cur.condBrOffsetResolve(diff) + } + case brTableSequence: + tableIndex := cur.u1 + targets := m.jmpTableTargets[tableIndex] + for i := range targets { + l := label(targets[i]) + offsetOfTarget := ectx.LabelPositions[l].BinaryOffset + diff := offsetOfTarget - (currentOffset + brTableSequenceOffsetTableBegin) + targets[i] = uint32(diff) + } + cur.brTableSequenceOffsetsResolved() + case emitSourceOffsetInfo: + m.compiler.AddSourceOffsetInfo(currentOffset, cur.sourceOffsetInfo()) + } + currentOffset += cur.size() + } +} + +const ( + maxSignedInt26 = 1<<25 - 1 + minSignedInt26 = -(1 << 25) + + maxSignedInt19 = 1<<18 - 1 + minSignedInt19 = -(1 << 18) +) + +func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *labelPosition, nextLabel label) { + cur := currentBlk.End + originalTarget := cbr.condBrLabel() + endNext := cur.next + + if cur.kind != br { + // If the current block ends with a conditional branch, we can just insert the trampoline after it. + // Otherwise, we need to insert "skip" instruction to skip the trampoline instructions. + skip := m.allocateInstr() + skip.asBr(nextLabel) + cur = linkInstr(cur, skip) + } + + cbrNewTargetInstr, cbrNewTargetLabel := m.allocateBrTarget() + cbr.setCondBrTargets(cbrNewTargetLabel) + cur = linkInstr(cur, cbrNewTargetInstr) + + // Then insert the unconditional branch to the original, which should be possible to get encoded + // as 26-bit offset should be enough for any practical application. + br := m.allocateInstr() + br.asBr(originalTarget) + cur = linkInstr(cur, br) + + // Update the end of the current block. + currentBlk.End = cur + + linkInstr(cur, endNext) +} + +// Format implements backend.Machine. +func (m *machine) Format() string { + ectx := m.executableContext + begins := map[*instruction]label{} + for l, pos := range ectx.LabelPositions { + begins[pos.Begin] = l + } + + irBlocks := map[label]ssa.BasicBlockID{} + for i, l := range ectx.SsaBlockIDToLabels { + irBlocks[l] = ssa.BasicBlockID(i) + } + + var lines []string + for cur := ectx.RootInstr; cur != nil; cur = cur.next { + if l, ok := begins[cur]; ok { + var labelStr string + if blkID, ok := irBlocks[l]; ok { + labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID) + } else { + labelStr = fmt.Sprintf("%s:", l) + } + lines = append(lines, labelStr) + } + if cur.kind == nop0 { + continue + } + lines = append(lines, "\t"+cur.String()) + } + return "\n" + strings.Join(lines, "\n") + "\n" +} + +// InsertReturn implements backend.Machine. +func (m *machine) InsertReturn() { + i := m.allocateInstr() + i.asRet() + m.insert(i) +} + +func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 { + offset, ok := m.spillSlots[id] + if !ok { + offset = m.spillSlotSize + // TODO: this should be aligned depending on the `size` to use Imm12 offset load/store as much as possible. + m.spillSlots[id] = offset + m.spillSlotSize += int64(size) + } + return offset + 16 // spill slot starts above the clobbered registers and the frame size. +} + +func (m *machine) clobberedRegSlotSize() int64 { + return int64(len(m.clobberedRegs) * 16) +} + +func (m *machine) arg0OffsetFromSP() int64 { + return m.frameSize() + + 16 + // 16-byte aligned return address + 16 // frame size saved below the clobbered registers. +} + +func (m *machine) ret0OffsetFromSP() int64 { + return m.arg0OffsetFromSP() + m.currentABI.ArgStackSize +} + +func (m *machine) requiredStackSize() int64 { + return m.maxRequiredStackSizeForCalls + + m.frameSize() + + 16 + // 16-byte aligned return address. + 16 // frame size saved below the clobbered registers. +} + +func (m *machine) frameSize() int64 { + s := m.clobberedRegSlotSize() + m.spillSlotSize + if s&0xf != 0 { + panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s)) + } + return s +} + +func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) { + // TODO: reuse the slice! + labels := make([]uint32, len(targets)) + for j, target := range targets { + labels[j] = uint32(m.executableContext.GetOrAllocateSSABlockLabel(target)) + } + index = len(m.jmpTableTargets) + m.jmpTableTargets = append(m.jmpTableTargets, labels) + return +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go new file mode 100644 index 000000000..466fac464 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go @@ -0,0 +1,469 @@ +package arm64 + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// PostRegAlloc implements backend.Machine. +func (m *machine) PostRegAlloc() { + m.setupPrologue() + m.postRegAlloc() +} + +// setupPrologue initializes the prologue of the function. +func (m *machine) setupPrologue() { + ectx := m.executableContext + + cur := ectx.RootInstr + prevInitInst := cur.next + + // + // (high address) (high address) + // SP----> +-----------------+ +------------------+ <----+ + // | ....... | | ....... | | + // | ret Y | | ret Y | | + // | ....... | | ....... | | + // | ret 0 | | ret 0 | | + // | arg X | | arg X | | size_of_arg_ret. + // | ....... | ====> | ....... | | + // | arg 1 | | arg 1 | | + // | arg 0 | | arg 0 | <----+ + // |-----------------| | size_of_arg_ret | + // | return address | + // +------------------+ <---- SP + // (low address) (low address) + + // Saves the return address (lr) and the size_of_arg_ret below the SP. + // size_of_arg_ret is used for stack unwinding. + cur = m.createReturnAddrAndSizeOfArgRetSlot(cur) + + if !m.stackBoundsCheckDisabled { + cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur) + } + + // Decrement SP if spillSlotSize > 0. + if m.spillSlotSize == 0 && len(m.spillSlots) != 0 { + panic(fmt.Sprintf("BUG: spillSlotSize=%d, spillSlots=%v\n", m.spillSlotSize, m.spillSlots)) + } + + if regs := m.clobberedRegs; len(regs) > 0 { + // + // (high address) (high address) + // +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | size_of_arg_ret | | size_of_arg_ret | + // | ReturnAddress | | ReturnAddress | + // SP----> +-----------------+ ====> +-----------------+ + // (low address) | clobbered M | + // | ............ | + // | clobbered 0 | + // +-----------------+ <----- SP + // (low address) + // + _amode := addressModePreOrPostIndex(spVReg, + -16, // stack pointer must be 16-byte aligned. + true, // Decrement before store. + ) + for _, vr := range regs { + // TODO: pair stores to reduce the number of instructions. + store := m.allocateInstr() + store.asStore(operandNR(vr), _amode, regTypeToRegisterSizeInBits(vr.RegType())) + cur = linkInstr(cur, store) + } + } + + if size := m.spillSlotSize; size > 0 { + // Check if size is 16-byte aligned. + if size&0xf != 0 { + panic(fmt.Errorf("BUG: spill slot size %d is not 16-byte aligned", size)) + } + + cur = m.addsAddOrSubStackPointer(cur, spVReg, size, false) + + // At this point, the stack looks like: + // + // (high address) + // +------------------+ + // | ....... | + // | ret Y | + // | ....... | + // | ret 0 | + // | arg X | + // | ....... | + // | arg 1 | + // | arg 0 | + // | size_of_arg_ret | + // | ReturnAddress | + // +------------------+ + // | clobbered M | + // | ............ | + // | clobbered 0 | + // | spill slot N | + // | ............ | + // | spill slot 2 | + // | spill slot 0 | + // SP----> +------------------+ + // (low address) + } + + // We push the frame size into the stack to make it possible to unwind stack: + // + // + // (high address) (high address) + // +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | size_of_arg_ret | | size_of_arg_ret | + // | ReturnAddress | | ReturnAddress | + // +-----------------+ ==> +-----------------+ <----+ + // | clobbered M | | clobbered M | | + // | ............ | | ............ | | + // | clobbered 2 | | clobbered 2 | | + // | clobbered 1 | | clobbered 1 | | frame size + // | clobbered 0 | | clobbered 0 | | + // | spill slot N | | spill slot N | | + // | ............ | | ............ | | + // | spill slot 0 | | spill slot 0 | <----+ + // SP---> +-----------------+ | xxxxxx | ;; unused space to make it 16-byte aligned. + // | frame_size | + // +-----------------+ <---- SP + // (low address) + // + cur = m.createFrameSizeSlot(cur, m.frameSize()) + + linkInstr(cur, prevInitInst) +} + +func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruction { + // First we decrement the stack pointer to point the arg0 slot. + var sizeOfArgRetReg regalloc.VReg + s := int64(m.currentABI.AlignedArgResultStackSlotSize()) + if s > 0 { + cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s) + sizeOfArgRetReg = tmpRegVReg + + subSp := m.allocateInstr() + subSp.asALU(aluOpSub, operandNR(spVReg), operandNR(spVReg), operandNR(sizeOfArgRetReg), true) + cur = linkInstr(cur, subSp) + } else { + sizeOfArgRetReg = xzrVReg + } + + // Saves the return address (lr) and the size_of_arg_ret below the SP. + // size_of_arg_ret is used for stack unwinding. + pstr := m.allocateInstr() + amode := addressModePreOrPostIndex(spVReg, -16, true /* decrement before store */) + pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode) + cur = linkInstr(cur, pstr) + return cur +} + +func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction { + var frameSizeReg regalloc.VReg + if s > 0 { + cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s) + frameSizeReg = tmpRegVReg + } else { + frameSizeReg = xzrVReg + } + _amode := addressModePreOrPostIndex(spVReg, + -16, // stack pointer must be 16-byte aligned. + true, // Decrement before store. + ) + store := m.allocateInstr() + store.asStore(operandNR(frameSizeReg), _amode, 64) + cur = linkInstr(cur, store) + return cur +} + +// postRegAlloc does multiple things while walking through the instructions: +// 1. Removes the redundant copy instruction. +// 2. Inserts the epilogue. +func (m *machine) postRegAlloc() { + ectx := m.executableContext + for cur := ectx.RootInstr; cur != nil; cur = cur.next { + switch cur.kind { + case ret: + m.setupEpilogueAfter(cur.prev) + case loadConstBlockArg: + lc := cur + next := lc.next + m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0] + m.lowerLoadConstantBlockArgAfterRegAlloc(lc) + for _, instr := range m.executableContext.PendingInstructions { + cur = linkInstr(cur, instr) + } + linkInstr(cur, next) + m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0] + default: + // Removes the redundant copy instruction. + if cur.IsCopy() && cur.rn.realReg() == cur.rd.realReg() { + prev, next := cur.prev, cur.next + // Remove the copy instruction. + prev.next = next + if next != nil { + next.prev = prev + } + } + } + } +} + +func (m *machine) setupEpilogueAfter(cur *instruction) { + prevNext := cur.next + + // We've stored the frame size in the prologue, and now that we are about to return from this function, we won't need it anymore. + cur = m.addsAddOrSubStackPointer(cur, spVReg, 16, true) + + if s := m.spillSlotSize; s > 0 { + // Adjust SP to the original value: + // + // (high address) (high address) + // +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | xxxxx | | xxxxx | + // | ReturnAddress | | ReturnAddress | + // +-----------------+ ====> +-----------------+ + // | clobbered M | | clobbered M | + // | ............ | | ............ | + // | clobbered 1 | | clobbered 1 | + // | clobbered 0 | | clobbered 0 | + // | spill slot N | +-----------------+ <---- SP + // | ............ | + // | spill slot 0 | + // SP---> +-----------------+ + // (low address) + // + cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true) + } + + // First we need to restore the clobbered registers. + if len(m.clobberedRegs) > 0 { + // (high address) + // +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | xxxxx | | xxxxx | + // | ReturnAddress | | ReturnAddress | + // +-----------------+ ========> +-----------------+ <---- SP + // | clobbered M | + // | ........... | + // | clobbered 1 | + // | clobbered 0 | + // SP---> +-----------------+ + // (low address) + + l := len(m.clobberedRegs) - 1 + for i := range m.clobberedRegs { + vr := m.clobberedRegs[l-i] // reverse order to restore. + load := m.allocateInstr() + amode := addressModePreOrPostIndex(spVReg, + 16, // stack pointer must be 16-byte aligned. + false, // Increment after store. + ) + // TODO: pair loads to reduce the number of instructions. + switch regTypeToRegisterSizeInBits(vr.RegType()) { + case 64: // save int reg. + load.asULoad(operandNR(vr), amode, 64) + case 128: // save vector reg. + load.asFpuLoad(operandNR(vr), amode, 128) + } + cur = linkInstr(cur, load) + } + } + + // Reload the return address (lr). + // + // +-----------------+ +-----------------+ + // | ....... | | ....... | + // | ret Y | | ret Y | + // | ....... | | ....... | + // | ret 0 | | ret 0 | + // | arg X | | arg X | + // | ....... | ===> | ....... | + // | arg 1 | | arg 1 | + // | arg 0 | | arg 0 | + // | xxxxx | +-----------------+ <---- SP + // | ReturnAddress | + // SP----> +-----------------+ + + ldr := m.allocateInstr() + ldr.asULoad(operandNR(lrVReg), + addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64) + cur = linkInstr(cur, ldr) + + if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 { + cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true) + } + + linkInstr(cur, prevNext) +} + +// saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient +// stack space left. Basically this is the combination of CalleeSavedRegisters plus argument registers execpt for x0, +// which always points to the execution context whenever the native code is entered from Go. +var saveRequiredRegs = []regalloc.VReg{ + x1VReg, x2VReg, x3VReg, x4VReg, x5VReg, x6VReg, x7VReg, + x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg, lrVReg, + v0VReg, v1VReg, v2VReg, v3VReg, v4VReg, v5VReg, v6VReg, v7VReg, + v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg, +} + +// insertStackBoundsCheck will insert the instructions after `cur` to check the +// stack bounds, and if there's no sufficient spaces required for the function, +// exit the execution and try growing it in Go world. +// +// TODO: we should be able to share the instructions across all the functions to reduce the size of compiled executable. +func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction { + if requiredStackSize%16 != 0 { + panic("BUG") + } + + if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok { + // sub tmp, sp, #requiredStackSize + sub := m.allocateInstr() + sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), immm12op, true) + cur = linkInstr(cur, sub) + } else { + // This case, we first load the requiredStackSize into the temporary register, + cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize) + // Then subtract it. + sub := m.allocateInstr() + sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true) + cur = linkInstr(cur, sub) + } + + tmp2 := x11VReg // Caller save, so it is safe to use it here in the prologue. + + // ldr tmp2, [executionContext #StackBottomPtr] + ldr := m.allocateInstr() + ldr.asULoad(operandNR(tmp2), addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: x0VReg, // execution context is always the first argument. + imm: wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(), + }, 64) + cur = linkInstr(cur, ldr) + + // subs xzr, tmp, tmp2 + subs := m.allocateInstr() + subs.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpRegVReg), operandNR(tmp2), true) + cur = linkInstr(cur, subs) + + // b.ge #imm + cbr := m.allocateInstr() + cbr.asCondBr(ge.asCond(), labelInvalid, false /* ignored */) + cur = linkInstr(cur, cbr) + + // Set the required stack size and set it to the exec context. + { + // First load the requiredStackSize into the temporary register, + cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize) + setRequiredStackSize := m.allocateInstr() + setRequiredStackSize.asStore(operandNR(tmpRegVReg), + addressMode{ + kind: addressModeKindRegUnsignedImm12, + // Execution context is always the first argument. + rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(), + }, 64) + + cur = linkInstr(cur, setRequiredStackSize) + } + + ldrAddress := m.allocateInstr() + ldrAddress.asULoad(operandNR(tmpRegVReg), addressMode{ + kind: addressModeKindRegUnsignedImm12, + rn: x0VReg, // execution context is always the first argument + imm: wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(), + }, 64) + cur = linkInstr(cur, ldrAddress) + + // Then jumps to the stack grow call sequence's address, meaning + // transferring the control to the code compiled by CompileStackGrowCallSequence. + bl := m.allocateInstr() + bl.asCallIndirect(tmpRegVReg, nil) + cur = linkInstr(cur, bl) + + // Now that we know the entire code, we can finalize how many bytes + // we have to skip when the stack size is sufficient. + var cbrOffset int64 + for _cur := cbr; ; _cur = _cur.next { + cbrOffset += _cur.size() + if _cur == cur { + break + } + } + cbr.condBrOffsetResolve(cbrOffset) + return cur +} + +// CompileStackGrowCallSequence implements backend.Machine. +func (m *machine) CompileStackGrowCallSequence() []byte { + ectx := m.executableContext + + cur := m.allocateInstr() + cur.asNop0() + ectx.RootInstr = cur + + // Save the callee saved and argument registers. + cur = m.saveRegistersInExecutionContext(cur, saveRequiredRegs) + + // Save the current stack pointer. + cur = m.saveCurrentStackPointer(cur, x0VReg) + + // Set the exit status on the execution context. + cur = m.setExitCode(cur, x0VReg, wazevoapi.ExitCodeGrowStack) + + // Exit the execution. + cur = m.storeReturnAddressAndExit(cur) + + // After the exit, restore the saved registers. + cur = m.restoreRegistersInExecutionContext(cur, saveRequiredRegs) + + // Then goes back the original address of this stack grow call. + ret := m.allocateInstr() + ret.asRet() + linkInstr(cur, ret) + + m.encode(ectx.RootInstr) + return m.compiler.Buf() +} + +func (m *machine) addsAddOrSubStackPointer(cur *instruction, rd regalloc.VReg, diff int64, add bool) *instruction { + ectx := m.executableContext + + ectx.PendingInstructions = ectx.PendingInstructions[:0] + m.insertAddOrSubStackPointer(rd, diff, add) + for _, inserted := range ectx.PendingInstructions { + cur = linkInstr(cur, inserted) + } + return cur +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go new file mode 100644 index 000000000..1c8793b73 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go @@ -0,0 +1,152 @@ +package arm64 + +// This file implements the interfaces required for register allocations. See backend.RegAllocFunctionMachine. + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// ClobberedRegisters implements backend.RegAllocFunctionMachine. +func (m *machine) ClobberedRegisters(regs []regalloc.VReg) { + m.clobberedRegs = append(m.clobberedRegs[:0], regs...) +} + +// Swap implements backend.RegAllocFunctionMachine. +func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) { + prevNext := cur.next + var mov1, mov2, mov3 *instruction + if x1.RegType() == regalloc.RegTypeInt { + if !tmp.Valid() { + tmp = tmpRegVReg + } + mov1 = m.allocateInstr().asMove64(tmp, x1) + mov2 = m.allocateInstr().asMove64(x1, x2) + mov3 = m.allocateInstr().asMove64(x2, tmp) + cur = linkInstr(cur, mov1) + cur = linkInstr(cur, mov2) + cur = linkInstr(cur, mov3) + linkInstr(cur, prevNext) + } else { + if !tmp.Valid() { + r2 := x2.RealReg() + // Temporarily spill x1 to stack. + cur = m.InsertStoreRegisterAt(x1, cur, true).prev + // Then move x2 to x1. + cur = linkInstr(cur, m.allocateInstr().asFpuMov128(x1, x2)) + linkInstr(cur, prevNext) + // Then reload the original value on x1 from stack to r2. + m.InsertReloadRegisterAt(x1.SetRealReg(r2), cur, true) + } else { + mov1 = m.allocateInstr().asFpuMov128(tmp, x1) + mov2 = m.allocateInstr().asFpuMov128(x1, x2) + mov3 = m.allocateInstr().asFpuMov128(x2, tmp) + cur = linkInstr(cur, mov1) + cur = linkInstr(cur, mov2) + cur = linkInstr(cur, mov3) + linkInstr(cur, prevNext) + } + } +} + +// InsertMoveBefore implements backend.RegAllocFunctionMachine. +func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) { + typ := src.RegType() + if typ != dst.RegType() { + panic("BUG: src and dst must have the same type") + } + + mov := m.allocateInstr() + if typ == regalloc.RegTypeInt { + mov.asMove64(dst, src) + } else { + mov.asFpuMov128(dst, src) + } + + cur := instr.prev + prevNext := cur.next + cur = linkInstr(cur, mov) + linkInstr(cur, prevNext) +} + +// SSABlockLabel implements backend.RegAllocFunctionMachine. +func (m *machine) SSABlockLabel(id ssa.BasicBlockID) backend.Label { + return m.executableContext.SsaBlockIDToLabels[id] +} + +// InsertStoreRegisterAt implements backend.RegAllocFunctionMachine. +func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction { + if !v.IsRealReg() { + panic("BUG: VReg must be backed by real reg to be stored") + } + + typ := m.compiler.TypeOf(v) + + var prevNext, cur *instruction + if after { + cur, prevNext = instr, instr.next + } else { + cur, prevNext = instr.prev, instr + } + + offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size()) + var amode addressMode + cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true) + store := m.allocateInstr() + store.asStore(operandNR(v), amode, typ.Bits()) + + cur = linkInstr(cur, store) + return linkInstr(cur, prevNext) +} + +// InsertReloadRegisterAt implements backend.RegAllocFunctionMachine. +func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction { + if !v.IsRealReg() { + panic("BUG: VReg must be backed by real reg to be stored") + } + + typ := m.compiler.TypeOf(v) + + var prevNext, cur *instruction + if after { + cur, prevNext = instr, instr.next + } else { + cur, prevNext = instr.prev, instr + } + + offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size()) + var amode addressMode + cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true) + load := m.allocateInstr() + switch typ { + case ssa.TypeI32, ssa.TypeI64: + load.asULoad(operandNR(v), amode, typ.Bits()) + case ssa.TypeF32, ssa.TypeF64: + load.asFpuLoad(operandNR(v), amode, typ.Bits()) + case ssa.TypeV128: + load.asFpuLoad(operandNR(v), amode, 128) + default: + panic("TODO") + } + + cur = linkInstr(cur, load) + return linkInstr(cur, prevNext) +} + +// LastInstrForInsertion implements backend.RegAllocFunctionMachine. +func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction { + cur := end + for cur.kind == nop0 { + cur = cur.prev + if cur == begin { + return end + } + } + switch cur.kind { + case br: + return cur + default: + return end + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go new file mode 100644 index 000000000..83902d927 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go @@ -0,0 +1,117 @@ +package arm64 + +import ( + "encoding/binary" + "fmt" + "math" + "sort" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" +) + +const ( + // trampolineCallSize is the size of the trampoline instruction sequence for each function in an island. + trampolineCallSize = 4*4 + 4 // Four instructions + 32-bit immediate. + + // Unconditional branch offset is encoded as divided by 4 in imm26. + // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link-?lang=en + + maxUnconditionalBranchOffset = maxSignedInt26 * 4 + minUnconditionalBranchOffset = minSignedInt26 * 4 + + // trampolineIslandInterval is the range of the trampoline island. + // Half of the range is used for the trampoline island, and the other half is used for the function. + trampolineIslandInterval = maxUnconditionalBranchOffset / 2 + + // maxNumFunctions explicitly specifies the maximum number of functions that can be allowed in a single executable. + maxNumFunctions = trampolineIslandInterval >> 6 + + // maxFunctionExecutableSize is the maximum size of a function that can exist in a trampoline island. + // Conservatively set to 1/4 of the trampoline island interval. + maxFunctionExecutableSize = trampolineIslandInterval >> 2 +) + +// CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo. +func (m *machine) CallTrampolineIslandInfo(numFunctions int) (interval, size int, err error) { + if numFunctions > maxNumFunctions { + return 0, 0, fmt.Errorf("too many functions: %d > %d", numFunctions, maxNumFunctions) + } + return trampolineIslandInterval, trampolineCallSize * numFunctions, nil +} + +// ResolveRelocations implements backend.Machine ResolveRelocations. +func (m *machine) ResolveRelocations( + refToBinaryOffset []int, + executable []byte, + relocations []backend.RelocationInfo, + callTrampolineIslandOffsets []int, +) { + for _, islandOffset := range callTrampolineIslandOffsets { + encodeCallTrampolineIsland(refToBinaryOffset, islandOffset, executable) + } + + for _, r := range relocations { + instrOffset := r.Offset + calleeFnOffset := refToBinaryOffset[r.FuncRef] + diff := int64(calleeFnOffset) - (instrOffset) + // Check if the diff is within the range of the branch instruction. + if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset { + // Find the near trampoline island from callTrampolineIslandOffsets. + islandOffset := searchTrampolineIsland(callTrampolineIslandOffsets, int(instrOffset)) + islandTargetOffset := islandOffset + trampolineCallSize*int(r.FuncRef) + diff = int64(islandTargetOffset) - (instrOffset) + if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset { + panic("BUG in trampoline placement") + } + } + binary.LittleEndian.PutUint32(executable[instrOffset:instrOffset+4], encodeUnconditionalBranch(true, diff)) + } +} + +// encodeCallTrampolineIsland encodes a trampoline island for the given functions. +// Each island consists of a trampoline instruction sequence for each function. +// Each trampoline instruction sequence consists of 4 instructions + 32-bit immediate. +func encodeCallTrampolineIsland(refToBinaryOffset []int, islandOffset int, executable []byte) { + for i := 0; i < len(refToBinaryOffset); i++ { + trampolineOffset := islandOffset + trampolineCallSize*i + + fnOffset := refToBinaryOffset[i] + diff := fnOffset - (trampolineOffset + 16) + if diff > math.MaxInt32 || diff < math.MinInt32 { + // This case even amd64 can't handle. 4GB is too big. + panic("too big binary") + } + + // The tmpReg, tmpReg2 is safe to overwrite (in fact any caller-saved register is safe to use). + tmpReg, tmpReg2 := regNumberInEncoding[tmpRegVReg.RealReg()], regNumberInEncoding[x11] + + // adr tmpReg, PC+16: load the address of #diff into tmpReg. + binary.LittleEndian.PutUint32(executable[trampolineOffset:], encodeAdr(tmpReg, 16)) + // ldrsw tmpReg2, [tmpReg]: Load #diff into tmpReg2. + binary.LittleEndian.PutUint32(executable[trampolineOffset+4:], + encodeLoadOrStore(sLoad32, tmpReg2, addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpRegVReg})) + // add tmpReg, tmpReg2, tmpReg: add #diff to the address of #diff, getting the absolute address of the function. + binary.LittleEndian.PutUint32(executable[trampolineOffset+8:], + encodeAluRRR(aluOpAdd, tmpReg, tmpReg, tmpReg2, true, false)) + // br tmpReg: branch to the function without overwriting the link register. + binary.LittleEndian.PutUint32(executable[trampolineOffset+12:], encodeUnconditionalBranchReg(tmpReg, false)) + // #diff + binary.LittleEndian.PutUint32(executable[trampolineOffset+16:], uint32(diff)) + } +} + +// searchTrampolineIsland finds the nearest trampoline island from callTrampolineIslandOffsets. +// Note that even if the offset is in the middle of two islands, it returns the latter one. +// That is ok because the island is always placed in the middle of the range. +// +// precondition: callTrampolineIslandOffsets is sorted in ascending order. +func searchTrampolineIsland(callTrampolineIslandOffsets []int, offset int) int { + l := len(callTrampolineIslandOffsets) + n := sort.Search(l, func(i int) bool { + return callTrampolineIslandOffsets[i] >= offset + }) + if n == l { + n = l - 1 + } + return callTrampolineIslandOffsets[n] +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go new file mode 100644 index 000000000..45737516d --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go @@ -0,0 +1,397 @@ +package arm64 + +import ( + "fmt" + "strconv" + "strings" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" +) + +// Arm64-specific registers. +// +// See https://developer.arm.com/documentation/dui0801/a/Overview-of-AArch64-state/Predeclared-core-register-names-in-AArch64-state + +const ( + // General purpose registers. Note that we do not distinguish wn and xn registers + // because they are the same from the perspective of register allocator, and + // the size can be determined by the type of the instruction. + + x0 = regalloc.RealRegInvalid + 1 + iota + x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 + x22 + x23 + x24 + x25 + x26 + x27 + x28 + x29 + x30 + + // Vector registers. Note that we do not distinguish vn and dn, ... registers + // because they are the same from the perspective of register allocator, and + // the size can be determined by the type of the instruction. + + v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7 + v8 + v9 + v10 + v11 + v12 + v13 + v14 + v15 + v16 + v17 + v18 + v19 + v20 + v21 + v22 + v23 + v24 + v25 + v26 + v27 + v28 + v29 + v30 + v31 + + // Special registers + + xzr + sp + lr = x30 + fp = x29 + tmp = x27 +) + +var ( + x0VReg = regalloc.FromRealReg(x0, regalloc.RegTypeInt) + x1VReg = regalloc.FromRealReg(x1, regalloc.RegTypeInt) + x2VReg = regalloc.FromRealReg(x2, regalloc.RegTypeInt) + x3VReg = regalloc.FromRealReg(x3, regalloc.RegTypeInt) + x4VReg = regalloc.FromRealReg(x4, regalloc.RegTypeInt) + x5VReg = regalloc.FromRealReg(x5, regalloc.RegTypeInt) + x6VReg = regalloc.FromRealReg(x6, regalloc.RegTypeInt) + x7VReg = regalloc.FromRealReg(x7, regalloc.RegTypeInt) + x8VReg = regalloc.FromRealReg(x8, regalloc.RegTypeInt) + x9VReg = regalloc.FromRealReg(x9, regalloc.RegTypeInt) + x10VReg = regalloc.FromRealReg(x10, regalloc.RegTypeInt) + x11VReg = regalloc.FromRealReg(x11, regalloc.RegTypeInt) + x12VReg = regalloc.FromRealReg(x12, regalloc.RegTypeInt) + x13VReg = regalloc.FromRealReg(x13, regalloc.RegTypeInt) + x14VReg = regalloc.FromRealReg(x14, regalloc.RegTypeInt) + x15VReg = regalloc.FromRealReg(x15, regalloc.RegTypeInt) + x16VReg = regalloc.FromRealReg(x16, regalloc.RegTypeInt) + x17VReg = regalloc.FromRealReg(x17, regalloc.RegTypeInt) + x18VReg = regalloc.FromRealReg(x18, regalloc.RegTypeInt) + x19VReg = regalloc.FromRealReg(x19, regalloc.RegTypeInt) + x20VReg = regalloc.FromRealReg(x20, regalloc.RegTypeInt) + x21VReg = regalloc.FromRealReg(x21, regalloc.RegTypeInt) + x22VReg = regalloc.FromRealReg(x22, regalloc.RegTypeInt) + x23VReg = regalloc.FromRealReg(x23, regalloc.RegTypeInt) + x24VReg = regalloc.FromRealReg(x24, regalloc.RegTypeInt) + x25VReg = regalloc.FromRealReg(x25, regalloc.RegTypeInt) + x26VReg = regalloc.FromRealReg(x26, regalloc.RegTypeInt) + x27VReg = regalloc.FromRealReg(x27, regalloc.RegTypeInt) + x28VReg = regalloc.FromRealReg(x28, regalloc.RegTypeInt) + x29VReg = regalloc.FromRealReg(x29, regalloc.RegTypeInt) + x30VReg = regalloc.FromRealReg(x30, regalloc.RegTypeInt) + v0VReg = regalloc.FromRealReg(v0, regalloc.RegTypeFloat) + v1VReg = regalloc.FromRealReg(v1, regalloc.RegTypeFloat) + v2VReg = regalloc.FromRealReg(v2, regalloc.RegTypeFloat) + v3VReg = regalloc.FromRealReg(v3, regalloc.RegTypeFloat) + v4VReg = regalloc.FromRealReg(v4, regalloc.RegTypeFloat) + v5VReg = regalloc.FromRealReg(v5, regalloc.RegTypeFloat) + v6VReg = regalloc.FromRealReg(v6, regalloc.RegTypeFloat) + v7VReg = regalloc.FromRealReg(v7, regalloc.RegTypeFloat) + v8VReg = regalloc.FromRealReg(v8, regalloc.RegTypeFloat) + v9VReg = regalloc.FromRealReg(v9, regalloc.RegTypeFloat) + v10VReg = regalloc.FromRealReg(v10, regalloc.RegTypeFloat) + v11VReg = regalloc.FromRealReg(v11, regalloc.RegTypeFloat) + v12VReg = regalloc.FromRealReg(v12, regalloc.RegTypeFloat) + v13VReg = regalloc.FromRealReg(v13, regalloc.RegTypeFloat) + v14VReg = regalloc.FromRealReg(v14, regalloc.RegTypeFloat) + v15VReg = regalloc.FromRealReg(v15, regalloc.RegTypeFloat) + v16VReg = regalloc.FromRealReg(v16, regalloc.RegTypeFloat) + v17VReg = regalloc.FromRealReg(v17, regalloc.RegTypeFloat) + v18VReg = regalloc.FromRealReg(v18, regalloc.RegTypeFloat) + v19VReg = regalloc.FromRealReg(v19, regalloc.RegTypeFloat) + v20VReg = regalloc.FromRealReg(v20, regalloc.RegTypeFloat) + v21VReg = regalloc.FromRealReg(v21, regalloc.RegTypeFloat) + v22VReg = regalloc.FromRealReg(v22, regalloc.RegTypeFloat) + v23VReg = regalloc.FromRealReg(v23, regalloc.RegTypeFloat) + v24VReg = regalloc.FromRealReg(v24, regalloc.RegTypeFloat) + v25VReg = regalloc.FromRealReg(v25, regalloc.RegTypeFloat) + v26VReg = regalloc.FromRealReg(v26, regalloc.RegTypeFloat) + v27VReg = regalloc.FromRealReg(v27, regalloc.RegTypeFloat) + // lr (link register) holds the return address at the function entry. + lrVReg = x30VReg + // tmpReg is used to perform spill/load on large stack offsets, and load large constants. + // Therefore, be cautious to use this register in the middle of the compilation, especially before the register allocation. + // This is the same as golang/go, but it's only described in the source code: + // https://github.com/golang/go/blob/18e17e2cb12837ea2c8582ecdb0cc780f49a1aac/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go#L59 + // https://github.com/golang/go/blob/18e17e2cb12837ea2c8582ecdb0cc780f49a1aac/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go#L13-L15 + tmpRegVReg = regalloc.FromRealReg(tmp, regalloc.RegTypeInt) + v28VReg = regalloc.FromRealReg(v28, regalloc.RegTypeFloat) + v29VReg = regalloc.FromRealReg(v29, regalloc.RegTypeFloat) + v30VReg = regalloc.FromRealReg(v30, regalloc.RegTypeFloat) + v31VReg = regalloc.FromRealReg(v31, regalloc.RegTypeFloat) + xzrVReg = regalloc.FromRealReg(xzr, regalloc.RegTypeInt) + spVReg = regalloc.FromRealReg(sp, regalloc.RegTypeInt) + fpVReg = regalloc.FromRealReg(fp, regalloc.RegTypeInt) +) + +var regNames = [...]string{ + x0: "x0", + x1: "x1", + x2: "x2", + x3: "x3", + x4: "x4", + x5: "x5", + x6: "x6", + x7: "x7", + x8: "x8", + x9: "x9", + x10: "x10", + x11: "x11", + x12: "x12", + x13: "x13", + x14: "x14", + x15: "x15", + x16: "x16", + x17: "x17", + x18: "x18", + x19: "x19", + x20: "x20", + x21: "x21", + x22: "x22", + x23: "x23", + x24: "x24", + x25: "x25", + x26: "x26", + x27: "x27", + x28: "x28", + x29: "x29", + x30: "x30", + xzr: "xzr", + sp: "sp", + v0: "v0", + v1: "v1", + v2: "v2", + v3: "v3", + v4: "v4", + v5: "v5", + v6: "v6", + v7: "v7", + v8: "v8", + v9: "v9", + v10: "v10", + v11: "v11", + v12: "v12", + v13: "v13", + v14: "v14", + v15: "v15", + v16: "v16", + v17: "v17", + v18: "v18", + v19: "v19", + v20: "v20", + v21: "v21", + v22: "v22", + v23: "v23", + v24: "v24", + v25: "v25", + v26: "v26", + v27: "v27", + v28: "v28", + v29: "v29", + v30: "v30", + v31: "v31", +} + +func formatVRegSized(r regalloc.VReg, size byte) (ret string) { + if r.IsRealReg() { + ret = regNames[r.RealReg()] + switch ret[0] { + case 'x': + switch size { + case 32: + ret = strings.Replace(ret, "x", "w", 1) + case 64: + default: + panic("BUG: invalid register size: " + strconv.Itoa(int(size))) + } + case 'v': + switch size { + case 32: + ret = strings.Replace(ret, "v", "s", 1) + case 64: + ret = strings.Replace(ret, "v", "d", 1) + case 128: + ret = strings.Replace(ret, "v", "q", 1) + default: + panic("BUG: invalid register size") + } + } + } else { + switch r.RegType() { + case regalloc.RegTypeInt: + switch size { + case 32: + ret = fmt.Sprintf("w%d?", r.ID()) + case 64: + ret = fmt.Sprintf("x%d?", r.ID()) + default: + panic("BUG: invalid register size: " + strconv.Itoa(int(size))) + } + case regalloc.RegTypeFloat: + switch size { + case 32: + ret = fmt.Sprintf("s%d?", r.ID()) + case 64: + ret = fmt.Sprintf("d%d?", r.ID()) + case 128: + ret = fmt.Sprintf("q%d?", r.ID()) + default: + panic("BUG: invalid register size") + } + default: + panic(fmt.Sprintf("BUG: invalid register type: %d for %s", r.RegType(), r)) + } + } + return +} + +func formatVRegWidthVec(r regalloc.VReg, width vecArrangement) (ret string) { + var id string + wspec := strings.ToLower(width.String()) + if r.IsRealReg() { + id = regNames[r.RealReg()][1:] + } else { + id = fmt.Sprintf("%d?", r.ID()) + } + ret = fmt.Sprintf("%s%s", wspec, id) + return +} + +func formatVRegVec(r regalloc.VReg, arr vecArrangement, index vecIndex) (ret string) { + id := fmt.Sprintf("v%d?", r.ID()) + if r.IsRealReg() { + id = regNames[r.RealReg()] + } + ret = fmt.Sprintf("%s.%s", id, strings.ToLower(arr.String())) + if index != vecIndexNone { + ret += fmt.Sprintf("[%d]", index) + } + return +} + +func regTypeToRegisterSizeInBits(r regalloc.RegType) byte { + switch r { + case regalloc.RegTypeInt: + return 64 + case regalloc.RegTypeFloat: + return 128 + default: + panic("BUG: invalid register type") + } +} + +var regNumberInEncoding = [...]uint32{ + x0: 0, + x1: 1, + x2: 2, + x3: 3, + x4: 4, + x5: 5, + x6: 6, + x7: 7, + x8: 8, + x9: 9, + x10: 10, + x11: 11, + x12: 12, + x13: 13, + x14: 14, + x15: 15, + x16: 16, + x17: 17, + x18: 18, + x19: 19, + x20: 20, + x21: 21, + x22: 22, + x23: 23, + x24: 24, + x25: 25, + x26: 26, + x27: 27, + x28: 28, + x29: 29, + x30: 30, + xzr: 31, + sp: 31, + v0: 0, + v1: 1, + v2: 2, + v3: 3, + v4: 4, + v5: 5, + v6: 6, + v7: 7, + v8: 8, + v9: 9, + v10: 10, + v11: 11, + v12: 12, + v13: 13, + v14: 14, + v15: 15, + v16: 16, + v17: 17, + v18: 18, + v19: 19, + v20: 20, + v21: 21, + v22: 22, + v23: 23, + v24: 24, + v25: 25, + v26: 26, + v27: 27, + v28: 28, + v29: 29, + v30: 30, + v31: 31, +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go new file mode 100644 index 000000000..edb0e36e3 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go @@ -0,0 +1,90 @@ +package arm64 + +import ( + "encoding/binary" + "reflect" + "unsafe" + + "github.com/tetratelabs/wazero/internal/wasmdebug" +) + +// UnwindStack implements wazevo.unwindStack. +func UnwindStack(sp, _, top uintptr, returnAddresses []uintptr) []uintptr { + l := int(top - sp) + + var stackBuf []byte + { + // TODO: use unsafe.Slice after floor version is set to Go 1.20. + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf)) + hdr.Data = sp + hdr.Len = l + hdr.Cap = l + } + + for i := uint64(0); i < uint64(l); { + // (high address) + // +-----------------+ + // | ....... | + // | ret Y | <----+ + // | ....... | | + // | ret 0 | | + // | arg X | | size_of_arg_ret + // | ....... | | + // | arg 1 | | + // | arg 0 | <----+ + // | size_of_arg_ret | + // | ReturnAddress | + // +-----------------+ <----+ + // | ........... | | + // | spill slot M | | + // | ............ | | + // | spill slot 2 | | + // | spill slot 1 | | frame size + // | spill slot 1 | | + // | clobbered N | | + // | ............ | | + // | clobbered 0 | <----+ + // | xxxxxx | ;; unused space to make it 16-byte aligned. + // | frame_size | + // +-----------------+ <---- SP + // (low address) + + frameSize := binary.LittleEndian.Uint64(stackBuf[i:]) + i += frameSize + + 16 // frame size + aligned space. + retAddr := binary.LittleEndian.Uint64(stackBuf[i:]) + i += 8 // ret addr. + sizeOfArgRet := binary.LittleEndian.Uint64(stackBuf[i:]) + i += 8 + sizeOfArgRet + returnAddresses = append(returnAddresses, uintptr(retAddr)) + if len(returnAddresses) == wasmdebug.MaxFrames { + break + } + } + return returnAddresses +} + +// GoCallStackView implements wazevo.goCallStackView. +func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 { + // (high address) + // +-----------------+ <----+ + // | xxxxxxxxxxx | | ;; optional unused space to make it 16-byte aligned. + // ^ | arg[N]/ret[M] | | + // sliceSize | | ............ | | sliceSize + // | | arg[1]/ret[1] | | + // v | arg[0]/ret[0] | <----+ + // | sliceSize | + // | frame_size | + // +-----------------+ <---- stackPointerBeforeGoCall + // (low address) + ptr := unsafe.Pointer(stackPointerBeforeGoCall) + size := *(*uint64)(unsafe.Add(ptr, 8)) + var view []uint64 + { + sh := (*reflect.SliceHeader)(unsafe.Pointer(&view)) + sh.Data = uintptr(unsafe.Add(ptr, 16)) // skips the (frame_size, sliceSize). + sh.Len = int(size) + sh.Cap = int(size) + } + return view +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go new file mode 100644 index 000000000..54ce89e46 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go @@ -0,0 +1,100 @@ +package backend + +import ( + "context" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +type ( + // Machine is a backend for a specific ISA machine. + Machine interface { + ExecutableContext() ExecutableContext + + // DisableStackCheck disables the stack check for the current compilation for debugging/testing. + DisableStackCheck() + + // SetCurrentABI initializes the FunctionABI for the given signature. + SetCurrentABI(abi *FunctionABI) + + // SetCompiler sets the compilation context used for the lifetime of Machine. + // This is only called once per Machine, i.e. before the first compilation. + SetCompiler(Compiler) + + // LowerSingleBranch is called when the compilation of the given single branch is started. + LowerSingleBranch(b *ssa.Instruction) + + // LowerConditionalBranch is called when the compilation of the given conditional branch is started. + LowerConditionalBranch(b *ssa.Instruction) + + // LowerInstr is called for each instruction in the given block except for the ones marked as already lowered + // via Compiler.MarkLowered. The order is reverse, i.e. from the last instruction to the first one. + // + // Note: this can lower multiple instructions (which produce the inputs) at once whenever it's possible + // for optimization. + LowerInstr(*ssa.Instruction) + + // Reset resets the machine state for the next compilation. + Reset() + + // InsertMove inserts a move instruction from src to dst whose type is typ. + InsertMove(dst, src regalloc.VReg, typ ssa.Type) + + // InsertReturn inserts the return instruction to return from the current function. + InsertReturn() + + // InsertLoadConstantBlockArg inserts the instruction(s) to load the constant value into the given regalloc.VReg. + InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) + + // Format returns the string representation of the currently compiled machine code. + // This is only for testing purpose. + Format() string + + // RegAlloc does the register allocation after lowering. + RegAlloc() + + // PostRegAlloc does the post register allocation, e.g. setting up prologue/epilogue, redundant move elimination, etc. + PostRegAlloc() + + // ResolveRelocations resolves the relocations after emitting machine code. + // * refToBinaryOffset: the map from the function reference (ssa.FuncRef) to the executable offset. + // * executable: the binary to resolve the relocations. + // * relocations: the relocations to resolve. + // * callTrampolineIslandOffsets: the offsets of the trampoline islands in the executable. + ResolveRelocations( + refToBinaryOffset []int, + executable []byte, + relocations []RelocationInfo, + callTrampolineIslandOffsets []int, + ) + + // Encode encodes the machine instructions to the Compiler. + Encode(ctx context.Context) error + + // CompileGoFunctionTrampoline compiles the trampoline function to call a Go function of the given exit code and signature. + CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte + + // CompileStackGrowCallSequence returns the sequence of instructions shared by all functions to + // call the stack grow builtin function. + CompileStackGrowCallSequence() []byte + + // CompileEntryPreamble returns the sequence of instructions shared by multiple functions to + // enter the function from Go. + CompileEntryPreamble(signature *ssa.Signature) []byte + + // LowerParams lowers the given parameters. + LowerParams(params []ssa.Value) + + // LowerReturns lowers the given returns. + LowerReturns(returns []ssa.Value) + + // ArgsResultsRegs returns the registers used for arguments and return values. + ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) + + // CallTrampolineIslandInfo returns the interval of the offset where the trampoline island is placed, and + // the size of the trampoline island. If islandSize is zero, the trampoline island is not used on this machine. + CallTrampolineIslandInfo(numFunctions int) (interval, islandSize int, err error) + } +) diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go new file mode 100644 index 000000000..3f36c84e5 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go @@ -0,0 +1,319 @@ +package backend + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// RegAllocFunctionMachine is the interface for the machine specific logic that will be used in RegAllocFunction. +type RegAllocFunctionMachine[I regalloc.InstrConstraint] interface { + // InsertMoveBefore inserts the move instruction from src to dst before the given instruction. + InsertMoveBefore(dst, src regalloc.VReg, instr I) + // InsertStoreRegisterAt inserts the instruction(s) to store the given virtual register at the given instruction. + // If after is true, the instruction(s) will be inserted after the given instruction, otherwise before. + InsertStoreRegisterAt(v regalloc.VReg, instr I, after bool) I + // InsertReloadRegisterAt inserts the instruction(s) to reload the given virtual register at the given instruction. + // If after is true, the instruction(s) will be inserted after the given instruction, otherwise before. + InsertReloadRegisterAt(v regalloc.VReg, instr I, after bool) I + // ClobberedRegisters is called when the register allocation is done and the clobbered registers are known. + ClobberedRegisters(regs []regalloc.VReg) + // Swap swaps the two virtual registers after the given instruction. + Swap(cur I, x1, x2, tmp regalloc.VReg) + // LastInstrForInsertion implements LastInstrForInsertion of regalloc.Function. See its comment for details. + LastInstrForInsertion(begin, end I) I + // SSABlockLabel returns the label of the given ssa.BasicBlockID. + SSABlockLabel(id ssa.BasicBlockID) Label +} + +type ( + // RegAllocFunction implements regalloc.Function. + RegAllocFunction[I regalloc.InstrConstraint, m RegAllocFunctionMachine[I]] struct { + m m + ssb ssa.Builder + c Compiler + // iter is the iterator for reversePostOrderBlocks + iter int + reversePostOrderBlocks []RegAllocBlock[I, m] + // labelToRegAllocBlockIndex maps label to the index of reversePostOrderBlocks. + labelToRegAllocBlockIndex map[Label]int + loopNestingForestRoots []ssa.BasicBlock + } + + // RegAllocBlock implements regalloc.Block. + RegAllocBlock[I regalloc.InstrConstraint, m RegAllocFunctionMachine[I]] struct { + // f is the function this instruction belongs to. Used to reuse the regAllocFunctionImpl.predsSlice slice for Defs() and Uses(). + f *RegAllocFunction[I, m] + sb ssa.BasicBlock + l Label + begin, end I + loopNestingForestChildren []ssa.BasicBlock + cur I + id int + cachedLastInstrForInsertion I + } +) + +// NewRegAllocFunction returns a new RegAllocFunction. +func NewRegAllocFunction[I regalloc.InstrConstraint, M RegAllocFunctionMachine[I]](m M, ssb ssa.Builder, c Compiler) *RegAllocFunction[I, M] { + return &RegAllocFunction[I, M]{ + m: m, + ssb: ssb, + c: c, + labelToRegAllocBlockIndex: make(map[Label]int), + } +} + +// AddBlock adds a new block to the function. +func (f *RegAllocFunction[I, M]) AddBlock(sb ssa.BasicBlock, l Label, begin, end I) { + i := len(f.reversePostOrderBlocks) + f.reversePostOrderBlocks = append(f.reversePostOrderBlocks, RegAllocBlock[I, M]{ + f: f, + sb: sb, + l: l, + begin: begin, + end: end, + id: int(sb.ID()), + }) + f.labelToRegAllocBlockIndex[l] = i +} + +// Reset resets the function for the next compilation. +func (f *RegAllocFunction[I, M]) Reset() { + f.reversePostOrderBlocks = f.reversePostOrderBlocks[:0] + f.iter = 0 +} + +// StoreRegisterAfter implements regalloc.Function StoreRegisterAfter. +func (f *RegAllocFunction[I, M]) StoreRegisterAfter(v regalloc.VReg, instr regalloc.Instr) { + m := f.m + m.InsertStoreRegisterAt(v, instr.(I), true) +} + +// ReloadRegisterBefore implements regalloc.Function ReloadRegisterBefore. +func (f *RegAllocFunction[I, M]) ReloadRegisterBefore(v regalloc.VReg, instr regalloc.Instr) { + m := f.m + m.InsertReloadRegisterAt(v, instr.(I), false) +} + +// ReloadRegisterAfter implements regalloc.Function ReloadRegisterAfter. +func (f *RegAllocFunction[I, M]) ReloadRegisterAfter(v regalloc.VReg, instr regalloc.Instr) { + m := f.m + m.InsertReloadRegisterAt(v, instr.(I), true) +} + +// StoreRegisterBefore implements regalloc.Function StoreRegisterBefore. +func (f *RegAllocFunction[I, M]) StoreRegisterBefore(v regalloc.VReg, instr regalloc.Instr) { + m := f.m + m.InsertStoreRegisterAt(v, instr.(I), false) +} + +// ClobberedRegisters implements regalloc.Function ClobberedRegisters. +func (f *RegAllocFunction[I, M]) ClobberedRegisters(regs []regalloc.VReg) { + f.m.ClobberedRegisters(regs) +} + +// SwapBefore implements regalloc.Function SwapBefore. +func (f *RegAllocFunction[I, M]) SwapBefore(x1, x2, tmp regalloc.VReg, instr regalloc.Instr) { + f.m.Swap(instr.Prev().(I), x1, x2, tmp) +} + +// PostOrderBlockIteratorBegin implements regalloc.Function PostOrderBlockIteratorBegin. +func (f *RegAllocFunction[I, M]) PostOrderBlockIteratorBegin() regalloc.Block { + f.iter = len(f.reversePostOrderBlocks) - 1 + return f.PostOrderBlockIteratorNext() +} + +// PostOrderBlockIteratorNext implements regalloc.Function PostOrderBlockIteratorNext. +func (f *RegAllocFunction[I, M]) PostOrderBlockIteratorNext() regalloc.Block { + if f.iter < 0 { + return nil + } + b := &f.reversePostOrderBlocks[f.iter] + f.iter-- + return b +} + +// ReversePostOrderBlockIteratorBegin implements regalloc.Function ReversePostOrderBlockIteratorBegin. +func (f *RegAllocFunction[I, M]) ReversePostOrderBlockIteratorBegin() regalloc.Block { + f.iter = 0 + return f.ReversePostOrderBlockIteratorNext() +} + +// ReversePostOrderBlockIteratorNext implements regalloc.Function ReversePostOrderBlockIteratorNext. +func (f *RegAllocFunction[I, M]) ReversePostOrderBlockIteratorNext() regalloc.Block { + if f.iter >= len(f.reversePostOrderBlocks) { + return nil + } + b := &f.reversePostOrderBlocks[f.iter] + f.iter++ + return b +} + +// LoopNestingForestRoots implements regalloc.Function LoopNestingForestRoots. +func (f *RegAllocFunction[I, M]) LoopNestingForestRoots() int { + f.loopNestingForestRoots = f.ssb.LoopNestingForestRoots() + return len(f.loopNestingForestRoots) +} + +// LoopNestingForestRoot implements regalloc.Function LoopNestingForestRoot. +func (f *RegAllocFunction[I, M]) LoopNestingForestRoot(i int) regalloc.Block { + blk := f.loopNestingForestRoots[i] + l := f.m.SSABlockLabel(blk.ID()) + index := f.labelToRegAllocBlockIndex[l] + return &f.reversePostOrderBlocks[index] +} + +// InsertMoveBefore implements regalloc.Function InsertMoveBefore. +func (f *RegAllocFunction[I, M]) InsertMoveBefore(dst, src regalloc.VReg, instr regalloc.Instr) { + f.m.InsertMoveBefore(dst, src, instr.(I)) +} + +// LowestCommonAncestor implements regalloc.Function LowestCommonAncestor. +func (f *RegAllocFunction[I, M]) LowestCommonAncestor(blk1, blk2 regalloc.Block) regalloc.Block { + ret := f.ssb.LowestCommonAncestor(blk1.(*RegAllocBlock[I, M]).sb, blk2.(*RegAllocBlock[I, M]).sb) + l := f.m.SSABlockLabel(ret.ID()) + index := f.labelToRegAllocBlockIndex[l] + return &f.reversePostOrderBlocks[index] +} + +// Idom implements regalloc.Function Idom. +func (f *RegAllocFunction[I, M]) Idom(blk regalloc.Block) regalloc.Block { + builder := f.ssb + idom := builder.Idom(blk.(*RegAllocBlock[I, M]).sb) + if idom == nil { + panic("BUG: idom must not be nil") + } + l := f.m.SSABlockLabel(idom.ID()) + index := f.labelToRegAllocBlockIndex[l] + return &f.reversePostOrderBlocks[index] +} + +// ID implements regalloc.Block. +func (r *RegAllocBlock[I, m]) ID() int32 { return int32(r.id) } + +// BlockParams implements regalloc.Block. +func (r *RegAllocBlock[I, m]) BlockParams(regs *[]regalloc.VReg) []regalloc.VReg { + c := r.f.c + *regs = (*regs)[:0] + for i := 0; i < r.sb.Params(); i++ { + v := c.VRegOf(r.sb.Param(i)) + *regs = append(*regs, v) + } + return *regs +} + +// InstrIteratorBegin implements regalloc.Block. +func (r *RegAllocBlock[I, m]) InstrIteratorBegin() regalloc.Instr { + r.cur = r.begin + return r.cur +} + +// InstrIteratorNext implements regalloc.Block. +func (r *RegAllocBlock[I, m]) InstrIteratorNext() regalloc.Instr { + for { + if r.cur == r.end { + return nil + } + instr := r.cur.Next() + r.cur = instr.(I) + if instr == nil { + return nil + } else if instr.AddedBeforeRegAlloc() { + // Only concerned about the instruction added before regalloc. + return instr + } + } +} + +// InstrRevIteratorBegin implements regalloc.Block. +func (r *RegAllocBlock[I, m]) InstrRevIteratorBegin() regalloc.Instr { + r.cur = r.end + return r.cur +} + +// InstrRevIteratorNext implements regalloc.Block. +func (r *RegAllocBlock[I, m]) InstrRevIteratorNext() regalloc.Instr { + for { + if r.cur == r.begin { + return nil + } + instr := r.cur.Prev() + r.cur = instr.(I) + if instr == nil { + return nil + } else if instr.AddedBeforeRegAlloc() { + // Only concerned about the instruction added before regalloc. + return instr + } + } +} + +// FirstInstr implements regalloc.Block. +func (r *RegAllocBlock[I, m]) FirstInstr() regalloc.Instr { + return r.begin +} + +// EndInstr implements regalloc.Block. +func (r *RegAllocBlock[I, m]) EndInstr() regalloc.Instr { + return r.end +} + +// LastInstrForInsertion implements regalloc.Block. +func (r *RegAllocBlock[I, m]) LastInstrForInsertion() regalloc.Instr { + var nil I + if r.cachedLastInstrForInsertion == nil { + r.cachedLastInstrForInsertion = r.f.m.LastInstrForInsertion(r.begin, r.end) + } + return r.cachedLastInstrForInsertion +} + +// Preds implements regalloc.Block. +func (r *RegAllocBlock[I, m]) Preds() int { return r.sb.Preds() } + +// Pred implements regalloc.Block. +func (r *RegAllocBlock[I, m]) Pred(i int) regalloc.Block { + sb := r.sb + pred := sb.Pred(i) + l := r.f.m.SSABlockLabel(pred.ID()) + index := r.f.labelToRegAllocBlockIndex[l] + return &r.f.reversePostOrderBlocks[index] +} + +// Entry implements regalloc.Block. +func (r *RegAllocBlock[I, m]) Entry() bool { return r.sb.EntryBlock() } + +// Succs implements regalloc.Block. +func (r *RegAllocBlock[I, m]) Succs() int { + return r.sb.Succs() +} + +// Succ implements regalloc.Block. +func (r *RegAllocBlock[I, m]) Succ(i int) regalloc.Block { + sb := r.sb + succ := sb.Succ(i) + if succ.ReturnBlock() { + return nil + } + l := r.f.m.SSABlockLabel(succ.ID()) + index := r.f.labelToRegAllocBlockIndex[l] + return &r.f.reversePostOrderBlocks[index] +} + +// LoopHeader implements regalloc.Block. +func (r *RegAllocBlock[I, m]) LoopHeader() bool { + return r.sb.LoopHeader() +} + +// LoopNestingForestChildren implements regalloc.Block. +func (r *RegAllocBlock[I, m]) LoopNestingForestChildren() int { + r.loopNestingForestChildren = r.sb.LoopNestingForestChildren() + return len(r.loopNestingForestChildren) +} + +// LoopNestingForestChild implements regalloc.Block. +func (r *RegAllocBlock[I, m]) LoopNestingForestChild(i int) regalloc.Block { + blk := r.loopNestingForestChildren[i] + l := r.f.m.SSABlockLabel(blk.ID()) + index := r.f.labelToRegAllocBlockIndex[l] + return &r.f.reversePostOrderBlocks[index] +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go new file mode 100644 index 000000000..23157b478 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go @@ -0,0 +1,136 @@ +package regalloc + +import "fmt" + +// These interfaces are implemented by ISA-specific backends to abstract away the details, and allow the register +// allocators to work on any ISA. +// +// TODO: the interfaces are not stabilized yet, especially x64 will need some changes. E.g. x64 has an addressing mode +// where index can be in memory. That kind of info will be useful to reduce the register pressure, and should be leveraged +// by the register allocators, like https://docs.rs/regalloc2/latest/regalloc2/enum.OperandConstraint.html + +type ( + // Function is the top-level interface to do register allocation, which corresponds to a CFG containing + // Blocks(s). + Function interface { + // PostOrderBlockIteratorBegin returns the first block in the post-order traversal of the CFG. + // In other words, the last blocks in the CFG will be returned first. + PostOrderBlockIteratorBegin() Block + // PostOrderBlockIteratorNext returns the next block in the post-order traversal of the CFG. + PostOrderBlockIteratorNext() Block + // ReversePostOrderBlockIteratorBegin returns the first block in the reverse post-order traversal of the CFG. + // In other words, the first blocks in the CFG will be returned first. + ReversePostOrderBlockIteratorBegin() Block + // ReversePostOrderBlockIteratorNext returns the next block in the reverse post-order traversal of the CFG. + ReversePostOrderBlockIteratorNext() Block + // ClobberedRegisters tell the clobbered registers by this function. + ClobberedRegisters([]VReg) + // LoopNestingForestRoots returns the number of roots of the loop nesting forest in a function. + LoopNestingForestRoots() int + // LoopNestingForestRoot returns the i-th root of the loop nesting forest in a function. + LoopNestingForestRoot(i int) Block + // LowestCommonAncestor returns the lowest common ancestor of two blocks in the dominator tree. + LowestCommonAncestor(blk1, blk2 Block) Block + // Idom returns the immediate dominator of the given block. + Idom(blk Block) Block + + // Followings are for rewriting the function. + + // SwapAtEndOfBlock swaps the two virtual registers at the end of the given block. + SwapBefore(x1, x2, tmp VReg, instr Instr) + // StoreRegisterBefore inserts store instruction(s) before the given instruction for the given virtual register. + StoreRegisterBefore(v VReg, instr Instr) + // StoreRegisterAfter inserts store instruction(s) after the given instruction for the given virtual register. + StoreRegisterAfter(v VReg, instr Instr) + // ReloadRegisterBefore inserts reload instruction(s) before the given instruction for the given virtual register. + ReloadRegisterBefore(v VReg, instr Instr) + // ReloadRegisterAfter inserts reload instruction(s) after the given instruction for the given virtual register. + ReloadRegisterAfter(v VReg, instr Instr) + // InsertMoveBefore inserts move instruction(s) before the given instruction for the given virtual registers. + InsertMoveBefore(dst, src VReg, instr Instr) + } + + // Block is a basic block in the CFG of a function, and it consists of multiple instructions, and predecessor Block(s). + Block interface { + // ID returns the unique identifier of this block which is ordered in the reverse post-order traversal of the CFG. + ID() int32 + // BlockParams returns the virtual registers used as the parameters of this block. + BlockParams(*[]VReg) []VReg + // InstrIteratorBegin returns the first instruction in this block. Instructions added after lowering must be skipped. + // Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr. + InstrIteratorBegin() Instr + // InstrIteratorNext returns the next instruction in this block. Instructions added after lowering must be skipped. + // Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr. + InstrIteratorNext() Instr + // InstrRevIteratorBegin is the same as InstrIteratorBegin, but in the reverse order. + InstrRevIteratorBegin() Instr + // InstrRevIteratorNext is the same as InstrIteratorNext, but in the reverse order. + InstrRevIteratorNext() Instr + // FirstInstr returns the fist instruction in this block where instructions will be inserted after it. + FirstInstr() Instr + // EndInstr returns the end instruction in this block. + EndInstr() Instr + // LastInstrForInsertion returns the last instruction in this block where instructions will be inserted before it. + // Such insertions only happen when we need to insert spill/reload instructions to adjust the merge edges. + // At the time of register allocation, all the critical edges are already split, so there is no need + // to worry about the case where branching instruction has multiple successors. + // Therefore, usually, it is the nop instruction, but if the block ends with an unconditional branching, then it returns + // the unconditional branch, not the nop. In other words it is either nop or unconditional branch. + LastInstrForInsertion() Instr + // Preds returns the number of predecessors of this block in the CFG. + Preds() int + // Pred returns the i-th predecessor of this block in the CFG. + Pred(i int) Block + // Entry returns true if the block is for the entry block. + Entry() bool + // Succs returns the number of successors of this block in the CFG. + Succs() int + // Succ returns the i-th successor of this block in the CFG. + Succ(i int) Block + // LoopHeader returns true if this block is a loop header. + LoopHeader() bool + // LoopNestingForestChildren returns the number of children of this block in the loop nesting forest. + LoopNestingForestChildren() int + // LoopNestingForestChild returns the i-th child of this block in the loop nesting forest. + LoopNestingForestChild(i int) Block + } + + // Instr is an instruction in a block, abstracting away the underlying ISA. + Instr interface { + fmt.Stringer + // Next returns the next instruction in the same block. + Next() Instr + // Prev returns the previous instruction in the same block. + Prev() Instr + // Defs returns the virtual registers defined by this instruction. + Defs(*[]VReg) []VReg + // Uses returns the virtual registers used by this instruction. + // Note: multiple returned []VReg will not be held at the same time, so it's safe to use the same slice for this. + Uses(*[]VReg) []VReg + // AssignUse assigns the RealReg-allocated virtual register used by this instruction at the given index. + AssignUse(index int, v VReg) + // AssignDef assigns a RealReg-allocated virtual register defined by this instruction. + // This only accepts one register because we don't allocate registers for multi-def instructions (i.e. call instruction) + AssignDef(VReg) + // IsCopy returns true if this instruction is a move instruction between two registers. + // If true, the instruction is of the form of dst = src, and if the src and dst do not interfere with each other, + // we could coalesce them, and hence the copy can be eliminated from the final code. + IsCopy() bool + // IsCall returns true if this instruction is a call instruction. The result is used to insert + // caller saved register spills and restores. + IsCall() bool + // IsIndirectCall returns true if this instruction is an indirect call instruction which calls a function pointer. + // The result is used to insert caller saved register spills and restores. + IsIndirectCall() bool + // IsReturn returns true if this instruction is a return instruction. + IsReturn() bool + // AddedBeforeRegAlloc returns true if this instruction is added before register allocation. + AddedBeforeRegAlloc() bool + } + + // InstrConstraint is an interface for arch-specific instruction constraints. + InstrConstraint interface { + comparable + Instr + } +) diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go new file mode 100644 index 000000000..46df807e6 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go @@ -0,0 +1,123 @@ +package regalloc + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// VReg represents a register which is assigned to an SSA value. This is used to represent a register in the backend. +// A VReg may or may not be a physical register, and the info of physical register can be obtained by RealReg. +type VReg uint64 + +// VRegID is the lower 32bit of VReg, which is the pure identifier of VReg without RealReg info. +type VRegID uint32 + +// RealReg returns the RealReg of this VReg. +func (v VReg) RealReg() RealReg { + return RealReg(v >> 32) +} + +// IsRealReg returns true if this VReg is backed by a physical register. +func (v VReg) IsRealReg() bool { + return v.RealReg() != RealRegInvalid +} + +// FromRealReg returns a VReg from the given RealReg and RegType. +// This is used to represent a specific pre-colored register in the backend. +func FromRealReg(r RealReg, typ RegType) VReg { + rid := VRegID(r) + if rid > vRegIDReservedForRealNum { + panic(fmt.Sprintf("invalid real reg %d", r)) + } + return VReg(r).SetRealReg(r).SetRegType(typ) +} + +// SetRealReg sets the RealReg of this VReg and returns the updated VReg. +func (v VReg) SetRealReg(r RealReg) VReg { + return VReg(r)<<32 | (v & 0xff_00_ffffffff) +} + +// RegType returns the RegType of this VReg. +func (v VReg) RegType() RegType { + return RegType(v >> 40) +} + +// SetRegType sets the RegType of this VReg and returns the updated VReg. +func (v VReg) SetRegType(t RegType) VReg { + return VReg(t)<<40 | (v & 0x00_ff_ffffffff) +} + +// ID returns the VRegID of this VReg. +func (v VReg) ID() VRegID { + return VRegID(v & 0xffffffff) +} + +// Valid returns true if this VReg is Valid. +func (v VReg) Valid() bool { + return v.ID() != vRegIDInvalid && v.RegType() != RegTypeInvalid +} + +// RealReg represents a physical register. +type RealReg byte + +const RealRegInvalid RealReg = 0 + +const ( + vRegIDInvalid VRegID = 1 << 31 + VRegIDNonReservedBegin = vRegIDReservedForRealNum + vRegIDReservedForRealNum VRegID = 128 + VRegInvalid = VReg(vRegIDInvalid) +) + +// String implements fmt.Stringer. +func (r RealReg) String() string { + switch r { + case RealRegInvalid: + return "invalid" + default: + return fmt.Sprintf("r%d", r) + } +} + +// String implements fmt.Stringer. +func (v VReg) String() string { + if v.IsRealReg() { + return fmt.Sprintf("r%d", v.ID()) + } + return fmt.Sprintf("v%d?", v.ID()) +} + +// RegType represents the type of a register. +type RegType byte + +const ( + RegTypeInvalid RegType = iota + RegTypeInt + RegTypeFloat + NumRegType +) + +// String implements fmt.Stringer. +func (r RegType) String() string { + switch r { + case RegTypeInt: + return "int" + case RegTypeFloat: + return "float" + default: + return "invalid" + } +} + +// RegTypeOf returns the RegType of the given ssa.Type. +func RegTypeOf(p ssa.Type) RegType { + switch p { + case ssa.TypeI32, ssa.TypeI64: + return RegTypeInt + case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128: + return RegTypeFloat + default: + panic("invalid type") + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go new file mode 100644 index 000000000..b4450d56f --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go @@ -0,0 +1,1212 @@ +// Package regalloc performs register allocation. The algorithm can work on any ISA by implementing the interfaces in +// api.go. +// +// References: +// - https://web.stanford.edu/class/archive/cs/cs143/cs143.1128/lectures/17/Slides17.pdf +// - https://en.wikipedia.org/wiki/Chaitin%27s_algorithm +// - https://llvm.org/ProjectsWithLLVM/2004-Fall-CS426-LS.pdf +// - https://pfalcon.github.io/ssabook/latest/book-full.pdf: Chapter 9. for liveness analysis. +// - https://github.com/golang/go/blob/release-branch.go1.21/src/cmd/compile/internal/ssa/regalloc.go +package regalloc + +import ( + "fmt" + "math" + "strings" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// NewAllocator returns a new Allocator. +func NewAllocator(allocatableRegs *RegisterInfo) Allocator { + a := Allocator{ + regInfo: allocatableRegs, + phiDefInstListPool: wazevoapi.NewPool[phiDefInstList](resetPhiDefInstList), + blockStates: wazevoapi.NewIDedPool[blockState](resetBlockState), + } + a.state.vrStates = wazevoapi.NewIDedPool[vrState](resetVrState) + a.state.reset() + for _, regs := range allocatableRegs.AllocatableRegisters { + for _, r := range regs { + a.allocatableSet = a.allocatableSet.add(r) + } + } + return a +} + +type ( + // RegisterInfo holds the statically-known ISA-specific register information. + RegisterInfo struct { + // AllocatableRegisters is a 2D array of allocatable RealReg, indexed by regTypeNum and regNum. + // The order matters: the first element is the most preferred one when allocating. + AllocatableRegisters [NumRegType][]RealReg + CalleeSavedRegisters RegSet + CallerSavedRegisters RegSet + RealRegToVReg []VReg + // RealRegName returns the name of the given RealReg for debugging. + RealRegName func(r RealReg) string + RealRegType func(r RealReg) RegType + } + + // Allocator is a register allocator. + Allocator struct { + // regInfo is static per ABI/ISA, and is initialized by the machine during Machine.PrepareRegisterAllocator. + regInfo *RegisterInfo + // allocatableSet is a set of allocatable RealReg derived from regInfo. Static per ABI/ISA. + allocatableSet RegSet + allocatedCalleeSavedRegs []VReg + vs []VReg + vs2 []VRegID + phiDefInstListPool wazevoapi.Pool[phiDefInstList] + + // Followings are re-used during various places. + blks []Block + reals []RealReg + currentOccupants regInUseSet + + // Following two fields are updated while iterating the blocks in the reverse postorder. + state state + blockStates wazevoapi.IDedPool[blockState] + } + + // programCounter represents an opaque index into the program which is used to represents a LiveInterval of a VReg. + programCounter int32 + + state struct { + argRealRegs []VReg + regsInUse regInUseSet + vrStates wazevoapi.IDedPool[vrState] + + currentBlockID int32 + + // allocatedRegSet is a set of RealReg that are allocated during the allocation phase. This is reset per function. + allocatedRegSet RegSet + } + + blockState struct { + // liveIns is a list of VReg that are live at the beginning of the block. + liveIns []VRegID + // seen is true if the block is visited during the liveness analysis. + seen bool + // visited is true if the block is visited during the allocation phase. + visited bool + startFromPredIndex int + // startRegs is a list of RealReg that are used at the beginning of the block. This is used to fix the merge edges. + startRegs regInUseSet + // endRegs is a list of RealReg that are used at the end of the block. This is used to fix the merge edges. + endRegs regInUseSet + } + + vrState struct { + v VReg + r RealReg + // defInstr is the instruction that defines this value. If this is the phi value and not the entry block, this is nil. + defInstr Instr + // defBlk is the block that defines this value. If this is the phi value, this is the block whose arguments contain this value. + defBlk Block + // lca = lowest common ancestor. This is the block that is the lowest common ancestor of all the blocks that + // reloads this value. This is used to determine the spill location. Only valid if spilled=true. + lca Block + // lastUse is the program counter of the last use of this value. This changes while iterating the block, and + // should not be used across the blocks as it becomes invalid. To check the validity, use lastUseUpdatedAtBlockID. + lastUse programCounter + lastUseUpdatedAtBlockID int32 + // spilled is true if this value is spilled i.e. the value is reload from the stack somewhere in the program. + // + // Note that this field is used during liveness analysis for different purpose. This is used to determine the + // value is live-in or not. + spilled bool + // isPhi is true if this is a phi value. + isPhi bool + desiredLoc desiredLoc + // phiDefInstList is a list of instructions that defines this phi value. + // This is used to determine the spill location, and only valid if isPhi=true. + *phiDefInstList + } + + // phiDefInstList is a linked list of instructions that defines a phi value. + phiDefInstList struct { + instr Instr + v VReg + next *phiDefInstList + } + + // desiredLoc represents a desired location for a VReg. + desiredLoc uint16 + // desiredLocKind is a kind of desired location for a VReg. + desiredLocKind uint16 +) + +const ( + // desiredLocKindUnspecified is a kind of desired location for a VReg that is not specified. + desiredLocKindUnspecified desiredLocKind = iota + // desiredLocKindStack is a kind of desired location for a VReg that is on the stack, only used for the phi values. + desiredLocKindStack + // desiredLocKindReg is a kind of desired location for a VReg that is in a register. + desiredLocKindReg + desiredLocUnspecified = desiredLoc(desiredLocKindUnspecified) + desiredLocStack = desiredLoc(desiredLocKindStack) +) + +func newDesiredLocReg(r RealReg) desiredLoc { + return desiredLoc(desiredLocKindReg) | desiredLoc(r<<2) +} + +func (d desiredLoc) realReg() RealReg { + return RealReg(d >> 2) +} + +func (d desiredLoc) stack() bool { + return d&3 == desiredLoc(desiredLocKindStack) +} + +func resetPhiDefInstList(l *phiDefInstList) { + l.instr = nil + l.next = nil + l.v = VRegInvalid +} + +func (s *state) dump(info *RegisterInfo) { //nolint:unused + fmt.Println("\t\tstate:") + fmt.Println("\t\t\targRealRegs:", s.argRealRegs) + fmt.Println("\t\t\tregsInUse", s.regsInUse.format(info)) + fmt.Println("\t\t\tallocatedRegSet:", s.allocatedRegSet.format(info)) + fmt.Println("\t\t\tused:", s.regsInUse.format(info)) + var strs []string + for i := 0; i <= s.vrStates.MaxIDEncountered(); i++ { + vs := s.vrStates.Get(i) + if vs == nil { + continue + } + if vs.r != RealRegInvalid { + strs = append(strs, fmt.Sprintf("(v%d: %s)", vs.v.ID(), info.RealRegName(vs.r))) + } + } + fmt.Println("\t\t\tvrStates:", strings.Join(strs, ", ")) +} + +func (s *state) reset() { + s.argRealRegs = s.argRealRegs[:0] + s.vrStates.Reset() + s.allocatedRegSet = RegSet(0) + s.regsInUse.reset() + s.currentBlockID = -1 +} + +func (s *state) setVRegState(v VReg, r RealReg) { + id := int(v.ID()) + st := s.vrStates.GetOrAllocate(id) + st.r = r + st.v = v +} + +func resetVrState(vs *vrState) { + vs.v = VRegInvalid + vs.r = RealRegInvalid + vs.defInstr = nil + vs.defBlk = nil + vs.spilled = false + vs.lastUse = -1 + vs.lastUseUpdatedAtBlockID = -1 + vs.lca = nil + vs.isPhi = false + vs.phiDefInstList = nil + vs.desiredLoc = desiredLocUnspecified +} + +func (s *state) getVRegState(v VRegID) *vrState { + return s.vrStates.GetOrAllocate(int(v)) +} + +func (s *state) useRealReg(r RealReg, v VReg) { + if s.regsInUse.has(r) { + panic("BUG: useRealReg: the given real register is already used") + } + s.regsInUse.add(r, v) + s.setVRegState(v, r) + s.allocatedRegSet = s.allocatedRegSet.add(r) +} + +func (s *state) releaseRealReg(r RealReg) { + current := s.regsInUse.get(r) + if current.Valid() { + s.regsInUse.remove(r) + s.setVRegState(current, RealRegInvalid) + } +} + +// recordReload records that the given VReg is reloaded in the given block. +// This is used to determine the spill location by tracking the lowest common ancestor of all the blocks that reloads the value. +func (vs *vrState) recordReload(f Function, blk Block) { + vs.spilled = true + if vs.lca == nil { + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\t\tv%d is reloaded in blk%d,\n", vs.v.ID(), blk.ID()) + } + vs.lca = blk + } else { + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\t\tv%d is reloaded in blk%d, lca=%d\n", vs.v.ID(), blk.ID(), vs.lca.ID()) + } + vs.lca = f.LowestCommonAncestor(vs.lca, blk) + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("updated lca=%d\n", vs.lca.ID()) + } + } +} + +func (s *state) findOrSpillAllocatable(a *Allocator, allocatable []RealReg, forbiddenMask RegSet, preferred RealReg) (r RealReg) { + r = RealRegInvalid + // First, check if the preferredMask has any allocatable register. + if preferred != RealRegInvalid && !forbiddenMask.has(preferred) && !s.regsInUse.has(preferred) { + for _, candidateReal := range allocatable { + // TODO: we should ensure the preferred register is in the allocatable set in the first place, + // but right now, just in case, we check it here. + if candidateReal == preferred { + return preferred + } + } + } + + var lastUseAt programCounter + var spillVReg VReg + for _, candidateReal := range allocatable { + if forbiddenMask.has(candidateReal) { + continue + } + + using := s.regsInUse.get(candidateReal) + if using == VRegInvalid { + // This is not used at this point. + return candidateReal + } + + // Real registers in use should not be spilled, so we skip them. + // For example, if the register is used as an argument register, and it might be + // spilled and not reloaded when it ends up being used as a temporary to pass + // stack based argument. + if using.IsRealReg() { + continue + } + + isPreferred := candidateReal == preferred + + // last == -1 means the value won't be used anymore. + if last := s.getVRegState(using.ID()).lastUse; r == RealRegInvalid || isPreferred || last == -1 || (lastUseAt != -1 && last > lastUseAt) { + lastUseAt = last + r = candidateReal + spillVReg = using + if isPreferred { + break + } + } + } + + if r == RealRegInvalid { + panic("not found any allocatable register") + } + + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\tspilling v%d when lastUseAt=%d and regsInUse=%s\n", spillVReg.ID(), lastUseAt, s.regsInUse.format(a.regInfo)) + } + s.releaseRealReg(r) + return r +} + +func (s *state) findAllocatable(allocatable []RealReg, forbiddenMask RegSet) RealReg { + for _, r := range allocatable { + if !s.regsInUse.has(r) && !forbiddenMask.has(r) { + return r + } + } + return RealRegInvalid +} + +func (s *state) resetAt(bs *blockState) { + s.regsInUse.range_(func(_ RealReg, vr VReg) { + s.setVRegState(vr, RealRegInvalid) + }) + s.regsInUse.reset() + bs.endRegs.range_(func(r RealReg, v VReg) { + id := int(v.ID()) + st := s.vrStates.GetOrAllocate(id) + if st.lastUseUpdatedAtBlockID == s.currentBlockID && st.lastUse == programCounterLiveIn { + s.regsInUse.add(r, v) + s.setVRegState(v, r) + } + }) +} + +func resetBlockState(b *blockState) { + b.seen = false + b.visited = false + b.endRegs.reset() + b.startRegs.reset() + b.startFromPredIndex = -1 + b.liveIns = b.liveIns[:0] +} + +func (b *blockState) dump(a *RegisterInfo) { + fmt.Println("\t\tblockState:") + fmt.Println("\t\t\tstartRegs:", b.startRegs.format(a)) + fmt.Println("\t\t\tendRegs:", b.endRegs.format(a)) + fmt.Println("\t\t\tstartFromPredIndex:", b.startFromPredIndex) + fmt.Println("\t\t\tvisited:", b.visited) +} + +// DoAllocation performs register allocation on the given Function. +func (a *Allocator) DoAllocation(f Function) { + a.livenessAnalysis(f) + a.alloc(f) + a.determineCalleeSavedRealRegs(f) +} + +func (a *Allocator) determineCalleeSavedRealRegs(f Function) { + a.allocatedCalleeSavedRegs = a.allocatedCalleeSavedRegs[:0] + a.state.allocatedRegSet.Range(func(allocatedRealReg RealReg) { + if a.regInfo.CalleeSavedRegisters.has(allocatedRealReg) { + a.allocatedCalleeSavedRegs = append(a.allocatedCalleeSavedRegs, a.regInfo.RealRegToVReg[allocatedRealReg]) + } + }) + f.ClobberedRegisters(a.allocatedCalleeSavedRegs) +} + +func (a *Allocator) getOrAllocateBlockState(blockID int32) *blockState { + return a.blockStates.GetOrAllocate(int(blockID)) +} + +// phiBlk returns the block that defines the given phi value, nil otherwise. +func (s *state) phiBlk(v VRegID) Block { + vs := s.getVRegState(v) + if vs.isPhi { + return vs.defBlk + } + return nil +} + +const ( + programCounterLiveIn = math.MinInt32 + programCounterLiveOut = math.MaxInt32 +) + +// liveAnalysis constructs Allocator.blockLivenessData. +// The algorithm here is described in https://pfalcon.github.io/ssabook/latest/book-full.pdf Chapter 9.2. +func (a *Allocator) livenessAnalysis(f Function) { + s := &a.state + for blk := f.PostOrderBlockIteratorBegin(); blk != nil; blk = f.PostOrderBlockIteratorNext() { // Order doesn't matter. + + // We should gather phi value data. + for _, p := range blk.BlockParams(&a.vs) { + vs := s.getVRegState(p.ID()) + vs.isPhi = true + vs.defBlk = blk + } + } + + for blk := f.PostOrderBlockIteratorBegin(); blk != nil; blk = f.PostOrderBlockIteratorNext() { + blkID := blk.ID() + info := a.getOrAllocateBlockState(blkID) + + a.vs2 = a.vs2[:0] + const ( + flagDeleted = false + flagLive = true + ) + ns := blk.Succs() + for i := 0; i < ns; i++ { + succ := blk.Succ(i) + if succ == nil { + continue + } + + succID := succ.ID() + succInfo := a.getOrAllocateBlockState(succID) + if !succInfo.seen { // This means the back edge. + continue + } + + for _, v := range succInfo.liveIns { + if s.phiBlk(v) != succ { + st := s.getVRegState(v) + // We use .spilled field to store the flag. + st.spilled = flagLive + a.vs2 = append(a.vs2, v) + } + } + } + + for instr := blk.InstrRevIteratorBegin(); instr != nil; instr = blk.InstrRevIteratorNext() { + + var use, def VReg + for _, def = range instr.Defs(&a.vs) { + if !def.IsRealReg() { + id := def.ID() + st := s.getVRegState(id) + // We use .spilled field to store the flag. + st.spilled = flagDeleted + a.vs2 = append(a.vs2, id) + } + } + for _, use = range instr.Uses(&a.vs) { + if !use.IsRealReg() { + id := use.ID() + st := s.getVRegState(id) + // We use .spilled field to store the flag. + st.spilled = flagLive + a.vs2 = append(a.vs2, id) + } + } + + if def.Valid() && s.phiBlk(def.ID()) != nil { + if use.Valid() && use.IsRealReg() { + // If the destination is a phi value, and the source is a real register, this is the beginning of the function. + a.state.argRealRegs = append(a.state.argRealRegs, use) + } + } + } + + for _, v := range a.vs2 { + st := s.getVRegState(v) + // We use .spilled field to store the flag. + if st.spilled == flagLive { //nolint:gosimple + info.liveIns = append(info.liveIns, v) + st.spilled = false + } + } + + info.seen = true + } + + nrs := f.LoopNestingForestRoots() + for i := 0; i < nrs; i++ { + root := f.LoopNestingForestRoot(i) + a.loopTreeDFS(root) + } +} + +// loopTreeDFS implements the Algorithm 9.3 in the book in an iterative way. +func (a *Allocator) loopTreeDFS(entry Block) { + a.blks = a.blks[:0] + a.blks = append(a.blks, entry) + + s := &a.state + for len(a.blks) > 0 { + tail := len(a.blks) - 1 + loop := a.blks[tail] + a.blks = a.blks[:tail] + a.vs2 = a.vs2[:0] + const ( + flagDone = false + flagPending = true + ) + info := a.getOrAllocateBlockState(loop.ID()) + for _, v := range info.liveIns { + if s.phiBlk(v) != loop { + a.vs2 = append(a.vs2, v) + st := s.getVRegState(v) + // We use .spilled field to store the flag. + st.spilled = flagPending + } + } + + var siblingAddedView []VRegID + cn := loop.LoopNestingForestChildren() + for i := 0; i < cn; i++ { + child := loop.LoopNestingForestChild(i) + childID := child.ID() + childInfo := a.getOrAllocateBlockState(childID) + + if i == 0 { + begin := len(childInfo.liveIns) + for _, v := range a.vs2 { + st := s.getVRegState(v) + // We use .spilled field to store the flag. + if st.spilled == flagPending { //nolint:gosimple + st.spilled = flagDone + // TODO: deduplicate, though I don't think it has much impact. + childInfo.liveIns = append(childInfo.liveIns, v) + } + } + siblingAddedView = childInfo.liveIns[begin:] + } else { + // TODO: deduplicate, though I don't think it has much impact. + childInfo.liveIns = append(childInfo.liveIns, siblingAddedView...) + } + + if child.LoopHeader() { + a.blks = append(a.blks, child) + } + } + + if cn == 0 { + // If there's no forest child, we haven't cleared the .spilled field at this point. + for _, v := range a.vs2 { + st := s.getVRegState(v) + st.spilled = false + } + } + } +} + +// alloc allocates registers for the given function by iterating the blocks in the reverse postorder. +// The algorithm here is derived from the Go compiler's allocator https://github.com/golang/go/blob/release-branch.go1.21/src/cmd/compile/internal/ssa/regalloc.go +// In short, this is a simply linear scan register allocation where each block inherits the register allocation state from +// one of its predecessors. Each block inherits the selected state and starts allocation from there. +// If there's a discrepancy in the end states between predecessors, the adjustments are made to ensure consistency after allocation is done (which we call "fixing merge state"). +// The spill instructions (store into the dedicated slots) are inserted after all the allocations and fixing merge states. That is because +// at the point, we all know where the reloads happen, and therefore we can know the best place to spill the values. More precisely, +// the spill happens in the block that is the lowest common ancestor of all the blocks that reloads the value. +// +// All of these logics are almost the same as Go's compiler which has a dedicated description in the source file ^^. +func (a *Allocator) alloc(f Function) { + // First we allocate each block in the reverse postorder (at least one predecessor should be allocated for each block). + for blk := f.ReversePostOrderBlockIteratorBegin(); blk != nil; blk = f.ReversePostOrderBlockIteratorNext() { + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("========== allocating blk%d ========\n", blk.ID()) + } + if blk.Entry() { + a.finalizeStartReg(blk) + } + a.allocBlock(f, blk) + } + // After the allocation, we all know the start and end state of each block. So we can fix the merge states. + for blk := f.ReversePostOrderBlockIteratorBegin(); blk != nil; blk = f.ReversePostOrderBlockIteratorNext() { + a.fixMergeState(f, blk) + } + // Finally, we insert the spill instructions as we know all the places where the reloads happen. + a.scheduleSpills(f) +} + +func (a *Allocator) updateLiveInVRState(liveness *blockState) { + currentBlockID := a.state.currentBlockID + for _, v := range liveness.liveIns { + vs := a.state.getVRegState(v) + vs.lastUse = programCounterLiveIn + vs.lastUseUpdatedAtBlockID = currentBlockID + } +} + +func (a *Allocator) finalizeStartReg(blk Block) { + bID := blk.ID() + liveness := a.getOrAllocateBlockState(bID) + s := &a.state + currentBlkState := a.getOrAllocateBlockState(bID) + if currentBlkState.startFromPredIndex > -1 { + return + } + + s.currentBlockID = bID + a.updateLiveInVRState(liveness) + + preds := blk.Preds() + var predState *blockState + switch preds { + case 0: // This is the entry block. + case 1: + predID := blk.Pred(0).ID() + predState = a.getOrAllocateBlockState(predID) + currentBlkState.startFromPredIndex = 0 + default: + // TODO: there should be some better heuristic to choose the predecessor. + for i := 0; i < preds; i++ { + predID := blk.Pred(i).ID() + if _predState := a.getOrAllocateBlockState(predID); _predState.visited { + predState = _predState + currentBlkState.startFromPredIndex = i + break + } + } + } + if predState == nil { + if !blk.Entry() { + panic(fmt.Sprintf("BUG: at lease one predecessor should be visited for blk%d", blk.ID())) + } + for _, u := range s.argRealRegs { + s.useRealReg(u.RealReg(), u) + } + currentBlkState.startFromPredIndex = 0 + } else if predState != nil { + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("allocating blk%d starting from blk%d (on index=%d) \n", + bID, blk.Pred(currentBlkState.startFromPredIndex).ID(), currentBlkState.startFromPredIndex) + } + s.resetAt(predState) + } + + s.regsInUse.range_(func(allocated RealReg, v VReg) { + currentBlkState.startRegs.add(allocated, v) + }) + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("finalized start reg for blk%d: %s\n", blk.ID(), currentBlkState.startRegs.format(a.regInfo)) + } +} + +func (a *Allocator) allocBlock(f Function, blk Block) { + bID := blk.ID() + s := &a.state + currentBlkState := a.getOrAllocateBlockState(bID) + s.currentBlockID = bID + + if currentBlkState.startFromPredIndex < 0 { + panic("BUG: startFromPredIndex should be set in finalizeStartReg prior to allocBlock") + } + + // Clears the previous state. + s.regsInUse.range_(func(allocatedRealReg RealReg, vr VReg) { + s.setVRegState(vr, RealRegInvalid) + }) + s.regsInUse.reset() + // Then set the start state. + currentBlkState.startRegs.range_(func(allocatedRealReg RealReg, vr VReg) { + s.useRealReg(allocatedRealReg, vr) + }) + + desiredUpdated := a.vs2[:0] + + // Update the last use of each VReg. + var pc programCounter + for instr := blk.InstrIteratorBegin(); instr != nil; instr = blk.InstrIteratorNext() { + var use, def VReg + for _, use = range instr.Uses(&a.vs) { + if !use.IsRealReg() { + s.getVRegState(use.ID()).lastUse = pc + } + } + + if instr.IsCopy() { + def = instr.Defs(&a.vs)[0] + r := def.RealReg() + if r != RealRegInvalid { + useID := use.ID() + vs := s.getVRegState(useID) + if !vs.isPhi { // TODO: no idea why do we need this. + vs.desiredLoc = newDesiredLocReg(r) + desiredUpdated = append(desiredUpdated, useID) + } + } + } + pc++ + } + + // Mark all live-out values by checking live-in of the successors. + // While doing so, we also update the desired register values. + var succ Block + for i, ns := 0, blk.Succs(); i < ns; i++ { + succ = blk.Succ(i) + if succ == nil { + continue + } + + succID := succ.ID() + succState := a.getOrAllocateBlockState(succID) + for _, v := range succState.liveIns { + if s.phiBlk(v) != succ { + st := s.getVRegState(v) + st.lastUse = programCounterLiveOut + } + } + + if succState.startFromPredIndex > -1 { + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("blk%d -> blk%d: start_regs: %s\n", bID, succID, succState.startRegs.format(a.regInfo)) + } + succState.startRegs.range_(func(allocatedRealReg RealReg, vr VReg) { + vs := s.getVRegState(vr.ID()) + vs.desiredLoc = newDesiredLocReg(allocatedRealReg) + desiredUpdated = append(desiredUpdated, vr.ID()) + }) + for _, p := range succ.BlockParams(&a.vs) { + vs := s.getVRegState(p.ID()) + if vs.desiredLoc.realReg() == RealRegInvalid { + vs.desiredLoc = desiredLocStack + desiredUpdated = append(desiredUpdated, p.ID()) + } + } + } + } + + // Propagate the desired register values from the end of the block to the beginning. + for instr := blk.InstrRevIteratorBegin(); instr != nil; instr = blk.InstrRevIteratorNext() { + if instr.IsCopy() { + def := instr.Defs(&a.vs)[0] + defState := s.getVRegState(def.ID()) + desired := defState.desiredLoc.realReg() + if desired == RealRegInvalid { + continue + } + + use := instr.Uses(&a.vs)[0] + useID := use.ID() + useState := s.getVRegState(useID) + if s.phiBlk(useID) != succ && useState.desiredLoc == desiredLocUnspecified { + useState.desiredLoc = newDesiredLocReg(desired) + desiredUpdated = append(desiredUpdated, useID) + } + } + } + + pc = 0 + for instr := blk.InstrIteratorBegin(); instr != nil; instr = blk.InstrIteratorNext() { + if wazevoapi.RegAllocLoggingEnabled { + fmt.Println(instr) + } + + var currentUsedSet RegSet + killSet := a.reals[:0] + + // Gather the set of registers that will be used in the current instruction. + for _, use := range instr.Uses(&a.vs) { + if use.IsRealReg() { + r := use.RealReg() + currentUsedSet = currentUsedSet.add(r) + if a.allocatableSet.has(r) { + killSet = append(killSet, r) + } + } else { + vs := s.getVRegState(use.ID()) + if r := vs.r; r != RealRegInvalid { + currentUsedSet = currentUsedSet.add(r) + } + } + } + + for i, use := range instr.Uses(&a.vs) { + if !use.IsRealReg() { + vs := s.getVRegState(use.ID()) + killed := vs.lastUse == pc + r := vs.r + + if r == RealRegInvalid { + r = s.findOrSpillAllocatable(a, a.regInfo.AllocatableRegisters[use.RegType()], currentUsedSet, + // Prefer the desired register if it's available. + vs.desiredLoc.realReg()) + vs.recordReload(f, blk) + f.ReloadRegisterBefore(use.SetRealReg(r), instr) + s.useRealReg(r, use) + } + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\ttrying to use v%v on %s\n", use.ID(), a.regInfo.RealRegName(r)) + } + instr.AssignUse(i, use.SetRealReg(r)) + currentUsedSet = currentUsedSet.add(r) + if killed { + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\tkill v%d with %s\n", use.ID(), a.regInfo.RealRegName(r)) + } + killSet = append(killSet, r) + } + } + } + + isIndirect := instr.IsIndirectCall() + call := instr.IsCall() || isIndirect + if call { + addr := RealRegInvalid + if instr.IsIndirectCall() { + addr = a.vs[0].RealReg() + } + a.releaseCallerSavedRegs(addr) + } + + for _, r := range killSet { + s.releaseRealReg(r) + } + a.reals = killSet + + defs := instr.Defs(&a.vs) + switch { + case len(defs) > 1: + // Some instructions define multiple values on real registers. + // E.g. call instructions (following calling convention) / div instruction on x64 that defines both rax and rdx. + // + // Note that currently I assume that such instructions define only the pre colored real registers, not the VRegs + // that require allocations. If we need to support such case, we need to add the logic to handle it here, + // though is there any such instruction? + for _, def := range defs { + if !def.IsRealReg() { + panic("BUG: multiple defs should be on real registers") + } + r := def.RealReg() + if s.regsInUse.has(r) { + s.releaseRealReg(r) + } + s.useRealReg(r, def) + } + case len(defs) == 1: + def := defs[0] + if def.IsRealReg() { + r := def.RealReg() + if a.allocatableSet.has(r) { + if s.regsInUse.has(r) { + s.releaseRealReg(r) + } + s.useRealReg(r, def) + } + } else { + vState := s.getVRegState(def.ID()) + r := vState.r + + if desired := vState.desiredLoc.realReg(); desired != RealRegInvalid { + if r != desired { + if (vState.isPhi && vState.defBlk == succ) || + // If this is not a phi and it's already assigned a real reg, + // this value has multiple definitions, hence we cannot assign the desired register. + (!s.regsInUse.has(desired) && r == RealRegInvalid) { + // If the phi value is passed via a real register, we force the value to be in the desired register. + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\t\tv%d is phi and desiredReg=%s\n", def.ID(), a.regInfo.RealRegName(desired)) + } + if r != RealRegInvalid { + // If the value is already in a different real register, we release it to change the state. + // Otherwise, multiple registers might have the same values at the end, which results in + // messing up the merge state reconciliation. + s.releaseRealReg(r) + } + r = desired + s.releaseRealReg(r) + s.useRealReg(r, def) + } + } + } + + // Allocate a new real register if `def` is not currently assigned one. + // It can happen when multiple instructions define the same VReg (e.g. const loads). + if r == RealRegInvalid { + if instr.IsCopy() { + copySrc := instr.Uses(&a.vs)[0].RealReg() + if a.allocatableSet.has(copySrc) && !s.regsInUse.has(copySrc) { + r = copySrc + } + } + if r == RealRegInvalid { + typ := def.RegType() + r = s.findOrSpillAllocatable(a, a.regInfo.AllocatableRegisters[typ], RegSet(0), RealRegInvalid) + } + s.useRealReg(r, def) + } + dr := def.SetRealReg(r) + instr.AssignDef(dr) + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\tdefining v%d with %s\n", def.ID(), a.regInfo.RealRegName(r)) + } + if vState.isPhi { + if vState.desiredLoc.stack() { // Stack based phi value. + f.StoreRegisterAfter(dr, instr) + // Release the real register as it's not used anymore. + s.releaseRealReg(r) + } else { + // Only the register based phis are necessary to track the defining instructions + // since the stack-based phis are already having stores inserted ^. + n := a.phiDefInstListPool.Allocate() + n.instr = instr + n.next = vState.phiDefInstList + n.v = dr + vState.phiDefInstList = n + } + } else { + vState.defInstr = instr + vState.defBlk = blk + } + } + } + if wazevoapi.RegAllocLoggingEnabled { + fmt.Println(instr) + } + pc++ + } + + s.regsInUse.range_(func(allocated RealReg, v VReg) { + currentBlkState.endRegs.add(allocated, v) + }) + + currentBlkState.visited = true + if wazevoapi.RegAllocLoggingEnabled { + currentBlkState.dump(a.regInfo) + } + + // Reset the desired end location. + for _, v := range desiredUpdated { + vs := s.getVRegState(v) + vs.desiredLoc = desiredLocUnspecified + } + a.vs2 = desiredUpdated[:0] + + for i := 0; i < blk.Succs(); i++ { + succ := blk.Succ(i) + if succ == nil { + continue + } + // If the successor is not visited yet, finalize the start state. + a.finalizeStartReg(succ) + } +} + +func (a *Allocator) releaseCallerSavedRegs(addrReg RealReg) { + s := &a.state + + for i := 0; i < 64; i++ { + allocated := RealReg(i) + if allocated == addrReg { // If this is the call indirect, we should not touch the addr register. + continue + } + if v := s.regsInUse.get(allocated); v.Valid() { + if v.IsRealReg() { + continue // This is the argument register as it's already used by VReg backed by the corresponding RealReg. + } + if !a.regInfo.CallerSavedRegisters.has(allocated) { + // If this is not a caller-saved register, it is safe to keep it across the call. + continue + } + s.releaseRealReg(allocated) + } + } +} + +func (a *Allocator) fixMergeState(f Function, blk Block) { + preds := blk.Preds() + if preds <= 1 { + return + } + + s := &a.state + + // Restores the state at the beginning of the block. + bID := blk.ID() + blkSt := a.getOrAllocateBlockState(bID) + desiredOccupants := &blkSt.startRegs + aliveOnRegVRegs := make(map[VReg]RealReg) + for i := 0; i < 64; i++ { + r := RealReg(i) + if v := blkSt.startRegs.get(r); v.Valid() { + aliveOnRegVRegs[v] = r + } + } + + if wazevoapi.RegAllocLoggingEnabled { + fmt.Println("fixMergeState", blk.ID(), ":", desiredOccupants.format(a.regInfo)) + } + + s.currentBlockID = bID + a.updateLiveInVRState(a.getOrAllocateBlockState(bID)) + + currentOccupants := &a.currentOccupants + for i := 0; i < preds; i++ { + currentOccupants.reset() + if i == blkSt.startFromPredIndex { + continue + } + + currentOccupantsRev := make(map[VReg]RealReg) + pred := blk.Pred(i) + predSt := a.getOrAllocateBlockState(pred.ID()) + for ii := 0; ii < 64; ii++ { + r := RealReg(ii) + if v := predSt.endRegs.get(r); v.Valid() { + if _, ok := aliveOnRegVRegs[v]; !ok { + continue + } + currentOccupants.add(r, v) + currentOccupantsRev[v] = r + } + } + + s.resetAt(predSt) + + // Finds the free registers if any. + intTmp, floatTmp := VRegInvalid, VRegInvalid + if intFree := s.findAllocatable( + a.regInfo.AllocatableRegisters[RegTypeInt], desiredOccupants.set, + ); intFree != RealRegInvalid { + intTmp = FromRealReg(intFree, RegTypeInt) + } + if floatFree := s.findAllocatable( + a.regInfo.AllocatableRegisters[RegTypeFloat], desiredOccupants.set, + ); floatFree != RealRegInvalid { + floatTmp = FromRealReg(floatFree, RegTypeFloat) + } + + if wazevoapi.RegAllocLoggingEnabled { + fmt.Println("\t", pred.ID(), ":", currentOccupants.format(a.regInfo)) + } + + for ii := 0; ii < 64; ii++ { + r := RealReg(ii) + desiredVReg := desiredOccupants.get(r) + if !desiredVReg.Valid() { + continue + } + + currentVReg := currentOccupants.get(r) + if desiredVReg.ID() == currentVReg.ID() { + continue + } + + typ := desiredVReg.RegType() + var tmpRealReg VReg + if typ == RegTypeInt { + tmpRealReg = intTmp + } else { + tmpRealReg = floatTmp + } + a.reconcileEdge(f, r, pred, currentOccupants, currentOccupantsRev, currentVReg, desiredVReg, tmpRealReg, typ) + } + } +} + +func (a *Allocator) reconcileEdge(f Function, + r RealReg, + pred Block, + currentOccupants *regInUseSet, + currentOccupantsRev map[VReg]RealReg, + currentVReg, desiredVReg VReg, + freeReg VReg, + typ RegType, +) { + s := &a.state + if currentVReg.Valid() { + // Both are on reg. + er, ok := currentOccupantsRev[desiredVReg] + if !ok { + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\t\tv%d is desired to be on %s, but currently on the stack\n", + desiredVReg.ID(), a.regInfo.RealRegName(r), + ) + } + // This case is that the desired value is on the stack, but currentVReg is on the target register. + // We need to move the current value to the stack, and reload the desired value. + // TODO: we can do better here. + f.StoreRegisterBefore(currentVReg.SetRealReg(r), pred.LastInstrForInsertion()) + delete(currentOccupantsRev, currentVReg) + + s.getVRegState(desiredVReg.ID()).recordReload(f, pred) + f.ReloadRegisterBefore(desiredVReg.SetRealReg(r), pred.LastInstrForInsertion()) + currentOccupants.add(r, desiredVReg) + currentOccupantsRev[desiredVReg] = r + return + } + + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\t\tv%d is desired to be on %s, but currently on %s\n", + desiredVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er), + ) + } + f.SwapBefore( + currentVReg.SetRealReg(r), + desiredVReg.SetRealReg(er), + freeReg, + pred.LastInstrForInsertion(), + ) + s.allocatedRegSet = s.allocatedRegSet.add(freeReg.RealReg()) + currentOccupantsRev[desiredVReg] = r + currentOccupantsRev[currentVReg] = er + currentOccupants.add(r, desiredVReg) + currentOccupants.add(er, currentVReg) + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\t\tv%d previously on %s moved to %s\n", currentVReg.ID(), a.regInfo.RealRegName(r), a.regInfo.RealRegName(er)) + } + } else { + // Desired is on reg, but currently the target register is not used. + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("\t\tv%d is desired to be on %s, current not used\n", + desiredVReg.ID(), a.regInfo.RealRegName(r), + ) + } + if currentReg, ok := currentOccupantsRev[desiredVReg]; ok { + f.InsertMoveBefore( + FromRealReg(r, typ), + desiredVReg.SetRealReg(currentReg), + pred.LastInstrForInsertion(), + ) + currentOccupants.remove(currentReg) + } else { + s.getVRegState(desiredVReg.ID()).recordReload(f, pred) + f.ReloadRegisterBefore(desiredVReg.SetRealReg(r), pred.LastInstrForInsertion()) + } + currentOccupantsRev[desiredVReg] = r + currentOccupants.add(r, desiredVReg) + } + + if wazevoapi.RegAllocLoggingEnabled { + fmt.Println("\t", pred.ID(), ":", currentOccupants.format(a.regInfo)) + } +} + +func (a *Allocator) scheduleSpills(f Function) { + states := a.state.vrStates + for i := 0; i <= states.MaxIDEncountered(); i++ { + vs := states.Get(i) + if vs == nil { + continue + } + if vs.spilled { + a.scheduleSpill(f, vs) + } + } +} + +func (a *Allocator) scheduleSpill(f Function, vs *vrState) { + v := vs.v + // If the value is the phi value, we need to insert a spill after each phi definition. + if vs.isPhi { + for defInstr := vs.phiDefInstList; defInstr != nil; defInstr = defInstr.next { + f.StoreRegisterAfter(defInstr.v, defInstr.instr) + } + return + } + + pos := vs.lca + definingBlk := vs.defBlk + r := RealRegInvalid + if definingBlk == nil { + panic(fmt.Sprintf("BUG: definingBlk should not be nil for %s. This is likley a bug in backend lowering logic", vs.v.String())) + } + if pos == nil { + panic(fmt.Sprintf("BUG: pos should not be nil for %s. This is likley a bug in backend lowering logic", vs.v.String())) + } + + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("v%d is spilled in blk%d, lca=blk%d\n", v.ID(), definingBlk.ID(), pos.ID()) + } + for pos != definingBlk { + st := a.getOrAllocateBlockState(pos.ID()) + for ii := 0; ii < 64; ii++ { + rr := RealReg(ii) + if st.startRegs.get(rr) == v { + r = rr + // Already in the register, so we can place the spill at the beginning of the block. + break + } + } + + if r != RealRegInvalid { + break + } + + pos = f.Idom(pos) + } + + if pos == definingBlk { + defInstr := vs.defInstr + defInstr.Defs(&a.vs) + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("schedule spill v%d after %v\n", v.ID(), defInstr) + } + f.StoreRegisterAfter(a.vs[0], defInstr) + } else { + // Found an ancestor block that holds the value in the register at the beginning of the block. + // We need to insert a spill before the last use. + first := pos.FirstInstr() + if wazevoapi.RegAllocLoggingEnabled { + fmt.Printf("schedule spill v%d before %v\n", v.ID(), first) + } + f.StoreRegisterAfter(v.SetRealReg(r), first) + } +} + +// Reset resets the allocator's internal state so that it can be reused. +func (a *Allocator) Reset() { + a.state.reset() + a.blockStates.Reset() + a.phiDefInstListPool.Reset() + a.vs = a.vs[:0] +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go new file mode 100644 index 000000000..e9bf60661 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go @@ -0,0 +1,108 @@ +package regalloc + +import ( + "fmt" + "strings" +) + +// NewRegSet returns a new RegSet with the given registers. +func NewRegSet(regs ...RealReg) RegSet { + var ret RegSet + for _, r := range regs { + ret = ret.add(r) + } + return ret +} + +// RegSet represents a set of registers. +type RegSet uint64 + +func (rs RegSet) format(info *RegisterInfo) string { //nolint:unused + var ret []string + for i := 0; i < 64; i++ { + if rs&(1<<uint(i)) != 0 { + ret = append(ret, info.RealRegName(RealReg(i))) + } + } + return strings.Join(ret, ", ") +} + +func (rs RegSet) has(r RealReg) bool { + return rs&(1<<uint(r)) != 0 +} + +func (rs RegSet) add(r RealReg) RegSet { + if r >= 64 { + return rs + } + return rs | 1<<uint(r) +} + +func (rs RegSet) Range(f func(allocatedRealReg RealReg)) { + for i := 0; i < 64; i++ { + if rs&(1<<uint(i)) != 0 { + f(RealReg(i)) + } + } +} + +type regInUseSet struct { + set RegSet + vrs [64]VReg +} + +func (rs *regInUseSet) reset() { + rs.set = 0 + for i := range rs.vrs { + rs.vrs[i] = VRegInvalid + } +} + +func (rs *regInUseSet) format(info *RegisterInfo) string { //nolint:unused + var ret []string + for i := 0; i < 64; i++ { + if rs.set&(1<<uint(i)) != 0 { + vr := rs.vrs[i] + ret = append(ret, fmt.Sprintf("(%s->v%d)", info.RealRegName(RealReg(i)), vr.ID())) + } + } + return strings.Join(ret, ", ") +} + +func (rs *regInUseSet) has(r RealReg) bool { + if r >= 64 { + return false + } + return rs.set&(1<<uint(r)) != 0 +} + +func (rs *regInUseSet) get(r RealReg) VReg { + if r >= 64 { + return VRegInvalid + } + return rs.vrs[r] +} + +func (rs *regInUseSet) remove(r RealReg) { + if r >= 64 { + return + } + rs.set &= ^(1 << uint(r)) + rs.vrs[r] = VRegInvalid +} + +func (rs *regInUseSet) add(r RealReg, vr VReg) { + if r >= 64 { + return + } + rs.set |= 1 << uint(r) + rs.vrs[r] = vr +} + +func (rs *regInUseSet) range_(f func(allocatedRealReg RealReg, vr VReg)) { + for i := 0; i < 64; i++ { + if rs.set&(1<<uint(i)) != 0 { + f(RealReg(i), rs.vrs[i]) + } + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go new file mode 100644 index 000000000..edfa962b5 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go @@ -0,0 +1,43 @@ +package backend + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +// SSAValueDefinition represents a definition of an SSA value. +type SSAValueDefinition struct { + // BlockParamValue is valid if Instr == nil + BlockParamValue ssa.Value + + // BlkParamVReg is valid if Instr == nil + BlkParamVReg regalloc.VReg + + // Instr is not nil if this is a definition from an instruction. + Instr *ssa.Instruction + // N is the index of the return value in the instr's return values list. + N int + // RefCount is the number of references to the result. + RefCount int +} + +func (d *SSAValueDefinition) IsFromInstr() bool { + return d.Instr != nil +} + +func (d *SSAValueDefinition) IsFromBlockParam() bool { + return d.Instr == nil +} + +func (d *SSAValueDefinition) SSAValue() ssa.Value { + if d.IsFromBlockParam() { + return d.BlockParamValue + } else { + r, rs := d.Instr.Returns() + if d.N == 0 { + return r + } else { + return rs[d.N-1] + } + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/call_engine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/call_engine.go new file mode 100644 index 000000000..3379c4dde --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/call_engine.go @@ -0,0 +1,722 @@ +package wazevo + +import ( + "context" + "encoding/binary" + "fmt" + "reflect" + "runtime" + "sync/atomic" + "unsafe" + + "github.com/tetratelabs/wazero/api" + "github.com/tetratelabs/wazero/experimental" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" + "github.com/tetratelabs/wazero/internal/expctxkeys" + "github.com/tetratelabs/wazero/internal/internalapi" + "github.com/tetratelabs/wazero/internal/wasm" + "github.com/tetratelabs/wazero/internal/wasmdebug" + "github.com/tetratelabs/wazero/internal/wasmruntime" +) + +type ( + // callEngine implements api.Function. + callEngine struct { + internalapi.WazeroOnly + stack []byte + // stackTop is the pointer to the *aligned* top of the stack. This must be updated + // whenever the stack is changed. This is passed to the assembly function + // at the very beginning of api.Function Call/CallWithStack. + stackTop uintptr + // executable is the pointer to the executable code for this function. + executable *byte + preambleExecutable *byte + // parent is the *moduleEngine from which this callEngine is created. + parent *moduleEngine + // indexInModule is the index of the function in the module. + indexInModule wasm.Index + // sizeOfParamResultSlice is the size of the parameter/result slice. + sizeOfParamResultSlice int + requiredParams int + // execCtx holds various information to be read/written by assembly functions. + execCtx executionContext + // execCtxPtr holds the pointer to the executionContext which doesn't change after callEngine is created. + execCtxPtr uintptr + numberOfResults int + stackIteratorImpl stackIterator + } + + // executionContext is the struct to be read/written by assembly functions. + executionContext struct { + // exitCode holds the wazevoapi.ExitCode describing the state of the function execution. + exitCode wazevoapi.ExitCode + // callerModuleContextPtr holds the moduleContextOpaque for Go function calls. + callerModuleContextPtr *byte + // originalFramePointer holds the original frame pointer of the caller of the assembly function. + originalFramePointer uintptr + // originalStackPointer holds the original stack pointer of the caller of the assembly function. + originalStackPointer uintptr + // goReturnAddress holds the return address to go back to the caller of the assembly function. + goReturnAddress uintptr + // stackBottomPtr holds the pointer to the bottom of the stack. + stackBottomPtr *byte + // goCallReturnAddress holds the return address to go back to the caller of the Go function. + goCallReturnAddress *byte + // stackPointerBeforeGoCall holds the stack pointer before calling a Go function. + stackPointerBeforeGoCall *uint64 + // stackGrowRequiredSize holds the required size of stack grow. + stackGrowRequiredSize uintptr + // memoryGrowTrampolineAddress holds the address of memory grow trampoline function. + memoryGrowTrampolineAddress *byte + // stackGrowCallTrampolineAddress holds the address of stack grow trampoline function. + stackGrowCallTrampolineAddress *byte + // checkModuleExitCodeTrampolineAddress holds the address of check-module-exit-code function. + checkModuleExitCodeTrampolineAddress *byte + // savedRegisters is the opaque spaces for save/restore registers. + // We want to align 16 bytes for each register, so we use [64][2]uint64. + savedRegisters [64][2]uint64 + // goFunctionCallCalleeModuleContextOpaque is the pointer to the target Go function's moduleContextOpaque. + goFunctionCallCalleeModuleContextOpaque uintptr + // tableGrowTrampolineAddress holds the address of table grow trampoline function. + tableGrowTrampolineAddress *byte + // refFuncTrampolineAddress holds the address of ref-func trampoline function. + refFuncTrampolineAddress *byte + // memmoveAddress holds the address of memmove function implemented by Go runtime. See memmove.go. + memmoveAddress uintptr + // framePointerBeforeGoCall holds the frame pointer before calling a Go function. Note: only used in amd64. + framePointerBeforeGoCall uintptr + // memoryWait32TrampolineAddress holds the address of memory_wait32 trampoline function. + memoryWait32TrampolineAddress *byte + // memoryWait32TrampolineAddress holds the address of memory_wait64 trampoline function. + memoryWait64TrampolineAddress *byte + // memoryNotifyTrampolineAddress holds the address of the memory_notify trampoline function. + memoryNotifyTrampolineAddress *byte + } +) + +func (c *callEngine) requiredInitialStackSize() int { + const initialStackSizeDefault = 10240 + stackSize := initialStackSizeDefault + paramResultInBytes := c.sizeOfParamResultSlice * 8 * 2 // * 8 because uint64 is 8 bytes, and *2 because we need both separated param/result slots. + required := paramResultInBytes + 32 + 16 // 32 is enough to accommodate the call frame info, and 16 exists just in case when []byte is not aligned to 16 bytes. + if required > stackSize { + stackSize = required + } + return stackSize +} + +func (c *callEngine) init() { + stackSize := c.requiredInitialStackSize() + if wazevoapi.StackGuardCheckEnabled { + stackSize += wazevoapi.StackGuardCheckGuardPageSize + } + c.stack = make([]byte, stackSize) + c.stackTop = alignedStackTop(c.stack) + if wazevoapi.StackGuardCheckEnabled { + c.execCtx.stackBottomPtr = &c.stack[wazevoapi.StackGuardCheckGuardPageSize] + } else { + c.execCtx.stackBottomPtr = &c.stack[0] + } + c.execCtxPtr = uintptr(unsafe.Pointer(&c.execCtx)) +} + +// alignedStackTop returns 16-bytes aligned stack top of given stack. +// 16 bytes should be good for all platform (arm64/amd64). +func alignedStackTop(s []byte) uintptr { + stackAddr := uintptr(unsafe.Pointer(&s[len(s)-1])) + return stackAddr - (stackAddr & (16 - 1)) +} + +// Definition implements api.Function. +func (c *callEngine) Definition() api.FunctionDefinition { + return c.parent.module.Source.FunctionDefinition(c.indexInModule) +} + +// Call implements api.Function. +func (c *callEngine) Call(ctx context.Context, params ...uint64) ([]uint64, error) { + if c.requiredParams != len(params) { + return nil, fmt.Errorf("expected %d params, but passed %d", c.requiredParams, len(params)) + } + paramResultSlice := make([]uint64, c.sizeOfParamResultSlice) + copy(paramResultSlice, params) + if err := c.callWithStack(ctx, paramResultSlice); err != nil { + return nil, err + } + return paramResultSlice[:c.numberOfResults], nil +} + +func (c *callEngine) addFrame(builder wasmdebug.ErrorBuilder, addr uintptr) (def api.FunctionDefinition, listener experimental.FunctionListener) { + eng := c.parent.parent.parent + cm := eng.compiledModuleOfAddr(addr) + if cm == nil { + // This case, the module might have been closed and deleted from the engine. + // We fall back to searching the imported modules that can be referenced from this callEngine. + + // First, we check itself. + if checkAddrInBytes(addr, c.parent.parent.executable) { + cm = c.parent.parent + } else { + // Otherwise, search all imported modules. TODO: maybe recursive, but not sure it's useful in practice. + p := c.parent + for i := range p.importedFunctions { + candidate := p.importedFunctions[i].me.parent + if checkAddrInBytes(addr, candidate.executable) { + cm = candidate + break + } + } + } + } + + if cm != nil { + index := cm.functionIndexOf(addr) + def = cm.module.FunctionDefinition(cm.module.ImportFunctionCount + index) + var sources []string + if dw := cm.module.DWARFLines; dw != nil { + sourceOffset := cm.getSourceOffset(addr) + sources = dw.Line(sourceOffset) + } + builder.AddFrame(def.DebugName(), def.ParamTypes(), def.ResultTypes(), sources) + if len(cm.listeners) > 0 { + listener = cm.listeners[index] + } + } + return +} + +// CallWithStack implements api.Function. +func (c *callEngine) CallWithStack(ctx context.Context, paramResultStack []uint64) (err error) { + if c.sizeOfParamResultSlice > len(paramResultStack) { + return fmt.Errorf("need %d params, but stack size is %d", c.sizeOfParamResultSlice, len(paramResultStack)) + } + return c.callWithStack(ctx, paramResultStack) +} + +// CallWithStack implements api.Function. +func (c *callEngine) callWithStack(ctx context.Context, paramResultStack []uint64) (err error) { + snapshotEnabled := ctx.Value(expctxkeys.EnableSnapshotterKey{}) != nil + if snapshotEnabled { + ctx = context.WithValue(ctx, expctxkeys.SnapshotterKey{}, c) + } + + if wazevoapi.StackGuardCheckEnabled { + defer func() { + wazevoapi.CheckStackGuardPage(c.stack) + }() + } + + p := c.parent + ensureTermination := p.parent.ensureTermination + m := p.module + if ensureTermination { + select { + case <-ctx.Done(): + // If the provided context is already done, close the module and return the error. + m.CloseWithCtxErr(ctx) + return m.FailIfClosed() + default: + } + } + + var paramResultPtr *uint64 + if len(paramResultStack) > 0 { + paramResultPtr = ¶mResultStack[0] + } + defer func() { + r := recover() + if s, ok := r.(*snapshot); ok { + // A snapshot that wasn't handled was created by a different call engine possibly from a nested wasm invocation, + // let it propagate up to be handled by the caller. + panic(s) + } + if r != nil { + type listenerForAbort struct { + def api.FunctionDefinition + lsn experimental.FunctionListener + } + + var listeners []listenerForAbort + builder := wasmdebug.NewErrorBuilder() + def, lsn := c.addFrame(builder, uintptr(unsafe.Pointer(c.execCtx.goCallReturnAddress))) + if lsn != nil { + listeners = append(listeners, listenerForAbort{def, lsn}) + } + returnAddrs := unwindStack( + uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), + c.execCtx.framePointerBeforeGoCall, + c.stackTop, + nil, + ) + for _, retAddr := range returnAddrs[:len(returnAddrs)-1] { // the last return addr is the trampoline, so we skip it. + def, lsn = c.addFrame(builder, retAddr) + if lsn != nil { + listeners = append(listeners, listenerForAbort{def, lsn}) + } + } + err = builder.FromRecovered(r) + + for _, lsn := range listeners { + lsn.lsn.Abort(ctx, m, lsn.def, err) + } + } else { + if err != wasmruntime.ErrRuntimeStackOverflow { // Stackoverflow case shouldn't be panic (to avoid extreme stack unwinding). + err = c.parent.module.FailIfClosed() + } + } + + if err != nil { + // Ensures that we can reuse this callEngine even after an error. + c.execCtx.exitCode = wazevoapi.ExitCodeOK + } + }() + + if ensureTermination { + done := m.CloseModuleOnCanceledOrTimeout(ctx) + defer done() + } + + if c.stackTop&(16-1) != 0 { + panic("BUG: stack must be aligned to 16 bytes") + } + entrypoint(c.preambleExecutable, c.executable, c.execCtxPtr, c.parent.opaquePtr, paramResultPtr, c.stackTop) + for { + switch ec := c.execCtx.exitCode; ec & wazevoapi.ExitCodeMask { + case wazevoapi.ExitCodeOK: + return nil + case wazevoapi.ExitCodeGrowStack: + oldsp := uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)) + oldTop := c.stackTop + oldStack := c.stack + var newsp, newfp uintptr + if wazevoapi.StackGuardCheckEnabled { + newsp, newfp, err = c.growStackWithGuarded() + } else { + newsp, newfp, err = c.growStack() + } + if err != nil { + return err + } + adjustClonedStack(oldsp, oldTop, newsp, newfp, c.stackTop) + // Old stack must be alive until the new stack is adjusted. + runtime.KeepAlive(oldStack) + c.execCtx.exitCode = wazevoapi.ExitCodeOK + afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, newsp, newfp) + case wazevoapi.ExitCodeGrowMemory: + mod := c.callerModuleInstance() + mem := mod.MemoryInstance + s := goCallStackView(c.execCtx.stackPointerBeforeGoCall) + argRes := &s[0] + if res, ok := mem.Grow(uint32(*argRes)); !ok { + *argRes = uint64(0xffffffff) // = -1 in signed 32-bit integer. + } else { + *argRes = uint64(res) + calleeOpaque := opaqueViewFromPtr(uintptr(unsafe.Pointer(c.execCtx.callerModuleContextPtr))) + if mod.Source.MemorySection != nil { // Local memory. + putLocalMemory(calleeOpaque, 8 /* local memory begins at 8 */, mem) + } else { + // Imported memory's owner at offset 16 of the callerModuleContextPtr. + opaquePtr := uintptr(binary.LittleEndian.Uint64(calleeOpaque[16:])) + importedMemOwner := opaqueViewFromPtr(opaquePtr) + putLocalMemory(importedMemOwner, 8 /* local memory begins at 8 */, mem) + } + } + c.execCtx.exitCode = wazevoapi.ExitCodeOK + afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall) + case wazevoapi.ExitCodeTableGrow: + mod := c.callerModuleInstance() + s := goCallStackView(c.execCtx.stackPointerBeforeGoCall) + tableIndex, num, ref := uint32(s[0]), uint32(s[1]), uintptr(s[2]) + table := mod.Tables[tableIndex] + s[0] = uint64(uint32(int32(table.Grow(num, ref)))) + c.execCtx.exitCode = wazevoapi.ExitCodeOK + afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, + uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall) + case wazevoapi.ExitCodeCallGoFunction: + index := wazevoapi.GoFunctionIndexFromExitCode(ec) + f := hostModuleGoFuncFromOpaque[api.GoFunction](index, c.execCtx.goFunctionCallCalleeModuleContextOpaque) + func() { + if snapshotEnabled { + defer snapshotRecoverFn(c) + } + f.Call(ctx, goCallStackView(c.execCtx.stackPointerBeforeGoCall)) + }() + // Back to the native code. + c.execCtx.exitCode = wazevoapi.ExitCodeOK + afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, + uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall) + case wazevoapi.ExitCodeCallGoFunctionWithListener: + index := wazevoapi.GoFunctionIndexFromExitCode(ec) + f := hostModuleGoFuncFromOpaque[api.GoFunction](index, c.execCtx.goFunctionCallCalleeModuleContextOpaque) + listeners := hostModuleListenersSliceFromOpaque(c.execCtx.goFunctionCallCalleeModuleContextOpaque) + s := goCallStackView(c.execCtx.stackPointerBeforeGoCall) + // Call Listener.Before. + callerModule := c.callerModuleInstance() + listener := listeners[index] + hostModule := hostModuleFromOpaque(c.execCtx.goFunctionCallCalleeModuleContextOpaque) + def := hostModule.FunctionDefinition(wasm.Index(index)) + listener.Before(ctx, callerModule, def, s, c.stackIterator(true)) + // Call into the Go function. + func() { + if snapshotEnabled { + defer snapshotRecoverFn(c) + } + f.Call(ctx, s) + }() + // Call Listener.After. + listener.After(ctx, callerModule, def, s) + // Back to the native code. + c.execCtx.exitCode = wazevoapi.ExitCodeOK + afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, + uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall) + case wazevoapi.ExitCodeCallGoModuleFunction: + index := wazevoapi.GoFunctionIndexFromExitCode(ec) + f := hostModuleGoFuncFromOpaque[api.GoModuleFunction](index, c.execCtx.goFunctionCallCalleeModuleContextOpaque) + mod := c.callerModuleInstance() + func() { + if snapshotEnabled { + defer snapshotRecoverFn(c) + } + f.Call(ctx, mod, goCallStackView(c.execCtx.stackPointerBeforeGoCall)) + }() + // Back to the native code. + c.execCtx.exitCode = wazevoapi.ExitCodeOK + afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, + uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall) + case wazevoapi.ExitCodeCallGoModuleFunctionWithListener: + index := wazevoapi.GoFunctionIndexFromExitCode(ec) + f := hostModuleGoFuncFromOpaque[api.GoModuleFunction](index, c.execCtx.goFunctionCallCalleeModuleContextOpaque) + listeners := hostModuleListenersSliceFromOpaque(c.execCtx.goFunctionCallCalleeModuleContextOpaque) + s := goCallStackView(c.execCtx.stackPointerBeforeGoCall) + // Call Listener.Before. + callerModule := c.callerModuleInstance() + listener := listeners[index] + hostModule := hostModuleFromOpaque(c.execCtx.goFunctionCallCalleeModuleContextOpaque) + def := hostModule.FunctionDefinition(wasm.Index(index)) + listener.Before(ctx, callerModule, def, s, c.stackIterator(true)) + // Call into the Go function. + func() { + if snapshotEnabled { + defer snapshotRecoverFn(c) + } + f.Call(ctx, callerModule, s) + }() + // Call Listener.After. + listener.After(ctx, callerModule, def, s) + // Back to the native code. + c.execCtx.exitCode = wazevoapi.ExitCodeOK + afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, + uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall) + case wazevoapi.ExitCodeCallListenerBefore: + stack := goCallStackView(c.execCtx.stackPointerBeforeGoCall) + index := wasm.Index(stack[0]) + mod := c.callerModuleInstance() + listener := mod.Engine.(*moduleEngine).listeners[index] + def := mod.Source.FunctionDefinition(index + mod.Source.ImportFunctionCount) + listener.Before(ctx, mod, def, stack[1:], c.stackIterator(false)) + c.execCtx.exitCode = wazevoapi.ExitCodeOK + afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, + uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall) + case wazevoapi.ExitCodeCallListenerAfter: + stack := goCallStackView(c.execCtx.stackPointerBeforeGoCall) + index := wasm.Index(stack[0]) + mod := c.callerModuleInstance() + listener := mod.Engine.(*moduleEngine).listeners[index] + def := mod.Source.FunctionDefinition(index + mod.Source.ImportFunctionCount) + listener.After(ctx, mod, def, stack[1:]) + c.execCtx.exitCode = wazevoapi.ExitCodeOK + afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, + uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall) + case wazevoapi.ExitCodeCheckModuleExitCode: + // Note: this operation must be done in Go, not native code. The reason is that + // native code cannot be preempted and that means it can block forever if there are not + // enough OS threads (which we don't have control over). + if err := m.FailIfClosed(); err != nil { + panic(err) + } + c.execCtx.exitCode = wazevoapi.ExitCodeOK + afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, + uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall) + case wazevoapi.ExitCodeRefFunc: + mod := c.callerModuleInstance() + s := goCallStackView(c.execCtx.stackPointerBeforeGoCall) + funcIndex := wasm.Index(s[0]) + ref := mod.Engine.FunctionInstanceReference(funcIndex) + s[0] = uint64(ref) + c.execCtx.exitCode = wazevoapi.ExitCodeOK + afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, + uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall) + case wazevoapi.ExitCodeMemoryWait32: + mod := c.callerModuleInstance() + mem := mod.MemoryInstance + if !mem.Shared { + panic(wasmruntime.ErrRuntimeExpectedSharedMemory) + } + + s := goCallStackView(c.execCtx.stackPointerBeforeGoCall) + timeout, exp, addr := int64(s[0]), uint32(s[1]), uintptr(s[2]) + base := uintptr(unsafe.Pointer(&mem.Buffer[0])) + + offset := uint32(addr - base) + res := mem.Wait32(offset, exp, timeout, func(mem *wasm.MemoryInstance, offset uint32) uint32 { + addr := unsafe.Add(unsafe.Pointer(&mem.Buffer[0]), offset) + return atomic.LoadUint32((*uint32)(addr)) + }) + s[0] = res + c.execCtx.exitCode = wazevoapi.ExitCodeOK + afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, + uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall) + case wazevoapi.ExitCodeMemoryWait64: + mod := c.callerModuleInstance() + mem := mod.MemoryInstance + if !mem.Shared { + panic(wasmruntime.ErrRuntimeExpectedSharedMemory) + } + + s := goCallStackView(c.execCtx.stackPointerBeforeGoCall) + timeout, exp, addr := int64(s[0]), uint64(s[1]), uintptr(s[2]) + base := uintptr(unsafe.Pointer(&mem.Buffer[0])) + + offset := uint32(addr - base) + res := mem.Wait64(offset, exp, timeout, func(mem *wasm.MemoryInstance, offset uint32) uint64 { + addr := unsafe.Add(unsafe.Pointer(&mem.Buffer[0]), offset) + return atomic.LoadUint64((*uint64)(addr)) + }) + s[0] = uint64(res) + c.execCtx.exitCode = wazevoapi.ExitCodeOK + afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, + uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall) + case wazevoapi.ExitCodeMemoryNotify: + mod := c.callerModuleInstance() + mem := mod.MemoryInstance + + s := goCallStackView(c.execCtx.stackPointerBeforeGoCall) + count, addr := uint32(s[0]), s[1] + offset := uint32(uintptr(addr) - uintptr(unsafe.Pointer(&mem.Buffer[0]))) + res := mem.Notify(offset, count) + s[0] = uint64(res) + c.execCtx.exitCode = wazevoapi.ExitCodeOK + afterGoFunctionCallEntrypoint(c.execCtx.goCallReturnAddress, c.execCtxPtr, + uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall) + case wazevoapi.ExitCodeUnreachable: + panic(wasmruntime.ErrRuntimeUnreachable) + case wazevoapi.ExitCodeMemoryOutOfBounds: + panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) + case wazevoapi.ExitCodeTableOutOfBounds: + panic(wasmruntime.ErrRuntimeInvalidTableAccess) + case wazevoapi.ExitCodeIndirectCallNullPointer: + panic(wasmruntime.ErrRuntimeInvalidTableAccess) + case wazevoapi.ExitCodeIndirectCallTypeMismatch: + panic(wasmruntime.ErrRuntimeIndirectCallTypeMismatch) + case wazevoapi.ExitCodeIntegerOverflow: + panic(wasmruntime.ErrRuntimeIntegerOverflow) + case wazevoapi.ExitCodeIntegerDivisionByZero: + panic(wasmruntime.ErrRuntimeIntegerDivideByZero) + case wazevoapi.ExitCodeInvalidConversionToInteger: + panic(wasmruntime.ErrRuntimeInvalidConversionToInteger) + case wazevoapi.ExitCodeUnalignedAtomic: + panic(wasmruntime.ErrRuntimeUnalignedAtomic) + default: + panic("BUG") + } + } +} + +func (c *callEngine) callerModuleInstance() *wasm.ModuleInstance { + return moduleInstanceFromOpaquePtr(c.execCtx.callerModuleContextPtr) +} + +func opaqueViewFromPtr(ptr uintptr) []byte { + var opaque []byte + sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaque)) + sh.Data = ptr + setSliceLimits(sh, 24, 24) + return opaque +} + +const callStackCeiling = uintptr(50000000) // in uint64 (8 bytes) == 400000000 bytes in total == 400mb. + +func (c *callEngine) growStackWithGuarded() (newSP uintptr, newFP uintptr, err error) { + if wazevoapi.StackGuardCheckEnabled { + wazevoapi.CheckStackGuardPage(c.stack) + } + newSP, newFP, err = c.growStack() + if err != nil { + return + } + if wazevoapi.StackGuardCheckEnabled { + c.execCtx.stackBottomPtr = &c.stack[wazevoapi.StackGuardCheckGuardPageSize] + } + return +} + +// growStack grows the stack, and returns the new stack pointer. +func (c *callEngine) growStack() (newSP, newFP uintptr, err error) { + currentLen := uintptr(len(c.stack)) + if callStackCeiling < currentLen { + err = wasmruntime.ErrRuntimeStackOverflow + return + } + + newLen := 2*currentLen + c.execCtx.stackGrowRequiredSize + 16 // Stack might be aligned to 16 bytes, so add 16 bytes just in case. + newSP, newFP, c.stackTop, c.stack = c.cloneStack(newLen) + c.execCtx.stackBottomPtr = &c.stack[0] + return +} + +func (c *callEngine) cloneStack(l uintptr) (newSP, newFP, newTop uintptr, newStack []byte) { + newStack = make([]byte, l) + + relSp := c.stackTop - uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)) + relFp := c.stackTop - c.execCtx.framePointerBeforeGoCall + + // Copy the existing contents in the previous Go-allocated stack into the new one. + var prevStackAligned, newStackAligned []byte + { + sh := (*reflect.SliceHeader)(unsafe.Pointer(&prevStackAligned)) + sh.Data = c.stackTop - relSp + setSliceLimits(sh, relSp, relSp) + } + newTop = alignedStackTop(newStack) + { + newSP = newTop - relSp + newFP = newTop - relFp + sh := (*reflect.SliceHeader)(unsafe.Pointer(&newStackAligned)) + sh.Data = newSP + setSliceLimits(sh, relSp, relSp) + } + copy(newStackAligned, prevStackAligned) + return +} + +func (c *callEngine) stackIterator(onHostCall bool) experimental.StackIterator { + c.stackIteratorImpl.reset(c, onHostCall) + return &c.stackIteratorImpl +} + +// stackIterator implements experimental.StackIterator. +type stackIterator struct { + retAddrs []uintptr + retAddrCursor int + eng *engine + pc uint64 + + currentDef *wasm.FunctionDefinition +} + +func (si *stackIterator) reset(c *callEngine, onHostCall bool) { + if onHostCall { + si.retAddrs = append(si.retAddrs[:0], uintptr(unsafe.Pointer(c.execCtx.goCallReturnAddress))) + } else { + si.retAddrs = si.retAddrs[:0] + } + si.retAddrs = unwindStack(uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)), c.execCtx.framePointerBeforeGoCall, c.stackTop, si.retAddrs) + si.retAddrs = si.retAddrs[:len(si.retAddrs)-1] // the last return addr is the trampoline, so we skip it. + si.retAddrCursor = 0 + si.eng = c.parent.parent.parent +} + +// Next implements the same method as documented on experimental.StackIterator. +func (si *stackIterator) Next() bool { + if si.retAddrCursor >= len(si.retAddrs) { + return false + } + + addr := si.retAddrs[si.retAddrCursor] + cm := si.eng.compiledModuleOfAddr(addr) + if cm != nil { + index := cm.functionIndexOf(addr) + def := cm.module.FunctionDefinition(cm.module.ImportFunctionCount + index) + si.currentDef = def + si.retAddrCursor++ + si.pc = uint64(addr) + return true + } + return false +} + +// ProgramCounter implements the same method as documented on experimental.StackIterator. +func (si *stackIterator) ProgramCounter() experimental.ProgramCounter { + return experimental.ProgramCounter(si.pc) +} + +// Function implements the same method as documented on experimental.StackIterator. +func (si *stackIterator) Function() experimental.InternalFunction { + return si +} + +// Definition implements the same method as documented on experimental.InternalFunction. +func (si *stackIterator) Definition() api.FunctionDefinition { + return si.currentDef +} + +// SourceOffsetForPC implements the same method as documented on experimental.InternalFunction. +func (si *stackIterator) SourceOffsetForPC(pc experimental.ProgramCounter) uint64 { + upc := uintptr(pc) + cm := si.eng.compiledModuleOfAddr(upc) + return cm.getSourceOffset(upc) +} + +// snapshot implements experimental.Snapshot +type snapshot struct { + sp, fp, top uintptr + returnAddress *byte + stack []byte + savedRegisters [64][2]uint64 + ret []uint64 + c *callEngine +} + +// Snapshot implements the same method as documented on experimental.Snapshotter. +func (c *callEngine) Snapshot() experimental.Snapshot { + returnAddress := c.execCtx.goCallReturnAddress + oldTop, oldSp := c.stackTop, uintptr(unsafe.Pointer(c.execCtx.stackPointerBeforeGoCall)) + newSP, newFP, newTop, newStack := c.cloneStack(uintptr(len(c.stack)) + 16) + adjustClonedStack(oldSp, oldTop, newSP, newFP, newTop) + return &snapshot{ + sp: newSP, + fp: newFP, + top: newTop, + savedRegisters: c.execCtx.savedRegisters, + returnAddress: returnAddress, + stack: newStack, + c: c, + } +} + +// Restore implements the same method as documented on experimental.Snapshot. +func (s *snapshot) Restore(ret []uint64) { + s.ret = ret + panic(s) +} + +func (s *snapshot) doRestore() { + spp := *(**uint64)(unsafe.Pointer(&s.sp)) + view := goCallStackView(spp) + copy(view, s.ret) + + c := s.c + c.stack = s.stack + c.stackTop = s.top + ec := &c.execCtx + ec.stackBottomPtr = &c.stack[0] + ec.stackPointerBeforeGoCall = spp + ec.framePointerBeforeGoCall = s.fp + ec.goCallReturnAddress = s.returnAddress + ec.savedRegisters = s.savedRegisters +} + +// Error implements the same method on error. +func (s *snapshot) Error() string { + return "unhandled snapshot restore, this generally indicates restore was called from a different " + + "exported function invocation than snapshot" +} + +func snapshotRecoverFn(c *callEngine) { + if r := recover(); r != nil { + if s, ok := r.(*snapshot); ok && s.c == c { + s.doRestore() + } else { + panic(r) + } + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine.go new file mode 100644 index 000000000..f02b905fc --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine.go @@ -0,0 +1,843 @@ +package wazevo + +import ( + "context" + "encoding/hex" + "errors" + "fmt" + "runtime" + "sort" + "sync" + "unsafe" + + "github.com/tetratelabs/wazero/api" + "github.com/tetratelabs/wazero/experimental" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/frontend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" + "github.com/tetratelabs/wazero/internal/filecache" + "github.com/tetratelabs/wazero/internal/platform" + "github.com/tetratelabs/wazero/internal/version" + "github.com/tetratelabs/wazero/internal/wasm" +) + +type ( + // engine implements wasm.Engine. + engine struct { + wazeroVersion string + fileCache filecache.Cache + compiledModules map[wasm.ModuleID]*compiledModule + // sortedCompiledModules is a list of compiled modules sorted by the initial address of the executable. + sortedCompiledModules []*compiledModule + mux sync.RWMutex + // sharedFunctions is compiled functions shared by all modules. + sharedFunctions *sharedFunctions + // setFinalizer defaults to runtime.SetFinalizer, but overridable for tests. + setFinalizer func(obj interface{}, finalizer interface{}) + + // The followings are reused for compiling shared functions. + machine backend.Machine + be backend.Compiler + } + + sharedFunctions struct { + // memoryGrowExecutable is a compiled trampoline executable for memory.grow builtin function. + memoryGrowExecutable []byte + // checkModuleExitCode is a compiled trampoline executable for checking module instance exit code. This + // is used when ensureTermination is true. + checkModuleExitCode []byte + // stackGrowExecutable is a compiled executable for growing stack builtin function. + stackGrowExecutable []byte + // tableGrowExecutable is a compiled trampoline executable for table.grow builtin function. + tableGrowExecutable []byte + // refFuncExecutable is a compiled trampoline executable for ref.func builtin function. + refFuncExecutable []byte + // memoryWait32Executable is a compiled trampoline executable for memory.wait32 builtin function + memoryWait32Executable []byte + // memoryWait64Executable is a compiled trampoline executable for memory.wait64 builtin function + memoryWait64Executable []byte + // memoryNotifyExecutable is a compiled trampoline executable for memory.notify builtin function + memoryNotifyExecutable []byte + listenerBeforeTrampolines map[*wasm.FunctionType][]byte + listenerAfterTrampolines map[*wasm.FunctionType][]byte + } + + // compiledModule is a compiled variant of a wasm.Module and ready to be used for instantiation. + compiledModule struct { + *executables + // functionOffsets maps a local function index to the offset in the executable. + functionOffsets []int + parent *engine + module *wasm.Module + ensureTermination bool + listeners []experimental.FunctionListener + listenerBeforeTrampolines []*byte + listenerAfterTrampolines []*byte + + // The followings are only available for non host modules. + + offsets wazevoapi.ModuleContextOffsetData + sharedFunctions *sharedFunctions + sourceMap sourceMap + } + + executables struct { + executable []byte + entryPreambles [][]byte + } +) + +// sourceMap is a mapping from the offset of the executable to the offset of the original wasm binary. +type sourceMap struct { + // executableOffsets is a sorted list of offsets of the executable. This is index-correlated with wasmBinaryOffsets, + // in other words executableOffsets[i] is the offset of the executable which corresponds to the offset of a Wasm + // binary pointed by wasmBinaryOffsets[i]. + executableOffsets []uintptr + // wasmBinaryOffsets is the counterpart of executableOffsets. + wasmBinaryOffsets []uint64 +} + +var _ wasm.Engine = (*engine)(nil) + +// NewEngine returns the implementation of wasm.Engine. +func NewEngine(ctx context.Context, _ api.CoreFeatures, fc filecache.Cache) wasm.Engine { + machine := newMachine() + be := backend.NewCompiler(ctx, machine, ssa.NewBuilder()) + e := &engine{ + compiledModules: make(map[wasm.ModuleID]*compiledModule), + setFinalizer: runtime.SetFinalizer, + machine: machine, + be: be, + fileCache: fc, + wazeroVersion: version.GetWazeroVersion(), + } + e.compileSharedFunctions() + return e +} + +// CompileModule implements wasm.Engine. +func (e *engine) CompileModule(ctx context.Context, module *wasm.Module, listeners []experimental.FunctionListener, ensureTermination bool) (err error) { + if wazevoapi.PerfMapEnabled { + wazevoapi.PerfMap.Lock() + defer wazevoapi.PerfMap.Unlock() + } + + if _, ok, err := e.getCompiledModule(module, listeners, ensureTermination); ok { // cache hit! + return nil + } else if err != nil { + return err + } + + if wazevoapi.DeterministicCompilationVerifierEnabled { + ctx = wazevoapi.NewDeterministicCompilationVerifierContext(ctx, len(module.CodeSection)) + } + cm, err := e.compileModule(ctx, module, listeners, ensureTermination) + if err != nil { + return err + } + if err = e.addCompiledModule(module, cm); err != nil { + return err + } + + if wazevoapi.DeterministicCompilationVerifierEnabled { + for i := 0; i < wazevoapi.DeterministicCompilationVerifyingIter; i++ { + _, err := e.compileModule(ctx, module, listeners, ensureTermination) + if err != nil { + return err + } + } + } + + if len(listeners) > 0 { + cm.listeners = listeners + cm.listenerBeforeTrampolines = make([]*byte, len(module.TypeSection)) + cm.listenerAfterTrampolines = make([]*byte, len(module.TypeSection)) + for i := range module.TypeSection { + typ := &module.TypeSection[i] + before, after := e.getListenerTrampolineForType(typ) + cm.listenerBeforeTrampolines[i] = before + cm.listenerAfterTrampolines[i] = after + } + } + return nil +} + +func (exec *executables) compileEntryPreambles(m *wasm.Module, machine backend.Machine, be backend.Compiler) { + exec.entryPreambles = make([][]byte, len(m.TypeSection)) + for i := range m.TypeSection { + typ := &m.TypeSection[i] + sig := frontend.SignatureForWasmFunctionType(typ) + be.Init() + buf := machine.CompileEntryPreamble(&sig) + executable := mmapExecutable(buf) + exec.entryPreambles[i] = executable + + if wazevoapi.PerfMapEnabled { + wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&executable[0])), + uint64(len(executable)), fmt.Sprintf("entry_preamble::type=%s", typ.String())) + } + } +} + +func (e *engine) compileModule(ctx context.Context, module *wasm.Module, listeners []experimental.FunctionListener, ensureTermination bool) (*compiledModule, error) { + withListener := len(listeners) > 0 + cm := &compiledModule{ + offsets: wazevoapi.NewModuleContextOffsetData(module, withListener), parent: e, module: module, + ensureTermination: ensureTermination, + executables: &executables{}, + } + + if module.IsHostModule { + return e.compileHostModule(ctx, module, listeners) + } + + importedFns, localFns := int(module.ImportFunctionCount), len(module.FunctionSection) + if localFns == 0 { + return cm, nil + } + + rels := make([]backend.RelocationInfo, 0) + refToBinaryOffset := make([]int, importedFns+localFns) + + if wazevoapi.DeterministicCompilationVerifierEnabled { + // The compilation must be deterministic regardless of the order of functions being compiled. + wazevoapi.DeterministicCompilationVerifierRandomizeIndexes(ctx) + } + + needSourceInfo := module.DWARFLines != nil + + // Creates new compiler instances which are reused for each function. + ssaBuilder := ssa.NewBuilder() + fe := frontend.NewFrontendCompiler(module, ssaBuilder, &cm.offsets, ensureTermination, withListener, needSourceInfo) + machine := newMachine() + be := backend.NewCompiler(ctx, machine, ssaBuilder) + + cm.executables.compileEntryPreambles(module, machine, be) + + totalSize := 0 // Total binary size of the executable. + cm.functionOffsets = make([]int, localFns) + bodies := make([][]byte, localFns) + + // Trampoline relocation related variables. + trampolineInterval, callTrampolineIslandSize, err := machine.CallTrampolineIslandInfo(localFns) + if err != nil { + return nil, err + } + needCallTrampoline := callTrampolineIslandSize > 0 + var callTrampolineIslandOffsets []int // Holds the offsets of trampoline islands. + + for i := range module.CodeSection { + if wazevoapi.DeterministicCompilationVerifierEnabled { + i = wazevoapi.DeterministicCompilationVerifierGetRandomizedLocalFunctionIndex(ctx, i) + } + + fidx := wasm.Index(i + importedFns) + + if wazevoapi.NeedFunctionNameInContext { + def := module.FunctionDefinition(fidx) + name := def.DebugName() + if len(def.ExportNames()) > 0 { + name = def.ExportNames()[0] + } + ctx = wazevoapi.SetCurrentFunctionName(ctx, i, fmt.Sprintf("[%d/%d]%s", i, len(module.CodeSection)-1, name)) + } + + needListener := len(listeners) > 0 && listeners[i] != nil + body, relsPerFunc, err := e.compileLocalWasmFunction(ctx, module, wasm.Index(i), fe, ssaBuilder, be, needListener) + if err != nil { + return nil, fmt.Errorf("compile function %d/%d: %v", i, len(module.CodeSection)-1, err) + } + + // Align 16-bytes boundary. + totalSize = (totalSize + 15) &^ 15 + cm.functionOffsets[i] = totalSize + + if needSourceInfo { + // At the beginning of the function, we add the offset of the function body so that + // we can resolve the source location of the call site of before listener call. + cm.sourceMap.executableOffsets = append(cm.sourceMap.executableOffsets, uintptr(totalSize)) + cm.sourceMap.wasmBinaryOffsets = append(cm.sourceMap.wasmBinaryOffsets, module.CodeSection[i].BodyOffsetInCodeSection) + + for _, info := range be.SourceOffsetInfo() { + cm.sourceMap.executableOffsets = append(cm.sourceMap.executableOffsets, uintptr(totalSize)+uintptr(info.ExecutableOffset)) + cm.sourceMap.wasmBinaryOffsets = append(cm.sourceMap.wasmBinaryOffsets, uint64(info.SourceOffset)) + } + } + + fref := frontend.FunctionIndexToFuncRef(fidx) + refToBinaryOffset[fref] = totalSize + + // At this point, relocation offsets are relative to the start of the function body, + // so we adjust it to the start of the executable. + for _, r := range relsPerFunc { + r.Offset += int64(totalSize) + rels = append(rels, r) + } + + bodies[i] = body + totalSize += len(body) + if wazevoapi.PrintMachineCodeHexPerFunction { + fmt.Printf("[[[machine code for %s]]]\n%s\n\n", wazevoapi.GetCurrentFunctionName(ctx), hex.EncodeToString(body)) + } + + if needCallTrampoline { + // If the total size exceeds the trampoline interval, we need to add a trampoline island. + if totalSize/trampolineInterval > len(callTrampolineIslandOffsets) { + callTrampolineIslandOffsets = append(callTrampolineIslandOffsets, totalSize) + totalSize += callTrampolineIslandSize + } + } + } + + // Allocate executable memory and then copy the generated machine code. + executable, err := platform.MmapCodeSegment(totalSize) + if err != nil { + panic(err) + } + cm.executable = executable + + for i, b := range bodies { + offset := cm.functionOffsets[i] + copy(executable[offset:], b) + } + + if wazevoapi.PerfMapEnabled { + wazevoapi.PerfMap.Flush(uintptr(unsafe.Pointer(&executable[0])), cm.functionOffsets) + } + + if needSourceInfo { + for i := range cm.sourceMap.executableOffsets { + cm.sourceMap.executableOffsets[i] += uintptr(unsafe.Pointer(&cm.executable[0])) + } + } + + // Resolve relocations for local function calls. + if len(rels) > 0 { + machine.ResolveRelocations(refToBinaryOffset, executable, rels, callTrampolineIslandOffsets) + } + + if runtime.GOARCH == "arm64" { + // On arm64, we cannot give all of rwx at the same time, so we change it to exec. + if err = platform.MprotectRX(executable); err != nil { + return nil, err + } + } + cm.sharedFunctions = e.sharedFunctions + e.setFinalizer(cm.executables, executablesFinalizer) + return cm, nil +} + +func (e *engine) compileLocalWasmFunction( + ctx context.Context, + module *wasm.Module, + localFunctionIndex wasm.Index, + fe *frontend.Compiler, + ssaBuilder ssa.Builder, + be backend.Compiler, + needListener bool, +) (body []byte, rels []backend.RelocationInfo, err error) { + typIndex := module.FunctionSection[localFunctionIndex] + typ := &module.TypeSection[typIndex] + codeSeg := &module.CodeSection[localFunctionIndex] + + // Initializes both frontend and backend compilers. + fe.Init(localFunctionIndex, typIndex, typ, codeSeg.LocalTypes, codeSeg.Body, needListener, codeSeg.BodyOffsetInCodeSection) + be.Init() + + // Lower Wasm to SSA. + fe.LowerToSSA() + if wazevoapi.PrintSSA && wazevoapi.PrintEnabledIndex(ctx) { + fmt.Printf("[[[SSA for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), ssaBuilder.Format()) + } + + if wazevoapi.DeterministicCompilationVerifierEnabled { + wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "SSA", ssaBuilder.Format()) + } + + // Run SSA-level optimization passes. + ssaBuilder.RunPasses() + + if wazevoapi.PrintOptimizedSSA && wazevoapi.PrintEnabledIndex(ctx) { + fmt.Printf("[[[Optimized SSA for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), ssaBuilder.Format()) + } + + if wazevoapi.DeterministicCompilationVerifierEnabled { + wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "Optimized SSA", ssaBuilder.Format()) + } + + // Now our ssaBuilder contains the necessary information to further lower them to + // machine code. + original, rels, err := be.Compile(ctx) + if err != nil { + return nil, nil, fmt.Errorf("ssa->machine code: %v", err) + } + + // TODO: optimize as zero copy. + copied := make([]byte, len(original)) + copy(copied, original) + return copied, rels, nil +} + +func (e *engine) compileHostModule(ctx context.Context, module *wasm.Module, listeners []experimental.FunctionListener) (*compiledModule, error) { + machine := newMachine() + be := backend.NewCompiler(ctx, machine, ssa.NewBuilder()) + + num := len(module.CodeSection) + cm := &compiledModule{module: module, listeners: listeners, executables: &executables{}} + cm.functionOffsets = make([]int, num) + totalSize := 0 // Total binary size of the executable. + bodies := make([][]byte, num) + var sig ssa.Signature + for i := range module.CodeSection { + totalSize = (totalSize + 15) &^ 15 + cm.functionOffsets[i] = totalSize + + typIndex := module.FunctionSection[i] + typ := &module.TypeSection[typIndex] + + // We can relax until the index fits together in ExitCode as we do in wazevoapi.ExitCodeCallGoModuleFunctionWithIndex. + // However, 1 << 16 should be large enough for a real use case. + const hostFunctionNumMaximum = 1 << 16 + if i >= hostFunctionNumMaximum { + return nil, fmt.Errorf("too many host functions (maximum %d)", hostFunctionNumMaximum) + } + + sig.ID = ssa.SignatureID(typIndex) // This is important since we reuse the `machine` which caches the ABI based on the SignatureID. + sig.Params = append(sig.Params[:0], + ssa.TypeI64, // First argument must be exec context. + ssa.TypeI64, // The second argument is the moduleContextOpaque of this host module. + ) + for _, t := range typ.Params { + sig.Params = append(sig.Params, frontend.WasmTypeToSSAType(t)) + } + + sig.Results = sig.Results[:0] + for _, t := range typ.Results { + sig.Results = append(sig.Results, frontend.WasmTypeToSSAType(t)) + } + + c := &module.CodeSection[i] + if c.GoFunc == nil { + panic("BUG: GoFunc must be set for host module") + } + + withListener := len(listeners) > 0 && listeners[i] != nil + var exitCode wazevoapi.ExitCode + fn := c.GoFunc + switch fn.(type) { + case api.GoModuleFunction: + exitCode = wazevoapi.ExitCodeCallGoModuleFunctionWithIndex(i, withListener) + case api.GoFunction: + exitCode = wazevoapi.ExitCodeCallGoFunctionWithIndex(i, withListener) + } + + be.Init() + machine.CompileGoFunctionTrampoline(exitCode, &sig, true) + if err := be.Finalize(ctx); err != nil { + return nil, err + } + body := be.Buf() + + if wazevoapi.PerfMapEnabled { + name := module.FunctionDefinition(wasm.Index(i)).DebugName() + wazevoapi.PerfMap.AddModuleEntry(i, + int64(totalSize), + uint64(len(body)), + fmt.Sprintf("trampoline:%s", name)) + } + + // TODO: optimize as zero copy. + copied := make([]byte, len(body)) + copy(copied, body) + bodies[i] = copied + totalSize += len(body) + } + + if totalSize == 0 { + // Empty module. + return cm, nil + } + + // Allocate executable memory and then copy the generated machine code. + executable, err := platform.MmapCodeSegment(totalSize) + if err != nil { + panic(err) + } + cm.executable = executable + + for i, b := range bodies { + offset := cm.functionOffsets[i] + copy(executable[offset:], b) + } + + if wazevoapi.PerfMapEnabled { + wazevoapi.PerfMap.Flush(uintptr(unsafe.Pointer(&executable[0])), cm.functionOffsets) + } + + if runtime.GOARCH == "arm64" { + // On arm64, we cannot give all of rwx at the same time, so we change it to exec. + if err = platform.MprotectRX(executable); err != nil { + return nil, err + } + } + e.setFinalizer(cm.executables, executablesFinalizer) + return cm, nil +} + +// Close implements wasm.Engine. +func (e *engine) Close() (err error) { + e.mux.Lock() + defer e.mux.Unlock() + e.sortedCompiledModules = nil + e.compiledModules = nil + e.sharedFunctions = nil + return nil +} + +// CompiledModuleCount implements wasm.Engine. +func (e *engine) CompiledModuleCount() uint32 { + e.mux.RLock() + defer e.mux.RUnlock() + return uint32(len(e.compiledModules)) +} + +// DeleteCompiledModule implements wasm.Engine. +func (e *engine) DeleteCompiledModule(m *wasm.Module) { + e.mux.Lock() + defer e.mux.Unlock() + cm, ok := e.compiledModules[m.ID] + if ok { + if len(cm.executable) > 0 { + e.deleteCompiledModuleFromSortedList(cm) + } + delete(e.compiledModules, m.ID) + } +} + +func (e *engine) addCompiledModuleToSortedList(cm *compiledModule) { + ptr := uintptr(unsafe.Pointer(&cm.executable[0])) + + index := sort.Search(len(e.sortedCompiledModules), func(i int) bool { + return uintptr(unsafe.Pointer(&e.sortedCompiledModules[i].executable[0])) >= ptr + }) + e.sortedCompiledModules = append(e.sortedCompiledModules, nil) + copy(e.sortedCompiledModules[index+1:], e.sortedCompiledModules[index:]) + e.sortedCompiledModules[index] = cm +} + +func (e *engine) deleteCompiledModuleFromSortedList(cm *compiledModule) { + ptr := uintptr(unsafe.Pointer(&cm.executable[0])) + + index := sort.Search(len(e.sortedCompiledModules), func(i int) bool { + return uintptr(unsafe.Pointer(&e.sortedCompiledModules[i].executable[0])) >= ptr + }) + if index >= len(e.sortedCompiledModules) { + return + } + copy(e.sortedCompiledModules[index:], e.sortedCompiledModules[index+1:]) + e.sortedCompiledModules = e.sortedCompiledModules[:len(e.sortedCompiledModules)-1] +} + +func (e *engine) compiledModuleOfAddr(addr uintptr) *compiledModule { + e.mux.RLock() + defer e.mux.RUnlock() + + index := sort.Search(len(e.sortedCompiledModules), func(i int) bool { + return uintptr(unsafe.Pointer(&e.sortedCompiledModules[i].executable[0])) > addr + }) + index -= 1 + if index < 0 { + return nil + } + candidate := e.sortedCompiledModules[index] + if checkAddrInBytes(addr, candidate.executable) { + // If a module is already deleted, the found module may have been wrong. + return candidate + } + return nil +} + +func checkAddrInBytes(addr uintptr, b []byte) bool { + return uintptr(unsafe.Pointer(&b[0])) <= addr && addr <= uintptr(unsafe.Pointer(&b[len(b)-1])) +} + +// NewModuleEngine implements wasm.Engine. +func (e *engine) NewModuleEngine(m *wasm.Module, mi *wasm.ModuleInstance) (wasm.ModuleEngine, error) { + me := &moduleEngine{} + + // Note: imported functions are resolved in moduleEngine.ResolveImportedFunction. + me.importedFunctions = make([]importedFunction, m.ImportFunctionCount) + + compiled, ok := e.getCompiledModuleFromMemory(m) + if !ok { + return nil, errors.New("source module must be compiled before instantiation") + } + me.parent = compiled + me.module = mi + me.listeners = compiled.listeners + + if m.IsHostModule { + me.opaque = buildHostModuleOpaque(m, compiled.listeners) + me.opaquePtr = &me.opaque[0] + } else { + if size := compiled.offsets.TotalSize; size != 0 { + opaque := newAlignedOpaque(size) + me.opaque = opaque + me.opaquePtr = &opaque[0] + } + } + return me, nil +} + +func (e *engine) compileSharedFunctions() { + e.sharedFunctions = &sharedFunctions{ + listenerBeforeTrampolines: make(map[*wasm.FunctionType][]byte), + listenerAfterTrampolines: make(map[*wasm.FunctionType][]byte), + } + + e.be.Init() + { + src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeGrowMemory, &ssa.Signature{ + Params: []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32}, + Results: []ssa.Type{ssa.TypeI32}, + }, false) + e.sharedFunctions.memoryGrowExecutable = mmapExecutable(src) + if wazevoapi.PerfMapEnabled { + exe := e.sharedFunctions.memoryGrowExecutable + wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "memory_grow_trampoline") + } + } + + e.be.Init() + { + src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeTableGrow, &ssa.Signature{ + Params: []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32 /* table index */, ssa.TypeI32 /* num */, ssa.TypeI64 /* ref */}, + Results: []ssa.Type{ssa.TypeI32}, + }, false) + e.sharedFunctions.tableGrowExecutable = mmapExecutable(src) + if wazevoapi.PerfMapEnabled { + exe := e.sharedFunctions.tableGrowExecutable + wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "table_grow_trampoline") + } + } + + e.be.Init() + { + src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeCheckModuleExitCode, &ssa.Signature{ + Params: []ssa.Type{ssa.TypeI32 /* exec context */}, + Results: []ssa.Type{ssa.TypeI32}, + }, false) + e.sharedFunctions.checkModuleExitCode = mmapExecutable(src) + if wazevoapi.PerfMapEnabled { + exe := e.sharedFunctions.checkModuleExitCode + wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "check_module_exit_code_trampoline") + } + } + + e.be.Init() + { + src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeRefFunc, &ssa.Signature{ + Params: []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32 /* function index */}, + Results: []ssa.Type{ssa.TypeI64}, // returns the function reference. + }, false) + e.sharedFunctions.refFuncExecutable = mmapExecutable(src) + if wazevoapi.PerfMapEnabled { + exe := e.sharedFunctions.refFuncExecutable + wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "ref_func_trampoline") + } + } + + e.be.Init() + { + src := e.machine.CompileStackGrowCallSequence() + e.sharedFunctions.stackGrowExecutable = mmapExecutable(src) + if wazevoapi.PerfMapEnabled { + exe := e.sharedFunctions.stackGrowExecutable + wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "stack_grow_trampoline") + } + } + + e.be.Init() + { + src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeMemoryWait32, &ssa.Signature{ + // exec context, timeout, expected, addr + Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI32, ssa.TypeI64}, + // Returns the status. + Results: []ssa.Type{ssa.TypeI32}, + }, false) + e.sharedFunctions.memoryWait32Executable = mmapExecutable(src) + if wazevoapi.PerfMapEnabled { + exe := e.sharedFunctions.memoryWait32Executable + wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "memory_wait32_trampoline") + } + } + + e.be.Init() + { + src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeMemoryWait64, &ssa.Signature{ + // exec context, timeout, expected, addr + Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI64, ssa.TypeI64}, + // Returns the status. + Results: []ssa.Type{ssa.TypeI32}, + }, false) + e.sharedFunctions.memoryWait64Executable = mmapExecutable(src) + if wazevoapi.PerfMapEnabled { + exe := e.sharedFunctions.memoryWait64Executable + wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "memory_wait64_trampoline") + } + } + + e.be.Init() + { + src := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeMemoryNotify, &ssa.Signature{ + // exec context, count, addr + Params: []ssa.Type{ssa.TypeI64, ssa.TypeI32, ssa.TypeI64}, + // Returns the number notified. + Results: []ssa.Type{ssa.TypeI32}, + }, false) + e.sharedFunctions.memoryNotifyExecutable = mmapExecutable(src) + if wazevoapi.PerfMapEnabled { + exe := e.sharedFunctions.memoryNotifyExecutable + wazevoapi.PerfMap.AddEntry(uintptr(unsafe.Pointer(&exe[0])), uint64(len(exe)), "memory_notify_trampoline") + } + } + + e.setFinalizer(e.sharedFunctions, sharedFunctionsFinalizer) +} + +func sharedFunctionsFinalizer(sf *sharedFunctions) { + if err := platform.MunmapCodeSegment(sf.memoryGrowExecutable); err != nil { + panic(err) + } + if err := platform.MunmapCodeSegment(sf.checkModuleExitCode); err != nil { + panic(err) + } + if err := platform.MunmapCodeSegment(sf.stackGrowExecutable); err != nil { + panic(err) + } + if err := platform.MunmapCodeSegment(sf.tableGrowExecutable); err != nil { + panic(err) + } + if err := platform.MunmapCodeSegment(sf.refFuncExecutable); err != nil { + panic(err) + } + if err := platform.MunmapCodeSegment(sf.memoryWait32Executable); err != nil { + panic(err) + } + if err := platform.MunmapCodeSegment(sf.memoryWait64Executable); err != nil { + panic(err) + } + if err := platform.MunmapCodeSegment(sf.memoryNotifyExecutable); err != nil { + panic(err) + } + for _, f := range sf.listenerBeforeTrampolines { + if err := platform.MunmapCodeSegment(f); err != nil { + panic(err) + } + } + for _, f := range sf.listenerAfterTrampolines { + if err := platform.MunmapCodeSegment(f); err != nil { + panic(err) + } + } + + sf.memoryGrowExecutable = nil + sf.checkModuleExitCode = nil + sf.stackGrowExecutable = nil + sf.tableGrowExecutable = nil + sf.refFuncExecutable = nil + sf.memoryWait32Executable = nil + sf.memoryWait64Executable = nil + sf.memoryNotifyExecutable = nil + sf.listenerBeforeTrampolines = nil + sf.listenerAfterTrampolines = nil +} + +func executablesFinalizer(exec *executables) { + if len(exec.executable) > 0 { + if err := platform.MunmapCodeSegment(exec.executable); err != nil { + panic(err) + } + } + exec.executable = nil + + for _, f := range exec.entryPreambles { + if err := platform.MunmapCodeSegment(f); err != nil { + panic(err) + } + } + exec.entryPreambles = nil +} + +func mmapExecutable(src []byte) []byte { + executable, err := platform.MmapCodeSegment(len(src)) + if err != nil { + panic(err) + } + + copy(executable, src) + + if runtime.GOARCH == "arm64" { + // On arm64, we cannot give all of rwx at the same time, so we change it to exec. + if err = platform.MprotectRX(executable); err != nil { + panic(err) + } + } + return executable +} + +func (cm *compiledModule) functionIndexOf(addr uintptr) wasm.Index { + addr -= uintptr(unsafe.Pointer(&cm.executable[0])) + offset := cm.functionOffsets + index := sort.Search(len(offset), func(i int) bool { + return offset[i] > int(addr) + }) + index-- + if index < 0 { + panic("BUG") + } + return wasm.Index(index) +} + +func (e *engine) getListenerTrampolineForType(functionType *wasm.FunctionType) (before, after *byte) { + e.mux.Lock() + defer e.mux.Unlock() + + beforeBuf, ok := e.sharedFunctions.listenerBeforeTrampolines[functionType] + afterBuf := e.sharedFunctions.listenerAfterTrampolines[functionType] + if ok { + return &beforeBuf[0], &afterBuf[0] + } + + beforeSig, afterSig := frontend.SignatureForListener(functionType) + + e.be.Init() + buf := e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeCallListenerBefore, beforeSig, false) + beforeBuf = mmapExecutable(buf) + + e.be.Init() + buf = e.machine.CompileGoFunctionTrampoline(wazevoapi.ExitCodeCallListenerAfter, afterSig, false) + afterBuf = mmapExecutable(buf) + + e.sharedFunctions.listenerBeforeTrampolines[functionType] = beforeBuf + e.sharedFunctions.listenerAfterTrampolines[functionType] = afterBuf + return &beforeBuf[0], &afterBuf[0] +} + +func (cm *compiledModule) getSourceOffset(pc uintptr) uint64 { + offsets := cm.sourceMap.executableOffsets + if len(offsets) == 0 { + return 0 + } + + index := sort.Search(len(offsets), func(i int) bool { + return offsets[i] >= pc + }) + + index-- + if index < 0 { + return 0 + } + return cm.sourceMap.wasmBinaryOffsets[index] +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go new file mode 100644 index 000000000..f7c0450ae --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/engine_cache.go @@ -0,0 +1,296 @@ +package wazevo + +import ( + "bytes" + "context" + "crypto/sha256" + "encoding/binary" + "fmt" + "hash/crc32" + "io" + "runtime" + "unsafe" + + "github.com/tetratelabs/wazero/experimental" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" + "github.com/tetratelabs/wazero/internal/filecache" + "github.com/tetratelabs/wazero/internal/platform" + "github.com/tetratelabs/wazero/internal/u32" + "github.com/tetratelabs/wazero/internal/u64" + "github.com/tetratelabs/wazero/internal/wasm" +) + +var crc = crc32.MakeTable(crc32.Castagnoli) + +// fileCacheKey returns a key for the file cache. +// In order to avoid collisions with the existing compiler, we do not use m.ID directly, +// but instead we rehash it with magic. +func fileCacheKey(m *wasm.Module) (ret filecache.Key) { + s := sha256.New() + s.Write(m.ID[:]) + s.Write(magic) + s.Sum(ret[:0]) + return +} + +func (e *engine) addCompiledModule(module *wasm.Module, cm *compiledModule) (err error) { + e.addCompiledModuleToMemory(module, cm) + if !module.IsHostModule && e.fileCache != nil { + err = e.addCompiledModuleToCache(module, cm) + } + return +} + +func (e *engine) getCompiledModule(module *wasm.Module, listeners []experimental.FunctionListener, ensureTermination bool) (cm *compiledModule, ok bool, err error) { + cm, ok = e.getCompiledModuleFromMemory(module) + if ok { + return + } + cm, ok, err = e.getCompiledModuleFromCache(module) + if ok { + cm.parent = e + cm.module = module + cm.sharedFunctions = e.sharedFunctions + cm.ensureTermination = ensureTermination + cm.offsets = wazevoapi.NewModuleContextOffsetData(module, len(listeners) > 0) + if len(listeners) > 0 { + cm.listeners = listeners + cm.listenerBeforeTrampolines = make([]*byte, len(module.TypeSection)) + cm.listenerAfterTrampolines = make([]*byte, len(module.TypeSection)) + for i := range module.TypeSection { + typ := &module.TypeSection[i] + before, after := e.getListenerTrampolineForType(typ) + cm.listenerBeforeTrampolines[i] = before + cm.listenerAfterTrampolines[i] = after + } + } + e.addCompiledModuleToMemory(module, cm) + ssaBuilder := ssa.NewBuilder() + machine := newMachine() + be := backend.NewCompiler(context.Background(), machine, ssaBuilder) + cm.executables.compileEntryPreambles(module, machine, be) + + // Set the finalizer. + e.setFinalizer(cm.executables, executablesFinalizer) + } + return +} + +func (e *engine) addCompiledModuleToMemory(m *wasm.Module, cm *compiledModule) { + e.mux.Lock() + defer e.mux.Unlock() + e.compiledModules[m.ID] = cm + if len(cm.executable) > 0 { + e.addCompiledModuleToSortedList(cm) + } +} + +func (e *engine) getCompiledModuleFromMemory(module *wasm.Module) (cm *compiledModule, ok bool) { + e.mux.RLock() + defer e.mux.RUnlock() + cm, ok = e.compiledModules[module.ID] + return +} + +func (e *engine) addCompiledModuleToCache(module *wasm.Module, cm *compiledModule) (err error) { + if e.fileCache == nil || module.IsHostModule { + return + } + err = e.fileCache.Add(fileCacheKey(module), serializeCompiledModule(e.wazeroVersion, cm)) + return +} + +func (e *engine) getCompiledModuleFromCache(module *wasm.Module) (cm *compiledModule, hit bool, err error) { + if e.fileCache == nil || module.IsHostModule { + return + } + + // Check if the entries exist in the external cache. + var cached io.ReadCloser + cached, hit, err = e.fileCache.Get(fileCacheKey(module)) + if !hit || err != nil { + return + } + + // Otherwise, we hit the cache on external cache. + // We retrieve *code structures from `cached`. + var staleCache bool + // Note: cached.Close is ensured to be called in deserializeCodes. + cm, staleCache, err = deserializeCompiledModule(e.wazeroVersion, cached) + if err != nil { + hit = false + return + } else if staleCache { + return nil, false, e.fileCache.Delete(fileCacheKey(module)) + } + return +} + +var magic = []byte{'W', 'A', 'Z', 'E', 'V', 'O'} + +func serializeCompiledModule(wazeroVersion string, cm *compiledModule) io.Reader { + buf := bytes.NewBuffer(nil) + // First 6 byte: WAZEVO header. + buf.Write(magic) + // Next 1 byte: length of version: + buf.WriteByte(byte(len(wazeroVersion))) + // Version of wazero. + buf.WriteString(wazeroVersion) + // Number of *code (== locally defined functions in the module): 4 bytes. + buf.Write(u32.LeBytes(uint32(len(cm.functionOffsets)))) + for _, offset := range cm.functionOffsets { + // The offset of this function in the executable (8 bytes). + buf.Write(u64.LeBytes(uint64(offset))) + } + // The length of code segment (8 bytes). + buf.Write(u64.LeBytes(uint64(len(cm.executable)))) + // Append the native code. + buf.Write(cm.executable) + // Append checksum. + checksum := crc32.Checksum(cm.executable, crc) + buf.Write(u32.LeBytes(checksum)) + if sm := cm.sourceMap; len(sm.executableOffsets) > 0 { + buf.WriteByte(1) // indicates that source map is present. + l := len(sm.wasmBinaryOffsets) + buf.Write(u64.LeBytes(uint64(l))) + executableAddr := uintptr(unsafe.Pointer(&cm.executable[0])) + for i := 0; i < l; i++ { + buf.Write(u64.LeBytes(sm.wasmBinaryOffsets[i])) + // executableOffsets is absolute address, so we need to subtract executableAddr. + buf.Write(u64.LeBytes(uint64(sm.executableOffsets[i] - executableAddr))) + } + } else { + buf.WriteByte(0) // indicates that source map is not present. + } + return bytes.NewReader(buf.Bytes()) +} + +func deserializeCompiledModule(wazeroVersion string, reader io.ReadCloser) (cm *compiledModule, staleCache bool, err error) { + defer reader.Close() + cacheHeaderSize := len(magic) + 1 /* version size */ + len(wazeroVersion) + 4 /* number of functions */ + + // Read the header before the native code. + header := make([]byte, cacheHeaderSize) + n, err := reader.Read(header) + if err != nil { + return nil, false, fmt.Errorf("compilationcache: error reading header: %v", err) + } + + if n != cacheHeaderSize { + return nil, false, fmt.Errorf("compilationcache: invalid header length: %d", n) + } + + if !bytes.Equal(header[:len(magic)], magic) { + return nil, false, fmt.Errorf( + "compilationcache: invalid magic number: got %s but want %s", magic, header[:len(magic)]) + } + + // Check the version compatibility. + versionSize := int(header[len(magic)]) + + cachedVersionBegin, cachedVersionEnd := len(magic)+1, len(magic)+1+versionSize + if cachedVersionEnd >= len(header) { + staleCache = true + return + } else if cachedVersion := string(header[cachedVersionBegin:cachedVersionEnd]); cachedVersion != wazeroVersion { + staleCache = true + return + } + + functionsNum := binary.LittleEndian.Uint32(header[len(header)-4:]) + cm = &compiledModule{functionOffsets: make([]int, functionsNum), executables: &executables{}} + + var eightBytes [8]byte + for i := uint32(0); i < functionsNum; i++ { + // Read the offset of each function in the executable. + var offset uint64 + if offset, err = readUint64(reader, &eightBytes); err != nil { + err = fmt.Errorf("compilationcache: error reading func[%d] executable offset: %v", i, err) + return + } + cm.functionOffsets[i] = int(offset) + } + + executableLen, err := readUint64(reader, &eightBytes) + if err != nil { + err = fmt.Errorf("compilationcache: error reading executable size: %v", err) + return + } + + if executableLen > 0 { + executable, err := platform.MmapCodeSegment(int(executableLen)) + if err != nil { + err = fmt.Errorf("compilationcache: error mmapping executable (len=%d): %v", executableLen, err) + return nil, false, err + } + + _, err = io.ReadFull(reader, executable) + if err != nil { + err = fmt.Errorf("compilationcache: error reading executable (len=%d): %v", executableLen, err) + return nil, false, err + } + + expected := crc32.Checksum(executable, crc) + if _, err = io.ReadFull(reader, eightBytes[:4]); err != nil { + return nil, false, fmt.Errorf("compilationcache: could not read checksum: %v", err) + } else if checksum := binary.LittleEndian.Uint32(eightBytes[:4]); expected != checksum { + return nil, false, fmt.Errorf("compilationcache: checksum mismatch (expected %d, got %d)", expected, checksum) + } + + if runtime.GOARCH == "arm64" { + // On arm64, we cannot give all of rwx at the same time, so we change it to exec. + if err = platform.MprotectRX(executable); err != nil { + return nil, false, err + } + } + cm.executable = executable + } + + if _, err := io.ReadFull(reader, eightBytes[:1]); err != nil { + return nil, false, fmt.Errorf("compilationcache: error reading source map presence: %v", err) + } + + if eightBytes[0] == 1 { + sm := &cm.sourceMap + sourceMapLen, err := readUint64(reader, &eightBytes) + if err != nil { + err = fmt.Errorf("compilationcache: error reading source map length: %v", err) + return nil, false, err + } + executableOffset := uintptr(unsafe.Pointer(&cm.executable[0])) + for i := uint64(0); i < sourceMapLen; i++ { + wasmBinaryOffset, err := readUint64(reader, &eightBytes) + if err != nil { + err = fmt.Errorf("compilationcache: error reading source map[%d] wasm binary offset: %v", i, err) + return nil, false, err + } + executableRelativeOffset, err := readUint64(reader, &eightBytes) + if err != nil { + err = fmt.Errorf("compilationcache: error reading source map[%d] executable offset: %v", i, err) + return nil, false, err + } + sm.wasmBinaryOffsets = append(sm.wasmBinaryOffsets, wasmBinaryOffset) + // executableOffsets is absolute address, so we need to add executableOffset. + sm.executableOffsets = append(sm.executableOffsets, uintptr(executableRelativeOffset)+executableOffset) + } + } + return +} + +// readUint64 strictly reads an uint64 in little-endian byte order, using the +// given array as a buffer. This returns io.EOF if less than 8 bytes were read. +func readUint64(reader io.Reader, b *[8]byte) (uint64, error) { + s := b[0:8] + n, err := reader.Read(s) + if err != nil { + return 0, err + } else if n < 8 { // more strict than reader.Read + return 0, io.EOF + } + + // Read the u64 from the underlying buffer. + ret := binary.LittleEndian.Uint64(s) + return ret, nil +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_amd64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_amd64.go new file mode 100644 index 000000000..18f60af3a --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_amd64.go @@ -0,0 +1,15 @@ +//go:build amd64 && !tinygo + +package wazevo + +import _ "unsafe" + +// entrypoint is implemented by the backend. +// +//go:linkname entrypoint github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64.entrypoint +func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultStackPtr *uint64, goAllocatedStackSlicePtr uintptr) + +// entrypoint is implemented by the backend. +// +//go:linkname afterGoFunctionCallEntrypoint github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64.afterGoFunctionCallEntrypoint +func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr) diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_arm64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_arm64.go new file mode 100644 index 000000000..e16d64f65 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_arm64.go @@ -0,0 +1,15 @@ +//go:build arm64 && !tinygo + +package wazevo + +import _ "unsafe" + +// entrypoint is implemented by the backend. +// +//go:linkname entrypoint github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64.entrypoint +func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultStackPtr *uint64, goAllocatedStackSlicePtr uintptr) + +// entrypoint is implemented by the backend. +// +//go:linkname afterGoFunctionCallEntrypoint github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64.afterGoFunctionCallEntrypoint +func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr) diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_others.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_others.go new file mode 100644 index 000000000..8f9d64b2b --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/entrypoint_others.go @@ -0,0 +1,15 @@ +//go:build (!arm64 && !amd64) || tinygo + +package wazevo + +import ( + "runtime" +) + +func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultStackPtr *uint64, goAllocatedStackSlicePtr uintptr) { + panic(runtime.GOARCH) +} + +func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr) { + panic(runtime.GOARCH) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go new file mode 100644 index 000000000..873a35a55 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/frontend.go @@ -0,0 +1,594 @@ +// Package frontend implements the translation of WebAssembly to SSA IR using the ssa package. +package frontend + +import ( + "bytes" + "math" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" + "github.com/tetratelabs/wazero/internal/wasm" +) + +// Compiler is in charge of lowering Wasm to SSA IR, and does the optimization +// on top of it in architecture-independent way. +type Compiler struct { + // Per-module data that is used across all functions. + + m *wasm.Module + offset *wazevoapi.ModuleContextOffsetData + // ssaBuilder is a ssa.Builder used by this frontend. + ssaBuilder ssa.Builder + signatures map[*wasm.FunctionType]*ssa.Signature + listenerSignatures map[*wasm.FunctionType][2]*ssa.Signature + memoryGrowSig ssa.Signature + memoryWait32Sig ssa.Signature + memoryWait64Sig ssa.Signature + memoryNotifySig ssa.Signature + checkModuleExitCodeSig ssa.Signature + tableGrowSig ssa.Signature + refFuncSig ssa.Signature + memmoveSig ssa.Signature + ensureTermination bool + + // Followings are reset by per function. + + // wasmLocalToVariable maps the index (considered as wasm.Index of locals) + // to the corresponding ssa.Variable. + wasmLocalToVariable [] /* local index to */ ssa.Variable + wasmLocalFunctionIndex wasm.Index + wasmFunctionTypeIndex wasm.Index + wasmFunctionTyp *wasm.FunctionType + wasmFunctionLocalTypes []wasm.ValueType + wasmFunctionBody []byte + wasmFunctionBodyOffsetInCodeSection uint64 + memoryBaseVariable, memoryLenVariable ssa.Variable + needMemory bool + memoryShared bool + globalVariables []ssa.Variable + globalVariablesTypes []ssa.Type + mutableGlobalVariablesIndexes []wasm.Index // index to ^. + needListener bool + needSourceOffsetInfo bool + // br is reused during lowering. + br *bytes.Reader + loweringState loweringState + + knownSafeBounds [] /* ssa.ValueID to */ knownSafeBound + knownSafeBoundsSet []ssa.ValueID + + knownSafeBoundsAtTheEndOfBlocks [] /* ssa.BlockID to */ knownSafeBoundsAtTheEndOfBlock + varLengthKnownSafeBoundWithIDPool wazevoapi.VarLengthPool[knownSafeBoundWithID] + + execCtxPtrValue, moduleCtxPtrValue ssa.Value + + // Following are reused for the known safe bounds analysis. + + pointers []int + bounds [][]knownSafeBoundWithID +} + +type ( + // knownSafeBound represents a known safe bound for a value. + knownSafeBound struct { + // bound is a constant upper bound for the value. + bound uint64 + // absoluteAddr is the absolute address of the value. + absoluteAddr ssa.Value + } + // knownSafeBoundWithID is a knownSafeBound with the ID of the value. + knownSafeBoundWithID struct { + knownSafeBound + id ssa.ValueID + } + knownSafeBoundsAtTheEndOfBlock = wazevoapi.VarLength[knownSafeBoundWithID] +) + +var knownSafeBoundsAtTheEndOfBlockNil = wazevoapi.NewNilVarLength[knownSafeBoundWithID]() + +// NewFrontendCompiler returns a frontend Compiler. +func NewFrontendCompiler(m *wasm.Module, ssaBuilder ssa.Builder, offset *wazevoapi.ModuleContextOffsetData, ensureTermination bool, listenerOn bool, sourceInfo bool) *Compiler { + c := &Compiler{ + m: m, + ssaBuilder: ssaBuilder, + br: bytes.NewReader(nil), + offset: offset, + ensureTermination: ensureTermination, + needSourceOffsetInfo: sourceInfo, + varLengthKnownSafeBoundWithIDPool: wazevoapi.NewVarLengthPool[knownSafeBoundWithID](), + } + c.declareSignatures(listenerOn) + return c +} + +func (c *Compiler) declareSignatures(listenerOn bool) { + m := c.m + c.signatures = make(map[*wasm.FunctionType]*ssa.Signature, len(m.TypeSection)+2) + if listenerOn { + c.listenerSignatures = make(map[*wasm.FunctionType][2]*ssa.Signature, len(m.TypeSection)) + } + for i := range m.TypeSection { + wasmSig := &m.TypeSection[i] + sig := SignatureForWasmFunctionType(wasmSig) + sig.ID = ssa.SignatureID(i) + c.signatures[wasmSig] = &sig + c.ssaBuilder.DeclareSignature(&sig) + + if listenerOn { + beforeSig, afterSig := SignatureForListener(wasmSig) + beforeSig.ID = ssa.SignatureID(i) + ssa.SignatureID(len(m.TypeSection)) + afterSig.ID = ssa.SignatureID(i) + ssa.SignatureID(len(m.TypeSection))*2 + c.listenerSignatures[wasmSig] = [2]*ssa.Signature{beforeSig, afterSig} + c.ssaBuilder.DeclareSignature(beforeSig) + c.ssaBuilder.DeclareSignature(afterSig) + } + } + + begin := ssa.SignatureID(len(m.TypeSection)) + if listenerOn { + begin *= 3 + } + c.memoryGrowSig = ssa.Signature{ + ID: begin, + // Takes execution context and the page size to grow. + Params: []ssa.Type{ssa.TypeI64, ssa.TypeI32}, + // Returns the previous page size. + Results: []ssa.Type{ssa.TypeI32}, + } + c.ssaBuilder.DeclareSignature(&c.memoryGrowSig) + + c.checkModuleExitCodeSig = ssa.Signature{ + ID: c.memoryGrowSig.ID + 1, + // Only takes execution context. + Params: []ssa.Type{ssa.TypeI64}, + } + c.ssaBuilder.DeclareSignature(&c.checkModuleExitCodeSig) + + c.tableGrowSig = ssa.Signature{ + ID: c.checkModuleExitCodeSig.ID + 1, + Params: []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32 /* table index */, ssa.TypeI32 /* num */, ssa.TypeI64 /* ref */}, + // Returns the previous size. + Results: []ssa.Type{ssa.TypeI32}, + } + c.ssaBuilder.DeclareSignature(&c.tableGrowSig) + + c.refFuncSig = ssa.Signature{ + ID: c.tableGrowSig.ID + 1, + Params: []ssa.Type{ssa.TypeI64 /* exec context */, ssa.TypeI32 /* func index */}, + // Returns the function reference. + Results: []ssa.Type{ssa.TypeI64}, + } + c.ssaBuilder.DeclareSignature(&c.refFuncSig) + + c.memmoveSig = ssa.Signature{ + ID: c.refFuncSig.ID + 1, + // dst, src, and the byte count. + Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI64}, + } + + c.ssaBuilder.DeclareSignature(&c.memmoveSig) + + c.memoryWait32Sig = ssa.Signature{ + ID: c.memmoveSig.ID + 1, + // exec context, timeout, expected, addr + Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI32, ssa.TypeI64}, + // Returns the status. + Results: []ssa.Type{ssa.TypeI32}, + } + c.ssaBuilder.DeclareSignature(&c.memoryWait32Sig) + + c.memoryWait64Sig = ssa.Signature{ + ID: c.memoryWait32Sig.ID + 1, + // exec context, timeout, expected, addr + Params: []ssa.Type{ssa.TypeI64, ssa.TypeI64, ssa.TypeI64, ssa.TypeI64}, + // Returns the status. + Results: []ssa.Type{ssa.TypeI32}, + } + c.ssaBuilder.DeclareSignature(&c.memoryWait64Sig) + + c.memoryNotifySig = ssa.Signature{ + ID: c.memoryWait64Sig.ID + 1, + // exec context, count, addr + Params: []ssa.Type{ssa.TypeI64, ssa.TypeI32, ssa.TypeI64}, + // Returns the number notified. + Results: []ssa.Type{ssa.TypeI32}, + } + c.ssaBuilder.DeclareSignature(&c.memoryNotifySig) +} + +// SignatureForWasmFunctionType returns the ssa.Signature for the given wasm.FunctionType. +func SignatureForWasmFunctionType(typ *wasm.FunctionType) ssa.Signature { + sig := ssa.Signature{ + // +2 to pass moduleContextPtr and executionContextPtr. See the inline comment LowerToSSA. + Params: make([]ssa.Type, len(typ.Params)+2), + Results: make([]ssa.Type, len(typ.Results)), + } + sig.Params[0] = executionContextPtrTyp + sig.Params[1] = moduleContextPtrTyp + for j, typ := range typ.Params { + sig.Params[j+2] = WasmTypeToSSAType(typ) + } + for j, typ := range typ.Results { + sig.Results[j] = WasmTypeToSSAType(typ) + } + return sig +} + +// Init initializes the state of frontendCompiler and make it ready for a next function. +func (c *Compiler) Init(idx, typIndex wasm.Index, typ *wasm.FunctionType, localTypes []wasm.ValueType, body []byte, needListener bool, bodyOffsetInCodeSection uint64) { + c.ssaBuilder.Init(c.signatures[typ]) + c.loweringState.reset() + + c.wasmFunctionTypeIndex = typIndex + c.wasmLocalFunctionIndex = idx + c.wasmFunctionTyp = typ + c.wasmFunctionLocalTypes = localTypes + c.wasmFunctionBody = body + c.wasmFunctionBodyOffsetInCodeSection = bodyOffsetInCodeSection + c.needListener = needListener + c.clearSafeBounds() + c.varLengthKnownSafeBoundWithIDPool.Reset() + c.knownSafeBoundsAtTheEndOfBlocks = c.knownSafeBoundsAtTheEndOfBlocks[:0] +} + +// Note: this assumes 64-bit platform (I believe we won't have 32-bit backend ;)). +const executionContextPtrTyp, moduleContextPtrTyp = ssa.TypeI64, ssa.TypeI64 + +// LowerToSSA lowers the current function to SSA function which will be held by ssaBuilder. +// After calling this, the caller will be able to access the SSA info in *Compiler.ssaBuilder. +// +// Note that this only does the naive lowering, and do not do any optimization, instead the caller is expected to do so. +func (c *Compiler) LowerToSSA() { + builder := c.ssaBuilder + + // Set up the entry block. + entryBlock := builder.AllocateBasicBlock() + builder.SetCurrentBlock(entryBlock) + + // Functions always take two parameters in addition to Wasm-level parameters: + // + // 1. executionContextPtr: pointer to the *executionContext in wazevo package. + // This will be used to exit the execution in the face of trap, plus used for host function calls. + // + // 2. moduleContextPtr: pointer to the *moduleContextOpaque in wazevo package. + // This will be used to access memory, etc. Also, this will be used during host function calls. + // + // Note: it's clear that sometimes a function won't need them. For example, + // if the function doesn't trap and doesn't make function call, then + // we might be able to eliminate the parameter. However, if that function + // can be called via call_indirect, then we cannot eliminate because the + // signature won't match with the expected one. + // TODO: maybe there's some way to do this optimization without glitches, but so far I have no clue about the feasibility. + // + // Note: In Wasmtime or many other runtimes, moduleContextPtr is called "vmContext". Also note that `moduleContextPtr` + // is wazero-specific since other runtimes can naturally use the OS-level signal to do this job thanks to the fact that + // they can use native stack vs wazero cannot use Go-routine stack and have to use Go-runtime allocated []byte as a stack. + c.execCtxPtrValue = entryBlock.AddParam(builder, executionContextPtrTyp) + c.moduleCtxPtrValue = entryBlock.AddParam(builder, moduleContextPtrTyp) + builder.AnnotateValue(c.execCtxPtrValue, "exec_ctx") + builder.AnnotateValue(c.moduleCtxPtrValue, "module_ctx") + + for i, typ := range c.wasmFunctionTyp.Params { + st := WasmTypeToSSAType(typ) + variable := builder.DeclareVariable(st) + value := entryBlock.AddParam(builder, st) + builder.DefineVariable(variable, value, entryBlock) + c.setWasmLocalVariable(wasm.Index(i), variable) + } + c.declareWasmLocals(entryBlock) + c.declareNecessaryVariables() + + c.lowerBody(entryBlock) +} + +// localVariable returns the SSA variable for the given Wasm local index. +func (c *Compiler) localVariable(index wasm.Index) ssa.Variable { + return c.wasmLocalToVariable[index] +} + +func (c *Compiler) setWasmLocalVariable(index wasm.Index, variable ssa.Variable) { + idx := int(index) + if idx >= len(c.wasmLocalToVariable) { + c.wasmLocalToVariable = append(c.wasmLocalToVariable, make([]ssa.Variable, idx+1-len(c.wasmLocalToVariable))...) + } + c.wasmLocalToVariable[idx] = variable +} + +// declareWasmLocals declares the SSA variables for the Wasm locals. +func (c *Compiler) declareWasmLocals(entry ssa.BasicBlock) { + localCount := wasm.Index(len(c.wasmFunctionTyp.Params)) + for i, typ := range c.wasmFunctionLocalTypes { + st := WasmTypeToSSAType(typ) + variable := c.ssaBuilder.DeclareVariable(st) + c.setWasmLocalVariable(wasm.Index(i)+localCount, variable) + + zeroInst := c.ssaBuilder.AllocateInstruction() + switch st { + case ssa.TypeI32: + zeroInst.AsIconst32(0) + case ssa.TypeI64: + zeroInst.AsIconst64(0) + case ssa.TypeF32: + zeroInst.AsF32const(0) + case ssa.TypeF64: + zeroInst.AsF64const(0) + case ssa.TypeV128: + zeroInst.AsVconst(0, 0) + default: + panic("TODO: " + wasm.ValueTypeName(typ)) + } + + c.ssaBuilder.InsertInstruction(zeroInst) + value := zeroInst.Return() + c.ssaBuilder.DefineVariable(variable, value, entry) + } +} + +func (c *Compiler) declareNecessaryVariables() { + if c.needMemory = c.m.MemorySection != nil; c.needMemory { + c.memoryShared = c.m.MemorySection.IsShared + } else if c.needMemory = c.m.ImportMemoryCount > 0; c.needMemory { + for _, imp := range c.m.ImportSection { + if imp.Type == wasm.ExternTypeMemory { + c.memoryShared = imp.DescMem.IsShared + break + } + } + } + + if c.needMemory { + c.memoryBaseVariable = c.ssaBuilder.DeclareVariable(ssa.TypeI64) + c.memoryLenVariable = c.ssaBuilder.DeclareVariable(ssa.TypeI64) + } + + c.globalVariables = c.globalVariables[:0] + c.mutableGlobalVariablesIndexes = c.mutableGlobalVariablesIndexes[:0] + c.globalVariablesTypes = c.globalVariablesTypes[:0] + for _, imp := range c.m.ImportSection { + if imp.Type == wasm.ExternTypeGlobal { + desc := imp.DescGlobal + c.declareWasmGlobal(desc.ValType, desc.Mutable) + } + } + for _, g := range c.m.GlobalSection { + desc := g.Type + c.declareWasmGlobal(desc.ValType, desc.Mutable) + } + + // TODO: add tables. +} + +func (c *Compiler) declareWasmGlobal(typ wasm.ValueType, mutable bool) { + var st ssa.Type + switch typ { + case wasm.ValueTypeI32: + st = ssa.TypeI32 + case wasm.ValueTypeI64, + // Both externref and funcref are represented as I64 since we only support 64-bit platforms. + wasm.ValueTypeExternref, wasm.ValueTypeFuncref: + st = ssa.TypeI64 + case wasm.ValueTypeF32: + st = ssa.TypeF32 + case wasm.ValueTypeF64: + st = ssa.TypeF64 + case wasm.ValueTypeV128: + st = ssa.TypeV128 + default: + panic("TODO: " + wasm.ValueTypeName(typ)) + } + v := c.ssaBuilder.DeclareVariable(st) + index := wasm.Index(len(c.globalVariables)) + c.globalVariables = append(c.globalVariables, v) + c.globalVariablesTypes = append(c.globalVariablesTypes, st) + if mutable { + c.mutableGlobalVariablesIndexes = append(c.mutableGlobalVariablesIndexes, index) + } +} + +// WasmTypeToSSAType converts wasm.ValueType to ssa.Type. +func WasmTypeToSSAType(vt wasm.ValueType) ssa.Type { + switch vt { + case wasm.ValueTypeI32: + return ssa.TypeI32 + case wasm.ValueTypeI64, + // Both externref and funcref are represented as I64 since we only support 64-bit platforms. + wasm.ValueTypeExternref, wasm.ValueTypeFuncref: + return ssa.TypeI64 + case wasm.ValueTypeF32: + return ssa.TypeF32 + case wasm.ValueTypeF64: + return ssa.TypeF64 + case wasm.ValueTypeV128: + return ssa.TypeV128 + default: + panic("TODO: " + wasm.ValueTypeName(vt)) + } +} + +// addBlockParamsFromWasmTypes adds the block parameters to the given block. +func (c *Compiler) addBlockParamsFromWasmTypes(tps []wasm.ValueType, blk ssa.BasicBlock) { + for _, typ := range tps { + st := WasmTypeToSSAType(typ) + blk.AddParam(c.ssaBuilder, st) + } +} + +// formatBuilder outputs the constructed SSA function as a string with a source information. +func (c *Compiler) formatBuilder() string { + return c.ssaBuilder.Format() +} + +// SignatureForListener returns the signatures for the listener functions. +func SignatureForListener(wasmSig *wasm.FunctionType) (*ssa.Signature, *ssa.Signature) { + beforeSig := &ssa.Signature{} + beforeSig.Params = make([]ssa.Type, len(wasmSig.Params)+2) + beforeSig.Params[0] = ssa.TypeI64 // Execution context. + beforeSig.Params[1] = ssa.TypeI32 // Function index. + for i, p := range wasmSig.Params { + beforeSig.Params[i+2] = WasmTypeToSSAType(p) + } + afterSig := &ssa.Signature{} + afterSig.Params = make([]ssa.Type, len(wasmSig.Results)+2) + afterSig.Params[0] = ssa.TypeI64 // Execution context. + afterSig.Params[1] = ssa.TypeI32 // Function index. + for i, p := range wasmSig.Results { + afterSig.Params[i+2] = WasmTypeToSSAType(p) + } + return beforeSig, afterSig +} + +// isBoundSafe returns true if the given value is known to be safe to access up to the given bound. +func (c *Compiler) getKnownSafeBound(v ssa.ValueID) *knownSafeBound { + if int(v) >= len(c.knownSafeBounds) { + return nil + } + return &c.knownSafeBounds[v] +} + +// recordKnownSafeBound records the given safe bound for the given value. +func (c *Compiler) recordKnownSafeBound(v ssa.ValueID, safeBound uint64, absoluteAddr ssa.Value) { + if int(v) >= len(c.knownSafeBounds) { + c.knownSafeBounds = append(c.knownSafeBounds, make([]knownSafeBound, v+1)...) + } + + if exiting := c.knownSafeBounds[v]; exiting.bound == 0 { + c.knownSafeBounds[v] = knownSafeBound{ + bound: safeBound, + absoluteAddr: absoluteAddr, + } + c.knownSafeBoundsSet = append(c.knownSafeBoundsSet, v) + } else if safeBound > exiting.bound { + c.knownSafeBounds[v].bound = safeBound + } +} + +// clearSafeBounds clears the known safe bounds. +func (c *Compiler) clearSafeBounds() { + for _, v := range c.knownSafeBoundsSet { + ptr := &c.knownSafeBounds[v] + ptr.bound = 0 + ptr.absoluteAddr = ssa.ValueInvalid + } + c.knownSafeBoundsSet = c.knownSafeBoundsSet[:0] +} + +// resetAbsoluteAddressInSafeBounds resets the absolute addresses recorded in the known safe bounds. +func (c *Compiler) resetAbsoluteAddressInSafeBounds() { + for _, v := range c.knownSafeBoundsSet { + ptr := &c.knownSafeBounds[v] + ptr.absoluteAddr = ssa.ValueInvalid + } +} + +func (k *knownSafeBound) valid() bool { + return k != nil && k.bound > 0 +} + +func (c *Compiler) allocateVarLengthValues(_cap int, vs ...ssa.Value) ssa.Values { + builder := c.ssaBuilder + pool := builder.VarLengthPool() + args := pool.Allocate(_cap) + args = args.Append(builder.VarLengthPool(), vs...) + return args +} + +func (c *Compiler) finalizeKnownSafeBoundsAtTheEndOfBlock(bID ssa.BasicBlockID) { + _bID := int(bID) + if l := len(c.knownSafeBoundsAtTheEndOfBlocks); _bID >= l { + c.knownSafeBoundsAtTheEndOfBlocks = append(c.knownSafeBoundsAtTheEndOfBlocks, + make([]knownSafeBoundsAtTheEndOfBlock, _bID+1-len(c.knownSafeBoundsAtTheEndOfBlocks))...) + for i := l; i < len(c.knownSafeBoundsAtTheEndOfBlocks); i++ { + c.knownSafeBoundsAtTheEndOfBlocks[i] = knownSafeBoundsAtTheEndOfBlockNil + } + } + p := &c.varLengthKnownSafeBoundWithIDPool + size := len(c.knownSafeBoundsSet) + allocated := c.varLengthKnownSafeBoundWithIDPool.Allocate(size) + // Sort the known safe bounds by the value ID so that we can use the intersection algorithm in initializeCurrentBlockKnownBounds. + sortSSAValueIDs(c.knownSafeBoundsSet) + for _, vID := range c.knownSafeBoundsSet { + kb := c.knownSafeBounds[vID] + allocated = allocated.Append(p, knownSafeBoundWithID{ + knownSafeBound: kb, + id: vID, + }) + } + c.knownSafeBoundsAtTheEndOfBlocks[bID] = allocated + c.clearSafeBounds() +} + +func (c *Compiler) initializeCurrentBlockKnownBounds() { + currentBlk := c.ssaBuilder.CurrentBlock() + switch preds := currentBlk.Preds(); preds { + case 0: + case 1: + pred := currentBlk.Pred(0).ID() + for _, kb := range c.getKnownSafeBoundsAtTheEndOfBlocks(pred).View() { + // Unless the block is sealed, we cannot assume the absolute address is valid: + // later we might add another predecessor that has no visibility of that value. + addr := ssa.ValueInvalid + if currentBlk.Sealed() { + addr = kb.absoluteAddr + } + c.recordKnownSafeBound(kb.id, kb.bound, addr) + } + default: + c.pointers = c.pointers[:0] + c.bounds = c.bounds[:0] + for i := 0; i < preds; i++ { + c.bounds = append(c.bounds, c.getKnownSafeBoundsAtTheEndOfBlocks(currentBlk.Pred(i).ID()).View()) + c.pointers = append(c.pointers, 0) + } + + // If there are multiple predecessors, we need to find the intersection of the known safe bounds. + + outer: + for { + smallestID := ssa.ValueID(math.MaxUint32) + for i, ptr := range c.pointers { + if ptr >= len(c.bounds[i]) { + break outer + } + cb := &c.bounds[i][ptr] + if id := cb.id; id < smallestID { + smallestID = cb.id + } + } + + // Check if current elements are the same across all lists. + same := true + minBound := uint64(math.MaxUint64) + for i := 0; i < preds; i++ { + cb := &c.bounds[i][c.pointers[i]] + if cb.id != smallestID { + same = false + break + } else { + if cb.bound < minBound { + minBound = cb.bound + } + } + } + + if same { // All elements are the same. + // Absolute address cannot be used in the intersection since the value might be only defined in one of the predecessors. + c.recordKnownSafeBound(smallestID, minBound, ssa.ValueInvalid) + } + + // Move pointer(s) for the smallest ID forward (if same, move all). + for i := 0; i < preds; i++ { + cb := &c.bounds[i][c.pointers[i]] + if cb.id == smallestID { + c.pointers[i]++ + } + } + } + } +} + +func (c *Compiler) getKnownSafeBoundsAtTheEndOfBlocks(id ssa.BasicBlockID) knownSafeBoundsAtTheEndOfBlock { + if int(id) >= len(c.knownSafeBoundsAtTheEndOfBlocks) { + return knownSafeBoundsAtTheEndOfBlockNil + } + return c.knownSafeBoundsAtTheEndOfBlocks[id] +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go new file mode 100644 index 000000000..5096a6365 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/lower.go @@ -0,0 +1,4268 @@ +package frontend + +import ( + "encoding/binary" + "fmt" + "math" + "runtime" + "strings" + + "github.com/tetratelabs/wazero/api" + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" + "github.com/tetratelabs/wazero/internal/leb128" + "github.com/tetratelabs/wazero/internal/wasm" +) + +type ( + // loweringState is used to keep the state of lowering. + loweringState struct { + // values holds the values on the Wasm stack. + values []ssa.Value + controlFrames []controlFrame + unreachable bool + unreachableDepth int + tmpForBrTable []uint32 + pc int + } + controlFrame struct { + kind controlFrameKind + // originalStackLen holds the number of values on the Wasm stack + // when start executing this control frame minus params for the block. + originalStackLenWithoutParam int + // blk is the loop header if this is loop, and is the else-block if this is an if frame. + blk, + // followingBlock is the basic block we enter if we reach "end" of block. + followingBlock ssa.BasicBlock + blockType *wasm.FunctionType + // clonedArgs hold the arguments to Else block. + clonedArgs ssa.Values + } + + controlFrameKind byte +) + +// String implements fmt.Stringer for debugging. +func (l *loweringState) String() string { + var str []string + for _, v := range l.values { + str = append(str, fmt.Sprintf("v%v", v.ID())) + } + var frames []string + for i := range l.controlFrames { + frames = append(frames, l.controlFrames[i].kind.String()) + } + return fmt.Sprintf("\n\tunreachable=%v(depth=%d)\n\tstack: %s\n\tcontrol frames: %s", + l.unreachable, l.unreachableDepth, + strings.Join(str, ", "), + strings.Join(frames, ", "), + ) +} + +const ( + controlFrameKindFunction = iota + 1 + controlFrameKindLoop + controlFrameKindIfWithElse + controlFrameKindIfWithoutElse + controlFrameKindBlock +) + +// String implements fmt.Stringer for debugging. +func (k controlFrameKind) String() string { + switch k { + case controlFrameKindFunction: + return "function" + case controlFrameKindLoop: + return "loop" + case controlFrameKindIfWithElse: + return "if_with_else" + case controlFrameKindIfWithoutElse: + return "if_without_else" + case controlFrameKindBlock: + return "block" + default: + panic(k) + } +} + +// isLoop returns true if this is a loop frame. +func (ctrl *controlFrame) isLoop() bool { + return ctrl.kind == controlFrameKindLoop +} + +// reset resets the state of loweringState for reuse. +func (l *loweringState) reset() { + l.values = l.values[:0] + l.controlFrames = l.controlFrames[:0] + l.pc = 0 + l.unreachable = false + l.unreachableDepth = 0 +} + +func (l *loweringState) peek() (ret ssa.Value) { + tail := len(l.values) - 1 + return l.values[tail] +} + +func (l *loweringState) pop() (ret ssa.Value) { + tail := len(l.values) - 1 + ret = l.values[tail] + l.values = l.values[:tail] + return +} + +func (l *loweringState) push(ret ssa.Value) { + l.values = append(l.values, ret) +} + +func (c *Compiler) nPeekDup(n int) ssa.Values { + if n == 0 { + return ssa.ValuesNil + } + + l := c.state() + tail := len(l.values) + + args := c.allocateVarLengthValues(n) + args = args.Append(c.ssaBuilder.VarLengthPool(), l.values[tail-n:tail]...) + return args +} + +func (l *loweringState) ctrlPop() (ret controlFrame) { + tail := len(l.controlFrames) - 1 + ret = l.controlFrames[tail] + l.controlFrames = l.controlFrames[:tail] + return +} + +func (l *loweringState) ctrlPush(ret controlFrame) { + l.controlFrames = append(l.controlFrames, ret) +} + +func (l *loweringState) ctrlPeekAt(n int) (ret *controlFrame) { + tail := len(l.controlFrames) - 1 + return &l.controlFrames[tail-n] +} + +// lowerBody lowers the body of the Wasm function to the SSA form. +func (c *Compiler) lowerBody(entryBlk ssa.BasicBlock) { + c.ssaBuilder.Seal(entryBlk) + + if c.needListener { + c.callListenerBefore() + } + + // Pushes the empty control frame which corresponds to the function return. + c.loweringState.ctrlPush(controlFrame{ + kind: controlFrameKindFunction, + blockType: c.wasmFunctionTyp, + followingBlock: c.ssaBuilder.ReturnBlock(), + }) + + for c.loweringState.pc < len(c.wasmFunctionBody) { + blkBeforeLowering := c.ssaBuilder.CurrentBlock() + c.lowerCurrentOpcode() + blkAfterLowering := c.ssaBuilder.CurrentBlock() + if blkBeforeLowering != blkAfterLowering { + // In Wasm, once a block exits, that means we've done compiling the block. + // Therefore, we finalize the known bounds at the end of the block for the exiting block. + c.finalizeKnownSafeBoundsAtTheEndOfBlock(blkBeforeLowering.ID()) + // After that, we initialize the known bounds for the new compilation target block. + c.initializeCurrentBlockKnownBounds() + } + } +} + +func (c *Compiler) state() *loweringState { + return &c.loweringState +} + +func (c *Compiler) lowerCurrentOpcode() { + op := c.wasmFunctionBody[c.loweringState.pc] + + if c.needSourceOffsetInfo { + c.ssaBuilder.SetCurrentSourceOffset( + ssa.SourceOffset(c.loweringState.pc) + ssa.SourceOffset(c.wasmFunctionBodyOffsetInCodeSection), + ) + } + + builder := c.ssaBuilder + state := c.state() + switch op { + case wasm.OpcodeI32Const: + c := c.readI32s() + if state.unreachable { + break + } + + iconst := builder.AllocateInstruction().AsIconst32(uint32(c)).Insert(builder) + value := iconst.Return() + state.push(value) + case wasm.OpcodeI64Const: + c := c.readI64s() + if state.unreachable { + break + } + iconst := builder.AllocateInstruction().AsIconst64(uint64(c)).Insert(builder) + value := iconst.Return() + state.push(value) + case wasm.OpcodeF32Const: + f32 := c.readF32() + if state.unreachable { + break + } + f32const := builder.AllocateInstruction(). + AsF32const(f32). + Insert(builder). + Return() + state.push(f32const) + case wasm.OpcodeF64Const: + f64 := c.readF64() + if state.unreachable { + break + } + f64const := builder.AllocateInstruction(). + AsF64const(f64). + Insert(builder). + Return() + state.push(f64const) + case wasm.OpcodeI32Add, wasm.OpcodeI64Add: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + iadd := builder.AllocateInstruction() + iadd.AsIadd(x, y) + builder.InsertInstruction(iadd) + value := iadd.Return() + state.push(value) + case wasm.OpcodeI32Sub, wasm.OpcodeI64Sub: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + isub := builder.AllocateInstruction() + isub.AsIsub(x, y) + builder.InsertInstruction(isub) + value := isub.Return() + state.push(value) + case wasm.OpcodeF32Add, wasm.OpcodeF64Add: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + iadd := builder.AllocateInstruction() + iadd.AsFadd(x, y) + builder.InsertInstruction(iadd) + value := iadd.Return() + state.push(value) + case wasm.OpcodeI32Mul, wasm.OpcodeI64Mul: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + imul := builder.AllocateInstruction() + imul.AsImul(x, y) + builder.InsertInstruction(imul) + value := imul.Return() + state.push(value) + case wasm.OpcodeF32Sub, wasm.OpcodeF64Sub: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + isub := builder.AllocateInstruction() + isub.AsFsub(x, y) + builder.InsertInstruction(isub) + value := isub.Return() + state.push(value) + case wasm.OpcodeF32Mul, wasm.OpcodeF64Mul: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + isub := builder.AllocateInstruction() + isub.AsFmul(x, y) + builder.InsertInstruction(isub) + value := isub.Return() + state.push(value) + case wasm.OpcodeF32Div, wasm.OpcodeF64Div: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + isub := builder.AllocateInstruction() + isub.AsFdiv(x, y) + builder.InsertInstruction(isub) + value := isub.Return() + state.push(value) + case wasm.OpcodeF32Max, wasm.OpcodeF64Max: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + isub := builder.AllocateInstruction() + isub.AsFmax(x, y) + builder.InsertInstruction(isub) + value := isub.Return() + state.push(value) + case wasm.OpcodeF32Min, wasm.OpcodeF64Min: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + isub := builder.AllocateInstruction() + isub.AsFmin(x, y) + builder.InsertInstruction(isub) + value := isub.Return() + state.push(value) + case wasm.OpcodeI64Extend8S: + if state.unreachable { + break + } + c.insertIntegerExtend(true, 8, 64) + case wasm.OpcodeI64Extend16S: + if state.unreachable { + break + } + c.insertIntegerExtend(true, 16, 64) + case wasm.OpcodeI64Extend32S, wasm.OpcodeI64ExtendI32S: + if state.unreachable { + break + } + c.insertIntegerExtend(true, 32, 64) + case wasm.OpcodeI64ExtendI32U: + if state.unreachable { + break + } + c.insertIntegerExtend(false, 32, 64) + case wasm.OpcodeI32Extend8S: + if state.unreachable { + break + } + c.insertIntegerExtend(true, 8, 32) + case wasm.OpcodeI32Extend16S: + if state.unreachable { + break + } + c.insertIntegerExtend(true, 16, 32) + case wasm.OpcodeI32Eqz, wasm.OpcodeI64Eqz: + if state.unreachable { + break + } + x := state.pop() + zero := builder.AllocateInstruction() + if op == wasm.OpcodeI32Eqz { + zero.AsIconst32(0) + } else { + zero.AsIconst64(0) + } + builder.InsertInstruction(zero) + icmp := builder.AllocateInstruction(). + AsIcmp(x, zero.Return(), ssa.IntegerCmpCondEqual). + Insert(builder). + Return() + state.push(icmp) + case wasm.OpcodeI32Eq, wasm.OpcodeI64Eq: + if state.unreachable { + break + } + c.insertIcmp(ssa.IntegerCmpCondEqual) + case wasm.OpcodeI32Ne, wasm.OpcodeI64Ne: + if state.unreachable { + break + } + c.insertIcmp(ssa.IntegerCmpCondNotEqual) + case wasm.OpcodeI32LtS, wasm.OpcodeI64LtS: + if state.unreachable { + break + } + c.insertIcmp(ssa.IntegerCmpCondSignedLessThan) + case wasm.OpcodeI32LtU, wasm.OpcodeI64LtU: + if state.unreachable { + break + } + c.insertIcmp(ssa.IntegerCmpCondUnsignedLessThan) + case wasm.OpcodeI32GtS, wasm.OpcodeI64GtS: + if state.unreachable { + break + } + c.insertIcmp(ssa.IntegerCmpCondSignedGreaterThan) + case wasm.OpcodeI32GtU, wasm.OpcodeI64GtU: + if state.unreachable { + break + } + c.insertIcmp(ssa.IntegerCmpCondUnsignedGreaterThan) + case wasm.OpcodeI32LeS, wasm.OpcodeI64LeS: + if state.unreachable { + break + } + c.insertIcmp(ssa.IntegerCmpCondSignedLessThanOrEqual) + case wasm.OpcodeI32LeU, wasm.OpcodeI64LeU: + if state.unreachable { + break + } + c.insertIcmp(ssa.IntegerCmpCondUnsignedLessThanOrEqual) + case wasm.OpcodeI32GeS, wasm.OpcodeI64GeS: + if state.unreachable { + break + } + c.insertIcmp(ssa.IntegerCmpCondSignedGreaterThanOrEqual) + case wasm.OpcodeI32GeU, wasm.OpcodeI64GeU: + if state.unreachable { + break + } + c.insertIcmp(ssa.IntegerCmpCondUnsignedGreaterThanOrEqual) + + case wasm.OpcodeF32Eq, wasm.OpcodeF64Eq: + if state.unreachable { + break + } + c.insertFcmp(ssa.FloatCmpCondEqual) + case wasm.OpcodeF32Ne, wasm.OpcodeF64Ne: + if state.unreachable { + break + } + c.insertFcmp(ssa.FloatCmpCondNotEqual) + case wasm.OpcodeF32Lt, wasm.OpcodeF64Lt: + if state.unreachable { + break + } + c.insertFcmp(ssa.FloatCmpCondLessThan) + case wasm.OpcodeF32Gt, wasm.OpcodeF64Gt: + if state.unreachable { + break + } + c.insertFcmp(ssa.FloatCmpCondGreaterThan) + case wasm.OpcodeF32Le, wasm.OpcodeF64Le: + if state.unreachable { + break + } + c.insertFcmp(ssa.FloatCmpCondLessThanOrEqual) + case wasm.OpcodeF32Ge, wasm.OpcodeF64Ge: + if state.unreachable { + break + } + c.insertFcmp(ssa.FloatCmpCondGreaterThanOrEqual) + case wasm.OpcodeF32Neg, wasm.OpcodeF64Neg: + if state.unreachable { + break + } + x := state.pop() + v := builder.AllocateInstruction().AsFneg(x).Insert(builder).Return() + state.push(v) + case wasm.OpcodeF32Sqrt, wasm.OpcodeF64Sqrt: + if state.unreachable { + break + } + x := state.pop() + v := builder.AllocateInstruction().AsSqrt(x).Insert(builder).Return() + state.push(v) + case wasm.OpcodeF32Abs, wasm.OpcodeF64Abs: + if state.unreachable { + break + } + x := state.pop() + v := builder.AllocateInstruction().AsFabs(x).Insert(builder).Return() + state.push(v) + case wasm.OpcodeF32Copysign, wasm.OpcodeF64Copysign: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + v := builder.AllocateInstruction().AsFcopysign(x, y).Insert(builder).Return() + state.push(v) + + case wasm.OpcodeF32Ceil, wasm.OpcodeF64Ceil: + if state.unreachable { + break + } + x := state.pop() + v := builder.AllocateInstruction().AsCeil(x).Insert(builder).Return() + state.push(v) + case wasm.OpcodeF32Floor, wasm.OpcodeF64Floor: + if state.unreachable { + break + } + x := state.pop() + v := builder.AllocateInstruction().AsFloor(x).Insert(builder).Return() + state.push(v) + case wasm.OpcodeF32Trunc, wasm.OpcodeF64Trunc: + if state.unreachable { + break + } + x := state.pop() + v := builder.AllocateInstruction().AsTrunc(x).Insert(builder).Return() + state.push(v) + case wasm.OpcodeF32Nearest, wasm.OpcodeF64Nearest: + if state.unreachable { + break + } + x := state.pop() + v := builder.AllocateInstruction().AsNearest(x).Insert(builder).Return() + state.push(v) + case wasm.OpcodeI64TruncF64S, wasm.OpcodeI64TruncF32S, + wasm.OpcodeI32TruncF64S, wasm.OpcodeI32TruncF32S, + wasm.OpcodeI64TruncF64U, wasm.OpcodeI64TruncF32U, + wasm.OpcodeI32TruncF64U, wasm.OpcodeI32TruncF32U: + if state.unreachable { + break + } + ret := builder.AllocateInstruction().AsFcvtToInt( + state.pop(), + c.execCtxPtrValue, + op == wasm.OpcodeI64TruncF64S || op == wasm.OpcodeI64TruncF32S || op == wasm.OpcodeI32TruncF32S || op == wasm.OpcodeI32TruncF64S, + op == wasm.OpcodeI64TruncF64S || op == wasm.OpcodeI64TruncF32S || op == wasm.OpcodeI64TruncF64U || op == wasm.OpcodeI64TruncF32U, + false, + ).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeMiscPrefix: + state.pc++ + // A misc opcode is encoded as an unsigned variable 32-bit integer. + miscOpUint, num, err := leb128.LoadUint32(c.wasmFunctionBody[state.pc:]) + if err != nil { + // In normal conditions this should never happen because the function has passed validation. + panic(fmt.Sprintf("failed to read misc opcode: %v", err)) + } + state.pc += int(num - 1) + miscOp := wasm.OpcodeMisc(miscOpUint) + switch miscOp { + case wasm.OpcodeMiscI64TruncSatF64S, wasm.OpcodeMiscI64TruncSatF32S, + wasm.OpcodeMiscI32TruncSatF64S, wasm.OpcodeMiscI32TruncSatF32S, + wasm.OpcodeMiscI64TruncSatF64U, wasm.OpcodeMiscI64TruncSatF32U, + wasm.OpcodeMiscI32TruncSatF64U, wasm.OpcodeMiscI32TruncSatF32U: + if state.unreachable { + break + } + ret := builder.AllocateInstruction().AsFcvtToInt( + state.pop(), + c.execCtxPtrValue, + miscOp == wasm.OpcodeMiscI64TruncSatF64S || miscOp == wasm.OpcodeMiscI64TruncSatF32S || miscOp == wasm.OpcodeMiscI32TruncSatF32S || miscOp == wasm.OpcodeMiscI32TruncSatF64S, + miscOp == wasm.OpcodeMiscI64TruncSatF64S || miscOp == wasm.OpcodeMiscI64TruncSatF32S || miscOp == wasm.OpcodeMiscI64TruncSatF64U || miscOp == wasm.OpcodeMiscI64TruncSatF32U, + true, + ).Insert(builder).Return() + state.push(ret) + + case wasm.OpcodeMiscTableSize: + tableIndex := c.readI32u() + if state.unreachable { + break + } + + // Load the table. + loadTableInstancePtr := builder.AllocateInstruction() + loadTableInstancePtr.AsLoad(c.moduleCtxPtrValue, c.offset.TableOffset(int(tableIndex)).U32(), ssa.TypeI64) + builder.InsertInstruction(loadTableInstancePtr) + tableInstancePtr := loadTableInstancePtr.Return() + + // Load the table's length. + loadTableLen := builder.AllocateInstruction(). + AsLoad(tableInstancePtr, tableInstanceLenOffset, ssa.TypeI32). + Insert(builder) + state.push(loadTableLen.Return()) + + case wasm.OpcodeMiscTableGrow: + tableIndex := c.readI32u() + if state.unreachable { + break + } + + c.storeCallerModuleContext() + + tableIndexVal := builder.AllocateInstruction().AsIconst32(tableIndex).Insert(builder).Return() + + num := state.pop() + r := state.pop() + + tableGrowPtr := builder.AllocateInstruction(). + AsLoad(c.execCtxPtrValue, + wazevoapi.ExecutionContextOffsetTableGrowTrampolineAddress.U32(), + ssa.TypeI64, + ).Insert(builder).Return() + + args := c.allocateVarLengthValues(4, c.execCtxPtrValue, tableIndexVal, num, r) + callGrowRet := builder. + AllocateInstruction(). + AsCallIndirect(tableGrowPtr, &c.tableGrowSig, args). + Insert(builder).Return() + state.push(callGrowRet) + + case wasm.OpcodeMiscTableCopy: + dstTableIndex := c.readI32u() + srcTableIndex := c.readI32u() + if state.unreachable { + break + } + + copySize := builder. + AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return() + srcOffset := builder. + AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return() + dstOffset := builder. + AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return() + + // Out of bounds check. + dstTableInstancePtr := c.boundsCheckInTable(dstTableIndex, dstOffset, copySize) + srcTableInstancePtr := c.boundsCheckInTable(srcTableIndex, srcOffset, copySize) + + dstTableBaseAddr := c.loadTableBaseAddr(dstTableInstancePtr) + srcTableBaseAddr := c.loadTableBaseAddr(srcTableInstancePtr) + + three := builder.AllocateInstruction().AsIconst64(3).Insert(builder).Return() + + dstOffsetInBytes := builder.AllocateInstruction().AsIshl(dstOffset, three).Insert(builder).Return() + dstAddr := builder.AllocateInstruction().AsIadd(dstTableBaseAddr, dstOffsetInBytes).Insert(builder).Return() + srcOffsetInBytes := builder.AllocateInstruction().AsIshl(srcOffset, three).Insert(builder).Return() + srcAddr := builder.AllocateInstruction().AsIadd(srcTableBaseAddr, srcOffsetInBytes).Insert(builder).Return() + + copySizeInBytes := builder.AllocateInstruction().AsIshl(copySize, three).Insert(builder).Return() + c.callMemmove(dstAddr, srcAddr, copySizeInBytes) + + case wasm.OpcodeMiscMemoryCopy: + state.pc += 2 // +2 to skip two memory indexes which are fixed to zero. + if state.unreachable { + break + } + + copySize := builder. + AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return() + srcOffset := builder. + AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return() + dstOffset := builder. + AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return() + + // Out of bounds check. + memLen := c.getMemoryLenValue(false) + c.boundsCheckInMemory(memLen, dstOffset, copySize) + c.boundsCheckInMemory(memLen, srcOffset, copySize) + + memBase := c.getMemoryBaseValue(false) + dstAddr := builder.AllocateInstruction().AsIadd(memBase, dstOffset).Insert(builder).Return() + srcAddr := builder.AllocateInstruction().AsIadd(memBase, srcOffset).Insert(builder).Return() + + c.callMemmove(dstAddr, srcAddr, copySize) + + case wasm.OpcodeMiscTableFill: + tableIndex := c.readI32u() + if state.unreachable { + break + } + fillSize := state.pop() + value := state.pop() + offset := state.pop() + + fillSizeExt := builder. + AllocateInstruction().AsUExtend(fillSize, 32, 64).Insert(builder).Return() + offsetExt := builder. + AllocateInstruction().AsUExtend(offset, 32, 64).Insert(builder).Return() + tableInstancePtr := c.boundsCheckInTable(tableIndex, offsetExt, fillSizeExt) + + three := builder.AllocateInstruction().AsIconst64(3).Insert(builder).Return() + offsetInBytes := builder.AllocateInstruction().AsIshl(offsetExt, three).Insert(builder).Return() + fillSizeInBytes := builder.AllocateInstruction().AsIshl(fillSizeExt, three).Insert(builder).Return() + + // Calculate the base address of the table. + tableBaseAddr := c.loadTableBaseAddr(tableInstancePtr) + addr := builder.AllocateInstruction().AsIadd(tableBaseAddr, offsetInBytes).Insert(builder).Return() + + // Prepare the loop and following block. + beforeLoop := builder.AllocateBasicBlock() + loopBlk := builder.AllocateBasicBlock() + loopVar := loopBlk.AddParam(builder, ssa.TypeI64) + followingBlk := builder.AllocateBasicBlock() + + // Uses the copy trick for faster filling buffer like memory.fill, but in this case we copy 8 bytes at a time. + // buf := memoryInst.Buffer[offset : offset+fillSize] + // buf[0:8] = value + // for i := 8; i < fillSize; i *= 2 { Begin with 8 bytes. + // copy(buf[i:], buf[:i]) + // } + + // Insert the jump to the beforeLoop block; If the fillSize is zero, then jump to the following block to skip entire logics. + zero := builder.AllocateInstruction().AsIconst64(0).Insert(builder).Return() + ifFillSizeZero := builder.AllocateInstruction().AsIcmp(fillSizeExt, zero, ssa.IntegerCmpCondEqual). + Insert(builder).Return() + builder.AllocateInstruction().AsBrnz(ifFillSizeZero, ssa.ValuesNil, followingBlk).Insert(builder) + c.insertJumpToBlock(ssa.ValuesNil, beforeLoop) + + // buf[0:8] = value + builder.SetCurrentBlock(beforeLoop) + builder.AllocateInstruction().AsStore(ssa.OpcodeStore, value, addr, 0).Insert(builder) + initValue := builder.AllocateInstruction().AsIconst64(8).Insert(builder).Return() + c.insertJumpToBlock(c.allocateVarLengthValues(1, initValue), loopBlk) + + builder.SetCurrentBlock(loopBlk) + dstAddr := builder.AllocateInstruction().AsIadd(addr, loopVar).Insert(builder).Return() + + // If loopVar*2 > fillSizeInBytes, then count must be fillSizeInBytes-loopVar. + var count ssa.Value + { + loopVarDoubled := builder.AllocateInstruction().AsIadd(loopVar, loopVar).Insert(builder).Return() + loopVarDoubledLargerThanFillSize := builder. + AllocateInstruction().AsIcmp(loopVarDoubled, fillSizeInBytes, ssa.IntegerCmpCondUnsignedGreaterThanOrEqual). + Insert(builder).Return() + diff := builder.AllocateInstruction().AsIsub(fillSizeInBytes, loopVar).Insert(builder).Return() + count = builder.AllocateInstruction().AsSelect(loopVarDoubledLargerThanFillSize, diff, loopVar).Insert(builder).Return() + } + + c.callMemmove(dstAddr, addr, count) + + shiftAmount := builder.AllocateInstruction().AsIconst64(1).Insert(builder).Return() + newLoopVar := builder.AllocateInstruction().AsIshl(loopVar, shiftAmount).Insert(builder).Return() + loopVarLessThanFillSize := builder.AllocateInstruction(). + AsIcmp(newLoopVar, fillSizeInBytes, ssa.IntegerCmpCondUnsignedLessThan).Insert(builder).Return() + + builder.AllocateInstruction(). + AsBrnz(loopVarLessThanFillSize, c.allocateVarLengthValues(1, newLoopVar), loopBlk). + Insert(builder) + + c.insertJumpToBlock(ssa.ValuesNil, followingBlk) + builder.SetCurrentBlock(followingBlk) + + builder.Seal(beforeLoop) + builder.Seal(loopBlk) + builder.Seal(followingBlk) + + case wasm.OpcodeMiscMemoryFill: + state.pc++ // Skip the memory index which is fixed to zero. + if state.unreachable { + break + } + + fillSize := builder. + AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return() + value := state.pop() + offset := builder. + AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return() + + // Out of bounds check. + c.boundsCheckInMemory(c.getMemoryLenValue(false), offset, fillSize) + + // Calculate the base address: + addr := builder.AllocateInstruction().AsIadd(c.getMemoryBaseValue(false), offset).Insert(builder).Return() + + // Uses the copy trick for faster filling buffer: https://gist.github.com/taylorza/df2f89d5f9ab3ffd06865062a4cf015d + // buf := memoryInst.Buffer[offset : offset+fillSize] + // buf[0] = value + // for i := 1; i < fillSize; i *= 2 { + // copy(buf[i:], buf[:i]) + // } + + // Prepare the loop and following block. + beforeLoop := builder.AllocateBasicBlock() + loopBlk := builder.AllocateBasicBlock() + loopVar := loopBlk.AddParam(builder, ssa.TypeI64) + followingBlk := builder.AllocateBasicBlock() + + // Insert the jump to the beforeLoop block; If the fillSize is zero, then jump to the following block to skip entire logics. + zero := builder.AllocateInstruction().AsIconst64(0).Insert(builder).Return() + ifFillSizeZero := builder.AllocateInstruction().AsIcmp(fillSize, zero, ssa.IntegerCmpCondEqual). + Insert(builder).Return() + builder.AllocateInstruction().AsBrnz(ifFillSizeZero, ssa.ValuesNil, followingBlk).Insert(builder) + c.insertJumpToBlock(ssa.ValuesNil, beforeLoop) + + // buf[0] = value + builder.SetCurrentBlock(beforeLoop) + builder.AllocateInstruction().AsStore(ssa.OpcodeIstore8, value, addr, 0).Insert(builder) + initValue := builder.AllocateInstruction().AsIconst64(1).Insert(builder).Return() + c.insertJumpToBlock(c.allocateVarLengthValues(1, initValue), loopBlk) + + builder.SetCurrentBlock(loopBlk) + dstAddr := builder.AllocateInstruction().AsIadd(addr, loopVar).Insert(builder).Return() + + // If loopVar*2 > fillSizeExt, then count must be fillSizeExt-loopVar. + var count ssa.Value + { + loopVarDoubled := builder.AllocateInstruction().AsIadd(loopVar, loopVar).Insert(builder).Return() + loopVarDoubledLargerThanFillSize := builder. + AllocateInstruction().AsIcmp(loopVarDoubled, fillSize, ssa.IntegerCmpCondUnsignedGreaterThanOrEqual). + Insert(builder).Return() + diff := builder.AllocateInstruction().AsIsub(fillSize, loopVar).Insert(builder).Return() + count = builder.AllocateInstruction().AsSelect(loopVarDoubledLargerThanFillSize, diff, loopVar).Insert(builder).Return() + } + + c.callMemmove(dstAddr, addr, count) + + shiftAmount := builder.AllocateInstruction().AsIconst64(1).Insert(builder).Return() + newLoopVar := builder.AllocateInstruction().AsIshl(loopVar, shiftAmount).Insert(builder).Return() + loopVarLessThanFillSize := builder.AllocateInstruction(). + AsIcmp(newLoopVar, fillSize, ssa.IntegerCmpCondUnsignedLessThan).Insert(builder).Return() + + builder.AllocateInstruction(). + AsBrnz(loopVarLessThanFillSize, c.allocateVarLengthValues(1, newLoopVar), loopBlk). + Insert(builder) + + c.insertJumpToBlock(ssa.ValuesNil, followingBlk) + builder.SetCurrentBlock(followingBlk) + + builder.Seal(beforeLoop) + builder.Seal(loopBlk) + builder.Seal(followingBlk) + + case wasm.OpcodeMiscMemoryInit: + index := c.readI32u() + state.pc++ // Skip the memory index which is fixed to zero. + if state.unreachable { + break + } + + copySize := builder. + AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return() + offsetInDataInstance := builder. + AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return() + offsetInMemory := builder. + AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return() + + dataInstPtr := c.dataOrElementInstanceAddr(index, c.offset.DataInstances1stElement) + + // Bounds check. + c.boundsCheckInMemory(c.getMemoryLenValue(false), offsetInMemory, copySize) + c.boundsCheckInDataOrElementInstance(dataInstPtr, offsetInDataInstance, copySize, wazevoapi.ExitCodeMemoryOutOfBounds) + + dataInstBaseAddr := builder.AllocateInstruction().AsLoad(dataInstPtr, 0, ssa.TypeI64).Insert(builder).Return() + srcAddr := builder.AllocateInstruction().AsIadd(dataInstBaseAddr, offsetInDataInstance).Insert(builder).Return() + + memBase := c.getMemoryBaseValue(false) + dstAddr := builder.AllocateInstruction().AsIadd(memBase, offsetInMemory).Insert(builder).Return() + + c.callMemmove(dstAddr, srcAddr, copySize) + + case wasm.OpcodeMiscTableInit: + elemIndex := c.readI32u() + tableIndex := c.readI32u() + if state.unreachable { + break + } + + copySize := builder. + AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return() + offsetInElementInstance := builder. + AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return() + offsetInTable := builder. + AllocateInstruction().AsUExtend(state.pop(), 32, 64).Insert(builder).Return() + + elemInstPtr := c.dataOrElementInstanceAddr(elemIndex, c.offset.ElementInstances1stElement) + + // Bounds check. + tableInstancePtr := c.boundsCheckInTable(tableIndex, offsetInTable, copySize) + c.boundsCheckInDataOrElementInstance(elemInstPtr, offsetInElementInstance, copySize, wazevoapi.ExitCodeTableOutOfBounds) + + three := builder.AllocateInstruction().AsIconst64(3).Insert(builder).Return() + // Calculates the destination address in the table. + tableOffsetInBytes := builder.AllocateInstruction().AsIshl(offsetInTable, three).Insert(builder).Return() + tableBaseAddr := c.loadTableBaseAddr(tableInstancePtr) + dstAddr := builder.AllocateInstruction().AsIadd(tableBaseAddr, tableOffsetInBytes).Insert(builder).Return() + + // Calculates the source address in the element instance. + srcOffsetInBytes := builder.AllocateInstruction().AsIshl(offsetInElementInstance, three).Insert(builder).Return() + elemInstBaseAddr := builder.AllocateInstruction().AsLoad(elemInstPtr, 0, ssa.TypeI64).Insert(builder).Return() + srcAddr := builder.AllocateInstruction().AsIadd(elemInstBaseAddr, srcOffsetInBytes).Insert(builder).Return() + + copySizeInBytes := builder.AllocateInstruction().AsIshl(copySize, three).Insert(builder).Return() + c.callMemmove(dstAddr, srcAddr, copySizeInBytes) + + case wasm.OpcodeMiscElemDrop: + index := c.readI32u() + if state.unreachable { + break + } + + c.dropDataOrElementInstance(index, c.offset.ElementInstances1stElement) + + case wasm.OpcodeMiscDataDrop: + index := c.readI32u() + if state.unreachable { + break + } + c.dropDataOrElementInstance(index, c.offset.DataInstances1stElement) + + default: + panic("Unknown MiscOp " + wasm.MiscInstructionName(miscOp)) + } + + case wasm.OpcodeI32ReinterpretF32: + if state.unreachable { + break + } + reinterpret := builder.AllocateInstruction(). + AsBitcast(state.pop(), ssa.TypeI32). + Insert(builder).Return() + state.push(reinterpret) + + case wasm.OpcodeI64ReinterpretF64: + if state.unreachable { + break + } + reinterpret := builder.AllocateInstruction(). + AsBitcast(state.pop(), ssa.TypeI64). + Insert(builder).Return() + state.push(reinterpret) + + case wasm.OpcodeF32ReinterpretI32: + if state.unreachable { + break + } + reinterpret := builder.AllocateInstruction(). + AsBitcast(state.pop(), ssa.TypeF32). + Insert(builder).Return() + state.push(reinterpret) + + case wasm.OpcodeF64ReinterpretI64: + if state.unreachable { + break + } + reinterpret := builder.AllocateInstruction(). + AsBitcast(state.pop(), ssa.TypeF64). + Insert(builder).Return() + state.push(reinterpret) + + case wasm.OpcodeI32DivS, wasm.OpcodeI64DivS: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + result := builder.AllocateInstruction().AsSDiv(x, y, c.execCtxPtrValue).Insert(builder).Return() + state.push(result) + + case wasm.OpcodeI32DivU, wasm.OpcodeI64DivU: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + result := builder.AllocateInstruction().AsUDiv(x, y, c.execCtxPtrValue).Insert(builder).Return() + state.push(result) + + case wasm.OpcodeI32RemS, wasm.OpcodeI64RemS: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + result := builder.AllocateInstruction().AsSRem(x, y, c.execCtxPtrValue).Insert(builder).Return() + state.push(result) + + case wasm.OpcodeI32RemU, wasm.OpcodeI64RemU: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + result := builder.AllocateInstruction().AsURem(x, y, c.execCtxPtrValue).Insert(builder).Return() + state.push(result) + + case wasm.OpcodeI32And, wasm.OpcodeI64And: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + and := builder.AllocateInstruction() + and.AsBand(x, y) + builder.InsertInstruction(and) + value := and.Return() + state.push(value) + case wasm.OpcodeI32Or, wasm.OpcodeI64Or: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + or := builder.AllocateInstruction() + or.AsBor(x, y) + builder.InsertInstruction(or) + value := or.Return() + state.push(value) + case wasm.OpcodeI32Xor, wasm.OpcodeI64Xor: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + xor := builder.AllocateInstruction() + xor.AsBxor(x, y) + builder.InsertInstruction(xor) + value := xor.Return() + state.push(value) + case wasm.OpcodeI32Shl, wasm.OpcodeI64Shl: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + ishl := builder.AllocateInstruction() + ishl.AsIshl(x, y) + builder.InsertInstruction(ishl) + value := ishl.Return() + state.push(value) + case wasm.OpcodeI32ShrU, wasm.OpcodeI64ShrU: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + ishl := builder.AllocateInstruction() + ishl.AsUshr(x, y) + builder.InsertInstruction(ishl) + value := ishl.Return() + state.push(value) + case wasm.OpcodeI32ShrS, wasm.OpcodeI64ShrS: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + ishl := builder.AllocateInstruction() + ishl.AsSshr(x, y) + builder.InsertInstruction(ishl) + value := ishl.Return() + state.push(value) + case wasm.OpcodeI32Rotl, wasm.OpcodeI64Rotl: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + rotl := builder.AllocateInstruction() + rotl.AsRotl(x, y) + builder.InsertInstruction(rotl) + value := rotl.Return() + state.push(value) + case wasm.OpcodeI32Rotr, wasm.OpcodeI64Rotr: + if state.unreachable { + break + } + y, x := state.pop(), state.pop() + rotr := builder.AllocateInstruction() + rotr.AsRotr(x, y) + builder.InsertInstruction(rotr) + value := rotr.Return() + state.push(value) + case wasm.OpcodeI32Clz, wasm.OpcodeI64Clz: + if state.unreachable { + break + } + x := state.pop() + clz := builder.AllocateInstruction() + clz.AsClz(x) + builder.InsertInstruction(clz) + value := clz.Return() + state.push(value) + case wasm.OpcodeI32Ctz, wasm.OpcodeI64Ctz: + if state.unreachable { + break + } + x := state.pop() + ctz := builder.AllocateInstruction() + ctz.AsCtz(x) + builder.InsertInstruction(ctz) + value := ctz.Return() + state.push(value) + case wasm.OpcodeI32Popcnt, wasm.OpcodeI64Popcnt: + if state.unreachable { + break + } + x := state.pop() + popcnt := builder.AllocateInstruction() + popcnt.AsPopcnt(x) + builder.InsertInstruction(popcnt) + value := popcnt.Return() + state.push(value) + + case wasm.OpcodeI32WrapI64: + if state.unreachable { + break + } + x := state.pop() + wrap := builder.AllocateInstruction().AsIreduce(x, ssa.TypeI32).Insert(builder).Return() + state.push(wrap) + case wasm.OpcodeGlobalGet: + index := c.readI32u() + if state.unreachable { + break + } + v := c.getWasmGlobalValue(index, false) + state.push(v) + case wasm.OpcodeGlobalSet: + index := c.readI32u() + if state.unreachable { + break + } + v := state.pop() + c.setWasmGlobalValue(index, v) + case wasm.OpcodeLocalGet: + index := c.readI32u() + if state.unreachable { + break + } + variable := c.localVariable(index) + if _, ok := c.m.NonStaticLocals[c.wasmLocalFunctionIndex][index]; ok { + state.push(builder.MustFindValue(variable)) + } else { + // If a local is static, we can simply find it in the entry block which is either a function param + // or a zero value. This fast pass helps to avoid the overhead of searching the entire function plus + // avoid adding unnecessary block arguments. + // TODO: I think this optimization should be done in a SSA pass like passRedundantPhiEliminationOpt, + // but somehow there's some corner cases that it fails to optimize. + state.push(builder.MustFindValueInBlk(variable, c.ssaBuilder.EntryBlock())) + } + case wasm.OpcodeLocalSet: + index := c.readI32u() + if state.unreachable { + break + } + variable := c.localVariable(index) + newValue := state.pop() + builder.DefineVariableInCurrentBB(variable, newValue) + + case wasm.OpcodeLocalTee: + index := c.readI32u() + if state.unreachable { + break + } + variable := c.localVariable(index) + newValue := state.peek() + builder.DefineVariableInCurrentBB(variable, newValue) + + case wasm.OpcodeSelect, wasm.OpcodeTypedSelect: + if op == wasm.OpcodeTypedSelect { + state.pc += 2 // ignores the type which is only needed during validation. + } + + if state.unreachable { + break + } + + cond := state.pop() + v2 := state.pop() + v1 := state.pop() + + sl := builder.AllocateInstruction(). + AsSelect(cond, v1, v2). + Insert(builder). + Return() + state.push(sl) + + case wasm.OpcodeMemorySize: + state.pc++ // skips the memory index. + if state.unreachable { + break + } + + var memSizeInBytes ssa.Value + if c.offset.LocalMemoryBegin < 0 { + memInstPtr := builder.AllocateInstruction(). + AsLoad(c.moduleCtxPtrValue, c.offset.ImportedMemoryBegin.U32(), ssa.TypeI64). + Insert(builder). + Return() + + memSizeInBytes = builder.AllocateInstruction(). + AsLoad(memInstPtr, memoryInstanceBufSizeOffset, ssa.TypeI32). + Insert(builder). + Return() + } else { + memSizeInBytes = builder.AllocateInstruction(). + AsLoad(c.moduleCtxPtrValue, c.offset.LocalMemoryLen().U32(), ssa.TypeI32). + Insert(builder). + Return() + } + + amount := builder.AllocateInstruction() + amount.AsIconst32(uint32(wasm.MemoryPageSizeInBits)) + builder.InsertInstruction(amount) + memSize := builder.AllocateInstruction(). + AsUshr(memSizeInBytes, amount.Return()). + Insert(builder). + Return() + state.push(memSize) + + case wasm.OpcodeMemoryGrow: + state.pc++ // skips the memory index. + if state.unreachable { + break + } + + c.storeCallerModuleContext() + + pages := state.pop() + memoryGrowPtr := builder.AllocateInstruction(). + AsLoad(c.execCtxPtrValue, + wazevoapi.ExecutionContextOffsetMemoryGrowTrampolineAddress.U32(), + ssa.TypeI64, + ).Insert(builder).Return() + + args := c.allocateVarLengthValues(1, c.execCtxPtrValue, pages) + callGrowRet := builder. + AllocateInstruction(). + AsCallIndirect(memoryGrowPtr, &c.memoryGrowSig, args). + Insert(builder).Return() + state.push(callGrowRet) + + // After the memory grow, reload the cached memory base and len. + c.reloadMemoryBaseLen() + + case wasm.OpcodeI32Store, + wasm.OpcodeI64Store, + wasm.OpcodeF32Store, + wasm.OpcodeF64Store, + wasm.OpcodeI32Store8, + wasm.OpcodeI32Store16, + wasm.OpcodeI64Store8, + wasm.OpcodeI64Store16, + wasm.OpcodeI64Store32: + + _, offset := c.readMemArg() + if state.unreachable { + break + } + var opSize uint64 + var opcode ssa.Opcode + switch op { + case wasm.OpcodeI32Store, wasm.OpcodeF32Store: + opcode = ssa.OpcodeStore + opSize = 4 + case wasm.OpcodeI64Store, wasm.OpcodeF64Store: + opcode = ssa.OpcodeStore + opSize = 8 + case wasm.OpcodeI32Store8, wasm.OpcodeI64Store8: + opcode = ssa.OpcodeIstore8 + opSize = 1 + case wasm.OpcodeI32Store16, wasm.OpcodeI64Store16: + opcode = ssa.OpcodeIstore16 + opSize = 2 + case wasm.OpcodeI64Store32: + opcode = ssa.OpcodeIstore32 + opSize = 4 + default: + panic("BUG") + } + + value := state.pop() + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), opSize) + builder.AllocateInstruction(). + AsStore(opcode, value, addr, offset). + Insert(builder) + + case wasm.OpcodeI32Load, + wasm.OpcodeI64Load, + wasm.OpcodeF32Load, + wasm.OpcodeF64Load, + wasm.OpcodeI32Load8S, + wasm.OpcodeI32Load8U, + wasm.OpcodeI32Load16S, + wasm.OpcodeI32Load16U, + wasm.OpcodeI64Load8S, + wasm.OpcodeI64Load8U, + wasm.OpcodeI64Load16S, + wasm.OpcodeI64Load16U, + wasm.OpcodeI64Load32S, + wasm.OpcodeI64Load32U: + _, offset := c.readMemArg() + if state.unreachable { + break + } + + var opSize uint64 + switch op { + case wasm.OpcodeI32Load, wasm.OpcodeF32Load: + opSize = 4 + case wasm.OpcodeI64Load, wasm.OpcodeF64Load: + opSize = 8 + case wasm.OpcodeI32Load8S, wasm.OpcodeI32Load8U: + opSize = 1 + case wasm.OpcodeI32Load16S, wasm.OpcodeI32Load16U: + opSize = 2 + case wasm.OpcodeI64Load8S, wasm.OpcodeI64Load8U: + opSize = 1 + case wasm.OpcodeI64Load16S, wasm.OpcodeI64Load16U: + opSize = 2 + case wasm.OpcodeI64Load32S, wasm.OpcodeI64Load32U: + opSize = 4 + default: + panic("BUG") + } + + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), opSize) + load := builder.AllocateInstruction() + switch op { + case wasm.OpcodeI32Load: + load.AsLoad(addr, offset, ssa.TypeI32) + case wasm.OpcodeI64Load: + load.AsLoad(addr, offset, ssa.TypeI64) + case wasm.OpcodeF32Load: + load.AsLoad(addr, offset, ssa.TypeF32) + case wasm.OpcodeF64Load: + load.AsLoad(addr, offset, ssa.TypeF64) + case wasm.OpcodeI32Load8S: + load.AsExtLoad(ssa.OpcodeSload8, addr, offset, false) + case wasm.OpcodeI32Load8U: + load.AsExtLoad(ssa.OpcodeUload8, addr, offset, false) + case wasm.OpcodeI32Load16S: + load.AsExtLoad(ssa.OpcodeSload16, addr, offset, false) + case wasm.OpcodeI32Load16U: + load.AsExtLoad(ssa.OpcodeUload16, addr, offset, false) + case wasm.OpcodeI64Load8S: + load.AsExtLoad(ssa.OpcodeSload8, addr, offset, true) + case wasm.OpcodeI64Load8U: + load.AsExtLoad(ssa.OpcodeUload8, addr, offset, true) + case wasm.OpcodeI64Load16S: + load.AsExtLoad(ssa.OpcodeSload16, addr, offset, true) + case wasm.OpcodeI64Load16U: + load.AsExtLoad(ssa.OpcodeUload16, addr, offset, true) + case wasm.OpcodeI64Load32S: + load.AsExtLoad(ssa.OpcodeSload32, addr, offset, true) + case wasm.OpcodeI64Load32U: + load.AsExtLoad(ssa.OpcodeUload32, addr, offset, true) + default: + panic("BUG") + } + builder.InsertInstruction(load) + state.push(load.Return()) + case wasm.OpcodeBlock: + // Note: we do not need to create a BB for this as that would always have only one predecessor + // which is the current BB, and therefore it's always ok to merge them in any way. + + bt := c.readBlockType() + + if state.unreachable { + state.unreachableDepth++ + break + } + + followingBlk := builder.AllocateBasicBlock() + c.addBlockParamsFromWasmTypes(bt.Results, followingBlk) + + state.ctrlPush(controlFrame{ + kind: controlFrameKindBlock, + originalStackLenWithoutParam: len(state.values) - len(bt.Params), + followingBlock: followingBlk, + blockType: bt, + }) + case wasm.OpcodeLoop: + bt := c.readBlockType() + + if state.unreachable { + state.unreachableDepth++ + break + } + + loopHeader, afterLoopBlock := builder.AllocateBasicBlock(), builder.AllocateBasicBlock() + c.addBlockParamsFromWasmTypes(bt.Params, loopHeader) + c.addBlockParamsFromWasmTypes(bt.Results, afterLoopBlock) + + originalLen := len(state.values) - len(bt.Params) + state.ctrlPush(controlFrame{ + originalStackLenWithoutParam: originalLen, + kind: controlFrameKindLoop, + blk: loopHeader, + followingBlock: afterLoopBlock, + blockType: bt, + }) + + args := c.allocateVarLengthValues(originalLen) + args = args.Append(builder.VarLengthPool(), state.values[originalLen:]...) + + // Insert the jump to the header of loop. + br := builder.AllocateInstruction() + br.AsJump(args, loopHeader) + builder.InsertInstruction(br) + + c.switchTo(originalLen, loopHeader) + + if c.ensureTermination { + checkModuleExitCodePtr := builder.AllocateInstruction(). + AsLoad(c.execCtxPtrValue, + wazevoapi.ExecutionContextOffsetCheckModuleExitCodeTrampolineAddress.U32(), + ssa.TypeI64, + ).Insert(builder).Return() + + args := c.allocateVarLengthValues(1, c.execCtxPtrValue) + builder.AllocateInstruction(). + AsCallIndirect(checkModuleExitCodePtr, &c.checkModuleExitCodeSig, args). + Insert(builder) + } + case wasm.OpcodeIf: + bt := c.readBlockType() + + if state.unreachable { + state.unreachableDepth++ + break + } + + v := state.pop() + thenBlk, elseBlk, followingBlk := builder.AllocateBasicBlock(), builder.AllocateBasicBlock(), builder.AllocateBasicBlock() + + // We do not make the Wasm-level block parameters as SSA-level block params for if-else blocks + // since they won't be PHI and the definition is unique. + + // On the other hand, the following block after if-else-end will likely have + // multiple definitions (one in Then and another in Else blocks). + c.addBlockParamsFromWasmTypes(bt.Results, followingBlk) + + args := c.allocateVarLengthValues(len(bt.Params)) + args = args.Append(builder.VarLengthPool(), state.values[len(state.values)-len(bt.Params):]...) + + // Insert the conditional jump to the Else block. + brz := builder.AllocateInstruction() + brz.AsBrz(v, ssa.ValuesNil, elseBlk) + builder.InsertInstruction(brz) + + // Then, insert the jump to the Then block. + br := builder.AllocateInstruction() + br.AsJump(ssa.ValuesNil, thenBlk) + builder.InsertInstruction(br) + + state.ctrlPush(controlFrame{ + kind: controlFrameKindIfWithoutElse, + originalStackLenWithoutParam: len(state.values) - len(bt.Params), + blk: elseBlk, + followingBlock: followingBlk, + blockType: bt, + clonedArgs: args, + }) + + builder.SetCurrentBlock(thenBlk) + + // Then and Else (if exists) have only one predecessor. + builder.Seal(thenBlk) + builder.Seal(elseBlk) + case wasm.OpcodeElse: + ifctrl := state.ctrlPeekAt(0) + if unreachable := state.unreachable; unreachable && state.unreachableDepth > 0 { + // If it is currently in unreachable and is a nested if, + // we just remove the entire else block. + break + } + + ifctrl.kind = controlFrameKindIfWithElse + if !state.unreachable { + // If this Then block is currently reachable, we have to insert the branching to the following BB. + followingBlk := ifctrl.followingBlock // == the BB after if-then-else. + args := c.nPeekDup(len(ifctrl.blockType.Results)) + c.insertJumpToBlock(args, followingBlk) + } else { + state.unreachable = false + } + + // Reset the stack so that we can correctly handle the else block. + state.values = state.values[:ifctrl.originalStackLenWithoutParam] + elseBlk := ifctrl.blk + for _, arg := range ifctrl.clonedArgs.View() { + state.push(arg) + } + + builder.SetCurrentBlock(elseBlk) + + case wasm.OpcodeEnd: + if state.unreachableDepth > 0 { + state.unreachableDepth-- + break + } + + ctrl := state.ctrlPop() + followingBlk := ctrl.followingBlock + + unreachable := state.unreachable + if !unreachable { + // Top n-th args will be used as a result of the current control frame. + args := c.nPeekDup(len(ctrl.blockType.Results)) + + // Insert the unconditional branch to the target. + c.insertJumpToBlock(args, followingBlk) + } else { // recover from the unreachable state. + state.unreachable = false + } + + switch ctrl.kind { + case controlFrameKindFunction: + break // This is the very end of function. + case controlFrameKindLoop: + // Loop header block can be reached from any br/br_table contained in the loop, + // so now that we've reached End of it, we can seal it. + builder.Seal(ctrl.blk) + case controlFrameKindIfWithoutElse: + // If this is the end of Then block, we have to emit the empty Else block. + elseBlk := ctrl.blk + builder.SetCurrentBlock(elseBlk) + c.insertJumpToBlock(ctrl.clonedArgs, followingBlk) + } + + builder.Seal(followingBlk) + + // Ready to start translating the following block. + c.switchTo(ctrl.originalStackLenWithoutParam, followingBlk) + + case wasm.OpcodeBr: + labelIndex := c.readI32u() + if state.unreachable { + break + } + + targetBlk, argNum := state.brTargetArgNumFor(labelIndex) + args := c.nPeekDup(argNum) + c.insertJumpToBlock(args, targetBlk) + + state.unreachable = true + + case wasm.OpcodeBrIf: + labelIndex := c.readI32u() + if state.unreachable { + break + } + + v := state.pop() + + targetBlk, argNum := state.brTargetArgNumFor(labelIndex) + args := c.nPeekDup(argNum) + var sealTargetBlk bool + if c.needListener && targetBlk.ReturnBlock() { // In this case, we have to call the listener before returning. + // Save the currently active block. + current := builder.CurrentBlock() + + // Allocate the trampoline block to the return where we call the listener. + targetBlk = builder.AllocateBasicBlock() + builder.SetCurrentBlock(targetBlk) + sealTargetBlk = true + + c.callListenerAfter() + + instr := builder.AllocateInstruction() + instr.AsReturn(args) + builder.InsertInstruction(instr) + + args = ssa.ValuesNil + + // Revert the current block. + builder.SetCurrentBlock(current) + } + + // Insert the conditional jump to the target block. + brnz := builder.AllocateInstruction() + brnz.AsBrnz(v, args, targetBlk) + builder.InsertInstruction(brnz) + + if sealTargetBlk { + builder.Seal(targetBlk) + } + + // Insert the unconditional jump to the Else block which corresponds to after br_if. + elseBlk := builder.AllocateBasicBlock() + c.insertJumpToBlock(ssa.ValuesNil, elseBlk) + + // Now start translating the instructions after br_if. + builder.Seal(elseBlk) // Else of br_if has the current block as the only one successor. + builder.SetCurrentBlock(elseBlk) + + case wasm.OpcodeBrTable: + labels := state.tmpForBrTable + labels = labels[:0] + labelCount := c.readI32u() + for i := 0; i < int(labelCount); i++ { + labels = append(labels, c.readI32u()) + } + labels = append(labels, c.readI32u()) // default label. + if state.unreachable { + break + } + + index := state.pop() + if labelCount == 0 { // If this br_table is empty, we can just emit the unconditional jump. + targetBlk, argNum := state.brTargetArgNumFor(labels[0]) + args := c.nPeekDup(argNum) + c.insertJumpToBlock(args, targetBlk) + } else { + c.lowerBrTable(labels, index) + } + state.unreachable = true + + case wasm.OpcodeNop: + case wasm.OpcodeReturn: + if state.unreachable { + break + } + if c.needListener { + c.callListenerAfter() + } + + results := c.nPeekDup(c.results()) + instr := builder.AllocateInstruction() + + instr.AsReturn(results) + builder.InsertInstruction(instr) + state.unreachable = true + + case wasm.OpcodeUnreachable: + if state.unreachable { + break + } + exit := builder.AllocateInstruction() + exit.AsExitWithCode(c.execCtxPtrValue, wazevoapi.ExitCodeUnreachable) + builder.InsertInstruction(exit) + state.unreachable = true + + case wasm.OpcodeCallIndirect: + typeIndex := c.readI32u() + tableIndex := c.readI32u() + if state.unreachable { + break + } + c.lowerCallIndirect(typeIndex, tableIndex) + + case wasm.OpcodeCall: + fnIndex := c.readI32u() + if state.unreachable { + break + } + + var typIndex wasm.Index + if fnIndex < c.m.ImportFunctionCount { + // Before transfer the control to the callee, we have to store the current module's moduleContextPtr + // into execContext.callerModuleContextPtr in case when the callee is a Go function. + c.storeCallerModuleContext() + var fi int + for i := range c.m.ImportSection { + imp := &c.m.ImportSection[i] + if imp.Type == wasm.ExternTypeFunc { + if fi == int(fnIndex) { + typIndex = imp.DescFunc + break + } + fi++ + } + } + } else { + typIndex = c.m.FunctionSection[fnIndex-c.m.ImportFunctionCount] + } + typ := &c.m.TypeSection[typIndex] + + argN := len(typ.Params) + tail := len(state.values) - argN + vs := state.values[tail:] + state.values = state.values[:tail] + args := c.allocateVarLengthValues(2+len(vs), c.execCtxPtrValue) + + sig := c.signatures[typ] + call := builder.AllocateInstruction() + if fnIndex >= c.m.ImportFunctionCount { + args = args.Append(builder.VarLengthPool(), c.moduleCtxPtrValue) // This case the callee module is itself. + args = args.Append(builder.VarLengthPool(), vs...) + call.AsCall(FunctionIndexToFuncRef(fnIndex), sig, args) + builder.InsertInstruction(call) + } else { + // This case we have to read the address of the imported function from the module context. + moduleCtx := c.moduleCtxPtrValue + loadFuncPtr, loadModuleCtxPtr := builder.AllocateInstruction(), builder.AllocateInstruction() + funcPtrOffset, moduleCtxPtrOffset, _ := c.offset.ImportedFunctionOffset(fnIndex) + loadFuncPtr.AsLoad(moduleCtx, funcPtrOffset.U32(), ssa.TypeI64) + loadModuleCtxPtr.AsLoad(moduleCtx, moduleCtxPtrOffset.U32(), ssa.TypeI64) + builder.InsertInstruction(loadFuncPtr) + builder.InsertInstruction(loadModuleCtxPtr) + + args = args.Append(builder.VarLengthPool(), loadModuleCtxPtr.Return()) + args = args.Append(builder.VarLengthPool(), vs...) + call.AsCallIndirect(loadFuncPtr.Return(), sig, args) + builder.InsertInstruction(call) + } + + first, rest := call.Returns() + if first.Valid() { + state.push(first) + } + for _, v := range rest { + state.push(v) + } + + c.reloadAfterCall() + + case wasm.OpcodeDrop: + if state.unreachable { + break + } + _ = state.pop() + case wasm.OpcodeF64ConvertI32S, wasm.OpcodeF64ConvertI64S, wasm.OpcodeF64ConvertI32U, wasm.OpcodeF64ConvertI64U: + if state.unreachable { + break + } + result := builder.AllocateInstruction().AsFcvtFromInt( + state.pop(), + op == wasm.OpcodeF64ConvertI32S || op == wasm.OpcodeF64ConvertI64S, + true, + ).Insert(builder).Return() + state.push(result) + case wasm.OpcodeF32ConvertI32S, wasm.OpcodeF32ConvertI64S, wasm.OpcodeF32ConvertI32U, wasm.OpcodeF32ConvertI64U: + if state.unreachable { + break + } + result := builder.AllocateInstruction().AsFcvtFromInt( + state.pop(), + op == wasm.OpcodeF32ConvertI32S || op == wasm.OpcodeF32ConvertI64S, + false, + ).Insert(builder).Return() + state.push(result) + case wasm.OpcodeF32DemoteF64: + if state.unreachable { + break + } + cvt := builder.AllocateInstruction() + cvt.AsFdemote(state.pop()) + builder.InsertInstruction(cvt) + state.push(cvt.Return()) + case wasm.OpcodeF64PromoteF32: + if state.unreachable { + break + } + cvt := builder.AllocateInstruction() + cvt.AsFpromote(state.pop()) + builder.InsertInstruction(cvt) + state.push(cvt.Return()) + + case wasm.OpcodeVecPrefix: + state.pc++ + vecOp := c.wasmFunctionBody[state.pc] + switch vecOp { + case wasm.OpcodeVecV128Const: + state.pc++ + lo := binary.LittleEndian.Uint64(c.wasmFunctionBody[state.pc:]) + state.pc += 8 + hi := binary.LittleEndian.Uint64(c.wasmFunctionBody[state.pc:]) + state.pc += 7 + if state.unreachable { + break + } + ret := builder.AllocateInstruction().AsVconst(lo, hi).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecV128Load: + _, offset := c.readMemArg() + if state.unreachable { + break + } + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), 16) + load := builder.AllocateInstruction() + load.AsLoad(addr, offset, ssa.TypeV128) + builder.InsertInstruction(load) + state.push(load.Return()) + case wasm.OpcodeVecV128Load8Lane, wasm.OpcodeVecV128Load16Lane, wasm.OpcodeVecV128Load32Lane: + _, offset := c.readMemArg() + state.pc++ + if state.unreachable { + break + } + var lane ssa.VecLane + var loadOp ssa.Opcode + var opSize uint64 + switch vecOp { + case wasm.OpcodeVecV128Load8Lane: + loadOp, lane, opSize = ssa.OpcodeUload8, ssa.VecLaneI8x16, 1 + case wasm.OpcodeVecV128Load16Lane: + loadOp, lane, opSize = ssa.OpcodeUload16, ssa.VecLaneI16x8, 2 + case wasm.OpcodeVecV128Load32Lane: + loadOp, lane, opSize = ssa.OpcodeUload32, ssa.VecLaneI32x4, 4 + } + laneIndex := c.wasmFunctionBody[state.pc] + vector := state.pop() + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), opSize) + load := builder.AllocateInstruction(). + AsExtLoad(loadOp, addr, offset, false). + Insert(builder).Return() + ret := builder.AllocateInstruction(). + AsInsertlane(vector, load, laneIndex, lane). + Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecV128Load64Lane: + _, offset := c.readMemArg() + state.pc++ + if state.unreachable { + break + } + laneIndex := c.wasmFunctionBody[state.pc] + vector := state.pop() + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), 8) + load := builder.AllocateInstruction(). + AsLoad(addr, offset, ssa.TypeI64). + Insert(builder).Return() + ret := builder.AllocateInstruction(). + AsInsertlane(vector, load, laneIndex, ssa.VecLaneI64x2). + Insert(builder).Return() + state.push(ret) + + case wasm.OpcodeVecV128Load32zero, wasm.OpcodeVecV128Load64zero: + _, offset := c.readMemArg() + if state.unreachable { + break + } + + var scalarType ssa.Type + switch vecOp { + case wasm.OpcodeVecV128Load32zero: + scalarType = ssa.TypeF32 + case wasm.OpcodeVecV128Load64zero: + scalarType = ssa.TypeF64 + } + + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), uint64(scalarType.Size())) + + ret := builder.AllocateInstruction(). + AsVZeroExtLoad(addr, offset, scalarType). + Insert(builder).Return() + state.push(ret) + + case wasm.OpcodeVecV128Load8x8u, wasm.OpcodeVecV128Load8x8s, + wasm.OpcodeVecV128Load16x4u, wasm.OpcodeVecV128Load16x4s, + wasm.OpcodeVecV128Load32x2u, wasm.OpcodeVecV128Load32x2s: + _, offset := c.readMemArg() + if state.unreachable { + break + } + var lane ssa.VecLane + var signed bool + switch vecOp { + case wasm.OpcodeVecV128Load8x8s: + signed = true + fallthrough + case wasm.OpcodeVecV128Load8x8u: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecV128Load16x4s: + signed = true + fallthrough + case wasm.OpcodeVecV128Load16x4u: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecV128Load32x2s: + signed = true + fallthrough + case wasm.OpcodeVecV128Load32x2u: + lane = ssa.VecLaneI32x4 + } + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), 8) + load := builder.AllocateInstruction(). + AsLoad(addr, offset, ssa.TypeF64). + Insert(builder).Return() + ret := builder.AllocateInstruction(). + AsWiden(load, lane, signed, true). + Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecV128Load8Splat, wasm.OpcodeVecV128Load16Splat, + wasm.OpcodeVecV128Load32Splat, wasm.OpcodeVecV128Load64Splat: + _, offset := c.readMemArg() + if state.unreachable { + break + } + var lane ssa.VecLane + var opSize uint64 + switch vecOp { + case wasm.OpcodeVecV128Load8Splat: + lane, opSize = ssa.VecLaneI8x16, 1 + case wasm.OpcodeVecV128Load16Splat: + lane, opSize = ssa.VecLaneI16x8, 2 + case wasm.OpcodeVecV128Load32Splat: + lane, opSize = ssa.VecLaneI32x4, 4 + case wasm.OpcodeVecV128Load64Splat: + lane, opSize = ssa.VecLaneI64x2, 8 + } + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), opSize) + ret := builder.AllocateInstruction(). + AsLoadSplat(addr, offset, lane). + Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecV128Store: + _, offset := c.readMemArg() + if state.unreachable { + break + } + value := state.pop() + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), 16) + builder.AllocateInstruction(). + AsStore(ssa.OpcodeStore, value, addr, offset). + Insert(builder) + case wasm.OpcodeVecV128Store8Lane, wasm.OpcodeVecV128Store16Lane, + wasm.OpcodeVecV128Store32Lane, wasm.OpcodeVecV128Store64Lane: + _, offset := c.readMemArg() + state.pc++ + if state.unreachable { + break + } + laneIndex := c.wasmFunctionBody[state.pc] + var storeOp ssa.Opcode + var lane ssa.VecLane + var opSize uint64 + switch vecOp { + case wasm.OpcodeVecV128Store8Lane: + storeOp, lane, opSize = ssa.OpcodeIstore8, ssa.VecLaneI8x16, 1 + case wasm.OpcodeVecV128Store16Lane: + storeOp, lane, opSize = ssa.OpcodeIstore16, ssa.VecLaneI16x8, 2 + case wasm.OpcodeVecV128Store32Lane: + storeOp, lane, opSize = ssa.OpcodeIstore32, ssa.VecLaneI32x4, 4 + case wasm.OpcodeVecV128Store64Lane: + storeOp, lane, opSize = ssa.OpcodeStore, ssa.VecLaneI64x2, 8 + } + vector := state.pop() + baseAddr := state.pop() + addr := c.memOpSetup(baseAddr, uint64(offset), opSize) + value := builder.AllocateInstruction(). + AsExtractlane(vector, laneIndex, lane, false). + Insert(builder).Return() + builder.AllocateInstruction(). + AsStore(storeOp, value, addr, offset). + Insert(builder) + case wasm.OpcodeVecV128Not: + if state.unreachable { + break + } + v1 := state.pop() + ret := builder.AllocateInstruction().AsVbnot(v1).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecV128And: + if state.unreachable { + break + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVband(v1, v2).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecV128AndNot: + if state.unreachable { + break + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVbandnot(v1, v2).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecV128Or: + if state.unreachable { + break + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVbor(v1, v2).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecV128Xor: + if state.unreachable { + break + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVbxor(v1, v2).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecV128Bitselect: + if state.unreachable { + break + } + c := state.pop() + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVbitselect(c, v1, v2).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecV128AnyTrue: + if state.unreachable { + break + } + v1 := state.pop() + ret := builder.AllocateInstruction().AsVanyTrue(v1).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16AllTrue, wasm.OpcodeVecI16x8AllTrue, wasm.OpcodeVecI32x4AllTrue, wasm.OpcodeVecI64x2AllTrue: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16AllTrue: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8AllTrue: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4AllTrue: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2AllTrue: + lane = ssa.VecLaneI64x2 + } + v1 := state.pop() + ret := builder.AllocateInstruction().AsVallTrue(v1, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16BitMask, wasm.OpcodeVecI16x8BitMask, wasm.OpcodeVecI32x4BitMask, wasm.OpcodeVecI64x2BitMask: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16BitMask: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8BitMask: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4BitMask: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2BitMask: + lane = ssa.VecLaneI64x2 + } + v1 := state.pop() + ret := builder.AllocateInstruction().AsVhighBits(v1, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16Abs, wasm.OpcodeVecI16x8Abs, wasm.OpcodeVecI32x4Abs, wasm.OpcodeVecI64x2Abs: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16Abs: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8Abs: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4Abs: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2Abs: + lane = ssa.VecLaneI64x2 + } + v1 := state.pop() + ret := builder.AllocateInstruction().AsVIabs(v1, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16Neg, wasm.OpcodeVecI16x8Neg, wasm.OpcodeVecI32x4Neg, wasm.OpcodeVecI64x2Neg: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16Neg: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8Neg: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4Neg: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2Neg: + lane = ssa.VecLaneI64x2 + } + v1 := state.pop() + ret := builder.AllocateInstruction().AsVIneg(v1, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16Popcnt: + if state.unreachable { + break + } + lane := ssa.VecLaneI8x16 + v1 := state.pop() + + ret := builder.AllocateInstruction().AsVIpopcnt(v1, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16Add, wasm.OpcodeVecI16x8Add, wasm.OpcodeVecI32x4Add, wasm.OpcodeVecI64x2Add: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16Add: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8Add: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4Add: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2Add: + lane = ssa.VecLaneI64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVIadd(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16AddSatS, wasm.OpcodeVecI16x8AddSatS: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16AddSatS: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8AddSatS: + lane = ssa.VecLaneI16x8 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVSaddSat(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16AddSatU, wasm.OpcodeVecI16x8AddSatU: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16AddSatU: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8AddSatU: + lane = ssa.VecLaneI16x8 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVUaddSat(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16SubSatS, wasm.OpcodeVecI16x8SubSatS: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16SubSatS: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8SubSatS: + lane = ssa.VecLaneI16x8 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVSsubSat(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16SubSatU, wasm.OpcodeVecI16x8SubSatU: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16SubSatU: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8SubSatU: + lane = ssa.VecLaneI16x8 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVUsubSat(v1, v2, lane).Insert(builder).Return() + state.push(ret) + + case wasm.OpcodeVecI8x16Sub, wasm.OpcodeVecI16x8Sub, wasm.OpcodeVecI32x4Sub, wasm.OpcodeVecI64x2Sub: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16Sub: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8Sub: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4Sub: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2Sub: + lane = ssa.VecLaneI64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVIsub(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16MinS, wasm.OpcodeVecI16x8MinS, wasm.OpcodeVecI32x4MinS: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16MinS: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8MinS: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4MinS: + lane = ssa.VecLaneI32x4 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVImin(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16MinU, wasm.OpcodeVecI16x8MinU, wasm.OpcodeVecI32x4MinU: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16MinU: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8MinU: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4MinU: + lane = ssa.VecLaneI32x4 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVUmin(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16MaxS, wasm.OpcodeVecI16x8MaxS, wasm.OpcodeVecI32x4MaxS: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16MaxS: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8MaxS: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4MaxS: + lane = ssa.VecLaneI32x4 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVImax(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16MaxU, wasm.OpcodeVecI16x8MaxU, wasm.OpcodeVecI32x4MaxU: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16MaxU: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8MaxU: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4MaxU: + lane = ssa.VecLaneI32x4 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVUmax(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16AvgrU, wasm.OpcodeVecI16x8AvgrU: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16AvgrU: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8AvgrU: + lane = ssa.VecLaneI16x8 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVAvgRound(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI16x8Mul, wasm.OpcodeVecI32x4Mul, wasm.OpcodeVecI64x2Mul: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI16x8Mul: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4Mul: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2Mul: + lane = ssa.VecLaneI64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVImul(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI16x8Q15mulrSatS: + if state.unreachable { + break + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsSqmulRoundSat(v1, v2, ssa.VecLaneI16x8).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16Eq, wasm.OpcodeVecI16x8Eq, wasm.OpcodeVecI32x4Eq, wasm.OpcodeVecI64x2Eq: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16Eq: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8Eq: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4Eq: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2Eq: + lane = ssa.VecLaneI64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVIcmp(v1, v2, ssa.IntegerCmpCondEqual, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16Ne, wasm.OpcodeVecI16x8Ne, wasm.OpcodeVecI32x4Ne, wasm.OpcodeVecI64x2Ne: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16Ne: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8Ne: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4Ne: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2Ne: + lane = ssa.VecLaneI64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVIcmp(v1, v2, ssa.IntegerCmpCondNotEqual, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16LtS, wasm.OpcodeVecI16x8LtS, wasm.OpcodeVecI32x4LtS, wasm.OpcodeVecI64x2LtS: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16LtS: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8LtS: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4LtS: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2LtS: + lane = ssa.VecLaneI64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVIcmp(v1, v2, ssa.IntegerCmpCondSignedLessThan, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16LtU, wasm.OpcodeVecI16x8LtU, wasm.OpcodeVecI32x4LtU: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16LtU: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8LtU: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4LtU: + lane = ssa.VecLaneI32x4 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVIcmp(v1, v2, ssa.IntegerCmpCondUnsignedLessThan, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16LeS, wasm.OpcodeVecI16x8LeS, wasm.OpcodeVecI32x4LeS, wasm.OpcodeVecI64x2LeS: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16LeS: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8LeS: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4LeS: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2LeS: + lane = ssa.VecLaneI64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVIcmp(v1, v2, ssa.IntegerCmpCondSignedLessThanOrEqual, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16LeU, wasm.OpcodeVecI16x8LeU, wasm.OpcodeVecI32x4LeU: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16LeU: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8LeU: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4LeU: + lane = ssa.VecLaneI32x4 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVIcmp(v1, v2, ssa.IntegerCmpCondUnsignedLessThanOrEqual, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16GtS, wasm.OpcodeVecI16x8GtS, wasm.OpcodeVecI32x4GtS, wasm.OpcodeVecI64x2GtS: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16GtS: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8GtS: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4GtS: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2GtS: + lane = ssa.VecLaneI64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVIcmp(v1, v2, ssa.IntegerCmpCondSignedGreaterThan, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16GtU, wasm.OpcodeVecI16x8GtU, wasm.OpcodeVecI32x4GtU: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16GtU: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8GtU: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4GtU: + lane = ssa.VecLaneI32x4 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVIcmp(v1, v2, ssa.IntegerCmpCondUnsignedGreaterThan, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16GeS, wasm.OpcodeVecI16x8GeS, wasm.OpcodeVecI32x4GeS, wasm.OpcodeVecI64x2GeS: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16GeS: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8GeS: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4GeS: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2GeS: + lane = ssa.VecLaneI64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVIcmp(v1, v2, ssa.IntegerCmpCondSignedGreaterThanOrEqual, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16GeU, wasm.OpcodeVecI16x8GeU, wasm.OpcodeVecI32x4GeU: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16GeU: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8GeU: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4GeU: + lane = ssa.VecLaneI32x4 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVIcmp(v1, v2, ssa.IntegerCmpCondUnsignedGreaterThanOrEqual, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Max, wasm.OpcodeVecF64x2Max: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Max: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Max: + lane = ssa.VecLaneF64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVFmax(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Abs, wasm.OpcodeVecF64x2Abs: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Abs: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Abs: + lane = ssa.VecLaneF64x2 + } + v1 := state.pop() + ret := builder.AllocateInstruction().AsVFabs(v1, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Min, wasm.OpcodeVecF64x2Min: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Min: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Min: + lane = ssa.VecLaneF64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVFmin(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Neg, wasm.OpcodeVecF64x2Neg: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Neg: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Neg: + lane = ssa.VecLaneF64x2 + } + v1 := state.pop() + ret := builder.AllocateInstruction().AsVFneg(v1, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Sqrt, wasm.OpcodeVecF64x2Sqrt: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Sqrt: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Sqrt: + lane = ssa.VecLaneF64x2 + } + v1 := state.pop() + ret := builder.AllocateInstruction().AsVSqrt(v1, lane).Insert(builder).Return() + state.push(ret) + + case wasm.OpcodeVecF32x4Add, wasm.OpcodeVecF64x2Add: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Add: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Add: + lane = ssa.VecLaneF64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVFadd(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Sub, wasm.OpcodeVecF64x2Sub: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Sub: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Sub: + lane = ssa.VecLaneF64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVFsub(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Mul, wasm.OpcodeVecF64x2Mul: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Mul: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Mul: + lane = ssa.VecLaneF64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVFmul(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Div, wasm.OpcodeVecF64x2Div: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Div: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Div: + lane = ssa.VecLaneF64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVFdiv(v1, v2, lane).Insert(builder).Return() + state.push(ret) + + case wasm.OpcodeVecI16x8ExtaddPairwiseI8x16S, wasm.OpcodeVecI16x8ExtaddPairwiseI8x16U: + if state.unreachable { + break + } + v := state.pop() + signed := vecOp == wasm.OpcodeVecI16x8ExtaddPairwiseI8x16S + ret := builder.AllocateInstruction().AsExtIaddPairwise(v, ssa.VecLaneI8x16, signed).Insert(builder).Return() + state.push(ret) + + case wasm.OpcodeVecI32x4ExtaddPairwiseI16x8S, wasm.OpcodeVecI32x4ExtaddPairwiseI16x8U: + if state.unreachable { + break + } + v := state.pop() + signed := vecOp == wasm.OpcodeVecI32x4ExtaddPairwiseI16x8S + ret := builder.AllocateInstruction().AsExtIaddPairwise(v, ssa.VecLaneI16x8, signed).Insert(builder).Return() + state.push(ret) + + case wasm.OpcodeVecI16x8ExtMulLowI8x16S, wasm.OpcodeVecI16x8ExtMulLowI8x16U: + if state.unreachable { + break + } + v2 := state.pop() + v1 := state.pop() + ret := c.lowerExtMul( + v1, v2, + ssa.VecLaneI8x16, ssa.VecLaneI16x8, + vecOp == wasm.OpcodeVecI16x8ExtMulLowI8x16S, true) + state.push(ret) + + case wasm.OpcodeVecI16x8ExtMulHighI8x16S, wasm.OpcodeVecI16x8ExtMulHighI8x16U: + if state.unreachable { + break + } + v2 := state.pop() + v1 := state.pop() + ret := c.lowerExtMul( + v1, v2, + ssa.VecLaneI8x16, ssa.VecLaneI16x8, + vecOp == wasm.OpcodeVecI16x8ExtMulHighI8x16S, false) + state.push(ret) + + case wasm.OpcodeVecI32x4ExtMulLowI16x8S, wasm.OpcodeVecI32x4ExtMulLowI16x8U: + if state.unreachable { + break + } + v2 := state.pop() + v1 := state.pop() + ret := c.lowerExtMul( + v1, v2, + ssa.VecLaneI16x8, ssa.VecLaneI32x4, + vecOp == wasm.OpcodeVecI32x4ExtMulLowI16x8S, true) + state.push(ret) + + case wasm.OpcodeVecI32x4ExtMulHighI16x8S, wasm.OpcodeVecI32x4ExtMulHighI16x8U: + if state.unreachable { + break + } + v2 := state.pop() + v1 := state.pop() + ret := c.lowerExtMul( + v1, v2, + ssa.VecLaneI16x8, ssa.VecLaneI32x4, + vecOp == wasm.OpcodeVecI32x4ExtMulHighI16x8S, false) + state.push(ret) + case wasm.OpcodeVecI64x2ExtMulLowI32x4S, wasm.OpcodeVecI64x2ExtMulLowI32x4U: + if state.unreachable { + break + } + v2 := state.pop() + v1 := state.pop() + ret := c.lowerExtMul( + v1, v2, + ssa.VecLaneI32x4, ssa.VecLaneI64x2, + vecOp == wasm.OpcodeVecI64x2ExtMulLowI32x4S, true) + state.push(ret) + + case wasm.OpcodeVecI64x2ExtMulHighI32x4S, wasm.OpcodeVecI64x2ExtMulHighI32x4U: + if state.unreachable { + break + } + v2 := state.pop() + v1 := state.pop() + ret := c.lowerExtMul( + v1, v2, + ssa.VecLaneI32x4, ssa.VecLaneI64x2, + vecOp == wasm.OpcodeVecI64x2ExtMulHighI32x4S, false) + state.push(ret) + + case wasm.OpcodeVecI32x4DotI16x8S: + if state.unreachable { + break + } + v2 := state.pop() + v1 := state.pop() + + ret := builder.AllocateInstruction().AsWideningPairwiseDotProductS(v1, v2).Insert(builder).Return() + state.push(ret) + + case wasm.OpcodeVecF32x4Eq, wasm.OpcodeVecF64x2Eq: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Eq: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Eq: + lane = ssa.VecLaneF64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVFcmp(v1, v2, ssa.FloatCmpCondEqual, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Ne, wasm.OpcodeVecF64x2Ne: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Ne: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Ne: + lane = ssa.VecLaneF64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVFcmp(v1, v2, ssa.FloatCmpCondNotEqual, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Lt, wasm.OpcodeVecF64x2Lt: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Lt: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Lt: + lane = ssa.VecLaneF64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVFcmp(v1, v2, ssa.FloatCmpCondLessThan, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Le, wasm.OpcodeVecF64x2Le: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Le: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Le: + lane = ssa.VecLaneF64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVFcmp(v1, v2, ssa.FloatCmpCondLessThanOrEqual, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Gt, wasm.OpcodeVecF64x2Gt: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Gt: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Gt: + lane = ssa.VecLaneF64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVFcmp(v1, v2, ssa.FloatCmpCondGreaterThan, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Ge, wasm.OpcodeVecF64x2Ge: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Ge: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Ge: + lane = ssa.VecLaneF64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVFcmp(v1, v2, ssa.FloatCmpCondGreaterThanOrEqual, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Ceil, wasm.OpcodeVecF64x2Ceil: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Ceil: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Ceil: + lane = ssa.VecLaneF64x2 + } + v1 := state.pop() + ret := builder.AllocateInstruction().AsVCeil(v1, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Floor, wasm.OpcodeVecF64x2Floor: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Floor: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Floor: + lane = ssa.VecLaneF64x2 + } + v1 := state.pop() + ret := builder.AllocateInstruction().AsVFloor(v1, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Trunc, wasm.OpcodeVecF64x2Trunc: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Trunc: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Trunc: + lane = ssa.VecLaneF64x2 + } + v1 := state.pop() + ret := builder.AllocateInstruction().AsVTrunc(v1, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Nearest, wasm.OpcodeVecF64x2Nearest: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Nearest: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Nearest: + lane = ssa.VecLaneF64x2 + } + v1 := state.pop() + ret := builder.AllocateInstruction().AsVNearest(v1, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Pmin, wasm.OpcodeVecF64x2Pmin: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Pmin: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Pmin: + lane = ssa.VecLaneF64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVMinPseudo(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4Pmax, wasm.OpcodeVecF64x2Pmax: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecF32x4Pmax: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Pmax: + lane = ssa.VecLaneF64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVMaxPseudo(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI32x4TruncSatF32x4S, wasm.OpcodeVecI32x4TruncSatF32x4U: + if state.unreachable { + break + } + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVFcvtToIntSat(v1, ssa.VecLaneF32x4, vecOp == wasm.OpcodeVecI32x4TruncSatF32x4S).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI32x4TruncSatF64x2SZero, wasm.OpcodeVecI32x4TruncSatF64x2UZero: + if state.unreachable { + break + } + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVFcvtToIntSat(v1, ssa.VecLaneF64x2, vecOp == wasm.OpcodeVecI32x4TruncSatF64x2SZero).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4ConvertI32x4S, wasm.OpcodeVecF32x4ConvertI32x4U: + if state.unreachable { + break + } + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsVFcvtFromInt(v1, ssa.VecLaneF32x4, vecOp == wasm.OpcodeVecF32x4ConvertI32x4S).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF64x2ConvertLowI32x4S, wasm.OpcodeVecF64x2ConvertLowI32x4U: + if state.unreachable { + break + } + v1 := state.pop() + if runtime.GOARCH == "arm64" { + // TODO: this is weird. fix. + v1 = builder.AllocateInstruction(). + AsWiden(v1, ssa.VecLaneI32x4, vecOp == wasm.OpcodeVecF64x2ConvertLowI32x4S, true).Insert(builder).Return() + } + ret := builder.AllocateInstruction(). + AsVFcvtFromInt(v1, ssa.VecLaneF64x2, vecOp == wasm.OpcodeVecF64x2ConvertLowI32x4S). + Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16NarrowI16x8S, wasm.OpcodeVecI8x16NarrowI16x8U: + if state.unreachable { + break + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsNarrow(v1, v2, ssa.VecLaneI16x8, vecOp == wasm.OpcodeVecI8x16NarrowI16x8S). + Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI16x8NarrowI32x4S, wasm.OpcodeVecI16x8NarrowI32x4U: + if state.unreachable { + break + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsNarrow(v1, v2, ssa.VecLaneI32x4, vecOp == wasm.OpcodeVecI16x8NarrowI32x4S). + Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI16x8ExtendLowI8x16S, wasm.OpcodeVecI16x8ExtendLowI8x16U: + if state.unreachable { + break + } + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsWiden(v1, ssa.VecLaneI8x16, vecOp == wasm.OpcodeVecI16x8ExtendLowI8x16S, true). + Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI16x8ExtendHighI8x16S, wasm.OpcodeVecI16x8ExtendHighI8x16U: + if state.unreachable { + break + } + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsWiden(v1, ssa.VecLaneI8x16, vecOp == wasm.OpcodeVecI16x8ExtendHighI8x16S, false). + Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI32x4ExtendLowI16x8S, wasm.OpcodeVecI32x4ExtendLowI16x8U: + if state.unreachable { + break + } + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsWiden(v1, ssa.VecLaneI16x8, vecOp == wasm.OpcodeVecI32x4ExtendLowI16x8S, true). + Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI32x4ExtendHighI16x8S, wasm.OpcodeVecI32x4ExtendHighI16x8U: + if state.unreachable { + break + } + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsWiden(v1, ssa.VecLaneI16x8, vecOp == wasm.OpcodeVecI32x4ExtendHighI16x8S, false). + Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI64x2ExtendLowI32x4S, wasm.OpcodeVecI64x2ExtendLowI32x4U: + if state.unreachable { + break + } + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsWiden(v1, ssa.VecLaneI32x4, vecOp == wasm.OpcodeVecI64x2ExtendLowI32x4S, true). + Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI64x2ExtendHighI32x4S, wasm.OpcodeVecI64x2ExtendHighI32x4U: + if state.unreachable { + break + } + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsWiden(v1, ssa.VecLaneI32x4, vecOp == wasm.OpcodeVecI64x2ExtendHighI32x4S, false). + Insert(builder).Return() + state.push(ret) + + case wasm.OpcodeVecF64x2PromoteLowF32x4Zero: + if state.unreachable { + break + } + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsFvpromoteLow(v1, ssa.VecLaneF32x4). + Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecF32x4DemoteF64x2Zero: + if state.unreachable { + break + } + v1 := state.pop() + ret := builder.AllocateInstruction(). + AsFvdemote(v1, ssa.VecLaneF64x2). + Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16Shl, wasm.OpcodeVecI16x8Shl, wasm.OpcodeVecI32x4Shl, wasm.OpcodeVecI64x2Shl: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16Shl: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8Shl: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4Shl: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2Shl: + lane = ssa.VecLaneI64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVIshl(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16ShrS, wasm.OpcodeVecI16x8ShrS, wasm.OpcodeVecI32x4ShrS, wasm.OpcodeVecI64x2ShrS: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16ShrS: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8ShrS: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4ShrS: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2ShrS: + lane = ssa.VecLaneI64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVSshr(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16ShrU, wasm.OpcodeVecI16x8ShrU, wasm.OpcodeVecI32x4ShrU, wasm.OpcodeVecI64x2ShrU: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16ShrU: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8ShrU: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4ShrU: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2ShrU: + lane = ssa.VecLaneI64x2 + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsVUshr(v1, v2, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecI8x16ExtractLaneS, wasm.OpcodeVecI16x8ExtractLaneS: + state.pc++ + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16ExtractLaneS: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8ExtractLaneS: + lane = ssa.VecLaneI16x8 + } + v1 := state.pop() + index := c.wasmFunctionBody[state.pc] + ext := builder.AllocateInstruction().AsExtractlane(v1, index, lane, true).Insert(builder).Return() + state.push(ext) + case wasm.OpcodeVecI8x16ExtractLaneU, wasm.OpcodeVecI16x8ExtractLaneU, + wasm.OpcodeVecI32x4ExtractLane, wasm.OpcodeVecI64x2ExtractLane, + wasm.OpcodeVecF32x4ExtractLane, wasm.OpcodeVecF64x2ExtractLane: + state.pc++ // Skip the immediate value. + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16ExtractLaneU: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8ExtractLaneU: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4ExtractLane: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2ExtractLane: + lane = ssa.VecLaneI64x2 + case wasm.OpcodeVecF32x4ExtractLane: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2ExtractLane: + lane = ssa.VecLaneF64x2 + } + v1 := state.pop() + index := c.wasmFunctionBody[state.pc] + ext := builder.AllocateInstruction().AsExtractlane(v1, index, lane, false).Insert(builder).Return() + state.push(ext) + case wasm.OpcodeVecI8x16ReplaceLane, wasm.OpcodeVecI16x8ReplaceLane, + wasm.OpcodeVecI32x4ReplaceLane, wasm.OpcodeVecI64x2ReplaceLane, + wasm.OpcodeVecF32x4ReplaceLane, wasm.OpcodeVecF64x2ReplaceLane: + state.pc++ + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16ReplaceLane: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8ReplaceLane: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4ReplaceLane: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2ReplaceLane: + lane = ssa.VecLaneI64x2 + case wasm.OpcodeVecF32x4ReplaceLane: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2ReplaceLane: + lane = ssa.VecLaneF64x2 + } + v2 := state.pop() + v1 := state.pop() + index := c.wasmFunctionBody[state.pc] + ret := builder.AllocateInstruction().AsInsertlane(v1, v2, index, lane).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeVecV128i8x16Shuffle: + state.pc++ + laneIndexes := c.wasmFunctionBody[state.pc : state.pc+16] + state.pc += 15 + if state.unreachable { + break + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsShuffle(v1, v2, laneIndexes).Insert(builder).Return() + state.push(ret) + + case wasm.OpcodeVecI8x16Swizzle: + if state.unreachable { + break + } + v2 := state.pop() + v1 := state.pop() + ret := builder.AllocateInstruction().AsSwizzle(v1, v2, ssa.VecLaneI8x16).Insert(builder).Return() + state.push(ret) + + case wasm.OpcodeVecI8x16Splat, + wasm.OpcodeVecI16x8Splat, + wasm.OpcodeVecI32x4Splat, + wasm.OpcodeVecI64x2Splat, + wasm.OpcodeVecF32x4Splat, + wasm.OpcodeVecF64x2Splat: + if state.unreachable { + break + } + var lane ssa.VecLane + switch vecOp { + case wasm.OpcodeVecI8x16Splat: + lane = ssa.VecLaneI8x16 + case wasm.OpcodeVecI16x8Splat: + lane = ssa.VecLaneI16x8 + case wasm.OpcodeVecI32x4Splat: + lane = ssa.VecLaneI32x4 + case wasm.OpcodeVecI64x2Splat: + lane = ssa.VecLaneI64x2 + case wasm.OpcodeVecF32x4Splat: + lane = ssa.VecLaneF32x4 + case wasm.OpcodeVecF64x2Splat: + lane = ssa.VecLaneF64x2 + } + v1 := state.pop() + ret := builder.AllocateInstruction().AsSplat(v1, lane).Insert(builder).Return() + state.push(ret) + + default: + panic("TODO: unsupported vector instruction: " + wasm.VectorInstructionName(vecOp)) + } + case wasm.OpcodeAtomicPrefix: + state.pc++ + atomicOp := c.wasmFunctionBody[state.pc] + switch atomicOp { + case wasm.OpcodeAtomicMemoryWait32, wasm.OpcodeAtomicMemoryWait64: + _, offset := c.readMemArg() + if state.unreachable { + break + } + + c.storeCallerModuleContext() + + var opSize uint64 + var trampoline wazevoapi.Offset + var sig *ssa.Signature + switch atomicOp { + case wasm.OpcodeAtomicMemoryWait32: + opSize = 4 + trampoline = wazevoapi.ExecutionContextOffsetMemoryWait32TrampolineAddress + sig = &c.memoryWait32Sig + case wasm.OpcodeAtomicMemoryWait64: + opSize = 8 + trampoline = wazevoapi.ExecutionContextOffsetMemoryWait64TrampolineAddress + sig = &c.memoryWait64Sig + } + + timeout := state.pop() + exp := state.pop() + baseAddr := state.pop() + addr := c.atomicMemOpSetup(baseAddr, uint64(offset), opSize) + + memoryWaitPtr := builder.AllocateInstruction(). + AsLoad(c.execCtxPtrValue, + trampoline.U32(), + ssa.TypeI64, + ).Insert(builder).Return() + + args := c.allocateVarLengthValues(3, c.execCtxPtrValue, timeout, exp, addr) + memoryWaitRet := builder.AllocateInstruction(). + AsCallIndirect(memoryWaitPtr, sig, args). + Insert(builder).Return() + state.push(memoryWaitRet) + case wasm.OpcodeAtomicMemoryNotify: + _, offset := c.readMemArg() + if state.unreachable { + break + } + + c.storeCallerModuleContext() + count := state.pop() + baseAddr := state.pop() + addr := c.atomicMemOpSetup(baseAddr, uint64(offset), 4) + + memoryNotifyPtr := builder.AllocateInstruction(). + AsLoad(c.execCtxPtrValue, + wazevoapi.ExecutionContextOffsetMemoryNotifyTrampolineAddress.U32(), + ssa.TypeI64, + ).Insert(builder).Return() + args := c.allocateVarLengthValues(2, c.execCtxPtrValue, count, addr) + memoryNotifyRet := builder.AllocateInstruction(). + AsCallIndirect(memoryNotifyPtr, &c.memoryNotifySig, args). + Insert(builder).Return() + state.push(memoryNotifyRet) + case wasm.OpcodeAtomicI32Load, wasm.OpcodeAtomicI64Load, wasm.OpcodeAtomicI32Load8U, wasm.OpcodeAtomicI32Load16U, wasm.OpcodeAtomicI64Load8U, wasm.OpcodeAtomicI64Load16U, wasm.OpcodeAtomicI64Load32U: + _, offset := c.readMemArg() + if state.unreachable { + break + } + + baseAddr := state.pop() + + var size uint64 + switch atomicOp { + case wasm.OpcodeAtomicI64Load: + size = 8 + case wasm.OpcodeAtomicI32Load, wasm.OpcodeAtomicI64Load32U: + size = 4 + case wasm.OpcodeAtomicI32Load16U, wasm.OpcodeAtomicI64Load16U: + size = 2 + case wasm.OpcodeAtomicI32Load8U, wasm.OpcodeAtomicI64Load8U: + size = 1 + } + + var typ ssa.Type + switch atomicOp { + case wasm.OpcodeAtomicI64Load, wasm.OpcodeAtomicI64Load32U, wasm.OpcodeAtomicI64Load16U, wasm.OpcodeAtomicI64Load8U: + typ = ssa.TypeI64 + case wasm.OpcodeAtomicI32Load, wasm.OpcodeAtomicI32Load16U, wasm.OpcodeAtomicI32Load8U: + typ = ssa.TypeI32 + } + + addr := c.atomicMemOpSetup(baseAddr, uint64(offset), size) + res := builder.AllocateInstruction().AsAtomicLoad(addr, size, typ).Insert(builder).Return() + state.push(res) + case wasm.OpcodeAtomicI32Store, wasm.OpcodeAtomicI64Store, wasm.OpcodeAtomicI32Store8, wasm.OpcodeAtomicI32Store16, wasm.OpcodeAtomicI64Store8, wasm.OpcodeAtomicI64Store16, wasm.OpcodeAtomicI64Store32: + _, offset := c.readMemArg() + if state.unreachable { + break + } + + val := state.pop() + baseAddr := state.pop() + + var size uint64 + switch atomicOp { + case wasm.OpcodeAtomicI64Store: + size = 8 + case wasm.OpcodeAtomicI32Store, wasm.OpcodeAtomicI64Store32: + size = 4 + case wasm.OpcodeAtomicI32Store16, wasm.OpcodeAtomicI64Store16: + size = 2 + case wasm.OpcodeAtomicI32Store8, wasm.OpcodeAtomicI64Store8: + size = 1 + } + + addr := c.atomicMemOpSetup(baseAddr, uint64(offset), size) + builder.AllocateInstruction().AsAtomicStore(addr, val, size).Insert(builder) + case wasm.OpcodeAtomicI32RmwAdd, wasm.OpcodeAtomicI64RmwAdd, wasm.OpcodeAtomicI32Rmw8AddU, wasm.OpcodeAtomicI32Rmw16AddU, wasm.OpcodeAtomicI64Rmw8AddU, wasm.OpcodeAtomicI64Rmw16AddU, wasm.OpcodeAtomicI64Rmw32AddU, + wasm.OpcodeAtomicI32RmwSub, wasm.OpcodeAtomicI64RmwSub, wasm.OpcodeAtomicI32Rmw8SubU, wasm.OpcodeAtomicI32Rmw16SubU, wasm.OpcodeAtomicI64Rmw8SubU, wasm.OpcodeAtomicI64Rmw16SubU, wasm.OpcodeAtomicI64Rmw32SubU, + wasm.OpcodeAtomicI32RmwAnd, wasm.OpcodeAtomicI64RmwAnd, wasm.OpcodeAtomicI32Rmw8AndU, wasm.OpcodeAtomicI32Rmw16AndU, wasm.OpcodeAtomicI64Rmw8AndU, wasm.OpcodeAtomicI64Rmw16AndU, wasm.OpcodeAtomicI64Rmw32AndU, + wasm.OpcodeAtomicI32RmwOr, wasm.OpcodeAtomicI64RmwOr, wasm.OpcodeAtomicI32Rmw8OrU, wasm.OpcodeAtomicI32Rmw16OrU, wasm.OpcodeAtomicI64Rmw8OrU, wasm.OpcodeAtomicI64Rmw16OrU, wasm.OpcodeAtomicI64Rmw32OrU, + wasm.OpcodeAtomicI32RmwXor, wasm.OpcodeAtomicI64RmwXor, wasm.OpcodeAtomicI32Rmw8XorU, wasm.OpcodeAtomicI32Rmw16XorU, wasm.OpcodeAtomicI64Rmw8XorU, wasm.OpcodeAtomicI64Rmw16XorU, wasm.OpcodeAtomicI64Rmw32XorU, + wasm.OpcodeAtomicI32RmwXchg, wasm.OpcodeAtomicI64RmwXchg, wasm.OpcodeAtomicI32Rmw8XchgU, wasm.OpcodeAtomicI32Rmw16XchgU, wasm.OpcodeAtomicI64Rmw8XchgU, wasm.OpcodeAtomicI64Rmw16XchgU, wasm.OpcodeAtomicI64Rmw32XchgU: + _, offset := c.readMemArg() + if state.unreachable { + break + } + + val := state.pop() + baseAddr := state.pop() + + var rmwOp ssa.AtomicRmwOp + var size uint64 + switch atomicOp { + case wasm.OpcodeAtomicI32RmwAdd, wasm.OpcodeAtomicI64RmwAdd, wasm.OpcodeAtomicI32Rmw8AddU, wasm.OpcodeAtomicI32Rmw16AddU, wasm.OpcodeAtomicI64Rmw8AddU, wasm.OpcodeAtomicI64Rmw16AddU, wasm.OpcodeAtomicI64Rmw32AddU: + rmwOp = ssa.AtomicRmwOpAdd + switch atomicOp { + case wasm.OpcodeAtomicI64RmwAdd: + size = 8 + case wasm.OpcodeAtomicI32RmwAdd, wasm.OpcodeAtomicI64Rmw32AddU: + size = 4 + case wasm.OpcodeAtomicI32Rmw16AddU, wasm.OpcodeAtomicI64Rmw16AddU: + size = 2 + case wasm.OpcodeAtomicI32Rmw8AddU, wasm.OpcodeAtomicI64Rmw8AddU: + size = 1 + } + case wasm.OpcodeAtomicI32RmwSub, wasm.OpcodeAtomicI64RmwSub, wasm.OpcodeAtomicI32Rmw8SubU, wasm.OpcodeAtomicI32Rmw16SubU, wasm.OpcodeAtomicI64Rmw8SubU, wasm.OpcodeAtomicI64Rmw16SubU, wasm.OpcodeAtomicI64Rmw32SubU: + rmwOp = ssa.AtomicRmwOpSub + switch atomicOp { + case wasm.OpcodeAtomicI64RmwSub: + size = 8 + case wasm.OpcodeAtomicI32RmwSub, wasm.OpcodeAtomicI64Rmw32SubU: + size = 4 + case wasm.OpcodeAtomicI32Rmw16SubU, wasm.OpcodeAtomicI64Rmw16SubU: + size = 2 + case wasm.OpcodeAtomicI32Rmw8SubU, wasm.OpcodeAtomicI64Rmw8SubU: + size = 1 + } + case wasm.OpcodeAtomicI32RmwAnd, wasm.OpcodeAtomicI64RmwAnd, wasm.OpcodeAtomicI32Rmw8AndU, wasm.OpcodeAtomicI32Rmw16AndU, wasm.OpcodeAtomicI64Rmw8AndU, wasm.OpcodeAtomicI64Rmw16AndU, wasm.OpcodeAtomicI64Rmw32AndU: + rmwOp = ssa.AtomicRmwOpAnd + switch atomicOp { + case wasm.OpcodeAtomicI64RmwAnd: + size = 8 + case wasm.OpcodeAtomicI32RmwAnd, wasm.OpcodeAtomicI64Rmw32AndU: + size = 4 + case wasm.OpcodeAtomicI32Rmw16AndU, wasm.OpcodeAtomicI64Rmw16AndU: + size = 2 + case wasm.OpcodeAtomicI32Rmw8AndU, wasm.OpcodeAtomicI64Rmw8AndU: + size = 1 + } + case wasm.OpcodeAtomicI32RmwOr, wasm.OpcodeAtomicI64RmwOr, wasm.OpcodeAtomicI32Rmw8OrU, wasm.OpcodeAtomicI32Rmw16OrU, wasm.OpcodeAtomicI64Rmw8OrU, wasm.OpcodeAtomicI64Rmw16OrU, wasm.OpcodeAtomicI64Rmw32OrU: + rmwOp = ssa.AtomicRmwOpOr + switch atomicOp { + case wasm.OpcodeAtomicI64RmwOr: + size = 8 + case wasm.OpcodeAtomicI32RmwOr, wasm.OpcodeAtomicI64Rmw32OrU: + size = 4 + case wasm.OpcodeAtomicI32Rmw16OrU, wasm.OpcodeAtomicI64Rmw16OrU: + size = 2 + case wasm.OpcodeAtomicI32Rmw8OrU, wasm.OpcodeAtomicI64Rmw8OrU: + size = 1 + } + case wasm.OpcodeAtomicI32RmwXor, wasm.OpcodeAtomicI64RmwXor, wasm.OpcodeAtomicI32Rmw8XorU, wasm.OpcodeAtomicI32Rmw16XorU, wasm.OpcodeAtomicI64Rmw8XorU, wasm.OpcodeAtomicI64Rmw16XorU, wasm.OpcodeAtomicI64Rmw32XorU: + rmwOp = ssa.AtomicRmwOpXor + switch atomicOp { + case wasm.OpcodeAtomicI64RmwXor: + size = 8 + case wasm.OpcodeAtomicI32RmwXor, wasm.OpcodeAtomicI64Rmw32XorU: + size = 4 + case wasm.OpcodeAtomicI32Rmw16XorU, wasm.OpcodeAtomicI64Rmw16XorU: + size = 2 + case wasm.OpcodeAtomicI32Rmw8XorU, wasm.OpcodeAtomicI64Rmw8XorU: + size = 1 + } + case wasm.OpcodeAtomicI32RmwXchg, wasm.OpcodeAtomicI64RmwXchg, wasm.OpcodeAtomicI32Rmw8XchgU, wasm.OpcodeAtomicI32Rmw16XchgU, wasm.OpcodeAtomicI64Rmw8XchgU, wasm.OpcodeAtomicI64Rmw16XchgU, wasm.OpcodeAtomicI64Rmw32XchgU: + rmwOp = ssa.AtomicRmwOpXchg + switch atomicOp { + case wasm.OpcodeAtomicI64RmwXchg: + size = 8 + case wasm.OpcodeAtomicI32RmwXchg, wasm.OpcodeAtomicI64Rmw32XchgU: + size = 4 + case wasm.OpcodeAtomicI32Rmw16XchgU, wasm.OpcodeAtomicI64Rmw16XchgU: + size = 2 + case wasm.OpcodeAtomicI32Rmw8XchgU, wasm.OpcodeAtomicI64Rmw8XchgU: + size = 1 + } + } + + addr := c.atomicMemOpSetup(baseAddr, uint64(offset), size) + res := builder.AllocateInstruction().AsAtomicRmw(rmwOp, addr, val, size).Insert(builder).Return() + state.push(res) + case wasm.OpcodeAtomicI32RmwCmpxchg, wasm.OpcodeAtomicI64RmwCmpxchg, wasm.OpcodeAtomicI32Rmw8CmpxchgU, wasm.OpcodeAtomicI32Rmw16CmpxchgU, wasm.OpcodeAtomicI64Rmw8CmpxchgU, wasm.OpcodeAtomicI64Rmw16CmpxchgU, wasm.OpcodeAtomicI64Rmw32CmpxchgU: + _, offset := c.readMemArg() + if state.unreachable { + break + } + + repl := state.pop() + exp := state.pop() + baseAddr := state.pop() + + var size uint64 + switch atomicOp { + case wasm.OpcodeAtomicI64RmwCmpxchg: + size = 8 + case wasm.OpcodeAtomicI32RmwCmpxchg, wasm.OpcodeAtomicI64Rmw32CmpxchgU: + size = 4 + case wasm.OpcodeAtomicI32Rmw16CmpxchgU, wasm.OpcodeAtomicI64Rmw16CmpxchgU: + size = 2 + case wasm.OpcodeAtomicI32Rmw8CmpxchgU, wasm.OpcodeAtomicI64Rmw8CmpxchgU: + size = 1 + } + addr := c.atomicMemOpSetup(baseAddr, uint64(offset), size) + res := builder.AllocateInstruction().AsAtomicCas(addr, exp, repl, size).Insert(builder).Return() + state.push(res) + case wasm.OpcodeAtomicFence: + order := c.readByte() + if state.unreachable { + break + } + if c.needMemory { + builder.AllocateInstruction().AsFence(order).Insert(builder) + } + default: + panic("TODO: unsupported atomic instruction: " + wasm.AtomicInstructionName(atomicOp)) + } + case wasm.OpcodeRefFunc: + funcIndex := c.readI32u() + if state.unreachable { + break + } + + c.storeCallerModuleContext() + + funcIndexVal := builder.AllocateInstruction().AsIconst32(funcIndex).Insert(builder).Return() + + refFuncPtr := builder.AllocateInstruction(). + AsLoad(c.execCtxPtrValue, + wazevoapi.ExecutionContextOffsetRefFuncTrampolineAddress.U32(), + ssa.TypeI64, + ).Insert(builder).Return() + + args := c.allocateVarLengthValues(2, c.execCtxPtrValue, funcIndexVal) + refFuncRet := builder. + AllocateInstruction(). + AsCallIndirect(refFuncPtr, &c.refFuncSig, args). + Insert(builder).Return() + state.push(refFuncRet) + + case wasm.OpcodeRefNull: + c.loweringState.pc++ // skips the reference type as we treat both of them as i64(0). + if state.unreachable { + break + } + ret := builder.AllocateInstruction().AsIconst64(0).Insert(builder).Return() + state.push(ret) + case wasm.OpcodeRefIsNull: + if state.unreachable { + break + } + r := state.pop() + zero := builder.AllocateInstruction().AsIconst64(0).Insert(builder) + icmp := builder.AllocateInstruction(). + AsIcmp(r, zero.Return(), ssa.IntegerCmpCondEqual). + Insert(builder). + Return() + state.push(icmp) + case wasm.OpcodeTableSet: + tableIndex := c.readI32u() + if state.unreachable { + break + } + r := state.pop() + targetOffsetInTable := state.pop() + + elementAddr := c.lowerAccessTableWithBoundsCheck(tableIndex, targetOffsetInTable) + builder.AllocateInstruction().AsStore(ssa.OpcodeStore, r, elementAddr, 0).Insert(builder) + + case wasm.OpcodeTableGet: + tableIndex := c.readI32u() + if state.unreachable { + break + } + targetOffsetInTable := state.pop() + elementAddr := c.lowerAccessTableWithBoundsCheck(tableIndex, targetOffsetInTable) + loaded := builder.AllocateInstruction().AsLoad(elementAddr, 0, ssa.TypeI64).Insert(builder).Return() + state.push(loaded) + default: + panic("TODO: unsupported in wazevo yet: " + wasm.InstructionName(op)) + } + + if wazevoapi.FrontEndLoggingEnabled { + fmt.Println("--------- Translated " + wasm.InstructionName(op) + " --------") + fmt.Println("state: " + c.loweringState.String()) + fmt.Println(c.formatBuilder()) + fmt.Println("--------------------------") + } + c.loweringState.pc++ +} + +func (c *Compiler) lowerExtMul(v1, v2 ssa.Value, from, to ssa.VecLane, signed, low bool) ssa.Value { + // TODO: The sequence `Widen; Widen; VIMul` can be substituted for a single instruction on some ISAs. + builder := c.ssaBuilder + + v1lo := builder.AllocateInstruction().AsWiden(v1, from, signed, low).Insert(builder).Return() + v2lo := builder.AllocateInstruction().AsWiden(v2, from, signed, low).Insert(builder).Return() + + return builder.AllocateInstruction().AsVImul(v1lo, v2lo, to).Insert(builder).Return() +} + +const ( + tableInstanceBaseAddressOffset = 0 + tableInstanceLenOffset = tableInstanceBaseAddressOffset + 8 +) + +func (c *Compiler) lowerAccessTableWithBoundsCheck(tableIndex uint32, elementOffsetInTable ssa.Value) (elementAddress ssa.Value) { + builder := c.ssaBuilder + + // Load the table. + loadTableInstancePtr := builder.AllocateInstruction() + loadTableInstancePtr.AsLoad(c.moduleCtxPtrValue, c.offset.TableOffset(int(tableIndex)).U32(), ssa.TypeI64) + builder.InsertInstruction(loadTableInstancePtr) + tableInstancePtr := loadTableInstancePtr.Return() + + // Load the table's length. + loadTableLen := builder.AllocateInstruction() + loadTableLen.AsLoad(tableInstancePtr, tableInstanceLenOffset, ssa.TypeI32) + builder.InsertInstruction(loadTableLen) + tableLen := loadTableLen.Return() + + // Compare the length and the target, and trap if out of bounds. + checkOOB := builder.AllocateInstruction() + checkOOB.AsIcmp(elementOffsetInTable, tableLen, ssa.IntegerCmpCondUnsignedGreaterThanOrEqual) + builder.InsertInstruction(checkOOB) + exitIfOOB := builder.AllocateInstruction() + exitIfOOB.AsExitIfTrueWithCode(c.execCtxPtrValue, checkOOB.Return(), wazevoapi.ExitCodeTableOutOfBounds) + builder.InsertInstruction(exitIfOOB) + + // Get the base address of wasm.TableInstance.References. + loadTableBaseAddress := builder.AllocateInstruction() + loadTableBaseAddress.AsLoad(tableInstancePtr, tableInstanceBaseAddressOffset, ssa.TypeI64) + builder.InsertInstruction(loadTableBaseAddress) + tableBase := loadTableBaseAddress.Return() + + // Calculate the address of the target function. First we need to multiply targetOffsetInTable by 8 (pointer size). + multiplyBy8 := builder.AllocateInstruction() + three := builder.AllocateInstruction() + three.AsIconst64(3) + builder.InsertInstruction(three) + multiplyBy8.AsIshl(elementOffsetInTable, three.Return()) + builder.InsertInstruction(multiplyBy8) + targetOffsetInTableMultipliedBy8 := multiplyBy8.Return() + + // Then add the multiplied value to the base which results in the address of the target function (*wazevo.functionInstance) + calcElementAddressInTable := builder.AllocateInstruction() + calcElementAddressInTable.AsIadd(tableBase, targetOffsetInTableMultipliedBy8) + builder.InsertInstruction(calcElementAddressInTable) + return calcElementAddressInTable.Return() +} + +func (c *Compiler) lowerCallIndirect(typeIndex, tableIndex uint32) { + builder := c.ssaBuilder + state := c.state() + + elementOffsetInTable := state.pop() + functionInstancePtrAddress := c.lowerAccessTableWithBoundsCheck(tableIndex, elementOffsetInTable) + loadFunctionInstancePtr := builder.AllocateInstruction() + loadFunctionInstancePtr.AsLoad(functionInstancePtrAddress, 0, ssa.TypeI64) + builder.InsertInstruction(loadFunctionInstancePtr) + functionInstancePtr := loadFunctionInstancePtr.Return() + + // Check if it is not the null pointer. + zero := builder.AllocateInstruction() + zero.AsIconst64(0) + builder.InsertInstruction(zero) + checkNull := builder.AllocateInstruction() + checkNull.AsIcmp(functionInstancePtr, zero.Return(), ssa.IntegerCmpCondEqual) + builder.InsertInstruction(checkNull) + exitIfNull := builder.AllocateInstruction() + exitIfNull.AsExitIfTrueWithCode(c.execCtxPtrValue, checkNull.Return(), wazevoapi.ExitCodeIndirectCallNullPointer) + builder.InsertInstruction(exitIfNull) + + // We need to do the type check. First, load the target function instance's typeID. + loadTypeID := builder.AllocateInstruction() + loadTypeID.AsLoad(functionInstancePtr, wazevoapi.FunctionInstanceTypeIDOffset, ssa.TypeI32) + builder.InsertInstruction(loadTypeID) + actualTypeID := loadTypeID.Return() + + // Next, we load the expected TypeID: + loadTypeIDsBegin := builder.AllocateInstruction() + loadTypeIDsBegin.AsLoad(c.moduleCtxPtrValue, c.offset.TypeIDs1stElement.U32(), ssa.TypeI64) + builder.InsertInstruction(loadTypeIDsBegin) + typeIDsBegin := loadTypeIDsBegin.Return() + + loadExpectedTypeID := builder.AllocateInstruction() + loadExpectedTypeID.AsLoad(typeIDsBegin, uint32(typeIndex)*4 /* size of wasm.FunctionTypeID */, ssa.TypeI32) + builder.InsertInstruction(loadExpectedTypeID) + expectedTypeID := loadExpectedTypeID.Return() + + // Check if the type ID matches. + checkTypeID := builder.AllocateInstruction() + checkTypeID.AsIcmp(actualTypeID, expectedTypeID, ssa.IntegerCmpCondNotEqual) + builder.InsertInstruction(checkTypeID) + exitIfNotMatch := builder.AllocateInstruction() + exitIfNotMatch.AsExitIfTrueWithCode(c.execCtxPtrValue, checkTypeID.Return(), wazevoapi.ExitCodeIndirectCallTypeMismatch) + builder.InsertInstruction(exitIfNotMatch) + + // Now ready to call the function. Load the executable and moduleContextOpaquePtr from the function instance. + loadExecutablePtr := builder.AllocateInstruction() + loadExecutablePtr.AsLoad(functionInstancePtr, wazevoapi.FunctionInstanceExecutableOffset, ssa.TypeI64) + builder.InsertInstruction(loadExecutablePtr) + executablePtr := loadExecutablePtr.Return() + loadModuleContextOpaquePtr := builder.AllocateInstruction() + loadModuleContextOpaquePtr.AsLoad(functionInstancePtr, wazevoapi.FunctionInstanceModuleContextOpaquePtrOffset, ssa.TypeI64) + builder.InsertInstruction(loadModuleContextOpaquePtr) + moduleContextOpaquePtr := loadModuleContextOpaquePtr.Return() + + typ := &c.m.TypeSection[typeIndex] + tail := len(state.values) - len(typ.Params) + vs := state.values[tail:] + state.values = state.values[:tail] + args := c.allocateVarLengthValues(2+len(vs), c.execCtxPtrValue, moduleContextOpaquePtr) + args = args.Append(builder.VarLengthPool(), vs...) + + // Before transfer the control to the callee, we have to store the current module's moduleContextPtr + // into execContext.callerModuleContextPtr in case when the callee is a Go function. + c.storeCallerModuleContext() + + call := builder.AllocateInstruction() + call.AsCallIndirect(executablePtr, c.signatures[typ], args) + builder.InsertInstruction(call) + + first, rest := call.Returns() + if first.Valid() { + state.push(first) + } + for _, v := range rest { + state.push(v) + } + + c.reloadAfterCall() +} + +// memOpSetup inserts the bounds check and calculates the address of the memory operation (loads/stores). +func (c *Compiler) memOpSetup(baseAddr ssa.Value, constOffset, operationSizeInBytes uint64) (address ssa.Value) { + address = ssa.ValueInvalid + builder := c.ssaBuilder + + baseAddrID := baseAddr.ID() + ceil := constOffset + operationSizeInBytes + if known := c.getKnownSafeBound(baseAddrID); known.valid() { + // We reuse the calculated absolute address even if the bound is not known to be safe. + address = known.absoluteAddr + if ceil <= known.bound { + if !address.Valid() { + // This means that, the bound is known to be safe, but the memory base might have changed. + // So, we re-calculate the address. + memBase := c.getMemoryBaseValue(false) + extBaseAddr := builder.AllocateInstruction(). + AsUExtend(baseAddr, 32, 64). + Insert(builder). + Return() + address = builder.AllocateInstruction(). + AsIadd(memBase, extBaseAddr).Insert(builder).Return() + known.absoluteAddr = address // Update the absolute address for the subsequent memory access. + } + return + } + } + + ceilConst := builder.AllocateInstruction() + ceilConst.AsIconst64(ceil) + builder.InsertInstruction(ceilConst) + + // We calculate the offset in 64-bit space. + extBaseAddr := builder.AllocateInstruction(). + AsUExtend(baseAddr, 32, 64). + Insert(builder). + Return() + + // Note: memLen is already zero extended to 64-bit space at the load time. + memLen := c.getMemoryLenValue(false) + + // baseAddrPlusCeil = baseAddr + ceil + baseAddrPlusCeil := builder.AllocateInstruction() + baseAddrPlusCeil.AsIadd(extBaseAddr, ceilConst.Return()) + builder.InsertInstruction(baseAddrPlusCeil) + + // Check for out of bounds memory access: `memLen >= baseAddrPlusCeil`. + cmp := builder.AllocateInstruction() + cmp.AsIcmp(memLen, baseAddrPlusCeil.Return(), ssa.IntegerCmpCondUnsignedLessThan) + builder.InsertInstruction(cmp) + exitIfNZ := builder.AllocateInstruction() + exitIfNZ.AsExitIfTrueWithCode(c.execCtxPtrValue, cmp.Return(), wazevoapi.ExitCodeMemoryOutOfBounds) + builder.InsertInstruction(exitIfNZ) + + // Load the value from memBase + extBaseAddr. + if address == ssa.ValueInvalid { // Reuse the value if the memBase is already calculated at this point. + memBase := c.getMemoryBaseValue(false) + address = builder.AllocateInstruction(). + AsIadd(memBase, extBaseAddr).Insert(builder).Return() + } + + // Record the bound ceil for this baseAddr is known to be safe for the subsequent memory access in the same block. + c.recordKnownSafeBound(baseAddrID, ceil, address) + return +} + +// atomicMemOpSetup inserts the bounds check and calculates the address of the memory operation (loads/stores), including +// the constant offset and performs an alignment check on the final address. +func (c *Compiler) atomicMemOpSetup(baseAddr ssa.Value, constOffset, operationSizeInBytes uint64) (address ssa.Value) { + builder := c.ssaBuilder + + addrWithoutOffset := c.memOpSetup(baseAddr, constOffset, operationSizeInBytes) + var addr ssa.Value + if constOffset == 0 { + addr = addrWithoutOffset + } else { + offset := builder.AllocateInstruction().AsIconst64(constOffset).Insert(builder).Return() + addr = builder.AllocateInstruction().AsIadd(addrWithoutOffset, offset).Insert(builder).Return() + } + + c.memAlignmentCheck(addr, operationSizeInBytes) + + return addr +} + +func (c *Compiler) memAlignmentCheck(addr ssa.Value, operationSizeInBytes uint64) { + if operationSizeInBytes == 1 { + return // No alignment restrictions when accessing a byte + } + var checkBits uint64 + switch operationSizeInBytes { + case 2: + checkBits = 0b1 + case 4: + checkBits = 0b11 + case 8: + checkBits = 0b111 + } + + builder := c.ssaBuilder + + mask := builder.AllocateInstruction().AsIconst64(checkBits).Insert(builder).Return() + masked := builder.AllocateInstruction().AsBand(addr, mask).Insert(builder).Return() + zero := builder.AllocateInstruction().AsIconst64(0).Insert(builder).Return() + cmp := builder.AllocateInstruction().AsIcmp(masked, zero, ssa.IntegerCmpCondNotEqual).Insert(builder).Return() + builder.AllocateInstruction().AsExitIfTrueWithCode(c.execCtxPtrValue, cmp, wazevoapi.ExitCodeUnalignedAtomic).Insert(builder) +} + +func (c *Compiler) callMemmove(dst, src, size ssa.Value) { + args := c.allocateVarLengthValues(3, dst, src, size) + if size.Type() != ssa.TypeI64 { + panic("TODO: memmove size must be i64") + } + + builder := c.ssaBuilder + memmovePtr := builder.AllocateInstruction(). + AsLoad(c.execCtxPtrValue, + wazevoapi.ExecutionContextOffsetMemmoveAddress.U32(), + ssa.TypeI64, + ).Insert(builder).Return() + builder.AllocateInstruction().AsCallGoRuntimeMemmove(memmovePtr, &c.memmoveSig, args).Insert(builder) +} + +func (c *Compiler) reloadAfterCall() { + // Note that when these are not used in the following instructions, they will be optimized out. + // So in any ways, we define them! + + // After calling any function, memory buffer might have changed. So we need to re-define the variable. + // However, if the memory is shared, we don't need to reload the memory base and length as the base will never change. + if c.needMemory && !c.memoryShared { + c.reloadMemoryBaseLen() + } + + // Also, any mutable Global can change. + for _, index := range c.mutableGlobalVariablesIndexes { + _ = c.getWasmGlobalValue(index, true) + } +} + +func (c *Compiler) reloadMemoryBaseLen() { + _ = c.getMemoryBaseValue(true) + _ = c.getMemoryLenValue(true) + + // This function being called means that the memory base might have changed. + // Therefore, we need to clear the absolute addresses recorded in the known safe bounds + // because we cache the absolute address of the memory access per each base offset. + c.resetAbsoluteAddressInSafeBounds() +} + +func (c *Compiler) setWasmGlobalValue(index wasm.Index, v ssa.Value) { + variable := c.globalVariables[index] + opaqueOffset := c.offset.GlobalInstanceOffset(index) + + builder := c.ssaBuilder + if index < c.m.ImportGlobalCount { + loadGlobalInstPtr := builder.AllocateInstruction() + loadGlobalInstPtr.AsLoad(c.moduleCtxPtrValue, uint32(opaqueOffset), ssa.TypeI64) + builder.InsertInstruction(loadGlobalInstPtr) + + store := builder.AllocateInstruction() + store.AsStore(ssa.OpcodeStore, v, loadGlobalInstPtr.Return(), uint32(0)) + builder.InsertInstruction(store) + + } else { + store := builder.AllocateInstruction() + store.AsStore(ssa.OpcodeStore, v, c.moduleCtxPtrValue, uint32(opaqueOffset)) + builder.InsertInstruction(store) + } + + // The value has changed to `v`, so we record it. + builder.DefineVariableInCurrentBB(variable, v) +} + +func (c *Compiler) getWasmGlobalValue(index wasm.Index, forceLoad bool) ssa.Value { + variable := c.globalVariables[index] + typ := c.globalVariablesTypes[index] + opaqueOffset := c.offset.GlobalInstanceOffset(index) + + builder := c.ssaBuilder + if !forceLoad { + if v := builder.FindValueInLinearPath(variable); v.Valid() { + return v + } + } + + var load *ssa.Instruction + if index < c.m.ImportGlobalCount { + loadGlobalInstPtr := builder.AllocateInstruction() + loadGlobalInstPtr.AsLoad(c.moduleCtxPtrValue, uint32(opaqueOffset), ssa.TypeI64) + builder.InsertInstruction(loadGlobalInstPtr) + load = builder.AllocateInstruction(). + AsLoad(loadGlobalInstPtr.Return(), uint32(0), typ) + } else { + load = builder.AllocateInstruction(). + AsLoad(c.moduleCtxPtrValue, uint32(opaqueOffset), typ) + } + + v := load.Insert(builder).Return() + builder.DefineVariableInCurrentBB(variable, v) + return v +} + +const ( + memoryInstanceBufOffset = 0 + memoryInstanceBufSizeOffset = memoryInstanceBufOffset + 8 +) + +func (c *Compiler) getMemoryBaseValue(forceReload bool) ssa.Value { + builder := c.ssaBuilder + variable := c.memoryBaseVariable + if !forceReload { + if v := builder.FindValueInLinearPath(variable); v.Valid() { + return v + } + } + + var ret ssa.Value + if c.offset.LocalMemoryBegin < 0 { + loadMemInstPtr := builder.AllocateInstruction() + loadMemInstPtr.AsLoad(c.moduleCtxPtrValue, c.offset.ImportedMemoryBegin.U32(), ssa.TypeI64) + builder.InsertInstruction(loadMemInstPtr) + memInstPtr := loadMemInstPtr.Return() + + loadBufPtr := builder.AllocateInstruction() + loadBufPtr.AsLoad(memInstPtr, memoryInstanceBufOffset, ssa.TypeI64) + builder.InsertInstruction(loadBufPtr) + ret = loadBufPtr.Return() + } else { + load := builder.AllocateInstruction() + load.AsLoad(c.moduleCtxPtrValue, c.offset.LocalMemoryBase().U32(), ssa.TypeI64) + builder.InsertInstruction(load) + ret = load.Return() + } + + builder.DefineVariableInCurrentBB(variable, ret) + return ret +} + +func (c *Compiler) getMemoryLenValue(forceReload bool) ssa.Value { + variable := c.memoryLenVariable + builder := c.ssaBuilder + if !forceReload && !c.memoryShared { + if v := builder.FindValueInLinearPath(variable); v.Valid() { + return v + } + } + + var ret ssa.Value + if c.offset.LocalMemoryBegin < 0 { + loadMemInstPtr := builder.AllocateInstruction() + loadMemInstPtr.AsLoad(c.moduleCtxPtrValue, c.offset.ImportedMemoryBegin.U32(), ssa.TypeI64) + builder.InsertInstruction(loadMemInstPtr) + memInstPtr := loadMemInstPtr.Return() + + loadBufSizePtr := builder.AllocateInstruction() + if c.memoryShared { + sizeOffset := builder.AllocateInstruction().AsIconst64(memoryInstanceBufSizeOffset).Insert(builder).Return() + addr := builder.AllocateInstruction().AsIadd(memInstPtr, sizeOffset).Insert(builder).Return() + loadBufSizePtr.AsAtomicLoad(addr, 8, ssa.TypeI64) + } else { + loadBufSizePtr.AsLoad(memInstPtr, memoryInstanceBufSizeOffset, ssa.TypeI64) + } + builder.InsertInstruction(loadBufSizePtr) + + ret = loadBufSizePtr.Return() + } else { + load := builder.AllocateInstruction() + if c.memoryShared { + lenOffset := builder.AllocateInstruction().AsIconst64(c.offset.LocalMemoryLen().U64()).Insert(builder).Return() + addr := builder.AllocateInstruction().AsIadd(c.moduleCtxPtrValue, lenOffset).Insert(builder).Return() + load.AsAtomicLoad(addr, 8, ssa.TypeI64) + } else { + load.AsExtLoad(ssa.OpcodeUload32, c.moduleCtxPtrValue, c.offset.LocalMemoryLen().U32(), true) + } + builder.InsertInstruction(load) + ret = load.Return() + } + + builder.DefineVariableInCurrentBB(variable, ret) + return ret +} + +func (c *Compiler) insertIcmp(cond ssa.IntegerCmpCond) { + state, builder := c.state(), c.ssaBuilder + y, x := state.pop(), state.pop() + cmp := builder.AllocateInstruction() + cmp.AsIcmp(x, y, cond) + builder.InsertInstruction(cmp) + value := cmp.Return() + state.push(value) +} + +func (c *Compiler) insertFcmp(cond ssa.FloatCmpCond) { + state, builder := c.state(), c.ssaBuilder + y, x := state.pop(), state.pop() + cmp := builder.AllocateInstruction() + cmp.AsFcmp(x, y, cond) + builder.InsertInstruction(cmp) + value := cmp.Return() + state.push(value) +} + +// storeCallerModuleContext stores the current module's moduleContextPtr into execContext.callerModuleContextPtr. +func (c *Compiler) storeCallerModuleContext() { + builder := c.ssaBuilder + execCtx := c.execCtxPtrValue + store := builder.AllocateInstruction() + store.AsStore(ssa.OpcodeStore, + c.moduleCtxPtrValue, execCtx, wazevoapi.ExecutionContextOffsetCallerModuleContextPtr.U32()) + builder.InsertInstruction(store) +} + +func (c *Compiler) readByte() byte { + v := c.wasmFunctionBody[c.loweringState.pc+1] + c.loweringState.pc++ + return v +} + +func (c *Compiler) readI32u() uint32 { + v, n, err := leb128.LoadUint32(c.wasmFunctionBody[c.loweringState.pc+1:]) + if err != nil { + panic(err) // shouldn't be reached since compilation comes after validation. + } + c.loweringState.pc += int(n) + return v +} + +func (c *Compiler) readI32s() int32 { + v, n, err := leb128.LoadInt32(c.wasmFunctionBody[c.loweringState.pc+1:]) + if err != nil { + panic(err) // shouldn't be reached since compilation comes after validation. + } + c.loweringState.pc += int(n) + return v +} + +func (c *Compiler) readI64s() int64 { + v, n, err := leb128.LoadInt64(c.wasmFunctionBody[c.loweringState.pc+1:]) + if err != nil { + panic(err) // shouldn't be reached since compilation comes after validation. + } + c.loweringState.pc += int(n) + return v +} + +func (c *Compiler) readF32() float32 { + v := math.Float32frombits(binary.LittleEndian.Uint32(c.wasmFunctionBody[c.loweringState.pc+1:])) + c.loweringState.pc += 4 + return v +} + +func (c *Compiler) readF64() float64 { + v := math.Float64frombits(binary.LittleEndian.Uint64(c.wasmFunctionBody[c.loweringState.pc+1:])) + c.loweringState.pc += 8 + return v +} + +// readBlockType reads the block type from the current position of the bytecode reader. +func (c *Compiler) readBlockType() *wasm.FunctionType { + state := c.state() + + c.br.Reset(c.wasmFunctionBody[state.pc+1:]) + bt, num, err := wasm.DecodeBlockType(c.m.TypeSection, c.br, api.CoreFeaturesV2) + if err != nil { + panic(err) // shouldn't be reached since compilation comes after validation. + } + state.pc += int(num) + + return bt +} + +func (c *Compiler) readMemArg() (align, offset uint32) { + state := c.state() + + align, num, err := leb128.LoadUint32(c.wasmFunctionBody[state.pc+1:]) + if err != nil { + panic(fmt.Errorf("read memory align: %v", err)) + } + + state.pc += int(num) + offset, num, err = leb128.LoadUint32(c.wasmFunctionBody[state.pc+1:]) + if err != nil { + panic(fmt.Errorf("read memory offset: %v", err)) + } + + state.pc += int(num) + return align, offset +} + +// insertJumpToBlock inserts a jump instruction to the given block in the current block. +func (c *Compiler) insertJumpToBlock(args ssa.Values, targetBlk ssa.BasicBlock) { + if targetBlk.ReturnBlock() { + if c.needListener { + c.callListenerAfter() + } + } + + builder := c.ssaBuilder + jmp := builder.AllocateInstruction() + jmp.AsJump(args, targetBlk) + builder.InsertInstruction(jmp) +} + +func (c *Compiler) insertIntegerExtend(signed bool, from, to byte) { + state := c.state() + builder := c.ssaBuilder + v := state.pop() + extend := builder.AllocateInstruction() + if signed { + extend.AsSExtend(v, from, to) + } else { + extend.AsUExtend(v, from, to) + } + builder.InsertInstruction(extend) + value := extend.Return() + state.push(value) +} + +func (c *Compiler) switchTo(originalStackLen int, targetBlk ssa.BasicBlock) { + if targetBlk.Preds() == 0 { + c.loweringState.unreachable = true + } + + // Now we should adjust the stack and start translating the continuation block. + c.loweringState.values = c.loweringState.values[:originalStackLen] + + c.ssaBuilder.SetCurrentBlock(targetBlk) + + // At this point, blocks params consist only of the Wasm-level parameters, + // (since it's added only when we are trying to resolve variable *inside* this block). + for i := 0; i < targetBlk.Params(); i++ { + value := targetBlk.Param(i) + c.loweringState.push(value) + } +} + +// results returns the number of results of the current function. +func (c *Compiler) results() int { + return len(c.wasmFunctionTyp.Results) +} + +func (c *Compiler) lowerBrTable(labels []uint32, index ssa.Value) { + state := c.state() + builder := c.ssaBuilder + + f := state.ctrlPeekAt(int(labels[0])) + var numArgs int + if f.isLoop() { + numArgs = len(f.blockType.Params) + } else { + numArgs = len(f.blockType.Results) + } + + targets := make([]ssa.BasicBlock, len(labels)) + + // We need trampoline blocks since depending on the target block structure, we might end up inserting moves before jumps, + // which cannot be done with br_table. Instead, we can do such per-block moves in the trampoline blocks. + // At the linking phase (very end of the backend), we can remove the unnecessary jumps, and therefore no runtime overhead. + currentBlk := builder.CurrentBlock() + for i, l := range labels { + // Args are always on the top of the stack. Note that we should not share the args slice + // among the jump instructions since the args are modified during passes (e.g. redundant phi elimination). + args := c.nPeekDup(numArgs) + targetBlk, _ := state.brTargetArgNumFor(l) + trampoline := builder.AllocateBasicBlock() + builder.SetCurrentBlock(trampoline) + c.insertJumpToBlock(args, targetBlk) + targets[i] = trampoline + } + builder.SetCurrentBlock(currentBlk) + + // If the target block has no arguments, we can just jump to the target block. + brTable := builder.AllocateInstruction() + brTable.AsBrTable(index, targets) + builder.InsertInstruction(brTable) + + for _, trampoline := range targets { + builder.Seal(trampoline) + } +} + +func (l *loweringState) brTargetArgNumFor(labelIndex uint32) (targetBlk ssa.BasicBlock, argNum int) { + targetFrame := l.ctrlPeekAt(int(labelIndex)) + if targetFrame.isLoop() { + targetBlk, argNum = targetFrame.blk, len(targetFrame.blockType.Params) + } else { + targetBlk, argNum = targetFrame.followingBlock, len(targetFrame.blockType.Results) + } + return +} + +func (c *Compiler) callListenerBefore() { + c.storeCallerModuleContext() + + builder := c.ssaBuilder + beforeListeners1stElement := builder.AllocateInstruction(). + AsLoad(c.moduleCtxPtrValue, + c.offset.BeforeListenerTrampolines1stElement.U32(), + ssa.TypeI64, + ).Insert(builder).Return() + + beforeListenerPtr := builder.AllocateInstruction(). + AsLoad(beforeListeners1stElement, uint32(c.wasmFunctionTypeIndex)*8 /* 8 bytes per index */, ssa.TypeI64).Insert(builder).Return() + + entry := builder.EntryBlock() + ps := entry.Params() + + args := c.allocateVarLengthValues(ps, c.execCtxPtrValue, + builder.AllocateInstruction().AsIconst32(c.wasmLocalFunctionIndex).Insert(builder).Return()) + for i := 2; i < ps; i++ { + args = args.Append(builder.VarLengthPool(), entry.Param(i)) + } + + beforeSig := c.listenerSignatures[c.wasmFunctionTyp][0] + builder.AllocateInstruction(). + AsCallIndirect(beforeListenerPtr, beforeSig, args). + Insert(builder) +} + +func (c *Compiler) callListenerAfter() { + c.storeCallerModuleContext() + + builder := c.ssaBuilder + afterListeners1stElement := builder.AllocateInstruction(). + AsLoad(c.moduleCtxPtrValue, + c.offset.AfterListenerTrampolines1stElement.U32(), + ssa.TypeI64, + ).Insert(builder).Return() + + afterListenerPtr := builder.AllocateInstruction(). + AsLoad(afterListeners1stElement, + uint32(c.wasmFunctionTypeIndex)*8 /* 8 bytes per index */, ssa.TypeI64). + Insert(builder). + Return() + + afterSig := c.listenerSignatures[c.wasmFunctionTyp][1] + args := c.allocateVarLengthValues( + c.results()+2, + c.execCtxPtrValue, + builder.AllocateInstruction().AsIconst32(c.wasmLocalFunctionIndex).Insert(builder).Return(), + ) + + l := c.state() + tail := len(l.values) + args = args.Append(c.ssaBuilder.VarLengthPool(), l.values[tail-c.results():tail]...) + builder.AllocateInstruction(). + AsCallIndirect(afterListenerPtr, afterSig, args). + Insert(builder) +} + +const ( + elementOrDataInstanceLenOffset = 8 + elementOrDataInstanceSize = 24 +) + +// dropInstance inserts instructions to drop the element/data instance specified by the given index. +func (c *Compiler) dropDataOrElementInstance(index uint32, firstItemOffset wazevoapi.Offset) { + builder := c.ssaBuilder + instPtr := c.dataOrElementInstanceAddr(index, firstItemOffset) + + zero := builder.AllocateInstruction().AsIconst64(0).Insert(builder).Return() + + // Clear the instance. + builder.AllocateInstruction().AsStore(ssa.OpcodeStore, zero, instPtr, 0).Insert(builder) + builder.AllocateInstruction().AsStore(ssa.OpcodeStore, zero, instPtr, elementOrDataInstanceLenOffset).Insert(builder) + builder.AllocateInstruction().AsStore(ssa.OpcodeStore, zero, instPtr, elementOrDataInstanceLenOffset+8).Insert(builder) +} + +func (c *Compiler) dataOrElementInstanceAddr(index uint32, firstItemOffset wazevoapi.Offset) ssa.Value { + builder := c.ssaBuilder + + _1stItemPtr := builder. + AllocateInstruction(). + AsLoad(c.moduleCtxPtrValue, firstItemOffset.U32(), ssa.TypeI64). + Insert(builder).Return() + + // Each data/element instance is a slice, so we need to multiply index by 16 to get the offset of the target instance. + index = index * elementOrDataInstanceSize + indexExt := builder.AllocateInstruction().AsIconst64(uint64(index)).Insert(builder).Return() + // Then, add the offset to the address of the instance. + instPtr := builder.AllocateInstruction().AsIadd(_1stItemPtr, indexExt).Insert(builder).Return() + return instPtr +} + +func (c *Compiler) boundsCheckInDataOrElementInstance(instPtr, offsetInInstance, copySize ssa.Value, exitCode wazevoapi.ExitCode) { + builder := c.ssaBuilder + dataInstLen := builder.AllocateInstruction(). + AsLoad(instPtr, elementOrDataInstanceLenOffset, ssa.TypeI64). + Insert(builder).Return() + ceil := builder.AllocateInstruction().AsIadd(offsetInInstance, copySize).Insert(builder).Return() + cmp := builder.AllocateInstruction(). + AsIcmp(dataInstLen, ceil, ssa.IntegerCmpCondUnsignedLessThan). + Insert(builder). + Return() + builder.AllocateInstruction(). + AsExitIfTrueWithCode(c.execCtxPtrValue, cmp, exitCode). + Insert(builder) +} + +func (c *Compiler) boundsCheckInTable(tableIndex uint32, offset, size ssa.Value) (tableInstancePtr ssa.Value) { + builder := c.ssaBuilder + dstCeil := builder.AllocateInstruction().AsIadd(offset, size).Insert(builder).Return() + + // Load the table. + tableInstancePtr = builder.AllocateInstruction(). + AsLoad(c.moduleCtxPtrValue, c.offset.TableOffset(int(tableIndex)).U32(), ssa.TypeI64). + Insert(builder).Return() + + // Load the table's length. + tableLen := builder.AllocateInstruction(). + AsLoad(tableInstancePtr, tableInstanceLenOffset, ssa.TypeI32).Insert(builder).Return() + tableLenExt := builder.AllocateInstruction().AsUExtend(tableLen, 32, 64).Insert(builder).Return() + + // Compare the length and the target, and trap if out of bounds. + checkOOB := builder.AllocateInstruction() + checkOOB.AsIcmp(tableLenExt, dstCeil, ssa.IntegerCmpCondUnsignedLessThan) + builder.InsertInstruction(checkOOB) + exitIfOOB := builder.AllocateInstruction() + exitIfOOB.AsExitIfTrueWithCode(c.execCtxPtrValue, checkOOB.Return(), wazevoapi.ExitCodeTableOutOfBounds) + builder.InsertInstruction(exitIfOOB) + return +} + +func (c *Compiler) loadTableBaseAddr(tableInstancePtr ssa.Value) ssa.Value { + builder := c.ssaBuilder + loadTableBaseAddress := builder. + AllocateInstruction(). + AsLoad(tableInstancePtr, tableInstanceBaseAddressOffset, ssa.TypeI64). + Insert(builder) + return loadTableBaseAddress.Return() +} + +func (c *Compiler) boundsCheckInMemory(memLen, offset, size ssa.Value) { + builder := c.ssaBuilder + ceil := builder.AllocateInstruction().AsIadd(offset, size).Insert(builder).Return() + cmp := builder.AllocateInstruction(). + AsIcmp(memLen, ceil, ssa.IntegerCmpCondUnsignedLessThan). + Insert(builder). + Return() + builder.AllocateInstruction(). + AsExitIfTrueWithCode(c.execCtxPtrValue, cmp, wazevoapi.ExitCodeMemoryOutOfBounds). + Insert(builder) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/misc.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/misc.go new file mode 100644 index 000000000..2db2b892c --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/misc.go @@ -0,0 +1,10 @@ +package frontend + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" + "github.com/tetratelabs/wazero/internal/wasm" +) + +func FunctionIndexToFuncRef(idx wasm.Index) ssa.FuncRef { + return ssa.FuncRef(idx) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id.go new file mode 100644 index 000000000..1296706f5 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id.go @@ -0,0 +1,15 @@ +//go:build go1.21 + +package frontend + +import ( + "slices" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +func sortSSAValueIDs(IDs []ssa.ValueID) { + slices.SortFunc(IDs, func(i, j ssa.ValueID) int { + return int(i) - int(j) + }) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id_old.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id_old.go new file mode 100644 index 000000000..2e786a160 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/frontend/sort_id_old.go @@ -0,0 +1,17 @@ +//go:build !go1.21 + +// TODO: delete after the floor Go version is 1.21 + +package frontend + +import ( + "sort" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa" +) + +func sortSSAValueIDs(IDs []ssa.ValueID) { + sort.SliceStable(IDs, func(i, j int) bool { + return int(IDs[i]) < int(IDs[j]) + }) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/hostmodule.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/hostmodule.go new file mode 100644 index 000000000..8da7347a9 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/hostmodule.go @@ -0,0 +1,82 @@ +package wazevo + +import ( + "encoding/binary" + "reflect" + "unsafe" + + "github.com/tetratelabs/wazero/experimental" + "github.com/tetratelabs/wazero/internal/wasm" +) + +func buildHostModuleOpaque(m *wasm.Module, listeners []experimental.FunctionListener) moduleContextOpaque { + size := len(m.CodeSection)*16 + 32 + ret := newAlignedOpaque(size) + + binary.LittleEndian.PutUint64(ret[0:], uint64(uintptr(unsafe.Pointer(m)))) + + if len(listeners) > 0 { + sliceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&listeners)) + binary.LittleEndian.PutUint64(ret[8:], uint64(sliceHeader.Data)) + binary.LittleEndian.PutUint64(ret[16:], uint64(sliceHeader.Len)) + binary.LittleEndian.PutUint64(ret[24:], uint64(sliceHeader.Cap)) + } + + offset := 32 + for i := range m.CodeSection { + goFn := m.CodeSection[i].GoFunc + writeIface(goFn, ret[offset:]) + offset += 16 + } + return ret +} + +func hostModuleFromOpaque(opaqueBegin uintptr) *wasm.Module { + var opaqueViewOverSlice []byte + sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaqueViewOverSlice)) + sh.Data = opaqueBegin + sh.Len = 32 + sh.Cap = 32 + return *(**wasm.Module)(unsafe.Pointer(&opaqueViewOverSlice[0])) +} + +func hostModuleListenersSliceFromOpaque(opaqueBegin uintptr) []experimental.FunctionListener { + var opaqueViewOverSlice []byte + sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaqueViewOverSlice)) + sh.Data = opaqueBegin + sh.Len = 32 + sh.Cap = 32 + + b := binary.LittleEndian.Uint64(opaqueViewOverSlice[8:]) + l := binary.LittleEndian.Uint64(opaqueViewOverSlice[16:]) + c := binary.LittleEndian.Uint64(opaqueViewOverSlice[24:]) + var ret []experimental.FunctionListener + sh = (*reflect.SliceHeader)(unsafe.Pointer(&ret)) + sh.Data = uintptr(b) + setSliceLimits(sh, uintptr(l), uintptr(c)) + return ret +} + +func hostModuleGoFuncFromOpaque[T any](index int, opaqueBegin uintptr) T { + offset := uintptr(index*16) + 32 + ptr := opaqueBegin + offset + + var opaqueViewOverFunction []byte + sh := (*reflect.SliceHeader)(unsafe.Pointer(&opaqueViewOverFunction)) + sh.Data = ptr + sh.Len = 16 + sh.Cap = 16 + return readIface(opaqueViewOverFunction).(T) +} + +func writeIface(goFn interface{}, buf []byte) { + goFnIface := *(*[2]uint64)(unsafe.Pointer(&goFn)) + binary.LittleEndian.PutUint64(buf, goFnIface[0]) + binary.LittleEndian.PutUint64(buf[8:], goFnIface[1]) +} + +func readIface(buf []byte) interface{} { + b := binary.LittleEndian.Uint64(buf) + s := binary.LittleEndian.Uint64(buf[8:]) + return *(*interface{})(unsafe.Pointer(&[2]uint64{b, s})) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_amd64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_amd64.go new file mode 100644 index 000000000..da27cc108 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_amd64.go @@ -0,0 +1,30 @@ +//go:build amd64 + +package wazevo + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64" +) + +func newMachine() backend.Machine { + return amd64.NewBackend() +} + +// unwindStack is a function to unwind the stack, and appends return addresses to `returnAddresses` slice. +// The implementation must be aligned with the ABI/Calling convention. +func unwindStack(sp, fp, top uintptr, returnAddresses []uintptr) []uintptr { + return amd64.UnwindStack(sp, fp, top, returnAddresses) +} + +// goCallStackView is a function to get a view of the stack before a Go call, which +// is the view of the stack allocated in CompileGoFunctionTrampoline. +func goCallStackView(stackPointerBeforeGoCall *uint64) []uint64 { + return amd64.GoCallStackView(stackPointerBeforeGoCall) +} + +// adjustClonedStack is a function to adjust the stack after it is grown. +// More precisely, absolute addresses (frame pointers) in the stack must be adjusted. +func adjustClonedStack(oldsp, oldTop, sp, fp, top uintptr) { + amd64.AdjustClonedStack(oldsp, oldTop, sp, fp, top) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_arm64.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_arm64.go new file mode 100644 index 000000000..e7a846548 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_arm64.go @@ -0,0 +1,32 @@ +//go:build arm64 + +package wazevo + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64" +) + +func newMachine() backend.Machine { + return arm64.NewBackend() +} + +// unwindStack is a function to unwind the stack, and appends return addresses to `returnAddresses` slice. +// The implementation must be aligned with the ABI/Calling convention. +func unwindStack(sp, fp, top uintptr, returnAddresses []uintptr) []uintptr { + return arm64.UnwindStack(sp, fp, top, returnAddresses) +} + +// goCallStackView is a function to get a view of the stack before a Go call, which +// is the view of the stack allocated in CompileGoFunctionTrampoline. +func goCallStackView(stackPointerBeforeGoCall *uint64) []uint64 { + return arm64.GoCallStackView(stackPointerBeforeGoCall) +} + +// adjustClonedStack is a function to adjust the stack after it is grown. +// More precisely, absolute addresses (frame pointers) in the stack must be adjusted. +func adjustClonedStack(oldsp, oldTop, sp, fp, top uintptr) { + // TODO: currently, the frame pointers are not used, and saved old sps are relative to the current stack pointer, + // so no need to adjustment on arm64. However, when we make it absolute, which in my opinion is better perf-wise + // at the expense of slightly costly stack growth, we need to adjust the pushed frame pointers. +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_other.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_other.go new file mode 100644 index 000000000..c5afc6314 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/isa_other.go @@ -0,0 +1,29 @@ +//go:build !(amd64 || arm64) + +package wazevo + +import ( + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" +) + +func newMachine() backend.Machine { + panic("unsupported architecture") +} + +// unwindStack is a function to unwind the stack, and appends return addresses to `returnAddresses` slice. +// The implementation must be aligned with the ABI/Calling convention. +func unwindStack(sp, fp, top uintptr, returnAddresses []uintptr) []uintptr { + panic("unsupported architecture") +} + +// goCallStackView is a function to get a view of the stack before a Go call, which +// is the view of the stack allocated in CompileGoFunctionTrampoline. +func goCallStackView(stackPointerBeforeGoCall *uint64) []uint64 { + panic("unsupported architecture") +} + +// adjustClonedStack is a function to adjust the stack after it is grown. +// More precisely, absolute addresses (frame pointers) in the stack must be adjusted. +func adjustClonedStack(oldsp, oldTop, sp, fp, top uintptr) { + panic("unsupported architecture") +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/memmove.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/memmove.go new file mode 100644 index 000000000..889922107 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/memmove.go @@ -0,0 +1,11 @@ +package wazevo + +import ( + "reflect" + "unsafe" +) + +//go:linkname memmove runtime.memmove +func memmove(_, _ unsafe.Pointer, _ uintptr) + +var memmovPtr = reflect.ValueOf(memmove).Pointer() diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go new file mode 100644 index 000000000..ba8f546c0 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/module_engine.go @@ -0,0 +1,344 @@ +package wazevo + +import ( + "encoding/binary" + "unsafe" + + "github.com/tetratelabs/wazero/api" + "github.com/tetratelabs/wazero/experimental" + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" + "github.com/tetratelabs/wazero/internal/wasm" + "github.com/tetratelabs/wazero/internal/wasmruntime" +) + +type ( + // moduleEngine implements wasm.ModuleEngine. + moduleEngine struct { + // opaquePtr equals &opaque[0]. + opaquePtr *byte + parent *compiledModule + module *wasm.ModuleInstance + opaque moduleContextOpaque + localFunctionInstances []*functionInstance + importedFunctions []importedFunction + listeners []experimental.FunctionListener + } + + functionInstance struct { + executable *byte + moduleContextOpaquePtr *byte + typeID wasm.FunctionTypeID + indexInModule wasm.Index + } + + importedFunction struct { + me *moduleEngine + indexInModule wasm.Index + } + + // moduleContextOpaque is the opaque byte slice of Module instance specific contents whose size + // is only Wasm-compile-time known, hence dynamic. Its contents are basically the pointers to the module instance, + // specific objects as well as functions. This is sometimes called "VMContext" in other Wasm runtimes. + // + // Internally, the buffer is structured as follows: + // + // type moduleContextOpaque struct { + // moduleInstance *wasm.ModuleInstance + // localMemoryBufferPtr *byte (optional) + // localMemoryLength uint64 (optional) + // importedMemoryInstance *wasm.MemoryInstance (optional) + // importedMemoryOwnerOpaqueCtx *byte (optional) + // importedFunctions [# of importedFunctions]functionInstance + // importedGlobals []ImportedGlobal (optional) + // localGlobals []Global (optional) + // typeIDsBegin &wasm.ModuleInstance.TypeIDs[0] (optional) + // tables []*wasm.TableInstance (optional) + // beforeListenerTrampolines1stElement **byte (optional) + // afterListenerTrampolines1stElement **byte (optional) + // dataInstances1stElement []wasm.DataInstance (optional) + // elementInstances1stElement []wasm.ElementInstance (optional) + // } + // + // type ImportedGlobal struct { + // *Global + // _ uint64 // padding + // } + // + // type Global struct { + // Val, ValHi uint64 + // } + // + // See wazevoapi.NewModuleContextOffsetData for the details of the offsets. + // + // Note that for host modules, the structure is entirely different. See buildHostModuleOpaque. + moduleContextOpaque []byte +) + +func newAlignedOpaque(size int) moduleContextOpaque { + // Check if the size is a multiple of 16. + if size%16 != 0 { + panic("size must be a multiple of 16") + } + buf := make([]byte, size+16) + // Align the buffer to 16 bytes. + rem := uintptr(unsafe.Pointer(&buf[0])) % 16 + buf = buf[16-rem:] + return buf +} + +func putLocalMemory(opaque []byte, offset wazevoapi.Offset, mem *wasm.MemoryInstance) { + s := uint64(len(mem.Buffer)) + var b uint64 + if len(mem.Buffer) > 0 { + b = uint64(uintptr(unsafe.Pointer(&mem.Buffer[0]))) + } + binary.LittleEndian.PutUint64(opaque[offset:], b) + binary.LittleEndian.PutUint64(opaque[offset+8:], s) +} + +func (m *moduleEngine) setupOpaque() { + inst := m.module + offsets := &m.parent.offsets + opaque := m.opaque + + binary.LittleEndian.PutUint64(opaque[offsets.ModuleInstanceOffset:], + uint64(uintptr(unsafe.Pointer(m.module))), + ) + + if lm := offsets.LocalMemoryBegin; lm >= 0 { + putLocalMemory(opaque, lm, inst.MemoryInstance) + } + + // Note: imported memory is resolved in ResolveImportedFunction. + + // Note: imported functions are resolved in ResolveImportedFunction. + + if globalOffset := offsets.GlobalsBegin; globalOffset >= 0 { + for i, g := range inst.Globals { + if i < int(inst.Source.ImportGlobalCount) { + importedME := g.Me.(*moduleEngine) + offset := importedME.parent.offsets.GlobalInstanceOffset(g.Index) + importedMEOpaque := importedME.opaque + binary.LittleEndian.PutUint64(opaque[globalOffset:], + uint64(uintptr(unsafe.Pointer(&importedMEOpaque[offset])))) + } else { + binary.LittleEndian.PutUint64(opaque[globalOffset:], g.Val) + binary.LittleEndian.PutUint64(opaque[globalOffset+8:], g.ValHi) + } + globalOffset += 16 + } + } + + if tableOffset := offsets.TablesBegin; tableOffset >= 0 { + // First we write the first element's address of typeIDs. + if len(inst.TypeIDs) > 0 { + binary.LittleEndian.PutUint64(opaque[offsets.TypeIDs1stElement:], uint64(uintptr(unsafe.Pointer(&inst.TypeIDs[0])))) + } + + // Then we write the table addresses. + for _, table := range inst.Tables { + binary.LittleEndian.PutUint64(opaque[tableOffset:], uint64(uintptr(unsafe.Pointer(table)))) + tableOffset += 8 + } + } + + if beforeListenerOffset := offsets.BeforeListenerTrampolines1stElement; beforeListenerOffset >= 0 { + binary.LittleEndian.PutUint64(opaque[beforeListenerOffset:], uint64(uintptr(unsafe.Pointer(&m.parent.listenerBeforeTrampolines[0])))) + } + if afterListenerOffset := offsets.AfterListenerTrampolines1stElement; afterListenerOffset >= 0 { + binary.LittleEndian.PutUint64(opaque[afterListenerOffset:], uint64(uintptr(unsafe.Pointer(&m.parent.listenerAfterTrampolines[0])))) + } + if len(inst.DataInstances) > 0 { + binary.LittleEndian.PutUint64(opaque[offsets.DataInstances1stElement:], uint64(uintptr(unsafe.Pointer(&inst.DataInstances[0])))) + } + if len(inst.ElementInstances) > 0 { + binary.LittleEndian.PutUint64(opaque[offsets.ElementInstances1stElement:], uint64(uintptr(unsafe.Pointer(&inst.ElementInstances[0])))) + } +} + +// NewFunction implements wasm.ModuleEngine. +func (m *moduleEngine) NewFunction(index wasm.Index) api.Function { + if wazevoapi.PrintMachineCodeHexPerFunctionDisassemblable { + panic("When PrintMachineCodeHexPerFunctionDisassemblable enabled, functions must not be called") + } + + localIndex := index + if importedFnCount := m.module.Source.ImportFunctionCount; index < importedFnCount { + imported := &m.importedFunctions[index] + return imported.me.NewFunction(imported.indexInModule) + } else { + localIndex -= importedFnCount + } + + src := m.module.Source + typIndex := src.FunctionSection[localIndex] + typ := src.TypeSection[typIndex] + sizeOfParamResultSlice := typ.ResultNumInUint64 + if ps := typ.ParamNumInUint64; ps > sizeOfParamResultSlice { + sizeOfParamResultSlice = ps + } + p := m.parent + offset := p.functionOffsets[localIndex] + + ce := &callEngine{ + indexInModule: index, + executable: &p.executable[offset], + parent: m, + preambleExecutable: &m.parent.entryPreambles[typIndex][0], + sizeOfParamResultSlice: sizeOfParamResultSlice, + requiredParams: typ.ParamNumInUint64, + numberOfResults: typ.ResultNumInUint64, + } + + ce.execCtx.memoryGrowTrampolineAddress = &m.parent.sharedFunctions.memoryGrowExecutable[0] + ce.execCtx.stackGrowCallTrampolineAddress = &m.parent.sharedFunctions.stackGrowExecutable[0] + ce.execCtx.checkModuleExitCodeTrampolineAddress = &m.parent.sharedFunctions.checkModuleExitCode[0] + ce.execCtx.tableGrowTrampolineAddress = &m.parent.sharedFunctions.tableGrowExecutable[0] + ce.execCtx.refFuncTrampolineAddress = &m.parent.sharedFunctions.refFuncExecutable[0] + ce.execCtx.memoryWait32TrampolineAddress = &m.parent.sharedFunctions.memoryWait32Executable[0] + ce.execCtx.memoryWait64TrampolineAddress = &m.parent.sharedFunctions.memoryWait64Executable[0] + ce.execCtx.memoryNotifyTrampolineAddress = &m.parent.sharedFunctions.memoryNotifyExecutable[0] + ce.execCtx.memmoveAddress = memmovPtr + ce.init() + return ce +} + +// GetGlobalValue implements the same method as documented on wasm.ModuleEngine. +func (m *moduleEngine) GetGlobalValue(i wasm.Index) (lo, hi uint64) { + offset := m.parent.offsets.GlobalInstanceOffset(i) + buf := m.opaque[offset:] + if i < m.module.Source.ImportGlobalCount { + panic("GetGlobalValue should not be called for imported globals") + } + return binary.LittleEndian.Uint64(buf), binary.LittleEndian.Uint64(buf[8:]) +} + +// SetGlobalValue implements the same method as documented on wasm.ModuleEngine. +func (m *moduleEngine) SetGlobalValue(i wasm.Index, lo, hi uint64) { + offset := m.parent.offsets.GlobalInstanceOffset(i) + buf := m.opaque[offset:] + if i < m.module.Source.ImportGlobalCount { + panic("GetGlobalValue should not be called for imported globals") + } + binary.LittleEndian.PutUint64(buf, lo) + binary.LittleEndian.PutUint64(buf[8:], hi) +} + +// OwnsGlobals implements the same method as documented on wasm.ModuleEngine. +func (m *moduleEngine) OwnsGlobals() bool { return true } + +// ResolveImportedFunction implements wasm.ModuleEngine. +func (m *moduleEngine) ResolveImportedFunction(index, indexInImportedModule wasm.Index, importedModuleEngine wasm.ModuleEngine) { + executableOffset, moduleCtxOffset, typeIDOffset := m.parent.offsets.ImportedFunctionOffset(index) + importedME := importedModuleEngine.(*moduleEngine) + + if int(indexInImportedModule) >= len(importedME.importedFunctions) { + indexInImportedModule -= wasm.Index(len(importedME.importedFunctions)) + } else { + imported := &importedME.importedFunctions[indexInImportedModule] + m.ResolveImportedFunction(index, imported.indexInModule, imported.me) + return // Recursively resolve the imported function. + } + + offset := importedME.parent.functionOffsets[indexInImportedModule] + typeID := getTypeIDOf(indexInImportedModule, importedME.module) + executable := &importedME.parent.executable[offset] + // Write functionInstance. + binary.LittleEndian.PutUint64(m.opaque[executableOffset:], uint64(uintptr(unsafe.Pointer(executable)))) + binary.LittleEndian.PutUint64(m.opaque[moduleCtxOffset:], uint64(uintptr(unsafe.Pointer(importedME.opaquePtr)))) + binary.LittleEndian.PutUint64(m.opaque[typeIDOffset:], uint64(typeID)) + + // Write importedFunction so that it can be used by NewFunction. + m.importedFunctions[index] = importedFunction{me: importedME, indexInModule: indexInImportedModule} +} + +func getTypeIDOf(funcIndex wasm.Index, m *wasm.ModuleInstance) wasm.FunctionTypeID { + source := m.Source + + var typeIndex wasm.Index + if funcIndex >= source.ImportFunctionCount { + funcIndex -= source.ImportFunctionCount + typeIndex = source.FunctionSection[funcIndex] + } else { + var cnt wasm.Index + for i := range source.ImportSection { + if source.ImportSection[i].Type == wasm.ExternTypeFunc { + if cnt == funcIndex { + typeIndex = source.ImportSection[i].DescFunc + break + } + cnt++ + } + } + } + return m.TypeIDs[typeIndex] +} + +// ResolveImportedMemory implements wasm.ModuleEngine. +func (m *moduleEngine) ResolveImportedMemory(importedModuleEngine wasm.ModuleEngine) { + importedME := importedModuleEngine.(*moduleEngine) + inst := importedME.module + + var memInstPtr uint64 + var memOwnerOpaquePtr uint64 + if offs := importedME.parent.offsets; offs.ImportedMemoryBegin >= 0 { + offset := offs.ImportedMemoryBegin + memInstPtr = binary.LittleEndian.Uint64(importedME.opaque[offset:]) + memOwnerOpaquePtr = binary.LittleEndian.Uint64(importedME.opaque[offset+8:]) + } else { + memInstPtr = uint64(uintptr(unsafe.Pointer(inst.MemoryInstance))) + memOwnerOpaquePtr = uint64(uintptr(unsafe.Pointer(importedME.opaquePtr))) + } + offset := m.parent.offsets.ImportedMemoryBegin + binary.LittleEndian.PutUint64(m.opaque[offset:], memInstPtr) + binary.LittleEndian.PutUint64(m.opaque[offset+8:], memOwnerOpaquePtr) +} + +// DoneInstantiation implements wasm.ModuleEngine. +func (m *moduleEngine) DoneInstantiation() { + if !m.module.Source.IsHostModule { + m.setupOpaque() + } +} + +// FunctionInstanceReference implements wasm.ModuleEngine. +func (m *moduleEngine) FunctionInstanceReference(funcIndex wasm.Index) wasm.Reference { + if funcIndex < m.module.Source.ImportFunctionCount { + begin, _, _ := m.parent.offsets.ImportedFunctionOffset(funcIndex) + return uintptr(unsafe.Pointer(&m.opaque[begin])) + } + localIndex := funcIndex - m.module.Source.ImportFunctionCount + p := m.parent + executable := &p.executable[p.functionOffsets[localIndex]] + typeID := m.module.TypeIDs[m.module.Source.FunctionSection[localIndex]] + + lf := &functionInstance{ + executable: executable, + moduleContextOpaquePtr: m.opaquePtr, + typeID: typeID, + indexInModule: funcIndex, + } + m.localFunctionInstances = append(m.localFunctionInstances, lf) + return uintptr(unsafe.Pointer(lf)) +} + +// LookupFunction implements wasm.ModuleEngine. +func (m *moduleEngine) LookupFunction(t *wasm.TableInstance, typeId wasm.FunctionTypeID, tableOffset wasm.Index) (*wasm.ModuleInstance, wasm.Index) { + if tableOffset >= uint32(len(t.References)) || t.Type != wasm.RefTypeFuncref { + panic(wasmruntime.ErrRuntimeInvalidTableAccess) + } + rawPtr := t.References[tableOffset] + if rawPtr == 0 { + panic(wasmruntime.ErrRuntimeInvalidTableAccess) + } + + tf := wazevoapi.PtrFromUintptr[functionInstance](rawPtr) + if tf.typeID != typeId { + panic(wasmruntime.ErrRuntimeIndirectCallTypeMismatch) + } + return moduleInstanceFromOpaquePtr(tf.moduleContextOpaquePtr), tf.indexInModule +} + +func moduleInstanceFromOpaquePtr(ptr *byte) *wasm.ModuleInstance { + return *(**wasm.ModuleInstance)(unsafe.Pointer(ptr)) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect.go new file mode 100644 index 000000000..6a03fc65c --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect.go @@ -0,0 +1,11 @@ +//go:build !tinygo + +package wazevo + +import "reflect" + +// setSliceLimits sets both Cap and Len for the given reflected slice. +func setSliceLimits(s *reflect.SliceHeader, l, c uintptr) { + s.Len = int(l) + s.Cap = int(c) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect_tinygo.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect_tinygo.go new file mode 100644 index 000000000..eda3e706a --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/reflect_tinygo.go @@ -0,0 +1,11 @@ +//go:build tinygo + +package wazevo + +import "reflect" + +// setSliceLimits sets both Cap and Len for the given reflected slice. +func setSliceLimits(s *reflect.SliceHeader, l, c uintptr) { + s.Len = l + s.Cap = c +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block.go new file mode 100644 index 000000000..10b6b4b62 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block.go @@ -0,0 +1,407 @@ +package ssa + +import ( + "fmt" + "strconv" + "strings" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// BasicBlock represents the Basic Block of an SSA function. +// Each BasicBlock always ends with branching instructions (e.g. Branch, Return, etc.), +// and at most two branches are allowed. If there's two branches, these two are placed together at the end of the block. +// In other words, there's no branching instruction in the middle of the block. +// +// Note: we use the "block argument" variant of SSA, instead of PHI functions. See the package level doc comments. +// +// Note: we use "parameter/param" as a placeholder which represents a variant of PHI, and "argument/arg" as an actual +// Value passed to that "parameter/param". +type BasicBlock interface { + // ID returns the unique ID of this block. + ID() BasicBlockID + + // Name returns the unique string ID of this block. e.g. blk0, blk1, ... + Name() string + + // AddParam adds the parameter to the block whose type specified by `t`. + AddParam(b Builder, t Type) Value + + // Params returns the number of parameters to this block. + Params() int + + // Param returns (Variable, Value) which corresponds to the i-th parameter of this block. + // The returned Value is the definition of the param in this block. + Param(i int) Value + + // InsertInstruction inserts an instruction that implements Value into the tail of this block. + InsertInstruction(raw *Instruction) + + // Root returns the root instruction of this block. + Root() *Instruction + + // Tail returns the tail instruction of this block. + Tail() *Instruction + + // EntryBlock returns true if this block represents the function entry. + EntryBlock() bool + + // ReturnBlock returns ture if this block represents the function return. + ReturnBlock() bool + + // FormatHeader returns the debug string of this block, not including instruction. + FormatHeader(b Builder) string + + // Valid is true if this block is still valid even after optimizations. + Valid() bool + + // Sealed is true if this block has been sealed. + Sealed() bool + + // BeginPredIterator returns the first predecessor of this block. + BeginPredIterator() BasicBlock + + // NextPredIterator returns the next predecessor of this block. + NextPredIterator() BasicBlock + + // Preds returns the number of predecessors of this block. + Preds() int + + // Pred returns the i-th predecessor of this block. + Pred(i int) BasicBlock + + // Succs returns the number of successors of this block. + Succs() int + + // Succ returns the i-th successor of this block. + Succ(i int) BasicBlock + + // LoopHeader returns true if this block is a loop header. + LoopHeader() bool + + // LoopNestingForestChildren returns the children of this block in the loop nesting forest. + LoopNestingForestChildren() []BasicBlock +} + +type ( + // basicBlock is a basic block in a SSA-transformed function. + basicBlock struct { + id BasicBlockID + rootInstr, currentInstr *Instruction + params []blockParam + predIter int + preds []basicBlockPredecessorInfo + success []*basicBlock + // singlePred is the alias to preds[0] for fast lookup, and only set after Seal is called. + singlePred *basicBlock + // lastDefinitions maps Variable to its last definition in this block. + lastDefinitions map[Variable]Value + // unknownsValues are used in builder.findValue. The usage is well-described in the paper. + unknownValues []unknownValue + // invalid is true if this block is made invalid during optimizations. + invalid bool + // sealed is true if this is sealed (all the predecessors are known). + sealed bool + // loopHeader is true if this block is a loop header: + // + // > A loop header (sometimes called the entry point of the loop) is a dominator that is the target + // > of a loop-forming back edge. The loop header dominates all blocks in the loop body. + // > A block may be a loop header for more than one loop. A loop may have multiple entry points, + // > in which case it has no "loop header". + // + // See https://en.wikipedia.org/wiki/Control-flow_graph for more details. + // + // This is modified during the subPassLoopDetection pass. + loopHeader bool + + // loopNestingForestChildren holds the children of this block in the loop nesting forest. + // Non-empty if and only if this block is a loop header (i.e. loopHeader=true) + loopNestingForestChildren []BasicBlock + + // reversePostOrder is used to sort all the blocks in the function in reverse post order. + // This is used in builder.LayoutBlocks. + reversePostOrder int + + // child and sibling are the ones in the dominator tree. + child, sibling *basicBlock + } + // BasicBlockID is the unique ID of a basicBlock. + BasicBlockID uint32 + + // blockParam implements Value and represents a parameter to a basicBlock. + blockParam struct { + // value is the Value that corresponds to the parameter in this block, + // and can be considered as an output of PHI instruction in traditional SSA. + value Value + // typ is the type of the parameter. + typ Type + } + + unknownValue struct { + // variable is the variable that this unknownValue represents. + variable Variable + // value is the value that this unknownValue represents. + value Value + } +) + +const basicBlockIDReturnBlock = 0xffffffff + +// Name implements BasicBlock.Name. +func (bb *basicBlock) Name() string { + if bb.id == basicBlockIDReturnBlock { + return "blk_ret" + } else { + return fmt.Sprintf("blk%d", bb.id) + } +} + +// String implements fmt.Stringer for debugging. +func (bid BasicBlockID) String() string { + if bid == basicBlockIDReturnBlock { + return "blk_ret" + } else { + return fmt.Sprintf("blk%d", bid) + } +} + +// ID implements BasicBlock.ID. +func (bb *basicBlock) ID() BasicBlockID { + return bb.id +} + +// basicBlockPredecessorInfo is the information of a predecessor of a basicBlock. +// predecessor is determined by a pair of block and the branch instruction used to jump to the successor. +type basicBlockPredecessorInfo struct { + blk *basicBlock + branch *Instruction +} + +// EntryBlock implements BasicBlock.EntryBlock. +func (bb *basicBlock) EntryBlock() bool { + return bb.id == 0 +} + +// ReturnBlock implements BasicBlock.ReturnBlock. +func (bb *basicBlock) ReturnBlock() bool { + return bb.id == basicBlockIDReturnBlock +} + +// AddParam implements BasicBlock.AddParam. +func (bb *basicBlock) AddParam(b Builder, typ Type) Value { + paramValue := b.allocateValue(typ) + bb.params = append(bb.params, blockParam{typ: typ, value: paramValue}) + return paramValue +} + +// addParamOn adds a parameter to this block whose value is already allocated. +func (bb *basicBlock) addParamOn(typ Type, value Value) { + bb.params = append(bb.params, blockParam{typ: typ, value: value}) +} + +// Params implements BasicBlock.Params. +func (bb *basicBlock) Params() int { + return len(bb.params) +} + +// Param implements BasicBlock.Param. +func (bb *basicBlock) Param(i int) Value { + p := &bb.params[i] + return p.value +} + +// Valid implements BasicBlock.Valid. +func (bb *basicBlock) Valid() bool { + return !bb.invalid +} + +// Sealed implements BasicBlock.Sealed. +func (bb *basicBlock) Sealed() bool { + return bb.sealed +} + +// InsertInstruction implements BasicBlock.InsertInstruction. +func (bb *basicBlock) InsertInstruction(next *Instruction) { + current := bb.currentInstr + if current != nil { + current.next = next + next.prev = current + } else { + bb.rootInstr = next + } + bb.currentInstr = next + + switch next.opcode { + case OpcodeJump, OpcodeBrz, OpcodeBrnz: + target := next.blk.(*basicBlock) + target.addPred(bb, next) + case OpcodeBrTable: + for _, _target := range next.targets { + target := _target.(*basicBlock) + target.addPred(bb, next) + } + } +} + +// NumPreds implements BasicBlock.NumPreds. +func (bb *basicBlock) NumPreds() int { + return len(bb.preds) +} + +// BeginPredIterator implements BasicBlock.BeginPredIterator. +func (bb *basicBlock) BeginPredIterator() BasicBlock { + bb.predIter = 0 + return bb.NextPredIterator() +} + +// NextPredIterator implements BasicBlock.NextPredIterator. +func (bb *basicBlock) NextPredIterator() BasicBlock { + if bb.predIter >= len(bb.preds) { + return nil + } + pred := bb.preds[bb.predIter].blk + bb.predIter++ + return pred +} + +// Preds implements BasicBlock.Preds. +func (bb *basicBlock) Preds() int { + return len(bb.preds) +} + +// Pred implements BasicBlock.Pred. +func (bb *basicBlock) Pred(i int) BasicBlock { + return bb.preds[i].blk +} + +// Succs implements BasicBlock.Succs. +func (bb *basicBlock) Succs() int { + return len(bb.success) +} + +// Succ implements BasicBlock.Succ. +func (bb *basicBlock) Succ(i int) BasicBlock { + return bb.success[i] +} + +// Root implements BasicBlock.Root. +func (bb *basicBlock) Root() *Instruction { + return bb.rootInstr +} + +// Tail implements BasicBlock.Tail. +func (bb *basicBlock) Tail() *Instruction { + return bb.currentInstr +} + +// reset resets the basicBlock to its initial state so that it can be reused for another function. +func resetBasicBlock(bb *basicBlock) { + bb.params = bb.params[:0] + bb.rootInstr, bb.currentInstr = nil, nil + bb.preds = bb.preds[:0] + bb.success = bb.success[:0] + bb.invalid, bb.sealed = false, false + bb.singlePred = nil + bb.unknownValues = bb.unknownValues[:0] + bb.lastDefinitions = wazevoapi.ResetMap(bb.lastDefinitions) + bb.reversePostOrder = -1 + bb.loopNestingForestChildren = bb.loopNestingForestChildren[:0] + bb.loopHeader = false + bb.sibling = nil + bb.child = nil +} + +// addPred adds a predecessor to this block specified by the branch instruction. +func (bb *basicBlock) addPred(blk BasicBlock, branch *Instruction) { + if bb.sealed { + panic("BUG: trying to add predecessor to a sealed block: " + bb.Name()) + } + + pred := blk.(*basicBlock) + for i := range bb.preds { + existingPred := &bb.preds[i] + if existingPred.blk == pred && existingPred.branch != branch { + // If the target is already added, then this must come from the same BrTable, + // otherwise such redundant branch should be eliminated by the frontend. (which should be simpler). + panic(fmt.Sprintf("BUG: redundant non BrTable jumps in %s whose targes are the same", bb.Name())) + } + } + + bb.preds = append(bb.preds, basicBlockPredecessorInfo{ + blk: pred, + branch: branch, + }) + + pred.success = append(pred.success, bb) +} + +// FormatHeader implements BasicBlock.FormatHeader. +func (bb *basicBlock) FormatHeader(b Builder) string { + ps := make([]string, len(bb.params)) + for i, p := range bb.params { + ps[i] = p.value.formatWithType(b) + } + + if len(bb.preds) > 0 { + preds := make([]string, 0, len(bb.preds)) + for _, pred := range bb.preds { + if pred.blk.invalid { + continue + } + preds = append(preds, fmt.Sprintf("blk%d", pred.blk.id)) + + } + return fmt.Sprintf("blk%d: (%s) <-- (%s)", + bb.id, strings.Join(ps, ","), strings.Join(preds, ",")) + } else { + return fmt.Sprintf("blk%d: (%s)", bb.id, strings.Join(ps, ", ")) + } +} + +// validates validates the basicBlock for debugging purpose. +func (bb *basicBlock) validate(b *builder) { + if bb.invalid { + panic("BUG: trying to validate an invalid block: " + bb.Name()) + } + if len(bb.preds) > 0 { + for _, pred := range bb.preds { + if pred.branch.opcode != OpcodeBrTable { + if target := pred.branch.blk; target != bb { + panic(fmt.Sprintf("BUG: '%s' is not branch to %s, but to %s", + pred.branch.Format(b), bb.Name(), target.Name())) + } + } + + var exp int + if bb.ReturnBlock() { + exp = len(b.currentSignature.Results) + } else { + exp = len(bb.params) + } + + if len(pred.branch.vs.View()) != exp { + panic(fmt.Sprintf( + "BUG: len(argument at %s) != len(params at %s): %d != %d: %s", + pred.blk.Name(), bb.Name(), + len(pred.branch.vs.View()), len(bb.params), pred.branch.Format(b), + )) + } + + } + } +} + +// String implements fmt.Stringer for debugging purpose only. +func (bb *basicBlock) String() string { + return strconv.Itoa(int(bb.id)) +} + +// LoopNestingForestChildren implements BasicBlock.LoopNestingForestChildren. +func (bb *basicBlock) LoopNestingForestChildren() []BasicBlock { + return bb.loopNestingForestChildren +} + +// LoopHeader implements BasicBlock.LoopHeader. +func (bb *basicBlock) LoopHeader() bool { + return bb.loopHeader +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort.go new file mode 100644 index 000000000..e1471edc3 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort.go @@ -0,0 +1,34 @@ +//go:build go1.21 + +package ssa + +import ( + "slices" +) + +func sortBlocks(blocks []*basicBlock) { + slices.SortFunc(blocks, func(i, j *basicBlock) int { + jIsReturn := j.ReturnBlock() + iIsReturn := i.ReturnBlock() + if iIsReturn && jIsReturn { + return 0 + } + if jIsReturn { + return 1 + } + if iIsReturn { + return -1 + } + iRoot, jRoot := i.rootInstr, j.rootInstr + if iRoot == nil && jRoot == nil { // For testing. + return 0 + } + if jRoot == nil { + return 1 + } + if iRoot == nil { + return -1 + } + return i.rootInstr.id - j.rootInstr.id + }) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort_old.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort_old.go new file mode 100644 index 000000000..9dc881dae --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/basic_block_sort_old.go @@ -0,0 +1,24 @@ +//go:build !go1.21 + +// TODO: delete after the floor Go version is 1.21 + +package ssa + +import "sort" + +func sortBlocks(blocks []*basicBlock) { + sort.SliceStable(blocks, func(i, j int) bool { + iBlk, jBlk := blocks[i], blocks[j] + if jBlk.ReturnBlock() { + return true + } + if iBlk.ReturnBlock() { + return false + } + iRoot, jRoot := iBlk.rootInstr, jBlk.rootInstr + if iRoot == nil || jRoot == nil { // For testing. + return true + } + return iBlk.rootInstr.id < jBlk.rootInstr.id + }) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/builder.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/builder.go new file mode 100644 index 000000000..1fc84d2ea --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/builder.go @@ -0,0 +1,731 @@ +package ssa + +import ( + "fmt" + "sort" + "strings" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// Builder is used to builds SSA consisting of Basic Blocks per function. +type Builder interface { + // Init must be called to reuse this builder for the next function. + Init(typ *Signature) + + // Signature returns the Signature of the currently-compiled function. + Signature() *Signature + + // BlockIDMax returns the maximum value of BasicBlocksID existing in the currently-compiled function. + BlockIDMax() BasicBlockID + + // AllocateBasicBlock creates a basic block in SSA function. + AllocateBasicBlock() BasicBlock + + // CurrentBlock returns the currently handled BasicBlock which is set by the latest call to SetCurrentBlock. + CurrentBlock() BasicBlock + + // EntryBlock returns the entry BasicBlock of the currently-compiled function. + EntryBlock() BasicBlock + + // SetCurrentBlock sets the instruction insertion target to the BasicBlock `b`. + SetCurrentBlock(b BasicBlock) + + // DeclareVariable declares a Variable of the given Type. + DeclareVariable(Type) Variable + + // DefineVariable defines a variable in the `block` with value. + // The defining instruction will be inserted into the `block`. + DefineVariable(variable Variable, value Value, block BasicBlock) + + // DefineVariableInCurrentBB is the same as DefineVariable except the definition is + // inserted into the current BasicBlock. Alias to DefineVariable(x, y, CurrentBlock()). + DefineVariableInCurrentBB(variable Variable, value Value) + + // AllocateInstruction returns a new Instruction. + AllocateInstruction() *Instruction + + // InsertInstruction executes BasicBlock.InsertInstruction for the currently handled basic block. + InsertInstruction(raw *Instruction) + + // allocateValue allocates an unused Value. + allocateValue(typ Type) Value + + // MustFindValue searches the latest definition of the given Variable and returns the result. + MustFindValue(variable Variable) Value + + // MustFindValueInBlk is the same as MustFindValue except it searches the latest definition from the given BasicBlock. + MustFindValueInBlk(variable Variable, blk BasicBlock) Value + + // FindValueInLinearPath tries to find the latest definition of the given Variable in the linear path to the current BasicBlock. + // If it cannot find the definition, or it's not sealed yet, it returns ValueInvalid. + FindValueInLinearPath(variable Variable) Value + + // Seal declares that we've known all the predecessors to this block and were added via AddPred. + // After calling this, AddPred will be forbidden. + Seal(blk BasicBlock) + + // AnnotateValue is for debugging purpose. + AnnotateValue(value Value, annotation string) + + // DeclareSignature appends the *Signature to be referenced by various instructions (e.g. OpcodeCall). + DeclareSignature(signature *Signature) + + // Signatures returns the slice of declared Signatures. + Signatures() []*Signature + + // ResolveSignature returns the Signature which corresponds to SignatureID. + ResolveSignature(id SignatureID) *Signature + + // RunPasses runs various passes on the constructed SSA function. + RunPasses() + + // Format returns the debugging string of the SSA function. + Format() string + + // BlockIteratorBegin initializes the state to iterate over all the valid BasicBlock(s) compiled. + // Combined with BlockIteratorNext, we can use this like: + // + // for blk := builder.BlockIteratorBegin(); blk != nil; blk = builder.BlockIteratorNext() { + // // ... + // } + // + // The returned blocks are ordered in the order of AllocateBasicBlock being called. + BlockIteratorBegin() BasicBlock + + // BlockIteratorNext advances the state for iteration initialized by BlockIteratorBegin. + // Returns nil if there's no unseen BasicBlock. + BlockIteratorNext() BasicBlock + + // ValueRefCounts returns the map of ValueID to its reference count. + // The returned slice must not be modified. + ValueRefCounts() []int + + // BlockIteratorReversePostOrderBegin is almost the same as BlockIteratorBegin except it returns the BasicBlock in the reverse post-order. + // This is available after RunPasses is run. + BlockIteratorReversePostOrderBegin() BasicBlock + + // BlockIteratorReversePostOrderNext is almost the same as BlockIteratorPostOrderNext except it returns the BasicBlock in the reverse post-order. + // This is available after RunPasses is run. + BlockIteratorReversePostOrderNext() BasicBlock + + // ReturnBlock returns the BasicBlock which is used to return from the function. + ReturnBlock() BasicBlock + + // InsertUndefined inserts an undefined instruction at the current position. + InsertUndefined() + + // SetCurrentSourceOffset sets the current source offset. The incoming instruction will be annotated with this offset. + SetCurrentSourceOffset(line SourceOffset) + + // LoopNestingForestRoots returns the roots of the loop nesting forest. + LoopNestingForestRoots() []BasicBlock + + // LowestCommonAncestor returns the lowest common ancestor in the dominator tree of the given BasicBlock(s). + LowestCommonAncestor(blk1, blk2 BasicBlock) BasicBlock + + // Idom returns the immediate dominator of the given BasicBlock. + Idom(blk BasicBlock) BasicBlock + + VarLengthPool() *wazevoapi.VarLengthPool[Value] +} + +// NewBuilder returns a new Builder implementation. +func NewBuilder() Builder { + return &builder{ + instructionsPool: wazevoapi.NewPool[Instruction](resetInstruction), + basicBlocksPool: wazevoapi.NewPool[basicBlock](resetBasicBlock), + varLengthPool: wazevoapi.NewVarLengthPool[Value](), + valueAnnotations: make(map[ValueID]string), + signatures: make(map[SignatureID]*Signature), + blkVisited: make(map[*basicBlock]int), + valueIDAliases: make(map[ValueID]Value), + redundantParameterIndexToValue: make(map[int]Value), + returnBlk: &basicBlock{id: basicBlockIDReturnBlock}, + } +} + +// builder implements Builder interface. +type builder struct { + basicBlocksPool wazevoapi.Pool[basicBlock] + instructionsPool wazevoapi.Pool[Instruction] + varLengthPool wazevoapi.VarLengthPool[Value] + signatures map[SignatureID]*Signature + currentSignature *Signature + + // reversePostOrderedBasicBlocks are the BasicBlock(s) ordered in the reverse post-order after passCalculateImmediateDominators. + reversePostOrderedBasicBlocks []*basicBlock + currentBB *basicBlock + returnBlk *basicBlock + + // variables track the types for Variable with the index regarded Variable. + variables []Type + // nextValueID is used by builder.AllocateValue. + nextValueID ValueID + // nextVariable is used by builder.AllocateVariable. + nextVariable Variable + + valueIDAliases map[ValueID]Value + valueAnnotations map[ValueID]string + + // valueRefCounts is used to lower the SSA in backend, and will be calculated + // by the last SSA-level optimization pass. + valueRefCounts []int + + // dominators stores the immediate dominator of each BasicBlock. + // The index is blockID of the BasicBlock. + dominators []*basicBlock + sparseTree dominatorSparseTree + + // loopNestingForestRoots are the roots of the loop nesting forest. + loopNestingForestRoots []BasicBlock + + // The followings are used for optimization passes/deterministic compilation. + instStack []*Instruction + blkVisited map[*basicBlock]int + valueIDToInstruction []*Instruction + blkStack []*basicBlock + blkStack2 []*basicBlock + ints []int + redundantParameterIndexToValue map[int]Value + + // blockIterCur is used to implement blockIteratorBegin and blockIteratorNext. + blockIterCur int + + // donePreBlockLayoutPasses is true if all the passes before LayoutBlocks are called. + donePreBlockLayoutPasses bool + // doneBlockLayout is true if LayoutBlocks is called. + doneBlockLayout bool + // donePostBlockLayoutPasses is true if all the passes after LayoutBlocks are called. + donePostBlockLayoutPasses bool + + currentSourceOffset SourceOffset +} + +func (b *builder) VarLengthPool() *wazevoapi.VarLengthPool[Value] { + return &b.varLengthPool +} + +// ReturnBlock implements Builder.ReturnBlock. +func (b *builder) ReturnBlock() BasicBlock { + return b.returnBlk +} + +// Init implements Builder.Reset. +func (b *builder) Init(s *Signature) { + b.nextVariable = 0 + b.currentSignature = s + resetBasicBlock(b.returnBlk) + b.instructionsPool.Reset() + b.basicBlocksPool.Reset() + b.varLengthPool.Reset() + b.donePreBlockLayoutPasses = false + b.doneBlockLayout = false + b.donePostBlockLayoutPasses = false + for _, sig := range b.signatures { + sig.used = false + } + + b.ints = b.ints[:0] + b.blkStack = b.blkStack[:0] + b.blkStack2 = b.blkStack2[:0] + b.dominators = b.dominators[:0] + b.loopNestingForestRoots = b.loopNestingForestRoots[:0] + + for i := 0; i < b.basicBlocksPool.Allocated(); i++ { + blk := b.basicBlocksPool.View(i) + delete(b.blkVisited, blk) + } + b.basicBlocksPool.Reset() + + for v := ValueID(0); v < b.nextValueID; v++ { + delete(b.valueAnnotations, v) + delete(b.valueIDAliases, v) + b.valueRefCounts[v] = 0 + b.valueIDToInstruction[v] = nil + } + b.nextValueID = 0 + b.reversePostOrderedBasicBlocks = b.reversePostOrderedBasicBlocks[:0] + b.doneBlockLayout = false + for i := range b.valueRefCounts { + b.valueRefCounts[i] = 0 + } + + b.currentSourceOffset = sourceOffsetUnknown +} + +// Signature implements Builder.Signature. +func (b *builder) Signature() *Signature { + return b.currentSignature +} + +// AnnotateValue implements Builder.AnnotateValue. +func (b *builder) AnnotateValue(value Value, a string) { + b.valueAnnotations[value.ID()] = a +} + +// AllocateInstruction implements Builder.AllocateInstruction. +func (b *builder) AllocateInstruction() *Instruction { + instr := b.instructionsPool.Allocate() + instr.id = b.instructionsPool.Allocated() + return instr +} + +// DeclareSignature implements Builder.AnnotateValue. +func (b *builder) DeclareSignature(s *Signature) { + b.signatures[s.ID] = s + s.used = false +} + +// Signatures implements Builder.Signatures. +func (b *builder) Signatures() (ret []*Signature) { + for _, sig := range b.signatures { + ret = append(ret, sig) + } + sort.Slice(ret, func(i, j int) bool { + return ret[i].ID < ret[j].ID + }) + return +} + +// SetCurrentSourceOffset implements Builder.SetCurrentSourceOffset. +func (b *builder) SetCurrentSourceOffset(l SourceOffset) { + b.currentSourceOffset = l +} + +func (b *builder) usedSignatures() (ret []*Signature) { + for _, sig := range b.signatures { + if sig.used { + ret = append(ret, sig) + } + } + sort.Slice(ret, func(i, j int) bool { + return ret[i].ID < ret[j].ID + }) + return +} + +// ResolveSignature implements Builder.ResolveSignature. +func (b *builder) ResolveSignature(id SignatureID) *Signature { + return b.signatures[id] +} + +// AllocateBasicBlock implements Builder.AllocateBasicBlock. +func (b *builder) AllocateBasicBlock() BasicBlock { + return b.allocateBasicBlock() +} + +// allocateBasicBlock allocates a new basicBlock. +func (b *builder) allocateBasicBlock() *basicBlock { + id := BasicBlockID(b.basicBlocksPool.Allocated()) + blk := b.basicBlocksPool.Allocate() + blk.id = id + return blk +} + +// Idom implements Builder.Idom. +func (b *builder) Idom(blk BasicBlock) BasicBlock { + return b.dominators[blk.ID()] +} + +// InsertInstruction implements Builder.InsertInstruction. +func (b *builder) InsertInstruction(instr *Instruction) { + b.currentBB.InsertInstruction(instr) + + if l := b.currentSourceOffset; l.Valid() { + // Emit the source offset info only when the instruction has side effect because + // these are the only instructions that are accessed by stack unwinding. + // This reduces the significant amount of the offset info in the binary. + if instr.sideEffect() != sideEffectNone { + instr.annotateSourceOffset(l) + } + } + + resultTypesFn := instructionReturnTypes[instr.opcode] + if resultTypesFn == nil { + panic("TODO: " + instr.Format(b)) + } + + t1, ts := resultTypesFn(b, instr) + if t1.invalid() { + return + } + + r1 := b.allocateValue(t1) + instr.rValue = r1 + + tsl := len(ts) + if tsl == 0 { + return + } + + rValues := b.varLengthPool.Allocate(tsl) + for i := 0; i < tsl; i++ { + rValues = rValues.Append(&b.varLengthPool, b.allocateValue(ts[i])) + } + instr.rValues = rValues +} + +// DefineVariable implements Builder.DefineVariable. +func (b *builder) DefineVariable(variable Variable, value Value, block BasicBlock) { + if b.variables[variable].invalid() { + panic("BUG: trying to define variable " + variable.String() + " but is not declared yet") + } + + if b.variables[variable] != value.Type() { + panic(fmt.Sprintf("BUG: inconsistent type for variable %d: expected %s but got %s", variable, b.variables[variable], value.Type())) + } + bb := block.(*basicBlock) + bb.lastDefinitions[variable] = value +} + +// DefineVariableInCurrentBB implements Builder.DefineVariableInCurrentBB. +func (b *builder) DefineVariableInCurrentBB(variable Variable, value Value) { + b.DefineVariable(variable, value, b.currentBB) +} + +// SetCurrentBlock implements Builder.SetCurrentBlock. +func (b *builder) SetCurrentBlock(bb BasicBlock) { + b.currentBB = bb.(*basicBlock) +} + +// CurrentBlock implements Builder.CurrentBlock. +func (b *builder) CurrentBlock() BasicBlock { + return b.currentBB +} + +// EntryBlock implements Builder.EntryBlock. +func (b *builder) EntryBlock() BasicBlock { + return b.entryBlk() +} + +// DeclareVariable implements Builder.DeclareVariable. +func (b *builder) DeclareVariable(typ Type) Variable { + v := b.allocateVariable() + iv := int(v) + if l := len(b.variables); l <= iv { + b.variables = append(b.variables, make([]Type, 2*(l+1))...) + } + b.variables[v] = typ + return v +} + +// allocateVariable allocates a new variable. +func (b *builder) allocateVariable() (ret Variable) { + ret = b.nextVariable + b.nextVariable++ + return +} + +// allocateValue implements Builder.AllocateValue. +func (b *builder) allocateValue(typ Type) (v Value) { + v = Value(b.nextValueID) + v = v.setType(typ) + b.nextValueID++ + return +} + +// FindValueInLinearPath implements Builder.FindValueInLinearPath. +func (b *builder) FindValueInLinearPath(variable Variable) Value { + return b.findValueInLinearPath(variable, b.currentBB) +} + +func (b *builder) findValueInLinearPath(variable Variable, blk *basicBlock) Value { + if val, ok := blk.lastDefinitions[variable]; ok { + return val + } else if !blk.sealed { + return ValueInvalid + } + + if pred := blk.singlePred; pred != nil { + // If this block is sealed and have only one predecessor, + // we can use the value in that block without ambiguity on definition. + return b.findValueInLinearPath(variable, pred) + } + if len(blk.preds) == 1 { + panic("BUG") + } + return ValueInvalid +} + +func (b *builder) MustFindValueInBlk(variable Variable, blk BasicBlock) Value { + typ := b.definedVariableType(variable) + return b.findValue(typ, variable, blk.(*basicBlock)) +} + +// MustFindValue implements Builder.MustFindValue. +func (b *builder) MustFindValue(variable Variable) Value { + typ := b.definedVariableType(variable) + return b.findValue(typ, variable, b.currentBB) +} + +// findValue recursively tries to find the latest definition of a `variable`. The algorithm is described in +// the section 2 of the paper https://link.springer.com/content/pdf/10.1007/978-3-642-37051-9_6.pdf. +// +// TODO: reimplement this in iterative, not recursive, to avoid stack overflow. +func (b *builder) findValue(typ Type, variable Variable, blk *basicBlock) Value { + if val, ok := blk.lastDefinitions[variable]; ok { + // The value is already defined in this block! + return val + } else if !blk.sealed { // Incomplete CFG as in the paper. + // If this is not sealed, that means it might have additional unknown predecessor later on. + // So we temporarily define the placeholder value here (not add as a parameter yet!), + // and record it as unknown. + // The unknown values are resolved when we call seal this block via BasicBlock.Seal(). + value := b.allocateValue(typ) + if wazevoapi.SSALoggingEnabled { + fmt.Printf("adding unknown value placeholder for %s at %d\n", variable, blk.id) + } + blk.lastDefinitions[variable] = value + blk.unknownValues = append(blk.unknownValues, unknownValue{ + variable: variable, + value: value, + }) + return value + } + + if pred := blk.singlePred; pred != nil { + // If this block is sealed and have only one predecessor, + // we can use the value in that block without ambiguity on definition. + return b.findValue(typ, variable, pred) + } else if len(blk.preds) == 0 { + panic("BUG: value is not defined for " + variable.String()) + } + + // If this block has multiple predecessors, we have to gather the definitions, + // and treat them as an argument to this block. + // + // The first thing is to define a new parameter to this block which may or may not be redundant, but + // later we eliminate trivial params in an optimization pass. This must be done before finding the + // definitions in the predecessors so that we can break the cycle. + paramValue := blk.AddParam(b, typ) + b.DefineVariable(variable, paramValue, blk) + + // After the new param is added, we have to manipulate the original branching instructions + // in predecessors so that they would pass the definition of `variable` as the argument to + // the newly added PHI. + for i := range blk.preds { + pred := &blk.preds[i] + value := b.findValue(typ, variable, pred.blk) + pred.branch.addArgumentBranchInst(b, value) + } + return paramValue +} + +// Seal implements Builder.Seal. +func (b *builder) Seal(raw BasicBlock) { + blk := raw.(*basicBlock) + if len(blk.preds) == 1 { + blk.singlePred = blk.preds[0].blk + } + blk.sealed = true + + for _, v := range blk.unknownValues { + variable, phiValue := v.variable, v.value + typ := b.definedVariableType(variable) + blk.addParamOn(typ, phiValue) + for i := range blk.preds { + pred := &blk.preds[i] + predValue := b.findValue(typ, variable, pred.blk) + if !predValue.Valid() { + panic("BUG: value is not defined anywhere in the predecessors in the CFG") + } + pred.branch.addArgumentBranchInst(b, predValue) + } + } +} + +// definedVariableType returns the type of the given variable. If the variable is not defined yet, it panics. +func (b *builder) definedVariableType(variable Variable) Type { + typ := b.variables[variable] + if typ.invalid() { + panic(fmt.Sprintf("%s is not defined yet", variable)) + } + return typ +} + +// Format implements Builder.Format. +func (b *builder) Format() string { + str := strings.Builder{} + usedSigs := b.usedSignatures() + if len(usedSigs) > 0 { + str.WriteByte('\n') + str.WriteString("signatures:\n") + for _, sig := range usedSigs { + str.WriteByte('\t') + str.WriteString(sig.String()) + str.WriteByte('\n') + } + } + + var iterBegin, iterNext func() *basicBlock + if b.doneBlockLayout { + iterBegin, iterNext = b.blockIteratorReversePostOrderBegin, b.blockIteratorReversePostOrderNext + } else { + iterBegin, iterNext = b.blockIteratorBegin, b.blockIteratorNext + } + for bb := iterBegin(); bb != nil; bb = iterNext() { + str.WriteByte('\n') + str.WriteString(bb.FormatHeader(b)) + str.WriteByte('\n') + + for cur := bb.Root(); cur != nil; cur = cur.Next() { + str.WriteByte('\t') + str.WriteString(cur.Format(b)) + str.WriteByte('\n') + } + } + return str.String() +} + +// BlockIteratorNext implements Builder.BlockIteratorNext. +func (b *builder) BlockIteratorNext() BasicBlock { + if blk := b.blockIteratorNext(); blk == nil { + return nil // BasicBlock((*basicBlock)(nil)) != BasicBlock(nil) + } else { + return blk + } +} + +// BlockIteratorNext implements Builder.BlockIteratorNext. +func (b *builder) blockIteratorNext() *basicBlock { + index := b.blockIterCur + for { + if index == b.basicBlocksPool.Allocated() { + return nil + } + ret := b.basicBlocksPool.View(index) + index++ + if !ret.invalid { + b.blockIterCur = index + return ret + } + } +} + +// BlockIteratorBegin implements Builder.BlockIteratorBegin. +func (b *builder) BlockIteratorBegin() BasicBlock { + return b.blockIteratorBegin() +} + +// BlockIteratorBegin implements Builder.BlockIteratorBegin. +func (b *builder) blockIteratorBegin() *basicBlock { + b.blockIterCur = 0 + return b.blockIteratorNext() +} + +// BlockIteratorReversePostOrderBegin implements Builder.BlockIteratorReversePostOrderBegin. +func (b *builder) BlockIteratorReversePostOrderBegin() BasicBlock { + return b.blockIteratorReversePostOrderBegin() +} + +// BlockIteratorBegin implements Builder.BlockIteratorBegin. +func (b *builder) blockIteratorReversePostOrderBegin() *basicBlock { + b.blockIterCur = 0 + return b.blockIteratorReversePostOrderNext() +} + +// BlockIteratorReversePostOrderNext implements Builder.BlockIteratorReversePostOrderNext. +func (b *builder) BlockIteratorReversePostOrderNext() BasicBlock { + if blk := b.blockIteratorReversePostOrderNext(); blk == nil { + return nil // BasicBlock((*basicBlock)(nil)) != BasicBlock(nil) + } else { + return blk + } +} + +// BlockIteratorNext implements Builder.BlockIteratorNext. +func (b *builder) blockIteratorReversePostOrderNext() *basicBlock { + if b.blockIterCur >= len(b.reversePostOrderedBasicBlocks) { + return nil + } else { + ret := b.reversePostOrderedBasicBlocks[b.blockIterCur] + b.blockIterCur++ + return ret + } +} + +// ValueRefCounts implements Builder.ValueRefCounts. +func (b *builder) ValueRefCounts() []int { + return b.valueRefCounts +} + +// alias records the alias of the given values. The alias(es) will be +// eliminated in the optimization pass via resolveArgumentAlias. +func (b *builder) alias(dst, src Value) { + b.valueIDAliases[dst.ID()] = src +} + +// resolveArgumentAlias resolves the alias of the arguments of the given instruction. +func (b *builder) resolveArgumentAlias(instr *Instruction) { + if instr.v.Valid() { + instr.v = b.resolveAlias(instr.v) + } + + if instr.v2.Valid() { + instr.v2 = b.resolveAlias(instr.v2) + } + + if instr.v3.Valid() { + instr.v3 = b.resolveAlias(instr.v3) + } + + view := instr.vs.View() + for i, v := range view { + view[i] = b.resolveAlias(v) + } +} + +// resolveAlias resolves the alias of the given value. +func (b *builder) resolveAlias(v Value) Value { + // Some aliases are chained, so we need to resolve them recursively. + for { + if src, ok := b.valueIDAliases[v.ID()]; ok { + v = src + } else { + break + } + } + return v +} + +// entryBlk returns the entry block of the function. +func (b *builder) entryBlk() *basicBlock { + return b.basicBlocksPool.View(0) +} + +// isDominatedBy returns true if the given block `n` is dominated by the given block `d`. +// Before calling this, the builder must pass by passCalculateImmediateDominators. +func (b *builder) isDominatedBy(n *basicBlock, d *basicBlock) bool { + if len(b.dominators) == 0 { + panic("BUG: passCalculateImmediateDominators must be called before calling isDominatedBy") + } + ent := b.entryBlk() + doms := b.dominators + for n != d && n != ent { + n = doms[n.id] + } + return n == d +} + +// BlockIDMax implements Builder.BlockIDMax. +func (b *builder) BlockIDMax() BasicBlockID { + return BasicBlockID(b.basicBlocksPool.Allocated()) +} + +// InsertUndefined implements Builder.InsertUndefined. +func (b *builder) InsertUndefined() { + instr := b.AllocateInstruction() + instr.opcode = OpcodeUndefined + b.InsertInstruction(instr) +} + +// LoopNestingForestRoots implements Builder.LoopNestingForestRoots. +func (b *builder) LoopNestingForestRoots() []BasicBlock { + return b.loopNestingForestRoots +} + +// LowestCommonAncestor implements Builder.LowestCommonAncestor. +func (b *builder) LowestCommonAncestor(blk1, blk2 BasicBlock) BasicBlock { + return b.sparseTree.findLCA(blk1.ID(), blk2.ID()) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/cmp.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/cmp.go new file mode 100644 index 000000000..15b62ca8e --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/cmp.go @@ -0,0 +1,107 @@ +package ssa + +// IntegerCmpCond represents a condition for integer comparison. +type IntegerCmpCond byte + +const ( + // IntegerCmpCondInvalid represents an invalid condition. + IntegerCmpCondInvalid IntegerCmpCond = iota + // IntegerCmpCondEqual represents "==". + IntegerCmpCondEqual + // IntegerCmpCondNotEqual represents "!=". + IntegerCmpCondNotEqual + // IntegerCmpCondSignedLessThan represents Signed "<". + IntegerCmpCondSignedLessThan + // IntegerCmpCondSignedGreaterThanOrEqual represents Signed ">=". + IntegerCmpCondSignedGreaterThanOrEqual + // IntegerCmpCondSignedGreaterThan represents Signed ">". + IntegerCmpCondSignedGreaterThan + // IntegerCmpCondSignedLessThanOrEqual represents Signed "<=". + IntegerCmpCondSignedLessThanOrEqual + // IntegerCmpCondUnsignedLessThan represents Unsigned "<". + IntegerCmpCondUnsignedLessThan + // IntegerCmpCondUnsignedGreaterThanOrEqual represents Unsigned ">=". + IntegerCmpCondUnsignedGreaterThanOrEqual + // IntegerCmpCondUnsignedGreaterThan represents Unsigned ">". + IntegerCmpCondUnsignedGreaterThan + // IntegerCmpCondUnsignedLessThanOrEqual represents Unsigned "<=". + IntegerCmpCondUnsignedLessThanOrEqual +) + +// String implements fmt.Stringer. +func (i IntegerCmpCond) String() string { + switch i { + case IntegerCmpCondEqual: + return "eq" + case IntegerCmpCondNotEqual: + return "neq" + case IntegerCmpCondSignedLessThan: + return "lt_s" + case IntegerCmpCondSignedGreaterThanOrEqual: + return "ge_s" + case IntegerCmpCondSignedGreaterThan: + return "gt_s" + case IntegerCmpCondSignedLessThanOrEqual: + return "le_s" + case IntegerCmpCondUnsignedLessThan: + return "lt_u" + case IntegerCmpCondUnsignedGreaterThanOrEqual: + return "ge_u" + case IntegerCmpCondUnsignedGreaterThan: + return "gt_u" + case IntegerCmpCondUnsignedLessThanOrEqual: + return "le_u" + default: + panic("invalid integer comparison condition") + } +} + +// Signed returns true if the condition is signed integer comparison. +func (i IntegerCmpCond) Signed() bool { + switch i { + case IntegerCmpCondSignedLessThan, IntegerCmpCondSignedGreaterThanOrEqual, + IntegerCmpCondSignedGreaterThan, IntegerCmpCondSignedLessThanOrEqual: + return true + default: + return false + } +} + +type FloatCmpCond byte + +const ( + // FloatCmpCondInvalid represents an invalid condition. + FloatCmpCondInvalid FloatCmpCond = iota + // FloatCmpCondEqual represents "==". + FloatCmpCondEqual + // FloatCmpCondNotEqual represents "!=". + FloatCmpCondNotEqual + // FloatCmpCondLessThan represents "<". + FloatCmpCondLessThan + // FloatCmpCondLessThanOrEqual represents "<=". + FloatCmpCondLessThanOrEqual + // FloatCmpCondGreaterThan represents ">". + FloatCmpCondGreaterThan + // FloatCmpCondGreaterThanOrEqual represents ">=". + FloatCmpCondGreaterThanOrEqual +) + +// String implements fmt.Stringer. +func (f FloatCmpCond) String() string { + switch f { + case FloatCmpCondEqual: + return "eq" + case FloatCmpCondNotEqual: + return "neq" + case FloatCmpCondLessThan: + return "lt" + case FloatCmpCondLessThanOrEqual: + return "le" + case FloatCmpCondGreaterThan: + return "gt" + case FloatCmpCondGreaterThanOrEqual: + return "ge" + default: + panic("invalid float comparison condition") + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/funcref.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/funcref.go new file mode 100644 index 000000000..d9620762a --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/funcref.go @@ -0,0 +1,12 @@ +package ssa + +import "fmt" + +// FuncRef is a unique identifier for a function of the frontend, +// and is used to reference the function in function call. +type FuncRef uint32 + +// String implements fmt.Stringer. +func (r FuncRef) String() string { + return fmt.Sprintf("f%d", r) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/instructions.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/instructions.go new file mode 100644 index 000000000..3e3482efc --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/instructions.go @@ -0,0 +1,2967 @@ +package ssa + +import ( + "fmt" + "math" + "strings" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// Opcode represents a SSA instruction. +type Opcode uint32 + +// Instruction represents an instruction whose opcode is specified by +// Opcode. Since Go doesn't have union type, we use this flattened type +// for all instructions, and therefore each field has different meaning +// depending on Opcode. +type Instruction struct { + // id is the unique ID of this instruction which ascends from 0 following the order of program. + id int + opcode Opcode + u1, u2 uint64 + v Value + v2 Value + v3 Value + vs Values + typ Type + blk BasicBlock + targets []BasicBlock + prev, next *Instruction + + rValue Value + rValues Values + gid InstructionGroupID + sourceOffset SourceOffset + live bool + alreadyLowered bool +} + +// SourceOffset represents the offset of the source of an instruction. +type SourceOffset int64 + +const sourceOffsetUnknown = -1 + +// Valid returns true if this source offset is valid. +func (l SourceOffset) Valid() bool { + return l != sourceOffsetUnknown +} + +func (i *Instruction) annotateSourceOffset(line SourceOffset) { + i.sourceOffset = line +} + +// SourceOffset returns the source offset of this instruction. +func (i *Instruction) SourceOffset() SourceOffset { + return i.sourceOffset +} + +// Opcode returns the opcode of this instruction. +func (i *Instruction) Opcode() Opcode { + return i.opcode +} + +// GroupID returns the InstructionGroupID of this instruction. +func (i *Instruction) GroupID() InstructionGroupID { + return i.gid +} + +// MarkLowered marks this instruction as already lowered. +func (i *Instruction) MarkLowered() { + i.alreadyLowered = true +} + +// Lowered returns true if this instruction is already lowered. +func (i *Instruction) Lowered() bool { + return i.alreadyLowered +} + +// resetInstruction resets this instruction to the initial state. +func resetInstruction(i *Instruction) { + *i = Instruction{} + i.v = ValueInvalid + i.v2 = ValueInvalid + i.v3 = ValueInvalid + i.rValue = ValueInvalid + i.typ = typeInvalid + i.vs = ValuesNil + i.sourceOffset = sourceOffsetUnknown +} + +// InstructionGroupID is assigned to each instruction and represents a group of instructions +// where each instruction is interchangeable with others except for the last instruction +// in the group which has side effects. In short, InstructionGroupID is determined by the side effects of instructions. +// That means, if there's an instruction with side effect between two instructions, then these two instructions +// will have different instructionGroupID. Note that each block always ends with branching, which is with side effects, +// therefore, instructions in different blocks always have different InstructionGroupID(s). +// +// The notable application of this is used in lowering SSA-level instruction to a ISA specific instruction, +// where we eagerly try to merge multiple instructions into single operation etc. Such merging cannot be done +// if these instruction have different InstructionGroupID since it will change the semantics of a program. +// +// See passDeadCodeElimination. +type InstructionGroupID uint32 + +// Returns Value(s) produced by this instruction if any. +// The `first` is the first return value, and `rest` is the rest of the values. +func (i *Instruction) Returns() (first Value, rest []Value) { + return i.rValue, i.rValues.View() +} + +// Return returns a Value(s) produced by this instruction if any. +// If there's multiple return values, only the first one is returned. +func (i *Instruction) Return() (first Value) { + return i.rValue +} + +// Args returns the arguments to this instruction. +func (i *Instruction) Args() (v1, v2, v3 Value, vs []Value) { + return i.v, i.v2, i.v3, i.vs.View() +} + +// Arg returns the first argument to this instruction. +func (i *Instruction) Arg() Value { + return i.v +} + +// Arg2 returns the first two arguments to this instruction. +func (i *Instruction) Arg2() (Value, Value) { + return i.v, i.v2 +} + +// ArgWithLane returns the first argument to this instruction, and the lane type. +func (i *Instruction) ArgWithLane() (Value, VecLane) { + return i.v, VecLane(i.u1) +} + +// Arg2WithLane returns the first two arguments to this instruction, and the lane type. +func (i *Instruction) Arg2WithLane() (Value, Value, VecLane) { + return i.v, i.v2, VecLane(i.u1) +} + +// ShuffleData returns the first two arguments to this instruction and 2 uint64s `lo`, `hi`. +// +// Note: Each uint64 encodes a sequence of 8 bytes where each byte encodes a VecLane, +// so that the 128bit integer `hi<<64|lo` packs a slice `[16]VecLane`, +// where `lane[0]` is the least significant byte, and `lane[n]` is shifted to offset `n*8`. +func (i *Instruction) ShuffleData() (v Value, v2 Value, lo uint64, hi uint64) { + return i.v, i.v2, i.u1, i.u2 +} + +// Arg3 returns the first three arguments to this instruction. +func (i *Instruction) Arg3() (Value, Value, Value) { + return i.v, i.v2, i.v3 +} + +// Next returns the next instruction laid out next to itself. +func (i *Instruction) Next() *Instruction { + return i.next +} + +// Prev returns the previous instruction laid out prior to itself. +func (i *Instruction) Prev() *Instruction { + return i.prev +} + +// IsBranching returns true if this instruction is a branching instruction. +func (i *Instruction) IsBranching() bool { + switch i.opcode { + case OpcodeJump, OpcodeBrz, OpcodeBrnz, OpcodeBrTable: + return true + default: + return false + } +} + +// TODO: complete opcode comments. +const ( + OpcodeInvalid Opcode = iota + + // OpcodeUndefined is a placeholder for undefined opcode. This can be used for debugging to intentionally + // cause a crash at certain point. + OpcodeUndefined + + // OpcodeJump takes the list of args to the `block` and unconditionally jumps to it. + OpcodeJump + + // OpcodeBrz branches into `blk` with `args` if the value `c` equals zero: `Brz c, blk, args`. + OpcodeBrz + + // OpcodeBrnz branches into `blk` with `args` if the value `c` is not zero: `Brnz c, blk, args`. + OpcodeBrnz + + // OpcodeBrTable takes the index value `index`, and branches into `labelX`. If the `index` is out of range, + // it branches into the last labelN: `BrTable index, [label1, label2, ... labelN]`. + OpcodeBrTable + + // OpcodeExitWithCode exit the execution immediately. + OpcodeExitWithCode + + // OpcodeExitIfTrueWithCode exits the execution immediately if the value `c` is not zero. + OpcodeExitIfTrueWithCode + + // OpcodeReturn returns from the function: `return rvalues`. + OpcodeReturn + + // OpcodeCall calls a function specified by the symbol FN with arguments `args`: `returnvals = Call FN, args...` + // This is a "near" call, which means the call target is known at compile time, and the target is relatively close + // to this function. If the target cannot be reached by near call, the backend fails to compile. + OpcodeCall + + // OpcodeCallIndirect calls a function specified by `callee` which is a function address: `returnvals = call_indirect SIG, callee, args`. + // Note that this is different from call_indirect in Wasm, which also does type checking, etc. + OpcodeCallIndirect + + // OpcodeSplat performs a vector splat operation: `v = Splat.lane x`. + OpcodeSplat + + // OpcodeSwizzle performs a vector swizzle operation: `v = Swizzle.lane x, y`. + OpcodeSwizzle + + // OpcodeInsertlane inserts a lane value into a vector: `v = InsertLane x, y, Idx`. + OpcodeInsertlane + + // OpcodeExtractlane extracts a lane value from a vector: `v = ExtractLane x, Idx`. + OpcodeExtractlane + + // OpcodeLoad loads a Type value from the [base + offset] address: `v = Load base, offset`. + OpcodeLoad + + // OpcodeStore stores a Type value to the [base + offset] address: `Store v, base, offset`. + OpcodeStore + + // OpcodeUload8 loads the 8-bit value from the [base + offset] address, zero-extended to 64 bits: `v = Uload8 base, offset`. + OpcodeUload8 + + // OpcodeSload8 loads the 8-bit value from the [base + offset] address, sign-extended to 64 bits: `v = Sload8 base, offset`. + OpcodeSload8 + + // OpcodeIstore8 stores the 8-bit value to the [base + offset] address, sign-extended to 64 bits: `Istore8 v, base, offset`. + OpcodeIstore8 + + // OpcodeUload16 loads the 16-bit value from the [base + offset] address, zero-extended to 64 bits: `v = Uload16 base, offset`. + OpcodeUload16 + + // OpcodeSload16 loads the 16-bit value from the [base + offset] address, sign-extended to 64 bits: `v = Sload16 base, offset`. + OpcodeSload16 + + // OpcodeIstore16 stores the 16-bit value to the [base + offset] address, zero-extended to 64 bits: `Istore16 v, base, offset`. + OpcodeIstore16 + + // OpcodeUload32 loads the 32-bit value from the [base + offset] address, zero-extended to 64 bits: `v = Uload32 base, offset`. + OpcodeUload32 + + // OpcodeSload32 loads the 32-bit value from the [base + offset] address, sign-extended to 64 bits: `v = Sload32 base, offset`. + OpcodeSload32 + + // OpcodeIstore32 stores the 32-bit value to the [base + offset] address, zero-extended to 64 bits: `Istore16 v, base, offset`. + OpcodeIstore32 + + // OpcodeLoadSplat represents a load that replicates the loaded value to all lanes `v = LoadSplat.lane p, Offset`. + OpcodeLoadSplat + + // OpcodeVZeroExtLoad loads a scalar single/double precision floating point value from the [p + Offset] address, + // and zero-extend it to the V128 value: `v = VExtLoad p, Offset`. + OpcodeVZeroExtLoad + + // OpcodeIconst represents the integer const. + OpcodeIconst + + // OpcodeF32const represents the single-precision const. + OpcodeF32const + + // OpcodeF64const represents the double-precision const. + OpcodeF64const + + // OpcodeVconst represents the 128bit vector const. + OpcodeVconst + + // OpcodeVbor computes binary or between two 128bit vectors: `v = bor x, y`. + OpcodeVbor + + // OpcodeVbxor computes binary xor between two 128bit vectors: `v = bxor x, y`. + OpcodeVbxor + + // OpcodeVband computes binary and between two 128bit vectors: `v = band x, y`. + OpcodeVband + + // OpcodeVbandnot computes binary and-not between two 128bit vectors: `v = bandnot x, y`. + OpcodeVbandnot + + // OpcodeVbnot negates a 128bit vector: `v = bnot x`. + OpcodeVbnot + + // OpcodeVbitselect uses the bits in the control mask c to select the corresponding bit from x when 1 + // and y when 0: `v = bitselect c, x, y`. + OpcodeVbitselect + + // OpcodeShuffle shuffles two vectors using the given 128-bit immediate: `v = shuffle imm, x, y`. + // For each byte in the immediate, a value i in [0, 15] selects the i-th byte in vector x; + // i in [16, 31] selects the (i-16)-th byte in vector y. + OpcodeShuffle + + // OpcodeSelect chooses between two values based on a condition `c`: `v = Select c, x, y`. + OpcodeSelect + + // OpcodeVanyTrue performs a any true operation: `s = VanyTrue a`. + OpcodeVanyTrue + + // OpcodeVallTrue performs a lane-wise all true operation: `s = VallTrue.lane a`. + OpcodeVallTrue + + // OpcodeVhighBits performs a lane-wise extract of the high bits: `v = VhighBits.lane a`. + OpcodeVhighBits + + // OpcodeIcmp compares two integer values with the given condition: `v = icmp Cond, x, y`. + OpcodeIcmp + + // OpcodeVIcmp compares two integer values with the given condition: `v = vicmp Cond, x, y` on vector. + OpcodeVIcmp + + // OpcodeIcmpImm compares an integer value with the immediate value on the given condition: `v = icmp_imm Cond, x, Y`. + OpcodeIcmpImm + + // OpcodeIadd performs an integer addition: `v = Iadd x, y`. + OpcodeIadd + + // OpcodeVIadd performs an integer addition: `v = VIadd.lane x, y` on vector. + OpcodeVIadd + + // OpcodeVSaddSat performs a signed saturating vector addition: `v = VSaddSat.lane x, y` on vector. + OpcodeVSaddSat + + // OpcodeVUaddSat performs an unsigned saturating vector addition: `v = VUaddSat.lane x, y` on vector. + OpcodeVUaddSat + + // OpcodeIsub performs an integer subtraction: `v = Isub x, y`. + OpcodeIsub + + // OpcodeVIsub performs an integer subtraction: `v = VIsub.lane x, y` on vector. + OpcodeVIsub + + // OpcodeVSsubSat performs a signed saturating vector subtraction: `v = VSsubSat.lane x, y` on vector. + OpcodeVSsubSat + + // OpcodeVUsubSat performs an unsigned saturating vector subtraction: `v = VUsubSat.lane x, y` on vector. + OpcodeVUsubSat + + // OpcodeVImin performs a signed integer min: `v = VImin.lane x, y` on vector. + OpcodeVImin + + // OpcodeVUmin performs an unsigned integer min: `v = VUmin.lane x, y` on vector. + OpcodeVUmin + + // OpcodeVImax performs a signed integer max: `v = VImax.lane x, y` on vector. + OpcodeVImax + + // OpcodeVUmax performs an unsigned integer max: `v = VUmax.lane x, y` on vector. + OpcodeVUmax + + // OpcodeVAvgRound performs an unsigned integer avg, truncating to zero: `v = VAvgRound.lane x, y` on vector. + OpcodeVAvgRound + + // OpcodeVImul performs an integer multiplication: `v = VImul.lane x, y` on vector. + OpcodeVImul + + // OpcodeVIneg negates the given integer vector value: `v = VIneg x`. + OpcodeVIneg + + // OpcodeVIpopcnt counts the number of 1-bits in the given vector: `v = VIpopcnt x`. + OpcodeVIpopcnt + + // OpcodeVIabs returns the absolute value for the given vector value: `v = VIabs.lane x`. + OpcodeVIabs + + // OpcodeVIshl shifts x left by (y mod lane-width): `v = VIshl.lane x, y` on vector. + OpcodeVIshl + + // OpcodeVUshr shifts x right by (y mod lane-width), unsigned: `v = VUshr.lane x, y` on vector. + OpcodeVUshr + + // OpcodeVSshr shifts x right by (y mod lane-width), signed: `v = VSshr.lane x, y` on vector. + OpcodeVSshr + + // OpcodeVFabs takes the absolute value of a floating point value: `v = VFabs.lane x on vector. + OpcodeVFabs + + // OpcodeVFmax takes the maximum of two floating point values: `v = VFmax.lane x, y on vector. + OpcodeVFmax + + // OpcodeVFmin takes the minimum of two floating point values: `v = VFmin.lane x, y on vector. + OpcodeVFmin + + // OpcodeVFneg negates the given floating point vector value: `v = VFneg x`. + OpcodeVFneg + + // OpcodeVFadd performs a floating point addition: `v = VFadd.lane x, y` on vector. + OpcodeVFadd + + // OpcodeVFsub performs a floating point subtraction: `v = VFsub.lane x, y` on vector. + OpcodeVFsub + + // OpcodeVFmul performs a floating point multiplication: `v = VFmul.lane x, y` on vector. + OpcodeVFmul + + // OpcodeVFdiv performs a floating point division: `v = VFdiv.lane x, y` on vector. + OpcodeVFdiv + + // OpcodeVFcmp compares two float values with the given condition: `v = VFcmp.lane Cond, x, y` on float. + OpcodeVFcmp + + // OpcodeVCeil takes the ceiling of the given floating point value: `v = ceil.lane x` on vector. + OpcodeVCeil + + // OpcodeVFloor takes the floor of the given floating point value: `v = floor.lane x` on vector. + OpcodeVFloor + + // OpcodeVTrunc takes the truncation of the given floating point value: `v = trunc.lane x` on vector. + OpcodeVTrunc + + // OpcodeVNearest takes the nearest integer of the given floating point value: `v = nearest.lane x` on vector. + OpcodeVNearest + + // OpcodeVMaxPseudo computes the lane-wise maximum value `v = VMaxPseudo.lane x, y` on vector defined as `x < y ? x : y`. + OpcodeVMaxPseudo + + // OpcodeVMinPseudo computes the lane-wise minimum value `v = VMinPseudo.lane x, y` on vector defined as `y < x ? x : y`. + OpcodeVMinPseudo + + // OpcodeVSqrt takes the minimum of two floating point values: `v = VFmin.lane x, y` on vector. + OpcodeVSqrt + + // OpcodeVFcvtToUintSat converts a floating point value to an unsigned integer: `v = FcvtToUintSat.lane x` on vector. + OpcodeVFcvtToUintSat + + // OpcodeVFcvtToSintSat converts a floating point value to a signed integer: `v = VFcvtToSintSat.lane x` on vector. + OpcodeVFcvtToSintSat + + // OpcodeVFcvtFromUint converts a floating point value from an unsigned integer: `v = FcvtFromUint.lane x` on vector. + // x is always a 32-bit integer lane, and the result is either a 32-bit or 64-bit floating point-sized vector. + OpcodeVFcvtFromUint + + // OpcodeVFcvtFromSint converts a floating point value from a signed integer: `v = VFcvtFromSint.lane x` on vector. + // x is always a 32-bit integer lane, and the result is either a 32-bit or 64-bit floating point-sized vector. + OpcodeVFcvtFromSint + + // OpcodeImul performs an integer multiplication: `v = Imul x, y`. + OpcodeImul + + // OpcodeUdiv performs the unsigned integer division `v = Udiv x, y`. + OpcodeUdiv + + // OpcodeSdiv performs the signed integer division `v = Sdiv x, y`. + OpcodeSdiv + + // OpcodeUrem computes the remainder of the unsigned integer division `v = Urem x, y`. + OpcodeUrem + + // OpcodeSrem computes the remainder of the signed integer division `v = Srem x, y`. + OpcodeSrem + + // OpcodeBand performs a binary and: `v = Band x, y`. + OpcodeBand + + // OpcodeBor performs a binary or: `v = Bor x, y`. + OpcodeBor + + // OpcodeBxor performs a binary xor: `v = Bxor x, y`. + OpcodeBxor + + // OpcodeBnot performs a binary not: `v = Bnot x`. + OpcodeBnot + + // OpcodeRotl rotates the given integer value to the left: `v = Rotl x, y`. + OpcodeRotl + + // OpcodeRotr rotates the given integer value to the right: `v = Rotr x, y`. + OpcodeRotr + + // OpcodeIshl does logical shift left: `v = Ishl x, y`. + OpcodeIshl + + // OpcodeUshr does logical shift right: `v = Ushr x, y`. + OpcodeUshr + + // OpcodeSshr does arithmetic shift right: `v = Sshr x, y`. + OpcodeSshr + + // OpcodeClz counts the number of leading zeros: `v = clz x`. + OpcodeClz + + // OpcodeCtz counts the number of trailing zeros: `v = ctz x`. + OpcodeCtz + + // OpcodePopcnt counts the number of 1-bits: `v = popcnt x`. + OpcodePopcnt + + // OpcodeFcmp compares two floating point values: `v = fcmp Cond, x, y`. + OpcodeFcmp + + // OpcodeFadd performs a floating point addition: / `v = Fadd x, y`. + OpcodeFadd + + // OpcodeFsub performs a floating point subtraction: `v = Fsub x, y`. + OpcodeFsub + + // OpcodeFmul performs a floating point multiplication: `v = Fmul x, y`. + OpcodeFmul + + // OpcodeSqmulRoundSat performs a lane-wise saturating rounding multiplication + // in Q15 format: `v = SqmulRoundSat.lane x,y` on vector. + OpcodeSqmulRoundSat + + // OpcodeFdiv performs a floating point division: `v = Fdiv x, y`. + OpcodeFdiv + + // OpcodeSqrt takes the square root of the given floating point value: `v = sqrt x`. + OpcodeSqrt + + // OpcodeFneg negates the given floating point value: `v = Fneg x`. + OpcodeFneg + + // OpcodeFabs takes the absolute value of the given floating point value: `v = fabs x`. + OpcodeFabs + + // OpcodeFcopysign copies the sign of the second floating point value to the first floating point value: + // `v = Fcopysign x, y`. + OpcodeFcopysign + + // OpcodeFmin takes the minimum of two floating point values: `v = fmin x, y`. + OpcodeFmin + + // OpcodeFmax takes the maximum of two floating point values: `v = fmax x, y`. + OpcodeFmax + + // OpcodeCeil takes the ceiling of the given floating point value: `v = ceil x`. + OpcodeCeil + + // OpcodeFloor takes the floor of the given floating point value: `v = floor x`. + OpcodeFloor + + // OpcodeTrunc takes the truncation of the given floating point value: `v = trunc x`. + OpcodeTrunc + + // OpcodeNearest takes the nearest integer of the given floating point value: `v = nearest x`. + OpcodeNearest + + // OpcodeBitcast is a bitcast operation: `v = bitcast x`. + OpcodeBitcast + + // OpcodeIreduce narrow the given integer: `v = Ireduce x`. + OpcodeIreduce + + // OpcodeSnarrow converts two input vectors x, y into a smaller lane vector by narrowing each lane, signed `v = Snarrow.lane x, y`. + OpcodeSnarrow + + // OpcodeUnarrow converts two input vectors x, y into a smaller lane vector by narrowing each lane, unsigned `v = Unarrow.lane x, y`. + OpcodeUnarrow + + // OpcodeSwidenLow converts low half of the smaller lane vector to a larger lane vector, sign extended: `v = SwidenLow.lane x`. + OpcodeSwidenLow + + // OpcodeSwidenHigh converts high half of the smaller lane vector to a larger lane vector, sign extended: `v = SwidenHigh.lane x`. + OpcodeSwidenHigh + + // OpcodeUwidenLow converts low half of the smaller lane vector to a larger lane vector, zero (unsigned) extended: `v = UwidenLow.lane x`. + OpcodeUwidenLow + + // OpcodeUwidenHigh converts high half of the smaller lane vector to a larger lane vector, zero (unsigned) extended: `v = UwidenHigh.lane x`. + OpcodeUwidenHigh + + // OpcodeExtIaddPairwise is a lane-wise integer extended pairwise addition producing extended results (twice wider results than the inputs): `v = extiadd_pairwise x, y` on vector. + OpcodeExtIaddPairwise + + // OpcodeWideningPairwiseDotProductS is a lane-wise widening pairwise dot product with signed saturation: `v = WideningPairwiseDotProductS x, y` on vector. + // Currently, the only lane is i16, and the result is i32. + OpcodeWideningPairwiseDotProductS + + // OpcodeUExtend zero-extends the given integer: `v = UExtend x, from->to`. + OpcodeUExtend + + // OpcodeSExtend sign-extends the given integer: `v = SExtend x, from->to`. + OpcodeSExtend + + // OpcodeFpromote promotes the given floating point value: `v = Fpromote x`. + OpcodeFpromote + + // OpcodeFvpromoteLow converts the two lower single-precision floating point lanes + // to the two double-precision lanes of the result: `v = FvpromoteLow.lane x` on vector. + OpcodeFvpromoteLow + + // OpcodeFdemote demotes the given float point value: `v = Fdemote x`. + OpcodeFdemote + + // OpcodeFvdemote converts the two double-precision floating point lanes + // to two lower single-precision lanes of the result `v = Fvdemote.lane x`. + OpcodeFvdemote + + // OpcodeFcvtToUint converts a floating point value to an unsigned integer: `v = FcvtToUint x`. + OpcodeFcvtToUint + + // OpcodeFcvtToSint converts a floating point value to a signed integer: `v = FcvtToSint x`. + OpcodeFcvtToSint + + // OpcodeFcvtToUintSat converts a floating point value to an unsigned integer: `v = FcvtToUintSat x` which saturates on overflow. + OpcodeFcvtToUintSat + + // OpcodeFcvtToSintSat converts a floating point value to a signed integer: `v = FcvtToSintSat x` which saturates on overflow. + OpcodeFcvtToSintSat + + // OpcodeFcvtFromUint converts an unsigned integer to a floating point value: `v = FcvtFromUint x`. + OpcodeFcvtFromUint + + // OpcodeFcvtFromSint converts a signed integer to a floating point value: `v = FcvtFromSint x`. + OpcodeFcvtFromSint + + // OpcodeAtomicRmw is atomic read-modify-write operation: `v = atomic_rmw op, p, offset, value`. + OpcodeAtomicRmw + + // OpcodeAtomicCas is atomic compare-and-swap operation. + OpcodeAtomicCas + + // OpcodeAtomicLoad is atomic load operation. + OpcodeAtomicLoad + + // OpcodeAtomicStore is atomic store operation. + OpcodeAtomicStore + + // OpcodeFence is a memory fence operation. + OpcodeFence + + // opcodeEnd marks the end of the opcode list. + opcodeEnd +) + +// AtomicRmwOp represents the atomic read-modify-write operation. +type AtomicRmwOp byte + +const ( + // AtomicRmwOpAdd is an atomic add operation. + AtomicRmwOpAdd AtomicRmwOp = iota + // AtomicRmwOpSub is an atomic sub operation. + AtomicRmwOpSub + // AtomicRmwOpAnd is an atomic and operation. + AtomicRmwOpAnd + // AtomicRmwOpOr is an atomic or operation. + AtomicRmwOpOr + // AtomicRmwOpXor is an atomic xor operation. + AtomicRmwOpXor + // AtomicRmwOpXchg is an atomic swap operation. + AtomicRmwOpXchg +) + +// String implements the fmt.Stringer. +func (op AtomicRmwOp) String() string { + switch op { + case AtomicRmwOpAdd: + return "add" + case AtomicRmwOpSub: + return "sub" + case AtomicRmwOpAnd: + return "and" + case AtomicRmwOpOr: + return "or" + case AtomicRmwOpXor: + return "xor" + case AtomicRmwOpXchg: + return "xchg" + } + panic(fmt.Sprintf("unknown AtomicRmwOp: %d", op)) +} + +// returnTypesFn provides the info to determine the type of instruction. +// t1 is the type of the first result, ts are the types of the remaining results. +type returnTypesFn func(b *builder, instr *Instruction) (t1 Type, ts []Type) + +var ( + returnTypesFnNoReturns returnTypesFn = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return typeInvalid, nil } + returnTypesFnSingle = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return instr.typ, nil } + returnTypesFnI32 = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return TypeI32, nil } + returnTypesFnF32 = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return TypeF32, nil } + returnTypesFnF64 = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return TypeF64, nil } + returnTypesFnV128 = func(b *builder, instr *Instruction) (t1 Type, ts []Type) { return TypeV128, nil } +) + +// sideEffect provides the info to determine if an instruction has side effects which +// is used to determine if it can be optimized out, interchanged with others, etc. +type sideEffect byte + +const ( + sideEffectUnknown sideEffect = iota + // sideEffectStrict represents an instruction with side effects, and should be always alive plus cannot be reordered. + sideEffectStrict + // sideEffectTraps represents an instruction that can trap, and should be always alive but can be reordered within the group. + sideEffectTraps + // sideEffectNone represents an instruction without side effects, and can be eliminated if the result is not used, plus can be reordered within the group. + sideEffectNone +) + +// instructionSideEffects provides the info to determine if an instruction has side effects. +// Instructions with side effects must not be eliminated regardless whether the result is used or not. +var instructionSideEffects = [opcodeEnd]sideEffect{ + OpcodeUndefined: sideEffectStrict, + OpcodeJump: sideEffectStrict, + OpcodeIconst: sideEffectNone, + OpcodeCall: sideEffectStrict, + OpcodeCallIndirect: sideEffectStrict, + OpcodeIadd: sideEffectNone, + OpcodeImul: sideEffectNone, + OpcodeIsub: sideEffectNone, + OpcodeIcmp: sideEffectNone, + OpcodeExtractlane: sideEffectNone, + OpcodeInsertlane: sideEffectNone, + OpcodeBand: sideEffectNone, + OpcodeBor: sideEffectNone, + OpcodeBxor: sideEffectNone, + OpcodeRotl: sideEffectNone, + OpcodeRotr: sideEffectNone, + OpcodeFcmp: sideEffectNone, + OpcodeFadd: sideEffectNone, + OpcodeClz: sideEffectNone, + OpcodeCtz: sideEffectNone, + OpcodePopcnt: sideEffectNone, + OpcodeLoad: sideEffectNone, + OpcodeLoadSplat: sideEffectNone, + OpcodeUload8: sideEffectNone, + OpcodeUload16: sideEffectNone, + OpcodeUload32: sideEffectNone, + OpcodeSload8: sideEffectNone, + OpcodeSload16: sideEffectNone, + OpcodeSload32: sideEffectNone, + OpcodeSExtend: sideEffectNone, + OpcodeUExtend: sideEffectNone, + OpcodeSwidenLow: sideEffectNone, + OpcodeUwidenLow: sideEffectNone, + OpcodeSwidenHigh: sideEffectNone, + OpcodeUwidenHigh: sideEffectNone, + OpcodeSnarrow: sideEffectNone, + OpcodeUnarrow: sideEffectNone, + OpcodeSwizzle: sideEffectNone, + OpcodeShuffle: sideEffectNone, + OpcodeSplat: sideEffectNone, + OpcodeFsub: sideEffectNone, + OpcodeF32const: sideEffectNone, + OpcodeF64const: sideEffectNone, + OpcodeIshl: sideEffectNone, + OpcodeSshr: sideEffectNone, + OpcodeUshr: sideEffectNone, + OpcodeStore: sideEffectStrict, + OpcodeIstore8: sideEffectStrict, + OpcodeIstore16: sideEffectStrict, + OpcodeIstore32: sideEffectStrict, + OpcodeExitWithCode: sideEffectStrict, + OpcodeExitIfTrueWithCode: sideEffectStrict, + OpcodeReturn: sideEffectStrict, + OpcodeBrz: sideEffectStrict, + OpcodeBrnz: sideEffectStrict, + OpcodeBrTable: sideEffectStrict, + OpcodeFdiv: sideEffectNone, + OpcodeFmul: sideEffectNone, + OpcodeFmax: sideEffectNone, + OpcodeSqmulRoundSat: sideEffectNone, + OpcodeSelect: sideEffectNone, + OpcodeFmin: sideEffectNone, + OpcodeFneg: sideEffectNone, + OpcodeFcvtToSint: sideEffectTraps, + OpcodeFcvtToUint: sideEffectTraps, + OpcodeFcvtFromSint: sideEffectNone, + OpcodeFcvtFromUint: sideEffectNone, + OpcodeFcvtToSintSat: sideEffectNone, + OpcodeFcvtToUintSat: sideEffectNone, + OpcodeVFcvtFromUint: sideEffectNone, + OpcodeVFcvtFromSint: sideEffectNone, + OpcodeFdemote: sideEffectNone, + OpcodeFvpromoteLow: sideEffectNone, + OpcodeFvdemote: sideEffectNone, + OpcodeFpromote: sideEffectNone, + OpcodeBitcast: sideEffectNone, + OpcodeIreduce: sideEffectNone, + OpcodeSqrt: sideEffectNone, + OpcodeCeil: sideEffectNone, + OpcodeFloor: sideEffectNone, + OpcodeTrunc: sideEffectNone, + OpcodeNearest: sideEffectNone, + OpcodeSdiv: sideEffectTraps, + OpcodeSrem: sideEffectTraps, + OpcodeUdiv: sideEffectTraps, + OpcodeUrem: sideEffectTraps, + OpcodeFabs: sideEffectNone, + OpcodeFcopysign: sideEffectNone, + OpcodeExtIaddPairwise: sideEffectNone, + OpcodeVconst: sideEffectNone, + OpcodeVbor: sideEffectNone, + OpcodeVbxor: sideEffectNone, + OpcodeVband: sideEffectNone, + OpcodeVbandnot: sideEffectNone, + OpcodeVbnot: sideEffectNone, + OpcodeVbitselect: sideEffectNone, + OpcodeVanyTrue: sideEffectNone, + OpcodeVallTrue: sideEffectNone, + OpcodeVhighBits: sideEffectNone, + OpcodeVIadd: sideEffectNone, + OpcodeVSaddSat: sideEffectNone, + OpcodeVUaddSat: sideEffectNone, + OpcodeVIsub: sideEffectNone, + OpcodeVSsubSat: sideEffectNone, + OpcodeVUsubSat: sideEffectNone, + OpcodeVIcmp: sideEffectNone, + OpcodeVImin: sideEffectNone, + OpcodeVUmin: sideEffectNone, + OpcodeVImax: sideEffectNone, + OpcodeVUmax: sideEffectNone, + OpcodeVAvgRound: sideEffectNone, + OpcodeVImul: sideEffectNone, + OpcodeVIabs: sideEffectNone, + OpcodeVIneg: sideEffectNone, + OpcodeVIpopcnt: sideEffectNone, + OpcodeVIshl: sideEffectNone, + OpcodeVSshr: sideEffectNone, + OpcodeVUshr: sideEffectNone, + OpcodeVSqrt: sideEffectNone, + OpcodeVFabs: sideEffectNone, + OpcodeVFmin: sideEffectNone, + OpcodeVFmax: sideEffectNone, + OpcodeVFneg: sideEffectNone, + OpcodeVFadd: sideEffectNone, + OpcodeVFsub: sideEffectNone, + OpcodeVFmul: sideEffectNone, + OpcodeVFdiv: sideEffectNone, + OpcodeVFcmp: sideEffectNone, + OpcodeVCeil: sideEffectNone, + OpcodeVFloor: sideEffectNone, + OpcodeVTrunc: sideEffectNone, + OpcodeVNearest: sideEffectNone, + OpcodeVMaxPseudo: sideEffectNone, + OpcodeVMinPseudo: sideEffectNone, + OpcodeVFcvtToUintSat: sideEffectNone, + OpcodeVFcvtToSintSat: sideEffectNone, + OpcodeVZeroExtLoad: sideEffectNone, + OpcodeAtomicRmw: sideEffectStrict, + OpcodeAtomicLoad: sideEffectStrict, + OpcodeAtomicStore: sideEffectStrict, + OpcodeAtomicCas: sideEffectStrict, + OpcodeFence: sideEffectStrict, + OpcodeWideningPairwiseDotProductS: sideEffectNone, +} + +// sideEffect returns true if this instruction has side effects. +func (i *Instruction) sideEffect() sideEffect { + if e := instructionSideEffects[i.opcode]; e == sideEffectUnknown { + panic("BUG: side effect info not registered for " + i.opcode.String()) + } else { + return e + } +} + +// instructionReturnTypes provides the function to determine the return types of an instruction. +var instructionReturnTypes = [opcodeEnd]returnTypesFn{ + OpcodeExtIaddPairwise: returnTypesFnV128, + OpcodeVbor: returnTypesFnV128, + OpcodeVbxor: returnTypesFnV128, + OpcodeVband: returnTypesFnV128, + OpcodeVbnot: returnTypesFnV128, + OpcodeVbandnot: returnTypesFnV128, + OpcodeVbitselect: returnTypesFnV128, + OpcodeVanyTrue: returnTypesFnI32, + OpcodeVallTrue: returnTypesFnI32, + OpcodeVhighBits: returnTypesFnI32, + OpcodeVIadd: returnTypesFnV128, + OpcodeVSaddSat: returnTypesFnV128, + OpcodeVUaddSat: returnTypesFnV128, + OpcodeVIsub: returnTypesFnV128, + OpcodeVSsubSat: returnTypesFnV128, + OpcodeVUsubSat: returnTypesFnV128, + OpcodeVIcmp: returnTypesFnV128, + OpcodeVImin: returnTypesFnV128, + OpcodeVUmin: returnTypesFnV128, + OpcodeVImax: returnTypesFnV128, + OpcodeVUmax: returnTypesFnV128, + OpcodeVImul: returnTypesFnV128, + OpcodeVAvgRound: returnTypesFnV128, + OpcodeVIabs: returnTypesFnV128, + OpcodeVIneg: returnTypesFnV128, + OpcodeVIpopcnt: returnTypesFnV128, + OpcodeVIshl: returnTypesFnV128, + OpcodeVSshr: returnTypesFnV128, + OpcodeVUshr: returnTypesFnV128, + OpcodeExtractlane: returnTypesFnSingle, + OpcodeInsertlane: returnTypesFnV128, + OpcodeBand: returnTypesFnSingle, + OpcodeFcopysign: returnTypesFnSingle, + OpcodeBitcast: returnTypesFnSingle, + OpcodeBor: returnTypesFnSingle, + OpcodeBxor: returnTypesFnSingle, + OpcodeRotl: returnTypesFnSingle, + OpcodeRotr: returnTypesFnSingle, + OpcodeIshl: returnTypesFnSingle, + OpcodeSshr: returnTypesFnSingle, + OpcodeSdiv: returnTypesFnSingle, + OpcodeSrem: returnTypesFnSingle, + OpcodeUdiv: returnTypesFnSingle, + OpcodeUrem: returnTypesFnSingle, + OpcodeUshr: returnTypesFnSingle, + OpcodeJump: returnTypesFnNoReturns, + OpcodeUndefined: returnTypesFnNoReturns, + OpcodeIconst: returnTypesFnSingle, + OpcodeSelect: returnTypesFnSingle, + OpcodeSExtend: returnTypesFnSingle, + OpcodeUExtend: returnTypesFnSingle, + OpcodeSwidenLow: returnTypesFnV128, + OpcodeUwidenLow: returnTypesFnV128, + OpcodeSwidenHigh: returnTypesFnV128, + OpcodeUwidenHigh: returnTypesFnV128, + OpcodeSnarrow: returnTypesFnV128, + OpcodeUnarrow: returnTypesFnV128, + OpcodeSwizzle: returnTypesFnSingle, + OpcodeShuffle: returnTypesFnV128, + OpcodeSplat: returnTypesFnV128, + OpcodeIreduce: returnTypesFnSingle, + OpcodeFabs: returnTypesFnSingle, + OpcodeSqrt: returnTypesFnSingle, + OpcodeCeil: returnTypesFnSingle, + OpcodeFloor: returnTypesFnSingle, + OpcodeTrunc: returnTypesFnSingle, + OpcodeNearest: returnTypesFnSingle, + OpcodeCallIndirect: func(b *builder, instr *Instruction) (t1 Type, ts []Type) { + sigID := SignatureID(instr.u1) + sig, ok := b.signatures[sigID] + if !ok { + panic("BUG") + } + switch len(sig.Results) { + case 0: + t1 = typeInvalid + case 1: + t1 = sig.Results[0] + default: + t1, ts = sig.Results[0], sig.Results[1:] + } + return + }, + OpcodeCall: func(b *builder, instr *Instruction) (t1 Type, ts []Type) { + sigID := SignatureID(instr.u2) + sig, ok := b.signatures[sigID] + if !ok { + panic("BUG") + } + switch len(sig.Results) { + case 0: + t1 = typeInvalid + case 1: + t1 = sig.Results[0] + default: + t1, ts = sig.Results[0], sig.Results[1:] + } + return + }, + OpcodeLoad: returnTypesFnSingle, + OpcodeVZeroExtLoad: returnTypesFnV128, + OpcodeLoadSplat: returnTypesFnV128, + OpcodeIadd: returnTypesFnSingle, + OpcodeIsub: returnTypesFnSingle, + OpcodeImul: returnTypesFnSingle, + OpcodeIcmp: returnTypesFnI32, + OpcodeFcmp: returnTypesFnI32, + OpcodeFadd: returnTypesFnSingle, + OpcodeFsub: returnTypesFnSingle, + OpcodeFdiv: returnTypesFnSingle, + OpcodeFmul: returnTypesFnSingle, + OpcodeFmax: returnTypesFnSingle, + OpcodeFmin: returnTypesFnSingle, + OpcodeSqmulRoundSat: returnTypesFnV128, + OpcodeF32const: returnTypesFnF32, + OpcodeF64const: returnTypesFnF64, + OpcodeClz: returnTypesFnSingle, + OpcodeCtz: returnTypesFnSingle, + OpcodePopcnt: returnTypesFnSingle, + OpcodeStore: returnTypesFnNoReturns, + OpcodeIstore8: returnTypesFnNoReturns, + OpcodeIstore16: returnTypesFnNoReturns, + OpcodeIstore32: returnTypesFnNoReturns, + OpcodeExitWithCode: returnTypesFnNoReturns, + OpcodeExitIfTrueWithCode: returnTypesFnNoReturns, + OpcodeReturn: returnTypesFnNoReturns, + OpcodeBrz: returnTypesFnNoReturns, + OpcodeBrnz: returnTypesFnNoReturns, + OpcodeBrTable: returnTypesFnNoReturns, + OpcodeUload8: returnTypesFnSingle, + OpcodeUload16: returnTypesFnSingle, + OpcodeUload32: returnTypesFnSingle, + OpcodeSload8: returnTypesFnSingle, + OpcodeSload16: returnTypesFnSingle, + OpcodeSload32: returnTypesFnSingle, + OpcodeFcvtToSint: returnTypesFnSingle, + OpcodeFcvtToUint: returnTypesFnSingle, + OpcodeFcvtFromSint: returnTypesFnSingle, + OpcodeFcvtFromUint: returnTypesFnSingle, + OpcodeFcvtToSintSat: returnTypesFnSingle, + OpcodeFcvtToUintSat: returnTypesFnSingle, + OpcodeVFcvtFromUint: returnTypesFnV128, + OpcodeVFcvtFromSint: returnTypesFnV128, + OpcodeFneg: returnTypesFnSingle, + OpcodeFdemote: returnTypesFnF32, + OpcodeFvdemote: returnTypesFnV128, + OpcodeFvpromoteLow: returnTypesFnV128, + OpcodeFpromote: returnTypesFnF64, + OpcodeVconst: returnTypesFnV128, + OpcodeVFabs: returnTypesFnV128, + OpcodeVSqrt: returnTypesFnV128, + OpcodeVFmax: returnTypesFnV128, + OpcodeVFmin: returnTypesFnV128, + OpcodeVFneg: returnTypesFnV128, + OpcodeVFadd: returnTypesFnV128, + OpcodeVFsub: returnTypesFnV128, + OpcodeVFmul: returnTypesFnV128, + OpcodeVFdiv: returnTypesFnV128, + OpcodeVFcmp: returnTypesFnV128, + OpcodeVCeil: returnTypesFnV128, + OpcodeVFloor: returnTypesFnV128, + OpcodeVTrunc: returnTypesFnV128, + OpcodeVNearest: returnTypesFnV128, + OpcodeVMaxPseudo: returnTypesFnV128, + OpcodeVMinPseudo: returnTypesFnV128, + OpcodeVFcvtToUintSat: returnTypesFnV128, + OpcodeVFcvtToSintSat: returnTypesFnV128, + OpcodeAtomicRmw: returnTypesFnSingle, + OpcodeAtomicLoad: returnTypesFnSingle, + OpcodeAtomicStore: returnTypesFnNoReturns, + OpcodeAtomicCas: returnTypesFnSingle, + OpcodeFence: returnTypesFnNoReturns, + OpcodeWideningPairwiseDotProductS: returnTypesFnV128, +} + +// AsLoad initializes this instruction as a store instruction with OpcodeLoad. +func (i *Instruction) AsLoad(ptr Value, offset uint32, typ Type) *Instruction { + i.opcode = OpcodeLoad + i.v = ptr + i.u1 = uint64(offset) + i.typ = typ + return i +} + +// AsExtLoad initializes this instruction as a store instruction with OpcodeLoad. +func (i *Instruction) AsExtLoad(op Opcode, ptr Value, offset uint32, dst64bit bool) *Instruction { + i.opcode = op + i.v = ptr + i.u1 = uint64(offset) + if dst64bit { + i.typ = TypeI64 + } else { + i.typ = TypeI32 + } + return i +} + +// AsVZeroExtLoad initializes this instruction as a store instruction with OpcodeVExtLoad. +func (i *Instruction) AsVZeroExtLoad(ptr Value, offset uint32, scalarType Type) *Instruction { + i.opcode = OpcodeVZeroExtLoad + i.v = ptr + i.u1 = uint64(offset) + i.u2 = uint64(scalarType) + i.typ = TypeV128 + return i +} + +// VZeroExtLoadData returns the operands for a load instruction. The returned `typ` is the scalar type of the load target. +func (i *Instruction) VZeroExtLoadData() (ptr Value, offset uint32, typ Type) { + return i.v, uint32(i.u1), Type(i.u2) +} + +// AsLoadSplat initializes this instruction as a store instruction with OpcodeLoadSplat. +func (i *Instruction) AsLoadSplat(ptr Value, offset uint32, lane VecLane) *Instruction { + i.opcode = OpcodeLoadSplat + i.v = ptr + i.u1 = uint64(offset) + i.u2 = uint64(lane) + i.typ = TypeV128 + return i +} + +// LoadData returns the operands for a load instruction. +func (i *Instruction) LoadData() (ptr Value, offset uint32, typ Type) { + return i.v, uint32(i.u1), i.typ +} + +// LoadSplatData returns the operands for a load splat instruction. +func (i *Instruction) LoadSplatData() (ptr Value, offset uint32, lane VecLane) { + return i.v, uint32(i.u1), VecLane(i.u2) +} + +// AsStore initializes this instruction as a store instruction with OpcodeStore. +func (i *Instruction) AsStore(storeOp Opcode, value, ptr Value, offset uint32) *Instruction { + i.opcode = storeOp + i.v = value + i.v2 = ptr + + var dstSize uint64 + switch storeOp { + case OpcodeStore: + dstSize = uint64(value.Type().Bits()) + case OpcodeIstore8: + dstSize = 8 + case OpcodeIstore16: + dstSize = 16 + case OpcodeIstore32: + dstSize = 32 + default: + panic("invalid store opcode" + storeOp.String()) + } + i.u1 = uint64(offset) | dstSize<<32 + return i +} + +// StoreData returns the operands for a store instruction. +func (i *Instruction) StoreData() (value, ptr Value, offset uint32, storeSizeInBits byte) { + return i.v, i.v2, uint32(i.u1), byte(i.u1 >> 32) +} + +// AsIconst64 initializes this instruction as a 64-bit integer constant instruction with OpcodeIconst. +func (i *Instruction) AsIconst64(v uint64) *Instruction { + i.opcode = OpcodeIconst + i.typ = TypeI64 + i.u1 = v + return i +} + +// AsIconst32 initializes this instruction as a 32-bit integer constant instruction with OpcodeIconst. +func (i *Instruction) AsIconst32(v uint32) *Instruction { + i.opcode = OpcodeIconst + i.typ = TypeI32 + i.u1 = uint64(v) + return i +} + +// AsIadd initializes this instruction as an integer addition instruction with OpcodeIadd. +func (i *Instruction) AsIadd(x, y Value) *Instruction { + i.opcode = OpcodeIadd + i.v = x + i.v2 = y + i.typ = x.Type() + return i +} + +// AsVIadd initializes this instruction as an integer addition instruction with OpcodeVIadd on a vector. +func (i *Instruction) AsVIadd(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVIadd + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsWideningPairwiseDotProductS initializes this instruction as a lane-wise integer extended pairwise addition instruction +// with OpcodeIaddPairwise on a vector. +func (i *Instruction) AsWideningPairwiseDotProductS(x, y Value) *Instruction { + i.opcode = OpcodeWideningPairwiseDotProductS + i.v = x + i.v2 = y + i.typ = TypeV128 + return i +} + +// AsExtIaddPairwise initializes this instruction as a lane-wise integer extended pairwise addition instruction +// with OpcodeIaddPairwise on a vector. +func (i *Instruction) AsExtIaddPairwise(x Value, srcLane VecLane, signed bool) *Instruction { + i.opcode = OpcodeExtIaddPairwise + i.v = x + i.u1 = uint64(srcLane) + if signed { + i.u2 = 1 + } + i.typ = TypeV128 + return i +} + +// ExtIaddPairwiseData returns the operands for a lane-wise integer extended pairwise addition instruction. +func (i *Instruction) ExtIaddPairwiseData() (x Value, srcLane VecLane, signed bool) { + return i.v, VecLane(i.u1), i.u2 != 0 +} + +// AsVSaddSat initializes this instruction as a vector addition with saturation instruction with OpcodeVSaddSat on a vector. +func (i *Instruction) AsVSaddSat(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVSaddSat + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVUaddSat initializes this instruction as a vector addition with saturation instruction with OpcodeVUaddSat on a vector. +func (i *Instruction) AsVUaddSat(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVUaddSat + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVIsub initializes this instruction as an integer subtraction instruction with OpcodeVIsub on a vector. +func (i *Instruction) AsVIsub(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVIsub + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVSsubSat initializes this instruction as a vector addition with saturation instruction with OpcodeVSsubSat on a vector. +func (i *Instruction) AsVSsubSat(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVSsubSat + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVUsubSat initializes this instruction as a vector addition with saturation instruction with OpcodeVUsubSat on a vector. +func (i *Instruction) AsVUsubSat(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVUsubSat + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVImin initializes this instruction as a signed integer min instruction with OpcodeVImin on a vector. +func (i *Instruction) AsVImin(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVImin + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVUmin initializes this instruction as an unsigned integer min instruction with OpcodeVUmin on a vector. +func (i *Instruction) AsVUmin(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVUmin + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVImax initializes this instruction as a signed integer max instruction with OpcodeVImax on a vector. +func (i *Instruction) AsVImax(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVImax + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVUmax initializes this instruction as an unsigned integer max instruction with OpcodeVUmax on a vector. +func (i *Instruction) AsVUmax(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVUmax + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVAvgRound initializes this instruction as an unsigned integer avg instruction, truncating to zero with OpcodeVAvgRound on a vector. +func (i *Instruction) AsVAvgRound(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVAvgRound + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVImul initializes this instruction as an integer multiplication with OpcodeVImul on a vector. +func (i *Instruction) AsVImul(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVImul + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsSqmulRoundSat initializes this instruction as a lane-wise saturating rounding multiplication +// in Q15 format with OpcodeSqmulRoundSat on a vector. +func (i *Instruction) AsSqmulRoundSat(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeSqmulRoundSat + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVIabs initializes this instruction as a vector absolute value with OpcodeVIabs. +func (i *Instruction) AsVIabs(x Value, lane VecLane) *Instruction { + i.opcode = OpcodeVIabs + i.v = x + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVIneg initializes this instruction as a vector negation with OpcodeVIneg. +func (i *Instruction) AsVIneg(x Value, lane VecLane) *Instruction { + i.opcode = OpcodeVIneg + i.v = x + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVIpopcnt initializes this instruction as a Population Count instruction with OpcodeVIpopcnt on a vector. +func (i *Instruction) AsVIpopcnt(x Value, lane VecLane) *Instruction { + if lane != VecLaneI8x16 { + panic("Unsupported lane type " + lane.String()) + } + i.opcode = OpcodeVIpopcnt + i.v = x + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVSqrt initializes this instruction as a sqrt instruction with OpcodeVSqrt on a vector. +func (i *Instruction) AsVSqrt(x Value, lane VecLane) *Instruction { + i.opcode = OpcodeVSqrt + i.v = x + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVFabs initializes this instruction as a float abs instruction with OpcodeVFabs on a vector. +func (i *Instruction) AsVFabs(x Value, lane VecLane) *Instruction { + i.opcode = OpcodeVFabs + i.v = x + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVFneg initializes this instruction as a float neg instruction with OpcodeVFneg on a vector. +func (i *Instruction) AsVFneg(x Value, lane VecLane) *Instruction { + i.opcode = OpcodeVFneg + i.v = x + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVFmax initializes this instruction as a float max instruction with OpcodeVFmax on a vector. +func (i *Instruction) AsVFmax(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVFmax + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVFmin initializes this instruction as a float min instruction with OpcodeVFmin on a vector. +func (i *Instruction) AsVFmin(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVFmin + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVFadd initializes this instruction as a floating point add instruction with OpcodeVFadd on a vector. +func (i *Instruction) AsVFadd(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVFadd + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVFsub initializes this instruction as a floating point subtraction instruction with OpcodeVFsub on a vector. +func (i *Instruction) AsVFsub(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVFsub + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVFmul initializes this instruction as a floating point multiplication instruction with OpcodeVFmul on a vector. +func (i *Instruction) AsVFmul(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVFmul + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVFdiv initializes this instruction as a floating point division instruction with OpcodeVFdiv on a vector. +func (i *Instruction) AsVFdiv(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVFdiv + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsImul initializes this instruction as an integer addition instruction with OpcodeImul. +func (i *Instruction) AsImul(x, y Value) *Instruction { + i.opcode = OpcodeImul + i.v = x + i.v2 = y + i.typ = x.Type() + return i +} + +func (i *Instruction) Insert(b Builder) *Instruction { + b.InsertInstruction(i) + return i +} + +// AsIsub initializes this instruction as an integer subtraction instruction with OpcodeIsub. +func (i *Instruction) AsIsub(x, y Value) *Instruction { + i.opcode = OpcodeIsub + i.v = x + i.v2 = y + i.typ = x.Type() + return i +} + +// AsIcmp initializes this instruction as an integer comparison instruction with OpcodeIcmp. +func (i *Instruction) AsIcmp(x, y Value, c IntegerCmpCond) *Instruction { + i.opcode = OpcodeIcmp + i.v = x + i.v2 = y + i.u1 = uint64(c) + i.typ = TypeI32 + return i +} + +// AsFcmp initializes this instruction as an integer comparison instruction with OpcodeFcmp. +func (i *Instruction) AsFcmp(x, y Value, c FloatCmpCond) { + i.opcode = OpcodeFcmp + i.v = x + i.v2 = y + i.u1 = uint64(c) + i.typ = TypeI32 +} + +// AsVIcmp initializes this instruction as an integer vector comparison instruction with OpcodeVIcmp. +func (i *Instruction) AsVIcmp(x, y Value, c IntegerCmpCond, lane VecLane) *Instruction { + i.opcode = OpcodeVIcmp + i.v = x + i.v2 = y + i.u1 = uint64(c) + i.u2 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsVFcmp initializes this instruction as a float comparison instruction with OpcodeVFcmp on Vector. +func (i *Instruction) AsVFcmp(x, y Value, c FloatCmpCond, lane VecLane) *Instruction { + i.opcode = OpcodeVFcmp + i.v = x + i.v2 = y + i.u1 = uint64(c) + i.typ = TypeV128 + i.u2 = uint64(lane) + return i +} + +// AsVCeil initializes this instruction as an instruction with OpcodeCeil. +func (i *Instruction) AsVCeil(x Value, lane VecLane) *Instruction { + i.opcode = OpcodeVCeil + i.v = x + i.typ = x.Type() + i.u1 = uint64(lane) + return i +} + +// AsVFloor initializes this instruction as an instruction with OpcodeFloor. +func (i *Instruction) AsVFloor(x Value, lane VecLane) *Instruction { + i.opcode = OpcodeVFloor + i.v = x + i.typ = x.Type() + i.u1 = uint64(lane) + return i +} + +// AsVTrunc initializes this instruction as an instruction with OpcodeTrunc. +func (i *Instruction) AsVTrunc(x Value, lane VecLane) *Instruction { + i.opcode = OpcodeVTrunc + i.v = x + i.typ = x.Type() + i.u1 = uint64(lane) + return i +} + +// AsVNearest initializes this instruction as an instruction with OpcodeNearest. +func (i *Instruction) AsVNearest(x Value, lane VecLane) *Instruction { + i.opcode = OpcodeVNearest + i.v = x + i.typ = x.Type() + i.u1 = uint64(lane) + return i +} + +// AsVMaxPseudo initializes this instruction as an instruction with OpcodeVMaxPseudo. +func (i *Instruction) AsVMaxPseudo(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVMaxPseudo + i.typ = x.Type() + i.v = x + i.v2 = y + i.u1 = uint64(lane) + return i +} + +// AsVMinPseudo initializes this instruction as an instruction with OpcodeVMinPseudo. +func (i *Instruction) AsVMinPseudo(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeVMinPseudo + i.typ = x.Type() + i.v = x + i.v2 = y + i.u1 = uint64(lane) + return i +} + +// AsSDiv initializes this instruction as an integer bitwise and instruction with OpcodeSdiv. +func (i *Instruction) AsSDiv(x, y, ctx Value) *Instruction { + i.opcode = OpcodeSdiv + i.v = x + i.v2 = y + i.v3 = ctx + i.typ = x.Type() + return i +} + +// AsUDiv initializes this instruction as an integer bitwise and instruction with OpcodeUdiv. +func (i *Instruction) AsUDiv(x, y, ctx Value) *Instruction { + i.opcode = OpcodeUdiv + i.v = x + i.v2 = y + i.v3 = ctx + i.typ = x.Type() + return i +} + +// AsSRem initializes this instruction as an integer bitwise and instruction with OpcodeSrem. +func (i *Instruction) AsSRem(x, y, ctx Value) *Instruction { + i.opcode = OpcodeSrem + i.v = x + i.v2 = y + i.v3 = ctx + i.typ = x.Type() + return i +} + +// AsURem initializes this instruction as an integer bitwise and instruction with OpcodeUrem. +func (i *Instruction) AsURem(x, y, ctx Value) *Instruction { + i.opcode = OpcodeUrem + i.v = x + i.v2 = y + i.v3 = ctx + i.typ = x.Type() + return i +} + +// AsBand initializes this instruction as an integer bitwise and instruction with OpcodeBand. +func (i *Instruction) AsBand(x, amount Value) *Instruction { + i.opcode = OpcodeBand + i.v = x + i.v2 = amount + i.typ = x.Type() + return i +} + +// AsBor initializes this instruction as an integer bitwise or instruction with OpcodeBor. +func (i *Instruction) AsBor(x, amount Value) { + i.opcode = OpcodeBor + i.v = x + i.v2 = amount + i.typ = x.Type() +} + +// AsBxor initializes this instruction as an integer bitwise xor instruction with OpcodeBxor. +func (i *Instruction) AsBxor(x, amount Value) { + i.opcode = OpcodeBxor + i.v = x + i.v2 = amount + i.typ = x.Type() +} + +// AsIshl initializes this instruction as an integer shift left instruction with OpcodeIshl. +func (i *Instruction) AsIshl(x, amount Value) *Instruction { + i.opcode = OpcodeIshl + i.v = x + i.v2 = amount + i.typ = x.Type() + return i +} + +// AsVIshl initializes this instruction as an integer shift left instruction with OpcodeVIshl on vector. +func (i *Instruction) AsVIshl(x, amount Value, lane VecLane) *Instruction { + i.opcode = OpcodeVIshl + i.v = x + i.v2 = amount + i.u1 = uint64(lane) + i.typ = x.Type() + return i +} + +// AsUshr initializes this instruction as an integer unsigned shift right (logical shift right) instruction with OpcodeUshr. +func (i *Instruction) AsUshr(x, amount Value) *Instruction { + i.opcode = OpcodeUshr + i.v = x + i.v2 = amount + i.typ = x.Type() + return i +} + +// AsVUshr initializes this instruction as an integer unsigned shift right (logical shift right) instruction with OpcodeVUshr on vector. +func (i *Instruction) AsVUshr(x, amount Value, lane VecLane) *Instruction { + i.opcode = OpcodeVUshr + i.v = x + i.v2 = amount + i.u1 = uint64(lane) + i.typ = x.Type() + return i +} + +// AsSshr initializes this instruction as an integer signed shift right (arithmetic shift right) instruction with OpcodeSshr. +func (i *Instruction) AsSshr(x, amount Value) *Instruction { + i.opcode = OpcodeSshr + i.v = x + i.v2 = amount + i.typ = x.Type() + return i +} + +// AsVSshr initializes this instruction as an integer signed shift right (arithmetic shift right) instruction with OpcodeVSshr on vector. +func (i *Instruction) AsVSshr(x, amount Value, lane VecLane) *Instruction { + i.opcode = OpcodeVSshr + i.v = x + i.v2 = amount + i.u1 = uint64(lane) + i.typ = x.Type() + return i +} + +// AsExtractlane initializes this instruction as an extract lane instruction with OpcodeExtractlane on vector. +func (i *Instruction) AsExtractlane(x Value, index byte, lane VecLane, signed bool) *Instruction { + i.opcode = OpcodeExtractlane + i.v = x + // We do not have a field for signedness, but `index` is a byte, + // so we just encode the flag in the high bits of `u1`. + i.u1 = uint64(index) + if signed { + i.u1 = i.u1 | 1<<32 + } + i.u2 = uint64(lane) + switch lane { + case VecLaneI8x16, VecLaneI16x8, VecLaneI32x4: + i.typ = TypeI32 + case VecLaneI64x2: + i.typ = TypeI64 + case VecLaneF32x4: + i.typ = TypeF32 + case VecLaneF64x2: + i.typ = TypeF64 + } + return i +} + +// AsInsertlane initializes this instruction as an insert lane instruction with OpcodeInsertlane on vector. +func (i *Instruction) AsInsertlane(x, y Value, index byte, lane VecLane) *Instruction { + i.opcode = OpcodeInsertlane + i.v = x + i.v2 = y + i.u1 = uint64(index) + i.u2 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsShuffle initializes this instruction as a shuffle instruction with OpcodeShuffle on vector. +func (i *Instruction) AsShuffle(x, y Value, lane []byte) *Instruction { + i.opcode = OpcodeShuffle + i.v = x + i.v2 = y + // Encode the 16 bytes as 8 bytes in u1, and 8 bytes in u2. + i.u1 = uint64(lane[7])<<56 | uint64(lane[6])<<48 | uint64(lane[5])<<40 | uint64(lane[4])<<32 | uint64(lane[3])<<24 | uint64(lane[2])<<16 | uint64(lane[1])<<8 | uint64(lane[0]) + i.u2 = uint64(lane[15])<<56 | uint64(lane[14])<<48 | uint64(lane[13])<<40 | uint64(lane[12])<<32 | uint64(lane[11])<<24 | uint64(lane[10])<<16 | uint64(lane[9])<<8 | uint64(lane[8]) + i.typ = TypeV128 + return i +} + +// AsSwizzle initializes this instruction as an insert lane instruction with OpcodeSwizzle on vector. +func (i *Instruction) AsSwizzle(x, y Value, lane VecLane) *Instruction { + i.opcode = OpcodeSwizzle + i.v = x + i.v2 = y + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsSplat initializes this instruction as an insert lane instruction with OpcodeSplat on vector. +func (i *Instruction) AsSplat(x Value, lane VecLane) *Instruction { + i.opcode = OpcodeSplat + i.v = x + i.u1 = uint64(lane) + i.typ = TypeV128 + return i +} + +// AsRotl initializes this instruction as a word rotate left instruction with OpcodeRotl. +func (i *Instruction) AsRotl(x, amount Value) { + i.opcode = OpcodeRotl + i.v = x + i.v2 = amount + i.typ = x.Type() +} + +// AsRotr initializes this instruction as a word rotate right instruction with OpcodeRotr. +func (i *Instruction) AsRotr(x, amount Value) { + i.opcode = OpcodeRotr + i.v = x + i.v2 = amount + i.typ = x.Type() +} + +// IcmpData returns the operands and comparison condition of this integer comparison instruction. +func (i *Instruction) IcmpData() (x, y Value, c IntegerCmpCond) { + return i.v, i.v2, IntegerCmpCond(i.u1) +} + +// FcmpData returns the operands and comparison condition of this floating-point comparison instruction. +func (i *Instruction) FcmpData() (x, y Value, c FloatCmpCond) { + return i.v, i.v2, FloatCmpCond(i.u1) +} + +// VIcmpData returns the operands and comparison condition of this integer comparison instruction on vector. +func (i *Instruction) VIcmpData() (x, y Value, c IntegerCmpCond, l VecLane) { + return i.v, i.v2, IntegerCmpCond(i.u1), VecLane(i.u2) +} + +// VFcmpData returns the operands and comparison condition of this float comparison instruction on vector. +func (i *Instruction) VFcmpData() (x, y Value, c FloatCmpCond, l VecLane) { + return i.v, i.v2, FloatCmpCond(i.u1), VecLane(i.u2) +} + +// ExtractlaneData returns the operands and sign flag of Extractlane on vector. +func (i *Instruction) ExtractlaneData() (x Value, index byte, signed bool, l VecLane) { + x = i.v + index = byte(0b00001111 & i.u1) + signed = i.u1>>32 != 0 + l = VecLane(i.u2) + return +} + +// InsertlaneData returns the operands and sign flag of Insertlane on vector. +func (i *Instruction) InsertlaneData() (x, y Value, index byte, l VecLane) { + x = i.v + y = i.v2 + index = byte(i.u1) + l = VecLane(i.u2) + return +} + +// AsFadd initializes this instruction as a floating-point addition instruction with OpcodeFadd. +func (i *Instruction) AsFadd(x, y Value) { + i.opcode = OpcodeFadd + i.v = x + i.v2 = y + i.typ = x.Type() +} + +// AsFsub initializes this instruction as a floating-point subtraction instruction with OpcodeFsub. +func (i *Instruction) AsFsub(x, y Value) { + i.opcode = OpcodeFsub + i.v = x + i.v2 = y + i.typ = x.Type() +} + +// AsFmul initializes this instruction as a floating-point multiplication instruction with OpcodeFmul. +func (i *Instruction) AsFmul(x, y Value) { + i.opcode = OpcodeFmul + i.v = x + i.v2 = y + i.typ = x.Type() +} + +// AsFdiv initializes this instruction as a floating-point division instruction with OpcodeFdiv. +func (i *Instruction) AsFdiv(x, y Value) { + i.opcode = OpcodeFdiv + i.v = x + i.v2 = y + i.typ = x.Type() +} + +// AsFmin initializes this instruction to take the minimum of two floating-points with OpcodeFmin. +func (i *Instruction) AsFmin(x, y Value) { + i.opcode = OpcodeFmin + i.v = x + i.v2 = y + i.typ = x.Type() +} + +// AsFmax initializes this instruction to take the maximum of two floating-points with OpcodeFmax. +func (i *Instruction) AsFmax(x, y Value) { + i.opcode = OpcodeFmax + i.v = x + i.v2 = y + i.typ = x.Type() +} + +// AsF32const initializes this instruction as a 32-bit floating-point constant instruction with OpcodeF32const. +func (i *Instruction) AsF32const(f float32) *Instruction { + i.opcode = OpcodeF32const + i.typ = TypeF64 + i.u1 = uint64(math.Float32bits(f)) + return i +} + +// AsF64const initializes this instruction as a 64-bit floating-point constant instruction with OpcodeF64const. +func (i *Instruction) AsF64const(f float64) *Instruction { + i.opcode = OpcodeF64const + i.typ = TypeF64 + i.u1 = math.Float64bits(f) + return i +} + +// AsVconst initializes this instruction as a vector constant instruction with OpcodeVconst. +func (i *Instruction) AsVconst(lo, hi uint64) *Instruction { + i.opcode = OpcodeVconst + i.typ = TypeV128 + i.u1 = lo + i.u2 = hi + return i +} + +// AsVbnot initializes this instruction as a vector negation instruction with OpcodeVbnot. +func (i *Instruction) AsVbnot(v Value) *Instruction { + i.opcode = OpcodeVbnot + i.typ = TypeV128 + i.v = v + return i +} + +// AsVband initializes this instruction as an and vector instruction with OpcodeVband. +func (i *Instruction) AsVband(x, y Value) *Instruction { + i.opcode = OpcodeVband + i.typ = TypeV128 + i.v = x + i.v2 = y + return i +} + +// AsVbor initializes this instruction as an or vector instruction with OpcodeVbor. +func (i *Instruction) AsVbor(x, y Value) *Instruction { + i.opcode = OpcodeVbor + i.typ = TypeV128 + i.v = x + i.v2 = y + return i +} + +// AsVbxor initializes this instruction as a xor vector instruction with OpcodeVbxor. +func (i *Instruction) AsVbxor(x, y Value) *Instruction { + i.opcode = OpcodeVbxor + i.typ = TypeV128 + i.v = x + i.v2 = y + return i +} + +// AsVbandnot initializes this instruction as an and-not vector instruction with OpcodeVbandnot. +func (i *Instruction) AsVbandnot(x, y Value) *Instruction { + i.opcode = OpcodeVbandnot + i.typ = TypeV128 + i.v = x + i.v2 = y + return i +} + +// AsVbitselect initializes this instruction as a bit select vector instruction with OpcodeVbitselect. +func (i *Instruction) AsVbitselect(c, x, y Value) *Instruction { + i.opcode = OpcodeVbitselect + i.typ = TypeV128 + i.v = c + i.v2 = x + i.v3 = y + return i +} + +// AsVanyTrue initializes this instruction as an anyTrue vector instruction with OpcodeVanyTrue. +func (i *Instruction) AsVanyTrue(x Value) *Instruction { + i.opcode = OpcodeVanyTrue + i.typ = TypeI32 + i.v = x + return i +} + +// AsVallTrue initializes this instruction as an allTrue vector instruction with OpcodeVallTrue. +func (i *Instruction) AsVallTrue(x Value, lane VecLane) *Instruction { + i.opcode = OpcodeVallTrue + i.typ = TypeI32 + i.v = x + i.u1 = uint64(lane) + return i +} + +// AsVhighBits initializes this instruction as a highBits vector instruction with OpcodeVhighBits. +func (i *Instruction) AsVhighBits(x Value, lane VecLane) *Instruction { + i.opcode = OpcodeVhighBits + i.typ = TypeI32 + i.v = x + i.u1 = uint64(lane) + return i +} + +// VconstData returns the operands of this vector constant instruction. +func (i *Instruction) VconstData() (lo, hi uint64) { + return i.u1, i.u2 +} + +// AsReturn initializes this instruction as a return instruction with OpcodeReturn. +func (i *Instruction) AsReturn(vs wazevoapi.VarLength[Value]) *Instruction { + i.opcode = OpcodeReturn + i.vs = vs + return i +} + +// AsIreduce initializes this instruction as a reduction instruction with OpcodeIreduce. +func (i *Instruction) AsIreduce(v Value, dstType Type) *Instruction { + i.opcode = OpcodeIreduce + i.v = v + i.typ = dstType + return i +} + +// AsWiden initializes this instruction as a signed or unsigned widen instruction +// on low half or high half of the given vector with OpcodeSwidenLow, OpcodeUwidenLow, OpcodeSwidenHigh, OpcodeUwidenHigh. +func (i *Instruction) AsWiden(v Value, lane VecLane, signed, low bool) *Instruction { + switch { + case signed && low: + i.opcode = OpcodeSwidenLow + case !signed && low: + i.opcode = OpcodeUwidenLow + case signed && !low: + i.opcode = OpcodeSwidenHigh + case !signed && !low: + i.opcode = OpcodeUwidenHigh + } + i.v = v + i.u1 = uint64(lane) + return i +} + +// AsAtomicLoad initializes this instruction as an atomic load. +// The size is in bytes and must be 1, 2, 4, or 8. +func (i *Instruction) AsAtomicLoad(addr Value, size uint64, typ Type) *Instruction { + i.opcode = OpcodeAtomicLoad + i.u1 = size + i.v = addr + i.typ = typ + return i +} + +// AsAtomicLoad initializes this instruction as an atomic store. +// The size is in bytes and must be 1, 2, 4, or 8. +func (i *Instruction) AsAtomicStore(addr, val Value, size uint64) *Instruction { + i.opcode = OpcodeAtomicStore + i.u1 = size + i.v = addr + i.v2 = val + i.typ = val.Type() + return i +} + +// AsAtomicRmw initializes this instruction as an atomic read-modify-write. +// The size is in bytes and must be 1, 2, 4, or 8. +func (i *Instruction) AsAtomicRmw(op AtomicRmwOp, addr, val Value, size uint64) *Instruction { + i.opcode = OpcodeAtomicRmw + i.u1 = uint64(op) + i.u2 = size + i.v = addr + i.v2 = val + i.typ = val.Type() + return i +} + +// AsAtomicCas initializes this instruction as an atomic compare-and-swap. +// The size is in bytes and must be 1, 2, 4, or 8. +func (i *Instruction) AsAtomicCas(addr, exp, repl Value, size uint64) *Instruction { + i.opcode = OpcodeAtomicCas + i.u1 = size + i.v = addr + i.v2 = exp + i.v3 = repl + i.typ = repl.Type() + return i +} + +// AsFence initializes this instruction as a memory fence. +// A single byte immediate may be used to indicate fence ordering in the future +// but is currently always 0 and ignored. +func (i *Instruction) AsFence(order byte) *Instruction { + i.opcode = OpcodeFence + i.u1 = uint64(order) + return i +} + +// AtomicRmwData returns the data for this atomic read-modify-write instruction. +func (i *Instruction) AtomicRmwData() (op AtomicRmwOp, size uint64) { + return AtomicRmwOp(i.u1), i.u2 +} + +// AtomicTargetSize returns the target memory size of the atomic instruction. +func (i *Instruction) AtomicTargetSize() (size uint64) { + return i.u1 +} + +// ReturnVals returns the return values of OpcodeReturn. +func (i *Instruction) ReturnVals() []Value { + return i.vs.View() +} + +// AsExitWithCode initializes this instruction as a trap instruction with OpcodeExitWithCode. +func (i *Instruction) AsExitWithCode(ctx Value, code wazevoapi.ExitCode) { + i.opcode = OpcodeExitWithCode + i.v = ctx + i.u1 = uint64(code) +} + +// AsExitIfTrueWithCode initializes this instruction as a trap instruction with OpcodeExitIfTrueWithCode. +func (i *Instruction) AsExitIfTrueWithCode(ctx, c Value, code wazevoapi.ExitCode) *Instruction { + i.opcode = OpcodeExitIfTrueWithCode + i.v = ctx + i.v2 = c + i.u1 = uint64(code) + return i +} + +// ExitWithCodeData returns the context and exit code of OpcodeExitWithCode. +func (i *Instruction) ExitWithCodeData() (ctx Value, code wazevoapi.ExitCode) { + return i.v, wazevoapi.ExitCode(i.u1) +} + +// ExitIfTrueWithCodeData returns the context and exit code of OpcodeExitWithCode. +func (i *Instruction) ExitIfTrueWithCodeData() (ctx, c Value, code wazevoapi.ExitCode) { + return i.v, i.v2, wazevoapi.ExitCode(i.u1) +} + +// InvertBrx inverts either OpcodeBrz or OpcodeBrnz to the other. +func (i *Instruction) InvertBrx() { + switch i.opcode { + case OpcodeBrz: + i.opcode = OpcodeBrnz + case OpcodeBrnz: + i.opcode = OpcodeBrz + default: + panic("BUG") + } +} + +// BranchData returns the branch data for this instruction necessary for backends. +func (i *Instruction) BranchData() (condVal Value, blockArgs []Value, target BasicBlock) { + switch i.opcode { + case OpcodeJump: + condVal = ValueInvalid + case OpcodeBrz, OpcodeBrnz: + condVal = i.v + default: + panic("BUG") + } + blockArgs = i.vs.View() + target = i.blk + return +} + +// BrTableData returns the branch table data for this instruction necessary for backends. +func (i *Instruction) BrTableData() (index Value, targets []BasicBlock) { + if i.opcode != OpcodeBrTable { + panic("BUG: BrTableData only available for OpcodeBrTable") + } + index = i.v + targets = i.targets + return +} + +// AsJump initializes this instruction as a jump instruction with OpcodeJump. +func (i *Instruction) AsJump(vs Values, target BasicBlock) *Instruction { + i.opcode = OpcodeJump + i.vs = vs + i.blk = target + return i +} + +// IsFallthroughJump returns true if this instruction is a fallthrough jump. +func (i *Instruction) IsFallthroughJump() bool { + if i.opcode != OpcodeJump { + panic("BUG: IsFallthrough only available for OpcodeJump") + } + return i.opcode == OpcodeJump && i.u1 != 0 +} + +// AsFallthroughJump marks this instruction as a fallthrough jump. +func (i *Instruction) AsFallthroughJump() { + if i.opcode != OpcodeJump { + panic("BUG: AsFallthroughJump only available for OpcodeJump") + } + i.u1 = 1 +} + +// AsBrz initializes this instruction as a branch-if-zero instruction with OpcodeBrz. +func (i *Instruction) AsBrz(v Value, args Values, target BasicBlock) { + i.opcode = OpcodeBrz + i.v = v + i.vs = args + i.blk = target +} + +// AsBrnz initializes this instruction as a branch-if-not-zero instruction with OpcodeBrnz. +func (i *Instruction) AsBrnz(v Value, args Values, target BasicBlock) *Instruction { + i.opcode = OpcodeBrnz + i.v = v + i.vs = args + i.blk = target + return i +} + +// AsBrTable initializes this instruction as a branch-table instruction with OpcodeBrTable. +func (i *Instruction) AsBrTable(index Value, targets []BasicBlock) { + i.opcode = OpcodeBrTable + i.v = index + i.targets = targets +} + +// AsCall initializes this instruction as a call instruction with OpcodeCall. +func (i *Instruction) AsCall(ref FuncRef, sig *Signature, args Values) { + i.opcode = OpcodeCall + i.u1 = uint64(ref) + i.vs = args + i.u2 = uint64(sig.ID) + sig.used = true +} + +// CallData returns the call data for this instruction necessary for backends. +func (i *Instruction) CallData() (ref FuncRef, sigID SignatureID, args []Value) { + if i.opcode != OpcodeCall { + panic("BUG: CallData only available for OpcodeCall") + } + ref = FuncRef(i.u1) + sigID = SignatureID(i.u2) + args = i.vs.View() + return +} + +// AsCallIndirect initializes this instruction as a call-indirect instruction with OpcodeCallIndirect. +func (i *Instruction) AsCallIndirect(funcPtr Value, sig *Signature, args Values) *Instruction { + i.opcode = OpcodeCallIndirect + i.typ = TypeF64 + i.vs = args + i.v = funcPtr + i.u1 = uint64(sig.ID) + sig.used = true + return i +} + +// AsCallGoRuntimeMemmove is the same as AsCallIndirect, but with a special flag set to indicate that it is a call to the Go runtime memmove function. +func (i *Instruction) AsCallGoRuntimeMemmove(funcPtr Value, sig *Signature, args Values) *Instruction { + i.AsCallIndirect(funcPtr, sig, args) + i.u2 = 1 + return i +} + +// CallIndirectData returns the call indirect data for this instruction necessary for backends. +func (i *Instruction) CallIndirectData() (funcPtr Value, sigID SignatureID, args []Value, isGoMemmove bool) { + if i.opcode != OpcodeCallIndirect { + panic("BUG: CallIndirectData only available for OpcodeCallIndirect") + } + funcPtr = i.v + sigID = SignatureID(i.u1) + args = i.vs.View() + isGoMemmove = i.u2 == 1 + return +} + +// AsClz initializes this instruction as a Count Leading Zeroes instruction with OpcodeClz. +func (i *Instruction) AsClz(x Value) { + i.opcode = OpcodeClz + i.v = x + i.typ = x.Type() +} + +// AsCtz initializes this instruction as a Count Trailing Zeroes instruction with OpcodeCtz. +func (i *Instruction) AsCtz(x Value) { + i.opcode = OpcodeCtz + i.v = x + i.typ = x.Type() +} + +// AsPopcnt initializes this instruction as a Population Count instruction with OpcodePopcnt. +func (i *Instruction) AsPopcnt(x Value) { + i.opcode = OpcodePopcnt + i.v = x + i.typ = x.Type() +} + +// AsFneg initializes this instruction as an instruction with OpcodeFneg. +func (i *Instruction) AsFneg(x Value) *Instruction { + i.opcode = OpcodeFneg + i.v = x + i.typ = x.Type() + return i +} + +// AsSqrt initializes this instruction as an instruction with OpcodeSqrt. +func (i *Instruction) AsSqrt(x Value) *Instruction { + i.opcode = OpcodeSqrt + i.v = x + i.typ = x.Type() + return i +} + +// AsFabs initializes this instruction as an instruction with OpcodeFabs. +func (i *Instruction) AsFabs(x Value) *Instruction { + i.opcode = OpcodeFabs + i.v = x + i.typ = x.Type() + return i +} + +// AsFcopysign initializes this instruction as an instruction with OpcodeFcopysign. +func (i *Instruction) AsFcopysign(x, y Value) *Instruction { + i.opcode = OpcodeFcopysign + i.v = x + i.v2 = y + i.typ = x.Type() + return i +} + +// AsCeil initializes this instruction as an instruction with OpcodeCeil. +func (i *Instruction) AsCeil(x Value) *Instruction { + i.opcode = OpcodeCeil + i.v = x + i.typ = x.Type() + return i +} + +// AsFloor initializes this instruction as an instruction with OpcodeFloor. +func (i *Instruction) AsFloor(x Value) *Instruction { + i.opcode = OpcodeFloor + i.v = x + i.typ = x.Type() + return i +} + +// AsTrunc initializes this instruction as an instruction with OpcodeTrunc. +func (i *Instruction) AsTrunc(x Value) *Instruction { + i.opcode = OpcodeTrunc + i.v = x + i.typ = x.Type() + return i +} + +// AsNearest initializes this instruction as an instruction with OpcodeNearest. +func (i *Instruction) AsNearest(x Value) *Instruction { + i.opcode = OpcodeNearest + i.v = x + i.typ = x.Type() + return i +} + +// AsBitcast initializes this instruction as an instruction with OpcodeBitcast. +func (i *Instruction) AsBitcast(x Value, dstType Type) *Instruction { + i.opcode = OpcodeBitcast + i.v = x + i.typ = dstType + return i +} + +// BitcastData returns the operands for a bitcast instruction. +func (i *Instruction) BitcastData() (x Value, dstType Type) { + return i.v, i.typ +} + +// AsFdemote initializes this instruction as an instruction with OpcodeFdemote. +func (i *Instruction) AsFdemote(x Value) { + i.opcode = OpcodeFdemote + i.v = x + i.typ = TypeF32 +} + +// AsFpromote initializes this instruction as an instruction with OpcodeFpromote. +func (i *Instruction) AsFpromote(x Value) { + i.opcode = OpcodeFpromote + i.v = x + i.typ = TypeF64 +} + +// AsFcvtFromInt initializes this instruction as an instruction with either OpcodeFcvtFromUint or OpcodeFcvtFromSint +func (i *Instruction) AsFcvtFromInt(x Value, signed bool, dst64bit bool) *Instruction { + if signed { + i.opcode = OpcodeFcvtFromSint + } else { + i.opcode = OpcodeFcvtFromUint + } + i.v = x + if dst64bit { + i.typ = TypeF64 + } else { + i.typ = TypeF32 + } + return i +} + +// AsFcvtToInt initializes this instruction as an instruction with either OpcodeFcvtToUint or OpcodeFcvtToSint +func (i *Instruction) AsFcvtToInt(x, ctx Value, signed bool, dst64bit bool, sat bool) *Instruction { + switch { + case signed && !sat: + i.opcode = OpcodeFcvtToSint + case !signed && !sat: + i.opcode = OpcodeFcvtToUint + case signed && sat: + i.opcode = OpcodeFcvtToSintSat + case !signed && sat: + i.opcode = OpcodeFcvtToUintSat + } + i.v = x + i.v2 = ctx + if dst64bit { + i.typ = TypeI64 + } else { + i.typ = TypeI32 + } + return i +} + +// AsVFcvtToIntSat initializes this instruction as an instruction with either OpcodeVFcvtToSintSat or OpcodeVFcvtToUintSat +func (i *Instruction) AsVFcvtToIntSat(x Value, lane VecLane, signed bool) *Instruction { + if signed { + i.opcode = OpcodeVFcvtToSintSat + } else { + i.opcode = OpcodeVFcvtToUintSat + } + i.v = x + i.u1 = uint64(lane) + return i +} + +// AsVFcvtFromInt initializes this instruction as an instruction with either OpcodeVFcvtToSintSat or OpcodeVFcvtToUintSat +func (i *Instruction) AsVFcvtFromInt(x Value, lane VecLane, signed bool) *Instruction { + if signed { + i.opcode = OpcodeVFcvtFromSint + } else { + i.opcode = OpcodeVFcvtFromUint + } + i.v = x + i.u1 = uint64(lane) + return i +} + +// AsNarrow initializes this instruction as an instruction with either OpcodeSnarrow or OpcodeUnarrow +func (i *Instruction) AsNarrow(x, y Value, lane VecLane, signed bool) *Instruction { + if signed { + i.opcode = OpcodeSnarrow + } else { + i.opcode = OpcodeUnarrow + } + i.v = x + i.v2 = y + i.u1 = uint64(lane) + return i +} + +// AsFvpromoteLow initializes this instruction as an instruction with OpcodeFvpromoteLow +func (i *Instruction) AsFvpromoteLow(x Value, lane VecLane) *Instruction { + i.opcode = OpcodeFvpromoteLow + i.v = x + i.u1 = uint64(lane) + return i +} + +// AsFvdemote initializes this instruction as an instruction with OpcodeFvdemote +func (i *Instruction) AsFvdemote(x Value, lane VecLane) *Instruction { + i.opcode = OpcodeFvdemote + i.v = x + i.u1 = uint64(lane) + return i +} + +// AsSExtend initializes this instruction as a sign extension instruction with OpcodeSExtend. +func (i *Instruction) AsSExtend(v Value, from, to byte) *Instruction { + i.opcode = OpcodeSExtend + i.v = v + i.u1 = uint64(from)<<8 | uint64(to) + if to == 64 { + i.typ = TypeI64 + } else { + i.typ = TypeI32 + } + return i +} + +// AsUExtend initializes this instruction as an unsigned extension instruction with OpcodeUExtend. +func (i *Instruction) AsUExtend(v Value, from, to byte) *Instruction { + i.opcode = OpcodeUExtend + i.v = v + i.u1 = uint64(from)<<8 | uint64(to) + if to == 64 { + i.typ = TypeI64 + } else { + i.typ = TypeI32 + } + return i +} + +func (i *Instruction) ExtendData() (from, to byte, signed bool) { + if i.opcode != OpcodeSExtend && i.opcode != OpcodeUExtend { + panic("BUG: ExtendData only available for OpcodeSExtend and OpcodeUExtend") + } + from = byte(i.u1 >> 8) + to = byte(i.u1) + signed = i.opcode == OpcodeSExtend + return +} + +// AsSelect initializes this instruction as an unsigned extension instruction with OpcodeSelect. +func (i *Instruction) AsSelect(c, x, y Value) *Instruction { + i.opcode = OpcodeSelect + i.v = c + i.v2 = x + i.v3 = y + i.typ = x.Type() + return i +} + +// SelectData returns the select data for this instruction necessary for backends. +func (i *Instruction) SelectData() (c, x, y Value) { + c = i.v + x = i.v2 + y = i.v3 + return +} + +// ExtendFromToBits returns the from and to bit size for the extension instruction. +func (i *Instruction) ExtendFromToBits() (from, to byte) { + from = byte(i.u1 >> 8) + to = byte(i.u1) + return +} + +// Format returns a string representation of this instruction with the given builder. +// For debugging purposes only. +func (i *Instruction) Format(b Builder) string { + var instSuffix string + switch i.opcode { + case OpcodeExitWithCode: + instSuffix = fmt.Sprintf(" %s, %s", i.v.Format(b), wazevoapi.ExitCode(i.u1)) + case OpcodeExitIfTrueWithCode: + instSuffix = fmt.Sprintf(" %s, %s, %s", i.v2.Format(b), i.v.Format(b), wazevoapi.ExitCode(i.u1)) + case OpcodeIadd, OpcodeIsub, OpcodeImul, OpcodeFadd, OpcodeFsub, OpcodeFmin, OpcodeFmax, OpcodeFdiv, OpcodeFmul: + instSuffix = fmt.Sprintf(" %s, %s", i.v.Format(b), i.v2.Format(b)) + case OpcodeIcmp: + instSuffix = fmt.Sprintf(" %s, %s, %s", IntegerCmpCond(i.u1), i.v.Format(b), i.v2.Format(b)) + case OpcodeFcmp: + instSuffix = fmt.Sprintf(" %s, %s, %s", FloatCmpCond(i.u1), i.v.Format(b), i.v2.Format(b)) + case OpcodeSExtend, OpcodeUExtend: + instSuffix = fmt.Sprintf(" %s, %d->%d", i.v.Format(b), i.u1>>8, i.u1&0xff) + case OpcodeCall, OpcodeCallIndirect: + view := i.vs.View() + vs := make([]string, len(view)) + for idx := range vs { + vs[idx] = view[idx].Format(b) + } + if i.opcode == OpcodeCallIndirect { + instSuffix = fmt.Sprintf(" %s:%s, %s", i.v.Format(b), SignatureID(i.u1), strings.Join(vs, ", ")) + } else { + instSuffix = fmt.Sprintf(" %s:%s, %s", FuncRef(i.u1), SignatureID(i.u2), strings.Join(vs, ", ")) + } + case OpcodeStore, OpcodeIstore8, OpcodeIstore16, OpcodeIstore32: + instSuffix = fmt.Sprintf(" %s, %s, %#x", i.v.Format(b), i.v2.Format(b), uint32(i.u1)) + case OpcodeLoad, OpcodeVZeroExtLoad: + instSuffix = fmt.Sprintf(" %s, %#x", i.v.Format(b), int32(i.u1)) + case OpcodeLoadSplat: + instSuffix = fmt.Sprintf(".%s %s, %#x", VecLane(i.u2), i.v.Format(b), int32(i.u1)) + case OpcodeUload8, OpcodeUload16, OpcodeUload32, OpcodeSload8, OpcodeSload16, OpcodeSload32: + instSuffix = fmt.Sprintf(" %s, %#x", i.v.Format(b), int32(i.u1)) + case OpcodeSelect, OpcodeVbitselect: + instSuffix = fmt.Sprintf(" %s, %s, %s", i.v.Format(b), i.v2.Format(b), i.v3.Format(b)) + case OpcodeIconst: + switch i.typ { + case TypeI32: + instSuffix = fmt.Sprintf("_32 %#x", uint32(i.u1)) + case TypeI64: + instSuffix = fmt.Sprintf("_64 %#x", i.u1) + } + case OpcodeVconst: + instSuffix = fmt.Sprintf(" %016x %016x", i.u1, i.u2) + case OpcodeF32const: + instSuffix = fmt.Sprintf(" %f", math.Float32frombits(uint32(i.u1))) + case OpcodeF64const: + instSuffix = fmt.Sprintf(" %f", math.Float64frombits(i.u1)) + case OpcodeReturn: + view := i.vs.View() + if len(view) == 0 { + break + } + vs := make([]string, len(view)) + for idx := range vs { + vs[idx] = view[idx].Format(b) + } + instSuffix = fmt.Sprintf(" %s", strings.Join(vs, ", ")) + case OpcodeJump: + view := i.vs.View() + vs := make([]string, len(view)+1) + if i.IsFallthroughJump() { + vs[0] = " fallthrough" + } else { + vs[0] = " " + i.blk.(*basicBlock).Name() + } + for idx := range view { + vs[idx+1] = view[idx].Format(b) + } + + instSuffix = strings.Join(vs, ", ") + case OpcodeBrz, OpcodeBrnz: + view := i.vs.View() + vs := make([]string, len(view)+2) + vs[0] = " " + i.v.Format(b) + vs[1] = i.blk.(*basicBlock).Name() + for idx := range view { + vs[idx+2] = view[idx].Format(b) + } + instSuffix = strings.Join(vs, ", ") + case OpcodeBrTable: + // `BrTable index, [label1, label2, ... labelN]` + instSuffix = fmt.Sprintf(" %s", i.v.Format(b)) + instSuffix += ", [" + for i, target := range i.targets { + blk := target.(*basicBlock) + if i == 0 { + instSuffix += blk.Name() + } else { + instSuffix += ", " + blk.Name() + } + } + instSuffix += "]" + case OpcodeBand, OpcodeBor, OpcodeBxor, OpcodeRotr, OpcodeRotl, OpcodeIshl, OpcodeSshr, OpcodeUshr, + OpcodeSdiv, OpcodeUdiv, OpcodeFcopysign, OpcodeSrem, OpcodeUrem, + OpcodeVbnot, OpcodeVbxor, OpcodeVbor, OpcodeVband, OpcodeVbandnot, OpcodeVIcmp, OpcodeVFcmp: + instSuffix = fmt.Sprintf(" %s, %s", i.v.Format(b), i.v2.Format(b)) + case OpcodeUndefined: + case OpcodeClz, OpcodeCtz, OpcodePopcnt, OpcodeFneg, OpcodeFcvtToSint, OpcodeFcvtToUint, OpcodeFcvtFromSint, + OpcodeFcvtFromUint, OpcodeFcvtToSintSat, OpcodeFcvtToUintSat, OpcodeFdemote, OpcodeFpromote, OpcodeIreduce, OpcodeBitcast, OpcodeSqrt, OpcodeFabs, + OpcodeCeil, OpcodeFloor, OpcodeTrunc, OpcodeNearest: + instSuffix = " " + i.v.Format(b) + case OpcodeVIadd, OpcodeExtIaddPairwise, OpcodeVSaddSat, OpcodeVUaddSat, OpcodeVIsub, OpcodeVSsubSat, OpcodeVUsubSat, + OpcodeVImin, OpcodeVUmin, OpcodeVImax, OpcodeVUmax, OpcodeVImul, OpcodeVAvgRound, + OpcodeVFadd, OpcodeVFsub, OpcodeVFmul, OpcodeVFdiv, + OpcodeVIshl, OpcodeVSshr, OpcodeVUshr, + OpcodeVFmin, OpcodeVFmax, OpcodeVMinPseudo, OpcodeVMaxPseudo, + OpcodeSnarrow, OpcodeUnarrow, OpcodeSwizzle, OpcodeSqmulRoundSat: + instSuffix = fmt.Sprintf(".%s %s, %s", VecLane(i.u1), i.v.Format(b), i.v2.Format(b)) + case OpcodeVIabs, OpcodeVIneg, OpcodeVIpopcnt, OpcodeVhighBits, OpcodeVallTrue, OpcodeVanyTrue, + OpcodeVFabs, OpcodeVFneg, OpcodeVSqrt, OpcodeVCeil, OpcodeVFloor, OpcodeVTrunc, OpcodeVNearest, + OpcodeVFcvtToUintSat, OpcodeVFcvtToSintSat, OpcodeVFcvtFromUint, OpcodeVFcvtFromSint, + OpcodeFvpromoteLow, OpcodeFvdemote, OpcodeSwidenLow, OpcodeUwidenLow, OpcodeSwidenHigh, OpcodeUwidenHigh, + OpcodeSplat: + instSuffix = fmt.Sprintf(".%s %s", VecLane(i.u1), i.v.Format(b)) + case OpcodeExtractlane: + var signedness string + if i.u1 != 0 { + signedness = "signed" + } else { + signedness = "unsigned" + } + instSuffix = fmt.Sprintf(".%s %d, %s (%s)", VecLane(i.u2), 0x0000FFFF&i.u1, i.v.Format(b), signedness) + case OpcodeInsertlane: + instSuffix = fmt.Sprintf(".%s %d, %s, %s", VecLane(i.u2), i.u1, i.v.Format(b), i.v2.Format(b)) + case OpcodeShuffle: + lanes := make([]byte, 16) + for idx := 0; idx < 8; idx++ { + lanes[idx] = byte(i.u1 >> (8 * idx)) + } + for idx := 0; idx < 8; idx++ { + lanes[idx+8] = byte(i.u2 >> (8 * idx)) + } + // Prints Shuffle.[0 1 2 3 4 5 6 7 ...] v2, v3 + instSuffix = fmt.Sprintf(".%v %s, %s", lanes, i.v.Format(b), i.v2.Format(b)) + case OpcodeAtomicRmw: + instSuffix = fmt.Sprintf(" %s_%d, %s, %s", AtomicRmwOp(i.u1), 8*i.u2, i.v.Format(b), i.v2.Format(b)) + case OpcodeAtomicLoad: + instSuffix = fmt.Sprintf("_%d, %s", 8*i.u1, i.v.Format(b)) + case OpcodeAtomicStore: + instSuffix = fmt.Sprintf("_%d, %s, %s", 8*i.u1, i.v.Format(b), i.v2.Format(b)) + case OpcodeAtomicCas: + instSuffix = fmt.Sprintf("_%d, %s, %s, %s", 8*i.u1, i.v.Format(b), i.v2.Format(b), i.v3.Format(b)) + case OpcodeFence: + instSuffix = fmt.Sprintf(" %d", i.u1) + case OpcodeWideningPairwiseDotProductS: + instSuffix = fmt.Sprintf(" %s, %s", i.v.Format(b), i.v2.Format(b)) + default: + panic(fmt.Sprintf("TODO: format for %s", i.opcode)) + } + + instr := i.opcode.String() + instSuffix + + var rvs []string + if rv := i.rValue; rv.Valid() { + rvs = append(rvs, rv.formatWithType(b)) + } + + for _, v := range i.rValues.View() { + rvs = append(rvs, v.formatWithType(b)) + } + + if len(rvs) > 0 { + return fmt.Sprintf("%s = %s", strings.Join(rvs, ", "), instr) + } else { + return instr + } +} + +// addArgumentBranchInst adds an argument to this instruction. +func (i *Instruction) addArgumentBranchInst(b *builder, v Value) { + switch i.opcode { + case OpcodeJump, OpcodeBrz, OpcodeBrnz: + i.vs = i.vs.Append(&b.varLengthPool, v) + default: + panic("BUG: " + i.opcode.String()) + } +} + +// Constant returns true if this instruction is a constant instruction. +func (i *Instruction) Constant() bool { + switch i.opcode { + case OpcodeIconst, OpcodeF32const, OpcodeF64const: + return true + } + return false +} + +// ConstantVal returns the constant value of this instruction. +// How to interpret the return value depends on the opcode. +func (i *Instruction) ConstantVal() (ret uint64) { + switch i.opcode { + case OpcodeIconst, OpcodeF32const, OpcodeF64const: + ret = i.u1 + default: + panic("TODO") + } + return +} + +// String implements fmt.Stringer. +func (o Opcode) String() (ret string) { + switch o { + case OpcodeInvalid: + return "invalid" + case OpcodeUndefined: + return "Undefined" + case OpcodeJump: + return "Jump" + case OpcodeBrz: + return "Brz" + case OpcodeBrnz: + return "Brnz" + case OpcodeBrTable: + return "BrTable" + case OpcodeExitWithCode: + return "Exit" + case OpcodeExitIfTrueWithCode: + return "ExitIfTrue" + case OpcodeReturn: + return "Return" + case OpcodeCall: + return "Call" + case OpcodeCallIndirect: + return "CallIndirect" + case OpcodeSplat: + return "Splat" + case OpcodeSwizzle: + return "Swizzle" + case OpcodeInsertlane: + return "Insertlane" + case OpcodeExtractlane: + return "Extractlane" + case OpcodeLoad: + return "Load" + case OpcodeLoadSplat: + return "LoadSplat" + case OpcodeStore: + return "Store" + case OpcodeUload8: + return "Uload8" + case OpcodeSload8: + return "Sload8" + case OpcodeIstore8: + return "Istore8" + case OpcodeUload16: + return "Uload16" + case OpcodeSload16: + return "Sload16" + case OpcodeIstore16: + return "Istore16" + case OpcodeUload32: + return "Uload32" + case OpcodeSload32: + return "Sload32" + case OpcodeIstore32: + return "Istore32" + case OpcodeIconst: + return "Iconst" + case OpcodeF32const: + return "F32const" + case OpcodeF64const: + return "F64const" + case OpcodeVconst: + return "Vconst" + case OpcodeShuffle: + return "Shuffle" + case OpcodeSelect: + return "Select" + case OpcodeVanyTrue: + return "VanyTrue" + case OpcodeVallTrue: + return "VallTrue" + case OpcodeVhighBits: + return "VhighBits" + case OpcodeIcmp: + return "Icmp" + case OpcodeIcmpImm: + return "IcmpImm" + case OpcodeVIcmp: + return "VIcmp" + case OpcodeIadd: + return "Iadd" + case OpcodeIsub: + return "Isub" + case OpcodeImul: + return "Imul" + case OpcodeUdiv: + return "Udiv" + case OpcodeSdiv: + return "Sdiv" + case OpcodeUrem: + return "Urem" + case OpcodeSrem: + return "Srem" + case OpcodeBand: + return "Band" + case OpcodeBor: + return "Bor" + case OpcodeBxor: + return "Bxor" + case OpcodeBnot: + return "Bnot" + case OpcodeRotl: + return "Rotl" + case OpcodeRotr: + return "Rotr" + case OpcodeIshl: + return "Ishl" + case OpcodeUshr: + return "Ushr" + case OpcodeSshr: + return "Sshr" + case OpcodeClz: + return "Clz" + case OpcodeCtz: + return "Ctz" + case OpcodePopcnt: + return "Popcnt" + case OpcodeFcmp: + return "Fcmp" + case OpcodeFadd: + return "Fadd" + case OpcodeFsub: + return "Fsub" + case OpcodeFmul: + return "Fmul" + case OpcodeFdiv: + return "Fdiv" + case OpcodeSqmulRoundSat: + return "SqmulRoundSat" + case OpcodeSqrt: + return "Sqrt" + case OpcodeFneg: + return "Fneg" + case OpcodeFabs: + return "Fabs" + case OpcodeFcopysign: + return "Fcopysign" + case OpcodeFmin: + return "Fmin" + case OpcodeFmax: + return "Fmax" + case OpcodeCeil: + return "Ceil" + case OpcodeFloor: + return "Floor" + case OpcodeTrunc: + return "Trunc" + case OpcodeNearest: + return "Nearest" + case OpcodeBitcast: + return "Bitcast" + case OpcodeIreduce: + return "Ireduce" + case OpcodeSnarrow: + return "Snarrow" + case OpcodeUnarrow: + return "Unarrow" + case OpcodeSwidenLow: + return "SwidenLow" + case OpcodeSwidenHigh: + return "SwidenHigh" + case OpcodeUwidenLow: + return "UwidenLow" + case OpcodeUwidenHigh: + return "UwidenHigh" + case OpcodeExtIaddPairwise: + return "IaddPairwise" + case OpcodeWideningPairwiseDotProductS: + return "WideningPairwiseDotProductS" + case OpcodeUExtend: + return "UExtend" + case OpcodeSExtend: + return "SExtend" + case OpcodeFpromote: + return "Fpromote" + case OpcodeFdemote: + return "Fdemote" + case OpcodeFvdemote: + return "Fvdemote" + case OpcodeFcvtToUint: + return "FcvtToUint" + case OpcodeFcvtToSint: + return "FcvtToSint" + case OpcodeFcvtToUintSat: + return "FcvtToUintSat" + case OpcodeFcvtToSintSat: + return "FcvtToSintSat" + case OpcodeFcvtFromUint: + return "FcvtFromUint" + case OpcodeFcvtFromSint: + return "FcvtFromSint" + case OpcodeAtomicRmw: + return "AtomicRmw" + case OpcodeAtomicCas: + return "AtomicCas" + case OpcodeAtomicLoad: + return "AtomicLoad" + case OpcodeAtomicStore: + return "AtomicStore" + case OpcodeFence: + return "Fence" + case OpcodeVbor: + return "Vbor" + case OpcodeVbxor: + return "Vbxor" + case OpcodeVband: + return "Vband" + case OpcodeVbandnot: + return "Vbandnot" + case OpcodeVbnot: + return "Vbnot" + case OpcodeVbitselect: + return "Vbitselect" + case OpcodeVIadd: + return "VIadd" + case OpcodeVSaddSat: + return "VSaddSat" + case OpcodeVUaddSat: + return "VUaddSat" + case OpcodeVSsubSat: + return "VSsubSat" + case OpcodeVUsubSat: + return "VUsubSat" + case OpcodeVAvgRound: + return "OpcodeVAvgRound" + case OpcodeVIsub: + return "VIsub" + case OpcodeVImin: + return "VImin" + case OpcodeVUmin: + return "VUmin" + case OpcodeVImax: + return "VImax" + case OpcodeVUmax: + return "VUmax" + case OpcodeVImul: + return "VImul" + case OpcodeVIabs: + return "VIabs" + case OpcodeVIneg: + return "VIneg" + case OpcodeVIpopcnt: + return "VIpopcnt" + case OpcodeVIshl: + return "VIshl" + case OpcodeVUshr: + return "VUshr" + case OpcodeVSshr: + return "VSshr" + case OpcodeVFabs: + return "VFabs" + case OpcodeVFmax: + return "VFmax" + case OpcodeVFmin: + return "VFmin" + case OpcodeVFneg: + return "VFneg" + case OpcodeVFadd: + return "VFadd" + case OpcodeVFsub: + return "VFsub" + case OpcodeVFmul: + return "VFmul" + case OpcodeVFdiv: + return "VFdiv" + case OpcodeVFcmp: + return "VFcmp" + case OpcodeVCeil: + return "VCeil" + case OpcodeVFloor: + return "VFloor" + case OpcodeVTrunc: + return "VTrunc" + case OpcodeVNearest: + return "VNearest" + case OpcodeVMaxPseudo: + return "VMaxPseudo" + case OpcodeVMinPseudo: + return "VMinPseudo" + case OpcodeVSqrt: + return "VSqrt" + case OpcodeVFcvtToUintSat: + return "VFcvtToUintSat" + case OpcodeVFcvtToSintSat: + return "VFcvtToSintSat" + case OpcodeVFcvtFromUint: + return "VFcvtFromUint" + case OpcodeVFcvtFromSint: + return "VFcvtFromSint" + case OpcodeFvpromoteLow: + return "FvpromoteLow" + case OpcodeVZeroExtLoad: + return "VZeroExtLoad" + } + panic(fmt.Sprintf("unknown opcode %d", o)) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass.go new file mode 100644 index 000000000..a2e986cd1 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass.go @@ -0,0 +1,417 @@ +package ssa + +import ( + "fmt" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// RunPasses implements Builder.RunPasses. +// +// The order here matters; some pass depends on the previous ones. +// +// Note that passes suffixed with "Opt" are the optimization passes, meaning that they edit the instructions and blocks +// while the other passes are not, like passEstimateBranchProbabilities does not edit them, but only calculates the additional information. +func (b *builder) RunPasses() { + b.runPreBlockLayoutPasses() + b.runBlockLayoutPass() + b.runPostBlockLayoutPasses() + b.runFinalizingPasses() +} + +func (b *builder) runPreBlockLayoutPasses() { + passSortSuccessors(b) + passDeadBlockEliminationOpt(b) + passRedundantPhiEliminationOpt(b) + // The result of passCalculateImmediateDominators will be used by various passes below. + passCalculateImmediateDominators(b) + passNopInstElimination(b) + + // TODO: implement either conversion of irreducible CFG into reducible one, or irreducible CFG detection where we panic. + // WebAssembly program shouldn't result in irreducible CFG, but we should handle it properly in just in case. + // See FixIrreducible pass in LLVM: https://llvm.org/doxygen/FixIrreducible_8cpp_source.html + + // TODO: implement more optimization passes like: + // block coalescing. + // Copy-propagation. + // Constant folding. + // Common subexpression elimination. + // Arithmetic simplifications. + // and more! + + // passDeadCodeEliminationOpt could be more accurate if we do this after other optimizations. + passDeadCodeEliminationOpt(b) + b.donePreBlockLayoutPasses = true +} + +func (b *builder) runBlockLayoutPass() { + if !b.donePreBlockLayoutPasses { + panic("runBlockLayoutPass must be called after all pre passes are done") + } + passLayoutBlocks(b) + b.doneBlockLayout = true +} + +// runPostBlockLayoutPasses runs the post block layout passes. After this point, CFG is somewhat stable, +// but still can be modified before finalizing passes. At this point, critical edges are split by passLayoutBlocks. +func (b *builder) runPostBlockLayoutPasses() { + if !b.doneBlockLayout { + panic("runPostBlockLayoutPasses must be called after block layout pass is done") + } + // TODO: Do more. e.g. tail duplication, loop unrolling, etc. + + b.donePostBlockLayoutPasses = true +} + +// runFinalizingPasses runs the finalizing passes. After this point, CFG should not be modified. +func (b *builder) runFinalizingPasses() { + if !b.donePostBlockLayoutPasses { + panic("runFinalizingPasses must be called after post block layout passes are done") + } + // Critical edges are split, so we fix the loop nesting forest. + passBuildLoopNestingForest(b) + passBuildDominatorTree(b) + // Now that we know the final placement of the blocks, we can explicitly mark the fallthrough jumps. + b.markFallthroughJumps() +} + +// passDeadBlockEliminationOpt searches the unreachable blocks, and sets the basicBlock.invalid flag true if so. +func passDeadBlockEliminationOpt(b *builder) { + entryBlk := b.entryBlk() + b.clearBlkVisited() + b.blkStack = append(b.blkStack, entryBlk) + for len(b.blkStack) > 0 { + reachableBlk := b.blkStack[len(b.blkStack)-1] + b.blkStack = b.blkStack[:len(b.blkStack)-1] + b.blkVisited[reachableBlk] = 0 // the value won't be used in this pass. + + if !reachableBlk.sealed && !reachableBlk.ReturnBlock() { + panic(fmt.Sprintf("%s is not sealed", reachableBlk)) + } + + if wazevoapi.SSAValidationEnabled { + reachableBlk.validate(b) + } + + for _, succ := range reachableBlk.success { + if _, ok := b.blkVisited[succ]; ok { + continue + } + b.blkStack = append(b.blkStack, succ) + } + } + + for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() { + if _, ok := b.blkVisited[blk]; !ok { + blk.invalid = true + } + } +} + +// passRedundantPhiEliminationOpt eliminates the redundant PHIs (in our terminology, parameters of a block). +func passRedundantPhiEliminationOpt(b *builder) { + redundantParameterIndexes := b.ints[:0] // reuse the slice from previous iterations. + + // TODO: this might be costly for large programs, but at least, as far as I did the experiment, it's almost the + // same as the single iteration version in terms of the overall compilation time. That *might be* mostly thanks to the fact + // that removing many PHIs results in the reduction of the total instructions, not because of this indefinite iteration is + // relatively small. For example, sqlite speedtest binary results in the large number of redundant PHIs, + // the maximum number of iteration was 22, which seems to be acceptable but not that small either since the + // complexity here is O(BlockNum * Iterations) at the worst case where BlockNum might be the order of thousands. + for { + changed := false + _ = b.blockIteratorBegin() // skip entry block! + // Below, we intentionally use the named iteration variable name, as this comes with inevitable nested for loops! + for blk := b.blockIteratorNext(); blk != nil; blk = b.blockIteratorNext() { + paramNum := len(blk.params) + + for paramIndex := 0; paramIndex < paramNum; paramIndex++ { + phiValue := blk.params[paramIndex].value + redundant := true + + nonSelfReferencingValue := ValueInvalid + for predIndex := range blk.preds { + br := blk.preds[predIndex].branch + // Resolve the alias in the arguments so that we could use the previous iteration's result. + b.resolveArgumentAlias(br) + pred := br.vs.View()[paramIndex] + if pred == phiValue { + // This is self-referencing: PHI from the same PHI. + continue + } + + if !nonSelfReferencingValue.Valid() { + nonSelfReferencingValue = pred + continue + } + + if nonSelfReferencingValue != pred { + redundant = false + break + } + } + + if !nonSelfReferencingValue.Valid() { + // This shouldn't happen, and must be a bug in builder.go. + panic("BUG: params added but only self-referencing") + } + + if redundant { + b.redundantParameterIndexToValue[paramIndex] = nonSelfReferencingValue + redundantParameterIndexes = append(redundantParameterIndexes, paramIndex) + } + } + + if len(b.redundantParameterIndexToValue) == 0 { + continue + } + changed = true + + // Remove the redundant PHIs from the argument list of branching instructions. + for predIndex := range blk.preds { + var cur int + predBlk := blk.preds[predIndex] + branchInst := predBlk.branch + view := branchInst.vs.View() + for argIndex, value := range view { + if _, ok := b.redundantParameterIndexToValue[argIndex]; !ok { + view[cur] = value + cur++ + } + } + branchInst.vs.Cut(cur) + } + + // Still need to have the definition of the value of the PHI (previously as the parameter). + for _, redundantParamIndex := range redundantParameterIndexes { + phiValue := blk.params[redundantParamIndex].value + onlyValue := b.redundantParameterIndexToValue[redundantParamIndex] + // Create an alias in this block from the only phi argument to the phi value. + b.alias(phiValue, onlyValue) + } + + // Finally, Remove the param from the blk. + var cur int + for paramIndex := 0; paramIndex < paramNum; paramIndex++ { + param := blk.params[paramIndex] + if _, ok := b.redundantParameterIndexToValue[paramIndex]; !ok { + blk.params[cur] = param + cur++ + } + } + blk.params = blk.params[:cur] + + // Clears the map for the next iteration. + for _, paramIndex := range redundantParameterIndexes { + delete(b.redundantParameterIndexToValue, paramIndex) + } + redundantParameterIndexes = redundantParameterIndexes[:0] + } + + if !changed { + break + } + } + + // Reuse the slice for the future passes. + b.ints = redundantParameterIndexes +} + +// passDeadCodeEliminationOpt traverses all the instructions, and calculates the reference count of each Value, and +// eliminates all the unnecessary instructions whose ref count is zero. +// The results are stored at builder.valueRefCounts. This also assigns a InstructionGroupID to each Instruction +// during the process. This is the last SSA-level optimization pass and after this, +// the SSA function is ready to be used by backends. +// +// TODO: the algorithm here might not be efficient. Get back to this later. +func passDeadCodeEliminationOpt(b *builder) { + nvid := int(b.nextValueID) + if nvid >= len(b.valueRefCounts) { + b.valueRefCounts = append(b.valueRefCounts, make([]int, b.nextValueID)...) + } + if nvid >= len(b.valueIDToInstruction) { + b.valueIDToInstruction = append(b.valueIDToInstruction, make([]*Instruction, b.nextValueID)...) + } + + // First, we gather all the instructions with side effects. + liveInstructions := b.instStack[:0] + // During the process, we will assign InstructionGroupID to each instruction, which is not + // relevant to dead code elimination, but we need in the backend. + var gid InstructionGroupID + for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() { + for cur := blk.rootInstr; cur != nil; cur = cur.next { + cur.gid = gid + switch cur.sideEffect() { + case sideEffectTraps: + // The trappable should always be alive. + liveInstructions = append(liveInstructions, cur) + case sideEffectStrict: + liveInstructions = append(liveInstructions, cur) + // The strict side effect should create different instruction groups. + gid++ + } + + r1, rs := cur.Returns() + if r1.Valid() { + b.valueIDToInstruction[r1.ID()] = cur + } + for _, r := range rs { + b.valueIDToInstruction[r.ID()] = cur + } + } + } + + // Find all the instructions referenced by live instructions transitively. + for len(liveInstructions) > 0 { + tail := len(liveInstructions) - 1 + live := liveInstructions[tail] + liveInstructions = liveInstructions[:tail] + if live.live { + // If it's already marked alive, this is referenced multiple times, + // so we can skip it. + continue + } + live.live = true + + // Before we walk, we need to resolve the alias first. + b.resolveArgumentAlias(live) + + v1, v2, v3, vs := live.Args() + if v1.Valid() { + producingInst := b.valueIDToInstruction[v1.ID()] + if producingInst != nil { + liveInstructions = append(liveInstructions, producingInst) + } + } + + if v2.Valid() { + producingInst := b.valueIDToInstruction[v2.ID()] + if producingInst != nil { + liveInstructions = append(liveInstructions, producingInst) + } + } + + if v3.Valid() { + producingInst := b.valueIDToInstruction[v3.ID()] + if producingInst != nil { + liveInstructions = append(liveInstructions, producingInst) + } + } + + for _, v := range vs { + producingInst := b.valueIDToInstruction[v.ID()] + if producingInst != nil { + liveInstructions = append(liveInstructions, producingInst) + } + } + } + + // Now that all the live instructions are flagged as live=true, we eliminate all dead instructions. + for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() { + for cur := blk.rootInstr; cur != nil; cur = cur.next { + if !cur.live { + // Remove the instruction from the list. + if prev := cur.prev; prev != nil { + prev.next = cur.next + } else { + blk.rootInstr = cur.next + } + if next := cur.next; next != nil { + next.prev = cur.prev + } + continue + } + + // If the value alive, we can be sure that arguments are used definitely. + // Hence, we can increment the value reference counts. + v1, v2, v3, vs := cur.Args() + if v1.Valid() { + b.incRefCount(v1.ID(), cur) + } + if v2.Valid() { + b.incRefCount(v2.ID(), cur) + } + if v3.Valid() { + b.incRefCount(v3.ID(), cur) + } + for _, v := range vs { + b.incRefCount(v.ID(), cur) + } + } + } + + b.instStack = liveInstructions // we reuse the stack for the next iteration. +} + +func (b *builder) incRefCount(id ValueID, from *Instruction) { + if wazevoapi.SSALoggingEnabled { + fmt.Printf("v%d referenced from %v\n", id, from.Format(b)) + } + b.valueRefCounts[id]++ +} + +// clearBlkVisited clears the b.blkVisited map so that we can reuse it for multiple places. +func (b *builder) clearBlkVisited() { + b.blkStack2 = b.blkStack2[:0] + for key := range b.blkVisited { + b.blkStack2 = append(b.blkStack2, key) + } + for _, blk := range b.blkStack2 { + delete(b.blkVisited, blk) + } + b.blkStack2 = b.blkStack2[:0] +} + +// passNopInstElimination eliminates the instructions which is essentially a no-op. +func passNopInstElimination(b *builder) { + if int(b.nextValueID) >= len(b.valueIDToInstruction) { + b.valueIDToInstruction = append(b.valueIDToInstruction, make([]*Instruction, b.nextValueID)...) + } + + for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() { + for cur := blk.rootInstr; cur != nil; cur = cur.next { + r1, rs := cur.Returns() + if r1.Valid() { + b.valueIDToInstruction[r1.ID()] = cur + } + for _, r := range rs { + b.valueIDToInstruction[r.ID()] = cur + } + } + } + + for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() { + for cur := blk.rootInstr; cur != nil; cur = cur.next { + switch cur.Opcode() { + // TODO: add more logics here. + case OpcodeIshl, OpcodeSshr, OpcodeUshr: + x, amount := cur.Arg2() + definingInst := b.valueIDToInstruction[amount.ID()] + if definingInst == nil { + // If there's no defining instruction, that means the amount is coming from the parameter. + continue + } + if definingInst.Constant() { + v := definingInst.ConstantVal() + + if x.Type().Bits() == 64 { + v = v % 64 + } else { + v = v % 32 + } + if v == 0 { + b.alias(cur.Return(), x) + } + } + } + } + } +} + +// passSortSuccessors sorts the successors of each block in the natural program order. +func passSortSuccessors(b *builder) { + for i := 0; i < b.basicBlocksPool.Allocated(); i++ { + blk := b.basicBlocksPool.View(i) + sortBlocks(blk.success) + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_blk_layouts.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_blk_layouts.go new file mode 100644 index 000000000..9068180a0 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_blk_layouts.go @@ -0,0 +1,335 @@ +package ssa + +import ( + "fmt" + "strings" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// passLayoutBlocks implements Builder.LayoutBlocks. This re-organizes builder.reversePostOrderedBasicBlocks. +// +// TODO: there are tons of room for improvement here. e.g. LLVM has BlockPlacementPass using BlockFrequencyInfo, +// BranchProbabilityInfo, and LoopInfo to do a much better job. Also, if we have the profiling instrumentation +// like ball-larus algorithm, then we could do profile-guided optimization. Basically all of them are trying +// to maximize the fall-through opportunities which is most efficient. +// +// Here, fallthrough happens when a block ends with jump instruction whose target is the right next block in the +// builder.reversePostOrderedBasicBlocks. +// +// Currently, we just place blocks using the DFS reverse post-order of the dominator tree with the heuristics: +// 1. a split edge trampoline towards a loop header will be placed as a fallthrough. +// 2. we invert the brz and brnz if it makes the fallthrough more likely. +// +// This heuristic is done in maybeInvertBranches function. +func passLayoutBlocks(b *builder) { + b.clearBlkVisited() + + // We might end up splitting critical edges which adds more basic blocks, + // so we store the currently existing basic blocks in nonSplitBlocks temporarily. + // That way we can iterate over the original basic blocks while appending new ones into reversePostOrderedBasicBlocks. + nonSplitBlocks := b.blkStack[:0] + for i, blk := range b.reversePostOrderedBasicBlocks { + if !blk.Valid() { + continue + } + nonSplitBlocks = append(nonSplitBlocks, blk) + if i != len(b.reversePostOrderedBasicBlocks)-1 { + _ = maybeInvertBranches(blk, b.reversePostOrderedBasicBlocks[i+1]) + } + } + + var trampolines []*basicBlock + + // Reset the order slice since we update on the fly by splitting critical edges. + b.reversePostOrderedBasicBlocks = b.reversePostOrderedBasicBlocks[:0] + uninsertedTrampolines := b.blkStack2[:0] + for _, blk := range nonSplitBlocks { + for i := range blk.preds { + pred := blk.preds[i].blk + if _, ok := b.blkVisited[pred]; ok || !pred.Valid() { + continue + } else if pred.reversePostOrder < blk.reversePostOrder { + // This means the edge is critical, and this pred is the trampoline and yet to be inserted. + // Split edge trampolines must come before the destination in reverse post-order. + b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, pred) + b.blkVisited[pred] = 0 // mark as inserted, the value is not used. + } + } + + // Now that we've already added all the potential trampoline blocks incoming to this block, + // we can add this block itself. + b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, blk) + b.blkVisited[blk] = 0 // mark as inserted, the value is not used. + + if len(blk.success) < 2 { + // There won't be critical edge originating from this block. + continue + } else if blk.currentInstr.opcode == OpcodeBrTable { + // We don't split critical edges here, because at the construction site of BrTable, we already split the edges. + continue + } + + for sidx, succ := range blk.success { + if !succ.ReturnBlock() && // If the successor is a return block, we need to split the edge any way because we need "epilogue" to be inserted. + // Plus if there's no multiple incoming edges to this successor, (pred, succ) is not critical. + len(succ.preds) < 2 { + continue + } + + // Otherwise, we are sure this is a critical edge. To modify the CFG, we need to find the predecessor info + // from the successor. + var predInfo *basicBlockPredecessorInfo + for i := range succ.preds { // This linear search should not be a problem since the number of predecessors should almost always small. + pred := &succ.preds[i] + if pred.blk == blk { + predInfo = pred + break + } + } + + if predInfo == nil { + // This must be a bug in somewhere around branch manipulation. + panic("BUG: predecessor info not found while the successor exists in successors list") + } + + if wazevoapi.SSALoggingEnabled { + fmt.Printf("trying to split edge from %d->%d at %s\n", + blk.ID(), succ.ID(), predInfo.branch.Format(b)) + } + + trampoline := b.splitCriticalEdge(blk, succ, predInfo) + // Update the successors slice because the target is no longer the original `succ`. + blk.success[sidx] = trampoline + + if wazevoapi.SSAValidationEnabled { + trampolines = append(trampolines, trampoline) + } + + if wazevoapi.SSALoggingEnabled { + fmt.Printf("edge split from %d->%d at %s as %d->%d->%d \n", + blk.ID(), succ.ID(), predInfo.branch.Format(b), + blk.ID(), trampoline.ID(), succ.ID()) + } + + fallthroughBranch := blk.currentInstr + if fallthroughBranch.opcode == OpcodeJump && fallthroughBranch.blk == trampoline { + // This can be lowered as fallthrough at the end of the block. + b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, trampoline) + b.blkVisited[trampoline] = 0 // mark as inserted, the value is not used. + } else { + uninsertedTrampolines = append(uninsertedTrampolines, trampoline) + } + } + + for _, trampoline := range uninsertedTrampolines { + if trampoline.success[0].reversePostOrder <= trampoline.reversePostOrder { // "<=", not "<" because the target might be itself. + // This means the critical edge was backward, so we insert after the current block immediately. + b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, trampoline) + b.blkVisited[trampoline] = 0 // mark as inserted, the value is not used. + } // If the target is forward, we can wait to insert until the target is inserted. + } + uninsertedTrampolines = uninsertedTrampolines[:0] // Reuse the stack for the next block. + } + + if wazevoapi.SSALoggingEnabled { + var bs []string + for _, blk := range b.reversePostOrderedBasicBlocks { + bs = append(bs, blk.Name()) + } + fmt.Println("ordered blocks: ", strings.Join(bs, ", ")) + } + + if wazevoapi.SSAValidationEnabled { + for _, trampoline := range trampolines { + if _, ok := b.blkVisited[trampoline]; !ok { + panic("BUG: trampoline block not inserted: " + trampoline.FormatHeader(b)) + } + trampoline.validate(b) + } + } + + // Reuse the stack for the next iteration. + b.blkStack2 = uninsertedTrampolines[:0] +} + +// markFallthroughJumps finds the fallthrough jumps and marks them as such. +func (b *builder) markFallthroughJumps() { + l := len(b.reversePostOrderedBasicBlocks) - 1 + for i, blk := range b.reversePostOrderedBasicBlocks { + if i < l { + cur := blk.currentInstr + if cur.opcode == OpcodeJump && cur.blk == b.reversePostOrderedBasicBlocks[i+1] { + cur.AsFallthroughJump() + } + } + } +} + +// maybeInvertBranches inverts the branch instructions if it is likely possible to the fallthrough more likely with simple heuristics. +// nextInRPO is the next block in the reverse post-order. +// +// Returns true if the branch is inverted for testing purpose. +func maybeInvertBranches(now *basicBlock, nextInRPO *basicBlock) bool { + fallthroughBranch := now.currentInstr + if fallthroughBranch.opcode == OpcodeBrTable { + return false + } + + condBranch := fallthroughBranch.prev + if condBranch == nil || (condBranch.opcode != OpcodeBrnz && condBranch.opcode != OpcodeBrz) { + return false + } + + if len(fallthroughBranch.vs.View()) != 0 || len(condBranch.vs.View()) != 0 { + // If either one of them has arguments, we don't invert the branches. + return false + } + + // So this block has two branches (a conditional branch followed by an unconditional branch) at the end. + // We can invert the condition of the branch if it makes the fallthrough more likely. + + fallthroughTarget, condTarget := fallthroughBranch.blk.(*basicBlock), condBranch.blk.(*basicBlock) + + if fallthroughTarget.loopHeader { + // First, if the tail's target is loopHeader, we don't need to do anything here, + // because the edge is likely to be critical edge for complex loops (e.g. loop with branches inside it). + // That means, we will split the edge in the end of LayoutBlocks function, and insert the trampoline block + // right after this block, which will be fallthrough in any way. + return false + } else if condTarget.loopHeader { + // On the other hand, if the condBranch's target is loopHeader, we invert the condition of the branch + // so that we could get the fallthrough to the trampoline block. + goto invert + } + + if fallthroughTarget == nextInRPO { + // Also, if the tail's target is the next block in the reverse post-order, we don't need to do anything here, + // because if this is not critical edge, we would end up placing these two blocks adjacent to each other. + // Even if it is the critical edge, we place the trampoline block right after this block, which will be fallthrough in any way. + return false + } else if condTarget == nextInRPO { + // If the condBranch's target is the next block in the reverse post-order, we invert the condition of the branch + // so that we could get the fallthrough to the block. + goto invert + } else { + return false + } + +invert: + for i := range fallthroughTarget.preds { + pred := &fallthroughTarget.preds[i] + if pred.branch == fallthroughBranch { + pred.branch = condBranch + break + } + } + for i := range condTarget.preds { + pred := &condTarget.preds[i] + if pred.branch == condBranch { + pred.branch = fallthroughBranch + break + } + } + + condBranch.InvertBrx() + condBranch.blk = fallthroughTarget + fallthroughBranch.blk = condTarget + if wazevoapi.SSALoggingEnabled { + fmt.Printf("inverting branches at %d->%d and %d->%d\n", + now.ID(), fallthroughTarget.ID(), now.ID(), condTarget.ID()) + } + + return true +} + +// splitCriticalEdge splits the critical edge between the given predecessor (`pred`) and successor (owning `predInfo`). +// +// - `pred` is the source of the critical edge, +// - `succ` is the destination of the critical edge, +// - `predInfo` is the predecessor info in the succ.preds slice which represents the critical edge. +// +// Why splitting critical edges is important? See following links: +// +// - https://en.wikipedia.org/wiki/Control-flow_graph +// - https://nickdesaulniers.github.io/blog/2023/01/27/critical-edge-splitting/ +// +// The returned basic block is the trampoline block which is inserted to split the critical edge. +func (b *builder) splitCriticalEdge(pred, succ *basicBlock, predInfo *basicBlockPredecessorInfo) *basicBlock { + // In the following, we convert the following CFG: + // + // pred --(originalBranch)--> succ + // + // to the following CFG: + // + // pred --(newBranch)--> trampoline --(originalBranch)-> succ + // + // where trampoline is a new basic block which is created to split the critical edge. + + trampoline := b.allocateBasicBlock() + if int(trampoline.id) >= len(b.dominators) { + b.dominators = append(b.dominators, make([]*basicBlock, trampoline.id+1)...) + } + b.dominators[trampoline.id] = pred + + originalBranch := predInfo.branch + + // Replace originalBranch with the newBranch. + newBranch := b.AllocateInstruction() + newBranch.opcode = originalBranch.opcode + newBranch.blk = trampoline + switch originalBranch.opcode { + case OpcodeJump: + case OpcodeBrz, OpcodeBrnz: + originalBranch.opcode = OpcodeJump // Trampoline consists of one unconditional branch. + newBranch.v = originalBranch.v + originalBranch.v = ValueInvalid + default: + panic("BUG: critical edge shouldn't be originated from br_table") + } + swapInstruction(pred, originalBranch, newBranch) + + // Replace the original branch with the new branch. + trampoline.rootInstr = originalBranch + trampoline.currentInstr = originalBranch + trampoline.success = append(trampoline.success, succ) // Do not use []*basicBlock{pred} because we might have already allocated the slice. + trampoline.preds = append(trampoline.preds, // same as ^. + basicBlockPredecessorInfo{blk: pred, branch: newBranch}) + b.Seal(trampoline) + + // Update the original branch to point to the trampoline. + predInfo.blk = trampoline + predInfo.branch = originalBranch + + if wazevoapi.SSAValidationEnabled { + trampoline.validate(b) + } + + if len(trampoline.params) > 0 { + panic("trampoline should not have params") + } + + // Assign the same order as the original block so that this will be placed before the actual destination. + trampoline.reversePostOrder = pred.reversePostOrder + return trampoline +} + +// swapInstruction replaces `old` in the block `blk` with `New`. +func swapInstruction(blk *basicBlock, old, New *Instruction) { + if blk.rootInstr == old { + blk.rootInstr = New + next := old.next + New.next = next + next.prev = New + } else { + if blk.currentInstr == old { + blk.currentInstr = New + } + prev := old.prev + prev.next, New.prev = New, prev + if next := old.next; next != nil { + New.next, next.prev = next, New + } + } + old.prev, old.next = nil, nil +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_cfg.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_cfg.go new file mode 100644 index 000000000..50cb9c475 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/pass_cfg.go @@ -0,0 +1,312 @@ +package ssa + +import ( + "fmt" + "math" + "strings" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// passCalculateImmediateDominators calculates immediate dominators for each basic block. +// The result is stored in b.dominators. This make it possible for the following passes to +// use builder.isDominatedBy to check if a block is dominated by another block. +// +// At the last of pass, this function also does the loop detection and sets the basicBlock.loop flag. +func passCalculateImmediateDominators(b *builder) { + reversePostOrder := b.reversePostOrderedBasicBlocks[:0] + exploreStack := b.blkStack[:0] + b.clearBlkVisited() + + entryBlk := b.entryBlk() + + // Store the reverse postorder from the entrypoint into reversePostOrder slice. + // This calculation of reverse postorder is not described in the paper, + // so we use heuristic to calculate it so that we could potentially handle arbitrary + // complex CFGs under the assumption that success is sorted in program's natural order. + // That means blk.success[i] always appears before blk.success[i+1] in the source program, + // which is a reasonable assumption as long as SSA Builder is properly used. + // + // First we push blocks in postorder iteratively visit successors of the entry block. + exploreStack = append(exploreStack, entryBlk) + const visitStateUnseen, visitStateSeen, visitStateDone = 0, 1, 2 + b.blkVisited[entryBlk] = visitStateSeen + for len(exploreStack) > 0 { + tail := len(exploreStack) - 1 + blk := exploreStack[tail] + exploreStack = exploreStack[:tail] + switch b.blkVisited[blk] { + case visitStateUnseen: + // This is likely a bug in the frontend. + panic("BUG: unsupported CFG") + case visitStateSeen: + // This is the first time to pop this block, and we have to see the successors first. + // So push this block again to the stack. + exploreStack = append(exploreStack, blk) + // And push the successors to the stack if necessary. + for _, succ := range blk.success { + if succ.ReturnBlock() || succ.invalid { + continue + } + if b.blkVisited[succ] == visitStateUnseen { + b.blkVisited[succ] = visitStateSeen + exploreStack = append(exploreStack, succ) + } + } + // Finally, we could pop this block once we pop all of its successors. + b.blkVisited[blk] = visitStateDone + case visitStateDone: + // Note: at this point we push blk in postorder despite its name. + reversePostOrder = append(reversePostOrder, blk) + } + } + // At this point, reversePostOrder has postorder actually, so we reverse it. + for i := len(reversePostOrder)/2 - 1; i >= 0; i-- { + j := len(reversePostOrder) - 1 - i + reversePostOrder[i], reversePostOrder[j] = reversePostOrder[j], reversePostOrder[i] + } + + for i, blk := range reversePostOrder { + blk.reversePostOrder = i + } + + // Reuse the dominators slice if possible from the previous computation of function. + b.dominators = b.dominators[:cap(b.dominators)] + if len(b.dominators) < b.basicBlocksPool.Allocated() { + // Generously reserve space in the slice because the slice will be reused future allocation. + b.dominators = append(b.dominators, make([]*basicBlock, b.basicBlocksPool.Allocated())...) + } + calculateDominators(reversePostOrder, b.dominators) + + // Reuse the slices for the future use. + b.blkStack = exploreStack + + // For the following passes. + b.reversePostOrderedBasicBlocks = reversePostOrder + + // Ready to detect loops! + subPassLoopDetection(b) +} + +// calculateDominators calculates the immediate dominator of each node in the CFG, and store the result in `doms`. +// The algorithm is based on the one described in the paper "A Simple, Fast Dominance Algorithm" +// https://www.cs.rice.edu/~keith/EMBED/dom.pdf which is a faster/simple alternative to the well known Lengauer-Tarjan algorithm. +// +// The following code almost matches the pseudocode in the paper with one exception (see the code comment below). +// +// The result slice `doms` must be pre-allocated with the size larger than the size of dfsBlocks. +func calculateDominators(reversePostOrderedBlks []*basicBlock, doms []*basicBlock) { + entry, reversePostOrderedBlks := reversePostOrderedBlks[0], reversePostOrderedBlks[1: /* skips entry point */] + for _, blk := range reversePostOrderedBlks { + doms[blk.id] = nil + } + doms[entry.id] = entry + + changed := true + for changed { + changed = false + for _, blk := range reversePostOrderedBlks { + var u *basicBlock + for i := range blk.preds { + pred := blk.preds[i].blk + // Skip if this pred is not reachable yet. Note that this is not described in the paper, + // but it is necessary to handle nested loops etc. + if doms[pred.id] == nil { + continue + } + + if u == nil { + u = pred + continue + } else { + u = intersect(doms, u, pred) + } + } + if doms[blk.id] != u { + doms[blk.id] = u + changed = true + } + } + } +} + +// intersect returns the common dominator of blk1 and blk2. +// +// This is the `intersect` function in the paper. +func intersect(doms []*basicBlock, blk1 *basicBlock, blk2 *basicBlock) *basicBlock { + finger1, finger2 := blk1, blk2 + for finger1 != finger2 { + // Move the 'finger1' upwards to its immediate dominator. + for finger1.reversePostOrder > finger2.reversePostOrder { + finger1 = doms[finger1.id] + } + // Move the 'finger2' upwards to its immediate dominator. + for finger2.reversePostOrder > finger1.reversePostOrder { + finger2 = doms[finger2.id] + } + } + return finger1 +} + +// subPassLoopDetection detects loops in the function using the immediate dominators. +// +// This is run at the last of passCalculateImmediateDominators. +func subPassLoopDetection(b *builder) { + for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() { + for i := range blk.preds { + pred := blk.preds[i].blk + if pred.invalid { + continue + } + if b.isDominatedBy(pred, blk) { + blk.loopHeader = true + } + } + } +} + +// buildLoopNestingForest builds the loop nesting forest for the function. +// This must be called after branch splitting since it relies on the CFG. +func passBuildLoopNestingForest(b *builder) { + ent := b.entryBlk() + doms := b.dominators + for _, blk := range b.reversePostOrderedBasicBlocks { + n := doms[blk.id] + for !n.loopHeader && n != ent { + n = doms[n.id] + } + + if n == ent && blk.loopHeader { + b.loopNestingForestRoots = append(b.loopNestingForestRoots, blk) + } else if n == ent { + } else if n.loopHeader { + n.loopNestingForestChildren = append(n.loopNestingForestChildren, blk) + } + } + + if wazevoapi.SSALoggingEnabled { + for _, root := range b.loopNestingForestRoots { + printLoopNestingForest(root.(*basicBlock), 0) + } + } +} + +func printLoopNestingForest(root *basicBlock, depth int) { + fmt.Println(strings.Repeat("\t", depth), "loop nesting forest root:", root.ID()) + for _, child := range root.loopNestingForestChildren { + fmt.Println(strings.Repeat("\t", depth+1), "child:", child.ID()) + if child.LoopHeader() { + printLoopNestingForest(child.(*basicBlock), depth+2) + } + } +} + +type dominatorSparseTree struct { + time int + euler []*basicBlock + first, depth []int + table [][]int +} + +// passBuildDominatorTree builds the dominator tree for the function, and constructs builder.sparseTree. +func passBuildDominatorTree(b *builder) { + // First we materialize the children of each node in the dominator tree. + idoms := b.dominators + for _, blk := range b.reversePostOrderedBasicBlocks { + parent := idoms[blk.id] + if parent == nil { + panic("BUG") + } else if parent == blk { + // This is the entry block. + continue + } + if prev := parent.child; prev == nil { + parent.child = blk + } else { + parent.child = blk + blk.sibling = prev + } + } + + // Reset the state from the previous computation. + n := b.basicBlocksPool.Allocated() + st := &b.sparseTree + st.euler = append(st.euler[:0], make([]*basicBlock, 2*n-1)...) + st.first = append(st.first[:0], make([]int, n)...) + for i := range st.first { + st.first[i] = -1 + } + st.depth = append(st.depth[:0], make([]int, 2*n-1)...) + st.time = 0 + + // Start building the sparse tree. + st.eulerTour(b.entryBlk(), 0) + st.buildSparseTable() +} + +func (dt *dominatorSparseTree) eulerTour(node *basicBlock, height int) { + if wazevoapi.SSALoggingEnabled { + fmt.Println(strings.Repeat("\t", height), "euler tour:", node.ID()) + } + dt.euler[dt.time] = node + dt.depth[dt.time] = height + if dt.first[node.id] == -1 { + dt.first[node.id] = dt.time + } + dt.time++ + + for child := node.child; child != nil; child = child.sibling { + dt.eulerTour(child, height+1) + dt.euler[dt.time] = node // add the current node again after visiting a child + dt.depth[dt.time] = height + dt.time++ + } +} + +// buildSparseTable builds a sparse table for RMQ queries. +func (dt *dominatorSparseTree) buildSparseTable() { + n := len(dt.depth) + k := int(math.Log2(float64(n))) + 1 + table := dt.table + + if n >= len(table) { + table = append(table, make([][]int, n+1)...) + } + for i := range table { + if len(table[i]) < k { + table[i] = append(table[i], make([]int, k)...) + } + table[i][0] = i + } + + for j := 1; 1<<j <= n; j++ { + for i := 0; i+(1<<j)-1 < n; i++ { + if dt.depth[table[i][j-1]] < dt.depth[table[i+(1<<(j-1))][j-1]] { + table[i][j] = table[i][j-1] + } else { + table[i][j] = table[i+(1<<(j-1))][j-1] + } + } + } + dt.table = table +} + +// rmq performs a range minimum query on the sparse table. +func (dt *dominatorSparseTree) rmq(l, r int) int { + table := dt.table + depth := dt.depth + j := int(math.Log2(float64(r - l + 1))) + if depth[table[l][j]] <= depth[table[r-(1<<j)+1][j]] { + return table[l][j] + } + return table[r-(1<<j)+1][j] +} + +// findLCA finds the LCA using the Euler tour and RMQ. +func (dt *dominatorSparseTree) findLCA(u, v BasicBlockID) *basicBlock { + first := dt.first + if first[u] > first[v] { + u, v = v, u + } + return dt.euler[dt.rmq(first[u], first[v])] +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/signature.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/signature.go new file mode 100644 index 000000000..43483395a --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/signature.go @@ -0,0 +1,49 @@ +package ssa + +import ( + "fmt" + "strings" +) + +// Signature is a function prototype. +type Signature struct { + // ID is a unique identifier for this signature used to lookup. + ID SignatureID + // Params and Results are the types of the parameters and results of the function. + Params, Results []Type + + // used is true if this is used by the currently-compiled function. + // Debugging only. + used bool +} + +// String implements fmt.Stringer. +func (s *Signature) String() string { + str := strings.Builder{} + str.WriteString(s.ID.String()) + str.WriteString(": ") + if len(s.Params) > 0 { + for _, typ := range s.Params { + str.WriteString(typ.String()) + } + } else { + str.WriteByte('v') + } + str.WriteByte('_') + if len(s.Results) > 0 { + for _, typ := range s.Results { + str.WriteString(typ.String()) + } + } else { + str.WriteByte('v') + } + return str.String() +} + +// SignatureID is an unique identifier used to lookup. +type SignatureID int + +// String implements fmt.Stringer. +func (s SignatureID) String() string { + return fmt.Sprintf("sig%d", s) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/ssa.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/ssa.go new file mode 100644 index 000000000..b477e58bd --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/ssa.go @@ -0,0 +1,14 @@ +// Package ssa is used to construct SSA function. By nature this is free of Wasm specific thing +// and ISA. +// +// We use the "block argument" variant of SSA: https://en.wikipedia.org/wiki/Static_single-assignment_form#Block_arguments +// which is equivalent to the traditional PHI function based one, but more convenient during optimizations. +// However, in this package's source code comment, we might use PHI whenever it seems necessary in order to be aligned with +// existing literatures, e.g. SSA level optimization algorithms are often described using PHI nodes. +// +// The rationale doc for the choice of "block argument" by MLIR of LLVM is worth a read: +// https://mlir.llvm.org/docs/Rationale/Rationale/#block-arguments-vs-phi-nodes +// +// The algorithm to resolve variable definitions used here is based on the paper +// "Simple and Efficient Construction of Static Single Assignment Form": https://link.springer.com/content/pdf/10.1007/978-3-642-37051-9_6.pdf. +package ssa diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/type.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/type.go new file mode 100644 index 000000000..e8c8cd9de --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/type.go @@ -0,0 +1,112 @@ +package ssa + +type Type byte + +const ( + typeInvalid Type = iota + + // TODO: add 8, 16 bit types when it's needed for optimizations. + + // TypeI32 represents an integer type with 32 bits. + TypeI32 + + // TypeI64 represents an integer type with 64 bits. + TypeI64 + + // TypeF32 represents 32-bit floats in the IEEE 754. + TypeF32 + + // TypeF64 represents 64-bit floats in the IEEE 754. + TypeF64 + + // TypeV128 represents 128-bit SIMD vectors. + TypeV128 +) + +// String implements fmt.Stringer. +func (t Type) String() (ret string) { + switch t { + case typeInvalid: + return "invalid" + case TypeI32: + return "i32" + case TypeI64: + return "i64" + case TypeF32: + return "f32" + case TypeF64: + return "f64" + case TypeV128: + return "v128" + default: + panic(int(t)) + } +} + +// IsInt returns true if the type is an integer type. +func (t Type) IsInt() bool { + return t == TypeI32 || t == TypeI64 +} + +// IsFloat returns true if the type is a floating point type. +func (t Type) IsFloat() bool { + return t == TypeF32 || t == TypeF64 +} + +// Bits returns the number of bits required to represent the type. +func (t Type) Bits() byte { + switch t { + case TypeI32, TypeF32: + return 32 + case TypeI64, TypeF64: + return 64 + case TypeV128: + return 128 + default: + panic(int(t)) + } +} + +// Size returns the number of bytes required to represent the type. +func (t Type) Size() byte { + return t.Bits() / 8 +} + +func (t Type) invalid() bool { + return t == typeInvalid +} + +// VecLane represents a lane in a SIMD vector. +type VecLane byte + +const ( + VecLaneInvalid VecLane = 1 + iota + VecLaneI8x16 + VecLaneI16x8 + VecLaneI32x4 + VecLaneI64x2 + VecLaneF32x4 + VecLaneF64x2 +) + +// String implements fmt.Stringer. +func (vl VecLane) String() (ret string) { + switch vl { + case VecLaneInvalid: + return "invalid" + case VecLaneI8x16: + return "i8x16" + case VecLaneI16x8: + return "i16x8" + case VecLaneI32x4: + return "i32x4" + case VecLaneI64x2: + return "i64x2" + case VecLaneF32x4: + return "f32x4" + case VecLaneF64x2: + return "f64x2" + default: + panic(int(vl)) + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/vs.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/vs.go new file mode 100644 index 000000000..bcf83cbf8 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/ssa/vs.go @@ -0,0 +1,87 @@ +package ssa + +import ( + "fmt" + "math" + + "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" +) + +// Variable is a unique identifier for a source program's variable and will correspond to +// multiple ssa Value(s). +// +// For example, `Local 1` is a Variable in WebAssembly, and Value(s) will be created for it +// whenever it executes `local.set 1`. +// +// Variable is useful to track the SSA Values of a variable in the source program, and +// can be used to find the corresponding latest SSA Value via Builder.FindValue. +type Variable uint32 + +// String implements fmt.Stringer. +func (v Variable) String() string { + return fmt.Sprintf("var%d", v) +} + +// Value represents an SSA value with a type information. The relationship with Variable is 1: N (including 0), +// that means there might be multiple Variable(s) for a Value. +// +// Higher 32-bit is used to store Type for this value. +type Value uint64 + +// ValueID is the lower 32bit of Value, which is the pure identifier of Value without type info. +type ValueID uint32 + +const ( + valueIDInvalid ValueID = math.MaxUint32 + ValueInvalid Value = Value(valueIDInvalid) +) + +// Format creates a debug string for this Value using the data stored in Builder. +func (v Value) Format(b Builder) string { + if annotation, ok := b.(*builder).valueAnnotations[v.ID()]; ok { + return annotation + } + return fmt.Sprintf("v%d", v.ID()) +} + +func (v Value) formatWithType(b Builder) (ret string) { + if annotation, ok := b.(*builder).valueAnnotations[v.ID()]; ok { + ret = annotation + ":" + v.Type().String() + } else { + ret = fmt.Sprintf("v%d:%s", v.ID(), v.Type()) + } + + if wazevoapi.SSALoggingEnabled { // This is useful to check live value analysis bugs. + if bd := b.(*builder); bd.donePostBlockLayoutPasses { + id := v.ID() + ret += fmt.Sprintf("(ref=%d)", bd.valueRefCounts[id]) + } + } + return ret +} + +// Valid returns true if this value is valid. +func (v Value) Valid() bool { + return v.ID() != valueIDInvalid +} + +// Type returns the Type of this value. +func (v Value) Type() Type { + return Type(v >> 32) +} + +// ID returns the valueID of this value. +func (v Value) ID() ValueID { + return ValueID(v) +} + +// setType sets a type to this Value and returns the updated Value. +func (v Value) setType(typ Type) Value { + return v | Value(typ)<<32 +} + +// Values is a slice of Value. Use this instead of []Value to reuse the underlying memory. +type Values = wazevoapi.VarLength[Value] + +// ValuesNil is a nil Values. +var ValuesNil = wazevoapi.NewNilVarLength[Value]() diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/debug_options.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/debug_options.go new file mode 100644 index 000000000..2db61e219 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/debug_options.go @@ -0,0 +1,196 @@ +package wazevoapi + +import ( + "context" + "encoding/hex" + "fmt" + "math/rand" + "os" + "time" +) + +// These consts are used various places in the wazevo implementations. +// Instead of defining them in each file, we define them here so that we can quickly iterate on +// debugging without spending "where do we have debug logging?" time. + +// ----- Debug logging ----- +// These consts must be disabled by default. Enable them only when debugging. + +const ( + FrontEndLoggingEnabled = false + SSALoggingEnabled = false + RegAllocLoggingEnabled = false +) + +// ----- Output prints ----- +// These consts must be disabled by default. Enable them only when debugging. + +const ( + PrintSSA = false + PrintOptimizedSSA = false + PrintSSAToBackendIRLowering = false + PrintRegisterAllocated = false + PrintFinalizedMachineCode = false + PrintMachineCodeHexPerFunction = printMachineCodeHexPerFunctionUnmodified || PrintMachineCodeHexPerFunctionDisassemblable //nolint + printMachineCodeHexPerFunctionUnmodified = false + // PrintMachineCodeHexPerFunctionDisassemblable prints the machine code while modifying the actual result + // to make it disassemblable. This is useful when debugging the final machine code. See the places where this is used for detail. + // When this is enabled, functions must not be called. + PrintMachineCodeHexPerFunctionDisassemblable = false +) + +// printTarget is the function index to print the machine code. This is used for debugging to print the machine code +// of a specific function. +const printTarget = -1 + +// PrintEnabledIndex returns true if the current function index is the print target. +func PrintEnabledIndex(ctx context.Context) bool { + if printTarget == -1 { + return true + } + return GetCurrentFunctionIndex(ctx) == printTarget +} + +// ----- Validations ----- +const ( + // SSAValidationEnabled enables the SSA validation. This is disabled by default since the operation is expensive. + SSAValidationEnabled = false +) + +// ----- Stack Guard Check ----- +const ( + // StackGuardCheckEnabled enables the stack guard check to ensure that our stack bounds check works correctly. + StackGuardCheckEnabled = false + StackGuardCheckGuardPageSize = 8096 +) + +// CheckStackGuardPage checks the given stack guard page is not corrupted. +func CheckStackGuardPage(s []byte) { + for i := 0; i < StackGuardCheckGuardPageSize; i++ { + if s[i] != 0 { + panic( + fmt.Sprintf("BUG: stack guard page is corrupted:\n\tguard_page=%s\n\tstack=%s", + hex.EncodeToString(s[:StackGuardCheckGuardPageSize]), + hex.EncodeToString(s[StackGuardCheckGuardPageSize:]), + )) + } + } +} + +// ----- Deterministic compilation verifier ----- + +const ( + // DeterministicCompilationVerifierEnabled enables the deterministic compilation verifier. This is disabled by default + // since the operation is expensive. But when in doubt, enable this to make sure the compilation is deterministic. + DeterministicCompilationVerifierEnabled = false + DeterministicCompilationVerifyingIter = 5 +) + +type ( + verifierState struct { + initialCompilationDone bool + maybeRandomizedIndexes []int + r *rand.Rand + values map[string]string + } + verifierStateContextKey struct{} + currentFunctionNameKey struct{} + currentFunctionIndexKey struct{} +) + +// NewDeterministicCompilationVerifierContext creates a new context with the deterministic compilation verifier used per wasm.Module. +func NewDeterministicCompilationVerifierContext(ctx context.Context, localFunctions int) context.Context { + maybeRandomizedIndexes := make([]int, localFunctions) + for i := range maybeRandomizedIndexes { + maybeRandomizedIndexes[i] = i + } + r := rand.New(rand.NewSource(time.Now().UnixNano())) + return context.WithValue(ctx, verifierStateContextKey{}, &verifierState{ + r: r, maybeRandomizedIndexes: maybeRandomizedIndexes, values: map[string]string{}, + }) +} + +// DeterministicCompilationVerifierRandomizeIndexes randomizes the indexes for the deterministic compilation verifier. +// To get the randomized index, use DeterministicCompilationVerifierGetRandomizedLocalFunctionIndex. +func DeterministicCompilationVerifierRandomizeIndexes(ctx context.Context) { + state := ctx.Value(verifierStateContextKey{}).(*verifierState) + if !state.initialCompilationDone { + // If this is the first attempt, we use the index as-is order. + state.initialCompilationDone = true + return + } + r := state.r + r.Shuffle(len(state.maybeRandomizedIndexes), func(i, j int) { + state.maybeRandomizedIndexes[i], state.maybeRandomizedIndexes[j] = state.maybeRandomizedIndexes[j], state.maybeRandomizedIndexes[i] + }) +} + +// DeterministicCompilationVerifierGetRandomizedLocalFunctionIndex returns the randomized index for the given `index` +// which is assigned by DeterministicCompilationVerifierRandomizeIndexes. +func DeterministicCompilationVerifierGetRandomizedLocalFunctionIndex(ctx context.Context, index int) int { + state := ctx.Value(verifierStateContextKey{}).(*verifierState) + ret := state.maybeRandomizedIndexes[index] + return ret +} + +// VerifyOrSetDeterministicCompilationContextValue verifies that the `newValue` is the same as the previous value for the given `scope` +// and the current function name. If the previous value doesn't exist, it sets the value to the given `newValue`. +// +// If the verification fails, this prints the diff and exits the process. +func VerifyOrSetDeterministicCompilationContextValue(ctx context.Context, scope string, newValue string) { + fn := ctx.Value(currentFunctionNameKey{}).(string) + key := fn + ": " + scope + verifierCtx := ctx.Value(verifierStateContextKey{}).(*verifierState) + oldValue, ok := verifierCtx.values[key] + if !ok { + verifierCtx.values[key] = newValue + return + } + if oldValue != newValue { + fmt.Printf( + `BUG: Deterministic compilation failed for function%s at scope="%s". + +This is mostly due to (but might not be limited to): + * Resetting ssa.Builder, backend.Compiler or frontend.Compiler, etc doens't work as expected, and the compilation has been affected by the previous iterations. + * Using a map with non-deterministic iteration order. + +---------- [old] ---------- +%s + +---------- [new] ---------- +%s +`, + fn, scope, oldValue, newValue, + ) + os.Exit(1) + } +} + +// nolint +const NeedFunctionNameInContext = PrintSSA || + PrintOptimizedSSA || + PrintSSAToBackendIRLowering || + PrintRegisterAllocated || + PrintFinalizedMachineCode || + PrintMachineCodeHexPerFunction || + DeterministicCompilationVerifierEnabled || + PerfMapEnabled + +// SetCurrentFunctionName sets the current function name to the given `functionName`. +func SetCurrentFunctionName(ctx context.Context, index int, functionName string) context.Context { + ctx = context.WithValue(ctx, currentFunctionNameKey{}, functionName) + ctx = context.WithValue(ctx, currentFunctionIndexKey{}, index) + return ctx +} + +// GetCurrentFunctionName returns the current function name. +func GetCurrentFunctionName(ctx context.Context) string { + ret, _ := ctx.Value(currentFunctionNameKey{}).(string) + return ret +} + +// GetCurrentFunctionIndex returns the current function index. +func GetCurrentFunctionIndex(ctx context.Context) int { + ret, _ := ctx.Value(currentFunctionIndexKey{}).(int) + return ret +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/exitcode.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/exitcode.go new file mode 100644 index 000000000..5ad594982 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/exitcode.go @@ -0,0 +1,109 @@ +package wazevoapi + +// ExitCode is an exit code of an execution of a function. +type ExitCode uint32 + +const ( + ExitCodeOK ExitCode = iota + ExitCodeGrowStack + ExitCodeGrowMemory + ExitCodeUnreachable + ExitCodeMemoryOutOfBounds + // ExitCodeCallGoModuleFunction is an exit code for a call to an api.GoModuleFunction. + ExitCodeCallGoModuleFunction + // ExitCodeCallGoFunction is an exit code for a call to an api.GoFunction. + ExitCodeCallGoFunction + ExitCodeTableOutOfBounds + ExitCodeIndirectCallNullPointer + ExitCodeIndirectCallTypeMismatch + ExitCodeIntegerDivisionByZero + ExitCodeIntegerOverflow + ExitCodeInvalidConversionToInteger + ExitCodeCheckModuleExitCode + ExitCodeCallListenerBefore + ExitCodeCallListenerAfter + ExitCodeCallGoModuleFunctionWithListener + ExitCodeCallGoFunctionWithListener + ExitCodeTableGrow + ExitCodeRefFunc + ExitCodeMemoryWait32 + ExitCodeMemoryWait64 + ExitCodeMemoryNotify + ExitCodeUnalignedAtomic + exitCodeMax +) + +const ExitCodeMask = 0xff + +// String implements fmt.Stringer. +func (e ExitCode) String() string { + switch e { + case ExitCodeOK: + return "ok" + case ExitCodeGrowStack: + return "grow_stack" + case ExitCodeCallGoModuleFunction: + return "call_go_module_function" + case ExitCodeCallGoFunction: + return "call_go_function" + case ExitCodeUnreachable: + return "unreachable" + case ExitCodeMemoryOutOfBounds: + return "memory_out_of_bounds" + case ExitCodeUnalignedAtomic: + return "unaligned_atomic" + case ExitCodeTableOutOfBounds: + return "table_out_of_bounds" + case ExitCodeIndirectCallNullPointer: + return "indirect_call_null_pointer" + case ExitCodeIndirectCallTypeMismatch: + return "indirect_call_type_mismatch" + case ExitCodeIntegerDivisionByZero: + return "integer_division_by_zero" + case ExitCodeIntegerOverflow: + return "integer_overflow" + case ExitCodeInvalidConversionToInteger: + return "invalid_conversion_to_integer" + case ExitCodeCheckModuleExitCode: + return "check_module_exit_code" + case ExitCodeCallListenerBefore: + return "call_listener_before" + case ExitCodeCallListenerAfter: + return "call_listener_after" + case ExitCodeCallGoModuleFunctionWithListener: + return "call_go_module_function_with_listener" + case ExitCodeCallGoFunctionWithListener: + return "call_go_function_with_listener" + case ExitCodeGrowMemory: + return "grow_memory" + case ExitCodeTableGrow: + return "table_grow" + case ExitCodeRefFunc: + return "ref_func" + case ExitCodeMemoryWait32: + return "memory_wait32" + case ExitCodeMemoryWait64: + return "memory_wait64" + case ExitCodeMemoryNotify: + return "memory_notify" + } + panic("TODO") +} + +func ExitCodeCallGoModuleFunctionWithIndex(index int, withListener bool) ExitCode { + if withListener { + return ExitCodeCallGoModuleFunctionWithListener | ExitCode(index<<8) + } + return ExitCodeCallGoModuleFunction | ExitCode(index<<8) +} + +func ExitCodeCallGoFunctionWithIndex(index int, withListener bool) ExitCode { + if withListener { + return ExitCodeCallGoFunctionWithListener | ExitCode(index<<8) + } + return ExitCodeCallGoFunction | ExitCode(index<<8) +} + +func GoFunctionIndexFromExitCode(exitCode ExitCode) int { + return int(exitCode >> 8) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/offsetdata.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/offsetdata.go new file mode 100644 index 000000000..fe6161b04 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/offsetdata.go @@ -0,0 +1,216 @@ +package wazevoapi + +import ( + "github.com/tetratelabs/wazero/internal/wasm" +) + +const ( + // FunctionInstanceSize is the size of wazevo.functionInstance. + FunctionInstanceSize = 24 + // FunctionInstanceExecutableOffset is an offset of `executable` field in wazevo.functionInstance + FunctionInstanceExecutableOffset = 0 + // FunctionInstanceModuleContextOpaquePtrOffset is an offset of `moduleContextOpaquePtr` field in wazevo.functionInstance + FunctionInstanceModuleContextOpaquePtrOffset = 8 + // FunctionInstanceTypeIDOffset is an offset of `typeID` field in wazevo.functionInstance + FunctionInstanceTypeIDOffset = 16 +) + +const ( + // ExecutionContextOffsetExitCodeOffset is an offset of `exitCode` field in wazevo.executionContext + ExecutionContextOffsetExitCodeOffset Offset = 0 + // ExecutionContextOffsetCallerModuleContextPtr is an offset of `callerModuleContextPtr` field in wazevo.executionContext + ExecutionContextOffsetCallerModuleContextPtr Offset = 8 + // ExecutionContextOffsetOriginalFramePointer is an offset of `originalFramePointer` field in wazevo.executionContext + ExecutionContextOffsetOriginalFramePointer Offset = 16 + // ExecutionContextOffsetOriginalStackPointer is an offset of `originalStackPointer` field in wazevo.executionContext + ExecutionContextOffsetOriginalStackPointer Offset = 24 + // ExecutionContextOffsetGoReturnAddress is an offset of `goReturnAddress` field in wazevo.executionContext + ExecutionContextOffsetGoReturnAddress Offset = 32 + // ExecutionContextOffsetStackBottomPtr is an offset of `stackBottomPtr` field in wazevo.executionContext + ExecutionContextOffsetStackBottomPtr Offset = 40 + // ExecutionContextOffsetGoCallReturnAddress is an offset of `goCallReturnAddress` field in wazevo.executionContext + ExecutionContextOffsetGoCallReturnAddress Offset = 48 + // ExecutionContextOffsetStackPointerBeforeGoCall is an offset of `StackPointerBeforeGoCall` field in wazevo.executionContext + ExecutionContextOffsetStackPointerBeforeGoCall Offset = 56 + // ExecutionContextOffsetStackGrowRequiredSize is an offset of `stackGrowRequiredSize` field in wazevo.executionContext + ExecutionContextOffsetStackGrowRequiredSize Offset = 64 + // ExecutionContextOffsetMemoryGrowTrampolineAddress is an offset of `memoryGrowTrampolineAddress` field in wazevo.executionContext + ExecutionContextOffsetMemoryGrowTrampolineAddress Offset = 72 + // ExecutionContextOffsetStackGrowCallTrampolineAddress is an offset of `stackGrowCallTrampolineAddress` field in wazevo.executionContext. + ExecutionContextOffsetStackGrowCallTrampolineAddress Offset = 80 + // ExecutionContextOffsetCheckModuleExitCodeTrampolineAddress is an offset of `checkModuleExitCodeTrampolineAddress` field in wazevo.executionContext. + ExecutionContextOffsetCheckModuleExitCodeTrampolineAddress Offset = 88 + // ExecutionContextOffsetSavedRegistersBegin is an offset of the first element of `savedRegisters` field in wazevo.executionContext + ExecutionContextOffsetSavedRegistersBegin Offset = 96 + // ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque is an offset of `goFunctionCallCalleeModuleContextOpaque` field in wazevo.executionContext + ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque Offset = 1120 + // ExecutionContextOffsetTableGrowTrampolineAddress is an offset of `tableGrowTrampolineAddress` field in wazevo.executionContext + ExecutionContextOffsetTableGrowTrampolineAddress Offset = 1128 + // ExecutionContextOffsetRefFuncTrampolineAddress is an offset of `refFuncTrampolineAddress` field in wazevo.executionContext + ExecutionContextOffsetRefFuncTrampolineAddress Offset = 1136 + ExecutionContextOffsetMemmoveAddress Offset = 1144 + ExecutionContextOffsetFramePointerBeforeGoCall Offset = 1152 + ExecutionContextOffsetMemoryWait32TrampolineAddress Offset = 1160 + ExecutionContextOffsetMemoryWait64TrampolineAddress Offset = 1168 + ExecutionContextOffsetMemoryNotifyTrampolineAddress Offset = 1176 +) + +// ModuleContextOffsetData allows the compilers to get the information about offsets to the fields of wazevo.moduleContextOpaque, +// This is unique per module. +type ModuleContextOffsetData struct { + TotalSize int + ModuleInstanceOffset, + LocalMemoryBegin, + ImportedMemoryBegin, + ImportedFunctionsBegin, + GlobalsBegin, + TypeIDs1stElement, + TablesBegin, + BeforeListenerTrampolines1stElement, + AfterListenerTrampolines1stElement, + DataInstances1stElement, + ElementInstances1stElement Offset +} + +// ImportedFunctionOffset returns an offset of the i-th imported function. +// Each item is stored as wazevo.functionInstance whose size matches FunctionInstanceSize. +func (m *ModuleContextOffsetData) ImportedFunctionOffset(i wasm.Index) ( + executableOffset, moduleCtxOffset, typeIDOffset Offset, +) { + base := m.ImportedFunctionsBegin + Offset(i)*FunctionInstanceSize + return base, base + 8, base + 16 +} + +// GlobalInstanceOffset returns an offset of the i-th global instance. +func (m *ModuleContextOffsetData) GlobalInstanceOffset(i wasm.Index) Offset { + return m.GlobalsBegin + Offset(i)*16 +} + +// Offset represents an offset of a field of a struct. +type Offset int32 + +// U32 encodes an Offset as uint32 for convenience. +func (o Offset) U32() uint32 { + return uint32(o) +} + +// I64 encodes an Offset as int64 for convenience. +func (o Offset) I64() int64 { + return int64(o) +} + +// U64 encodes an Offset as int64 for convenience. +func (o Offset) U64() uint64 { + return uint64(o) +} + +// LocalMemoryBase returns an offset of the first byte of the local memory. +func (m *ModuleContextOffsetData) LocalMemoryBase() Offset { + return m.LocalMemoryBegin +} + +// LocalMemoryLen returns an offset of the length of the local memory buffer. +func (m *ModuleContextOffsetData) LocalMemoryLen() Offset { + if l := m.LocalMemoryBegin; l >= 0 { + return l + 8 + } + return -1 +} + +// TableOffset returns an offset of the i-th table instance. +func (m *ModuleContextOffsetData) TableOffset(tableIndex int) Offset { + return m.TablesBegin + Offset(tableIndex)*8 +} + +// NewModuleContextOffsetData creates a ModuleContextOffsetData determining the structure of moduleContextOpaque for the given Module. +// The structure is described in the comment of wazevo.moduleContextOpaque. +func NewModuleContextOffsetData(m *wasm.Module, withListener bool) ModuleContextOffsetData { + ret := ModuleContextOffsetData{} + var offset Offset + + ret.ModuleInstanceOffset = 0 + offset += 8 + + if m.MemorySection != nil { + ret.LocalMemoryBegin = offset + // buffer base + memory size. + const localMemorySizeInOpaqueModuleContext = 16 + offset += localMemorySizeInOpaqueModuleContext + } else { + // Indicates that there's no local memory + ret.LocalMemoryBegin = -1 + } + + if m.ImportMemoryCount > 0 { + offset = align8(offset) + // *wasm.MemoryInstance + imported memory's owner (moduleContextOpaque) + const importedMemorySizeInOpaqueModuleContext = 16 + ret.ImportedMemoryBegin = offset + offset += importedMemorySizeInOpaqueModuleContext + } else { + // Indicates that there's no imported memory + ret.ImportedMemoryBegin = -1 + } + + if m.ImportFunctionCount > 0 { + offset = align8(offset) + ret.ImportedFunctionsBegin = offset + // Each function is stored wazevo.functionInstance. + size := int(m.ImportFunctionCount) * FunctionInstanceSize + offset += Offset(size) + } else { + ret.ImportedFunctionsBegin = -1 + } + + if globals := int(m.ImportGlobalCount) + len(m.GlobalSection); globals > 0 { + // Align to 16 bytes for globals, as f32/f64/v128 might be loaded via SIMD instructions. + offset = align16(offset) + ret.GlobalsBegin = offset + // Pointers to *wasm.GlobalInstance. + offset += Offset(globals) * 16 + } else { + ret.GlobalsBegin = -1 + } + + if tables := len(m.TableSection) + int(m.ImportTableCount); tables > 0 { + offset = align8(offset) + ret.TypeIDs1stElement = offset + offset += 8 // First element of TypeIDs. + + ret.TablesBegin = offset + // Pointers to *wasm.TableInstance. + offset += Offset(tables) * 8 + } else { + ret.TypeIDs1stElement = -1 + ret.TablesBegin = -1 + } + + if withListener { + offset = align8(offset) + ret.BeforeListenerTrampolines1stElement = offset + offset += 8 // First element of BeforeListenerTrampolines. + + ret.AfterListenerTrampolines1stElement = offset + offset += 8 // First element of AfterListenerTrampolines. + } else { + ret.BeforeListenerTrampolines1stElement = -1 + ret.AfterListenerTrampolines1stElement = -1 + } + + ret.DataInstances1stElement = offset + offset += 8 // First element of DataInstances. + + ret.ElementInstances1stElement = offset + offset += 8 // First element of ElementInstances. + + ret.TotalSize = int(align16(offset)) + return ret +} + +func align16(o Offset) Offset { + return (o + 15) &^ 15 +} + +func align8(o Offset) Offset { + return (o + 7) &^ 7 +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap.go new file mode 100644 index 000000000..642c7f75d --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap.go @@ -0,0 +1,96 @@ +package wazevoapi + +import ( + "fmt" + "os" + "strconv" + "sync" +) + +var PerfMap *Perfmap + +func init() { + if PerfMapEnabled { + pid := os.Getpid() + filename := "/tmp/perf-" + strconv.Itoa(pid) + ".map" + + fh, err := os.OpenFile(filename, os.O_APPEND|os.O_RDWR|os.O_CREATE, 0o644) + if err != nil { + panic(err) + } + + PerfMap = &Perfmap{fh: fh} + } +} + +// Perfmap holds perfmap entries to be flushed into a perfmap file. +type Perfmap struct { + entries []entry + mux sync.Mutex + fh *os.File +} + +type entry struct { + index int + offset int64 + size uint64 + name string +} + +func (f *Perfmap) Lock() { + f.mux.Lock() +} + +func (f *Perfmap) Unlock() { + f.mux.Unlock() +} + +// AddModuleEntry adds a perfmap entry into the perfmap file. +// index is the index of the function in the module, offset is the offset of the function in the module, +// size is the size of the function, and name is the name of the function. +// +// Note that the entries are not flushed into the perfmap file until Flush is called, +// and the entries are module-scoped; Perfmap must be locked until Flush is called. +func (f *Perfmap) AddModuleEntry(index int, offset int64, size uint64, name string) { + e := entry{index: index, offset: offset, size: size, name: name} + if f.entries == nil { + f.entries = []entry{e} + return + } + f.entries = append(f.entries, e) +} + +// Flush writes the perfmap entries into the perfmap file where the entries are adjusted by the given `addr` and `functionOffsets`. +func (f *Perfmap) Flush(addr uintptr, functionOffsets []int) { + defer func() { + _ = f.fh.Sync() + }() + + for _, e := range f.entries { + if _, err := f.fh.WriteString(fmt.Sprintf("%x %s %s\n", + uintptr(e.offset)+addr+uintptr(functionOffsets[e.index]), + strconv.FormatUint(e.size, 16), + e.name, + )); err != nil { + panic(err) + } + } + f.entries = f.entries[:0] +} + +// Clear clears the perfmap entries not yet flushed. +func (f *Perfmap) Clear() { + f.entries = f.entries[:0] +} + +// AddEntry writes a perfmap entry directly into the perfmap file, not using the entries. +func (f *Perfmap) AddEntry(addr uintptr, size uint64, name string) { + _, err := f.fh.WriteString(fmt.Sprintf("%x %s %s\n", + addr, + strconv.FormatUint(size, 16), + name, + )) + if err != nil { + panic(err) + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap_disabled.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap_disabled.go new file mode 100644 index 000000000..bcc4e545c --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap_disabled.go @@ -0,0 +1,5 @@ +//go:build !perfmap + +package wazevoapi + +const PerfMapEnabled = false diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap_enabled.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap_enabled.go new file mode 100644 index 000000000..2a39879ec --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/perfmap_enabled.go @@ -0,0 +1,5 @@ +//go:build perfmap + +package wazevoapi + +const PerfMapEnabled = true diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go new file mode 100644 index 000000000..3149fdc9e --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/pool.go @@ -0,0 +1,215 @@ +package wazevoapi + +const poolPageSize = 128 + +// Pool is a pool of T that can be allocated and reset. +// This is useful to avoid unnecessary allocations. +type Pool[T any] struct { + pages []*[poolPageSize]T + resetFn func(*T) + allocated, index int +} + +// NewPool returns a new Pool. +// resetFn is called when a new T is allocated in Pool.Allocate. +func NewPool[T any](resetFn func(*T)) Pool[T] { + var ret Pool[T] + ret.resetFn = resetFn + ret.Reset() + return ret +} + +// Allocated returns the number of allocated T currently in the pool. +func (p *Pool[T]) Allocated() int { + return p.allocated +} + +// Allocate allocates a new T from the pool. +func (p *Pool[T]) Allocate() *T { + if p.index == poolPageSize { + if len(p.pages) == cap(p.pages) { + p.pages = append(p.pages, new([poolPageSize]T)) + } else { + i := len(p.pages) + p.pages = p.pages[:i+1] + if p.pages[i] == nil { + p.pages[i] = new([poolPageSize]T) + } + } + p.index = 0 + } + ret := &p.pages[len(p.pages)-1][p.index] + if p.resetFn != nil { + p.resetFn(ret) + } + p.index++ + p.allocated++ + return ret +} + +// View returns the pointer to i-th item from the pool. +func (p *Pool[T]) View(i int) *T { + page, index := i/poolPageSize, i%poolPageSize + return &p.pages[page][index] +} + +// Reset resets the pool. +func (p *Pool[T]) Reset() { + p.pages = p.pages[:0] + p.index = poolPageSize + p.allocated = 0 +} + +// IDedPool is a pool of T that can be allocated and reset, with a way to get T by an ID. +type IDedPool[T any] struct { + pool Pool[T] + idToItems []*T + maxIDEncountered int +} + +// NewIDedPool returns a new IDedPool. +func NewIDedPool[T any](resetFn func(*T)) IDedPool[T] { + return IDedPool[T]{pool: NewPool[T](resetFn)} +} + +// GetOrAllocate returns the T with the given id. +func (p *IDedPool[T]) GetOrAllocate(id int) *T { + if p.maxIDEncountered < id { + p.maxIDEncountered = id + } + if id >= len(p.idToItems) { + p.idToItems = append(p.idToItems, make([]*T, id-len(p.idToItems)+1)...) + } + if p.idToItems[id] == nil { + p.idToItems[id] = p.pool.Allocate() + } + return p.idToItems[id] +} + +// Get returns the T with the given id, or nil if it's not allocated. +func (p *IDedPool[T]) Get(id int) *T { + if id >= len(p.idToItems) { + return nil + } + return p.idToItems[id] +} + +// Reset resets the pool. +func (p *IDedPool[T]) Reset() { + p.pool.Reset() + for i := range p.idToItems { + p.idToItems[i] = nil + } + p.maxIDEncountered = -1 +} + +// MaxIDEncountered returns the maximum id encountered so far. +func (p *IDedPool[T]) MaxIDEncountered() int { + return p.maxIDEncountered +} + +// arraySize is the size of the array used in VarLengthPool's arrayPool. +// This is chosen to be 8, which is empirically a good number among 8, 12, 16 and 20. +const arraySize = 8 + +// VarLengthPool is a pool of VarLength[T] that can be allocated and reset. +type ( + VarLengthPool[T any] struct { + arrayPool Pool[varLengthPoolArray[T]] + slicePool Pool[[]T] + } + // varLengthPoolArray wraps an array and keeps track of the next index to be used to avoid the heap allocation. + varLengthPoolArray[T any] struct { + arr [arraySize]T + next int + } +) + +// VarLength is a variable length array that can be reused via a pool. +type VarLength[T any] struct { + arr *varLengthPoolArray[T] + slc *[]T +} + +// NewVarLengthPool returns a new VarLengthPool. +func NewVarLengthPool[T any]() VarLengthPool[T] { + return VarLengthPool[T]{ + arrayPool: NewPool[varLengthPoolArray[T]](func(v *varLengthPoolArray[T]) { + v.next = 0 + }), + slicePool: NewPool[[]T](func(i *[]T) { + *i = (*i)[:0] + }), + } +} + +// NewNilVarLength returns a new VarLength[T] with a nil backing. +func NewNilVarLength[T any]() VarLength[T] { + return VarLength[T]{} +} + +// Allocate allocates a new VarLength[T] from the pool. +func (p *VarLengthPool[T]) Allocate(knownMin int) VarLength[T] { + if knownMin <= arraySize { + arr := p.arrayPool.Allocate() + return VarLength[T]{arr: arr} + } + slc := p.slicePool.Allocate() + return VarLength[T]{slc: slc} +} + +// Reset resets the pool. +func (p *VarLengthPool[T]) Reset() { + p.arrayPool.Reset() + p.slicePool.Reset() +} + +// Append appends items to the backing slice just like the `append` builtin function in Go. +func (i VarLength[T]) Append(p *VarLengthPool[T], items ...T) VarLength[T] { + if i.slc != nil { + *i.slc = append(*i.slc, items...) + return i + } + + if i.arr == nil { + i.arr = p.arrayPool.Allocate() + } + + arr := i.arr + if arr.next+len(items) <= arraySize { + for _, item := range items { + arr.arr[arr.next] = item + arr.next++ + } + } else { + slc := p.slicePool.Allocate() + // Copy the array to the slice. + for ptr := 0; ptr < arr.next; ptr++ { + *slc = append(*slc, arr.arr[ptr]) + } + i.slc = slc + *i.slc = append(*i.slc, items...) + } + return i +} + +// View returns the backing slice. +func (i VarLength[T]) View() []T { + if i.slc != nil { + return *i.slc + } else if i.arr != nil { + arr := i.arr + return arr.arr[:arr.next] + } + return nil +} + +// Cut cuts the backing slice to the given length. +// Precondition: n <= len(i.backing). +func (i VarLength[T]) Cut(n int) { + if i.slc != nil { + *i.slc = (*i.slc)[:n] + } else if i.arr != nil { + i.arr.next = n + } +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/ptr.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/ptr.go new file mode 100644 index 000000000..f21e1a5d8 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/ptr.go @@ -0,0 +1,15 @@ +package wazevoapi + +import "unsafe" + +// PtrFromUintptr resurrects the original *T from the given uintptr. +// The caller of this function MUST be sure that ptr is valid. +func PtrFromUintptr[T any](ptr uintptr) *T { + // Wraps ptrs as the double pointer in order to avoid the unsafe access as detected by race detector. + // + // For example, if we have (*function)(unsafe.Pointer(ptr)) instead, then the race detector's "checkptr" + // subroutine wanrs as "checkptr: pointer arithmetic result points to invalid allocation" + // https://github.com/golang/go/blob/1ce7fcf139417d618c2730010ede2afb41664211/src/runtime/checkptr.go#L69 + var wrapped *uintptr = &ptr + return *(**T)(unsafe.Pointer(wrapped)) +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/queue.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/queue.go new file mode 100644 index 000000000..e3118fa69 --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/queue.go @@ -0,0 +1,26 @@ +package wazevoapi + +// Queue is the resettable queue where the underlying slice is reused. +type Queue[T any] struct { + index int + Data []T +} + +func (q *Queue[T]) Enqueue(v T) { + q.Data = append(q.Data, v) +} + +func (q *Queue[T]) Dequeue() (ret T) { + ret = q.Data[q.index] + q.index++ + return +} + +func (q *Queue[T]) Empty() bool { + return q.index >= len(q.Data) +} + +func (q *Queue[T]) Reset() { + q.index = 0 + q.Data = q.Data[:0] +} diff --git a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/resetmap.go b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/resetmap.go new file mode 100644 index 000000000..7177fbb4b --- /dev/null +++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi/resetmap.go @@ -0,0 +1,13 @@ +package wazevoapi + +// ResetMap resets the map to an empty state, or creates a new map if it is nil. +func ResetMap[K comparable, V any](m map[K]V) map[K]V { + if m == nil { + m = make(map[K]V) + } else { + for v := range m { + delete(m, v) + } + } + return m +} |