1 files changed, 1012 insertions, 738 deletions
diff --git a/vendor/github.com/ugorji/go/codec/helper.go b/vendor/github.com/ugorji/go/codec/helper.go
index ecd87ba53..e737ef2b9 100644
--- a/vendor/github.com/ugorji/go/codec/helper.go
+++ b/vendor/github.com/ugorji/go/codec/helper.go
@@ -190,14 +190,17 @@ package codec
 // These are the TransientAddrK and TransientAddr2K methods of decPerType.
 
 import (
+	"bytes"
 	"encoding"
 	"encoding/binary"
+	"encoding/hex"
 	"errors"
 	"fmt"
 	"io"
 	"math"
 	"reflect"
 	"runtime"
+	"runtime/debug"
 	"sort"
 	"strconv"
 	"strings"
@@ -207,18 +210,25 @@ import (
 	"unicode/utf8"
 )
 
-// if debugging is true, then
-//   - within Encode/Decode, do not recover from panic's
-//   - etc
-//
-// Note: Negative tests that check for errors will fail, so only use this
-// when debugging, and run only one test at a time preferably.
-//
-// Note: RPC tests depend on getting the error from an Encode/Decode call.
-// Consequently, they will always fail if debugging = true.
-const debugging = false
-
 const (
+	// if debugging is true, then
+	//   - within Encode/Decode, do not recover from panic's
+	//   - etc
+	//
+	// Note: Negative tests that check for errors will fail, so only use this
+	// when debugging, and run only one test at a time preferably.
+	//
+	// Note: RPC tests depend on getting the error from an Encode/Decode call.
+	// Consequently, they will always fail if debugging = true.
+	//
+	// It is generally set to false
+	debugging = false
+
+	// if debugLogging is false, debugf calls will be a No-op.
+	//
+	// It is generally set to true
+	debugLogging = true
+
 	// containerLenUnknown is length returned from Read(Map|Array)Len
 	// when a format doesn't know apiori.
 	// For example, json doesn't pre-determine the length of a container (sequence/map).
@@ -240,9 +250,6 @@ const (
 	// This constant flag will enable or disable it.
 	supportMarshalInterfaces = true
 
-	// bytesFreeListNoCache is used for debugging, when we want to skip using a cache of []byte.
-	bytesFreeListNoCache = false
-
 	// size of the cacheline: defaulting to value for archs: amd64, arm64, 386
 	// should use "runtime/internal/sys".CacheLineSize, but that is not exposed.
 	cacheLineSize = 64
@@ -250,11 +257,38 @@ const (
 	wordSizeBits = 32 << (^uint(0) >> 63) // strconv.IntSize
 	wordSize     = wordSizeBits / 8
 
+	// byteBufSize is the default size of []byte used
+	// possibly for bufioWriter, etc
+	byteBufSize = 1 << 10 // 4:16, 6:64, 8:256, 10:1024
+
 	// MARKER: determines whether to skip calling fastpath(En|De)codeTypeSwitch.
 	// Calling the fastpath switch in encode() or decode() could be redundant,
 	// as we still have to introspect it again within fnLoad
 	// to determine the function to use for values of that type.
 	skipFastpathTypeSwitchInDirectCall = false
+
+	// maxArrayLen is the size of uint, which determines
+	// the maximum length of any array.
+	maxArrayLen = 1<<((32<<(^uint(0)>>63))-1) - 1
+
+	// ---- below this line, useXXX consts should be true
+
+	usePoolForSFIs  = true
+	useArenaForSFIs = true
+
+	usePoolForTypeInfoLoad = true
+
+	usePoolForSideEncode = true
+
+	usePoolForSideDecode = true
+
+	useBytesFreeList = true
+
+	useSfiRvFreeList = true
+
+	// ---- below this line, useXXX consts should be false
+
+	useBytesFreeListPutGetSeparateCalls = false
 )
 
 const cpu32Bit = ^uint(0)>>32 == 0
@@ -267,16 +301,6 @@ const (
 	rkindChan   = rkind(reflect.Chan)
 )
 
-type mapKeyFastKind uint8
-
-const (
-	mapKeyFastKind32 = iota + 1
-	mapKeyFastKind32ptr
-	mapKeyFastKind64
-	mapKeyFastKind64ptr
-	mapKeyFastKindStr
-)
-
 var (
 	// use a global mutex to ensure each Handle is initialized.
 	// We do this, so we don't have to store the basicHandle mutex
@@ -289,7 +313,10 @@ var (
 	digitCharBitset      bitset256
 	numCharBitset        bitset256
 	whitespaceCharBitset bitset256
-	asciiAlphaNumBitset  bitset256
+	// asciiAlphaNumBitset  bitset256
+
+	jsonCharHtmlSafeBitset bitset256
+	jsonCharSafeBitset     bitset256
 
 	// numCharWithExpBitset64 bitset64
 	// numCharNoExpBitset64   bitset64
@@ -313,8 +340,6 @@ var (
 	// scalarBitset sets bit for all kinds which are scalars/primitives and thus immutable
 	scalarBitset bitset32
 
-	mapKeyFastKindVals [32]mapKeyFastKind
-
 	// codecgen is set to true by codecgen, so that tests, etc can use this information as needed.
 	codecgen bool
 
@@ -322,6 +347,21 @@ var (
 	zeroByteSlice = oneByteArr[:0:0]
 
 	eofReader devNullReader
+
+	// string containing all values of a uint8 in sequence.
+	// We maintain a [256]byte slice, for efficiently making strings with one byte.
+	// str256 string
+
+	// handleNewFns []handleNewFn
+
+	basicErrDecorator errDecoratorDef
+
+	// sentinel value passed to panicValToErr, signifying to call recover yourself
+	callRecoverSentinel = new(byte)
+
+	// debugstackOnce is used to put a single debugStack at a certain point (during debugging).
+	// To use, just call debugstackOnce() wherever you need to see a stack only once.
+	debugstackOnce = sync.OnceFunc(debug.PrintStack)
 )
 
 var (
@@ -340,41 +380,13 @@ var (
 	errNoFormatHandle = errors.New("no handle (cannot identify format)")
 )
 
-var pool4tiload = sync.Pool{
+var poolForTypeInfoLoad = sync.Pool{
 	New: func() interface{} {
-		return &typeInfoLoad{
-			etypes:   make([]uintptr, 0, 4),
-			sfis:     make([]structFieldInfo, 0, 4),
-			sfiNames: make(map[string]uint16, 4),
-		}
+		return newTypeInfoLoad()
 	},
 }
 
 func init() {
-	xx := func(f mapKeyFastKind, k ...reflect.Kind) {
-		for _, v := range k {
-			mapKeyFastKindVals[byte(v)&31] = f // 'v % 32' equal to 'v & 31'
-		}
-	}
-
-	var f mapKeyFastKind
-
-	f = mapKeyFastKind64
-	if wordSizeBits == 32 {
-		f = mapKeyFastKind32
-	}
-	xx(f, reflect.Int, reflect.Uint, reflect.Uintptr)
-
-	f = mapKeyFastKind64ptr
-	if wordSizeBits == 32 {
-		f = mapKeyFastKind32ptr
-	}
-	xx(f, reflect.Ptr)
-
-	xx(mapKeyFastKindStr, reflect.String)
-	xx(mapKeyFastKind32, reflect.Uint32, reflect.Int32, reflect.Float32)
-	xx(mapKeyFastKind64, reflect.Uint64, reflect.Int64, reflect.Float64)
-
 	numBoolBitset.
 		set(byte(reflect.Bool)).
 		set(byte(reflect.Int)).
@@ -425,9 +437,9 @@ func init() {
 	// 	set(byte(reflect.String))
 
 	for i := byte(0); i <= utf8.RuneSelf; i++ {
-		if (i >= '0' && i <= '9') || (i >= 'a' && i <= 'z') || (i >= 'A' && i <= 'Z') {
-			asciiAlphaNumBitset.set(i)
-		}
+		// if (i >= '0' && i <= '9') || (i >= 'a' && i <= 'z') || (i >= 'A' && i <= 'Z') || i == '_' {
+		// 	asciiAlphaNumBitset.set(i)
+		// }
 		switch i {
 		case ' ', '\t', '\r', '\n':
 			whitespaceCharBitset.set(i)
@@ -440,16 +452,74 @@ func init() {
 			numCharBitset.set(i)
 		}
 	}
+
+	// populate the safe values as true: note: ASCII control characters are (0-31)
+	// jsonCharSafeBitset:     all true except (0-31) " \
+	// jsonCharHtmlSafeBitset: all true except (0-31) " \ < > &
+	for i := byte(32); i < utf8.RuneSelf; i++ {
+		switch i {
+		case '"', '\\':
+		case '<', '>', '&':
+			jsonCharSafeBitset.set(i) // = true
+		default:
+			jsonCharSafeBitset.set(i)
+			jsonCharHtmlSafeBitset.set(i)
+		}
+	}
 }
 
-// driverStateManager supports the runtime state of an (enc|dec)Driver.
+func searchRtids(s []uintptr, v uintptr) (i uint, ok bool) {
+	var h uint
+	var j uint = uint(len(s))
+LOOP:
+	if i < j {
+		h = (i + j) >> 1 // avoid overflow when computing h // h = i + (j-i)/2
+		if s[h] < v {
+			i = h + 1
+		} else {
+			j = h
+		}
+		goto LOOP
+	}
+	return i, i < uint(len(s)) && s[i] == v
+}
+
+// circularRefChecker holds interfaces during an encoding (if CheckCircularRef=true)
 //
-// During a side(En|De)code call, we can capture the state, reset it,
-// and then restore it later to continue the primary encoding/decoding.
-type driverStateManager interface {
-	resetState()
-	captureState() interface{}
-	restoreState(state interface{})
+// We considered using a []uintptr (slice of pointer addresses) retrievable via rv.UnsafeAddr.
+// However, it is possible for the same pointer to point to 2 different types e.g.
+//
+//	type T struct { tHelper }
+//	Here, for var v T; &v and &v.tHelper are the same pointer.
+//
+// Consequently, we need a tuple of type and pointer, which interface{} natively provides.
+//
+// Note: the following references, if seen, can lead to circular references
+//   - Pointer to struct/slice/array/map (any container)
+//   - map (reference, where a value in a kv pair could be the map itself)
+//   - addr of slice/array element
+//   - add of struct field
+type circularRefChecker []interface{} // []uintptr
+
+func (ci *circularRefChecker) push(v interface{}) {
+	for _, vv := range *ci {
+		if eq4i(v, vv) { // error if sptr already seen
+			halt.errorf("circular reference found: %p, %T", v, v)
+		}
+	}
+	*ci = append(*ci, v)
+}
+
+func (_ *circularRefChecker) canPushElemKind(elemKind reflect.Kind) bool {
+	switch elemKind {
+	case reflect.Struct, reflect.Slice, reflect.Array, reflect.Map:
+		return true
+	}
+	return false
+}
+
+func (ci *circularRefChecker) pop(num int) {
+	*ci = (*ci)[:len(*ci)-num]
 }
 
 type bdAndBdread struct {
@@ -457,10 +527,7 @@ type bdAndBdread struct {
 	bd     byte
 }
 
-func (x bdAndBdread) captureState() interface{}   { return x }
-func (x *bdAndBdread) resetState()                { x.bd, x.bdRead = 0, false }
-func (x *bdAndBdread) reset()                     { x.resetState() }
-func (x *bdAndBdread) restoreState(v interface{}) { *x = v.(bdAndBdread) }
+func (x *bdAndBdread) reset() { x.bd, x.bdRead = 0, false }
 
 type clsErr struct {
 	err    error // error on closing
@@ -570,6 +637,7 @@ type fauxUnion struct {
 	b bool
 
 	// state
+	a dBytesAttachState
 	v valueType
 }
 
@@ -580,12 +648,21 @@ type typeInfoLoad struct {
 	sfiNames map[string]uint16
 }
 
+func newTypeInfoLoad() *typeInfoLoad {
+	return &typeInfoLoad{
+		etypes:   make([]uintptr, 0, 4),
+		sfis:     make([]structFieldInfo, 0, 4),
+		sfiNames: make(map[string]uint16, 4),
+	}
+}
+
 func (x *typeInfoLoad) reset() {
 	x.etypes = x.etypes[:0]
 	x.sfis = x.sfis[:0]
-	for k := range x.sfiNames { // optimized to zero the map
-		delete(x.sfiNames, k)
-	}
+	clear(x.sfiNames)
+	// for k := range x.sfiNames { // optimized to zero the map
+	// 	delete(x.sfiNames, k)
+	// }
 }
 
 // mirror json.Marshaler and json.Unmarshaler here,
@@ -606,6 +683,15 @@ type isCodecEmptyer interface {
 	IsCodecEmpty() bool
 }
 
+type outOfBoundsError struct {
+	capacity  uint
+	requested uint
+}
+
+func (x *outOfBoundsError) Error() string {
+	return sprintf("out of bounds with capacity = %d, requested %d", x.capacity, x.requested)
+}
+
 type codecError struct {
 	err    error
 	name   string
@@ -638,6 +724,8 @@ func wrapCodecErr(in error, name string, numbytesread int, encode bool) (out err
 
 var (
 	bigen bigenHelper
+	// bigenB  bigenWriter[bytesEncAppenderM]
+	// bigenIO bigenWriter[bufioEncWriterM]
 
 	bigenstd = binary.BigEndian
 
@@ -671,11 +759,10 @@ var (
 	jsonMarshalerTyp   = reflect.TypeOf((*jsonMarshaler)(nil)).Elem()
 	jsonUnmarshalerTyp = reflect.TypeOf((*jsonUnmarshaler)(nil)).Elem()
 
-	selferTyp                = reflect.TypeOf((*Selfer)(nil)).Elem()
-	missingFielderTyp        = reflect.TypeOf((*MissingFielder)(nil)).Elem()
-	iszeroTyp                = reflect.TypeOf((*isZeroer)(nil)).Elem()
-	isCodecEmptyerTyp        = reflect.TypeOf((*isCodecEmptyer)(nil)).Elem()
-	isSelferViaCodecgenerTyp = reflect.TypeOf((*isSelferViaCodecgener)(nil)).Elem()
+	selferTyp         = reflect.TypeOf((*Selfer)(nil)).Elem()
+	missingFielderTyp = reflect.TypeOf((*MissingFielder)(nil)).Elem()
+	iszeroTyp         = reflect.TypeOf((*isZeroer)(nil)).Elem()
+	isCodecEmptyerTyp = reflect.TypeOf((*isCodecEmptyer)(nil)).Elem()
 
 	uint8TypId      = rt2id(uint8Typ)
 	uint8SliceTypId = rt2id(uint8SliceTyp)
@@ -732,10 +819,6 @@ type Selfer interface {
 	CodecDecodeSelf(*Decoder)
 }
 
-type isSelferViaCodecgener interface {
-	codecSelferViaCodecgen()
-}
-
 // MissingFielder defines the interface allowing structs to internally decode or encode
 // values which do not map to struct fields.
 //
@@ -782,6 +865,18 @@ type MapBySlice interface {
 	MapBySlice()
 }
 
+// const (
+// 	rtidfn_Enc_IO = iota
+// 	rtidfn_Enc_IO_NoExt
+// 	rtidfn_Dec_IO
+// 	rtidfn_Dec_IO_NoExt
+
+// 	rtidfn_Enc_Bytes
+// 	rtidfn_Enc_Bytes_NoExt
+// 	rtidfn_Dec_Bytes
+// 	rtidfn_Dec_Bytes_NoExt
+// )
+
 // basicHandleRuntimeState holds onto all BasicHandle runtime and cached config information.
 //
 // Storing this outside BasicHandle allows us create shallow copies of a Handle,
@@ -790,11 +885,6 @@ type MapBySlice interface {
 // temporarily when running tests in parallel, without running the risk that a test executing
 // in parallel with other tests does not see a transient modified values not meant for it.
 type basicHandleRuntimeState struct {
-	// these are used during runtime.
-	// At init time, they should have nothing in them.
-	rtidFns      atomicRtidFnSlice
-	rtidFnsNoExt atomicRtidFnSlice
-
 	// Note: basicHandleRuntimeState is not comparable, due to these slices here (extHandle, intf2impls).
 	// If *[]T is used instead, this becomes comparable, at the cost of extra indirection.
 	// Thses slices are used all the time, so keep as slices (not pointers).
@@ -803,16 +893,32 @@ type basicHandleRuntimeState struct {
 
 	intf2impls
 
+	// these keep track of the []codecRtidFns for this handle.
+	// We used a non-generic value so we can:
+	//   - keep these within BasicHandle
+	//   - work around recursive limitations of go's generics
+	rtidFnsEncIO,
+	rtidFnsEncNoExtIO,
+	rtidFnsEncBytes,
+	rtidFnsEncNoExtBytes,
+	rtidFnsDecIO,
+	rtidFnsDecNoExtIO,
+	rtidFnsDecBytes,
+	rtidFnsDecNoExtBytes atomicRtidFnSlice
+
+	sideEncPool sync.Pool
+	sideDecPool sync.Pool
+
 	mu sync.Mutex
 
 	jsonHandle   bool
 	binaryHandle bool
 
-	// timeBuiltin is initialized from TimeNotBuiltin, and used internally.
+	// timeBuiltin is initialized from TimeNotBuiltin, and used internally by setExt.
 	// once initialized, it cannot be changed, as the function for encoding/decoding time.Time
 	// will have been cached and the TimeNotBuiltin value will not be consulted thereafter.
 	timeBuiltin bool
-	_           bool // padding
+	// _ bool // padding
 }
 
 // BasicHandle encapsulates the common options and extension functions.
@@ -827,7 +933,7 @@ type BasicHandle struct {
 	// If not configured, the default TypeInfos is used, which uses struct tag keys: codec, json
 	TypeInfos *TypeInfos
 
-	*basicHandleRuntimeState
+	basicHandleRuntimeState
 
 	// ---- cache line
 
@@ -865,6 +971,7 @@ type BasicHandle struct {
 	// ---- cache line
 	inited uint32 // holds if inited, and also handle flags (binary encoding, json handler, etc)
 
+	// name string
 }
 
 // initHandle does a one-time initialization of the handle.
@@ -886,17 +993,48 @@ func initHandle(hh Handle) {
 	// is not sufficient, since a race condition can occur within init(Handle) function.
 	// init is made noinline, so that this function can be inlined by its caller.
 	if atomic.LoadUint32(&x.inited) == 0 {
-		x.initHandle(hh)
+		initHandle2(x, hh)
+	}
+}
+
+// initHandle2 should be called only from codec.initHandle global function.
+// make it uninlineable, as it is called at most once for each handle.
+//
+//go:noinline
+func initHandle2(x *BasicHandle, hh Handle) {
+	handleInitMu.Lock()
+	defer handleInitMu.Unlock() // use defer, as halt may panic below
+	if x.inited != 0 {
+		return
+	}
+	x.jsonHandle = hh.isJson()
+	x.binaryHandle = hh.isBinary()
+	x.basicInit()
+
+	x.sideEncPool.New = func() any {
+		return NewEncoderBytes(nil, hh).encoderI
+	}
+	x.sideDecPool.New = func() any {
+		return NewDecoderBytes(nil, hh).decoderI
 	}
+
+	// hh.init()
+
+	atomic.StoreUint32(&x.inited, 1)
 }
 
 func (x *BasicHandle) basicInit() {
-	x.rtidFns.store(nil)
-	x.rtidFnsNoExt.store(nil)
+	// ensure MapType and SliceType are of correct type
+	if x.MapType != nil && x.MapType.Kind() != reflect.Map {
+		halt.onerror(errMapTypeNotMapKind)
+	}
+	if x.SliceType != nil && x.SliceType.Kind() != reflect.Slice {
+		halt.onerror(errSliceTypeNotSliceKind)
+	}
 	x.timeBuiltin = !x.TimeNotBuiltin
 }
 
-func (x *BasicHandle) init() {}
+// func (x *BasicHandle) init() {}
 
 func (x *BasicHandle) isInited() bool {
 	return atomic.LoadUint32(&x.inited) != 0
@@ -907,20 +1045,6 @@ func (x *BasicHandle) clearInited() {
 	atomic.StoreUint32(&x.inited, 0)
 }
 
-// TimeBuiltin returns whether time.Time OOTB support is used,
-// based on the initial configuration of TimeNotBuiltin
-func (x *basicHandleRuntimeState) TimeBuiltin() bool {
-	return x.timeBuiltin
-}
-
-func (x *basicHandleRuntimeState) isJs() bool {
-	return x.jsonHandle
-}
-
-func (x *basicHandleRuntimeState) isBe() bool {
-	return x.binaryHandle
-}
-
 func (x *basicHandleRuntimeState) setExt(rt reflect.Type, tag uint64, ext Ext) (err error) {
 	rk := rt.Kind()
 	for rk == reflect.Ptr {
@@ -957,32 +1081,6 @@ func (x *basicHandleRuntimeState) setExt(rt reflect.Type, tag uint64, ext Ext) (
 	return
 }
 
-// initHandle should be called only from codec.initHandle global function.
-// make it uninlineable, as it is called at most once for each handle.
-//
-//go:noinline
-func (x *BasicHandle) initHandle(hh Handle) {
-	handleInitMu.Lock()
-	defer handleInitMu.Unlock() // use defer, as halt may panic below
-	if x.inited == 0 {
-		if x.basicHandleRuntimeState == nil {
-			x.basicHandleRuntimeState = new(basicHandleRuntimeState)
-		}
-		x.jsonHandle = hh.isJson()
-		x.binaryHandle = hh.isBinary()
-		// ensure MapType and SliceType are of correct type
-		if x.MapType != nil && x.MapType.Kind() != reflect.Map {
-			halt.onerror(errMapTypeNotMapKind)
-		}
-		if x.SliceType != nil && x.SliceType.Kind() != reflect.Slice {
-			halt.onerror(errSliceTypeNotSliceKind)
-		}
-		x.basicInit()
-		hh.init()
-		atomic.StoreUint32(&x.inited, 1)
-	}
-}
-
 func (x *BasicHandle) getBasicHandle() *BasicHandle {
 	return x
 }
@@ -994,305 +1092,13 @@ func (x *BasicHandle) typeInfos() *TypeInfos {
 	return defTypeInfos
 }
 
+// getTypeInfo expects a non-pointer
 func (x *BasicHandle) getTypeInfo(rtid uintptr, rt reflect.Type) (pti *typeInfo) {
 	return x.typeInfos().get(rtid, rt)
 }
 
-func findRtidFn(s []codecRtidFn, rtid uintptr) (i uint, fn *codecFn) {
-	// binary search. adapted from sort/search.go.
-	// Note: we use goto (instead of for loop) so this can be inlined.
-
-	// h, i, j := 0, 0, len(s)
-	var h uint // var h, i uint
-	var j = uint(len(s))
-LOOP:
-	if i < j {
-		h = (i + j) >> 1 // avoid overflow when computing h // h = i + (j-i)/2
-		if s[h].rtid < rtid {
-			i = h + 1
-		} else {
-			j = h
-		}
-		goto LOOP
-	}
-	if i < uint(len(s)) && s[i].rtid == rtid {
-		fn = s[i].fn
-	}
-	return
-}
-
-func (x *BasicHandle) fn(rt reflect.Type) (fn *codecFn) {
-	return x.fnVia(rt, x.typeInfos(), &x.rtidFns, x.CheckCircularRef, true)
-}
-
-func (x *BasicHandle) fnNoExt(rt reflect.Type) (fn *codecFn) {
-	return x.fnVia(rt, x.typeInfos(), &x.rtidFnsNoExt, x.CheckCircularRef, false)
-}
-
-func (x *basicHandleRuntimeState) fnVia(rt reflect.Type, tinfos *TypeInfos, fs *atomicRtidFnSlice, checkCircularRef, checkExt bool) (fn *codecFn) {
-	rtid := rt2id(rt)
-	sp := fs.load()
-	if sp != nil {
-		if _, fn = findRtidFn(sp, rtid); fn != nil {
-			return
-		}
-	}
-
-	fn = x.fnLoad(rt, rtid, tinfos, checkCircularRef, checkExt)
-	x.mu.Lock()
-	sp = fs.load()
-	// since this is an atomic load/store, we MUST use a different array each time,
-	// else we have a data race when a store is happening simultaneously with a findRtidFn call.
-	if sp == nil {
-		sp = []codecRtidFn{{rtid, fn}}
-		fs.store(sp)
-	} else {
-		idx, fn2 := findRtidFn(sp, rtid)
-		if fn2 == nil {
-			sp2 := make([]codecRtidFn, len(sp)+1)
-			copy(sp2[idx+1:], sp[idx:])
-			copy(sp2, sp[:idx])
-			sp2[idx] = codecRtidFn{rtid, fn}
-			fs.store(sp2)
-		}
-	}
-	x.mu.Unlock()
-	return
-}
-
-func fnloadFastpathUnderlying(ti *typeInfo) (f *fastpathE, u reflect.Type) {
-	var rtid uintptr
-	var idx int
-	rtid = rt2id(ti.fastpathUnderlying)
-	idx = fastpathAvIndex(rtid)
-	if idx == -1 {
-		return
-	}
-	f = &fastpathAv[idx]
-	if uint8(reflect.Array) == ti.kind {
-		u = reflectArrayOf(ti.rt.Len(), ti.elem)
-	} else {
-		u = f.rt
-	}
-	return
-}
-
-func (x *basicHandleRuntimeState) fnLoad(rt reflect.Type, rtid uintptr, tinfos *TypeInfos, checkCircularRef, checkExt bool) (fn *codecFn) {
-	fn = new(codecFn)
-	fi := &(fn.i)
-	ti := tinfos.get(rtid, rt)
-	fi.ti = ti
-	rk := reflect.Kind(ti.kind)
-
-	// anything can be an extension except the built-in ones: time, raw and rawext.
-	// ensure we check for these types, then if extension, before checking if
-	// it implementes one of the pre-declared interfaces.
-
-	fi.addrDf = true
-	// fi.addrEf = true
-
-	if rtid == timeTypId && x.timeBuiltin {
-		fn.fe = (*Encoder).kTime
-		fn.fd = (*Decoder).kTime
-	} else if rtid == rawTypId {
-		fn.fe = (*Encoder).raw
-		fn.fd = (*Decoder).raw
-	} else if rtid == rawExtTypId {
-		fn.fe = (*Encoder).rawExt
-		fn.fd = (*Decoder).rawExt
-		fi.addrD = true
-		fi.addrE = true
-	} else if xfFn := x.getExt(rtid, checkExt); xfFn != nil {
-		fi.xfTag, fi.xfFn = xfFn.tag, xfFn.ext
-		fn.fe = (*Encoder).ext
-		fn.fd = (*Decoder).ext
-		fi.addrD = true
-		if rk == reflect.Struct || rk == reflect.Array {
-			fi.addrE = true
-		}
-	} else if (ti.flagSelfer || ti.flagSelferPtr) &&
-		!(checkCircularRef && ti.flagSelferViaCodecgen && ti.kind == byte(reflect.Struct)) {
-		// do not use Selfer generated by codecgen if it is a struct and CheckCircularRef=true
-		fn.fe = (*Encoder).selferMarshal
-		fn.fd = (*Decoder).selferUnmarshal
-		fi.addrD = ti.flagSelferPtr
-		fi.addrE = ti.flagSelferPtr
-	} else if supportMarshalInterfaces && x.isBe() &&
-		(ti.flagBinaryMarshaler || ti.flagBinaryMarshalerPtr) &&
-		(ti.flagBinaryUnmarshaler || ti.flagBinaryUnmarshalerPtr) {
-		fn.fe = (*Encoder).binaryMarshal
-		fn.fd = (*Decoder).binaryUnmarshal
-		fi.addrD = ti.flagBinaryUnmarshalerPtr
-		fi.addrE = ti.flagBinaryMarshalerPtr
-	} else if supportMarshalInterfaces && !x.isBe() && x.isJs() &&
-		(ti.flagJsonMarshaler || ti.flagJsonMarshalerPtr) &&
-		(ti.flagJsonUnmarshaler || ti.flagJsonUnmarshalerPtr) {
-		//If JSON, we should check JSONMarshal before textMarshal
-		fn.fe = (*Encoder).jsonMarshal
-		fn.fd = (*Decoder).jsonUnmarshal
-		fi.addrD = ti.flagJsonUnmarshalerPtr
-		fi.addrE = ti.flagJsonMarshalerPtr
-	} else if supportMarshalInterfaces && !x.isBe() &&
-		(ti.flagTextMarshaler || ti.flagTextMarshalerPtr) &&
-		(ti.flagTextUnmarshaler || ti.flagTextUnmarshalerPtr) {
-		fn.fe = (*Encoder).textMarshal
-		fn.fd = (*Decoder).textUnmarshal
-		fi.addrD = ti.flagTextUnmarshalerPtr
-		fi.addrE = ti.flagTextMarshalerPtr
-	} else {
-		if fastpathEnabled && (rk == reflect.Map || rk == reflect.Slice || rk == reflect.Array) {
-			// by default (without using unsafe),
-			// if an array is not addressable, converting from an array to a slice
-			// requires an allocation (see helper_not_unsafe.go: func rvGetSlice4Array).
-			//
-			// (Non-addressable arrays mostly occur as keys/values from a map).
-			//
-			// However, fastpath functions are mostly for slices of numbers or strings,
-			// which are small by definition and thus allocation should be fast/cheap in time.
-			//
-			// Consequently, the value of doing this quick allocation to elide the overhead cost of
-			// non-optimized (not-unsafe) reflection is a fair price.
-			var rtid2 uintptr
-			if !ti.flagHasPkgPath { // un-named type (slice or mpa or array)
-				rtid2 = rtid
-				if rk == reflect.Array {
-					rtid2 = rt2id(ti.key) // ti.key for arrays = reflect.SliceOf(ti.elem)
-				}
-				if idx := fastpathAvIndex(rtid2); idx != -1 {
-					fn.fe = fastpathAv[idx].encfn
-					fn.fd = fastpathAv[idx].decfn
-					fi.addrD = true
-					fi.addrDf = false
-					if rk == reflect.Array {
-						fi.addrD = false // decode directly into array value (slice made from it)
-					}
-				}
-			} else { // named type (with underlying type of map or slice or array)
-				// try to use mapping for underlying type
-				xfe, xrt := fnloadFastpathUnderlying(ti)
-				if xfe != nil {
-					xfnf := xfe.encfn
-					xfnf2 := xfe.decfn
-					if rk == reflect.Array {
-						fi.addrD = false // decode directly into array value (slice made from it)
-						fn.fd = func(d *Decoder, xf *codecFnInfo, xrv reflect.Value) {
-							xfnf2(d, xf, rvConvert(xrv, xrt))
-						}
-					} else {
-						fi.addrD = true
-						fi.addrDf = false // meaning it can be an address(ptr) or a value
-						xptr2rt := reflect.PtrTo(xrt)
-						fn.fd = func(d *Decoder, xf *codecFnInfo, xrv reflect.Value) {
-							if xrv.Kind() == reflect.Ptr {
-								xfnf2(d, xf, rvConvert(xrv, xptr2rt))
-							} else {
-								xfnf2(d, xf, rvConvert(xrv, xrt))
-							}
-						}
-					}
-					fn.fe = func(e *Encoder, xf *codecFnInfo, xrv reflect.Value) {
-						xfnf(e, xf, rvConvert(xrv, xrt))
-					}
-				}
-			}
-		}
-		if fn.fe == nil && fn.fd == nil {
-			switch rk {
-			case reflect.Bool:
-				fn.fe = (*Encoder).kBool
-				fn.fd = (*Decoder).kBool
-			case reflect.String:
-				// Do not use different functions based on StringToRaw option, as that will statically
-				// set the function for a string type, and if the Handle is modified thereafter,
-				// behaviour is non-deterministic
-				// i.e. DO NOT DO:
-				//   if x.StringToRaw {
-				//   	fn.fe = (*Encoder).kStringToRaw
-				//   } else {
-				//   	fn.fe = (*Encoder).kStringEnc
-				//   }
-
-				fn.fe = (*Encoder).kString
-				fn.fd = (*Decoder).kString
-			case reflect.Int:
-				fn.fd = (*Decoder).kInt
-				fn.fe = (*Encoder).kInt
-			case reflect.Int8:
-				fn.fe = (*Encoder).kInt8
-				fn.fd = (*Decoder).kInt8
-			case reflect.Int16:
-				fn.fe = (*Encoder).kInt16
-				fn.fd = (*Decoder).kInt16
-			case reflect.Int32:
-				fn.fe = (*Encoder).kInt32
-				fn.fd = (*Decoder).kInt32
-			case reflect.Int64:
-				fn.fe = (*Encoder).kInt64
-				fn.fd = (*Decoder).kInt64
-			case reflect.Uint:
-				fn.fd = (*Decoder).kUint
-				fn.fe = (*Encoder).kUint
-			case reflect.Uint8:
-				fn.fe = (*Encoder).kUint8
-				fn.fd = (*Decoder).kUint8
-			case reflect.Uint16:
-				fn.fe = (*Encoder).kUint16
-				fn.fd = (*Decoder).kUint16
-			case reflect.Uint32:
-				fn.fe = (*Encoder).kUint32
-				fn.fd = (*Decoder).kUint32
-			case reflect.Uint64:
-				fn.fe = (*Encoder).kUint64
-				fn.fd = (*Decoder).kUint64
-			case reflect.Uintptr:
-				fn.fe = (*Encoder).kUintptr
-				fn.fd = (*Decoder).kUintptr
-			case reflect.Float32:
-				fn.fe = (*Encoder).kFloat32
-				fn.fd = (*Decoder).kFloat32
-			case reflect.Float64:
-				fn.fe = (*Encoder).kFloat64
-				fn.fd = (*Decoder).kFloat64
-			case reflect.Complex64:
-				fn.fe = (*Encoder).kComplex64
-				fn.fd = (*Decoder).kComplex64
-			case reflect.Complex128:
-				fn.fe = (*Encoder).kComplex128
-				fn.fd = (*Decoder).kComplex128
-			case reflect.Chan:
-				fn.fe = (*Encoder).kChan
-				fn.fd = (*Decoder).kChan
-			case reflect.Slice:
-				fn.fe = (*Encoder).kSlice
-				fn.fd = (*Decoder).kSlice
-			case reflect.Array:
-				fi.addrD = false // decode directly into array value (slice made from it)
-				fn.fe = (*Encoder).kArray
-				fn.fd = (*Decoder).kArray
-			case reflect.Struct:
-				if ti.anyOmitEmpty ||
-					ti.flagMissingFielder ||
-					ti.flagMissingFielderPtr {
-					fn.fe = (*Encoder).kStruct
-				} else {
-					fn.fe = (*Encoder).kStructNoOmitempty
-				}
-				fn.fd = (*Decoder).kStruct
-			case reflect.Map:
-				fn.fe = (*Encoder).kMap
-				fn.fd = (*Decoder).kMap
-			case reflect.Interface:
-				// encode: reflect.Interface are handled already by preEncodeValue
-				fn.fd = (*Decoder).kInterface
-				fn.fe = (*Encoder).kErr
-			default:
-				// reflect.Ptr and reflect.Interface are handled already by preEncodeValue
-				fn.fe = (*Encoder).kErr
-				fn.fd = (*Decoder).kErr
-			}
-		}
-	}
-	return
+func (x *BasicHandle) getTypeInfo4RT(rt reflect.Type) (pti *typeInfo) {
+	return x.typeInfos().get(rt2id(rt), rt)
 }
 
 // Handle defines a specific encoding format. It also stores any runtime state
@@ -1313,14 +1119,18 @@ func (x *basicHandleRuntimeState) fnLoad(rt reflect.Type, rtid uintptr, tinfos *
 type Handle interface {
 	Name() string
 	getBasicHandle() *BasicHandle
-	newEncDriver() encDriver
-	newDecDriver() decDriver
 	isBinary() bool
 	isJson() bool // json is special for now, so track it
 	// desc describes the current byte descriptor, or returns "unknown[XXX]" if not understood.
 	desc(bd byte) string
 	// init initializes the handle based on handle-specific info (beyond what is in BasicHandle)
-	init()
+	// init()
+	// clone() Handle
+	newEncoderBytes(out *[]byte) encoderI
+	newEncoder(w io.Writer) encoderI
+
+	newDecoderBytes(in []byte) decoderI
+	newDecoder(r io.Reader) decoderI
 }
 
 // Raw represents raw formatted bytes.
@@ -1330,11 +1140,13 @@ type Handle interface {
 type Raw []byte
 
 // RawExt represents raw unprocessed extension data.
+//
 // Some codecs will decode extension data as a *RawExt
 // if there is no registered extension for the tag.
 //
-// Only one of Data or Value is nil.
-// If Data is nil, then the content of the RawExt is in the Value.
+// On encode, encode the Data. If nil, then try to encode the Value.
+//
+// On decode: store tag, then store bytes and/or decode into Value.
 type RawExt struct {
 	Tag uint64
 	// Data is the []byte which represents the raw ext. If nil, ext is exposed in Value.
@@ -1454,12 +1266,10 @@ type extFailWrapper struct {
 type binaryEncodingType struct{}
 
 func (binaryEncodingType) isBinary() bool { return true }
-func (binaryEncodingType) isJson() bool   { return false }
 
 type textEncodingType struct{}
 
 func (textEncodingType) isBinary() bool { return false }
-func (textEncodingType) isJson() bool   { return false }
 
 type notJsonType struct{}
 
@@ -1487,11 +1297,8 @@ func (noBuiltInTypes) DecodeBuiltin(rt uintptr, v interface{}) {}
 // retrofitted from stdlib: encoding/binary/BigEndian (ByteOrder)
 type bigenHelper struct{}
 
-func (z bigenHelper) PutUint16(v uint16) (b [2]byte) {
-	return [...]byte{
-		byte(v >> 8),
-		byte(v),
-	}
+func (z bigenHelper) PutUint16(v uint16) (b1, b2 byte) {
+	return byte(v >> 8), byte(v)
 }
 
 func (z bigenHelper) PutUint32(v uint32) (b [4]byte) {
@@ -1539,23 +1346,6 @@ func (z bigenHelper) Uint64(b [8]byte) (v uint64) {
 		uint64(b[0])<<56
 }
 
-func (z bigenHelper) writeUint16(w *encWr, v uint16) {
-	x := z.PutUint16(v)
-	w.writen2(x[0], x[1])
-}
-
-func (z bigenHelper) writeUint32(w *encWr, v uint32) {
-	// w.writeb((z.PutUint32(v))[:])
-	// x := z.PutUint32(v)
-	// w.writeb(x[:])
-	// w.writen4(x[0], x[1], x[2], x[3])
-	w.writen4(z.PutUint32(v))
-}
-
-func (z bigenHelper) writeUint64(w *encWr, v uint64) {
-	w.writen8(z.PutUint64(v))
-}
-
 type extTypeTagFn struct {
 	rtid    uintptr
 	rtidptr uintptr
@@ -1584,14 +1374,13 @@ func (x *BasicHandle) AddExt(rt reflect.Type, tag byte,
 // An error is returned if that is not honored.
 // To Deregister an ext, call SetExt with nil Ext.
 //
-// Deprecated: Use SetBytesExt or SetInterfaceExt on the Handle instead.
+// It will throw an error if called after the Handle has been initialized.
+//
+// Deprecated: Use SetBytesExt or SetInterfaceExt on the Handle instead (which *may* internally call this)
 func (x *BasicHandle) SetExt(rt reflect.Type, tag uint64, ext Ext) (err error) {
 	if x.isInited() {
 		return errHandleInited
 	}
-	if x.basicHandleRuntimeState == nil {
-		x.basicHandleRuntimeState = new(basicHandleRuntimeState)
-	}
 	return x.basicHandleRuntimeState.setExt(rt, tag, ext)
 }
 
@@ -1672,6 +1461,16 @@ func (o intf2impls) intf2impl(rtid uintptr) (rv reflect.Value) {
 	return
 }
 
+type structFieldInfoNode struct {
+	offset   uint16
+	index    uint16
+	kind     uint8
+	numderef uint8
+	_        uint16 // padding
+
+	typ reflect.Type
+}
+
 // structFieldinfopathNode is a node in a tree, which allows us easily
 // walk the anonymous path.
 //
@@ -1679,19 +1478,7 @@ func (o intf2impls) intf2impl(rtid uintptr) (rv reflect.Value) {
 // will be nil and this information becomes a value (not needing any indirection).
 type structFieldInfoPathNode struct {
 	parent *structFieldInfoPathNode
-
-	offset   uint16
-	index    uint16
-	kind     uint8
-	numderef uint8
-
-	// encNameAsciiAlphaNum and omitEmpty should be in structFieldInfo,
-	// but are kept here for tighter packaging.
-
-	encNameAsciiAlphaNum bool // the encName only contains ascii alphabet and numbers
-	omitEmpty            bool
-
-	typ reflect.Type
+	structFieldInfoNode
 }
 
 // depth returns number of valid nodes in the hierachy
@@ -1705,46 +1492,105 @@ TOP:
 	return
 }
 
-// field returns the field of the struct.
-func (path *structFieldInfoPathNode) field(v reflect.Value) (rv2 reflect.Value) {
-	if parent := path.parent; parent != nil {
-		v = parent.field(v)
-		for j, k := uint8(0), parent.numderef; j < k; j++ {
-			if rvIsNil(v) {
-				return
+// MARKER: fully working code - commented out as we inline the code in sfi.field(No)Alloc
+// // field returns the field of the struct.
+// func (n *structFieldInfoPathNode) field(v reflect.Value, alloc, base bool) (rv reflect.Value) {
+// 	if n.parent != nil {
+// 		v = n.parent.field(v, alloc, true)
+// 		if !v.IsValid() {
+// 			return
+// 		}
+// 	}
+// 	return n.structFieldInfoNode.field(v, alloc, base)
+// }
+
+type structFieldInfo struct {
+	encName string // encode name
+
+	// encNameHash uintptr
+
+	// fieldName string // currently unused
+
+	encNameEscape4Json bool
+	omitEmpty          bool
+
+	ptrKind bool
+
+	encBuiltin bool // is field supported for encoding as a builtin?
+	decBuiltin bool // is field addr supported for decoding as a builtin?
+
+	node    structFieldInfoNode
+	parents []structFieldInfoNode
+
+	// path structFieldInfoPathNode
+
+	baseTyp reflect.Type
+	ptrTyp  reflect.Type
+}
+
+// MARKER: fully working code - commented out as we inline the code in sfi.field(No)Alloc
+// func (n *structFieldInfo) field(v reflect.Value, alloc, base bool) (rv reflect.Value) {
+// 	for i := range n.parents {
+// 		v = n.parents[i].field(v, alloc, true)
+// 		if !v.IsValid() {
+// 			return
+// 		}
+// 	}
+// 	return n.node.field(v, alloc, base)
+// }
+
+func (n *structFieldInfo) fieldAlloc(v reflect.Value) reflect.Value {
+	// return n.path.field(v, true, true)
+	// return n.field(v, true, true)
+	var j, nd uint8
+	for i := range n.parents {
+		v = n.parents[i].rvField(v)
+		nd = n.parents[i].numderef
+		for j = 0; j < nd; j++ {
+			if rvPtrIsNil(v) {
+				rvSetDirect(v, reflect.New(v.Type().Elem()))
 			}
 			v = v.Elem()
 		}
 	}
-	return path.rvField(v)
+	v = n.node.rvField(v)
+	nd = n.node.numderef
+	for j = 0; j < nd; j++ {
+		if rvPtrIsNil(v) {
+			rvSetDirect(v, reflect.New(v.Type().Elem()))
+		}
+		v = v.Elem()
+	}
+	return v
 }
 
-// fieldAlloc returns the field of the struct.
-// It allocates if a nil value was seen while searching.
-func (path *structFieldInfoPathNode) fieldAlloc(v reflect.Value) (rv2 reflect.Value) {
-	if parent := path.parent; parent != nil {
-		v = parent.fieldAlloc(v)
-		for j, k := uint8(0), parent.numderef; j < k; j++ {
-			if rvIsNil(v) {
-				rvSetDirect(v, reflect.New(v.Type().Elem()))
+func (n *structFieldInfo) fieldNoAlloc(v reflect.Value, base bool) (rv reflect.Value) {
+	// return n.path.field(v, false, base)
+	// return n.field(v, false, base)
+	var j, nd uint8
+	for i := range n.parents {
+		v = n.parents[i].rvField(v)
+		nd = n.parents[i].numderef
+		for j = 0; j < nd; j++ {
+			if rvPtrIsNil(v) {
+				return reflect.Value{}
 			}
 			v = v.Elem()
 		}
 	}
-	return path.rvField(v)
-}
-
-type structFieldInfo struct {
-	encName string // encode name
-
-	// encNameHash uintptr
-
-	// fieldName string // currently unused
-
-	// encNameAsciiAlphaNum and omitEmpty should be here,
-	// but are stored in structFieldInfoPathNode for tighter packaging.
-
-	path structFieldInfoPathNode
+	v = n.node.rvField(v)
+	rv = v
+	nd = n.node.numderef
+	for j = 0; j < nd; j++ {
+		if rvPtrIsNil(v) {
+			return reflect.Value{}
+		}
+		v = v.Elem()
+	}
+	if base {
+		rv = v
+	}
+	return
 }
 
 func parseStructInfo(stag string) (toArray, omitEmpty bool, keytype valueType) {
@@ -1777,22 +1623,168 @@ func parseStructInfo(stag string) (toArray, omitEmpty bool, keytype valueType) {
 	return
 }
 
-func (si *structFieldInfo) parseTag(stag string) {
+func parseStructFieldTag(stag string) (encName string, omitEmpty bool) {
 	if stag == "" {
 		return
 	}
 	for i, s := range strings.Split(stag, ",") {
 		if i == 0 {
 			if s != "" {
-				si.encName = s
+				encName = s
 			}
-		} else {
-			switch s {
-			case "omitempty":
-				si.path.omitEmpty = true
+			continue
+		}
+		if s == "omitempty" {
+			omitEmpty = true
+		}
+	}
+	return
+}
+
+// ----
+
+type uint8To32TrieNode struct {
+	uint8To32TrieNodeNoKids
+	kids uint8To32TrieNodeKids
+}
+
+func (x *uint8To32TrieNode) reset(v uint8) {
+	x.key = v
+	x.value = 0
+	x.valid = false
+	x.truncKids()
+}
+
+func (x *uint8To32TrieNode) expandKids() (r *uint8To32TrieNode) {
+	// since we want to reuse the slices, let's not use append as it will
+	// always overwrite the value. Only append if we're expanding
+	kids := x.getKids()
+	if cap(kids) > len(kids) {
+		kids = kids[:len(kids)+1]
+	} else {
+		kids = append(kids, uint8To32TrieNode{})
+	}
+	x.setKids(kids)
+	r = &kids[len(kids)-1]
+	return
+}
+
+func (x *uint8To32TrieNode) put(v uint8) (r *uint8To32TrieNode) {
+	kids := x.getKids()
+	for i := range kids {
+		if kids[i].key == v {
+			return &kids[i]
+		}
+	}
+
+	r = x.expandKids()
+	r.reset(v)
+	return r
+}
+
+func (x *uint8To32TrieNode) puts(s string, v uint32) (r *uint8To32TrieNode) {
+	for _, c := range []byte(s) {
+		x = x.put(c)
+	}
+	x.value = v
+	x.valid = true
+	return x
+}
+
+func (x *uint8To32TrieNode) gets(s []byte) (v uint32, ok bool) {
+TOP:
+	for _, b := range s {
+		kids := x.getKids()
+		for i := range kids {
+			if kids[i].key == b {
+				x = &kids[i]
+				continue TOP
 			}
 		}
+		return 0, false
+	}
+	return x.value, x.valid
+}
+
+func (x *uint8To32TrieNode) deepNumKids() (n int) {
+	kids := x.getKids()
+	n = len(kids)
+	for i := range kids {
+		n += kids[i].deepNumKids()
 	}
+	return
+}
+
+// arena just helps all the nodes stay close for better cache-line performance.
+// It basically tries to load up all the nodes within a contiguous space of memory.
+type uint8To32TrieNodeArena struct {
+	arena  []uint8To32TrieNode
+	cursor int
+}
+
+func (x *uint8To32TrieNodeArena) init(v *uint8To32TrieNode) (r *uint8To32TrieNode) {
+	x.arena = make([]uint8To32TrieNode, v.deepNumKids()+1) // incl one for the node, and one buffer
+	r = &x.arena[0]
+	x.cursor++
+	x.clone(r, v)
+	return
+}
+
+func (x *uint8To32TrieNodeArena) clone(dst, src *uint8To32TrieNode) {
+	dst.uint8To32TrieNodeNoKids = src.uint8To32TrieNodeNoKids
+	// dst.kids = nil
+	srckids := src.getKids()
+	c := len(srckids)
+	if c == 0 {
+		return
+	}
+	dstkids := x.arena[x.cursor:][:c:c]
+	dst.setKids(dstkids)
+	x.cursor += c
+	for i := range srckids {
+		x.clone(&dstkids[i], &srckids[i])
+	}
+}
+
+// ----
+
+var pool4SFIs = sync.Pool{
+	New: func() interface{} {
+		return &uint8To32TrieNode{}
+	},
+}
+
+func (x *structFieldInfos) finish() {
+	var src *uint8To32TrieNode
+	if usePoolForSFIs {
+		src = pool4SFIs.Get().(*uint8To32TrieNode)
+	} else {
+		src = &x.t
+	}
+	x.loadSearchTrie(src)
+	if useArenaForSFIs {
+		var ar uint8To32TrieNodeArena
+		x.t = *(ar.init(src))
+	}
+	if usePoolForSFIs {
+		src.reset(0)
+		pool4SFIs.Put(src)
+	}
+}
+
+func (x *structFieldInfos) loadSearchTrie(src *uint8To32TrieNode) {
+	// load the search trie
+	for i, v := range x.source() {
+		src.puts(v.encName, uint32(i))
+	}
+}
+
+func (x *structFieldInfos) search(name []byte) (sfi *structFieldInfo) {
+	n, ok := x.t.gets(name)
+	if ok {
+		sfi = x.source()[n]
+	}
+	return
 }
 
 type sfiSortedByEncName []*structFieldInfo
@@ -1821,7 +1813,7 @@ type typeInfo4Container struct {
 	tielem *typeInfo
 }
 
-// typeInfo keeps static (non-changing readonly)information
+// typeInfo keeps static (non-changing readonly) information
 // about each (non-ptr) type referenced in the encode/decode sequence.
 //
 // During an encode/decode sequence, we work as below:
@@ -1842,12 +1834,15 @@ type typeInfo struct {
 	kind    uint8
 	chandir uint8
 
-	anyOmitEmpty bool      // true if a struct, and any of the fields are tagged "omitempty"
-	toArray      bool      // whether this (struct) type should be encoded as an array
-	keyType      valueType // if struct, how is the field name stored in a stream? default is string
-	mbs          bool      // base type (T or *T) is a MapBySlice
-
-	sfi4Name map[string]*structFieldInfo // map. used for finding sfi given a name
+	// simple=true if a struct, AND
+	//   - none of the fields are tagged "omitempty"
+	//   - no missingFielder
+	//   - keyType is always string
+	//   - noEsc4Json on any fields
+	simple  bool
+	toArray bool      // whether this (struct) type should be encoded as an array
+	keyType valueType // if struct, how is the field name stored in a stream? default is string
+	mbs     bool      // base type (T or *T) is a MapBySlice
 
 	*typeInfo4Container
 
@@ -1861,8 +1856,7 @@ type typeInfo struct {
 	flagComparable   bool
 	flagCanTransient bool
 
-	flagMarshalInterface  bool // does this have custom (un)marshal implementation?
-	flagSelferViaCodecgen bool
+	flagMarshalInterface bool // does this have custom (un)marshal implementation?
 
 	// custom implementation flags
 	flagIsZeroer    bool
@@ -1895,132 +1889,127 @@ type typeInfo struct {
 	flagMissingFielder    bool
 	flagMissingFielderPtr bool
 
+	flagEncBuiltin bool
+	flagDecBuiltin bool
+
 	infoFieldOmitempty bool
 
+	// MARKER - may need padding here (like 6 bytes - auto-handled)
 	sfi structFieldInfos
 }
 
 func (ti *typeInfo) siForEncName(name []byte) (si *structFieldInfo) {
-	return ti.sfi4Name[string(name)]
+	return ti.sfi.search(name)
 }
 
 func (ti *typeInfo) resolve(x []structFieldInfo, ss map[string]uint16) (n int) {
 	n = len(x)
-
 	for i := range x {
 		ui := uint16(i)
-		xn := x[i].encName
+		sf := &x[ui]
+		xn := sf.encName
 		j, ok := ss[xn]
-		if ok {
-			i2clear := ui                              // index to be cleared
-			if x[i].path.depth() < x[j].path.depth() { // this one is shallower
-				ss[xn] = ui
-				i2clear = j
-			}
-			if x[i2clear].encName != "" {
-				x[i2clear].encName = ""
-				n--
-			}
-		} else {
+		if !ok {
+			ss[xn] = ui
+			continue
+		}
+		if ui == j {
+			continue
+		}
+		// if x[i].path.depth() < x[j].path.depth() { // this one is shallower
+		sf2 := &x[j]
+		if len(sf.parents) < len(sf2.parents) { // this one is shallower
 			ss[xn] = ui
+			sf = sf2
+		}
+		if sf.encName == "" {
+			continue
 		}
+		sf.encName = ""
+		n--
 	}
-
 	return
 }
 
 func (ti *typeInfo) init(x []structFieldInfo, n int) {
-	var anyOmitEmpty bool
+	simple := true
+
+	if ti.flagMissingFielder || ti.flagMissingFielderPtr ||
+		ti.keyType != valueTypeString {
+		simple = false
+	}
 
 	// remove all the nils (non-ready)
-	m := make(map[string]*structFieldInfo, n)
+	// m := make(map[string]*structFieldInfo, n)
 	w := make([]structFieldInfo, n)
 	y := make([]*structFieldInfo, n+n)
 	z := y[n:]
 	y = y[:n]
 	n = 0
 	for i := range x {
-		if x[i].encName == "" {
+		sfi := &x[i]
+		if sfi.encName == "" {
 			continue
 		}
-		if !anyOmitEmpty && x[i].path.omitEmpty {
-			anyOmitEmpty = true
+		if simple && (sfi.omitEmpty || sfi.encNameEscape4Json) {
+			simple = false
 		}
-		w[n] = x[i]
-		y[n] = &w[n]
-		m[x[i].encName] = &w[n]
+		w[n] = *sfi
+		sfi = &w[n]
+		y[n] = sfi
+		// m[sfi.encName] = sfi
 		n++
 	}
 	if n != len(y) {
-		halt.errorf("failure reading struct %v - expecting %d of %d valid fields, got %d", ti.rt, len(y), len(x), n)
+		halt.errorf("failure reading struct %v - expecting %d of %d valid fields, got %d", ti.rt, len(y), len(x), any(n))
 	}
 
+	ti.simple = simple
+
 	copy(z, y)
 	sort.Sort(sfiSortedByEncName(z))
 
-	ti.anyOmitEmpty = anyOmitEmpty
 	ti.sfi.load(y, z)
-	ti.sfi4Name = m
+	ti.sfi.finish()
+	// ti.sfi.byName = m
 }
 
-// Handling flagCanTransient
+// isCanTransient returns whether this type can be transient.
+//
+// # Handling flagCanTransient
 //
 // We support transient optimization if the kind of the type is
-// a number, bool, string, or slice (of number/bool).
-// In addition, we also support if the kind is struct or array,
-// and the type does not contain any pointers recursively).
+// - a number, bool, string
+// - slice (of number/bool)
+// - struct with no reference values (pointers, interface, etc) recursively
+// - array with no reference values (pointers, interface, etc) recursively
 //
-// Noteworthy that all reference types (string, slice, func, map, ptr, interface, etc) have pointers.
+// NOTE: all reference types (string, slice, func, map, ptr, interface, etc) have pointers.
 //
 // If using transient for a type with a pointer, there is the potential for data corruption
 // when GC tries to follow a "transient" pointer which may become a non-pointer soon after.
-//
-
-func transientBitsetFlags() *bitset32 {
-	if transientValueHasStringSlice {
-		return &numBoolStrSliceBitset
+func isCanTransient(t reflect.Type, inclStrSlice bool) (v bool) {
+	k := t.Kind()
+	bset := &numBoolBitset
+	if inclStrSlice {
+		bset = &numBoolStrSliceBitset
 	}
-	return &numBoolBitset
-}
-
-func isCanTransient(t reflect.Type, k reflect.Kind) (v bool) {
-	var bs = transientBitsetFlags()
-	if bs.isset(byte(k)) {
+	if bset.isset(byte(k)) {
 		v = true
-	} else if k == reflect.Slice {
-		elem := t.Elem()
-		v = numBoolBitset.isset(byte(elem.Kind()))
 	} else if k == reflect.Array {
-		elem := t.Elem()
-		v = isCanTransient(elem, elem.Kind())
+		v = isCanTransient(t.Elem(), false)
 	} else if k == reflect.Struct {
 		v = true
 		for j, jlen := 0, t.NumField(); j < jlen; j++ {
 			f := t.Field(j)
-			if !isCanTransient(f.Type, f.Type.Kind()) {
-				v = false
-				return
+			if !isCanTransient(f.Type, false) {
+				return false
 			}
 		}
-	} else {
-		v = false
 	}
 	return
 }
 
-func (ti *typeInfo) doSetFlagCanTransient() {
-	if transientSizeMax > 0 {
-		ti.flagCanTransient = ti.size <= transientSizeMax
-	} else {
-		ti.flagCanTransient = true
-	}
-	if ti.flagCanTransient {
-		if !transientBitsetFlags().isset(ti.kind) {
-			ti.flagCanTransient = isCanTransient(ti.rt, reflect.Kind(ti.kind))
-		}
-	}
-}
-
 type rtid2ti struct {
 	rtid uintptr
 	ti   *typeInfo
@@ -2031,7 +2020,7 @@ type rtid2ti struct {
 // It is configured with a set of tag keys, which are used to get
 // configuration for the type.
 type TypeInfos struct {
-	infos atomicTypeInfoSlice
+	infos atomic.Pointer[[]rtid2ti] // atomicTypeInfoSlice
 	mu    sync.Mutex
 	_     uint64 // padding (cache-aligned)
 	tags  []string
@@ -2088,9 +2077,9 @@ func (x *TypeInfos) get(rtid uintptr, rt reflect.Type) (pti *typeInfo) {
 }
 
 func (x *TypeInfos) find(rtid uintptr) (pti *typeInfo) {
-	sp := x.infos.load()
+	sp := x.infos.Load()
 	if sp != nil {
-		_, pti = findTypeInfo(sp, rtid)
+		_, pti = findTypeInfo(*sp, rtid)
 	}
 	return
 }
@@ -2099,7 +2088,7 @@ func (x *TypeInfos) load(rt reflect.Type) (pti *typeInfo) {
 	rk := rt.Kind()
 
 	if rk == reflect.Ptr { // || (rk == reflect.Interface && rtid != intfTypId) {
-		halt.errorf("invalid kind passed to TypeInfos.get: %v - %v", rk, rt)
+		halt.errorf("invalid kind passed to TypeInfos.get: %v - %v", rk.String(), rt)
 	}
 
 	rtid := rt2id(rt)
@@ -2108,7 +2097,7 @@ func (x *TypeInfos) load(rt reflect.Type) (pti *typeInfo) {
 	// it may lead to duplication, but that's ok.
 	ti := typeInfo{
 		rt:      rt,
-		ptr:     reflect.PtrTo(rt),
+		ptr:     reflect.PointerTo(rt),
 		rtid:    rtid,
 		kind:    uint8(rk),
 		size:    uint32(rt.Size()),
@@ -2119,6 +2108,12 @@ func (x *TypeInfos) load(rt reflect.Type) (pti *typeInfo) {
 		flagHasPkgPath: rt.PkgPath() != "",
 	}
 
+	_, ti.flagEncBuiltin = searchRtids(encBuiltinRtids, rtid)
+	_, ti.flagDecBuiltin = searchRtids(decBuiltinRtids, rtid)
+	if !ti.flagDecBuiltin {
+		_, ti.flagDecBuiltin = searchRtids(decBuiltinRtids, rt2id(ti.ptr))
+	}
+
 	// bset sets custom implementation flags
 	bset := func(when bool, b *bool) {
 		if when {
@@ -2159,11 +2154,7 @@ func (x *TypeInfos) load(rt reflect.Type) (pti *typeInfo) {
 	bset(b1, &ti.flagIsCodecEmptyer)
 	bset(b2, &ti.flagIsCodecEmptyerPtr)
 
-	b1, b2 = implIntf(rt, isSelferViaCodecgenerTyp)
-	ti.flagSelferViaCodecgen = b1 || b2
-
 	ti.flagMarshalInterface = ti.flagSelfer || ti.flagSelferPtr ||
-		ti.flagSelferViaCodecgen ||
 		ti.flagBinaryMarshaler || ti.flagBinaryMarshalerPtr ||
 		ti.flagBinaryUnmarshaler || ti.flagBinaryUnmarshalerPtr ||
 		ti.flagTextMarshaler || ti.flagTextMarshalerPtr ||
@@ -2175,7 +2166,7 @@ func (x *TypeInfos) load(rt reflect.Type) (pti *typeInfo) {
 	// bset(b1, &ti.flagComparable)
 	ti.flagComparable = b1
 
-	ti.doSetFlagCanTransient()
+	ti.flagCanTransient = isTransientType4Size(ti.size) && isCanTransient(ti.rt, true)
 
 	var tt reflect.Type
 	switch rk {
@@ -2187,14 +2178,22 @@ func (x *TypeInfos) load(rt reflect.Type) (pti *typeInfo) {
 		} else {
 			ti.keyType = valueTypeString
 		}
-		pp, pi := &pool4tiload, pool4tiload.Get()
-		pv := pi.(*typeInfoLoad)
-		pv.reset()
+		var pi interface{}
+		var pv *typeInfoLoad
+		if usePoolForTypeInfoLoad {
+			pi = poolForTypeInfoLoad.Get()
+			pv = pi.(*typeInfoLoad)
+			pv.reset()
+		} else {
+			pv = newTypeInfoLoad()
+		}
 		pv.etypes = append(pv.etypes, ti.rtid)
-		x.rget(rt, rtid, nil, pv, omitEmpty)
+		x.rget(rt, nil, pv, omitEmpty)
 		n := ti.resolve(pv.sfis, pv.sfiNames)
 		ti.init(pv.sfis, n)
-		pp.Put(pi)
+		if usePoolForTypeInfoLoad {
+			poolForTypeInfoLoad.Put(pi)
+		}
 	case reflect.Map:
 		ti.typeInfo4Container = new(typeInfo4Container)
 		ti.elem = rt.Elem()
@@ -2265,13 +2264,16 @@ func (x *TypeInfos) load(rt reflect.Type) (pti *typeInfo) {
 	}
 
 	x.mu.Lock()
-	sp := x.infos.load()
+	var sp []rtid2ti
+	if spt := x.infos.Load(); spt != nil {
+		sp = *spt
+	}
 	// since this is an atomic load/store, we MUST use a different array each time,
 	// else we have a data race when a store is happening simultaneously with a findRtidFn call.
 	if sp == nil {
 		pti = &ti
 		sp = []rtid2ti{{rtid, pti}}
-		x.infos.store(sp)
+		x.infos.Store(&sp)
 	} else {
 		var idx uint
 		idx, pti = findTypeInfo(sp, rtid)
@@ -2281,15 +2283,14 @@ func (x *TypeInfos) load(rt reflect.Type) (pti *typeInfo) {
 			copy(sp2[idx+1:], sp[idx:])
 			copy(sp2, sp[:idx])
 			sp2[idx] = rtid2ti{rtid, pti}
-			x.infos.store(sp2)
+			x.infos.Store(&sp2)
 		}
 	}
 	x.mu.Unlock()
 	return
 }
 
-func (x *TypeInfos) rget(rt reflect.Type, rtid uintptr,
-	path *structFieldInfoPathNode, pv *typeInfoLoad, omitEmpty bool) {
+func (x *TypeInfos) rget(rt reflect.Type, path *structFieldInfoPathNode, pv *typeInfoLoad, defaultOmitEmpty bool) {
 	// Read up fields and store how to access the value.
 	//
 	// It uses go's rules for message selectors,
@@ -2318,41 +2319,40 @@ LOOP:
 		if stag == "-" {
 			continue
 		}
-		var si structFieldInfo
 
 		var numderef uint8 = 0
-		for xft := f.Type; xft.Kind() == reflect.Ptr; xft = xft.Elem() {
+		ft := f.Type
+		for ; ft.Kind() == reflect.Ptr; ft = ft.Elem() {
 			numderef++
 		}
 
-		var parsed bool
+		var encName string
+		var parsed, omitEmpty bool
+
+		ftid := rt2id(ft)
 		// if anonymous and no struct tag (or it's blank),
 		// and a struct (or pointer to struct), inline it.
 		if f.Anonymous && fkind != reflect.Interface {
 			// ^^ redundant but ok: per go spec, an embedded pointer type cannot be to an interface
-			ft := f.Type
-			isPtr := ft.Kind() == reflect.Ptr
-			for ft.Kind() == reflect.Ptr {
-				ft = ft.Elem()
-			}
+			isPtr := f.Type.Kind() == reflect.Ptr
 			isStruct := ft.Kind() == reflect.Struct
 
 			// Ignore embedded fields of unexported non-struct types.
 			// Also, from go1.10, ignore pointers to unexported struct types
 			// because unmarshal cannot assign a new struct to an unexported field.
 			// See https://golang.org/issue/21357
-			if (isUnexported && !isStruct) || (!allowSetUnexportedEmbeddedPtr && isUnexported && isPtr) {
+			if isUnexported && (!isStruct || isPtr) {
 				continue
 			}
 			doInline := stag == ""
 			if !doInline {
-				si.parseTag(stag)
+				encName, omitEmpty = parseStructFieldTag(stag)
 				parsed = true
-				doInline = si.encName == "" // si.isZero()
+				doInline = encName == "" // si.isZero()
 			}
 			if doInline && isStruct {
 				// if etypes contains this, don't call rget again (as fields are already seen here)
-				ftid := rt2id(ft)
+				//
 				// We cannot recurse forever, but we need to track other field depths.
 				// So - we break if we see a type twice (not the first time).
 				// This should be sufficient to handle an embedded type that refers to its
@@ -2371,54 +2371,83 @@ LOOP:
 				if processIt {
 					pv.etypes = append(pv.etypes, ftid)
 					path2 := &structFieldInfoPathNode{
-						parent:   path,
-						typ:      f.Type,
-						offset:   uint16(f.Offset),
-						index:    j,
-						kind:     uint8(fkind),
-						numderef: numderef,
+						parent: path,
+						structFieldInfoNode: structFieldInfoNode{
+							typ:      f.Type,
+							offset:   uint16(f.Offset),
+							index:    j,
+							kind:     uint8(fkind),
+							numderef: numderef,
+						},
 					}
-					x.rget(ft, ftid, path2, pv, omitEmpty)
+					x.rget(ft, path2, pv, defaultOmitEmpty)
 				}
 				continue
 			}
 		}
 
 		// after the anonymous dance: if an unexported field, skip
-		if isUnexported || f.Name == "" { // f.Name cannot be "", but defensively handle it
+		if isUnexported || f.Name == "" || f.Name == structInfoFieldName { // f.Name cannot be "", but defensively handle it
 			continue
 		}
 
-		si.path = structFieldInfoPathNode{
-			parent:   path,
+		if !parsed {
+			encName, omitEmpty = parseStructFieldTag(stag)
+			parsed = true
+		}
+		if encName == "" {
+			encName = f.Name
+		}
+		if defaultOmitEmpty {
+			omitEmpty = true
+		}
+
+		var si = structFieldInfo{
+			encName:   encName,
+			omitEmpty: omitEmpty,
+			ptrKind:   fkind == reflect.Ptr,
+			baseTyp:   ft,
+			ptrTyp:    reflect.PointerTo(ft),
+		}
+
+		// si.path = structFieldInfoPathNode{
+		// 	parent: path,
+		// 	structFieldInfoNode: structFieldInfoNode{
+		// 		typ:      f.Type,
+		// 		offset:   uint16(f.Offset),
+		// 		index:    j,
+		// 		kind:     uint8(fkind),
+		// 		numderef: numderef,
+		// 	},
+		// }
+
+		si.node = structFieldInfoNode{
 			typ:      f.Type,
 			offset:   uint16(f.Offset),
 			index:    j,
 			kind:     uint8(fkind),
 			numderef: numderef,
-			// set asciiAlphaNum to true (default); checked and may be set to false below
-			encNameAsciiAlphaNum: true,
-			// note: omitEmpty might have been set in an earlier parseTag call, etc - so carry it forward
-			omitEmpty: si.path.omitEmpty,
 		}
 
-		if !parsed {
-			si.encName = f.Name
-			si.parseTag(stag)
-			parsed = true
-		} else if si.encName == "" {
-			si.encName = f.Name
+		if path != nil {
+			si.parents = make([]structFieldInfoNode, path.depth())
+			for k, p := len(si.parents)-1, path; k >= 0; k-- {
+				si.parents[k] = p.structFieldInfoNode
+				p = p.parent
+			}
 		}
 
-		// si.encNameHash = maxUintptr() // hashShortString(bytesView(si.encName))
-
-		if omitEmpty {
-			si.path.omitEmpty = true
+		// ftid = rt2id(ft) where ft = si.baseTyp)
+		_, si.encBuiltin = searchRtids(encBuiltinRtids, ftid)
+		_, si.decBuiltin = searchRtids(decBuiltinRtids, ftid)
+		if !si.decBuiltin {
+			_, si.decBuiltin = searchRtids(decBuiltinRtids, rt2id(si.ptrTyp))
 		}
+		// si.encNameHash = maxUintptr() // hashShortString(bytesView(si.encName))
 
 		for i := len(si.encName) - 1; i >= 0; i-- { // bounds-check elimination
-			if !asciiAlphaNumBitset.isset(si.encName[i]) {
-				si.path.encNameAsciiAlphaNum = false
+			if !jsonCharSafeBitset.isset(si.encName[i]) {
+				si.encNameEscape4Json = true
 				break
 			}
 		}
@@ -2427,6 +2456,29 @@ LOOP:
 	}
 }
 
+type timeRv struct {
+	v time.Time
+	r reflect.Value
+}
+
+type bytesRv struct {
+	v []byte
+	r reflect.Value
+}
+
+type stringIntf struct {
+	v string
+	i interface{}
+}
+
+func cmpTimeRv(v1, v2 timeRv) int {
+	return v1.v.Compare(v2.v)
+}
+
+func cmpBytesRv(v1, v2 bytesRv) int {
+	return bytes.Compare(v1.v, v2.v)
+}
+
 func implIntf(rt, iTyp reflect.Type) (base bool, indir bool) {
 	// return rt.Implements(iTyp), reflect.PtrTo(rt).Implements(iTyp)
 
@@ -2441,11 +2493,15 @@ func implIntf(rt, iTyp reflect.Type) (base bool, indir bool) {
 	if base {
 		indir = true
 	} else {
-		indir = reflect.PtrTo(rt).Implements(iTyp)
+		indir = reflect.PointerTo(rt).Implements(iTyp)
 	}
 	return
 }
 
+func bytesOK(bs []byte, _ bool) []byte {
+	return bs
+}
+
 func bool2int(b bool) (v uint8) {
 	// MARKER: optimized to be a single instruction
 	if b {
@@ -2455,25 +2511,97 @@ func bool2int(b bool) (v uint8) {
 }
 
 func isSliceBoundsError(s string) bool {
-	return strings.Contains(s, "index out of range") ||
-		strings.Contains(s, "slice bounds out of range")
+	return strings.Contains(s, "index out of range") || // indexing error
+		strings.Contains(s, "slice bounds out of range") || // slicing error
+		strings.Contains(s, "cannot convert slice with length") // slice-->array error
 }
 
 func sprintf(format string, v ...interface{}) string {
 	return fmt.Sprintf(format, v...)
 }
 
-func panicValToErr(h errDecorator, v interface{}, err *error) {
-	if v == *err {
+func snip(v []byte) []byte {
+	return v[:min(96, len(v))]
+}
+
+// These constants are used within debugf.
+// If the first parameter to debugf is one of these, it determines
+// the ANSI color used within the ANSI terminal.
+//
+// They make it easier to write different groups of debug statements
+// with a visual aid.
+const (
+	hlSFX    = "\033[0m"
+	hlPFX    = "\033[1;"
+	hlBLACK  = hlPFX + "30" + "m"
+	hlRED    = hlPFX + "31" + "m"
+	hlGREEN  = hlPFX + "32" + "m"
+	hlYELLOW = hlPFX + "33" + "m"
+	hlBLUE   = hlPFX + "34" + "m"
+	hlPURPLE = hlPFX + "35" + "m"
+	hlCYAN   = hlPFX + "36" + "m"
+	hlWHITE  = hlPFX + "37" + "m"
+	// hlORANGE = hlYELLOW
+)
+
+// debugf will print debug statements to the screen whether or not debugging is on
+//
+// Note: if first parameter in a is one of the hlXXX vars, then we treat it as a hint
+// to highlight in different colors.
+//
+//go:noinline
+func debugf(s string, a ...any) {
+	if !debugLogging {
+		return
+	}
+	if len(s) == 0 {
+		return
+	}
+	if s[len(s)-1] != '\n' {
+		s = s + "\n"
+	}
+	if len(a) > 0 {
+		switch a[0] {
+		case hlBLACK, hlRED, hlGREEN, hlYELLOW, hlBLUE, hlPURPLE, hlCYAN, hlWHITE:
+			s = a[0].(string) + s + hlSFX
+			a = a[1:]
+		}
+	}
+	fmt.Printf(s, a...)
+}
+
+func panicToErr(h errDecorator, fn func()) (err error) {
+	defer panicValToErr(h, callRecoverSentinel, &err, nil, debugging)
+	fn()
+	return
+}
+
+// panicValToErr will convert a panic value into an error
+//
+// err and recovered are guaranteed to be not nil
+func panicValToErr(h errDecorator, recovered interface{}, err, errCopy *error, panicAgain bool) {
+	if recovered == callRecoverSentinel {
+		recovered = recover()
+	}
+	if recovered == nil || err == nil {
 		return
 	}
-	switch xerr := v.(type) {
-	case nil:
+	if recovered == *err {
+		goto HANDLE_COPY
+	}
+	switch xerr := recovered.(type) {
+	case *outOfBoundsError:
+		h.wrapErr(xerr, err)
 	case runtime.Error:
-		d, dok := h.(*Decoder)
-		if dok && d.bytes && isSliceBoundsError(xerr.Error()) {
-			*err = io.ErrUnexpectedEOF
-		} else {
+		switch d := h.(type) {
+		case decoderI:
+			if d.isBytes() && isSliceBoundsError(xerr.Error()) {
+				// *err = io.ErrUnexpectedEOF
+				h.wrapErr(io.ErrUnexpectedEOF, err)
+			} else {
+				h.wrapErr(xerr, err)
+			}
+		default:
 			h.wrapErr(xerr, err)
 		}
 	case error:
@@ -2481,62 +2609,55 @@ func panicValToErr(h errDecorator, v interface{}, err *error) {
 		case nil:
 		case io.EOF, io.ErrUnexpectedEOF, errEncoderNotInitialized, errDecoderNotInitialized:
 			// treat as special (bubble up)
-			*err = xerr
+			// *err = xerr
+			h.wrapErr(xerr, err)
 		default:
 			h.wrapErr(xerr, err)
 		}
+	case string:
+		h.wrapErr(errors.New(xerr), err)
+		// *err = errors.New(xerr)
 	default:
 		// we don't expect this to happen (as this library always panics with an error)
-		h.wrapErr(fmt.Errorf("%v", v), err)
+		h.wrapErr(fmt.Errorf("%v", recovered), err)
+	}
+HANDLE_COPY:
+	if errCopy != nil {
+		*errCopy = *err
+	}
+	if panicAgain {
+		panic(*err)
 	}
 }
 
-func usableByteSlice(bs []byte, slen int) (out []byte, changed bool) {
+func usableByteSlice(bs []byte, slen int) (out []byte, isMadeNew bool) {
 	const maxCap = 1024 * 1024 * 64 // 64MB
-	const skipMaxCap = false        // allow to test
-	if slen <= 0 {
-		return []byte{}, true
+	// const skipMaxCap = false        // allow to test
+
+	// if slen <= 0 {
+	// 	return bs[:0], false // return zeroByteSlice, true
+	// }
+
+	// slen=0 means it's defined-length of 0.
+	// slen<0 means non-defined length which would be determined in future.
+
+	// if bs is nil, for length=0, ensure we don't return a nil []byte,
+	// which will cause DecodeBytes (caller) to return a nil []byte incorrectly.
+	if slen == 0 {
+		return zeroByteSlice, false
+	}
+	if slen < 0 {
+		return bs[:0], false
 	}
 	if slen <= cap(bs) {
 		return bs[:slen], false
 	}
 	// slen > cap(bs) ... handle memory overload appropriately
-	if skipMaxCap || slen <= maxCap {
-		return make([]byte, slen), true
-	}
-	return make([]byte, maxCap), true
-}
-
-func mapKeyFastKindFor(k reflect.Kind) mapKeyFastKind {
-	return mapKeyFastKindVals[k&31]
-}
-
-// ----
-
-type codecFnInfo struct {
-	ti     *typeInfo
-	xfFn   Ext
-	xfTag  uint64
-	addrD  bool
-	addrDf bool // force: if addrD, then decode function MUST take a ptr
-	addrE  bool
-	// addrEf bool // force: if addrE, then encode function MUST take a ptr
-}
-
-// codecFn encapsulates the captured variables and the encode function.
-// This way, we only do some calculations one times, and pass to the
-// code block that should be called (encapsulated in a function)
-// instead of executing the checks every time.
-type codecFn struct {
-	i  codecFnInfo
-	fe func(*Encoder, *codecFnInfo, reflect.Value)
-	fd func(*Decoder, *codecFnInfo, reflect.Value)
-	// _  [1]uint64 // padding (cache-aligned)
-}
-
-type codecRtidFn struct {
-	rtid uintptr
-	fn   *codecFn
+	return make([]byte, min(slen, maxCap)), true
+	// if skipMaxCap || slen <= maxCap {
+	// 	return make([]byte, slen), true
+	// }
+	// return make([]byte, maxCap), true
 }
 
 func makeExt(ext interface{}) Ext {
@@ -2552,12 +2673,25 @@ func makeExt(ext interface{}) Ext {
 }
 
 func baseRV(v interface{}) (rv reflect.Value) {
+	// MARKER TODO try using rv4i not reflect.ValueOf
 	// use reflect.ValueOf, not rv4i, as of go 1.16beta, rv4i was not inlineable
 	for rv = reflect.ValueOf(v); rv.Kind() == reflect.Ptr; rv = rv.Elem() {
 	}
 	return
 }
 
+func baseRVRV(v reflect.Value) (rv reflect.Value) {
+	for rv = v; rv.Kind() == reflect.Ptr; rv = rv.Elem() {
+	}
+	return
+}
+
+func baseRT(v reflect.Type) (vv reflect.Type) {
+	for vv = v; vv.Kind() == reflect.Ptr; vv = vv.Elem() {
+	}
+	return
+}
+
 // ----
 
 // these "checkOverflow" functions must be inlinable, and not call anybody.
@@ -2613,25 +2747,25 @@ func (checkOverflow) SignedInt(v uint64) (overflow bool) {
 
 func (x checkOverflow) Float32V(v float64) float64 {
 	if x.Float32(v) {
-		halt.errorf("float32 overflow: %v", v)
+		halt.errorFloat("float32 overflow: ", v)
 	}
 	return v
 }
 func (x checkOverflow) UintV(v uint64, bitsize uint8) uint64 {
 	if x.Uint(v, bitsize) {
-		halt.errorf("uint64 overflow: %v", v)
+		halt.errorUint("uint64 overflow: ", v)
 	}
 	return v
 }
 func (x checkOverflow) IntV(v int64, bitsize uint8) int64 {
 	if x.Int(v, bitsize) {
-		halt.errorf("int64 overflow: %v", v)
+		halt.errorInt("int64 overflow: ", v)
 	}
 	return v
 }
 func (x checkOverflow) SignedIntV(v uint64) int64 {
 	if x.SignedInt(v) {
-		halt.errorf("uint64 to int64 overflow: %v", v)
+		halt.errorUint("uint64 to int64 overflow: ", v)
 	}
 	return int64(v)
 }
@@ -2660,6 +2794,91 @@ func isNumberChar(v byte) bool {
 
 // -----------------------
 
+func pruneSignExt(v []byte, pos bool) (n int) {
+	if len(v) < 2 {
+	} else if pos && v[0] == 0 {
+		for ; v[n] == 0 && n+1 < len(v) && (v[n+1]&(1<<7) == 0); n++ {
+		}
+	} else if !pos && v[0] == 0xff {
+		for ; v[n] == 0xff && n+1 < len(v) && (v[n+1]&(1<<7) != 0); n++ {
+		}
+	}
+	return
+}
+
+func halfFloatToFloatBits(h uint16) (f uint32) {
+	// retrofitted from:
+	// - OGRE (Object-Oriented Graphics Rendering Engine)
+	//   function: halfToFloatI https://www.ogre3d.org/docs/api/1.9/_ogre_bitwise_8h_source.html
+
+	s := uint32(h >> 15)
+	m := uint32(h & 0x03ff)
+	e := int32((h >> 10) & 0x1f)
+
+	if e == 0 {
+		if m == 0 { // plus or minus 0
+			return s << 31
+		}
+		// Denormalized number -- renormalize it
+		for (m & 0x0400) == 0 {
+			m <<= 1
+			e -= 1
+		}
+		e += 1
+		m &= ^uint32(0x0400)
+	} else if e == 31 {
+		if m == 0 { // Inf
+			return (s << 31) | 0x7f800000
+		}
+		return (s << 31) | 0x7f800000 | (m << 13) // NaN
+	}
+	e = e + (127 - 15)
+	m = m << 13
+	return (s << 31) | (uint32(e) << 23) | m
+}
+
+func floatToHalfFloatBits(i uint32) (h uint16) {
+	// retrofitted from:
+	// - OGRE (Object-Oriented Graphics Rendering Engine)
+	//   function: halfToFloatI https://www.ogre3d.org/docs/api/1.9/_ogre_bitwise_8h_source.html
+	// - http://www.java2s.com/example/java-utility-method/float-to/floattohalf-float-f-fae00.html
+	s := (i >> 16) & 0x8000
+	e := int32(((i >> 23) & 0xff) - (127 - 15))
+	m := i & 0x7fffff
+
+	var h32 uint32
+
+	if e <= 0 {
+		if e < -10 { // zero
+			h32 = s // track -0 vs +0
+		} else {
+			m = (m | 0x800000) >> uint32(1-e)
+			h32 = s | (m >> 13)
+		}
+	} else if e == 0xff-(127-15) {
+		if m == 0 { // Inf
+			h32 = s | 0x7c00
+		} else { // NAN
+			m >>= 13
+			var me uint32
+			if m == 0 {
+				me = 1
+			}
+			h32 = s | 0x7c00 | m | me
+		}
+	} else {
+		if e > 30 { // Overflow
+			h32 = s | 0x7c00
+		} else {
+			h32 = s | (uint32(e) << 10) | (m >> 13)
+		}
+	}
+	h = uint16(h32)
+	return
+}
+
+// -----------------------
+
 type ioFlusher interface {
 	Flush() error
 }
@@ -2719,31 +2938,67 @@ func (x *bitset256) isset(pos byte) bool {
 
 // ------------
 
+// panicHdl will panic with the parameters passed.
 type panicHdl struct{}
 
-// errorv will panic if err is defined (not nil)
 func (panicHdl) onerror(err error) {
 	if err != nil {
 		panic(err)
 	}
 }
 
+func (panicHdl) error(err error) { panic(err) }
+
+func (panicHdl) errorStr(s string) { panic(s) }
+
+func (panicHdl) errorStr2(s, s2 string) { panic(s + s2) }
+
+func (panicHdl) errorBytes(s string, p1 []byte) { panic(s + stringView(p1)) }
+
+func (v panicHdl) errorByte(prefix string, p1 byte) {
+	panic(stringView(append(panicHdlBytes(prefix), p1)))
+}
+
+func (v panicHdl) errorInt(prefix string, p1 int64) {
+	panic(stringView(strconv.AppendInt(panicHdlBytes(prefix), p1, 10)))
+	// bs := make([]byte, len(prefix)+8)
+	// bs = append(bs, prefix...)
+	// bs = strconv.AppendInt(bs, p1, 10)
+	// panic(stringView(bs))
+}
+
+func (v panicHdl) errorUint(prefix string, p1 uint64) {
+	panic(stringView(strconv.AppendUint(panicHdlBytes(prefix), p1, 10)))
+}
+
+func (v panicHdl) errorFloat(prefix string, p1 float64) {
+	panic(stringView(strconv.AppendFloat(panicHdlBytes(prefix), p1, 'G', -1, 64)))
+}
+
+// MARKER
+// consider adding //go:noinline to errorf and maybe other methods
+
 // errorf will always panic, using the parameters passed.
 //
 // Note: it is ok to pass in a stringView, as it will just pass it directly
 // to a fmt.Sprintf call and not hold onto it.
 //
-//go:noinline
+// Since this is an unexported call, we will not be defensive.
+// Callers should ensure a non-empty string and 1+ parameter.
 func (panicHdl) errorf(format string, params ...interface{}) {
-	if format == "" {
-		panic(errPanicUndefined)
-	}
-	if len(params) == 0 {
-		panic(errors.New(format))
-	}
+	// if format == "" {
+	// 	panic(errPanicUndefined)
+	// }
+	// if len(params) == 0 {
+	// 	panic(errors.New(format))
+	// }
 	panic(fmt.Errorf(format, params...))
 }
 
+func panicHdlBytes(prefix string) []byte {
+	return append(make([]byte, len(prefix)+8), prefix...)
+}
+
 // ----------------------------------------------------
 
 type errDecorator interface {
@@ -2778,12 +3033,12 @@ func (mustHdl) Float(s float64, err error) float64 {
 // -------------------
 
 func freelistCapacity(length int) (capacity int) {
-	for capacity = 8; capacity <= length; capacity *= 2 {
+	for capacity = 8; capacity < length; capacity *= 2 {
 	}
 	return
 }
 
-// bytesFreelist is a list of byte buffers, sorted by cap.
+// bytesFreeList is a list of byte buffers, sorted by cap.
 //
 // In anecdotal testing (running go test -tsd 1..6), we couldn't get
 // the length of the list > 4 at any time. So we believe a linear search
@@ -2807,12 +3062,12 @@ func freelistCapacity(length int) (capacity int) {
 //	if !byteSliceSameData(v0, v1) {
 //	  blist.put(v0)
 //	}
-type bytesFreelist [][]byte
+type bytesFreeList [][]byte
 
 // peek returns a slice of possibly non-zero'ed bytes, with len=0,
 // and with the largest capacity from the list.
-func (x *bytesFreelist) peek(length int, pop bool) (out []byte) {
-	if bytesFreeListNoCache {
+func (x *bytesFreeList) peek(length int, pop bool) (out []byte) {
+	if !useBytesFreeList {
 		return make([]byte, 0, freelistCapacity(length))
 	}
 	y := *x
@@ -2838,15 +3093,12 @@ func (x *bytesFreelist) peek(length int, pop bool) (out []byte) {
 
 // get returns a slice of possibly non-zero'ed bytes, with len=0,
 // and with cap >= length requested.
-func (x *bytesFreelist) get(length int) (out []byte) {
-	if bytesFreeListNoCache {
+func (x *bytesFreeList) get(length int) (out []byte) {
+	if !useBytesFreeList {
 		return make([]byte, 0, freelistCapacity(length))
 	}
 	y := *x
-	// MARKER: do not use range, as range is not currently inlineable as of go 1.16-beta
-	// for i, v := range y {
-	for i := 0; i < len(y); i++ {
-		v := y[i]
+	for i, v := range y {
 		if cap(v) >= length {
 			// *x = append(y[:i], y[i+1:]...)
 			copy(y[i:], y[i+1:])
@@ -2857,21 +3109,21 @@ func (x *bytesFreelist) get(length int) (out []byte) {
 	return make([]byte, 0, freelistCapacity(length))
 }
 
-func (x *bytesFreelist) put(v []byte) {
-	if bytesFreeListNoCache || cap(v) == 0 {
+func (x *bytesFreeList) put(v []byte) {
+	if !useBytesFreeList || cap(v) == 0 {
 		return
 	}
 	if len(v) != 0 {
 		v = v[:0]
 	}
+	// v = v[:0]
 	// append the new value, then try to put it in a better position
 	y := append(*x, v)
 	*x = y
-	// MARKER: do not use range, as range is not currently inlineable as of go 1.16-beta
+	// MARKER: use simple for loop, so as not to create new slice
 	// for i, z := range y[:len(y)-1] {
 	for i := 0; i < len(y)-1; i++ {
-		z := y[i]
-		if cap(z) > cap(v) {
+		if cap(y[i]) > cap(v) {
 			copy(y[i+1:], y[i:])
 			y[i] = v
 			return
@@ -2879,24 +3131,23 @@ func (x *bytesFreelist) put(v []byte) {
 	}
 }
 
-func (x *bytesFreelist) check(v []byte, length int) (out []byte) {
+func (x *bytesFreeList) check(v []byte, length int) (out []byte) {
 	// ensure inlineable, by moving slow-path out to its own function
 	if cap(v) >= length {
 		return v[:0]
 	}
-	return x.checkPutGet(v, length)
+	return x.putGet(v, length)
 }
 
-func (x *bytesFreelist) checkPutGet(v []byte, length int) []byte {
-	// checkPutGet broken out into its own function, so check is inlineable in general case
-	const useSeparateCalls = false
+func (x *bytesFreeList) putGet(v []byte, length int) []byte {
+	// putGet broken out into its own function, so check is inlineable in general case
 
-	if useSeparateCalls {
+	if useBytesFreeListPutGetSeparateCalls {
 		x.put(v)
 		return x.get(length)
 	}
 
-	if bytesFreeListNoCache {
+	if !useBytesFreeList {
 		return make([]byte, 0, freelistCapacity(length))
 	}
 
@@ -2907,8 +3158,9 @@ func (x *bytesFreelist) checkPutGet(v []byte, length int) []byte {
 		y = append(y, v)
 		*x = y
 	}
-	for i := 0; i < len(y); i++ {
-		z := y[i]
+	// for i := 0; i < len(y); i++ {
+	// 	z := y[i]
+	for i, z := range y {
 		if put {
 			if cap(z) >= length {
 				copy(y[i:], y[i+1:])
@@ -2929,7 +3181,7 @@ func (x *bytesFreelist) checkPutGet(v []byte, length int) []byte {
 
 // -------------------------
 
-// sfiRvFreelist is used by Encoder for encoding structs,
+// sfiRvFreeList is used by Encoder for encoding structs,
 // where we have to gather the fields first and then
 // analyze them for omitEmpty, before knowing the length of the array/map to encode.
 //
@@ -2938,9 +3190,12 @@ func (x *bytesFreelist) checkPutGet(v []byte, length int) []byte {
 //
 // In the general case, the length of this list at most times is 1,
 // so linear search is fine.
-type sfiRvFreelist [][]sfiRv
+type sfiRvFreeList [][]sfiRv
 
-func (x *sfiRvFreelist) get(length int) (out []sfiRv) {
+func (x *sfiRvFreeList) get(length int) (out []sfiRv) {
+	if !useSfiRvFreeList {
+		return make([]sfiRv, 0, freelistCapacity(length))
+	}
 	y := *x
 
 	// MARKER: do not use range, as range is not currently inlineable as of go 1.16-beta
@@ -2957,7 +3212,10 @@ func (x *sfiRvFreelist) get(length int) (out []sfiRv) {
 	return make([]sfiRv, 0, freelistCapacity(length))
 }
 
-func (x *sfiRvFreelist) put(v []sfiRv) {
+func (x *sfiRvFreeList) put(v []sfiRv) {
+	if !useSfiRvFreeList {
+		return
+	}
 	if len(v) != 0 {
 		v = v[:0]
 	}
@@ -3002,3 +3260,19 @@ func (x internerMap) string(v []byte) (s string) {
 	}
 	return
 }
+
+// ----
+
+type bytesEncoder interface {
+	Encode(dst, src []byte)
+	Decode(dst, src []byte) (n int, err error)
+	EncodedLen(n int) int
+	DecodedLen(n int) int
+}
+
+type hexEncoder struct{}
+
+func (hexEncoder) Encode(dst, src []byte)                    { hex.Encode(dst, src) }
+func (hexEncoder) Decode(dst, src []byte) (n int, err error) { return hex.Decode(dst, src) }
+func (hexEncoder) EncodedLen(n int) int                      { return hex.EncodedLen(n) }
+func (hexEncoder) DecodedLen(n int) int                      { return hex.DecodedLen(n) }