From 0d6e1720b336e0ed59719dfd8a7bd897a678db26 Mon Sep 17 00:00:00 2001 From: "Rob Moore (MakerX)" Date: Tue, 27 Feb 2024 06:20:37 +0800 Subject: [PATCH] feat: Added `useRawBinaryStrings` option to Decoder to allow override of default UTF-8 behaviour (#3) --- README.md | 25 ++++++++++------- src/Decoder.ts | 28 ++++++++++++++++--- test/decode-raw-strings.test.ts | 49 +++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 14 deletions(-) create mode 100644 test/decode-raw-strings.test.ts diff --git a/README.md b/README.md index 5984ec1..5a213af 100644 --- a/README.md +++ b/README.md @@ -115,6 +115,7 @@ console.log(buffer); | extensionCodec | ExtensionCodec | `ExtensionCodec.defaultCodec` | | context | user-defined | - | | forceBigIntToInt64 | boolean | false | +| useRawBinaryStrings | boolean | false | | maxDepth | number | `100` | | initialBufferSize | number | `2048` | | sortKeys | boolean | false | @@ -122,6 +123,8 @@ console.log(buffer); | forceIntegerToFloat | boolean | false | | ignoreUndefined | boolean | false | +To skip UTF-8 decoding of strings, `useRawBinaryStrings` can be set to `true`. In this case, strings are decoded into `Uint8Array`. + ### `decode(buffer: ArrayLike | BufferSource, options?: DecoderOptions): unknown` It decodes `buffer` that includes a MessagePack-encoded object, and returns the decoded object typed `unknown`. @@ -522,18 +525,19 @@ The mapping of integers varies on the setting of `intMode`. | number (53-bit int) | int family | number or bigint (\*2) | | number (64-bit float) | float family | number (64-bit float) | | bigint | int family | number or bigint (\*2) | -| string | str family | string | -| ArrayBufferView | bin family | Uint8Array (\*3) | +| string | str family | string (\*3) | +| ArrayBufferView | bin family | Uint8Array (\*4) | | Array | array family | Array | -| Object | map family | Object (\*4) | -| Date | timestamp ext family | Date (\*5) | +| Object | map family | Object (\*5) | +| Date | timestamp ext family | Date (\*6) | | bigint | int family | bigint | -- \*1 Both `null` and `undefined` are mapped to `nil` (`0xC0`) type, and are decoded into `null` -- \*2 MessagePack ints are decoded as either numbers or bigints depending on the [IntMode](#intmode) used during decoding. -- \*3 Any `ArrayBufferView`s including NodeJS's `Buffer` are mapped to `bin` family, and are decoded into `Uint8Array` -- \*4 In handling `Object`, it is regarded as `Record` in terms of TypeScript -- \*5 MessagePack timestamps may have nanoseconds, which will lost when it is decoded into JavaScript `Date`. This behavior can be overridden by registering `-1` for the extension codec. +* \*1 Both `null` and `undefined` are mapped to `nil` (`0xC0`) type, and are decoded into `null` +* \*2 MessagePack ints are decoded as either numbers or bigints depending on the [IntMode](#intmode) used during decoding. +* \*3 If you'd like to skip UTF-8 decoding of strings, set `useRawBinaryStrings: true`. In this case, strings are decoded into `Uint8Array`. +* \*4 Any `ArrayBufferView`s including NodeJS's `Buffer` are mapped to `bin` family, and are decoded into `Uint8Array` +* \*5 In handling `Object`, it is regarded as `Record` in terms of TypeScript +* \*6 MessagePack timestamps may have nanoseconds, which will lost when it is decoded into JavaScript `Date`. This behavior can be overridden by registering `-1` for the extension codec. If you set `useBigInt64: true`, the following mapping is used: @@ -550,8 +554,9 @@ If you set `useBigInt64: true`, the following mapping is used: | Object | map family | Object | | Date | timestamp ext family | Date | -- \*5 If the bigint is larger than the max value of uint64 or smaller than the min value of int64, then the behavior is undefined. +* \*6 If the bigint is larger than the max value of uint64 or smaller than the min value of int64, then the behavior is undefined. +* \*7 If the bigint is larger than the max value of uint64 or smaller than the min value of int64, then the behavior is undefined. ## Prerequisites This is a universal JavaScript library that supports major browsers and NodeJS. diff --git a/src/Decoder.ts b/src/Decoder.ts index a369959..adf6101 100644 --- a/src/Decoder.ts +++ b/src/Decoder.ts @@ -27,6 +27,17 @@ export type DecoderOptions = Readonly< */ intMode?: IntMode; + /** + * By default, string values will be decoded as UTF-8 strings. However, if this option is true, + * string values will be returned as Uint8Arrays without additional decoding. + * + * This is useful if the strings may contain invalid UTF-8 sequences. + * + * Note that this option only applies to string values, not map keys. Additionally, when + * enabled, raw string length is limited by the maxBinLength option. + */ + useRawBinaryStrings: boolean; + /** * Maximum string length. * @@ -202,6 +213,7 @@ export class Decoder { private readonly extensionCodec: ExtensionCodecType; private readonly context: ContextType; private readonly intMode: IntMode; + private readonly useRawBinaryStrings: boolean; private readonly maxStrLength: number; private readonly maxBinLength: number; private readonly maxArrayLength: number; @@ -222,6 +234,7 @@ export class Decoder { this.context = (options as { context: ContextType } | undefined)?.context as ContextType; // needs a type assertion because EncoderOptions has no context property when ContextType is undefined this.intMode = options?.intMode ?? (options?.useBigInt64 ? IntMode.AS_ENCODED : IntMode.UNSAFE_NUMBER); + this.useRawBinaryStrings = options?.useRawBinaryStrings ?? false; this.maxStrLength = options?.maxStrLength ?? UINT32_MAX; this.maxBinLength = options?.maxBinLength ?? UINT32_MAX; this.maxArrayLength = options?.maxArrayLength ?? UINT32_MAX; @@ -406,7 +419,7 @@ export class Decoder { } else { // fixstr (101x xxxx) 0xa0 - 0xbf const byteLength = headByte - 0xa0; - object = this.decodeUtf8String(byteLength, 0); + object = this.decodeString(byteLength, 0); } } else if (headByte === 0xc0) { // nil @@ -450,15 +463,15 @@ export class Decoder { } else if (headByte === 0xd9) { // str 8 const byteLength = this.lookU8(); - object = this.decodeUtf8String(byteLength, 1); + object = this.decodeString(byteLength, 1); } else if (headByte === 0xda) { // str 16 const byteLength = this.lookU16(); - object = this.decodeUtf8String(byteLength, 2); + object = this.decodeString(byteLength, 2); } else if (headByte === 0xdb) { // str 32 const byteLength = this.lookU32(); - object = this.decodeUtf8String(byteLength, 4); + object = this.decodeString(byteLength, 4); } else if (headByte === 0xdc) { // array 16 const size = this.readU16(); @@ -636,6 +649,13 @@ export class Decoder { this.stack.pushArrayState(size); } + private decodeString(byteLength: number, headerOffset: number): string | Uint8Array { + if (!this.useRawBinaryStrings || this.stateIsMapKey()) { + return this.decodeUtf8String(byteLength, headerOffset); + } + return this.decodeBinary(byteLength, headerOffset); + } + private decodeUtf8String(byteLength: number, headerOffset: number): string { if (byteLength > this.maxStrLength) { throw new DecodeError( diff --git a/test/decode-raw-strings.test.ts b/test/decode-raw-strings.test.ts new file mode 100644 index 0000000..dd6d7f8 --- /dev/null +++ b/test/decode-raw-strings.test.ts @@ -0,0 +1,49 @@ +import assert from "assert"; +import { encode, decode } from "../src"; +import type { DecoderOptions } from "../src"; + +describe("decode with useRawBinaryStrings specified", () => { + const options = { useRawBinaryStrings: true } satisfies DecoderOptions; + + it("decodes string as binary", () => { + const actual = decode(encode("foo"), options); + const expected = Uint8Array.from([0x66, 0x6f, 0x6f]); + assert.deepStrictEqual(actual, expected); + }); + + it("decodes invalid UTF-8 string as binary", () => { + const invalidUtf8String = Uint8Array.from([ + 61, 180, 118, 220, 39, 166, 43, 68, 219, 116, 105, 84, 121, 46, 122, 136, 233, 221, 15, 174, 247, 19, 50, 176, + 184, 221, 66, 188, 171, 36, 135, 121, + ]); + const encoded = Uint8Array.from([ + 196, 32, 61, 180, 118, 220, 39, 166, 43, 68, 219, 116, 105, 84, 121, 46, 122, 136, 233, 221, 15, 174, 247, 19, 50, + 176, 184, 221, 66, 188, 171, 36, 135, 121, + ]); + + const actual = decode(encoded, options); + assert.deepStrictEqual(actual, invalidUtf8String); + }); + + it("decodes object keys as strings", () => { + const actual = decode(encode({ key: "foo" }), options); + const expected = { key: Uint8Array.from([0x66, 0x6f, 0x6f]) }; + assert.deepStrictEqual(actual, expected); + }); + + it("ignores maxStrLength", () => { + const lengthLimitedOptions = { ...options, maxStrLength: 1 } satisfies DecoderOptions; + + const actual = decode(encode("foo"), lengthLimitedOptions); + const expected = Uint8Array.from([0x66, 0x6f, 0x6f]); + assert.deepStrictEqual(actual, expected); + }); + + it("respects maxBinLength", () => { + const lengthLimitedOptions = { ...options, maxBinLength: 1 } satisfies DecoderOptions; + + assert.throws(() => { + decode(encode("foo"), lengthLimitedOptions); + }, /max length exceeded/i); + }); +});