diff --git a/lib/internals/decode.js b/lib/internals/decode.js new file mode 100644 index 0000000..0fc0d70 --- /dev/null +++ b/lib/internals/decode.js @@ -0,0 +1,55 @@ +"use strict"; + +/** + * Lenient percent-decoder used only as a fallback when the fast path + * (`fast-decode-uri-component`) returns `null` for input that contains a + * syntactically valid percent-escape that is not valid UTF-8. + * + * It mirrors how `node:querystring` decodes such a token: when at least one + * decodable `%XX` escape is present, every code unit is masked to a single + * byte, escapes contribute their byte, and invalid byte sequences become + * U+FFFD (instead of being left as the raw "%XX" text). When the token has + * no decodable escape (only malformed ones such as "%zz"), it is returned + * unchanged, exactly as node:querystring leaves it. + * + * @param {string} str + * @returns {string} + */ +function decodeString(str) { + const len = str.length; + const buf = Buffer.allocUnsafe(len); + let bufLen = 0; + let hasEscape = false; + + for (let i = 0; i < len; i++) { + const c = str.charCodeAt(i); + + if (c === 37 /* % */ && i + 2 < len) { + const hi = hexValue(str.charCodeAt(i + 1)); + const lo = hexValue(str.charCodeAt(i + 2)); + + if (hi !== -1 && lo !== -1) { + buf[bufLen++] = (hi << 4) | lo; + hasEscape = true; + i += 2; + continue; + } + } + + buf[bufLen++] = c & 0xff; + } + + // No decodable escape: node:querystring returns the token untouched. + if (!hasEscape) return str; + + return buf.toString("utf8", 0, bufLen); +} + +function hexValue(c) { + if (c >= 48 && c <= 57) return c - 48; // 0-9 + if (c >= 65 && c <= 70) return c - 55; // A-F + if (c >= 97 && c <= 102) return c - 87; // a-f + return -1; +} + +module.exports = { decodeString }; diff --git a/lib/parse.js b/lib/parse.js index 0e49fff..0ac91ba 100644 --- a/lib/parse.js +++ b/lib/parse.js @@ -1,6 +1,7 @@ "use strict"; const fastDecode = require("fast-decode-uri-component"); +const { decodeString } = require("./internals/decode"); const plusRegex = /\+/g; const Empty = function () {}; @@ -55,7 +56,11 @@ function parse(input) { // Optimization: Do not decode if it's not necessary. if (shouldDecodeKey) { - key = fastDecode(key) || key; + const decodedKey = fastDecode(key); + // fastDecode returns null on invalid UTF-8; fall back to a lenient + // decoder so the result matches node:querystring (U+FFFD) instead of + // keeping the raw "%XX" text. + key = decodedKey === null ? decodeString(key) : decodedKey; } if (hasBothKeyValuePair) { @@ -66,7 +71,8 @@ function parse(input) { } if (shouldDecodeValue) { - value = fastDecode(value) || value; + const decodedValue = fastDecode(value); + value = decodedValue === null ? decodeString(value) : decodedValue; } } const currentValue = result[key]; diff --git a/test/parse.test.ts b/test/parse.test.ts index 8e97d93..839b463 100644 --- a/test/parse.test.ts +++ b/test/parse.test.ts @@ -60,3 +60,26 @@ test("should parse large numbers", () => { "918854443121279438895193", ); }); + +test("matches node:querystring on invalid UTF-8 escapes in values", () => { + // Syntactically valid escapes that are not valid UTF-8. node:querystring + // substitutes U+FFFD; it never returns the raw "%XX" text. + for (const input of [ + "a=%C3", + "a=%ff", + "a=%80", + "a=%E4%B8", + "a=%F0%9F%98", + "a=%ED%A0%80", + "a=%C0%80", + "a=%C3%A9%E4%B8", // valid then invalid in the same value + ]) { + assert.deepEqual(qs.parse(input), querystring.parse(input), input); + } +}); + +test("matches node:querystring on invalid UTF-8 escapes in keys", () => { + for (const input of ["%C3=x", "%ff=x", "%E4%B8=x"]) { + assert.deepEqual(qs.parse(input), querystring.parse(input), input); + } +});