From 88f6fc6a1684326ae1f947ea8ec24ad0bff0f449 Mon Sep 17 00:00:00 2001 From: Leo Kettmeir Date: Mon, 20 Feb 2023 18:47:42 +0100 Subject: [PATCH] refactor: use ops for idna & punycode (#17817) Towards https://github.com/denoland/deno/issues/17809 --- Cargo.lock | 1 + ext/node/Cargo.toml | 1 + ext/node/idna.rs | 26 ++ ext/node/lib.rs | 5 + ext/node/polyfills/dns.ts | 2 +- ext/node/polyfills/internal/dns/promises.ts | 2 +- ext/node/polyfills/internal/idna.ts | 357 -------------------- ext/node/polyfills/punycode.ts | 26 +- ext/node/polyfills/url.ts | 15 +- 9 files changed, 55 insertions(+), 380 deletions(-) create mode 100644 ext/node/idna.rs diff --git a/Cargo.lock b/Cargo.lock index 3946ec0e59..b5a464ddee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1191,6 +1191,7 @@ version = "0.26.0" dependencies = [ "deno_core", "digest 0.10.6", + "idna 0.3.0", "md-5", "md4", "once_cell", diff --git a/ext/node/Cargo.toml b/ext/node/Cargo.toml index 1b7d613ef6..600e4fb820 100644 --- a/ext/node/Cargo.toml +++ b/ext/node/Cargo.toml @@ -16,6 +16,7 @@ path = "lib.rs" [dependencies] deno_core.workspace = true digest = { version = "0.10.5", features = ["core-api", "std"] } +idna = "0.3.0" md-5 = "0.10.5" md4 = "0.10.2" once_cell.workspace = true diff --git a/ext/node/idna.rs b/ext/node/idna.rs new file mode 100644 index 0000000000..8f7cfe34ab --- /dev/null +++ b/ext/node/idna.rs @@ -0,0 +1,26 @@ +// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license. + +use deno_core::error::AnyError; +use deno_core::op; + +#[op] +pub fn op_node_idna_domain_to_ascii( + domain: String, +) -> Result { + Ok(idna::domain_to_ascii(&domain)?) +} + +#[op] +pub fn op_node_idna_domain_to_unicode(domain: String) -> String { + idna::domain_to_unicode(&domain).0 +} + +#[op] +pub fn op_node_idna_punycode_decode(domain: String) -> String { + idna::punycode::decode_to_string(&domain).unwrap_or_default() +} + +#[op] +pub fn op_node_idna_punycode_encode(domain: String) -> String { + idna::punycode::encode_str(&domain).unwrap_or_default() +} diff --git a/ext/node/lib.rs b/ext/node/lib.rs index fc61dc317e..fb389d8505 100644 --- a/ext/node/lib.rs +++ b/ext/node/lib.rs @@ -14,6 +14,7 @@ use std::rc::Rc; mod crypto; pub mod errors; +mod idna; mod ops; mod package_json; mod path; @@ -376,6 +377,10 @@ pub fn init_polyfill() -> Extension { winerror::op_node_sys_to_uv_error::decl(), v8::op_v8_cached_data_version_tag::decl(), v8::op_v8_get_heap_statistics::decl(), + idna::op_node_idna_domain_to_ascii::decl(), + idna::op_node_idna_domain_to_unicode::decl(), + idna::op_node_idna_punycode_decode::decl(), + idna::op_node_idna_punycode_encode::decl(), op_node_build_os::decl(), ]) .build() diff --git a/ext/node/polyfills/dns.ts b/ext/node/polyfills/dns.ts index 7b8752e13d..1626517f13 100644 --- a/ext/node/polyfills/dns.ts +++ b/ext/node/polyfills/dns.ts @@ -89,7 +89,7 @@ import { GetAddrInfoReqWrap, QueryReqWrap, } from "internal:deno_node/polyfills/internal_binding/cares_wrap.ts"; -import { toASCII } from "internal:deno_node/polyfills/internal/idna.ts"; +import { toASCII } from "internal:deno_node/polyfills/punycode.ts"; import { notImplemented } from "internal:deno_node/polyfills/_utils.ts"; function onlookup( diff --git a/ext/node/polyfills/internal/dns/promises.ts b/ext/node/polyfills/internal/dns/promises.ts index e5e15dc871..40f57fd2c0 100644 --- a/ext/node/polyfills/internal/dns/promises.ts +++ b/ext/node/polyfills/internal/dns/promises.ts @@ -56,7 +56,7 @@ import { GetAddrInfoReqWrap, QueryReqWrap, } from "internal:deno_node/polyfills/internal_binding/cares_wrap.ts"; -import { toASCII } from "internal:deno_node/polyfills/internal/idna.ts"; +import { toASCII } from "internal:deno_node/polyfills/punycode.ts"; function onlookup( this: GetAddrInfoReqWrap, diff --git a/ext/node/polyfills/internal/idna.ts b/ext/node/polyfills/internal/idna.ts index f71c06eba6..dd7fe45a67 100644 --- a/ext/node/polyfills/internal/idna.ts +++ b/ext/node/polyfills/internal/idna.ts @@ -48,74 +48,6 @@ "use strict"; -/** Highest positive signed 32-bit float value */ -const maxInt = 2147483647; // aka. 0x7FFFFFFF or 2^31-1 - -/** Bootstring parameters */ -const base = 36; -const tMin = 1; -const tMax = 26; -const skew = 38; -const damp = 700; -const initialBias = 72; -const initialN = 128; // 0x80 -const delimiter = "-"; // '\x2D' - -/** Regular expressions */ -export const regexPunycode = /^xn--/; -export const regexNonASCII = /[^\0-\x7E]/; // non-ASCII chars -const regexSeparators = /[\x2E\u3002\uFF0E\uFF61]/g; // RFC 3490 separators - -/** Error messages */ -const errors: Record = { - "overflow": "Overflow: input needs wider integers to process", - "not-basic": "Illegal input >= 0x80 (not a basic code point)", - "invalid-input": "Invalid input", -}; - -/** Convenience shortcuts */ -const baseMinusTMin = base - tMin; -const floor = Math.floor; - -/** - * A generic error utility function. - * - * @param type The error type. - * @return Throws a `RangeError` with the applicable error message. - */ -function error(type: string) { - throw new RangeError(errors[type]); -} - -/** - * A simple `Array#map`-like wrapper to work with domain name strings or email - * addresses. - * - * @param domain The domain name or email address. - * @param callback The function that gets called for every - * character. - * @return A new string of characters returned by the callback - * function. - */ -function mapDomain(str: string, fn: (label: string) => string) { - const parts = str.split("@"); - let result = ""; - - if (parts.length > 1) { - // In email addresses, only the domain name should be punycoded. Leave - // the local part (i.e. everything up to `@`) intact. - result = parts[0] + "@"; - str = parts[1]; - } - - // Avoid `split(regex)` for IE8 compatibility. See #17. - str = str.replace(regexSeparators, "\x2E"); - const labels = str.split("."); - const encoded = labels.map(fn).join("."); - - return result + encoded; -} - /** * Creates an array containing the numeric code points of each Unicode * character in the string. While JavaScript uses UCS-2 internally, @@ -170,292 +102,3 @@ export const ucs2 = { decode: ucs2decode, encode: ucs2encode, }; - -/** - * Converts a basic code point into a digit/integer. - * @see `digitToBasic()` - * @private - * @param codePoint The basic numeric code point value. - * @returns The numeric value of a basic code point (for use in - * representing integers) in the range `0` to `base - 1`, or `base` if - * the code point does not represent a value. - */ -function basicToDigit(codePoint: number) { - if (codePoint - 0x30 < 0x0A) { - return codePoint - 0x16; - } - if (codePoint - 0x41 < 0x1A) { - return codePoint - 0x41; - } - if (codePoint - 0x61 < 0x1A) { - return codePoint - 0x61; - } - return base; -} - -/** - * Converts a digit/integer into a basic code point. - * - * @param digit The numeric value of a basic code point. - * @return The basic code point whose value (when used for - * representing integers) is `digit`, which needs to be in the range - * `0` to `base - 1`. If `flag` is non-zero, the uppercase form is - * used; else, the lowercase form is used. The behavior is undefined - * if `flag` is non-zero and `digit` has no uppercase form. - */ -function digitToBasic(digit: number, flag: number) { - // 0..25 map to ASCII a..z or A..Z - // 26..35 map to ASCII 0..9 - return digit + 22 + 75 * Number(digit < 26) - (Number(flag != 0) << 5); -} - -/** - * Bias adaptation function as per section 3.4 of RFC 3492. - * https://tools.ietf.org/html/rfc3492#section-3.4 - */ -function adapt(delta: number, numPoints: number, firstTime: boolean) { - let k = 0; - delta = firstTime ? Math.floor(delta / damp) : delta >> 1; - delta += Math.floor(delta / numPoints); - - for (; /* no initialization */ delta > baseMinusTMin * tMax >> 1; k += base) { - delta = Math.floor(delta / baseMinusTMin); - } - - return Math.floor(k + (baseMinusTMin + 1) * delta / (delta + skew)); -} - -/** - * Converts a Punycode string of ASCII-only symbols to a string of Unicode - * symbols. - * @memberOf punycode - * @param input The Punycode string of ASCII-only symbols. - * @returns The resulting string of Unicode symbols. - */ -export function decode(input: string): string { - // Don't use UCS-2. - const output = []; - const inputLength = input.length; - let i = 0; - let n = initialN; - let bias = initialBias; - - // Handle the basic code points: let `basic` be the number of input code - // points before the last delimiter, or `0` if there is none, then copy - // the first basic code points to the output. - - let basic = input.lastIndexOf(delimiter); - if (basic < 0) { - basic = 0; - } - - for (let j = 0; j < basic; ++j) { - // if it's not a basic code point - if (input.charCodeAt(j) >= 0x80) { - error("not-basic"); - } - output.push(input.charCodeAt(j)); - } - - // Main decoding loop: start just after the last delimiter if any basic code - // points were copied; start at the beginning otherwise. - - for ( - let index = basic > 0 ? basic + 1 : 0; - index < inputLength; - /* no final expression */ - ) { - // `index` is the index of the next character to be consumed. - // Decode a generalized variable-length integer into `delta`, - // which gets added to `i`. The overflow checking is easier - // if we increase `i` as we go, then subtract off its starting - // value at the end to obtain `delta`. - const oldi = i; - for (let w = 1, k = base;; /* no condition */ k += base) { - if (index >= inputLength) { - error("invalid-input"); - } - - const digit = basicToDigit(input.charCodeAt(index++)); - - if (digit >= base || digit > floor((maxInt - i) / w)) { - error("overflow"); - } - - i += digit * w; - const t = k <= bias ? tMin : (k >= bias + tMax ? tMax : k - bias); - - if (digit < t) { - break; - } - - const baseMinusT = base - t; - if (w > floor(maxInt / baseMinusT)) { - error("overflow"); - } - - w *= baseMinusT; - } - - const out = output.length + 1; - bias = adapt(i - oldi, out, oldi == 0); - - // `i` was supposed to wrap around from `out` to `0`, - // incrementing `n` each time, so we'll fix that now: - if (floor(i / out) > maxInt - n) { - error("overflow"); - } - - n += floor(i / out); - i %= out; - - // Insert `n` at position `i` of the output. - output.splice(i++, 0, n); - } - - return String.fromCodePoint(...output); -} - -/** - * Converts a string of Unicode symbols (e.g. a domain name label) to a - * Punycode string of ASCII-only symbols. - * - * @param str The string of Unicode symbols. - * @return The resulting Punycode string of ASCII-only symbols. - */ -export function encode(str: string) { - const output = []; - - // Convert the input in UCS-2 to an array of Unicode code points. - const input = ucs2decode(str); - - // Cache the length. - const inputLength = input.length; - - // Initialize the state. - let n = initialN; - let delta = 0; - let bias = initialBias; - - // Handle the basic code points. - for (const currentValue of input) { - if (currentValue < 0x80) { - output.push(String.fromCharCode(currentValue)); - } - } - - const basicLength = output.length; - let handledCPCount = basicLength; - - // `handledCPCount` is the number of code points that have been handled; - // `basicLength` is the number of basic code points. - - // Finish the basic string with a delimiter unless it's empty. - if (basicLength) { - output.push(delimiter); - } - - // Main encoding loop: - while (handledCPCount < inputLength) { - // All non-basic code points < n have been handled already. Find the next - // larger one: - let m = maxInt; - - for (const currentValue of input) { - if (currentValue >= n && currentValue < m) { - m = currentValue; - } - } - - // Increase `delta` enough to advance the decoder's state to , - // but guard against overflow. - const handledCPCountPlusOne = handledCPCount + 1; - - if (m - n > Math.floor((maxInt - delta) / handledCPCountPlusOne)) { - error("overflow"); - } - - delta += (m - n) * handledCPCountPlusOne; - n = m; - - for (const currentValue of input) { - if (currentValue < n && ++delta > maxInt) { - error("overflow"); - } - - if (currentValue == n) { - // Represent delta as a generalized variable-length integer. - let q = delta; - - for (let k = base;; /* no condition */ k += base) { - const t = k <= bias ? tMin : (k >= bias + tMax ? tMax : k - bias); - - if (q < t) { - break; - } - - const qMinusT = q - t; - const baseMinusT = base - t; - - output.push( - String.fromCharCode(digitToBasic(t + qMinusT % baseMinusT, 0)), - ); - - q = Math.floor(qMinusT / baseMinusT); - } - - output.push(String.fromCharCode(digitToBasic(q, 0))); - - bias = adapt( - delta, - handledCPCountPlusOne, - handledCPCount == basicLength, - ); - - delta = 0; - ++handledCPCount; - } - } - - ++delta; - ++n; - } - - return output.join(""); -} - -/** - * Converts a Punycode string representing a domain name or an email address - * to Unicode. Only the Punycoded parts of the input will be converted, i.e. - * it doesn't matter if you call it on a string that has already been - * converted to Unicode. - * @memberOf punycode - * @param input The Punycoded domain name or email address to - * convert to Unicode. - * @returns The Unicode representation of the given Punycode - * string. - */ -export function toUnicode(input: string) { - return mapDomain(input, function (string) { - return regexPunycode.test(string) - ? decode(string.slice(4).toLowerCase()) - : string; - }); -} - -/** - * Converts a Unicode string representing a domain name or an email address to - * Punycode. Only the non-ASCII parts of the domain name will be converted, - * i.e. it doesn't matter if you call it with a domain that's already in - * ASCII. - * - * @param input The domain name or email address to convert, as a - * Unicode string. - * @return The Punycode representation of the given domain name or - * email address. - */ -export function toASCII(input: string): string { - return mapDomain(input, function (str: string) { - return regexNonASCII.test(str) ? "xn--" + encode(str) : str; - }); -} diff --git a/ext/node/polyfills/punycode.ts b/ext/node/polyfills/punycode.ts index f58871c0aa..30fb727c2f 100644 --- a/ext/node/polyfills/punycode.ts +++ b/ext/node/polyfills/punycode.ts @@ -1,12 +1,24 @@ // Copyright 2018-2023 the Deno authors. All rights reserved. MIT license. -import { - decode, - encode, - toASCII, - toUnicode, - ucs2, -} from "internal:deno_node/polyfills/internal/idna.ts"; +import { ucs2 } from "internal:deno_node/polyfills/internal/idna.ts"; + +const { ops } = globalThis.__bootstrap.core; + +function toASCII(domain) { + return ops.op_node_idna_domain_to_ascii(domain); +} + +function toUnicode(domain) { + return ops.op_node_idna_domain_to_unicode(domain); +} + +function decode(domain) { + return ops.op_node_idna_punycode_decode(domain); +} + +function encode(domain) { + return ops.op_node_idna_punycode_encode(domain); +} export { decode, encode, toASCII, toUnicode, ucs2 }; diff --git a/ext/node/polyfills/url.ts b/ext/node/polyfills/url.ts index 31b1e676f9..6d38fd1ffc 100644 --- a/ext/node/polyfills/url.ts +++ b/ext/node/polyfills/url.ts @@ -67,12 +67,7 @@ import { CHAR_ZERO_WIDTH_NOBREAK_SPACE, } from "internal:deno_node/polyfills/path/_constants.ts"; import * as path from "internal:deno_node/polyfills/path.ts"; -import { - regexNonASCII, - regexPunycode, - toASCII, - toUnicode, -} from "internal:deno_node/polyfills/internal/idna.ts"; +import { toASCII, toUnicode } from "internal:deno_node/polyfills/punycode.ts"; import { isWindows, osType } from "internal:deno_node/polyfills/_util/os.ts"; import { encodeStr, @@ -1263,10 +1258,6 @@ export function resolveObject(source: string | Url, relative: string) { * @see https://www.rfc-editor.org/rfc/rfc3490#section-4 */ export function domainToASCII(domain: string) { - if (regexPunycode.test(domain) && regexNonASCII.test(domain)) { - return ""; // Failure case - } - return toASCII(domain); } @@ -1277,10 +1268,6 @@ export function domainToASCII(domain: string) { * @see https://www.rfc-editor.org/rfc/rfc3490#section-4 */ export function domainToUnicode(domain: string) { - if (regexPunycode.test(domain) && regexNonASCII.test(domain)) { - return ""; // Failure case - } - return toUnicode(domain); }