fix(fetch): Correctly decode multipart/form-data names and filenames (#19145)

Currently the `multipart/form-data` parser in `Request.prototype.formData` and `Response.prototype.formData` decodes non-ASCII filenames incorrectly, as if they were encoded in Latin-1 rather than UTF-8. This happens because the header section of each `multipart/form-data` entry is decoded as Latin-1 in order to be parsed with `Headers`, which only allows `ByteString`s, but the names and filenames are never decoded correctly. This PR fixes this as a post-processing step. Note that the `multipart/form-data` parsing for this APIs in the Fetch spec is very much underspecified, and it does not specify that names and filenames must be decoded as UTF-8. However, it does require that the bodies of non-`File` entries are decoded as UTF-8, and in browsers, names and filenames always use the same encoding as the body. Closes #19142.
2025-01-21 21:50:00 -05:00 · 2023-05-16 17:49:35 +02:00 · 2023-05-16 17:49:35 +02:00 · 9ba2c4c42f
commit 9ba2c4c42f
parent 1c74b41855
2 changed files with 81 additions and 5 deletions
--- a/cli/tests/unit/body_test.ts
+++ b/cli/tests/unit/body_test.ts
@ -53,6 +53,59 @@ Deno.test(
  },
 );

+// FormData: non-ASCII names and filenames
+Deno.test(
+  { permissions: { net: true } },
+  async function bodyMultipartFormDataNonAsciiNames() {
+    const boundary = "----01230123";
+    const payload = [
+      `--${boundary}`,
+      `Content-Disposition: form-data; name="文字"`,
+      "",
+      "文字",
+      `--${boundary}`,
+      `Content-Disposition: form-data; name="file"; filename="文字"`,
+      "Content-Type: application/octet-stream",
+      "",
+      "",
+      `--${boundary}--`,
+    ].join("\r\n");
+
+    const body = buildBody(
+      new TextEncoder().encode(payload),
+      new Headers({
+        "Content-Type": `multipart/form-data; boundary=${boundary}`,
+      }),
+    );
+
+    const formData = await body.formData();
+    assert(formData.has("文字"));
+    assertEquals(formData.get("文字"), "文字");
+    assert(formData.has("file"));
+    assert(formData.get("file") instanceof File);
+    assertEquals((formData.get("file") as File).name, "文字");
+  },
+);
+
+// FormData: non-ASCII names and filenames roundtrip
+Deno.test(
+  { permissions: { net: true } },
+  async function bodyMultipartFormDataNonAsciiRoundtrip() {
+    const inFormData = new FormData();
+    inFormData.append("文字", "文字");
+    inFormData.append("file", new File([], "文字"));
+
+    const body = buildBody(inFormData);
+
+    const formData = await body.formData();
+    assert(formData.has("文字"));
+    assertEquals(formData.get("文字"), "文字");
+    assert(formData.has("file"));
+    assert(formData.get("file") instanceof File);
+    assertEquals((formData.get("file") as File).name, "文字");
+  },
+);
+
 Deno.test(
  { permissions: { net: true } },
  async function bodyURLEncodedFormData() {
--- a/ext/fetch/21_formdata.js
+++ b/ext/fetch/21_formdata.js
@ -41,6 +41,7 @@ const {
  StringPrototypeReplaceAll,
  TypeError,
  TypedArrayPrototypeSubarray,
+  Uint8Array,
 } = primordials;

 const entryList = Symbol("entry list");
@ -358,6 +359,20 @@ function parseContentDisposition(value) {
  return params;
 }

+/**
+ * Decodes a string containing UTF-8 mistakenly decoded as Latin-1 and
+ * decodes it correctly.
+ * @param {string} latin1String
+ * @returns {string}
+ */
+function decodeLatin1StringAsUtf8(latin1String) {
+  const buffer = new Uint8Array(latin1String.length);
+  for (let i = 0; i < latin1String.length; i++) {
+    buffer[i] = latin1String.charCodeAt(i);
+  }
+  return core.decode(buffer);
+}
+
 const CRLF = "\r\n";
 const LF = StringPrototypeCodePointAt(CRLF, 1);
 const CR = StringPrototypeCodePointAt(CRLF, 0);
@ -465,23 +480,31 @@ class MultipartParser {
            i - boundaryIndex - 1,
          );
          // https://fetch.spec.whatwg.org/#ref-for-dom-body-formdata
-          const filename = MapPrototypeGet(disposition, "filename");
-          const name = MapPrototypeGet(disposition, "name");
+          // These are UTF-8 decoded as if it was Latin-1.
+          // TODO(@andreubotella): Maybe we shouldn't be parsing entry headers
+          // as Latin-1.
+          const latin1Filename = MapPrototypeGet(disposition, "filename");
+          const latin1Name = MapPrototypeGet(disposition, "name");

          state = 5;
          // Reset
          boundaryIndex = 0;
          headerText = "";

-          if (!name) {
+          if (!latin1Name) {
            continue; // Skip, unknown name
          }

-          if (filename) {
+          const name = decodeLatin1StringAsUtf8(latin1Name);
+          if (latin1Filename) {
            const blob = new Blob([content], {
              type: headers.get("Content-Type") || "application/octet-stream",
            });
-            formData.append(name, blob, filename);
+            formData.append(
+              name,
+              blob,
+              decodeLatin1StringAsUtf8(latin1Filename),
+            );
          } else {
            formData.append(name, core.decode(content));
          }