Add encoding/csv (denoland/deno_std#432)

Original: c8a7dcdcd0
2025-03-03 09:31:22 -05:00 · 2019-05-24 15:33:42 +02:00 · 2019-05-24 15:33:42 +02:00 · aed65ff333
commit aed65ff333
parent 31db7c4dba
3 changed files with 612 additions and 0 deletions
--- a/encoding/csv.ts
+++ b/encoding/csv.ts
@ -0,0 +1,151 @@
+// Ported from Go:
+// https://github.com/golang/go/blob/go1.12.5/src/encoding/csv/
+// Copyright 2018-2019 the Deno authors. All rights reserved. MIT license.
+
+import { BufReader, BufState } from "../io/bufio.ts";
+import { TextProtoReader } from "../textproto/mod.ts";
+
+const INVALID_RUNE = ["\r", "\n", '"'];
+
+export class ParseError extends Error {
+  StartLine: number;
+  Line: number;
+  constructor(start: number, line: number, message: string) {
+    super(message);
+    this.StartLine = start;
+    this.Line = line;
+  }
+}
+
+export interface ParseOptions {
+  comma: string;
+  comment?: string;
+  trimLeadingSpace: boolean;
+  lazyQuotes?: boolean;
+  fieldsPerRecord?: number;
+}
+
+function chkOptions(opt: ParseOptions): Error | null {
+  if (
+    INVALID_RUNE.includes(opt.comma) ||
+    INVALID_RUNE.includes(opt.comment) ||
+    opt.comma === opt.comment
+  ) {
+    return Error("Invalid Delimiter");
+  }
+  return null;
+}
+
+export async function read(
+  Startline: number,
+  reader: BufReader,
+  opt: ParseOptions = { comma: ",", comment: "#", trimLeadingSpace: false }
+): Promise<[string[], BufState]> {
+  const tp = new TextProtoReader(reader);
+  let err: BufState;
+  let line: string;
+  let result: string[] = [];
+  let lineIndex = Startline;
+
+  [line, err] = await tp.readLine();
+
+  // Normalize \r\n to \n on all input lines.
+  if (
+    line.length >= 2 &&
+    line[line.length - 2] === "\r" &&
+    line[line.length - 1] === "\n"
+  ) {
+    line = line.substring(0, line.length - 2);
+    line = line + "\n";
+  }
+
+  const trimmedLine = line.trimLeft();
+  if (trimmedLine.length === 0) {
+    return [[], err];
+  }
+
+  // line starting with comment character is ignored
+  if (opt.comment && trimmedLine[0] === opt.comment) {
+    return [result, err];
+  }
+
+  result = line.split(opt.comma);
+
+  let quoteError = false;
+  result = result.map(
+    (r): string => {
+      if (opt.trimLeadingSpace) {
+        r = r.trimLeft();
+      }
+      if (r[0] === '"' && r[r.length - 1] === '"') {
+        r = r.substring(1, r.length - 1);
+      } else if (r[0] === '"') {
+        r = r.substring(1, r.length);
+      }
+
+      if (!opt.lazyQuotes) {
+        if (r[0] !== '"' && r.indexOf('"') !== -1) {
+          quoteError = true;
+        }
+      }
+      return r;
+    }
+  );
+  if (quoteError) {
+    return [
+      [],
+      new ParseError(Startline, lineIndex, 'bare " in non-quoted-field')
+    ];
+  }
+  return [result, err];
+}
+
+export async function readAll(
+  reader: BufReader,
+  opt: ParseOptions = {
+    comma: ",",
+    trimLeadingSpace: false,
+    lazyQuotes: false
+  }
+): Promise<[string[][], BufState]> {
+  const result: string[][] = [];
+  let _nbFields: number;
+  let err: BufState;
+  let lineResult: string[];
+  let first = true;
+  let lineIndex = 0;
+  err = chkOptions(opt);
+  if (err) return [result, err];
+
+  for (;;) {
+    [lineResult, err] = await read(lineIndex, reader, opt);
+    if (err) break;
+    lineIndex++;
+    // If fieldsPerRecord is 0, Read sets it to
+    // the number of fields in the first record
+    if (first) {
+      first = false;
+      if (opt.fieldsPerRecord !== undefined) {
+        if (opt.fieldsPerRecord === 0) {
+          _nbFields = lineResult.length;
+        } else {
+          _nbFields = opt.fieldsPerRecord;
+        }
+      }
+    }
+
+    if (lineResult.length > 0) {
+      if (_nbFields && _nbFields !== lineResult.length) {
+        return [
+          null,
+          new ParseError(lineIndex, lineIndex, "wrong number of fields")
+        ];
+      }
+      result.push(lineResult);
+    }
+  }
+  if (err !== "EOF") {
+    return [result, err];
+  }
+  return [result, null];
+}
--- a/encoding/csv_test.ts
+++ b/encoding/csv_test.ts
@ -0,0 +1,460 @@
+// Test ported from Golang
+// https://github.com/golang/go/blob/2cc15b1/src/encoding/csv/reader_test.go
+import { test, runIfMain } from "../testing/mod.ts";
+import { assertEquals, assert } from "../testing/asserts.ts";
+import { readAll } from "./csv.ts";
+import { StringReader } from "../io/readers.ts";
+import { BufReader } from "../io/bufio.ts";
+
+const ErrInvalidDelim = "Invalid Delimiter";
+const ErrFieldCount = "wrong number of fields";
+const ErrBareQuote = 'bare " in non-quoted-field';
+
+// TODO(zekth): Activate remaining tests
+const testCases = [
+  {
+    Name: "Simple",
+    Input: "a,b,c\n",
+    Output: [["a", "b", "c"]]
+  },
+  {
+    Name: "CRLF",
+    Input: "a,b\r\nc,d\r\n",
+    Output: [["a", "b"], ["c", "d"]]
+  },
+  {
+    Name: "BareCR",
+    Input: "a,b\rc,d\r\n",
+    Output: [["a", "b\rc", "d"]]
+  },
+  //   {
+  //     Name: "RFC4180test",
+  //     Input: `#field1,field2,field3
+  // "aaa","bbb","ccc"
+  // "a,a","bbb","ccc"
+  // zzz,yyy,xxx`,
+  //     UseFieldsPerRecord: true,
+  //     FieldsPerRecord: 0,
+  //     Output: [
+  //       ["#field1", "field2", "field3"],
+  //       ["aaa", "bbb", "ccc"],
+  //       ["a,a", `bbb`, "ccc"],
+  //       ["zzz", "yyy", "xxx"]
+  //     ]
+  //   },
+  {
+    Name: "NoEOLTest",
+    Input: "a,b,c",
+    Output: [["a", "b", "c"]]
+  },
+  {
+    Name: "Semicolon",
+    Input: "a;b;c\n",
+    Output: [["a", "b", "c"]],
+    Comma: ";"
+  },
+  //   {
+  //     Name: "MultiLine",
+  //     Input: `"two
+  // line","one line","three
+  // line
+  // field"`,
+  //     Output: [["two\nline"], ["one line"], ["three\nline\nfield"]]
+  //   },
+  {
+    Name: "BlankLine",
+    Input: "a,b,c\n\nd,e,f\n\n",
+    Output: [["a", "b", "c"], ["d", "e", "f"]]
+  },
+  {
+    Name: "BlankLineFieldCount",
+    Input: "a,b,c\n\nd,e,f\n\n",
+    Output: [["a", "b", "c"], ["d", "e", "f"]],
+    UseFieldsPerRecord: true,
+    FieldsPerRecord: 0
+  },
+  {
+    Name: "TrimSpace",
+    Input: " a,  b,   c\n",
+    Output: [["a", "b", "c"]],
+    TrimLeadingSpace: true
+  },
+  {
+    Name: "LeadingSpace",
+    Input: " a,  b,   c\n",
+    Output: [[" a", "  b", "   c"]]
+  },
+  {
+    Name: "Comment",
+    Input: "#1,2,3\na,b,c\n#comment",
+    Output: [["a", "b", "c"]],
+    Comment: "#"
+  },
+  {
+    Name: "NoComment",
+    Input: "#1,2,3\na,b,c",
+    Output: [["#1", "2", "3"], ["a", "b", "c"]]
+  },
+  {
+    Name: "LazyQuotes",
+    Input: `a "word","1"2",a","b`,
+    Output: [[`a "word"`, `1"2`, `a"`, `b`]],
+    LazyQuotes: true
+  },
+  {
+    Name: "BareQuotes",
+    Input: `a "word","1"2",a"`,
+    Output: [[`a "word"`, `1"2`, `a"`]],
+    LazyQuotes: true
+  },
+  {
+    Name: "BareDoubleQuotes",
+    Input: `a""b,c`,
+    Output: [[`a""b`, `c`]],
+    LazyQuotes: true
+  },
+  {
+    Name: "BadDoubleQuotes",
+    Input: `a""b,c`,
+    Error: ErrBareQuote
+    // Error: &ParseError{StartLine: 1, Line: 1, Column: 1, Err: ErrBareQuote},
+  },
+  {
+    Name: "TrimQuote",
+    Input: ` "a"," b",c`,
+    Output: [["a", " b", "c"]],
+    TrimLeadingSpace: true
+  },
+  {
+    Name: "BadBareQuote",
+    Input: `a "word","b"`,
+    Error: ErrBareQuote
+    // Error: true //&ParseError{StartLine: 1, Line: 1, Column: 2, Err: ErrBareQuote},
+  },
+  {
+    Name: "BadTrailingQuote",
+    Input: `"a word",b"`,
+    Error: ErrBareQuote
+  },
+  {
+    Name: "ExtraneousQuote",
+    Input: `"a "word","b"`,
+    Error: ErrBareQuote
+  },
+  {
+    Name: "BadFieldCount",
+    Input: "a,b,c\nd,e",
+    Error: ErrFieldCount,
+    UseFieldsPerRecord: true,
+    FieldsPerRecord: 0
+  },
+  {
+    Name: "BadFieldCount1",
+    Input: `a,b,c`,
+    // Error:              &ParseError{StartLine: 1, Line: 1, Err: ErrFieldCount},
+    UseFieldsPerRecord: true,
+    FieldsPerRecord: 2,
+    Error: ErrFieldCount
+  },
+  {
+    Name: "FieldCount",
+    Input: "a,b,c\nd,e",
+    Output: [["a", "b", "c"], ["d", "e"]]
+  },
+  {
+    Name: "TrailingCommaEOF",
+    Input: "a,b,c,",
+    Output: [["a", "b", "c", ""]]
+  },
+  {
+    Name: "TrailingCommaEOL",
+    Input: "a,b,c,\n",
+    Output: [["a", "b", "c", ""]]
+  },
+  {
+    Name: "TrailingCommaSpaceEOF",
+    Input: "a,b,c, ",
+    Output: [["a", "b", "c", ""]],
+    TrimLeadingSpace: true
+  },
+  {
+    Name: "TrailingCommaSpaceEOL",
+    Input: "a,b,c, \n",
+    Output: [["a", "b", "c", ""]],
+    TrimLeadingSpace: true
+  },
+  {
+    Name: "TrailingCommaLine3",
+    Input: "a,b,c\nd,e,f\ng,hi,",
+    Output: [["a", "b", "c"], ["d", "e", "f"], ["g", "hi", ""]],
+    TrimLeadingSpace: true
+  },
+  {
+    Name: "NotTrailingComma3",
+    Input: "a,b,c, \n",
+    Output: [["a", "b", "c", " "]]
+  },
+  {
+    Name: "CommaFieldTest",
+    Input: `x,y,z,w
+x,y,z,
+x,y,,
+x,,,
+,,,
+"x","y","z","w"
+"x","y","z",""
+"x","y","",""
+"x","","",""
+"","","",""
+`,
+    Output: [
+      ["x", "y", "z", "w"],
+      ["x", "y", "z", ""],
+      ["x", "y", "", ""],
+      ["x", "", "", ""],
+      ["", "", "", ""],
+      ["x", "y", "z", "w"],
+      ["x", "y", "z", ""],
+      ["x", "y", "", ""],
+      ["x", "", "", ""],
+      ["", "", "", ""]
+    ]
+  },
+  {
+    Name: "TrailingCommaIneffective1",
+    Input: "a,b,\nc,d,e",
+    Output: [["a", "b", ""], ["c", "d", "e"]],
+    TrimLeadingSpace: true
+  },
+  {
+    Name: "ReadAllReuseRecord",
+    Input: "a,b\nc,d",
+    Output: [["a", "b"], ["c", "d"]],
+    ReuseRecord: true
+  },
+  // {
+  //   Name: "StartLine1", // Issue 19019
+  //   Input: 'a,"b\nc"d,e',
+  //   Error: true
+  //   // Error: &ParseError{StartLine: 1, Line: 2, Column: 1, Err: ErrQuote},
+  // },
+  // {
+  //   Name: "StartLine2",
+  //   Input: 'a,b\n"d\n\n,e',
+  //   Error: true
+  //   // Error: &ParseError{StartLine: 2, Line: 5, Column: 0, Err: ErrQuote},
+  // },
+  // {
+  //   Name: "CRLFInQuotedField", // Issue 21201
+  //   Input: 'A,"Hello\r\nHi",B\r\n',
+  //   Output: [["A", "Hello\nHi", "B"]]
+  // },
+  {
+    Name: "BinaryBlobField", // Issue 19410
+    Input: "x09\x41\xb4\x1c,aktau",
+    Output: [["x09A\xb4\x1c", "aktau"]]
+  },
+  // {
+  //   Name: "TrailingCR",
+  //   Input: "field1,field2\r",
+  //   Output: [["field1", "field2"]]
+  // },
+  // {
+  //   Name: "QuotedTrailingCR",
+  //   Input: '"field"\r',
+  //   Output: [['"field"']]
+  // },
+  // {
+  //   Name: "QuotedTrailingCRCR",
+  //   Input: '"field"\r\r',
+  //   Error: true,
+  //   // Error: &ParseError{StartLine: 1, Line: 1, Column: 6, Err: ErrQuote},
+  // },
+  // {
+  //   Name: "FieldCR",
+  //   Input: "field\rfield\r",
+  //   Output: [["field\rfield"]]
+  // },
+  // {
+  //   Name: "FieldCRCR",
+  //   Input: "field\r\rfield\r\r",
+  //   Output: [["field\r\rfield\r"]]
+  // },
+  {
+    Name: "FieldCRCRLF",
+    Input: "field\r\r\nfield\r\r\n",
+    Output: [["field\r"], ["field\r"]]
+  },
+  {
+    Name: "FieldCRCRLFCR",
+    Input: "field\r\r\n\rfield\r\r\n\r",
+    Output: [["field\r"], ["\rfield\r"]]
+  },
+  // {
+  //   Name: "FieldCRCRLFCRCR",
+  //   Input: "field\r\r\n\r\rfield\r\r\n\r\r",
+  //   Output: [["field\r"], ["\r\rfield\r"], ["\r"]]
+  // },
+  // {
+  //   Name: "MultiFieldCRCRLFCRCR",
+  //   Input: "field1,field2\r\r\n\r\rfield1,field2\r\r\n\r\r,",
+  //   Output: [["field1", "field2\r"], ["\r\rfield1", "field2\r"], ["\r\r", ""]]
+  // },
+  {
+    Name: "NonASCIICommaAndComment",
+    Input: "a£b,c£ \td,e\n€ comment\n",
+    Output: [["a", "b,c", "d,e"]],
+    TrimLeadingSpace: true,
+    Comma: "£",
+    Comment: "€"
+  },
+  {
+    Name: "NonASCIICommaAndCommentWithQuotes",
+    Input: 'a€"  b,"€ c\nλ comment\n',
+    Output: [["a", "  b,", " c"]],
+    Comma: "€",
+    Comment: "λ"
+  },
+  {
+    // λ and θ start with the same byte.
+    // This tests that the parser doesn't confuse such characters.
+    Name: "NonASCIICommaConfusion",
+    Input: '"abθcd"λefθgh',
+    Output: [["abθcd", "efθgh"]],
+    Comma: "λ",
+    Comment: "€"
+  },
+  {
+    Name: "NonASCIICommentConfusion",
+    Input: "λ\nλ\nθ\nλ\n",
+    Output: [["λ"], ["λ"], ["λ"]],
+    Comment: "θ"
+  },
+  // {
+  //   Name: "QuotedFieldMultipleLF",
+  //   Input: '"\n\n\n\n"',
+  //   Output: [["\n\n\n\n"]]
+  // },
+  // {
+  //   Name: "MultipleCRLF",
+  //   Input: "\r\n\r\n\r\n\r\n"
+  // },
+  //  {
+  //   // The implementation may read each line in several chunks if it doesn't fit entirely
+  //   // in the read buffer, so we should test the code to handle that condition.
+  //   Name:    "HugeLines",
+  //   Input:   strings.Repeat("#ignore\n", 10000) + strings.Repeat("@", 5000) + "," + strings.Repeat("*", 5000),
+  //   Output:  [[strings.Repeat("@", 5000), strings.Repeat("*", 5000)]],
+  //   Comment: '#',
+  // },
+  {
+    Name: "QuoteWithTrailingCRLF",
+    Input: '"foo"bar"\r\n',
+    Error: ErrBareQuote
+    // Error: &ParseError{StartLine: 1, Line: 1, Column: 4, Err: ErrQuote},
+  },
+  {
+    Name: "LazyQuoteWithTrailingCRLF",
+    Input: '"foo"bar"\r\n',
+    Output: [[`foo"bar`]],
+    LazyQuotes: true
+  },
+  // {
+  //   Name: "DoubleQuoteWithTrailingCRLF",
+  //   Input: '"foo""bar"\r\n',
+  //   Output: [[`foo"bar`]]
+  // },
+  // {
+  //   Name: "EvenQuotes",
+  //   Input: `""""""""`,
+  //   Output: [[`"""`]]
+  // },
+  // {
+  //   Name: "OddQuotes",
+  //   Input: `"""""""`,
+  //   Error: true
+  //   // Error:" &ParseError{StartLine: 1, Line: 1, Column: 7, Err: ErrQuote}",
+  // },
+  // {
+  //   Name: "LazyOddQuotes",
+  //   Input: `"""""""`,
+  //   Output: [[`"""`]],
+  //   LazyQuotes: true
+  // },
+  {
+    Name: "BadComma1",
+    Comma: "\n",
+    Error: ErrInvalidDelim
+  },
+  {
+    Name: "BadComma2",
+    Comma: "\r",
+    Error: ErrInvalidDelim
+  },
+  {
+    Name: "BadComma3",
+    Comma: '"',
+    Error: ErrInvalidDelim
+  },
+  {
+    Name: "BadComment1",
+    Comment: "\n",
+    Error: ErrInvalidDelim
+  },
+  {
+    Name: "BadComment2",
+    Comment: "\r",
+    Error: ErrInvalidDelim
+  },
+  {
+    Name: "BadCommaComment",
+    Comma: "X",
+    Comment: "X",
+    Error: ErrInvalidDelim
+  }
+];
+for (const t of testCases) {
+  test({
+    name: `[CSV] ${t.Name}`,
+    async fn(): Promise<void> {
+      let comma = ",";
+      let comment;
+      let fieldsPerRec;
+      let trim = false;
+      let lazyquote = false;
+      if (t.Comma) {
+        comma = t.Comma;
+      }
+      if (t.Comment) {
+        comment = t.Comment;
+      }
+      if (t.TrimLeadingSpace) {
+        trim = true;
+      }
+      if (t.UseFieldsPerRecord) {
+        fieldsPerRec = t.FieldsPerRecord;
+      }
+      if (t.LazyQuotes) {
+        lazyquote = t.LazyQuotes;
+      }
+      const actual = await readAll(new BufReader(new StringReader(t.Input)), {
+        comma: comma,
+        comment: comment,
+        trimLeadingSpace: trim,
+        fieldsPerRecord: fieldsPerRec,
+        lazyQuotes: lazyquote
+      });
+      if (t.Error) {
+        assert(!!actual[1]);
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        const e: any = actual[1];
+        assertEquals(e.message, t.Error);
+      } else {
+        const expected = [t.Output, null];
+        assertEquals(actual, expected);
+      }
+    }
+  });
+}
+
+runIfMain(import.meta);
--- a/encoding/test.ts
+++ b/encoding/test.ts
@ -1,2 +1,3 @@
 // Copyright 2018-2019 the Deno authors. All rights reserved. MIT license.
 import "./toml_test.ts";
+import "./csv_test.ts";