Example papaparse (csv parser) binding

In case this helps anyone, here are some quick, rough bindings to PapaParse.

open! Import

module ParseResult = {
  module Error = {
    module Code = {
      // at version 5.3.1, there are four possible values
      type t = [
        | #MissingQuotes
        | #TooFewFields
        | #TooManyFields
        | #UndetectableDelimiter
        | #UnexpectedError(string)
      ] // UnexpectedError is for guarding against the future

      let ofString: string => t = c =>
        switch c {
        | "MissingQuotes" => #MissingQuotes
        | "TooFewFields" => #TooFewFields
        | "TooManyFields" => #TooManyFields
        | "UndetectableDelimiter" => #UndetectableDelimiter
        | c => #UnexpectedError(c)
        }
    }

    type t = {
      @as("type") type_: string, // at version 5.3.1, there are three possible values
      code: string,
      message: string,
      row: int,
    }

    let toString = t => `At row ${t.row->Belt.Int.toString}: ${t.message}`
  }
  module Meta = {
    type t = {fields: array<string>}
  }
  module Row = {
    type t = Js.Dict.t<string>

    // Note: we use unsafeGet because we assume that we
    //   have already ensured that there are no #TooFewFields errors
    //   and the headerCheck has passed before reaching point
    let get = (t, fieldHeader) => Js.Dict.unsafeGet(t, fieldHeader)
  }

  type t = {
    data: array<Row.t>,
    errors: array<Error.t>,
    meta: Meta.t,
  }

  let foundHeaders = t => t.meta.fields
}

module ParseConfig = {
  type t = {
    delimiter: string,
    header: bool,
    skipEmptyLines: bool,
  }
}

module UnstructuredParseResult = {
  module Row = {
    type t = array<string>

    let get = (t, columnIndex) => Belt.Array.get(t, columnIndex)

    let toList = Belt.List.fromArray

    let length = Belt.Array.length
  }

  type t = {
    data: array<Row.t>,
    errors: array<ParseResult.Error.t>,
  }
}

let getFirstError: (
  array<ParseResult.Error.t>,
  ParseResult.Error.Code.t,
) => option<ParseResult.Error.t> = (errors, code) =>
  errors->Belt.Array.getBy(e => ParseResult.Error.Code.ofString(e.code) == code)

@module("papaparse") external parse: (string, ParseConfig.t) => 'a = "parse"

let parseCsvAsDictionaries = data =>
  parse(
    data,
    {
      delimiter: ",",
      header: true,
      // Note: this may mask empty lines in the middle of the file,
      //  but it also ensures that a trailing newline will not produce an error
      skipEmptyLines: true,
    },
  )

let parseUnstructured = data =>
  parse(
    data,
    {
      delimiter: ",",
      header: false,
      // Note: this may mask empty lines in the middle of the file,
      //  but it also ensures that a trailing newline will not produce an error
      skipEmptyLines: true,
    },
  )

// Note: we don't bind to the config parameter because
//  we don't use any of the configuration.
@module("papaparse") external unparse: array<array<string>> => string = "unparse"

Some quick context:

  • npm options are papaparse (9.9k stars, 770k downloads, update june), csv-parse (2.9k stars, 1,400k downloads, updated july), fast-csv (1.1k stars, 680k downloads, update july), and csv-parser (1.1k stars, 490k downloads, updated dec 2020)
  • I spent some time debating whether to use headers: true. In my case, Iā€™m not sure if the header names will be reliable, so I will probably use some detection based on the field contents and regex against header name, to determine the exact column for the data that I am working on.
  • I like having papaparse take care of field escaping and end of line delimiter detection
  • If I spent more time to this, I would change ParseResult.t to something like result<array<Error.t>, (array<Row.t>, Meta.t, array<Warning.t>)>

I wish tablecloth was alive so that I could have used delimted_parsing.

3 Likes