dashbitco / nimble_parsec

A simple and fast library for text-based parser combinators

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Library Abuse or Slow Compilation Times

foresttoney opened this issue · comments

I am migrating an old service that communicates with an external platform via FTP / txt files. Because I have to parse these txt files, I thought this library would be more appropriate than just pattern matching on the byte offsets. At any rate, I wrote a handful of combinators and my tests already eclipse the 10 second compilation warning. Here is a representative sample the test and implementation:

test "parse_file_header_record" do
      file_header_record =
        "02000011697978     TEST       100720OYTKK                 F030000000     1    "
        |> POA.parse_file_header_record()

      assert file_header_record ==
               {:ok,
                [
                  %{
                    record_code: 2,
                    sequence_number: 1,
                    file_source: "1697978",
                    file_source_name: "TEST",
                    creation_date: ~D[2010-07-20],
                    ecu: "OYTKK",
                    filename: "",
                    version: "F03",
                    destination: "0000000",
                    type: "1"
                  }
                ], "", %{}, {1, 0}, 80}
end

// example of combinator defined in ParserUtils that is reused for multiple record types
def parse_record_code(x) do
  string(x)
  |> map({String, :to_integer, []})
  |> unwrap_and_tag(:record_code)
  |> label("record_code to be an integer")
end

defparsec(
    :parse_file_header_record,
    empty()
    |> concat(ParserUtils.parse_record_code("02"))
    |> concat(ParserUtils.parse_sequence_number())
    |> concat(ParserUtils.parse_file_source())
    |> ignore(ParserUtils.parse_reserved(5))
    |> concat(ParserUtils.parse_file_source_name())
    |> concat(
      ParserUtils.parse_yyMMdd()
      |> unwrap_and_tag(:creation_date)
      |> label("creation_date to be valid yyMMdd date")
    )
    |> concat(
      utf8_string([], 5)
      |> unwrap_and_tag(:ecu)
      |> label("ecu to be 5 charachters")
    )
    |> concat(ParserUtils.parse_filename(17))
    |> concat(ParserUtils.parse_version())
    |> concat(
      utf8_string([], 7)
      |> unwrap_and_tag(:destination)
      |> label("destination to be 7 charachters")
    )
    |> ignore(ParserUtils.parse_reserved(5))
    |> concat(ParserUtils.parse_type())
    |> ignore(ParserUtils.parse_reserved(4))
    |> eos()
    |> reduce({Enum, :into, [%{}]})
)

I spent a bit of time with the debug: true dump and the inline is quite large and nested. As an example:

(
    line =
      (
        line =
          (
            line =
              (
                line =
                  (
                    line =
                      (
                        line =
                          (
                            line =
                              (
                                line =
                                  (
                                    line =
                                      (
                                        line =
                                          (
                                            line =
                                              (
                                                line =
                                                  (
                                                    line =
                                                      (
                                                        line =
                                                          (
                                                            line =
                                                              (
                                                                line =
                                                                  (
                                                                    line =
                                                                      (
                                                                        line =
                                                                          (
                                                                            line =
                                                                              (
                                                                                line =
                                                                                  (
                                                                                    line =
                                                                                      (
                                                                                        line =
                                                                                          comb__line

                                                                                        case x5 do
                                                                                          10 ->
                                                                                            {elem(
                                                                                               line,
                                                                                               0
                                                                                             ) +
                                                                                               1,
                                                                                             comb__offset +
                                                                                               7 +
                                                                                               byte_size(
                                                                                                 <<x5::utf8>>
                                                                                               )}

                                                                                          _ ->
                                                                                            line
                                                                                        end
                                                                                      )

                                                                                    case x6 do
                                                                                      10 ->
                                                                                        {elem(
                                                                                           line,
                                                                                           0
                                                                                         ) + 1,
                                                                                         comb__offset +
                                                                                           7 +
                                                                                           byte_size(
                                                                                             <<x5::utf8>>
                                                                                           ) +
                                                                                           byte_size(
                                                                                             <<x6::utf8>>
                                                                                           )}

                                                                                      _ ->
                                                                                        line
                                                                                    end
                                                                                  ) ...

My question is whether I am abusing the library (incorrectly constructing combinators or using it outside the intended purpose) or if there is a larger performance issue with the inlining.

Correct. nimble_parsec is about code generation, so you may end-up abusing it. You can use parsec to encapsulate highly repetitive code or use other combinators for composition to avoid repetitive code.