takeP equivalent

Question

takeP equivalent

foresttoney opened this issue 6 months ago · comments

Forest Loren Toney III commented 6 months ago

I often find myself needing to take the next n bytes of the input. For a contrived but representative example:

defparsec(
  :example_parser,
  string("00")   
  |> ignore(utf8_string([], 5))
  |> integer(1),
  debug: true
)

The issue is that when using something like ascii_string or utf8_string, you get quite a bit of unnecessary inlining.

defp example_parser__0(
  <<"00", x0::utf8, x1::utf8, x2::utf8, x3::utf8, x4::utf8, x5, rest::binary>>,
  acc,
  stack,
  context,
  comb__line,
  comb__offset
) when x5 >= 48 and x5 <= 57 do
  example_parser__1(
  rest,
  [x5 - 48, "00"] ++ acc,
  stack,
  context,
  (
    line =
      (
        line =
          (
            line =
              (
                line =
                  (
                    line = comb__line

                    case x0 do
                      10 -> {elem(line, 0) + 1, comb__offset + 2 + byte_size(<<x0::utf8>>)}
                      _ -> line
                    end
                  )

                case x1 do
                  10 ->
                    {elem(line, 0) + 1,
                     comb__offset + 2 + byte_size(<<x0::utf8>>) + byte_size(<<x1::utf8>>)}

                  _ ->
                    line
                end
              )

            case x2 do
              10 ->
                {elem(line, 0) + 1,
                 comb__offset + 2 + byte_size(<<x0::utf8>>) + byte_size(<<x1::utf8>>) +
                   byte_size(<<x2::utf8>>)}

              _ ->
                line
            end
          )

        case x3 do
          10 ->
            {elem(line, 0) + 1,
             comb__offset + 2 + byte_size(<<x0::utf8>>) + byte_size(<<x1::utf8>>) +
               byte_size(<<x2::utf8>>) + byte_size(<<x3::utf8>>)}

          _ ->
            line
        end
      )

    case x4 do
      10 ->
        {elem(line, 0) + 1,
         comb__offset + 2 + byte_size(<<x0::utf8>>) + byte_size(<<x1::utf8>>) +
           byte_size(<<x2::utf8>>) + byte_size(<<x3::utf8>>) + byte_size(<<x4::utf8>>)}

      _ ->
        line
    end
  ),
  comb__offset + 2 + byte_size(<<x0::utf8>>) + byte_size(<<x1::utf8>>) + byte_size(<<x2::utf8>>) +
    byte_size(<<x3::utf8>>) + byte_size(<<x4::utf8>>) + 1
)
end

defp example_parser__0(rest, _acc, _stack, context, line, offset) do
  {:error,
 "expected string \"00\", followed by utf8 codepoint, followed by utf8 codepoint, followed by utf8 codepoint, followed by utf8 codepoint, followed by utf8 codepoint, followed by ASCII character in the range \"0\" to \"9\"",
 rest, context, line, offset}
end

defp example_parser__1(rest, acc, _stack, context, line, offset) do
  {:ok, acc, rest, context, line, offset}
end

I think it would make sense to have something like takeP from megaparsec. Perhaps it would be called byte and it would compile to something like:

defparsec(
  :example_parser,
  string("00")   
  |> ignore(byte(5))
  |> integer(1),
  debug: true
)

defp example_parser__0(
  <<"00", x0::binary-size(5), x1, rest::binary>>,
  acc,
  stack,
  context,
  comb__line,
  comb__offset
) when x1 >= 48 and x1 <= 57 do
  example_parser__1(
  rest,
  [x1 - 48, "00"] ++ acc,
  stack,
  context, ...

Like takeP, it would return a parse error if there weren't enough tokens (the pattern match fails). I know that compile times can grow when parsing fixed-width files, so perhaps this might help.

José Valim · Answer 1 · Mon Nov 27 2023 08:57:22 GMT+0800 (China Standard Time)

Yeah, #30 was meant to add it, but I never got around it. If you want to send a PR, it will be welcome!

Forest Loren Toney III · Answer 2 · Mon Nov 27 2023 09:44:32 GMT+0800 (China Standard Time)

Yeah - this is exactly right. I haven't spent much time diving into the library internals, but I will see what I can come up with.

José Valim · Answer 3 · Tue Nov 28 2023 19:33:27 GMT+0800 (China Standard Time)

Closing in favor of PR!