dashbitco / nimble_parsec

A simple and fast library for text-based parser combinators

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Parsing IPv6 addresses

liamwhite opened this issue · comments

I implemented a combinator (roughly) as per the ABNF described by the IETF:

        IPv4address = d8 "." d8 "." d8 "." d8

        d8          = DIGIT               ; 0-9
                    / %x31-39 DIGIT       ; 10-99
                    / "1" 2DIGIT          ; 100-199
                    / "2" %x30-34 DIGIT   ; 200-249
                    / "25" %x30-35        ; 250-255

        IPv6address =                          6(h16 ":") ls32
                    /                     "::" 5(h16 ":") ls32
                    / [             h16 ] "::" 4(h16 ":") ls32
                    / [ *1(h16 ":") h16 ] "::" 3(h16 ":") ls32
                    / [ *2(h16 ":") h16 ] "::" 2(h16 ":") ls32
                    / [ *3(h16 ":") h16 ] "::"   h16 ":"  ls32
                    / [ *4(h16 ":") h16 ] "::"            ls32
                    / [ *5(h16 ":") h16 ] "::"             h16
                    / [ *6(h16 ":") h16 ] "::"

        ls32        = h16 ":" h16 / IPv4address

        h16         = 1*4HEXDIG
  ipv4_octet =
    ascii_string([?0..?9], min: 1, max: 3)

  ipv4_address =
    times(ipv4_octet |> string("."), 3)
    |> concat(ipv4_octet)

  ipv6_hexadectet =
    ascii_string('0123456789abcdefABCDEF', min: 1, max: 4)

  ipv6_ls32 =
    choice([
      ipv6_hexadectet |> string(":") |> concat(ipv6_hexadectet),
      ipv4_address
    ])

  ipv6_fragment =
    ipv6_hexadectet |> string(":")

  ipv6_address =
    choice([
      times(ipv6_fragment, 6) |> concat(ipv6_ls32),
      string("::") |> times(ipv6_fragment, 5) |> concat(ipv6_ls32),
      optional(ipv6_hexadectet) |> string("::") |> times(ipv6_fragment, 4) |> concat(ipv6_ls32),
      optional(times(ipv6_fragment, max: 1) |> concat(ipv6_hexadectet)) |> string("::") |> times(ipv6_fragment, 3) |> concat(ipv6_ls32),
      optional(times(ipv6_fragment, max: 2) |> concat(ipv6_hexadectet)) |> string("::") |> times(ipv6_fragment, 2) |> concat(ipv6_ls32),
      optional(times(ipv6_fragment, max: 3) |> concat(ipv6_hexadectet)) |> string("::") |> concat(ipv6_fragment) |> concat(ipv6_ls32),
      optional(times(ipv6_fragment, max: 4) |> concat(ipv6_hexadectet)) |> string("::") |> concat(ipv6_ls32),
      optional(times(ipv6_fragment, max: 5) |> concat(ipv6_hexadectet)) |> string("::") |> concat(ipv6_hexadectet),
      optional(times(ipv6_fragment, max: 6) |> concat(ipv6_hexadectet)) |> string("::")
    ])

I have checked this combinator over and over again and I am fairly sure it is implemented exactly per the BNF, but I cannot get it to match addresses like fe80::362c:b162:1a49:bf12 or 2000:4000:6000:8000::a. What's going wrong?

Can you please provide the whole module with the definition and how you are trying to call it?

Here's a whole module:

defmodule IpAddressParser do
  import NimbleParsec

  ipv4_octet =
    ascii_string([?0..?9], min: 1, max: 3)

  ipv4_address =
    times(ipv4_octet |> string("."), 3)
    |> concat(ipv4_octet)

  ipv6_hexadectet =
    ascii_string('0123456789abcdefABCDEF', min: 1, max: 4)

  ipv6_ls32 =
    choice([
      ipv6_hexadectet |> string(":") |> concat(ipv6_hexadectet),
      ipv4_address
    ])

  ipv6_fragment =
    ipv6_hexadectet |> string(":")

  ipv6_address =
    choice([
      times(ipv6_fragment, 6) |> concat(ipv6_ls32),
      string("::") |> times(ipv6_fragment, 5) |> concat(ipv6_ls32),
      optional(ipv6_hexadectet) |> string("::") |> times(ipv6_fragment, 4) |> concat(ipv6_ls32),
      optional(times(ipv6_fragment, max: 1) |> concat(ipv6_hexadectet)) |> string("::") |> times(ipv6_fragment, 3) |> concat(ipv6_ls32),
      optional(times(ipv6_fragment, max: 2) |> concat(ipv6_hexadectet)) |> string("::") |> times(ipv6_fragment, 2) |> concat(ipv6_ls32),
      optional(times(ipv6_fragment, max: 3) |> concat(ipv6_hexadectet)) |> string("::") |> concat(ipv6_fragment) |> concat(ipv6_ls32),
      optional(times(ipv6_fragment, max: 4) |> concat(ipv6_hexadectet)) |> string("::") |> concat(ipv6_ls32),
      optional(times(ipv6_fragment, max: 5) |> concat(ipv6_hexadectet)) |> string("::") |> concat(ipv6_hexadectet),
      optional(times(ipv6_fragment, max: 6) |> concat(ipv6_hexadectet)) |> string("::")
    ])

  cidr_prefix =
    string("/")
    |> ascii_string([?0..?9], min: 1, max: 3)

  ip_address =
    choice([
      ipv4_address,
      ipv6_address
    ])
    |> optional(cidr_prefix)
    |> reduce({Enum, :join, []})
    |> label("a valid IPv4 or IPv6 address and optional CIDR prefix")
    |> unwrap_and_tag(:ip)

  defparsec :ip_address, ip_address
end

And here's how I'm calling it:

iex(1)> IpAddressParser.ip_address("127.0.0.1")
{:ok, [ip: "127.0.0.1"], "", %{}, {1, 0}, 9}
iex(2)> IpAddressParser.ip_address("::127.0.0.1")
{:ok, [ip: "::127.0.0.1"], "", %{}, {1, 0}, 11}
iex(3)> IpAddressParser.ip_address("fe80::362c:b162:1a49:bf12")  
{:error, "expected a valid IPv4 or IPv6 address and optional CIDR prefix",
 "fe80::362c:b162:1a49:bf12", %{}, {1, 0}, 0}
iex(4)> IpAddressParser.ip_address("2000:4000:6000:8000::a")   
{:error, "expected a valid IPv4 or IPv6 address and optional CIDR prefix",
 "2000:4000:6000:8000::a", %{}, {1, 0}, 0}
iex(5)> IpAddressParser.ip_address("::0")                   
{:ok, [ip: "::0"], "", %{}, {1, 0}, 3}
iex(6)> IpAddressParser.ip_address("::") 
{:ok, [ip: "::"], "", %{}, {1, 0}, 2}

I believe your issue is a mixture of choice with optional. Optional always succeeds, which means you are always entering this clause:

 optional(ipv6_hexadectet) |> string("::") |> times(ipv6_fragment, 4) |> concat(ipv6_ls32)

Remove the optionals and built on top of that. Basically, since they are inside a choice, they are all optional already. A similar issue may happen with times(). Once you succeced in any of the times, it won't back track all the way up to the choice if it fails later on. The backtracking happens per each immediate choice and not for each "clause" inside the choice.

Okay, that seems to work. Thank you.

Yes, here you go:

  ipv4_octet =
    choice([
      ascii_char('2') |> ascii_char('5') |> ascii_char([?0..?5]),
      ascii_char('2') |> ascii_char([?0..?4]) |> ascii_char([?0..?9]),
      ascii_char('1') |> ascii_char([?0..?9]) |> ascii_char([?0..?9]),
      ascii_char([?1..?9]) |> ascii_char([?0..?9]),
      ascii_char([?0..?9])
    ])
    |> reduce({List, :to_string, []})

  ipv4_address =
    times(ipv4_octet |> string("."), 3)
    |> concat(ipv4_octet)

  ipv4_prefix =
    ascii_char('/')
    |> choice([
      ascii_char('3') |> ascii_char([?0..?2]),
      ascii_char([?1..?2]) |> ascii_char([?0..?9]),
      ascii_char([?0..?9])
    ])
    |> reduce({List, :to_string, []})

  ipv6_hexadectet =
    ascii_string('0123456789abcdefABCDEF', min: 1, max: 4)

  ipv6_ls32 =
    choice([
      ipv6_hexadectet |> string(":") |> concat(ipv6_hexadectet),
      ipv4_address
    ])

  ipv6_fragment =
    ipv6_hexadectet |> string(":")

  ipv6_address =
    choice([
      times(ipv6_fragment, 6) |> concat(ipv6_ls32),
      string("::") |> times(ipv6_fragment, 5) |> concat(ipv6_ls32),

      ipv6_hexadectet |> string("::") |> times(ipv6_fragment, 4) |> concat(ipv6_ls32),
      string("::") |> times(ipv6_fragment, 4) |> concat(ipv6_ls32),
      
      times(ipv6_fragment, 1) |> concat(ipv6_hexadectet) |> string("::") |> times(ipv6_fragment, 3) |> concat(ipv6_ls32),
      ipv6_hexadectet |> string("::") |> times(ipv6_fragment, 3) |> concat(ipv6_ls32),
      string("::") |> times(ipv6_fragment, 3) |> concat(ipv6_ls32),

      times(ipv6_fragment, 2) |> concat(ipv6_hexadectet) |> string("::") |> times(ipv6_fragment, 2) |> concat(ipv6_ls32),
      times(ipv6_fragment, 1) |> concat(ipv6_hexadectet) |> string("::") |> times(ipv6_fragment, 2) |> concat(ipv6_ls32),
      ipv6_hexadectet |> string("::") |> times(ipv6_fragment, 2) |> concat(ipv6_ls32),
      string("::") |> times(ipv6_fragment, 2) |> concat(ipv6_ls32),

      times(ipv6_fragment, 3) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_fragment) |> concat(ipv6_ls32),
      times(ipv6_fragment, 2) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_fragment) |> concat(ipv6_ls32),
      times(ipv6_fragment, 1) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_fragment) |> concat(ipv6_ls32),
      ipv6_hexadectet |> string("::") |> concat(ipv6_fragment) |> concat(ipv6_ls32),
      string("::") |> concat(ipv6_fragment) |> concat(ipv6_ls32),

      times(ipv6_fragment, 4) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_ls32),
      times(ipv6_fragment, 3) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_ls32),
      times(ipv6_fragment, 2) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_ls32),
      times(ipv6_fragment, 1) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_ls32),
      ipv6_hexadectet |> string("::") |> concat(ipv6_ls32),
      string("::") |> concat(ipv6_ls32),

      times(ipv6_fragment, 5) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_hexadectet),
      times(ipv6_fragment, 4) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_hexadectet),
      times(ipv6_fragment, 3) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_hexadectet),
      times(ipv6_fragment, 2) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_hexadectet),
      times(ipv6_fragment, 1) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_hexadectet),
      ipv6_hexadectet |> string("::") |> concat(ipv6_hexadectet),
      string("::") |> concat(ipv6_hexadectet),

      times(ipv6_fragment, 6) |> concat(ipv6_hexadectet) |> string("::"),
      times(ipv6_fragment, 5) |> concat(ipv6_hexadectet) |> string("::"),
      times(ipv6_fragment, 4) |> concat(ipv6_hexadectet) |> string("::"),
      times(ipv6_fragment, 3) |> concat(ipv6_hexadectet) |> string("::"),
      times(ipv6_fragment, 2) |> concat(ipv6_hexadectet) |> string("::"),
      times(ipv6_fragment, 1) |> concat(ipv6_hexadectet) |> string("::"),
      ipv6_hexadectet |> string("::"),
      string("::")
    ])

  ipv6_prefix =
    ascii_char('/')
    |> choice([
      ascii_char('1') |> ascii_char('2') |> ascii_char([?0..?8]),
      ascii_char('1') |> ascii_char([?0..?1]) |> ascii_char([?0..?9]),
      ascii_char([?1..?9]) |> ascii_char([?0..?9]),
      ascii_char([?0..?9])
    ])
    |> reduce({List, :to_string, []})

  ip_address =
    choice([
      ipv4_address |> optional(ipv4_prefix),
      ipv6_address |> optional(ipv6_prefix)
    ])
    |> reduce({Enum, :join, []})
    |> label("a valid IPv4 or IPv6 address and optional CIDR prefix")
    |> unwrap_and_tag(:ip)

Wow is it slow to compile, but it does seem to work with everything I've tested so far.

You can do a choice on the ending though. For example, you repeat times(ipv6_fragment, 1) multiple times. But I imagine you can do this:

times(ipv6_fragment, 1)
|> concat(ipv6_hexadectet)
|> string("::")
|> choice([
  times(ipv6_fragment, 3) |> concat(ipv6_ls32),
  times(ipv6_fragment, 2) |> concat(ipv6_ls32),
  concat(ipv6_fragment) |> concat(ipv6_ls32),
  concat(ipv6_ls32),
  concat(ipv6_fragment),
  empty()
])

which in turns seems to be the same as:

times(ipv6_fragment, 1)
|> concat(ipv6_hexadectet)
|> string("::")
|> choice([
  times(ipv6_fragment, min: 1, max: 3) |> concat(ipv6_ls32),
  concat(ipv6_ls32),
  concat(ipv6_fragment),
  empty()
])

You can probably refactor all rules to something similar to this.

These combinators take a really long time to compile. I'm using Elixir 1.9.1; is there any way I can improve the compile time?

They will take a lot to compile because there is lot of duplication, if you refactor them as I showed above, it should speed up compilation.

Also check the documentation, in particular parsec, for ways to reduce compilation time.