eddelbuettel / rcppsimdjson

Rcpp Bindings for the 'simdjson' Header Library

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Duplicate object names and data frames

knapply opened this issue · comments

Tracking #21 (comment)

"The names within an object SHOULD be unique."... but are not required to be.

RcppSimdJson:::.deserialize_json('{"a":1,"b":2,"a":3}')
#> $a
#> [1] 1
#> 
#> $b
#> [1] 2
#> 
#> $a
#> [1] 3

RcppSimdJson:::.deserialize_json('[{"a":1,"b":2,"a":3}]')
#>   a b
#> 1 1 2

RcppSimdJson:::.deserialize_json('[{"a":"a1","b":"b1","a":"a2"},
                                   {"a":"a3","b":"b2","a":"a4"}]')
#>    a  b
#> 1 a1 b1
#> 2 a3 b2
commented

if there's an informal standard among current R packages

Yeah I too was surprised at the definition of unique keys, and I went down the route you, and I believe jsonlite has gone down, where simplified data.frames remove duplicates keys.
But the simplify = F lets the user keep them.

from_json('[{"a":1,"b":2,"a":3}]')
# a b
# 1 1 2

from_json('[{"a":1,"b":2,"a":3}]', simplify = F)
# [[1]]
# [[1]]$a
# [1] 1
# 
# [[1]]$b
# [1] 2
# 
# [[1]]$a
# [1] 3

from_json('[{"a":"a1","b":"b1","a":"a2"},
          {"a":"a3","b":"b2","a":"a4"}]')
# a  b
# 1 a1 b1
# 2 a3 b2

from_json('[{"a":"a1","b":"b1","a":"a2"},
          {"a":"a3","b":"b2","a":"a4"}]', simplify = F)

# [[1]]
# [[1]]$a
# [1] "a1"
# 
# [[1]]$b
# [1] "b1"
# 
# [[1]]$a
# [1] "a2"
# 
# 
# [[2]]
# [[2]]$a
# [1] "a3"
# 
# [[2]]$b
# [1] "b2"
# 
# [[2]]$a
# [1] "a4"

Oh! I could've sworn jsonify and jsonlite kept duplicates in the data frames.

In that case, I think we can consider the behavior as "standard" then.

json <- '[{"a":"a1","b":"b1","a":"a2"},
          {"a":"a3","b":"b2","a":"a4"}]'

test_results <- list()

simplifers <- list(
  jsonlite = jsonlite::fromJSON, 
  jsonify = jsonify::from_json, 
  RcppSimdJson = RcppSimdJson:::.deserialize_json
)
simplify_tests <- expand.grid(f1 = simplifers, f2 = simplifers)

non_simplifers <- list(
  jsonlite = function(x) jsonlite::fromJSON(x, simplifyVector = FALSE), 
  jsonify = function(x) jsonify::from_json(x, simplify = FALSE), 
  RcppSimdJson = function(x) RcppSimdJson:::.deserialize_json(x, simplify_to = 1)
)
no_simplify_tests <- expand.grid(f1 = non_simplifers, f2 = non_simplifers)


test_results$all_identical_simplify_df <- mapply(
  function(.f1, .f2) identical(.f1(json), .f2(json)),
  simplify_tests$f1, simplify_tests$f2
)

test_results$all_identical_no_simplify_df <- mapply(
  function(.f1, .f2) identical(.f1(json), .f2(json)),
  no_simplify_tests$f1, no_simplify_tests$f2
)

apply(do.call(cbind, test_results), 2, all)
#>    all_identical_simplify_df all_identical_no_simplify_df 
#>                         TRUE                         TRUE