njtierney / naniar

Tidy data structures, summaries, and visualisations for missing data

Home Page:http://naniar.njtierney.com/

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Helpers to identify which rows and variables have over a certain % or number of missingns

njtierney opened this issue · comments

As discussed with @dicook , these helpers could be used to identify which rows and variables need imputation/dropping/exploring.

Perhaps something like this, with an example workflow

library(tidyverse)
# which row ID has > X% missing
# which variables have > x% missing

which_prop_miss_row <- function(data, prop){
  naniar::prop_miss_row(data) > prop
}

prop_miss_cols <- function(data){
  colMeans(is.na(data))
}

which_prop_miss_var <- function(data, prop){
  prop_miss_cols(data) > prop
}

which_prop_miss_row(airquality, 0.1)
#>   [1] FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE
#>  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#>  [25]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
#>  [37]  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE
#>  [49] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
#>  [61]  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
#>  [73] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
#>  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
#>  [97]  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE
#> [109] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE
#> [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> [145] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
which_prop_miss_var(airquality, 0.1)
#>   Ozone Solar.R    Wind    Temp   Month     Day 
#>    TRUE   FALSE   FALSE   FALSE   FALSE   FALSE

nrow(airquality)
#> [1] 153

airquality %>% 
  filter(
    which_prop_miss_row(., 0.1)
  ) %>% 
  nrow()
#> [1] 42

vars_over_10_pct <- which(which_prop_miss_var(airquality, 0.1))
vars_under_10_pct <- which(!which_prop_miss_var(airquality, 0.1))
airquality %>% 
  select(
    all_of(vars_over_10_pct)
  ) %>% 
  as_tibble()
#> # A tibble: 153 × 1
#>    Ozone
#>    <int>
#>  1    41
#>  2    36
#>  3    12
#>  4    18
#>  5    NA
#>  6    28
#>  7    23
#>  8    19
#>  9     8
#> 10    NA
#> # … with 143 more rows

airquality %>% 
  select(
    all_of(vars_under_10_pct)
  ) %>% 
  as_tibble()
#> # A tibble: 153 × 5
#>    Solar.R  Wind  Temp Month   Day
#>      <int> <dbl> <int> <int> <int>
#>  1     190   7.4    67     5     1
#>  2     118   8      72     5     2
#>  3     149  12.6    74     5     3
#>  4     313  11.5    62     5     4
#>  5      NA  14.3    56     5     5
#>  6      NA  14.9    66     5     6
#>  7     299   8.6    65     5     7
#>  8      99  13.8    59     5     8
#>  9      19  20.1    61     5     9
#> 10     194   8.6    69     5    10
#> # … with 143 more rows

Created on 2023-04-06 with reprex v2.0.2

Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.2.3 (2023-03-15)
#>  os       macOS Ventura 13.2
#>  system   aarch64, darwin20
#>  ui       X11
#>  language (EN)
#>  collate  en_US.UTF-8
#>  ctype    en_US.UTF-8
#>  tz       Australia/Hobart
#>  date     2023-04-06
#>  pandoc   2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package       * version date (UTC) lib source
#>  assertthat      0.2.1   2019-03-21 [1] CRAN (R 4.2.0)
#>  backports       1.4.1   2021-12-13 [1] CRAN (R 4.2.0)
#>  broom           1.0.3   2023-01-25 [1] CRAN (R 4.2.0)
#>  cellranger      1.1.0   2016-07-27 [1] CRAN (R 4.2.0)
#>  cli             3.6.0   2023-01-09 [1] CRAN (R 4.2.0)
#>  colorspace      2.1-0   2023-01-23 [1] CRAN (R 4.2.0)
#>  crayon          1.5.2   2022-09-29 [1] CRAN (R 4.2.0)
#>  DBI             1.1.3   2022-06-18 [1] CRAN (R 4.2.0)
#>  dbplyr          2.3.0   2023-01-16 [1] CRAN (R 4.2.0)
#>  digest          0.6.31  2022-12-11 [1] CRAN (R 4.2.0)
#>  dplyr         * 1.1.0   2023-01-29 [1] CRAN (R 4.2.1)
#>  ellipsis        0.3.2   2021-04-29 [1] CRAN (R 4.2.0)
#>  evaluate        0.20    2023-01-17 [1] CRAN (R 4.2.0)
#>  fansi           1.0.4   2023-01-22 [1] CRAN (R 4.2.0)
#>  fastmap         1.1.0   2021-01-25 [1] CRAN (R 4.2.0)
#>  forcats       * 1.0.0   2023-01-29 [1] CRAN (R 4.2.0)
#>  fs              1.6.1   2023-02-06 [1] CRAN (R 4.2.0)
#>  gargle          1.3.0   2023-01-30 [1] CRAN (R 4.2.0)
#>  generics        0.1.3   2022-07-05 [1] CRAN (R 4.2.0)
#>  ggplot2       * 3.4.1   2023-02-10 [1] CRAN (R 4.2.0)
#>  glue            1.6.2   2022-02-24 [1] CRAN (R 4.2.0)
#>  googledrive     2.0.0   2021-07-08 [1] CRAN (R 4.2.0)
#>  googlesheets4   1.0.1   2022-08-13 [1] CRAN (R 4.2.0)
#>  gtable          0.3.1   2022-09-01 [1] CRAN (R 4.2.0)
#>  haven           2.5.1   2022-08-22 [1] CRAN (R 4.2.0)
#>  hms             1.1.2   2022-08-19 [1] CRAN (R 4.2.0)
#>  htmltools       0.5.4   2022-12-07 [1] CRAN (R 4.2.0)
#>  httr            1.4.4   2022-08-17 [1] CRAN (R 4.2.0)
#>  jsonlite        1.8.4   2022-12-06 [1] CRAN (R 4.2.0)
#>  knitr           1.42    2023-01-25 [1] CRAN (R 4.2.0)
#>  lifecycle       1.0.3   2022-10-07 [1] CRAN (R 4.2.0)
#>  lubridate       1.9.1   2023-01-24 [1] CRAN (R 4.2.0)
#>  magrittr        2.0.3   2022-03-30 [1] CRAN (R 4.2.0)
#>  modelr          0.1.10  2022-11-11 [1] CRAN (R 4.2.0)
#>  munsell         0.5.0   2018-06-12 [1] CRAN (R 4.2.0)
#>  naniar          1.0.0   2023-02-02 [1] CRAN (R 4.2.0)
#>  pillar          1.8.1   2022-08-19 [1] CRAN (R 4.2.0)
#>  pkgconfig       2.0.3   2019-09-22 [1] CRAN (R 4.2.0)
#>  purrr         * 1.0.1   2023-01-10 [1] CRAN (R 4.2.0)
#>  R.cache         0.16.0  2022-07-21 [1] CRAN (R 4.2.0)
#>  R.methodsS3     1.8.2   2022-06-13 [1] CRAN (R 4.2.0)
#>  R.oo            1.25.0  2022-06-12 [1] CRAN (R 4.2.0)
#>  R.utils         2.12.2  2022-11-11 [1] CRAN (R 4.2.0)
#>  R6              2.5.1   2021-08-19 [1] CRAN (R 4.2.0)
#>  readr         * 2.1.3   2022-10-01 [1] CRAN (R 4.2.0)
#>  readxl          1.4.1   2022-08-17 [1] CRAN (R 4.2.0)
#>  reprex          2.0.2   2022-08-17 [1] CRAN (R 4.2.0)
#>  rlang           1.0.6   2022-09-24 [1] CRAN (R 4.2.0)
#>  rmarkdown       2.20    2023-01-19 [1] CRAN (R 4.2.0)
#>  rstudioapi      0.14    2022-08-22 [1] CRAN (R 4.2.0)
#>  rvest           1.0.3   2022-08-19 [1] CRAN (R 4.2.0)
#>  scales          1.2.1   2022-08-20 [1] CRAN (R 4.2.0)
#>  sessioninfo     1.2.2   2021-12-06 [1] CRAN (R 4.2.0)
#>  stringi         1.7.12  2023-01-11 [1] CRAN (R 4.2.0)
#>  stringr       * 1.5.0   2022-12-02 [1] CRAN (R 4.2.0)
#>  styler          1.9.0   2023-01-15 [1] CRAN (R 4.2.0)
#>  tibble        * 3.1.8   2022-07-22 [1] CRAN (R 4.2.0)
#>  tidyr         * 1.3.0   2023-01-24 [1] CRAN (R 4.2.0)
#>  tidyselect      1.2.0   2022-10-10 [1] CRAN (R 4.2.0)
#>  tidyverse     * 1.3.2   2022-07-18 [1] CRAN (R 4.2.0)
#>  timechange      0.2.0   2023-01-11 [1] CRAN (R 4.2.0)
#>  tzdb            0.3.0   2022-03-28 [1] CRAN (R 4.2.0)
#>  utf8            1.2.3   2023-01-31 [1] CRAN (R 4.2.0)
#>  vctrs           0.5.2   2023-01-23 [1] CRAN (R 4.2.0)
#>  visdat          0.6.0   2023-02-02 [1] local
#>  withr           2.5.0   2022-03-03 [1] CRAN (R 4.2.0)
#>  xfun            0.37    2023-01-31 [1] CRAN (R 4.2.0)
#>  xml2            1.3.3   2021-11-30 [1] CRAN (R 4.2.0)
#>  yaml            2.3.7   2023-01-23 [1] CRAN (R 4.2.0)
#> 
#>  [1] /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────

There could also be equivalent which_n_miss_row/var functions