Process data with a partial dictionary • dataRC

First and foremost, for this vignette we need to install and load the following libraries.

library(dataRC)
library(arrow)
library(dplyr)
library(stringr)

Lets consider a sample of the dataset of Nonfetal Vital Statistics (NVS), a public Colombian dataset that records all the deaths between 1998 and 2022 of people born alive. This data is published by the Colombian National Administrative Department of Statistics (DANE by its spanish acronym) in its website, such that each year can be downloaded in traditional formats (sav, txt, csv, dta). The number of variables per year ranges from 48 to 69, while the number of rows can reach up to 363,089.

For this vignette I sampled 100 rows for each year (each row records a death). Of course such small sample do not demand significant computational resources. However, in its original form, this dataset requires more than 2GB of storage, demand that is reduced to 123.8 MB by converting the files to parquet format. This efficiency gain can be easily replicated with the function convert_files(), as is shown in the next chunk of code.

# Folder where is stored the data.
folder <- system.file('extdata', package = 'dataRC')
# The files to convert are all the ones that began with "nofetal", have a four 
# digit year and ends with ".xlsx".
files <- list.files(folder, pattern = '^nofetal\\d{4}\\.csv$')
convert_files(
  folder,
  files, 
  # I want to convert the files to parquet.
  new_extension = 'parquet', 
  # I want to store the parquet data in the same folder.
  new_folder = folder,
  # Do not display progress messages.
  verbose = F)

As this data has been developed since the previous century, is no surprise that there have been many changes on how the data is stored. For instance, the table below shows that the number of variables per year has changes drastically across the years, and even for the years with same number of variables, the number of columns per datatype differ.

# The new parquet files.
files <- list.files(folder, pattern = '^nofetal\\d{4}\\.parquet$') 

# Get the number of columns per year.
cols_per_year <- sapply(files, function(file) arrow::open_dataset(
  file.path(folder, file)) %>% ncol) %>% unlist %>% unname
# Get the number of columns per year and column classes.
classes_per_year <- sapply(files, function(x) arrow::read_parquet(
  file.path(folder, x)) %>% sapply(class) %>% table) 
character_per_year <- sapply(classes_per_year, 
                             function(x) {
                               x <- x[names(x) == 'character']
                               x <- ifelse(length(x) > 0, x, 0)
                               return(x)
                               }) %>% unlist %>% unname
integer_per_year <- sapply(classes_per_year, 
                           function(x) {
                             x <- x[names(x) == 'integer']
                             x <- ifelse(length(x) > 0, x, 0)
                             return(x)
                             }) %>% unlist %>% unname
logical_per_year <- sapply(classes_per_year, 
                           function(x) {
                             x <- x[names(x) == 'logical']
                             x <- ifelse(length(x) > 0, x, 0)
                             return(x)
                             }) %>% unlist %>% unname
numeric_per_year <- sapply(classes_per_year, 
                           function(x) {
                             x <- x[names(x) == 'numeric']
                             x <- ifelse(length(x) > 0, x, 0)
                             return(x)
                             }) %>% unlist %>% unname

data.frame(year = 1998:2022, n_vars= cols_per_year, 
           character_vars = character_per_year,
           integer_vars = integer_per_year,
           logical_vars = logical_per_year,
           numeric_vars = numeric_per_year
           ) %>% print
#>    year n_vars character_vars integer_vars logical_vars numeric_vars
#> 1  1998     48              2           36           10            0
#> 2  1999     48              7           39            2            0
#> 3  2000     48              2           36           10            0
#> 4  2001     48              8           39            0            1
#> 5  2002     48              7           40            1            0
#> 6  2003     48              7           39            1            1
#> 7  2004     48              8           39            0            1
#> 8  2005     48              8           39            0            1
#> 9  2006     48              7           39            1            1
#> 10 2007     48              7           40            1            0
#> 11 2008     68              7           54            7            0
#> 12 2009     68             12           54            2            0
#> 13 2010     68             10           53            5            0
#> 14 2011     68             11           54            3            0
#> 15 2012     69             13           51            4            1
#> 16 2013     69             15           50            3            1
#> 17 2014     68             11           54            3            0
#> 18 2015     68             13           50            5            0
#> 19 2016     68             11           51            6            0
#> 20 2017     68             13           39           16            0
#> 21 2018     68             12           48            8            0
#> 22 2019     56              3           50            3            0
#> 23 2020     55              3           46            6            0
#> 24 2021     56              4           50            2            0
#> 25 2022     57              4           40           13            0

Besides, the number of unique variable names for all years is 145, despite there whole majority of columns is common across years.

sapply(files, function(file) arrow::open_dataset(
    file.path(folder, file)) %>% names) %>% unlist %>% unname %>% unique %>% length
#> [1] 145

These kind of problematic scenarios are suitable for the functions create_partial_dictionary() and sort_partial_dictionary(), which create a “partial” dictionary that ease the process of unify the data.

dict_path <- file.path(folder, 'dict.xlsx')
unlink(dict_path)
create_partial_dictionary(folder, files, dict_path, verbose = F)
sort_partial_dictionary(dict_path, overwrite = T)
#> 
#> Save sorted dictionary in /home/runner/.cache/R/renv/library/dataRC-f4d37bc5/R-4.4/x86_64-pc-linux-gnu/dataRC/extdata/dict.xlsx.


dict <- readxl::read_excel(dict_path, sheet = 'colname') %>% print
#> # A tibble: 75 × 30
#>    uniname   uniclass  class_mode unique_classes   coverage nofetal1998.parquet
#>    <chr>     <chr>     <chr>      <chr>               <dbl> <chr>              
#>  1 A_DEFUN   integer   integer    integer               100 a_defun            
#>  2 ANO       integer   integer    integer               100 ano                
#>  3 AREA_RES  integer   integer    integer               100 area_res           
#>  4 ASIS_MED  integer   integer    integer               100 asis_med           
#>  5 C_BAS1    character character  character             100 c_bas1             
#>  6 C_MUERTE  NA        integer    integer; logical      100 c_muerte           
#>  7 CAU_HOMOL integer   integer    integer               100 cau_homol          
#>  8 COD_DPTO  integer   integer    integer               100 cod_dpto           
#>  9 COD_MUNIC integer   integer    integer               100 cod_munic          
#> 10 CODMUNOC  NA        integer    logical; integer      100 codmunoc           
#> # ℹ 65 more rows
#> # ℹ 24 more variables: nofetal1999.parquet <chr>, nofetal2000.parquet <chr>,
#> #   nofetal2001.parquet <chr>, nofetal2002.parquet <chr>,
#> #   nofetal2003.parquet <chr>, nofetal2004.parquet <chr>,
#> #   nofetal2005.parquet <chr>, nofetal2006.parquet <chr>,
#> #   nofetal2007.parquet <chr>, nofetal2008.parquet <chr>,
#> #   nofetal2009.parquet <chr>, nofetal2010.parquet <chr>, …

unlink(dict_path)

This could be done easily with the previous chunk of code, which writes a xlsx file with two sheets: colnames and colclass. Both sheets have one column per file, which in the former case have all the variable names of the file and in the latter have the datatype (class) per variable. Besides, the first column uniname is common for both sheets. Based on the variable names of all files, this column propose a name to group common variables across files. The common variables are determined if the names across files are equal ignoring upper and lower case. Of course this is a naive approach, so the user must review if the uninames could be reduced more. For instance, lets imagine that we have three files that have a column with the month. In the first file this column is named as month, in the second file MONTH and in the latter mes. The associated partial dictionary will propose two different uninames: MONTH and MES, however the user manually should combine this uninames in one.

By its part, based on the colclass sheet, the colnames sheet has three columns related to the class types of the variables per uniname. The first one is the uniclass, which should be manually filled by the user with the desired class for all the variables associated to each uniname. To fill this is usefull to look the columns class_mode and unique_classes. The former has the mode of the class (or modes in case of ties) and the latter have the unique classes across files. As shown latter, the uniname and uniclass columns will ease the processing of the data.

For our sample of NVS we have the following partial dictionary. The dictionary is sorted by the uniname in descended order for number of matches, at the presence of ties, it is used the natural sort order. For instance, from the first row (uniname == A_DEFUN) until the 37th (uniname == TIPO_EMB), all the uninames have a variable on each file so no manual work must be done. However, IDADMISALUD (2008-2011 and 2014-2022) and IDADMISALU (2012-2013) could be joined in one uniname.

dict %>% filter(uniname %in% c('IDADMISALUD', 'IDADMISALU'))
#> # A tibble: 2 × 30
#>   uniname     uniclass class_mode unique_classes coverage nofetal1998.parquet
#>   <chr>       <chr>    <chr>      <chr>             <dbl> <chr>              
#> 1 IDADMISALUD integer  integer    integer              52 NA                 
#> 2 IDADMISALU  integer  integer    integer               8 NA                 
#> # ℹ 24 more variables: nofetal1999.parquet <chr>, nofetal2000.parquet <chr>,
#> #   nofetal2001.parquet <chr>, nofetal2002.parquet <chr>,
#> #   nofetal2003.parquet <chr>, nofetal2004.parquet <chr>,
#> #   nofetal2005.parquet <chr>, nofetal2006.parquet <chr>,
#> #   nofetal2007.parquet <chr>, nofetal2008.parquet <chr>,
#> #   nofetal2009.parquet <chr>, nofetal2010.parquet <chr>,
#> #   nofetal2011.parquet <chr>, nofetal2012.parquet <chr>, …

The unification could be done in three steps. First, determine in which uniname do you want to keep, in my case I prefer IDADMISALUD since have more elements and IDADMISALU has a spanish typo. Second, update the rows of the selected uniname with the rows of the discarded uniname. Finally, delete the discarded uniname. This could be easily done directly in Excel, however this code also satisfy our needs.

print('Before unification:')
#> [1] "Before unification:"

dict %>% filter(uniname == 'IDADMISALUD') %>% 
  select(nofetal2012.parquet, nofetal2013.parquet)
#> # A tibble: 1 × 2
#>   nofetal2012.parquet nofetal2013.parquet
#>   <chr>               <chr>              
#> 1 NA                  NA


dict <- dict %>% 
  # Update the rows of `IDADMISALUD`.
  dplyr::mutate(
    nofetal2012.parquet = dplyr::if_else(
      uniname == 'IDADMISALUD', 'IDADMISALU', nofetal2012.parquet),
    nofetal2013.parquet = dplyr::if_else(
      uniname == 'IDADMISALUD', 'IDADMISALU', nofetal2013.parquet)) %>% 
  # Delete `IDADMISALU`.
  dplyr::filter(uniname != 'IDADMISALU')

print('After unification:')
#> [1] "After unification:"

dict %>% dplyr::filter(uniname == 'IDADMISALUD') %>% 
  dplyr::select(nofetal2012.parquet, nofetal2013.parquet)
#> # A tibble: 1 × 2
#>   nofetal2012.parquet nofetal2013.parquet
#>   <chr>               <chr>              
#> 1 IDADMISALU          IDADMISALU

The manual review of the uninames and the uniclasses could be elaborated gradually as we need to use a variable. For instance, lets supposed that we are interested in answer whether women or men have a higher number of deaths in Colombia. For this we just need the year and sex variable. By looking our dictionary we notice that the uninames ANO and SEXO have a integer related variable in all files (despite the names are case sensitive different).

dict %>% dplyr::filter(uniname %in% c('ANO', 'SEXO'))
#> # A tibble: 2 × 30
#>   uniname uniclass class_mode unique_classes coverage nofetal1998.parquet
#>   <chr>   <chr>    <chr>      <chr>             <dbl> <chr>              
#> 1 ANO     integer  integer    integer             100 ano                
#> 2 SEXO    integer  integer    integer             100 sexo               
#> # ℹ 24 more variables: nofetal1999.parquet <chr>, nofetal2000.parquet <chr>,
#> #   nofetal2001.parquet <chr>, nofetal2002.parquet <chr>,
#> #   nofetal2003.parquet <chr>, nofetal2004.parquet <chr>,
#> #   nofetal2005.parquet <chr>, nofetal2006.parquet <chr>,
#> #   nofetal2007.parquet <chr>, nofetal2008.parquet <chr>,
#> #   nofetal2009.parquet <chr>, nofetal2010.parquet <chr>,
#> #   nofetal2011.parquet <chr>, nofetal2012.parquet <chr>, …

To achieve our objective we could employ the following code. Initially this could seem a bit overwhelming, however, the intuition is simple. You began by determining which columns you need in SELECTED_COLUMNS <- c('ANO', 'SEXO'). Then for each file, keep only the desired columns, rename it to their uniname, compute the number of deaths per sex and save the results in df. Finally, calculate the proportion of deaths per sex on each year. Notice that the vast majority of the code is for go for each file and unify the names, so you just need to modify three things:

Which columns you need in SELECTED_COLUMNS <- c('ANO', 'SEXO').
The processing per file.
The processing of the whole data.

In the following chunk, this just represents four lines, the rest is the same for any scenario. In other words, this is a template!

# BEGAN MODIFY:  Which columns you need?
SELECTED_COLUMNS <- c('ANO', 'SEXO')
# END MODIFY. 

df <- NULL
for (file in files) {
  # Open file with lazy evaluation for speed (you could also use
  # `read_parquet()`).
  df0 <- arrow::open_dataset(file.path(folder, file)) %>%
    unify_colnames(dict, file, SELECTED_COLUMNS) %>% 
    unify_classes(dict, file, SELECTED_COLUMNS) %>% 
    relocate_columns(SELECTED_COLUMNS) %>% 
    # BEGAN MODIFY: Process your data as required  with the `uniname`s.
    dplyr::group_by(ANO, SEXO) %>% 
    summarise(n_ = n()) %>% ungroup %>% 
    # END MODIFY. 
    
    # End lazy evaluation. If you use `read_parquet() this is unnecesary.
    collect
  
  # Save the iteration results.
  # NOTE: # If all columns are going to have the same columns is recommended to 
  # use `rbind()`.
  df <- dplyr::bind_rows(df, df0)
}
# MODIFY: Process your data as required.
df %>% group_by(ANO) %>% mutate(prop_ = n_ / sum(n_))
#> # A tibble: 50 × 4
#> # Groups:   ANO [25]
#>      ANO  SEXO    n_ prop_
#>    <int> <int> <int> <dbl>
#>  1  1998     2    43  0.43
#>  2  1998     1    57  0.57
#>  3  1999     1    78  0.78
#>  4  1999     2    22  0.22
#>  5  2000     2    38  0.38
#>  6  2000     1    62  0.62
#>  7  2001     1    78  0.78
#>  8  2001     2    22  0.22
#>  9  2002     2    40  0.4 
#> 10  2002     1    60  0.6 
#> # ℹ 40 more rows

# END MODIFY.

But what if the per uniname the variable have different classes per file. Well, we need a small extra step. To see this lets artificially change the ANO integer class to a character for the 2022 file.

path_file_2022 <- file.path(folder, files[stringr::str_detect(files, '2022')])
arrow::open_dataset(path_file_2022) %>% 
  mutate(ANO = as.character(ANO)) %>% 
  write_parquet(path_file_2022)

Now, this is not a major problem, if we run the previous code we will obtain the same results, with the only difference that the ANO will be now a character. However, in many scenarios this could be problematic and inefficient. To solve this we need to make two changes to our current methodology. First, we manually must fill the uniclass column of our dictionary for the problematic variable (in this case ANO must be an integer). Again, this could be easily done in Excel but for our purposes we will use code.

dict <- dict %>% mutate(uniclass = if_else(uniname == 'ANO', 'integer', uniclass))

Then, we will add a few lines to our template so that for each file, the selected columns are force to have the data type of uniclass. The following code support: “integer”, “logical”, “numeric”, “character” and “date”. This makes our template larger, but does not add to us more work.

# BEGAN MODIFY:  Which columns you need?
SELECTED_COLUMNS <- c('ANO', 'SEXO')
# END MODIFY. 

df <- NULL
for (file in files) {
  # Open file with lazy evaluation for speed (you could also use
  # `read_parquet()`).
  df0 <- arrow::open_dataset(file.path(folder, file)) %>%
    # Select `SELECTED_COLUMNS` and rename it with the `uniname`s.
    unify_colnames(dict, file, SELECTED_COLUMNS) %>% 
    # Force the class. NOTE: Lazy evaluation is picky, so if you have
    # problematic values (e.g. "" to numeric), you will need to filter those
    # observations or collect.
    unify_classes(dict, file, SELECTED_COLUMNS) %>% 
    # Preserve order.
    relocate_columns(SELECTED_COLUMNS) %>% 
    
    # BEGAN MODIFY: Process your data as required  with the `uniname`s. NOTE:
    # Again lazy evaluation is picky and is still on develop to support more
    # functions. Therefore, some features cannot be done in it, such as
    # `dplyr::slice` or window functions. If this is the case you can collect
    # and do the processing normally, however this could consume a lot of time,
    # then you could  be creative and look for another way to achieve the
    # objective. Nevertheless, one option is to employ the function
    # `arrow::to_duckdb()`, which allow you to use more function without
    # increasing to much the time consumption.
    dplyr::group_by(ANO, SEXO) %>% 
    summarise(n_ = n()) %>% ungroup %>% 
    # END MODIFY. 
  
    # End lazy evaluation. If you use `read_parquet() this is unnecesary.
    collect
  
  # Save the iteration results.
  # NOTE: # If all columns are going to have the same columns is recommended to 
  # use `rbind()`.
  df <- dplyr::bind_rows(df, df0)
}
# MODIFY: Process your data as required.
df %>% group_by(ANO) %>% mutate(prop_ = n_ / sum(n_))
#> # A tibble: 50 × 4
#> # Groups:   ANO [25]
#>      ANO  SEXO    n_ prop_
#>    <int> <int> <int> <dbl>
#>  1  1998     2    43  0.43
#>  2  1998     1    57  0.57
#>  3  1999     1    78  0.78
#>  4  1999     2    22  0.22
#>  5  2000     2    38  0.38
#>  6  2000     1    62  0.62
#>  7  2001     1    78  0.78
#>  8  2001     2    22  0.22
#>  9  2002     2    40  0.4 
#> 10  2002     1    60  0.6 
#> # ℹ 40 more rows

# END MODIFY.