
Correct the columns' order
relocate_columns.Rd
This function is part of a group of functions intended to solve a scenario
where there is equivalent data that is potentially stored heterogeneously
(e.g., different column names and datatypes). In particular, this function
relocate the columns to follow the order of selected_columns
.
Arguments
- df
Data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). Its columns should be a subset of
selected_columns
.- selected_columns
Atomic vector that is a subset of the
uninames
dict
's column. In other words, is a set of desired columns that individually, should be in at least one file of the group of files.
Value
An object of the same type as df
that potentially changes the
columns's location to replicate the selected_columns
' order.
See also
For a full example, see the vignette
process_data_with_partial_dict
in the
website
or with the command vignette('process_data_with_partial_dict', package = dataRC')
.
Examples
if (FALSE) {
# Parameters
folder <- 'my_folder'
files <- list.files(folder)
dict <- readxl::read_excel('my_dict.xlsx')
selected_columns <- dict$uniname[1L:3L]
# Make a unique database using lazy evaluation.
df <- NULL
for (file in files) {
df0 <- arrow::open_dataset(file.path(folder, file)) %>%
unify_colnames(dict, file, selected_columns) %>%
unify_classes(dict, file, selected_columns) %>% collect
df <- dplyr::bind_rows(df, df0)
}
df %>% relocate_columns(selected_columns) %>%
write_parquet('unified_data.parquet')
}
if (FALSE) {
# Parameters
folder <- 'my_folder'
files <- list.files(folder)
dict <- readxl::read_excel('my_dict.xlsx')
selected_columns <- c('ID', 'YEAR', 'MONTH')
# Count number of people per month using lazy evaluation.
df <- NULL
for (file in files) {
df0 <- arrow::read_parquet(file.path(folder, file)) %>%
unify_colnames(dict, file, selected_columns) %>%
unify_classes(dict, file, selected_columns) %>%
dplyr::distinct() %>%
dplyr::group_by(YEAR, MONTH) %>%
dplyr::summarise(n_ = n()) %>% collect
df <- rbind(df, df0)
}
df %>% relocate_columns(selected_columns) %>%
dplyr::group_by(YEAR, MONTH) %>%
dplyr::summarise(n_ = sum(n_)) %>%
write_parquet('people_per_month.parquet')
}