Seed Notebook: SALURBAL Templating Data

Author

Ran Li, Jess Uruchima

Published

June 24, 2024

Standard Notebook Setup
{ ## 'Global Setup (For any notebook)'
  
  #### Configure relative path (Change!)
  path_global_setup_function = '../../R/setup/global_setup.R'
  
  #### Generic global setup code (Do not change!)
  source(path_global_setup_function)
  setup = global_setup(path_global_setup_function)
  invisible(list2env(setup$sourced_objects, envir = .GlobalEnv))
  global_context = setup$global_context
}

{ ## Context
  
  ## Local Objects
  path_templating_data_folder = generate_relative_path(global_context$notebook_relative_root,
                                                       'datasets/_templating_data/')
  path_seeds_folder = generate_relative_path(global_context$notebook_relative_root,'clean/seeds')
  path_templating_data_seed = file.path(path_seeds_folder, 'templating_data.xlsx')
  path_templating_dictionary_seed = file.path(path_seeds_folder, 'templating_dictionary.json')
  path_templating_df_seed = file.path(path_seeds_folder, 'templating_data.parquet')
  
  ## Local Context
  context = c(global_context)
  
}

The SALURBAL system utilized composite keys and dataframe templating to handle the highly heterogeneous relationships between data and metadata. Our Dictionary table is similar in that it is grouped by the composite key combinations:

This notebook contains the logic for ingestion, processing, testing/validation of SALURBAL key-value dictionary tables. The source tables are ingested from the _src folder and are processed into a single intermediate data structure for validation. The validation process includes unit tests for structure and content issues. If the source tables pass the validation tests, they are exported to the clean/seeds folder as a single .xlsx file for use during the ETL process.

1. Operationalize

1.1 Import Tables

Import individual templating tables.

Listing Source Files
files_all = list.files(path = path_templating_data_folder, pattern = '.xlsx')  %>%
  discard(~str_detect(.x,'~')) %>% 
  keep(~str_detect(.x,'src__|context__')) 
src_files = files_all %>%  keep(~str_detect(.x,'^src__'))
context_files = files_all %>%  keep(~str_detect(.x,'^context__'))
files_all
 [1] "context__key.xlsx"                  "src__bec-brt.xlsx"                 
 [3] "src__bec-global-surface-water.xlsx" "src__bec-min-wage.xlsx"            
 [5] "src__bec-osm.xlsx"                  "src__bec_coastline.xlsx"           
 [7] "src__bec_gas-price.xlsx"            "src__bec_ggmcf.xlsx"               
 [9] "src__bec_NDVI.xlsx"                 "src__bec_sentinel.xlsx"            
[11] "src__bec_simple_sources.xlsx"       "src__bec_SRTM.xlsx"                
[13] "src__bec_surface_water.xlsx"        "src__census.xlsx"                  
[15] "src__employment_survey.xlsx"        "src__fb-popdensity.xlsx"           
[17] "src__health-survey-adult.xlsx"      "src__health-survey-child.xlsx"     
[19] "src__household_survey.xlsx"         "src__political-participation.xlsx" 
[21] "src__population-BEC.xlsx"           "src__population.xlsx"              
[23] "src__spatial_l3.xlsx"               "src__temperature.xlsx"             
[25] "src__urban-footprint.xlsx"          "src__vehicle-registration.xlsx"    
[27] "src__vital_registration.xlsx"      

Currently we we have 26 data source and 1 utility templating data files. The first thing we do is to compile them all into a single intermediate source data structure for validation by testing for structure and content issues.

Importing Source Files
## Why are these required?? 
required_cols = c("public_value", "source_value", "acknowledgements_value")

src_schema = lst(
  pk = c('key','iso2','year','dataset','dataset_id'),
  fields = c('public_value','source_value','acknowledgements_value',
             'source_URL_value','source_terms_of_use_URL_value',
             'value'),
  columns = c(pk, fields)
)

## import all templating data files
templating_data_all = list.files(path = path_templating_data_folder, 
                        pattern = '.xlsx',
                        full.names = T) %>% 
  # keep(~str_detect(.x,'employ'))
  discard(~str_detect(.x,'~')) %>% 
  discard(~str_detect(.x,'//src')) %>% 
  map(~import_salurbal_xslx_src(.x, src_schema) %>% 
        map(~unpack_columns(.x, list(context$vec__admin_composite_keys_all))))

templating_data_all_names = templating_data_all %>% map(~names(.x)) %>% unlist() %>% unique() %>% sort()



templating_data = templating_data_all_names %>% 
  map(function(table_name){
    # table_name = 'by_key'
    src_sheet_raw = templating_data_all %>% 
      map(~.x[[table_name]]) %>% 
      discard(~is.null(.x)) %>% 
      bind_rows() %>% 
      unpack_string_columns()
    ## Initialize required columns
    missing_cols <- setdiff(required_cols, names(src_sheet_raw))
    if (length(missing_cols)>0){
      missing_cols %>% 
        walk(~{
          add_cols <- map(set_names(.x), ~ rep("", nrow(src_sheet_raw)))
          src_sheet_raw <<- bind_cols(src_sheet_raw, add_cols)
        })
    }
    return(src_sheet_raw)
  }) %>% 
  set_names(templating_data_all_names) 
Preview Key and Composite Key Combinations
summary_table <- map_dfr(names(templating_data), function(list_name) {
  templating_data[[list_name]] %>%
    group_by(key) %>%
    summarise(
      row_count = n(),
      composite_key = list_name,
      .groups = "drop"
    )
}) %>%
  arrange(desc(row_count)) %>%
  select(key,composite_key,  row_count)

reactable(summary_table,
          searchable = T)

1.2 Templating data

Templating engines require a key value pair during the rendering the process. The information we have now is the in the form of a dataframe that includes both the root keyword and also some positional information (iso2, year, dataset). To make this composite information available to the templating enginer we will operationalize a composite key string that captures this relational information so it can be used to access the correct value during the rendering process into dataframes. A few notes:

  • order of composite key must be standardized
    • Lets just standardize it to: sort(c('dataset_id', 'iso2', 'strata_id', 'geo', 'year')).
## Helper functions
add_linkage_column <- function(df_list) {
  df_list %>%
    imap(~ .x %>% mutate(linkage = .y))
}


generate_core_composite_key_columns = function(table_tmp, context){
  
  cols = setdiff(context$vec__admin_composite_keys_all, names(table_tmp))
  new_cols <- cols %>%
    set_names() %>%
    map(~ rep('', nrow(table_tmp)))
  
   table_wide = table_tmp %>%
      # Add missing columns as empty strings
      add_column(!!!new_cols) %>%
      # Ensure all specified columns are character type
      mutate(across(all_of(context$vec__admin_composite_keys_all), as.character)) %>% 
      select(all_of(context$vec__admin_composite_keys_all), everything())
   
  return(table_wide)
}


tidy_templating_data = function(table_tmp, context){
    # table_tmp = templating_data %>% add_linkage_column() %>% pluck(1)
    
    ## Add missing composite key columns
    table_wide = generate_core_composite_key_columns(table_tmp, context)
    
 
    
    ## Tidy
    composite_keys = c('key',context$vec__admin_composite_keys_all,'linkage')
    table_tidy = table_wide %>% 
      pivot_longer(cols = -all_of(composite_keys), names_to = 'column', values_to = 'value') %>% 
      filter(!is.na(value)) %>% 
      filter(!value=='') %>% 
      filter(str_detect(key, "\\{\\{"))
    
    ## Process  composite_key
    tidy_template_data = table_tidy %>% 
      mutate(column = str_remove_all(column, '_value'),
             key_string = str_remove_all(key, "\\{|\\}")) %>% 
      mutate(composite_key = glue("{{{{{key_string}__{dataset_id}__{geo}__{iso2}__{strata_id}__{year}__{column}}}}}")) %>% 
      select(composite_key, value, everything()) 
    
    return(tidy_template_data)
    
}

## Main logic
df_templating_data = templating_data %>% 
  add_linkage_column() %>% 
  map_df(~{
    tidy_templating_data( .x, context)
  })  

head(df_templating_data)
composite_key value dataset_id dataset_instance dataset_version day geo iso2 month observation_id observation_type schema_version strata_id var_name year key linkage column key_string
{{BRT____________public}} 1 {{BRT}} by_key public BRT
{{BRT____________source}} BRTData {{BRT}} by_key source BRT
{{BRT____________acknowledgements}} BRTData https://brtdata.org {{BRT}} by_key acknowledgements BRT
{{BRT____________source_URL}} https://brtdata.org {{BRT}} by_key source_URL BRT
{{BRT____________source_terms_of_use_URL}} https://brtdata.org/info/terms {{BRT}} by_key source_terms_of_use_URL BRT
{{GlobalSurfaceWater____________public}} 1 {{GlobalSurfaceWater}} by_key public GlobalSurfaceWater

So we have now a tidy dataframe that includes the composite key information. This will be used to generate the templating dictionary.

dictionary_templating_data =  as.list(
  setNames(df_templating_data$value, 
           str_remove_all(df_templating_data$composite_key, "\\{\\{|\\}\\}")))

head(dictionary_templating_data)
$BRT____________public
[1] "1"

$BRT____________source
[1] "BRTData"

$BRT____________acknowledgements
[1] "BRTData https://brtdata.org"

$BRT____________source_URL
[1] "https://brtdata.org"

$BRT____________source_terms_of_use_URL
[1] "https://brtdata.org/info/terms"

$GlobalSurfaceWater____________public
[1] "1"

2. Validation

Now we have our upstream source tables ingested, lets do some testing to validate the structure as well as integrity of the source dictionary.

2.1 Unit Tests

Unit tests include:

  • valid_value_cols : checks that all tables have the three expect cols (public, source, ack .. _value)

  • valid_key: all keys must not have under dashes only number, character and dahses

  • valid_src_utf8: checks everything is valid utf-8 encoding

  • valid_no_seperator: makes sure seperators (;) as everything should be tidy

  • valid_tidy_iso2_year: checks src__by_iso2_year is tidy

  • valid_pk: checks that pk matches the table names

unique_composite_keys
df_invalid = df_templating_data %>% 
  add_count(composite_key) %>%
  filter(n>1)

if (nrow(df_invalid) > 0 ) cli_abort("Some composite keys are not unique. Please check!")
valid_key
df_invalid = df_templating_data %>% 
  filter(str_detect(key, "_"))

if (nrow(df_invalid) > 0 ) cli_abort('Some keys contain invalid chracter of underdashes. Please make sure all keys do not have underdashes.')
valid_value_cols
valid_value_cols = templating_data %>% 
  imap_lgl(~{
    table_tmp = .x
    index <- .y
    valid_tmp = all(required_cols%in%names(table_tmp))
    if (!valid_tmp) cli_alert_warning("{index} Missing expect value columns")
    return(valid_tmp)
  }) %>% 
  all()

if (!valid_value_cols) cli_abort('Some src sheets are missing expect columns.')
valid_utf8
valid_src_utf8 = templating_data %>% 
  map_lgl(~validate_df_utf8(.x)) %>% 
  all()

if (!valid_src_utf8) cli_abort('_src `valid_utf8` ERROR: invalid encoding detected!')
valid_tidy_iso2_year
valid_tidy_iso2_year = templating_data$by_key_iso2_year %>% 
  add_count(key, iso2, year) %>% 
  filter(n > 2) %>% 
  nrow() == 0 

if (!valid_tidy_iso2_year) cli_abort('_src `valid_tidy_iso2_year` ERROR: _src_iso2_year is not tidy. please check')
valid_pk
qc_keys_per_table = templating_data %>%
  map(~{
    pk_tmp_str = .x %>% 
      select(-contains('value')) %>% 
      names() %>% 
      discard(~.x=='key') %>% 
      sort() %>% 
      recode('dataset_id' = 'dataset') %>% 
      paste(collapse = '_')
    if (pk_tmp_str == '') { 
      return('by_key')
    } else { 
      return(paste0("by_key_",pk_tmp_str))
    }
  })

invalid_tables = qc_keys_per_table == names(qc_keys_per_table) %>%  keep(~.x==F)

valid_pk = length(invalid_tables) == 0

if (!valid_pk) cli_abort('_src `valid_pk` ERROR: table name and primary keys in src tables are inconsistent. Please check!')
valid_no_seperator
# Quarentine for now... will come back to.
  valid_no_seperator =  map2_lgl( 
    templating_data, names(templating_data),
    function(table, table_name){
      # table_name = 'by_key_iso2'; table = templating_data[[table_name]]
      invalid_rows = table  %>%
        mutate(across(where(is.character), ~ str_detect(.,";"))) %>%
        mutate(r = row_number()) %>%
        filter(rowSums(across(where(is.logical))) > 0) %>%
        pull(r)
      df_invalid = table %>% slice(invalid_rows)
      valid_tmp = nrow(df_invalid) == 0
      if (!valid_tmp) {
        cli_alert_danger("{table_name} has invalid seperators! Please check!")
      }
      return(valid_tmp)
    }) %>% 
    all() 

2.2 Test Results

  • valid_value_cols
  • valid_src_utf8:
  • valid_tidy_iso2_year:
  • valid_pk:
valid = all(valid_value_cols,
            valid_src_utf8, 
            # valid_no_seperator, 
            valid_tidy_iso2_year, 
            valid_pk)    
if (!valid) cli_abort('Some tests failed. Please check the results above.')

2.3 Verify assertions

The primary export is a named list that originates from a dataframe. Lets just test that dataframe.

df_templating_data = df_templating_data  %>%
    assert(no_salurbal_seperator, everything(), -value, error_fun = assertr::error_report)

3. Export

## Export Parquet
if (valid){
  
  ## Export templating data excel
  write_salurbal_xlsx(
    list_cdbk = templating_data,
    path = path_templating_data_seed,
    local_context = NULL)
  dictionary_templating_data %>% 
    jsonlite::write_json(path_templating_dictionary_seed, 
                         pretty = T, auto_unbox  = T)
  df_templating_data %>% arrow::write_parquet(sink = path_templating_df_seed)
 
  ## Snapshot
  snapshot_excel(folder =  path_templating_data_folder)
  snapshot_excel(path =  path_templating_data_seed )
  
  ## Export for Reporting
  summary_table %>% 
    arrow::write_parquet(
      file.path(context$repo_reports_source_templating_data,'summary_table.parquet')
    )
  templating_data$by_key %>% 
    arrow::write_parquet(
      file.path(context$repo_reports_source_templating_data,'templating_data__by_key.parquet')
    )
  templating_data$by_key_iso2 %>% 
    arrow::write_parquet(
      file.path(context$repo_reports_source_templating_data,'templating_data__by_key_iso2.parquet')
    )
  templating_data$by_key_iso2_year %>%
    arrow::write_parquet(
      file.path(context$repo_reports_source_templating_data,'templating_data__by_key_iso2_year.parquet')
    )
  templating_data$by_key_dataset_iso2 %>%
    arrow::write_parquet(
      file.path(context$repo_reports_source_templating_data,
                'templating_data__by_key_dataset_iso2.parquet')
    )
  
  ## CLI
  cli_alert_success('Valid Context files + written to /clean/seeds/context.xlsx')
}

Preview Templating Data

The table below shows the Templating keys avaialable and the associated relationships these keys are available at.

Below we provide a preview of each of tables within the SALURBAL Templating Data resource. You can use the ‘Search’ box in each of these tables for preliminary preview of the templating data available to other processes within the SALURBAL system.

by_key

by_key_iso2

by_key_iso2_year

by_key_dataset_iso2

Preview Templating Dictionary

reactable(df_templating_data,
          searchable = T)