SALURBAL Portal Marts v1.1

Authors

Ran Li (Maintainer)

Jessica Uruchima

Published

October 5, 2024

This is legacy workflow for operatianolizaign json mart items for the portal. It pulls from two upstream notebooks:

There are a few sections to this notebook:

0. Setup

Setup
{ ## 'Global Setup (For any notebook)'
  
  #### Configure relative path (Change!)
  path_global_setup_function = '../../../../R/setup/global_setup.R'
  
  #### Generic global setup code (Do not change!)
  source(path_global_setup_function)
  setup = global_setup(path_global_setup_function)
  invisible(list2env(setup$sourced_objects, envir = .GlobalEnv))
  global_context = setup$global_context
}

{ ## Local Setup (For any notebook)
  
  local_context_object = lst(
    cache_folder = file.path(
      global_context$path_server_etl_cache, 
      '_salurbal_cube_v1'),
    cache_path_validated_metadata_cube = file.path(
      cache_folder, 'validated_metadata_cube.parquet' ),
    cache_path_validated_data_cube = file.path(
      cache_folder, 'validated_data_cube.parquet' ),
    path_cache_utility_1 = file.path(
      global_context$path_server_etl_cache, 
      '_salurbal_portal_mart_v1',
      'utility_1.parquet'),
     path_cache_utility_2 = file.path(
      global_context$path_server_etl_cache, 
      '_salurbal_portal_mart_v1',
      'utility_2.parquet'),
  )
  
  ## Local Objects
  local_context = c(local_context_object, global_context)
  validated_metadata_cube = local_context$cache_path_validated_metadata_cube %>%
    arrow::open_dataset()
  validated_data_cube = local_context$cache_path_validated_data_cube %>% 
    arrow::open_dataset()

}

1. Global Items

1.1 iso2

Code
## Operationalize
df_iso2 = local_context$xwalk_iso2 %>%
  rename(country = country_label, country_url = country_name) %>% 
  arrange(salid0)

## Write
df_iso2 %>%
  jsonlite::toJSON() %>%
  xfun::write_utf8(
    file.path(
      global_context$notebook_relative_root,
      "clean/4-salurbal-hub/0-seeds/iso2.json"
    )
  )

## Preview
head(df_iso2)

1.2 centroids

Code
## Operationalize
df_centroids = read_parquet(local_context$l1ad_centroids_df_path) %>%
  select(country = country_label,
         salid1 = id1,
         salid1_name = l1_name,
         salid1_label = l1_label,
         lng = long,
         lat,
         iso2) %>% 
  arrange(salid1)

## Write
df_centroids %>%
  jsonlite::toJSON() %>%
  xfun::write_utf8(
    file.path(
      global_context$notebook_relative_root,
      "clean/4-salurbal-hub/0-seeds/centroids.json"
    )
  )

## Preview
head(df_centroids)

1.3 City metadata

Code
## Operationalize
xwalk_l2_count  = local_context$xwalk_l2 %>%
  count(psalid1) %>%
  select( salid1 = psalid1,
          n_l2 = n) %>% 
  arrange(salid1)

## Write
xwalk_l2_count %>%
  jsonlite::toJSON() %>%
  xfun::write_utf8(
    file.path(
      global_context$notebook_relative_root,
      "clean/4-salurbal-hub/0-seeds/l1ad_metadata.json"
    )
  )

## Preview
head(xwalk_l2_count)

1.4 Domains

Code
## Operationalize
xwalk_domain_icon = read_excel(
  file.path(
    global_context$notebook_relative_root,
    "code/domains/domains_subdomains_icons.xlsx"
  ),
  sheet = 'domains')  %>%
  mutate_all(~str_squish(.x))
df_domains = validated_metadata_cube %>%
  count(domain,var_name) %>% 
  count(domain, name = 'n_vars') %>%
  collect() %>% 
  mutate(url = str_to_lower(domain) %>% 
           str_replace_all(" ", "-") %>% 
           str_remove_all(',')) %>%
  left_join(xwalk_domain_icon) %>%
  left_join(local_context$df_domains ) %>%
  rename(name = domain) %>% 
  arrange(name)

## Export
df_domains %>%
  jsonlite::toJSON(pretty = T) %>%
  xfun::write_utf8(
    file.path(
      global_context$notebook_relative_root,
      "clean/4-salurbal-hub/0-seeds/domains.json"
    )
  )


## Preview
head(df_domains)

1.5 Subomains

Code
## Write
local_context$df_subdomains %>%
      jsonlite::toJSON(pretty = T) %>%
      xfun::write_utf8(
        file.path(
          global_context$notebook_relative_root,
          "clean/4-salurbal-hub/0-seeds/subdomains.json"
        )
      )
 
## Preview
head(local_context$df_subdomains)

2. Front pages

2.1 Front page stats

Code
## Operationalize
list_etc = list(
  n_variables = validated_metadata_cube %>%
    count(var_name) %>% 
    collect() %>% 
    nrow(),
  n_countries = length(unique(local_context$xwalk_iso2$country_name)),
  n_cities = length(unique(local_context$xwalk_l1ad$salid1)),
  n_nbhd = length(unique(local_context$xwalk_l2$salid2))
)

## Write
list_etc %>%
  jsonlite::toJSON() %>%
  xfun::write_utf8(
    file.path(
      global_context$notebook_relative_root,
      "clean/4-salurbal-hub/0-seeds/etc.json"
    )
  )

## Preview
list_etc

3. Data Catalog + Data About pages

3.1 variables.json

This is our general purpose variable level JSON objects. It includes variable level metadata as well as variable-level summaries of important data-point level information. Lets first op. those summary tables!

Upstream Intermediate Tables

There are two upstream intermediate tables we will need.

Validate Cubes

This is ad-hoc validatiaon of salurbal cubes for this notebook. All QQC and validation will be migrated over to cube notebooks after clarifying semantcis, documentation, and testing.

Note some of these are calcualted from the data cube which is a small subset. This is problematic if variables are being dropped. So lets check

Code
df_qc = validated_metadata_cube %>% 
  select(dataset_instance, schema_version, var_name) %>% 
  mutate(metadata = T) %>% 
  distinct() %>% 
  collect() %>% 
  left_join(
    validated_data_cube %>% 
      select(var_name) %>% 
      mutate(data = T) %>% 
      distinct() %>% 
      collect()
  ) 

## 216 vairbales have metadata but not in data cube
df_qc %>% count(metadata, data)

## Lest look at what hese actually are
df_qc %>% 
  filter(is.na(data)) %>% 
  count(dataset_instance, schema_version)

df_pop

This is a general use L1AD population estimate

Code
## Clean
df_pop = file.path(
  global_context$path_server_dbt_sources,
  'schema_v1_deprecating',
  'PRJT_v1.0_internal.parquet') %>% 
  read_parquet() %>% 
  filter(geo == 'L1AD',
         strata_id == 'Sex_Overall',
         year == '2020') %>% 
  rename(salid1 = salid) %>% 
  left_join(global_context$xwalk_l1ad) %>% 
  select(salid1, pop = value) %>%
  mutate(pop = as.numeric(pop) %>% round(0))

## Export
df_pop %>% 
  arrow::write_parquet(
    file.path(
      global_context$notebook_relative_root,
      "clean/2-processed/df_pop_l1_2020.parquet"
    )
  )

df_strata

This contains compilation of populaiton strata information for each dataset. V2 schema should have all of this in the metdata cube so eventually this will just all pull from the SALURBAL metadata cube, but fow now we need tow workk with how v1 schema thigns are set up:

  • Pull in strata_description from metadata cube
  • Pull in strata details from strata.csv
  • Combine

Let’s get started!

Pull in strata_description from metadata cube

Get strata_desc from metadata cube
## Clean
df_strata_prod_description = validated_metadata_cube %>% 
  select(var_name, strata_id, strata_description,
        dataset_instance, dataset_id, version, dataset_version, schema_version) %>% 
  distinct() %>% 
  collect() %>% 
  verify(columns_must_not_have_NA_NULL_cells(., list(c('strata_id')))) %>% 
  verify(composite_key_uniqueness(., local_context)) %>% 
  verify(columns_must_not_have_EMPTY_cells(., list(c('dataset_instance','version','dataset_id')))) 

## Preview
head(df_strata_prod_description)

Pull in subset of strata details from schema v1 strata.csv files. Note there are more of these compared to strata id in metadata cube because not all repo file are in production cubes; so we subset for the correct ones. Also note that as we shift towards schema v2 where this information is found in metadata cube - this section of logic will be deprecated.

Get strata details from strata.csv files
## Clean
df_strata_csv = list.files(
  file.path(
    global_context$notebook_relative_root,
    'datasets'
  ),
  recursive = T,
  pattern = '2-strata.csv',
  full.names = T
) %>% 
  map_df(~{
    
    ## Setup paths
    path_tmp = .x # .x = '../../../../datasets/area-level/UCNT_Longit/v2.0/2-strata.csv'
    path_split = path_tmp %>% str_split('/')
    path_schema_v2_metadata = str_replace(path_tmp, '2-strata.csv', '_loading.parquet')
    
    
    ## Clean
    version_tmp = path_split[[1]][length(path_split[[1]])-1]
    dataset_id_tmp = path_split[[1]][length(path_split[[1]])-2]
    dataset_instance_tmp = glue("{dataset_id_tmp}_{version_tmp}") %>% paste()
    schema_version_tmp = ifelse(file.exists(path_schema_v2_metadata),'v2','v1' )
    
    ## Export
    read_csv(.x, show_col_types  = F) %>% 
      mutate_all(~as.character(.x)) %>% 
      mutate(dataset_instance = dataset_instance_tmp,
             dataset_id = dataset_id_tmp,
             version = version_tmp,
             dataset_version = version_tmp,
             schema_version = schema_version_tmp) 

  }) %>% 
  select(-var_name_raw, -var_name_san, - strata_raw) %>% 
  distinct() %>% 
  filter(!str_detect(var_name,'_LCI|_UCI')) %>% 
  distinct()  %>% 
  mutate(strata_id =  ifelse(is.na(strata_id) | strata_id=='NA', 
                             '', strata_id) %>% 
           str_remove_all("_NA_NA$")) %>% 
  select(dataset_instance, var_name, strata_id, everything()) %>%
  mutate_if(is.character, ~ map_chr(.x, ~ ifelse(is.na(.x), '', .x))) %>% 
  verify(columns_must_not_have_NA_NULL_cells(., list(c('strata_id')))) %>% 
  verify(composite_key_uniqueness(., local_context)) %>% 
  verify(columns_must_not_have_EMPTY_cells(., 
                                           list(c('dataset_instance',
                                                  'version',
                                                  'dataset_id')))) 

## Validation: Check that all production strata_id have is present here
vec_key_production = df_strata_prod_description %>% 
  mutate(key = paste(var_name, strata_id)%>% 
           str_trim()) %>% 
  pull(key) %>% unique()
vec_key_repo = df_strata_csv %>% 
  mutate(key = paste(var_name, strata_id)%>% 
           str_trim()) %>% 
  pull(key) %>% unique()
pass = all(vec_key_production %in% vec_key_repo)
if (!pass) cli_abort("Not all production strata id have associated info here!")


## Preview
head(df_strata_csv)

Okay looks good. Lets Combine!

Code
## Merge
df_strata_stage = df_strata_prod_description %>% 
  left_join(df_strata_csv) %>% 
  verify(full_referential_integrity(., df_strata_prod_description, local_context)) %>% 
  verify(columns_must_not_have_NA_NULL_cells(., list(c('strata_id')))) %>% 
  verify(composite_key_uniqueness(., local_context)) %>% 
  verify(columns_must_not_have_EMPTY_cells(., list(c('dataset_instance','version','dataset_id')))) %>% 
  select(dataset_instance, var_name, strata_description, everything()) %>%  
  arrange(desc(strata_id), var_name, strata_1_name, strata_2_name, 
          strata_1_value, strata_2_value) %>%
  mutate_if(is.character, ~ map_chr(.x, ~ ifelse(is.na(.x), '', .x))) 


## Preview
head(df_strata_stage)

No that we have the strata details lets reform that into a structure that contains summary statistics.

Code
{ ## df_strata ---------------------------------------------------------------
  xwalk_n_strata = df_strata_stage %>%
    select(var_name, strata_1_name, strata_2_name) %>% 
    distinct() %>% 
    group_by(var_name) %>% 
    mutate(n_strata = sum(strata_1_name!='', strata_2_name!='')) %>% 
    ungroup() %>% 
    select(var_name, n_strata)
  df_strata = df_strata_stage %>% 
    left_join(xwalk_n_strata) %>% 
    rowwise() %>% 
    mutate(var_name_nested = ifelse(strata_id == '',
                                    var_name,
                                    paste0(var_name, '_',strata_id ))) %>% 
    ungroup()%>% 
    select(-any_of(c('var_name_san','strata_raw'))) %>% 
    arrange(dataset_instance, var_name, strata_id) %>% 
    select(var_name, var_name_nested, strata_id, everything()) %>% 
    verify(full_referential_integrity(., df_strata_prod_description, local_context)) %>% 
    verify(composite_key_uniqueness(., local_context)) 
  
  ## QC
  regex__invalid_nested_patterns = c('L2') %>% paste(collapse = '|')
  vec__invalid_var_nested =  df_strata %>% 
    filter(str_detect(var_name_nested,regex__invalid_nested_patterns)|is.na(var_name_nested)) %>% 
    pull(var_name_nested)
  if (length(vec__invalid_var_nested)>0)cli_abort("Invalid patterns detected in var_name_nested!")
  
  
  ## Version control
  df_strata %>% 
    write_json("version_control/df_strata.json", pretty = T)
}

Variable level metadata summaries

Here we generate variable level information

  • From data cube
    • What years are available for this measure
    • what geogprahic and strata are availble for this meausre
    • what countires this variable has data for

From metadata cube

Code
{  ## Op. attribute availability  -------------
  
  ## Cleaning
  xwalk_availability_strata = df_strata  %>%
    group_by(var_name) %>%
    group_modify(
      ~  {
        dfa = .x
        attr1 = unique(dfa$strata_1_name)
        attr2 = unique(dfa$strata_2_name)
        
        n_attributes_tmp =  case_when(
          is.na(attr1)  &
            is.na(attr2)  ~  0,
          !is.na(attr1)  &
            is.na(attr2)  ~  1,
          TRUE ~ 2
        )
        attr1_obj = list("name"  =  attr1, "choices" = list(unique(dfa$strata_1_value))[[1]])
        attr2_obj = list("name"  =  attr2, "choices" = list(unique(dfa$strata_2_value))[[1]])
        attr_objs_tmp = NULL
        if (n_attributes_tmp  ==  1)
        {
          attr_objs_tmp = list(attr1_obj)
        }
        if (n_attributes_tmp == 2) {
          attr_objs_tmp = list(attr1_obj, attr2_obj)
        }
        
        tibble(n_attributes = n_attributes_tmp,
               attr_objs  = list(attr_objs_tmp))
      }) %>%
    ungroup() %>% 
    arrange(var_name)
  
  ## Version Control
  xwalk_availability_strata %>% 
    write_json("version_control/xwalk_availability_strata.json", 
               pretty = T)
  
  cli_alert("xwalk_availability_strata done!")
}

{ ## Op. public avialability -------------------------------------------------
  
  ## Cleaning
  xwalk_availability_public = validated_metadata_cube %>%
    group_by(var_name) %>% 
    summarize(public_any = ifelse(any(public == '1'),'1','0')) %>% 
    collect()
  
  ## Version control
  xwalk_availability_public %>% 
    write_json("version_control/xwalk_availability_public.json", 
               pretty = T)
  cli_alert("xwalk_availability_public done!")
}

Now that we have summaries of data-point level information aggregated to the variable level. We can pull our varaible level metadata and then merge that stuff in.

From data cube

Code
{ ## Op. year availability  ------------

  ## Clean
  xwalk_availability_year = validated_data_cube %>% 
    select(var_name, year) %>% 
    distinct() %>% 
    collect() %>%
    group_by(var_name) %>%
    summarize(
      year_min = min(year),
      year_max = max(year),
      n_years = length(unique(year)),
      years = list(sort(unique(year))),
      years_packed = pack_year_string(unique(year))
    ) %>%
    ungroup() %>% 
    arrange(var_name, year_min, year_max)
  
  ## Version Control
  xwalk_availability_year %>% 
    write_json("version_control/xwalk_availability_year.json", 
               pretty = T)

cli_alert("xwalk_availability_year done!")
}




{ ## Op. geo availability  --------------------
  xwalk_availability_geo =  validated_data_cube %>% 
    count(var_name, geo, strata_id) %>% 
    collect() %>%
    left_join(df_strata) %>%
    count(var_name, geo, strata_1_name, strata_2_name) %>%
    select(-n) %>%
    mutate(grouper = var_name) %>%
    group_by(grouper) %>%
    group_modify(
      ~ tibble(
        var_name = unique(.x$var_name),
        geo_available = list(unique(.x$geo) %>% keep(~.x%in%c("L1AD","L2","L2_5","L3","COUNTRY"))),
        attributes_available =  {
          attr_all = unique(c(.x$strata_1_name, .x$strata_2_name))
          list(attr_all[!is.na(attr_all)])
        }
      )) %>%
    ungroup() %>%
    select(-grouper)
  cli_alert("xwalk_availability_geo done!")
}

{ ## Op. country availability ----------------------
  xwalk_availability_country = validated_data_cube %>% 
    count(var_name, iso2) %>% 
    collect() %>%
    left_join(local_context$xwalk_iso2 %>% select(iso2, country = country_label)) %>%
    count(var_name, country) %>%
    arrange(var_name, country) %>%
    group_by(var_name) %>%
    summarize(countries = paste(country, collapse = ', ')) %>%
    ungroup()
  cli_alert("xwalk_availability_country done!")
}

Variable level object

Code
## Pull variable level metadata
df_variable_level_metadata = validated_metadata_cube %>%
  select(
    observation_type,
    domain,
    subdomain,
    var_name,
    var_label,
    var_def,
    value_type,
    units,
    coding,
    dataset_id,
    dataset_instance,
    version,
    dataset_notes 
  ) %>%
  distinct() %>%
  collect() %>%
  arrange(domain, subdomain, var_label)  %>%  
  verify(valid_non_missing_value(., 
                                 list(c('domain', 'subdomain', 'var_def')), 
                                 local_context)) %>%
  verify(composite_key_uniqueness(., local_context)) 

## Merge in the summaries
df_variables = df_variable_level_metadata %>%
  # left_join(xwalk_availability_geo) %>%
  left_join(xwalk_availability_strata) %>%
  # left_join(xwalk_availability_country) %>%
  left_join(xwalk_availability_year) %>%
  left_join(xwalk_availability_public) %>%  
  verify(composite_key_uniqueness(., local_context)) %>% 
  verify(full_referential_integrity(., df_variable_level_metadata, 
                                    local_context)) %>%  
  verify(valid_non_missing_value(., 
                                 list(c('domain', 'subdomain', 'var_def','public_any')), 
                                 local_context))

## Preview
head(df_variables)

Looks okay. Lets export as JSON.

Code
df_variables   %>%   
  jsonlite::toJSON() %>%
  xfun::write_utf8(
    file.path(
      global_context$notebook_relative_root,
      "clean/4-salurbal-hub/1-variables/variables.json"
    )
  )

cli_alert_success("variables.json generated!")

3.2 algolia-variable-catalog.json

This is the json object that we load to algolia. It is just a subset of the variables.json but with some recoded labels for the search filter.

Code
## Operationalize
df_algolia =  df_variables %>%
      mutate(download = ifelse(public_any == '1',
                               'Limited Public Access',
                               'Authorization Required')) %>%
      select(var_name,
             var_def, 
             var_label,
             domain, subdomain,
             geo = geo_available,
             download
      ) %>%
      distinct()

## Preview
head(df_algolia)

## Write
df_algolia %>%
  jsonlite::toJSON(pretty = T) %>%
  xfun::write_utf8(glue("../../../clean/4-salurbal-hub/algolia-variable-catalog.json"))

3.3 Sunburst Plots

4. Variable Specific Pages

These are JSON obejcts that contain details for the variable specific pages

4.1 variable_availability.json

This takes a while as it is a bit more complex wiht pivots (expect 2 minutes). It is a JSON object that contains the availability of each variable by year, country, and attribute.

Code
## Operationalize (legacy)

if (!file.exists('metadata_cache/df_availability_cache.parquet')){
  df_availability = validated_metadata_cube %>%
    select(var_name, iso2, year, public) %>%
    distinct() %>%
    collect() %>% 
    group_split(var_name) %>%
    map_df(~{
      
      var_tmp = unique(.x$var_name)
      cli_alert("Start processing - {var_tmp}")
      
      df_availability_data =  validated_data_cube  %>%
        filter(geo %in% local_context$vec__accepted_geo,
               var_name == var_tmp) %>%
        select(var_name, iso2, year, public) %>%
        distinct() %>% 
        collect()
      
      df_availability_metadata = validated_metadata_cube %>%
        filter(geo %in% local_context$vec__accepted_geo,
               var_name == var_tmp) %>%
        select(var_name, iso2, year, public) %>%
        distinct() %>% 
      
      df_availibilty_metadata_diff = dplyr::setdiff(df_availability_metadata, df_availability_data) %>%
        mutate(public = '9')
      
      df_availibilty_tmp = bind_rows(df_availability_data, df_availibilty_metadata_diff) %>%
        pivot_wider(names_from = iso2, values_from = public) %>%
        replace(is.na(.), "0") %>%
        arrange(var_name, year)
      
      return(df_availibilty_tmp)
    })
  
  df_availability %>% arrow::write_parquet("metadata_cache/df_availability_cache.parquet")
  
} else {
  df_availability = arrow::read_parquet("metadata_cache/df_availability_cache.parquet")
}

Lets take a look.

Code
df_availability %>% 
  sample_n(10)

Looks okay letse export!

Code
df_availability %>%
  jsonlite::toJSON(pretty = T,
                   null = 'null',
                   na = 'null') %>%
  write(glue("../../../clean/4-salurbal-hub/1-variables/variable_availability.json"))

4.2 variable_sources.json

Provides information on

Code
## Operationalize
df_sources = arrow::read_parquet(glue("../../../clean/2-processed/df_sources.parquet"))
df_sources_out = df_sources %>%
  unpack_year() %>%
  pack_year() %>%
  pack_string_column('iso2') %>%
  distinct() 

## Export
df_sources_out %>%
  jsonlite::toJSON(pretty = T) %>%
  xfun::write_utf8(glue("../../../clean/4-salurbal-hub/1-variables/variable_sources.json"))

## Preview
head(sample_n(df_sources_out,10))

4.3 variables_attr.json

Details on variable level population strata availability.

Code
variables_attr = df_strata %>%
  arrange(var_name, var_name_nested)

variables_attr %>%
  jsonlite::toJSON(pretty = T) %>%
  xfun::write_utf8(glue("../../../clean/4-salurbal-hub/1-variables/variables_attr.json"))

head(variables_attr)

4.4 l1ad_preview_data.json

This is a public subset of data to preview

Code
## Operationalize
df_l1_data_preview = df_validated_data_cube %>% 
  ## Subset
  filter(geo == 'L1AD', public == '1') %>% 
  select(var_name,
         var_label,
         iso2,
         salid1 = salid,
         year,
         value,
         value_type )  %>% 
  ## Subset by variable
  group_by(var_name) %>% 
  slice(1:30) %>% 
  ungroup() %>% 
  ## Merge in metadata
  left_join(df_variables)  %>% 
  left_join(local_context$xwalk_l1ad  %>% select(salid1, salid1_label= l1_label))  %>% 
  ## Append population
  left_join(df_pop)  


df_l1_data_preview %>%
  jsonlite::toJSON() %>%
  xfun::write_utf8("../../../clean/4-salurbal-hub/1-variables/l1ad_preview_data.json")

5. Check out page

The checkout page requires just a simple count of n_strata (n_attr).

Code
xwalk_attr_form = df_variables %>% 
  select(var_name, var_label) %>% 
  left_join(df_strata %>% select(var_name, n_attr  = n_strata) %>% distinct())

xwalk_attr_form %>%
  jsonlite::toJSON() %>% 
  xfun::write_utf8("../../../clean/4-salurbal-hub/99-checkout/xwalk_attr_form.json")