SALURBAL Portal Marts v1.1

Authors

Ran Li (Maintainer)

Jessica Uruchima

Published

October 5, 2024

This is legacy workflow for operatianolizaign json mart items for the portal. It pulls from two upstream notebooks:

salurbal-cubes.qmd - which generates proejct wide cubes
salurbal-metadata-mart.qmd - which generates some interemdaite tables that summarize metadata

There are a few sections to this notebook:

Global Items: generate data taht is used across the portal
Data Catalog: generate data that is used just for the data catalog and Algolia

0. Setup

Setup

{ ## 'Global Setup (For any notebook)'
  
  #### Configure relative path (Change!)
  path_global_setup_function = '../../../../R/setup/global_setup.R'
  
  #### Generic global setup code (Do not change!)
  source(path_global_setup_function)
  setup = global_setup(path_global_setup_function)
  invisible(list2env(setup$sourced_objects, envir = .GlobalEnv))
  global_context = setup$global_context
}

{ ## Local Setup (For any notebook)
  
  local_context_object = lst(
    cache_folder = file.path(
      global_context$path_server_etl_cache, 
      '_salurbal_cube_v1'),
    cache_path_validated_metadata_cube = file.path(
      cache_folder, 'validated_metadata_cube.parquet' ),
    cache_path_validated_data_cube = file.path(
      cache_folder, 'validated_data_cube.parquet' ),
    path_cache_utility_1 = file.path(
      global_context$path_server_etl_cache, 
      '_salurbal_portal_mart_v1',
      'utility_1.parquet'),
     path_cache_utility_2 = file.path(
      global_context$path_server_etl_cache, 
      '_salurbal_portal_mart_v1',
      'utility_2.parquet'),
  )
  
  ## Local Objects
  local_context = c(local_context_object, global_context)
  validated_metadata_cube = local_context$cache_path_validated_metadata_cube %>%
    arrow::open_dataset()
  validated_data_cube = local_context$cache_path_validated_data_cube %>% 
    arrow::open_dataset()

}

1. Global Items

1.1 iso2

Code

## Operationalize
df_iso2 = local_context$xwalk_iso2 %>%
  rename(country = country_label, country_url = country_name) %>% 
  arrange(salid0)

## Write
df_iso2 %>%
  jsonlite::toJSON() %>%
  xfun::write_utf8(
    file.path(
      global_context$notebook_relative_root,
      "clean/4-salurbal-hub/0-seeds/iso2.json"
    )
  )

## Preview
head(df_iso2)

1.2 centroids

Code

## Operationalize
df_centroids = read_parquet(local_context$l1ad_centroids_df_path) %>%
  select(country = country_label,
         salid1 = id1,
         salid1_name = l1_name,
         salid1_label = l1_label,
         lng = long,
         lat,
         iso2) %>% 
  arrange(salid1)

## Write
df_centroids %>%
  jsonlite::toJSON() %>%
  xfun::write_utf8(
    file.path(
      global_context$notebook_relative_root,
      "clean/4-salurbal-hub/0-seeds/centroids.json"
    )
  )

## Preview
head(df_centroids)

1.3 City metadata

Code

## Operationalize
xwalk_l2_count  = local_context$xwalk_l2 %>%
  count(psalid1) %>%
  select( salid1 = psalid1,
          n_l2 = n) %>% 
  arrange(salid1)

## Write
xwalk_l2_count %>%
  jsonlite::toJSON() %>%
  xfun::write_utf8(
    file.path(
      global_context$notebook_relative_root,
      "clean/4-salurbal-hub/0-seeds/l1ad_metadata.json"
    )
  )

## Preview
head(xwalk_l2_count)

1.4 Domains

Code

## Operationalize
xwalk_domain_icon = read_excel(
  file.path(
    global_context$notebook_relative_root,
    "code/domains/domains_subdomains_icons.xlsx"
  ),
  sheet = 'domains')  %>%
  mutate_all(~str_squish(.x))
df_domains = validated_metadata_cube %>%
  count(domain,var_name) %>% 
  count(domain, name = 'n_vars') %>%
  collect() %>% 
  mutate(url = str_to_lower(domain) %>% 
           str_replace_all(" ", "-") %>% 
           str_remove_all(',')) %>%
  left_join(xwalk_domain_icon) %>%
  left_join(local_context$df_domains ) %>%
  rename(name = domain) %>% 
  arrange(name)

## Export
df_domains %>%
  jsonlite::toJSON(pretty = T) %>%
  xfun::write_utf8(
    file.path(
      global_context$notebook_relative_root,
      "clean/4-salurbal-hub/0-seeds/domains.json"
    )
  )


## Preview
head(df_domains)

1.5 Subomains

Code

## Write
local_context$df_subdomains %>%
      jsonlite::toJSON(pretty = T) %>%
      xfun::write_utf8(
        file.path(
          global_context$notebook_relative_root,
          "clean/4-salurbal-hub/0-seeds/subdomains.json"
        )
      )
 
## Preview
head(local_context$df_subdomains)

2. Front pages

2.1 Front page stats

Code

## Operationalize
list_etc = list(
  n_variables = validated_metadata_cube %>%
    count(var_name) %>% 
    collect() %>% 
    nrow(),
  n_countries = length(unique(local_context$xwalk_iso2$country_name)),
  n_cities = length(unique(local_context$xwalk_l1ad$salid1)),
  n_nbhd = length(unique(local_context$xwalk_l2$salid2))
)

## Write
list_etc %>%
  jsonlite::toJSON() %>%
  xfun::write_utf8(
    file.path(
      global_context$notebook_relative_root,
      "clean/4-salurbal-hub/0-seeds/etc.json"
    )
  )

## Preview
list_etc

3. Data Catalog + Data About pages

3.1 variables.json

This is our general purpose variable level JSON objects. It includes variable level metadata as well as variable-level summaries of important data-point level information. Lets first op. those summary tables!

Upstream Intermediate Tables

There are two upstream intermediate tables we will need.

Validate Cubes

This is ad-hoc validatiaon of salurbal cubes for this notebook. All QQC and validation will be migrated over to cube notebooks after clarifying semantcis, documentation, and testing.

Note some of these are calcualted from the data cube which is a small subset. This is problematic if variables are being dropped. So lets check

Code

df_qc = validated_metadata_cube %>% 
  select(dataset_instance, schema_version, var_name) %>% 
  mutate(metadata = T) %>% 
  distinct() %>% 
  collect() %>% 
  left_join(
    validated_data_cube %>% 
      select(var_name) %>% 
      mutate(data = T) %>% 
      distinct() %>% 
      collect()
  ) 

## 216 vairbales have metadata but not in data cube
df_qc %>% count(metadata, data)

## Lest look at what hese actually are
df_qc %>% 
  filter(is.na(data)) %>% 
  count(dataset_instance, schema_version)

df_pop

This is a general use L1AD population estimate

Code

## Clean
df_pop = file.path(
  global_context$path_server_dbt_sources,
  'schema_v1_deprecating',
  'PRJT_v1.0_internal.parquet') %>% 
  read_parquet() %>% 
  filter(geo == 'L1AD',
         strata_id == 'Sex_Overall',
         year == '2020') %>% 
  rename(salid1 = salid) %>% 
  left_join(global_context$xwalk_l1ad) %>% 
  select(salid1, pop = value) %>%
  mutate(pop = as.numeric(pop) %>% round(0))

## Export
df_pop %>% 
  arrow::write_parquet(
    file.path(
      global_context$notebook_relative_root,
      "clean/2-processed/df_pop_l1_2020.parquet"
    )
  )

df_strata

This contains compilation of populaiton strata information for each dataset. V2 schema should have all of this in the metdata cube so eventually this will just all pull from the SALURBAL metadata cube, but fow now we need tow workk with how v1 schema thigns are set up:

Pull in strata_description from metadata cube
Pull in strata details from strata.csv
Combine

Let’s get started!

Pull in strata_description from metadata cube

Get strata_desc from metadata cube

## Clean
df_strata_prod_description = validated_metadata_cube %>% 
  select(var_name, strata_id, strata_description,
        dataset_instance, dataset_id, version, dataset_version, schema_version) %>% 
  distinct() %>% 
  collect() %>% 
  verify(columns_must_not_have_NA_NULL_cells(., list(c('strata_id')))) %>% 
  verify(composite_key_uniqueness(., local_context)) %>% 
  verify(columns_must_not_have_EMPTY_cells(., list(c('dataset_instance','version','dataset_id')))) 

## Preview
head(df_strata_prod_description)

Pull in subset of strata details from schema v1 strata.csv files. Note there are more of these compared to strata id in metadata cube because not all repo file are in production cubes; so we subset for the correct ones. Also note that as we shift towards schema v2 where this information is found in metadata cube - this section of logic will be deprecated.

Get strata details from strata.csv files

## Clean
df_strata_csv = list.files(
  file.path(
    global_context$notebook_relative_root,
    'datasets'
  ),
  recursive = T,
  pattern = '2-strata.csv',
  full.names = T
) %>% 
  map_df(~{
    
    ## Setup paths
    path_tmp = .x # .x = '../../../../datasets/area-level/UCNT_Longit/v2.0/2-strata.csv'
    path_split = path_tmp %>% str_split('/')
    path_schema_v2_metadata = str_replace(path_tmp, '2-strata.csv', '_loading.parquet')
    
    
    ## Clean
    version_tmp = path_split[[1]][length(path_split[[1]])-1]
    dataset_id_tmp = path_split[[1]][length(path_split[[1]])-2]
    dataset_instance_tmp = glue("{dataset_id_tmp}_{version_tmp}") %>% paste()
    schema_version_tmp = ifelse(file.exists(path_schema_v2_metadata),'v2','v1' )
    
    ## Export
    read_csv(.x, show_col_types  = F) %>% 
      mutate_all(~as.character(.x)) %>% 
      mutate(dataset_instance = dataset_instance_tmp,
             dataset_id = dataset_id_tmp,
             version = version_tmp,
             dataset_version = version_tmp,
             schema_version = schema_version_tmp) 

  }) %>% 
  select(-var_name_raw, -var_name_san, - strata_raw) %>% 
  distinct() %>% 
  filter(!str_detect(var_name,'_LCI|_UCI')) %>% 
  distinct()  %>% 
  mutate(strata_id =  ifelse(is.na(strata_id) | strata_id=='NA', 
                             '', strata_id) %>% 
           str_remove_all("_NA_NA$")) %>% 
  select(dataset_instance, var_name, strata_id, everything()) %>%
  mutate_if(is.character, ~ map_chr(.x, ~ ifelse(is.na(.x), '', .x))) %>% 
  verify(columns_must_not_have_NA_NULL_cells(., list(c('strata_id')))) %>% 
  verify(composite_key_uniqueness(., local_context)) %>% 
  verify(columns_must_not_have_EMPTY_cells(., 
                                           list(c('dataset_instance',
                                                  'version',
                                                  'dataset_id')))) 

## Validation: Check that all production strata_id have is present here
vec_key_production = df_strata_prod_description %>% 
  mutate(key = paste(var_name, strata_id)%>% 
           str_trim()) %>% 
  pull(key) %>% unique()
vec_key_repo = df_strata_csv %>% 
  mutate(key = paste(var_name, strata_id)%>% 
           str_trim()) %>% 
  pull(key) %>% unique()
pass = all(vec_key_production %in% vec_key_repo)
if (!pass) cli_abort("Not all production strata id have associated info here!")


## Preview
head(df_strata_csv)

Okay looks good. Lets Combine!

Code

## Merge
df_strata_stage = df_strata_prod_description %>% 
  left_join(df_strata_csv) %>% 
  verify(full_referential_integrity(., df_strata_prod_description, local_context)) %>% 
  verify(columns_must_not_have_NA_NULL_cells(., list(c('strata_id')))) %>% 
  verify(composite_key_uniqueness(., local_context)) %>% 
  verify(columns_must_not_have_EMPTY_cells(., list(c('dataset_instance','version','dataset_id')))) %>% 
  select(dataset_instance, var_name, strata_description, everything()) %>%  
  arrange(desc(strata_id), var_name, strata_1_name, strata_2_name, 
          strata_1_value, strata_2_value) %>%
  mutate_if(is.character, ~ map_chr(.x, ~ ifelse(is.na(.x), '', .x))) 


## Preview
head(df_strata_stage)

No that we have the strata details lets reform that into a structure that contains summary statistics.

Code

{ ## df_strata ---------------------------------------------------------------
  xwalk_n_strata = df_strata_stage %>%
    select(var_name, strata_1_name, strata_2_name) %>% 
    distinct() %>% 
    group_by(var_name) %>% 
    mutate(n_strata = sum(strata_1_name!='', strata_2_name!='')) %>% 
    ungroup() %>% 
    select(var_name, n_strata)
  df_strata = df_strata_stage %>% 
    left_join(xwalk_n_strata) %>% 
    rowwise() %>% 
    mutate(var_name_nested = ifelse(strata_id == '',
                                    var_name,
                                    paste0(var_name, '_',strata_id ))) %>% 
    ungroup()%>% 
    select(-any_of(c('var_name_san','strata_raw'))) %>% 
    arrange(dataset_instance, var_name, strata_id) %>% 
    select(var_name, var_name_nested, strata_id, everything()) %>% 
    verify(full_referential_integrity(., df_strata_prod_description, local_context)) %>% 
    verify(composite_key_uniqueness(., local_context)) 
  
  ## QC
  regex__invalid_nested_patterns = c('L2') %>% paste(collapse = '|')
  vec__invalid_var_nested =  df_strata %>% 
    filter(str_detect(var_name_nested,regex__invalid_nested_patterns)|is.na(var_name_nested)) %>% 
    pull(var_name_nested)
  if (length(vec__invalid_var_nested)>0)cli_abort("Invalid patterns detected in var_name_nested!")
  
  
  ## Version control
  df_strata %>% 
    write_json("version_control/df_strata.json", pretty = T)
}

Variable level metadata summaries

Here we generate variable level information

From data cube
- What years are available for this measure
- what geogprahic and strata are availble for this meausre
- what countires this variable has data for

From metadata cube

Code

{  ## Op. attribute availability  -------------
  
  ## Cleaning
  xwalk_availability_strata = df_strata  %>%
    group_by(var_name) %>%
    group_modify(
      ~  {
        dfa = .x
        attr1 = unique(dfa$strata_1_name)
        attr2 = unique(dfa$strata_2_name)
        
        n_attributes_tmp =  case_when(
          is.na(attr1)  &
            is.na(attr2)  ~  0,
          !is.na(attr1)  &
            is.na(attr2)  ~  1,
          TRUE ~ 2
        )
        attr1_obj = list("name"  =  attr1, "choices" = list(unique(dfa$strata_1_value))[[1]])
        attr2_obj = list("name"  =  attr2, "choices" = list(unique(dfa$strata_2_value))[[1]])
        attr_objs_tmp = NULL
        if (n_attributes_tmp  ==  1)
        {
          attr_objs_tmp = list(attr1_obj)
        }
        if (n_attributes_tmp == 2) {
          attr_objs_tmp = list(attr1_obj, attr2_obj)
        }
        
        tibble(n_attributes = n_attributes_tmp,
               attr_objs  = list(attr_objs_tmp))
      }) %>%
    ungroup() %>% 
    arrange(var_name)
  
  ## Version Control
  xwalk_availability_strata %>% 
    write_json("version_control/xwalk_availability_strata.json", 
               pretty = T)
  
  cli_alert("xwalk_availability_strata done!")
}

{ ## Op. public avialability -------------------------------------------------
  
  ## Cleaning
  xwalk_availability_public = validated_metadata_cube %>%
    group_by(var_name) %>% 
    summarize(public_any = ifelse(any(public == '1'),'1','0')) %>% 
    collect()
  
  ## Version control
  xwalk_availability_public %>% 
    write_json("version_control/xwalk_availability_public.json", 
               pretty = T)
  cli_alert("xwalk_availability_public done!")
}

Now that we have summaries of data-point level information aggregated to the variable level. We can pull our varaible level metadata and then merge that stuff in.

From data cube

Code

{ ## Op. year availability  ------------

  ## Clean
  xwalk_availability_year = validated_data_cube %>% 
    select(var_name, year) %>% 
    distinct() %>% 
    collect() %>%
    group_by(var_name) %>%
    summarize(
      year_min = min(year),
      year_max = max(year),
      n_years = length(unique(year)),
      years = list(sort(unique(year))),
      years_packed = pack_year_string(unique(year))
    ) %>%
    ungroup() %>% 
    arrange(var_name, year_min, year_max)
  
  ## Version Control
  xwalk_availability_year %>% 
    write_json("version_control/xwalk_availability_year.json", 
               pretty = T)

cli_alert("xwalk_availability_year done!")
}




{ ## Op. geo availability  --------------------
  xwalk_availability_geo =  validated_data_cube %>% 
    count(var_name, geo, strata_id) %>% 
    collect() %>%
    left_join(df_strata) %>%
    count(var_name, geo, strata_1_name, strata_2_name) %>%
    select(-n) %>%
    mutate(grouper = var_name) %>%
    group_by(grouper) %>%
    group_modify(
      ~ tibble(
        var_name = unique(.x$var_name),
        geo_available = list(unique(.x$geo) %>% keep(~.x%in%c("L1AD","L2","L2_5","L3","COUNTRY"))),
        attributes_available =  {
          attr_all = unique(c(.x$strata_1_name, .x$strata_2_name))
          list(attr_all[!is.na(attr_all)])
        }
      )) %>%
    ungroup() %>%
    select(-grouper)
  cli_alert("xwalk_availability_geo done!")
}

{ ## Op. country availability ----------------------
  xwalk_availability_country = validated_data_cube %>% 
    count(var_name, iso2) %>% 
    collect() %>%
    left_join(local_context$xwalk_iso2 %>% select(iso2, country = country_label)) %>%
    count(var_name, country) %>%
    arrange(var_name, country) %>%
    group_by(var_name) %>%
    summarize(countries = paste(country, collapse = ', ')) %>%
    ungroup()
  cli_alert("xwalk_availability_country done!")
}

Variable level object

Code

## Pull variable level metadata
df_variable_level_metadata = validated_metadata_cube %>%
  select(
    observation_type,
    domain,
    subdomain,
    var_name,
    var_label,
    var_def,
    value_type,
    units,
    coding,
    dataset_id,
    dataset_instance,
    version,
    dataset_notes 
  ) %>%
  distinct() %>%
  collect() %>%
  arrange(domain, subdomain, var_label)  %>%  
  verify(valid_non_missing_value(., 
                                 list(c('domain', 'subdomain', 'var_def')), 
                                 local_context)) %>%
  verify(composite_key_uniqueness(., local_context)) 

## Merge in the summaries
df_variables = df_variable_level_metadata %>%
  # left_join(xwalk_availability_geo) %>%
  left_join(xwalk_availability_strata) %>%
  # left_join(xwalk_availability_country) %>%
  left_join(xwalk_availability_year) %>%
  left_join(xwalk_availability_public) %>%  
  verify(composite_key_uniqueness(., local_context)) %>% 
  verify(full_referential_integrity(., df_variable_level_metadata, 
                                    local_context)) %>%  
  verify(valid_non_missing_value(., 
                                 list(c('domain', 'subdomain', 'var_def','public_any')), 
                                 local_context))

## Preview
head(df_variables)

Looks okay. Lets export as JSON.

Code

df_variables   %>%   
  jsonlite::toJSON() %>%
  xfun::write_utf8(
    file.path(
      global_context$notebook_relative_root,
      "clean/4-salurbal-hub/1-variables/variables.json"
    )
  )

cli_alert_success("variables.json generated!")

3.2 algolia-variable-catalog.json

This is the json object that we load to algolia. It is just a subset of the variables.json but with some recoded labels for the search filter.

Code

## Operationalize
df_algolia =  df_variables %>%
      mutate(download = ifelse(public_any == '1',
                               'Limited Public Access',
                               'Authorization Required')) %>%
      select(var_name,
             var_def, 
             var_label,
             domain, subdomain,
             geo = geo_available,
             download
      ) %>%
      distinct()

## Preview
head(df_algolia)

## Write
df_algolia %>%
  jsonlite::toJSON(pretty = T) %>%
  xfun::write_utf8(glue("../../../clean/4-salurbal-hub/algolia-variable-catalog.json"))

3.3 Sunburst Plots

4. Variable Specific Pages

These are JSON obejcts that contain details for the variable specific pages

4.1 variable_availability.json

This takes a while as it is a bit more complex wiht pivots (expect 2 minutes). It is a JSON object that contains the availability of each variable by year, country, and attribute.

Code

## Operationalize (legacy)

if (!file.exists('metadata_cache/df_availability_cache.parquet')){
  df_availability = validated_metadata_cube %>%
    select(var_name, iso2, year, public) %>%
    distinct() %>%
    collect() %>% 
    group_split(var_name) %>%
    map_df(~{
      
      var_tmp = unique(.x$var_name)
      cli_alert("Start processing - {var_tmp}")
      
      df_availability_data =  validated_data_cube  %>%
        filter(geo %in% local_context$vec__accepted_geo,
               var_name == var_tmp) %>%
        select(var_name, iso2, year, public) %>%
        distinct() %>% 
        collect()
      
      df_availability_metadata = validated_metadata_cube %>%
        filter(geo %in% local_context$vec__accepted_geo,
               var_name == var_tmp) %>%
        select(var_name, iso2, year, public) %>%
        distinct() %>% 
      
      df_availibilty_metadata_diff = dplyr::setdiff(df_availability_metadata, df_availability_data) %>%
        mutate(public = '9')
      
      df_availibilty_tmp = bind_rows(df_availability_data, df_availibilty_metadata_diff) %>%
        pivot_wider(names_from = iso2, values_from = public) %>%
        replace(is.na(.), "0") %>%
        arrange(var_name, year)
      
      return(df_availibilty_tmp)
    })
  
  df_availability %>% arrow::write_parquet("metadata_cache/df_availability_cache.parquet")
  
} else {
  df_availability = arrow::read_parquet("metadata_cache/df_availability_cache.parquet")
}

Lets take a look.

Code

df_availability %>% 
  sample_n(10)

Looks okay letse export!

Code

df_availability %>%
  jsonlite::toJSON(pretty = T,
                   null = 'null',
                   na = 'null') %>%
  write(glue("../../../clean/4-salurbal-hub/1-variables/variable_availability.json"))

4.2 variable_sources.json

Provides information on

Code

## Operationalize
df_sources = arrow::read_parquet(glue("../../../clean/2-processed/df_sources.parquet"))
df_sources_out = df_sources %>%
  unpack_year() %>%
  pack_year() %>%
  pack_string_column('iso2') %>%
  distinct() 

## Export
df_sources_out %>%
  jsonlite::toJSON(pretty = T) %>%
  xfun::write_utf8(glue("../../../clean/4-salurbal-hub/1-variables/variable_sources.json"))

## Preview
head(sample_n(df_sources_out,10))

4.3 variables_attr.json

Details on variable level population strata availability.

Code

variables_attr = df_strata %>%
  arrange(var_name, var_name_nested)

variables_attr %>%
  jsonlite::toJSON(pretty = T) %>%
  xfun::write_utf8(glue("../../../clean/4-salurbal-hub/1-variables/variables_attr.json"))

head(variables_attr)

4.4 l1ad_preview_data.json

This is a public subset of data to preview

Code

## Operationalize
df_l1_data_preview = df_validated_data_cube %>% 
  ## Subset
  filter(geo == 'L1AD', public == '1') %>% 
  select(var_name,
         var_label,
         iso2,
         salid1 = salid,
         year,
         value,
         value_type )  %>% 
  ## Subset by variable
  group_by(var_name) %>% 
  slice(1:30) %>% 
  ungroup() %>% 
  ## Merge in metadata
  left_join(df_variables)  %>% 
  left_join(local_context$xwalk_l1ad  %>% select(salid1, salid1_label= l1_label))  %>% 
  ## Append population
  left_join(df_pop)  


df_l1_data_preview %>%
  jsonlite::toJSON() %>%
  xfun::write_utf8("../../../clean/4-salurbal-hub/1-variables/l1ad_preview_data.json")

5. Check out page

The checkout page requires just a simple count of n_strata (n_attr).

Code

xwalk_attr_form = df_variables %>% 
  select(var_name, var_label) %>% 
  left_join(df_strata %>% select(var_name, n_attr  = n_strata) %>% distinct())

xwalk_attr_form %>%
  jsonlite::toJSON() %>% 
  xfun::write_utf8("../../../clean/4-salurbal-hub/99-checkout/xwalk_attr_form.json")

--- title: "SALURBAL Portal Marts v1.1" author: - name: Ran Li (Maintainer) orcid: 0000-0002-4699-4755 - name: Jessica Uruchima date: last-modified format: html: page-layout: full toc: true self-contained: true code-fold: true df-print: kable code-tools: true comments: hypothesis: theme: clean editor: source execute: warning: false message: false eval: false editor_options: chunk_output_type: console --- This is legacy workflow for operatianolizaign json mart items for the portal. It pulls from two upstream notebooks: - salurbal-cubes.qmd - which generates proejct wide cubes - salurbal-metadata-mart.qmd - which generates some interemdaite tables that summarize metadata There are a few sections to this notebook: - Global Items: generate data taht is used across the portal - Data Catalog: generate data that is used just for the data catalog and Algolia # 0. Setup ```{r} #| code-summary: 'Setup' #| code-fold: true { ## 'Global Setup (For any notebook)' #### Configure relative path (Change!) path_global_setup_function = '../../../../R/setup/global_setup.R' #### Generic global setup code (Do not change!) source(path_global_setup_function) setup = global_setup(path_global_setup_function) invisible(list2env(setup$sourced_objects, envir = .GlobalEnv)) global_context = setup$global_context } { ## Local Setup (For any notebook) local_context_object = lst( cache_folder = file.path( global_context$path_server_etl_cache, '_salurbal_cube_v1'), cache_path_validated_metadata_cube = file.path( cache_folder, 'validated_metadata_cube.parquet' ), cache_path_validated_data_cube = file.path( cache_folder, 'validated_data_cube.parquet' ), path_cache_utility_1 = file.path( global_context$path_server_etl_cache, '_salurbal_portal_mart_v1', 'utility_1.parquet'), path_cache_utility_2 = file.path( global_context$path_server_etl_cache, '_salurbal_portal_mart_v1', 'utility_2.parquet'), ) ## Local Objects local_context = c(local_context_object, global_context) validated_metadata_cube = local_context$cache_path_validated_metadata_cube %>% arrow::open_dataset() validated_data_cube = local_context$cache_path_validated_data_cube %>% arrow::open_dataset() } ``` # 1. Global Items ## 1.1 iso2 ```{r} ## Operationalize df_iso2 = local_context$xwalk_iso2 %>% rename(country = country_label, country_url = country_name) %>% arrange(salid0) ## Write df_iso2 %>% jsonlite::toJSON() %>% xfun::write_utf8( file.path( global_context$notebook_relative_root, "clean/4-salurbal-hub/0-seeds/iso2.json" ) ) ## Preview head(df_iso2) ``` ## 1.2 centroids ```{r} ## Operationalize df_centroids = read_parquet(local_context$l1ad_centroids_df_path) %>% select(country = country_label, salid1 = id1, salid1_name = l1_name, salid1_label = l1_label, lng = long, lat, iso2) %>% arrange(salid1) ## Write df_centroids %>% jsonlite::toJSON() %>% xfun::write_utf8( file.path( global_context$notebook_relative_root, "clean/4-salurbal-hub/0-seeds/centroids.json" ) ) ## Preview head(df_centroids) ``` ## 1.3 City metadata ```{r} ## Operationalize xwalk_l2_count = local_context$xwalk_l2 %>% count(psalid1) %>% select( salid1 = psalid1, n_l2 = n) %>% arrange(salid1) ## Write xwalk_l2_count %>% jsonlite::toJSON() %>% xfun::write_utf8( file.path( global_context$notebook_relative_root, "clean/4-salurbal-hub/0-seeds/l1ad_metadata.json" ) ) ## Preview head(xwalk_l2_count) ``` ## 1.4 Domains ```{r} ## Operationalize xwalk_domain_icon = read_excel( file.path( global_context$notebook_relative_root, "code/domains/domains_subdomains_icons.xlsx" ), sheet = 'domains') %>% mutate_all(~str_squish(.x)) df_domains = validated_metadata_cube %>% count(domain,var_name) %>% count(domain, name = 'n_vars') %>% collect() %>% mutate(url = str_to_lower(domain) %>% str_replace_all(" ", "-") %>% str_remove_all(',')) %>% left_join(xwalk_domain_icon) %>% left_join(local_context$df_domains ) %>% rename(name = domain) %>% arrange(name) ## Export df_domains %>% jsonlite::toJSON(pretty = T) %>% xfun::write_utf8( file.path( global_context$notebook_relative_root, "clean/4-salurbal-hub/0-seeds/domains.json" ) ) ## Preview head(df_domains) ``` ## 1.5 Subomains ```{r} ## Write local_context$df_subdomains %>% jsonlite::toJSON(pretty = T) %>% xfun::write_utf8( file.path( global_context$notebook_relative_root, "clean/4-salurbal-hub/0-seeds/subdomains.json" ) ) ## Preview head(local_context$df_subdomains) ``` # 2. Front pages ## 2.1 Front page stats ```{r} ## Operationalize list_etc = list( n_variables = validated_metadata_cube %>% count(var_name) %>% collect() %>% nrow(), n_countries = length(unique(local_context$xwalk_iso2$country_name)), n_cities = length(unique(local_context$xwalk_l1ad$salid1)), n_nbhd = length(unique(local_context$xwalk_l2$salid2)) ) ## Write list_etc %>% jsonlite::toJSON() %>% xfun::write_utf8( file.path( global_context$notebook_relative_root, "clean/4-salurbal-hub/0-seeds/etc.json" ) ) ## Preview list_etc ``` # 3. Data Catalog + Data About pages ## 3.1 variables.json This is our general purpose variable level JSON objects. It includes variable level metadata as well as variable-level summaries of important data-point level information. Lets first op. those summary tables! ### Upstream Intermediate Tables There are two upstream intermediate tables we will need. #### Validate Cubes This is ad-hoc validatiaon of salurbal cubes for this notebook. All QQC and validation will be migrated over to cube notebooks after clarifying semantcis, documentation, and testing. Note some of these are calcualted from the data cube which is a small subset. This is problematic if variables are being dropped. So lets check ```{r} #| eval: false df_qc = validated_metadata_cube %>% select(dataset_instance, schema_version, var_name) %>% mutate(metadata = T) %>% distinct() %>% collect() %>% left_join( validated_data_cube %>% select(var_name) %>% mutate(data = T) %>% distinct() %>% collect() ) ## 216 vairbales have metadata but not in data cube df_qc %>% count(metadata, data) ## Lest look at what hese actually are df_qc %>% filter(is.na(data)) %>% count(dataset_instance, schema_version) ``` #### df_pop This is a general use L1AD population estimate ```{r} ## Clean df_pop = file.path( global_context$path_server_dbt_sources, 'schema_v1_deprecating', 'PRJT_v1.0_internal.parquet') %>% read_parquet() %>% filter(geo == 'L1AD', strata_id == 'Sex_Overall', year == '2020') %>% rename(salid1 = salid) %>% left_join(global_context$xwalk_l1ad) %>% select(salid1, pop = value) %>% mutate(pop = as.numeric(pop) %>% round(0)) ## Export df_pop %>% arrow::write_parquet( file.path( global_context$notebook_relative_root, "clean/2-processed/df_pop_l1_2020.parquet" ) ) ``` #### df_strata This contains compilation of populaiton strata information for each dataset. V2 schema should have all of this in the metdata cube so eventually this will just all pull from the SALURBAL metadata cube, but fow now we need tow workk with how v1 schema thigns are set up: - Pull in strata_description from metadata cube - Pull in strata details from strata.csv - Combine Let's get started! **Pull in strata_description from metadata cube** ```{r} #| code-summary: "Get strata_desc from metadata cube" ## Clean df_strata_prod_description = validated_metadata_cube %>% select(var_name, strata_id, strata_description, dataset_instance, dataset_id, version, dataset_version, schema_version) %>% distinct() %>% collect() %>% verify(columns_must_not_have_NA_NULL_cells(., list(c('strata_id')))) %>% verify(composite_key_uniqueness(., local_context)) %>% verify(columns_must_not_have_EMPTY_cells(., list(c('dataset_instance','version','dataset_id')))) ## Preview head(df_strata_prod_description) ``` **Pull in subset of strata details from schema v1 strata.csv files.** Note there are more of these compared to strata id in metadata cube because not all repo file are in production cubes; so we subset for the correct ones. Also note that as we shift towards schema v2 where this information is found in metadata cube - this section of logic will be deprecated. ```{r} #| code-summary: 'Get strata details from strata.csv files' ## Clean df_strata_csv = list.files( file.path( global_context$notebook_relative_root, 'datasets' ), recursive = T, pattern = '2-strata.csv', full.names = T ) %>% map_df(~{ ## Setup paths path_tmp = .x # .x = '../../../../datasets/area-level/UCNT_Longit/v2.0/2-strata.csv' path_split = path_tmp %>% str_split('/') path_schema_v2_metadata = str_replace(path_tmp, '2-strata.csv', '_loading.parquet') ## Clean version_tmp = path_split[[1]][length(path_split[[1]])-1] dataset_id_tmp = path_split[[1]][length(path_split[[1]])-2] dataset_instance_tmp = glue("{dataset_id_tmp}_{version_tmp}") %>% paste() schema_version_tmp = ifelse(file.exists(path_schema_v2_metadata),'v2','v1' ) ## Export read_csv(.x, show_col_types = F) %>% mutate_all(~as.character(.x)) %>% mutate(dataset_instance = dataset_instance_tmp, dataset_id = dataset_id_tmp, version = version_tmp, dataset_version = version_tmp, schema_version = schema_version_tmp) }) %>% select(-var_name_raw, -var_name_san, - strata_raw) %>% distinct() %>% filter(!str_detect(var_name,'_LCI|_UCI')) %>% distinct() %>% mutate(strata_id = ifelse(is.na(strata_id) | strata_id=='NA', '', strata_id) %>% str_remove_all("_NA_NA$")) %>% select(dataset_instance, var_name, strata_id, everything()) %>% mutate_if(is.character, ~ map_chr(.x, ~ ifelse(is.na(.x), '', .x))) %>% verify(columns_must_not_have_NA_NULL_cells(., list(c('strata_id')))) %>% verify(composite_key_uniqueness(., local_context)) %>% verify(columns_must_not_have_EMPTY_cells(., list(c('dataset_instance', 'version', 'dataset_id')))) ## Validation: Check that all production strata_id have is present here vec_key_production = df_strata_prod_description %>% mutate(key = paste(var_name, strata_id)%>% str_trim()) %>% pull(key) %>% unique() vec_key_repo = df_strata_csv %>% mutate(key = paste(var_name, strata_id)%>% str_trim()) %>% pull(key) %>% unique() pass = all(vec_key_production %in% vec_key_repo) if (!pass) cli_abort("Not all production strata id have associated info here!") ## Preview head(df_strata_csv) ``` Okay looks good. Lets **Combine!** ```{r} ## Merge df_strata_stage = df_strata_prod_description %>% left_join(df_strata_csv) %>% verify(full_referential_integrity(., df_strata_prod_description, local_context)) %>% verify(columns_must_not_have_NA_NULL_cells(., list(c('strata_id')))) %>% verify(composite_key_uniqueness(., local_context)) %>% verify(columns_must_not_have_EMPTY_cells(., list(c('dataset_instance','version','dataset_id')))) %>% select(dataset_instance, var_name, strata_description, everything()) %>% arrange(desc(strata_id), var_name, strata_1_name, strata_2_name, strata_1_value, strata_2_value) %>% mutate_if(is.character, ~ map_chr(.x, ~ ifelse(is.na(.x), '', .x))) ## Preview head(df_strata_stage) ``` No that we have the strata details lets reform that into a structure that contains summary statistics. ```{r} { ## df_strata --------------------------------------------------------------- xwalk_n_strata = df_strata_stage %>% select(var_name, strata_1_name, strata_2_name) %>% distinct() %>% group_by(var_name) %>% mutate(n_strata = sum(strata_1_name!='', strata_2_name!='')) %>% ungroup() %>% select(var_name, n_strata) df_strata = df_strata_stage %>% left_join(xwalk_n_strata) %>% rowwise() %>% mutate(var_name_nested = ifelse(strata_id == '', var_name, paste0(var_name, '_',strata_id ))) %>% ungroup()%>% select(-any_of(c('var_name_san','strata_raw'))) %>% arrange(dataset_instance, var_name, strata_id) %>% select(var_name, var_name_nested, strata_id, everything()) %>% verify(full_referential_integrity(., df_strata_prod_description, local_context)) %>% verify(composite_key_uniqueness(., local_context)) ## QC regex__invalid_nested_patterns = c('L2') %>% paste(collapse = '|') vec__invalid_var_nested = df_strata %>% filter(str_detect(var_name_nested,regex__invalid_nested_patterns)|is.na(var_name_nested)) %>% pull(var_name_nested) if (length(vec__invalid_var_nested)>0)cli_abort("Invalid patterns detected in var_name_nested!") ## Version control df_strata %>% write_json("version_control/df_strata.json", pretty = T) } ``` ### Variable level metadata summaries Here we generate variable level information - From data cube - What years are available for this measure - what geogprahic and strata are availble for this meausre - what countires this variable has data for #### From metadata cube ```{r} { ## Op. attribute availability ------------- ## Cleaning xwalk_availability_strata = df_strata %>% group_by(var_name) %>% group_modify( ~ { dfa = .x attr1 = unique(dfa$strata_1_name) attr2 = unique(dfa$strata_2_name) n_attributes_tmp = case_when( is.na(attr1) & is.na(attr2) ~ 0, !is.na(attr1) & is.na(attr2) ~ 1, TRUE ~ 2 ) attr1_obj = list("name" = attr1, "choices" = list(unique(dfa$strata_1_value))[[1]]) attr2_obj = list("name" = attr2, "choices" = list(unique(dfa$strata_2_value))[[1]]) attr_objs_tmp = NULL if (n_attributes_tmp == 1) { attr_objs_tmp = list(attr1_obj) } if (n_attributes_tmp == 2) { attr_objs_tmp = list(attr1_obj, attr2_obj) } tibble(n_attributes = n_attributes_tmp, attr_objs = list(attr_objs_tmp)) }) %>% ungroup() %>% arrange(var_name) ## Version Control xwalk_availability_strata %>% write_json("version_control/xwalk_availability_strata.json", pretty = T) cli_alert("xwalk_availability_strata done!") } { ## Op. public avialability ------------------------------------------------- ## Cleaning xwalk_availability_public = validated_metadata_cube %>% group_by(var_name) %>% summarize(public_any = ifelse(any(public == '1'),'1','0')) %>% collect() ## Version control xwalk_availability_public %>% write_json("version_control/xwalk_availability_public.json", pretty = T) cli_alert("xwalk_availability_public done!") } ``` Now that we have summaries of data-point level information aggregated to the variable level. We can pull our varaible level metadata and then merge that stuff in. #### From data cube ```{r} { ## Op. year availability ------------ ## Clean xwalk_availability_year = validated_data_cube %>% select(var_name, year) %>% distinct() %>% collect() %>% group_by(var_name) %>% summarize( year_min = min(year), year_max = max(year), n_years = length(unique(year)), years = list(sort(unique(year))), years_packed = pack_year_string(unique(year)) ) %>% ungroup() %>% arrange(var_name, year_min, year_max) ## Version Control xwalk_availability_year %>% write_json("version_control/xwalk_availability_year.json", pretty = T) cli_alert("xwalk_availability_year done!") } { ## Op. geo availability -------------------- xwalk_availability_geo = validated_data_cube %>% count(var_name, geo, strata_id) %>% collect() %>% left_join(df_strata) %>% count(var_name, geo, strata_1_name, strata_2_name) %>% select(-n) %>% mutate(grouper = var_name) %>% group_by(grouper) %>% group_modify( ~ tibble( var_name = unique(.x$var_name), geo_available = list(unique(.x$geo) %>% keep(~.x%in%c("L1AD","L2","L2_5","L3","COUNTRY"))), attributes_available = { attr_all = unique(c(.x$strata_1_name, .x$strata_2_name)) list(attr_all[!is.na(attr_all)]) } )) %>% ungroup() %>% select(-grouper) cli_alert("xwalk_availability_geo done!") } { ## Op. country availability ---------------------- xwalk_availability_country = validated_data_cube %>% count(var_name, iso2) %>% collect() %>% left_join(local_context$xwalk_iso2 %>% select(iso2, country = country_label)) %>% count(var_name, country) %>% arrange(var_name, country) %>% group_by(var_name) %>% summarize(countries = paste(country, collapse = ', ')) %>% ungroup() cli_alert("xwalk_availability_country done!") } ``` ### Variable level object ```{r} ## Pull variable level metadata df_variable_level_metadata = validated_metadata_cube %>% select( observation_type, domain, subdomain, var_name, var_label, var_def, value_type, units, coding, dataset_id, dataset_instance, version, dataset_notes ) %>% distinct() %>% collect() %>% arrange(domain, subdomain, var_label) %>% verify(valid_non_missing_value(., list(c('domain', 'subdomain', 'var_def')), local_context)) %>% verify(composite_key_uniqueness(., local_context)) ## Merge in the summaries df_variables = df_variable_level_metadata %>% # left_join(xwalk_availability_geo) %>% left_join(xwalk_availability_strata) %>% # left_join(xwalk_availability_country) %>% left_join(xwalk_availability_year) %>% left_join(xwalk_availability_public) %>% verify(composite_key_uniqueness(., local_context)) %>% verify(full_referential_integrity(., df_variable_level_metadata, local_context)) %>% verify(valid_non_missing_value(., list(c('domain', 'subdomain', 'var_def','public_any')), local_context)) ## Preview head(df_variables) ``` Looks okay. Lets export as JSON. ```{r} df_variables %>% jsonlite::toJSON() %>% xfun::write_utf8( file.path( global_context$notebook_relative_root, "clean/4-salurbal-hub/1-variables/variables.json" ) ) cli_alert_success("variables.json generated!") ``` ## 3.2 algolia-variable-catalog.json This is the json object that we load to algolia. It is just a subset of the variables.json but with some recoded labels for the search filter. ```{r} ## Operationalize df_algolia = df_variables %>% mutate(download = ifelse(public_any == '1', 'Limited Public Access', 'Authorization Required')) %>% select(var_name, var_def, var_label, domain, subdomain, geo = geo_available, download ) %>% distinct() ## Preview head(df_algolia) ## Write df_algolia %>% jsonlite::toJSON(pretty = T) %>% xfun::write_utf8(glue("../../../clean/4-salurbal-hub/algolia-variable-catalog.json")) ``` ## 3.3 Sunburst Plots # 4. Variable Specific Pages These are JSON obejcts that contain details for the variable specific pages ## 4.1 variable_availability.json This takes a while as it is a bit more complex wiht pivots (expect 2 minutes). It is a JSON object that contains the availability of each variable by year, country, and attribute. ```{r} ## Operationalize (legacy) if (!file.exists('metadata_cache/df_availability_cache.parquet')){ df_availability = validated_metadata_cube %>% select(var_name, iso2, year, public) %>% distinct() %>% collect() %>% group_split(var_name) %>% map_df(~{ var_tmp = unique(.x$var_name) cli_alert("Start processing - {var_tmp}") df_availability_data = validated_data_cube %>% filter(geo %in% local_context$vec__accepted_geo, var_name == var_tmp) %>% select(var_name, iso2, year, public) %>% distinct() %>% collect() df_availability_metadata = validated_metadata_cube %>% filter(geo %in% local_context$vec__accepted_geo, var_name == var_tmp) %>% select(var_name, iso2, year, public) %>% distinct() %>% df_availibilty_metadata_diff = dplyr::setdiff(df_availability_metadata, df_availability_data) %>% mutate(public = '9') df_availibilty_tmp = bind_rows(df_availability_data, df_availibilty_metadata_diff) %>% pivot_wider(names_from = iso2, values_from = public) %>% replace(is.na(.), "0") %>% arrange(var_name, year) return(df_availibilty_tmp) }) df_availability %>% arrow::write_parquet("metadata_cache/df_availability_cache.parquet") } else { df_availability = arrow::read_parquet("metadata_cache/df_availability_cache.parquet") } ``` Lets take a look. ```{r} df_availability %>% sample_n(10) ``` Looks okay letse export! ```{r} df_availability %>% jsonlite::toJSON(pretty = T, null = 'null', na = 'null') %>% write(glue("../../../clean/4-salurbal-hub/1-variables/variable_availability.json")) ``` ## 4.2 variable_sources.json Provides information on ```{r} ## Operationalize df_sources = arrow::read_parquet(glue("../../../clean/2-processed/df_sources.parquet")) df_sources_out = df_sources %>% unpack_year() %>% pack_year() %>% pack_string_column('iso2') %>% distinct() ## Export df_sources_out %>% jsonlite::toJSON(pretty = T) %>% xfun::write_utf8(glue("../../../clean/4-salurbal-hub/1-variables/variable_sources.json")) ## Preview head(sample_n(df_sources_out,10)) ``` ## 4.3 variables_attr.json Details on variable level population strata availability. ```{r} variables_attr = df_strata %>% arrange(var_name, var_name_nested) variables_attr %>% jsonlite::toJSON(pretty = T) %>% xfun::write_utf8(glue("../../../clean/4-salurbal-hub/1-variables/variables_attr.json")) head(variables_attr) ``` ## 4.4 l1ad_preview_data.json This is a public subset of data to preview ```{r} ## Operationalize df_l1_data_preview = df_validated_data_cube %>% ## Subset filter(geo == 'L1AD', public == '1') %>% select(var_name, var_label, iso2, salid1 = salid, year, value, value_type ) %>% ## Subset by variable group_by(var_name) %>% slice(1:30) %>% ungroup() %>% ## Merge in metadata left_join(df_variables) %>% left_join(local_context$xwalk_l1ad %>% select(salid1, salid1_label= l1_label)) %>% ## Append population left_join(df_pop) df_l1_data_preview %>% jsonlite::toJSON() %>% xfun::write_utf8("../../../clean/4-salurbal-hub/1-variables/l1ad_preview_data.json") ``` # 5. Check out page The checkout page requires just a simple count of n_strata (`n_attr`). ```{r} xwalk_attr_form = df_variables %>% select(var_name, var_label) %>% left_join(df_strata %>% select(var_name, n_attr = n_strata) %>% distinct()) xwalk_attr_form %>% jsonlite::toJSON() %>% xfun::write_utf8("../../../clean/4-salurbal-hub/99-checkout/xwalk_attr_form.json") ```