--- title:  "SALURBAL Portal Marts v1.1" author:     - name: Ran Li (Maintainer)     orcid: 0000-0002-4699-4755   - name: Jessica Uruchima date:  last-modified format:     html:     page-layout: full     toc: true     self-contained: true     code-fold: true     df-print: kable     code-tools: true  comments:   hypothesis:      theme: clean editor:  source execute:      warning: false   message: false   eval: false editor_options:     chunk_output_type: console --- -  salurbal-cubes.qmd - which generates proejct wide cubes-  salurbal-metadata-mart.qmd - which generates some interemdaite tables that summarize metadata-  Global Items: generate data taht is used across the portal-  Data Catalog: generate data that is used just for the data catalog and Algolia# 0. Setup ```{r} #| code-summary: 'Setup' #| code-fold: true ## 'Global Setup (For any notebook)' #### Configure relative path (Change!) =  '../../../../R/setup/global_setup.R' #### Generic global setup code (Do not change!) source (path_global_setup_function)=  global_setup (path_global_setup_function)invisible (list2env (setup$ sourced_objects, envir =  .GlobalEnv))=  setup$ global_context## Local Setup (For any notebook) =  lst (cache_folder =  file.path ($ path_server_etl_cache, '_salurbal_cube_v1' ),cache_path_validated_metadata_cube =  file.path ('validated_metadata_cube.parquet'  ),cache_path_validated_data_cube =  file.path ('validated_data_cube.parquet'  ),path_cache_utility_1 =  file.path ($ path_server_etl_cache, '_salurbal_portal_mart_v1' ,'utility_1.parquet' ),path_cache_utility_2 =  file.path ($ path_server_etl_cache, '_salurbal_portal_mart_v1' ,'utility_2.parquet' ),## Local Objects =  c (local_context_object, global_context)=  local_context$ cache_path_validated_metadata_cube %>% :: open_dataset ()=  local_context$ cache_path_validated_data_cube %>%  :: open_dataset ()``` # 1. Global Items ## 1.1 iso2 ```{r} ## Operationalize =  local_context$ xwalk_iso2 %>% rename (country =  country_label, country_url =  country_name) %>%  arrange (salid0)## Write %>% :: toJSON () %>% :: write_utf8 (file.path ($ notebook_relative_root,"clean/4-salurbal-hub/0-seeds/iso2.json" ## Preview head (df_iso2)``` ## 1.2 centroids ```{r} ## Operationalize =  read_parquet (local_context$ l1ad_centroids_df_path) %>% select (country =  country_label,salid1 =  id1,salid1_name =  l1_name,salid1_label =  l1_label,lng =  long,%>%  arrange (salid1)## Write %>% :: toJSON () %>% :: write_utf8 (file.path ($ notebook_relative_root,"clean/4-salurbal-hub/0-seeds/centroids.json" ## Preview head (df_centroids)``` ## 1.3 City metadata ```{r} ## Operationalize =  local_context$ xwalk_l2 %>% count (psalid1) %>% select ( salid1 =  psalid1,n_l2 =  n) %>%  arrange (salid1)## Write %>% :: toJSON () %>% :: write_utf8 (file.path ($ notebook_relative_root,"clean/4-salurbal-hub/0-seeds/l1ad_metadata.json" ## Preview head (xwalk_l2_count)``` ## 1.4 Domains ```{r} ## Operationalize =  read_excel (file.path ($ notebook_relative_root,"code/domains/domains_subdomains_icons.xlsx" sheet =  'domains' )  %>% mutate_all (~ str_squish (.x))=  validated_metadata_cube %>% count (domain,var_name) %>%  count (domain, name =  'n_vars' ) %>% collect () %>%  mutate (url =  str_to_lower (domain) %>%  str_replace_all (" " , "-" ) %>%  str_remove_all (',' )) %>% left_join (xwalk_domain_icon) %>% left_join (local_context$ df_domains ) %>% rename (name =  domain) %>%  arrange (name)## Export %>% :: toJSON (pretty =  T) %>% :: write_utf8 (file.path ($ notebook_relative_root,"clean/4-salurbal-hub/0-seeds/domains.json" ## Preview head (df_domains)``` ## 1.5  Subomains ```{r} ## Write $ df_subdomains %>% :: toJSON (pretty =  T) %>% :: write_utf8 (file.path ($ notebook_relative_root,"clean/4-salurbal-hub/0-seeds/subdomains.json" ## Preview head (local_context$ df_subdomains)``` # 2. Front  pages ## 2.1 Front page stats ```{r} ## Operationalize =  list (n_variables =  validated_metadata_cube %>% count (var_name) %>%  collect () %>%  nrow (),n_countries =  length (unique (local_context$ xwalk_iso2$ country_name)),n_cities =  length (unique (local_context$ xwalk_l1ad$ salid1)),n_nbhd =  length (unique (local_context$ xwalk_l2$ salid2))## Write %>% :: toJSON () %>% :: write_utf8 (file.path ($ notebook_relative_root,"clean/4-salurbal-hub/0-seeds/etc.json" ## Preview ``` # 3. Data Catalog + Data About pages ## 3.1 variables.json  ### Upstream Intermediate Tables #### Validate Cubes ```{r} #| eval: false =  validated_metadata_cube %>%  select (dataset_instance, schema_version, var_name) %>%  mutate (metadata =  T) %>%  distinct () %>%  collect () %>%  left_join (%>%  select (var_name) %>%  mutate (data =  T) %>%  distinct () %>%  collect ()## 216 vairbales have metadata but not in data cube %>%  count (metadata, data)## Lest look at what hese actually are %>%  filter (is.na (data)) %>%  count (dataset_instance, schema_version)``` #### df_pop ```{r} ## Clean =  file.path ($ path_server_dbt_sources,'schema_v1_deprecating' ,'PRJT_v1.0_internal.parquet' ) %>%  read_parquet () %>%  filter (geo ==  'L1AD' ,==  'Sex_Overall' ,==  '2020' ) %>%  rename (salid1 =  salid) %>%  left_join (global_context$ xwalk_l1ad) %>%  select (salid1, pop =  value) %>% mutate (pop =  as.numeric (pop) %>%  round (0 ))## Export %>%  :: write_parquet (file.path ($ notebook_relative_root,"clean/2-processed/df_pop_l1_2020.parquet" ``` #### df_strata -  Pull in strata_description from metadata cube-  Pull in strata details from strata.csv -  Combine```{r} #| code-summary: "Get strata_desc from metadata cube" ## Clean =  validated_metadata_cube %>%  select (var_name, strata_id, strata_description,%>%  distinct () %>%  collect () %>%  verify (columns_must_not_have_NA_NULL_cells (., list (c ('strata_id' )))) %>%  verify (composite_key_uniqueness (., local_context)) %>%  verify (columns_must_not_have_EMPTY_cells (., list (c ('dataset_instance' ,'version' ,'dataset_id' )))) ## Preview head (df_strata_prod_description)``` ```{r} #| code-summary: 'Get strata details from strata.csv files' ## Clean =  list.files (file.path ($ notebook_relative_root,'datasets' recursive =  T,pattern =  '2-strata.csv' ,full.names =  T%>%  map_df (~ {## Setup paths =  .x # .x = '../../../../datasets/area-level/UCNT_Longit/v2.0/2-strata.csv' =  path_tmp %>%  str_split ('/' )=  str_replace (path_tmp, '2-strata.csv' , '_loading.parquet' )## Clean =  path_split[[1 ]][length (path_split[[1 ]])- 1 ]=  path_split[[1 ]][length (path_split[[1 ]])- 2 ]=  glue ("{dataset_id_tmp}_{version_tmp}" ) %>%  paste ()=  ifelse (file.exists (path_schema_v2_metadata),'v2' ,'v1'  )## Export read_csv (.x, show_col_types  =  F) %>%  mutate_all (~ as.character (.x)) %>%  mutate (dataset_instance =  dataset_instance_tmp,dataset_id =  dataset_id_tmp,version =  version_tmp,dataset_version =  version_tmp,schema_version =  schema_version_tmp) %>%  select (- var_name_raw, - var_name_san, -  strata_raw) %>%  distinct () %>%  filter (! str_detect (var_name,'_LCI|_UCI' )) %>%  distinct ()  %>%  mutate (strata_id =   ifelse (is.na (strata_id) |  strata_id== 'NA' , '' , strata_id) %>%  str_remove_all ("_NA_NA$" )) %>%  select (dataset_instance, var_name, strata_id, everything ()) %>% mutate_if (is.character, ~  map_chr (.x, ~  ifelse (is.na (.x), '' , .x))) %>%  verify (columns_must_not_have_NA_NULL_cells (., list (c ('strata_id' )))) %>%  verify (composite_key_uniqueness (., local_context)) %>%  verify (columns_must_not_have_EMPTY_cells (., list (c ('dataset_instance' ,'version' ,'dataset_id' )))) ## Validation: Check that all production strata_id have is present here =  df_strata_prod_description %>%  mutate (key =  paste (var_name, strata_id)%>%  str_trim ()) %>%  pull (key) %>%  unique ()=  df_strata_csv %>%  mutate (key =  paste (var_name, strata_id)%>%  str_trim ()) %>%  pull (key) %>%  unique ()=  all (vec_key_production %in%  vec_key_repo)if  (! pass) cli_abort ("Not all production strata id have associated info here!" )## Preview head (df_strata_csv)``` ```{r} ## Merge =  df_strata_prod_description %>%  left_join (df_strata_csv) %>%  verify (full_referential_integrity (., df_strata_prod_description, local_context)) %>%  verify (columns_must_not_have_NA_NULL_cells (., list (c ('strata_id' )))) %>%  verify (composite_key_uniqueness (., local_context)) %>%  verify (columns_must_not_have_EMPTY_cells (., list (c ('dataset_instance' ,'version' ,'dataset_id' )))) %>%  select (dataset_instance, var_name, strata_description, everything ()) %>%   arrange (desc (strata_id), var_name, strata_1_name, strata_2_name, %>% mutate_if (is.character, ~  map_chr (.x, ~  ifelse (is.na (.x), '' , .x))) ## Preview head (df_strata_stage)``` ```{r} ## df_strata --------------------------------------------------------------- =  df_strata_stage %>% select (var_name, strata_1_name, strata_2_name) %>%  distinct () %>%  group_by (var_name) %>%  mutate (n_strata =  sum (strata_1_name!= '' , strata_2_name!= '' )) %>%  ungroup () %>%  select (var_name, n_strata)=  df_strata_stage %>%  left_join (xwalk_n_strata) %>%  rowwise () %>%  mutate (var_name_nested =  ifelse (strata_id ==  '' ,paste0 (var_name, '_' ,strata_id ))) %>%  ungroup ()%>%  select (- any_of (c ('var_name_san' ,'strata_raw' ))) %>%  arrange (dataset_instance, var_name, strata_id) %>%  select (var_name, var_name_nested, strata_id, everything ()) %>%  verify (full_referential_integrity (., df_strata_prod_description, local_context)) %>%  verify (composite_key_uniqueness (., local_context)) ## QC =  c ('L2' ) %>%  paste (collapse =  '|' )=   df_strata %>%  filter (str_detect (var_name_nested,regex__invalid_nested_patterns)| is.na (var_name_nested)) %>%  pull (var_name_nested)if  (length (vec__invalid_var_nested)> 0 )cli_abort ("Invalid patterns detected in var_name_nested!" )## Version control %>%  write_json ("version_control/df_strata.json" , pretty =  T)``` ### Variable level metadata summaries -  From data cube   -  What years are available for this measure   -  what geogprahic and strata are availble for this meausre   -  what countires this variable has data for#### From metadata cube ```{r} ## Op. attribute availability  ------------- ## Cleaning =  df_strata  %>% group_by (var_name) %>% group_modify (~   {=  .x=  unique (dfa$ strata_1_name)=  unique (dfa$ strata_2_name)=   case_when (is.na (attr1)  & is.na (attr2)  ~   0 ,! is.na (attr1)  & is.na (attr2)  ~   1 ,TRUE  ~  2 =  list ("name"   =   attr1, "choices"  =  list (unique (dfa$ strata_1_value))[[1 ]])=  list ("name"   =   attr2, "choices"  =  list (unique (dfa$ strata_2_value))[[1 ]])=  NULL if  (n_attributes_tmp  ==   1 )=  list (attr1_obj)if  (n_attributes_tmp ==  2 ) {=  list (attr1_obj, attr2_obj)tibble (n_attributes =  n_attributes_tmp,attr_objs  =  list (attr_objs_tmp))%>% ungroup () %>%  arrange (var_name)## Version Control %>%  write_json ("version_control/xwalk_availability_strata.json" , pretty =  T)cli_alert ("xwalk_availability_strata done!" )## Op. public avialability ------------------------------------------------- ## Cleaning =  validated_metadata_cube %>% group_by (var_name) %>%  summarize (public_any =  ifelse (any (public ==  '1' ),'1' ,'0' )) %>%  collect ()## Version control %>%  write_json ("version_control/xwalk_availability_public.json" , pretty =  T)cli_alert ("xwalk_availability_public done!" )``` #### From data cube ```{r} ## Op. year availability  ------------ ## Clean =  validated_data_cube %>%  select (var_name, year) %>%  distinct () %>%  collect () %>% group_by (var_name) %>% summarize (year_min =  min (year),year_max =  max (year),n_years =  length (unique (year)),years =  list (sort (unique (year))),years_packed =  pack_year_string (unique (year))%>% ungroup () %>%  arrange (var_name, year_min, year_max)## Version Control %>%  write_json ("version_control/xwalk_availability_year.json" , pretty =  T)cli_alert ("xwalk_availability_year done!" )## Op. geo availability  -------------------- =   validated_data_cube %>%  count (var_name, geo, strata_id) %>%  collect () %>% left_join (df_strata) %>% count (var_name, geo, strata_1_name, strata_2_name) %>% select (- n) %>% mutate (grouper =  var_name) %>% group_by (grouper) %>% group_modify (~  tibble (var_name =  unique (.x$ var_name),geo_available =  list (unique (.x$ geo) %>%  keep (~ .x%in% c ("L1AD" ,"L2" ,"L2_5" ,"L3" ,"COUNTRY" ))),attributes_available =   {=  unique (c (.x$ strata_1_name, .x$ strata_2_name))list (attr_all[! is.na (attr_all)])%>% ungroup () %>% select (- grouper)cli_alert ("xwalk_availability_geo done!" )## Op. country availability ---------------------- =  validated_data_cube %>%  count (var_name, iso2) %>%  collect () %>% left_join (local_context$ xwalk_iso2 %>%  select (iso2, country =  country_label)) %>% count (var_name, country) %>% arrange (var_name, country) %>% group_by (var_name) %>% summarize (countries =  paste (country, collapse =  ', ' )) %>% ungroup ()cli_alert ("xwalk_availability_country done!" )``` ### Variable level object ```{r} ## Pull variable level metadata =  validated_metadata_cube %>% select (%>% distinct () %>% collect () %>% arrange (domain, subdomain, var_label)  %>%   verify (valid_non_missing_value (., list (c ('domain' , 'subdomain' , 'var_def' )), %>% verify (composite_key_uniqueness (., local_context)) ## Merge in the summaries =  df_variable_level_metadata %>% # left_join(xwalk_availability_geo) %>% left_join (xwalk_availability_strata) %>% # left_join(xwalk_availability_country) %>% left_join (xwalk_availability_year) %>% left_join (xwalk_availability_public) %>%   verify (composite_key_uniqueness (., local_context)) %>%  verify (full_referential_integrity (., df_variable_level_metadata, %>%   verify (valid_non_missing_value (., list (c ('domain' , 'subdomain' , 'var_def' ,'public_any' )), ## Preview head (df_variables)``` ```{r} %>%    :: toJSON () %>% :: write_utf8 (file.path ($ notebook_relative_root,"clean/4-salurbal-hub/1-variables/variables.json" cli_alert_success ("variables.json generated!" )``` ## 3.2 algolia-variable-catalog.json ```{r} ## Operationalize =   df_variables %>% mutate (download =  ifelse (public_any ==  '1' ,'Limited Public Access' ,'Authorization Required' )) %>% select (var_name,geo =  geo_available,%>% distinct ()## Preview head (df_algolia)## Write %>% :: toJSON (pretty =  T) %>% :: write_utf8 (glue ("../../../clean/4-salurbal-hub/algolia-variable-catalog.json" ))``` ## 3.3 Sunburst Plots # 4. Variable Specific Pages ## 4.1 variable_availability.json ```{r} ## Operationalize (legacy) if  (! file.exists ('metadata_cache/df_availability_cache.parquet' )){=  validated_metadata_cube %>% select (var_name, iso2, year, public) %>% distinct () %>% collect () %>%  group_split (var_name) %>% map_df (~ {=  unique (.x$ var_name)cli_alert ("Start processing - {var_tmp}" )=   validated_data_cube  %>% filter (geo %in%  local_context$ vec__accepted_geo,==  var_tmp) %>% select (var_name, iso2, year, public) %>% distinct () %>%  collect ()=  validated_metadata_cube %>% filter (geo %in%  local_context$ vec__accepted_geo,==  var_tmp) %>% select (var_name, iso2, year, public) %>% distinct () %>%  =  dplyr:: setdiff (df_availability_metadata, df_availability_data) %>% mutate (public =  '9' )=  bind_rows (df_availability_data, df_availibilty_metadata_diff) %>% pivot_wider (names_from =  iso2, values_from =  public) %>% replace (is.na (.), "0" ) %>% arrange (var_name, year)return (df_availibilty_tmp)%>%  arrow:: write_parquet ("metadata_cache/df_availability_cache.parquet" )else  {=  arrow:: read_parquet ("metadata_cache/df_availability_cache.parquet" )``` ```{r} %>%  sample_n (10 )``` ```{r} %>% :: toJSON (pretty =  T,null =  'null' ,na =  'null' ) %>% write (glue ("../../../clean/4-salurbal-hub/1-variables/variable_availability.json" ))``` ## 4.2 variable_sources.json ```{r} ## Operationalize =  arrow:: read_parquet (glue ("../../../clean/2-processed/df_sources.parquet" ))=  df_sources %>% unpack_year () %>% pack_year () %>% pack_string_column ('iso2' ) %>% distinct () ## Export %>% :: toJSON (pretty =  T) %>% :: write_utf8 (glue ("../../../clean/4-salurbal-hub/1-variables/variable_sources.json" ))## Preview head (sample_n (df_sources_out,10 ))``` ## 4.3 variables_attr.json ```{r} =  df_strata %>% arrange (var_name, var_name_nested)%>% :: toJSON (pretty =  T) %>% :: write_utf8 (glue ("../../../clean/4-salurbal-hub/1-variables/variables_attr.json" ))head (variables_attr)``` ## 4.4 l1ad_preview_data.json ```{r} ## Operationalize =  df_validated_data_cube %>%  ## Subset filter (geo ==  'L1AD' , public ==  '1' ) %>%  select (var_name,salid1 =  salid,%>%  ## Subset by variable group_by (var_name) %>%  slice (1 : 30 ) %>%  ungroup () %>%  ## Merge in metadata left_join (df_variables)  %>%  left_join (local_context$ xwalk_l1ad  %>%  select (salid1, salid1_label=  l1_label))  %>%  ## Append population left_join (df_pop)  %>% :: toJSON () %>% :: write_utf8 ("../../../clean/4-salurbal-hub/1-variables/l1ad_preview_data.json" )``` # 5. Check out page `n_attr` ).```{r} =  df_variables %>%  select (var_name, var_label) %>%  left_join (df_strata %>%  select (var_name, n_attr  =  n_strata) %>%  distinct ())%>% :: toJSON () %>%  :: write_utf8 ("../../../clean/4-salurbal-hub/99-checkout/xwalk_attr_form.json" )```