Portal v2.0 - City profiles/comparisons Mart

Author

Ran Li (Maintainer)

Published

April 16, 2025

Setup
{ ## 'Global Setup (For any notebook)'
  
  pacman::p_load(tidyverse, arrow, assertr, reactable)
  context = lst(
    version = 'v2.0',
    path_dbt = '//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/Wellcome_Trust/Data Methods Core/Dashboards/dbt/salurbal-dbt-server/models/v1.1'
  )
  setwd(file.path(here::here(),'_etl',context$version))  
}


{ ## Local Setup (For any notebook)
  
  api__admin_cube = file.path(context$path_dbt,'core__admin_cube.parquet') %>%
    arrow::open_dataset()
  api__metadata = file.path(context$path_dbt,'api__metadata_cube.parquet') %>%
    arrow::open_dataset()
  api__portal = file.path(context$path_dbt,'api__portal.parquet') %>%
    arrow::open_dataset()

}

xwalk_l1 = "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/xwalk_l1ad_v1/xwalk_l1ad_v1.parquet" |> 
  arrow::open_dataset() |>
  dplyr::collect()
  
xwalk_l1  
salid0 country_name iso2 salid1 salid1_name vintage
102 Brasil BR 102101 Rio Branco salurbal-1
102 Brasil BR 102102 Arapiraca salurbal-1
102 Brasil BR 102103 Maceio salurbal-1
102 Brasil BR 102104 Macapa salurbal-1
102 Brasil BR 102105 Manaus salurbal-1
102 Brasil BR 102106 Alagoinhas salurbal-1
102 Brasil BR 102107 Barreiras salurbal-1
102 Brasil BR 102108 Feira De Santana salurbal-1
102 Brasil BR 102109 Ilheus salurbal-1
102 Brasil BR 102110 Itabuna salurbal-1
102 Brasil BR 102111 Jequie salurbal-1
102 Brasil BR 102112 Porto Seguro salurbal-1
102 Brasil BR 102113 Salvador salurbal-1
102 Brasil BR 102114 Teixeira De Freitas salurbal-1
102 Brasil BR 102115 Vitoria Da Conquista salurbal-1
102 Brasil BR 102116 Fortaleza salurbal-1
102 Brasil BR 102117 Juazeiro Do Norte salurbal-1
102 Brasil BR 102118 Sobral salurbal-1
102 Brasil BR 102119 Brasilia salurbal-1
102 Brasil BR 102120 Cachoeiro De Itapemirim salurbal-1
102 Brasil BR 102121 Guarapari salurbal-1
102 Brasil BR 102122 Linhares salurbal-1
102 Brasil BR 102123 Vitoria salurbal-1
102 Brasil BR 102124 Anapolis salurbal-1
102 Brasil BR 102125 Goiania salurbal-1
102 Brasil BR 102126 Rio Verde salurbal-1
102 Brasil BR 102127 Caxias salurbal-1
102 Brasil BR 102128 Imperatriz salurbal-1
102 Brasil BR 102129 Sao Luis salurbal-1
102 Brasil BR 102130 Cuiaba salurbal-1
102 Brasil BR 102131 Rondonopolis salurbal-1
102 Brasil BR 102132 Campo Grande salurbal-1
102 Brasil BR 102133 Dourados salurbal-1
102 Brasil BR 102134 Araguari salurbal-1
102 Brasil BR 102135 Barbacena salurbal-1
102 Brasil BR 102136 Belo Horizonte salurbal-1
102 Brasil BR 102137 Conselheiro Lafaiete salurbal-1
102 Brasil BR 102138 Divinopolis salurbal-1
102 Brasil BR 102139 Governador Valadares salurbal-1
102 Brasil BR 102140 Ipatinga salurbal-1
102 Brasil BR 102141 Itabira salurbal-1
102 Brasil BR 102142 Juiz De Fora salurbal-1
102 Brasil BR 102143 Montes Claros salurbal-1
102 Brasil BR 102144 Passos salurbal-1
102 Brasil BR 102145 Patos De Minas salurbal-1
102 Brasil BR 102146 Pocos De Caldas salurbal-1
102 Brasil BR 102147 Pouso Alegre salurbal-1
102 Brasil BR 102148 Sete Lagoas salurbal-1
102 Brasil BR 102149 Teofilo Otoni salurbal-1
102 Brasil BR 102150 Uberaba salurbal-1
102 Brasil BR 102151 Uberlandia salurbal-1
102 Brasil BR 102152 Varginha salurbal-1
102 Brasil BR 102153 Belem salurbal-1
102 Brasil BR 102154 Castanhal salurbal-1
102 Brasil BR 102155 Maraba salurbal-1
102 Brasil BR 102156 Parauapebas salurbal-1
102 Brasil BR 102157 Santarem salurbal-1
102 Brasil BR 102158 Campina Grande salurbal-1
102 Brasil BR 102159 Joao Pessoa salurbal-1
102 Brasil BR 102160 Apucarana salurbal-1
102 Brasil BR 102161 Arapongas salurbal-1
102 Brasil BR 102162 Cascavel salurbal-1
102 Brasil BR 102163 Curitiba salurbal-1
102 Brasil BR 102164 Foz Do Iguacu salurbal-1
102 Brasil BR 102165 Guarapuava salurbal-1
102 Brasil BR 102166 Londrina salurbal-1
102 Brasil BR 102167 Maringa salurbal-1
102 Brasil BR 102168 Paranagua salurbal-1
102 Brasil BR 102169 Ponta Grossa salurbal-1
102 Brasil BR 102170 Toledo salurbal-1
102 Brasil BR 102172 Caruaru salurbal-1
102 Brasil BR 102173 Garanhuns salurbal-1
102 Brasil BR 102174 Petrolina salurbal-1
102 Brasil BR 102175 Recife salurbal-1
102 Brasil BR 102176 Vitoria De Santo Antao salurbal-1
102 Brasil BR 102177 Parnaiba salurbal-1
102 Brasil BR 102178 Teresina salurbal-1
102 Brasil BR 102179 Angra Dos Reis salurbal-1
102 Brasil BR 102180 Araruama salurbal-1
102 Brasil BR 102181 Cabo Frio salurbal-1
102 Brasil BR 102182 Campos Dos Goytacazes salurbal-1
102 Brasil BR 102184 Macae salurbal-1
102 Brasil BR 102187 Nova Friburgo salurbal-1
102 Brasil BR 102188 Petropolis salurbal-1
102 Brasil BR 102189 Resende salurbal-1
102 Brasil BR 102190 Rio De Janeiro salurbal-1
102 Brasil BR 102191 Teresopolis salurbal-1
102 Brasil BR 102192 Volta Redonda salurbal-1
102 Brasil BR 102193 Mossoro salurbal-1
102 Brasil BR 102194 Natal salurbal-1
102 Brasil BR 102195 Caxias Do Sul salurbal-1
102 Brasil BR 102196 Passo Fundo salurbal-1
102 Brasil BR 102197 Pelotas salurbal-1
102 Brasil BR 102198 Porto Alegre salurbal-1
102 Brasil BR 102199 Rio Grande salurbal-1
102 Brasil BR 102200 Santa Cruz Do Sul salurbal-1
102 Brasil BR 102201 Santa Maria salurbal-1
102 Brasil BR 102202 Uruguaiana salurbal-1
102 Brasil BR 102203 Ji-Parana salurbal-1
102 Brasil BR 102204 Porto Velho salurbal-1
102 Brasil BR 102205 Boa Vista salurbal-1
102 Brasil BR 102206 Balneario Camboriu salurbal-1
102 Brasil BR 102207 Blumenau salurbal-1
102 Brasil BR 102208 Brusque salurbal-1
102 Brasil BR 102209 Chapeco salurbal-1
102 Brasil BR 102210 Criciuma salurbal-1
102 Brasil BR 102211 Florianopolis salurbal-1
102 Brasil BR 102212 Itajai salurbal-1
102 Brasil BR 102213 Jaragua Do Sul salurbal-1
102 Brasil BR 102214 Joinville salurbal-1
102 Brasil BR 102215 Lages salurbal-1
102 Brasil BR 102216 Aracatuba salurbal-1
102 Brasil BR 102217 Araraquara salurbal-1
102 Brasil BR 102218 Araras salurbal-1
102 Brasil BR 102219 Atibaia salurbal-1
102 Brasil BR 102220 Santos salurbal-1
102 Brasil BR 102221 Barretos salurbal-1
102 Brasil BR 102222 Bauru salurbal-1
102 Brasil BR 102223 Birigui salurbal-1
102 Brasil BR 102224 Botucatu salurbal-1
102 Brasil BR 102225 Braganca Paulista salurbal-1
102 Brasil BR 102226 Campinas salurbal-1
102 Brasil BR 102227 Catanduva salurbal-1
102 Brasil BR 102228 Franca salurbal-1
102 Brasil BR 102229 Guaratingueta salurbal-1
102 Brasil BR 102231 Itapetininga salurbal-1
102 Brasil BR 102233 Jau salurbal-1
102 Brasil BR 102234 Jundiai salurbal-1
102 Brasil BR 102235 Limeira salurbal-1
102 Brasil BR 102236 Marilia salurbal-1
102 Brasil BR 102237 Mogi Guacu salurbal-1
102 Brasil BR 102238 Ourinhos salurbal-1
102 Brasil BR 102240 Piracicaba salurbal-1
102 Brasil BR 102241 Presidente Prudente salurbal-1
102 Brasil BR 102242 Ribeirao Preto salurbal-1
102 Brasil BR 102243 Rio Claro salurbal-1
102 Brasil BR 102245 Sao Carlos salurbal-1
102 Brasil BR 102246 Sao Jose Do Rio Preto salurbal-1
102 Brasil BR 102247 Sao Jose Dos Campos salurbal-1
102 Brasil BR 102248 Sao Paulo salurbal-1
102 Brasil BR 102249 Sertaozinho salurbal-1
102 Brasil BR 102250 Sorocaba salurbal-1
102 Brasil BR 102251 Tatui salurbal-1
102 Brasil BR 102252 Taubate salurbal-1
102 Brasil BR 102253 Aracaju salurbal-1
102 Brasil BR 102254 Araguaina salurbal-1
102 Brasil BR 102255 Palmas salurbal-1
102 Brasil BR 102256 Tubarao salurbal-1
102 Brasil BR 102257 Bento Goncalves salurbal-1
102 Brasil BR 102258 Caraguatatuba salurbal-1
102 Brasil BR 102259 Parobe salurbal-1
102 Brasil BR 102260 Rio Das Ostras salurbal-1
203 Guatemala GT 203101 Escuintla salurbal-1
203 Guatemala GT 203102 Guatemala City salurbal-1
203 Guatemala GT 203103 Quetzaltenango salurbal-1
206 Panama PA 206101 Panama City salurbal-1
206 Panama PA 206102 Colon salurbal-1
206 Panama PA 206103 David salurbal-1

Variables

pct_young/elderly

Import PRJ_v3.0

We pull from PRJ_v3.0 staged data.

Code
## API query
df_pop_age_sex = '//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/Wellcome_Trust/Data Methods Core//Dashboards/etl/cache/PRJ_v2.0/PRJ_v2.0_staged_data_cube.parquet' |> 
  open_dataset() |> 
  filter(var_name == 'PRJPOP', geo == "L1AD") |> 
  collect() |> 
  mutate(pop = parse_number(value),
         year = parse_number(year),
         strata_split = str_split(strata_id,"_"),
         sex = strata_split |> map_chr(~.x[2]),
         age = strata_split |> map_chr(~.x[4]) ) |> 
  select(iso2, salid = observation_id, year, sex, age, pop,
         var_name, geo)  %>%
  assert_rows(col_concat, is_uniq, salid, year, sex, age)
df_pop_age = df_pop_age_sex %>% 
  group_by(iso2, salid, year, age) %>% 
  summarize(pop_age = sum((pop)))  %>% 
  ungroup()
df_pop = df_pop_age_sex %>% 
  group_by(iso2, salid, year) %>% 
  summarize(pop = sum((pop)))  %>% 
  ungroup() |> 
  assert_rows(col_concat, is_uniq, salid, year) 

Input Validation

Let’s do a quick sanity check on the api results before moving on.

Code
df_pop |> 
  ggplot(aes(x = year, y = pop, color = salid, group = salid)) + 
  geom_line() +
  geom_point() +
  facet_wrap(~iso2) +
  scale_x_continuous(breaks = seq(2000,2020,10)) +
  scale_y_continuous(labels = function(x) x/1000000) + 
  theme_bw() +
  labs(title = 'L1AD population over time',
       subtitle = 'Not sane  =',
       y = "Population (millions)") +
  theme(legend.position = 'none')

This does not look right at all - Brazile should have the largest cities; Sao Paolo should be the largest city. Let’s take a look at Sao Paulo, BR

The values look okay.

Clean

Let’ operationalize the required variables pct_young and pct_elderly

Code
## Intermediate
df_pop_age_grp  = df_pop_age %>% 
  mutate( age_grp = case_when(
    age<5 ~ 'pop_young',
    age >= 65 ~ 'pop_elderly',
    TRUE ~ 'other'
  ) ) %>% 
  filter(age_grp!='other') %>%  
  group_by(salid, year, age_grp) %>% 
  summarize(pop_age_grp = sum(pop_age)) %>% 
  ungroup() %>% 
  pivot_wider(names_from = age_grp, values_from = pop_age_grp) %>% 
  left_join(df_pop |> rename(total_pop = pop)) %>% 
  mutate(pct_young = (pop_young/total_pop)*100,
         pct_elderly =  (pop_elderly/total_pop)*100) |> 
  verify(!is.na(pct_elderly)) |> 
  verify(!is.na(pct_young)) |>
  verify(between(pct_young, 0, 100)) %>%
  verify(between(pct_elderly, 0, 100)) 

## Final
df_pop_final = df_pop_age_grp %>% 
  select(salid, year, contains('pct_')) %>% 
  pivot_longer(cols = c(pct_young, pct_elderly)) %>% 
  mutate(value_clean = sprintf("%.2f",  (value)),
         units = '%',
         value_type = 'numeric',
         var_name = name,
         var_label = name %>% recode("pct_young" = "% Young children (<5)",
                                     "pct_elderly" = "% Elderly (>=65)" ))  |> 
  left_join(df_pop_age_grp |> 
              select(salid , iso2) |> 
              distinct()) |> 
  mutate(geo = "L1AD",
         year_raw = as.character(year),
         year = as.numeric(year),
         source = 'PRJ_v3.0  PRJPOP') |> 
    select(geo, salid, iso2, year, year_raw, 
           value, var_name,
           var_label, value_type,value_clean, source, everything()) 
df_pop_final |> 
  head() 
geo salid iso2 year year_raw value var_name var_label value_type value_clean source name units
L1AD 101101 AR 2001 2001 66.06366 pct_young % Young children (<5) numeric 66.06 PRJ_v3.0 PRJPOP pct_young %
L1AD 101101 AR 2001 2001 12.25562 pct_elderly % Elderly (>=65) numeric 12.26 PRJ_v3.0 PRJPOP pct_elderly %
L1AD 101101 AR 2002 2002 65.92662 pct_young % Young children (<5) numeric 65.93 PRJ_v3.0 PRJPOP pct_young %
L1AD 101101 AR 2002 2002 12.35011 pct_elderly % Elderly (>=65) numeric 12.35 PRJ_v3.0 PRJPOP pct_elderly %
L1AD 101101 AR 2003 2003 65.79001 pct_young % Young children (<5) numeric 65.79 PRJ_v3.0 PRJPOP pct_young %
L1AD 101101 AR 2003 2003 12.44431 pct_elderly % Elderly (>=65) numeric 12.44 PRJ_v3.0 PRJPOP pct_elderly %

Validations

Code
df_pop_final %>%
  # Group by country, year and population type to get average values
  group_by(iso2, year, var_label) %>%
  summarize(avg_value = mean(value, na.rm = TRUE), .groups = "drop") %>%
  # Get the top 8 countries by data volume for a cleaner visualization
  group_by(iso2) %>%
  mutate(data_points = n()) %>%
  ungroup() %>%
  arrange(desc(data_points)) %>%
  filter(iso2 %in% unique(iso2)[1:8]) %>%
  # Create the plot
  ggplot(aes(x = year, y = avg_value, color = var_label)) +
  geom_line(linewidth = 1, alpha = 0.8) +
  geom_point(size = 2, alpha = 0.7) +
  facet_wrap(~iso2, ncol = 2) +
  scale_color_brewer(palette = "Set1", name = "Population Group") +
  scale_y_continuous(labels = function(x) paste0(x, "%")) +
  scale_x_continuous(breaks = seq(min(df_pop_final$year), 
                                  max(df_pop_final$year), 
                                  by = 2)) +
  labs(
    title = "Age Demographics Trends by Country (2001-Present)",
    subtitle = "Looks sane",
    x = "Year",
    y = "Percentage of Population",
    caption = "Data shown for 8 countries with most observations"
  ) +
  theme_bw() +
  theme(
    legend.position = "bottom",
    # strip.background = element_rect(fill = "lightgrey", color = NA),
    strip.text = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

life_expectancy

Clean

This is simple with no transformations required; jsut an API query

Code
df_le = "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/Wellcome_Trust/Data Methods Core/Dashboards/dbt/salurbal-dbt-server/models/v1.1/core__admin_cube.parquet" %>% 
  open_dataset() |> 
  filter(var_name == 'LEALE',
         geo == 'L1AD') %>% 
  collect() |> 
  mutate(
    var_name = 'LEALE__LEMEDIAN_L1_v1.0',
    value = parse_number(value),
    value_clean = sprintf("%.2f",  (value)),
    strata_split = str_split(strata_id,"_"),
    male =  strata_split |> map_chr(~.x[2]) ,
    age = strata_split |> map_chr(~.x[4]),,
    var_label = male %>% recode(
    "1" = 'Life expectancy for males',
    "0"='Life expectancy for females' 
  )) %>% 
  filter(age %in% c(0,60)) |> 
  select(var_name, var_label, geo, salid, iso2, year, 
         strata_id, male,age,value, value_type, 
         value_clean, units, source) |> 
  assert_rows(col_concat, is_uniq, male,age, salid, year, iso2)  |> 
  rowwise() |> 
  mutate(year_raw = as.character(year),
         year = year_raw |> 
           str_split( "-") |> 
           unlist() |> 
           as.numeric() |> 
           mean()) |> 
  ungroup() |> 
    select(geo, salid, iso2, year, year_raw, 
           value, var_name,
           var_label, value_type,value_clean, source, everything()) 

Validation

sanity check. this is bar chart

Code
## this is barchart not longitudinal. there are groups in `grp` and iso2; i don't want year on any axis. i want one bar per per salid, faceted by group and country 
df_le |>
  mutate( age = age |> recode('0'='At birth','60'='Age 60'),
         male = male |> recode('0'='Female','1'='Male')
    ) |> 
  ggplot(aes(x = iso2, y = value)) +
  geom_boxplot() +
  geom_point() +
  facet_grid(age~male) +
  labs(
    title = 'Life Expectancy look sane'
  )

total/infant_deaths

Import DTH_v3.0

Code
df_dth_v3 =  "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/Wellcome_Trust/Data Methods Core//Dashboards/etl/cache/DTH_v3.0/DTH_v3.0_staged_data_cube.parquet" |> 
  open_dataset() |> 
  collect()

Clean

total_death + infant_deaths

Code
## API Call
df_dth = df_dth_v3 %>% 
  filter(var_name == 'DTHDEATHS',
         geo == 'L1AD') %>%
  collect() |> 
  rename(salid = observation_id) |> 
  add_count(year, salid) %>%    
  arrange(n) %>% 
  mutate(year = parse_number(year),
         value = parse_number(value),
         strata_split = str_split(strata_id,"_"),
         age = strata_split |> map_chr(~.x[4]),
         sex = strata_split |> map_chr(~.x[2])) |> 
  select(var_name, dataset_instance, geo, iso2, salid, year,
         strata_id, age,sex ,value, n ) 

## Total
df_dth_total = df_dth %>% 
  group_by(var_name, geo, iso2, salid, year) %>%
  summarize(n_total_deaths__DTH_v3 = sum(value)) %>%
  ungroup()   %>% 
  mutate(
    var_name = 'n_total_deaths__DTH_v3',
    var_label = 'Total deaths from DTH_v3',
    source = 'DTH_v3.0',
    value_type = 'numeric',
    value = n_total_deaths__DTH_v3,
    value_clean = as.character(value),
    year_raw = as.character(year),
    year = as.numeric(year)) |>  
  select(geo, salid, iso2, year, year_raw, 
         value, var_name,
         var_label, value_type,value_clean, source,
         everything()) 

## Infant
df_infant_dth = df_dth %>% 
  filter(age == 0) |> 
  group_by(var_name, geo, iso2, salid, year) %>%
  summarize(n_infant_deaths__DTH_v3 = sum(value)) %>%
  ungroup()   %>% 
  mutate(
    var_name = 'n_infant_deaths__DTH_v3',
    var_label = 'Number of Infant deaths from DTH_v3',
    source = 'DTH_v3.0',
    value_type = 'numeric',
    value = n_infant_deaths__DTH_v3,
    value_clean = as.character(value),
    year_raw = as.character(year),
    year = as.numeric(year)) |>  
  select(geo, salid, iso2, year, year_raw, 
         value, var_name,
         var_label, value_type,value_clean, source,
         everything()) |> 
  glimpse()
Rows: 7,121
Columns: 12
$ geo                     <chr> "L1AD", "L1AD", "L1AD", "L1AD", "L1AD", "L1AD"…
$ salid                   <chr> "101101", "101101", "101101", "101101", "10110…
$ iso2                    <chr> "AR", "AR", "AR", "AR", "AR", "AR", "AR", "AR"…
$ year                    <dbl> 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012…
$ year_raw                <chr> "2005", "2006", "2007", "2008", "2009", "2010"…
$ value                   <dbl> 56, 54, 51, 45, 48, 47, 58, 34, 44, 36, 52, 41…
$ var_name                <chr> "n_infant_deaths__DTH_v3", "n_infant_deaths__D…
$ var_label               <chr> "Number of Infant deaths from DTH_v3", "Number…
$ value_type              <chr> "numeric", "numeric", "numeric", "numeric", "n…
$ value_clean             <chr> "56", "54", "51", "45", "48", "47", "58", "34"…
$ source                  <chr> "DTH_v3.0", "DTH_v3.0", "DTH_v3.0", "DTH_v3.0"…
$ n_infant_deaths__DTH_v3 <dbl> 56, 54, 51, 45, 48, 47, 58, 34, 44, 36, 52, 41…

Validation

Let’s do a sanity check

Code
## Total Deaths
df_dth_total |> 
  ggplot(aes(x = year, y = value, color = salid, group = salid)) +
  geom_line() +
  geom_point() +
  facet_wrap(~iso2, scales = 'free_y') +
  scale_x_continuous(breaks = seq(2000,2020,10)) +
  theme_bw() +
  labs(title = ' total deaths - L1AD over time',
       subtitle = 'looks sane? right?') +
  theme(legend.position = 'none')

Code
## Infant Deaths
df_infant_dth |> 
  ggplot(aes(x = year, y = value, 
             color = salid, group = salid)) +
  geom_line() +
  geom_point() +
  ## free y scale facet
  facet_wrap(~iso2, scales = 'free_y') +
  scale_x_continuous(breaks = seq(2000,2020,10)) +
  theme_bw() +
  labs(title = 'infant deaths - L1AD  over time',
       subtitle = 'looks sane? right?') +
  theme(legend.position = 'none')

births

Clean BTH_v1

Code
df_infant_bth = "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/Wellcome_Trust/Data Methods Core/Dashboards/dbt/salurbal-dbt-server/models/v1.1/core__admin_cube.parquet" %>% 
  open_dataset()  %>% 
  filter(var_name == 'BTHBIRTHS') %>% 
  collect() |> 
  group_by(var_name, geo, iso2, salid, year, source) %>% 
  summarize(n_births = sum(parse_number(value))) %>% 
  ungroup() %>% 
  filter(year%in%df_infant_dth$year) |> 
  mutate(year = parse_number(year)) |> 
  assert_rows(col_concat, is_uniq, salid, year, iso2)  |> 
  mutate(
    var_label = 'Number of births from BTHBIRTHS',
    value_type = 'numeric',
    var_name = 'n_births__BTH_v1',
    source = "BTHBIRTHS__BTH_v1.0",
    value = n_births,
    n_births__BTH_v1 = value,
    value_clean = as.character(value),
    year_raw = as.character(year)) |> 
  ungroup() |> 
    select(geo, salid, iso2, year, year_raw, 
           value, var_name,
           var_label, value_type,value_clean, source, everything()) 



head(df_infant_bth)
geo salid iso2 year year_raw value var_name var_label value_type value_clean source n_births n_births__BTH_v1
L1AD 101101 AR 2009 2009 9031 n_births__BTH_v1 Number of births from BTHBIRTHS numeric 9031 BTHBIRTHS__BTH_v1.0 9031 9031
L1AD 101101 AR 2010 2010 9753 n_births__BTH_v1 Number of births from BTHBIRTHS numeric 9753 BTHBIRTHS__BTH_v1.0 9753 9753
L1AD 101101 AR 2011 2011 9746 n_births__BTH_v1 Number of births from BTHBIRTHS numeric 9746 BTHBIRTHS__BTH_v1.0 9746 9746
L1AD 101101 AR 2012 2012 9741 n_births__BTH_v1 Number of births from BTHBIRTHS numeric 9741 BTHBIRTHS__BTH_v1.0 9741 9741
L1AD 101101 AR 2013 2013 9738 n_births__BTH_v1 Number of births from BTHBIRTHS numeric 9738 BTHBIRTHS__BTH_v1.0 9738 9738
L1AD 101101 AR 2014 2014 10135 n_births__BTH_v1 Number of births from BTHBIRTHS numeric 10135 BTHBIRTHS__BTH_v1.0 10135 10135

Validation

Sanity check

Code
df_infant_bth |> 
  ggplot(aes(x = year, y = value, 
             color = salid, group = salid)) +
  geom_line() +
  geom_point() +
  ## free y scale facet
  facet_wrap(~iso2, scales = 'free_y') +
  scale_x_continuous(breaks = seq(2000,2020,10)) +
  theme_bw() +
  labs(title = 'infant births - L1AD  over time',
       subtitle = 'looks sane? right?') +
  theme(legend.position = 'none')

infant_mortality

Clean

Code
df_infant_mort = df_infant_dth %>%  
  select(geo, salid, iso2, year, year_raw,
         n_infant_deaths__DTH_v3) |> 
  left_join(df_infant_bth |> 
              select(geo, salid, iso2, year, year_raw,
                     n_births__BTH_v1)) |> 
  filter(!is.na(n_births__BTH_v1)) |> 
  rowwise()  %>%
  mutate(
    var_name = 'infant_mortality_rate',
    var_label = 'Infant Mortality',
    source = "n_births__BTH_v1 + n_infant_deaths__DTH_v3",
    value = (n_infant_deaths__DTH_v3/n_births__BTH_v1)*1000,
    value_type = 'numeric',
    value_clean = sprintf("%.2f",  (value)),
    units = 'Deaths per 1,000 births',
  ) %>% 
  assert_rows(col_concat, is_uniq, salid, year, iso2) |> 
  ungroup() |> 
    select(geo, salid, iso2, year, year_raw, 
           value, var_name,
           var_label, value_type,value_clean, source, everything()) 

Validation

Sanity Check

Code
df_infant_mort |> 
  ggplot(aes(x = year, y = value, 
             color = salid, group = salid)) +
  geom_line() +
  geom_point() +
  ## free y scale facet
  facet_wrap(~iso2) +
  scale_x_continuous(breaks = seq(2000,2020,10)) +
  theme_bw() +
  labs(title = 'infant mortality - L1AD  over time',
       subtitle = 'looks sane? right?') +
  theme(legend.position = 'none')

pct_urban (ERROR)

Not sure how to operatinoalize % urban area in a city. Tried with these BEC variables but not working - ask DMC.

Clean

Code
df_area  = "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/Wellcome_Trust/Data Methods Core/Dashboards/dbt/salurbal-dbt-server/models/v1.1/core__admin_cube.parquet" %>% 
  open_dataset()  %>% 
  filter(var_name == 'BECADAREA', geo=='L1AD')  %>% 
  select(var_name1 = var_name, 
         geo, salid = observation_id, iso2, 
         total_area = value,
         source1 = source) |> 
  collect()

df_urban_area  = "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/Wellcome_Trust/Data Methods Core/Dashboards/dbt/salurbal-dbt-server/models/v1.1/core__admin_cube.parquet" %>% 
  open_dataset()  %>% 
  filter(var_name == 'BECTUAREA', geo=='L1AD')  %>% 
  select(var_name2 = var_name, 
         year,
         geo,salid = observation_id , iso2, 
         urban_area = value,
         source2 = source) |> 
  collect() 

df_pct_urban_int =  df_urban_area %>%
  left_join(df_area)  %>%
  filter(!is.na(urban_area) & !is.na(total_area)) %>% 
  rowwise()  %>%
  mutate(
    urban_area = parse_number(urban_area),
    total_area = parse_number(total_area),
    var_name = list(c(var_name1, var_name2)),
    var_label = 'Percent urban',
    source = source2,
    value = (urban_area / total_area) * 100,
    value_clean = sprintf("%.2f",  (value)),
    units = '%'
  ) %>%
  ungroup()

df_pct_urban = df_pct_urban_int|> 
  mutate(
    var_name = 'pct_urban',
    source  = glue::glue("{var_name1} + {var_name2}"),
    year_raw = as.character(year),
         year = as.numeric(year),
         value_type = 'numeric') |> 
  ungroup() |> 
    select(geo, salid, iso2, year, year_raw, 
           value, var_name,
           var_label, value_type,value_clean, source,
           everything()) |> 
  assert_rows(col_concat, is_uniq, salid, year, iso2) |> 
  glimpse()
Rows: 2,960
Columns: 18
$ geo         <chr> "L1AD", "L1AD", "L1AD", "L1AD", "L1AD", "L1AD", "L1AD", "L…
$ salid       <chr> "101101", "101103", "101104", "101105", "101106", "101107"…
$ iso2        <chr> "AR", "AR", "AR", "AR", "AR", "AR", "AR", "AR", "AR", "AR"…
$ year        <dbl> 1985, 1985, 1985, 1985, 1985, 1985, 1985, 1985, 1985, 1985…
$ year_raw    <chr> "1985", "1985", "1985", "1985", "1985", "1985", "1985", "1…
$ value       <dbl> 1.423159e+01, 5.461491e+02, 2.861140e+02, 2.977578e+01, 5.…
$ var_name    <chr> "pct_urban", "pct_urban", "pct_urban", "pct_urban", "pct_u…
$ var_label   <chr> "Percent urban", "Percent urban", "Percent urban", "Percen…
$ value_type  <chr> "numeric", "numeric", "numeric", "numeric", "numeric", "nu…
$ value_clean <chr> "14.23", "546.15", "286.11", "29.78", "56.85", "614.81", "…
$ source      <glue> "BECADAREA + BECTUAREA", "BECADAREA + BECTUAREA", "BECADA…
$ var_name2   <chr> "BECTUAREA", "BECTUAREA", "BECTUAREA", "BECTUAREA", "BECTU…
$ urban_area  <dbl> 325.71, 7877.43, 1938.15, 1448.55, 669.60, 18261.36, 2479.…
$ source2     <chr> "High spatiotemporal resolution mapping of global urban ch…
$ var_name1   <chr> "BECADAREA", "BECADAREA", "BECADAREA", "BECADAREA", "BECAD…
$ total_area  <dbl> 2288.6409, 1442.3590, 677.4047, 4864.8594, 1177.8457, 2970…
$ source1     <chr> "Argentina Secretaria de Energia Radios Censales  from Arg…
$ units       <chr> "%", "%", "%", "%", "%", "%", "%", "%", "%", "%", "%", "%"…
Code
head(df_pct_urban)
geo salid iso2 year year_raw value var_name var_label value_type value_clean source var_name2 urban_area source2 var_name1 total_area source1 units
L1AD 101101 AR 1985 1985 14.23159 pct_urban Percent urban numeric 14.23 BECADAREA + BECTUAREA BECTUAREA 325.71 High spatiotemporal resolution mapping of global urban change from 1985 to 2015; Global Urban Footprint (GUF), 2012 BECADAREA 2288.6409 Argentina Secretaria de Energia Radios Censales from Argentina Ministerio de Energía y Minería.Secretaría de Coordinación de Planeamiento Energético.Dirección Nacional de Información Energética.Tecnología de la Información. %
L1AD 101103 AR 1985 1985 546.14906 pct_urban Percent urban numeric 546.15 BECADAREA + BECTUAREA BECTUAREA 7877.43 High spatiotemporal resolution mapping of global urban change from 1985 to 2015; Global Urban Footprint (GUF), 2012 BECADAREA 1442.3590 Argentina Secretaria de Energia Radios Censales from Argentina Ministerio de Energía y Minería.Secretaría de Coordinación de Planeamiento Energético.Dirección Nacional de Información Energética.Tecnología de la Información. %
L1AD 101104 AR 1985 1985 286.11404 pct_urban Percent urban numeric 286.11 BECADAREA + BECTUAREA BECTUAREA 1938.15 High spatiotemporal resolution mapping of global urban change from 1985 to 2015; Global Urban Footprint (GUF), 2012 BECADAREA 677.4047 Argentina Secretaria de Energia Radios Censales from Argentina Ministerio de Energía y Minería.Secretaría de Coordinación de Planeamiento Energético.Dirección Nacional de Información Energética.Tecnología de la Información. %
L1AD 101105 AR 1985 1985 29.77578 pct_urban Percent urban numeric 29.78 BECADAREA + BECTUAREA BECTUAREA 1448.55 High spatiotemporal resolution mapping of global urban change from 1985 to 2015; Global Urban Footprint (GUF), 2012 BECADAREA 4864.8594 Argentina Secretaria de Energia Radios Censales from Argentina Ministerio de Energía y Minería.Secretaría de Coordinación de Planeamiento Energético.Dirección Nacional de Información Energética.Tecnología de la Información. %
L1AD 101106 AR 1985 1985 56.84955 pct_urban Percent urban numeric 56.85 BECADAREA + BECTUAREA BECTUAREA 669.60 High spatiotemporal resolution mapping of global urban change from 1985 to 2015; Global Urban Footprint (GUF), 2012 BECADAREA 1177.8457 Argentina Secretaria de Energia Radios Censales from Argentina Ministerio de Energía y Minería.Secretaría de Coordinación de Planeamiento Energético.Dirección Nacional de Información Energética.Tecnología de la Información. %
L1AD 101107 AR 1985 1985 614.81327 pct_urban Percent urban numeric 614.81 BECADAREA + BECTUAREA BECTUAREA 18261.36 High spatiotemporal resolution mapping of global urban change from 1985 to 2015; Global Urban Footprint (GUF), 2012 BECADAREA 2970.2287 Argentina Secretaria de Energia Radios Censales from Argentina Ministerio de Energía y Minería.Secretaría de Coordinación de Planeamiento Energético.Dirección Nacional de Información Energética.Tecnología de la Información. %

Validation

Code
df_pct_urban |> 
  ggplot(aes(x = iso2, y = value))+
  geom_boxplot()

Sanity check of componnets. This is an issue ( why is total area less than urban_area in some cases.)

```{r}
df_pct_urban_int |> 
  verify(total_area > urban_area)


df_pct_urban_int |> 
  filter(total_area < urban_area) |> 
  count(geo, iso2,year)
```

sanity chekc of pct_urban

```{r}
df_pct_urban |> 
  mutate(year = parse_number(year)) |>
  ggplot(aes(x = year, y = value, 
             color = salid, group = salid)) +
  geom_line() +
  geom_point() +
  ## free y scale facet
  facet_wrap(~iso2, scales = 'free_y') +
  scale_x_continuous(breaks = seq(2000,2020,10)) +
  theme_bw() +
  labs(title = 'Percent Urban - L1AD  over time',
       subtitle = 'looks sane? right?') +
  theme(legend.position = 'none')
```

Violent Deaths (To Check with Jess/Goro)

Not sure how t numbers look; it seems like 40% of deaths coded to violence (1580) seems liek a lot.

Import DTH3_v1.0

Code
## This was bound to legacy api 
df_dth_ghe3 = "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/Wellcome_Trust/Data Methods Core//Dashboards/etl/cache/DTH3_v1.0/DTH3_v1.0_staged_data_cube.parquet" |> 
  open_dataset() |> 
  collect() |> 
  rename(salid = observation_id) |> 
  filter(geo == 'L1AD') %>% 
  filter(var_name == 'DTH3DEATHS') %>%
  collect() |> 
  mutate( year = as.numeric(year))  %>% 
  assert_rows(col_concat, is_uniq, salid, year, strata_id)

Clean

Using staged data.

Code
## Vioence GHE
df_dth_ghe3_total = df_dth_ghe3 |> 
  group_by(var_name, geo, iso2, salid, year) %>%
  summarize(total_deaths = sum(parse_number(value))) %>%
  ungroup() |> 
  left_join(df_dth_total |> 
              select(
                geo, iso2, salid, year,
                total_deaths_dth3 = value))
  
df_dth_ghe3_violent = df_dth_ghe3 |> 
  filter(strata_id %in% c('Sex_1_GHE-Tier-3_1580',
                          'Sex_0_GHE-Tier-3_1580')) |> 
  group_by(var_name, geo, iso2, salid, year) %>%
  summarize(n_violent_deaths = sum(parse_number(value))) %>%
  ungroup() 


## Merge total
df_dth_ghe3_merge =  df_dth_ghe3_violent  %>%
  left_join(df_dth_ghe3_total) 

## Calculate percent
df_pct_violent_deaths = df_dth_ghe3_merge %>%
  rowwise() %>% 
  mutate(
    year_raw = as.character(year),
    year = as.numeric(year),
    source = "DTH3DEATHS__DTH3_v1.0 (Sex_1_GHE-Tier-3_1580 + Sex_0_GHE-Tier-3_1580)",
    var_name = 'pct_violent_death',
    var_label = '% violent deaths',
    value_type = 'numeric',
    units = '%',
    value = (n_violent_deaths / total_deaths) * 100,
    value_clean = sprintf("%.2f",  (value)),
  )%>%  
  ungroup()  |> 
    select(geo, salid, iso2, year, year_raw, 
           value, var_name,
           var_label, value_type,value_clean, source,
           everything())

Validation

Code
df_pct_violent_deaths |> 
  ggplot(aes(x = year, y = value, 
             color = salid, group = salid)) +
  geom_line() +
  geom_point() +
  ## free y scale facet
  facet_wrap(~iso2 ) +
  scale_x_continuous(breaks = seq(2000,2020,10)) +
  theme_bw() +
  labs(title = 'Violent Deaths - L1AD  over time',
       subtitle = 'looks sane? right?') +
  theme(legend.position = 'none')

Comparisons Datastore

This will be a longitudinal dataset with the above data. Let’s first compile

Compile

Code
df_longitudinal = list(
  df_pop_final,
  df_le,
  df_infant_dth,
  df_dth_total,
  df_infant_mort,
  df_infant_bth,
  df_pct_urban,
  df_pct_violent_deaths
) |> 
  bind_rows() |> 
    select(geo, salid, iso2, year, year_raw, 
           value, var_name,
           var_label, value_type,value_clean, source)

df_longitudinal |>  count(var_name)
var_name n
LEALE__LEMEDIAN_L1_v1.0 1464
infant_mortality_rate 6889
n_births__BTH_v1 31833
n_infant_deaths__DTH_v3 7121
n_total_deaths__DTH_v3 7121
pct_elderly 7672
pct_urban 2960
pct_violent_death 7121
pct_young 7672

EDA

Let’s check

Summary Metrics

These include SALURBAL wide and country specific summary metrics. Let’s first compile.