SALURBAL Google Analytics Ingestions

Author

Ran Li (Maintainer)

Published

October 1, 2025

This notebook ingests google analytics data for our datawarehouse we will leverage the googleAnalyticsR package.

Properties

```{r}
#| code-summary: "Connect to Google Analytics"
#| echo: false
ga_auth(email="ranli2011@gmail.com")

# ga_auth()
# account_df <- ga_accounts()
# account_df$id

```

Lets get a list of all of our google analytics accounts - focusing on SALURBAL

## All SALURBAL Properties
salurbal_ga = ga_account_list("ga4") %>% 
  filter(str_detect(property_name, "salurbal"))

## Portal Property
ga_portal = salurbal_ga %>% 
  filter(property_name == 'salurbal-data-portal')
property_id = ga_portal$propertyId


salurbal_ga

Here is a table of available dimensions and metrics available for us to analyze.

df_metrics_available = ga_meta(
  version = c("data"),
  propertyId = property_id,
  cached = TRUE,
  no_api = FALSE
) %>% 
  as_tibble() 

df_metrics_available %>% 
  select(type, name = uiName, description) %>% 
  arrange(desc(type), name) %>% 
  reactable(searchable = TRUE)

Portal

Lets get some basic monthly information about SALURABL portal. Lets see what is available for this property.

Download

Daily

Let’s just get basic metrics.

if (!file.exists('metadata_cache/df_daily.parquet')){
  df_daily =  ga_data(
    property_id,
    metrics = c(
      "sessions", 
      "newUsers",
      "activeUsers", 
      "screenPageViews",
      "bounceRate",
      "averageSessionDuration",
      "eventsPerSession"
    ),
    dimensions = c('date'),
    date_range = c(start_date, end_date),
    limit = -1
  ) %>% 
    arrange(desc(date))
  
  df_daily %>% 
    clean_names()%>% 
    write_parquet("metadata_cache/df_daily.parquet")

  list(
    name = 'df_daily',
    nrow = nrow(df_daily),
    ncol = ncol(df_daily),
    date_frozen = Sys.Date()
  ) |> 
    jsonlite::write_json("metadata_cache/df_daily_metadata.json", auto_unbox = TRUE, pretty = T)
} else {
  df_daily = read_parquet("metadata_cache/df_daily.parquet")
}

df_daily %>% 
  reactable()

Daily by Country

if (!file.exists('metadata_cache/df_daily_country.parquet')){
  
  df_daily_country = ga_data(
    property_id,
       metrics = c(
      "sessions", 
      "newUsers",
      "activeUsers", 
      "screenPageViews",
      "bounceRate",
      "averageSessionDuration",
      "eventsPerSession"
    ),
    dimensions = c('date','country'),
    date_range = c(start_date, end_date),
    limit = -1
  ) %>%
    arrange(desc(activeUsers))
  
  df_daily_country %>%
    clean_names() %>%
    write_parquet("metadata_cache/df_daily_country.parquet")

  list(
    name = 'df_daily_country',
    nrow = nrow(df_daily_country),
    ncol = ncol(df_daily_country),
    date_frozen = Sys.Date()
  ) |> 
    jsonlite::write_json("metadata_cache/df_daily_country_metadata.json", auto_unbox = TRUE, pretty = T)

} else {
  df_daily_country = read_parquet("metadata_cache/df_daily_country.parquet")
}

df_daily_country %>%
  reactable()

Validate complete data Country vs Unstratfieid

Here we just compare the unstratified daily data, compared to the daily by country data, summed up by day. I assume there might be some differences because nto every user will have country data (blocked cookies, VPNs, etc) so there is value in keeping both datasets.

df__daily_aggregated = df_daily_country %>%
  summarise(new_users  = sum(new_users ), .by = c('date')) |> 
  arrange(date)

df_qc = df_daily |> 
  select(date, new_users  ) |> 
  left_join(df__daily_aggregated, by = 'date', suffix = c('_daily', '_daily_country')) |> 
  mutate(diff = new_users_daily - new_users_daily_country) |> 
  arrange(desc(diff)) |> 
  ## Validation
  verify(diff == 0)

Looks good. Our daily country data sums up to the unstratified daily data. SO we can utilize these datasets interchangeably without worrying about dropped users.

Metrics

New users by Month

This is the one metric we have on the usage dashboard. Let’s summarize new users by month.

df_monthly_new_users = df_daily %>% 
  select(date, new_users ) %>% 
  mutate(year = year(date),
         month = month(date),
         year_month_str =  glue::glue("{year}-{month}")) %>% 
  group_by(year_month_str) %>%
  summarise(new_users = sum(new_users)) %>%
  ungroup() %>% 
  mutate(date = ymd(glue::glue("{year_month_str}-01")))  %>% 
  ## format date as MM dd, YYYY
  mutate(Name = format(date, "%B, %Y")) %>% 
  arrange(desc(date))%>% 
  select(Name, `New Users` = new_users, Date =  date)


df_monthly_new_users %>% 
  write_csv("metadata_cache/df_monthly_new_users.csv")
df_monthly_new_users %>% 
  jsonlite::write_json("metadata_cache/df_monthly_new_users.json", auto_unbox = TRUE, pretty = T)

Let’s preview this

df_monthly_new_users %>% 
  reactable()

New users by Month and Country

df_monthly_new_users_country = df_daily_country %>% 
  mutate(iso2_group = case_when(
    country %in% c("Brazil", "Brasil") ~ "BR",
    country %in% c("Colombia") ~ "CO",
    country %in% c("Mexico", "México") ~ "MX",
    country %in% c("Argentina") ~ "AR",
    country %in% c("Chile") ~ "CL",
    country %in% c("Peru", "Perú") ~ "PE",
    country %in% c("Ecuador") ~ "EC",
    country %in% c("Guatemala") ~ "GT",
    country %in% c("Panama", "Panamá") ~ "PA",
    country %in% c("Honduras") ~ "HN",
    country %in% c("Nicaragua") ~ "NI",
    country %in% c("Costa Rica") ~ "CR",
    country %in% c("El Salvador") ~ "SV",
    country %in% c("United States") ~ "US",
    TRUE ~ "Other"
  )) |> 
  select(date, iso2_group, new_users) %>% 
  mutate(year = year(date),
         month = month(date),
         year_month_str = glue::glue("{year}-{month}")) %>% 
  group_by(year_month_str, iso2_group) %>%
  summarise(new_users = sum(new_users), .groups = 'drop') %>%
  mutate(date = ymd(glue::glue("{year_month_str}-01}"))) %>% 
  mutate(month_name = format(date, "%B, %Y")) %>% 
  arrange(desc(date), desc(new_users)) %>% 
  select(Month = month_name, Country = iso2_group, `New Users` = new_users, Date = date) |> 
  verify(not_na(Country))

df_monthly_new_users_country %>% 
  write_csv("metadata_cache/df_monthly_new_users_country.csv")

df_monthly_new_users_country %>% 
  jsonlite::write_json("metadata_cache/df_monthly_new_users_country.json", auto_unbox = TRUE, pretty = TRUE)

Let’s Preview

df_monthly_new_users_country %>% 
  reactable()

--- title: "SALURBAL Google Analytics Ingestions" author: - name: Ran Li (Maintainer) orcid: 0000-0002-4699-4755 date: last-modified format: html: page-layout: full toc: true toc-depth: 3 self-contained: true code-fold: false df-print: kable code-tools: true comments: hypothesis: theme: clean editor: source execute: warning: false message: false eval: true editor_options: chunk_output_type: console --- ```{r} #| code-summary: "Setup Code" #| echo: false library(pacman) p_load(tidyverse, arrow, googleAnalyticsR, lubridate, reactable, janitor, here, assertr) setwd(here( "salurbal-notebooks/_portal/portal_usage_analytics/google_analytics")) ## local parametrs start_date = "2022-10-01" end_date = 'today' ``` This notebook ingests google analytics data for our datawarehouse we will leverage the `googleAnalyticsR` package. # Properties ```{r} #| code-summary: "Connect to Google Analytics" #| echo: false ga_auth(email="ranli2011@gmail.com") # ga_auth() # account_df <- ga_accounts() # account_df$id ``` ```{r} #| code-summary: "Authenticate if needd" #| echo: false #| eval: false ga_auth() account_df <- ga_accounts() account_df$id ``` Lets get a list of all of our google analytics accounts - focusing on SALURBAL ```{r} #| code-summary: "Get SALURBAL GA properties" #| eval: false ## All SALURBAL Properties salurbal_ga = ga_account_list("ga4") %>% filter(str_detect(property_name, "salurbal")) ## Portal Property ga_portal = salurbal_ga %>% filter(property_name == 'salurbal-data-portal') property_id = ga_portal$propertyId salurbal_ga ``` Here is a table of available dimensions and metrics available for us to analyze. ```{r} #| code-summary: "Google Analytics codebook" #| eval: false df_metrics_available = ga_meta( version = c("data"), propertyId = property_id, cached = TRUE, no_api = FALSE ) %>% as_tibble() df_metrics_available %>% select(type, name = uiName, description) %>% arrange(desc(type), name) %>% reactable(searchable = TRUE) ``` # Portal Lets get some basic monthly information about SALURABL portal. Lets see what is available for this property. ## Download ### Daily Let's just get basic metrics. ```{r} #| code-summary: "Get basic metrics" if (!file.exists('metadata_cache/df_daily.parquet')){ df_daily = ga_data( property_id, metrics = c( "sessions", "newUsers", "activeUsers", "screenPageViews", "bounceRate", "averageSessionDuration", "eventsPerSession" ), dimensions = c('date'), date_range = c(start_date, end_date), limit = -1 ) %>% arrange(desc(date)) df_daily %>% clean_names()%>% write_parquet("metadata_cache/df_daily.parquet") list( name = 'df_daily', nrow = nrow(df_daily), ncol = ncol(df_daily), date_frozen = Sys.Date() ) |> jsonlite::write_json("metadata_cache/df_daily_metadata.json", auto_unbox = TRUE, pretty = T) } else { df_daily = read_parquet("metadata_cache/df_daily.parquet") } df_daily %>% reactable() ``` ### Daily by Country ```{r} #| code-summary: "Get users by country" if (!file.exists('metadata_cache/df_daily_country.parquet')){ df_daily_country = ga_data( property_id, metrics = c( "sessions", "newUsers", "activeUsers", "screenPageViews", "bounceRate", "averageSessionDuration", "eventsPerSession" ), dimensions = c('date','country'), date_range = c(start_date, end_date), limit = -1 ) %>% arrange(desc(activeUsers)) df_daily_country %>% clean_names() %>% write_parquet("metadata_cache/df_daily_country.parquet") list( name = 'df_daily_country', nrow = nrow(df_daily_country), ncol = ncol(df_daily_country), date_frozen = Sys.Date() ) |> jsonlite::write_json("metadata_cache/df_daily_country_metadata.json", auto_unbox = TRUE, pretty = T) } else { df_daily_country = read_parquet("metadata_cache/df_daily_country.parquet") } df_daily_country %>% reactable() ``` ### Validate complete data Country vs Unstratfieid Here we just compare the unstratified daily data, compared to the daily by country data, summed up by day. I assume there might be some differences because nto every user will have country data (blocked cookies, VPNs, etc) so there is value in keeping both datasets. ```{r} df__daily_aggregated = df_daily_country %>% summarise(new_users = sum(new_users ), .by = c('date')) |> arrange(date) df_qc = df_daily |> select(date, new_users ) |> left_join(df__daily_aggregated, by = 'date', suffix = c('_daily', '_daily_country')) |> mutate(diff = new_users_daily - new_users_daily_country) |> arrange(desc(diff)) |> ## Validation verify(diff == 0) ``` Looks good. Our daily country data sums up to the unstratified daily data. SO we can utilize these datasets interchangeably without worrying about dropped users. # Metrics ## New users by Month This is the one metric we have on the usage dashboard. Let's summarize new users by month. ```{r} df_monthly_new_users = df_daily %>% select(date, new_users ) %>% mutate(year = year(date), month = month(date), year_month_str = glue::glue("{year}-{month}")) %>% group_by(year_month_str) %>% summarise(new_users = sum(new_users)) %>% ungroup() %>% mutate(date = ymd(glue::glue("{year_month_str}-01"))) %>% ## format date as MM dd, YYYY mutate(Name = format(date, "%B, %Y")) %>% arrange(desc(date))%>% select(Name, `New Users` = new_users, Date = date) df_monthly_new_users %>% write_csv("metadata_cache/df_monthly_new_users.csv") df_monthly_new_users %>% jsonlite::write_json("metadata_cache/df_monthly_new_users.json", auto_unbox = TRUE, pretty = T) ``` Let's preview this ```{r} df_monthly_new_users %>% reactable() ``` ## New users by Month and Country ```{r} #| code-summary: "New users by month and country" df_monthly_new_users_country = df_daily_country %>% mutate(iso2_group = case_when( country %in% c("Brazil", "Brasil") ~ "BR", country %in% c("Colombia") ~ "CO", country %in% c("Mexico", "México") ~ "MX", country %in% c("Argentina") ~ "AR", country %in% c("Chile") ~ "CL", country %in% c("Peru", "Perú") ~ "PE", country %in% c("Ecuador") ~ "EC", country %in% c("Guatemala") ~ "GT", country %in% c("Panama", "Panamá") ~ "PA", country %in% c("Honduras") ~ "HN", country %in% c("Nicaragua") ~ "NI", country %in% c("Costa Rica") ~ "CR", country %in% c("El Salvador") ~ "SV", country %in% c("United States") ~ "US", TRUE ~ "Other" )) |> select(date, iso2_group, new_users) %>% mutate(year = year(date), month = month(date), year_month_str = glue::glue("{year}-{month}")) %>% group_by(year_month_str, iso2_group) %>% summarise(new_users = sum(new_users), .groups = 'drop') %>% mutate(date = ymd(glue::glue("{year_month_str}-01}"))) %>% mutate(month_name = format(date, "%B, %Y")) %>% arrange(desc(date), desc(new_users)) %>% select(Month = month_name, Country = iso2_group, `New Users` = new_users, Date = date) |> verify(not_na(Country)) df_monthly_new_users_country %>% write_csv("metadata_cache/df_monthly_new_users_country.csv") df_monthly_new_users_country %>% jsonlite::write_json("metadata_cache/df_monthly_new_users_country.json", auto_unbox = TRUE, pretty = TRUE) ``` Let's Preview ```{r} df_monthly_new_users_country %>% reactable() ```