SALURBAL Google Analytics Ingestions

Author

Ran Li (Maintainer)

Published

October 1, 2025

This notebook ingests google analytics data for our datawarehouse we will leverage the googleAnalyticsR package.

Properties

```{r}
#| code-summary: "Connect to Google Analytics"
#| echo: false
ga_auth(email="ranli2011@gmail.com")

# ga_auth()
# account_df <- ga_accounts()
# account_df$id

```

Lets get a list of all of our google analytics accounts - focusing on SALURBAL

## All SALURBAL Properties
salurbal_ga = ga_account_list("ga4") %>% 
  filter(str_detect(property_name, "salurbal"))

## Portal Property
ga_portal = salurbal_ga %>% 
  filter(property_name == 'salurbal-data-portal')
property_id = ga_portal$propertyId


salurbal_ga

Here is a table of available dimensions and metrics available for us to analyze.

df_metrics_available = ga_meta(
  version = c("data"),
  propertyId = property_id,
  cached = TRUE,
  no_api = FALSE
) %>% 
  as_tibble() 

df_metrics_available %>% 
  select(type, name = uiName, description) %>% 
  arrange(desc(type), name) %>% 
  reactable(searchable = TRUE)

Portal

Lets get some basic monthly information about SALURABL portal. Lets see what is available for this property.

Download

Daily

Let’s just get basic metrics.

if (!file.exists('metadata_cache/df_daily.parquet')){
  df_daily =  ga_data(
    property_id,
    metrics = c(
      "sessions", 
      "newUsers",
      "activeUsers", 
      "screenPageViews",
      "bounceRate",
      "averageSessionDuration",
      "eventsPerSession"
    ),
    dimensions = c('date'),
    date_range = c(start_date, end_date),
    limit = -1
  ) %>% 
    arrange(desc(date))
  
  df_daily %>% 
    clean_names()%>% 
    write_parquet("metadata_cache/df_daily.parquet")

  list(
    name = 'df_daily',
    nrow = nrow(df_daily),
    ncol = ncol(df_daily),
    date_frozen = Sys.Date()
  ) |> 
    jsonlite::write_json("metadata_cache/df_daily_metadata.json", auto_unbox = TRUE, pretty = T)
} else {
  df_daily = read_parquet("metadata_cache/df_daily.parquet")
}

df_daily %>% 
  reactable()

Daily by Country

if (!file.exists('metadata_cache/df_daily_country.parquet')){
  
  df_daily_country = ga_data(
    property_id,
       metrics = c(
      "sessions", 
      "newUsers",
      "activeUsers", 
      "screenPageViews",
      "bounceRate",
      "averageSessionDuration",
      "eventsPerSession"
    ),
    dimensions = c('date','country'),
    date_range = c(start_date, end_date),
    limit = -1
  ) %>%
    arrange(desc(activeUsers))
  
  df_daily_country %>%
    clean_names() %>%
    write_parquet("metadata_cache/df_daily_country.parquet")

  list(
    name = 'df_daily_country',
    nrow = nrow(df_daily_country),
    ncol = ncol(df_daily_country),
    date_frozen = Sys.Date()
  ) |> 
    jsonlite::write_json("metadata_cache/df_daily_country_metadata.json", auto_unbox = TRUE, pretty = T)

} else {
  df_daily_country = read_parquet("metadata_cache/df_daily_country.parquet")
}

df_daily_country %>%
  reactable()

Validate complete data Country vs Unstratfieid

Here we just compare the unstratified daily data, compared to the daily by country data, summed up by day. I assume there might be some differences because nto every user will have country data (blocked cookies, VPNs, etc) so there is value in keeping both datasets.

df__daily_aggregated = df_daily_country %>%
  summarise(new_users  = sum(new_users ), .by = c('date')) |> 
  arrange(date)

df_qc = df_daily |> 
  select(date, new_users  ) |> 
  left_join(df__daily_aggregated, by = 'date', suffix = c('_daily', '_daily_country')) |> 
  mutate(diff = new_users_daily - new_users_daily_country) |> 
  arrange(desc(diff)) |> 
  ## Validation
  verify(diff == 0)

Looks good. Our daily country data sums up to the unstratified daily data. SO we can utilize these datasets interchangeably without worrying about dropped users.

Metrics

New users by Month

This is the one metric we have on the usage dashboard. Let’s summarize new users by month.

df_monthly_new_users = df_daily %>% 
  select(date, new_users ) %>% 
  mutate(year = year(date),
         month = month(date),
         year_month_str =  glue::glue("{year}-{month}")) %>% 
  group_by(year_month_str) %>%
  summarise(new_users = sum(new_users)) %>%
  ungroup() %>% 
  mutate(date = ymd(glue::glue("{year_month_str}-01")))  %>% 
  ## format date as MM dd, YYYY
  mutate(Name = format(date, "%B, %Y")) %>% 
  arrange(desc(date))%>% 
  select(Name, `New Users` = new_users, Date =  date)


df_monthly_new_users %>% 
  write_csv("metadata_cache/df_monthly_new_users.csv")
df_monthly_new_users %>% 
  jsonlite::write_json("metadata_cache/df_monthly_new_users.json", auto_unbox = TRUE, pretty = T)

Let’s preview this

df_monthly_new_users %>% 
  reactable()

New users by Month and Country

df_monthly_new_users_country = df_daily_country %>% 
  mutate(iso2_group = case_when(
    country %in% c("Brazil", "Brasil") ~ "BR",
    country %in% c("Colombia") ~ "CO",
    country %in% c("Mexico", "México") ~ "MX",
    country %in% c("Argentina") ~ "AR",
    country %in% c("Chile") ~ "CL",
    country %in% c("Peru", "Perú") ~ "PE",
    country %in% c("Ecuador") ~ "EC",
    country %in% c("Guatemala") ~ "GT",
    country %in% c("Panama", "Panamá") ~ "PA",
    country %in% c("Honduras") ~ "HN",
    country %in% c("Nicaragua") ~ "NI",
    country %in% c("Costa Rica") ~ "CR",
    country %in% c("El Salvador") ~ "SV",
    country %in% c("United States") ~ "US",
    TRUE ~ "Other"
  )) |> 
  select(date, iso2_group, new_users) %>% 
  mutate(year = year(date),
         month = month(date),
         year_month_str = glue::glue("{year}-{month}")) %>% 
  group_by(year_month_str, iso2_group) %>%
  summarise(new_users = sum(new_users), .groups = 'drop') %>%
  mutate(date = ymd(glue::glue("{year_month_str}-01}"))) %>% 
  mutate(month_name = format(date, "%B, %Y")) %>% 
  arrange(desc(date), desc(new_users)) %>% 
  select(Month = month_name, Country = iso2_group, `New Users` = new_users, Date = date) |> 
  verify(not_na(Country))

df_monthly_new_users_country %>% 
  write_csv("metadata_cache/df_monthly_new_users_country.csv")

df_monthly_new_users_country %>% 
  jsonlite::write_json("metadata_cache/df_monthly_new_users_country.json", auto_unbox = TRUE, pretty = TRUE)

Let’s Preview

df_monthly_new_users_country %>% 
  reactable()