## All SALURBAL Properties
salurbal_ga = ga_account_list("ga4") %>%
filter(str_detect(property_name, "salurbal"))
## Portal Property
ga_portal = salurbal_ga %>%
filter(property_name == 'salurbal-data-portal')
property_id = ga_portal$propertyId
salurbal_gaSALURBAL Google Analytics Ingestions
This notebook ingests google analytics data for our datawarehouse we will leverage the googleAnalyticsR package.
Properties
```{r}
#| code-summary: "Connect to Google Analytics"
#| echo: false
ga_auth(email="ranli2011@gmail.com")
# ga_auth()
# account_df <- ga_accounts()
# account_df$id
```Lets get a list of all of our google analytics accounts - focusing on SALURBAL
Here is a table of available dimensions and metrics available for us to analyze.
df_metrics_available = ga_meta(
version = c("data"),
propertyId = property_id,
cached = TRUE,
no_api = FALSE
) %>%
as_tibble()
df_metrics_available %>%
select(type, name = uiName, description) %>%
arrange(desc(type), name) %>%
reactable(searchable = TRUE)Portal
Lets get some basic monthly information about SALURABL portal. Lets see what is available for this property.
Download
Daily
Let’s just get basic metrics.
if (!file.exists('metadata_cache/df_daily.parquet')){
df_daily = ga_data(
property_id,
metrics = c(
"sessions",
"newUsers",
"activeUsers",
"screenPageViews",
"bounceRate",
"averageSessionDuration",
"eventsPerSession"
),
dimensions = c('date'),
date_range = c(start_date, end_date),
limit = -1
) %>%
arrange(desc(date))
df_daily %>%
clean_names()%>%
write_parquet("metadata_cache/df_daily.parquet")
list(
name = 'df_daily',
nrow = nrow(df_daily),
ncol = ncol(df_daily),
date_frozen = Sys.Date()
) |>
jsonlite::write_json("metadata_cache/df_daily_metadata.json", auto_unbox = TRUE, pretty = T)
} else {
df_daily = read_parquet("metadata_cache/df_daily.parquet")
}
df_daily %>%
reactable()Daily by Country
if (!file.exists('metadata_cache/df_daily_country.parquet')){
df_daily_country = ga_data(
property_id,
metrics = c(
"sessions",
"newUsers",
"activeUsers",
"screenPageViews",
"bounceRate",
"averageSessionDuration",
"eventsPerSession"
),
dimensions = c('date','country'),
date_range = c(start_date, end_date),
limit = -1
) %>%
arrange(desc(activeUsers))
df_daily_country %>%
clean_names() %>%
write_parquet("metadata_cache/df_daily_country.parquet")
list(
name = 'df_daily_country',
nrow = nrow(df_daily_country),
ncol = ncol(df_daily_country),
date_frozen = Sys.Date()
) |>
jsonlite::write_json("metadata_cache/df_daily_country_metadata.json", auto_unbox = TRUE, pretty = T)
} else {
df_daily_country = read_parquet("metadata_cache/df_daily_country.parquet")
}
df_daily_country %>%
reactable()Validate complete data Country vs Unstratfieid
Here we just compare the unstratified daily data, compared to the daily by country data, summed up by day. I assume there might be some differences because nto every user will have country data (blocked cookies, VPNs, etc) so there is value in keeping both datasets.
df__daily_aggregated = df_daily_country %>%
summarise(new_users = sum(new_users ), .by = c('date')) |>
arrange(date)
df_qc = df_daily |>
select(date, new_users ) |>
left_join(df__daily_aggregated, by = 'date', suffix = c('_daily', '_daily_country')) |>
mutate(diff = new_users_daily - new_users_daily_country) |>
arrange(desc(diff)) |>
## Validation
verify(diff == 0)Looks good. Our daily country data sums up to the unstratified daily data. SO we can utilize these datasets interchangeably without worrying about dropped users.
Metrics
New users by Month
This is the one metric we have on the usage dashboard. Let’s summarize new users by month.
df_monthly_new_users = df_daily %>%
select(date, new_users ) %>%
mutate(year = year(date),
month = month(date),
year_month_str = glue::glue("{year}-{month}")) %>%
group_by(year_month_str) %>%
summarise(new_users = sum(new_users)) %>%
ungroup() %>%
mutate(date = ymd(glue::glue("{year_month_str}-01"))) %>%
## format date as MM dd, YYYY
mutate(Name = format(date, "%B, %Y")) %>%
arrange(desc(date))%>%
select(Name, `New Users` = new_users, Date = date)
df_monthly_new_users %>%
write_csv("metadata_cache/df_monthly_new_users.csv")
df_monthly_new_users %>%
jsonlite::write_json("metadata_cache/df_monthly_new_users.json", auto_unbox = TRUE, pretty = T)Let’s preview this
df_monthly_new_users %>%
reactable()New users by Month and Country
df_monthly_new_users_country = df_daily_country %>%
mutate(iso2_group = case_when(
country %in% c("Brazil", "Brasil") ~ "BR",
country %in% c("Colombia") ~ "CO",
country %in% c("Mexico", "México") ~ "MX",
country %in% c("Argentina") ~ "AR",
country %in% c("Chile") ~ "CL",
country %in% c("Peru", "Perú") ~ "PE",
country %in% c("Ecuador") ~ "EC",
country %in% c("Guatemala") ~ "GT",
country %in% c("Panama", "Panamá") ~ "PA",
country %in% c("Honduras") ~ "HN",
country %in% c("Nicaragua") ~ "NI",
country %in% c("Costa Rica") ~ "CR",
country %in% c("El Salvador") ~ "SV",
country %in% c("United States") ~ "US",
TRUE ~ "Other"
)) |>
select(date, iso2_group, new_users) %>%
mutate(year = year(date),
month = month(date),
year_month_str = glue::glue("{year}-{month}")) %>%
group_by(year_month_str, iso2_group) %>%
summarise(new_users = sum(new_users), .groups = 'drop') %>%
mutate(date = ymd(glue::glue("{year_month_str}-01}"))) %>%
mutate(month_name = format(date, "%B, %Y")) %>%
arrange(desc(date), desc(new_users)) %>%
select(Month = month_name, Country = iso2_group, `New Users` = new_users, Date = date) |>
verify(not_na(Country))
df_monthly_new_users_country %>%
write_csv("metadata_cache/df_monthly_new_users_country.csv")
df_monthly_new_users_country %>%
jsonlite::write_json("metadata_cache/df_monthly_new_users_country.json", auto_unbox = TRUE, pretty = TRUE)Let’s Preview
df_monthly_new_users_country %>%
reactable()