## All SALURBAL Properties
= ga_account_list("ga4") %>%
salurbal_ga filter(str_detect(property_name, "salurbal"))
## Portal Property
= salurbal_ga %>%
ga_portal filter(property_name == 'salurbal-data-portal')
= ga_portal$propertyId
property_id
salurbal_ga
SALURBAL Google Analytics Ingestions
This notebook ingests google analytics data for our datawarehouse we will leverage the googleAnalyticsR
package.
Properties
```{r}
#| code-summary: "Connect to Google Analytics"
#| echo: false
ga_auth(email="ranli2011@gmail.com")
# ga_auth()
# account_df <- ga_accounts()
# account_df$id
```
Lets get a list of all of our google analytics accounts - focusing on SALURBAL
Here is a table of available dimensions and metrics available for us to analyze.
= ga_meta(
df_metrics_available version = c("data"),
propertyId = property_id,
cached = TRUE,
no_api = FALSE
%>%
) as_tibble()
%>%
df_metrics_available select(type, name = uiName, description) %>%
arrange(desc(type), name) %>%
reactable(searchable = TRUE)
Portal
Lets get some basic monthly information about SALURABL portal. Lets see what is available for this property.
Download
Daily
Let’s just get basic metrics.
if (!file.exists('metadata_cache/df_daily.parquet')){
= ga_data(
df_daily
property_id,metrics = c(
"sessions",
"newUsers",
"activeUsers",
"screenPageViews",
"bounceRate",
"averageSessionDuration",
"eventsPerSession"
),dimensions = c('date'),
date_range = c(start_date, end_date),
limit = -1
%>%
) arrange(desc(date))
%>%
df_daily clean_names()%>%
write_parquet("metadata_cache/df_daily.parquet")
list(
name = 'df_daily',
nrow = nrow(df_daily),
ncol = ncol(df_daily),
date_frozen = Sys.Date()
|>
) ::write_json("metadata_cache/df_daily_metadata.json", auto_unbox = TRUE, pretty = T)
jsonliteelse {
} = read_parquet("metadata_cache/df_daily.parquet")
df_daily
}
%>%
df_daily reactable()
Daily by Country
if (!file.exists('metadata_cache/df_daily_country.parquet')){
= ga_data(
df_daily_country
property_id,metrics = c(
"sessions",
"newUsers",
"activeUsers",
"screenPageViews",
"bounceRate",
"averageSessionDuration",
"eventsPerSession"
),dimensions = c('date','country'),
date_range = c(start_date, end_date),
limit = -1
%>%
) arrange(desc(activeUsers))
%>%
df_daily_country clean_names() %>%
write_parquet("metadata_cache/df_daily_country.parquet")
list(
name = 'df_daily_country',
nrow = nrow(df_daily_country),
ncol = ncol(df_daily_country),
date_frozen = Sys.Date()
|>
) ::write_json("metadata_cache/df_daily_country_metadata.json", auto_unbox = TRUE, pretty = T)
jsonlite
else {
} = read_parquet("metadata_cache/df_daily_country.parquet")
df_daily_country
}
%>%
df_daily_country reactable()
Validate complete data Country vs Unstratfieid
Here we just compare the unstratified daily data, compared to the daily by country data, summed up by day. I assume there might be some differences because nto every user will have country data (blocked cookies, VPNs, etc) so there is value in keeping both datasets.
= df_daily_country %>%
df__daily_aggregated summarise(new_users = sum(new_users ), .by = c('date')) |>
arrange(date)
= df_daily |>
df_qc select(date, new_users ) |>
left_join(df__daily_aggregated, by = 'date', suffix = c('_daily', '_daily_country')) |>
mutate(diff = new_users_daily - new_users_daily_country) |>
arrange(desc(diff)) |>
## Validation
verify(diff == 0)
Looks good. Our daily country data sums up to the unstratified daily data. SO we can utilize these datasets interchangeably without worrying about dropped users.
Metrics
New users by Month
This is the one metric we have on the usage dashboard. Let’s summarize new users by month.
= df_daily %>%
df_monthly_new_users select(date, new_users ) %>%
mutate(year = year(date),
month = month(date),
year_month_str = glue::glue("{year}-{month}")) %>%
group_by(year_month_str) %>%
summarise(new_users = sum(new_users)) %>%
ungroup() %>%
mutate(date = ymd(glue::glue("{year_month_str}-01"))) %>%
## format date as MM dd, YYYY
mutate(Name = format(date, "%B, %Y")) %>%
arrange(desc(date))%>%
select(Name, `New Users` = new_users, Date = date)
%>%
df_monthly_new_users write_csv("metadata_cache/df_monthly_new_users.csv")
%>%
df_monthly_new_users ::write_json("metadata_cache/df_monthly_new_users.json", auto_unbox = TRUE, pretty = T) jsonlite
Let’s preview this
%>%
df_monthly_new_users reactable()
New users by Month and Country
= df_daily_country %>%
df_monthly_new_users_country mutate(iso2_group = case_when(
%in% c("Brazil", "Brasil") ~ "BR",
country %in% c("Colombia") ~ "CO",
country %in% c("Mexico", "México") ~ "MX",
country %in% c("Argentina") ~ "AR",
country %in% c("Chile") ~ "CL",
country %in% c("Peru", "Perú") ~ "PE",
country %in% c("Ecuador") ~ "EC",
country %in% c("Guatemala") ~ "GT",
country %in% c("Panama", "Panamá") ~ "PA",
country %in% c("Honduras") ~ "HN",
country %in% c("Nicaragua") ~ "NI",
country %in% c("Costa Rica") ~ "CR",
country %in% c("El Salvador") ~ "SV",
country %in% c("United States") ~ "US",
country TRUE ~ "Other"
|>
)) select(date, iso2_group, new_users) %>%
mutate(year = year(date),
month = month(date),
year_month_str = glue::glue("{year}-{month}")) %>%
group_by(year_month_str, iso2_group) %>%
summarise(new_users = sum(new_users), .groups = 'drop') %>%
mutate(date = ymd(glue::glue("{year_month_str}-01}"))) %>%
mutate(month_name = format(date, "%B, %Y")) %>%
arrange(desc(date), desc(new_users)) %>%
select(Month = month_name, Country = iso2_group, `New Users` = new_users, Date = date) |>
verify(not_na(Country))
%>%
df_monthly_new_users_country write_csv("metadata_cache/df_monthly_new_users_country.csv")
%>%
df_monthly_new_users_country ::write_json("metadata_cache/df_monthly_new_users_country.json", auto_unbox = TRUE, pretty = TRUE) jsonlite
Let’s Preview
%>%
df_monthly_new_users_country reactable()