## Plotdf_mort_year %>%# Assuming iso2_source is already a column in your data# If not, you might need to create it with mutate firstggplot(aes(x = year, y = n, color = iso2_source)) +geom_line(size =1) +geom_point(size =2.5, alpha =0.7) +# Add text labels at the end of each linegeom_text(data = df_mort_year %>%group_by(iso2_source) %>%filter(year ==max(year)) %>%ungroup(),aes(label = iso2),hjust =-0.3, vjust =0.5,fontface ="bold",size =3.5) +scale_y_log10(labels = scales::comma_format()) +scale_color_brewer(palette ="Set1", name ="Country/Source") +labs(title ="Death Records by Country and Year",subtitle ="Log scale used to show variation across different data sources",x ="Year",y ="Number of Records (log scale)",caption ="Data source: CCUH Aim 2 Harmonized death records" ) +theme_minimal() +theme(plot.title =element_text(face ="bold", size =16),plot.subtitle =element_text(size =12, color ="gray40"),axis.title =element_text(face ="bold"),legend.position ="none",panel.grid.minor =element_blank(),panel.border =element_rect(fill =NA, color ="gray80"),plot.caption =element_text(hjust =0, size =9, color ="gray40"),plot.margin =margin(t =10, r =15, b =10, l =10) ) +scale_x_continuous(breaks =seq(min(df_mort_year$year), max(df_mort_year$year), by =2),limits =c(min(df_mort_year$year), max(df_mort_year$year) +1) # Add space for labels ) +guides(color =guide_legend(override.aes =list(size =2), ncol =1))
Looks okay to me so far.
Age
Missingness in age by country and year
Code
# Analyze missingness in age by country and yearage_missingness <- df_mort %>%group_by(iso2, year) %>%summarize(total_records =n(),missing_age =sum(is.na(dinage)),pct_missing =round(missing_age / total_records *100, 2) ) %>%collect() %>%arrange(year) %>%ungroup() %>%verify(filter(.,iso2!='BR') %>%pull(pct_missing) <3)# Create a table of the resultsage_missingness %>%pivot_wider(id_cols = iso2,names_from = year,values_from = pct_missing )
iso2
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
US
0.1
0.07
0.16
0.13
0.14
0.15
0.1
0.12
0.14
0.11
0.13
4e-02
1e-02
1e-02
4e-02
4e-02
5e-02
7e-02
5e-02
5e-02
0.02
0.03
0.02
BR
NA
NA
NA
100.00
100.00
100.00
100.0
100.00
100.00
100.00
100.00
1e+02
1e+02
1e+02
1e+02
1e+02
1e+02
1e+02
1e+02
1e+02
NA
NA
NA
GT
NA
NA
NA
NA
NA
NA
NA
NA
NA
0.00
0.00
0e+00
0e+00
0e+00
0e+00
0e+00
0e+00
0e+00
0e+00
0e+00
0.00
NA
NA
PA
NA
NA
NA
NA
NA
NA
NA
NA
NA
NA
NA
NA
0e+00
0e+00
0e+00
0e+00
0e+00
0e+00
0e+00
NA
0.00
NA
NA
BR does nto ahve ‘DINAGE’ or age of death in years
other ocuntires this has minimal < 1% missingness
Let’s do a EDA plot on DINAGE.
Code
# Create stacked bar charts for death distribution by age# First, prepare the data by age group and year# Create age groups based on dinageage_distribution <- df_mort %>%mutate(# Create age group labels based on dinageage_group =case_when(is.na(dinage) ~"Unknown", dinage <5~"0-4", dinage <15~"5-14", dinage <25~"15-24", dinage <45~"25-44", dinage <65~"45-64", dinage <85~"65-84",TRUE~"85+" ),# Convert year to a standard numeric formatyear =as.numeric(year) ) %>%group_by(iso2, year, age_group) %>%summarize(deaths =n()) %>%collect() %>%# Make sure age groups are ordered correctly for the visualizationmutate(age_group =factor(age_group, levels =c("0-4", "5-14", "15-24", "25-44", "45-64", "65-84", "85+", "Unknown")))# Calculate percentage distributionage_pct_distribution <- age_distribution %>%group_by(iso2, year) %>%mutate(total_deaths =sum(deaths),pct_deaths = deaths / total_deaths *100 ) %>%ungroup()# 100% stacked bar chart showing proportionsggplot(age_pct_distribution, aes(x = year, y = pct_deaths, fill = age_group)) +geom_col(position ="fill") +facet_wrap(~ iso2, ncol =1) +scale_fill_brewer(palette ="RdYlBu", name ="Age Group", direction =-1) +scale_y_continuous(labels = scales::percent_format(scale =1)) +labs(title ="Proportional Distribution of Deaths by Age Group and Year",subtitle ="100% stacked bars (based on dinage)",x ="Year",y ="Percentage of Deaths" ) +theme_minimal() +theme(plot.title =element_text(face ="bold"),axis.text.x =element_text(angle =45, hjust =1),strip.text =element_text(face ="bold", size =12),legend.position ="right" )
Makes sense. Some interesting insights:
infnat/child mortality in GT and PA are decreasing? Thats good right?
Age 5 year categories
Brazil does nto have age in years
Brazil only has 5 year age cats
Code
# Analyze missingness in age by country and yearage_cat_missingness <- df_mort %>%group_by(iso2, year) %>%summarize(total_records =n(),missing_age =sum(is.na(minage5c)),pct_missing =round(missing_age / total_records *100, 2) ) %>%collect() %>%arrange(year) %>%ungroup()# Create a table of the resultsage_cat_missingness %>%pivot_wider(id_cols = iso2,names_from = year,values_from = pct_missing )
iso2
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
US
0.1
0.07
0.16
0.13
0.14
0.15
0.10
0.12
0.14
0.11
0.13
0.04
0.01
0.01
0.04
0.04
0.05
0.07
0.05
0.05
0.02
0.03
0.02
BR
NA
NA
NA
2.46
2.62
0.10
2.29
2.16
2.16
2.15
2.05
2.14
2.30
2.20
2.23
2.25
2.01
2.11
2.06
1.98
NA
NA
NA
GT
NA
NA
NA
NA
NA
NA
NA
NA
NA
0.00
0.00
0.00
0.00
0.00
0.00
0.00
0.00
0.00
0.00
0.00
0.00
NA
NA
PA
NA
NA
NA
NA
NA
NA
NA
NA
NA
NA
NA
NA
0.00
0.00
0.00
0.00
0.00
0.00
0.00
NA
0.00
NA
NA
Code
# Visualize missingnessggplot(age_cat_missingness, aes(x = year, y = pct_missing, color = iso2)) +geom_line(size =1) +geom_point(size =2.5) +facet_wrap(~ iso2, ncol =2) +labs(title ="Percentage of Missing Age Values by Country and Year",x ="Year",y ="Percentage Missing (%)" ) +theme_minimal() +scale_color_brewer(palette ="Set1") +theme(legend.position ="none",panel.grid.minor =element_blank(),plot.title =element_text(face ="bold"),strip.text =element_text(face ="bold", size =12) ) +ylim(0,100)
5 year age categories is available in all datasets
there is < 3 % missingness of this in BR and < 1% in all other countries
Let’s jsut check the the age categories are consisnte across the harmozianitons.
Code
# EDA plot for minage5c with three age groups# Create simplified age groups based on minage5cage_distribution_three_groups <- df_mort %>%mutate(# Create three age group categoriesage_group =case_when(is.na(minage5c) ~"Unknown", minage5c <5~"0-4", minage5c >=65~"65+",TRUE~"5-64" ),# Convert year to a standard numeric formatyear =as.numeric(year) ) %>%group_by(iso2, year, age_group) %>%summarize(deaths =n()) %>%collect() %>%# Make sure age groups are ordered correctly for the visualizationmutate(age_group =factor(age_group, levels =c("0-4", "5-64", "65+", "Unknown")))# Calculate percentage distributionage_pct_three_groups <- age_distribution_three_groups %>%group_by(iso2, year) %>%mutate(total_deaths =sum(deaths),pct_deaths = deaths / total_deaths *100 ) %>%ungroup()# 100% stacked bar chart showing proportionsggplot(age_pct_three_groups, aes(x = year, y = pct_deaths, fill = age_group)) +geom_col(position ="fill") +facet_wrap(~ iso2, ncol =2) +scale_fill_manual(values =c("0-4"="#2c7bb6", "5-64"="#ffffbf", "65+"="#d7191c", "Unknown"="#cccccc"),name ="Age Group" ) +scale_y_continuous(labels = scales::percent_format(scale =1)) +labs(title ="Proportional Distribution of Deaths by Age Group and Year",subtitle ="Showing three main age groups: 0-4, 5-64, and 65+",x ="Year",y ="Percentage of Deaths" ) +theme_minimal() +theme(plot.title =element_text(face ="bold"),axis.text.x =element_text(angle =45, hjust =1),strip.text =element_text(face ="bold", size =12),legend.position ="right" )
Looks sensible.
Most deaths are older grousp in all counrites
In GT and PA there are relativel ymore 0-4 deaths
2. Death Counts
This is a translation of legacy code into our datawarehouse. The legacy logic from Derek’s :
Here we translate this logic into the harmonized Aim 2 death records set. THe rationale: docueminetn lineage for transaprency and 2) so this can be reused across manuscripts. So the deliverable here is
daily counts at nbhd level
Dates without any deaths should be filled with NA or 0.
two datasets: 1) total 2) stratafied by age>65
These were done in SQL models. The logic was not too complex to warrant a python model or excesively complex to require a dplyr notebook. Below we just quality check the results.
df_counts_dplyr %>%rename(death = death_r) %>%left_join(df_schema) %>%summarize(death =sum(death), .by =c(date, iso2, city_name)) %>%ggplot(aes(x = date, y = death, group = city_name, color = city_name))+geom_line() +facet_wrap(~iso2)+labs(title ='Something is off with Belo Horizonte numbers',subtitle ="City level death trends from nbhd sums for CCUH Cities")
To check with Usama about Belo Horizonte numbers. Lets dial in on neighborhood level for Belo Horizonte. First let’s see what year had that drop
# Visualize distribution by countryggplot(neighborhood_stats, aes(x = iso2, y = mean_daily_deaths)) +geom_boxplot() +labs(title ="Mean Neighbhborhood-level Daily Deaths by Country",x ="Country", y ="Mean Daily Deaths") +theme_minimal()
Lets also look at trends
Code
df_counts %>%left_join(df_schema) %>%summarize(death =sum(death), .by =c(date, iso2, city_name)) %>%ggplot(aes(x = date, y = death, group = city_name, color = city_name))+geom_line() +facet_wrap(~iso2)+labs(title ='Something is off with Belo Horizonte numbers',subtitle ="City level death trends from nbhd sums for CCUH Cities")
To check with Usama about Belo Horizonte numbers.
3. Death Counts by Age
Here we do death counts stratafied by two age groups: 1) <65 and then 65+. Note htat there are some recrods missing age (<0.1% in USA recrods and ~ 2% in Brazil records); These records were removed from this tabulation.
Develop
import polars as plimport datetimedf_mort = pl.scan_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__death_records.parquet")df_mort_age = (df_mort .with_columns( pl.when(pl.col("minage5c") <=64) .then(pl.lit('<65')) # Add pl.lit() here .otherwise(pl.lit('65+')) # Add pl.lit() here .alias("age_category") ))df_schema = pl.scan_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/base__ccuh__aim2_city_subcity_nbhd__schema.parquet").collect()
Let’s do the transformation; first getting some intermediates.
# Visualize distribution by countryggplot(neighborhood_stats, aes(x = iso2, y = mean_daily_deaths, fill = age_category)) +geom_boxplot(position =position_dodge(width =0.8), outlier.alpha =0.5) +scale_fill_brewer(palette ="Set1", name ="Age Group") +labs(title ="Mean Daily Deaths by Country and Age Group",x ="Country", y ="Mean Daily Deaths") +theme_minimal() +theme(legend.position ="bottom") +scale_y_continuous(trans ="log10", # Log scale to better show distributionlabels = scales::number_format(accuracy =0.001) ) +geom_hline(yintercept =0.1, linetype ="dashed", color ="gray50", alpha =0.7)
---title: "ccuh_aim2_mortality__v1"author: - name: Ran Li (Maintainer) orcid: 0000-0002-4699-4755date: last-modifiedformat: html: toc: true self-contained: true code-fold: true df-print: kable code-tools: true comments: hypothesis: theme: cleaneditor: sourceexecute: warning: false message: false eval: trueeditor_options: chunk_output_type: console---```{r}#| code-summary: "Setup"#| code-fold: truepacman::p_load( tidyverse, arrow, glue, cli, janitor, reactable, sf, geoarrow, rmapshaper, leaflet, here, assertr, assertthat)library(reticulate)use_virtualenv(here::here('.venv'))py_config()py_list_packages()py_module_available("polars")py_module_available("numpy")context =lst(dbt_server ="//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0",schema =file.path(dbt_server,'base__ccuh__aim2_city_subcity_nbhd__schema.parquet'),records =file.path(dbt_server,'harmonized__ccuh_aim_2__death_records.parquet'),counts =file.path(dbt_server,'harmonized__ccuh_aim_2__nbhd_death_counts.parquet'),counts_age =file.path(dbt_server,'harmonized__ccuh_aim_2__nbhd_death_counts__age_grp_65.parquet'),)## Schemadf_schema = context$schema %>%open_dataset() %>%collect()```# 1. Death recordsThese death records are harmonized for CCUH Aim 2; details and documentation can be found at [CCUH Aim 2 Harmonized death records (Harmonized Dataset)](https://www.notion.so/drexel-ccuh/CCUH-Aim-2-Harmonized-death-records-Harmonized-Dataset-1a457008e885803ca9e5cd2997558357). Here we just do some exploratory analysis to understand the scope of the data and do preliminary QC.```{r}#| code-summary: "Import + Validate"df_mort = context$records %>%open_dataset() %>%collect() %>%verify(not_na(nbhd_geoid)) %>%verify(not_na(iso2)) %>%verify(not_na(dod)) %>%verify(all(unique(city_geoid) %in%unique(df_schema$city_geoid))) %>%verify(all(unique(nbhd_geoid) %in%unique(df_schema$nbhd_geoid)))```### YearAvailability Across country and year```{r}df_mort_year = df_mort %>%count(iso2, source_table, year) %>%collect() %>%arrange(year)%>%mutate(iso2_source =glue("{iso2} ({source_table})"))## Tabledf_mort_year %>%pivot_wider(names_from = year, values_from = n) ## Plotdf_mort_year %>%# Assuming iso2_source is already a column in your data# If not, you might need to create it with mutate firstggplot(aes(x = year, y = n, color = iso2_source)) +geom_line(size =1) +geom_point(size =2.5, alpha =0.7) +# Add text labels at the end of each linegeom_text(data = df_mort_year %>%group_by(iso2_source) %>%filter(year ==max(year)) %>%ungroup(),aes(label = iso2),hjust =-0.3, vjust =0.5,fontface ="bold",size =3.5) +scale_y_log10(labels = scales::comma_format()) +scale_color_brewer(palette ="Set1", name ="Country/Source") +labs(title ="Death Records by Country and Year",subtitle ="Log scale used to show variation across different data sources",x ="Year",y ="Number of Records (log scale)",caption ="Data source: CCUH Aim 2 Harmonized death records" ) +theme_minimal() +theme(plot.title =element_text(face ="bold", size =16),plot.subtitle =element_text(size =12, color ="gray40"),axis.title =element_text(face ="bold"),legend.position ="none",panel.grid.minor =element_blank(),panel.border =element_rect(fill =NA, color ="gray80"),plot.caption =element_text(hjust =0, size =9, color ="gray40"),plot.margin =margin(t =10, r =15, b =10, l =10) ) +scale_x_continuous(breaks =seq(min(df_mort_year$year), max(df_mort_year$year), by =2),limits =c(min(df_mort_year$year), max(df_mort_year$year) +1) # Add space for labels ) +guides(color =guide_legend(override.aes =list(size =2), ncol =1))```Looks okay to me so far.### AgeMissingness in age by country and year```{r}# Analyze missingness in age by country and yearage_missingness <- df_mort %>%group_by(iso2, year) %>%summarize(total_records =n(),missing_age =sum(is.na(dinage)),pct_missing =round(missing_age / total_records *100, 2) ) %>%collect() %>%arrange(year) %>%ungroup() %>%verify(filter(.,iso2!='BR') %>%pull(pct_missing) <3)# Create a table of the resultsage_missingness %>%pivot_wider(id_cols = iso2,names_from = year,values_from = pct_missing )```- BR does nto ahve 'DINAGE' or age of death in years- other ocuntires this has minimal < 1% missingnessLet's do a EDA plot on DINAGE.```{r}# Create stacked bar charts for death distribution by age# First, prepare the data by age group and year# Create age groups based on dinageage_distribution <- df_mort %>%mutate(# Create age group labels based on dinageage_group =case_when(is.na(dinage) ~"Unknown", dinage <5~"0-4", dinage <15~"5-14", dinage <25~"15-24", dinage <45~"25-44", dinage <65~"45-64", dinage <85~"65-84",TRUE~"85+" ),# Convert year to a standard numeric formatyear =as.numeric(year) ) %>%group_by(iso2, year, age_group) %>%summarize(deaths =n()) %>%collect() %>%# Make sure age groups are ordered correctly for the visualizationmutate(age_group =factor(age_group, levels =c("0-4", "5-14", "15-24", "25-44", "45-64", "65-84", "85+", "Unknown")))# Calculate percentage distributionage_pct_distribution <- age_distribution %>%group_by(iso2, year) %>%mutate(total_deaths =sum(deaths),pct_deaths = deaths / total_deaths *100 ) %>%ungroup()# 100% stacked bar chart showing proportionsggplot(age_pct_distribution, aes(x = year, y = pct_deaths, fill = age_group)) +geom_col(position ="fill") +facet_wrap(~ iso2, ncol =1) +scale_fill_brewer(palette ="RdYlBu", name ="Age Group", direction =-1) +scale_y_continuous(labels = scales::percent_format(scale =1)) +labs(title ="Proportional Distribution of Deaths by Age Group and Year",subtitle ="100% stacked bars (based on dinage)",x ="Year",y ="Percentage of Deaths" ) +theme_minimal() +theme(plot.title =element_text(face ="bold"),axis.text.x =element_text(angle =45, hjust =1),strip.text =element_text(face ="bold", size =12),legend.position ="right" )```Makes sense. Some interesting insights:- infnat/child mortality in GT and PA are decreasing? Thats good right?### Age 5 year categories- Brazil does nto have age in years- Brazil only has 5 year age cats```{r}# Analyze missingness in age by country and yearage_cat_missingness <- df_mort %>%group_by(iso2, year) %>%summarize(total_records =n(),missing_age =sum(is.na(minage5c)),pct_missing =round(missing_age / total_records *100, 2) ) %>%collect() %>%arrange(year) %>%ungroup()# Create a table of the resultsage_cat_missingness %>%pivot_wider(id_cols = iso2,names_from = year,values_from = pct_missing ) # Visualize missingnessggplot(age_cat_missingness, aes(x = year, y = pct_missing, color = iso2)) +geom_line(size =1) +geom_point(size =2.5) +facet_wrap(~ iso2, ncol =2) +labs(title ="Percentage of Missing Age Values by Country and Year",x ="Year",y ="Percentage Missing (%)" ) +theme_minimal() +scale_color_brewer(palette ="Set1") +theme(legend.position ="none",panel.grid.minor =element_blank(),plot.title =element_text(face ="bold"),strip.text =element_text(face ="bold", size =12) ) +ylim(0,100)```- 5 year age categories is available in all datasets- there is < 3 % missingness of this in BR and < 1% in all other countriesLet's jsut check the the age categories are consisnte across the harmozianitons. ```{r}# EDA plot for minage5c with three age groups# Create simplified age groups based on minage5cage_distribution_three_groups <- df_mort %>%mutate(# Create three age group categoriesage_group =case_when(is.na(minage5c) ~"Unknown", minage5c <5~"0-4", minage5c >=65~"65+",TRUE~"5-64" ),# Convert year to a standard numeric formatyear =as.numeric(year) ) %>%group_by(iso2, year, age_group) %>%summarize(deaths =n()) %>%collect() %>%# Make sure age groups are ordered correctly for the visualizationmutate(age_group =factor(age_group, levels =c("0-4", "5-64", "65+", "Unknown")))# Calculate percentage distributionage_pct_three_groups <- age_distribution_three_groups %>%group_by(iso2, year) %>%mutate(total_deaths =sum(deaths),pct_deaths = deaths / total_deaths *100 ) %>%ungroup()# 100% stacked bar chart showing proportionsggplot(age_pct_three_groups, aes(x = year, y = pct_deaths, fill = age_group)) +geom_col(position ="fill") +facet_wrap(~ iso2, ncol =2) +scale_fill_manual(values =c("0-4"="#2c7bb6", "5-64"="#ffffbf", "65+"="#d7191c", "Unknown"="#cccccc"),name ="Age Group" ) +scale_y_continuous(labels = scales::percent_format(scale =1)) +labs(title ="Proportional Distribution of Deaths by Age Group and Year",subtitle ="Showing three main age groups: 0-4, 5-64, and 65+",x ="Year",y ="Percentage of Deaths" ) +theme_minimal() +theme(plot.title =element_text(face ="bold"),axis.text.x =element_text(angle =45, hjust =1),strip.text =element_text(face ="bold", size =12),legend.position ="right" )```Looks sensible.- Most deaths are older grousp in all counrites- In GT and PA there are relativel ymore 0-4 deaths# 2. Death CountsThis is a translation of legacy code into our datawarehouse. The legacy logic from Derek's : ```{r}## Connect to hamronized DIN database v1db_harmonized__salurbal_din__v1 = '//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/dev/harmonized__salurbal_din__v1.parquet' %>% arrow::open_dataset() |> rename(date = dod, age = dinage ) ## L2 countsdf_L2 <- db_harmonized__salurbal_din__v1 %>% summarize(death = n(), .by = c(salid2, date)) |> collect() %>% complete(salid2, date, fill = list(death = 0)) |> arrange(salid2, date)## L2 by age group countsdf_L2_age <- db_harmonized__salurbal_din__v1 |> mutate(age_category = ifelse(age <= 64, "<65", "65+")) |> summarize(death = n(), .by = c(salid2, date, age_category)) |> collect() %>% complete(salid2, date, age_category, fill = list(death = 0)) |> arrange(salid2, date)```Here we translate this logic into the harmonized Aim 2 death records set. THe rationale: docueminetn lineage for transaprency and 2) so this can be reused across manuscripts. So the deliverable here is - daily counts at nbhd level- Dates without any deaths should be filled with NA or 0.- two datasets: 1) total 2) stratafied by age>65These were done in SQL models. The logic was not too complex to warrant a python model or excesively complex to require a dplyr notebook. Below we just quality check the results. ## Develop### Dplyr Let's first operationalize death counts in R.```{r}#| code-summary: "Dplyr model"#| code-fold: falsedf_nbhd_year = df_mort %>%select(nbhd_geoid, year) %>%distinct() head(df_nbhd_year)df_date_template <- df_nbhd_year %>%mutate(start_date =make_date(year, 1, 1),end_date =make_date(year, 12, 31) ) %>%rowwise() %>%mutate(date =list(seq(start_date, end_date, by ="day"))) %>%unnest(date) %>%select(nbhd_geoid, year, date) head(df_date_template)df_counts_raw_dplyr = df_mort %>%rename(date = dod) %>%summarize(death =n(), .by =c(nbhd_geoid, date)) |>collect()head(df_counts_raw_dplyr)df_counts_dplyr = df_date_template %>%left_join(df_counts_raw_dplyr) %>%mutate(death =replace_na(death, 0)) %>%filter(nbhd_geoid %in%df_schema$nbhd_geoid) %>%arrange(nbhd_geoid, date) %>%rename(death_r = death) %>%left_join(select(df_schema, nbhd_geoid, iso2) %>%distinct()) %>%add_count(nbhd_geoid, year, name ='n_days') %>%## Validationsverify(not_na(iso2))%>%verify(not_na(nbhd_geoid ))%>%verify(not_na(date)) %>%verify(not_na(death_r)) %>%verify(between(n_days, 365,366))head(df_counts_dplyr)```#### Belo Horizonte UndercountingLets do a quick trends EDA ```{r}df_counts_dplyr %>%rename(death = death_r) %>%left_join(df_schema) %>%summarize(death =sum(death), .by =c(date, iso2, city_name)) %>%ggplot(aes(x = date, y = death, group = city_name, color = city_name))+geom_line() +facet_wrap(~iso2)+labs(title ='Something is off with Belo Horizonte numbers',subtitle ="City level death trends from nbhd sums for CCUH Cities")```To check with Usama about Belo Horizonte numbers.Lets dial in on neighborhood level for Belo Horizonte. First let's see what year had that drop```{r}df_qc = df_counts_dplyr %>%left_join(df_schema) %>%rename(death = death_r) %>%mutate(month =month(date)) %>%group_by(city_name, city_geoid, nbhd_geoid, month, year) %>%summarize(death =sum(death)) %>%ungroup() df_qc %>%filter(city_name =='Belo Horizonte') %>%mutate(date =make_date(year, month, 1) ) %>%ggplot(aes(x = date, y = death, group = nbhd_geoid))+geom_line(alpha =0.05)+theme(legend.position ='none' )+labs(title ='Neighborhood death trends Belo Horizonte',subtitle ='weird shift starting 2006 (hihglihghted as between red lines)' ) +geom_vline(xintercept =make_date(2006, 1, 1),color ='red', linetype ='dashed', alpha =1) +geom_vline(xintercept =make_date(2006, 6, 1),color ='red', linetype ='dashed', alpha =1) +scale_x_date(date_breaks ="1 year", date_labels ="%Y")```For now Will quarentine the data for this city prior to 2007 due to visible undercounting. ### PolarsPolars. ```{python}#| code-summary: "Set up Python"#| code-fold: falseimport polars as plimport datetimedf_mort = pl.scan_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__death_records.parquet")df_schema = pl.scan_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/base__ccuh__aim2_city_subcity_nbhd__schema.parquet").collect()```Let's do the transformation; first getting some intermediates.```{python}#| code-summary: "Polars Model"#| code-fold: falsedf_nbhd_year = ( df_mort .select("nbhd_geoid", "year") .with_columns(pl.col('year').cast(pl.Int32)) .unique() .collect())print(df_nbhd_year.head())df_date_template = ( df_nbhd_year .group_by(['nbhd_geoid', 'year']) .agg([ pl.date_range( pl.date(pl.col('year').first(), 1, 1), pl.date(pl.col('year').first(), 12, 31), "1d" ).alias('date') ]) .explode('date'))print(df_date_template.head())df_counts_raw_polars = ( df_mort .rename({"dod": "date"}) .group_by(["nbhd_geoid", "date"]) .agg([ pl.len().alias("death") ]) .collect())print(df_counts_raw_polars.head())df_counts_polars = ( df_date_template .join( df_counts_raw_polars, on=["nbhd_geoid", "date"], how="left" ) .with_columns([ pl.col("death").fill_null(0) ]) .filter( pl.col("nbhd_geoid").is_in(df_schema.select("nbhd_geoid").to_series().to_list()) ) .sort(["nbhd_geoid", "date"]) .rename({"death": "death_r"}) .join( df_schema.select(["nbhd_geoid", "city_name", "city_geoid", "iso2"]).unique(), on="nbhd_geoid", how="left" )## Remove Belo Horizonte nbhd data before 2007 due to visible undercounting .filter(~((pl.col("city_name") =="Belo Horizonte") & (pl.col("year") <2007)) ))print(df_counts_polars.head())```## Validate DBT Let's compare results with DBT.```{r}#| code-fold: false## Import DBT Results with validationsdf_counts = context$counts %>%open_dataset() %>%select(iso2, city_name,nbhd_geoid, date , death) %>%collect()|>arrange(nbhd_geoid, date) %>%verify(not_na(iso2))%>%verify(not_na(nbhd_geoid ))%>%verify(not_na(date)) %>%verify(not_na(death)) %>%verify(all(unique(nbhd_geoid) %in%unique(df_schema$nbhd_geoid)))## Check Identicalidentical( df_counts, df_counts_dplyr %>%left_join(df_schema |>select(nbhd_geoid,iso2, city_name)) |>filter(!(city_name =="Belo Horizonte"& year <2007)) %>%rename(death = death_r) %>%select(names(df_counts))) |> assertthat::assert_that()```Looks good. The SQL results are identical to the Dplyr results. Increases evidence that our semantics were properly transalted. ## Visual InspectionNot sure how to check. but let's start with some trends.```{r}# Calculate mean daily deaths by neighborhoodneighborhood_stats <- df_counts %>%group_by(iso2, nbhd_geoid) %>%summarize(mean_daily_deaths =mean(death),median_daily_deaths =median(death),max_daily_deaths =max(death),total_deaths =sum(death),.groups ="drop" )# View summary statisticsneighborhood_stats %>%group_by(iso2) %>%summarize(n_neighborhoods =n(),avg_mean_deaths =mean(mean_daily_deaths),median_mean_deaths =median(mean_daily_deaths),min_mean_deaths =min(mean_daily_deaths),max_mean_deaths =max(mean_daily_deaths) )%>% knitr::kable()# Visualize distribution by countryggplot(neighborhood_stats, aes(x = iso2, y = mean_daily_deaths)) +geom_boxplot() +labs(title ="Mean Neighbhborhood-level Daily Deaths by Country",x ="Country", y ="Mean Daily Deaths") +theme_minimal()```Lets also look at trends```{r}df_counts %>%left_join(df_schema) %>%summarize(death =sum(death), .by =c(date, iso2, city_name)) %>%ggplot(aes(x = date, y = death, group = city_name, color = city_name))+geom_line() +facet_wrap(~iso2)+labs(title ='Something is off with Belo Horizonte numbers',subtitle ="City level death trends from nbhd sums for CCUH Cities")```To check with Usama about Belo Horizonte numbers.# 3. Death Counts by AgeHere we do death counts stratafied by two age groups: 1) <65 and then 65+. Note htat there are some recrods missing age (<0.1% in USA recrods and ~ 2% in Brazil records); These records were removed from this tabulation. ## Develop```{python}#| code-summary: "Set up Python"#| code-fold: falseimport polars as plimport datetimedf_mort = pl.scan_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__death_records.parquet")df_mort_age = (df_mort .with_columns( pl.when(pl.col("minage5c") <=64) .then(pl.lit('<65')) # Add pl.lit() here .otherwise(pl.lit('65+')) # Add pl.lit() here .alias("age_category") ))df_schema = pl.scan_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/base__ccuh__aim2_city_subcity_nbhd__schema.parquet").collect()```Let's do the transformation; first getting some intermediates.```{python}#| code-summary: "Polars Model"#| code-fold: falsedf_nbhd_year = ( df_mort_age .select("nbhd_geoid", "year", "age_category") .with_columns(pl.col('year').cast(pl.Int32)) .unique() .collect())print(df_nbhd_year.head())df_date_template = ( df_nbhd_year .group_by(['nbhd_geoid', 'year', "age_category"]) .agg([ pl.date_range( pl.date(pl.col('year').first(), 1, 1), pl.date(pl.col('year').first(), 12, 31), "1d" ).alias('date') ]) .explode('date'))print(df_date_template.head())df_counts_raw_polars = ( df_mort_age .rename({"dod": "date"}) .group_by(["nbhd_geoid", "date", "age_category"]) .agg([ pl.len().alias("death") ]) .collect())print(df_counts_raw_polars.head())df_counts_polars = ( df_date_template .join( df_counts_raw_polars, on=["nbhd_geoid", "date", "age_category"], how="left" ) .with_columns([ pl.col("death").fill_null(0) ]) .filter( pl.col("nbhd_geoid").is_in(df_schema.select("nbhd_geoid").to_series().to_list()) ) .sort(["nbhd_geoid", "date"]) .join( df_schema.select(["nbhd_geoid", "city_name", "city_geoid", "iso2"]).unique(), on="nbhd_geoid", how="left" )## Remove Belo Horizonte nbhd data before 2007 due to visible undercounting .filter(~((pl.col("city_name") =="Belo Horizonte") & (pl.col("year") <2007)) ))print(df_counts_polars.head())```## Validate Let's import the DBT results and just validate against eh unstratified results.```{r}#| code-fold: false## Import DBT Resultsdf_counts_age = context$counts_age %>%open_dataset() %>%select(iso2, nbhd_geoid, age_category, date, death) %>%collect()|>arrange(age_category, nbhd_geoid, date) %>%verify(not_na(age_category)) %>%verify(not_na(iso2)) %>%verify(not_na(nbhd_geoid ))%>%verify(not_na(date)) %>%verify(not_na(death)) %>%verify(all(unique(nbhd_geoid) %in%unique(df_schema$nbhd_geoid)))## Dplyr computationsdf_counts_reconstructed = df_counts_age |>group_by(iso2, nbhd_geoid, date) |>summarize(death =sum(death)) |>ungroup()|>arrange(nbhd_geoid, date)## Check Identicalidentical( df_counts_reconstructed, df_counts %>%select(names(df_counts_reconstructed))) |>assert_that()```Looks good. The DBT strataffied results when aggregated to unstratified are identical to the unstratfiaeid results. ## Visual InspectionNot sure how to check. but let's start with some trends.```{r}# Calculate mean daily deaths by neighborhoodneighborhood_stats <- df_counts_age %>%group_by(iso2, nbhd_geoid, age_category) %>%summarize(mean_daily_deaths =mean(death),median_daily_deaths =median(death),max_daily_deaths =max(death),total_deaths =sum(death),.groups ="drop" )# View summary statisticsneighborhood_stats %>%group_by(iso2, age_category) %>%summarize(n_neighborhoods =n(),avg_mean_deaths =mean(mean_daily_deaths),median_mean_deaths =median(mean_daily_deaths),min_mean_deaths =min(mean_daily_deaths),max_mean_deaths =max(mean_daily_deaths) )%>% knitr::kable()# Visualize distribution by countryggplot(neighborhood_stats, aes(x = iso2, y = mean_daily_deaths, fill = age_category)) +geom_boxplot(position =position_dodge(width =0.8), outlier.alpha =0.5) +scale_fill_brewer(palette ="Set1", name ="Age Group") +labs(title ="Mean Daily Deaths by Country and Age Group",x ="Country", y ="Mean Daily Deaths") +theme_minimal() +theme(legend.position ="bottom") +scale_y_continuous(trans ="log10", # Log scale to better show distributionlabels = scales::number_format(accuracy =0.001) ) +geom_hline(yintercept =0.1, linetype ="dashed", color ="gray50", alpha =0.7)```# 4. Access```{r}df_aim2_mort = "`r context$records`" |> arrow::open_dataset() df_aim2_nbhd_deaths = "`r context$counts`" |> arrow::open_dataset() df_aim2_nbhd_deaths_age = "`r context$counts_age`" |> arrow::open_dataset() ```