ccuh_aim2_mortality__v1

Author

Ran Li (Maintainer)

Published

March 2, 2025

Setup

pacman::p_load(
  tidyverse, arrow, glue, cli, janitor, reactable,
  sf, geoarrow, rmapshaper, leaflet, here,
  assertr, assertthat)

library(reticulate)
use_virtualenv(here::here('.venv'))
py_config()

python:         D:/GitHub/ccuh-data-infrastructure/ccuh-notebooks/_harmonization/ccuh_aim2/ccuh_aim2_models_v1/.venv/Scripts/python.exe
libpython:      C:/Users/rl627/AppData/Local/Programs/Python/Python313/python313.dll
pythonhome:     D:/GitHub/ccuh-data-infrastructure/ccuh-notebooks/_harmonization/ccuh_aim2/ccuh_aim2_models_v1/.venv
version:        3.13.2 (tags/v3.13.2:4f8bb39, Feb  4 2025, 15:23:48) [MSC v.1942 64 bit (AMD64)]
Architecture:   64bit
numpy:          D:/GitHub/ccuh-data-infrastructure/ccuh-notebooks/_harmonization/ccuh_aim2/ccuh_aim2_models_v1/.venv/Lib/site-packages/numpy
numpy_version:  2.2.3

NOTE: Python version was forced by use_python() function

Setup

py_list_packages()

package	version	requirement
asttokens	3.0.0	asttokens==3.0.0
colorama	0.4.6	colorama==0.4.6
comm	0.2.2	comm==0.2.2
debugpy	1.8.12	debugpy==1.8.12
decorator	5.2.1	decorator==5.2.1
executing	2.2.0	executing==2.2.0
ipykernel	6.29.5	ipykernel==6.29.5
ipython	9.0.0	ipython==9.0.0
ipython_pygments_lexers	1.1.1	ipython_pygments_lexers==1.1.1
jedi	0.19.2	jedi==0.19.2
jupyter_client	8.6.3	jupyter_client==8.6.3
jupyter_core	5.7.2	jupyter_core==5.7.2
matplotlib-inline	0.1.7	matplotlib-inline==0.1.7
nest-asyncio	1.6.0	nest-asyncio==1.6.0
numpy	2.2.3	numpy==2.2.3
packaging	24.2	packaging==24.2
parso	0.8.4	parso==0.8.4
platformdirs	4.3.6	platformdirs==4.3.6
polars	1.24.0	polars==1.24.0
prompt_toolkit	3.0.50	prompt_toolkit==3.0.50
psutil	7.0.0	psutil==7.0.0
pure_eval	0.2.3	pure_eval==0.2.3
Pygments	2.19.1	Pygments==2.19.1
python-dateutil	2.9.0.post0	python-dateutil==2.9.0.post0
pywin32	308	pywin32==308
pyzmq	26.2.1	pyzmq==26.2.1
six	1.17.0	six==1.17.0
stack-data	0.6.3	stack-data==0.6.3
tornado	6.4.2	tornado==6.4.2
traitlets	5.14.3	traitlets==5.14.3
wcwidth	0.2.13	wcwidth==0.2.13

Setup

py_module_available("polars")

[1] TRUE

Setup

py_module_available("numpy")

[1] TRUE

Setup

context = lst(
  dbt_server = "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0",
  schema = file.path(dbt_server,'base__ccuh__aim2_city_subcity_nbhd__schema.parquet'),
  records = file.path(dbt_server,'harmonized__ccuh_aim_2__death_records.parquet'),
  counts = file.path(dbt_server,'harmonized__ccuh_aim_2__nbhd_death_counts.parquet'),
  counts_age = file.path(dbt_server,'harmonized__ccuh_aim_2__nbhd_death_counts__age_grp_65.parquet'),
)


## Schema
df_schema = context$schema %>% 
  open_dataset() %>% 
  collect()

1. Death records

These death records are harmonized for CCUH Aim 2; details and documentation can be found at CCUH Aim 2 Harmonized death records (Harmonized Dataset).

Here we just do some exploratory analysis to understand the scope of the data and do preliminary QC.

Import + Validate

df_mort = context$records %>% 
  open_dataset() %>% 
  collect() %>% 
  verify(not_na(nbhd_geoid)) %>% 
  verify(not_na(iso2)) %>% 
  verify(not_na(dod)) %>% 
  verify(all(unique(city_geoid) %in%
               unique(df_schema$city_geoid))) %>%
  verify(all(unique(nbhd_geoid) %in% 
               unique(df_schema$nbhd_geoid)))

Year

Availability Across country and year

Code

df_mort_year = df_mort %>% 
  count(iso2, source_table, year)  %>% 
  collect() %>% 
  arrange(year)%>% 
  mutate(iso2_source = glue("{iso2} ({source_table})"))

## Table
df_mort_year %>% 
  pivot_wider(names_from = year, values_from = n)

iso2	source_table	iso2_source	2000	2001	2002	2003	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021	2022
US	base__pennsylvania_records_death_v1	US (base__pennsylvania_records_death_v1)	37865	36835	36859	36685	35895	36344	35259	35205	34576	34409	34233	35376	34920	35224	35252	35858	36379	36907	36902	36950	44402	40881	39726
BR	base__salurbal__sim_BR__v1	BR (base__salurbal__sim_BR__v1)	NA	NA	NA	173645	168776	175555	187072	194640	198625	206220	214386	214831	214509	220794	225234	228127	238044	231127	239244	241595	NA	NA	NA
GT	base__salurbal__din_GT__v1	GT (base__salurbal__din_GT__v1)	NA	NA	NA	NA	NA	NA	NA	NA	NA	16852	18033	17509	17120	18982	18987	19674	19831	19524	19628	20578	25297	NA	NA
PA	base__salurbal__din_PA__v1	PA (base__salurbal__din_PA__v1)	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	7667	8331	7966	8560	8773	8932	9165	NA	13158	NA	NA

Code

## Plot
df_mort_year %>% 
  # Assuming iso2_source is already a column in your data
  # If not, you might need to create it with mutate first
  ggplot(aes(x = year, y = n, color = iso2_source)) +
  geom_line(size = 1) +
  geom_point(size = 2.5, alpha = 0.7) +
  # Add text labels at the end of each line
  geom_text(data = df_mort_year %>% 
              group_by(iso2_source) %>% 
              filter(year == max(year)) %>%
              ungroup(),
            aes(label = iso2),
            hjust = -0.3, 
            vjust = 0.5,
            fontface = "bold",
            size = 3.5) +
  scale_y_log10(labels = scales::comma_format()) +
  scale_color_brewer(palette = "Set1", name = "Country/Source") +
  labs(
    title = "Death Records by Country and Year",
    subtitle = "Log scale used to show variation across different data sources",
    x = "Year",
    y = "Number of Records (log scale)",
    caption = "Data source: CCUH Aim 2 Harmonized death records"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, color = "gray40"),
    axis.title = element_text(face = "bold"),
    legend.position = "none",
    panel.grid.minor = element_blank(),
    panel.border = element_rect(fill = NA, color = "gray80"),
    plot.caption = element_text(hjust = 0, size = 9, color = "gray40"),
    plot.margin = margin(t = 10, r = 15, b = 10, l = 10)
  ) +
  scale_x_continuous(
    breaks = seq(min(df_mort_year$year), max(df_mort_year$year), by = 2),
    limits = c(min(df_mort_year$year), max(df_mort_year$year) + 1)  # Add space for labels
  ) +
  guides(color = guide_legend(override.aes = list(size = 2), ncol = 1))

Looks okay to me so far.

Age

Missingness in age by country and year

Code

# Analyze missingness in age by country and year
age_missingness <- df_mort %>%
  group_by(iso2, year) %>%
  summarize(
    total_records = n(),
    missing_age = sum(is.na(dinage)),
    pct_missing = round(missing_age / total_records * 100, 2)
  ) %>%
  collect() %>%
  arrange(year) %>% 
  ungroup() %>% 
  verify(
    filter(.,iso2!='BR') %>%
      pull(pct_missing) < 3)

# Create a table of the results
age_missingness %>%
  pivot_wider(
    id_cols = iso2,
    names_from = year,
    values_from = pct_missing
  )

iso2	2000	2001	2002	2003	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021	2022
US	0.1	0.07	0.16	0.13	0.14	0.15	0.1	0.12	0.14	0.11	0.13	4e-02	1e-02	1e-02	4e-02	4e-02	5e-02	7e-02	5e-02	5e-02	0.02	0.03	0.02
BR	NA	NA	NA	100.00	100.00	100.00	100.0	100.00	100.00	100.00	100.00	1e+02	1e+02	1e+02	1e+02	1e+02	1e+02	1e+02	1e+02	1e+02	NA	NA	NA
GT	NA	NA	NA	NA	NA	NA	NA	NA	NA	0.00	0.00	0e+00	0e+00	0e+00	0e+00	0e+00	0e+00	0e+00	0e+00	0e+00	0.00	NA	NA
PA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	0e+00	0e+00	0e+00	0e+00	0e+00	0e+00	0e+00	NA	0.00	NA	NA

BR does nto ahve ‘DINAGE’ or age of death in years
other ocuntires this has minimal < 1% missingness

Let’s do a EDA plot on DINAGE.

Code

# Create stacked bar charts for death distribution by age
# First, prepare the data by age group and year

# Create age groups based on dinage
age_distribution <- df_mort %>%
  mutate(
    # Create age group labels based on dinage
    age_group = case_when(
      is.na(dinage) ~ "Unknown",
      dinage < 5 ~ "0-4",
      dinage < 15 ~ "5-14",
      dinage < 25 ~ "15-24",
      dinage < 45 ~ "25-44",
      dinage < 65 ~ "45-64",
      dinage < 85 ~ "65-84",
      TRUE ~ "85+"
    ),
    # Convert year to a standard numeric format
    year = as.numeric(year)
  ) %>%
  group_by(iso2, year, age_group) %>%
  summarize(deaths = n()) %>%
  collect() %>% 
  # Make sure age groups are ordered correctly for the visualization
  mutate(age_group = factor(age_group, levels = c("0-4", "5-14", "15-24", "25-44", "45-64", "65-84", "85+", "Unknown")))

# Calculate percentage distribution
age_pct_distribution <- age_distribution %>%
  group_by(iso2, year) %>%
  mutate(
    total_deaths = sum(deaths),
    pct_deaths = deaths / total_deaths * 100
  ) %>%
  ungroup()

# 100% stacked bar chart showing proportions
ggplot(age_pct_distribution, aes(x = year, y = pct_deaths, fill = age_group)) +
  geom_col(position = "fill") +
  facet_wrap(~ iso2, ncol = 1) +
  scale_fill_brewer(palette = "RdYlBu", name = "Age Group", direction = -1) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  labs(
    title = "Proportional Distribution of Deaths by Age Group and Year",
    subtitle = "100% stacked bars (based on dinage)",
    x = "Year",
    y = "Percentage of Deaths"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(face = "bold", size = 12),
    legend.position = "right"
  )

Makes sense. Some interesting insights:

infnat/child mortality in GT and PA are decreasing? Thats good right?

Age 5 year categories

Brazil does nto have age in years
Brazil only has 5 year age cats

Code

# Analyze missingness in age by country and year
age_cat_missingness <- df_mort %>%
  group_by(iso2, year) %>%
  summarize(
    total_records = n(),
    missing_age = sum(is.na(minage5c)),
    pct_missing = round(missing_age / total_records * 100, 2)
  ) %>%
  collect() %>%
  arrange(year) %>% 
  ungroup()

# Create a table of the results
age_cat_missingness %>%
  pivot_wider(
    id_cols = iso2,
    names_from = year,
    values_from = pct_missing
  )

iso2	2000	2001	2002	2003	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021	2022
US	0.1	0.07	0.16	0.13	0.14	0.15	0.10	0.12	0.14	0.11	0.13	0.04	0.01	0.01	0.04	0.04	0.05	0.07	0.05	0.05	0.02	0.03	0.02
BR	NA	NA	NA	2.46	2.62	0.10	2.29	2.16	2.16	2.15	2.05	2.14	2.30	2.20	2.23	2.25	2.01	2.11	2.06	1.98	NA	NA	NA
GT	NA	NA	NA	NA	NA	NA	NA	NA	NA	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	NA	NA
PA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	0.00	0.00	0.00	0.00	0.00	0.00	0.00	NA	0.00	NA	NA

Code

# Visualize missingness
ggplot(age_cat_missingness, aes(x = year, y = pct_missing, color = iso2)) +
  geom_line(size = 1) +
  geom_point(size = 2.5) +
  facet_wrap(~ iso2, ncol = 2) +
  labs(
    title = "Percentage of Missing Age Values by Country and Year",
    x = "Year",
    y = "Percentage Missing (%)"
  ) +
  theme_minimal() +
  scale_color_brewer(palette = "Set1") +
  theme(
    legend.position = "none",
    panel.grid.minor = element_blank(),
    plot.title = element_text(face = "bold"),
    strip.text = element_text(face = "bold", size = 12)
  ) +
  ylim(0,100)

5 year age categories is available in all datasets
there is < 3 % missingness of this in BR and < 1% in all other countries

Let’s jsut check the the age categories are consisnte across the harmozianitons.

Code

# EDA plot for minage5c with three age groups

# Create simplified age groups based on minage5c
age_distribution_three_groups <- df_mort %>%
  mutate(
    # Create three age group categories
    age_group = case_when(
      is.na(minage5c) ~ "Unknown",
      minage5c < 5 ~ "0-4",
      minage5c >= 65 ~ "65+",
      TRUE ~ "5-64"
    ),
    # Convert year to a standard numeric format
    year = as.numeric(year)
  ) %>%
  group_by(iso2, year, age_group) %>%
  summarize(deaths = n()) %>%
  collect() %>%
  # Make sure age groups are ordered correctly for the visualization
  mutate(age_group = factor(age_group, levels = c("0-4", "5-64", "65+", "Unknown")))

# Calculate percentage distribution
age_pct_three_groups <- age_distribution_three_groups %>%
  group_by(iso2, year) %>%
  mutate(
    total_deaths = sum(deaths),
    pct_deaths = deaths / total_deaths * 100
  ) %>%
  ungroup()

# 100% stacked bar chart showing proportions
ggplot(age_pct_three_groups, aes(x = year, y = pct_deaths, fill = age_group)) +
  geom_col(position = "fill") +
  facet_wrap(~ iso2, ncol = 2) +
  scale_fill_manual(
    values = c("0-4" = "#2c7bb6", "5-64" = "#ffffbf", "65+" = "#d7191c", "Unknown" = "#cccccc"),
    name = "Age Group"
  ) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  labs(
    title = "Proportional Distribution of Deaths by Age Group and Year",
    subtitle = "Showing three main age groups: 0-4, 5-64, and 65+",
    x = "Year",
    y = "Percentage of Deaths"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(face = "bold", size = 12),
    legend.position = "right"
  )

Looks sensible.

Most deaths are older grousp in all counrites
In GT and PA there are relativel ymore 0-4 deaths

2. Death Counts

This is a translation of legacy code into our datawarehouse. The legacy logic from Derek’s :

```{r}
## Connect to hamronized DIN database v1
db_harmonized__salurbal_din__v1 = '//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/dev/harmonized__salurbal_din__v1.parquet' %>% 
  arrow::open_dataset() |> 
  rename(date = dod, age = dinage ) 

## L2 counts
df_L2 <- db_harmonized__salurbal_din__v1  %>% 
  summarize(death = n(), .by = c(salid2, date)) |> 
  collect() %>% 
  complete(salid2, date, fill = list(death = 0)) |> 
  arrange(salid2, date)

## L2 by age group counts
df_L2_age <- db_harmonized__salurbal_din__v1 |> 
  mutate(age_category = ifelse(age <= 64, "<65", "65+")) |> 
  summarize(death = n(), .by = c(salid2, date, age_category)) |> 
  collect() %>% 
  complete(salid2, date, age_category,
           fill = list(death = 0)) |> 
  arrange(salid2, date)

```

Here we translate this logic into the harmonized Aim 2 death records set. THe rationale: docueminetn lineage for transaprency and 2) so this can be reused across manuscripts. So the deliverable here is

daily counts at nbhd level
Dates without any deaths should be filled with NA or 0.
two datasets: 1) total 2) stratafied by age>65

These were done in SQL models. The logic was not too complex to warrant a python model or excesively complex to require a dplyr notebook. Below we just quality check the results.

Develop

Dplyr

Let’s first operationalize death counts in R.

df_nbhd_year = df_mort %>% 
  select(nbhd_geoid, year) %>% 
  distinct() 
head(df_nbhd_year)

nbhd_geoid	year
20310210	2020
20310216	2020
20310215	2020
20310220	2020
20310213	2020
20310219	2020

df_date_template <- df_nbhd_year %>%
  mutate(
    start_date = make_date(year, 1, 1),
    end_date = make_date(year, 12, 31)
  ) %>%
  rowwise() %>%
  mutate(date = list(seq(start_date, end_date, by = "day"))) %>%
  unnest(date) %>%
  select(nbhd_geoid, year, date) 
head(df_date_template)

nbhd_geoid	year	date
20310210	2020	2020-01-01
20310210	2020	2020-01-02
20310210	2020	2020-01-03
20310210	2020	2020-01-04
20310210	2020	2020-01-05
20310210	2020	2020-01-06

df_counts_raw_dplyr = df_mort %>% 
  rename(date = dod) %>% 
  summarize(death = n(), .by = c(nbhd_geoid, date)) |> 
  collect()
head(df_counts_raw_dplyr)

nbhd_geoid	date	death
20310210	2020-07-13	75
20310216	2020-12-05	10
20310215	2020-11-04	6
20310210	2020-03-04	24
20310210	2020-06-19	50
20310210	2020-10-29	26

df_counts_dplyr = df_date_template %>% 
  left_join(df_counts_raw_dplyr) %>% 
  mutate(death = replace_na(death, 0)) %>% 
  filter(nbhd_geoid %in%df_schema$nbhd_geoid) %>% 
  arrange(nbhd_geoid, date) %>% 
  rename(death_r = death) %>%
  left_join(select(df_schema, nbhd_geoid, iso2) %>% distinct()) %>%
  add_count(nbhd_geoid, year, name = 'n_days') %>% 
  ## Validations
  verify(not_na(iso2))%>%
  verify(not_na(nbhd_geoid  ))%>%
  verify(not_na(date)) %>%
  verify(not_na(death_r)) %>% 
  verify(between(n_days, 365,366))
head(df_counts_dplyr)

nbhd_geoid	year	date	death_r	iso2	n_days
10213610100	2003	2003-01-01	1	BR	365
10213610100	2003	2003-01-02	0	BR	365
10213610100	2003	2003-01-03	0	BR	365
10213610100	2003	2003-01-04	0	BR	365
10213610100	2003	2003-01-05	0	BR	365
10213610100	2003	2003-01-06	0	BR	365

Belo Horizonte Undercounting

Lets do a quick trends EDA

Code

df_counts_dplyr %>%
  rename(death = death_r) %>% 
  left_join(df_schema) %>% 
  summarize(death = sum(death), .by = c(date, iso2, city_name)) %>% 
  ggplot(aes(x = date, y = death, group = city_name, color = city_name))+
  geom_line() +
  facet_wrap(~iso2)+
  labs(
    title = 'Something is off with Belo Horizonte numbers',
    subtitle = "City level death trends from nbhd sums for CCUH Cities")

To check with Usama about Belo Horizonte numbers. Lets dial in on neighborhood level for Belo Horizonte. First let’s see what year had that drop

Code

df_qc = df_counts_dplyr %>% 
  left_join(df_schema)  %>% 
  rename(death = death_r) %>% 
  mutate(month = month(date)) %>% 
  group_by(city_name, city_geoid, nbhd_geoid, month, year) %>% 
  summarize(death = sum(death)) %>% 
  ungroup() 
  

df_qc %>% 
  filter(city_name == 'Belo Horizonte') %>% 
  mutate(date = make_date(year, month, 1) ) %>% 
  ggplot(aes(x = date, y = death, group = nbhd_geoid))+
  geom_line(alpha = 0.05)+
  theme(
    legend.position = 'none'
  )+
  labs(
    title = 'Neighborhood death trends Belo Horizonte',
    subtitle = 'weird shift starting 2006 (hihglihghted as between red lines)'
  ) +
  geom_vline(xintercept = make_date(2006, 1, 1),
             color = 'red', linetype = 'dashed', alpha = 1) +
  geom_vline(xintercept = make_date(2006, 6, 1),
             color = 'red', linetype = 'dashed', alpha = 1) +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y")

For now Will quarentine the data for this city prior to 2007 due to visible undercounting.

Polars

Polars.

import polars as pl
import datetime

df_mort = pl.scan_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__death_records.parquet")

df_schema = pl.scan_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/base__ccuh__aim2_city_subcity_nbhd__schema.parquet").collect()

Let’s do the transformation; first getting some intermediates.

df_nbhd_year = (
  df_mort
    .select("nbhd_geoid", "year")
    .with_columns(pl.col('year').cast(pl.Int32))
    .unique()
    .collect()
)
print(df_nbhd_year.head())

shape: (5, 2)
┌────────────────┬──────┐
│ nbhd_geoid     ┆ year │
│ ---            ┆ ---  │
│ str            ┆ i32  │
╞════════════════╪══════╡
│ 10219013108    ┆ 2008 │
│ DSPHPHLNBHD_36 ┆ 2011 │
│ nbhd_7         ┆ 2013 │
│ 10224810397    ┆ 2007 │
│ 10224820108    ┆ 2011 │
└────────────────┴──────┘

df_date_template = (
  df_nbhd_year
    .group_by(['nbhd_geoid', 'year'])
    .agg([
        pl.date_range(
            pl.date(pl.col('year').first(), 1, 1),  
            pl.date(pl.col('year').first(), 12, 31), 
            "1d"
        ).alias('date')
    ])
    .explode('date')
)
print(df_date_template.head())

shape: (5, 3)
┌────────────┬──────┬────────────┐
│ nbhd_geoid ┆ year ┆ date       │
│ ---        ┆ ---  ┆ ---        │
│ str        ┆ i32  ┆ date       │
╞════════════╪══════╪════════════╡
│ 20610117   ┆ 2015 ┆ 2015-01-01 │
│ 20610117   ┆ 2015 ┆ 2015-01-02 │
│ 20610117   ┆ 2015 ┆ 2015-01-03 │
│ 20610117   ┆ 2015 ┆ 2015-01-04 │
│ 20610117   ┆ 2015 ┆ 2015-01-05 │
└────────────┴──────┴────────────┘

df_counts_raw_polars = (
  df_mort
    .rename({"dod": "date"})
    .group_by(["nbhd_geoid", "date"])
    .agg([
        pl.len().alias("death")
    ])
    .collect()
)
print(df_counts_raw_polars.head())

shape: (5, 3)
┌────────────────┬────────────┬───────┐
│ nbhd_geoid     ┆ date       ┆ death │
│ ---            ┆ ---        ┆ ---   │
│ str            ┆ date       ┆ u32   │
╞════════════════╪════════════╪═══════╡
│ 10224810407    ┆ 2007-09-20 ┆ 1     │
│ 10213616106    ┆ 2004-02-25 ┆ 1     │
│ DSPHPHLNBHD_03 ┆ 2013-10-24 ┆ 1     │
│ 10224817124    ┆ 2008-03-27 ┆ 1     │
│ 10224810170    ┆ 2010-05-09 ┆ 1     │
└────────────────┴────────────┴───────┘

df_counts_polars = (
    df_date_template
    .join(
        df_counts_raw_polars,
        on=["nbhd_geoid", "date"],
        how="left"
    )
    .with_columns([
        pl.col("death").fill_null(0)
    ])
    .filter(
        pl.col("nbhd_geoid").is_in(df_schema.select("nbhd_geoid").to_series().to_list())
    )
    .sort(["nbhd_geoid", "date"])
    .rename({"death": "death_r"})
    .join(
        df_schema.select(["nbhd_geoid", "city_name", "city_geoid", "iso2"]).unique(),
        on="nbhd_geoid",
        how="left"
    )
    ## Remove Belo Horizonte nbhd data before 2007 due to visible undercounting
    .filter(
        ~((pl.col("city_name") == "Belo Horizonte") & (pl.col("year") < 2007))
    )
)
print(df_counts_polars.head())

shape: (5, 7)
┌─────────────┬──────┬────────────┬─────────┬────────────────┬────────────┬──────┐
│ nbhd_geoid  ┆ year ┆ date       ┆ death_r ┆ city_name      ┆ city_geoid ┆ iso2 │
│ ---         ┆ ---  ┆ ---        ┆ ---     ┆ ---            ┆ ---        ┆ ---  │
│ str         ┆ i32  ┆ date       ┆ u32     ┆ str            ┆ str        ┆ str  │
╞═════════════╪══════╪════════════╪═════════╪════════════════╪════════════╪══════╡
│ 10213610100 ┆ 2007 ┆ 2007-01-01 ┆ 0       ┆ Belo Horizonte ┆ 102136     ┆ BR   │
│ 10213610100 ┆ 2007 ┆ 2007-01-02 ┆ 1       ┆ Belo Horizonte ┆ 102136     ┆ BR   │
│ 10213610100 ┆ 2007 ┆ 2007-01-03 ┆ 0       ┆ Belo Horizonte ┆ 102136     ┆ BR   │
│ 10213610100 ┆ 2007 ┆ 2007-01-04 ┆ 0       ┆ Belo Horizonte ┆ 102136     ┆ BR   │
│ 10213610100 ┆ 2007 ┆ 2007-01-05 ┆ 0       ┆ Belo Horizonte ┆ 102136     ┆ BR   │
└─────────────┴──────┴────────────┴─────────┴────────────────┴────────────┴──────┘

Validate DBT

Let’s compare results with DBT.

## Import DBT Results with validations
df_counts = context$counts %>% 
  open_dataset() %>% 
  select(iso2, city_name,nbhd_geoid, date , death) %>% 
  collect()|> 
  arrange(nbhd_geoid, date)  %>% 
  verify(not_na(iso2))%>% 
  verify(not_na(nbhd_geoid  ))%>% 
  verify(not_na(date))  %>% 
  verify(not_na(death)) %>% 
  verify(all(unique(nbhd_geoid) %in% 
               unique(df_schema$nbhd_geoid)))

## Check Identical
identical(
  df_counts,
  df_counts_dplyr %>% 
    left_join(df_schema |> select(nbhd_geoid,iso2, city_name)) |> 
    filter(!(city_name == "Belo Horizonte" & year < 2007)) %>% 
    rename(death = death_r) %>% 
    select(names(df_counts))
) |> 
  assertthat::assert_that()

[1] TRUE

Looks good. The SQL results are identical to the Dplyr results. Increases evidence that our semantics were properly transalted.

Visual Inspection

Not sure how to check. but let’s start with some trends.

Code

# Calculate mean daily deaths by neighborhood
neighborhood_stats <- df_counts %>%
  group_by(iso2, nbhd_geoid) %>%
  summarize(
    mean_daily_deaths = mean(death),
    median_daily_deaths = median(death),
    max_daily_deaths = max(death),
    total_deaths = sum(death),
    .groups = "drop"
  )

# View summary statistics
neighborhood_stats %>%
  group_by(iso2) %>%
  summarize(
    n_neighborhoods = n(),
    avg_mean_deaths = mean(mean_daily_deaths),
    median_mean_deaths = median(mean_daily_deaths),
    min_mean_deaths = min(mean_daily_deaths),
    max_mean_deaths = max(mean_daily_deaths)
  )%>%
  knitr::kable()

iso2	n_neighborhoods	avg_mean_deaths	median_mean_deaths	min_mean_deaths	max_mean_deaths
BR	1242	0.4682538	0.4125463	0.0505718	5.329844
GT	14	3.7810860	1.5515629	0.3212412	27.229067
PA	53	0.4683222	0.4296955	0.0212111	2.046527
US	152	0.6601203	0.5366623	0.1168908	1.823830

Code

# Visualize distribution by country
ggplot(neighborhood_stats, aes(x = iso2, y = mean_daily_deaths)) +
  geom_boxplot() +
  labs(title = "Mean Neighbhborhood-level Daily Deaths by Country",
       x = "Country", y = "Mean Daily Deaths") +
  theme_minimal()

Lets also look at trends

Code

df_counts %>% 
  left_join(df_schema) %>% 
  summarize(death = sum(death), .by = c(date, iso2, city_name)) %>% 
  ggplot(aes(x = date, y = death, group = city_name, color = city_name))+
  geom_line() +
  facet_wrap(~iso2)+
  labs(
    title = 'Something is off with Belo Horizonte numbers',
    subtitle = "City level death trends from nbhd sums for CCUH Cities")

To check with Usama about Belo Horizonte numbers.

3. Death Counts by Age

Here we do death counts stratafied by two age groups: 1) <65 and then 65+. Note htat there are some recrods missing age (<0.1% in USA recrods and ~ 2% in Brazil records); These records were removed from this tabulation.

Develop

import polars as pl
import datetime

df_mort = pl.scan_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__death_records.parquet")
df_mort_age = (df_mort
    .with_columns(
        pl.when(pl.col("minage5c") <= 64)
        .then(pl.lit('<65'))  # Add pl.lit() here
        .otherwise(pl.lit('65+'))  # Add pl.lit() here
        .alias("age_category")
    )
)

df_schema = pl.scan_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/base__ccuh__aim2_city_subcity_nbhd__schema.parquet").collect()

Let’s do the transformation; first getting some intermediates.

df_nbhd_year = (
  df_mort_age
    .select("nbhd_geoid", "year", "age_category")
    .with_columns(pl.col('year').cast(pl.Int32))
    .unique()
    .collect()
)
print(df_nbhd_year.head())

shape: (5, 3)
┌─────────────┬──────┬──────────────┐
│ nbhd_geoid  ┆ year ┆ age_category │
│ ---         ┆ ---  ┆ ---          │
│ str         ┆ i32  ┆ str          │
╞═════════════╪══════╪══════════════╡
│ 10213614103 ┆ 2019 ┆ 65+          │
│ nbhd_67     ┆ 2016 ┆ <65          │
│ 10224821100 ┆ 2005 ┆ <65          │
│ 10224810374 ┆ 2018 ┆ <65          │
│ 10213610141 ┆ 2014 ┆ 65+          │
└─────────────┴──────┴──────────────┘

df_date_template = (
  df_nbhd_year
    .group_by(['nbhd_geoid', 'year', "age_category"])
    .agg([
        pl.date_range(
            pl.date(pl.col('year').first(), 1, 1),  
            pl.date(pl.col('year').first(), 12, 31), 
            "1d"
        ).alias('date')
    ])
    .explode('date')
)
print(df_date_template.head())

shape: (5, 4)
┌─────────────┬──────┬──────────────┬────────────┐
│ nbhd_geoid  ┆ year ┆ age_category ┆ date       │
│ ---         ┆ ---  ┆ ---          ┆ ---        │
│ str         ┆ i32  ┆ str          ┆ date       │
╞═════════════╪══════╪══════════════╪════════════╡
│ 10222613103 ┆ 2010 ┆ 65+          ┆ 2010-01-01 │
│ 10222613103 ┆ 2010 ┆ 65+          ┆ 2010-01-02 │
│ 10222613103 ┆ 2010 ┆ 65+          ┆ 2010-01-03 │
│ 10222613103 ┆ 2010 ┆ 65+          ┆ 2010-01-04 │
│ 10222613103 ┆ 2010 ┆ 65+          ┆ 2010-01-05 │
└─────────────┴──────┴──────────────┴────────────┘

df_counts_raw_polars = (
  df_mort_age
    .rename({"dod": "date"})
    .group_by(["nbhd_geoid", "date", "age_category"])
    .agg([
        pl.len().alias("death")
    ])
    .collect()
)
print(df_counts_raw_polars.head())

shape: (5, 4)
┌────────────────┬────────────┬──────────────┬───────┐
│ nbhd_geoid     ┆ date       ┆ age_category ┆ death │
│ ---            ┆ ---        ┆ ---          ┆ ---   │
│ str            ┆ date       ┆ str          ┆ u32   │
╞════════════════╪════════════╪══════════════╪═══════╡
│ 20310216       ┆ 2016-05-13 ┆ <65          ┆ 8     │
│ 10224810261    ┆ 2017-02-14 ┆ 65+          ┆ 1     │
│ DSPHPHLNBHD_06 ┆ 2004-08-21 ┆ <65          ┆ 1     │
│ 10224822119    ┆ 2010-03-15 ┆ <65          ┆ 1     │
│ 10224810352    ┆ 2008-02-12 ┆ 65+          ┆ 3     │
└────────────────┴────────────┴──────────────┴───────┘

df_counts_polars = (
    df_date_template
    .join(
        df_counts_raw_polars,
        on=["nbhd_geoid", "date", "age_category"],
        how="left"
    )
    .with_columns([
        pl.col("death").fill_null(0)
    ])
    .filter(
        pl.col("nbhd_geoid").is_in(df_schema.select("nbhd_geoid").to_series().to_list())
    )
    .sort(["nbhd_geoid", "date"])
    .join(
        df_schema.select(["nbhd_geoid", "city_name", "city_geoid", "iso2"]).unique(),
        on="nbhd_geoid",
        how="left"
    )
    ## Remove Belo Horizonte nbhd data before 2007 due to visible undercounting
    .filter(
        ~((pl.col("city_name") == "Belo Horizonte") & (pl.col("year") < 2007))
    )
)
print(df_counts_polars.head())

shape: (5, 8)
┌─────────────┬──────┬──────────────┬────────────┬───────┬────────────────┬────────────┬──────┐
│ nbhd_geoid  ┆ year ┆ age_category ┆ date       ┆ death ┆ city_name      ┆ city_geoid ┆ iso2 │
│ ---         ┆ ---  ┆ ---          ┆ ---        ┆ ---   ┆ ---            ┆ ---        ┆ ---  │
│ str         ┆ i32  ┆ str          ┆ date       ┆ u32   ┆ str            ┆ str        ┆ str  │
╞═════════════╪══════╪══════════════╪════════════╪═══════╪════════════════╪════════════╪══════╡
│ 10213610100 ┆ 2007 ┆ 65+          ┆ 2007-01-01 ┆ 0     ┆ Belo Horizonte ┆ 102136     ┆ BR   │
│ 10213610100 ┆ 2007 ┆ <65          ┆ 2007-01-01 ┆ 0     ┆ Belo Horizonte ┆ 102136     ┆ BR   │
│ 10213610100 ┆ 2007 ┆ 65+          ┆ 2007-01-02 ┆ 1     ┆ Belo Horizonte ┆ 102136     ┆ BR   │
│ 10213610100 ┆ 2007 ┆ <65          ┆ 2007-01-02 ┆ 0     ┆ Belo Horizonte ┆ 102136     ┆ BR   │
│ 10213610100 ┆ 2007 ┆ 65+          ┆ 2007-01-03 ┆ 0     ┆ Belo Horizonte ┆ 102136     ┆ BR   │
└─────────────┴──────┴──────────────┴────────────┴───────┴────────────────┴────────────┴──────┘

Validate

Let’s import the DBT results and just validate against eh unstratified results.

## Import DBT Results
df_counts_age = context$counts_age %>% 
  open_dataset() %>% 
  select(iso2, nbhd_geoid, age_category, date, death) %>% 
  collect()|> 
  arrange(age_category, nbhd_geoid, date) %>% 
  verify(not_na(age_category)) %>% 
  verify(not_na(iso2)) %>% 
  verify(not_na(nbhd_geoid  ))%>% 
  verify(not_na(date))  %>% 
  verify(not_na(death)) %>% 
  verify(all(unique(nbhd_geoid) %in% 
               unique(df_schema$nbhd_geoid)))
 
  
## Dplyr computations
df_counts_reconstructed = df_counts_age |>  
  group_by(iso2, nbhd_geoid, date) |> 
  summarize(death = sum(death)) |> 
  ungroup()|> 
  arrange(nbhd_geoid, date)
 

 

## Check Identical
identical(
  df_counts_reconstructed,
  df_counts %>% 
    select(names(df_counts_reconstructed))
) |> 
  assert_that()

[1] TRUE

Looks good. The DBT strataffied results when aggregated to unstratified are identical to the unstratfiaeid results.

Visual Inspection

Not sure how to check. but let’s start with some trends.

Code

# Calculate mean daily deaths by neighborhood
neighborhood_stats <- df_counts_age %>%
  group_by(iso2, nbhd_geoid, age_category) %>%
  summarize(
    mean_daily_deaths = mean(death),
    median_daily_deaths = median(death),
    max_daily_deaths = max(death),
    total_deaths = sum(death),
    .groups = "drop"
  )

# View summary statistics
neighborhood_stats %>%
  group_by(iso2, age_category) %>%
  summarize(
    n_neighborhoods = n(),
    avg_mean_deaths = mean(mean_daily_deaths),
    median_mean_deaths = median(mean_daily_deaths),
    min_mean_deaths = min(mean_daily_deaths),
    max_mean_deaths = max(mean_daily_deaths)
  )%>%
  knitr::kable()

iso2	age_category	n_neighborhoods	avg_mean_deaths	median_mean_deaths	min_mean_deaths	max_mean_deaths
BR	65+	1242	0.2760775	0.2206860	0.0230311	3.1306168
BR	<65	1242	0.1921762	0.1687872	0.0212595	2.1992269
GT	65+	14	1.7096411	0.6583390	0.1615332	12.3025325
GT	<65	14	2.0714449	0.8932238	0.1597080	14.9265343
PA	65+	53	0.2815362	0.2322956	0.0147109	1.3640096
PA	<65	53	0.1868396	0.1847417	0.0065002	0.6825180
US	65+	152	0.4907350	0.4127485	0.0791572	1.4623259
US	<65	152	0.1693854	0.1069516	0.0274967	0.6043328

Code

# Visualize distribution by country
ggplot(neighborhood_stats, aes(x = iso2, y = mean_daily_deaths, fill = age_category)) +
  geom_boxplot(position = position_dodge(width = 0.8), outlier.alpha = 0.5) +
  scale_fill_brewer(palette = "Set1", name = "Age Group") +
  labs(title = "Mean Daily Deaths by Country and Age Group",
       x = "Country", 
       y = "Mean Daily Deaths") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  scale_y_continuous(
    trans = "log10",  # Log scale to better show distribution
    labels = scales::number_format(accuracy = 0.001)
  ) +
  geom_hline(yintercept = 0.1, linetype = "dashed", color = "gray50", alpha = 0.7)

4. Access

```{r}
df_aim2_mort =  "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__death_records.parquet" |> 
  arrow::open_dataset() 
  
df_aim2_nbhd_deaths =  "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__nbhd_death_counts.parquet" |> 
  arrow::open_dataset() 
  
df_aim2_nbhd_deaths_age =  "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__nbhd_death_counts__age_grp_65.parquet" |> 
  arrow::open_dataset() 
```