ccuh_aim2_mortality__v1

Author

Ran Li (Maintainer)

Published

March 2, 2025

Setup
pacman::p_load(
  tidyverse, arrow, glue, cli, janitor, reactable,
  sf, geoarrow, rmapshaper, leaflet, here,
  assertr, assertthat)

library(reticulate)
use_virtualenv(here::here('.venv'))
py_config()
python:         D:/GitHub/ccuh-data-infrastructure/ccuh-notebooks/_harmonization/ccuh_aim2/ccuh_aim2_models_v1/.venv/Scripts/python.exe
libpython:      C:/Users/rl627/AppData/Local/Programs/Python/Python313/python313.dll
pythonhome:     D:/GitHub/ccuh-data-infrastructure/ccuh-notebooks/_harmonization/ccuh_aim2/ccuh_aim2_models_v1/.venv
version:        3.13.2 (tags/v3.13.2:4f8bb39, Feb  4 2025, 15:23:48) [MSC v.1942 64 bit (AMD64)]
Architecture:   64bit
numpy:          D:/GitHub/ccuh-data-infrastructure/ccuh-notebooks/_harmonization/ccuh_aim2/ccuh_aim2_models_v1/.venv/Lib/site-packages/numpy
numpy_version:  2.2.3

NOTE: Python version was forced by use_python() function
Setup
py_list_packages()
package version requirement
asttokens 3.0.0 asttokens==3.0.0
colorama 0.4.6 colorama==0.4.6
comm 0.2.2 comm==0.2.2
debugpy 1.8.12 debugpy==1.8.12
decorator 5.2.1 decorator==5.2.1
executing 2.2.0 executing==2.2.0
ipykernel 6.29.5 ipykernel==6.29.5
ipython 9.0.0 ipython==9.0.0
ipython_pygments_lexers 1.1.1 ipython_pygments_lexers==1.1.1
jedi 0.19.2 jedi==0.19.2
jupyter_client 8.6.3 jupyter_client==8.6.3
jupyter_core 5.7.2 jupyter_core==5.7.2
matplotlib-inline 0.1.7 matplotlib-inline==0.1.7
nest-asyncio 1.6.0 nest-asyncio==1.6.0
numpy 2.2.3 numpy==2.2.3
packaging 24.2 packaging==24.2
parso 0.8.4 parso==0.8.4
platformdirs 4.3.6 platformdirs==4.3.6
polars 1.24.0 polars==1.24.0
prompt_toolkit 3.0.50 prompt_toolkit==3.0.50
psutil 7.0.0 psutil==7.0.0
pure_eval 0.2.3 pure_eval==0.2.3
Pygments 2.19.1 Pygments==2.19.1
python-dateutil 2.9.0.post0 python-dateutil==2.9.0.post0
pywin32 308 pywin32==308
pyzmq 26.2.1 pyzmq==26.2.1
six 1.17.0 six==1.17.0
stack-data 0.6.3 stack-data==0.6.3
tornado 6.4.2 tornado==6.4.2
traitlets 5.14.3 traitlets==5.14.3
wcwidth 0.2.13 wcwidth==0.2.13
Setup
py_module_available("polars")
[1] TRUE
Setup
py_module_available("numpy")
[1] TRUE
Setup
context = lst(
  dbt_server = "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0",
  schema = file.path(dbt_server,'base__ccuh__aim2_city_subcity_nbhd__schema.parquet'),
  records = file.path(dbt_server,'harmonized__ccuh_aim_2__death_records.parquet'),
  counts = file.path(dbt_server,'harmonized__ccuh_aim_2__nbhd_death_counts.parquet'),
  counts_age = file.path(dbt_server,'harmonized__ccuh_aim_2__nbhd_death_counts__age_grp_65.parquet'),
)


## Schema
df_schema = context$schema %>% 
  open_dataset() %>% 
  collect()

1. Death records

These death records are harmonized for CCUH Aim 2; details and documentation can be found at CCUH Aim 2 Harmonized death records (Harmonized Dataset).

Here we just do some exploratory analysis to understand the scope of the data and do preliminary QC.

Import + Validate
df_mort = context$records %>% 
  open_dataset() %>% 
  collect() %>% 
  verify(not_na(nbhd_geoid)) %>% 
  verify(not_na(iso2)) %>% 
  verify(not_na(dod)) %>% 
  verify(all(unique(city_geoid) %in%
               unique(df_schema$city_geoid))) %>%
  verify(all(unique(nbhd_geoid) %in% 
               unique(df_schema$nbhd_geoid)))

Year

Availability Across country and year

Code
df_mort_year = df_mort %>% 
  count(iso2, source_table, year)  %>% 
  collect() %>% 
  arrange(year)%>% 
  mutate(iso2_source = glue("{iso2} ({source_table})"))

## Table
df_mort_year %>% 
  pivot_wider(names_from = year, values_from = n) 
iso2 source_table iso2_source 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
US base__pennsylvania_records_death_v1 US (base__pennsylvania_records_death_v1) 37865 36835 36859 36685 35895 36344 35259 35205 34576 34409 34233 35376 34920 35224 35252 35858 36379 36907 36902 36950 44402 40881 39726
BR base__salurbal__sim_BR__v1 BR (base__salurbal__sim_BR__v1) NA NA NA 173645 168776 175555 187072 194640 198625 206220 214386 214831 214509 220794 225234 228127 238044 231127 239244 241595 NA NA NA
GT base__salurbal__din_GT__v1 GT (base__salurbal__din_GT__v1) NA NA NA NA NA NA NA NA NA 16852 18033 17509 17120 18982 18987 19674 19831 19524 19628 20578 25297 NA NA
PA base__salurbal__din_PA__v1 PA (base__salurbal__din_PA__v1) NA NA NA NA NA NA NA NA NA NA NA NA 7667 8331 7966 8560 8773 8932 9165 NA 13158 NA NA
Code
## Plot
df_mort_year %>% 
  # Assuming iso2_source is already a column in your data
  # If not, you might need to create it with mutate first
  ggplot(aes(x = year, y = n, color = iso2_source)) +
  geom_line(size = 1) +
  geom_point(size = 2.5, alpha = 0.7) +
  # Add text labels at the end of each line
  geom_text(data = df_mort_year %>% 
              group_by(iso2_source) %>% 
              filter(year == max(year)) %>%
              ungroup(),
            aes(label = iso2),
            hjust = -0.3, 
            vjust = 0.5,
            fontface = "bold",
            size = 3.5) +
  scale_y_log10(labels = scales::comma_format()) +
  scale_color_brewer(palette = "Set1", name = "Country/Source") +
  labs(
    title = "Death Records by Country and Year",
    subtitle = "Log scale used to show variation across different data sources",
    x = "Year",
    y = "Number of Records (log scale)",
    caption = "Data source: CCUH Aim 2 Harmonized death records"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, color = "gray40"),
    axis.title = element_text(face = "bold"),
    legend.position = "none",
    panel.grid.minor = element_blank(),
    panel.border = element_rect(fill = NA, color = "gray80"),
    plot.caption = element_text(hjust = 0, size = 9, color = "gray40"),
    plot.margin = margin(t = 10, r = 15, b = 10, l = 10)
  ) +
  scale_x_continuous(
    breaks = seq(min(df_mort_year$year), max(df_mort_year$year), by = 2),
    limits = c(min(df_mort_year$year), max(df_mort_year$year) + 1)  # Add space for labels
  ) +
  guides(color = guide_legend(override.aes = list(size = 2), ncol = 1))

Looks okay to me so far.

Age

Missingness in age by country and year

Code
# Analyze missingness in age by country and year
age_missingness <- df_mort %>%
  group_by(iso2, year) %>%
  summarize(
    total_records = n(),
    missing_age = sum(is.na(dinage)),
    pct_missing = round(missing_age / total_records * 100, 2)
  ) %>%
  collect() %>%
  arrange(year) %>% 
  ungroup() %>% 
  verify(
    filter(.,iso2!='BR') %>%
      pull(pct_missing) < 3)

# Create a table of the results
age_missingness %>%
  pivot_wider(
    id_cols = iso2,
    names_from = year,
    values_from = pct_missing
  )
iso2 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
US 0.1 0.07 0.16 0.13 0.14 0.15 0.1 0.12 0.14 0.11 0.13 4e-02 1e-02 1e-02 4e-02 4e-02 5e-02 7e-02 5e-02 5e-02 0.02 0.03 0.02
BR NA NA NA 100.00 100.00 100.00 100.0 100.00 100.00 100.00 100.00 1e+02 1e+02 1e+02 1e+02 1e+02 1e+02 1e+02 1e+02 1e+02 NA NA NA
GT NA NA NA NA NA NA NA NA NA 0.00 0.00 0e+00 0e+00 0e+00 0e+00 0e+00 0e+00 0e+00 0e+00 0e+00 0.00 NA NA
PA NA NA NA NA NA NA NA NA NA NA NA NA 0e+00 0e+00 0e+00 0e+00 0e+00 0e+00 0e+00 NA 0.00 NA NA
  • BR does nto ahve ‘DINAGE’ or age of death in years
  • other ocuntires this has minimal < 1% missingness

Let’s do a EDA plot on DINAGE.

Code
# Create stacked bar charts for death distribution by age
# First, prepare the data by age group and year

# Create age groups based on dinage
age_distribution <- df_mort %>%
  mutate(
    # Create age group labels based on dinage
    age_group = case_when(
      is.na(dinage) ~ "Unknown",
      dinage < 5 ~ "0-4",
      dinage < 15 ~ "5-14",
      dinage < 25 ~ "15-24",
      dinage < 45 ~ "25-44",
      dinage < 65 ~ "45-64",
      dinage < 85 ~ "65-84",
      TRUE ~ "85+"
    ),
    # Convert year to a standard numeric format
    year = as.numeric(year)
  ) %>%
  group_by(iso2, year, age_group) %>%
  summarize(deaths = n()) %>%
  collect() %>% 
  # Make sure age groups are ordered correctly for the visualization
  mutate(age_group = factor(age_group, levels = c("0-4", "5-14", "15-24", "25-44", "45-64", "65-84", "85+", "Unknown")))

# Calculate percentage distribution
age_pct_distribution <- age_distribution %>%
  group_by(iso2, year) %>%
  mutate(
    total_deaths = sum(deaths),
    pct_deaths = deaths / total_deaths * 100
  ) %>%
  ungroup()

# 100% stacked bar chart showing proportions
ggplot(age_pct_distribution, aes(x = year, y = pct_deaths, fill = age_group)) +
  geom_col(position = "fill") +
  facet_wrap(~ iso2, ncol = 1) +
  scale_fill_brewer(palette = "RdYlBu", name = "Age Group", direction = -1) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  labs(
    title = "Proportional Distribution of Deaths by Age Group and Year",
    subtitle = "100% stacked bars (based on dinage)",
    x = "Year",
    y = "Percentage of Deaths"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(face = "bold", size = 12),
    legend.position = "right"
  )

Makes sense. Some interesting insights:

  • infnat/child mortality in GT and PA are decreasing? Thats good right?

Age 5 year categories

  • Brazil does nto have age in years
  • Brazil only has 5 year age cats
Code
# Analyze missingness in age by country and year
age_cat_missingness <- df_mort %>%
  group_by(iso2, year) %>%
  summarize(
    total_records = n(),
    missing_age = sum(is.na(minage5c)),
    pct_missing = round(missing_age / total_records * 100, 2)
  ) %>%
  collect() %>%
  arrange(year) %>% 
  ungroup()

# Create a table of the results
age_cat_missingness %>%
  pivot_wider(
    id_cols = iso2,
    names_from = year,
    values_from = pct_missing
  ) 
iso2 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
US 0.1 0.07 0.16 0.13 0.14 0.15 0.10 0.12 0.14 0.11 0.13 0.04 0.01 0.01 0.04 0.04 0.05 0.07 0.05 0.05 0.02 0.03 0.02
BR NA NA NA 2.46 2.62 0.10 2.29 2.16 2.16 2.15 2.05 2.14 2.30 2.20 2.23 2.25 2.01 2.11 2.06 1.98 NA NA NA
GT NA NA NA NA NA NA NA NA NA 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 NA NA
PA NA NA NA NA NA NA NA NA NA NA NA NA 0.00 0.00 0.00 0.00 0.00 0.00 0.00 NA 0.00 NA NA
Code
# Visualize missingness
ggplot(age_cat_missingness, aes(x = year, y = pct_missing, color = iso2)) +
  geom_line(size = 1) +
  geom_point(size = 2.5) +
  facet_wrap(~ iso2, ncol = 2) +
  labs(
    title = "Percentage of Missing Age Values by Country and Year",
    x = "Year",
    y = "Percentage Missing (%)"
  ) +
  theme_minimal() +
  scale_color_brewer(palette = "Set1") +
  theme(
    legend.position = "none",
    panel.grid.minor = element_blank(),
    plot.title = element_text(face = "bold"),
    strip.text = element_text(face = "bold", size = 12)
  ) +
  ylim(0,100)

  • 5 year age categories is available in all datasets
  • there is < 3 % missingness of this in BR and < 1% in all other countries

Let’s jsut check the the age categories are consisnte across the harmozianitons.

Code
# EDA plot for minage5c with three age groups

# Create simplified age groups based on minage5c
age_distribution_three_groups <- df_mort %>%
  mutate(
    # Create three age group categories
    age_group = case_when(
      is.na(minage5c) ~ "Unknown",
      minage5c < 5 ~ "0-4",
      minage5c >= 65 ~ "65+",
      TRUE ~ "5-64"
    ),
    # Convert year to a standard numeric format
    year = as.numeric(year)
  ) %>%
  group_by(iso2, year, age_group) %>%
  summarize(deaths = n()) %>%
  collect() %>%
  # Make sure age groups are ordered correctly for the visualization
  mutate(age_group = factor(age_group, levels = c("0-4", "5-64", "65+", "Unknown")))

# Calculate percentage distribution
age_pct_three_groups <- age_distribution_three_groups %>%
  group_by(iso2, year) %>%
  mutate(
    total_deaths = sum(deaths),
    pct_deaths = deaths / total_deaths * 100
  ) %>%
  ungroup()

# 100% stacked bar chart showing proportions
ggplot(age_pct_three_groups, aes(x = year, y = pct_deaths, fill = age_group)) +
  geom_col(position = "fill") +
  facet_wrap(~ iso2, ncol = 2) +
  scale_fill_manual(
    values = c("0-4" = "#2c7bb6", "5-64" = "#ffffbf", "65+" = "#d7191c", "Unknown" = "#cccccc"),
    name = "Age Group"
  ) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  labs(
    title = "Proportional Distribution of Deaths by Age Group and Year",
    subtitle = "Showing three main age groups: 0-4, 5-64, and 65+",
    x = "Year",
    y = "Percentage of Deaths"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(face = "bold", size = 12),
    legend.position = "right"
  )

Looks sensible.

  • Most deaths are older grousp in all counrites
  • In GT and PA there are relativel ymore 0-4 deaths

2. Death Counts

This is a translation of legacy code into our datawarehouse. The legacy logic from Derek’s :

```{r}
## Connect to hamronized DIN database v1
db_harmonized__salurbal_din__v1 = '//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/dev/harmonized__salurbal_din__v1.parquet' %>% 
  arrow::open_dataset() |> 
  rename(date = dod, age = dinage ) 

## L2 counts
df_L2 <- db_harmonized__salurbal_din__v1  %>% 
  summarize(death = n(), .by = c(salid2, date)) |> 
  collect() %>% 
  complete(salid2, date, fill = list(death = 0)) |> 
  arrange(salid2, date)

## L2 by age group counts
df_L2_age <- db_harmonized__salurbal_din__v1 |> 
  mutate(age_category = ifelse(age <= 64, "<65", "65+")) |> 
  summarize(death = n(), .by = c(salid2, date, age_category)) |> 
  collect() %>% 
  complete(salid2, date, age_category,
           fill = list(death = 0)) |> 
  arrange(salid2, date)

```

Here we translate this logic into the harmonized Aim 2 death records set. THe rationale: docueminetn lineage for transaprency and 2) so this can be reused across manuscripts. So the deliverable here is

  • daily counts at nbhd level
  • Dates without any deaths should be filled with NA or 0.
  • two datasets: 1) total 2) stratafied by age>65

These were done in SQL models. The logic was not too complex to warrant a python model or excesively complex to require a dplyr notebook. Below we just quality check the results.

Develop

Dplyr

Let’s first operationalize death counts in R.

df_nbhd_year = df_mort %>% 
  select(nbhd_geoid, year) %>% 
  distinct() 
head(df_nbhd_year)
nbhd_geoid year
20310210 2020
20310216 2020
20310215 2020
20310220 2020
20310213 2020
20310219 2020
df_date_template <- df_nbhd_year %>%
  mutate(
    start_date = make_date(year, 1, 1),
    end_date = make_date(year, 12, 31)
  ) %>%
  rowwise() %>%
  mutate(date = list(seq(start_date, end_date, by = "day"))) %>%
  unnest(date) %>%
  select(nbhd_geoid, year, date) 
head(df_date_template)
nbhd_geoid year date
20310210 2020 2020-01-01
20310210 2020 2020-01-02
20310210 2020 2020-01-03
20310210 2020 2020-01-04
20310210 2020 2020-01-05
20310210 2020 2020-01-06
df_counts_raw_dplyr = df_mort %>% 
  rename(date = dod) %>% 
  summarize(death = n(), .by = c(nbhd_geoid, date)) |> 
  collect()
head(df_counts_raw_dplyr)
nbhd_geoid date death
20310210 2020-07-13 75
20310216 2020-12-05 10
20310215 2020-11-04 6
20310210 2020-03-04 24
20310210 2020-06-19 50
20310210 2020-10-29 26
df_counts_dplyr = df_date_template %>% 
  left_join(df_counts_raw_dplyr) %>% 
  mutate(death = replace_na(death, 0)) %>% 
  filter(nbhd_geoid %in%df_schema$nbhd_geoid) %>% 
  arrange(nbhd_geoid, date) %>% 
  rename(death_r = death) %>%
  left_join(select(df_schema, nbhd_geoid, iso2) %>% distinct()) %>%
  add_count(nbhd_geoid, year, name = 'n_days') %>% 
  ## Validations
  verify(not_na(iso2))%>%
  verify(not_na(nbhd_geoid  ))%>%
  verify(not_na(date)) %>%
  verify(not_na(death_r)) %>% 
  verify(between(n_days, 365,366))
head(df_counts_dplyr)
nbhd_geoid year date death_r iso2 n_days
10213610100 2003 2003-01-01 1 BR 365
10213610100 2003 2003-01-02 0 BR 365
10213610100 2003 2003-01-03 0 BR 365
10213610100 2003 2003-01-04 0 BR 365
10213610100 2003 2003-01-05 0 BR 365
10213610100 2003 2003-01-06 0 BR 365

Belo Horizonte Undercounting

Lets do a quick trends EDA

Code
df_counts_dplyr %>%
  rename(death = death_r) %>% 
  left_join(df_schema) %>% 
  summarize(death = sum(death), .by = c(date, iso2, city_name)) %>% 
  ggplot(aes(x = date, y = death, group = city_name, color = city_name))+
  geom_line() +
  facet_wrap(~iso2)+
  labs(
    title = 'Something is off with Belo Horizonte numbers',
    subtitle = "City level death trends from nbhd sums for CCUH Cities")

To check with Usama about Belo Horizonte numbers. Lets dial in on neighborhood level for Belo Horizonte. First let’s see what year had that drop

Code
df_qc = df_counts_dplyr %>% 
  left_join(df_schema)  %>% 
  rename(death = death_r) %>% 
  mutate(month = month(date)) %>% 
  group_by(city_name, city_geoid, nbhd_geoid, month, year) %>% 
  summarize(death = sum(death)) %>% 
  ungroup() 
  

df_qc %>% 
  filter(city_name == 'Belo Horizonte') %>% 
  mutate(date = make_date(year, month, 1) ) %>% 
  ggplot(aes(x = date, y = death, group = nbhd_geoid))+
  geom_line(alpha = 0.05)+
  theme(
    legend.position = 'none'
  )+
  labs(
    title = 'Neighborhood death trends Belo Horizonte',
    subtitle = 'weird shift starting 2006 (hihglihghted as between red lines)'
  ) +
  geom_vline(xintercept = make_date(2006, 1, 1),
             color = 'red', linetype = 'dashed', alpha = 1) +
  geom_vline(xintercept = make_date(2006, 6, 1),
             color = 'red', linetype = 'dashed', alpha = 1) +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y")

For now Will quarentine the data for this city prior to 2007 due to visible undercounting.

Polars

Polars.

import polars as pl
import datetime

df_mort = pl.scan_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__death_records.parquet")

df_schema = pl.scan_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/base__ccuh__aim2_city_subcity_nbhd__schema.parquet").collect()

Let’s do the transformation; first getting some intermediates.

df_nbhd_year = (
  df_mort
    .select("nbhd_geoid", "year")
    .with_columns(pl.col('year').cast(pl.Int32))
    .unique()
    .collect()
)
print(df_nbhd_year.head())
shape: (5, 2)
┌────────────────┬──────┐
│ nbhd_geoid     ┆ year │
│ ---            ┆ ---  │
│ str            ┆ i32  │
╞════════════════╪══════╡
│ 10219013108    ┆ 2008 │
│ DSPHPHLNBHD_36 ┆ 2011 │
│ nbhd_7         ┆ 2013 │
│ 10224810397    ┆ 2007 │
│ 10224820108    ┆ 2011 │
└────────────────┴──────┘
df_date_template = (
  df_nbhd_year
    .group_by(['nbhd_geoid', 'year'])
    .agg([
        pl.date_range(
            pl.date(pl.col('year').first(), 1, 1),  
            pl.date(pl.col('year').first(), 12, 31), 
            "1d"
        ).alias('date')
    ])
    .explode('date')
)
print(df_date_template.head())
shape: (5, 3)
┌────────────┬──────┬────────────┐
│ nbhd_geoid ┆ year ┆ date       │
│ ---        ┆ ---  ┆ ---        │
│ str        ┆ i32  ┆ date       │
╞════════════╪══════╪════════════╡
│ 20610117   ┆ 2015 ┆ 2015-01-01 │
│ 20610117   ┆ 2015 ┆ 2015-01-02 │
│ 20610117   ┆ 2015 ┆ 2015-01-03 │
│ 20610117   ┆ 2015 ┆ 2015-01-04 │
│ 20610117   ┆ 2015 ┆ 2015-01-05 │
└────────────┴──────┴────────────┘
df_counts_raw_polars = (
  df_mort
    .rename({"dod": "date"})
    .group_by(["nbhd_geoid", "date"])
    .agg([
        pl.len().alias("death")
    ])
    .collect()
)
print(df_counts_raw_polars.head())
shape: (5, 3)
┌────────────────┬────────────┬───────┐
│ nbhd_geoid     ┆ date       ┆ death │
│ ---            ┆ ---        ┆ ---   │
│ str            ┆ date       ┆ u32   │
╞════════════════╪════════════╪═══════╡
│ 10224810407    ┆ 2007-09-20 ┆ 1     │
│ 10213616106    ┆ 2004-02-25 ┆ 1     │
│ DSPHPHLNBHD_03 ┆ 2013-10-24 ┆ 1     │
│ 10224817124    ┆ 2008-03-27 ┆ 1     │
│ 10224810170    ┆ 2010-05-09 ┆ 1     │
└────────────────┴────────────┴───────┘
df_counts_polars = (
    df_date_template
    .join(
        df_counts_raw_polars,
        on=["nbhd_geoid", "date"],
        how="left"
    )
    .with_columns([
        pl.col("death").fill_null(0)
    ])
    .filter(
        pl.col("nbhd_geoid").is_in(df_schema.select("nbhd_geoid").to_series().to_list())
    )
    .sort(["nbhd_geoid", "date"])
    .rename({"death": "death_r"})
    .join(
        df_schema.select(["nbhd_geoid", "city_name", "city_geoid", "iso2"]).unique(),
        on="nbhd_geoid",
        how="left"
    )
    ## Remove Belo Horizonte nbhd data before 2007 due to visible undercounting
    .filter(
        ~((pl.col("city_name") == "Belo Horizonte") & (pl.col("year") < 2007))
    )
)
print(df_counts_polars.head())
shape: (5, 7)
┌─────────────┬──────┬────────────┬─────────┬────────────────┬────────────┬──────┐
│ nbhd_geoid  ┆ year ┆ date       ┆ death_r ┆ city_name      ┆ city_geoid ┆ iso2 │
│ ---         ┆ ---  ┆ ---        ┆ ---     ┆ ---            ┆ ---        ┆ ---  │
│ str         ┆ i32  ┆ date       ┆ u32     ┆ str            ┆ str        ┆ str  │
╞═════════════╪══════╪════════════╪═════════╪════════════════╪════════════╪══════╡
│ 10213610100 ┆ 2007 ┆ 2007-01-01 ┆ 0       ┆ Belo Horizonte ┆ 102136     ┆ BR   │
│ 10213610100 ┆ 2007 ┆ 2007-01-02 ┆ 1       ┆ Belo Horizonte ┆ 102136     ┆ BR   │
│ 10213610100 ┆ 2007 ┆ 2007-01-03 ┆ 0       ┆ Belo Horizonte ┆ 102136     ┆ BR   │
│ 10213610100 ┆ 2007 ┆ 2007-01-04 ┆ 0       ┆ Belo Horizonte ┆ 102136     ┆ BR   │
│ 10213610100 ┆ 2007 ┆ 2007-01-05 ┆ 0       ┆ Belo Horizonte ┆ 102136     ┆ BR   │
└─────────────┴──────┴────────────┴─────────┴────────────────┴────────────┴──────┘

Validate DBT

Let’s compare results with DBT.

## Import DBT Results with validations
df_counts = context$counts %>% 
  open_dataset() %>% 
  select(iso2, city_name,nbhd_geoid, date , death) %>% 
  collect()|> 
  arrange(nbhd_geoid, date)  %>% 
  verify(not_na(iso2))%>% 
  verify(not_na(nbhd_geoid  ))%>% 
  verify(not_na(date))  %>% 
  verify(not_na(death)) %>% 
  verify(all(unique(nbhd_geoid) %in% 
               unique(df_schema$nbhd_geoid)))

## Check Identical
identical(
  df_counts,
  df_counts_dplyr %>% 
    left_join(df_schema |> select(nbhd_geoid,iso2, city_name)) |> 
    filter(!(city_name == "Belo Horizonte" & year < 2007)) %>% 
    rename(death = death_r) %>% 
    select(names(df_counts))
) |> 
  assertthat::assert_that()
[1] TRUE

Looks good. The SQL results are identical to the Dplyr results. Increases evidence that our semantics were properly transalted.

Visual Inspection

Not sure how to check. but let’s start with some trends.

Code
# Calculate mean daily deaths by neighborhood
neighborhood_stats <- df_counts %>%
  group_by(iso2, nbhd_geoid) %>%
  summarize(
    mean_daily_deaths = mean(death),
    median_daily_deaths = median(death),
    max_daily_deaths = max(death),
    total_deaths = sum(death),
    .groups = "drop"
  )

# View summary statistics
neighborhood_stats %>%
  group_by(iso2) %>%
  summarize(
    n_neighborhoods = n(),
    avg_mean_deaths = mean(mean_daily_deaths),
    median_mean_deaths = median(mean_daily_deaths),
    min_mean_deaths = min(mean_daily_deaths),
    max_mean_deaths = max(mean_daily_deaths)
  )%>%
  knitr::kable()
iso2 n_neighborhoods avg_mean_deaths median_mean_deaths min_mean_deaths max_mean_deaths
BR 1242 0.4682538 0.4125463 0.0505718 5.329844
GT 14 3.7810860 1.5515629 0.3212412 27.229067
PA 53 0.4683222 0.4296955 0.0212111 2.046527
US 152 0.6601203 0.5366623 0.1168908 1.823830
Code
# Visualize distribution by country
ggplot(neighborhood_stats, aes(x = iso2, y = mean_daily_deaths)) +
  geom_boxplot() +
  labs(title = "Mean Neighbhborhood-level Daily Deaths by Country",
       x = "Country", y = "Mean Daily Deaths") +
  theme_minimal()

Lets also look at trends

Code
df_counts %>% 
  left_join(df_schema) %>% 
  summarize(death = sum(death), .by = c(date, iso2, city_name)) %>% 
  ggplot(aes(x = date, y = death, group = city_name, color = city_name))+
  geom_line() +
  facet_wrap(~iso2)+
  labs(
    title = 'Something is off with Belo Horizonte numbers',
    subtitle = "City level death trends from nbhd sums for CCUH Cities")

To check with Usama about Belo Horizonte numbers.

3. Death Counts by Age

Here we do death counts stratafied by two age groups: 1) <65 and then 65+. Note htat there are some recrods missing age (<0.1% in USA recrods and ~ 2% in Brazil records); These records were removed from this tabulation.

Develop

import polars as pl
import datetime

df_mort = pl.scan_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__death_records.parquet")
df_mort_age = (df_mort
    .with_columns(
        pl.when(pl.col("minage5c") <= 64)
        .then(pl.lit('<65'))  # Add pl.lit() here
        .otherwise(pl.lit('65+'))  # Add pl.lit() here
        .alias("age_category")
    )
)

df_schema = pl.scan_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/base__ccuh__aim2_city_subcity_nbhd__schema.parquet").collect()

Let’s do the transformation; first getting some intermediates.

df_nbhd_year = (
  df_mort_age
    .select("nbhd_geoid", "year", "age_category")
    .with_columns(pl.col('year').cast(pl.Int32))
    .unique()
    .collect()
)
print(df_nbhd_year.head())
shape: (5, 3)
┌─────────────┬──────┬──────────────┐
│ nbhd_geoid  ┆ year ┆ age_category │
│ ---         ┆ ---  ┆ ---          │
│ str         ┆ i32  ┆ str          │
╞═════════════╪══════╪══════════════╡
│ 10213614103 ┆ 2019 ┆ 65+          │
│ nbhd_67     ┆ 2016 ┆ <65          │
│ 10224821100 ┆ 2005 ┆ <65          │
│ 10224810374 ┆ 2018 ┆ <65          │
│ 10213610141 ┆ 2014 ┆ 65+          │
└─────────────┴──────┴──────────────┘
df_date_template = (
  df_nbhd_year
    .group_by(['nbhd_geoid', 'year', "age_category"])
    .agg([
        pl.date_range(
            pl.date(pl.col('year').first(), 1, 1),  
            pl.date(pl.col('year').first(), 12, 31), 
            "1d"
        ).alias('date')
    ])
    .explode('date')
)
print(df_date_template.head())
shape: (5, 4)
┌─────────────┬──────┬──────────────┬────────────┐
│ nbhd_geoid  ┆ year ┆ age_category ┆ date       │
│ ---         ┆ ---  ┆ ---          ┆ ---        │
│ str         ┆ i32  ┆ str          ┆ date       │
╞═════════════╪══════╪══════════════╪════════════╡
│ 10222613103 ┆ 2010 ┆ 65+          ┆ 2010-01-01 │
│ 10222613103 ┆ 2010 ┆ 65+          ┆ 2010-01-02 │
│ 10222613103 ┆ 2010 ┆ 65+          ┆ 2010-01-03 │
│ 10222613103 ┆ 2010 ┆ 65+          ┆ 2010-01-04 │
│ 10222613103 ┆ 2010 ┆ 65+          ┆ 2010-01-05 │
└─────────────┴──────┴──────────────┴────────────┘
df_counts_raw_polars = (
  df_mort_age
    .rename({"dod": "date"})
    .group_by(["nbhd_geoid", "date", "age_category"])
    .agg([
        pl.len().alias("death")
    ])
    .collect()
)
print(df_counts_raw_polars.head())
shape: (5, 4)
┌────────────────┬────────────┬──────────────┬───────┐
│ nbhd_geoid     ┆ date       ┆ age_category ┆ death │
│ ---            ┆ ---        ┆ ---          ┆ ---   │
│ str            ┆ date       ┆ str          ┆ u32   │
╞════════════════╪════════════╪══════════════╪═══════╡
│ 20310216       ┆ 2016-05-13 ┆ <65          ┆ 8     │
│ 10224810261    ┆ 2017-02-14 ┆ 65+          ┆ 1     │
│ DSPHPHLNBHD_06 ┆ 2004-08-21 ┆ <65          ┆ 1     │
│ 10224822119    ┆ 2010-03-15 ┆ <65          ┆ 1     │
│ 10224810352    ┆ 2008-02-12 ┆ 65+          ┆ 3     │
└────────────────┴────────────┴──────────────┴───────┘
df_counts_polars = (
    df_date_template
    .join(
        df_counts_raw_polars,
        on=["nbhd_geoid", "date", "age_category"],
        how="left"
    )
    .with_columns([
        pl.col("death").fill_null(0)
    ])
    .filter(
        pl.col("nbhd_geoid").is_in(df_schema.select("nbhd_geoid").to_series().to_list())
    )
    .sort(["nbhd_geoid", "date"])
    .join(
        df_schema.select(["nbhd_geoid", "city_name", "city_geoid", "iso2"]).unique(),
        on="nbhd_geoid",
        how="left"
    )
    ## Remove Belo Horizonte nbhd data before 2007 due to visible undercounting
    .filter(
        ~((pl.col("city_name") == "Belo Horizonte") & (pl.col("year") < 2007))
    )
)
print(df_counts_polars.head())
shape: (5, 8)
┌─────────────┬──────┬──────────────┬────────────┬───────┬────────────────┬────────────┬──────┐
│ nbhd_geoid  ┆ year ┆ age_category ┆ date       ┆ death ┆ city_name      ┆ city_geoid ┆ iso2 │
│ ---         ┆ ---  ┆ ---          ┆ ---        ┆ ---   ┆ ---            ┆ ---        ┆ ---  │
│ str         ┆ i32  ┆ str          ┆ date       ┆ u32   ┆ str            ┆ str        ┆ str  │
╞═════════════╪══════╪══════════════╪════════════╪═══════╪════════════════╪════════════╪══════╡
│ 10213610100 ┆ 2007 ┆ 65+          ┆ 2007-01-01 ┆ 0     ┆ Belo Horizonte ┆ 102136     ┆ BR   │
│ 10213610100 ┆ 2007 ┆ <65          ┆ 2007-01-01 ┆ 0     ┆ Belo Horizonte ┆ 102136     ┆ BR   │
│ 10213610100 ┆ 2007 ┆ 65+          ┆ 2007-01-02 ┆ 1     ┆ Belo Horizonte ┆ 102136     ┆ BR   │
│ 10213610100 ┆ 2007 ┆ <65          ┆ 2007-01-02 ┆ 0     ┆ Belo Horizonte ┆ 102136     ┆ BR   │
│ 10213610100 ┆ 2007 ┆ 65+          ┆ 2007-01-03 ┆ 0     ┆ Belo Horizonte ┆ 102136     ┆ BR   │
└─────────────┴──────┴──────────────┴────────────┴───────┴────────────────┴────────────┴──────┘

Validate

Let’s import the DBT results and just validate against eh unstratified results.

## Import DBT Results
df_counts_age = context$counts_age %>% 
  open_dataset() %>% 
  select(iso2, nbhd_geoid, age_category, date, death) %>% 
  collect()|> 
  arrange(age_category, nbhd_geoid, date) %>% 
  verify(not_na(age_category)) %>% 
  verify(not_na(iso2)) %>% 
  verify(not_na(nbhd_geoid  ))%>% 
  verify(not_na(date))  %>% 
  verify(not_na(death)) %>% 
  verify(all(unique(nbhd_geoid) %in% 
               unique(df_schema$nbhd_geoid)))
 
  
## Dplyr computations
df_counts_reconstructed = df_counts_age |>  
  group_by(iso2, nbhd_geoid, date) |> 
  summarize(death = sum(death)) |> 
  ungroup()|> 
  arrange(nbhd_geoid, date)
 

 

## Check Identical
identical(
  df_counts_reconstructed,
  df_counts %>% 
    select(names(df_counts_reconstructed))
) |> 
  assert_that()
[1] TRUE

Looks good. The DBT strataffied results when aggregated to unstratified are identical to the unstratfiaeid results.

Visual Inspection

Not sure how to check. but let’s start with some trends.

Code
# Calculate mean daily deaths by neighborhood
neighborhood_stats <- df_counts_age %>%
  group_by(iso2, nbhd_geoid, age_category) %>%
  summarize(
    mean_daily_deaths = mean(death),
    median_daily_deaths = median(death),
    max_daily_deaths = max(death),
    total_deaths = sum(death),
    .groups = "drop"
  )

# View summary statistics
neighborhood_stats %>%
  group_by(iso2, age_category) %>%
  summarize(
    n_neighborhoods = n(),
    avg_mean_deaths = mean(mean_daily_deaths),
    median_mean_deaths = median(mean_daily_deaths),
    min_mean_deaths = min(mean_daily_deaths),
    max_mean_deaths = max(mean_daily_deaths)
  )%>%
  knitr::kable()
iso2 age_category n_neighborhoods avg_mean_deaths median_mean_deaths min_mean_deaths max_mean_deaths
BR 65+ 1242 0.2760775 0.2206860 0.0230311 3.1306168
BR <65 1242 0.1921762 0.1687872 0.0212595 2.1992269
GT 65+ 14 1.7096411 0.6583390 0.1615332 12.3025325
GT <65 14 2.0714449 0.8932238 0.1597080 14.9265343
PA 65+ 53 0.2815362 0.2322956 0.0147109 1.3640096
PA <65 53 0.1868396 0.1847417 0.0065002 0.6825180
US 65+ 152 0.4907350 0.4127485 0.0791572 1.4623259
US <65 152 0.1693854 0.1069516 0.0274967 0.6043328
Code
# Visualize distribution by country
ggplot(neighborhood_stats, aes(x = iso2, y = mean_daily_deaths, fill = age_category)) +
  geom_boxplot(position = position_dodge(width = 0.8), outlier.alpha = 0.5) +
  scale_fill_brewer(palette = "Set1", name = "Age Group") +
  labs(title = "Mean Daily Deaths by Country and Age Group",
       x = "Country", 
       y = "Mean Daily Deaths") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  scale_y_continuous(
    trans = "log10",  # Log scale to better show distribution
    labels = scales::number_format(accuracy = 0.001)
  ) +
  geom_hline(yintercept = 0.1, linetype = "dashed", color = "gray50", alpha = 0.7)

4. Access

```{r}
df_aim2_mort =  "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__death_records.parquet" |> 
  arrow::open_dataset() 
  
df_aim2_nbhd_deaths =  "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__nbhd_death_counts.parquet" |> 
  arrow::open_dataset() 
  
df_aim2_nbhd_deaths_age =  "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__nbhd_death_counts__age_grp_65.parquet" |> 
  arrow::open_dataset() 
```