harmonized__ccuh_aim_2__nbhd_exposures_wri__v1

Author

Ran Li (Maintainer)

Published

March 14, 2025

setwd(here::here("ccuh-notebooks/_harmonization/ccuh_aim2/ccuh_aim2_models_v1"))
pacman::p_load(tidyverse, arrow, cli, reactable, readxl, glue, janitor, assertr, haven)

df_schema_aim2 = "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/schema_ccuh_aim2_city_nbhd/schema_ccuh_aim2_city_nbhd.parquet" |> 
  arrow::open_dataset() |>
  dplyr::collect()

1. Develop

R

wri_api =  "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/wri__data_api__v1.parquet" |> 
  open_dataset() |> 
  collect()


df_ref = wri_api |> 
  filter(var_name != 'iisu_cat') |> 
  filter(geo == 'L2_5', iso2 == 'BR', salid1 %in% df_schema_aim2$city_geoid) |> 
  rename(nbhd_geoid = observation_id) |>
  verify(not_na(value)) |> 
  group_by(nbhd_geoid, var_name) |> 
  summarize(value = mean(as.numeric(value))) |>
  ungroup() |> 
  pivot_wider(names_from = var_name, values_from = value)

Python

```{python}

import polars as pl

df_schema_aim2 = pl.read_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/schema_ccuh_aim2_city_nbhd/schema_ccuh_aim2_city_nbhd.parquet")
wri_api = pl.read_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/wri__data_api__v1.parquet")


(wri_api
 .filter(pl.col("var_name") != "iisu_cat")
 .filter((pl.col("geo") == "L2_5") & 
         (pl.col("iso2") == "BR") & 
         pl.col("salid1").is_in(df_schema_aim2["city_geoid"]))
 .rename({"observation_id": "nbhd_geoid"})
 .filter(pl.col("value").is_not_null())
 .group_by(["nbhd_geoid", "var_name"])
 .agg(pl.col("value").cast(pl.Float64).mean())
 .pivot(index="nbhd_geoid", 
        columns="var_name", 
        values="value")
)
```

2. Validate DBT results

Compare with Reference

## Import
df_dbt = wri_api =  "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__nbhd_exposures_wri__v1.parquet" |> 
  open_dataset() |> 
  collect() |> 
  select(names(df_ref)) |> 
  arrange(nbhd_geoid)


## Verify
identical(
  df_dbt  |> 
    arrange(nbhd_geoid)|> 
    mutate(across(where(is.numeric), ~round(., 1))), 
  df_ref  |> 
    arrange(nbhd_geoid)|> 
    mutate(across(where(is.numeric), ~round(., 1)))
) |> 
  assertthat::assert_that()
[1] TRUE

Validations

Let’s do some basic validations.

  • No missing values
  • All expected neighborhoods
wri_nbhds = df_schema_aim2 |> 
  filter(iso2 == 'BR',
         city_name %in% c('Belo Horizonte','Campinas')) |> 
  pull(nbhd_geoid)

df_dbt = df_dbt |> 
  assert(not_na, everything()) |> 
  verify(setequal(unique(nbhd_geoid), unique(wri_nbhds)))

Visual Inspection

Let’ just do a visual inspection of four variables.

df_long = df_dbt |> 
  select(nbhd_geoid, iisu, greenNDVI, perc_green, perc_informal) |> 
  pivot_longer(-nbhd_geoid) |> 
  left_join(df_schema_aim2 |> select(nbhd_geoid, city_name))

ggplot(df_long, aes(x = value, fill = city_name)) +
  geom_density(alpha = 0.7) +
  facet_wrap(~name, scales = "free") +
  labs(title = "Distribution of WRI crossectional nbhd data by City",
       x = "Value", y = "Density", fill = "City") +
  theme_minimal()

Looks okay