harmonized__ccuh_aim_2__nbhd_exposures_wri__v1

Author

Ran Li (Maintainer)

Published

March 14, 2025

setwd(here::here("ccuh-notebooks/_harmonization/ccuh_aim2/ccuh_aim2_models_v1"))
pacman::p_load(tidyverse, arrow, cli, reactable, readxl, glue, janitor, assertr, haven)

df_schema_aim2 = "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/schema_ccuh_aim2_city_nbhd/schema_ccuh_aim2_city_nbhd.parquet" |> 
  arrow::open_dataset() |>
  dplyr::collect()

1. Develop

R

wri_api =  "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/wri__data_api__v1.parquet" |> 
  open_dataset() |> 
  collect()


df_ref = wri_api |> 
  filter(var_name != 'iisu_cat') |> 
  filter(geo == 'L2_5', iso2 == 'BR', salid1 %in% df_schema_aim2$city_geoid) |> 
  rename(nbhd_geoid = observation_id) |>
  verify(not_na(value)) |> 
  group_by(nbhd_geoid, var_name) |> 
  summarize(value = mean(as.numeric(value))) |>
  ungroup() |> 
  pivot_wider(names_from = var_name, values_from = value)

Python

```{python}

import polars as pl

df_schema_aim2 = pl.read_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/schema_ccuh_aim2_city_nbhd/schema_ccuh_aim2_city_nbhd.parquet")
wri_api = pl.read_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/wri__data_api__v1.parquet")


(wri_api
 .filter(pl.col("var_name") != "iisu_cat")
 .filter((pl.col("geo") == "L2_5") & 
         (pl.col("iso2") == "BR") & 
         pl.col("salid1").is_in(df_schema_aim2["city_geoid"]))
 .rename({"observation_id": "nbhd_geoid"})
 .filter(pl.col("value").is_not_null())
 .group_by(["nbhd_geoid", "var_name"])
 .agg(pl.col("value").cast(pl.Float64).mean())
 .pivot(index="nbhd_geoid", 
        columns="var_name", 
        values="value")
)
```

2. Validate DBT results

Compare with Reference

## Import
df_dbt = wri_api =  "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__nbhd_exposures_wri__v1.parquet" |> 
  open_dataset() |> 
  collect() |> 
  select(names(df_ref)) |> 
  arrange(nbhd_geoid)


## Verify
identical(
  df_dbt  |> 
    arrange(nbhd_geoid)|> 
    mutate(across(where(is.numeric), ~round(., 1))), 
  df_ref  |> 
    arrange(nbhd_geoid)|> 
    mutate(across(where(is.numeric), ~round(., 1)))
) |> 
  assertthat::assert_that()

[1] TRUE

Validations

Let’s do some basic validations.

No missing values
All expected neighborhoods

wri_nbhds = df_schema_aim2 |> 
  filter(iso2 == 'BR',
         city_name %in% c('Belo Horizonte','Campinas')) |> 
  pull(nbhd_geoid)

df_dbt = df_dbt |> 
  assert(not_na, everything()) |> 
  verify(setequal(unique(nbhd_geoid), unique(wri_nbhds)))

Visual Inspection

Let’ just do a visual inspection of four variables.

df_long = df_dbt |> 
  select(nbhd_geoid, iisu, greenNDVI, perc_green, perc_informal) |> 
  pivot_longer(-nbhd_geoid) |> 
  left_join(df_schema_aim2 |> select(nbhd_geoid, city_name))

ggplot(df_long, aes(x = value, fill = city_name)) +
  geom_density(alpha = 0.7) +
  facet_wrap(~name, scales = "free") +
  labs(title = "Distribution of WRI crossectional nbhd data by City",
       x = "Value", y = "Density", fill = "City") +
  theme_minimal()

Looks okay

--- title: "harmonized__ccuh_aim_2__nbhd_exposures_wri__v1" author: - name: Ran Li (Maintainer) orcid: 0000-0002-4699-4755 date: last-modified format: html: toc: true self-contained: true code-fold: false df-print: kable code-tools: true comments: hypothesis: theme: clean editor: source execute: warning: false message: false eval: true editor_options: chunk_output_type: console --- ```{r} #| code-summary: "Setup" setwd(here::here("ccuh-notebooks/_harmonization/ccuh_aim2/ccuh_aim2_models_v1")) pacman::p_load(tidyverse, arrow, cli, reactable, readxl, glue, janitor, assertr, haven) df_schema_aim2 = "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/schema_ccuh_aim2_city_nbhd/schema_ccuh_aim2_city_nbhd.parquet" |> arrow::open_dataset() |> dplyr::collect() ``` # 1. Develop ## R ```{r} #| code-fold: false wri_api = "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/wri__data_api__v1.parquet" |> open_dataset() |> collect() df_ref = wri_api |> filter(var_name != 'iisu_cat') |> filter(geo == 'L2_5', iso2 == 'BR', salid1 %in% df_schema_aim2$city_geoid) |> rename(nbhd_geoid = observation_id) |> verify(not_na(value)) |> group_by(nbhd_geoid, var_name) |> summarize(value = mean(as.numeric(value))) |> ungroup() |> pivot_wider(names_from = var_name, values_from = value) ``` ## Python ```{python} import polars as pl df_schema_aim2 = pl.read_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/schema_ccuh_aim2_city_nbhd/schema_ccuh_aim2_city_nbhd.parquet") wri_api = pl.read_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/wri__data_api__v1.parquet") (wri_api .filter(pl.col("var_name") != "iisu_cat") .filter((pl.col("geo") == "L2_5") & (pl.col("iso2") == "BR") & pl.col("salid1").is_in(df_schema_aim2["city_geoid"])) .rename({"observation_id": "nbhd_geoid"}) .filter(pl.col("value").is_not_null()) .group_by(["nbhd_geoid", "var_name"]) .agg(pl.col("value").cast(pl.Float64).mean()) .pivot(index="nbhd_geoid", columns="var_name", values="value") ) ``` # 2. Validate DBT results ## Compare with Reference ```{r} ## Import df_dbt = wri_api = "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__nbhd_exposures_wri__v1.parquet" |> open_dataset() |> collect() |> select(names(df_ref)) |> arrange(nbhd_geoid) ## Verify identical( df_dbt |> arrange(nbhd_geoid)|> mutate(across(where(is.numeric), ~round(., 1))), df_ref |> arrange(nbhd_geoid)|> mutate(across(where(is.numeric), ~round(., 1))) ) |> assertthat::assert_that() ``` ## Validations Let's do some basic validations. - No missing values - All expected neighborhoods ```{r} #| code-fold: false wri_nbhds = df_schema_aim2 |> filter(iso2 == 'BR', city_name %in% c('Belo Horizonte','Campinas')) |> pull(nbhd_geoid) df_dbt = df_dbt |> assert(not_na, everything()) |> verify(setequal(unique(nbhd_geoid), unique(wri_nbhds))) ``` ## Visual Inspection Let' just do a visual inspection of four variables. ```{r} df_long = df_dbt |> select(nbhd_geoid, iisu, greenNDVI, perc_green, perc_informal) |> pivot_longer(-nbhd_geoid) |> left_join(df_schema_aim2 |> select(nbhd_geoid, city_name)) ggplot(df_long, aes(x = value, fill = city_name)) + geom_density(alpha = 0.7) + facet_wrap(~name, scales = "free") + labs(title = "Distribution of WRI crossectional nbhd data by City", x = "Value", y = "Density", fill = "City") + theme_minimal() ``` Looks okay