setwd(here::here("ccuh-notebooks/_harmonization/ccuh_aim2/ccuh_aim2_models_v1"))
::p_load(tidyverse, arrow, cli, reactable, readxl, glue, janitor, assertr, haven)
pacman
= "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/schema_ccuh_aim2_city_nbhd/schema_ccuh_aim2_city_nbhd.parquet" |>
df_schema_aim2 ::open_dataset() |>
arrow::collect() dplyr
harmonized__ccuh_aim_2__nbhd_exposures_wri__v1
1. Develop
R
= "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/wri__data_api__v1.parquet" |>
wri_api open_dataset() |>
collect()
= wri_api |>
df_ref filter(var_name != 'iisu_cat') |>
filter(geo == 'L2_5', iso2 == 'BR', salid1 %in% df_schema_aim2$city_geoid) |>
rename(nbhd_geoid = observation_id) |>
verify(not_na(value)) |>
group_by(nbhd_geoid, var_name) |>
summarize(value = mean(as.numeric(value))) |>
ungroup() |>
pivot_wider(names_from = var_name, values_from = value)
Python
```{python}
import polars as pl
df_schema_aim2 = pl.read_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/schema_ccuh_aim2_city_nbhd/schema_ccuh_aim2_city_nbhd.parquet")
wri_api = pl.read_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/wri__data_api__v1.parquet")
(wri_api
.filter(pl.col("var_name") != "iisu_cat")
.filter((pl.col("geo") == "L2_5") &
(pl.col("iso2") == "BR") &
pl.col("salid1").is_in(df_schema_aim2["city_geoid"]))
.rename({"observation_id": "nbhd_geoid"})
.filter(pl.col("value").is_not_null())
.group_by(["nbhd_geoid", "var_name"])
.agg(pl.col("value").cast(pl.Float64).mean())
.pivot(index="nbhd_geoid",
columns="var_name",
values="value")
)
```
2. Validate DBT results
Compare with Reference
## Import
= wri_api = "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/_datawarehouse/v_0_1_0/harmonized__ccuh_aim_2__nbhd_exposures_wri__v1.parquet" |>
df_dbt open_dataset() |>
collect() |>
select(names(df_ref)) |>
arrange(nbhd_geoid)
## Verify
identical(
|>
df_dbt arrange(nbhd_geoid)|>
mutate(across(where(is.numeric), ~round(., 1))),
|>
df_ref arrange(nbhd_geoid)|>
mutate(across(where(is.numeric), ~round(., 1)))
|>
) ::assert_that() assertthat
[1] TRUE
Validations
Let’s do some basic validations.
- No missing values
- All expected neighborhoods
= df_schema_aim2 |>
wri_nbhds filter(iso2 == 'BR',
%in% c('Belo Horizonte','Campinas')) |>
city_name pull(nbhd_geoid)
= df_dbt |>
df_dbt assert(not_na, everything()) |>
verify(setequal(unique(nbhd_geoid), unique(wri_nbhds)))
Visual Inspection
Let’ just do a visual inspection of four variables.
= df_dbt |>
df_long select(nbhd_geoid, iisu, greenNDVI, perc_green, perc_informal) |>
pivot_longer(-nbhd_geoid) |>
left_join(df_schema_aim2 |> select(nbhd_geoid, city_name))
ggplot(df_long, aes(x = value, fill = city_name)) +
geom_density(alpha = 0.7) +
facet_wrap(~name, scales = "free") +
labs(title = "Distribution of WRI crossectional nbhd data by City",
x = "Value", y = "Density", fill = "City") +
theme_minimal()
Looks okay