Pull tract10 data from ACS for CCUH
Author

Ran Li (Maintainer)

Published

April 10, 2025

Setup
## Dependencies
pacman::p_load(tidyverse, assertr, arrow, duckdb, reactable, glue, tidycensus, sf)

## Local Context
context = lst(
  server = '//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/acs_v1',
  server_cache = file.path(server, 'cache'),
  server_cache_df_var = file.path(server_cache, 'df_var.parquet'),
  model = 'acs__v1',
  final = file.path(server, glue("{model}.parquet"))
)

## Parameters
params = lst(
  geo = c("tract", 'zcta', "county subdivision",'county'),
  year = 2012:2020,
  state = c("PA")
)

{ # Global objects ----------------------------------------------------------
  
  ## List of vairables in ACS tables
  if (!file.exists(context$server_cache_df_var)){
    
    df_vars_acs = 2009:2020 %>% 
      map_df(~{
        tidycensus::load_variables(year = .x, dataset = 'acs5') %>% 
          mutate(year = .x)
      })
    
    df_vars_acs %>% arrow::write_parquet(context$server_cache_df_var)
    
  } else {
    df_vars_acs = context$server_cache_df_var %>% 
      open_dataset() %>% 
      collect()
  }

}

1. Intro

Please refer to this documentation log for ACS data for CCUH. Note that in this first version we will just pull a subset of requested variables to support the first three papers focusing on FAIR best practices and good docuemntation. We have a list of tables identified which we will pull data from at the tract10 level and operationalize any requested summaries.

Tables

Starting with a few example ACS datasets (UHC nbhd dataset, MS2 data pull, Shreya pull) we have identified a ACS tables from which the data comes from - these are documented here.

Lets subset hte ACS variabel list to those in these tables.

Table of ACS variables of interest
tables_of_interest = acs_tables <- c(
  # MS2 Tables
  "B01003", # Total population
  "B25001", # Housing units
  "B25002", # Occupancy status
  "B25003", # Tenure
  "B25014", # Occupants per room
  "B25034", # Year structure built
  "B25047", # Plumbing facilities
  "B25051", # Kitchen facilities
  "B25070", # Gross rent as percentage of income
  
  # Shreya Tables
  "B01001", # Sex by age
  "B03002", # Race/ethnicity
  "B15003", # Educational attainment
  "B17001", # Poverty status
  "B19013", # Median household income
  "B23025", # Employment status
  "B25035", # Median year structure built
  "B25044", # Tenure by vehicles available
  "B25077", # Median value
  "B27001", # Health insurance coverage
  
  # Additional UHC Tables
  "B15002", # Sex by educational attainment
  "B23001", # Sex by employment status
  "B25010"  # Average household size by tenure
) %>% 
  paste0(.,"_") %>% 
  paste(collapse = '|')

df_tables = df_vars_acs %>% 
  filter(str_detect(name, tables_of_interest)) %>%
  arrange(concept) %>% 
  select(-year, -geography) %>% 
  distinct()

reactable(df_tables, searchable = T)

2. Components

MS2 Data Pull

These variables were operatioanlized ad hoc for Tara’s CCUH MS2, we just reproduce and integrate into the CCUH ACS dataset here. Before we start we need to pull in ALAND which is needed to calculate population density. This is a variable we can calculate based on total population and ALAND from the TIGER files. Let’s pull in the ALAND data first - note we have operatainolized this in the CCUH Tiger Boundaries API.

## Connect
api_tiger = "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/tiger10_boundaries__v1.1/tiger_us_boundaries_v1.parquet" %>% 
  open_dataset()

## query
df_ALAND = api_tiger %>% 
  filter(!cb) %>% 
  filter(vintage%in% c("tract10", 'zcta10', "cousub10", 'county10')) %>% 
  select(geoid, vintage, ALAND) %>% 
  distinct() %>% 
  collect()
 
## Preview
df_ALAND %>% 
  head() %>% 
  reactable()

Looks good now we can use this along with ACS population to calculate pop density. Now lets do our ACS Pull and cleaning.

MS2 - nbhd variables function
get_ms2_acs_data <- function(geography, year) {
  
  ## Pull ACS Data
  #### geography = 'county subdivision'; year = 2017
  #### geography = 'county subdivision'; year = 2012
  #### geography = 'tract'; year = 2020
  
  vars_tmp = c(
      
      ## OCCUPANCY STATUS  
      "B25002_001",   ## Total
      "B25002_002",   ## Occupied
      
      ## YEAR STRUCTURE BUILT
      "B25034_001",   # Total
      "B25034_007",   # pre-2015: 1960 to 1969; 2015+: 1970 to 1979
      "B25034_008",   # pre-2015: 1950 to 1959; 2015+: 1960 to 1969
      "B25034_009",   # pre-2015: 1940 to 1949; 2015+: 1950 to 1959
      "B25034_010",   # pre-2015: 1939 or earlier; 2015+: 1940 to 1949
      "B25034_011",    # pre-2015: NA; 2015+: 1939 or earlier
      
      ## TENURE (RENTER)
      "B25003_001",   ## Total!!Renter occupied
      "B25003_003",   ## Renter occupied
      
      ## PLUMBING FACILITIES FOR ALL HOUSING UNITS
      "B25047_001",   ## Total
      "B25047_003",   ## Lacking complete plumbing facilities
      
      ## KITCHEN FACILITIES FOR ALL HOUSING UNITS
      "B25051_001",   ## Total
      "B25051_003",   ## Lacking complete kitchen facilities
      
      ## TENURE BY OCCUPANTS PER ROOM 
      "B25014_001",   ## Total
      "B25014_006",   ## TENURE BY OCCUPANTS PER ROOM (1.51 to 2.00 or more occupants per room)
      "B25014_007",   ## TENURE BY OCCUPANTS PER ROOM (2.01 or more occupants per room)
      
      ## GROSS RENT AS A PERCENTAGE OF HOUSEHOLD INCOME IN THE PAST 12 MONTHS
      "B25070_001",   ## Total
      "B25070_007",   ## (30.0-34.9% or more)
      "B25070_008",   ## (34.9-39.9% or more)
      "B25070_009",   ## (40.0 to 49.9 % or more)
      "B25070_010",   ## (50% or more)
      
      ## TOTAL POPULATION (general)
      "B01003_001"  ## Total 
    ) 
  
  if (year<2015) vars_tmp = vars_tmp %>% purrr::discard(~.x=='B25034_011')
  
  df_pull = get_acs(
    geography = geography,
    state = "PA", 
    year = year,
    survey = "acs5",
    variables = vars_tmp,
    output = "wide"
  ) %>% 
    ## Standardize ZCTA to ZCTA5
    mutate(
      geo = geography,
      GEOID = if_else(geo == 'zcta', str_sub(GEOID, -5), GEOID)) %>% 
    rename(geoid = GEOID) %>% 
    ## Merge in ALAND
    left_join(df_ALAND) # %>%  assert(not_na, c('ALAND', 'B01003_001E')) Relax as some are missing.
  
  { # QC  -----------------------------------------------------------
    
    ## If pct_missing_ALAND units is significant (> 5%) then quarentine all ALAND downstream variables (tract10-2020)
    n_rows = nrow(df_pull)
    n_rows_missing_ALAND = df_pull %>% filter(is.na(ALAND)) %>% nrow()
    pct_missing_ALAND = n_rows_missing_ALAND/n_rows * 100
    if (pct_missing_ALAND > 5){
      df_pull = df_pull %>% mutate(ALAND = NA)
      cli::cli_alert_danger("Quarentine ALAND variables due to systematic ALAND missingness")
    }
    
  }
  
  ## Summary Metrics
  df_wide = df_pull %>%
    mutate(
      year = year,
      pct_occupied = B25002_002E / B25002_001E * 100,
      pct_pre_1970 = ifelse(
        year < 2015,
        (B25034_007E + B25034_008E + B25034_009E + B25034_010E) / B25034_001E,
        (B25034_008E + B25034_009E + B25034_010E + B25034_011E) / B25034_001E
      ),
      pct_renter = B25003_003E / B25003_001E * 100,
      pct_lacking_plumbing = B25047_003E / B25047_001E * 100,
      pct_lacking_kitchen = B25051_003E / B25051_001E * 100,
      pct_crowded_1_5more = (B25014_006E + B25014_007E) / B25014_001E * 100,
      pct_crowded_2more = B25014_007E / B25014_001E * 100,
      pct_high_grapi_30more = (B25070_007E + B25070_008E + B25070_009E + B25070_010E) / B25070_001E * 100,
      pct_high_grapi_50more = B25070_010E / B25070_001E * 100,
      area_sq_km = ALAND / 1e6,  # Convert square meters to square kilometers
      population_density_per_sq_km = B01003_001E / area_sq_km
    ) %>%
    select(geoid , year, geo, total_population = B01003_001E, starts_with("pct_"), 
           area_sq_km, population_density_per_sq_km)
  
  
  return(df_wide)
}

get_ms2_acs_data("county subdivision", 2012)

Note we quarent pct_pre_1970 to 2015+ due to different code.

Here is the metadata.

Code
df_ms2_metadata <- tribble(
  ~column_name, ~description, ~data_type, ~source,
  "geoid", "Unique identifier for the geography", "character", "Census GEOID",
  "year", "Year of the ACS data", "integer", "ACS 5-year estimates",
  "geo", "Geography type (tract, county subdivision, etc.)", "character", "Function input parameter",
  "total_population", "Total population estimate", "double", "ACS variable B01003_001",
  "pct_occupied", "Percentage of occupied housing units", "double", "B25002_002/B25002_001 * 100",
  "pct_pre_1970", "Percentage of housing units built before 1970 (Note: Different calculation pre/post 2015)", "double", "Custom calculation based on year, using B25034 series",
  "pct_renter", "Percentage of renter-occupied housing units", "double", "B25003_003/B25003_001 * 100",
  "pct_lacking_plumbing", "Percentage lacking complete plumbing facilities", "double", "B25047_003/B25047_001 * 100",
  "pct_lacking_kitchen", "Percentage lacking complete kitchen facilities", "double", "B25051_003/B25051_001 * 100",
  "pct_crowded_1_5more", "Percentage with 1.51 or more occupants per room", "double", "(B25014_006 + B25014_007)/B25014_001 * 100",
  "pct_crowded_2more", "Percentage with 2.01 or more occupants per room", "double", "B25014_007/B25014_001 * 100",
  "pct_high_grapi_30more", "Percentage paying 30% or more of income on rent", "double", "(B25070_007 + B25070_008 + B25070_009 + B25070_010)/B25070_001 * 100",
  "pct_high_grapi_50more", "Percentage paying 50% or more of income on rent", "double", "B25070_010/B25070_001 * 100",
  "area_sq_km", "Land area in square kilometers", "double", "ALAND/1e6",
  "population_density_per_sq_km", "Population per square kilometer", "double", "total_population/area_sq_km"
)

df_ms2_metadata %>% 
  reactable()

Shreya’s Data Pull

These were contribtued by Shreya see this GitHub issue.

Shreya ACS code
get_shreya_acs_data <- function(geography, year) {
  # Get ACS data
  df_pull <- get_acs(
    geography = geography,
    state = "PA",
    year = year,
    survey = "acs5",
    variables = c(
       # Demographics (B01001)
      total_age = "B01001_001",
      # Males by age group
      total_males = "B01001_002",
      males_under_5 = "B01001_003",
      males_5_9 = "B01001_004",
      males_10_14 = "B01001_005",
      males_15_17 = "B01001_006",
      males_18_19 = "B01001_007",
      males_20 = "B01001_008",
      males_21 = "B01001_009",
      males_22_24 = "B01001_010",
      males_25_29 = "B01001_011",
      males_30_34 = "B01001_012",
      males_35_39 = "B01001_013",
      males_40_44 = "B01001_014",
      males_45_49 = "B01001_015",
      males_50_54 = "B01001_016",
      males_55_59 = "B01001_017",
      males_60_61 = "B01001_018",
      males_62_64 = "B01001_019",
      males_65_66 = "B01001_020",
      males_67_69 = "B01001_021", 
      males_70_74 = "B01001_022",
      males_75_79 = "B01001_023",
      males_80_84 = "B01001_024",
      males_85_plus = "B01001_025",
      
      # Females by age group
      total_females = "B01001_026",
      females_under_5 = "B01001_027",
      females_5_9 = "B01001_028",
      females_10_14 = "B01001_029",
      females_15_17 = "B01001_030",
      females_18_19 = "B01001_031",
      females_20 = "B01001_032",
      females_21 = "B01001_033",
      females_22_24 = "B01001_034",
      females_25_29 = "B01001_035",
      females_30_34 = "B01001_036",
      females_35_39 = "B01001_037",
      females_40_44 = "B01001_038",
      females_45_49 = "B01001_039",
      females_50_54 = "B01001_040",
      females_55_59 = "B01001_041",
      females_60_61 = "B01001_042",
      females_62_64 = "B01001_043",
      females_65_66 = "B01001_044",
      females_67_69 = "B01001_045",
      females_70_74 = "B01001_046",
      females_75_79 = "B01001_047",
      females_80_84 = "B01001_048",
      females_85_plus = "B01001_049",
      
      # Education (B15003)
      total_education = "B15003_001",
      no_schooling = "B15003_002",
      nursery_school = "B15003_003",
      kindergarten = "B15003_004",
      grade_1 = "B15003_005",
      grade_2 = "B15003_006",
      grade_3 = "B15003_007",
      grade_4 = "B15003_008",
      grade_5 = "B15003_009",
      grade_6 = "B15003_010",
      grade_7 = "B15003_011",
      grade_8 = "B15003_012",
      grade_9 = "B15003_013",
      grade_10 = "B15003_014",
      grade_11 = "B15003_015",
      grade_12_no_diploma = "B15003_016",
      high_school_graduate = "B15003_017",
      ged = "B15003_018",
      some_college_less_1yr = "B15003_019",
      some_college_1yr_plus = "B15003_020",
      associates_degree = "B15003_021",
      college_edu = "B15003_022",
      masters_degree = "B15003_023",
      professional_school_degree = "B15003_024",
      doctorate_degree = "B15003_025",
      
      
      # Employment (B23025)
      labor_force = "B23025_002",
      unemployed = "B23025_005",
      
      # Vehicle Access (B25044)
      total_vehicle = "B25044_001",
      no_vehicle = "B25044_003",
      total_vehicle_renters = "B25044_009",
      no_vehicle_renters = "B25044_010",
      
      # Health Insurance (B27001)
      total_pop_insurance = "B27001_001",
      no_insurance_male = "B27001_005",
      no_insurance_female = "B27001_030",
      
      # Housing
      median_year_built = "B25035_001",
      median_housing_value = "B25077_001",
      
      # Overcrowding (B25014)
      total_occupied = "B25014_002",
      overcrowded = "B25014_007",
      total_occupied_renters = "B25014_008",
      overcrowded_renters = "B25014_013",
      
      # Race/Ethnicity (B03002)
      total_pop_ethnicity = "B03002_001",
      nh_white = "B03002_003",
      nh_black = "B03002_004",
      nh_asian = "B03002_006",
      hispanic = "B03002_012",
      
      # Income and Poverty (B19013, B17001)
      median_income = "B19013_001",
      poverty_universe = "B17001_001",
      below_poverty = "B17001_002"
    ),
    output = "wide"
  )
  
  # Calculate derived variables
  df_wide <- df_pull %>%
    mutate(
      geo = geography,
      year = year,
      
      ## Consolidated 5-year age groups
      pop_5yr_age_cat_0_4 = males_under_5E + males_under_5E,
      pop_5yr_age_cat_5_9 = males_5_9E + females_5_9E,
      pop_5yr_age_cat_10_14 = males_10_14E + females_10_14E,
      pop_5yr_age_cat_15_19 = males_15_17E + males_18_19E + females_15_17E + females_18_19E,
      pop_5yr_age_cat_20_24 = males_20E + males_21E + males_22_24E + females_20E + females_21E + females_22_24E,
      pop_5yr_age_cat_25_29 = males_25_29E + females_25_29E,
      pop_5yr_age_cat_30_34 = males_30_34E + females_30_34E,
      pop_5yr_age_cat_35_39 = males_35_39E + females_35_39E,
      pop_5yr_age_cat_40_44 = males_40_44E + females_40_44E,
      pop_5yr_age_cat_45_49 = males_45_49E + females_45_49E,
      pop_5yr_age_cat_50_54 = males_50_54E + females_50_54E,
      pop_5yr_age_cat_55_59 = males_55_59E + females_55_59E,
      pop_5yr_age_cat_60_64 = males_60_61E + males_62_64E + females_60_61E + females_62_64E,
      pop_5yr_age_cat_65_69 = males_65_66E + males_67_69E + females_65_66E + females_67_69E,
      pop_5yr_age_cat_70_74 = males_70_74E + females_70_74E,
      pop_5yr_age_cat_75_79 = males_75_79E + females_75_79E,
      pop_5yr_age_cat_80_84 = males_80_84E + females_80_84E,
      pop_5yr_age_cat_85_plus = males_85_plusE + females_85_plusE,
      
      
      ## Age Age (65) calculations
      total_male_65_plus = males_65_66E + males_67_69E + males_70_74E + males_75_79E + males_80_84E + males_85_plusE,
      total_female_65_plus = females_65_66E + females_67_69E + females_70_74E + females_75_79E + females_80_84E + females_85_plusE, 
      total_65_plus = total_male_65_plus + total_female_65_plus,
      pct_65_plus = 100 * total_65_plus / total_ageE,
      pct_0_64 = 100 * (total_ageE - total_65_plus) / total_ageE,
      
      # Education percentages
        ACS_MINPR = 100 * (total_educationE - no_schoolingE - nursery_schoolE + kindergartenE -
        grade_1E - grade_2E - grade_3E - grade_4E - grade_5E) / total_educationE,
      pct_hs_or_less = 100 * (no_schoolingE + nursery_schoolE + kindergartenE +
        grade_1E + grade_2E + grade_3E + grade_4E + grade_5E +
        grade_6E + grade_7E + grade_8E + grade_9E + grade_10E +
        grade_11E + grade_12_no_diplomaE) / total_educationE,
      pct_hs_grad = 100 * (high_school_graduateE + gedE) / total_educationE,
      pct_bachelors = 100 * college_eduE / total_educationE,
      pct_some_college = 100 * (some_college_less_1yrE + some_college_1yr_plusE + associates_degreeE) / total_educationE, 
      pct_bachelors_or_greater = 100 * (college_eduE + masters_degreeE + professional_school_degreeE +
        doctorate_degreeE) / total_educationE,
      
      # Housing and overcrowding
      pct_overcrowding_owners = 100 * overcrowdedE / total_occupiedE,
      pct_overcrowding_renters = 100 * overcrowded_rentersE / total_occupied_rentersE,
      
      # Employment
      pct_unemployed = 100 * unemployedE / labor_forceE,
      
      # Health insurance
      pct_no_health_insurance = 100 * (no_insurance_maleE + no_insurance_femaleE) / total_pop_insuranceE,
      
      # Vehicle access
      pct_vehicle_access = 100 * (1 - (no_vehicleE + no_vehicle_rentersE) /
        (total_vehicleE + total_vehicle_rentersE)),
      
      # Race/ethnicity percentages
      pct_nh_black = 100 * nh_blackE / total_pop_ethnicityE,
      pct_nh_white = 100 * nh_whiteE / total_pop_ethnicityE,
      pct_nh_asian = 100 * nh_asianE / total_pop_ethnicityE,
      pct_hispanic = 100 * hispanicE / total_pop_ethnicityE,
      
      # ICE calculations
      ice_race_black = (nh_whiteE - nh_blackE) / total_pop_ethnicityE,
      ice_race_hispanic = (nh_whiteE - hispanicE) / total_pop_ethnicityE,
      ice_race_asian = (nh_whiteE - nh_asianE) / total_pop_ethnicityE,
      
      # Poverty
      pct_below_poverty = 100 * below_povertyE / poverty_universeE,
      
      # Clean median year built
      median_year_built = ifelse(median_year_builtE == 0, NA, median_year_builtE)
    ) %>%
    select(
      geoid = GEOID,
      year,
      geo,
      median_income = median_incomeE,
      median_housing_value = median_housing_valueE,
      median_year_built,
      total_male_65_plus,
      total_female_65_plus,
      total_age_sex = total_ageE,
      starts_with("pct_"),
      starts_with("pop_5yr"),
      ACS_MINPR,
      starts_with("ice_")
    )
  
  return(df_wide)
}

dfa =  get_shreya_acs_data('county', 2018)

Here is the metadata

Code
df_shreya_metadata <- tribble(
  ~column_name, ~description, ~data_type, ~source,
  "geoid", "Census geographic identifier", "character", "Census GEOID",
  "year", "Year of ACS data", "integer", "ACS 5-year estimates",
  "geo", "Geographic level (tract/zcta)", "character", "Geographic specification",
  "median_income", "Median household income in past 12 months", "double", "ACS B19013_001",
  "median_housing_value", "Median value of owner-occupied housing units", "double", "ACS B25077_001",
  "median_year_built", "Median year structure built", "double", "ACS B25035_001",
  "pct_65_plus", "Percent population 65 years and over", "double", "Calculated from sum of B01001 age groups 65+",
  "pct_0_64", "Percent population under 65 years", "double", "Calculated as 100 - pct_65_plus",
  "pct_hs_or_less", "Percent with high school or less education", "double", "Sum of B15003_002 through B15003_016 divided by total",
  "ACS_MINPR", "Percentage of the population aged 25 or older who completed primary education or above. In general, completed primary is considered to be attained at 6 years of education.","double", "total minus all below 6th grade categories divided by total",
  "pct_hs_grad", "Percent high school graduates (including GED)", "double", "B15003_017 + B15003_018 divided by total",
  "pct_some_college", "Percent with some college credit or associate's degree, no bachelor's", "double", "B15003_019 + B15003_020 + B15003_021 divided by total",
  "pct_bachelors", "Percent with bachelor's degree", "double", "B15003_022 divided by total",
  "pct_bachelors_or_greater", "Percent with bachelor's degree or higher", "double", "B15003_022 + B15003_023 + B15003_024 + B15003_025 divided by total",
  "pct_overcrowding_owners", "Percent owner units with >1.51 occupants per room", "double", "B25014_007 divided by B25014_002",
  "pct_overcrowding_renters", "Percent renter units with >1.51 occupants per room", "double", "B25014_013 divided by B25014_008",
  "pct_unemployed", "Unemployment rate", "double", "B23025_005 divided by B23025_002",
  "pct_no_health_insurance", "Percent without health insurance", "double", "(B27001_005 + B27001_030) divided by B27001_001",
  "pct_vehicle_access", "Percent households with vehicle access", "double", "1 - (B25044_003 + B25044_010)/(B25044_001 + B25044_009)",
  "pct_nh_black", "Percent non-Hispanic Black alone", "double", "B03002_004 divided by B03002_001",
  "pct_nh_white", "Percent non-Hispanic White alone", "double", "B03002_003 divided by B03002_001",
  "pct_nh_asian", "Percent non-Hispanic Asian alone", "double", "B03002_006 divided by B03002_001",
  "pct_hispanic", "Percent Hispanic or Latino (any race)", "double", "B03002_012 divided by B03002_001",
  "pct_below_poverty", "Percent below poverty level", "double", "B17001_002 divided by B17001_001",
  "ice_race_black", "Index of Concentration at Extremes (ICE) White vs Black", "double", "(White - Black)/Total population",
  "ice_race_hispanic", "ICE White vs Hispanic", "double", "(White - Hispanic)/Total population",
  "ice_race_asian", "ICE White vs Asian", "double", "(White - Asian)/Total population"
)

df_shreya_metadata %>% 
  reactable()

UHC RDC variables

Here is the code for non-stratiatifed UHC variables that we haven’t added so far.

Remaining UHC ACS variables code
get_uhc_acs_data <- function(geography, year) {
  
  df_pull = get_acs(
    geography = geography,
    state = "PA",
    year = year,
    survey = "acs5",
    variables = c(
      # Household size (B25010)
      hh_size_owner = "B25010_002",
      hh_size_renter = "B25010_003",
      
      # Crowding >0.5ppr (B25014)
      total_occupied = "B25014_001",
      crowd_half_1_owner = "B25014_005", # 0.5 to 1.0 persons per room
      crowd_half_1_renter = "B25014_011",
      
      # Labor force (B23001)
      pop_16_over = "B23001_001",
      not_in_labor_force = "B23001_004",
      
      # Some college education (B15002)
      pop_25_over = "B15002_001",
      some_college = "B15002_014"
    ),
    output = "wide"
  )
  
  df_wide = df_pull %>%
    mutate(
      geo = geography,
      year = year,
      
      # Average household sizes
      avg_hh_size_owner = hh_size_ownerE,
      avg_hh_size_renter = hh_size_renterE,
      
      # >0.5 persons per room crowding
      pct_crowd_half = (crowd_half_1_ownerE + crowd_half_1_renterE) / 
                      total_occupiedE * 100,
      
      # Labor force
      pct_not_in_labor_force = not_in_labor_forceE / pop_16_overE * 100,
      
    ) %>%
    select(
      geoid = GEOID,
      year,
      geo,
      starts_with("avg_"),
      starts_with("pct_")
    )
  
  return(df_wide)
 }


get_uhc_acs_data('county', 2018)

here is the metadta

Code
df_uhc_metadata <- tribble(
  ~column_name, ~description, ~data_type, ~source,
  "geoid", "Census geographic identifier", "character", "Census GEOID",
  "year", "Year of ACS data", "integer", "ACS 5-year estimates",
  "geo", "Geographic level", "character", "Geographic specification",
  "avg_hh_size_owner", "Average household size in owner-occupied units", "double", "B25010_002",
  "avg_hh_size_renter", "Average household size in renter-occupied units", "double", "B25010_003",
  "pct_crowd_half", "Percentage of occupied housing units with >0.5 persons per room", "double", "B25014_005 + B25014_011/B25014_001",
  "pct_not_in_labor_force", "Percentage not in labor force among population 16+ years", "double", "B23001_004/B23001_001"
)

df_uhc_metadata %>% 
  reactable()

3. Data

Harmonize

Here we pull ACS data and operationalize the requested neighborhood level variables for CCUH. Lets test the harmonization of a single year and geography (zcta)

```{r}

## Component data
zcta_ms2_data <- get_ms2_acs_data("zcta", 2019)
zcta_shreya_data <- get_shreya_acs_data("zcta", 2019)
zcta_uhc_data <- get_uhc_acs_data("zcta", 2019)


## Harmonize
df_harmonize_tmp = zcta_ms2_data %>% 
  left_join(zcta_shreya_data) %>% 
  left_join(zcta_uhc_data)
```

Process

Now lets scale this and run the computations across our parameter space. Here is our parameter space.

params
$geo
[1] "tract"              "zcta"               "county subdivision"
[4] "county"            

$year
[1] 2012 2013 2014 2015 2016 2017 2018 2019 2020

$state
[1] "PA"

Now lets turn this into combiantions.

df_params = tibble(
  geo = list(params$geo),
  year = params$year
) %>% 
  unnest(cols = c(geo))

df_params %>% reactable()

So we have 36 combinations of year and geographic level. Lets run these and cache results.

df_params %>%
  ## Error for this combiations - to debug later
  filter(!(geo == "zcta" & year == 2020)) %>%  
  group_by(row_number()) %>% 
  group_walk(~{
    
    ## Set up
    ##  geo_tmp = 'county'; year_tmp = 2015
    geo_tmp = .x$geo
    year_tmp = .x$year
    file_out_tmp = glue("{geo_tmp}_{year_tmp}_results.parquet")
    out_tmp = file.path("cache/_processed/",file_out_tmp)
    cli::cli_alert("Start processing: {geo_tmp} - {year_tmp}")
    if (file.exists(out_tmp)) {
      cli::cli_alert_info("Already processed! Skipping!!")
      return()
    }
    
    ## Component data
    ms2_data <- get_ms2_acs_data(geo_tmp, year_tmp)
    shreya_data <- get_shreya_acs_data(geo_tmp, year_tmp)
    uhc_data <- get_uhc_acs_data(geo_tmp, year_tmp)
    
    ## Harmonize
    df_harmonize_tmp = ms2_data %>%
      left_join(shreya_data) %>%
      left_join(uhc_data)
    
    ## Cache
    df_harmonize_tmp %>% arrow::write_parquet(out_tmp)
    cli::cli_alert_success("Done processing; results cached!")
    
  })

Lets compile

Compile

Lets connect to the results.

Connect to results.
df_data = "cache/_processed/" %>% 
  open_dataset() %>% 
  collect() 
glimpse(df_data)
Rows: 68,895
Columns: 64
$ geoid                        <chr> "4206564376", "4206315560", "4211747080",…
$ year                         <int> 2012, 2012, 2012, 2012, 2012, 2012, 2012,…
$ geo                          <chr> "county subdivision", "county subdivision…
$ total_population             <dbl> 2754, 2294, 3599, 722, 437, 834, 435, 123…
$ pct_occupied                 <dbl> 87.17156, 94.94737, 88.15789, 97.45223, 7…
$ pct_pre_1970                 <dbl> 0.7318393, 0.5178947, 0.5567434, 0.337579…
$ pct_renter                   <dbl> 35.460993, 16.075388, 50.932836, 40.19607…
$ pct_lacking_plumbing         <dbl> 6.0278207, 1.4736842, 0.0000000, 0.000000…
$ pct_lacking_kitchen          <dbl> 8.1143740, 1.4736842, 1.2335526, 0.000000…
$ pct_crowded_1_5more          <dbl> 0.0000000, 0.7760532, 0.0000000, 0.000000…
$ pct_crowded_2more            <dbl> 0.0000000, 0.5543237, 0.0000000, 0.000000…
$ pct_high_grapi_30more        <dbl> 36.25000, 40.68966, 66.66667, 65.04065, 1…
$ pct_high_grapi_50more        <dbl> 25.750000, 11.724138, 45.787546, 33.33333…
$ area_sq_km                   <dbl> 3.793760, 87.033848, 4.956900, 1.509299, …
$ population_density_per_sq_km <dbl> 725.928894, 26.357561, 726.058625, 478.36…
$ median_income                <dbl> 38298, 40667, 27500, 32125, 52500, 38977,…
$ median_housing_value         <dbl> 65100, 104100, 135000, 105200, 125900, 86…
$ median_year_built            <dbl> 1939, 1968, 1963, 1978, 1972, 1961, 1975,…
$ total_male_65_plus           <dbl> 182, 219, 199, 45, 53, 80, 85, 86, 93, 99…
$ total_female_65_plus         <dbl> 256, 238, 182, 52, 36, 97, 62, 156, 83, 1…
$ total_age_sex                <dbl> 2754, 2294, 3599, 722, 437, 834, 435, 123…
$ pct_65_plus                  <dbl> 15.90414, 19.92153, 10.58627, 13.43490, 2…
$ pct_0_64                     <dbl> 84.09586, 80.07847, 89.41373, 86.56510, 7…
$ pct_hs_or_less               <dbl> 11.833333, 10.903614, 8.537491, 10.557769…
$ pct_hs_grad                  <dbl> 56.05556, 53.97590, 26.28062, 46.41434, 5…
$ pct_bachelors                <dbl> 7.777778, 9.759036, 19.896065, 13.346614,…
$ pct_some_college             <dbl> 21.05556, 21.02410, 26.20638, 26.09562, 1…
$ pct_bachelors_or_greater     <dbl> 11.055556, 14.096386, 38.975501, 16.93227…
$ pct_overcrowding_owners      <dbl> 0.000000, 0.660502, 0.000000, 0.000000, 0…
$ pct_overcrowding_renters     <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0…
$ pct_unemployed               <dbl> 7.545671, 9.728718, 20.572057, 9.939759, …
$ pct_no_health_insurance      <dbl> 48.80174, 45.98954, 56.04335, 53.46260, 4…
$ pct_vehicle_access           <dbl> 89.65969, 98.47182, 91.59456, 94.17249, 9…
$ pct_nh_black                 <dbl> 0.1452433, 0.0000000, 6.1961656, 0.277008…
$ pct_nh_white                 <dbl> 98.03922, 95.33566, 87.94109, 98.33795, 9…
$ pct_nh_asian                 <dbl> 0.0000000, 0.0000000, 1.7227008, 0.000000…
$ pct_hispanic                 <dbl> 1.3071895, 4.6643418, 3.1397610, 0.415512…
$ pct_below_poverty            <dbl> 26.511289, 13.199301, 30.609598, 28.75000…
$ pop_5yr_age_cat_0_4          <dbl> 230, 122, 44, 28, 28, 12, 26, 70, 84, 64,…
$ pop_5yr_age_cat_5_9          <dbl> 151, 175, 107, 63, 22, 74, 18, 65, 57, 63…
$ pop_5yr_age_cat_10_14        <dbl> 167, 111, 104, 41, 20, 51, 4, 68, 108, 44…
$ pop_5yr_age_cat_15_19        <dbl> 244, 139, 1027, 41, 34, 30, 33, 95, 83, 5…
$ pop_5yr_age_cat_20_24        <dbl> 170, 127, 916, 40, 19, 48, 0, 80, 46, 114…
$ pop_5yr_age_cat_25_29        <dbl> 146, 82, 226, 79, 13, 20, 8, 75, 31, 32, …
$ pop_5yr_age_cat_30_34        <dbl> 179, 79, 106, 66, 22, 33, 27, 62, 61, 69,…
$ pop_5yr_age_cat_35_39        <dbl> 166, 102, 95, 40, 22, 73, 10, 57, 85, 55,…
$ pop_5yr_age_cat_40_44        <dbl> 211, 146, 69, 49, 27, 52, 39, 79, 84, 119…
$ pop_5yr_age_cat_45_49        <dbl> 64, 191, 91, 31, 32, 89, 44, 105, 127, 14…
$ pop_5yr_age_cat_50_54        <dbl> 201, 236, 179, 37, 52, 35, 34, 76, 98, 98…
$ pop_5yr_age_cat_55_59        <dbl> 211, 234, 147, 29, 36, 64, 32, 98, 106, 1…
$ pop_5yr_age_cat_60_64        <dbl> 184, 133, 53, 74, 30, 62, 24, 56, 91, 35,…
$ pop_5yr_age_cat_65_69        <dbl> 162, 136, 76, 40, 41, 22, 61, 49, 45, 71,…
$ pop_5yr_age_cat_70_74        <dbl> 73, 95, 110, 28, 19, 61, 42, 85, 52, 24, …
$ pop_5yr_age_cat_75_79        <dbl> 34, 85, 66, 2, 16, 48, 21, 41, 41, 30, 12…
$ pop_5yr_age_cat_80_84        <dbl> 71, 106, 109, 10, 8, 32, 11, 34, 24, 36, …
$ pop_5yr_age_cat_85_plus      <dbl> 98, 35, 20, 17, 5, 14, 12, 33, 14, 60, 10…
$ ACS_MINPR                    <dbl> 98.61111, 99.15663, 99.48033, 98.80478, 1…
$ ice_race_black               <dbl> 0.9789397, 0.9533566, 0.8174493, 0.980609…
$ ice_race_hispanic            <dbl> 0.9673203, 0.9067132, 0.8480133, 0.979224…
$ ice_race_asian               <dbl> 0.9803922, 0.9533566, 0.8621839, 0.983379…
$ avg_hh_size_owner            <dbl> 2.31, 2.51, 2.15, 2.26, 2.34, 2.16, 1.92,…
$ avg_hh_size_renter           <dbl> 2.61, 2.71, 2.07, 2.45, 2.51, 2.29, 2.58,…
$ pct_crowd_half               <dbl> 0.0000000, 0.0000000, 0.5597015, 0.000000…
$ pct_not_in_labor_force       <dbl> 1.0999083, 0.4212744, 5.2889025, 1.388888…

Tabulate across geo and time.

Code
df_data %>% 
  count(geo, year) %>% 
  reactable()

Let’s just create a tidy dataframe to make it easier to QC.

Code
#|code-fold: false

df_tidy = df_data %>% 
  select('geoid','year','geo', contains('pct_')) %>% 
  pivot_longer(-c('geoid','year','geo'), names_to = 'var') 

df_tidy %>% 
  sample_n(5) %>% 
  reactable()

Invalid %

First check if if variables is a pct - check that values and between 0-100 or NA.

df_valid = df_tidy %>% 
  filter(str_detect(var, "pct_")) %>% 
  filter(!is.na(value)) %>% 
  verify(value <= 100) %>% 
  verify(value >= 0)

Missingness

Next let’s check the patterns of missingness. We noticed high missingness in year 2012; so that was quarentined for now.

Code
df_missing = df_tidy %>% 
  filter(year >2012) %>% 
  summarize(
    rows = n(),
    rows_missing = sum(is.na(value)),
    pct_missing = rows_missing/rows *100, 
    .by = c(year, geo, var)
  ) %>% 
  arrange(desc(pct_missing))

ggplot(df_missing, 
       aes(x = pct_missing, 
           y = reorder(var, pct_missing), 
           fill = geo)) +
  geom_bar(stat = "identity", 
           position = "dodge") +
  facet_wrap(~year) +
  scale_x_continuous(limits = c(0, 100)) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 8),
    legend.position = "bottom"
  ) +
  labs(
    title = "Missing Data Percentages by Year and Geography",
    x = "Percent Missing",
    y = "Variable",
    fill = "Geography Type"
  )

Other than that missingness is somewhat expected and looks okay for now.

Visual Inspection

Educational Attainment

Let’s do a quick visual inspection of educaiton attainment variable.

Code
df_data %>%
  select(geoid, year, geo,
         pct_bachelors_or_greater,
         ACS_MINPR) |> 
  pivot_longer(
    cols = c(
      pct_bachelors_or_greater,
      ACS_MINPR),
    names_to = "education_level",
    values_to = "percentage"
  ) |> 
  ggplot(aes(x = as.factor(year), y = percentage, fill = education_level)) +
  geom_violin(alpha = 0.5, position = position_dodge(width = 0.8)) +
  geom_boxplot(width = 0.2, alpha = 0.8, position = position_dodge(width = 0.8)) +
  facet_wrap(~geo) +
  theme_minimal() +
  labs(
    title = "Distribution of Educational Attainment",
    subtitle = "Comparing Greater Bachelor's Degree vs Greater than Primary Education",
    x = "Year",
    y = "Percentage",
    fill = "Education Level"
  ) +
  theme(
    plot.title = element_text(size = 12),
    legend.position = "bottom"
  )

Unemployment

Code
df_data %>%
  ggplot(aes(x = as.factor(year), y = pct_unemployed )) +
  geom_violin(fill = "salmon", alpha = 0.5) +
  geom_boxplot(width = 0.2, alpha = 0.8) +
  facet_wrap(~geo) +
  theme_minimal() +
  labs(
    title = "Distribution of Unemployment Rate",
    x = "Year",
    y = "Unemployment Rate (%)"
  ) +
  theme(plot.title = element_text(size = 10))

Overcrowding

Code
df_data %>%
  ggplot(aes(x = as.factor(year), y = pct_crowded_2more )) +
  geom_violin(fill = "lightgreen", alpha = 0.5) +
  geom_boxplot(width = 0.2, alpha = 0.8) +
  facet_wrap(~geo) +
  theme_minimal() +
  labs(
    title = "Distribution of Overcrowded Housing",
    x = "Year",
    y = "Percent Households with 1.51+ Occupants per Room"
  ) +
  theme(plot.title = element_text(size = 10))

Population Density

Lets plot tract level population density as

Code
df_data %>% 
  ggplot(aes(x = as.factor(year), y = population_density_per_sq_km)) +
  geom_violin(fill = "lightblue", alpha = 0.7) +
  scale_y_log10() +
  facet_wrap(~geo) +
  theme_minimal() +
  labs(
    title = "Distribution of Population Density in PA",
    x = "Year",
    y = "Population Density (per sq km, log scale)"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5),
    axis.text.x = element_text(angle = 0)
  )

Looks okay. Tract level values peaks between 10,000-20,000.

Population Age Sex

Just a quick EDA on the age sex population counts

Code
# Create long format data for gender comparison
elderly_data <- df_data %>%
  select(geoid, year, geo, total_male_65_plus, total_female_65_plus) %>%
  pivot_longer(
    cols = c(total_male_65_plus, total_female_65_plus),
    names_to = "gender",
    values_to = "population"
  ) %>%
  mutate(
    gender = case_when(
      gender == "total_male_65_plus" ~ "Male 65+",
      gender == "total_female_65_plus" ~ "Female 65+",
      TRUE ~ gender
    )
  )

# Create the visualization
ggplot(elderly_data, aes(x = as.factor(year), y = population, fill = gender)) +
  geom_violin(alpha = 0.5, position = position_dodge(width = 0.8)) +
  geom_boxplot(width = 0.2, alpha = 0.8, position = position_dodge(width = 0.8)) +
  facet_wrap(~geo, scales = "free_y") +
  scale_fill_manual(values = c("Male 65+" = "steelblue", "Female 65+" = "darkorange")) +
  labs(
    title = "Distribution of Elderly Population (65+) by Gender",
    subtitle = "Comparison Across Geographic Areas and Years",
    x = "Year",
    y = "Population Count",
    fill = "Gender Group"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    plot.title = element_text(size = 12, face = "bold"),
    plot.subtitle = element_text(size = 10),
    strip.text = element_text(face = "bold")
  )

Looks sane.

Population Pct_65 Density

Code
pct_distribution <- df_data %>%
  # Select the appropriate columns from your new dataset
  select(geoid, pct_65_plus, pct_0_64) %>%
  # Rename for better plot labels
  rename("65 and older" = pct_65_plus, "Under 65" = pct_0_64) %>%
  # Pivot to long format for plotting
  pivot_longer(
    cols = c("Under 65", "65 and older"),
    names_to = "age_category",
    values_to = "percentage"
  )

# Create overlapping density plot
ggplot(pct_distribution, aes(x = percentage, fill = age_category)) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("Under 65" = "steelblue", "65 and older" = "firebrick")) +
  theme_minimal() +
  labs(title = "Distribution of Population Percentages by Age Group",
       subtitle = "Tract10 Level Data",
       x = "Percentage",
       y = "Density",
       fill = "Age Category") +
  theme(legend.position = "top")

Looks sane.

Population Pyrmid (random tracts)

Code
# Select 4 random tract geoids

set.seed(123)
random_tracts <- df_data |> 
  filter(geo == 'county') |>
  filter(year == 2015) |> 
  pull(geoid) |> 
  unique() |> 
  sample(4)



df_data_all = random_tracts |> 
  map_df(\(tract_tmp){
    # Extract data for the tract
    tract_data <- df_data %>% 
      filter(geoid == tract_tmp) |> 
      filter(year == 2015)
    
    # Get age columns - these are different in your new dataset
    # They follow the pattern pop_5yr_age_cat_X_Y in the new data
    age_columns <- grep("^pop_5yr_age_cat", names(df_data), value = TRUE)
    
    # Create dataframe with age data
    age_data <- data.frame(
      age_group = age_columns,
      population = as.numeric(tract_data[1, age_columns]),
      tract_id = tract_tmp,
      year = tract_data$year,
      total_pop = tract_data$total_population
    ) |> 
      mutate(age = str_remove_all(age_group, "pop_5yr_age_cat_") |> 
               parse_number()) |> 
      select(tract = tract_id, year, age, pop = population)
  })
    


# Filter data to include selected tracts
selected_tracts_data <- df_data_all 

# Create factor with correct ordering for age groups
selected_tracts_data$age_factor <- factor(selected_tracts_data$age, 
                                          levels = c(0, 5, 10, 15, 20, 25, 30, 35, 
                                                     40, 45, 50, 55, 60, 65, 70, 75, 
                                                     80, 85))

# Create tract label for facets
selected_tracts_data$tract_label <- paste0("Tract: ", selected_tracts_data$tract, " (", selected_tracts_data$year, ")")

# Create a single plot with facets
selected_tracts_data |> 
  ggplot(aes(x = age_factor, y = pop)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  facet_wrap(~ tract, scales = "free_x")+
  theme_minimal() +
  labs(title = "Population by Age Group for Selected County",
       x = "Age Group (Years)",
       y = "Population")

Looks okay.

Export

if (!file.exists(context$final)){
  df_data %>% write_parquet(context$final)
}

4. Metadata

Okay let’s compile the metadata

df_metadata =  list(
  df_ms2_metadata,
  df_shreya_metadata %>% filter(!column_name%in%c('geoid','year','geo')),
  df_uhc_metadata %>% filter(!column_name%in%c('geoid','year','geo'))
) %>% 
  bind_rows()

5. Access

Data

```{r}
library(arrow)
library(tidyverse)

ds = "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/ccuh-server/freeze/acs_v1/acs__v1.parquet" |> 
  arrow::open_dataset()
  
df = ds %>% 
  collect()
```

Metadata

Code
df_metadata %>% 
  reactable(
    searchable = T,
    defaultPageSize = 99
  )