Please refer to this documentation log for ACS data for CCUH. Note that in this first version we will just pull a subset of requested variables to support the first three papers focusing on FAIR best practices and good docuemntation. We have a list of tables identified which we will pull data from at the tract10 level and operationalize any requested summaries.
Tables
Starting with a few example ACS datasets (UHC nbhd dataset, MS2 data pull, Shreya pull) we have identified a ACS tables from which the data comes from - these are documented here.
Lets subset hte ACS variabel list to those in these tables.
Table of ACS variables of interest
tables_of_interest = acs_tables <-c(# MS2 Tables"B01003", # Total population"B25001", # Housing units"B25002", # Occupancy status"B25003", # Tenure"B25014", # Occupants per room"B25034", # Year structure built"B25047", # Plumbing facilities"B25051", # Kitchen facilities"B25070", # Gross rent as percentage of income# Shreya Tables"B01001", # Sex by age"B03002", # Race/ethnicity"B15003", # Educational attainment"B17001", # Poverty status"B19013", # Median household income"B23025", # Employment status"B25035", # Median year structure built"B25044", # Tenure by vehicles available"B25077", # Median value"B27001", # Health insurance coverage# Additional UHC Tables"B15002", # Sex by educational attainment"B23001", # Sex by employment status"B25010"# Average household size by tenure) %>%paste0(.,"_") %>%paste(collapse ='|')df_tables = df_vars_acs %>%filter(str_detect(name, tables_of_interest)) %>%arrange(concept) %>%select(-year, -geography) %>%distinct()reactable(df_tables, searchable = T)
2. Components
MS2 Data Pull
These variables were operatioanlized ad hoc for Tara’s CCUH MS2, we just reproduce and integrate into the CCUH ACS dataset here. Before we start we need to pull in ALAND which is needed to calculate population density. This is a variable we can calculate based on total population and ALAND from the TIGER files. Let’s pull in the ALAND data first - note we have operatainolized this in the CCUH Tiger Boundaries API.
Looks good now we can use this along with ACS population to calculate pop density. Now lets do our ACS Pull and cleaning.
MS2 - nbhd variables function
get_ms2_acs_data <-function(geography, year) {## Pull ACS Data#### geography = 'county subdivision'; year = 2017#### geography = 'county subdivision'; year = 2012#### geography = 'tract'; year = 2020 vars_tmp =c(## OCCUPANCY STATUS "B25002_001", ## Total"B25002_002", ## Occupied## YEAR STRUCTURE BUILT"B25034_001", # Total"B25034_007", # pre-2015: 1960 to 1969; 2015+: 1970 to 1979"B25034_008", # pre-2015: 1950 to 1959; 2015+: 1960 to 1969"B25034_009", # pre-2015: 1940 to 1949; 2015+: 1950 to 1959"B25034_010", # pre-2015: 1939 or earlier; 2015+: 1940 to 1949"B25034_011", # pre-2015: NA; 2015+: 1939 or earlier## TENURE (RENTER)"B25003_001", ## Total!!Renter occupied"B25003_003", ## Renter occupied## PLUMBING FACILITIES FOR ALL HOUSING UNITS"B25047_001", ## Total"B25047_003", ## Lacking complete plumbing facilities## KITCHEN FACILITIES FOR ALL HOUSING UNITS"B25051_001", ## Total"B25051_003", ## Lacking complete kitchen facilities## TENURE BY OCCUPANTS PER ROOM "B25014_001", ## Total"B25014_006", ## TENURE BY OCCUPANTS PER ROOM (1.51 to 2.00 or more occupants per room)"B25014_007", ## TENURE BY OCCUPANTS PER ROOM (2.01 or more occupants per room)## GROSS RENT AS A PERCENTAGE OF HOUSEHOLD INCOME IN THE PAST 12 MONTHS"B25070_001", ## Total"B25070_007", ## (30.0-34.9% or more)"B25070_008", ## (34.9-39.9% or more)"B25070_009", ## (40.0 to 49.9 % or more)"B25070_010", ## (50% or more)## TOTAL POPULATION (general)"B01003_001"## Total ) if (year<2015) vars_tmp = vars_tmp %>% purrr::discard(~.x=='B25034_011') df_pull =get_acs(geography = geography,state ="PA", year = year,survey ="acs5",variables = vars_tmp,output ="wide" ) %>%## Standardize ZCTA to ZCTA5mutate(geo = geography,GEOID =if_else(geo =='zcta', str_sub(GEOID, -5), GEOID)) %>%rename(geoid = GEOID) %>%## Merge in ALANDleft_join(df_ALAND) # %>% assert(not_na, c('ALAND', 'B01003_001E')) Relax as some are missing. { # QC -----------------------------------------------------------## If pct_missing_ALAND units is significant (> 5%) then quarentine all ALAND downstream variables (tract10-2020) n_rows =nrow(df_pull) n_rows_missing_ALAND = df_pull %>%filter(is.na(ALAND)) %>%nrow() pct_missing_ALAND = n_rows_missing_ALAND/n_rows *100if (pct_missing_ALAND >5){ df_pull = df_pull %>%mutate(ALAND =NA) cli::cli_alert_danger("Quarentine ALAND variables due to systematic ALAND missingness") } }## Summary Metrics df_wide = df_pull %>%mutate(year = year,pct_occupied = B25002_002E / B25002_001E *100,pct_pre_1970 =ifelse( year <2015, (B25034_007E + B25034_008E + B25034_009E + B25034_010E) / B25034_001E, (B25034_008E + B25034_009E + B25034_010E + B25034_011E) / B25034_001E ),pct_renter = B25003_003E / B25003_001E *100,pct_lacking_plumbing = B25047_003E / B25047_001E *100,pct_lacking_kitchen = B25051_003E / B25051_001E *100,pct_crowded_1_5more = (B25014_006E + B25014_007E) / B25014_001E *100,pct_crowded_2more = B25014_007E / B25014_001E *100,pct_high_grapi_30more = (B25070_007E + B25070_008E + B25070_009E + B25070_010E) / B25070_001E *100,pct_high_grapi_50more = B25070_010E / B25070_001E *100,area_sq_km = ALAND /1e6, # Convert square meters to square kilometerspopulation_density_per_sq_km = B01003_001E / area_sq_km ) %>%select(geoid , year, geo, total_population = B01003_001E, starts_with("pct_"), area_sq_km, population_density_per_sq_km)return(df_wide)}get_ms2_acs_data("county subdivision", 2012)
Note we quarent pct_pre_1970 to 2015+ due to different code.
Here is the metadata.
Code
df_ms2_metadata <-tribble(~column_name, ~description, ~data_type, ~source,"geoid", "Unique identifier for the geography", "character", "Census GEOID","year", "Year of the ACS data", "integer", "ACS 5-year estimates","geo", "Geography type (tract, county subdivision, etc.)", "character", "Function input parameter","total_population", "Total population estimate", "double", "ACS variable B01003_001","pct_occupied", "Percentage of occupied housing units", "double", "B25002_002/B25002_001 * 100","pct_pre_1970", "Percentage of housing units built before 1970 (Note: Different calculation pre/post 2015)", "double", "Custom calculation based on year, using B25034 series","pct_renter", "Percentage of renter-occupied housing units", "double", "B25003_003/B25003_001 * 100","pct_lacking_plumbing", "Percentage lacking complete plumbing facilities", "double", "B25047_003/B25047_001 * 100","pct_lacking_kitchen", "Percentage lacking complete kitchen facilities", "double", "B25051_003/B25051_001 * 100","pct_crowded_1_5more", "Percentage with 1.51 or more occupants per room", "double", "(B25014_006 + B25014_007)/B25014_001 * 100","pct_crowded_2more", "Percentage with 2.01 or more occupants per room", "double", "B25014_007/B25014_001 * 100","pct_high_grapi_30more", "Percentage paying 30% or more of income on rent", "double", "(B25070_007 + B25070_008 + B25070_009 + B25070_010)/B25070_001 * 100","pct_high_grapi_50more", "Percentage paying 50% or more of income on rent", "double", "B25070_010/B25070_001 * 100","area_sq_km", "Land area in square kilometers", "double", "ALAND/1e6","population_density_per_sq_km", "Population per square kilometer", "double", "total_population/area_sq_km")df_ms2_metadata %>%reactable()
Shreya’s Data Pull
These were contribtued by Shreya see this GitHub issue.
df_shreya_metadata <-tribble(~column_name, ~description, ~data_type, ~source,"geoid", "Census geographic identifier", "character", "Census GEOID","year", "Year of ACS data", "integer", "ACS 5-year estimates","geo", "Geographic level (tract/zcta)", "character", "Geographic specification","median_income", "Median household income in past 12 months", "double", "ACS B19013_001","median_housing_value", "Median value of owner-occupied housing units", "double", "ACS B25077_001","median_year_built", "Median year structure built", "double", "ACS B25035_001","pct_65_plus", "Percent population 65 years and over", "double", "Calculated from sum of B01001 age groups 65+","pct_0_64", "Percent population under 65 years", "double", "Calculated as 100 - pct_65_plus","pct_hs_or_less", "Percent with high school or less education", "double", "Sum of B15003_002 through B15003_016 divided by total","ACS_MINPR", "Percentage of the population aged 25 or older who completed primary education or above. In general, completed primary is considered to be attained at 6 years of education.","double", "total minus all below 6th grade categories divided by total","pct_hs_grad", "Percent high school graduates (including GED)", "double", "B15003_017 + B15003_018 divided by total","pct_some_college", "Percent with some college credit or associate's degree, no bachelor's", "double", "B15003_019 + B15003_020 + B15003_021 divided by total","pct_bachelors", "Percent with bachelor's degree", "double", "B15003_022 divided by total","pct_bachelors_or_greater", "Percent with bachelor's degree or higher", "double", "B15003_022 + B15003_023 + B15003_024 + B15003_025 divided by total","pct_overcrowding_owners", "Percent owner units with >1.51 occupants per room", "double", "B25014_007 divided by B25014_002","pct_overcrowding_renters", "Percent renter units with >1.51 occupants per room", "double", "B25014_013 divided by B25014_008","pct_unemployed", "Unemployment rate", "double", "B23025_005 divided by B23025_002","pct_no_health_insurance", "Percent without health insurance", "double", "(B27001_005 + B27001_030) divided by B27001_001","pct_vehicle_access", "Percent households with vehicle access", "double", "1 - (B25044_003 + B25044_010)/(B25044_001 + B25044_009)","pct_nh_black", "Percent non-Hispanic Black alone", "double", "B03002_004 divided by B03002_001","pct_nh_white", "Percent non-Hispanic White alone", "double", "B03002_003 divided by B03002_001","pct_nh_asian", "Percent non-Hispanic Asian alone", "double", "B03002_006 divided by B03002_001","pct_hispanic", "Percent Hispanic or Latino (any race)", "double", "B03002_012 divided by B03002_001","pct_below_poverty", "Percent below poverty level", "double", "B17001_002 divided by B17001_001","ice_race_black", "Index of Concentration at Extremes (ICE) White vs Black", "double", "(White - Black)/Total population","ice_race_hispanic", "ICE White vs Hispanic", "double", "(White - Hispanic)/Total population","ice_race_asian", "ICE White vs Asian", "double", "(White - Asian)/Total population")df_shreya_metadata %>%reactable()
df_uhc_metadata <-tribble(~column_name, ~description, ~data_type, ~source,"geoid", "Census geographic identifier", "character", "Census GEOID","year", "Year of ACS data", "integer", "ACS 5-year estimates","geo", "Geographic level", "character", "Geographic specification","avg_hh_size_owner", "Average household size in owner-occupied units", "double", "B25010_002","avg_hh_size_renter", "Average household size in renter-occupied units", "double", "B25010_003","pct_crowd_half", "Percentage of occupied housing units with >0.5 persons per room", "double", "B25014_005 + B25014_011/B25014_001","pct_not_in_labor_force", "Percentage not in labor force among population 16+ years", "double", "B23001_004/B23001_001")df_uhc_metadata %>%reactable()
3. Data
Harmonize
Here we pull ACS data and operationalize the requested neighborhood level variables for CCUH. Lets test the harmonization of a single year and geography (zcta)
df_data %>%ggplot(aes(x =as.factor(year), y = pct_crowded_2more )) +geom_violin(fill ="lightgreen", alpha =0.5) +geom_boxplot(width =0.2, alpha =0.8) +facet_wrap(~geo) +theme_minimal() +labs(title ="Distribution of Overcrowded Housing",x ="Year",y ="Percent Households with 1.51+ Occupants per Room" ) +theme(plot.title =element_text(size =10))
Population Density
Lets plot tract level population density as
Code
df_data %>%ggplot(aes(x =as.factor(year), y = population_density_per_sq_km)) +geom_violin(fill ="lightblue", alpha =0.7) +scale_y_log10() +facet_wrap(~geo) +theme_minimal() +labs(title ="Distribution of Population Density in PA",x ="Year",y ="Population Density (per sq km, log scale)" ) +theme(plot.title =element_text(hjust =0.5),axis.text.x =element_text(angle =0) )
Looks okay. Tract level values peaks between 10,000-20,000.
Population Age Sex
Just a quick EDA on the age sex population counts
Code
# Create long format data for gender comparisonelderly_data <- df_data %>%select(geoid, year, geo, total_male_65_plus, total_female_65_plus) %>%pivot_longer(cols =c(total_male_65_plus, total_female_65_plus),names_to ="gender",values_to ="population" ) %>%mutate(gender =case_when( gender =="total_male_65_plus"~"Male 65+", gender =="total_female_65_plus"~"Female 65+",TRUE~ gender ) )# Create the visualizationggplot(elderly_data, aes(x =as.factor(year), y = population, fill = gender)) +geom_violin(alpha =0.5, position =position_dodge(width =0.8)) +geom_boxplot(width =0.2, alpha =0.8, position =position_dodge(width =0.8)) +facet_wrap(~geo, scales ="free_y") +scale_fill_manual(values =c("Male 65+"="steelblue", "Female 65+"="darkorange")) +labs(title ="Distribution of Elderly Population (65+) by Gender",subtitle ="Comparison Across Geographic Areas and Years",x ="Year",y ="Population Count",fill ="Gender Group" ) +theme_minimal() +theme(legend.position ="bottom",plot.title =element_text(size =12, face ="bold"),plot.subtitle =element_text(size =10),strip.text =element_text(face ="bold") )
Looks sane.
Population Pct_65 Density
Code
pct_distribution <- df_data %>%# Select the appropriate columns from your new datasetselect(geoid, pct_65_plus, pct_0_64) %>%# Rename for better plot labelsrename("65 and older"= pct_65_plus, "Under 65"= pct_0_64) %>%# Pivot to long format for plottingpivot_longer(cols =c("Under 65", "65 and older"),names_to ="age_category",values_to ="percentage" )# Create overlapping density plotggplot(pct_distribution, aes(x = percentage, fill = age_category)) +geom_density(alpha =0.5) +scale_fill_manual(values =c("Under 65"="steelblue", "65 and older"="firebrick")) +theme_minimal() +labs(title ="Distribution of Population Percentages by Age Group",subtitle ="Tract10 Level Data",x ="Percentage",y ="Density",fill ="Age Category") +theme(legend.position ="top")
Looks sane.
Population Pyrmid (random tracts)
Code
# Select 4 random tract geoidsset.seed(123)random_tracts <- df_data |>filter(geo =='county') |>filter(year ==2015) |>pull(geoid) |>unique() |>sample(4)df_data_all = random_tracts |>map_df(\(tract_tmp){# Extract data for the tract tract_data <- df_data %>%filter(geoid == tract_tmp) |>filter(year ==2015)# Get age columns - these are different in your new dataset# They follow the pattern pop_5yr_age_cat_X_Y in the new data age_columns <-grep("^pop_5yr_age_cat", names(df_data), value =TRUE)# Create dataframe with age data age_data <-data.frame(age_group = age_columns,population =as.numeric(tract_data[1, age_columns]),tract_id = tract_tmp,year = tract_data$year,total_pop = tract_data$total_population ) |>mutate(age =str_remove_all(age_group, "pop_5yr_age_cat_") |>parse_number()) |>select(tract = tract_id, year, age, pop = population) })# Filter data to include selected tractsselected_tracts_data <- df_data_all # Create factor with correct ordering for age groupsselected_tracts_data$age_factor <-factor(selected_tracts_data$age, levels =c(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85))# Create tract label for facetsselected_tracts_data$tract_label <-paste0("Tract: ", selected_tracts_data$tract, " (", selected_tracts_data$year, ")")# Create a single plot with facetsselected_tracts_data |>ggplot(aes(x = age_factor, y = pop)) +geom_bar(stat ="identity", fill ="steelblue") +coord_flip() +facet_wrap(~ tract, scales ="free_x")+theme_minimal() +labs(title ="Population by Age Group for Selected County",x ="Age Group (Years)",y ="Population")
Looks okay.
Export
if (!file.exists(context$final)){ df_data %>%write_parquet(context$final)}