From 896c80ba52bcb92ab2622804c08cc5b0c73b4371 Mon Sep 17 00:00:00 2001 From: Dylan Date: Tue, 7 Apr 2026 11:54:43 -0700 Subject: [PATCH 1/9] draft new acs release script --- .github/workflows/check_acs_release.yml | 30 +++++++++ tools/check_acs_release.py | 88 +++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 .github/workflows/check_acs_release.yml create mode 100644 tools/check_acs_release.py diff --git a/.github/workflows/check_acs_release.yml b/.github/workflows/check_acs_release.yml new file mode 100644 index 00000000..7604cdc4 --- /dev/null +++ b/.github/workflows/check_acs_release.yml @@ -0,0 +1,30 @@ +name: Check for new ACS TIGER_DP release + +on: + schedule: + - cron: "0 15 1 * *" + workflow_dispatch: + +jobs: + check-release: + runs-on: ubuntu-latest + permissions: + contents: read + issues: write + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install requests PyGithub + + - name: Check Census release and open issue + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: python tools/check_acs_release.py \ No newline at end of file diff --git a/tools/check_acs_release.py b/tools/check_acs_release.py new file mode 100644 index 00000000..f36f9f64 --- /dev/null +++ b/tools/check_acs_release.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import os +import sys +import requests +from github import Github + +REPO = "oturns/geosnap" +ISSUE_PREFIX = "New ACS release detected:" +CENSUS_ROOT = "https://www2.census.gov/geo/tiger/TIGER_DP" +TIMEOUT = 30 + +# Temporary until this can be inferred from package metadata or datastore contents +LATEST_SUPPORTED_YEAR = 2021 + +def census_year_url(year: int) -> str: + return f"{CENSUS_ROOT}/{year}ACS/" + +def expected_files(year: int) -> list[str]: + # block group + tract are the most relevant sentinels for geosnap's tooling + return [ + f"ACS_{year}_5YR_BG.gdb.zip", + f"ACS_{year}_5YR_TRACT.gdb.zip", + ] + +def fetch_directory_listing(year: int) -> str | None: + url = census_year_url(year) + resp = requests.get(url, timeout=TIMEOUT) + if resp.status_code == 404: + return None + resp.raise_for_status() + return resp.text + +def census_release_ready(year: int) -> tuple[bool, list[str]]: + html = fetch_directory_listing(year) + if html is None: + return False, [] + missing = [name for name in expected_files(year) if name not in html] + return len(missing) == 0, missing + +def issue_exists(repo, year: int) -> bool: + query = f'repo:{REPO} is:issue is:open "{ISSUE_PREFIX} {year}"' + return Github(os.environ["GITHUB_TOKEN"]).search_issues(query).totalCount > 0 + +def open_issue(repo, year: int, missing: list[str]) -> None: + title = f"{ISSUE_PREFIX} {year}" + body = f"""A new ACS vintage appears to be available on Census. + +Checked: +- {census_year_url(year)} + +Expected files: +- {expected_files(year)[0]} +- {expected_files(year)[1]} + +Missing files at check time: +- {", ".join(missing) if missing else "None"} + +This issue was opened automatically by the release-check workflow. +""" + repo.create_issue(title=title, body=body) + +def main() -> int: + token = os.environ.get("GITHUB_TOKEN") + if not token: + print("Missing GITHUB_TOKEN", file=sys.stderr) + return 1 + + year = LATEST_SUPPORTED_YEAR + 1 + ready, missing = census_release_ready(year) + + if not ready: + print(f"{year} release not ready. Missing: {missing}") + return 0 + + gh = Github(token) + repo = gh.get_repo(REPO) + + if issue_exists(repo, year): + print(f"Issue for {year} already exists.") + return 0 + + open_issue(repo, year, missing) + print(f"Opened issue for {year}.") + return 0 + +if __name__ == "__main__": + raise SystemExit(main()) From 4e8afca35573dbe98de6eb91e156f0d28da4020e Mon Sep 17 00:00:00 2001 From: Dylan Date: Thu, 9 Apr 2026 11:40:30 -0700 Subject: [PATCH 2/9] attempt to process acs too --- .github/workflows/check_acs_release.yml | 15 +- environment.yml | 2 + tools/check_acs_release.py | 204 +++++++++++++++++++----- 3 files changed, 180 insertions(+), 41 deletions(-) diff --git a/.github/workflows/check_acs_release.yml b/.github/workflows/check_acs_release.yml index 7604cdc4..de48941f 100644 --- a/.github/workflows/check_acs_release.yml +++ b/.github/workflows/check_acs_release.yml @@ -15,16 +15,21 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 + - uses: conda-incubator/setup-miniconda@v3 with: + activate-environment: geosnap + environment-file: environment.yml python-version: "3.11" + auto-activate-base: false - - name: Install dependencies + - name: Install package and extra dependencies + shell: bash -l {0} run: | - python -m pip install --upgrade pip - pip install requests PyGithub + pip install -e . + pip install PyGithub - - name: Check Census release and open issue + - name: Check Census release and process if available + shell: bash -l {0} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: python tools/check_acs_release.py \ No newline at end of file diff --git a/environment.yml b/environment.yml index af76fa00..c1775513 100644 --- a/environment.yml +++ b/environment.yml @@ -26,3 +26,5 @@ dependencies: - pandarm - ibis-framework - ultraplot + - pyogrio + - dask-geopandas diff --git a/tools/check_acs_release.py b/tools/check_acs_release.py index f36f9f64..5ff19c59 100644 --- a/tools/check_acs_release.py +++ b/tools/check_acs_release.py @@ -1,27 +1,42 @@ from __future__ import annotations - import os import sys +from pathlib import Path +import geopandas as gpd import requests from github import Github +from geosnap.io.util import get_census_gdb, convert_census_gdb, process_acs REPO = "oturns/geosnap" ISSUE_PREFIX = "New ACS release detected:" CENSUS_ROOT = "https://www2.census.gov/geo/tiger/TIGER_DP" TIMEOUT = 30 -# Temporary until this can be inferred from package metadata or datastore contents +# TODO: make this update dynamically LATEST_SUPPORTED_YEAR = 2021 +# Start with one geography to keep memory lower and behavior predictable. +GEOM_LEVEL = "blockgroup" # "blockgroup" or "tract" +LEVEL_CODE = "bg" if GEOM_LEVEL == "blockgroup" else "tract" +FILE_SUFFIX = "BG" if GEOM_LEVEL == "blockgroup" else "TRACT" + +# ensure the file actually has stuff in it +MIN_EXPECTED_SIZE_BYTES = 1_250_000_000 + +WORKDIR = Path("build") / f"{LATEST_SUPPORTED_YEAR + 1}_{LEVEL_CODE}" + + def census_year_url(year: int) -> str: return f"{CENSUS_ROOT}/{year}ACS/" -def expected_files(year: int) -> list[str]: - # block group + tract are the most relevant sentinels for geosnap's tooling - return [ - f"ACS_{year}_5YR_BG.gdb.zip", - f"ACS_{year}_5YR_TRACT.gdb.zip", - ] + +def expected_file(year: int) -> str: + return f"ACS_{year}_5YR_{FILE_SUFFIX}.gdb.zip" + + +def expected_file_url(year: int) -> str: + return f"{census_year_url(year)}{expected_file(year)}" + def fetch_directory_listing(year: int) -> str | None: url = census_year_url(year) @@ -31,34 +46,130 @@ def fetch_directory_listing(year: int) -> str | None: resp.raise_for_status() return resp.text -def census_release_ready(year: int) -> tuple[bool, list[str]]: + +def remote_file_size_bytes(url: str) -> int | None: + """ + Try to get the remote file size from HTTP headers. + + Returns: + int: size in bytes if available + None: if the server does not provide Content-Length + """ + resp = requests.head(url, allow_redirects=True, timeout=TIMEOUT) + + if resp.status_code == 404: + return None + + # Some servers do not return Content-Length on HEAD. Fall back to GET stream. + if resp.ok: + content_length = resp.headers.get("Content-Length") + if content_length is not None: + return int(content_length) + + resp = requests.get(url, stream=True, allow_redirects=True, timeout=TIMEOUT) + + if resp.status_code == 404: + return None + + resp.raise_for_status() + content_length = resp.headers.get("Content-Length") + if content_length is None: + return None + return int(content_length) + + +def census_release_status(year: int) -> tuple[bool, str]: + """ + Check whether the release is ready for processing. + + A release is considered ready only if the year directory exists, + the expected file is listed in the directory, and the remote + file size is at least MIN_EXPECTED_SIZE_BYTES + """ html = fetch_directory_listing(year) if html is None: - return False, [] - missing = [name for name in expected_files(year) if name not in html] - return len(missing) == 0, missing + return False, f"{census_year_url(year)} not found" + + filename = expected_file(year) + if filename not in html: + return False, f"{filename} not listed in {census_year_url(year)}" + + file_url = expected_file_url(year) + size_bytes = remote_file_size_bytes(file_url) + + if size_bytes is None: + return False, f"Could not determine remote file size for {file_url}" + + if size_bytes < MIN_EXPECTED_SIZE_BYTES: + return ( + False, + f"{filename} is present but too small " + f"({size_bytes:,} bytes < {MIN_EXPECTED_SIZE_BYTES:,} bytes)", + ) + + return ( + True, + f"{filename} is present and large enough " + f"({size_bytes:,} bytes >= {MIN_EXPECTED_SIZE_BYTES:,} bytes)", + ) -def issue_exists(repo, year: int) -> bool: + +def issue_exists(year: int) -> bool: query = f'repo:{REPO} is:issue is:open "{ISSUE_PREFIX} {year}"' - return Github(os.environ["GITHUB_TOKEN"]).search_issues(query).totalCount > 0 + gh = Github(os.environ["GITHUB_TOKEN"]) + return gh.search_issues(query).totalCount > 0 + + +def open_issue(year: int, body: str) -> None: + gh = Github(os.environ["GITHUB_TOKEN"]) + repo = gh.get_repo(REPO) + repo.create_issue( + title=f"{ISSUE_PREFIX} {year}", + body=body, + ) + + +def ensure_workdir() -> None: + WORKDIR.mkdir(parents=True, exist_ok=True) -def open_issue(repo, year: int, missing: list[str]) -> None: - title = f"{ISSUE_PREFIX} {year}" - body = f"""A new ACS vintage appears to be available on Census. -Checked: -- {census_year_url(year)} +def download_raw_gdb(year: int) -> Path: + ensure_workdir() + get_census_gdb( + years=[year], + geom_level=GEOM_LEVEL, + output_dir=str(WORKDIR), + protocol="https", + ) + return WORKDIR / expected_file(year) -Expected files: -- {expected_files(year)[0]} -- {expected_files(year)[1]} -Missing files at check time: -- {", ".join(missing) if missing else "None"} +def convert_raw_gdb(year: int, gdb_path: Path) -> Path: + convert_census_gdb( + year=str(year), + level=LEVEL_CODE, + gdb_path=str(gdb_path), + layers=None, + save_intermediate=True, + overwrite=False, + combine=True, + output_dir=str(WORKDIR), + ) + return WORKDIR / f"acs_demographic_profile_{year}_{LEVEL_CODE}.parquet" + + +def build_processed_acs(year: int, combined_path: Path) -> Path: + df = gpd.read_parquet(combined_path) + + if "GEOID" not in df.columns: + df = df.reset_index() + + processed = process_acs(df) + + out_path = WORKDIR / f"acs_{year}_{LEVEL_CODE}.parquet" + processed.to_parquet(out_path) + return out_path -This issue was opened automatically by the release-check workflow. -""" - repo.create_issue(title=title, body=body) def main() -> int: token = os.environ.get("GITHUB_TOKEN") @@ -67,22 +178,43 @@ def main() -> int: return 1 year = LATEST_SUPPORTED_YEAR + 1 - ready, missing = census_release_ready(year) + + ready, status_message = census_release_status(year) + print(status_message) if not ready: - print(f"{year} release not ready. Missing: {missing}") + print(f"{year} release not ready for processing.") return 0 - gh = Github(token) - repo = gh.get_repo(REPO) + try: + gdb_path = download_raw_gdb(year) + print(f"Downloaded: {gdb_path}") + + combined_path = convert_raw_gdb(year, gdb_path) + print(f"Combined parquet: {combined_path}") + + final_path = build_processed_acs(year, combined_path) + print(f"Processed ACS parquet: {final_path}") - if issue_exists(repo, year): - print(f"Issue for {year} already exists.") return 0 - open_issue(repo, year, missing) - print(f"Opened issue for {year}.") - return 0 + except Exception as exc: + msg = ( + f"Detected Census ACS release for {year}, but automated processing failed.\n\n" + f"Checked directory: {census_year_url(year)}\n" + f"Checked file: {expected_file_url(year)}\n\n" + f"Preflight check: {status_message}\n\n" + f"Error:\n```\n{exc}\n```" + ) + print(msg, file=sys.stderr) + + if os.environ.get("DISABLE_GITHUB_ISSUES", "").lower() not in {"1", "true", "yes"}: + if not issue_exists(year): + open_issue(year, msg) + else: + print("Skipping issue creation because DISABLE_GITHUB_ISSUES is set.", file=sys.stderr) + + return 1 if __name__ == "__main__": raise SystemExit(main()) From b283b1819da2f146c87d679f2eaa674acfd08994 Mon Sep 17 00:00:00 2001 From: Dylan Date: Thu, 9 Apr 2026 12:12:24 -0700 Subject: [PATCH 3/9] GEOID renamed in non-geo layers of 2022 acs vintage --- geosnap/io/util.py | 39 +++++++++++++++++++++++++++++--------- tools/check_acs_release.py | 21 +++++++++++++++++--- 2 files changed, 48 insertions(+), 12 deletions(-) diff --git a/geosnap/io/util.py b/geosnap/io/util.py index 30d9fd22..020eb62d 100644 --- a/geosnap/io/util.py +++ b/geosnap/io/util.py @@ -163,17 +163,38 @@ def convert_census_gdb( ) # remove prefix for bgs tables.append(df) else: - df = ( - dgpd.read_file(gdb_path, layer=i, npartitions=npartitions) - .compute() - .set_index("GEOID") - ) + raw = dgpd.read_file(gdb_path, layer=i, npartitions=npartitions).compute() + + if "GEOID" in raw.columns: + geoid_col = "GEOID" + elif "GEOIDFQ" in raw.columns: + geoid_col = "GEOIDFQ" + elif "GEOID_Data" in raw.columns: + geoid_col = "GEOID_Data" + else: + raise KeyError( + f"No GEOID-like column found in layer {i}. Columns are: {list(raw.columns)}" + ) + + df = raw.set_index(geoid_col) + if "ACS_" not in i: # only the geoms have the ACS prefix - df = df[df.columns[df.columns.str.contains("e")]] - df.columns = pd.Series(df.columns).apply(reformat_acs_vars) + # newer vintages already use normalized names like B02001_E001. + # older vintages may still use names like B02001e1. + uppercase_estimates = df.columns[df.columns.str.contains("_E", regex=False)] + lowercase_estimates = df.columns[df.columns.str.contains("e", regex=False)] + + if len(uppercase_estimates) > 0: + df = df[uppercase_estimates] + else: + df = df[lowercase_estimates] + df.columns = pd.Series(df.columns).apply(reformat_acs_vars) + df = df.dropna(axis=1, how="all") - df.index = df.index.str.replace("14000US", "") # remove prefix for tracts - df.index = df.index.str.replace("15000US", "") # remove prefix for bgs + df.index = df.index.astype(str) + df.index = df.index.str.replace("14000US", "", regex=False) + df.index = df.index.str.replace("15000US", "", regex=False) + if combine: tables.append(df) if save_intermediate: diff --git a/tools/check_acs_release.py b/tools/check_acs_release.py index 5ff19c59..f01f321c 100644 --- a/tools/check_acs_release.py +++ b/tools/check_acs_release.py @@ -132,19 +132,34 @@ def open_issue(year: int, body: str) -> None: def ensure_workdir() -> None: WORKDIR.mkdir(parents=True, exist_ok=True) - + def download_raw_gdb(year: int) -> Path: ensure_workdir() + filename = expected_file(year) get_census_gdb( years=[year], geom_level=GEOM_LEVEL, output_dir=str(WORKDIR), protocol="https", ) - return WORKDIR / expected_file(year) + matches = [p.resolve() for p in WORKDIR.rglob(filename) if p.is_file()] + if not matches: + raise FileNotFoundError( + f"Could not find downloaded file {filename} under {WORKDIR.resolve()}" + ) + + if len(matches) > 1: + print("Multiple file matches found:", file=sys.stderr) + for match in matches: + print(f" {match}", file=sys.stderr) + chosen = matches[0] + print(f"Using downloaded zip file: {chosen}", file=sys.stderr) + return chosen def convert_raw_gdb(year: int, gdb_path: Path) -> Path: + gdb_path = gdb_path.resolve() + convert_census_gdb( year=str(year), level=LEVEL_CODE, @@ -198,7 +213,7 @@ def main() -> int: return 0 - except Exception as exc: + except Exception as exc: msg = ( f"Detected Census ACS release for {year}, but automated processing failed.\n\n" f"Checked directory: {census_year_url(year)}\n" From 6236c82452397ad0ae7e4931ab2e0a7ffb0233f7 Mon Sep 17 00:00:00 2001 From: Dylan Date: Thu, 9 Apr 2026 14:18:23 -0700 Subject: [PATCH 4/9] no vars present in 2022 --- build/examine_output.ipynb | 108 +++++++++++++++++++++++++++++++++++++ geosnap/io/util.py | 9 ++-- 2 files changed, 113 insertions(+), 4 deletions(-) create mode 100644 build/examine_output.ipynb diff --git a/build/examine_output.ipynb b/build/examine_output.ipynb new file mode 100644 index 00000000..08b25009 --- /dev/null +++ b/build/examine_output.ipynb @@ -0,0 +1,108 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bf7ad3ec-6101-454a-9092-fcbe217ba030", + "metadata": {}, + "source": [ + "# Examine output\n", + "\n", + "use this notebook to see how effective the processing is for 2022." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ec35a66c-96b3-402c-aae4-feac48a1e742", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Needed ACS vars: 260\n", + "Missing in 2021: 171\n", + "Missing in 2022: 260\n", + "Newly missing in 2022: 89\n", + "Missing in 2021 but present in 2022: 0\n", + "\n", + "First 100 newly missing in 2022:\n", + "['B01001_003E', 'B01001_004E', 'B01001_005E', 'B01001_006E', 'B01001_018E', 'B01001_019E', 'B01001_020E', 'B01001_021E', 'B01001_022E', 'B01001_023E', 'B01001_024E', 'B01001_025E', 'B01001_027E', 'B01001_028E', 'B01001_029E', 'B01001_030E', 'B01001_042E', 'B01001_043E', 'B01001_044E', 'B01001_045E', 'B01001_046E', 'B01001_047E', 'B01001_048E', 'B01001_049E', 'B01003_001E', 'B02001_006E', 'B03002_003E', 'B03002_004E', 'B03002_005E', 'B03002_006E', 'B03002_007E', 'B03002_012E', 'B12001_001E', 'B12001_005E', 'B12001_007E', 'B12001_009E', 'B12001_010E', 'B12001_016E', 'B12001_018E', 'B12001_019E', 'B15002_001E', 'B15002_003E', 'B15002_004E', 'B15002_005E', 'B15002_006E', 'B15002_007E', 'B15002_008E', 'B15002_009E', 'B15002_010E', 'B15002_015E', 'B15002_016E', 'B15002_017E', 'B15002_018E', 'B15002_020E', 'B15002_021E', 'B15002_022E', 'B15002_023E', 'B15002_024E', 'B15002_025E', 'B15002_026E', 'B15002_027E', 'B15002_032E', 'B15002_033E', 'B15002_034E', 'B15002_035E', 'B17010_001E', 'B17010_004E', 'B17010_011E', 'B17010_017E', 'B19001_001E', 'B19013_001E', 'B19301_001E', 'B21001_002E', 'B25002_001E', 'B25002_002E', 'B25002_003E', 'B25003_001E', 'B25003_002E', 'B25003_003E', 'B25024_001E', 'B25024_004E', 'B25024_005E', 'B25024_006E', 'B25024_007E', 'B25024_008E', 'B25024_009E', 'B25058_001E', 'B25077_001E', 'C24010_001E']\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import geopandas as gpd\n", + "from geosnap.io.util import _normalize_relation\n", + "\n", + "df21 = gpd.read_parquet(\"2021_bg/acs_demographic_profile_2021_bg.parquet\")\n", + "df22 = gpd.read_parquet(\"2022_bg/acs_demographic_profile_2022_bg.parquet\")\n", + "\n", + "vars_df = pd.read_csv(\"../geosnap/io/variables.csv\")\n", + "\n", + "needed = set()\n", + "for rel in vars_df[\"acs\"].dropna():\n", + " expr = _normalize_relation(rel)\n", + " pieces = (\n", + " expr.replace(\"+\", \",\")\n", + " .replace(\"-\", \",\")\n", + " .replace(\"(\", \"\")\n", + " .replace(\")\", \"\")\n", + " .split(\",\")\n", + " )\n", + " for piece in pieces:\n", + " piece = piece.strip()\n", + " if piece:\n", + " needed.add(piece)\n", + "\n", + "present21 = set(df21.columns)\n", + "present22 = set(df22.columns)\n", + "\n", + "missing21 = sorted(needed - present21)\n", + "missing22 = sorted(needed - present22)\n", + "\n", + "newly_missing_in_2022 = sorted((needed - present22) - (needed - present21))\n", + "fixed_in_2022 = sorted((needed - present21) - (needed - present22))\n", + "\n", + "print(\"Needed ACS vars:\", len(needed))\n", + "print(\"Missing in 2021:\", len(missing21))\n", + "print(\"Missing in 2022:\", len(missing22))\n", + "print(\"Newly missing in 2022:\", len(newly_missing_in_2022))\n", + "print(\"Missing in 2021 but present in 2022:\", len(fixed_in_2022))\n", + "\n", + "print(\"\\nFirst 100 newly missing in 2022:\")\n", + "print(newly_missing_in_2022[:100])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "328e5680-602f-4a4a-b501-829bf1263214", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/geosnap/io/util.py b/geosnap/io/util.py index 020eb62d..c5792f41 100644 --- a/geosnap/io/util.py +++ b/geosnap/io/util.py @@ -134,13 +134,14 @@ def convert_census_gdb( if gdb_path is None: warn("No `gdb_path` given. Data will be pulled from the Census server") gdb_path = f"https://www2.census.gov/geo/tiger/TIGER_DP/{year}ACS/ACS_{year}_5YR_{level.upper()}.gdb.zip" - if layers is None: # grab them all except the metadata + if layers is None: # grab them all except metadata layers year_suffix = year[-2:] meta_str = f"{level.upper()}_METADATA_20{year_suffix}" layers = [layer[0] for layer in ogr.list_layers(gdb_path)] - if meta_str in layers: - layers.remove(meta_str) - + layers = [ + layer for layer in layers + if layer != meta_str and not layer.endswith("_METADATA") + ] tables = list() existing_files = os.listdir(output_dir) for i in tqdm(layers): From b442ac4a8cf34b30e1d9dbf2a3cb62c5ab40013c Mon Sep 17 00:00:00 2001 From: Dylan Date: Thu, 9 Apr 2026 15:01:17 -0700 Subject: [PATCH 5/9] updated examine notebook --- build/examine_output.ipynb | 550 ++++++++++++++++++++++++++++++++++++- 1 file changed, 547 insertions(+), 3 deletions(-) diff --git a/build/examine_output.ipynb b/build/examine_output.ipynb index 08b25009..8adc8a65 100644 --- a/build/examine_output.ipynb +++ b/build/examine_output.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "id": "ec35a66c-96b3-402c-aae4-feac48a1e742", "metadata": {}, "outputs": [ @@ -62,7 +62,7 @@ "missing21 = sorted(needed - present21)\n", "missing22 = sorted(needed - present22)\n", "\n", - "newly_missing_in_2022 = sorted((needed - present22) - (needed - present21))\n", + "newly_missing_in_2022 = sorted(set(missing22) - set(missing21))\n", "fixed_in_2022 = sorted((needed - present21) - (needed - present22))\n", "\n", "print(\"Needed ACS vars:\", len(needed))\n", @@ -75,10 +75,554 @@ "print(newly_missing_in_2022[:100])" ] }, + { + "cell_type": "markdown", + "id": "328e5680-602f-4a4a-b501-829bf1263214", + "metadata": {}, + "source": [ + "# Are variables gone or just renamed?\n", + "per @knaaptime: Its possible these have different analogues/variable names in the new ACS but we'd need to dig more\n", + "\n", + "let's go dig more" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "42ace1ac-64d4-4918-899f-ccf1e285b215", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "vars2021 = requests.get(\"https://api.census.gov/data/2021/acs/acs5/variables.json\").json()[\"variables\"]\n", + "vars2022 = requests.get(\"https://api.census.gov/data/2022/acs/acs5/variables.json\").json()[\"variables\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e1730a77-37b0-431d-88d0-9bf9d092ce9c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Still exist in 2022: 89\n", + "Gone in 2022: 0\n" + ] + } + ], + "source": [ + "missing = newly_missing_in_2022\n", + "\n", + "still_exist = []\n", + "gone = []\n", + "\n", + "for var in missing:\n", + " if var in vars2022:\n", + " still_exist.append(var)\n", + " else:\n", + " gone.append(var)\n", + "\n", + "print(\"Still exist in 2022:\", len(still_exist))\n", + "print(\"Gone in 2022:\", len(gone))" + ] + }, + { + "cell_type": "markdown", + "id": "22e852ab-8064-4e81-ad82-5c5aca45db3b", + "metadata": {}, + "source": [ + "So like, the variables still exist per the metadata (variables json provided by the ACS), but they are not present where I expect them (tiger product). First, let's ID all the variables by their census identifiers:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7d7c5d72-e33f-446e-86ff-f0bf309c683e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "B01001_003E -> Estimate!!Total:!!Male:!!Under 5 years\n", + "B01001_004E -> Estimate!!Total:!!Male:!!5 to 9 years\n", + "B01001_005E -> Estimate!!Total:!!Male:!!10 to 14 years\n", + "B01001_006E -> Estimate!!Total:!!Male:!!15 to 17 years\n", + "B01001_018E -> Estimate!!Total:!!Male:!!60 and 61 years\n", + "B01001_019E -> Estimate!!Total:!!Male:!!62 to 64 years\n", + "B01001_020E -> Estimate!!Total:!!Male:!!65 and 66 years\n", + "B01001_021E -> Estimate!!Total:!!Male:!!67 to 69 years\n", + "B01001_022E -> Estimate!!Total:!!Male:!!70 to 74 years\n", + "B01001_023E -> Estimate!!Total:!!Male:!!75 to 79 years\n", + "B01001_024E -> Estimate!!Total:!!Male:!!80 to 84 years\n", + "B01001_025E -> Estimate!!Total:!!Male:!!85 years and over\n", + "B01001_027E -> Estimate!!Total:!!Female:!!Under 5 years\n", + "B01001_028E -> Estimate!!Total:!!Female:!!5 to 9 years\n", + "B01001_029E -> Estimate!!Total:!!Female:!!10 to 14 years\n", + "B01001_030E -> Estimate!!Total:!!Female:!!15 to 17 years\n", + "B01001_042E -> Estimate!!Total:!!Female:!!60 and 61 years\n", + "B01001_043E -> Estimate!!Total:!!Female:!!62 to 64 years\n", + "B01001_044E -> Estimate!!Total:!!Female:!!65 and 66 years\n", + "B01001_045E -> Estimate!!Total:!!Female:!!67 to 69 years\n", + "B01001_046E -> Estimate!!Total:!!Female:!!70 to 74 years\n", + "B01001_047E -> Estimate!!Total:!!Female:!!75 to 79 years\n", + "B01001_048E -> Estimate!!Total:!!Female:!!80 to 84 years\n", + "B01001_049E -> Estimate!!Total:!!Female:!!85 years and over\n", + "B01003_001E -> Estimate!!Total\n", + "B02001_006E -> Estimate!!Total:!!Native Hawaiian and Other Pacific Islander alone\n", + "B03002_003E -> Estimate!!Total:!!Not Hispanic or Latino:!!White alone\n", + "B03002_004E -> Estimate!!Total:!!Not Hispanic or Latino:!!Black or African American alone\n", + "B03002_005E -> Estimate!!Total:!!Not Hispanic or Latino:!!American Indian and Alaska Native alone\n", + "B03002_006E -> Estimate!!Total:!!Not Hispanic or Latino:!!Asian alone\n", + "B03002_007E -> Estimate!!Total:!!Not Hispanic or Latino:!!Native Hawaiian and Other Pacific Islander alone\n", + "B03002_012E -> Estimate!!Total:!!Hispanic or Latino:\n", + "B12001_001E -> Estimate!!Total:\n", + "B12001_005E -> Estimate!!Total:!!Male:!!Now married:!!Married, spouse present\n", + "B12001_007E -> Estimate!!Total:!!Male:!!Now married:!!Married, spouse absent:!!Separated\n", + "B12001_009E -> Estimate!!Total:!!Male:!!Widowed\n", + "B12001_010E -> Estimate!!Total:!!Male:!!Divorced\n", + "B12001_016E -> Estimate!!Total:!!Female:!!Now married:!!Married, spouse absent:!!Separated\n", + "B12001_018E -> Estimate!!Total:!!Female:!!Widowed\n", + "B12001_019E -> Estimate!!Total:!!Female:!!Divorced\n", + "B15002_001E -> Estimate!!Total:\n", + "B15002_003E -> Estimate!!Total:!!Male:!!No schooling completed\n", + "B15002_004E -> Estimate!!Total:!!Male:!!Nursery to 4th grade\n", + "B15002_005E -> Estimate!!Total:!!Male:!!5th and 6th grade\n", + "B15002_006E -> Estimate!!Total:!!Male:!!7th and 8th grade\n", + "B15002_007E -> Estimate!!Total:!!Male:!!9th grade\n", + "B15002_008E -> Estimate!!Total:!!Male:!!10th grade\n", + "B15002_009E -> Estimate!!Total:!!Male:!!11th grade\n", + "B15002_010E -> Estimate!!Total:!!Male:!!12th grade, no diploma\n", + "B15002_015E -> Estimate!!Total:!!Male:!!Bachelor's degree\n", + "B15002_016E -> Estimate!!Total:!!Male:!!Master's degree\n", + "B15002_017E -> Estimate!!Total:!!Male:!!Professional school degree\n", + "B15002_018E -> Estimate!!Total:!!Male:!!Doctorate degree\n", + "B15002_020E -> Estimate!!Total:!!Female:!!No schooling completed\n", + "B15002_021E -> Estimate!!Total:!!Female:!!Nursery to 4th grade\n", + "B15002_022E -> Estimate!!Total:!!Female:!!5th and 6th grade\n", + "B15002_023E -> Estimate!!Total:!!Female:!!7th and 8th grade\n", + "B15002_024E -> Estimate!!Total:!!Female:!!9th grade\n", + "B15002_025E -> Estimate!!Total:!!Female:!!10th grade\n", + "B15002_026E -> Estimate!!Total:!!Female:!!11th grade\n", + "B15002_027E -> Estimate!!Total:!!Female:!!12th grade, no diploma\n", + "B15002_032E -> Estimate!!Total:!!Female:!!Bachelor's degree\n", + "B15002_033E -> Estimate!!Total:!!Female:!!Master's degree\n", + "B15002_034E -> Estimate!!Total:!!Female:!!Professional school degree\n", + "B15002_035E -> Estimate!!Total:!!Female:!!Doctorate degree\n", + "B17010_001E -> Estimate!!Total:\n", + "B17010_004E -> Estimate!!Total:!!Income in the past 12 months below poverty level:!!Married-couple family:!!With related children of the householder under 18 years:\n", + "B17010_011E -> Estimate!!Total:!!Income in the past 12 months below poverty level:!!Other family:!!Male householder, no spouse present:!!With related children of the householder under 18 years:\n", + "B17010_017E -> Estimate!!Total:!!Income in the past 12 months below poverty level:!!Other family:!!Female householder, no spouse present:!!With related children of the householder under 18 years:\n", + "B19001_001E -> Estimate!!Total:\n", + "B19013_001E -> Estimate!!Median household income in the past 12 months (in 2022 inflation-adjusted dollars)\n", + "B19301_001E -> Estimate!!Per capita income in the past 12 months (in 2022 inflation-adjusted dollars)\n", + "B21001_002E -> Estimate!!Total:!!Veteran\n", + "B25002_001E -> Estimate!!Total:\n", + "B25002_002E -> Estimate!!Total:!!Occupied\n", + "B25002_003E -> Estimate!!Total:!!Vacant\n", + "B25003_001E -> Estimate!!Total:\n", + "B25003_002E -> Estimate!!Total:!!Owner occupied\n", + "B25003_003E -> Estimate!!Total:!!Renter occupied\n", + "B25024_001E -> Estimate!!Total:\n", + "B25024_004E -> Estimate!!Total:!!2\n", + "B25024_005E -> Estimate!!Total:!!3 or 4\n", + "B25024_006E -> Estimate!!Total:!!5 to 9\n", + "B25024_007E -> Estimate!!Total:!!10 to 19\n", + "B25024_008E -> Estimate!!Total:!!20 to 49\n", + "B25024_009E -> Estimate!!Total:!!50 or more\n", + "B25058_001E -> Estimate!!Median contract rent\n", + "B25077_001E -> Estimate!!Median value (dollars)\n", + "C24010_001E -> Estimate!!Total:\n" + ] + } + ], + "source": [ + "for var in still_exist:\n", + " print(var, \"->\", vars2022[var][\"label\"])" + ] + }, + { + "cell_type": "markdown", + "id": "f563f412-019a-464f-828b-f7ef1bf0f680", + "metadata": {}, + "source": [ + "# Group by census table\n", + "Now, let's group those into their repsective census tables to figure out what still needs to be missing" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "a9b932cb-6f06-4c1e-8b81-bc47e95e9237", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "from collections import defaultdict" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c6422685-f1be-4cdd-bb90-2dab660a20d8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unique table groups: 17\n", + "B01001: 24 vars\n", + "B01003: 1 vars\n", + "B02001: 1 vars\n", + "B03002: 6 vars\n", + "B12001: 8 vars\n", + "B15002: 25 vars\n", + "B17010: 4 vars\n", + "B19001: 1 vars\n", + "B19013: 1 vars\n", + "B19301: 1 vars\n", + "B21001: 1 vars\n", + "B25002: 3 vars\n", + "B25003: 3 vars\n", + "B25024: 7 vars\n", + "B25058: 1 vars\n", + "B25077: 1 vars\n", + "C24010: 1 vars\n" + ] + } + ], + "source": [ + "def variable_to_table_group(var: str) -> str | None:\n", + " \"\"\"\n", + " Convert an ACS variable like B01003_001E to its table/group name B01003.\n", + " \"\"\"\n", + " m = re.match(r\"^([A-Z0-9]+)_\\d+[A-Z]$\", var)\n", + " if m:\n", + " return m.group(1)\n", + " return None\n", + "\n", + "\n", + "def group_variables_by_table(vars_list: list[str]) -> dict[str, list[str]]:\n", + " groups = defaultdict(list)\n", + " unparsed = []\n", + "\n", + " for var in sorted(set(vars_list)):\n", + " group = variable_to_table_group(var)\n", + " if group is None:\n", + " unparsed.append(var)\n", + " else:\n", + " groups[group].append(var)\n", + "\n", + " if unparsed:\n", + " print(\"Could not parse these variables:\")\n", + " for var in unparsed:\n", + " print(\" \", var)\n", + "\n", + " return dict(sorted(groups.items()))\n", + "\n", + "\n", + "groups = group_variables_by_table(newly_missing_in_2022)\n", + "\n", + "print(f\"Unique table groups: {len(groups)}\")\n", + "for group, vars_ in groups.items():\n", + " print(f\"{group}: {len(vars_)} vars\")" + ] + }, + { + "cell_type": "markdown", + "id": "a41071b9-a655-4061-be9d-643f939aead7", + "metadata": {}, + "source": [ + "## Another way to inspect the tables/groups" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "328e5680-602f-4a4a-b501-829bf1263214", + "id": "5b1cdaaa-fe11-423a-ab8d-3b65ef31c1b5", + "metadata": {}, + "outputs": [], + "source": [ + "def describe_group(group: str, vars_meta: dict) -> pd.DataFrame:\n", + " rows = []\n", + " for var, meta in vars_meta.items():\n", + " if var.startswith(f\"{group}_\"):\n", + " rows.append(\n", + " {\n", + " \"variable\": var,\n", + " \"label\": meta.get(\"label\"),\n", + " \"concept\": meta.get(\"concept\"),\n", + " \"predicateType\": meta.get(\"predicateType\"),\n", + " \"group\": meta.get(\"group\"),\n", + " }\n", + " )\n", + " return pd.DataFrame(rows).sort_values(\"variable\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "500ab2ae-734e-4c33-80b2-840dd3278ac7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variablelabelconceptpredicateTypegroup
40B01001_001EEstimate!!Total:Sex by AgeintB01001
39B01001_002EEstimate!!Total:!!Male:Sex by AgeintB01001
42B01001_003EEstimate!!Total:!!Male:!!Under 5 yearsSex by AgeintB01001
41B01001_004EEstimate!!Total:!!Male:!!5 to 9 yearsSex by AgeintB01001
45B01001_005EEstimate!!Total:!!Male:!!10 to 14 yearsSex by AgeintB01001
43B01001_006EEstimate!!Total:!!Male:!!15 to 17 yearsSex by AgeintB01001
44B01001_007EEstimate!!Total:!!Male:!!18 and 19 yearsSex by AgeintB01001
47B01001_008EEstimate!!Total:!!Male:!!20 yearsSex by AgeintB01001
46B01001_009EEstimate!!Total:!!Male:!!21 yearsSex by AgeintB01001
48B01001_010EEstimate!!Total:!!Male:!!22 to 24 yearsSex by AgeintB01001
1B01001_011EEstimate!!Total:!!Male:!!25 to 29 yearsSex by AgeintB01001
0B01001_012EEstimate!!Total:!!Male:!!30 to 34 yearsSex by AgeintB01001
3B01001_013EEstimate!!Total:!!Male:!!35 to 39 yearsSex by AgeintB01001
2B01001_014EEstimate!!Total:!!Male:!!40 to 44 yearsSex by AgeintB01001
5B01001_015EEstimate!!Total:!!Male:!!45 to 49 yearsSex by AgeintB01001
4B01001_016EEstimate!!Total:!!Male:!!50 to 54 yearsSex by AgeintB01001
7B01001_017EEstimate!!Total:!!Male:!!55 to 59 yearsSex by AgeintB01001
8B01001_018EEstimate!!Total:!!Male:!!60 and 61 yearsSex by AgeintB01001
6B01001_019EEstimate!!Total:!!Male:!!62 to 64 yearsSex by AgeintB01001
9B01001_020EEstimate!!Total:!!Male:!!65 and 66 yearsSex by AgeintB01001
\n", + "
" + ], + "text/plain": [ + " variable label concept \\\n", + "40 B01001_001E Estimate!!Total: Sex by Age \n", + "39 B01001_002E Estimate!!Total:!!Male: Sex by Age \n", + "42 B01001_003E Estimate!!Total:!!Male:!!Under 5 years Sex by Age \n", + "41 B01001_004E Estimate!!Total:!!Male:!!5 to 9 years Sex by Age \n", + "45 B01001_005E Estimate!!Total:!!Male:!!10 to 14 years Sex by Age \n", + "43 B01001_006E Estimate!!Total:!!Male:!!15 to 17 years Sex by Age \n", + "44 B01001_007E Estimate!!Total:!!Male:!!18 and 19 years Sex by Age \n", + "47 B01001_008E Estimate!!Total:!!Male:!!20 years Sex by Age \n", + "46 B01001_009E Estimate!!Total:!!Male:!!21 years Sex by Age \n", + "48 B01001_010E Estimate!!Total:!!Male:!!22 to 24 years Sex by Age \n", + "1 B01001_011E Estimate!!Total:!!Male:!!25 to 29 years Sex by Age \n", + "0 B01001_012E Estimate!!Total:!!Male:!!30 to 34 years Sex by Age \n", + "3 B01001_013E Estimate!!Total:!!Male:!!35 to 39 years Sex by Age \n", + "2 B01001_014E Estimate!!Total:!!Male:!!40 to 44 years Sex by Age \n", + "5 B01001_015E Estimate!!Total:!!Male:!!45 to 49 years Sex by Age \n", + "4 B01001_016E Estimate!!Total:!!Male:!!50 to 54 years Sex by Age \n", + "7 B01001_017E Estimate!!Total:!!Male:!!55 to 59 years Sex by Age \n", + "8 B01001_018E Estimate!!Total:!!Male:!!60 and 61 years Sex by Age \n", + "6 B01001_019E Estimate!!Total:!!Male:!!62 to 64 years Sex by Age \n", + "9 B01001_020E Estimate!!Total:!!Male:!!65 and 66 years Sex by Age \n", + "\n", + " predicateType group \n", + "40 int B01001 \n", + "39 int B01001 \n", + "42 int B01001 \n", + "41 int B01001 \n", + "45 int B01001 \n", + "43 int B01001 \n", + "44 int B01001 \n", + "47 int B01001 \n", + "46 int B01001 \n", + "48 int B01001 \n", + "1 int B01001 \n", + "0 int B01001 \n", + "3 int B01001 \n", + "2 int B01001 \n", + "5 int B01001 \n", + "4 int B01001 \n", + "7 int B01001 \n", + "8 int B01001 \n", + "6 int B01001 \n", + "9 int B01001 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Example:\n", + "describe_group(\"B01001\", vars2022).head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "531953dc-7766-414a-b428-49a24df68742", "metadata": {}, "outputs": [], "source": [] From ba613fabf56d489fbbdf45a2d1af89d6a2006038 Mon Sep 17 00:00:00 2001 From: Dylan Date: Mon, 13 Apr 2026 13:37:05 -0700 Subject: [PATCH 6/9] diagnostic of 2022 col names --- build/examine_output.ipynb | 5128 +++++++++++++++++++++++++++++++++++- 1 file changed, 5039 insertions(+), 89 deletions(-) diff --git a/build/examine_output.ipynb b/build/examine_output.ipynb index 8adc8a65..9d0587c3 100644 --- a/build/examine_output.ipynb +++ b/build/examine_output.ipynb @@ -10,82 +10,6 @@ "use this notebook to see how effective the processing is for 2022." ] }, - { - "cell_type": "code", - "execution_count": 5, - "id": "ec35a66c-96b3-402c-aae4-feac48a1e742", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Needed ACS vars: 260\n", - "Missing in 2021: 171\n", - "Missing in 2022: 260\n", - "Newly missing in 2022: 89\n", - "Missing in 2021 but present in 2022: 0\n", - "\n", - "First 100 newly missing in 2022:\n", - "['B01001_003E', 'B01001_004E', 'B01001_005E', 'B01001_006E', 'B01001_018E', 'B01001_019E', 'B01001_020E', 'B01001_021E', 'B01001_022E', 'B01001_023E', 'B01001_024E', 'B01001_025E', 'B01001_027E', 'B01001_028E', 'B01001_029E', 'B01001_030E', 'B01001_042E', 'B01001_043E', 'B01001_044E', 'B01001_045E', 'B01001_046E', 'B01001_047E', 'B01001_048E', 'B01001_049E', 'B01003_001E', 'B02001_006E', 'B03002_003E', 'B03002_004E', 'B03002_005E', 'B03002_006E', 'B03002_007E', 'B03002_012E', 'B12001_001E', 'B12001_005E', 'B12001_007E', 'B12001_009E', 'B12001_010E', 'B12001_016E', 'B12001_018E', 'B12001_019E', 'B15002_001E', 'B15002_003E', 'B15002_004E', 'B15002_005E', 'B15002_006E', 'B15002_007E', 'B15002_008E', 'B15002_009E', 'B15002_010E', 'B15002_015E', 'B15002_016E', 'B15002_017E', 'B15002_018E', 'B15002_020E', 'B15002_021E', 'B15002_022E', 'B15002_023E', 'B15002_024E', 'B15002_025E', 'B15002_026E', 'B15002_027E', 'B15002_032E', 'B15002_033E', 'B15002_034E', 'B15002_035E', 'B17010_001E', 'B17010_004E', 'B17010_011E', 'B17010_017E', 'B19001_001E', 'B19013_001E', 'B19301_001E', 'B21001_002E', 'B25002_001E', 'B25002_002E', 'B25002_003E', 'B25003_001E', 'B25003_002E', 'B25003_003E', 'B25024_001E', 'B25024_004E', 'B25024_005E', 'B25024_006E', 'B25024_007E', 'B25024_008E', 'B25024_009E', 'B25058_001E', 'B25077_001E', 'C24010_001E']\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import geopandas as gpd\n", - "from geosnap.io.util import _normalize_relation\n", - "\n", - "df21 = gpd.read_parquet(\"2021_bg/acs_demographic_profile_2021_bg.parquet\")\n", - "df22 = gpd.read_parquet(\"2022_bg/acs_demographic_profile_2022_bg.parquet\")\n", - "\n", - "vars_df = pd.read_csv(\"../geosnap/io/variables.csv\")\n", - "\n", - "needed = set()\n", - "for rel in vars_df[\"acs\"].dropna():\n", - " expr = _normalize_relation(rel)\n", - " pieces = (\n", - " expr.replace(\"+\", \",\")\n", - " .replace(\"-\", \",\")\n", - " .replace(\"(\", \"\")\n", - " .replace(\")\", \"\")\n", - " .split(\",\")\n", - " )\n", - " for piece in pieces:\n", - " piece = piece.strip()\n", - " if piece:\n", - " needed.add(piece)\n", - "\n", - "present21 = set(df21.columns)\n", - "present22 = set(df22.columns)\n", - "\n", - "missing21 = sorted(needed - present21)\n", - "missing22 = sorted(needed - present22)\n", - "\n", - "newly_missing_in_2022 = sorted(set(missing22) - set(missing21))\n", - "fixed_in_2022 = sorted((needed - present21) - (needed - present22))\n", - "\n", - "print(\"Needed ACS vars:\", len(needed))\n", - "print(\"Missing in 2021:\", len(missing21))\n", - "print(\"Missing in 2022:\", len(missing22))\n", - "print(\"Newly missing in 2022:\", len(newly_missing_in_2022))\n", - "print(\"Missing in 2021 but present in 2022:\", len(fixed_in_2022))\n", - "\n", - "print(\"\\nFirst 100 newly missing in 2022:\")\n", - "print(newly_missing_in_2022[:100])" - ] - }, - { - "cell_type": "markdown", - "id": "328e5680-602f-4a4a-b501-829bf1263214", - "metadata": {}, - "source": [ - "# Are variables gone or just renamed?\n", - "per @knaaptime: Its possible these have different analogues/variable names in the new ACS but we'd need to dig more\n", - "\n", - "let's go dig more" - ] - }, { "cell_type": "code", "execution_count": 6, @@ -245,15 +169,6 @@ " print(var, \"->\", vars2022[var][\"label\"])" ] }, - { - "cell_type": "markdown", - "id": "f563f412-019a-464f-828b-f7ef1bf0f680", - "metadata": {}, - "source": [ - "# Group by census table\n", - "Now, let's group those into their repsective census tables to figure out what still needs to be missing" - ] - }, { "cell_type": "code", "execution_count": 13, @@ -336,7 +251,9 @@ { "cell_type": "markdown", "id": "a41071b9-a655-4061-be9d-643f939aead7", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "## Another way to inspect the tables/groups" ] @@ -620,12 +537,5045 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "id": "531953dc-7766-414a-b428-49a24df68742", "metadata": {}, + "source": [ + "## Reclaim new naming format\n", + "\n", + "Follow Eli's comment on the PR:\n", + "\n", + "`ok, now that i've looked at ont of the 2022 tables in the geodatabase, the reason you're getting no results is the naming convention has changed. Your PR includes an update for the geoid column, but there are other systematic changes. In the new tables, the variables are named (as an example): B02001_E001. We need to have processing that anticipates this format, then converts it to the canonical form (like the json tables, B02001_001E (where E/M is the final character of the variable rather than the leading character)`" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "86339013-67d1-4f85-a213-a82608665c59", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import geopandas as gpd\n", + "\n", + "import re\n", + "from pathlib import Path\n", + "import pyarrow.parquet as pq\n", + "\n", + "from IPython.display import display" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "fe5f95eb-4a88-418b-82ee-9f7b21555570", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2021 dir: /home/dylan/projects/geosnap/build/2021_bg\n", + "2022 dir: /home/dylan/projects/geosnap/build/2022_bg\n", + "Report dir: /home/dylan/projects/geosnap/build/reports\n" + ] + } + ], + "source": [ + "# Adjust this path if needed\n", + "BUILD_ROOT = Path(\"../build\")\n", + "\n", + "DIR_2021 = BUILD_ROOT / \"2021_bg\"\n", + "DIR_2022 = BUILD_ROOT / \"2022_bg\"\n", + "REPORT_DIR = BUILD_ROOT / \"reports\"\n", + "REPORT_DIR.mkdir(parents=True, exist_ok=True)\n", + "\n", + "print(\"2021 dir:\", DIR_2021.resolve())\n", + "print(\"2022 dir:\", DIR_2022.resolve())\n", + "print(\"Report dir:\", REPORT_DIR.resolve())" + ] + }, + { + "cell_type": "markdown", + "id": "2b02ca80-1327-4041-94de-56bb39bdf512", + "metadata": {}, + "source": [ + "## Inspect the new naming on one file" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "7c4c5ad9-bade-4dfb-918e-ee3b49afbb50", + "metadata": {}, + "outputs": [], + "source": [ + "test1 = pd.read_parquet(f'{DIR_2022}/acs_2022_X14_SCHOOL_ENROLLMENT_bg.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "6adaade2-f526-49b4-b798-5660fe8773ac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
B14002_E001B14002_E002B14002_E003B14002_E004B14002_E005B14002_E006B14002_E007B14002_E008B14002_E009B14002_E010...B14007I_E010B14007I_E011B14007I_E012B14007I_E013B14007I_E014B14007I_E015B14007I_E016B14007I_E017B14007I_E018B14007I_E019
GEOIDFQ
1500000US0101795480021375.0529.045.024.024.00.00.00.00.00.0...0.00.00.014.00.00.00.00.00.021.0
1500000US010179548004773.0409.038.00.00.00.00.00.00.027.0...0.00.00.00.00.00.00.00.00.030.0
1500000US010179548003281.085.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.05.0
1500000US010150011031539.0321.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
1500000US010150024003970.0421.093.00.00.00.00.00.00.033.0...0.00.00.00.00.00.00.00.00.00.0
\n", + "

5 rows × 268 columns

\n", + "
" + ], + "text/plain": [ + " B14002_E001 B14002_E002 B14002_E003 B14002_E004 \\\n", + "GEOIDFQ \n", + "1500000US010179548002 1375.0 529.0 45.0 24.0 \n", + "1500000US010179548004 773.0 409.0 38.0 0.0 \n", + "1500000US010179548003 281.0 85.0 0.0 0.0 \n", + "1500000US010150011031 539.0 321.0 0.0 0.0 \n", + "1500000US010150024003 970.0 421.0 93.0 0.0 \n", + "\n", + " B14002_E005 B14002_E006 B14002_E007 B14002_E008 \\\n", + "GEOIDFQ \n", + "1500000US010179548002 24.0 0.0 0.0 0.0 \n", + "1500000US010179548004 0.0 0.0 0.0 0.0 \n", + "1500000US010179548003 0.0 0.0 0.0 0.0 \n", + "1500000US010150011031 0.0 0.0 0.0 0.0 \n", + "1500000US010150024003 0.0 0.0 0.0 0.0 \n", + "\n", + " B14002_E009 B14002_E010 ... B14007I_E010 \\\n", + "GEOIDFQ ... \n", + "1500000US010179548002 0.0 0.0 ... 0.0 \n", + "1500000US010179548004 0.0 27.0 ... 0.0 \n", + "1500000US010179548003 0.0 0.0 ... 0.0 \n", + "1500000US010150011031 0.0 0.0 ... 0.0 \n", + "1500000US010150024003 0.0 33.0 ... 0.0 \n", + "\n", + " B14007I_E011 B14007I_E012 B14007I_E013 B14007I_E014 \\\n", + "GEOIDFQ \n", + "1500000US010179548002 0.0 0.0 14.0 0.0 \n", + "1500000US010179548004 0.0 0.0 0.0 0.0 \n", + "1500000US010179548003 0.0 0.0 0.0 0.0 \n", + "1500000US010150011031 0.0 0.0 0.0 0.0 \n", + "1500000US010150024003 0.0 0.0 0.0 0.0 \n", + "\n", + " B14007I_E015 B14007I_E016 B14007I_E017 B14007I_E018 \\\n", + "GEOIDFQ \n", + "1500000US010179548002 0.0 0.0 0.0 0.0 \n", + "1500000US010179548004 0.0 0.0 0.0 0.0 \n", + "1500000US010179548003 0.0 0.0 0.0 0.0 \n", + "1500000US010150011031 0.0 0.0 0.0 0.0 \n", + "1500000US010150024003 0.0 0.0 0.0 0.0 \n", + "\n", + " B14007I_E019 \n", + "GEOIDFQ \n", + "1500000US010179548002 21.0 \n", + "1500000US010179548004 30.0 \n", + "1500000US010179548003 5.0 \n", + "1500000US010150011031 0.0 \n", + "1500000US010150024003 0.0 \n", + "\n", + "[5 rows x 268 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test1.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "d21c2ddf-80fc-4e5f-92ac-7cccc49c880d", + "metadata": {}, "outputs": [], - "source": [] + "source": [ + "test1_2021 = pd.read_parquet(f'{DIR_2021}/acs_2021_X14_SCHOOL_ENROLLMENT_bg.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "3d7aae74-69e3-4b3f-aba1-6a222190e1f7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
B14002_001EB14002_002EB14002_003EB14002_004EB14002_005EB14002_006EB14002_007EB14002_008EB14002_009EB14002_010E...B14007I_010EB14007I_011EB14007I_012EB14007I_013EB14007I_014EB14007I_015EB14007I_016EB14007I_017EB14007I_018EB14007I_019E
GEOID
010010201001691.0296.046.04.00.04.00.00.00.031.0...0.00.00.00.00.00.00.00.00.013.0
0100102010021038.0558.0145.00.00.00.00.00.00.087.0...0.00.00.00.00.00.00.00.00.09.0
010010202001782.0324.077.07.00.07.00.00.00.029.0...0.00.00.00.00.00.00.00.00.00.0
0100102020021146.0703.067.00.00.00.00.00.00.028.0...0.00.00.00.00.00.00.00.00.00.0
0100102030012667.01256.0329.00.00.00.025.025.00.0117.0...0.00.00.00.00.00.00.02.00.05.0
\n", + "

5 rows × 268 columns

\n", + "
" + ], + "text/plain": [ + " B14002_001E B14002_002E B14002_003E B14002_004E B14002_005E \\\n", + "GEOID \n", + "010010201001 691.0 296.0 46.0 4.0 0.0 \n", + "010010201002 1038.0 558.0 145.0 0.0 0.0 \n", + "010010202001 782.0 324.0 77.0 7.0 0.0 \n", + "010010202002 1146.0 703.0 67.0 0.0 0.0 \n", + "010010203001 2667.0 1256.0 329.0 0.0 0.0 \n", + "\n", + " B14002_006E B14002_007E B14002_008E B14002_009E B14002_010E \\\n", + "GEOID \n", + "010010201001 4.0 0.0 0.0 0.0 31.0 \n", + "010010201002 0.0 0.0 0.0 0.0 87.0 \n", + "010010202001 7.0 0.0 0.0 0.0 29.0 \n", + "010010202002 0.0 0.0 0.0 0.0 28.0 \n", + "010010203001 0.0 25.0 25.0 0.0 117.0 \n", + "\n", + " ... B14007I_010E B14007I_011E B14007I_012E B14007I_013E \\\n", + "GEOID ... \n", + "010010201001 ... 0.0 0.0 0.0 0.0 \n", + "010010201002 ... 0.0 0.0 0.0 0.0 \n", + "010010202001 ... 0.0 0.0 0.0 0.0 \n", + "010010202002 ... 0.0 0.0 0.0 0.0 \n", + "010010203001 ... 0.0 0.0 0.0 0.0 \n", + "\n", + " B14007I_014E B14007I_015E B14007I_016E B14007I_017E \\\n", + "GEOID \n", + "010010201001 0.0 0.0 0.0 0.0 \n", + "010010201002 0.0 0.0 0.0 0.0 \n", + "010010202001 0.0 0.0 0.0 0.0 \n", + "010010202002 0.0 0.0 0.0 0.0 \n", + "010010203001 0.0 0.0 0.0 2.0 \n", + "\n", + " B14007I_018E B14007I_019E \n", + "GEOID \n", + "010010201001 0.0 13.0 \n", + "010010201002 0.0 9.0 \n", + "010010202001 0.0 0.0 \n", + "010010202002 0.0 0.0 \n", + "010010203001 0.0 5.0 \n", + "\n", + "[5 rows x 268 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test1_2021.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "e28b0089-3da0-4794-8bb6-fa606d5f5e8b", + "metadata": {}, + "outputs": [], + "source": [ + "test2 = pd.read_parquet(f'{DIR_2022}/acs_2022_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "d4700f62-dc2e-4097-bea6-78b9b6c4e482", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
B03002_E001B03002_E002B03002_E003B03002_E004B03002_E005B03002_E006B03002_E007B03002_E008B03002_E009B03002_E010...B03002_E015B03002_E016B03002_E017B03002_E018B03002_E019B03002_E020B03002_E021B03003_E001B03003_E002B03003_E003
GEOIDFQ
1500000US0101795480021375.01340.0149.01191.00.00.00.00.00.00.0...0.00.00.00.00.00.00.01375.01340.035.0
1500000US010179548004797.0767.0450.0314.00.00.03.00.00.00.0...0.00.00.030.00.00.00.0797.0767.030.0
1500000US010179548003281.0276.0138.0138.00.00.00.00.00.00.0...0.00.00.05.00.00.00.0281.0276.05.0
1500000US010150011031560.0560.0560.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.0560.0560.00.0
1500000US0101500240031003.01003.0871.045.00.00.00.00.087.00.0...0.00.00.00.00.00.00.01003.01003.00.0
\n", + "

5 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " B03002_E001 B03002_E002 B03002_E003 B03002_E004 \\\n", + "GEOIDFQ \n", + "1500000US010179548002 1375.0 1340.0 149.0 1191.0 \n", + "1500000US010179548004 797.0 767.0 450.0 314.0 \n", + "1500000US010179548003 281.0 276.0 138.0 138.0 \n", + "1500000US010150011031 560.0 560.0 560.0 0.0 \n", + "1500000US010150024003 1003.0 1003.0 871.0 45.0 \n", + "\n", + " B03002_E005 B03002_E006 B03002_E007 B03002_E008 \\\n", + "GEOIDFQ \n", + "1500000US010179548002 0.0 0.0 0.0 0.0 \n", + "1500000US010179548004 0.0 0.0 3.0 0.0 \n", + "1500000US010179548003 0.0 0.0 0.0 0.0 \n", + "1500000US010150011031 0.0 0.0 0.0 0.0 \n", + "1500000US010150024003 0.0 0.0 0.0 0.0 \n", + "\n", + " B03002_E009 B03002_E010 ... B03002_E015 \\\n", + "GEOIDFQ ... \n", + "1500000US010179548002 0.0 0.0 ... 0.0 \n", + "1500000US010179548004 0.0 0.0 ... 0.0 \n", + "1500000US010179548003 0.0 0.0 ... 0.0 \n", + "1500000US010150011031 0.0 0.0 ... 0.0 \n", + "1500000US010150024003 87.0 0.0 ... 0.0 \n", + "\n", + " B03002_E016 B03002_E017 B03002_E018 B03002_E019 \\\n", + "GEOIDFQ \n", + "1500000US010179548002 0.0 0.0 0.0 0.0 \n", + "1500000US010179548004 0.0 0.0 30.0 0.0 \n", + "1500000US010179548003 0.0 0.0 5.0 0.0 \n", + "1500000US010150011031 0.0 0.0 0.0 0.0 \n", + "1500000US010150024003 0.0 0.0 0.0 0.0 \n", + "\n", + " B03002_E020 B03002_E021 B03003_E001 B03003_E002 \\\n", + "GEOIDFQ \n", + "1500000US010179548002 0.0 0.0 1375.0 1340.0 \n", + "1500000US010179548004 0.0 0.0 797.0 767.0 \n", + "1500000US010179548003 0.0 0.0 281.0 276.0 \n", + "1500000US010150011031 0.0 0.0 560.0 560.0 \n", + "1500000US010150024003 0.0 0.0 1003.0 1003.0 \n", + "\n", + " B03003_E003 \n", + "GEOIDFQ \n", + "1500000US010179548002 35.0 \n", + "1500000US010179548004 30.0 \n", + "1500000US010179548003 5.0 \n", + "1500000US010150011031 0.0 \n", + "1500000US010150024003 0.0 \n", + "\n", + "[5 rows x 24 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "a2f23636-b9d8-47fb-90a2-3b5c94b12cbe", + "metadata": {}, + "outputs": [], + "source": [ + "test2_2021 = pd.read_parquet(f'{DIR_2021}/acs_2021_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "b39f8d07-ad85-40b2-94ba-425625aa3c75", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
B03002_001EB03002_002EB03002_003EB03002_004EB03002_005EB03002_006EB03002_007EB03002_008EB03002_009EB03002_010E...B03002_015EB03002_016EB03002_017EB03002_018EB03002_019EB03002_020EB03002_021EB03003_001EB03003_002EB03003_003E
GEOID
010010201001693.0674.0587.016.00.00.00.00.071.00.0...0.00.00.00.00.00.00.0693.0674.019.0
0100102010021098.01089.0887.0155.00.038.00.00.09.00.0...0.00.00.00.00.00.00.01098.01089.09.0
010010202001844.0834.0336.0421.00.00.00.00.077.00.0...0.00.00.00.00.00.00.0844.0834.010.0
0100102020021166.01166.0439.0667.00.00.00.08.052.027.0...0.00.00.00.00.00.00.01166.01166.00.0
0100102030012685.02672.02011.0531.00.026.00.00.0104.00.0...0.00.00.07.06.06.00.02685.02672.013.0
\n", + "

5 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " B03002_001E B03002_002E B03002_003E B03002_004E B03002_005E \\\n", + "GEOID \n", + "010010201001 693.0 674.0 587.0 16.0 0.0 \n", + "010010201002 1098.0 1089.0 887.0 155.0 0.0 \n", + "010010202001 844.0 834.0 336.0 421.0 0.0 \n", + "010010202002 1166.0 1166.0 439.0 667.0 0.0 \n", + "010010203001 2685.0 2672.0 2011.0 531.0 0.0 \n", + "\n", + " B03002_006E B03002_007E B03002_008E B03002_009E B03002_010E \\\n", + "GEOID \n", + "010010201001 0.0 0.0 0.0 71.0 0.0 \n", + "010010201002 38.0 0.0 0.0 9.0 0.0 \n", + "010010202001 0.0 0.0 0.0 77.0 0.0 \n", + "010010202002 0.0 0.0 8.0 52.0 27.0 \n", + "010010203001 26.0 0.0 0.0 104.0 0.0 \n", + "\n", + " ... B03002_015E B03002_016E B03002_017E B03002_018E \\\n", + "GEOID ... \n", + "010010201001 ... 0.0 0.0 0.0 0.0 \n", + "010010201002 ... 0.0 0.0 0.0 0.0 \n", + "010010202001 ... 0.0 0.0 0.0 0.0 \n", + "010010202002 ... 0.0 0.0 0.0 0.0 \n", + "010010203001 ... 0.0 0.0 0.0 7.0 \n", + "\n", + " B03002_019E B03002_020E B03002_021E B03003_001E B03003_002E \\\n", + "GEOID \n", + "010010201001 0.0 0.0 0.0 693.0 674.0 \n", + "010010201002 0.0 0.0 0.0 1098.0 1089.0 \n", + "010010202001 0.0 0.0 0.0 844.0 834.0 \n", + "010010202002 0.0 0.0 0.0 1166.0 1166.0 \n", + "010010203001 6.0 6.0 0.0 2685.0 2672.0 \n", + "\n", + " B03003_003E \n", + "GEOID \n", + "010010201001 19.0 \n", + "010010201002 9.0 \n", + "010010202001 10.0 \n", + "010010202002 0.0 \n", + "010010203001 13.0 \n", + "\n", + "[5 rows x 24 columns]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test2_2021.head()" + ] + }, + { + "cell_type": "markdown", + "id": "0d85ad48-1e93-417e-a8d1-2d97f728fce8", + "metadata": {}, + "source": [ + "It would be really cool if the 'E' moving was the only naming convention change with the new vintage" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "0346257d-4d09-4e46-8692-856895d490dc", + "metadata": {}, + "outputs": [], + "source": [ + "# New 2022-style ACS naming:\n", + "# B02001_E001\n", + "# B02001_M001\n", + "NEW_STYLE_ACS_RE = re.compile(r\"^([A-Z0-9]+)_([EM])(\\d{3})$\", re.IGNORECASE)\n", + "\n", + "# Canonical ACS naming:\n", + "# B02001_001E\n", + "# B02001_001M\n", + "CANONICAL_ACS_RE = re.compile(r\"^([A-Z0-9]+)_(\\d{3})([EM])$\", re.IGNORECASE)\n", + "\n", + "# Flexible GEOID-like matcher\n", + "GEOID_RE = re.compile(r\"^GEOID([A-Z_].*)?$\", re.IGNORECASE)\n", + "\n", + "\n", + "def read_parquet_columns(path: Path) -> list[str]:\n", + " \"\"\"Read parquet schema only, not the data.\"\"\"\n", + " schema = pq.ParquetFile(path).schema_arrow\n", + " return schema.names\n", + "\n", + "\n", + "def canonicalize_column(col: str) -> str:\n", + " \"\"\"\n", + " Normalize ACS variable names to canonical form.\n", + "\n", + " Examples:\n", + " B02001_E001 -> B02001_001E\n", + " B02001_M001 -> B02001_001M\n", + " B02001_001E -> B02001_001E\n", + " \"\"\"\n", + " c = col.strip()\n", + "\n", + " m = NEW_STYLE_ACS_RE.match(c)\n", + " if m:\n", + " stem, suffix, digits = m.groups()\n", + " return f\"{stem.upper()}_{digits}{suffix.upper()}\"\n", + "\n", + " m = CANONICAL_ACS_RE.match(c)\n", + " if m:\n", + " stem, digits, suffix = m.groups()\n", + " return f\"{stem.upper()}_{digits}{suffix.upper()}\"\n", + "\n", + " return c\n", + "\n", + "\n", + "def classify_column(col: str) -> str:\n", + " c = col.strip()\n", + "\n", + " if GEOID_RE.match(c) or c in {\"GEOIDFQ\", \"GEOID_Data\"}:\n", + " return \"geoid_like\"\n", + "\n", + " if NEW_STYLE_ACS_RE.match(c):\n", + " return \"acs_new_style\"\n", + "\n", + " if CANONICAL_ACS_RE.match(c):\n", + " return \"acs_canonical\"\n", + "\n", + " return \"other\"\n", + "\n", + "\n", + "def layer_key(path: Path) -> str:\n", + " \"\"\"\n", + " Convert a filename into a year-agnostic layer key.\n", + " \"\"\"\n", + " name = path.name\n", + "\n", + " if re.fullmatch(r\"acs_\\d{4}_bg\\.parquet\", name):\n", + " return \"ALL_BG\"\n", + "\n", + " if re.fullmatch(r\"acs_demographic_profile_\\d{4}_bg\\.parquet\", name):\n", + " return \"DEMOGRAPHIC_PROFILE\"\n", + "\n", + " m = re.fullmatch(r\"acs_\\d{4}_(.+?)_bg\\.parquet\", name)\n", + " if m:\n", + " return m.group(1)\n", + "\n", + " return name" + ] + }, + { + "cell_type": "markdown", + "id": "dc6866a6-b51a-4056-b083-46382112e98a", + "metadata": {}, + "source": [ + "## Sanity check: do the helper functions work?\n", + "\n", + "Expected: \n", + "- B02001_E001 should become B02001_001E\n", + "- B02001_M001 should become B02001_001M\n", + "- canonical names should stay unchanged\n", + "- GEOID-like columns should stay unchanged" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "c7647e0c-b8e9-4271-b446-3ea81609bbe0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
originalclassificationcanonical
0B02001_E001acs_new_styleB02001_001E
1B02001_M001acs_new_styleB02001_001M
2B02001_001Eacs_canonicalB02001_001E
3B19013_001Eacs_canonicalB19013_001E
4GEOIDgeoid_likeGEOID
5GEOIDFQgeoid_likeGEOIDFQ
6GEOID_Datageoid_likeGEOID_Data
7NAMEotherNAME
\n", + "
" + ], + "text/plain": [ + " original classification canonical\n", + "0 B02001_E001 acs_new_style B02001_001E\n", + "1 B02001_M001 acs_new_style B02001_001M\n", + "2 B02001_001E acs_canonical B02001_001E\n", + "3 B19013_001E acs_canonical B19013_001E\n", + "4 GEOID geoid_like GEOID\n", + "5 GEOIDFQ geoid_like GEOIDFQ\n", + "6 GEOID_Data geoid_like GEOID_Data\n", + "7 NAME other NAME" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "test_cols = [\n", + " \"B02001_E001\",\n", + " \"B02001_M001\",\n", + " \"B02001_001E\",\n", + " \"B19013_001E\",\n", + " \"GEOID\",\n", + " \"GEOIDFQ\",\n", + " \"GEOID_Data\",\n", + " \"NAME\",\n", + "]\n", + "\n", + "test_df = pd.DataFrame({\n", + " \"original\": test_cols,\n", + " \"classification\": [classify_column(c) for c in test_cols],\n", + " \"canonical\": [canonicalize_column(c) for c in test_cols],\n", + "})\n", + "\n", + "display(test_df)" + ] + }, + { + "cell_type": "markdown", + "id": "c7fc46df-fb64-4190-9de2-249ea92c0309", + "metadata": {}, + "source": [ + "## Compare the files between vintages\n", + "\n", + "Just verifying comparable tables and identifying what is new" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "07bd7b42-cdb1-475f-9cd8-9d1adfbdadc5", + "metadata": {}, + "outputs": [], + "source": [ + "def parse_parquet_filename(path: Path) -> dict:\n", + " \"\"\"\n", + " Parse known ACS parquet filenames into structured parts and normalize\n", + " year-specific pieces so 2021 and 2022 comparable files align.\n", + " \"\"\"\n", + " name = path.name\n", + "\n", + " # acs_demographic_profile_2022_bg.parquet\n", + " m = re.match(\n", + " r\"^acs_demographic_profile_(\\d{4})_(\\w+)\\.parquet$\",\n", + " name,\n", + " flags=re.IGNORECASE,\n", + " )\n", + " if m:\n", + " year, geography = m.groups()\n", + " return {\n", + " \"file\": name,\n", + " \"year\": int(year),\n", + " \"kind\": \"demographic_profile\",\n", + " \"x_code\": None,\n", + " \"table_name\": \"demographic_profile\",\n", + " \"geography\": geography,\n", + " \"group_key\": f\"demographic_profile::{geography}\",\n", + " \"sort_key\": (9998, \"demographic_profile\", geography),\n", + " }\n", + "\n", + " # acs_2022_X29_VOTING_AGE_POPULATION_bg.parquet\n", + " m = re.match(\n", + " r\"^acs_(\\d{4})_(X\\d{2})_(.+)_(\\w+)\\.parquet$\",\n", + " name,\n", + " flags=re.IGNORECASE,\n", + " )\n", + " if m:\n", + " year, x_code, table_name, geography = m.groups()\n", + " x_code = x_code.upper()\n", + " return {\n", + " \"file\": name,\n", + " \"year\": int(year),\n", + " \"kind\": \"x_table\",\n", + " \"x_code\": x_code,\n", + " \"table_name\": table_name,\n", + " \"geography\": geography,\n", + " \"group_key\": f\"{x_code}::{table_name}::{geography}\",\n", + " \"sort_key\": (int(x_code[1:]), table_name, geography),\n", + " }\n", + "\n", + " # acs_2022_ACS_2022_5YR_BG_bg.parquet\n", + " # normalize ACS_2021_5YR_BG and ACS_2022_5YR_BG to ACS_5YR_BG\n", + " m = re.match(\n", + " r\"^acs_(\\d{4})_(ACS_\\d{4}_5YR_[A-Z]+)_(\\w+)\\.parquet$\",\n", + " name,\n", + " flags=re.IGNORECASE,\n", + " )\n", + " if m:\n", + " year, source_name, geography = m.groups()\n", + " source_name_norm = re.sub(r\"ACS_\\d{4}_5YR_\", \"ACS_5YR_\", source_name, flags=re.IGNORECASE)\n", + " return {\n", + " \"file\": name,\n", + " \"year\": int(year),\n", + " \"kind\": \"whole_gdb\",\n", + " \"x_code\": None,\n", + " \"table_name\": source_name_norm,\n", + " \"geography\": geography,\n", + " \"group_key\": f\"whole_gdb::{source_name_norm}::{geography}\",\n", + " \"sort_key\": (9996, source_name_norm, geography),\n", + " }\n", + "\n", + " # acs_2022_bg.parquet\n", + " m = re.match(\n", + " r\"^acs_(\\d{4})_(\\w+)\\.parquet$\",\n", + " name,\n", + " flags=re.IGNORECASE,\n", + " )\n", + " if m:\n", + " year, geography = m.groups()\n", + " return {\n", + " \"file\": name,\n", + " \"year\": int(year),\n", + " \"kind\": \"combined\",\n", + " \"x_code\": None,\n", + " \"table_name\": \"combined\",\n", + " \"geography\": geography,\n", + " \"group_key\": f\"combined::{geography}\",\n", + " \"sort_key\": (9997, \"combined\", geography),\n", + " }\n", + "\n", + " return {\n", + " \"file\": name,\n", + " \"year\": None,\n", + " \"kind\": \"unknown\",\n", + " \"x_code\": None,\n", + " \"table_name\": name,\n", + " \"geography\": None,\n", + " \"group_key\": f\"unknown::{name}\",\n", + " \"sort_key\": (9999, name, \"\"),\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "ffbb0145-e40e-42b6-8793-77409c74dab4", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2021 parquet files: 26\n", + "2022 parquet files: 34\n" + ] + } + ], + "source": [ + "files_2021 = sorted(DIR_2021.glob(\"*.parquet\"))\n", + "files_2022 = sorted(DIR_2022.glob(\"*.parquet\"))\n", + "\n", + "parsed_2021 = pd.DataFrame([parse_parquet_filename(p) for p in files_2021])\n", + "parsed_2022 = pd.DataFrame([parse_parquet_filename(p) for p in files_2022])\n", + "\n", + "print(f\"2021 parquet files: {len(parsed_2021)}\")\n", + "print(f\"2022 parquet files: {len(parsed_2022)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "450715be-8255-45b7-9c3c-b828d78ed639", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
file_2021file_2022exists_2021exists_2022
0acs_2021_X01_AGE_AND_SEX_bg.parquetacs_2022_X01_AGE_AND_SEX_bg.parquetTrueTrue
1acs_2021_X02_RACE_bg.parquetacs_2022_X02_RACE_bg.parquetTrueTrue
2acs_2021_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquetacs_2022_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquetTrueTrue
3NaNacs_2022_X04_ANCESTRY_bg.parquetFalseTrue
4NaNacs_2022_X05_FOREIGN_BORN_CITIZENSHIP_bg.parquetFalseTrue
5NaNacs_2022_X06_PLACE_OF_BIRTH_bg.parquetFalseTrue
6acs_2021_X07_MIGRATION_bg.parquetacs_2022_X07_MIGRATION_bg.parquetTrueTrue
7acs_2021_X08_COMMUTING_bg.parquetacs_2022_X08_COMMUTING_bg.parquetTrueTrue
8acs_2021_X09_CHILDREN_HOUSEHOLD_RELATIONSHIP_b...acs_2022_X09_CHILDREN_HOUSEHOLD_RELATIONSHIP_b...TrueTrue
9NaNacs_2022_X10_GRANDPARENTS_GRANDCHILDREN_bg.par...FalseTrue
10acs_2021_X11_HOUSEHOLD_FAMILY_SUBFAMILIES_bg.p...acs_2022_X11_HOUSEHOLD_FAMILY_SUBFAMILIES_bg.p...TrueTrue
11acs_2021_X12_MARITAL_STATUS_AND_HISTORY_bg.par...acs_2022_X12_MARITAL_STATUS_AND_HISTORY_bg.par...TrueTrue
12NaNacs_2022_X13_FERTILITY_bg.parquetFalseTrue
13acs_2021_X14_SCHOOL_ENROLLMENT_bg.parquetacs_2022_X14_SCHOOL_ENROLLMENT_bg.parquetTrueTrue
14acs_2021_X15_EDUCATIONAL_ATTAINMENT_bg.parquetacs_2022_X15_EDUCATIONAL_ATTAINMENT_bg.parquetTrueTrue
15acs_2021_X16_LANGUAGE_SPOKEN_AT_HOME_bg.parquetacs_2022_X16_LANGUAGE_SPOKEN_AT_HOME_bg.parquetTrueTrue
16acs_2021_X17_POVERTY_bg.parquetacs_2022_X17_POVERTY_bg.parquetTrueTrue
17NaNacs_2022_X18_DISABILITY_bg.parquetFalseTrue
18acs_2021_X19_INCOME_bg.parquetacs_2022_X19_INCOME_bg.parquetTrueTrue
19acs_2021_X20_EARNINGS_bg.parquetacs_2022_X20_EARNINGS_bg.parquetTrueTrue
20acs_2021_X21_VETERAN_STATUS_bg.parquetacs_2022_X21_VETERAN_STATUS_bg.parquetTrueTrue
21acs_2021_X22_FOOD_STAMPS_bg.parquetacs_2022_X22_FOOD_STAMPS_bg.parquetTrueTrue
22acs_2021_X23_EMPLOYMENT_STATUS_bg.parquetacs_2022_X23_EMPLOYMENT_STATUS_bg.parquetTrueTrue
23acs_2021_X24_INDUSTRY_OCCUPATION_bg.parquetacs_2022_X24_INDUSTRY_OCCUPATION_bg.parquetTrueTrue
24acs_2021_X25_HOUSING_CHARACTERISTICS_bg.parquetacs_2022_X25_HOUSING_CHARACTERISTICS_bg.parquetTrueTrue
25NaNacs_2022_X26_GROUP_QUARTERS_bg.parquetFalseTrue
26acs_2021_X27_HEALTH_INSURANCE_bg.parquetacs_2022_X27_HEALTH_INSURANCE_bg.parquetTrueTrue
27acs_2021_X28_COMPUTER_AND_INTERNET_USE_bg.parquetacs_2022_X28_COMPUTER_AND_INTERNET_USE_bg.parquetTrueTrue
28acs_2021_X29_VOTING_AGE_POPULATION_bg.parquetacs_2022_X29_VOTING_AGE_POPULATION_bg.parquetTrueTrue
29NaNacs_2022_X98_UNWEIGHTED_HOUSING_UNIT_SAMPLE_bg...FalseTrue
30acs_2021_X99_IMPUTATION_bg.parquetacs_2022_X99_IMPUTATION_bg.parquetTrueTrue
31acs_2021_ACS_2021_5YR_BG_bg.parquetacs_2022_ACS_2022_5YR_BG_bg.parquetTrueTrue
32acs_2021_bg.parquetacs_2022_bg.parquetTrueTrue
33acs_demographic_profile_2021_bg.parquetacs_demographic_profile_2022_bg.parquetTrueTrue
\n", + "
" + ], + "text/plain": [ + " file_2021 \\\n", + "0 acs_2021_X01_AGE_AND_SEX_bg.parquet \n", + "1 acs_2021_X02_RACE_bg.parquet \n", + "2 acs_2021_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquet \n", + "3 NaN \n", + "4 NaN \n", + "5 NaN \n", + "6 acs_2021_X07_MIGRATION_bg.parquet \n", + "7 acs_2021_X08_COMMUTING_bg.parquet \n", + "8 acs_2021_X09_CHILDREN_HOUSEHOLD_RELATIONSHIP_b... \n", + "9 NaN \n", + "10 acs_2021_X11_HOUSEHOLD_FAMILY_SUBFAMILIES_bg.p... \n", + "11 acs_2021_X12_MARITAL_STATUS_AND_HISTORY_bg.par... \n", + "12 NaN \n", + "13 acs_2021_X14_SCHOOL_ENROLLMENT_bg.parquet \n", + "14 acs_2021_X15_EDUCATIONAL_ATTAINMENT_bg.parquet \n", + "15 acs_2021_X16_LANGUAGE_SPOKEN_AT_HOME_bg.parquet \n", + "16 acs_2021_X17_POVERTY_bg.parquet \n", + "17 NaN \n", + "18 acs_2021_X19_INCOME_bg.parquet \n", + "19 acs_2021_X20_EARNINGS_bg.parquet \n", + "20 acs_2021_X21_VETERAN_STATUS_bg.parquet \n", + "21 acs_2021_X22_FOOD_STAMPS_bg.parquet \n", + "22 acs_2021_X23_EMPLOYMENT_STATUS_bg.parquet \n", + "23 acs_2021_X24_INDUSTRY_OCCUPATION_bg.parquet \n", + "24 acs_2021_X25_HOUSING_CHARACTERISTICS_bg.parquet \n", + "25 NaN \n", + "26 acs_2021_X27_HEALTH_INSURANCE_bg.parquet \n", + "27 acs_2021_X28_COMPUTER_AND_INTERNET_USE_bg.parquet \n", + "28 acs_2021_X29_VOTING_AGE_POPULATION_bg.parquet \n", + "29 NaN \n", + "30 acs_2021_X99_IMPUTATION_bg.parquet \n", + "31 acs_2021_ACS_2021_5YR_BG_bg.parquet \n", + "32 acs_2021_bg.parquet \n", + "33 acs_demographic_profile_2021_bg.parquet \n", + "\n", + " file_2022 exists_2021 \\\n", + "0 acs_2022_X01_AGE_AND_SEX_bg.parquet True \n", + "1 acs_2022_X02_RACE_bg.parquet True \n", + "2 acs_2022_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquet True \n", + "3 acs_2022_X04_ANCESTRY_bg.parquet False \n", + "4 acs_2022_X05_FOREIGN_BORN_CITIZENSHIP_bg.parquet False \n", + "5 acs_2022_X06_PLACE_OF_BIRTH_bg.parquet False \n", + "6 acs_2022_X07_MIGRATION_bg.parquet True \n", + "7 acs_2022_X08_COMMUTING_bg.parquet True \n", + "8 acs_2022_X09_CHILDREN_HOUSEHOLD_RELATIONSHIP_b... True \n", + "9 acs_2022_X10_GRANDPARENTS_GRANDCHILDREN_bg.par... False \n", + "10 acs_2022_X11_HOUSEHOLD_FAMILY_SUBFAMILIES_bg.p... True \n", + "11 acs_2022_X12_MARITAL_STATUS_AND_HISTORY_bg.par... True \n", + "12 acs_2022_X13_FERTILITY_bg.parquet False \n", + "13 acs_2022_X14_SCHOOL_ENROLLMENT_bg.parquet True \n", + "14 acs_2022_X15_EDUCATIONAL_ATTAINMENT_bg.parquet True \n", + "15 acs_2022_X16_LANGUAGE_SPOKEN_AT_HOME_bg.parquet True \n", + "16 acs_2022_X17_POVERTY_bg.parquet True \n", + "17 acs_2022_X18_DISABILITY_bg.parquet False \n", + "18 acs_2022_X19_INCOME_bg.parquet True \n", + "19 acs_2022_X20_EARNINGS_bg.parquet True \n", + "20 acs_2022_X21_VETERAN_STATUS_bg.parquet True \n", + "21 acs_2022_X22_FOOD_STAMPS_bg.parquet True \n", + "22 acs_2022_X23_EMPLOYMENT_STATUS_bg.parquet True \n", + "23 acs_2022_X24_INDUSTRY_OCCUPATION_bg.parquet True \n", + "24 acs_2022_X25_HOUSING_CHARACTERISTICS_bg.parquet True \n", + "25 acs_2022_X26_GROUP_QUARTERS_bg.parquet False \n", + "26 acs_2022_X27_HEALTH_INSURANCE_bg.parquet True \n", + "27 acs_2022_X28_COMPUTER_AND_INTERNET_USE_bg.parquet True \n", + "28 acs_2022_X29_VOTING_AGE_POPULATION_bg.parquet True \n", + "29 acs_2022_X98_UNWEIGHTED_HOUSING_UNIT_SAMPLE_bg... False \n", + "30 acs_2022_X99_IMPUTATION_bg.parquet True \n", + "31 acs_2022_ACS_2022_5YR_BG_bg.parquet True \n", + "32 acs_2022_bg.parquet True \n", + "33 acs_demographic_profile_2022_bg.parquet True \n", + "\n", + " exists_2022 \n", + "0 True \n", + "1 True \n", + "2 True \n", + "3 True \n", + "4 True \n", + "5 True \n", + "6 True \n", + "7 True \n", + "8 True \n", + "9 True \n", + "10 True \n", + "11 True \n", + "12 True \n", + "13 True \n", + "14 True \n", + "15 True \n", + "16 True \n", + "17 True \n", + "18 True \n", + "19 True \n", + "20 True \n", + "21 True \n", + "22 True \n", + "23 True \n", + "24 True \n", + "25 True \n", + "26 True \n", + "27 True \n", + "28 True \n", + "29 True \n", + "30 True \n", + "31 True \n", + "32 True \n", + "33 True " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "compare_files_df = (\n", + " parsed_2021.rename(columns={\"file\": \"file_2021\", \"year\": \"year_2021\"})\n", + " .merge(\n", + " parsed_2022.rename(columns={\"file\": \"file_2022\", \"year\": \"year_2022\"}),\n", + " on=[\"group_key\", \"kind\", \"x_code\", \"table_name\", \"geography\", \"sort_key\"],\n", + " how=\"outer\",\n", + " )\n", + " .sort_values([\"sort_key\", \"kind\", \"table_name\", \"group_key\"])\n", + " .reset_index(drop=True)\n", + ")\n", + "\n", + "compare_files_df[\"exists_2021\"] = compare_files_df[\"file_2021\"].notna()\n", + "compare_files_df[\"exists_2022\"] = compare_files_df[\"file_2022\"].notna()\n", + "\n", + "display(\n", + " compare_files_df[\n", + " [\n", + " \"file_2021\",\n", + " \"file_2022\",\n", + " \"exists_2021\",\n", + " \"exists_2022\",\n", + " ]\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0a7eada9-7cc9-4ec9-a8c7-a96007df98dc", + "metadata": {}, + "source": [ + "This cell " + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "cee97f24-0ed4-42d5-acab-5e142c65599a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2021 classification counts\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
classificationcount
0acs_canonical35
1geoid_like1
\n", + "
" + ], + "text/plain": [ + " classification count\n", + "0 acs_canonical 35\n", + "1 geoid_like 1" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2022 classification counts\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
classificationcount
0acs_new_style37
1geoid_like1
\n", + "
" + ], + "text/plain": [ + " classification count\n", + "0 acs_new_style 37\n", + "1 geoid_like 1" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "inspect21 = pd.DataFrame({\n", + " \"column\": cols21,\n", + " \"classification\": [classify_column(c) for c in cols21],\n", + " \"canonical\": [canonicalize_column(c) for c in cols21],\n", + " \"changed\": [c != canonicalize_column(c) for c in cols21],\n", + "})\n", + "inspect22 = pd.DataFrame({\n", + " \"column\": cols22,\n", + " \"classification\": [classify_column(c) for c in cols22],\n", + " \"canonical\": [canonicalize_column(c) for c in cols22],\n", + " \"changed\": [c != canonicalize_column(c) for c in cols22],\n", + "})\n", + "\n", + "print(\"2021 classification counts\")\n", + "display(inspect21[\"classification\"].value_counts().rename_axis(\"classification\").reset_index(name=\"count\"))\n", + "\n", + "print(\"2022 classification counts\")\n", + "display(inspect22[\"classification\"].value_counts().rename_axis(\"classification\").reset_index(name=\"count\"))" + ] + }, + { + "cell_type": "markdown", + "id": "630c9973-5243-4bef-b807-fbfab1eb1623", + "metadata": {}, + "source": [ + "We want to see no columns changed for 2021, but many for 2022" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "02e64267-af20-45de-a07f-1d6cbad20f32", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2021 columns changed by canonicalization\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
columnclassificationcanonicalchanged
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [column, classification, canonical, changed]\n", + "Index: []" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2022 columns changed by canonicalization\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
columnclassificationcanonicalchanged
0B02001_E001acs_new_styleB02001_001ETrue
1B02001_E002acs_new_styleB02001_002ETrue
2B02001_E003acs_new_styleB02001_003ETrue
3B02001_E004acs_new_styleB02001_004ETrue
4B02001_E005acs_new_styleB02001_005ETrue
5B02001_E006acs_new_styleB02001_006ETrue
6B02001_E007acs_new_styleB02001_007ETrue
7B02001_E008acs_new_styleB02001_008ETrue
8B02001_E009acs_new_styleB02001_009ETrue
9B02001_E010acs_new_styleB02001_010ETrue
10B02008_E001acs_new_styleB02008_001ETrue
11B02009_E001acs_new_styleB02009_001ETrue
12B02010_E001acs_new_styleB02010_001ETrue
13B02011_E001acs_new_styleB02011_001ETrue
14B02012_E001acs_new_styleB02012_001ETrue
15B02013_E001acs_new_styleB02013_001ETrue
16C02003_E001acs_new_styleC02003_001ETrue
17C02003_E002acs_new_styleC02003_002ETrue
18C02003_E003acs_new_styleC02003_003ETrue
19C02003_E004acs_new_styleC02003_004ETrue
20C02003_E005acs_new_styleC02003_005ETrue
21C02003_E006acs_new_styleC02003_006ETrue
22C02003_E007acs_new_styleC02003_007ETrue
23C02003_E008acs_new_styleC02003_008ETrue
24C02003_E009acs_new_styleC02003_009ETrue
25C02003_E010acs_new_styleC02003_010ETrue
26C02003_E011acs_new_styleC02003_011ETrue
27C02003_E012acs_new_styleC02003_012ETrue
28C02003_E013acs_new_styleC02003_013ETrue
29C02003_E014acs_new_styleC02003_014ETrue
\n", + "
" + ], + "text/plain": [ + " column classification canonical changed\n", + "0 B02001_E001 acs_new_style B02001_001E True\n", + "1 B02001_E002 acs_new_style B02001_002E True\n", + "2 B02001_E003 acs_new_style B02001_003E True\n", + "3 B02001_E004 acs_new_style B02001_004E True\n", + "4 B02001_E005 acs_new_style B02001_005E True\n", + "5 B02001_E006 acs_new_style B02001_006E True\n", + "6 B02001_E007 acs_new_style B02001_007E True\n", + "7 B02001_E008 acs_new_style B02001_008E True\n", + "8 B02001_E009 acs_new_style B02001_009E True\n", + "9 B02001_E010 acs_new_style B02001_010E True\n", + "10 B02008_E001 acs_new_style B02008_001E True\n", + "11 B02009_E001 acs_new_style B02009_001E True\n", + "12 B02010_E001 acs_new_style B02010_001E True\n", + "13 B02011_E001 acs_new_style B02011_001E True\n", + "14 B02012_E001 acs_new_style B02012_001E True\n", + "15 B02013_E001 acs_new_style B02013_001E True\n", + "16 C02003_E001 acs_new_style C02003_001E True\n", + "17 C02003_E002 acs_new_style C02003_002E True\n", + "18 C02003_E003 acs_new_style C02003_003E True\n", + "19 C02003_E004 acs_new_style C02003_004E True\n", + "20 C02003_E005 acs_new_style C02003_005E True\n", + "21 C02003_E006 acs_new_style C02003_006E True\n", + "22 C02003_E007 acs_new_style C02003_007E True\n", + "23 C02003_E008 acs_new_style C02003_008E True\n", + "24 C02003_E009 acs_new_style C02003_009E True\n", + "25 C02003_E010 acs_new_style C02003_010E True\n", + "26 C02003_E011 acs_new_style C02003_011E True\n", + "27 C02003_E012 acs_new_style C02003_012E True\n", + "28 C02003_E013 acs_new_style C02003_013E True\n", + "29 C02003_E014 acs_new_style C02003_014E True" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(\"2021 columns changed by canonicalization\")\n", + "display(inspect21[inspect21[\"changed\"]].head(30))\n", + "\n", + "print(\"2022 columns changed by canonicalization\")\n", + "display(inspect22[inspect22[\"changed\"]].head(30))" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "adc06724-05ea-496a-bbfd-c346768b991e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
layern_cols_2021_rawn_cols_2022_rawraw_overlapcanonical_overlapraw_only_2021raw_only_2022canonical_only_2021canonical_only_2022
0X02_RACE3638035363813
\n", + "
" + ], + "text/plain": [ + " layer n_cols_2021_raw n_cols_2022_raw raw_overlap canonical_overlap \\\n", + "0 X02_RACE 36 38 0 35 \n", + "\n", + " raw_only_2021 raw_only_2022 canonical_only_2021 canonical_only_2022 \n", + "0 36 38 1 3 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "raw_overlap = len(set(cols21) & set(cols22))\n", + "canon_overlap = len({canonicalize_column(c) for c in cols21} & {canonicalize_column(c) for c in cols22})\n", + "\n", + "comparison_df = pd.DataFrame([{\n", + " \"layer\": layer,\n", + " \"n_cols_2021_raw\": len(cols21),\n", + " \"n_cols_2022_raw\": len(cols22),\n", + " \"raw_overlap\": raw_overlap,\n", + " \"canonical_overlap\": canon_overlap,\n", + " \"raw_only_2021\": len(set(cols21) - set(cols22)),\n", + " \"raw_only_2022\": len(set(cols22) - set(cols21)),\n", + " \"canonical_only_2021\": len({canonicalize_column(c) for c in cols21} - {canonicalize_column(c) for c in cols22}),\n", + " \"canonical_only_2022\": len({canonicalize_column(c) for c in cols22} - {canonicalize_column(c) for c in cols21}),\n", + "}])\n", + "\n", + "display(comparison_df)" + ] + }, + { + "cell_type": "markdown", + "id": "f91994dd-0836-43a2-aaf3-822c51139d3f", + "metadata": {}, + "source": [ + "The variables specifically:" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "96232773-2474-44aa-b785-504ed0f2885b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
canonicalraw_2021raw_2022different_raw_names
0B02001_001EB02001_001EB02001_E001True
1B02001_002EB02001_002EB02001_E002True
2B02001_003EB02001_003EB02001_E003True
3B02001_004EB02001_004EB02001_E004True
4B02001_005EB02001_005EB02001_E005True
5B02001_006EB02001_006EB02001_E006True
6B02001_007EB02001_007EB02001_E007True
7B02001_008EB02001_008EB02001_E008True
8B02001_009EB02001_009EB02001_E009True
9B02001_010EB02001_010EB02001_E010True
10B02008_001EB02008_001EB02008_E001True
11B02009_001EB02009_001EB02009_E001True
12B02010_001EB02010_001EB02010_E001True
13B02011_001EB02011_001EB02011_E001True
14B02012_001EB02012_001EB02012_E001True
15B02013_001EB02013_001EB02013_E001True
16C02003_001EC02003_001EC02003_E001True
17C02003_002EC02003_002EC02003_E002True
18C02003_003EC02003_003EC02003_E003True
19C02003_004EC02003_004EC02003_E004True
20C02003_005EC02003_005EC02003_E005True
21C02003_006EC02003_006EC02003_E006True
22C02003_007EC02003_007EC02003_E007True
23C02003_008EC02003_008EC02003_E008True
24C02003_009EC02003_009EC02003_E009True
25C02003_010EC02003_010EC02003_E010True
26C02003_011EC02003_011EC02003_E011True
27C02003_012EC02003_012EC02003_E012True
28C02003_013EC02003_013EC02003_E013True
29C02003_014EC02003_014EC02003_E014True
30C02003_015EC02003_015EC02003_E015True
31C02003_016EC02003_016EC02003_E016True
32C02003_017EC02003_017EC02003_E017True
33C02003_018EC02003_018EC02003_E018True
34C02003_019EC02003_019EC02003_E019True
\n", + "
" + ], + "text/plain": [ + " canonical raw_2021 raw_2022 different_raw_names\n", + "0 B02001_001E B02001_001E B02001_E001 True\n", + "1 B02001_002E B02001_002E B02001_E002 True\n", + "2 B02001_003E B02001_003E B02001_E003 True\n", + "3 B02001_004E B02001_004E B02001_E004 True\n", + "4 B02001_005E B02001_005E B02001_E005 True\n", + "5 B02001_006E B02001_006E B02001_E006 True\n", + "6 B02001_007E B02001_007E B02001_E007 True\n", + "7 B02001_008E B02001_008E B02001_E008 True\n", + "8 B02001_009E B02001_009E B02001_E009 True\n", + "9 B02001_010E B02001_010E B02001_E010 True\n", + "10 B02008_001E B02008_001E B02008_E001 True\n", + "11 B02009_001E B02009_001E B02009_E001 True\n", + "12 B02010_001E B02010_001E B02010_E001 True\n", + "13 B02011_001E B02011_001E B02011_E001 True\n", + "14 B02012_001E B02012_001E B02012_E001 True\n", + "15 B02013_001E B02013_001E B02013_E001 True\n", + "16 C02003_001E C02003_001E C02003_E001 True\n", + "17 C02003_002E C02003_002E C02003_E002 True\n", + "18 C02003_003E C02003_003E C02003_E003 True\n", + "19 C02003_004E C02003_004E C02003_E004 True\n", + "20 C02003_005E C02003_005E C02003_E005 True\n", + "21 C02003_006E C02003_006E C02003_E006 True\n", + "22 C02003_007E C02003_007E C02003_E007 True\n", + "23 C02003_008E C02003_008E C02003_E008 True\n", + "24 C02003_009E C02003_009E C02003_E009 True\n", + "25 C02003_010E C02003_010E C02003_E010 True\n", + "26 C02003_011E C02003_011E C02003_E011 True\n", + "27 C02003_012E C02003_012E C02003_E012 True\n", + "28 C02003_013E C02003_013E C02003_E013 True\n", + "29 C02003_014E C02003_014E C02003_E014 True\n", + "30 C02003_015E C02003_015E C02003_E015 True\n", + "31 C02003_016E C02003_016E C02003_E016 True\n", + "32 C02003_017E C02003_017E C02003_E017 True\n", + "33 C02003_018E C02003_018E C02003_E018 True\n", + "34 C02003_019E C02003_019E C02003_E019 True" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "canon21 = pd.DataFrame({\n", + " \"canonical\": [canonicalize_column(c) for c in cols21],\n", + " \"raw_2021\": cols21,\n", + "}).drop_duplicates()\n", + "\n", + "canon22 = pd.DataFrame({\n", + " \"canonical\": [canonicalize_column(c) for c in cols22],\n", + " \"raw_2022\": cols22,\n", + "}).drop_duplicates()\n", + "\n", + "aligned = canon21.merge(canon22, on=\"canonical\", how=\"outer\")\n", + "aligned[\"different_raw_names\"] = (\n", + " aligned[\"raw_2021\"].notna() &\n", + " aligned[\"raw_2022\"].notna() &\n", + " (aligned[\"raw_2021\"] != aligned[\"raw_2022\"])\n", + ")\n", + "\n", + "display(aligned[aligned[\"different_raw_names\"]])" + ] + }, + { + "cell_type": "markdown", + "id": "9f73607b-c8bc-4a67-8af2-914ed1344323", + "metadata": {}, + "source": [ + "The mismatch is almost entirely due to naming convention changes, not missing data.\n", + "\n", + "the canonicalization rule recovers ~97% (35/36) of variables.\n", + "\n", + "The remaining differences are small and real (1 dropped, 3 added), not a pipeline failure." + ] + }, + { + "cell_type": "markdown", + "id": "94c36107-74d7-457a-8237-9ba0da0c4141", + "metadata": {}, + "source": [ + "## New variables in 2022\n", + "\n", + "The two cells below show the unaligned variables that are not covered by the new rule." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "f0c52177-9a6e-40d9-86d9-f7057f5d137a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
canonicalraw_2021raw_2022different_raw_names
37GEOIDGEOIDNaNFalse
\n", + "
" + ], + "text/plain": [ + " canonical raw_2021 raw_2022 different_raw_names\n", + "37 GEOID GEOID NaN False" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(aligned[aligned[\"raw_2022\"].isna()])" + ] + }, + { + "cell_type": "markdown", + "id": "40e4dde4-2029-458e-b2e6-215039994577", + "metadata": {}, + "source": [ + "The GEOID difference we know about, but the two variables below are new to this table this vintage." + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "58e50b3d-e88a-4d2f-9354-78cb45e93061", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
canonicalraw_2021raw_2022different_raw_names
35C02003_020ENaNC02003_E020False
36C02003_021ENaNC02003_E021False
38GEOIDFQNaNGEOIDFQFalse
\n", + "
" + ], + "text/plain": [ + " canonical raw_2021 raw_2022 different_raw_names\n", + "35 C02003_020E NaN C02003_E020 False\n", + "36 C02003_021E NaN C02003_E021 False\n", + "38 GEOIDFQ NaN GEOIDFQ False" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(aligned[aligned[\"raw_2021\"].isna()])" + ] + }, + { + "cell_type": "markdown", + "id": "ffd54d82-0269-4cba-80ea-0dc6f0f2d243", + "metadata": {}, + "source": [ + "## GEOID grepper\n", + "\n", + "hoping that `classify_columns` will recognize multiple GEOID-like columns" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "d65d088f-8a9e-4751-b1cf-44e1b61cb631", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2021 GEOID-like columns: ['GEOID']\n", + "2022 GEOID-like columns: ['GEOIDFQ']\n" + ] + } + ], + "source": [ + "geoid21 = [c for c in cols21 if classify_column(c) == \"geoid_like\"]\n", + "geoid22 = [c for c in cols22 if classify_column(c) == \"geoid_like\"]\n", + "\n", + "print(\"2021 GEOID-like columns:\", geoid21)\n", + "print(\"2022 GEOID-like columns:\", geoid22)" + ] + }, + { + "cell_type": "markdown", + "id": "a9de6051-c55e-41dc-ba29-dce2444ade31", + "metadata": {}, + "source": [ + "# Inspect all layers\n", + "\n", + "Now we put these new functions onto all the files" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "189808d9-6d21-44be-baf0-c5bf4da9b232", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yearfilelayercolumnclassificationcanonicalchanged
02021acs_2021_ACS_2021_5YR_BG_bg.parquetACS_2021_5YR_BGSTATEFPotherSTATEFPFalse
12021acs_2021_ACS_2021_5YR_BG_bg.parquetACS_2021_5YR_BGCOUNTYFPotherCOUNTYFPFalse
22021acs_2021_ACS_2021_5YR_BG_bg.parquetACS_2021_5YR_BGTRACTCEotherTRACTCEFalse
32021acs_2021_ACS_2021_5YR_BG_bg.parquetACS_2021_5YR_BGBLKGRPCEotherBLKGRPCEFalse
42021acs_2021_ACS_2021_5YR_BG_bg.parquetACS_2021_5YR_BGNAMELSADotherNAMELSADFalse
\n", + "
" + ], + "text/plain": [ + " year file layer column \\\n", + "0 2021 acs_2021_ACS_2021_5YR_BG_bg.parquet ACS_2021_5YR_BG STATEFP \n", + "1 2021 acs_2021_ACS_2021_5YR_BG_bg.parquet ACS_2021_5YR_BG COUNTYFP \n", + "2 2021 acs_2021_ACS_2021_5YR_BG_bg.parquet ACS_2021_5YR_BG TRACTCE \n", + "3 2021 acs_2021_ACS_2021_5YR_BG_bg.parquet ACS_2021_5YR_BG BLKGRPCE \n", + "4 2021 acs_2021_ACS_2021_5YR_BG_bg.parquet ACS_2021_5YR_BG NAMELSAD \n", + "\n", + " classification canonical changed \n", + "0 other STATEFP False \n", + "1 other COUNTYFP False \n", + "2 other TRACTCE False \n", + "3 other BLKGRPCE False \n", + "4 other NAMELSAD False " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total inspected columns: 16998\n" + ] + } + ], + "source": [ + "def inspect_file(path: Path, year: int) -> pd.DataFrame:\n", + " cols = read_parquet_columns(path)\n", + " return pd.DataFrame({\n", + " \"year\": year,\n", + " \"file\": path.name,\n", + " \"layer\": layer_key(path),\n", + " \"column\": cols,\n", + " \"classification\": [classify_column(c) for c in cols],\n", + " \"canonical\": [canonicalize_column(c) for c in cols],\n", + " \"changed\": [c != canonicalize_column(c) for c in cols],\n", + " })\n", + "\n", + "\n", + "all_column_frames = []\n", + "\n", + "for layer in all_layers:\n", + " if layer in idx_2021:\n", + " all_column_frames.append(inspect_file(idx_2021[layer], 2021))\n", + " if layer in idx_2022:\n", + " all_column_frames.append(inspect_file(idx_2022[layer], 2022))\n", + "\n", + "all_columns_df = pd.concat(all_column_frames, ignore_index=True)\n", + "\n", + "display(all_columns_df.head()) # can look closer here\n", + "print(\"Total inspected columns:\", len(all_columns_df))" + ] + }, + { + "cell_type": "markdown", + "id": "b33d9bef-394c-478d-8a6c-ab347283a2d3", + "metadata": {}, + "source": [ + "## Summarize across all layers" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "f54a3278-cd56-44dd-9f2c-17407595ac16", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yearfilelayeracs_canonicalacs_new_stylegeoid_likeothern_changed_by_canonicalization
02021acs_2021_ACS_2021_5YR_BG_bg.parquetACS_2021_5YR_BG002140
262022acs_2022_ACS_2022_5YR_BG_bg.parquetACS_2022_5YR_BG002140
242021acs_2021_bg.parquetALL_BG001370
582022acs_2022_bg.parquetALL_BG00110
252021acs_demographic_profile_2021_bg.parquetDEMOGRAPHIC_PROFILE415902140
592022acs_demographic_profile_2022_bg.parquetDEMOGRAPHIC_PROFILE042611154261
12021acs_2021_X01_AGE_AND_SEX_bg.parquetX01_AGE_AND_SEX800100
272022acs_2022_X01_AGE_AND_SEX_bg.parquetX01_AGE_AND_SEX0801080
22021acs_2021_X02_RACE_bg.parquetX02_RACE350100
282022acs_2022_X02_RACE_bg.parquetX02_RACE0371037
32021acs_2021_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquetX03_HISPANIC_OR_LATINO_ORIGIN240100
292022acs_2022_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquetX03_HISPANIC_OR_LATINO_ORIGIN0241024
302022acs_2022_X04_ANCESTRY_bg.parquetX04_ANCESTRY00100
312022acs_2022_X05_FOREIGN_BORN_CITIZENSHIP_bg.parquetX05_FOREIGN_BORN_CITIZENSHIP00100
322022acs_2022_X06_PLACE_OF_BIRTH_bg.parquetX06_PLACE_OF_BIRTH00100
42021acs_2021_X07_MIGRATION_bg.parquetX07_MIGRATION800100
332022acs_2022_X07_MIGRATION_bg.parquetX07_MIGRATION0801080
52021acs_2021_X08_COMMUTING_bg.parquetX08_COMMUTING2900100
342022acs_2022_X08_COMMUTING_bg.parquetX08_COMMUTING029010290
62021acs_2021_X09_CHILDREN_HOUSEHOLD_RELATIONSHIP_b...X09_CHILDREN_HOUSEHOLD_RELATIONSHIP1030100
352022acs_2022_X09_CHILDREN_HOUSEHOLD_RELATIONSHIP_b...X09_CHILDREN_HOUSEHOLD_RELATIONSHIP010310103
362022acs_2022_X10_GRANDPARENTS_GRANDCHILDREN_bg.par...X10_GRANDPARENTS_GRANDCHILDREN00100
72021acs_2021_X11_HOUSEHOLD_FAMILY_SUBFAMILIES_bg.p...X11_HOUSEHOLD_FAMILY_SUBFAMILIES3390100
372022acs_2022_X11_HOUSEHOLD_FAMILY_SUBFAMILIES_bg.p...X11_HOUSEHOLD_FAMILY_SUBFAMILIES033910339
82021acs_2021_X12_MARITAL_STATUS_AND_HISTORY_bg.par...X12_MARITAL_STATUS_AND_HISTORY190100
382022acs_2022_X12_MARITAL_STATUS_AND_HISTORY_bg.par...X12_MARITAL_STATUS_AND_HISTORY0191019
392022acs_2022_X13_FERTILITY_bg.parquetX13_FERTILITY00100
92021acs_2021_X14_SCHOOL_ENROLLMENT_bg.parquetX14_SCHOOL_ENROLLMENT2680100
402022acs_2022_X14_SCHOOL_ENROLLMENT_bg.parquetX14_SCHOOL_ENROLLMENT026810268
102021acs_2021_X15_EDUCATIONAL_ATTAINMENT_bg.parquetX15_EDUCATIONAL_ATTAINMENT1750100
412022acs_2022_X15_EDUCATIONAL_ATTAINMENT_bg.parquetX15_EDUCATIONAL_ATTAINMENT017510175
112021acs_2021_X16_LANGUAGE_SPOKEN_AT_HOME_bg.parquetX16_LANGUAGE_SPOKEN_AT_HOME810100
422022acs_2022_X16_LANGUAGE_SPOKEN_AT_HOME_bg.parquetX16_LANGUAGE_SPOKEN_AT_HOME0811081
122021acs_2021_X17_POVERTY_bg.parquetX17_POVERTY1510100
432022acs_2022_X17_POVERTY_bg.parquetX17_POVERTY015110151
442022acs_2022_X18_DISABILITY_bg.parquetX18_DISABILITY00100
132021acs_2021_X19_INCOME_bg.parquetX19_INCOME2250100
452022acs_2022_X19_INCOME_bg.parquetX19_INCOME022510225
142021acs_2021_X20_EARNINGS_bg.parquetX20_EARNINGS600100
462022acs_2022_X20_EARNINGS_bg.parquetX20_EARNINGS0601060
152021acs_2021_X21_VETERAN_STATUS_bg.parquetX21_VETERAN_STATUS860100
472022acs_2022_X21_VETERAN_STATUS_bg.parquetX21_VETERAN_STATUS0861086
162021acs_2021_X22_FOOD_STAMPS_bg.parquetX22_FOOD_STAMPS70100
482022acs_2022_X22_FOOD_STAMPS_bg.parquetX22_FOOD_STAMPS07107
172021acs_2021_X23_EMPLOYMENT_STATUS_bg.parquetX23_EMPLOYMENT_STATUS3310100
492022acs_2022_X23_EMPLOYMENT_STATUS_bg.parquetX23_EMPLOYMENT_STATUS033110331
182021acs_2021_X24_INDUSTRY_OCCUPATION_bg.parquetX24_INDUSTRY_OCCUPATION3390100
502022acs_2022_X24_INDUSTRY_OCCUPATION_bg.parquetX24_INDUSTRY_OCCUPATION033910339
192021acs_2021_X25_HOUSING_CHARACTERISTICS_bg.parquetX25_HOUSING_CHARACTERISTICS8700100
512022acs_2022_X25_HOUSING_CHARACTERISTICS_bg.parquetX25_HOUSING_CHARACTERISTICS097010970
522022acs_2022_X26_GROUP_QUARTERS_bg.parquetX26_GROUP_QUARTERS00100
202021acs_2021_X27_HEALTH_INSURANCE_bg.parquetX27_HEALTH_INSURANCE660100
532022acs_2022_X27_HEALTH_INSURANCE_bg.parquetX27_HEALTH_INSURANCE0661066
212021acs_2021_X28_COMPUTER_AND_INTERNET_USE_bg.parquetX28_COMPUTER_AND_INTERNET_USE2130100
542022acs_2022_X28_COMPUTER_AND_INTERNET_USE_bg.parquetX28_COMPUTER_AND_INTERNET_USE021310213
222021acs_2021_X29_VOTING_AGE_POPULATION_bg.parquetX29_VOTING_AGE_POPULATION170100
552022acs_2022_X29_VOTING_AGE_POPULATION_bg.parquetX29_VOTING_AGE_POPULATION0171017
562022acs_2022_X98_UNWEIGHTED_HOUSING_UNIT_SAMPLE_bg...X98_UNWEIGHTED_HOUSING_UNIT_SAMPLE00100
232021acs_2021_X99_IMPUTATION_bg.parquetX99_IMPUTATION3000100
572022acs_2022_X99_IMPUTATION_bg.parquetX99_IMPUTATION030010300
\n", + "
" + ], + "text/plain": [ + " year file \\\n", + "0 2021 acs_2021_ACS_2021_5YR_BG_bg.parquet \n", + "26 2022 acs_2022_ACS_2022_5YR_BG_bg.parquet \n", + "24 2021 acs_2021_bg.parquet \n", + "58 2022 acs_2022_bg.parquet \n", + "25 2021 acs_demographic_profile_2021_bg.parquet \n", + "59 2022 acs_demographic_profile_2022_bg.parquet \n", + "1 2021 acs_2021_X01_AGE_AND_SEX_bg.parquet \n", + "27 2022 acs_2022_X01_AGE_AND_SEX_bg.parquet \n", + "2 2021 acs_2021_X02_RACE_bg.parquet \n", + "28 2022 acs_2022_X02_RACE_bg.parquet \n", + "3 2021 acs_2021_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquet \n", + "29 2022 acs_2022_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquet \n", + "30 2022 acs_2022_X04_ANCESTRY_bg.parquet \n", + "31 2022 acs_2022_X05_FOREIGN_BORN_CITIZENSHIP_bg.parquet \n", + "32 2022 acs_2022_X06_PLACE_OF_BIRTH_bg.parquet \n", + "4 2021 acs_2021_X07_MIGRATION_bg.parquet \n", + "33 2022 acs_2022_X07_MIGRATION_bg.parquet \n", + "5 2021 acs_2021_X08_COMMUTING_bg.parquet \n", + "34 2022 acs_2022_X08_COMMUTING_bg.parquet \n", + "6 2021 acs_2021_X09_CHILDREN_HOUSEHOLD_RELATIONSHIP_b... \n", + "35 2022 acs_2022_X09_CHILDREN_HOUSEHOLD_RELATIONSHIP_b... \n", + "36 2022 acs_2022_X10_GRANDPARENTS_GRANDCHILDREN_bg.par... \n", + "7 2021 acs_2021_X11_HOUSEHOLD_FAMILY_SUBFAMILIES_bg.p... \n", + "37 2022 acs_2022_X11_HOUSEHOLD_FAMILY_SUBFAMILIES_bg.p... \n", + "8 2021 acs_2021_X12_MARITAL_STATUS_AND_HISTORY_bg.par... \n", + "38 2022 acs_2022_X12_MARITAL_STATUS_AND_HISTORY_bg.par... \n", + "39 2022 acs_2022_X13_FERTILITY_bg.parquet \n", + "9 2021 acs_2021_X14_SCHOOL_ENROLLMENT_bg.parquet \n", + "40 2022 acs_2022_X14_SCHOOL_ENROLLMENT_bg.parquet \n", + "10 2021 acs_2021_X15_EDUCATIONAL_ATTAINMENT_bg.parquet \n", + "41 2022 acs_2022_X15_EDUCATIONAL_ATTAINMENT_bg.parquet \n", + "11 2021 acs_2021_X16_LANGUAGE_SPOKEN_AT_HOME_bg.parquet \n", + "42 2022 acs_2022_X16_LANGUAGE_SPOKEN_AT_HOME_bg.parquet \n", + "12 2021 acs_2021_X17_POVERTY_bg.parquet \n", + "43 2022 acs_2022_X17_POVERTY_bg.parquet \n", + "44 2022 acs_2022_X18_DISABILITY_bg.parquet \n", + "13 2021 acs_2021_X19_INCOME_bg.parquet \n", + "45 2022 acs_2022_X19_INCOME_bg.parquet \n", + "14 2021 acs_2021_X20_EARNINGS_bg.parquet \n", + "46 2022 acs_2022_X20_EARNINGS_bg.parquet \n", + "15 2021 acs_2021_X21_VETERAN_STATUS_bg.parquet \n", + "47 2022 acs_2022_X21_VETERAN_STATUS_bg.parquet \n", + "16 2021 acs_2021_X22_FOOD_STAMPS_bg.parquet \n", + "48 2022 acs_2022_X22_FOOD_STAMPS_bg.parquet \n", + "17 2021 acs_2021_X23_EMPLOYMENT_STATUS_bg.parquet \n", + "49 2022 acs_2022_X23_EMPLOYMENT_STATUS_bg.parquet \n", + "18 2021 acs_2021_X24_INDUSTRY_OCCUPATION_bg.parquet \n", + "50 2022 acs_2022_X24_INDUSTRY_OCCUPATION_bg.parquet \n", + "19 2021 acs_2021_X25_HOUSING_CHARACTERISTICS_bg.parquet \n", + "51 2022 acs_2022_X25_HOUSING_CHARACTERISTICS_bg.parquet \n", + "52 2022 acs_2022_X26_GROUP_QUARTERS_bg.parquet \n", + "20 2021 acs_2021_X27_HEALTH_INSURANCE_bg.parquet \n", + "53 2022 acs_2022_X27_HEALTH_INSURANCE_bg.parquet \n", + "21 2021 acs_2021_X28_COMPUTER_AND_INTERNET_USE_bg.parquet \n", + "54 2022 acs_2022_X28_COMPUTER_AND_INTERNET_USE_bg.parquet \n", + "22 2021 acs_2021_X29_VOTING_AGE_POPULATION_bg.parquet \n", + "55 2022 acs_2022_X29_VOTING_AGE_POPULATION_bg.parquet \n", + "56 2022 acs_2022_X98_UNWEIGHTED_HOUSING_UNIT_SAMPLE_bg... \n", + "23 2021 acs_2021_X99_IMPUTATION_bg.parquet \n", + "57 2022 acs_2022_X99_IMPUTATION_bg.parquet \n", + "\n", + " layer acs_canonical acs_new_style \\\n", + "0 ACS_2021_5YR_BG 0 0 \n", + "26 ACS_2022_5YR_BG 0 0 \n", + "24 ALL_BG 0 0 \n", + "58 ALL_BG 0 0 \n", + "25 DEMOGRAPHIC_PROFILE 4159 0 \n", + "59 DEMOGRAPHIC_PROFILE 0 4261 \n", + "1 X01_AGE_AND_SEX 80 0 \n", + "27 X01_AGE_AND_SEX 0 80 \n", + "2 X02_RACE 35 0 \n", + "28 X02_RACE 0 37 \n", + "3 X03_HISPANIC_OR_LATINO_ORIGIN 24 0 \n", + "29 X03_HISPANIC_OR_LATINO_ORIGIN 0 24 \n", + "30 X04_ANCESTRY 0 0 \n", + "31 X05_FOREIGN_BORN_CITIZENSHIP 0 0 \n", + "32 X06_PLACE_OF_BIRTH 0 0 \n", + "4 X07_MIGRATION 80 0 \n", + "33 X07_MIGRATION 0 80 \n", + "5 X08_COMMUTING 290 0 \n", + "34 X08_COMMUTING 0 290 \n", + "6 X09_CHILDREN_HOUSEHOLD_RELATIONSHIP 103 0 \n", + "35 X09_CHILDREN_HOUSEHOLD_RELATIONSHIP 0 103 \n", + "36 X10_GRANDPARENTS_GRANDCHILDREN 0 0 \n", + "7 X11_HOUSEHOLD_FAMILY_SUBFAMILIES 339 0 \n", + "37 X11_HOUSEHOLD_FAMILY_SUBFAMILIES 0 339 \n", + "8 X12_MARITAL_STATUS_AND_HISTORY 19 0 \n", + "38 X12_MARITAL_STATUS_AND_HISTORY 0 19 \n", + "39 X13_FERTILITY 0 0 \n", + "9 X14_SCHOOL_ENROLLMENT 268 0 \n", + "40 X14_SCHOOL_ENROLLMENT 0 268 \n", + "10 X15_EDUCATIONAL_ATTAINMENT 175 0 \n", + "41 X15_EDUCATIONAL_ATTAINMENT 0 175 \n", + "11 X16_LANGUAGE_SPOKEN_AT_HOME 81 0 \n", + "42 X16_LANGUAGE_SPOKEN_AT_HOME 0 81 \n", + "12 X17_POVERTY 151 0 \n", + "43 X17_POVERTY 0 151 \n", + "44 X18_DISABILITY 0 0 \n", + "13 X19_INCOME 225 0 \n", + "45 X19_INCOME 0 225 \n", + "14 X20_EARNINGS 60 0 \n", + "46 X20_EARNINGS 0 60 \n", + "15 X21_VETERAN_STATUS 86 0 \n", + "47 X21_VETERAN_STATUS 0 86 \n", + "16 X22_FOOD_STAMPS 7 0 \n", + "48 X22_FOOD_STAMPS 0 7 \n", + "17 X23_EMPLOYMENT_STATUS 331 0 \n", + "49 X23_EMPLOYMENT_STATUS 0 331 \n", + "18 X24_INDUSTRY_OCCUPATION 339 0 \n", + "50 X24_INDUSTRY_OCCUPATION 0 339 \n", + "19 X25_HOUSING_CHARACTERISTICS 870 0 \n", + "51 X25_HOUSING_CHARACTERISTICS 0 970 \n", + "52 X26_GROUP_QUARTERS 0 0 \n", + "20 X27_HEALTH_INSURANCE 66 0 \n", + "53 X27_HEALTH_INSURANCE 0 66 \n", + "21 X28_COMPUTER_AND_INTERNET_USE 213 0 \n", + "54 X28_COMPUTER_AND_INTERNET_USE 0 213 \n", + "22 X29_VOTING_AGE_POPULATION 17 0 \n", + "55 X29_VOTING_AGE_POPULATION 0 17 \n", + "56 X98_UNWEIGHTED_HOUSING_UNIT_SAMPLE 0 0 \n", + "23 X99_IMPUTATION 300 0 \n", + "57 X99_IMPUTATION 0 300 \n", + "\n", + " geoid_like other n_changed_by_canonicalization \n", + "0 2 14 0 \n", + "26 2 14 0 \n", + "24 1 37 0 \n", + "58 1 1 0 \n", + "25 2 14 0 \n", + "59 1 15 4261 \n", + "1 1 0 0 \n", + "27 1 0 80 \n", + "2 1 0 0 \n", + "28 1 0 37 \n", + "3 1 0 0 \n", + "29 1 0 24 \n", + "30 1 0 0 \n", + "31 1 0 0 \n", + "32 1 0 0 \n", + "4 1 0 0 \n", + "33 1 0 80 \n", + "5 1 0 0 \n", + "34 1 0 290 \n", + "6 1 0 0 \n", + "35 1 0 103 \n", + "36 1 0 0 \n", + "7 1 0 0 \n", + "37 1 0 339 \n", + "8 1 0 0 \n", + "38 1 0 19 \n", + "39 1 0 0 \n", + "9 1 0 0 \n", + "40 1 0 268 \n", + "10 1 0 0 \n", + "41 1 0 175 \n", + "11 1 0 0 \n", + "42 1 0 81 \n", + "12 1 0 0 \n", + "43 1 0 151 \n", + "44 1 0 0 \n", + "13 1 0 0 \n", + "45 1 0 225 \n", + "14 1 0 0 \n", + "46 1 0 60 \n", + "15 1 0 0 \n", + "47 1 0 86 \n", + "16 1 0 0 \n", + "48 1 0 7 \n", + "17 1 0 0 \n", + "49 1 0 331 \n", + "18 1 0 0 \n", + "50 1 0 339 \n", + "19 1 0 0 \n", + "51 1 0 970 \n", + "52 1 0 0 \n", + "20 1 0 0 \n", + "53 1 0 66 \n", + "21 1 0 0 \n", + "54 1 0 213 \n", + "22 1 0 0 \n", + "55 1 0 17 \n", + "56 1 0 0 \n", + "23 1 0 0 \n", + "57 1 0 300 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "file_summary_df = (\n", + " all_columns_df\n", + " .groupby([\"year\", \"file\", \"layer\", \"classification\"])\n", + " .size()\n", + " .unstack(fill_value=0)\n", + " .reset_index()\n", + ")\n", + "\n", + "if \"acs_canonical\" not in file_summary_df.columns:\n", + " file_summary_df[\"acs_canonical\"] = 0\n", + "if \"acs_new_style\" not in file_summary_df.columns:\n", + " file_summary_df[\"acs_new_style\"] = 0\n", + "if \"geoid_like\" not in file_summary_df.columns:\n", + " file_summary_df[\"geoid_like\"] = 0\n", + "if \"other\" not in file_summary_df.columns:\n", + " file_summary_df[\"other\"] = 0\n", + "\n", + "changed_summary = (\n", + " all_columns_df\n", + " .groupby([\"year\", \"file\", \"layer\"])[\"changed\"]\n", + " .sum()\n", + " .reset_index(name=\"n_changed_by_canonicalization\")\n", + ")\n", + "\n", + "file_summary_df = file_summary_df.merge(\n", + " changed_summary,\n", + " on=[\"year\", \"file\", \"layer\"],\n", + " how=\"left\"\n", + ")\n", + "\n", + "display(file_summary_df.sort_values([\"layer\", \"year\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "7611da36-111c-4d2c-b138-03d6052fe05e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
layerexists_2021exists_2022n_cols_2021_rawn_cols_2022_rawraw_overlapcanonical_overlapraw_only_2021raw_only_2022canonical_only_2021canonical_only_2022
0ACS_2021_5YR_BGTrueFalse16000160160
1ACS_2022_5YR_BGFalseTrue01600016016
2ALL_BGTrueTrue38222360360
3DEMOGRAPHIC_PROFILETrueTrue41754277144173416142632104
4X01_AGE_AND_SEXTrueTrue8181080818111
5X02_RACETrueTrue3638035363813
6X03_HISPANIC_OR_LATINO_ORIGINTrueTrue2525024252511
7X04_ANCESTRYFalseTrue01000101
8X05_FOREIGN_BORN_CITIZENSHIPFalseTrue01000101
9X06_PLACE_OF_BIRTHFalseTrue01000101
10X07_MIGRATIONTrueTrue8181080818111
11X08_COMMUTINGTrueTrue291291029029129111
12X09_CHILDREN_HOUSEHOLD_RELATIONSHIPTrueTrue104104010310410411
13X10_GRANDPARENTS_GRANDCHILDRENFalseTrue01000101
14X11_HOUSEHOLD_FAMILY_SUBFAMILIESTrueTrue340340033934034011
15X12_MARITAL_STATUS_AND_HISTORYTrueTrue2020019202011
16X13_FERTILITYFalseTrue01000101
17X14_SCHOOL_ENROLLMENTTrueTrue269269026826926911
18X15_EDUCATIONAL_ATTAINMENTTrueTrue176176017517617611
19X16_LANGUAGE_SPOKEN_AT_HOMETrueTrue8282081828211
20X17_POVERTYTrueTrue152152015115215211
21X18_DISABILITYFalseTrue01000101
22X19_INCOMETrueTrue226226022522622611
23X20_EARNINGSTrueTrue6161060616111
24X21_VETERAN_STATUSTrueTrue8787086878711
25X22_FOOD_STAMPSTrueTrue88078811
26X23_EMPLOYMENT_STATUSTrueTrue332332033133233211
27X24_INDUSTRY_OCCUPATIONTrueTrue340340033934034011
28X25_HOUSING_CHARACTERISTICSTrueTrue87197108708719711101
29X26_GROUP_QUARTERSFalseTrue01000101
30X27_HEALTH_INSURANCETrueTrue6767066676711
31X28_COMPUTER_AND_INTERNET_USETrueTrue214214021321421411
32X29_VOTING_AGE_POPULATIONTrueTrue1818017181811
33X98_UNWEIGHTED_HOUSING_UNIT_SAMPLEFalseTrue01000101
34X99_IMPUTATIONTrueTrue301301030030130111
\n", + "
" + ], + "text/plain": [ + " layer exists_2021 exists_2022 \\\n", + "0 ACS_2021_5YR_BG True False \n", + "1 ACS_2022_5YR_BG False True \n", + "2 ALL_BG True True \n", + "3 DEMOGRAPHIC_PROFILE True True \n", + "4 X01_AGE_AND_SEX True True \n", + "5 X02_RACE True True \n", + "6 X03_HISPANIC_OR_LATINO_ORIGIN True True \n", + "7 X04_ANCESTRY False True \n", + "8 X05_FOREIGN_BORN_CITIZENSHIP False True \n", + "9 X06_PLACE_OF_BIRTH False True \n", + "10 X07_MIGRATION True True \n", + "11 X08_COMMUTING True True \n", + "12 X09_CHILDREN_HOUSEHOLD_RELATIONSHIP True True \n", + "13 X10_GRANDPARENTS_GRANDCHILDREN False True \n", + "14 X11_HOUSEHOLD_FAMILY_SUBFAMILIES True True \n", + "15 X12_MARITAL_STATUS_AND_HISTORY True True \n", + "16 X13_FERTILITY False True \n", + "17 X14_SCHOOL_ENROLLMENT True True \n", + "18 X15_EDUCATIONAL_ATTAINMENT True True \n", + "19 X16_LANGUAGE_SPOKEN_AT_HOME True True \n", + "20 X17_POVERTY True True \n", + "21 X18_DISABILITY False True \n", + "22 X19_INCOME True True \n", + "23 X20_EARNINGS True True \n", + "24 X21_VETERAN_STATUS True True \n", + "25 X22_FOOD_STAMPS True True \n", + "26 X23_EMPLOYMENT_STATUS True True \n", + "27 X24_INDUSTRY_OCCUPATION True True \n", + "28 X25_HOUSING_CHARACTERISTICS True True \n", + "29 X26_GROUP_QUARTERS False True \n", + "30 X27_HEALTH_INSURANCE True True \n", + "31 X28_COMPUTER_AND_INTERNET_USE True True \n", + "32 X29_VOTING_AGE_POPULATION True True \n", + "33 X98_UNWEIGHTED_HOUSING_UNIT_SAMPLE False True \n", + "34 X99_IMPUTATION True True \n", + "\n", + " n_cols_2021_raw n_cols_2022_raw raw_overlap canonical_overlap \\\n", + "0 16 0 0 0 \n", + "1 0 16 0 0 \n", + "2 38 2 2 2 \n", + "3 4175 4277 14 4173 \n", + "4 81 81 0 80 \n", + "5 36 38 0 35 \n", + "6 25 25 0 24 \n", + "7 0 1 0 0 \n", + "8 0 1 0 0 \n", + "9 0 1 0 0 \n", + "10 81 81 0 80 \n", + "11 291 291 0 290 \n", + "12 104 104 0 103 \n", + "13 0 1 0 0 \n", + "14 340 340 0 339 \n", + "15 20 20 0 19 \n", + "16 0 1 0 0 \n", + "17 269 269 0 268 \n", + "18 176 176 0 175 \n", + "19 82 82 0 81 \n", + "20 152 152 0 151 \n", + "21 0 1 0 0 \n", + "22 226 226 0 225 \n", + "23 61 61 0 60 \n", + "24 87 87 0 86 \n", + "25 8 8 0 7 \n", + "26 332 332 0 331 \n", + "27 340 340 0 339 \n", + "28 871 971 0 870 \n", + "29 0 1 0 0 \n", + "30 67 67 0 66 \n", + "31 214 214 0 213 \n", + "32 18 18 0 17 \n", + "33 0 1 0 0 \n", + "34 301 301 0 300 \n", + "\n", + " raw_only_2021 raw_only_2022 canonical_only_2021 canonical_only_2022 \n", + "0 16 0 16 0 \n", + "1 0 16 0 16 \n", + "2 36 0 36 0 \n", + "3 4161 4263 2 104 \n", + "4 81 81 1 1 \n", + "5 36 38 1 3 \n", + "6 25 25 1 1 \n", + "7 0 1 0 1 \n", + "8 0 1 0 1 \n", + "9 0 1 0 1 \n", + "10 81 81 1 1 \n", + "11 291 291 1 1 \n", + "12 104 104 1 1 \n", + "13 0 1 0 1 \n", + "14 340 340 1 1 \n", + "15 20 20 1 1 \n", + "16 0 1 0 1 \n", + "17 269 269 1 1 \n", + "18 176 176 1 1 \n", + "19 82 82 1 1 \n", + "20 152 152 1 1 \n", + "21 0 1 0 1 \n", + "22 226 226 1 1 \n", + "23 61 61 1 1 \n", + "24 87 87 1 1 \n", + "25 8 8 1 1 \n", + "26 332 332 1 1 \n", + "27 340 340 1 1 \n", + "28 871 971 1 101 \n", + "29 0 1 0 1 \n", + "30 67 67 1 1 \n", + "31 214 214 1 1 \n", + "32 18 18 1 1 \n", + "33 0 1 0 1 \n", + "34 301 301 1 1 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def layer_compare(layer: str) -> dict:\n", + " cols21 = set(read_parquet_columns(idx_2021[layer])) if layer in idx_2021 else set()\n", + " cols22 = set(read_parquet_columns(idx_2022[layer])) if layer in idx_2022 else set()\n", + "\n", + " canon21 = {canonicalize_column(c) for c in cols21}\n", + " canon22 = {canonicalize_column(c) for c in cols22}\n", + "\n", + " return {\n", + " \"layer\": layer,\n", + " \"exists_2021\": layer in idx_2021,\n", + " \"exists_2022\": layer in idx_2022,\n", + " \"n_cols_2021_raw\": len(cols21),\n", + " \"n_cols_2022_raw\": len(cols22),\n", + " \"raw_overlap\": len(cols21 & cols22),\n", + " \"canonical_overlap\": len(canon21 & canon22),\n", + " \"raw_only_2021\": len(cols21 - cols22),\n", + " \"raw_only_2022\": len(cols22 - cols21),\n", + " \"canonical_only_2021\": len(canon21 - canon22),\n", + " \"canonical_only_2022\": len(canon22 - canon21),\n", + " }\n", + "\n", + "layer_comparison_df = pd.DataFrame([layer_compare(layer) for layer in all_layers])\n", + "display(layer_comparison_df.sort_values(\"layer\"))\n", + "\n", + "\n", + "# Ignore the first two rows. In the last two columns (canonical_only), the columns which have `1` are going to be the differently named GEOID columns. \n", + "# Some tables have large differences: Demographic Profile, Housing Characteristics, and Race all have > 1\n", + "# Some tables are entirely new in 2022 (X04, X05, X06, X10, X13, X18, X26, X98)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "ffd516ce-4eca-49b1-9559-6e449fcf1285", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
layercanonicalraw_2021raw_2022different_raw_names
72DEMOGRAPHIC_PROFILEB01001_001EB01001_001EB01001_E001True
73DEMOGRAPHIC_PROFILEB01001_002EB01001_002EB01001_E002True
74DEMOGRAPHIC_PROFILEB01001_003EB01001_003EB01001_E003True
75DEMOGRAPHIC_PROFILEB01001_004EB01001_004EB01001_E004True
76DEMOGRAPHIC_PROFILEB01001_005EB01001_005EB01001_E005True
..................
8657X99_IMPUTATIONB99283_001EB99283_001EB99283_E001True
8658X99_IMPUTATIONB99283_002EB99283_002EB99283_E002True
8659X99_IMPUTATIONB99283_003EB99283_003EB99283_E003True
8660X99_IMPUTATIONB99283_004EB99283_004EB99283_E004True
8661X99_IMPUTATIONB99283_005EB99283_005EB99283_E005True
\n", + "

8318 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " layer canonical raw_2021 raw_2022 \\\n", + "72 DEMOGRAPHIC_PROFILE B01001_001E B01001_001E B01001_E001 \n", + "73 DEMOGRAPHIC_PROFILE B01001_002E B01001_002E B01001_E002 \n", + "74 DEMOGRAPHIC_PROFILE B01001_003E B01001_003E B01001_E003 \n", + "75 DEMOGRAPHIC_PROFILE B01001_004E B01001_004E B01001_E004 \n", + "76 DEMOGRAPHIC_PROFILE B01001_005E B01001_005E B01001_E005 \n", + "... ... ... ... ... \n", + "8657 X99_IMPUTATION B99283_001E B99283_001E B99283_E001 \n", + "8658 X99_IMPUTATION B99283_002E B99283_002E B99283_E002 \n", + "8659 X99_IMPUTATION B99283_003E B99283_003E B99283_E003 \n", + "8660 X99_IMPUTATION B99283_004E B99283_004E B99283_E004 \n", + "8661 X99_IMPUTATION B99283_005E B99283_005E B99283_E005 \n", + "\n", + " different_raw_names \n", + "72 True \n", + "73 True \n", + "74 True \n", + "75 True \n", + "76 True \n", + "... ... \n", + "8657 True \n", + "8658 True \n", + "8659 True \n", + "8660 True \n", + "8661 True \n", + "\n", + "[8318 rows x 5 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rows with same canonical variable but different raw names: 8318\n" + ] + } + ], + "source": [ + "canon_pairs = (\n", + " all_columns_df[[\"year\", \"layer\", \"column\", \"canonical\"]]\n", + " .drop_duplicates()\n", + ")\n", + "\n", + "canon_pivot = canon_pairs.pivot_table(\n", + " index=[\"layer\", \"canonical\"],\n", + " columns=\"year\",\n", + " values=\"column\",\n", + " aggfunc=\"first\"\n", + ").reset_index()\n", + "\n", + "canon_pivot.columns.name = None\n", + "if 2021 not in canon_pivot.columns:\n", + " canon_pivot[2021] = None\n", + "if 2022 not in canon_pivot.columns:\n", + " canon_pivot[2022] = None\n", + "\n", + "canon_pivot = canon_pivot.rename(columns={2021: \"raw_2021\", 2022: \"raw_2022\"})\n", + "canon_pivot[\"different_raw_names\"] = (\n", + " canon_pivot[\"raw_2021\"].notna() &\n", + " canon_pivot[\"raw_2022\"].notna() &\n", + " (canon_pivot[\"raw_2021\"] != canon_pivot[\"raw_2022\"])\n", + ")\n", + "\n", + "renamed_df = canon_pivot[canon_pivot[\"different_raw_names\"]].sort_values([\"layer\", \"canonical\"])\n", + "\n", + "display(renamed_df)\n", + "print(\"Rows with same canonical variable but different raw names:\", len(renamed_df))" + ] } ], "metadata": { From cec1fea2609201342d8d3af300010362d036b4a8 Mon Sep 17 00:00:00 2001 From: Dylan Date: Mon, 13 Apr 2026 13:51:13 -0700 Subject: [PATCH 7/9] housing characteristics --- build/examine_output.ipynb | 571 ++++++++++++++++++++++++++++++++++++- 1 file changed, 568 insertions(+), 3 deletions(-) diff --git a/build/examine_output.ipynb b/build/examine_output.ipynb index 9d0587c3..6d51c410 100644 --- a/build/examine_output.ipynb +++ b/build/examine_output.ipynb @@ -2019,9 +2019,7 @@ "cell_type": "code", "execution_count": 44, "id": "ffbb0145-e40e-42b6-8793-77409c74dab4", - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -5576,6 +5574,573 @@ "display(renamed_df)\n", "print(\"Rows with same canonical variable but different raw names:\", len(renamed_df))" ] + }, + { + "cell_type": "markdown", + "id": "858ac216-1a48-45ff-a944-2bc83ef2add3", + "metadata": {}, + "source": [ + "# Housing characteristics??\n", + "\n", + "Why is this table so much different? Are these really 100 new variables?" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "0442576a-e1dd-4633-8af8-141234f0fff6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2021 exists: True ../build/2021_bg/acs_2021_X25_HOUSING_CHARACTERISTICS_bg.parquet\n", + "2022 exists: True ../build/2022_bg/acs_2022_X25_HOUSING_CHARACTERISTICS_bg.parquet\n" + ] + } + ], + "source": [ + "BUILD_ROOT = Path(\"../build\")\n", + "\n", + "file_2021 = BUILD_ROOT / \"2021_bg\" / \"acs_2021_X25_HOUSING_CHARACTERISTICS_bg.parquet\"\n", + "file_2022 = BUILD_ROOT / \"2022_bg\" / \"acs_2022_X25_HOUSING_CHARACTERISTICS_bg.parquet\"\n", + "\n", + "print(\"2021 exists:\", file_2021.exists(), file_2021)\n", + "print(\"2022 exists:\", file_2022.exists(), file_2022)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "e704d78f-58d4-4627-bdff-2e5ef7598442", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n_cols_2021_raw: 871\n", + "n_cols_2022_raw: 971\n" + ] + } + ], + "source": [ + "def read_parquet_columns(path: Path) -> list[str]:\n", + " return pq.ParquetFile(path).schema_arrow.names\n", + "\n", + "cols_2021 = read_parquet_columns(file_2021)\n", + "cols_2022 = read_parquet_columns(file_2022)\n", + "\n", + "print(\"n_cols_2021_raw:\", len(cols_2021))\n", + "print(\"n_cols_2022_raw:\", len(cols_2022))" + ] + }, + { + "cell_type": "markdown", + "id": "601a490f-c552-4b3b-86bc-124b7ddeb9b8", + "metadata": {}, + "source": [ + "Canonicalize the columns" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "dc4577d0-2679-490e-a143-1a9bdee2b0f9", + "metadata": {}, + "outputs": [], + "source": [ + "df21 = pd.DataFrame({\"raw_2021\": cols_2021})\n", + "df21[\"canonical\"] = df21[\"raw_2021\"].map(canonicalize_column)\n", + "\n", + "df22 = pd.DataFrame({\"raw_2022\": cols_2022})\n", + "df22[\"canonical\"] = df22[\"raw_2022\"].map(canonicalize_column)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "81387311-9998-4fa3-bd88-aaa8b7b21693", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
raw_2021canonicalraw_2022present_2021present_2022
0B25001_001EB25001_001EB25001_E001TrueTrue
1B25002_001EB25002_001EB25002_E001TrueTrue
2B25002_002EB25002_002EB25002_E002TrueTrue
3B25002_003EB25002_003EB25002_E003TrueTrue
4B25003A_001EB25003A_001EB25003A_E001TrueTrue
\n", + "
" + ], + "text/plain": [ + " raw_2021 canonical raw_2022 present_2021 present_2022\n", + "0 B25001_001E B25001_001E B25001_E001 True True\n", + "1 B25002_001E B25002_001E B25002_E001 True True\n", + "2 B25002_002E B25002_002E B25002_E002 True True\n", + "3 B25002_003E B25002_003E B25002_E003 True True\n", + "4 B25003A_001E B25003A_001E B25003A_E001 True True" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "aligned = (\n", + " df21.merge(df22, on=\"canonical\", how=\"outer\")\n", + " .sort_values(\"canonical\")\n", + " .reset_index(drop=True)\n", + ")\n", + "\n", + "aligned[\"present_2021\"] = aligned[\"raw_2021\"].notna()\n", + "aligned[\"present_2022\"] = aligned[\"raw_2022\"].notna()\n", + "\n", + "display(aligned.head(5))" + ] + }, + { + "cell_type": "markdown", + "id": "43fd67f6-9a62-404c-9bbc-c0788836e0e5", + "metadata": {}, + "source": [ + "## Now get vars only present in 2022" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "62bd73a2-cb6d-4565-a1d5-2106c55a1410", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ACS variables only in 2022: 100\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
raw_2021canonicalraw_2022present_2021present_2022
73NaNB25008A_001EB25008A_E001FalseTrue
74NaNB25008A_002EB25008A_E002FalseTrue
75NaNB25008A_003EB25008A_E003FalseTrue
76NaNB25008B_001EB25008B_E001FalseTrue
77NaNB25008B_002EB25008B_E002FalseTrue
\n", + "
" + ], + "text/plain": [ + " raw_2021 canonical raw_2022 present_2021 present_2022\n", + "73 NaN B25008A_001E B25008A_E001 False True\n", + "74 NaN B25008A_002E B25008A_E002 False True\n", + "75 NaN B25008A_003E B25008A_E003 False True\n", + "76 NaN B25008B_001E B25008B_E001 False True\n", + "77 NaN B25008B_002E B25008B_E002 False True" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "only_2021 = aligned[\n", + " aligned[\"raw_2021\"].notna() & aligned[\"raw_2022\"].isna()\n", + "].copy()\n", + "\n", + "acs_pattern = r\"^[A-Z0-9]+_\\d{3}[EM]$\"\n", + "\n", + "acs_only_2022 = only_2022[\n", + " only_2022[\"canonical\"].str.match(acs_pattern, na=False)\n", + "].copy()\n", + "\n", + "print(\"ACS variables only in 2022:\", len(acs_only_2022))\n", + "display(acs_only_2022.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "75c5d294-23c5-4b59-9a08-bf593e70c620", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
table_prefixn_new_vars
0B2513613
1B2514013
2B251379
3B25008A3
4B25008E3
5B25008B3
6B25008C3
7B25008D3
8B25008I3
9B25010A3
10B25010B3
11B25010C3
12B25010D3
13B25008F3
14B25008G3
15B25008H3
16B25010G3
17B25010F3
18B25010E3
19B25010H3
20B25010I3
21B25077C1
22B25077A1
23B25077B1
24B25077F1
25B25077E1
26B25077D1
27B25077G1
28B25077I1
29B25077H1
30B251381
31B251391
\n", + "
" + ], + "text/plain": [ + " table_prefix n_new_vars\n", + "0 B25136 13\n", + "1 B25140 13\n", + "2 B25137 9\n", + "3 B25008A 3\n", + "4 B25008E 3\n", + "5 B25008B 3\n", + "6 B25008C 3\n", + "7 B25008D 3\n", + "8 B25008I 3\n", + "9 B25010A 3\n", + "10 B25010B 3\n", + "11 B25010C 3\n", + "12 B25010D 3\n", + "13 B25008F 3\n", + "14 B25008G 3\n", + "15 B25008H 3\n", + "16 B25010G 3\n", + "17 B25010F 3\n", + "18 B25010E 3\n", + "19 B25010H 3\n", + "20 B25010I 3\n", + "21 B25077C 1\n", + "22 B25077A 1\n", + "23 B25077B 1\n", + "24 B25077F 1\n", + "25 B25077E 1\n", + "26 B25077D 1\n", + "27 B25077G 1\n", + "28 B25077I 1\n", + "29 B25077H 1\n", + "30 B25138 1\n", + "31 B25139 1" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "acs_only_2022[\"prefix\"] = acs_only_2022[\"canonical\"].str.extract(r\"^([A-Z0-9]+)_\")\n", + "\n", + "prefix_counts = (\n", + " acs_only_2022[\"prefix\"]\n", + " .value_counts()\n", + " .rename_axis(\"table_prefix\")\n", + " .reset_index(name=\"n_new_vars\")\n", + ")\n", + "\n", + "display(prefix_counts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfc8b9b7-dc30-443e-8905-fffde0057daa", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 67bdab47e5f5bec63b6d97d61ab66121e00740cc Mon Sep 17 00:00:00 2001 From: Dylan Date: Tue, 14 Apr 2026 10:11:13 -0700 Subject: [PATCH 8/9] fix for 2022 var names 0.1 --- geosnap/io/util.py | 127 +++++++++++++++++++++++++++++++++------------ 1 file changed, 95 insertions(+), 32 deletions(-) diff --git a/geosnap/io/util.py b/geosnap/io/util.py index 020eb62d..eaf848d1 100644 --- a/geosnap/io/util.py +++ b/geosnap/io/util.py @@ -7,6 +7,7 @@ import pandas as pd import pooch from tqdm.auto import tqdm +import re def get_census_gdb(years=None, geom_level="blockgroup", output_dir=".", protocol="ftp"): @@ -54,25 +55,94 @@ def get_census_gdb(years=None, geom_level="blockgroup", output_dir=".", protocol pooch.retrieve(urls[protocol], None, progressbar=True, fname=fn, path=pth) -def reformat_acs_vars(col): - """Convert variable names to the same format used by the Census Detailed Tables API. - +def normalize_acs_vars(col): + """Normalize ACS variable names to the canonical Census API format. + See for variable descriptions + Supported conversions + ---------------- + Old-style TIGER_DP names: + B02001e1 -> B02001_001E + B19013e1 -> B19013_001E + + Newer TIGER_DP names: + B02001_E001 -> B02001_001E + B02001_M001 -> B02001_001M + + Already-canonical names: + B02001_001E -> B02001_001E + B02001_001M -> B02001_001M + + GEOID-like columns are returned unchanged. Parameters ---------- col : str - column name to adjust + Column name to adjust. Returns ------- str - reformatted column name + Normalized ACS-style column name. """ - pieces = col.split("e") - formatted = pieces[0] + "_" + pieces[1].rjust(3, "0") + "E" - return formatted + col = str(col).strip() + if col in {"GEOID", "GEOIDFQ", "GEOID_Data", "geometry"}: + return col + + # Older style: B02001e1 -> B02001_001E + old_style = re.match(r"^([A-Za-z0-9]+)e(\d+)$", col) + if old_style: + stem, num = old_style.groups() + return f"{stem.upper()}_{num.rjust(3, '0')}E" + + # 2022 style: B02001_E001 -> B02001_001E + new_style = re.match(r"^([A-Za-z0-9]+)_([EM])(\d{3})$", col, flags=re.IGNORECASE) + if new_style: + stem, suffix, num = new_style.groups() + return f"{stem.upper()}_{num}{suffix.upper()}" + + canonical = re.match(r"^([A-Za-z0-9]+)_(\d{3})([EM])$", col, flags=re.IGNORECASE) + if canonical: + stem, num, suffix = canonical.groups() + return f"{stem.upper()}_{num}{suffix.upper()}" + + return col + + +def find_geoid_column(columns): + """Identify the GEOID-like column in a set of column names. + + Supports naming conventions used across Census vintages, e.g.: + GEOID + GEOIDFQ + GEOID_Data + GEOID20, GEOID10, etc. + + Parameters + ---------- + columns : iterable + Collection of column names (DataFrame.columns) + + Returns + ------- + str or None + Name of the detected GEOID-like column, or None if not found + """ + # Preferred explicit matches first (most stable) + priority = ["GEOID", "GEOIDFQ", "GEOID_Data"] + for candidate in priority: + if candidate in columns: + return candidate + + # Fallback: regex match for any GEOID-like column + for col in columns: + if re.match(r"^GEOID", str(col), flags=re.IGNORECASE): + return col + + # If no GEOID column found, warn + warn(f"No GEOID-like column found. Columns are: {list(columns)}") + return None def convert_census_gdb( @@ -138,8 +208,10 @@ def convert_census_gdb( year_suffix = year[-2:] meta_str = f"{level.upper()}_METADATA_20{year_suffix}" layers = [layer[0] for layer in ogr.list_layers(gdb_path)] - if meta_str in layers: - layers.remove(meta_str) + layers = [ + layer for layer in layers + if layer != meta_str and not layer.endswith("_METADATA") + ] tables = list() existing_files = os.listdir(output_dir) @@ -165,30 +237,21 @@ def convert_census_gdb( else: raw = dgpd.read_file(gdb_path, layer=i, npartitions=npartitions).compute() - if "GEOID" in raw.columns: - geoid_col = "GEOID" - elif "GEOIDFQ" in raw.columns: - geoid_col = "GEOIDFQ" - elif "GEOID_Data" in raw.columns: - geoid_col = "GEOID_Data" - else: - raise KeyError( - f"No GEOID-like column found in layer {i}. Columns are: {list(raw.columns)}" - ) - + geoid_col = find_geoid_column(raw.columns) + if geoid_col is None: + warn(f"Skipping layer {i} because no GEOID column was found") + continue + df = raw.set_index(geoid_col) - if "ACS_" not in i: # only the geoms have the ACS prefix - # newer vintages already use normalized names like B02001_E001. - # older vintages may still use names like B02001e1. - uppercase_estimates = df.columns[df.columns.str.contains("_E", regex=False)] - lowercase_estimates = df.columns[df.columns.str.contains("e", regex=False)] - - if len(uppercase_estimates) > 0: - df = df[uppercase_estimates] - else: - df = df[lowercase_estimates] - df.columns = pd.Series(df.columns).apply(reformat_acs_vars) + if "ACS_" not in i: # only the geoms have the ACS prefix + candidate_cols = df.columns[ + df.columns.str.contains("_E", regex=False) + | df.columns.str.contains("_M", regex=False) + | df.columns.str.contains("e", regex=False) + ] + df = df[candidate_cols] + df.columns = pd.Index([normalize_acs_vars(col) for col in df.columns]) df = df.dropna(axis=1, how="all") df.index = df.index.astype(str) From cbf51a25a166f9388cb13d5ab6df744fdbaf68f0 Mon Sep 17 00:00:00 2001 From: Dylan Date: Tue, 14 Apr 2026 12:18:31 -0700 Subject: [PATCH 9/9] fix for 2022 var names 0.2 --- build/examine_output.ipynb | 5175 +++++++++--------------------------- geosnap/io/util.py | 18 +- geosnap/io/variables.csv | 390 +-- 3 files changed, 1510 insertions(+), 4073 deletions(-) diff --git a/build/examine_output.ipynb b/build/examine_output.ipynb index 6d51c410..378766a0 100644 --- a/build/examine_output.ipynb +++ b/build/examine_output.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "42ace1ac-64d4-4918-899f-ccf1e285b215", "metadata": {}, "outputs": [], @@ -25,19 +25,10 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "e1730a77-37b0-431d-88d0-9bf9d092ce9c", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Still exist in 2022: 89\n", - "Gone in 2022: 0\n" - ] - } - ], + "outputs": [], "source": [ "missing = newly_missing_in_2022\n", "\n", @@ -64,106 +55,10 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "7d7c5d72-e33f-446e-86ff-f0bf309c683e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "B01001_003E -> Estimate!!Total:!!Male:!!Under 5 years\n", - "B01001_004E -> Estimate!!Total:!!Male:!!5 to 9 years\n", - "B01001_005E -> Estimate!!Total:!!Male:!!10 to 14 years\n", - "B01001_006E -> Estimate!!Total:!!Male:!!15 to 17 years\n", - "B01001_018E -> Estimate!!Total:!!Male:!!60 and 61 years\n", - "B01001_019E -> Estimate!!Total:!!Male:!!62 to 64 years\n", - "B01001_020E -> Estimate!!Total:!!Male:!!65 and 66 years\n", - "B01001_021E -> Estimate!!Total:!!Male:!!67 to 69 years\n", - "B01001_022E -> Estimate!!Total:!!Male:!!70 to 74 years\n", - "B01001_023E -> Estimate!!Total:!!Male:!!75 to 79 years\n", - "B01001_024E -> Estimate!!Total:!!Male:!!80 to 84 years\n", - "B01001_025E -> Estimate!!Total:!!Male:!!85 years and over\n", - "B01001_027E -> Estimate!!Total:!!Female:!!Under 5 years\n", - "B01001_028E -> Estimate!!Total:!!Female:!!5 to 9 years\n", - "B01001_029E -> Estimate!!Total:!!Female:!!10 to 14 years\n", - "B01001_030E -> Estimate!!Total:!!Female:!!15 to 17 years\n", - "B01001_042E -> Estimate!!Total:!!Female:!!60 and 61 years\n", - "B01001_043E -> Estimate!!Total:!!Female:!!62 to 64 years\n", - "B01001_044E -> Estimate!!Total:!!Female:!!65 and 66 years\n", - "B01001_045E -> Estimate!!Total:!!Female:!!67 to 69 years\n", - "B01001_046E -> Estimate!!Total:!!Female:!!70 to 74 years\n", - "B01001_047E -> Estimate!!Total:!!Female:!!75 to 79 years\n", - "B01001_048E -> Estimate!!Total:!!Female:!!80 to 84 years\n", - "B01001_049E -> Estimate!!Total:!!Female:!!85 years and over\n", - "B01003_001E -> Estimate!!Total\n", - "B02001_006E -> Estimate!!Total:!!Native Hawaiian and Other Pacific Islander alone\n", - "B03002_003E -> Estimate!!Total:!!Not Hispanic or Latino:!!White alone\n", - "B03002_004E -> Estimate!!Total:!!Not Hispanic or Latino:!!Black or African American alone\n", - "B03002_005E -> Estimate!!Total:!!Not Hispanic or Latino:!!American Indian and Alaska Native alone\n", - "B03002_006E -> Estimate!!Total:!!Not Hispanic or Latino:!!Asian alone\n", - "B03002_007E -> Estimate!!Total:!!Not Hispanic or Latino:!!Native Hawaiian and Other Pacific Islander alone\n", - "B03002_012E -> Estimate!!Total:!!Hispanic or Latino:\n", - "B12001_001E -> Estimate!!Total:\n", - "B12001_005E -> Estimate!!Total:!!Male:!!Now married:!!Married, spouse present\n", - "B12001_007E -> Estimate!!Total:!!Male:!!Now married:!!Married, spouse absent:!!Separated\n", - "B12001_009E -> Estimate!!Total:!!Male:!!Widowed\n", - "B12001_010E -> Estimate!!Total:!!Male:!!Divorced\n", - "B12001_016E -> Estimate!!Total:!!Female:!!Now married:!!Married, spouse absent:!!Separated\n", - "B12001_018E -> Estimate!!Total:!!Female:!!Widowed\n", - "B12001_019E -> Estimate!!Total:!!Female:!!Divorced\n", - "B15002_001E -> Estimate!!Total:\n", - "B15002_003E -> Estimate!!Total:!!Male:!!No schooling completed\n", - "B15002_004E -> Estimate!!Total:!!Male:!!Nursery to 4th grade\n", - "B15002_005E -> Estimate!!Total:!!Male:!!5th and 6th grade\n", - "B15002_006E -> Estimate!!Total:!!Male:!!7th and 8th grade\n", - "B15002_007E -> Estimate!!Total:!!Male:!!9th grade\n", - "B15002_008E -> Estimate!!Total:!!Male:!!10th grade\n", - "B15002_009E -> Estimate!!Total:!!Male:!!11th grade\n", - "B15002_010E -> Estimate!!Total:!!Male:!!12th grade, no diploma\n", - "B15002_015E -> Estimate!!Total:!!Male:!!Bachelor's degree\n", - "B15002_016E -> Estimate!!Total:!!Male:!!Master's degree\n", - "B15002_017E -> Estimate!!Total:!!Male:!!Professional school degree\n", - "B15002_018E -> Estimate!!Total:!!Male:!!Doctorate degree\n", - "B15002_020E -> Estimate!!Total:!!Female:!!No schooling completed\n", - "B15002_021E -> Estimate!!Total:!!Female:!!Nursery to 4th grade\n", - "B15002_022E -> Estimate!!Total:!!Female:!!5th and 6th grade\n", - "B15002_023E -> Estimate!!Total:!!Female:!!7th and 8th grade\n", - "B15002_024E -> Estimate!!Total:!!Female:!!9th grade\n", - "B15002_025E -> Estimate!!Total:!!Female:!!10th grade\n", - "B15002_026E -> Estimate!!Total:!!Female:!!11th grade\n", - "B15002_027E -> Estimate!!Total:!!Female:!!12th grade, no diploma\n", - "B15002_032E -> Estimate!!Total:!!Female:!!Bachelor's degree\n", - "B15002_033E -> Estimate!!Total:!!Female:!!Master's degree\n", - "B15002_034E -> Estimate!!Total:!!Female:!!Professional school degree\n", - "B15002_035E -> Estimate!!Total:!!Female:!!Doctorate degree\n", - "B17010_001E -> Estimate!!Total:\n", - "B17010_004E -> Estimate!!Total:!!Income in the past 12 months below poverty level:!!Married-couple family:!!With related children of the householder under 18 years:\n", - "B17010_011E -> Estimate!!Total:!!Income in the past 12 months below poverty level:!!Other family:!!Male householder, no spouse present:!!With related children of the householder under 18 years:\n", - "B17010_017E -> Estimate!!Total:!!Income in the past 12 months below poverty level:!!Other family:!!Female householder, no spouse present:!!With related children of the householder under 18 years:\n", - "B19001_001E -> Estimate!!Total:\n", - "B19013_001E -> Estimate!!Median household income in the past 12 months (in 2022 inflation-adjusted dollars)\n", - "B19301_001E -> Estimate!!Per capita income in the past 12 months (in 2022 inflation-adjusted dollars)\n", - "B21001_002E -> Estimate!!Total:!!Veteran\n", - "B25002_001E -> Estimate!!Total:\n", - "B25002_002E -> Estimate!!Total:!!Occupied\n", - "B25002_003E -> Estimate!!Total:!!Vacant\n", - "B25003_001E -> Estimate!!Total:\n", - "B25003_002E -> Estimate!!Total:!!Owner occupied\n", - "B25003_003E -> Estimate!!Total:!!Renter occupied\n", - "B25024_001E -> Estimate!!Total:\n", - "B25024_004E -> Estimate!!Total:!!2\n", - "B25024_005E -> Estimate!!Total:!!3 or 4\n", - "B25024_006E -> Estimate!!Total:!!5 to 9\n", - "B25024_007E -> Estimate!!Total:!!10 to 19\n", - "B25024_008E -> Estimate!!Total:!!20 to 49\n", - "B25024_009E -> Estimate!!Total:!!50 or more\n", - "B25058_001E -> Estimate!!Median contract rent\n", - "B25077_001E -> Estimate!!Median value (dollars)\n", - "C24010_001E -> Estimate!!Total:\n" - ] - } - ], + "outputs": [], "source": [ "for var in still_exist:\n", " print(var, \"->\", vars2022[var][\"label\"])" @@ -171,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "a9b932cb-6f06-4c1e-8b81-bc47e95e9237", "metadata": {}, "outputs": [], @@ -182,35 +77,10 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "c6422685-f1be-4cdd-bb90-2dab660a20d8", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Unique table groups: 17\n", - "B01001: 24 vars\n", - "B01003: 1 vars\n", - "B02001: 1 vars\n", - "B03002: 6 vars\n", - "B12001: 8 vars\n", - "B15002: 25 vars\n", - "B17010: 4 vars\n", - "B19001: 1 vars\n", - "B19013: 1 vars\n", - "B19301: 1 vars\n", - "B21001: 1 vars\n", - "B25002: 3 vars\n", - "B25003: 3 vars\n", - "B25024: 7 vars\n", - "B25058: 1 vars\n", - "B25077: 1 vars\n", - "C24010: 1 vars\n" - ] - } - ], + "outputs": [], "source": [ "def variable_to_table_group(var: str) -> str | None:\n", " \"\"\"\n", @@ -283,254 +153,10 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "500ab2ae-734e-4c33-80b2-840dd3278ac7", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
variablelabelconceptpredicateTypegroup
40B01001_001EEstimate!!Total:Sex by AgeintB01001
39B01001_002EEstimate!!Total:!!Male:Sex by AgeintB01001
42B01001_003EEstimate!!Total:!!Male:!!Under 5 yearsSex by AgeintB01001
41B01001_004EEstimate!!Total:!!Male:!!5 to 9 yearsSex by AgeintB01001
45B01001_005EEstimate!!Total:!!Male:!!10 to 14 yearsSex by AgeintB01001
43B01001_006EEstimate!!Total:!!Male:!!15 to 17 yearsSex by AgeintB01001
44B01001_007EEstimate!!Total:!!Male:!!18 and 19 yearsSex by AgeintB01001
47B01001_008EEstimate!!Total:!!Male:!!20 yearsSex by AgeintB01001
46B01001_009EEstimate!!Total:!!Male:!!21 yearsSex by AgeintB01001
48B01001_010EEstimate!!Total:!!Male:!!22 to 24 yearsSex by AgeintB01001
1B01001_011EEstimate!!Total:!!Male:!!25 to 29 yearsSex by AgeintB01001
0B01001_012EEstimate!!Total:!!Male:!!30 to 34 yearsSex by AgeintB01001
3B01001_013EEstimate!!Total:!!Male:!!35 to 39 yearsSex by AgeintB01001
2B01001_014EEstimate!!Total:!!Male:!!40 to 44 yearsSex by AgeintB01001
5B01001_015EEstimate!!Total:!!Male:!!45 to 49 yearsSex by AgeintB01001
4B01001_016EEstimate!!Total:!!Male:!!50 to 54 yearsSex by AgeintB01001
7B01001_017EEstimate!!Total:!!Male:!!55 to 59 yearsSex by AgeintB01001
8B01001_018EEstimate!!Total:!!Male:!!60 and 61 yearsSex by AgeintB01001
6B01001_019EEstimate!!Total:!!Male:!!62 to 64 yearsSex by AgeintB01001
9B01001_020EEstimate!!Total:!!Male:!!65 and 66 yearsSex by AgeintB01001
\n", - "
" - ], - "text/plain": [ - " variable label concept \\\n", - "40 B01001_001E Estimate!!Total: Sex by Age \n", - "39 B01001_002E Estimate!!Total:!!Male: Sex by Age \n", - "42 B01001_003E Estimate!!Total:!!Male:!!Under 5 years Sex by Age \n", - "41 B01001_004E Estimate!!Total:!!Male:!!5 to 9 years Sex by Age \n", - "45 B01001_005E Estimate!!Total:!!Male:!!10 to 14 years Sex by Age \n", - "43 B01001_006E Estimate!!Total:!!Male:!!15 to 17 years Sex by Age \n", - "44 B01001_007E Estimate!!Total:!!Male:!!18 and 19 years Sex by Age \n", - "47 B01001_008E Estimate!!Total:!!Male:!!20 years Sex by Age \n", - "46 B01001_009E Estimate!!Total:!!Male:!!21 years Sex by Age \n", - "48 B01001_010E Estimate!!Total:!!Male:!!22 to 24 years Sex by Age \n", - "1 B01001_011E Estimate!!Total:!!Male:!!25 to 29 years Sex by Age \n", - "0 B01001_012E Estimate!!Total:!!Male:!!30 to 34 years Sex by Age \n", - "3 B01001_013E Estimate!!Total:!!Male:!!35 to 39 years Sex by Age \n", - "2 B01001_014E Estimate!!Total:!!Male:!!40 to 44 years Sex by Age \n", - "5 B01001_015E Estimate!!Total:!!Male:!!45 to 49 years Sex by Age \n", - "4 B01001_016E Estimate!!Total:!!Male:!!50 to 54 years Sex by Age \n", - "7 B01001_017E Estimate!!Total:!!Male:!!55 to 59 years Sex by Age \n", - "8 B01001_018E Estimate!!Total:!!Male:!!60 and 61 years Sex by Age \n", - "6 B01001_019E Estimate!!Total:!!Male:!!62 to 64 years Sex by Age \n", - "9 B01001_020E Estimate!!Total:!!Male:!!65 and 66 years Sex by Age \n", - "\n", - " predicateType group \n", - "40 int B01001 \n", - "39 int B01001 \n", - "42 int B01001 \n", - "41 int B01001 \n", - "45 int B01001 \n", - "43 int B01001 \n", - "44 int B01001 \n", - "47 int B01001 \n", - "46 int B01001 \n", - "48 int B01001 \n", - "1 int B01001 \n", - "0 int B01001 \n", - "3 int B01001 \n", - "2 int B01001 \n", - "5 int B01001 \n", - "4 int B01001 \n", - "7 int B01001 \n", - "8 int B01001 \n", - "6 int B01001 \n", - "9 int B01001 " - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Example:\n", "describe_group(\"B01001\", vars2022).head(20)" @@ -550,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 1, "id": "86339013-67d1-4f85-a213-a82608665c59", "metadata": {}, "outputs": [], @@ -567,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 2, "id": "fe5f95eb-4a88-418b-82ee-9f7b21555570", "metadata": {}, "outputs": [ @@ -605,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 3, "id": "7c4c5ad9-bade-4dfb-918e-ee3b49afbb50", "metadata": {}, "outputs": [], @@ -615,7 +241,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 4, "id": "6adaade2-f526-49b4-b798-5660fe8773ac", "metadata": {}, "outputs": [ @@ -865,7 +491,7 @@ "[5 rows x 268 columns]" ] }, - "execution_count": 24, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -876,7 +502,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 5, "id": "d21c2ddf-80fc-4e5f-92ac-7cccc49c880d", "metadata": {}, "outputs": [], @@ -886,7 +512,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 6, "id": "3d7aae74-69e3-4b3f-aba1-6a222190e1f7", "metadata": {}, "outputs": [ @@ -1128,7 +754,7 @@ "[5 rows x 268 columns]" ] }, - "execution_count": 29, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -1139,7 +765,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 7, "id": "e28b0089-3da0-4794-8bb6-fa606d5f5e8b", "metadata": {}, "outputs": [], @@ -1149,7 +775,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 8, "id": "d4700f62-dc2e-4097-bea6-78b9b6c4e482", "metadata": {}, "outputs": [ @@ -1399,7 +1025,7 @@ "[5 rows x 24 columns]" ] }, - "execution_count": 26, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1410,7 +1036,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 9, "id": "a2f23636-b9d8-47fb-90a2-3b5c94b12cbe", "metadata": {}, "outputs": [], @@ -1420,7 +1046,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 10, "id": "b39f8d07-ad85-40b2-94ba-425625aa3c75", "metadata": {}, "outputs": [ @@ -1662,7 +1288,7 @@ "[5 rows x 24 columns]" ] }, - "execution_count": 31, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1681,7 +1307,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 11, "id": "0346257d-4d09-4e46-8692-856895d490dc", "metadata": {}, "outputs": [], @@ -1780,7 +1406,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 12, "id": "c7647e0c-b8e9-4271-b446-3ea81609bbe0", "metadata": {}, "outputs": [ @@ -1912,7 +1538,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 13, "id": "07bd7b42-cdb1-475f-9cd8-9d1adfbdadc5", "metadata": {}, "outputs": [], @@ -2017,7 +1643,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 14, "id": "ffbb0145-e40e-42b6-8793-77409c74dab4", "metadata": {}, "outputs": [ @@ -2043,7 +1669,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 15, "id": "450715be-8255-45b7-9c3c-b828d78ed639", "metadata": {}, "outputs": [ @@ -2466,19 +2092,103 @@ "This cell " ] }, + { + "cell_type": "markdown", + "id": "630c9973-5243-4bef-b807-fbfab1eb1623", + "metadata": {}, + "source": [ + "We want to see no columns changed for 2021, but many for 2022" + ] + }, + { + "cell_type": "markdown", + "id": "858ac216-1a48-45ff-a944-2bc83ef2add3", + "metadata": {}, + "source": [ + "# Housing characteristics??\n", + "\n", + "Why is this table so much different? Are these really 100 new variables?" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0442576a-e1dd-4633-8af8-141234f0fff6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2021 exists: True ../build/2021_bg/acs_2021_X25_HOUSING_CHARACTERISTICS_bg.parquet\n", + "2022 exists: True ../build/2022_bg/acs_2022_X25_HOUSING_CHARACTERISTICS_bg.parquet\n" + ] + } + ], + "source": [ + "BUILD_ROOT = Path(\"../build\")\n", + "\n", + "file_2021 = BUILD_ROOT / \"2021_bg\" / \"acs_2021_X25_HOUSING_CHARACTERISTICS_bg.parquet\"\n", + "file_2022 = BUILD_ROOT / \"2022_bg\" / \"acs_2022_X25_HOUSING_CHARACTERISTICS_bg.parquet\"\n", + "\n", + "print(\"2021 exists:\", file_2021.exists(), file_2021)\n", + "print(\"2022 exists:\", file_2022.exists(), file_2022)" + ] + }, { "cell_type": "code", - "execution_count": 50, - "id": "cee97f24-0ed4-42d5-acab-5e142c65599a", + "execution_count": 17, + "id": "e704d78f-58d4-4627-bdff-2e5ef7598442", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "2021 classification counts\n" + "n_cols_2021_raw: 871\n", + "n_cols_2022_raw: 971\n" ] - }, + } + ], + "source": [ + "def read_parquet_columns(path: Path) -> list[str]:\n", + " return pq.ParquetFile(path).schema_arrow.names\n", + "\n", + "cols_2021 = read_parquet_columns(file_2021)\n", + "cols_2022 = read_parquet_columns(file_2022)\n", + "\n", + "print(\"n_cols_2021_raw:\", len(cols_2021))\n", + "print(\"n_cols_2022_raw:\", len(cols_2022))" + ] + }, + { + "cell_type": "markdown", + "id": "601a490f-c552-4b3b-86bc-124b7ddeb9b8", + "metadata": {}, + "source": [ + "Canonicalize the columns" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "dc4577d0-2679-490e-a143-1a9bdee2b0f9", + "metadata": {}, + "outputs": [], + "source": [ + "df21 = pd.DataFrame({\"raw_2021\": cols_2021})\n", + "df21[\"canonical\"] = df21[\"raw_2021\"].map(canonicalize_column)\n", + "\n", + "df22 = pd.DataFrame({\"raw_2022\": cols_2022})\n", + "df22[\"canonical\"] = df22[\"raw_2022\"].map(canonicalize_column)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "81387311-9998-4fa3-bd88-aaa8b7b21693", + "metadata": {}, + "outputs": [ { "data": { "text/html": [ @@ -2500,85 +2210,65 @@ " \n", " \n", " \n", - " classification\n", - " count\n", + " raw_2021\n", + " canonical\n", + " raw_2022\n", + " present_2021\n", + " present_2022\n", " \n", " \n", " \n", " \n", " 0\n", - " acs_canonical\n", - " 35\n", + " B25001_001E\n", + " B25001_001E\n", + " B25001_E001\n", + " True\n", + " True\n", " \n", " \n", " 1\n", - " geoid_like\n", - " 1\n", + " B25002_001E\n", + " B25002_001E\n", + " B25002_E001\n", + " True\n", + " True\n", " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " classification count\n", - "0 acs_canonical 35\n", - "1 geoid_like 1" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022 classification counts\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
classificationcount
2B25002_002EB25002_002EB25002_E002TrueTrue
0acs_new_style373B25002_003EB25002_003EB25002_E003TrueTrue
1geoid_like14B25003A_001EB25003A_001EB25003A_E001TrueTrue
\n", "
" ], "text/plain": [ - " classification count\n", - "0 acs_new_style 37\n", - "1 geoid_like 1" + " raw_2021 canonical raw_2022 present_2021 present_2022\n", + "0 B25001_001E B25001_001E B25001_E001 True True\n", + "1 B25002_001E B25002_001E B25002_E001 True True\n", + "2 B25002_002E B25002_002E B25002_E002 True True\n", + "3 B25002_003E B25002_003E B25002_E003 True True\n", + "4 B25003A_001E B25003A_001E B25003A_E001 True True" ] }, "metadata": {}, @@ -2586,95 +2276,386 @@ } ], "source": [ - "inspect21 = pd.DataFrame({\n", - " \"column\": cols21,\n", - " \"classification\": [classify_column(c) for c in cols21],\n", - " \"canonical\": [canonicalize_column(c) for c in cols21],\n", - " \"changed\": [c != canonicalize_column(c) for c in cols21],\n", - "})\n", - "inspect22 = pd.DataFrame({\n", - " \"column\": cols22,\n", - " \"classification\": [classify_column(c) for c in cols22],\n", - " \"canonical\": [canonicalize_column(c) for c in cols22],\n", - " \"changed\": [c != canonicalize_column(c) for c in cols22],\n", - "})\n", + "aligned = (\n", + " df21.merge(df22, on=\"canonical\", how=\"outer\")\n", + " .sort_values(\"canonical\")\n", + " .reset_index(drop=True)\n", + ")\n", "\n", - "print(\"2021 classification counts\")\n", - "display(inspect21[\"classification\"].value_counts().rename_axis(\"classification\").reset_index(name=\"count\"))\n", + "aligned[\"present_2021\"] = aligned[\"raw_2021\"].notna()\n", + "aligned[\"present_2022\"] = aligned[\"raw_2022\"].notna()\n", "\n", - "print(\"2022 classification counts\")\n", - "display(inspect22[\"classification\"].value_counts().rename_axis(\"classification\").reset_index(name=\"count\"))" + "display(aligned.head(5))" ] }, { "cell_type": "markdown", - "id": "630c9973-5243-4bef-b807-fbfab1eb1623", + "id": "cfc8b9b7-dc30-443e-8905-fffde0057daa", "metadata": {}, "source": [ - "We want to see no columns changed for 2021, but many for 2022" + "# Test the fix\n", + "\n", + "Pushed a change on 4/14/26" + ] + }, + { + "cell_type": "markdown", + "id": "e5d0658b-e91a-496d-bbae-1a7a0d1c3759", + "metadata": {}, + "source": [ + "## Synthetic Tests" ] }, { "cell_type": "code", - "execution_count": 51, - "id": "02e64267-af20-45de-a07f-1d6cbad20f32", + "execution_count": 20, + "id": "e31113d7-dd10-4027-8c4b-83035577effb", "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2021 columns changed by canonicalization\n" - ] - }, { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
columnclassificationcanonicalchanged
\n", - "
" - ], "text/plain": [ - "Empty DataFrame\n", - "Columns: [column, classification, canonical, changed]\n", - "Index: []" + "'/home/dylan/projects/geosnap/build'" ] }, + "execution_count": 20, "metadata": {}, - "output_type": "display_data" - }, + "output_type": "execute_result" + } + ], + "source": [ + "pwd" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1c88c2e2-444b-4711-87b5-8f8745121d2d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Obtaining file:///home/dylan/projects/geosnap\n", + " Installing build dependencies ... \u001b[?25ldone\n", + "\u001b[?25h Checking if build backend supports build_editable ... \u001b[?25ldone\n", + "\u001b[?25h Getting requirements to build editable ... \u001b[?25ldone\n", + "\u001b[?25h Preparing editable metadata (pyproject.toml) ... \u001b[?25ldone\n", + "\u001b[?25hRequirement already satisfied: numpy in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.3.5)\n", + "Requirement already satisfied: pandas in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.3.3)\n", + "Requirement already satisfied: geopandas>=0.9 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.1.1)\n", + "Requirement already satisfied: matplotlib in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.10.8)\n", + "Requirement already satisfied: scikit-learn in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.8.0)\n", + "Requirement already satisfied: seaborn in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.13.2)\n", + "Requirement already satisfied: libpysal in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (4.13.0)\n", + "Requirement already satisfied: mapclassify in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.10.0)\n", + "Requirement already satisfied: giddy>=2.2.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.3.8)\n", + "Requirement already satisfied: xlrd in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.0.2)\n", + "Requirement already satisfied: platformdirs in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (4.5.1)\n", + "Requirement already satisfied: tqdm in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (4.67.1)\n", + "Requirement already satisfied: quilt3>=3.6 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (7.0.0)\n", + "Requirement already satisfied: pyarrow>=0.14.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (22.0.0)\n", + "Requirement already satisfied: contextily in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.7.0)\n", + "Requirement already satisfied: tobler>=0.8.2 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.12.1)\n", + "Requirement already satisfied: spopt>=0.3.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.7.0)\n", + "Requirement already satisfied: segregation>=2.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.5.3)\n", + "Requirement already satisfied: pyproj>=3 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.7.2)\n", + "Requirement already satisfied: pandarm in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.0.3)\n", + "Requirement already satisfied: pooch in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.8.2)\n", + "Requirement already satisfied: ibis-framework in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (12.0.0)\n", + "Requirement already satisfied: packaging in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geopandas>=0.9->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (25.0)\n", + "Requirement already satisfied: shapely>=2.0.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geopandas>=0.9->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.1.2)\n", + "Requirement already satisfied: esda<2.9,>=2.7 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from giddy>=2.2.1->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.8.0)\n", + "Requirement already satisfied: quantecon>=0.8 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from giddy>=2.2.1->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.10.1)\n", + "Requirement already satisfied: scipy>=1.12 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from giddy>=2.2.1->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.16.3)\n", + "Requirement already satisfied: beautifulsoup4>=4.10 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from libpysal->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (4.14.3)\n", + "Requirement already satisfied: requests>=2.27 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from libpysal->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.32.5)\n", + "Requirement already satisfied: soupsieve>=1.6.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from beautifulsoup4>=4.10->libpysal->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.8)\n", + "Requirement already satisfied: typing-extensions>=4.0.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from beautifulsoup4>=4.10->libpysal->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (4.15.0)\n", + "Requirement already satisfied: networkx>=3.2 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from mapclassify->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.6.1)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from pandas->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from pandas->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2025.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from pandas->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2025.2)\n", + "Requirement already satisfied: certifi in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from pyproj>=3->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2026.1.4)\n", + "Requirement already satisfied: six>=1.5 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from python-dateutil>=2.8.2->pandas->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.17.0)\n", + "Requirement already satisfied: numba>=0.49.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quantecon>=0.8->giddy>=2.2.1->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.63.1)\n", + "Requirement already satisfied: sympy in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quantecon>=0.8->giddy>=2.2.1->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.14.0)\n", + "Requirement already satisfied: llvmlite<0.47,>=0.46.0dev0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from numba>=0.49.0->quantecon>=0.8->giddy>=2.2.1->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.46.0)\n", + "Requirement already satisfied: boto3>=1.21.7 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.40.70)\n", + "Requirement already satisfied: jsonlines==1.2.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.2.0)\n", + "Requirement already satisfied: PyYAML>=5.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (6.0.3)\n", + "Requirement already satisfied: tenacity!=8.4.0,>=5.1.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (9.1.2)\n", + "Requirement already satisfied: requests_futures==1.0.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.0.0)\n", + "Requirement already satisfied: jsonschema<5,>=3 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (4.25.1)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.0.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.12.5)\n", + "Requirement already satisfied: attrs>=22.2.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from jsonschema<5,>=3->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (25.4.0)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from jsonschema<5,>=3->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2025.9.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from jsonschema<5,>=3->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.37.0)\n", + "Requirement already satisfied: rpds-py>=0.7.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from jsonschema<5,>=3->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.30.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from pydantic<3.0.0,>=2.0.0->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.41.5 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from pydantic<3.0.0,>=2.0.0->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.41.5)\n", + "Requirement already satisfied: typing-inspection>=0.4.2 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from pydantic<3.0.0,>=2.0.0->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.4.2)\n", + "Requirement already satisfied: botocore<1.41.0,>=1.40.70 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from boto3>=1.21.7->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.40.70)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from boto3>=1.21.7->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.0.1)\n", + "Requirement already satisfied: s3transfer<0.15.0,>=0.14.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from boto3>=1.21.7->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.14.0)\n", + "Requirement already satisfied: urllib3!=2.2.0,<3,>=1.25.4 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from botocore<1.41.0,>=1.40.70->boto3>=1.21.7->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.6.1)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from requests>=2.27->libpysal->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.4.4)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from requests>=2.27->libpysal->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.11)\n", + "Requirement already satisfied: joblib>=1.3.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from scikit-learn->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.5.2)\n", + "Requirement already satisfied: threadpoolctl>=3.2.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from scikit-learn->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.6.0)\n", + "Requirement already satisfied: deprecation in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from segregation>=2.1->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.1.0)\n", + "Requirement already satisfied: pointpats>=2.4.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from spopt>=0.3.0->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.5.2)\n", + "Requirement already satisfied: pulp>=2.8 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from spopt>=0.3.0->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.8.0)\n", + "Requirement already satisfied: spaghetti>=1.7.4 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from spopt>=0.3.0->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.7.6)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from matplotlib->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.3.3)\n", + "Requirement already satisfied: cycler>=0.10 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from matplotlib->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from matplotlib->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (4.61.0)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from matplotlib->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.4.9)\n", + "Requirement already satisfied: pillow>=8 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from matplotlib->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (12.0.0)\n", + "Requirement already satisfied: pyparsing>=3 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from matplotlib->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.2.5)\n", + "Requirement already satisfied: rtree>=1.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from spaghetti>=1.7.4->spopt>=0.3.0->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.4.1)\n", + "Requirement already satisfied: rasterio in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.4.3)\n", + "Requirement already satisfied: statsmodels in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.14.6)\n", + "Requirement already satisfied: rasterstats in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.20.0)\n", + "Requirement already satisfied: geopy in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from contextily->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.4.1)\n", + "Requirement already satisfied: mercantile in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from contextily->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.2.1)\n", + "Requirement already satisfied: xyzservices in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from contextily->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2025.11.0)\n", + "Requirement already satisfied: geographiclib<3,>=1.52 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geopy->contextily->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.1)\n", + "Requirement already satisfied: atpublic>=2.3 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from ibis-framework->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (7.0.0)\n", + "Requirement already satisfied: parsy>=2 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from ibis-framework->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.2)\n", + "Requirement already satisfied: sqlglot!=26.32.0,>=23.4 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from ibis-framework->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (30.4.3)\n", + "Requirement already satisfied: toolz>=0.11 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from ibis-framework->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.1.0)\n", + "Requirement already satisfied: click>=3.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from mercantile->contextily->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (8.3.1)\n", + "Requirement already satisfied: tables>=3.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from pandarm->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.10.2)\n", + "Requirement already satisfied: numexpr>=2.6.2 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from tables>=3.1->pandarm->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.14.1)\n", + "Requirement already satisfied: py-cpuinfo in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from tables>=3.1->pandarm->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (9.0.0)\n", + "Requirement already satisfied: affine in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from rasterio->tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.4.0)\n", + "Requirement already satisfied: cligj>=0.5 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from rasterio->tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.7.2)\n", + "Requirement already satisfied: click-plugins in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from rasterio->tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.1.1.2)\n", + "Requirement already satisfied: fiona in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from rasterstats->tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.10.1)\n", + "Requirement already satisfied: simplejson in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from rasterstats->tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.20.2)\n", + "Requirement already satisfied: patsy>=0.5.6 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from statsmodels->tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.0.2)\n", + "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from sympy->quantecon>=0.8->giddy>=2.2.1->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.3.0)\n", + "Building wheels for collected packages: geosnap\n", + " Building editable for geosnap (pyproject.toml) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for geosnap: filename=geosnap-0.16.1.dev34+g2f9cb9274.d20260414-0.editable-py3-none-any.whl size=8639 sha256=d324b655c7466dd269e80cd78beb59a593c7e69b5f00ccc85ec83ddc2dd95bd5\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-aoabk68n/wheels/50/05/f1/5afabb92124d2b3b9c0f2213aed7af8d580c81096a101a27a2\n", + "Successfully built geosnap\n", + "Installing collected packages: geosnap\n", + " Attempting uninstall: geosnap\n", + " Found existing installation: geosnap 0.16.1.dev34+g2f9cb9274.d20260414\n", + " Uninstalling geosnap-0.16.1.dev34+g2f9cb9274.d20260414:\n", + " Successfully uninstalled geosnap-0.16.1.dev34+g2f9cb9274.d20260414\n", + "Successfully installed geosnap-0.16.1.dev34+g2f9cb9274.d20260414\n" + ] + } + ], + "source": [ + "!pip install -e /home/dylan/projects/geosnap" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "1afee93e-6d67-426d-a950-c5d6cda5abf2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages/numba/np/ufunc/parallel.py:373: NumbaWarning: \u001b[1mThe TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.\u001b[0m\n", + " warnings.warn(problem)\n" + ] + } + ], + "source": [ + "from geosnap.io.util import normalize_acs_vars" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "0a728115-10a0-484e-807b-79191367ad16", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "B02001e1 -> B02001_001E\n", + "B02001e12 -> B02001_012E\n", + "B02001_E001 -> B02001_001E\n", + "B02001_M001 -> B02001_001M\n", + "B02001_001E -> B02001_001E\n", + "B02001_001M -> B02001_001M\n", + "GEOID -> GEOID\n", + "GEOIDFQ -> GEOIDFQ\n", + "GEOID_Data -> GEOID_Data\n", + "geometry -> geometry\n", + "NAME -> NAME\n" + ] + } + ], + "source": [ + "tests = [\n", + " \"B02001e1\",\n", + " \"B02001e12\",\n", + " \"B02001_E001\",\n", + " \"B02001_M001\",\n", + " \"B02001_001E\",\n", + " \"B02001_001M\",\n", + " \"GEOID\",\n", + " \"GEOIDFQ\",\n", + " \"GEOID_Data\",\n", + " \"geometry\",\n", + " \"NAME\",\n", + "]\n", + "\n", + "for t in tests:\n", + " print(f\"{t:15} -> {normalize_acs_vars(t)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "ca1f46fb-c65c-43a7-bb7a-e80b2eb0ac32", + "metadata": {}, + "outputs": [], + "source": [ + "from geosnap.io.util import find_geoid_column" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "8fa4058c-e98f-4e3e-b75c-029fac66dd3a", + "metadata": {}, + "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "2022 columns changed by canonicalization\n" + "['GEOID', 'B01001_E001'] -> GEOID\n", + "['GEOIDFQ', 'B01001_E001'] -> GEOIDFQ\n", + "['GEOID_Data', 'B01001_E001'] -> GEOID_Data\n", + "['GEOID20', 'B01001_E001'] -> GEOID20\n", + "['NAME', 'B01001_E001'] -> None\n" ] }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/dylan/projects/geosnap/geosnap/io/util.py:144: UserWarning: No GEOID-like column found. Columns are: ['NAME', 'B01001_E001']\n", + " warn(f\"No GEOID-like column found. Columns are: {list(columns)}\")\n" + ] + } + ], + "source": [ + "cases = [\n", + " [\"GEOID\", \"B01001_E001\"],\n", + " [\"GEOIDFQ\", \"B01001_E001\"],\n", + " [\"GEOID_Data\", \"B01001_E001\"],\n", + " [\"GEOID20\", \"B01001_E001\"],\n", + " [\"NAME\", \"B01001_E001\"],\n", + "]\n", + "\n", + "for cols in cases:\n", + " print(cols, \"->\", find_geoid_column(cols))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "75f32b9d-c11a-49a1-ac0e-6959b1e8f77b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All columns:\n", + "['B02001e1', 'B02001e12', 'B02001_E001', 'B02001_M001', 'B02001_001E', 'B02001_001M', 'GEOID', 'GEOIDFQ', 'NAME', 'geometry', 'random_column', 'B02001X001']\n" + ] + } + ], + "source": [ + "cols = pd.Index([\n", + " # old style\n", + " \"B02001e1\", \"B02001e12\",\n", + "\n", + " # 2022 style\n", + " \"B02001_E001\", \"B02001_M001\",\n", + "\n", + " # canonical style\n", + " \"B02001_001E\", \"B02001_001M\",\n", + "\n", + " # noise / non-ACS\n", + " \"GEOID\", \"GEOIDFQ\", \"NAME\", \"geometry\",\n", + " \"random_column\", \"B02001X001\",\n", + "])\n", + "\n", + "df = pd.DataFrame(columns=cols)\n", + "\n", + "print(\"All columns:\")\n", + "print(list(df.columns))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "13c8b8ca-0650-4ed0-94a2-10232e61251b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected columns:\n", + "['B02001e1', 'B02001e12', 'B02001_E001', 'B02001_M001', 'B02001_001E', 'B02001_001M']\n" + ] + } + ], + "source": [ + "candidate_cols = df.columns[\n", + " df.columns.str.match(r\"^[A-Za-z0-9]+e\\d+$\", na=False) # old style\n", + " | df.columns.str.match(r\"^[A-Za-z0-9]+_[EM]\\d{3}$\", na=False) # 2022 style\n", + " | df.columns.str.match(r\"^[A-Za-z0-9]+_\\d{3}[EM]$\", na=False) # canonical style\n", + "]\n", + "\n", + "print(\"Selected columns:\")\n", + "print(list(candidate_cols))\n", + "# should not contain any GEOID or 'geometry' or 'random_column'" + ] + }, + { + "cell_type": "markdown", + "id": "c07cbfc6-fd89-47e4-95b1-36f3efeffb85", + "metadata": {}, + "source": [ + "# Examine output from `process_acs`\n", + "After turning the new conversion function loose, I applied the `process_acs` function on the resulting combined demographic profile. Let's look at both here" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "1f72c657-b3a7-4967-9957-51293ac16145", + "metadata": {}, + "outputs": [], + "source": [ + "demographic_profile = pd.read_parquet('2022_bg/acs_demographic_profile_2022_bg.parquet')\n", + "processed_acs = pd.read_parquet('2022_bg/acs_2022_bg_processed.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "b0fb731b-039f-47da-b5b0-cee0a574a0ba", + "metadata": {}, + "outputs": [ { "data": { "text/html": [ @@ -2696,259 +2677,366 @@ " \n", " \n", " \n", - " column\n", - " classification\n", - " canonical\n", - " changed\n", + " STATEFP\n", + " COUNTYFP\n", + " TRACTCE\n", + " BLKGRPCE\n", + " NAMELSAD\n", + " MTFCC\n", + " FUNCSTAT\n", + " ALAND\n", + " AWATER\n", + " INTPTLAT\n", + " ...\n", + " B01002H_003E\n", + " B01002H_003M\n", + " B01002I_001E\n", + " B01002I_001M\n", + " B01002I_002E\n", + " B01002I_002M\n", + " B01002I_003E\n", + " B01002I_003M\n", + " B01003_001E\n", + " B01003_001M\n", " \n", " \n", " \n", " \n", - " 0\n", - " B02001_E001\n", - " acs_new_style\n", - " B02001_001E\n", - " True\n", - " \n", - " \n", - " 1\n", - " B02001_E002\n", - " acs_new_style\n", - " B02001_002E\n", - " True\n", - " \n", - " \n", - " 2\n", - " B02001_E003\n", - " acs_new_style\n", - " B02001_003E\n", - " True\n", - " \n", - " \n", - " 3\n", - " B02001_E004\n", - " acs_new_style\n", - " B02001_004E\n", - " True\n", - " \n", - " \n", - " 4\n", - " B02001_E005\n", - " acs_new_style\n", - " B02001_005E\n", - " True\n", - " \n", - " \n", - " 5\n", - " B02001_E006\n", - " acs_new_style\n", - " B02001_006E\n", - " True\n", - " \n", - " \n", - " 6\n", - " B02001_E007\n", - " acs_new_style\n", - " B02001_007E\n", - " True\n", - " \n", - " \n", - " 7\n", - " B02001_E008\n", - " acs_new_style\n", - " B02001_008E\n", - " True\n", - " \n", - " \n", - " 8\n", - " B02001_E009\n", - " acs_new_style\n", - " B02001_009E\n", - " True\n", - " \n", - " \n", - " 9\n", - " B02001_E010\n", - " acs_new_style\n", - " B02001_010E\n", - " True\n", - " \n", - " \n", - " 10\n", - " B02008_E001\n", - " acs_new_style\n", - " B02008_001E\n", - " True\n", - " \n", - " \n", - " 11\n", - " B02009_E001\n", - " acs_new_style\n", - " B02009_001E\n", - " True\n", - " \n", - " \n", - " 12\n", - " B02010_E001\n", - " acs_new_style\n", - " B02010_001E\n", - " True\n", - " \n", - " \n", - " 13\n", - " B02011_E001\n", - " acs_new_style\n", - " B02011_001E\n", - " True\n", - " \n", - " \n", - " 14\n", - " B02012_E001\n", - " acs_new_style\n", - " B02012_001E\n", - " True\n", - " \n", - " \n", - " 15\n", - " B02013_E001\n", - " acs_new_style\n", - " B02013_001E\n", - " True\n", - " \n", - " \n", - " 16\n", - " C02003_E001\n", - " acs_new_style\n", - " C02003_001E\n", - " True\n", - " \n", - " \n", - " 17\n", - " C02003_E002\n", - " acs_new_style\n", - " C02003_002E\n", - " True\n", - " \n", - " \n", - " 18\n", - " C02003_E003\n", - " acs_new_style\n", - " C02003_003E\n", - " True\n", - " \n", - " \n", - " 19\n", - " C02003_E004\n", - " acs_new_style\n", - " C02003_004E\n", - " True\n", - " \n", - " \n", - " 20\n", - " C02003_E005\n", - " acs_new_style\n", - " C02003_005E\n", - " True\n", - " \n", - " \n", - " 21\n", - " C02003_E006\n", - " acs_new_style\n", - " C02003_006E\n", - " True\n", - " \n", - " \n", - " 22\n", - " C02003_E007\n", - " acs_new_style\n", - " C02003_007E\n", - " True\n", - " \n", - " \n", - " 23\n", - " C02003_E008\n", - " acs_new_style\n", - " C02003_008E\n", - " True\n", + " 010179548002\n", + " 01\n", + " 017\n", + " 954800\n", + " 2\n", + " Block Group 2\n", + " G5030\n", + " S\n", + " 1094218.0\n", + " 0.0\n", + " +32.8662046\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", - " 24\n", - " C02003_E009\n", - " acs_new_style\n", - " C02003_009E\n", - " True\n", + " 010179548004\n", + " 01\n", + " 017\n", + " 954800\n", + " 4\n", + " Block Group 4\n", + " G5030\n", + " S\n", + " 2392140.0\n", + " 0.0\n", + " +32.8482537\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", - " 25\n", - " C02003_E010\n", - " acs_new_style\n", - " C02003_010E\n", - " True\n", + " 010179548003\n", + " 01\n", + " 017\n", + " 954800\n", + " 3\n", + " Block Group 3\n", + " G5030\n", + " S\n", + " 902949.0\n", + " 0.0\n", + " +32.8577594\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", - " 26\n", - " C02003_E011\n", - " acs_new_style\n", - " C02003_011E\n", - " True\n", + " 010150011031\n", + " 01\n", + " 015\n", + " 001103\n", + " 1\n", + " Block Group 1\n", + " G5030\n", + " S\n", + " 2346322.0\n", + " 94061.0\n", + " +33.5892886\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", - " 27\n", - " C02003_E012\n", - " acs_new_style\n", - " C02003_012E\n", - " True\n", + " 010150024003\n", + " 01\n", + " 015\n", + " 002400\n", + " 3\n", + " Block Group 3\n", + " G5030\n", + " S\n", + " 38223047.0\n", + " 173264.0\n", + " +33.9079142\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", - " 28\n", - " C02003_E013\n", - " acs_new_style\n", - " C02003_013E\n", - " True\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 29\n", - " C02003_E014\n", - " acs_new_style\n", - " C02003_014E\n", - " True\n", + " 1500000US720210302002\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " NaN\n", + " NaN\n", + " None\n", + " ...\n", + " -666666666.0\n", + " -222222222.0\n", + " 57.6\n", + " 13.3\n", + " 53.7\n", + " 42.9\n", + " 58.5\n", + " 17.3\n", + " 597.0\n", + " 239.0\n", + " \n", + " \n", + " 1500000US720210314012\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " NaN\n", + " NaN\n", + " None\n", + " ...\n", + " -666666666.0\n", + " -222222222.0\n", + " 58.4\n", + " 12.9\n", + " 56.5\n", + " 13.9\n", + " 63.3\n", + " 15.0\n", + " 977.0\n", + " 285.0\n", + " \n", + " \n", + " 1500000US720210312021\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " NaN\n", + " NaN\n", + " None\n", + " ...\n", + " -666666666.0\n", + " -222222222.0\n", + " 44.5\n", + " 6.9\n", + " 49.2\n", + " 14.6\n", + " 43.7\n", + " 5.3\n", + " 1837.0\n", + " 372.0\n", + " \n", + " \n", + " 1500000US720531504003\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " NaN\n", + " NaN\n", + " None\n", + " ...\n", + " -666666666.0\n", + " -222222222.0\n", + " 38.7\n", + " 13.0\n", + " 33.8\n", + " 11.7\n", + " 47.1\n", + " 15.1\n", + " 1115.0\n", + " 365.0\n", + " \n", + " \n", + " 1500000US721153304003\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " NaN\n", + " NaN\n", + " None\n", + " ...\n", + " -666666666.0\n", + " -222222222.0\n", + " 40.2\n", + " 10.5\n", + " 40.2\n", + " 10.7\n", + " 40.3\n", + " 16.3\n", + " 1892.0\n", + " 618.0\n", " \n", " \n", "\n", + "

484672 rows × 8237 columns

\n", "" ], "text/plain": [ - " column classification canonical changed\n", - "0 B02001_E001 acs_new_style B02001_001E True\n", - "1 B02001_E002 acs_new_style B02001_002E True\n", - "2 B02001_E003 acs_new_style B02001_003E True\n", - "3 B02001_E004 acs_new_style B02001_004E True\n", - "4 B02001_E005 acs_new_style B02001_005E True\n", - "5 B02001_E006 acs_new_style B02001_006E True\n", - "6 B02001_E007 acs_new_style B02001_007E True\n", - "7 B02001_E008 acs_new_style B02001_008E True\n", - "8 B02001_E009 acs_new_style B02001_009E True\n", - "9 B02001_E010 acs_new_style B02001_010E True\n", - "10 B02008_E001 acs_new_style B02008_001E True\n", - "11 B02009_E001 acs_new_style B02009_001E True\n", - "12 B02010_E001 acs_new_style B02010_001E True\n", - "13 B02011_E001 acs_new_style B02011_001E True\n", - "14 B02012_E001 acs_new_style B02012_001E True\n", - "15 B02013_E001 acs_new_style B02013_001E True\n", - "16 C02003_E001 acs_new_style C02003_001E True\n", - "17 C02003_E002 acs_new_style C02003_002E True\n", - "18 C02003_E003 acs_new_style C02003_003E True\n", - "19 C02003_E004 acs_new_style C02003_004E True\n", - "20 C02003_E005 acs_new_style C02003_005E True\n", - "21 C02003_E006 acs_new_style C02003_006E True\n", - "22 C02003_E007 acs_new_style C02003_007E True\n", - "23 C02003_E008 acs_new_style C02003_008E True\n", - "24 C02003_E009 acs_new_style C02003_009E True\n", - "25 C02003_E010 acs_new_style C02003_010E True\n", - "26 C02003_E011 acs_new_style C02003_011E True\n", - "27 C02003_E012 acs_new_style C02003_012E True\n", - "28 C02003_E013 acs_new_style C02003_013E True\n", - "29 C02003_E014 acs_new_style C02003_014E True" + " STATEFP COUNTYFP TRACTCE BLKGRPCE NAMELSAD MTFCC \\\n", + "010179548002 01 017 954800 2 Block Group 2 G5030 \n", + "010179548004 01 017 954800 4 Block Group 4 G5030 \n", + "010179548003 01 017 954800 3 Block Group 3 G5030 \n", + "010150011031 01 015 001103 1 Block Group 1 G5030 \n", + "010150024003 01 015 002400 3 Block Group 3 G5030 \n", + "... ... ... ... ... ... ... \n", + "1500000US720210302002 None None None None None None \n", + "1500000US720210314012 None None None None None None \n", + "1500000US720210312021 None None None None None None \n", + "1500000US720531504003 None None None None None None \n", + "1500000US721153304003 None None None None None None \n", + "\n", + " FUNCSTAT ALAND AWATER INTPTLAT ... \\\n", + "010179548002 S 1094218.0 0.0 +32.8662046 ... \n", + "010179548004 S 2392140.0 0.0 +32.8482537 ... \n", + "010179548003 S 902949.0 0.0 +32.8577594 ... \n", + "010150011031 S 2346322.0 94061.0 +33.5892886 ... \n", + "010150024003 S 38223047.0 173264.0 +33.9079142 ... \n", + "... ... ... ... ... ... \n", + "1500000US720210302002 None NaN NaN None ... \n", + "1500000US720210314012 None NaN NaN None ... \n", + "1500000US720210312021 None NaN NaN None ... \n", + "1500000US720531504003 None NaN NaN None ... \n", + "1500000US721153304003 None NaN NaN None ... \n", + "\n", + " B01002H_003E B01002H_003M B01002I_001E B01002I_001M \\\n", + "010179548002 NaN NaN NaN NaN \n", + "010179548004 NaN NaN NaN NaN \n", + "010179548003 NaN NaN NaN NaN \n", + "010150011031 NaN NaN NaN NaN \n", + "010150024003 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1500000US720210302002 -666666666.0 -222222222.0 57.6 13.3 \n", + "1500000US720210314012 -666666666.0 -222222222.0 58.4 12.9 \n", + "1500000US720210312021 -666666666.0 -222222222.0 44.5 6.9 \n", + "1500000US720531504003 -666666666.0 -222222222.0 38.7 13.0 \n", + "1500000US721153304003 -666666666.0 -222222222.0 40.2 10.5 \n", + "\n", + " B01002I_002E B01002I_002M B01002I_003E B01002I_003M \\\n", + "010179548002 NaN NaN NaN NaN \n", + "010179548004 NaN NaN NaN NaN \n", + "010179548003 NaN NaN NaN NaN \n", + "010150011031 NaN NaN NaN NaN \n", + "010150024003 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1500000US720210302002 53.7 42.9 58.5 17.3 \n", + "1500000US720210314012 56.5 13.9 63.3 15.0 \n", + "1500000US720210312021 49.2 14.6 43.7 5.3 \n", + "1500000US720531504003 33.8 11.7 47.1 15.1 \n", + "1500000US721153304003 40.2 10.7 40.3 16.3 \n", + "\n", + " B01003_001E B01003_001M \n", + "010179548002 NaN NaN \n", + "010179548004 NaN NaN \n", + "010179548003 NaN NaN \n", + "010150011031 NaN NaN \n", + "010150024003 NaN NaN \n", + "... ... ... \n", + "1500000US720210302002 597.0 239.0 \n", + "1500000US720210314012 977.0 285.0 \n", + "1500000US720210312021 1837.0 372.0 \n", + "1500000US720531504003 1115.0 365.0 \n", + "1500000US721153304003 1892.0 618.0 \n", + "\n", + "[484672 rows x 8237 columns]" ] }, "metadata": {}, @@ -2956,17 +3044,13 @@ } ], "source": [ - "print(\"2021 columns changed by canonicalization\")\n", - "display(inspect21[inspect21[\"changed\"]].head(30))\n", - "\n", - "print(\"2022 columns changed by canonicalization\")\n", - "display(inspect22[inspect22[\"changed\"]].head(30))" + "display(demographic_profile)" ] }, { "cell_type": "code", - "execution_count": 52, - "id": "adc06724-05ea-496a-bbfd-c346768b991e", + "execution_count": 38, + "id": "27e38351-3998-423c-83bb-67de3b008f82", "metadata": {}, "outputs": [ { @@ -2990,3131 +3074,493 @@ " \n", " \n", " \n", - " layer\n", - " n_cols_2021_raw\n", - " n_cols_2022_raw\n", - " raw_overlap\n", - " canonical_overlap\n", - " raw_only_2021\n", - " raw_only_2022\n", - " canonical_only_2021\n", - " canonical_only_2022\n", + " n_total_housing_units\n", + " n_vacant_housing_units\n", + " n_occupied_housing_units\n", + " n_owner_occupied_housing_units\n", + " n_renter_occupied_housing_units\n", + " n_housing_units_multiunit_structures_denom\n", + " n_total_housing_units_sample\n", + " median_home_value\n", + " median_contract_rent\n", + " n_occupied_housing_units_sample\n", + " ...\n", + " p_owner_occupied_units\n", + " p_married\n", + " p_female_headed_families\n", + " p_nonhisp_white_persons\n", + " p_nonhisp_black_persons\n", + " p_hispanic_persons\n", + " p_native_persons\n", + " p_hawaiian_persons\n", + " p_veterans\n", + " geometry\n", " \n", - " \n", - " \n", " \n", - " 0\n", - " X02_RACE\n", - " 36\n", - " 38\n", - " 0\n", - " 35\n", - " 36\n", - " 38\n", - " 1\n", - " 3\n", - " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " layer n_cols_2021_raw n_cols_2022_raw raw_overlap canonical_overlap \\\n", - "0 X02_RACE 36 38 0 35 \n", - "\n", - " raw_only_2021 raw_only_2022 canonical_only_2021 canonical_only_2022 \n", - "0 36 38 1 3 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "raw_overlap = len(set(cols21) & set(cols22))\n", - "canon_overlap = len({canonicalize_column(c) for c in cols21} & {canonicalize_column(c) for c in cols22})\n", - "\n", - "comparison_df = pd.DataFrame([{\n", - " \"layer\": layer,\n", - " \"n_cols_2021_raw\": len(cols21),\n", - " \"n_cols_2022_raw\": len(cols22),\n", - " \"raw_overlap\": raw_overlap,\n", - " \"canonical_overlap\": canon_overlap,\n", - " \"raw_only_2021\": len(set(cols21) - set(cols22)),\n", - " \"raw_only_2022\": len(set(cols22) - set(cols21)),\n", - " \"canonical_only_2021\": len({canonicalize_column(c) for c in cols21} - {canonicalize_column(c) for c in cols22}),\n", - " \"canonical_only_2022\": len({canonicalize_column(c) for c in cols22} - {canonicalize_column(c) for c in cols21}),\n", - "}])\n", - "\n", - "display(comparison_df)" - ] - }, - { - "cell_type": "markdown", - "id": "f91994dd-0836-43a2-aaf3-822c51139d3f", - "metadata": {}, - "source": [ - "The variables specifically:" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "id": "96232773-2474-44aa-b785-504ed0f2885b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
canonicalraw_2021raw_2022different_raw_names
0B02001_001EB02001_001EB02001_E001True
1B02001_002EB02001_002EB02001_E002True
2B02001_003EB02001_003EB02001_E003True
3B02001_004EB02001_004EB02001_E004True
4B02001_005EB02001_005EB02001_E005True
5B02001_006EB02001_006EB02001_E006True
6B02001_007EB02001_007EB02001_E007True
7B02001_008EB02001_008EB02001_E008True
8B02001_009EB02001_009EB02001_E009True
9B02001_010EB02001_010EB02001_E010True
10B02008_001EB02008_001EB02008_E001True
11B02009_001EB02009_001EB02009_E001True
12B02010_001EB02010_001EB02010_E001True
13B02011_001EB02011_001EB02011_E001True
14B02012_001EB02012_001EB02012_E001True
15B02013_001EB02013_001EB02013_E001True
16C02003_001EC02003_001EC02003_E001True
17C02003_002EC02003_002EC02003_E002True
18C02003_003EC02003_003EC02003_E003True
19C02003_004EC02003_004EC02003_E004True
20C02003_005EC02003_005EC02003_E005True
21C02003_006EC02003_006EC02003_E006True
22C02003_007EC02003_007EC02003_E007True
23C02003_008EC02003_008EC02003_E008True
24C02003_009EC02003_009EC02003_E009True
25C02003_010EC02003_010EC02003_E010True
26C02003_011EC02003_011EC02003_E011True
27C02003_012EC02003_012EC02003_E012True
28C02003_013EC02003_013EC02003_E013True
29C02003_014EC02003_014EC02003_E014True
30C02003_015EC02003_015EC02003_E015True
31C02003_016EC02003_016EC02003_E016True
32C02003_017EC02003_017EC02003_E017True
33C02003_018EC02003_018EC02003_E018True
34C02003_019EC02003_019EC02003_E019True
\n", - "
" - ], - "text/plain": [ - " canonical raw_2021 raw_2022 different_raw_names\n", - "0 B02001_001E B02001_001E B02001_E001 True\n", - "1 B02001_002E B02001_002E B02001_E002 True\n", - "2 B02001_003E B02001_003E B02001_E003 True\n", - "3 B02001_004E B02001_004E B02001_E004 True\n", - "4 B02001_005E B02001_005E B02001_E005 True\n", - "5 B02001_006E B02001_006E B02001_E006 True\n", - "6 B02001_007E B02001_007E B02001_E007 True\n", - "7 B02001_008E B02001_008E B02001_E008 True\n", - "8 B02001_009E B02001_009E B02001_E009 True\n", - "9 B02001_010E B02001_010E B02001_E010 True\n", - "10 B02008_001E B02008_001E B02008_E001 True\n", - "11 B02009_001E B02009_001E B02009_E001 True\n", - "12 B02010_001E B02010_001E B02010_E001 True\n", - "13 B02011_001E B02011_001E B02011_E001 True\n", - "14 B02012_001E B02012_001E B02012_E001 True\n", - "15 B02013_001E B02013_001E B02013_E001 True\n", - "16 C02003_001E C02003_001E C02003_E001 True\n", - "17 C02003_002E C02003_002E C02003_E002 True\n", - "18 C02003_003E C02003_003E C02003_E003 True\n", - "19 C02003_004E C02003_004E C02003_E004 True\n", - "20 C02003_005E C02003_005E C02003_E005 True\n", - "21 C02003_006E C02003_006E C02003_E006 True\n", - "22 C02003_007E C02003_007E C02003_E007 True\n", - "23 C02003_008E C02003_008E C02003_E008 True\n", - "24 C02003_009E C02003_009E C02003_E009 True\n", - "25 C02003_010E C02003_010E C02003_E010 True\n", - "26 C02003_011E C02003_011E C02003_E011 True\n", - "27 C02003_012E C02003_012E C02003_E012 True\n", - "28 C02003_013E C02003_013E C02003_E013 True\n", - "29 C02003_014E C02003_014E C02003_E014 True\n", - "30 C02003_015E C02003_015E C02003_E015 True\n", - "31 C02003_016E C02003_016E C02003_E016 True\n", - "32 C02003_017E C02003_017E C02003_E017 True\n", - "33 C02003_018E C02003_018E C02003_E018 True\n", - "34 C02003_019E C02003_019E C02003_E019 True" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "canon21 = pd.DataFrame({\n", - " \"canonical\": [canonicalize_column(c) for c in cols21],\n", - " \"raw_2021\": cols21,\n", - "}).drop_duplicates()\n", - "\n", - "canon22 = pd.DataFrame({\n", - " \"canonical\": [canonicalize_column(c) for c in cols22],\n", - " \"raw_2022\": cols22,\n", - "}).drop_duplicates()\n", - "\n", - "aligned = canon21.merge(canon22, on=\"canonical\", how=\"outer\")\n", - "aligned[\"different_raw_names\"] = (\n", - " aligned[\"raw_2021\"].notna() &\n", - " aligned[\"raw_2022\"].notna() &\n", - " (aligned[\"raw_2021\"] != aligned[\"raw_2022\"])\n", - ")\n", - "\n", - "display(aligned[aligned[\"different_raw_names\"]])" - ] - }, - { - "cell_type": "markdown", - "id": "9f73607b-c8bc-4a67-8af2-914ed1344323", - "metadata": {}, - "source": [ - "The mismatch is almost entirely due to naming convention changes, not missing data.\n", - "\n", - "the canonicalization rule recovers ~97% (35/36) of variables.\n", - "\n", - "The remaining differences are small and real (1 dropped, 3 added), not a pipeline failure." - ] - }, - { - "cell_type": "markdown", - "id": "94c36107-74d7-457a-8237-9ba0da0c4141", - "metadata": {}, - "source": [ - "## New variables in 2022\n", - "\n", - "The two cells below show the unaligned variables that are not covered by the new rule." - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "f0c52177-9a6e-40d9-86d9-f7057f5d137a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
canonicalraw_2021raw_2022different_raw_names
37GEOIDGEOIDNaNFalse
\n", - "
" - ], - "text/plain": [ - " canonical raw_2021 raw_2022 different_raw_names\n", - "37 GEOID GEOID NaN False" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(aligned[aligned[\"raw_2022\"].isna()])" - ] - }, - { - "cell_type": "markdown", - "id": "40e4dde4-2029-458e-b2e6-215039994577", - "metadata": {}, - "source": [ - "The GEOID difference we know about, but the two variables below are new to this table this vintage." - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "58e50b3d-e88a-4d2f-9354-78cb45e93061", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
canonicalraw_2021raw_2022different_raw_names
35C02003_020ENaNC02003_E020False
36C02003_021ENaNC02003_E021False
38GEOIDFQNaNGEOIDFQFalse
\n", - "
" - ], - "text/plain": [ - " canonical raw_2021 raw_2022 different_raw_names\n", - "35 C02003_020E NaN C02003_E020 False\n", - "36 C02003_021E NaN C02003_E021 False\n", - "38 GEOIDFQ NaN GEOIDFQ False" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(aligned[aligned[\"raw_2021\"].isna()])" - ] - }, - { - "cell_type": "markdown", - "id": "ffd54d82-0269-4cba-80ea-0dc6f0f2d243", - "metadata": {}, - "source": [ - "## GEOID grepper\n", - "\n", - "hoping that `classify_columns` will recognize multiple GEOID-like columns" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "id": "d65d088f-8a9e-4751-b1cf-44e1b61cb631", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2021 GEOID-like columns: ['GEOID']\n", - "2022 GEOID-like columns: ['GEOIDFQ']\n" - ] - } - ], - "source": [ - "geoid21 = [c for c in cols21 if classify_column(c) == \"geoid_like\"]\n", - "geoid22 = [c for c in cols22 if classify_column(c) == \"geoid_like\"]\n", - "\n", - "print(\"2021 GEOID-like columns:\", geoid21)\n", - "print(\"2022 GEOID-like columns:\", geoid22)" - ] - }, - { - "cell_type": "markdown", - "id": "a9de6051-c55e-41dc-ba29-dce2444ade31", - "metadata": {}, - "source": [ - "# Inspect all layers\n", - "\n", - "Now we put these new functions onto all the files" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "id": "189808d9-6d21-44be-baf0-c5bf4da9b232", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
yearfilelayercolumnclassificationcanonicalchanged
02021acs_2021_ACS_2021_5YR_BG_bg.parquetACS_2021_5YR_BGSTATEFPotherSTATEFPFalse
12021acs_2021_ACS_2021_5YR_BG_bg.parquetACS_2021_5YR_BGCOUNTYFPotherCOUNTYFPFalse
22021acs_2021_ACS_2021_5YR_BG_bg.parquetACS_2021_5YR_BGTRACTCEotherTRACTCEFalse
32021acs_2021_ACS_2021_5YR_BG_bg.parquetACS_2021_5YR_BGBLKGRPCEotherBLKGRPCEFalse
42021acs_2021_ACS_2021_5YR_BG_bg.parquetACS_2021_5YR_BGNAMELSADotherNAMELSADFalse
\n", - "
" - ], - "text/plain": [ - " year file layer column \\\n", - "0 2021 acs_2021_ACS_2021_5YR_BG_bg.parquet ACS_2021_5YR_BG STATEFP \n", - "1 2021 acs_2021_ACS_2021_5YR_BG_bg.parquet ACS_2021_5YR_BG COUNTYFP \n", - "2 2021 acs_2021_ACS_2021_5YR_BG_bg.parquet ACS_2021_5YR_BG TRACTCE \n", - "3 2021 acs_2021_ACS_2021_5YR_BG_bg.parquet ACS_2021_5YR_BG BLKGRPCE \n", - "4 2021 acs_2021_ACS_2021_5YR_BG_bg.parquet ACS_2021_5YR_BG NAMELSAD \n", - "\n", - " classification canonical changed \n", - "0 other STATEFP False \n", - "1 other COUNTYFP False \n", - "2 other TRACTCE False \n", - "3 other BLKGRPCE False \n", - "4 other NAMELSAD False " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total inspected columns: 16998\n" - ] - } - ], - "source": [ - "def inspect_file(path: Path, year: int) -> pd.DataFrame:\n", - " cols = read_parquet_columns(path)\n", - " return pd.DataFrame({\n", - " \"year\": year,\n", - " \"file\": path.name,\n", - " \"layer\": layer_key(path),\n", - " \"column\": cols,\n", - " \"classification\": [classify_column(c) for c in cols],\n", - " \"canonical\": [canonicalize_column(c) for c in cols],\n", - " \"changed\": [c != canonicalize_column(c) for c in cols],\n", - " })\n", - "\n", - "\n", - "all_column_frames = []\n", - "\n", - "for layer in all_layers:\n", - " if layer in idx_2021:\n", - " all_column_frames.append(inspect_file(idx_2021[layer], 2021))\n", - " if layer in idx_2022:\n", - " all_column_frames.append(inspect_file(idx_2022[layer], 2022))\n", - "\n", - "all_columns_df = pd.concat(all_column_frames, ignore_index=True)\n", - "\n", - "display(all_columns_df.head()) # can look closer here\n", - "print(\"Total inspected columns:\", len(all_columns_df))" - ] - }, - { - "cell_type": "markdown", - "id": "b33d9bef-394c-478d-8a6c-ab347283a2d3", - "metadata": {}, - "source": [ - "## Summarize across all layers" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "id": "f54a3278-cd56-44dd-9f2c-17407595ac16", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
yearfilelayeracs_canonicalacs_new_stylegeoid_likeothern_changed_by_canonicalization
02021acs_2021_ACS_2021_5YR_BG_bg.parquetACS_2021_5YR_BG002140
262022acs_2022_ACS_2022_5YR_BG_bg.parquetACS_2022_5YR_BG002140
242021acs_2021_bg.parquetALL_BG001370
582022acs_2022_bg.parquetALL_BG00110
252021acs_demographic_profile_2021_bg.parquetDEMOGRAPHIC_PROFILE415902140
592022acs_demographic_profile_2022_bg.parquetDEMOGRAPHIC_PROFILE042611154261
12021acs_2021_X01_AGE_AND_SEX_bg.parquetX01_AGE_AND_SEX800100
272022acs_2022_X01_AGE_AND_SEX_bg.parquetX01_AGE_AND_SEX0801080
22021acs_2021_X02_RACE_bg.parquetX02_RACE350100
282022acs_2022_X02_RACE_bg.parquetX02_RACE0371037
32021acs_2021_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquetX03_HISPANIC_OR_LATINO_ORIGIN240100
292022acs_2022_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquetX03_HISPANIC_OR_LATINO_ORIGIN0241024
302022acs_2022_X04_ANCESTRY_bg.parquetX04_ANCESTRY00100
312022acs_2022_X05_FOREIGN_BORN_CITIZENSHIP_bg.parquetX05_FOREIGN_BORN_CITIZENSHIP00100
322022acs_2022_X06_PLACE_OF_BIRTH_bg.parquetX06_PLACE_OF_BIRTH00100
42021acs_2021_X07_MIGRATION_bg.parquetX07_MIGRATION800100
332022acs_2022_X07_MIGRATION_bg.parquetX07_MIGRATION0801080
52021acs_2021_X08_COMMUTING_bg.parquetX08_COMMUTING2900100
342022acs_2022_X08_COMMUTING_bg.parquetX08_COMMUTING029010290
62021acs_2021_X09_CHILDREN_HOUSEHOLD_RELATIONSHIP_b...X09_CHILDREN_HOUSEHOLD_RELATIONSHIP1030100
352022acs_2022_X09_CHILDREN_HOUSEHOLD_RELATIONSHIP_b...X09_CHILDREN_HOUSEHOLD_RELATIONSHIP010310103
362022acs_2022_X10_GRANDPARENTS_GRANDCHILDREN_bg.par...X10_GRANDPARENTS_GRANDCHILDREN00100
72021acs_2021_X11_HOUSEHOLD_FAMILY_SUBFAMILIES_bg.p...X11_HOUSEHOLD_FAMILY_SUBFAMILIES3390100
372022acs_2022_X11_HOUSEHOLD_FAMILY_SUBFAMILIES_bg.p...X11_HOUSEHOLD_FAMILY_SUBFAMILIES033910339
82021acs_2021_X12_MARITAL_STATUS_AND_HISTORY_bg.par...X12_MARITAL_STATUS_AND_HISTORY190100
382022acs_2022_X12_MARITAL_STATUS_AND_HISTORY_bg.par...X12_MARITAL_STATUS_AND_HISTORY0191019
392022acs_2022_X13_FERTILITY_bg.parquetX13_FERTILITY00100
92021acs_2021_X14_SCHOOL_ENROLLMENT_bg.parquetX14_SCHOOL_ENROLLMENT2680100
402022acs_2022_X14_SCHOOL_ENROLLMENT_bg.parquetX14_SCHOOL_ENROLLMENT026810268
102021acs_2021_X15_EDUCATIONAL_ATTAINMENT_bg.parquetX15_EDUCATIONAL_ATTAINMENT1750100
412022acs_2022_X15_EDUCATIONAL_ATTAINMENT_bg.parquetX15_EDUCATIONAL_ATTAINMENT017510175
112021acs_2021_X16_LANGUAGE_SPOKEN_AT_HOME_bg.parquetX16_LANGUAGE_SPOKEN_AT_HOME810100
422022acs_2022_X16_LANGUAGE_SPOKEN_AT_HOME_bg.parquetX16_LANGUAGE_SPOKEN_AT_HOME0811081
122021acs_2021_X17_POVERTY_bg.parquetX17_POVERTY1510100
432022acs_2022_X17_POVERTY_bg.parquetX17_POVERTY015110151
442022acs_2022_X18_DISABILITY_bg.parquetX18_DISABILITY00100
132021acs_2021_X19_INCOME_bg.parquetX19_INCOME2250100
452022acs_2022_X19_INCOME_bg.parquetX19_INCOME022510225
142021acs_2021_X20_EARNINGS_bg.parquetX20_EARNINGS600100
462022acs_2022_X20_EARNINGS_bg.parquetX20_EARNINGS0601060
152021acs_2021_X21_VETERAN_STATUS_bg.parquetX21_VETERAN_STATUS860100
472022acs_2022_X21_VETERAN_STATUS_bg.parquetX21_VETERAN_STATUS0861086
162021acs_2021_X22_FOOD_STAMPS_bg.parquetX22_FOOD_STAMPS70100
482022acs_2022_X22_FOOD_STAMPS_bg.parquetX22_FOOD_STAMPS07107
172021acs_2021_X23_EMPLOYMENT_STATUS_bg.parquetX23_EMPLOYMENT_STATUS3310100
492022acs_2022_X23_EMPLOYMENT_STATUS_bg.parquetX23_EMPLOYMENT_STATUS033110331
182021acs_2021_X24_INDUSTRY_OCCUPATION_bg.parquetX24_INDUSTRY_OCCUPATION3390100
502022acs_2022_X24_INDUSTRY_OCCUPATION_bg.parquetX24_INDUSTRY_OCCUPATION033910339
192021acs_2021_X25_HOUSING_CHARACTERISTICS_bg.parquetX25_HOUSING_CHARACTERISTICS8700100
512022acs_2022_X25_HOUSING_CHARACTERISTICS_bg.parquetX25_HOUSING_CHARACTERISTICS097010970
522022acs_2022_X26_GROUP_QUARTERS_bg.parquetX26_GROUP_QUARTERS00100
202021acs_2021_X27_HEALTH_INSURANCE_bg.parquetX27_HEALTH_INSURANCE660100
532022acs_2022_X27_HEALTH_INSURANCE_bg.parquetX27_HEALTH_INSURANCE0661066
212021acs_2021_X28_COMPUTER_AND_INTERNET_USE_bg.parquetX28_COMPUTER_AND_INTERNET_USE2130100
542022acs_2022_X28_COMPUTER_AND_INTERNET_USE_bg.parquetX28_COMPUTER_AND_INTERNET_USE021310213
222021acs_2021_X29_VOTING_AGE_POPULATION_bg.parquetX29_VOTING_AGE_POPULATION170100
552022acs_2022_X29_VOTING_AGE_POPULATION_bg.parquetX29_VOTING_AGE_POPULATION0171017
562022acs_2022_X98_UNWEIGHTED_HOUSING_UNIT_SAMPLE_bg...X98_UNWEIGHTED_HOUSING_UNIT_SAMPLE00100
232021acs_2021_X99_IMPUTATION_bg.parquetX99_IMPUTATION3000100
572022acs_2022_X99_IMPUTATION_bg.parquetX99_IMPUTATION030010300
\n", - "
" - ], - "text/plain": [ - " year file \\\n", - "0 2021 acs_2021_ACS_2021_5YR_BG_bg.parquet \n", - "26 2022 acs_2022_ACS_2022_5YR_BG_bg.parquet \n", - "24 2021 acs_2021_bg.parquet \n", - "58 2022 acs_2022_bg.parquet \n", - "25 2021 acs_demographic_profile_2021_bg.parquet \n", - "59 2022 acs_demographic_profile_2022_bg.parquet \n", - "1 2021 acs_2021_X01_AGE_AND_SEX_bg.parquet \n", - "27 2022 acs_2022_X01_AGE_AND_SEX_bg.parquet \n", - "2 2021 acs_2021_X02_RACE_bg.parquet \n", - "28 2022 acs_2022_X02_RACE_bg.parquet \n", - "3 2021 acs_2021_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquet \n", - "29 2022 acs_2022_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquet \n", - "30 2022 acs_2022_X04_ANCESTRY_bg.parquet \n", - "31 2022 acs_2022_X05_FOREIGN_BORN_CITIZENSHIP_bg.parquet \n", - "32 2022 acs_2022_X06_PLACE_OF_BIRTH_bg.parquet \n", - "4 2021 acs_2021_X07_MIGRATION_bg.parquet \n", - "33 2022 acs_2022_X07_MIGRATION_bg.parquet \n", - "5 2021 acs_2021_X08_COMMUTING_bg.parquet \n", - "34 2022 acs_2022_X08_COMMUTING_bg.parquet \n", - "6 2021 acs_2021_X09_CHILDREN_HOUSEHOLD_RELATIONSHIP_b... \n", - "35 2022 acs_2022_X09_CHILDREN_HOUSEHOLD_RELATIONSHIP_b... \n", - "36 2022 acs_2022_X10_GRANDPARENTS_GRANDCHILDREN_bg.par... \n", - "7 2021 acs_2021_X11_HOUSEHOLD_FAMILY_SUBFAMILIES_bg.p... \n", - "37 2022 acs_2022_X11_HOUSEHOLD_FAMILY_SUBFAMILIES_bg.p... \n", - "8 2021 acs_2021_X12_MARITAL_STATUS_AND_HISTORY_bg.par... \n", - "38 2022 acs_2022_X12_MARITAL_STATUS_AND_HISTORY_bg.par... \n", - "39 2022 acs_2022_X13_FERTILITY_bg.parquet \n", - "9 2021 acs_2021_X14_SCHOOL_ENROLLMENT_bg.parquet \n", - "40 2022 acs_2022_X14_SCHOOL_ENROLLMENT_bg.parquet \n", - "10 2021 acs_2021_X15_EDUCATIONAL_ATTAINMENT_bg.parquet \n", - "41 2022 acs_2022_X15_EDUCATIONAL_ATTAINMENT_bg.parquet \n", - "11 2021 acs_2021_X16_LANGUAGE_SPOKEN_AT_HOME_bg.parquet \n", - "42 2022 acs_2022_X16_LANGUAGE_SPOKEN_AT_HOME_bg.parquet \n", - "12 2021 acs_2021_X17_POVERTY_bg.parquet \n", - "43 2022 acs_2022_X17_POVERTY_bg.parquet \n", - "44 2022 acs_2022_X18_DISABILITY_bg.parquet \n", - "13 2021 acs_2021_X19_INCOME_bg.parquet \n", - "45 2022 acs_2022_X19_INCOME_bg.parquet \n", - "14 2021 acs_2021_X20_EARNINGS_bg.parquet \n", - "46 2022 acs_2022_X20_EARNINGS_bg.parquet \n", - "15 2021 acs_2021_X21_VETERAN_STATUS_bg.parquet \n", - "47 2022 acs_2022_X21_VETERAN_STATUS_bg.parquet \n", - "16 2021 acs_2021_X22_FOOD_STAMPS_bg.parquet \n", - "48 2022 acs_2022_X22_FOOD_STAMPS_bg.parquet \n", - "17 2021 acs_2021_X23_EMPLOYMENT_STATUS_bg.parquet \n", - "49 2022 acs_2022_X23_EMPLOYMENT_STATUS_bg.parquet \n", - "18 2021 acs_2021_X24_INDUSTRY_OCCUPATION_bg.parquet \n", - "50 2022 acs_2022_X24_INDUSTRY_OCCUPATION_bg.parquet \n", - "19 2021 acs_2021_X25_HOUSING_CHARACTERISTICS_bg.parquet \n", - "51 2022 acs_2022_X25_HOUSING_CHARACTERISTICS_bg.parquet \n", - "52 2022 acs_2022_X26_GROUP_QUARTERS_bg.parquet \n", - "20 2021 acs_2021_X27_HEALTH_INSURANCE_bg.parquet \n", - "53 2022 acs_2022_X27_HEALTH_INSURANCE_bg.parquet \n", - "21 2021 acs_2021_X28_COMPUTER_AND_INTERNET_USE_bg.parquet \n", - "54 2022 acs_2022_X28_COMPUTER_AND_INTERNET_USE_bg.parquet \n", - "22 2021 acs_2021_X29_VOTING_AGE_POPULATION_bg.parquet \n", - "55 2022 acs_2022_X29_VOTING_AGE_POPULATION_bg.parquet \n", - "56 2022 acs_2022_X98_UNWEIGHTED_HOUSING_UNIT_SAMPLE_bg... \n", - "23 2021 acs_2021_X99_IMPUTATION_bg.parquet \n", - "57 2022 acs_2022_X99_IMPUTATION_bg.parquet \n", - "\n", - " layer acs_canonical acs_new_style \\\n", - "0 ACS_2021_5YR_BG 0 0 \n", - "26 ACS_2022_5YR_BG 0 0 \n", - "24 ALL_BG 0 0 \n", - "58 ALL_BG 0 0 \n", - "25 DEMOGRAPHIC_PROFILE 4159 0 \n", - "59 DEMOGRAPHIC_PROFILE 0 4261 \n", - "1 X01_AGE_AND_SEX 80 0 \n", - "27 X01_AGE_AND_SEX 0 80 \n", - "2 X02_RACE 35 0 \n", - "28 X02_RACE 0 37 \n", - "3 X03_HISPANIC_OR_LATINO_ORIGIN 24 0 \n", - "29 X03_HISPANIC_OR_LATINO_ORIGIN 0 24 \n", - "30 X04_ANCESTRY 0 0 \n", - "31 X05_FOREIGN_BORN_CITIZENSHIP 0 0 \n", - "32 X06_PLACE_OF_BIRTH 0 0 \n", - "4 X07_MIGRATION 80 0 \n", - "33 X07_MIGRATION 0 80 \n", - "5 X08_COMMUTING 290 0 \n", - "34 X08_COMMUTING 0 290 \n", - "6 X09_CHILDREN_HOUSEHOLD_RELATIONSHIP 103 0 \n", - "35 X09_CHILDREN_HOUSEHOLD_RELATIONSHIP 0 103 \n", - "36 X10_GRANDPARENTS_GRANDCHILDREN 0 0 \n", - "7 X11_HOUSEHOLD_FAMILY_SUBFAMILIES 339 0 \n", - "37 X11_HOUSEHOLD_FAMILY_SUBFAMILIES 0 339 \n", - "8 X12_MARITAL_STATUS_AND_HISTORY 19 0 \n", - "38 X12_MARITAL_STATUS_AND_HISTORY 0 19 \n", - "39 X13_FERTILITY 0 0 \n", - "9 X14_SCHOOL_ENROLLMENT 268 0 \n", - "40 X14_SCHOOL_ENROLLMENT 0 268 \n", - "10 X15_EDUCATIONAL_ATTAINMENT 175 0 \n", - "41 X15_EDUCATIONAL_ATTAINMENT 0 175 \n", - "11 X16_LANGUAGE_SPOKEN_AT_HOME 81 0 \n", - "42 X16_LANGUAGE_SPOKEN_AT_HOME 0 81 \n", - "12 X17_POVERTY 151 0 \n", - "43 X17_POVERTY 0 151 \n", - "44 X18_DISABILITY 0 0 \n", - "13 X19_INCOME 225 0 \n", - "45 X19_INCOME 0 225 \n", - "14 X20_EARNINGS 60 0 \n", - "46 X20_EARNINGS 0 60 \n", - "15 X21_VETERAN_STATUS 86 0 \n", - "47 X21_VETERAN_STATUS 0 86 \n", - "16 X22_FOOD_STAMPS 7 0 \n", - "48 X22_FOOD_STAMPS 0 7 \n", - "17 X23_EMPLOYMENT_STATUS 331 0 \n", - "49 X23_EMPLOYMENT_STATUS 0 331 \n", - "18 X24_INDUSTRY_OCCUPATION 339 0 \n", - "50 X24_INDUSTRY_OCCUPATION 0 339 \n", - "19 X25_HOUSING_CHARACTERISTICS 870 0 \n", - "51 X25_HOUSING_CHARACTERISTICS 0 970 \n", - "52 X26_GROUP_QUARTERS 0 0 \n", - "20 X27_HEALTH_INSURANCE 66 0 \n", - "53 X27_HEALTH_INSURANCE 0 66 \n", - "21 X28_COMPUTER_AND_INTERNET_USE 213 0 \n", - "54 X28_COMPUTER_AND_INTERNET_USE 0 213 \n", - "22 X29_VOTING_AGE_POPULATION 17 0 \n", - "55 X29_VOTING_AGE_POPULATION 0 17 \n", - "56 X98_UNWEIGHTED_HOUSING_UNIT_SAMPLE 0 0 \n", - "23 X99_IMPUTATION 300 0 \n", - "57 X99_IMPUTATION 0 300 \n", - "\n", - " geoid_like other n_changed_by_canonicalization \n", - "0 2 14 0 \n", - "26 2 14 0 \n", - "24 1 37 0 \n", - "58 1 1 0 \n", - "25 2 14 0 \n", - "59 1 15 4261 \n", - "1 1 0 0 \n", - "27 1 0 80 \n", - "2 1 0 0 \n", - "28 1 0 37 \n", - "3 1 0 0 \n", - "29 1 0 24 \n", - "30 1 0 0 \n", - "31 1 0 0 \n", - "32 1 0 0 \n", - "4 1 0 0 \n", - "33 1 0 80 \n", - "5 1 0 0 \n", - "34 1 0 290 \n", - "6 1 0 0 \n", - "35 1 0 103 \n", - "36 1 0 0 \n", - "7 1 0 0 \n", - "37 1 0 339 \n", - "8 1 0 0 \n", - "38 1 0 19 \n", - "39 1 0 0 \n", - "9 1 0 0 \n", - "40 1 0 268 \n", - "10 1 0 0 \n", - "41 1 0 175 \n", - "11 1 0 0 \n", - "42 1 0 81 \n", - "12 1 0 0 \n", - "43 1 0 151 \n", - "44 1 0 0 \n", - "13 1 0 0 \n", - "45 1 0 225 \n", - "14 1 0 0 \n", - "46 1 0 60 \n", - "15 1 0 0 \n", - "47 1 0 86 \n", - "16 1 0 0 \n", - "48 1 0 7 \n", - "17 1 0 0 \n", - "49 1 0 331 \n", - "18 1 0 0 \n", - "50 1 0 339 \n", - "19 1 0 0 \n", - "51 1 0 970 \n", - "52 1 0 0 \n", - "20 1 0 0 \n", - "53 1 0 66 \n", - "21 1 0 0 \n", - "54 1 0 213 \n", - "22 1 0 0 \n", - "55 1 0 17 \n", - "56 1 0 0 \n", - "23 1 0 0 \n", - "57 1 0 300 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "file_summary_df = (\n", - " all_columns_df\n", - " .groupby([\"year\", \"file\", \"layer\", \"classification\"])\n", - " .size()\n", - " .unstack(fill_value=0)\n", - " .reset_index()\n", - ")\n", - "\n", - "if \"acs_canonical\" not in file_summary_df.columns:\n", - " file_summary_df[\"acs_canonical\"] = 0\n", - "if \"acs_new_style\" not in file_summary_df.columns:\n", - " file_summary_df[\"acs_new_style\"] = 0\n", - "if \"geoid_like\" not in file_summary_df.columns:\n", - " file_summary_df[\"geoid_like\"] = 0\n", - "if \"other\" not in file_summary_df.columns:\n", - " file_summary_df[\"other\"] = 0\n", - "\n", - "changed_summary = (\n", - " all_columns_df\n", - " .groupby([\"year\", \"file\", \"layer\"])[\"changed\"]\n", - " .sum()\n", - " .reset_index(name=\"n_changed_by_canonicalization\")\n", - ")\n", - "\n", - "file_summary_df = file_summary_df.merge(\n", - " changed_summary,\n", - " on=[\"year\", \"file\", \"layer\"],\n", - " how=\"left\"\n", - ")\n", - "\n", - "display(file_summary_df.sort_values([\"layer\", \"year\"]))" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "id": "7611da36-111c-4d2c-b138-03d6052fe05e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
layerexists_2021exists_2022n_cols_2021_rawn_cols_2022_rawraw_overlapcanonical_overlapraw_only_2021raw_only_2022canonical_only_2021canonical_only_2022
0ACS_2021_5YR_BGTrueFalse16000160160
1ACS_2022_5YR_BGFalseTrue01600016016
2ALL_BGTrueTrue38222360360
3DEMOGRAPHIC_PROFILETrueTrue41754277144173416142632104
4X01_AGE_AND_SEXTrueTrue8181080818111
5X02_RACETrueTrue3638035363813
6X03_HISPANIC_OR_LATINO_ORIGINTrueTrue2525024252511
7X04_ANCESTRYFalseTrue01000101
8X05_FOREIGN_BORN_CITIZENSHIPFalseTrue01000101
9X06_PLACE_OF_BIRTHFalseTrue01000101
10X07_MIGRATIONTrueTrue8181080818111
11X08_COMMUTINGTrueTrue291291029029129111
12X09_CHILDREN_HOUSEHOLD_RELATIONSHIPTrueTrue104104010310410411
13X10_GRANDPARENTS_GRANDCHILDRENFalseTrue01000101
14X11_HOUSEHOLD_FAMILY_SUBFAMILIESTrueTrue340340033934034011
15X12_MARITAL_STATUS_AND_HISTORYTrueTrue2020019202011
16X13_FERTILITYFalseTrue01000101
17X14_SCHOOL_ENROLLMENTTrueTrue269269026826926911
18X15_EDUCATIONAL_ATTAINMENTTrueTrue176176017517617611
19X16_LANGUAGE_SPOKEN_AT_HOMETrueTrue8282081828211
20X17_POVERTYTrueTrue152152015115215211
21X18_DISABILITYFalseTrue01000101
22X19_INCOMETrueTrue226226022522622611
23X20_EARNINGSTrueTrue6161060616111
24X21_VETERAN_STATUSTrueTrue8787086878711
25X22_FOOD_STAMPSTrueTrue88078811
26X23_EMPLOYMENT_STATUSTrueTrue332332033133233211
27X24_INDUSTRY_OCCUPATIONTrueTrue340340033934034011
28X25_HOUSING_CHARACTERISTICSTrueTrue87197108708719711101
29X26_GROUP_QUARTERSFalseTrue01000101
30X27_HEALTH_INSURANCETrueTrue6767066676711
31X28_COMPUTER_AND_INTERNET_USETrueTrue214214021321421411
32X29_VOTING_AGE_POPULATIONTrueTrue1818017181811
33X98_UNWEIGHTED_HOUSING_UNIT_SAMPLEFalseTrue01000101
34X99_IMPUTATIONTrueTrue301301030030130111
\n", - "
" - ], - "text/plain": [ - " layer exists_2021 exists_2022 \\\n", - "0 ACS_2021_5YR_BG True False \n", - "1 ACS_2022_5YR_BG False True \n", - "2 ALL_BG True True \n", - "3 DEMOGRAPHIC_PROFILE True True \n", - "4 X01_AGE_AND_SEX True True \n", - "5 X02_RACE True True \n", - "6 X03_HISPANIC_OR_LATINO_ORIGIN True True \n", - "7 X04_ANCESTRY False True \n", - "8 X05_FOREIGN_BORN_CITIZENSHIP False True \n", - "9 X06_PLACE_OF_BIRTH False True \n", - "10 X07_MIGRATION True True \n", - "11 X08_COMMUTING True True \n", - "12 X09_CHILDREN_HOUSEHOLD_RELATIONSHIP True True \n", - "13 X10_GRANDPARENTS_GRANDCHILDREN False True \n", - "14 X11_HOUSEHOLD_FAMILY_SUBFAMILIES True True \n", - "15 X12_MARITAL_STATUS_AND_HISTORY True True \n", - "16 X13_FERTILITY False True \n", - "17 X14_SCHOOL_ENROLLMENT True True \n", - "18 X15_EDUCATIONAL_ATTAINMENT True True \n", - "19 X16_LANGUAGE_SPOKEN_AT_HOME True True \n", - "20 X17_POVERTY True True \n", - "21 X18_DISABILITY False True \n", - "22 X19_INCOME True True \n", - "23 X20_EARNINGS True True \n", - "24 X21_VETERAN_STATUS True True \n", - "25 X22_FOOD_STAMPS True True \n", - "26 X23_EMPLOYMENT_STATUS True True \n", - "27 X24_INDUSTRY_OCCUPATION True True \n", - "28 X25_HOUSING_CHARACTERISTICS True True \n", - "29 X26_GROUP_QUARTERS False True \n", - "30 X27_HEALTH_INSURANCE True True \n", - "31 X28_COMPUTER_AND_INTERNET_USE True True \n", - "32 X29_VOTING_AGE_POPULATION True True \n", - "33 X98_UNWEIGHTED_HOUSING_UNIT_SAMPLE False True \n", - "34 X99_IMPUTATION True True \n", - "\n", - " n_cols_2021_raw n_cols_2022_raw raw_overlap canonical_overlap \\\n", - "0 16 0 0 0 \n", - "1 0 16 0 0 \n", - "2 38 2 2 2 \n", - "3 4175 4277 14 4173 \n", - "4 81 81 0 80 \n", - "5 36 38 0 35 \n", - "6 25 25 0 24 \n", - "7 0 1 0 0 \n", - "8 0 1 0 0 \n", - "9 0 1 0 0 \n", - "10 81 81 0 80 \n", - "11 291 291 0 290 \n", - "12 104 104 0 103 \n", - "13 0 1 0 0 \n", - "14 340 340 0 339 \n", - "15 20 20 0 19 \n", - "16 0 1 0 0 \n", - "17 269 269 0 268 \n", - "18 176 176 0 175 \n", - "19 82 82 0 81 \n", - "20 152 152 0 151 \n", - "21 0 1 0 0 \n", - "22 226 226 0 225 \n", - "23 61 61 0 60 \n", - "24 87 87 0 86 \n", - "25 8 8 0 7 \n", - "26 332 332 0 331 \n", - "27 340 340 0 339 \n", - "28 871 971 0 870 \n", - "29 0 1 0 0 \n", - "30 67 67 0 66 \n", - "31 214 214 0 213 \n", - "32 18 18 0 17 \n", - "33 0 1 0 0 \n", - "34 301 301 0 300 \n", - "\n", - " raw_only_2021 raw_only_2022 canonical_only_2021 canonical_only_2022 \n", - "0 16 0 16 0 \n", - "1 0 16 0 16 \n", - "2 36 0 36 0 \n", - "3 4161 4263 2 104 \n", - "4 81 81 1 1 \n", - "5 36 38 1 3 \n", - "6 25 25 1 1 \n", - "7 0 1 0 1 \n", - "8 0 1 0 1 \n", - "9 0 1 0 1 \n", - "10 81 81 1 1 \n", - "11 291 291 1 1 \n", - "12 104 104 1 1 \n", - "13 0 1 0 1 \n", - "14 340 340 1 1 \n", - "15 20 20 1 1 \n", - "16 0 1 0 1 \n", - "17 269 269 1 1 \n", - "18 176 176 1 1 \n", - "19 82 82 1 1 \n", - "20 152 152 1 1 \n", - "21 0 1 0 1 \n", - "22 226 226 1 1 \n", - "23 61 61 1 1 \n", - "24 87 87 1 1 \n", - "25 8 8 1 1 \n", - "26 332 332 1 1 \n", - "27 340 340 1 1 \n", - "28 871 971 1 101 \n", - "29 0 1 0 1 \n", - "30 67 67 1 1 \n", - "31 214 214 1 1 \n", - "32 18 18 1 1 \n", - "33 0 1 0 1 \n", - "34 301 301 1 1 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "def layer_compare(layer: str) -> dict:\n", - " cols21 = set(read_parquet_columns(idx_2021[layer])) if layer in idx_2021 else set()\n", - " cols22 = set(read_parquet_columns(idx_2022[layer])) if layer in idx_2022 else set()\n", - "\n", - " canon21 = {canonicalize_column(c) for c in cols21}\n", - " canon22 = {canonicalize_column(c) for c in cols22}\n", - "\n", - " return {\n", - " \"layer\": layer,\n", - " \"exists_2021\": layer in idx_2021,\n", - " \"exists_2022\": layer in idx_2022,\n", - " \"n_cols_2021_raw\": len(cols21),\n", - " \"n_cols_2022_raw\": len(cols22),\n", - " \"raw_overlap\": len(cols21 & cols22),\n", - " \"canonical_overlap\": len(canon21 & canon22),\n", - " \"raw_only_2021\": len(cols21 - cols22),\n", - " \"raw_only_2022\": len(cols22 - cols21),\n", - " \"canonical_only_2021\": len(canon21 - canon22),\n", - " \"canonical_only_2022\": len(canon22 - canon21),\n", - " }\n", - "\n", - "layer_comparison_df = pd.DataFrame([layer_compare(layer) for layer in all_layers])\n", - "display(layer_comparison_df.sort_values(\"layer\"))\n", - "\n", - "\n", - "# Ignore the first two rows. In the last two columns (canonical_only), the columns which have `1` are going to be the differently named GEOID columns. \n", - "# Some tables have large differences: Demographic Profile, Housing Characteristics, and Race all have > 1\n", - "# Some tables are entirely new in 2022 (X04, X05, X06, X10, X13, X18, X26, X98)" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "id": "ffd516ce-4eca-49b1-9559-6e449fcf1285", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
layercanonicalraw_2021raw_2022different_raw_names
72DEMOGRAPHIC_PROFILEB01001_001EB01001_001EB01001_E001True
73DEMOGRAPHIC_PROFILEB01001_002EB01001_002EB01001_E002True
74DEMOGRAPHIC_PROFILEB01001_003EB01001_003EB01001_E003True
75DEMOGRAPHIC_PROFILEB01001_004EB01001_004EB01001_E004True
76DEMOGRAPHIC_PROFILEB01001_005EB01001_005EB01001_E005True
..................
8657X99_IMPUTATIONB99283_001EB99283_001EB99283_E001True
8658X99_IMPUTATIONB99283_002EB99283_002EB99283_E002True
8659X99_IMPUTATIONB99283_003EB99283_003EB99283_E003True
8660X99_IMPUTATIONB99283_004EB99283_004EB99283_E004True
8661X99_IMPUTATIONB99283_005EB99283_005EB99283_E005True
\n", - "

8318 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " layer canonical raw_2021 raw_2022 \\\n", - "72 DEMOGRAPHIC_PROFILE B01001_001E B01001_001E B01001_E001 \n", - "73 DEMOGRAPHIC_PROFILE B01001_002E B01001_002E B01001_E002 \n", - "74 DEMOGRAPHIC_PROFILE B01001_003E B01001_003E B01001_E003 \n", - "75 DEMOGRAPHIC_PROFILE B01001_004E B01001_004E B01001_E004 \n", - "76 DEMOGRAPHIC_PROFILE B01001_005E B01001_005E B01001_E005 \n", - "... ... ... ... ... \n", - "8657 X99_IMPUTATION B99283_001E B99283_001E B99283_E001 \n", - "8658 X99_IMPUTATION B99283_002E B99283_002E B99283_E002 \n", - "8659 X99_IMPUTATION B99283_003E B99283_003E B99283_E003 \n", - "8660 X99_IMPUTATION B99283_004E B99283_004E B99283_E004 \n", - "8661 X99_IMPUTATION B99283_005E B99283_005E B99283_E005 \n", - "\n", - " different_raw_names \n", - "72 True \n", - "73 True \n", - "74 True \n", - "75 True \n", - "76 True \n", - "... ... \n", - "8657 True \n", - "8658 True \n", - "8659 True \n", - "8660 True \n", - "8661 True \n", - "\n", - "[8318 rows x 5 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rows with same canonical variable but different raw names: 8318\n" - ] - } - ], - "source": [ - "canon_pairs = (\n", - " all_columns_df[[\"year\", \"layer\", \"column\", \"canonical\"]]\n", - " .drop_duplicates()\n", - ")\n", - "\n", - "canon_pivot = canon_pairs.pivot_table(\n", - " index=[\"layer\", \"canonical\"],\n", - " columns=\"year\",\n", - " values=\"column\",\n", - " aggfunc=\"first\"\n", - ").reset_index()\n", - "\n", - "canon_pivot.columns.name = None\n", - "if 2021 not in canon_pivot.columns:\n", - " canon_pivot[2021] = None\n", - "if 2022 not in canon_pivot.columns:\n", - " canon_pivot[2022] = None\n", - "\n", - "canon_pivot = canon_pivot.rename(columns={2021: \"raw_2021\", 2022: \"raw_2022\"})\n", - "canon_pivot[\"different_raw_names\"] = (\n", - " canon_pivot[\"raw_2021\"].notna() &\n", - " canon_pivot[\"raw_2022\"].notna() &\n", - " (canon_pivot[\"raw_2021\"] != canon_pivot[\"raw_2022\"])\n", - ")\n", - "\n", - "renamed_df = canon_pivot[canon_pivot[\"different_raw_names\"]].sort_values([\"layer\", \"canonical\"])\n", - "\n", - "display(renamed_df)\n", - "print(\"Rows with same canonical variable but different raw names:\", len(renamed_df))" - ] - }, - { - "cell_type": "markdown", - "id": "858ac216-1a48-45ff-a944-2bc83ef2add3", - "metadata": {}, - "source": [ - "# Housing characteristics??\n", - "\n", - "Why is this table so much different? Are these really 100 new variables?" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "0442576a-e1dd-4633-8af8-141234f0fff6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2021 exists: True ../build/2021_bg/acs_2021_X25_HOUSING_CHARACTERISTICS_bg.parquet\n", - "2022 exists: True ../build/2022_bg/acs_2022_X25_HOUSING_CHARACTERISTICS_bg.parquet\n" - ] - } - ], - "source": [ - "BUILD_ROOT = Path(\"../build\")\n", - "\n", - "file_2021 = BUILD_ROOT / \"2021_bg\" / \"acs_2021_X25_HOUSING_CHARACTERISTICS_bg.parquet\"\n", - "file_2022 = BUILD_ROOT / \"2022_bg\" / \"acs_2022_X25_HOUSING_CHARACTERISTICS_bg.parquet\"\n", - "\n", - "print(\"2021 exists:\", file_2021.exists(), file_2021)\n", - "print(\"2022 exists:\", file_2022.exists(), file_2022)" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "e704d78f-58d4-4627-bdff-2e5ef7598442", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "n_cols_2021_raw: 871\n", - "n_cols_2022_raw: 971\n" - ] - } - ], - "source": [ - "def read_parquet_columns(path: Path) -> list[str]:\n", - " return pq.ParquetFile(path).schema_arrow.names\n", - "\n", - "cols_2021 = read_parquet_columns(file_2021)\n", - "cols_2022 = read_parquet_columns(file_2022)\n", - "\n", - "print(\"n_cols_2021_raw:\", len(cols_2021))\n", - "print(\"n_cols_2022_raw:\", len(cols_2022))" - ] - }, - { - "cell_type": "markdown", - "id": "601a490f-c552-4b3b-86bc-124b7ddeb9b8", - "metadata": {}, - "source": [ - "Canonicalize the columns" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "id": "dc4577d0-2679-490e-a143-1a9bdee2b0f9", - "metadata": {}, - "outputs": [], - "source": [ - "df21 = pd.DataFrame({\"raw_2021\": cols_2021})\n", - "df21[\"canonical\"] = df21[\"raw_2021\"].map(canonicalize_column)\n", - "\n", - "df22 = pd.DataFrame({\"raw_2022\": cols_2022})\n", - "df22[\"canonical\"] = df22[\"raw_2022\"].map(canonicalize_column)" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "id": "81387311-9998-4fa3-bd88-aaa8b7b21693", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
GEOIDraw_2021canonicalraw_2022present_2021present_2022
0B25001_001EB25001_001EB25001_E001TrueTrue
1B25002_001EB25002_001EB25002_E001TrueTrue
2B25002_002EB25002_002EB25002_E002TrueTrue
3B25002_003EB25002_003EB25002_E003TrueTrue
4B25003A_001EB25003A_001EB25003A_E001TrueTrue
\n", - "
" - ], - "text/plain": [ - " raw_2021 canonical raw_2022 present_2021 present_2022\n", - "0 B25001_001E B25001_001E B25001_E001 True True\n", - "1 B25002_001E B25002_001E B25002_E001 True True\n", - "2 B25002_002E B25002_002E B25002_E002 True True\n", - "3 B25002_003E B25002_003E B25002_E003 True True\n", - "4 B25003A_001E B25003A_001E B25003A_E001 True True" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "aligned = (\n", - " df21.merge(df22, on=\"canonical\", how=\"outer\")\n", - " .sort_values(\"canonical\")\n", - " .reset_index(drop=True)\n", - ")\n", - "\n", - "aligned[\"present_2021\"] = aligned[\"raw_2021\"].notna()\n", - "aligned[\"present_2022\"] = aligned[\"raw_2022\"].notna()\n", - "\n", - "display(aligned.head(5))" - ] - }, - { - "cell_type": "markdown", - "id": "43fd67f6-9a62-404c-9bbc-c0788836e0e5", - "metadata": {}, - "source": [ - "## Now get vars only present in 2022" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "id": "62bd73a2-cb6d-4565-a1d5-2106c55a1410", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ACS variables only in 2022: 100\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
raw_2021canonicalraw_2022present_2021present_2022
73NaNB25008A_001EB25008A_E001FalseTrue
74NaNB25008A_002EB25008A_E002FalseTrue
75NaNB25008A_003EB25008A_E003FalseTrue
76NaNB25008B_001EB25008B_E001FalseTrue
77NaNB25008B_002EB25008B_E002FalseTrue
\n", - "
" - ], - "text/plain": [ - " raw_2021 canonical raw_2022 present_2021 present_2022\n", - "73 NaN B25008A_001E B25008A_E001 False True\n", - "74 NaN B25008A_002E B25008A_E002 False True\n", - "75 NaN B25008A_003E B25008A_E003 False True\n", - "76 NaN B25008B_001E B25008B_E001 False True\n", - "77 NaN B25008B_002E B25008B_E002 False True" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "only_2021 = aligned[\n", - " aligned[\"raw_2021\"].notna() & aligned[\"raw_2022\"].isna()\n", - "].copy()\n", - "\n", - "acs_pattern = r\"^[A-Z0-9]+_\\d{3}[EM]$\"\n", - "\n", - "acs_only_2022 = only_2022[\n", - " only_2022[\"canonical\"].str.match(acs_pattern, na=False)\n", - "].copy()\n", - "\n", - "print(\"ACS variables only in 2022:\", len(acs_only_2022))\n", - "display(acs_only_2022.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "id": "75c5d294-23c5-4b59-9a08-bf593e70c620", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
table_prefixn_new_vars
0B2513613
1B2514013
2B251379
3B25008A3
4B25008E3
5B25008B3
6B25008C3
7B25008D3
8B25008I3
9B25010A3
10B25010B3
11B25010C3
12B25010D3
13B25008F3
14B25008G3
15B25008H3
16B25010G3
17B25010F3
18B25010E3
19B25010H3
20B25010I3
21B25077C1010179548002NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNb'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03...
22B25077A1010179548004NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNb'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03...
23B25077B1010179548003NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNb'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03...
24B25077F1010150011031NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNb'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03...
25B25077E1010150024003NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNb'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03...
26B25077D1..................................................................
27B25077G11500000US720210302002332.0104.0228.0159.069.0332.0332.0155500.0582.0228.0...68.6746999.85401515.0000000.0000000.0100.0000000.00.09.882747None
28B25077I11500000US720210314012599.0155.0444.0336.0108.0599.0599.0132400.0519.0444.0...74.12353919.5496420.0000000.0000000.0100.0000000.00.05.424770None
29B25077H11500000US720210312021821.0129.0692.0554.0138.0821.0821.0103000.0460.0692.0...84.28745410.9178169.8591550.0000000.098.5302120.00.03.647251None
30B2513811500000US720531504003564.0113.0451.0303.0148.0564.0564.093700.0440.0451.0...79.96453915.14818914.8014440.0000000.0100.0000000.00.00.807175None
31B2513911500000US721153304003787.0121.0666.0472.0194.0787.0787.093900.0426.0666.0...84.62515912.53391212.8019321.3213530.098.6786470.00.01.162791None
\n", + "

484672 rows × 37 columns

\n", "
" ], "text/plain": [ - " table_prefix n_new_vars\n", - "0 B25136 13\n", - "1 B25140 13\n", - "2 B25137 9\n", - "3 B25008A 3\n", - "4 B25008E 3\n", - "5 B25008B 3\n", - "6 B25008C 3\n", - "7 B25008D 3\n", - "8 B25008I 3\n", - "9 B25010A 3\n", - "10 B25010B 3\n", - "11 B25010C 3\n", - "12 B25010D 3\n", - "13 B25008F 3\n", - "14 B25008G 3\n", - "15 B25008H 3\n", - "16 B25010G 3\n", - "17 B25010F 3\n", - "18 B25010E 3\n", - "19 B25010H 3\n", - "20 B25010I 3\n", - "21 B25077C 1\n", - "22 B25077A 1\n", - "23 B25077B 1\n", - "24 B25077F 1\n", - "25 B25077E 1\n", - "26 B25077D 1\n", - "27 B25077G 1\n", - "28 B25077I 1\n", - "29 B25077H 1\n", - "30 B25138 1\n", - "31 B25139 1" + " n_total_housing_units n_vacant_housing_units \\\n", + "GEOID \n", + "010179548002 NaN NaN \n", + "010179548004 NaN NaN \n", + "010179548003 NaN NaN \n", + "010150011031 NaN NaN \n", + "010150024003 NaN NaN \n", + "... ... ... \n", + "1500000US720210302002 332.0 104.0 \n", + "1500000US720210314012 599.0 155.0 \n", + "1500000US720210312021 821.0 129.0 \n", + "1500000US720531504003 564.0 113.0 \n", + "1500000US721153304003 787.0 121.0 \n", + "\n", + " n_occupied_housing_units \\\n", + "GEOID \n", + "010179548002 NaN \n", + "010179548004 NaN \n", + "010179548003 NaN \n", + "010150011031 NaN \n", + "010150024003 NaN \n", + "... ... \n", + "1500000US720210302002 228.0 \n", + "1500000US720210314012 444.0 \n", + "1500000US720210312021 692.0 \n", + "1500000US720531504003 451.0 \n", + "1500000US721153304003 666.0 \n", + "\n", + " n_owner_occupied_housing_units \\\n", + "GEOID \n", + "010179548002 NaN \n", + "010179548004 NaN \n", + "010179548003 NaN \n", + "010150011031 NaN \n", + "010150024003 NaN \n", + "... ... \n", + "1500000US720210302002 159.0 \n", + "1500000US720210314012 336.0 \n", + "1500000US720210312021 554.0 \n", + "1500000US720531504003 303.0 \n", + "1500000US721153304003 472.0 \n", + "\n", + " n_renter_occupied_housing_units \\\n", + "GEOID \n", + "010179548002 NaN \n", + "010179548004 NaN \n", + "010179548003 NaN \n", + "010150011031 NaN \n", + "010150024003 NaN \n", + "... ... \n", + "1500000US720210302002 69.0 \n", + "1500000US720210314012 108.0 \n", + "1500000US720210312021 138.0 \n", + "1500000US720531504003 148.0 \n", + "1500000US721153304003 194.0 \n", + "\n", + " n_housing_units_multiunit_structures_denom \\\n", + "GEOID \n", + "010179548002 NaN \n", + "010179548004 NaN \n", + "010179548003 NaN \n", + "010150011031 NaN \n", + "010150024003 NaN \n", + "... ... \n", + "1500000US720210302002 332.0 \n", + "1500000US720210314012 599.0 \n", + "1500000US720210312021 821.0 \n", + "1500000US720531504003 564.0 \n", + "1500000US721153304003 787.0 \n", + "\n", + " n_total_housing_units_sample median_home_value \\\n", + "GEOID \n", + "010179548002 NaN NaN \n", + "010179548004 NaN NaN \n", + "010179548003 NaN NaN \n", + "010150011031 NaN NaN \n", + "010150024003 NaN NaN \n", + "... ... ... \n", + "1500000US720210302002 332.0 155500.0 \n", + "1500000US720210314012 599.0 132400.0 \n", + "1500000US720210312021 821.0 103000.0 \n", + "1500000US720531504003 564.0 93700.0 \n", + "1500000US721153304003 787.0 93900.0 \n", + "\n", + " median_contract_rent n_occupied_housing_units_sample \\\n", + "GEOID \n", + "010179548002 NaN NaN \n", + "010179548004 NaN NaN \n", + "010179548003 NaN NaN \n", + "010150011031 NaN NaN \n", + "010150024003 NaN NaN \n", + "... ... ... \n", + "1500000US720210302002 582.0 228.0 \n", + "1500000US720210314012 519.0 444.0 \n", + "1500000US720210312021 460.0 692.0 \n", + "1500000US720531504003 440.0 451.0 \n", + "1500000US721153304003 426.0 666.0 \n", + "\n", + " ... p_owner_occupied_units p_married \\\n", + "GEOID ... \n", + "010179548002 ... NaN NaN \n", + "010179548004 ... NaN NaN \n", + "010179548003 ... NaN NaN \n", + "010150011031 ... NaN NaN \n", + "010150024003 ... NaN NaN \n", + "... ... ... ... \n", + "1500000US720210302002 ... 68.674699 9.854015 \n", + "1500000US720210314012 ... 74.123539 19.549642 \n", + "1500000US720210312021 ... 84.287454 10.917816 \n", + "1500000US720531504003 ... 79.964539 15.148189 \n", + "1500000US721153304003 ... 84.625159 12.533912 \n", + "\n", + " p_female_headed_families p_nonhisp_white_persons \\\n", + "GEOID \n", + "010179548002 NaN NaN \n", + "010179548004 NaN NaN \n", + "010179548003 NaN NaN \n", + "010150011031 NaN NaN \n", + "010150024003 NaN NaN \n", + "... ... ... \n", + "1500000US720210302002 15.000000 0.000000 \n", + "1500000US720210314012 0.000000 0.000000 \n", + "1500000US720210312021 9.859155 0.000000 \n", + "1500000US720531504003 14.801444 0.000000 \n", + "1500000US721153304003 12.801932 1.321353 \n", + "\n", + " p_nonhisp_black_persons p_hispanic_persons \\\n", + "GEOID \n", + "010179548002 NaN NaN \n", + "010179548004 NaN NaN \n", + "010179548003 NaN NaN \n", + "010150011031 NaN NaN \n", + "010150024003 NaN NaN \n", + "... ... ... \n", + "1500000US720210302002 0.0 100.000000 \n", + "1500000US720210314012 0.0 100.000000 \n", + "1500000US720210312021 0.0 98.530212 \n", + "1500000US720531504003 0.0 100.000000 \n", + "1500000US721153304003 0.0 98.678647 \n", + "\n", + " p_native_persons p_hawaiian_persons p_veterans \\\n", + "GEOID \n", + "010179548002 NaN NaN NaN \n", + "010179548004 NaN NaN NaN \n", + "010179548003 NaN NaN NaN \n", + "010150011031 NaN NaN NaN \n", + "010150024003 NaN NaN NaN \n", + "... ... ... ... \n", + "1500000US720210302002 0.0 0.0 9.882747 \n", + "1500000US720210314012 0.0 0.0 5.424770 \n", + "1500000US720210312021 0.0 0.0 3.647251 \n", + "1500000US720531504003 0.0 0.0 0.807175 \n", + "1500000US721153304003 0.0 0.0 1.162791 \n", + "\n", + " geometry \n", + "GEOID \n", + "010179548002 b'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03... \n", + "010179548004 b'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03... \n", + "010179548003 b'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03... \n", + "010150011031 b'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03... \n", + "010150024003 b'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03... \n", + "... ... \n", + "1500000US720210302002 None \n", + "1500000US720210314012 None \n", + "1500000US720210312021 None \n", + "1500000US720531504003 None \n", + "1500000US721153304003 None \n", + "\n", + "[484672 rows x 37 columns]" ] }, "metadata": {}, @@ -6122,22 +3568,13 @@ } ], "source": [ - "acs_only_2022[\"prefix\"] = acs_only_2022[\"canonical\"].str.extract(r\"^([A-Z0-9]+)_\")\n", - "\n", - "prefix_counts = (\n", - " acs_only_2022[\"prefix\"]\n", - " .value_counts()\n", - " .rename_axis(\"table_prefix\")\n", - " .reset_index(name=\"n_new_vars\")\n", - ")\n", - "\n", - "display(prefix_counts)" + "display(processed_acs)" ] }, { "cell_type": "code", "execution_count": null, - "id": "cfc8b9b7-dc30-443e-8905-fffde0057daa", + "id": "3c862c3c-49e8-4fa3-9b6c-1d376e91dac4", "metadata": {}, "outputs": [], "source": [] diff --git a/geosnap/io/util.py b/geosnap/io/util.py index 433e31bd..d4c3f568 100644 --- a/geosnap/io/util.py +++ b/geosnap/io/util.py @@ -213,7 +213,7 @@ def convert_census_gdb( if layer != meta_str and not layer.endswith("_METADATA") ] - tables = list() + tables = list() existing_files = os.listdir(output_dir) for i in tqdm(layers): print(i) @@ -244,14 +244,14 @@ def convert_census_gdb( df = raw.set_index(geoid_col) - if "ACS_" not in i: # only the geoms have the ACS prefix - candidate_cols = df.columns[ - df.columns.str.contains("_E", regex=False) - | df.columns.str.contains("_M", regex=False) - | df.columns.str.contains("e", regex=False) - ] - df = df[candidate_cols] - df.columns = pd.Index([normalize_acs_vars(col) for col in df.columns]) + if "ACS_" not in i: # only the geoms have the ACS prefix + candidate_cols = df.columns[ + df.columns.str.match(r"^[A-Za-z0-9]+e\d+$", na=False) # old style: B02001e1 + | df.columns.str.match(r"^[A-Za-z0-9]+_[EM]\d{3}$", na=False) # new style: B02001_E001 / B02001_M001 + | df.columns.str.match(r"^[A-Za-z0-9]+_\d{3}[EM]$", na=False) # canonical: B02001_001E / B02001_001M + ] + df = df[candidate_cols] + df.columns = pd.Index([normalize_acs_vars(col) for col in df.columns]) df = df.dropna(axis=1, how="all") df.index = df.index.astype(str) diff --git a/geosnap/io/variables.csv b/geosnap/io/variables.csv index 65b5b114..da1f16e1 100644 --- a/geosnap/io/variables.csv +++ b/geosnap/io/variables.csv @@ -1,195 +1,195 @@ -variable,label,formula,ltdb,ncdb,census_1990_form,census_1990_table_column,census_2000_form,census_2000_table_column,acs,category,notes -geoid,FIPS code,,geoid,GEO2010,,,,,,, -n_mexican_pop,persons of Mexican parentage or ancestry,,mex,MEXIC,SF1,P0090001,SF1,PCT011004,B03001_004E,Ethnicity & Immigration, -n_cuban_pop,persons of Cuban parentage or ancestry,,cuban,CUBAN,SF1,P0090004,SF1,PCT011006,B03001_006E,Ethnicity & Immigration, -n_puerto_rican_pop,persons of Puerto Rican parentage or ancestry,,pr,PRICAN,SF1,P0090003,SF1,PCT011005,B03001_005E,Ethnicity & Immigration, -n_russian_pop,persons of Russian/USSR parentage or ancestry,,ruanc,,SF3,P0330022,SF3,PCT016064+PCT016053+PCT016052+PCT016037,B04004_064E,Ethnicity & Immigration,ruancXX (page 17 of LTDB codebook) suggests that USSR is only selected for 1970. I gather you're aggregating soviet countries individually? 1990 doesn't seem to have USSR or several of its constituents -n_italian_pop,persons of Italian parentage or ancestry,,itanc,,SF3,P0330016,SF3,PCT016051,B04004_051E,Ethnicity & Immigration, -n_german_pop,persons of German parentage or ancestry,,geanc,,SF3,P0330012,SF3,PCT016042,B04004_042E,Ethnicity & Immigration, -n_irish_pop,persons of Irish parentage or ancestry,,iranc,,SF3,P0330015,SF3,PCT016049,B04004_049E,Ethnicity & Immigration, -n_scandaniavian_pop,persons of Scandinavian parentage/ancestry,,scanc,,,,SF3,PCT016059+PCT016039+PCT016033+PCT016090,B04004_065E,Ethnicity & Immigration,"scanXX (page 18 of LTDB codebook) suggests dedicated nationalities are used in 1990 and 2000. This is despite there being a scandinavian category in 2000 SF3 (PCT016065); [ek]: similarly, the ACS lists both the scandanavian category *and* the individual country nationalities?" -n_total_pop_sample,total population from sample-based data,,dfb,,,,,,,Ethnicity & Immigration,LTDB suggests 1980 only -n_foreign_born_pop,foreign-born,,fb,FORBORN,SF3,P0360001:10,SF3,P021013,B05002_013E,Ethnicity & Immigration, -n_recent_immigrant_pop,recent immigrants (within the past 10 years),,n10imm,,SF3,P0360001:04,SF3,P023002,B05005_007E,Ethnicity & Immigration, -n_naturalized_pop,naturalized foreign-born,,nat,FORBCZN,SF3,P0370005,SF3,P021014,B05002_014E,Ethnicity & Immigration, -n_age_5_older,persons 5 years and over,,ag5up,,SF3,P0130004:31,SF3,P019001,B16001_001E,Ethnicity & Immigration, -n_other_language,persons who speak language other than English at home,,olang,,SF3,P0310002:26,SF3,P019001 - (P019025+P019003+P019047),B16001_001E - B16001_002E,Ethnicity & Immigration,Construct census 2000 count by subtraction from P019001 -n_limited_english,persons who speak English not well,,lep,,SF3,P0280004+P0280007+P0280010+P0280014+P0280017+P0280020+P0280024+P0280027+P0280030,SF3,P019022+P019023+P019029+P019013+P019012+P019017+P019018+P019007+P019008+P019061+P019062+P019067+P019066+P019052+P019051+P019057+P019056+P019040+P019045+P019044+P019030+P019039+P019035+P019034,DP02_0113E,Ethnicity & Immigration,"[ljw] cant tell if this includes ""speak other Languages"" as a catchall or if that is a pre-crosstab" -n_russian_born_pop,persons who were born in Russia/ USSR,,rufb,,,,SF3,PCT019026,B05006_040E,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API" -n_italian_born_pop,persons who were born in Italy,,itfb,,,,SF3,PCT019016,B05006_023E,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API" -n_german_born_pop,persons who were born in Germany,,gefb,,,,SF3,PCT019011,B05006_017E,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API" -n_irish_born_pop,persons who were born in Ireland,,irfb,,,,SF3,PCT019005,B05006_008E,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API" -n_scandaniavian_born_pop,persons who were born in Scandinavian Countries,,scfb,,,,SF3,PCT019006+PCT019007,B05006_009E+B05006_010E+B05006_011E+B05006_012E,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API" -p_mexican_pop,percentage of persons of Mexican parentage or ancestry,p_mexican_pop=n_mexican_pop / n_total_pop*100,pmex,,,,,,,Ethnicity & Immigration, -p_cuban_pop,percentage of persons of Cuban parentage or ancestry,p_cuban_pop=n_cuban_pop / n_total_pop*100,pcuban,,,,,,,Ethnicity & Immigration, -p_puerto_rican_pop,percentage of persons of Puerto Rican parentage or ancestry,p_puerto_rican_pop=n_puerto_rican_pop / n_total_pop*100,ppr,,,,,,,Ethnicity & Immigration, -p_russian_pop,percentage of persons of Russian/USSR parentage or ancestry,p_russian_pop=n_russian_pop / n_total_pop*100,pruanc,,,,,,,Ethnicity & Immigration, -p_italian_pop,percentage of persons of Italian parentage or ancestry,p_italian_pop=n_italian_pop / n_total_pop*100,pitanc,,,,,,,Ethnicity & Immigration, -p_german_pop,percentage of persons of German parentage or ancestry,p_german_pop=n_german_pop / n_total_pop*100,pgeanc,,,,,,,Ethnicity & Immigration, -p_irish_pop,percentage of persons of Irish parentage or ancestry,p_irish_pop=n_irish_pop / n_total_pop*100,piranc,,,,,,,Ethnicity & Immigration, -p_scandanavian_pop,percentage of persons of Scandinavian parentage/ancestry,p_scandanavian_pop=n_scandaniavian_pop / n_total_pop*100,pscanc,,,,,,,Ethnicity & Immigration, -p_foreign_born_pop,percentage of foreign-born,p_foreign_born_pop=n_foreign_born_pop / n_total_pop*100,pfb,SHRFOR,,,,,,Ethnicity & Immigration, -p_recent_immigrant_pop,percentage of recent immigrants (within the past 10 years),p_recent_immigrant_pop=n_recent_immigrant_pop / n_total_pop*100,p10imm,,,,,,,Ethnicity & Immigration, -p_naturalized_pop,percentage of naturalized foreign-born,p_naturalized_pop=n_naturalized_pop / n_total_pop*100,pnat,,,,,,,Ethnicity & Immigration, -p_other_language,percentage of persons who speak language other than English at home,p_other_language=n_other_language / n_total_pop*100,polang,,,,,,,Ethnicity & Immigration, -p_limited_english,percentage of persons who speak English not well,p_limited_english=n_limited_english / n_total_pop*100,plep,,,,,,,Ethnicity & Immigration, -p_russian_born_pop,percentage of persons who were born in Russia/ USSR,p_russian_born_pop=n_russian_born_pop / n_total_pop*100,prufb,,,,,,,Ethnicity & Immigration, -p_italian_born_pop,percentage of persons who were born in Italy,p_italian_born_pop=n_italian_born_pop / n_total_pop*100,pitfb,,,,,,,Ethnicity & Immigration, -p_german_born_pop,percentage of persons who were born in Germany,p_german_born_pop=n_german_born_pop / n_total_pop*100,pgefb,,,,,,,Ethnicity & Immigration, -p_irish_born_pop,percentage of persons who were born in Ireland,p_irish_born_pop=n_irish_born_pop / n_total_pop*100,pirfb,,,,,,,Ethnicity & Immigration, -p_scandanavian_born_pop,percentage of persons who were born in Scandinavian Countries,p_scandanavian_born_pop=n_scandaniavian_born_pop / n_total_pop*100,pscfb,,,,,,,Ethnicity & Immigration, -n_total_housing_units,housing units,,hu,TOTHSUN,SF1,H0010001,SF1,H001001,B25002_001E,"Housing, Age, & Marital Status", -n_vacant_housing_units,vacant housing units,,vac,VACHU,SF1,H0020002,SF1,H003003,B25002_003E,"Housing, Age, & Marital Status",divide by B25002_001E for vacancy rate -n_occupied_housing_units,occupied housing units,,ohu,OCCHU,SF1,H0020001,SF1,H003002,B25002_002E,"Housing, Age, & Marital Status", -n_owner_occupied_housing_units,owner-occupied housing units,,own,OWNOCC,SF1,H0030001,SF1,H004002,B25003_002E,"Housing, Age, & Marital Status", -n_renter_occupied_housing_units,renter-occupied housing units,,rent,RNTOCC,SF1,H0030002,SF1,H004003,B25003_003E,"Housing, Age, & Marital Status", -n_housing_units_multiunit_structures_denom,housing units denom,n_housing_units_multiunit_structures_denom=n_total_housing_units,dmulti,,,,,,B25024_001E,"Housing, Age, & Marital Status", -n_housing_units_multiunit_structures,housing units in multi-unit structures,,multi,,,,SF3,H030004+H030005+H030006+H030007+H030008+H030009,B25024_004E+B25024_005E+B25024_006E+B25024_007E+B25024_008E+B25024_009E,"Housing, Age, & Marital Status",[ljw] LTDB is unclear as to the relevant computed column from SF3-H030*. Recorded columns here are all stationary housing units (not mobile home (H030010) or RV/Van/Boat (H030011) -n_total_housing_units_sample,housing units in sample-based data,n_total_housing_units_sample=n_total_housing_units,husp,,,,,,B25024_001E,"Housing, Age, & Marital Status", -median_home_value,Median home value,,mhmval,MDVALHS,SF3,H061A001,SF3,H085001,B25077_001E,"Housing, Age, & Marital Status", -median_contract_rent,Median monthly contract rent,,mrent,MDGRENT,SF3,H043A001,SF3,H056001,B25058_001E,"Housing, Age, & Marital Status", -n_structures_30_old,structures built more than 30 years ago,,h30old,,SF3,H0250005+H0250006+H0250007+H0250008,SF3,H034010+H034009+H034008+H034007,,"Housing, Age, & Marital Status", -n_occupied_housing_units_sample,occupied housing units in sample-based data,,ohusp,,SF3,H0040001,SF3,H006001,B25003_001E,"Housing, Age, & Marital Status", -n_household_recent_move,household heads moved into unit less than 10 years ago,,h10yrs,,SF3,H0250006+H0250007+H0250008,SF3,H038003+H038004+H038005+H038010+H038011+H038012,,"Housing, Age, & Marital Status", -n_persons_under_18,persons age 17 years and under,,a18und,NCHILD,SF3,P0130012+P0130011+P0130010+P0130009+P0130008+P0130007+P0130006+P0130005+P0130004+P0130003+P0130002+P0130001,SF1,P012003+P012004+P012005+P012006+P012027+P012028+P012029+P012030,B01001_003E+B01001_004E+B01001_005E+B01001_006E+B01001_027E+B01001_028E+B01001_029E+B01001_030E,"Housing, Age, & Marital Status", -n_persons_over_60,persons age 60 years and over,,a60up,,SF3,P0130025+P0130026+P0130027+P0130028+P0130029+P0130030+P0130031,SF1,P012018:025+ P012042:049,B01001_018E+B01001_019E+B01001_020E+B01001_021E+B01001_022E+B01001_023E+B01001_024E+B01001_025E+B01001_042E+B01001_043E+B01001_044E+B01001_045E+B01001_046E+B01001_047E+B01001_048E+B01001_049E,"Housing, Age, & Marital Status", -n_persons_over_75,persons age 75 years and over,,a75up,,SF3,P0130029+P0130030+P0130031,SF1,P012023:25+ P012047:49,B01001_047E+B01001_048E+B01001_049E+B01001_023E+B01001_024E+B01001_025E,"Housing, Age, & Marital Status", -n_persons_over_15,population 15 years and over,,ag15up,PERS15P,SF3,P0130010+P0130011+P0130012+P0130013+P0130014+P0130015+P0130016+P0130017+P0130018+P0130019+P0130020+P0130021+P0130022+P0130023+P0130024+P0130025+P0130026+P0130027+P0130028+P0130029+P0130030+P0130031,SF3,P018001,B12001_001E,"Housing, Age, & Marital Status", -n_persons_over_25,population 25 years and over,,ag25up,,SF3,P0130018+P0130019+P0130020+P0130021+P0130022+P0130023+P0130024+P0130025+P0130026+P0130027+P0130028+P0130029+P0130030+P0130031,SF3,P037001,B15002_001E,"Housing, Age, & Marital Status",denominator for educational attainment -n_married,currently married (excluding separated),,mar,MMARSPP,SF3,P0250001+P0250002,SF3,(P018004+P018013) - (P018007+P018016),B12001_005E,"Housing, Age, & Marital Status", -n_widowed_divorced,"widowed, divorced, and separated",,wds,,SF3,P0270005+P0270011+P0270006+P0270012,SF3,P018007+P018009+P018010+P018016+P018018+P018019,B12001_007E+B12001_009E+B12001_010E+B12001_016E+B12001_018E+B12001_019E,"Housing, Age, & Marital Status", -n_total_families,total families,,family,FAMSUB,SF3,P0040001,SF1,P031001,B17010_001E,"Housing, Age, & Marital Status",denominator for calculating % female-headed families w/ children -n_female_headed_families,female-headed families with children,,fhh,NFFH,SF3,P0190005,SF1,P035016,B17010_017E,"Housing, Age, & Marital Status",numerator for calculating % female-headed families w/ children -p_vacant_housing_units,percentage of vacant housing units,p_vacant_housing_units=n_vacant_housing_units / n_total_housing_units*100,pvac,,,,,,,"Housing, Age, & Marital Status", -p_owner_occupied_units,percentage of owner-occupied housing units,p_owner_occupied_units=n_occupied_housing_units / n_total_housing_units*100,pown,,,,,,,"Housing, Age, & Marital Status", -p_housing_units_multiunit_structures,percentage of housing units in multi-unit structures,p_housing_units_multiunit_structures=n_housing_units_multiunit_structures / n_housing_units_multiunit_structures_denom*100,pmulti,,,,,,,"Housing, Age, & Marital Status", -p_structures_30_old,percentage of structures built more than 30 years ago,p_structures_30_old=n_structures_30_old / n_housing_units_multiunit_structures_denom*100,p30old,,,,,,,"Housing, Age, & Marital Status", -p_household_recent_move,percentage of household heads moved into unit less than 10 years ago,p_household_recent_move=n_household_recent_move / n_total_households*100,p10yrs,,,,,,,"Housing, Age, & Marital Status", -p_persons_under_18,percentage of persons age 17 years and under,p_persons_under_18=n_persons_under_18 / n_total_pop*100,p18und,,,,,,,"Housing, Age, & Marital Status", -p_persons_over_60,percentage of persons age 60 years and over,p_persons_over_60=n_persons_over_60 / n_total_pop*100,p60up,,,,,,,"Housing, Age, & Marital Status", -p_persons_over_75,percentage of persons age 75 years and over,p_persons_over_75=n_persons_over_75 / n_total_pop*100,p75up,,,,,,,"Housing, Age, & Marital Status", -p_married,percent currently married (excluding separated),p_married=n_married / n_persons_over_15*100,pmar,,,,,,,"Housing, Age, & Marital Status", -p_widowed_divorced,"percent widowed, divorced, and separated",p_widowed_divorced=n_widowed_divorced / n_persons_over_15*100,pwds,,,,,,,"Housing, Age, & Marital Status",should denom be families or individuals? -p_female_headed_families,percentage of female-headed families with children,p_female_headed_families=n_female_headed_families / n_total_families*100,pfhh,,,,,,,"Housing, Age, & Marital Status",should denom be families or individuals? -n_white_persons,persons of white race,,white,NSHRWHT,SF1,P0100001+P0100006,SF1,P003003,,Race & Age,"[ljw] inferring here, documentation suggests this is only available for 1970, but white alone (regardless of hispanic) gives this count? [ek] I think the question about hispanic ethnicity was added in 1980, so this is the best they can do to measdure the 'white alone' construct in 1970 " -n_nonhisp_white_persons,"persons of white race, not Hispanic origin",,nhwht,NSHRNHW,SF1,P0100001,SF1,P004005,B03002_003E,Race & Age,"[ljw] I read this as P004005 (not hispanic white alone), not P003003 (white alone)" -n_black_persons,persons of black race,,black,NSHRBLK,SF1,P0100007+P0100002,SF1,P003004,,Race & Age, -n_nonhisp_black_persons,"persons of black race, not Hispanic origin",,nhblk,NSHRNHB,SF1,P0100002,SF1,P004006,B03002_004E,Race & Age, -n_hispanic_persons,persons of Hispanic origin,,hisp,NSHRHSP,SF1,P0080001,SF1,P004002,B03002_012E,Race & Age, -n_native_persons,"persons of Native American race, not Hispanic origin",,ntv,NSHRAMI,SF1,P0070003,SF1,P004007,B03002_005E,Race & Age,is this nonhispanic? -n_hawaiian_persons,"persons of Hawaiian race, not Hispanic origin",,haw,NSHRHIP,SF1,P0070017,SF1,P004009,B02001_006E,Race & Age,is this nonhispanic? -n_asian_indian_persons,persons of Asian Indian race,,india,,SF1,P0070009,SF1,PCT007002,B03002_006E+B03002_007E,Race & Age, -n_chinese_persons,persons of Chinese race,,china,,SF1,P0070006,SF1,PCT007005+PCT007015,B02015_007E+B02015_020E,Race & Age, -n_filipino_persons,persons of Filipino race,,filip,,SF1,P0070007,SF1,PCT007006,B02015_008E,Race & Age, -n_japanese_persons,persons of Japanese race,,japan,,SF1,P0070008,SF1,PCT007009,B02015_011E,Race & Age, -n_korean_persons,persons of Korean race,,korea,,SF1,P0070010,SF1,PCT007010,B02015_012E,Race & Age, -n_asian_persons,persons of Asian race,,asian,,SF1,P0060004,SF1,P004008,B03002_006E+B03002_007E,Race & Age,for 1990 this is Asian and PI -n_vietnamese_persons,persons of Vietnamese race,,viet,,SF1,P0070011,SF1,PCT007017,B02018_022E,Race & Age, -n_white_age_distribution,white population with known age distribution,,agewht,,SF1,P0120001:62,SF1,PCT012I001,B01001H_001E,Race & Age, -n_white_under_15,0-15 years old of white race,,a15wht,,SF1,P0120001:09+P0120032:40,SF1,PCT012I003:018+PCT012I108:122,B01001H_003E+B01001H_004E+B01001H_005E+B01001H_018E+B01001H_019E+B01001H_020E,Race & Age, -n_white_over_60,60 years and older of white race,,a60wht,,SF1,P0120025:31+P0120056:62,SF1,PCT012I063:105+PCT012I167:209,,Race & Age,is this nonhispanic? -n_white_over_65,65 years and older of non-Hispanic whites,,a65wht,,SF1,P0120027:31+P0120058:62,SF1,PCT012I068:105+PCT012I172:209,B01001H_014E+B01001H_015E+B01001H_016E+B01001H_029E+B01001H_030E+B01001H_031E,Race & Age, -n_black_age_distribution,black population with known age distribution,,ageblk,,SF1,P0120063:0124,SF1,PCT012J001,B01001B_001E,Race & Age, -n_black_under_15,0-15 years old of black race,,a15blk,,SF1,P0120063:71+P0120094:102,SF1,PCT012J003:018+PCT012J108:122,B01001B_003E+B01001B_004E+B01001B_005E+B01001B_018E+B01001B_019E+B01001B_020E,Race & Age, -n_black_over_60,60 years and older of black race,,a60blk,,SF1,P0120087:93+P0120118:124,SF1,PCT012J063:105+PCT012J167:209,,Race & Age, -n_black_over_65,65 years and older of black race,,a65blk,,SF1,P0120089:93+P0120120:124,SF1,PCT012J068:105+PCT012J172:209,B01001B_014E+B01001B_015E+B01001B_016E+B01001B_029E+B01001B_030E+B01001B_031E,Race & Age, -n_hispanic_age_distribution,Hispanic population with known age distribution,,agehsp,,SF1,P0130001:62,SF1,PCT012H001,B01001I_001E,Race & Age, -n_hispanic_under_15,"0-15 years old, persons of Hispanic origins",,a15hsp,,SF1,P0130001:09+P0130032:40,SF1,PCT012H003:018+PCT012H108:122,B01001I_003E+B01001I_004E+B01001I_005E+B01001I_018E+B01001I_019E+B01001I_020E,Race & Age, -n_hispanic_over_60,"60 years and older, persons of Hispanic origins",,a60hsp,,SF1,P0130025:31+P0130056:62,SF1,PCT012H063:105+PCT012H167:209,,Race & Age, -n_hispanic_over_65,"65 years and older, persons of Hispanic origins",,a65hsp,,SF1,P0130027:31+P0130058:62,SF1,PCT012H068:105+PCT012H172:209,B01001I_014E+B01001I_015E+B01001I_016E+B01001I_029E+B01001I_030E+B01001I_031E,Race & Age, -n_native_age_distribution,Native American population with known age distribution,,agentv,,SF1,P0120125:186,SF1,PCT012K001,B01001C_001E,Race & Age, -n_native_under_15,0-15 years old of Native American race,,a15ntv,,SF1,P0120125:133 +P0120156:164,SF1,PCT012K003:018+PCT012K108:122,B01001C_003E+B01001C_004E+B01001C_005E+B01001C_018E+B01001C_019E+B01001C_020E,Race & Age, -n_native_over_60,60 years and older of Native American race,,a60ntv,,SF1,P0120149:155+P0120180:186,SF1,PCT012K063:105+PCT012K167:209,,Race & Age, -n_native_over_65,65 years and older of Native American race,,a65ntv,,SF1,P0120151:155+P0120182:186,SF1,PCT012K068:105+PCT012K172:209,B01001C_014E+B01001C_015E+B01001C_016E+B01001C_029E+B01001C_030E+B01001C_031E,Race & Age, -n_asian_age_distribution,Asian and Pacific Islander population with known age distribution,,ageasn,,SF1,P0120187:248,SF1,PCT012L001+PCT012M001,B01001D_001E+B01001E_001E,Race & Age, -n_asian_under_15,0-15 years old of Asians and Pacific Islanders,,a15asn,,SF1,P0120187:195+P0120218:226,SF1,PCT012M003:018+PCT012M108:122+PCT012L003:018+PCT012L108:122,B01001D_003E+B01001D_004E+B01001D_005E+B01001D_018E+B01001D_019E+B01001D_020E+B01001E_003E+B01001E_004E+B01001E_005E+B01001E_018E+B01001E_019E+B01001E_020E,Race & Age, -n_asian_over_60,60 years and older of Asians and Pacific Islanders,,a60asn,,,,SF1,PCT012M063:105+PCT012M167:209,,Race & Age, -n_asian_over_65,65 years and older of Asians and Pacific Islanders,,a65asn,,,,SF1,PCT012M068:105+PCT012M172:209+PCT012L068:105+PCT012L172:209,B01001D_014E+B01001D_015E+B01001D_016E+B01001E_014E+B01001E_015E+B01001E_016E+B01001E_029E+B01001E_030E+B01001E_031E+B01001D_029E+B01001D_030E+B01001D_031E,Race & Age, -p_white_persons,percentage of persons of white race,,pwhite,,,,,,,Race & Age, -p_black_persons,percentage of persons of black race,,pblack,,,,,,,Race & Age, -p_nonhisp_white_persons,"percentage of persons of white race, not Hispanic origin",p_nonhisp_white_persons=n_nonhisp_white_persons / n_total_pop*100,pnhwht,SHRNHW,,,,,,Race & Age, -p_nonhisp_black_persons,"percentage of persons of black race, not Hispanic origin",p_nonhisp_black_persons=n_nonhisp_black_persons / n_total_pop*100,pnhblk,SHRNHB,,,,,,Race & Age, -p_hispanic_persons,percentage of persons of Hispanic origin,p_hispanic_persons=n_hispanic_persons / n_total_pop*100,phisp,SHRHSP,,,,,,Race & Age, -p_native_persons,percentage of persons of Native American race,p_native_persons=n_native_persons / n_total_pop*100,pntv,SHRNHI,,,,,,Race & Age, -p_asian_persons,percentage of persons of Asian race (and Pacific Islander),p_asian_persons=n_asian_persons / n_total_pop*100,pasian,SHRNHR,,,,,,Race & Age, -p_hawaiian_persons,percentage of persons of Hawaiian race,p_hawaiian_persons=n_hawaiian_persons / n_total_pop*100,phaw,SHRNHH,,,,,,Race & Age, -p_asian_indian_persons,percentage of persons of Asian Indian race,p_asian_indian_persons=n_asian_indian_persons / n_total_pop*100,pindia,,,,,,,Race & Age, -p_chinese_persons,percentage of persons of Chinese race,p_chinese_persons=n_chinese_persons / n_total_pop*100,pchina,,,,,,,Race & Age, -p_filipino_persons,percentage of persons of Filipino race,p_filipino_persons=n_filipino_persons / n_total_pop*100,pfilip,,,,,,,Race & Age, -p_japanese_persons,percentage of persons of Japanese race,p_japanese_persons=n_japanese_persons / n_total_pop*100,pjapan,,,,,,,Race & Age, -p_korean_persons,percentage of persons of Korean race,p_korean_persons=n_korean_persons / n_total_pop*100,pkorea,,,,,,,Race & Age, -p_vietnamese_persons,percentage of persons of Vietnamese race,p_vietnamese_persons=n_vietnamese_persons / n_total_pop*100,pviet,,,,,,,Race & Age, -p_white_under_15,percentage of 0-15 years old of white race,p_white_under_15=n_white_under_15 / n_total_pop*100,p15wht,,,,,,,Race & Age, -p_white_over_60,percentage of 60 years and older of white race,p_white_over_60=n_white_over_60 / n_total_pop*100,p60wht,,,,,,,Race & Age, -p_white_over_65,percentage of 65 years and older of non-Hispanic whites,p_white_over_65=n_white_over_65 / n_total_pop*100,p65wht,,,,,,,Race & Age, -p_black_under_15,percentage of 0-15 years old of black race,p_black_under_15=n_black_under_15 / n_total_pop*100,p15blk,,,,,,,Race & Age, -p_black_over_60,percentage of 60 years and older of black race,p_black_over_60=n_black_over_60 / n_total_pop*100,p60blk,,,,,,,Race & Age, -p_black_over_65,percentage of 65 years and older of black race,p_black_over_65=n_black_over_65 / n_total_pop*100,p65blk,,,,,,,Race & Age, -p_hispanic_under_15,"percentage of 0-15 years old, persons of Hispanic origins",p_hispanic_under_15=n_hispanic_under_15 / n_total_pop*100,p15hsp,,,,,,,Race & Age, -p_hispanic_over_60,"percentage of 60 years and older, persons of Hispanic origins",p_hispanic_over_60=n_hispanic_over_60 / n_total_pop*100,p60hsp,,,,,,,Race & Age, -p_hispanic_over_65,"percentage of 65 years and older, persons of Hispanic origins",p_hispanic_over_65=n_hispanic_over_65 / n_total_pop*100,p65hsp,,,,,,,Race & Age, -p_native_under_15,percentage of 0-15 years old of Native American race,p_native_under_15=n_native_under_15 / n_total_pop*100,p15ntv,,,,,,,Race & Age, -p_native_over_60,percentage of 60 years and older of Native American race,p_native_over_60=n_native_over_60 / n_total_pop*100,p60ntv,,,,,,,Race & Age, -p_native_over_65,percentage of 65 years and older of Native American race,p_native_over_65=n_native_over_65 / n_total_pop*100,p65ntv,,,,,,,Race & Age, -p_asian_under_15,percentage of 0-15 years old of Asians and Pacific Islanders,p_asian_under_15=n_asian_under_15 / n_total_pop*100,p15asn,,,,,,,Race & Age, -p_asian_over_60,percentage of 60 years and older of Asians and Pacific Islanders,p_asian_over_60=n_asian_over_60 / n_total_pop*100,p60asn,,,,,,,Race & Age, -p_asian_over_65,percentage of 65 years and older of Asians and Pacific Islanders,p_asian_over_65=n_asian_over_65 / n_total_pop*100,p65asn,,,,,,,Race & Age, -n_female_over_16,"females 16 years and over, except in armed forces",,dflabf,DCFEPR,SF3,P0700006+P0700007+P0700008,SF3,P043012,,Socioeconomic Status, -n_female_labor_force,females in labor force,,flabf,FEPR,SF3,P0700006+P0700007,SF3,P043010,,Socioeconomic Status, -n_labor_force,civilian labor force,,clf,,SF3,P0700002+P0700003+P0700006+P0700007,SF3,P043005+P043012,B27011_002E,Socioeconomic Status, -n_unemployed_persons,unemployed persons,,unemp,,SF3,P0700003+P0700007,SF3,P043007+P043014,B23001_008E+B23001_015E+B23001_022E+B23001_029E+B23001_036E+B23001_044E+B23001_050E+B23001_057E+B23001_064E+B23001_071E+B23001_094E+B23001_101E+B23001_108E+B23001_115E+B23001_122E+B23001_129E+B23001_136E+B23001_143E+B23001_150E+B23001_157E,Socioeconomic Status, -n_employed_over_16,employed persons 16 years and over,,empclf,EMPMT,SF3,P0700002+P0700006,SF3,P049001,B23001_007E+B23001_014E+B23001_021E+B23001_028E+B23001_035E+B23001_042E+B23001_049E+B23001_049E+B23001_056E+B23001_063E+B23001_070E+B23001_093E+B23001_100E+B23001_107E+B23001_114E+B23001_121E+B23001_128E+B23001_135E+B23001_142E+B23001_149E+B23001_156E,Socioeconomic Status, -n_employed_professional,professional employees (by occupations),,prof,DLFRAT,SF3,P0780001+P0780002,SF3,P049017+P049044,,Socioeconomic Status, -n_employed_manufacturing,manufacturing employees (by industries),,manuf,PRFEMP,SF3,P0770004+P0770005,SF3,P049007+P049034,,Socioeconomic Status, -n_employed_self_employed,self-employed,,semp,,SF3,P0790006,SF3,P051012+P051023+P051033+P051044+P051055+P051065,,Socioeconomic Status, -n_civilians_over_16,civilian population 16 years and over,,ag16cv,,SF3,P0640002+P0640003+P0640005+P0640006+P0640008+P0640009 +P0640011+P0640012,SF3,P043005+P043012,C24010_001E,Socioeconomic Status, -n_civilians_over_18,civilian population 18 years and over,,ag18cv,,,,SF3,P039005+P039010+P039016+P039021,,Socioeconomic Status, -n_veterans,veterans,,vet,,SF3,P0640002+P0640005+P0640008+P0640011,SF3,P039006+P039011+P039017+P039022,B21001_002E,Socioeconomic Status, -n_civilians_16_64,civilian non-institutionalized persons 16-64 years old,,cni16u,,SF3,P0640002+P0640003+P0640008+P0640009,SF3,P042001,,Socioeconomic Status, -n_disabled,disabled,,dis,,SF3,P0680001+P0680002+P0680005+P0680006+P0680009+P0680010+P0680013+P0680014,SF3,P042004+P042007+P042014+P042021+P042024+P042028+P042031+P042038+P042045+P042048,,Socioeconomic Status, -median_household_income,Median household income,,hinc,MDHHY,SF3,P080A001,SF3,P053001,B19013_001E,Socioeconomic Status,"in 2015 dollars, will need inflation adjustment for timeseries" -n_total_households,total households in sample-based data,,hh,NUMHHS,SF3,P0050001,SF3,P010001,B19001_001E,Socioeconomic Status, -median_income_whitehh,Median household income for whites,,hincw,,,,SF3,P152A001,B19013H_001E,Socioeconomic Status,"[ek] the 1990 table noted in the LTDB docs only has ranges, not median (e.g. P0820001)" -n_white_households,total white households in sample-based data,,hhw,,SF3,P0080001,SF3,P146A001,B19001H_001E,Socioeconomic Status, -median_income_blackhh,Median household income for blacks,,hincb,,,,SF3,P152B001,B19013B_001E,Socioeconomic Status,"[ek] the 1990 table noted in the LTDB docs only has ranges, not median (e.g. P0820001)" -n_black_households,total black households in sample-based data,,hhb,,SF3,P0080002,SF3,P146B001,B19001B_001E,Socioeconomic Status, -median_income_hispanichh,Median household income for Hispanics,,hinch,,,,SF3,P152H001,B19013I_001E,Socioeconomic Status,"[ek] the 1990 table noted in the LTDB docs only has ranges, not median (e.g. P0820001)" -n_hispanic_households,total Hispanic households in sample-based data,,hhh,,SF3,P0210001:07,SF3,P146H001,B19001I_001E,Socioeconomic Status,"[ek] the 1990 value is calculated differently than the LTDB codebook, because the their reference (P0830001) doesnt include hispanic origin" -median_income_asianhh,Median household income for Asians and Pacific Islanders,,hinca,,,,SF3,P152D001,,Socioeconomic Status,"[ek] the 1990 and 2010 tables noted in the LTDB docs only have ranges, not median (e.g. P0820001 for 1990 and B19001F_012E for 2010)" -n_asian_households,total Asian/Pacific Islander households in sample-based data,,hha,,SF3,P0080004,SF3,P152D001+P152E001,B19001D_001E+B19001E_001E,Socioeconomic Status,"unclear how to calculate, since this is only provided as asian or as PI for 2000. Column recorded is asian+pacific islander" -per_capita_income,Per capita income,,incpc,,SF3,P114A001,SF3,P082001,B19301_001E,Socioeconomic Status, -n_poverty_determined_persons,persons for whom poverty status is determined,,dpov,DPOVRAT,SF3,P1170001:24,SF3,P087001,B17001_001E,Socioeconomic Status,denominator for calculating poverty rate -n_poverty_persons,persons in poverty,,npov,NPOVRAT,SF3,P1170013:24,SF3,P087002,B17001_002E,Socioeconomic Status,numerator for calculating poverty rate -n_poverty_over_65,persons 65 years and older in poverty,,n65pov,NELDPOO,SF3,P1170023+P1170024,SF3,P087008+P087009,B17001_015E+B17001_016E+B17001_029E+B17001_030E,Socioeconomic Status, -n_poverty_determined_families,families for whom poverty status is determined,,dfmpov,,SF3,P1230001:24,SF3,P090001,B17001_001E,Socioeconomic Status, -n_poverty_families_children,families with children in poverty,,nfmpov,,,P1230013:15+P1230017:19+P1230021:23,SF3,P090002,B17010_004E+B17010_011E+B17010_017E,Socioeconomic Status, -n_poverty_determined_white,white persons for whom poverty status is determined,,dwpov,DWHTPR,SF3,P1190001:07+P1190036:42,SF3,P159A001,B17001A_001E,Socioeconomic Status,is this nonhispanic? Recorded white (regardless). White (not hispanic) is P159I -n_poverty_white,whites in poverty,,nwpov,NWHTPR,SF3,P1190036:42,SF3,P159A002,B17001A_002E,Socioeconomic Status, -n_poverty_determined_black,black persons for whom poverty status is determined,,dbpov,DBLKPR,SF3,P1190008:14+P1190043:49,SF3,P159B001,B17001B_001E,Socioeconomic Status, -n_poverty_black,blacks in poverty,,nbpov,NBLKPR,SF3,P1190043:49,SF3,P159B002,B17001B_002E,Socioeconomic Status, -n_poverty_determined_hispanic,Hispanics for whom poverty status is determined,,dhpov,DHISPR,,,SF3,P159H001,B17020I_001E,Socioeconomic Status,[ek] it's not clear to me how LTDB computed values from this variable https://api.census.gov/data/1990/sf3/variables/P1200001.json -n_poverty_hispanic,Hispanics in poverty,,nhpov,NHISPR,,,SF3,P159H002,B17020I_002E,Socioeconomic Status, -n_poverty_determined_native,Native American for whom poverty status is determined,,dnapov,DINDPR,SF3,P1190015:21+P1190050:56,SF3,P159C001,B17020C_001E,Socioeconomic Status, -n_poverty_native,Native Americans in poverty,,nnapov,INDPR,SF3,P1190050:56,SF3,P159C002,B17020C_002E,Socioeconomic Status, -n_poverty_determined_asian,Asians and Pacific Islanders for whom poverty status is determined,,dapov,DASNPR,SF3,P1190022:28+P1190058:63,SF3,P159D001+P159E001,B17020E_001E,Socioeconomic Status,"asian alone is D, hawaiian and pac islander is E" -n_poverty_asian,Asians and Pacific Islanders in poverty,,napov,NASNPR,SF3,P1190058:63,SF3,P159D002+P159E002,B17020E_002E,Socioeconomic Status, -n_edu_college_greater,persons with at least a four-year college degree,,col,EDUC16,SF3,P0570006+P0570007,SF3,P037015:18+P037032:35,B15002_015E+B15002_016E+B15002_017E+B15002_018E+B15002_032E+B15002_033E+B15002_034E+B15002_035E,Socioeconomic Status, -n_edu_hs_less,persons with high school degree or less,,hs,EDUC12,SF3,P0570001+P0570002+P0570003,SF3,P037003:011+P037020:028,B15002_003E+B15002_004E+B15002_005E+B15002_006E+B15002_007E+B15002_008E+B15002_009E+B15002_010E+B15002_020E+B15002_021E+B15002_022E+B15002_023E+B15002_024E+B15002_025E+B15002_026E+B15002_027E,Socioeconomic Status, -p_edu_hs_less,percentage of persons with high school degree or less,p_edu_hs_less=n_edu_hs_less / n_persons_over_25*100,phs,,,,,,,Socioeconomic Status, -p_edu_college_greater,percentage of persons with at least a four-year college degree,p_edu_college_greater=n_edu_college_greater / n_persons_over_25*100,pcol,,,,,,,Socioeconomic Status, -p_unemployment_rate,percent unemployed,p_unemployment_rate=n_unemployed_persons / n_labor_force*100,punemp,UNEMPRT,,,,,,Socioeconomic Status, -p_female_labor_force,percentage of females in labor force,,pflabf,,,,,,,Socioeconomic Status, -p_employed_professional,percentage of professional employees (by occupations),p_employed_professional=n_employed_professional / n_employed_over_16*100,pprof,,,,,,,Socioeconomic Status, -p_employed_manufacturing,percentage of manufacturing employees (by industries),p_employed_manufacturing=n_employed_manufacturing / n_employed_over_16*100,pmanuf,,,,,,,Socioeconomic Status, -p_employed_self_employed,percentage of self-employed,p_employed_self_employed=n_employed_self_employed / n_employed_over_16*100,psemp,,,,,,,Socioeconomic Status, -p_veterans,percentage of veterans,p_veterans=n_veterans / n_total_pop*100,pvet,,,,,,,Socioeconomic Status, -p_disabled,percent with disability,p_disabled=n_disabled / n_total_pop*100,pdis,,,,,,,Socioeconomic Status, -p_poverty_rate,percent poor,p_poverty_rate=n_poverty_persons / n_poverty_determined_persons*100,ppov,POVRAT,,,,,,Socioeconomic Status, -p_poverty_rate_over_65,percentage of 65 years and older in poverty,p_poverty_rate_over_65=n_poverty_over_65 / n_poverty_determined_persons*100,p65pov,ELDPOO,,,,,,Socioeconomic Status, -p_poverty_rate_children,percentage of families with children in poverty,p_poverty_rate_children=n_poverty_families_children / n_poverty_determined_families*100,pfmpov,,,,,,,Socioeconomic Status, -p_poverty_rate_white,percentage of whites in poverty,p_poverty_rate_white=n_poverty_white / n_poverty_determined_persons*100,pwpov,WHTPR,,,,,,Socioeconomic Status, -p_poverty_rate_black,percentage of blacks in poverty,p_poverty_rate_black=n_poverty_black / n_poverty_determined_persons*100,pbpov,BLKPR,,,,,,Socioeconomic Status, -p_poverty_rate_hispanic,percentage of Hispanics in poverty,p_poverty_rate_hispanic=n_poverty_hispanic / n_poverty_determined_persons*100,phpov,,,,,,,Socioeconomic Status, -p_poverty_rate_native,percentage of Native Americans in poverty,p_poverty_rate_native=n_poverty_native / n_poverty_determined_persons*100,pnapov,,,,,,,Socioeconomic Status, -p_poverty_rate_asian,percentage of Asian and Pacific Islanders in poverty,p_poverty_rate_asian=n_poverty_asian / n_poverty_determined_persons*100,papov,RASPR,,,,,,Socioeconomic Status, -n_total_pop,total population,,pop,TRCTPOP,SF1,P0010001,SF1,P001001,B01003_001E,total population, \ No newline at end of file +,Unnamed: 0.1,Unnamed: 0,variable,label,formula,ltdb,ncdb,census_1990_form,census_1990_table_column,census_2000_form,census_2000_table_column,acs,census_2020_table,census_2020_notes,category,notes +0,0,0,geoid,FIPS code,,geoid,GEO2010,,,,,,,,, +1,1,1,n_mexican_pop,persons of Mexican parentage or ancestry,,mex,MEXIC,SF1,P0090001,SF1,PCT011004,B03001_004E,,,Ethnicity & Immigration, +2,2,2,n_cuban_pop,persons of Cuban parentage or ancestry,,cuban,CUBAN,SF1,P0090004,SF1,PCT011006,B03001_006E,,,Ethnicity & Immigration, +3,3,3,n_puerto_rican_pop,persons of Puerto Rican parentage or ancestry,,pr,PRICAN,SF1,P0090003,SF1,PCT011005,B03001_005E,,,Ethnicity & Immigration, +4,4,4,n_russian_pop,persons of Russian/USSR parentage or ancestry,,ruanc,,SF3,P0330022,SF3,PCT016064+PCT016053+PCT016052+PCT016037,B04004_064E,,,Ethnicity & Immigration,ruancXX (page 17 of LTDB codebook) suggests that USSR is only selected for 1970. I gather you're aggregating soviet countries individually? 1990 doesn't seem to have USSR or several of its constituents +5,5,5,n_italian_pop,persons of Italian parentage or ancestry,,itanc,,SF3,P0330016,SF3,PCT016051,B04004_051E,,,Ethnicity & Immigration, +6,6,6,n_german_pop,persons of German parentage or ancestry,,geanc,,SF3,P0330012,SF3,PCT016042,B04004_042E,,,Ethnicity & Immigration, +7,7,7,n_irish_pop,persons of Irish parentage or ancestry,,iranc,,SF3,P0330015,SF3,PCT016049,B04004_049E,,,Ethnicity & Immigration, +8,8,8,n_scandaniavian_pop,persons of Scandinavian parentage/ancestry,,scanc,,,,SF3,PCT016059+PCT016039+PCT016033+PCT016090,B04004_065E,,,Ethnicity & Immigration,"scanXX (page 18 of LTDB codebook) suggests dedicated nationalities are used in 1990 and 2000. This is despite there being a scandinavian category in 2000 SF3 (PCT016065); [ek]: similarly, the ACS lists both the scandanavian category *and* the individual country nationalities?" +9,9,9,n_total_pop_sample,total population from sample-based data,,dfb,,,,,,,,,Ethnicity & Immigration,LTDB suggests 1980 only +10,10,10,n_foreign_born_pop,foreign-born,,fb,FORBORN,SF3,P0360001:10,SF3,P021013,B05002_013E,,,Ethnicity & Immigration, +11,11,11,n_recent_immigrant_pop,recent immigrants (within the past 10 years),,n10imm,,SF3,P0360001:04,SF3,P023002,B05005_007E,,,Ethnicity & Immigration, +12,12,12,n_naturalized_pop,naturalized foreign-born,,nat,FORBCZN,SF3,P0370005,SF3,P021014,B05002_014E,,,Ethnicity & Immigration, +13,13,13,n_age_5_older,persons 5 years and over,,ag5up,,SF3,P0130004:31,SF3,P019001,B16001_001E,,,Ethnicity & Immigration, +14,14,14,n_other_language,persons who speak language other than English at home,,olang,,SF3,P0310002:26,SF3,P019001 - (P019025+P019003+P019047),B16001_001E - B16001_002E,,,Ethnicity & Immigration,Construct census 2000 count by subtraction from P019001 +15,15,15,n_limited_english,persons who speak English not well,,lep,,SF3,P0280004+P0280007+P0280010+P0280014+P0280017+P0280020+P0280024+P0280027+P0280030,SF3,P019022+P019023+P019029+P019013+P019012+P019017+P019018+P019007+P019008+P019061+P019062+P019067+P019066+P019052+P019051+P019057+P019056+P019040+P019045+P019044+P019030+P019039+P019035+P019034,DP02_0113E,,,Ethnicity & Immigration,"[ljw] cant tell if this includes ""speak other Languages"" as a catchall or if that is a pre-crosstab" +16,16,16,n_russian_born_pop,persons who were born in Russia/ USSR,,rufb,,,,SF3,PCT019026,B05006_040E,,,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API" +17,17,17,n_italian_born_pop,persons who were born in Italy,,itfb,,,,SF3,PCT019016,B05006_023E,,,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API" +18,18,18,n_german_born_pop,persons who were born in Germany,,gefb,,,,SF3,PCT019011,B05006_017E,,,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API" +19,19,19,n_irish_born_pop,persons who were born in Ireland,,irfb,,,,SF3,PCT019005,B05006_008E,,,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API" +20,20,20,n_scandaniavian_born_pop,persons who were born in Scandinavian Countries,,scfb,,,,SF3,PCT019006+PCT019007,B05006_009E+B05006_010E+B05006_011E+B05006_012E,,,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API" +21,21,21,p_mexican_pop,percentage of persons of Mexican parentage or ancestry,p_mexican_pop=n_mexican_pop / n_total_pop*100,pmex,,,,,,,,,Ethnicity & Immigration, +22,22,22,p_cuban_pop,percentage of persons of Cuban parentage or ancestry,p_cuban_pop=n_cuban_pop / n_total_pop*100,pcuban,,,,,,,,,Ethnicity & Immigration, +23,23,23,p_puerto_rican_pop,percentage of persons of Puerto Rican parentage or ancestry,p_puerto_rican_pop=n_puerto_rican_pop / n_total_pop*100,ppr,,,,,,,,,Ethnicity & Immigration, +24,24,24,p_russian_pop,percentage of persons of Russian/USSR parentage or ancestry,p_russian_pop=n_russian_pop / n_total_pop*100,pruanc,,,,,,,,,Ethnicity & Immigration, +25,25,25,p_italian_pop,percentage of persons of Italian parentage or ancestry,p_italian_pop=n_italian_pop / n_total_pop*100,pitanc,,,,,,,,,Ethnicity & Immigration, +26,26,26,p_german_pop,percentage of persons of German parentage or ancestry,p_german_pop=n_german_pop / n_total_pop*100,pgeanc,,,,,,,,,Ethnicity & Immigration, +27,27,27,p_irish_pop,percentage of persons of Irish parentage or ancestry,p_irish_pop=n_irish_pop / n_total_pop*100,piranc,,,,,,,,,Ethnicity & Immigration, +28,28,28,p_scandanavian_pop,percentage of persons of Scandinavian parentage/ancestry,p_scandanavian_pop=n_scandaniavian_pop / n_total_pop*100,pscanc,,,,,,,,,Ethnicity & Immigration, +29,29,29,p_foreign_born_pop,percentage of foreign-born,p_foreign_born_pop=n_foreign_born_pop / n_total_pop*100,pfb,SHRFOR,,,,,,,,Ethnicity & Immigration, +30,30,30,p_recent_immigrant_pop,percentage of recent immigrants (within the past 10 years),p_recent_immigrant_pop=n_recent_immigrant_pop / n_total_pop*100,p10imm,,,,,,,,,Ethnicity & Immigration, +31,31,31,p_naturalized_pop,percentage of naturalized foreign-born,p_naturalized_pop=n_naturalized_pop / n_total_pop*100,pnat,,,,,,,,,Ethnicity & Immigration, +32,32,32,p_other_language,percentage of persons who speak language other than English at home,p_other_language=n_other_language / n_total_pop*100,polang,,,,,,,,,Ethnicity & Immigration, +33,33,33,p_limited_english,percentage of persons who speak English not well,p_limited_english=n_limited_english / n_total_pop*100,plep,,,,,,,,,Ethnicity & Immigration, +34,34,34,p_russian_born_pop,percentage of persons who were born in Russia/ USSR,p_russian_born_pop=n_russian_born_pop / n_total_pop*100,prufb,,,,,,,,,Ethnicity & Immigration, +35,35,35,p_italian_born_pop,percentage of persons who were born in Italy,p_italian_born_pop=n_italian_born_pop / n_total_pop*100,pitfb,,,,,,,,,Ethnicity & Immigration, +36,36,36,p_german_born_pop,percentage of persons who were born in Germany,p_german_born_pop=n_german_born_pop / n_total_pop*100,pgefb,,,,,,,,,Ethnicity & Immigration, +37,37,37,p_irish_born_pop,percentage of persons who were born in Ireland,p_irish_born_pop=n_irish_born_pop / n_total_pop*100,pirfb,,,,,,,,,Ethnicity & Immigration, +38,38,38,p_scandanavian_born_pop,percentage of persons who were born in Scandinavian Countries,p_scandanavian_born_pop=n_scandaniavian_born_pop / n_total_pop*100,pscfb,,,,,,,,,Ethnicity & Immigration, +39,39,39,n_total_housing_units,housing units,,hu,TOTHSUN,SF1,H0010001,SF1,H001001,B25002_001E,,,"Housing, Age, & Marital Status", +40,40,40,n_vacant_housing_units,vacant housing units,,vac,VACHU,SF1,H0020002,SF1,H003003,B25002_003E,,,"Housing, Age, & Marital Status",divide by B25002_001E for vacancy rate +41,41,41,n_occupied_housing_units,occupied housing units,,ohu,OCCHU,SF1,H0020001,SF1,H003002,B25002_002E,,,"Housing, Age, & Marital Status", +42,42,42,n_owner_occupied_housing_units,owner-occupied housing units,,own,OWNOCC,SF1,H0030001,SF1,H004002,B25003_002E,,,"Housing, Age, & Marital Status", +43,43,43,n_renter_occupied_housing_units,renter-occupied housing units,,rent,RNTOCC,SF1,H0030002,SF1,H004003,B25003_003E,,,"Housing, Age, & Marital Status", +44,44,44,n_housing_units_multiunit_structures_denom,housing units denom,n_housing_units_multiunit_structures_denom=n_total_housing_units,dmulti,,,,,,B25024_001E,,,"Housing, Age, & Marital Status", +45,45,45,n_housing_units_multiunit_structures,housing units in multi-unit structures,,multi,,,,SF3,H030004+H030005+H030006+H030007+H030008+H030009,B25024_004E+B25024_005E+B25024_006E+B25024_007E+B25024_008E+B25024_009E,,,"Housing, Age, & Marital Status",[ljw] LTDB is unclear as to the relevant computed column from SF3-H030*. Recorded columns here are all stationary housing units (not mobile home (H030010) or RV/Van/Boat (H030011) +46,46,46,n_total_housing_units_sample,housing units in sample-based data,n_total_housing_units_sample=n_total_housing_units,husp,,,,,,B25024_001E,,,"Housing, Age, & Marital Status", +47,47,47,median_home_value,Median home value,,mhmval,MDVALHS,SF3,H061A001,SF3,H085001,B25077_001E,,,"Housing, Age, & Marital Status", +48,48,48,median_contract_rent,Median monthly contract rent,,mrent,MDGRENT,SF3,H043A001,SF3,H056001,B25058_001E,,,"Housing, Age, & Marital Status", +49,49,49,n_structures_30_old,structures built more than 30 years ago,,h30old,,SF3,H0250005+H0250006+H0250007+H0250008,SF3,H034010+H034009+H034008+H034007,,,,"Housing, Age, & Marital Status", +50,50,50,n_occupied_housing_units_sample,occupied housing units in sample-based data,,ohusp,,SF3,H0040001,SF3,H006001,B25003_001E,,,"Housing, Age, & Marital Status", +51,51,51,n_household_recent_move,household heads moved into unit less than 10 years ago,,h10yrs,,SF3,H0250006+H0250007+H0250008,SF3,H038003+H038004+H038005+H038010+H038011+H038012,,,,"Housing, Age, & Marital Status", +52,52,52,n_persons_under_18,persons age 17 years and under,,a18und,NCHILD,SF3,P0130012+P0130011+P0130010+P0130009+P0130008+P0130007+P0130006+P0130005+P0130004+P0130003+P0130002+P0130001,SF1,P012003+P012004+P012005+P012006+P012027+P012028+P012029+P012030,B01001_003E+B01001_004E+B01001_005E+B01001_006E+B01001_027E+B01001_028E+B01001_029E+B01001_030E,,,"Housing, Age, & Marital Status", +53,53,53,n_persons_over_60,persons age 60 years and over,,a60up,,SF3,P0130025+P0130026+P0130027+P0130028+P0130029+P0130030+P0130031,SF1,P012018:025+ P012042:049,B01001_018E+B01001_019E+B01001_020E+B01001_021E+B01001_022E+B01001_023E+B01001_024E+B01001_025E+B01001_042E+B01001_043E+B01001_044E+B01001_045E+B01001_046E+B01001_047E+B01001_048E+B01001_049E,,,"Housing, Age, & Marital Status", +54,54,54,n_persons_over_75,persons age 75 years and over,,a75up,,SF3,P0130029+P0130030+P0130031,SF1,P012023:25+ P012047:49,B01001_047E+B01001_048E+B01001_049E+B01001_023E+B01001_024E+B01001_025E,,,"Housing, Age, & Marital Status", +55,55,55,n_persons_over_15,population 15 years and over,,ag15up,PERS15P,SF3,P0130010+P0130011+P0130012+P0130013+P0130014+P0130015+P0130016+P0130017+P0130018+P0130019+P0130020+P0130021+P0130022+P0130023+P0130024+P0130025+P0130026+P0130027+P0130028+P0130029+P0130030+P0130031,SF3,P018001,B12001_001E,,,"Housing, Age, & Marital Status", +56,56,56,n_persons_over_25,population 25 years and over,,ag25up,,SF3,P0130018+P0130019+P0130020+P0130021+P0130022+P0130023+P0130024+P0130025+P0130026+P0130027+P0130028+P0130029+P0130030+P0130031,SF3,P037001,B15002_001E,,,"Housing, Age, & Marital Status",denominator for educational attainment +57,57,57,n_married,currently married (excluding separated),,mar,MMARSPP,SF3,P0250001+P0250002,SF3,(P018004+P018013) - (P018007+P018016),B12001_005E,,,"Housing, Age, & Marital Status", +58,58,58,n_widowed_divorced,"widowed, divorced, and separated",,wds,,SF3,P0270005+P0270011+P0270006+P0270012,SF3,P018007+P018009+P018010+P018016+P018018+P018019,B12001_007E+B12001_009E+B12001_010E+B12001_016E+B12001_018E+B12001_019E,,,"Housing, Age, & Marital Status", +59,59,59,n_total_families,total families,,family,FAMSUB,SF3,P0040001,SF1,P031001,B17010_001E,,,"Housing, Age, & Marital Status",denominator for calculating % female-headed families w/ children +60,60,60,n_female_headed_families,female-headed families with children,,fhh,NFFH,SF3,P0190005,SF1,P035016,B17010_017E,,,"Housing, Age, & Marital Status",numerator for calculating % female-headed families w/ children +61,61,61,p_vacant_housing_units,percentage of vacant housing units,p_vacant_housing_units=n_vacant_housing_units / n_total_housing_units*100,pvac,,,,,,,,,"Housing, Age, & Marital Status", +62,62,62,p_owner_occupied_units,percentage of owner-occupied housing units,p_owner_occupied_units=n_occupied_housing_units / n_total_housing_units*100,pown,,,,,,,,,"Housing, Age, & Marital Status", +63,63,63,p_housing_units_multiunit_structures,percentage of housing units in multi-unit structures,p_housing_units_multiunit_structures=n_housing_units_multiunit_structures / n_housing_units_multiunit_structures_denom*100,pmulti,,,,,,,,,"Housing, Age, & Marital Status", +64,64,64,p_structures_30_old,percentage of structures built more than 30 years ago,p_structures_30_old=n_structures_30_old / n_housing_units_multiunit_structures_denom*100,p30old,,,,,,,,,"Housing, Age, & Marital Status", +65,65,65,p_household_recent_move,percentage of household heads moved into unit less than 10 years ago,p_household_recent_move=n_household_recent_move / n_total_households*100,p10yrs,,,,,,,,,"Housing, Age, & Marital Status", +66,66,66,p_persons_under_18,percentage of persons age 17 years and under,p_persons_under_18=n_persons_under_18 / n_total_pop*100,p18und,,,,,,,,,"Housing, Age, & Marital Status", +67,67,67,p_persons_over_60,percentage of persons age 60 years and over,p_persons_over_60=n_persons_over_60 / n_total_pop*100,p60up,,,,,,,,,"Housing, Age, & Marital Status", +68,68,68,p_persons_over_75,percentage of persons age 75 years and over,p_persons_over_75=n_persons_over_75 / n_total_pop*100,p75up,,,,,,,,,"Housing, Age, & Marital Status", +69,69,69,p_married,percent currently married (excluding separated),p_married=n_married / n_persons_over_15*100,pmar,,,,,,,,,"Housing, Age, & Marital Status", +70,70,70,p_widowed_divorced,"percent widowed, divorced, and separated",p_widowed_divorced=n_widowed_divorced / n_persons_over_15*100,pwds,,,,,,,,,"Housing, Age, & Marital Status",should denom be families or individuals? +71,71,71,p_female_headed_families,percentage of female-headed families with children,p_female_headed_families=n_female_headed_families / n_total_families*100,pfhh,,,,,,,,,"Housing, Age, & Marital Status",should denom be families or individuals? +72,72,72,n_white_persons,persons of white race,,white,NSHRWHT,SF1,P0100001+P0100006,SF1,P003003,,,,Race & Age,"[ljw] inferring here, documentation suggests this is only available for 1970, but white alone (regardless of hispanic) gives this count? [ek] I think the question about hispanic ethnicity was added in 1980, so this is the best they can do to measdure the 'white alone' construct in 1970 " +73,73,73,n_nonhisp_white_persons,"persons of white race, not Hispanic origin",,nhwht,NSHRNHW,SF1,P0100001,SF1,P004005,B03002_003E,,,Race & Age,"[ljw] I read this as P004005 (not hispanic white alone), not P003003 (white alone)" +74,74,74,n_black_persons,persons of black race,,black,NSHRBLK,SF1,P0100007+P0100002,SF1,P003004,,,,Race & Age, +75,75,75,n_nonhisp_black_persons,"persons of black race, not Hispanic origin",,nhblk,NSHRNHB,SF1,P0100002,SF1,P004006,B03002_004E,,,Race & Age, +76,76,76,n_hispanic_persons,persons of Hispanic origin,,hisp,NSHRHSP,SF1,P0080001,SF1,P004002,B03002_012E,,,Race & Age, +77,77,77,n_native_persons,"persons of Native American race, not Hispanic origin",,ntv,NSHRAMI,SF1,P0070003,SF1,P004007,B03002_005E,,,Race & Age,is this nonhispanic? +78,78,78,n_hawaiian_persons,"persons of Hawaiian race, not Hispanic origin",,haw,NSHRHIP,SF1,P0070017,SF1,P004009,B02001_006E,,,Race & Age,is this nonhispanic? +79,79,79,n_asian_indian_persons,persons of Asian Indian race,,india,,SF1,P0070009,SF1,PCT007002,B03002_006E+B03002_007E,,,Race & Age, +80,80,80,n_chinese_persons,persons of Chinese race,,china,,SF1,P0070006,SF1,PCT007005+PCT007015,B02015_007E+B02015_020E,,,Race & Age, +81,81,81,n_filipino_persons,persons of Filipino race,,filip,,SF1,P0070007,SF1,PCT007006,B02015_008E,,,Race & Age, +82,82,82,n_japanese_persons,persons of Japanese race,,japan,,SF1,P0070008,SF1,PCT007009,B02015_011E,,,Race & Age, +83,83,83,n_korean_persons,persons of Korean race,,korea,,SF1,P0070010,SF1,PCT007010,B02015_012E,,,Race & Age, +84,84,84,n_asian_persons,persons of Asian race,,asian,,SF1,P0060004,SF1,P004008,B03002_006E+B03002_007E,,,Race & Age,for 1990 this is Asian and PI +85,85,85,n_vietnamese_persons,persons of Vietnamese race,,viet,,SF1,P0070011,SF1,PCT007017,B02018_022E,,,Race & Age, +86,86,86,n_white_age_distribution,white population with known age distribution,,agewht,,SF1,P0120001:62,SF1,PCT012I001,B01001H_001E,,,Race & Age, +87,87,87,n_white_under_15,0-15 years old of white race,,a15wht,,SF1,P0120001:09+P0120032:40,SF1,PCT012I003:018+PCT012I108:122,B01001H_003E+B01001H_004E+B01001H_005E+B01001H_018E+B01001H_019E+B01001H_020E,,,Race & Age, +88,88,88,n_white_over_60,60 years and older of white race,,a60wht,,SF1,P0120025:31+P0120056:62,SF1,PCT012I063:105+PCT012I167:209,,,,Race & Age,is this nonhispanic? +89,89,89,n_white_over_65,65 years and older of non-Hispanic whites,,a65wht,,SF1,P0120027:31+P0120058:62,SF1,PCT012I068:105+PCT012I172:209,B01001H_014E+B01001H_015E+B01001H_016E+B01001H_029E+B01001H_030E+B01001H_031E,,,Race & Age, +90,90,90,n_black_age_distribution,black population with known age distribution,,ageblk,,SF1,P0120063:0124,SF1,PCT012J001,B01001B_001E,,,Race & Age, +91,91,91,n_black_under_15,0-15 years old of black race,,a15blk,,SF1,P0120063:71+P0120094:102,SF1,PCT012J003:018+PCT012J108:122,B01001B_003E+B01001B_004E+B01001B_005E+B01001B_018E+B01001B_019E+B01001B_020E,,,Race & Age, +92,92,92,n_black_over_60,60 years and older of black race,,a60blk,,SF1,P0120087:93+P0120118:124,SF1,PCT012J063:105+PCT012J167:209,,,,Race & Age, +93,93,93,n_black_over_65,65 years and older of black race,,a65blk,,SF1,P0120089:93+P0120120:124,SF1,PCT012J068:105+PCT012J172:209,B01001B_014E+B01001B_015E+B01001B_016E+B01001B_029E+B01001B_030E+B01001B_031E,,,Race & Age, +94,94,94,n_hispanic_age_distribution,Hispanic population with known age distribution,,agehsp,,SF1,P0130001:62,SF1,PCT012H001,B01001I_001E,,,Race & Age, +95,95,95,n_hispanic_under_15,"0-15 years old, persons of Hispanic origins",,a15hsp,,SF1,P0130001:09+P0130032:40,SF1,PCT012H003:018+PCT012H108:122,B01001I_003E+B01001I_004E+B01001I_005E+B01001I_018E+B01001I_019E+B01001I_020E,,,Race & Age, +96,96,96,n_hispanic_over_60,"60 years and older, persons of Hispanic origins",,a60hsp,,SF1,P0130025:31+P0130056:62,SF1,PCT012H063:105+PCT012H167:209,,,,Race & Age, +97,97,97,n_hispanic_over_65,"65 years and older, persons of Hispanic origins",,a65hsp,,SF1,P0130027:31+P0130058:62,SF1,PCT012H068:105+PCT012H172:209,B01001I_014E+B01001I_015E+B01001I_016E+B01001I_029E+B01001I_030E+B01001I_031E,,,Race & Age, +98,98,98,n_native_age_distribution,Native American population with known age distribution,,agentv,,SF1,P0120125:186,SF1,PCT012K001,B01001C_001E,,,Race & Age, +99,99,99,n_native_under_15,0-15 years old of Native American race,,a15ntv,,SF1,P0120125:133 +P0120156:164,SF1,PCT012K003:018+PCT012K108:122,B01001C_003E+B01001C_004E+B01001C_005E+B01001C_018E+B01001C_019E+B01001C_020E,,,Race & Age, +100,100,100,n_native_over_60,60 years and older of Native American race,,a60ntv,,SF1,P0120149:155+P0120180:186,SF1,PCT012K063:105+PCT012K167:209,,,,Race & Age, +101,101,101,n_native_over_65,65 years and older of Native American race,,a65ntv,,SF1,P0120151:155+P0120182:186,SF1,PCT012K068:105+PCT012K172:209,B01001C_014E+B01001C_015E+B01001C_016E+B01001C_029E+B01001C_030E+B01001C_031E,,,Race & Age, +102,102,102,n_asian_age_distribution,Asian and Pacific Islander population with known age distribution,,ageasn,,SF1,P0120187:248,SF1,PCT012L001+PCT012M001,B01001D_001E+B01001E_001E,,,Race & Age, +103,103,103,n_asian_under_15,0-15 years old of Asians and Pacific Islanders,,a15asn,,SF1,P0120187:195+P0120218:226,SF1,PCT012M003:018+PCT012M108:122+PCT012L003:018+PCT012L108:122,B01001D_003E+B01001D_004E+B01001D_005E+B01001D_018E+B01001D_019E+B01001D_020E+B01001E_003E+B01001E_004E+B01001E_005E+B01001E_018E+B01001E_019E+B01001E_020E,,,Race & Age, +104,104,104,n_asian_over_60,60 years and older of Asians and Pacific Islanders,,a60asn,,,,SF1,PCT012M063:105+PCT012M167:209,,,,Race & Age, +105,105,105,n_asian_over_65,65 years and older of Asians and Pacific Islanders,,a65asn,,,,SF1,PCT012M068:105+PCT012M172:209+PCT012L068:105+PCT012L172:209,B01001D_014E+B01001D_015E+B01001D_016E+B01001E_014E+B01001E_015E+B01001E_016E+B01001E_029E+B01001E_030E+B01001E_031E+B01001D_029E+B01001D_030E+B01001D_031E,,,Race & Age, +106,106,106,p_white_persons,percentage of persons of white race,,pwhite,,,,,,,,,Race & Age, +107,107,107,p_black_persons,percentage of persons of black race,,pblack,,,,,,,,,Race & Age, +108,108,108,p_nonhisp_white_persons,"percentage of persons of white race, not Hispanic origin",p_nonhisp_white_persons=n_nonhisp_white_persons / n_total_pop*100,pnhwht,SHRNHW,,,,,,,,Race & Age, +109,109,109,p_nonhisp_black_persons,"percentage of persons of black race, not Hispanic origin",p_nonhisp_black_persons=n_nonhisp_black_persons / n_total_pop*100,pnhblk,SHRNHB,,,,,,,,Race & Age, +110,110,110,p_hispanic_persons,percentage of persons of Hispanic origin,p_hispanic_persons=n_hispanic_persons / n_total_pop*100,phisp,SHRHSP,,,,,,,,Race & Age, +111,111,111,p_native_persons,percentage of persons of Native American race,p_native_persons=n_native_persons / n_total_pop*100,pntv,SHRNHI,,,,,,,,Race & Age, +112,112,112,p_asian_persons,percentage of persons of Asian race (and Pacific Islander),p_asian_persons=n_asian_persons / n_total_pop*100,pasian,SHRNHR,,,,,,,,Race & Age, +113,113,113,p_hawaiian_persons,percentage of persons of Hawaiian race,p_hawaiian_persons=n_hawaiian_persons / n_total_pop*100,phaw,SHRNHH,,,,,,,,Race & Age, +114,114,114,p_asian_indian_persons,percentage of persons of Asian Indian race,p_asian_indian_persons=n_asian_indian_persons / n_total_pop*100,pindia,,,,,,,,,Race & Age, +115,115,115,p_chinese_persons,percentage of persons of Chinese race,p_chinese_persons=n_chinese_persons / n_total_pop*100,pchina,,,,,,,,,Race & Age, +116,116,116,p_filipino_persons,percentage of persons of Filipino race,p_filipino_persons=n_filipino_persons / n_total_pop*100,pfilip,,,,,,,,,Race & Age, +117,117,117,p_japanese_persons,percentage of persons of Japanese race,p_japanese_persons=n_japanese_persons / n_total_pop*100,pjapan,,,,,,,,,Race & Age, +118,118,118,p_korean_persons,percentage of persons of Korean race,p_korean_persons=n_korean_persons / n_total_pop*100,pkorea,,,,,,,,,Race & Age, +119,119,119,p_vietnamese_persons,percentage of persons of Vietnamese race,p_vietnamese_persons=n_vietnamese_persons / n_total_pop*100,pviet,,,,,,,,,Race & Age, +120,120,120,p_white_under_15,percentage of 0-15 years old of white race,p_white_under_15=n_white_under_15 / n_total_pop*100,p15wht,,,,,,,,,Race & Age, +121,121,121,p_white_over_60,percentage of 60 years and older of white race,p_white_over_60=n_white_over_60 / n_total_pop*100,p60wht,,,,,,,,,Race & Age, +122,122,122,p_white_over_65,percentage of 65 years and older of non-Hispanic whites,p_white_over_65=n_white_over_65 / n_total_pop*100,p65wht,,,,,,,,,Race & Age, +123,123,123,p_black_under_15,percentage of 0-15 years old of black race,p_black_under_15=n_black_under_15 / n_total_pop*100,p15blk,,,,,,,,,Race & Age, +124,124,124,p_black_over_60,percentage of 60 years and older of black race,p_black_over_60=n_black_over_60 / n_total_pop*100,p60blk,,,,,,,,,Race & Age, +125,125,125,p_black_over_65,percentage of 65 years and older of black race,p_black_over_65=n_black_over_65 / n_total_pop*100,p65blk,,,,,,,,,Race & Age, +126,126,126,p_hispanic_under_15,"percentage of 0-15 years old, persons of Hispanic origins",p_hispanic_under_15=n_hispanic_under_15 / n_total_pop*100,p15hsp,,,,,,,,,Race & Age, +127,127,127,p_hispanic_over_60,"percentage of 60 years and older, persons of Hispanic origins",p_hispanic_over_60=n_hispanic_over_60 / n_total_pop*100,p60hsp,,,,,,,,,Race & Age, +128,128,128,p_hispanic_over_65,"percentage of 65 years and older, persons of Hispanic origins",p_hispanic_over_65=n_hispanic_over_65 / n_total_pop*100,p65hsp,,,,,,,,,Race & Age, +129,129,129,p_native_under_15,percentage of 0-15 years old of Native American race,p_native_under_15=n_native_under_15 / n_total_pop*100,p15ntv,,,,,,,,,Race & Age, +130,130,130,p_native_over_60,percentage of 60 years and older of Native American race,p_native_over_60=n_native_over_60 / n_total_pop*100,p60ntv,,,,,,,,,Race & Age, +131,131,131,p_native_over_65,percentage of 65 years and older of Native American race,p_native_over_65=n_native_over_65 / n_total_pop*100,p65ntv,,,,,,,,,Race & Age, +132,132,132,p_asian_under_15,percentage of 0-15 years old of Asians and Pacific Islanders,p_asian_under_15=n_asian_under_15 / n_total_pop*100,p15asn,,,,,,,,,Race & Age, +133,133,133,p_asian_over_60,percentage of 60 years and older of Asians and Pacific Islanders,p_asian_over_60=n_asian_over_60 / n_total_pop*100,p60asn,,,,,,,,,Race & Age, +134,134,134,p_asian_over_65,percentage of 65 years and older of Asians and Pacific Islanders,p_asian_over_65=n_asian_over_65 / n_total_pop*100,p65asn,,,,,,,,,Race & Age, +135,135,135,n_female_over_16,"females 16 years and over, except in armed forces",,dflabf,DCFEPR,SF3,P0700006+P0700007+P0700008,SF3,P043012,,,,Socioeconomic Status, +136,136,136,n_female_labor_force,females in labor force,,flabf,FEPR,SF3,P0700006+P0700007,SF3,P043010,,,,Socioeconomic Status, +137,137,137,n_labor_force,civilian labor force,,clf,,SF3,P0700002+P0700003+P0700006+P0700007,SF3,P043005+P043012,B27011_002E,,,Socioeconomic Status, +138,138,138,n_unemployed_persons,unemployed persons,,unemp,,SF3,P0700003+P0700007,SF3,P043007+P043014,B23001_008E+B23001_015E+B23001_022E+B23001_029E+B23001_036E+B23001_044E+B23001_050E+B23001_057E+B23001_064E+B23001_071E+B23001_094E+B23001_101E+B23001_108E+B23001_115E+B23001_122E+B23001_129E+B23001_136E+B23001_143E+B23001_150E+B23001_157E,,,Socioeconomic Status, +139,139,139,n_employed_over_16,employed persons 16 years and over,,empclf,EMPMT,SF3,P0700002+P0700006,SF3,P049001,B23001_007E+B23001_014E+B23001_021E+B23001_028E+B23001_035E+B23001_042E+B23001_049E+B23001_049E+B23001_056E+B23001_063E+B23001_070E+B23001_093E+B23001_100E+B23001_107E+B23001_114E+B23001_121E+B23001_128E+B23001_135E+B23001_142E+B23001_149E+B23001_156E,,,Socioeconomic Status, +140,140,140,n_employed_professional,professional employees (by occupations),,prof,DLFRAT,SF3,P0780001+P0780002,SF3,P049017+P049044,,,,Socioeconomic Status, +141,141,141,n_employed_manufacturing,manufacturing employees (by industries),,manuf,PRFEMP,SF3,P0770004+P0770005,SF3,P049007+P049034,,,,Socioeconomic Status, +142,142,142,n_employed_self_employed,self-employed,,semp,,SF3,P0790006,SF3,P051012+P051023+P051033+P051044+P051055+P051065,,,,Socioeconomic Status, +143,143,143,n_civilians_over_16,civilian population 16 years and over,,ag16cv,,SF3,P0640002+P0640003+P0640005+P0640006+P0640008+P0640009 +P0640011+P0640012,SF3,P043005+P043012,C24010_001E,,,Socioeconomic Status, +144,144,144,n_civilians_over_18,civilian population 18 years and over,,ag18cv,,,,SF3,P039005+P039010+P039016+P039021,,,,Socioeconomic Status, +145,145,145,n_veterans,veterans,,vet,,SF3,P0640002+P0640005+P0640008+P0640011,SF3,P039006+P039011+P039017+P039022,B21001_002E,,,Socioeconomic Status, +146,146,146,n_civilians_16_64,civilian non-institutionalized persons 16-64 years old,,cni16u,,SF3,P0640002+P0640003+P0640008+P0640009,SF3,P042001,,,,Socioeconomic Status, +147,147,147,n_disabled,disabled,,dis,,SF3,P0680001+P0680002+P0680005+P0680006+P0680009+P0680010+P0680013+P0680014,SF3,P042004+P042007+P042014+P042021+P042024+P042028+P042031+P042038+P042045+P042048,,,,Socioeconomic Status, +148,148,148,median_household_income,Median household income,,hinc,MDHHY,SF3,P080A001,SF3,P053001,B19013_001E,,,Socioeconomic Status,"in 2015 dollars, will need inflation adjustment for timeseries" +149,149,149,n_total_households,total households in sample-based data,,hh,NUMHHS,SF3,P0050001,SF3,P010001,B19001_001E,,,Socioeconomic Status, +150,150,150,median_income_whitehh,Median household income for whites,,hincw,,,,SF3,P152A001,B19013H_001E,,,Socioeconomic Status,"[ek] the 1990 table noted in the LTDB docs only has ranges, not median (e.g. P0820001)" +151,151,151,n_white_households,total white households in sample-based data,,hhw,,SF3,P0080001,SF3,P146A001,B19001H_001E,,,Socioeconomic Status, +152,152,152,median_income_blackhh,Median household income for blacks,,hincb,,,,SF3,P152B001,B19013B_001E,,,Socioeconomic Status,"[ek] the 1990 table noted in the LTDB docs only has ranges, not median (e.g. P0820001)" +153,153,153,n_black_households,total black households in sample-based data,,hhb,,SF3,P0080002,SF3,P146B001,B19001B_001E,,,Socioeconomic Status, +154,154,154,median_income_hispanichh,Median household income for Hispanics,,hinch,,,,SF3,P152H001,B19013I_001E,,,Socioeconomic Status,"[ek] the 1990 table noted in the LTDB docs only has ranges, not median (e.g. P0820001)" +155,155,155,n_hispanic_households,total Hispanic households in sample-based data,,hhh,,SF3,P0210001:07,SF3,P146H001,B19001I_001E,,,Socioeconomic Status,"[ek] the 1990 value is calculated differently than the LTDB codebook, because the their reference (P0830001) doesnt include hispanic origin" +156,156,156,median_income_asianhh,Median household income for Asians and Pacific Islanders,,hinca,,,,SF3,P152D001,,,,Socioeconomic Status,"[ek] the 1990 and 2010 tables noted in the LTDB docs only have ranges, not median (e.g. P0820001 for 1990 and B19001F_012E for 2010)" +157,157,157,n_asian_households,total Asian/Pacific Islander households in sample-based data,,hha,,SF3,P0080004,SF3,P152D001+P152E001,B19001D_001E+B19001E_001E,,,Socioeconomic Status,"unclear how to calculate, since this is only provided as asian or as PI for 2000. Column recorded is asian+pacific islander" +158,158,158,per_capita_income,Per capita income,,incpc,,SF3,P114A001,SF3,P082001,B19301_001E,,,Socioeconomic Status, +159,159,159,n_poverty_determined_persons,persons for whom poverty status is determined,,dpov,DPOVRAT,SF3,P1170001:24,SF3,P087001,B17001_001E,,,Socioeconomic Status,denominator for calculating poverty rate +160,160,160,n_poverty_persons,persons in poverty,,npov,NPOVRAT,SF3,P1170013:24,SF3,P087002,B17001_002E,,,Socioeconomic Status,numerator for calculating poverty rate +161,161,161,n_poverty_over_65,persons 65 years and older in poverty,,n65pov,NELDPOO,SF3,P1170023+P1170024,SF3,P087008+P087009,B17001_015E+B17001_016E+B17001_029E+B17001_030E,,,Socioeconomic Status, +162,162,162,n_poverty_determined_families,families for whom poverty status is determined,,dfmpov,,SF3,P1230001:24,SF3,P090001,B17001_001E,,,Socioeconomic Status, +163,163,163,n_poverty_families_children,families with children in poverty,,nfmpov,,,P1230013:15+P1230017:19+P1230021:23,SF3,P090002,B17010_004E+B17010_011E+B17010_017E,,,Socioeconomic Status, +164,164,164,n_poverty_determined_white,white persons for whom poverty status is determined,,dwpov,DWHTPR,SF3,P1190001:07+P1190036:42,SF3,P159A001,B17001A_001E,,,Socioeconomic Status,is this nonhispanic? Recorded white (regardless). White (not hispanic) is P159I +165,165,165,n_poverty_white,whites in poverty,,nwpov,NWHTPR,SF3,P1190036:42,SF3,P159A002,B17001A_002E,,,Socioeconomic Status, +166,166,166,n_poverty_determined_black,black persons for whom poverty status is determined,,dbpov,DBLKPR,SF3,P1190008:14+P1190043:49,SF3,P159B001,B17001B_001E,,,Socioeconomic Status, +167,167,167,n_poverty_black,blacks in poverty,,nbpov,NBLKPR,SF3,P1190043:49,SF3,P159B002,B17001B_002E,,,Socioeconomic Status, +168,168,168,n_poverty_determined_hispanic,Hispanics for whom poverty status is determined,,dhpov,DHISPR,,,SF3,P159H001,B17020I_001E,,,Socioeconomic Status,[ek] it's not clear to me how LTDB computed values from this variable https://api.census.gov/data/1990/sf3/variables/P1200001.json +169,169,169,n_poverty_hispanic,Hispanics in poverty,,nhpov,NHISPR,,,SF3,P159H002,B17020I_002E,,,Socioeconomic Status, +170,170,170,n_poverty_determined_native,Native American for whom poverty status is determined,,dnapov,DINDPR,SF3,P1190015:21+P1190050:56,SF3,P159C001,B17020C_001E,,,Socioeconomic Status, +171,171,171,n_poverty_native,Native Americans in poverty,,nnapov,INDPR,SF3,P1190050:56,SF3,P159C002,B17020C_002E,,,Socioeconomic Status, +172,172,172,n_poverty_determined_asian,Asians and Pacific Islanders for whom poverty status is determined,,dapov,DASNPR,SF3,P1190022:28+P1190058:63,SF3,P159D001+P159E001,B17020E_001E,,,Socioeconomic Status,"asian alone is D, hawaiian and pac islander is E" +173,173,173,n_poverty_asian,Asians and Pacific Islanders in poverty,,napov,NASNPR,SF3,P1190058:63,SF3,P159D002+P159E002,B17020E_002E,,,Socioeconomic Status, +174,174,174,n_edu_college_greater,persons with at least a four-year college degree,,col,EDUC16,SF3,P0570006+P0570007,SF3,P037015:18+P037032:35,B15002_015E+B15002_016E+B15002_017E+B15002_018E+B15002_032E+B15002_033E+B15002_034E+B15002_035E,,,Socioeconomic Status, +175,175,175,n_edu_hs_less,persons with high school degree or less,,hs,EDUC12,SF3,P0570001+P0570002+P0570003,SF3,P037003:011+P037020:028,B15002_003E+B15002_004E+B15002_005E+B15002_006E+B15002_007E+B15002_008E+B15002_009E+B15002_010E+B15002_020E+B15002_021E+B15002_022E+B15002_023E+B15002_024E+B15002_025E+B15002_026E+B15002_027E,,,Socioeconomic Status, +176,176,176,p_edu_hs_less,percentage of persons with high school degree or less,p_edu_hs_less=n_edu_hs_less / n_persons_over_25*100,phs,,,,,,,,,Socioeconomic Status, +177,177,177,p_edu_college_greater,percentage of persons with at least a four-year college degree,p_edu_college_greater=n_edu_college_greater / n_persons_over_25*100,pcol,,,,,,,,,Socioeconomic Status, +178,178,178,p_unemployment_rate,percent unemployed,p_unemployment_rate=n_unemployed_persons / n_labor_force*100,punemp,UNEMPRT,,,,,,,,Socioeconomic Status, +179,179,179,p_female_labor_force,percentage of females in labor force,,pflabf,,,,,,,,,Socioeconomic Status, +180,180,180,p_employed_professional,percentage of professional employees (by occupations),p_employed_professional=n_employed_professional / n_employed_over_16*100,pprof,,,,,,,,,Socioeconomic Status, +181,181,181,p_employed_manufacturing,percentage of manufacturing employees (by industries),p_employed_manufacturing=n_employed_manufacturing / n_employed_over_16*100,pmanuf,,,,,,,,,Socioeconomic Status, +182,182,182,p_employed_self_employed,percentage of self-employed,p_employed_self_employed=n_employed_self_employed / n_employed_over_16*100,psemp,,,,,,,,,Socioeconomic Status, +183,183,183,p_veterans,percentage of veterans,p_veterans=n_veterans / n_total_pop*100,pvet,,,,,,,,,Socioeconomic Status, +184,184,184,p_disabled,percent with disability,p_disabled=n_disabled / n_total_pop*100,pdis,,,,,,,,,Socioeconomic Status, +185,185,185,p_poverty_rate,percent poor,p_poverty_rate=n_poverty_persons / n_poverty_determined_persons*100,ppov,POVRAT,,,,,,,,Socioeconomic Status, +186,186,186,p_poverty_rate_over_65,percentage of 65 years and older in poverty,p_poverty_rate_over_65=n_poverty_over_65 / n_poverty_determined_persons*100,p65pov,ELDPOO,,,,,,,,Socioeconomic Status, +187,187,187,p_poverty_rate_children,percentage of families with children in poverty,p_poverty_rate_children=n_poverty_families_children / n_poverty_determined_families*100,pfmpov,,,,,,,,,Socioeconomic Status, +188,188,188,p_poverty_rate_white,percentage of whites in poverty,p_poverty_rate_white=n_poverty_white / n_poverty_determined_persons*100,pwpov,WHTPR,,,,,,,,Socioeconomic Status, +189,189,189,p_poverty_rate_black,percentage of blacks in poverty,p_poverty_rate_black=n_poverty_black / n_poverty_determined_persons*100,pbpov,BLKPR,,,,,,,,Socioeconomic Status, +190,190,190,p_poverty_rate_hispanic,percentage of Hispanics in poverty,p_poverty_rate_hispanic=n_poverty_hispanic / n_poverty_determined_persons*100,phpov,,,,,,,,,Socioeconomic Status, +191,191,191,p_poverty_rate_native,percentage of Native Americans in poverty,p_poverty_rate_native=n_poverty_native / n_poverty_determined_persons*100,pnapov,,,,,,,,,Socioeconomic Status, +192,192,192,p_poverty_rate_asian,percentage of Asian and Pacific Islanders in poverty,p_poverty_rate_asian=n_poverty_asian / n_poverty_determined_persons*100,papov,RASPR,,,,,,,,Socioeconomic Status, +193,193,193,n_total_pop,total population,,pop,TRCTPOP,SF1,P0010001,SF1,P001001,B01003_001E,,,total population,