diff --git a/.github/workflows/check_acs_release.yml b/.github/workflows/check_acs_release.yml
new file mode 100644
index 00000000..de48941f
--- /dev/null
+++ b/.github/workflows/check_acs_release.yml
@@ -0,0 +1,35 @@
+name: Check for new ACS TIGER_DP release
+
+on:
+ schedule:
+ - cron: "0 15 1 * *"
+ workflow_dispatch:
+
+jobs:
+ check-release:
+ runs-on: ubuntu-latest
+ permissions:
+ contents: read
+ issues: write
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - uses: conda-incubator/setup-miniconda@v3
+ with:
+ activate-environment: geosnap
+ environment-file: environment.yml
+ python-version: "3.11"
+ auto-activate-base: false
+
+ - name: Install package and extra dependencies
+ shell: bash -l {0}
+ run: |
+ pip install -e .
+ pip install PyGithub
+
+ - name: Check Census release and process if available
+ shell: bash -l {0}
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: python tools/check_acs_release.py
\ No newline at end of file
diff --git a/build/examine_output.ipynb b/build/examine_output.ipynb
new file mode 100644
index 00000000..378766a0
--- /dev/null
+++ b/build/examine_output.ipynb
@@ -0,0 +1,3604 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "bf7ad3ec-6101-454a-9092-fcbe217ba030",
+ "metadata": {},
+ "source": [
+ "# Examine output\n",
+ "\n",
+ "use this notebook to see how effective the processing is for 2022."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "42ace1ac-64d4-4918-899f-ccf1e285b215",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "\n",
+ "vars2021 = requests.get(\"https://api.census.gov/data/2021/acs/acs5/variables.json\").json()[\"variables\"]\n",
+ "vars2022 = requests.get(\"https://api.census.gov/data/2022/acs/acs5/variables.json\").json()[\"variables\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e1730a77-37b0-431d-88d0-9bf9d092ce9c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "missing = newly_missing_in_2022\n",
+ "\n",
+ "still_exist = []\n",
+ "gone = []\n",
+ "\n",
+ "for var in missing:\n",
+ " if var in vars2022:\n",
+ " still_exist.append(var)\n",
+ " else:\n",
+ " gone.append(var)\n",
+ "\n",
+ "print(\"Still exist in 2022:\", len(still_exist))\n",
+ "print(\"Gone in 2022:\", len(gone))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "22e852ab-8064-4e81-ad82-5c5aca45db3b",
+ "metadata": {},
+ "source": [
+ "So like, the variables still exist per the metadata (variables json provided by the ACS), but they are not present where I expect them (tiger product). First, let's ID all the variables by their census identifiers:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7d7c5d72-e33f-446e-86ff-f0bf309c683e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for var in still_exist:\n",
+ " print(var, \"->\", vars2022[var][\"label\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a9b932cb-6f06-4c1e-8b81-bc47e95e9237",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "from collections import defaultdict"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c6422685-f1be-4cdd-bb90-2dab660a20d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def variable_to_table_group(var: str) -> str | None:\n",
+ " \"\"\"\n",
+ " Convert an ACS variable like B01003_001E to its table/group name B01003.\n",
+ " \"\"\"\n",
+ " m = re.match(r\"^([A-Z0-9]+)_\\d+[A-Z]$\", var)\n",
+ " if m:\n",
+ " return m.group(1)\n",
+ " return None\n",
+ "\n",
+ "\n",
+ "def group_variables_by_table(vars_list: list[str]) -> dict[str, list[str]]:\n",
+ " groups = defaultdict(list)\n",
+ " unparsed = []\n",
+ "\n",
+ " for var in sorted(set(vars_list)):\n",
+ " group = variable_to_table_group(var)\n",
+ " if group is None:\n",
+ " unparsed.append(var)\n",
+ " else:\n",
+ " groups[group].append(var)\n",
+ "\n",
+ " if unparsed:\n",
+ " print(\"Could not parse these variables:\")\n",
+ " for var in unparsed:\n",
+ " print(\" \", var)\n",
+ "\n",
+ " return dict(sorted(groups.items()))\n",
+ "\n",
+ "\n",
+ "groups = group_variables_by_table(newly_missing_in_2022)\n",
+ "\n",
+ "print(f\"Unique table groups: {len(groups)}\")\n",
+ "for group, vars_ in groups.items():\n",
+ " print(f\"{group}: {len(vars_)} vars\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a41071b9-a655-4061-be9d-643f939aead7",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true
+ },
+ "source": [
+ "## Another way to inspect the tables/groups"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5b1cdaaa-fe11-423a-ab8d-3b65ef31c1b5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def describe_group(group: str, vars_meta: dict) -> pd.DataFrame:\n",
+ " rows = []\n",
+ " for var, meta in vars_meta.items():\n",
+ " if var.startswith(f\"{group}_\"):\n",
+ " rows.append(\n",
+ " {\n",
+ " \"variable\": var,\n",
+ " \"label\": meta.get(\"label\"),\n",
+ " \"concept\": meta.get(\"concept\"),\n",
+ " \"predicateType\": meta.get(\"predicateType\"),\n",
+ " \"group\": meta.get(\"group\"),\n",
+ " }\n",
+ " )\n",
+ " return pd.DataFrame(rows).sort_values(\"variable\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "500ab2ae-734e-4c33-80b2-840dd3278ac7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Example:\n",
+ "describe_group(\"B01001\", vars2022).head(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "531953dc-7766-414a-b428-49a24df68742",
+ "metadata": {},
+ "source": [
+ "## Reclaim new naming format\n",
+ "\n",
+ "Follow Eli's comment on the PR:\n",
+ "\n",
+ "`ok, now that i've looked at ont of the 2022 tables in the geodatabase, the reason you're getting no results is the naming convention has changed. Your PR includes an update for the geoid column, but there are other systematic changes. In the new tables, the variables are named (as an example): B02001_E001. We need to have processing that anticipates this format, then converts it to the canonical form (like the json tables, B02001_001E (where E/M is the final character of the variable rather than the leading character)`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "86339013-67d1-4f85-a213-a82608665c59",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import geopandas as gpd\n",
+ "\n",
+ "import re\n",
+ "from pathlib import Path\n",
+ "import pyarrow.parquet as pq\n",
+ "\n",
+ "from IPython.display import display"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "fe5f95eb-4a88-418b-82ee-9f7b21555570",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2021 dir: /home/dylan/projects/geosnap/build/2021_bg\n",
+ "2022 dir: /home/dylan/projects/geosnap/build/2022_bg\n",
+ "Report dir: /home/dylan/projects/geosnap/build/reports\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Adjust this path if needed\n",
+ "BUILD_ROOT = Path(\"../build\")\n",
+ "\n",
+ "DIR_2021 = BUILD_ROOT / \"2021_bg\"\n",
+ "DIR_2022 = BUILD_ROOT / \"2022_bg\"\n",
+ "REPORT_DIR = BUILD_ROOT / \"reports\"\n",
+ "REPORT_DIR.mkdir(parents=True, exist_ok=True)\n",
+ "\n",
+ "print(\"2021 dir:\", DIR_2021.resolve())\n",
+ "print(\"2022 dir:\", DIR_2022.resolve())\n",
+ "print(\"Report dir:\", REPORT_DIR.resolve())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2b02ca80-1327-4041-94de-56bb39bdf512",
+ "metadata": {},
+ "source": [
+ "## Inspect the new naming on one file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "7c4c5ad9-bade-4dfb-918e-ee3b49afbb50",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test1 = pd.read_parquet(f'{DIR_2022}/acs_2022_X14_SCHOOL_ENROLLMENT_bg.parquet')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "6adaade2-f526-49b4-b798-5660fe8773ac",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " B14002_E001 | \n",
+ " B14002_E002 | \n",
+ " B14002_E003 | \n",
+ " B14002_E004 | \n",
+ " B14002_E005 | \n",
+ " B14002_E006 | \n",
+ " B14002_E007 | \n",
+ " B14002_E008 | \n",
+ " B14002_E009 | \n",
+ " B14002_E010 | \n",
+ " ... | \n",
+ " B14007I_E010 | \n",
+ " B14007I_E011 | \n",
+ " B14007I_E012 | \n",
+ " B14007I_E013 | \n",
+ " B14007I_E014 | \n",
+ " B14007I_E015 | \n",
+ " B14007I_E016 | \n",
+ " B14007I_E017 | \n",
+ " B14007I_E018 | \n",
+ " B14007I_E019 | \n",
+ "
\n",
+ " \n",
+ " | GEOIDFQ | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1500000US010179548002 | \n",
+ " 1375.0 | \n",
+ " 529.0 | \n",
+ " 45.0 | \n",
+ " 24.0 | \n",
+ " 24.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 14.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 21.0 | \n",
+ "
\n",
+ " \n",
+ " | 1500000US010179548004 | \n",
+ " 773.0 | \n",
+ " 409.0 | \n",
+ " 38.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 27.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 30.0 | \n",
+ "
\n",
+ " \n",
+ " | 1500000US010179548003 | \n",
+ " 281.0 | \n",
+ " 85.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 1500000US010150011031 | \n",
+ " 539.0 | \n",
+ " 321.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 1500000US010150024003 | \n",
+ " 970.0 | \n",
+ " 421.0 | \n",
+ " 93.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 33.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 268 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " B14002_E001 B14002_E002 B14002_E003 B14002_E004 \\\n",
+ "GEOIDFQ \n",
+ "1500000US010179548002 1375.0 529.0 45.0 24.0 \n",
+ "1500000US010179548004 773.0 409.0 38.0 0.0 \n",
+ "1500000US010179548003 281.0 85.0 0.0 0.0 \n",
+ "1500000US010150011031 539.0 321.0 0.0 0.0 \n",
+ "1500000US010150024003 970.0 421.0 93.0 0.0 \n",
+ "\n",
+ " B14002_E005 B14002_E006 B14002_E007 B14002_E008 \\\n",
+ "GEOIDFQ \n",
+ "1500000US010179548002 24.0 0.0 0.0 0.0 \n",
+ "1500000US010179548004 0.0 0.0 0.0 0.0 \n",
+ "1500000US010179548003 0.0 0.0 0.0 0.0 \n",
+ "1500000US010150011031 0.0 0.0 0.0 0.0 \n",
+ "1500000US010150024003 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " B14002_E009 B14002_E010 ... B14007I_E010 \\\n",
+ "GEOIDFQ ... \n",
+ "1500000US010179548002 0.0 0.0 ... 0.0 \n",
+ "1500000US010179548004 0.0 27.0 ... 0.0 \n",
+ "1500000US010179548003 0.0 0.0 ... 0.0 \n",
+ "1500000US010150011031 0.0 0.0 ... 0.0 \n",
+ "1500000US010150024003 0.0 33.0 ... 0.0 \n",
+ "\n",
+ " B14007I_E011 B14007I_E012 B14007I_E013 B14007I_E014 \\\n",
+ "GEOIDFQ \n",
+ "1500000US010179548002 0.0 0.0 14.0 0.0 \n",
+ "1500000US010179548004 0.0 0.0 0.0 0.0 \n",
+ "1500000US010179548003 0.0 0.0 0.0 0.0 \n",
+ "1500000US010150011031 0.0 0.0 0.0 0.0 \n",
+ "1500000US010150024003 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " B14007I_E015 B14007I_E016 B14007I_E017 B14007I_E018 \\\n",
+ "GEOIDFQ \n",
+ "1500000US010179548002 0.0 0.0 0.0 0.0 \n",
+ "1500000US010179548004 0.0 0.0 0.0 0.0 \n",
+ "1500000US010179548003 0.0 0.0 0.0 0.0 \n",
+ "1500000US010150011031 0.0 0.0 0.0 0.0 \n",
+ "1500000US010150024003 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " B14007I_E019 \n",
+ "GEOIDFQ \n",
+ "1500000US010179548002 21.0 \n",
+ "1500000US010179548004 30.0 \n",
+ "1500000US010179548003 5.0 \n",
+ "1500000US010150011031 0.0 \n",
+ "1500000US010150024003 0.0 \n",
+ "\n",
+ "[5 rows x 268 columns]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test1.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "d21c2ddf-80fc-4e5f-92ac-7cccc49c880d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test1_2021 = pd.read_parquet(f'{DIR_2021}/acs_2021_X14_SCHOOL_ENROLLMENT_bg.parquet')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "3d7aae74-69e3-4b3f-aba1-6a222190e1f7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " B14002_001E | \n",
+ " B14002_002E | \n",
+ " B14002_003E | \n",
+ " B14002_004E | \n",
+ " B14002_005E | \n",
+ " B14002_006E | \n",
+ " B14002_007E | \n",
+ " B14002_008E | \n",
+ " B14002_009E | \n",
+ " B14002_010E | \n",
+ " ... | \n",
+ " B14007I_010E | \n",
+ " B14007I_011E | \n",
+ " B14007I_012E | \n",
+ " B14007I_013E | \n",
+ " B14007I_014E | \n",
+ " B14007I_015E | \n",
+ " B14007I_016E | \n",
+ " B14007I_017E | \n",
+ " B14007I_018E | \n",
+ " B14007I_019E | \n",
+ "
\n",
+ " \n",
+ " | GEOID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 010010201001 | \n",
+ " 691.0 | \n",
+ " 296.0 | \n",
+ " 46.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 31.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 13.0 | \n",
+ "
\n",
+ " \n",
+ " | 010010201002 | \n",
+ " 1038.0 | \n",
+ " 558.0 | \n",
+ " 145.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 87.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 9.0 | \n",
+ "
\n",
+ " \n",
+ " | 010010202001 | \n",
+ " 782.0 | \n",
+ " 324.0 | \n",
+ " 77.0 | \n",
+ " 7.0 | \n",
+ " 0.0 | \n",
+ " 7.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 29.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 010010202002 | \n",
+ " 1146.0 | \n",
+ " 703.0 | \n",
+ " 67.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 28.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 010010203001 | \n",
+ " 2667.0 | \n",
+ " 1256.0 | \n",
+ " 329.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 25.0 | \n",
+ " 25.0 | \n",
+ " 0.0 | \n",
+ " 117.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 268 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " B14002_001E B14002_002E B14002_003E B14002_004E B14002_005E \\\n",
+ "GEOID \n",
+ "010010201001 691.0 296.0 46.0 4.0 0.0 \n",
+ "010010201002 1038.0 558.0 145.0 0.0 0.0 \n",
+ "010010202001 782.0 324.0 77.0 7.0 0.0 \n",
+ "010010202002 1146.0 703.0 67.0 0.0 0.0 \n",
+ "010010203001 2667.0 1256.0 329.0 0.0 0.0 \n",
+ "\n",
+ " B14002_006E B14002_007E B14002_008E B14002_009E B14002_010E \\\n",
+ "GEOID \n",
+ "010010201001 4.0 0.0 0.0 0.0 31.0 \n",
+ "010010201002 0.0 0.0 0.0 0.0 87.0 \n",
+ "010010202001 7.0 0.0 0.0 0.0 29.0 \n",
+ "010010202002 0.0 0.0 0.0 0.0 28.0 \n",
+ "010010203001 0.0 25.0 25.0 0.0 117.0 \n",
+ "\n",
+ " ... B14007I_010E B14007I_011E B14007I_012E B14007I_013E \\\n",
+ "GEOID ... \n",
+ "010010201001 ... 0.0 0.0 0.0 0.0 \n",
+ "010010201002 ... 0.0 0.0 0.0 0.0 \n",
+ "010010202001 ... 0.0 0.0 0.0 0.0 \n",
+ "010010202002 ... 0.0 0.0 0.0 0.0 \n",
+ "010010203001 ... 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " B14007I_014E B14007I_015E B14007I_016E B14007I_017E \\\n",
+ "GEOID \n",
+ "010010201001 0.0 0.0 0.0 0.0 \n",
+ "010010201002 0.0 0.0 0.0 0.0 \n",
+ "010010202001 0.0 0.0 0.0 0.0 \n",
+ "010010202002 0.0 0.0 0.0 0.0 \n",
+ "010010203001 0.0 0.0 0.0 2.0 \n",
+ "\n",
+ " B14007I_018E B14007I_019E \n",
+ "GEOID \n",
+ "010010201001 0.0 13.0 \n",
+ "010010201002 0.0 9.0 \n",
+ "010010202001 0.0 0.0 \n",
+ "010010202002 0.0 0.0 \n",
+ "010010203001 0.0 5.0 \n",
+ "\n",
+ "[5 rows x 268 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test1_2021.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "e28b0089-3da0-4794-8bb6-fa606d5f5e8b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test2 = pd.read_parquet(f'{DIR_2022}/acs_2022_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquet')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "d4700f62-dc2e-4097-bea6-78b9b6c4e482",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " B03002_E001 | \n",
+ " B03002_E002 | \n",
+ " B03002_E003 | \n",
+ " B03002_E004 | \n",
+ " B03002_E005 | \n",
+ " B03002_E006 | \n",
+ " B03002_E007 | \n",
+ " B03002_E008 | \n",
+ " B03002_E009 | \n",
+ " B03002_E010 | \n",
+ " ... | \n",
+ " B03002_E015 | \n",
+ " B03002_E016 | \n",
+ " B03002_E017 | \n",
+ " B03002_E018 | \n",
+ " B03002_E019 | \n",
+ " B03002_E020 | \n",
+ " B03002_E021 | \n",
+ " B03003_E001 | \n",
+ " B03003_E002 | \n",
+ " B03003_E003 | \n",
+ "
\n",
+ " \n",
+ " | GEOIDFQ | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1500000US010179548002 | \n",
+ " 1375.0 | \n",
+ " 1340.0 | \n",
+ " 149.0 | \n",
+ " 1191.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1375.0 | \n",
+ " 1340.0 | \n",
+ " 35.0 | \n",
+ "
\n",
+ " \n",
+ " | 1500000US010179548004 | \n",
+ " 797.0 | \n",
+ " 767.0 | \n",
+ " 450.0 | \n",
+ " 314.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 30.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 797.0 | \n",
+ " 767.0 | \n",
+ " 30.0 | \n",
+ "
\n",
+ " \n",
+ " | 1500000US010179548003 | \n",
+ " 281.0 | \n",
+ " 276.0 | \n",
+ " 138.0 | \n",
+ " 138.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 281.0 | \n",
+ " 276.0 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 1500000US010150011031 | \n",
+ " 560.0 | \n",
+ " 560.0 | \n",
+ " 560.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 560.0 | \n",
+ " 560.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 1500000US010150024003 | \n",
+ " 1003.0 | \n",
+ " 1003.0 | \n",
+ " 871.0 | \n",
+ " 45.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 87.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1003.0 | \n",
+ " 1003.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 24 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " B03002_E001 B03002_E002 B03002_E003 B03002_E004 \\\n",
+ "GEOIDFQ \n",
+ "1500000US010179548002 1375.0 1340.0 149.0 1191.0 \n",
+ "1500000US010179548004 797.0 767.0 450.0 314.0 \n",
+ "1500000US010179548003 281.0 276.0 138.0 138.0 \n",
+ "1500000US010150011031 560.0 560.0 560.0 0.0 \n",
+ "1500000US010150024003 1003.0 1003.0 871.0 45.0 \n",
+ "\n",
+ " B03002_E005 B03002_E006 B03002_E007 B03002_E008 \\\n",
+ "GEOIDFQ \n",
+ "1500000US010179548002 0.0 0.0 0.0 0.0 \n",
+ "1500000US010179548004 0.0 0.0 3.0 0.0 \n",
+ "1500000US010179548003 0.0 0.0 0.0 0.0 \n",
+ "1500000US010150011031 0.0 0.0 0.0 0.0 \n",
+ "1500000US010150024003 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " B03002_E009 B03002_E010 ... B03002_E015 \\\n",
+ "GEOIDFQ ... \n",
+ "1500000US010179548002 0.0 0.0 ... 0.0 \n",
+ "1500000US010179548004 0.0 0.0 ... 0.0 \n",
+ "1500000US010179548003 0.0 0.0 ... 0.0 \n",
+ "1500000US010150011031 0.0 0.0 ... 0.0 \n",
+ "1500000US010150024003 87.0 0.0 ... 0.0 \n",
+ "\n",
+ " B03002_E016 B03002_E017 B03002_E018 B03002_E019 \\\n",
+ "GEOIDFQ \n",
+ "1500000US010179548002 0.0 0.0 0.0 0.0 \n",
+ "1500000US010179548004 0.0 0.0 30.0 0.0 \n",
+ "1500000US010179548003 0.0 0.0 5.0 0.0 \n",
+ "1500000US010150011031 0.0 0.0 0.0 0.0 \n",
+ "1500000US010150024003 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " B03002_E020 B03002_E021 B03003_E001 B03003_E002 \\\n",
+ "GEOIDFQ \n",
+ "1500000US010179548002 0.0 0.0 1375.0 1340.0 \n",
+ "1500000US010179548004 0.0 0.0 797.0 767.0 \n",
+ "1500000US010179548003 0.0 0.0 281.0 276.0 \n",
+ "1500000US010150011031 0.0 0.0 560.0 560.0 \n",
+ "1500000US010150024003 0.0 0.0 1003.0 1003.0 \n",
+ "\n",
+ " B03003_E003 \n",
+ "GEOIDFQ \n",
+ "1500000US010179548002 35.0 \n",
+ "1500000US010179548004 30.0 \n",
+ "1500000US010179548003 5.0 \n",
+ "1500000US010150011031 0.0 \n",
+ "1500000US010150024003 0.0 \n",
+ "\n",
+ "[5 rows x 24 columns]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test2.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "a2f23636-b9d8-47fb-90a2-3b5c94b12cbe",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test2_2021 = pd.read_parquet(f'{DIR_2021}/acs_2021_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquet')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "b39f8d07-ad85-40b2-94ba-425625aa3c75",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " B03002_001E | \n",
+ " B03002_002E | \n",
+ " B03002_003E | \n",
+ " B03002_004E | \n",
+ " B03002_005E | \n",
+ " B03002_006E | \n",
+ " B03002_007E | \n",
+ " B03002_008E | \n",
+ " B03002_009E | \n",
+ " B03002_010E | \n",
+ " ... | \n",
+ " B03002_015E | \n",
+ " B03002_016E | \n",
+ " B03002_017E | \n",
+ " B03002_018E | \n",
+ " B03002_019E | \n",
+ " B03002_020E | \n",
+ " B03002_021E | \n",
+ " B03003_001E | \n",
+ " B03003_002E | \n",
+ " B03003_003E | \n",
+ "
\n",
+ " \n",
+ " | GEOID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 010010201001 | \n",
+ " 693.0 | \n",
+ " 674.0 | \n",
+ " 587.0 | \n",
+ " 16.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 71.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 693.0 | \n",
+ " 674.0 | \n",
+ " 19.0 | \n",
+ "
\n",
+ " \n",
+ " | 010010201002 | \n",
+ " 1098.0 | \n",
+ " 1089.0 | \n",
+ " 887.0 | \n",
+ " 155.0 | \n",
+ " 0.0 | \n",
+ " 38.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 9.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1098.0 | \n",
+ " 1089.0 | \n",
+ " 9.0 | \n",
+ "
\n",
+ " \n",
+ " | 010010202001 | \n",
+ " 844.0 | \n",
+ " 834.0 | \n",
+ " 336.0 | \n",
+ " 421.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 77.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 844.0 | \n",
+ " 834.0 | \n",
+ " 10.0 | \n",
+ "
\n",
+ " \n",
+ " | 010010202002 | \n",
+ " 1166.0 | \n",
+ " 1166.0 | \n",
+ " 439.0 | \n",
+ " 667.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 8.0 | \n",
+ " 52.0 | \n",
+ " 27.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1166.0 | \n",
+ " 1166.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 010010203001 | \n",
+ " 2685.0 | \n",
+ " 2672.0 | \n",
+ " 2011.0 | \n",
+ " 531.0 | \n",
+ " 0.0 | \n",
+ " 26.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 104.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 7.0 | \n",
+ " 6.0 | \n",
+ " 6.0 | \n",
+ " 0.0 | \n",
+ " 2685.0 | \n",
+ " 2672.0 | \n",
+ " 13.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 24 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " B03002_001E B03002_002E B03002_003E B03002_004E B03002_005E \\\n",
+ "GEOID \n",
+ "010010201001 693.0 674.0 587.0 16.0 0.0 \n",
+ "010010201002 1098.0 1089.0 887.0 155.0 0.0 \n",
+ "010010202001 844.0 834.0 336.0 421.0 0.0 \n",
+ "010010202002 1166.0 1166.0 439.0 667.0 0.0 \n",
+ "010010203001 2685.0 2672.0 2011.0 531.0 0.0 \n",
+ "\n",
+ " B03002_006E B03002_007E B03002_008E B03002_009E B03002_010E \\\n",
+ "GEOID \n",
+ "010010201001 0.0 0.0 0.0 71.0 0.0 \n",
+ "010010201002 38.0 0.0 0.0 9.0 0.0 \n",
+ "010010202001 0.0 0.0 0.0 77.0 0.0 \n",
+ "010010202002 0.0 0.0 8.0 52.0 27.0 \n",
+ "010010203001 26.0 0.0 0.0 104.0 0.0 \n",
+ "\n",
+ " ... B03002_015E B03002_016E B03002_017E B03002_018E \\\n",
+ "GEOID ... \n",
+ "010010201001 ... 0.0 0.0 0.0 0.0 \n",
+ "010010201002 ... 0.0 0.0 0.0 0.0 \n",
+ "010010202001 ... 0.0 0.0 0.0 0.0 \n",
+ "010010202002 ... 0.0 0.0 0.0 0.0 \n",
+ "010010203001 ... 0.0 0.0 0.0 7.0 \n",
+ "\n",
+ " B03002_019E B03002_020E B03002_021E B03003_001E B03003_002E \\\n",
+ "GEOID \n",
+ "010010201001 0.0 0.0 0.0 693.0 674.0 \n",
+ "010010201002 0.0 0.0 0.0 1098.0 1089.0 \n",
+ "010010202001 0.0 0.0 0.0 844.0 834.0 \n",
+ "010010202002 0.0 0.0 0.0 1166.0 1166.0 \n",
+ "010010203001 6.0 6.0 0.0 2685.0 2672.0 \n",
+ "\n",
+ " B03003_003E \n",
+ "GEOID \n",
+ "010010201001 19.0 \n",
+ "010010201002 9.0 \n",
+ "010010202001 10.0 \n",
+ "010010202002 0.0 \n",
+ "010010203001 13.0 \n",
+ "\n",
+ "[5 rows x 24 columns]"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test2_2021.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0d85ad48-1e93-417e-a8d1-2d97f728fce8",
+ "metadata": {},
+ "source": [
+ "It would be really cool if the 'E' moving was the only naming convention change with the new vintage"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "0346257d-4d09-4e46-8692-856895d490dc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# New 2022-style ACS naming:\n",
+ "# B02001_E001\n",
+ "# B02001_M001\n",
+ "NEW_STYLE_ACS_RE = re.compile(r\"^([A-Z0-9]+)_([EM])(\\d{3})$\", re.IGNORECASE)\n",
+ "\n",
+ "# Canonical ACS naming:\n",
+ "# B02001_001E\n",
+ "# B02001_001M\n",
+ "CANONICAL_ACS_RE = re.compile(r\"^([A-Z0-9]+)_(\\d{3})([EM])$\", re.IGNORECASE)\n",
+ "\n",
+ "# Flexible GEOID-like matcher\n",
+ "GEOID_RE = re.compile(r\"^GEOID([A-Z_].*)?$\", re.IGNORECASE)\n",
+ "\n",
+ "\n",
+ "def read_parquet_columns(path: Path) -> list[str]:\n",
+ " \"\"\"Read parquet schema only, not the data.\"\"\"\n",
+ " schema = pq.ParquetFile(path).schema_arrow\n",
+ " return schema.names\n",
+ "\n",
+ "\n",
+ "def canonicalize_column(col: str) -> str:\n",
+ " \"\"\"\n",
+ " Normalize ACS variable names to canonical form.\n",
+ "\n",
+ " Examples:\n",
+ " B02001_E001 -> B02001_001E\n",
+ " B02001_M001 -> B02001_001M\n",
+ " B02001_001E -> B02001_001E\n",
+ " \"\"\"\n",
+ " c = col.strip()\n",
+ "\n",
+ " m = NEW_STYLE_ACS_RE.match(c)\n",
+ " if m:\n",
+ " stem, suffix, digits = m.groups()\n",
+ " return f\"{stem.upper()}_{digits}{suffix.upper()}\"\n",
+ "\n",
+ " m = CANONICAL_ACS_RE.match(c)\n",
+ " if m:\n",
+ " stem, digits, suffix = m.groups()\n",
+ " return f\"{stem.upper()}_{digits}{suffix.upper()}\"\n",
+ "\n",
+ " return c\n",
+ "\n",
+ "\n",
+ "def classify_column(col: str) -> str:\n",
+ " c = col.strip()\n",
+ "\n",
+ " if GEOID_RE.match(c) or c in {\"GEOIDFQ\", \"GEOID_Data\"}:\n",
+ " return \"geoid_like\"\n",
+ "\n",
+ " if NEW_STYLE_ACS_RE.match(c):\n",
+ " return \"acs_new_style\"\n",
+ "\n",
+ " if CANONICAL_ACS_RE.match(c):\n",
+ " return \"acs_canonical\"\n",
+ "\n",
+ " return \"other\"\n",
+ "\n",
+ "\n",
+ "def layer_key(path: Path) -> str:\n",
+ " \"\"\"\n",
+ " Convert a filename into a year-agnostic layer key.\n",
+ " \"\"\"\n",
+ " name = path.name\n",
+ "\n",
+ " if re.fullmatch(r\"acs_\\d{4}_bg\\.parquet\", name):\n",
+ " return \"ALL_BG\"\n",
+ "\n",
+ " if re.fullmatch(r\"acs_demographic_profile_\\d{4}_bg\\.parquet\", name):\n",
+ " return \"DEMOGRAPHIC_PROFILE\"\n",
+ "\n",
+ " m = re.fullmatch(r\"acs_\\d{4}_(.+?)_bg\\.parquet\", name)\n",
+ " if m:\n",
+ " return m.group(1)\n",
+ "\n",
+ " return name"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dc6866a6-b51a-4056-b083-46382112e98a",
+ "metadata": {},
+ "source": [
+ "## Sanity check: do the helper functions work?\n",
+ "\n",
+ "Expected: \n",
+ "- B02001_E001 should become B02001_001E\n",
+ "- B02001_M001 should become B02001_001M\n",
+ "- canonical names should stay unchanged\n",
+ "- GEOID-like columns should stay unchanged"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "c7647e0c-b8e9-4271-b446-3ea81609bbe0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " original | \n",
+ " classification | \n",
+ " canonical | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " B02001_E001 | \n",
+ " acs_new_style | \n",
+ " B02001_001E | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " B02001_M001 | \n",
+ " acs_new_style | \n",
+ " B02001_001M | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " B02001_001E | \n",
+ " acs_canonical | \n",
+ " B02001_001E | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " B19013_001E | \n",
+ " acs_canonical | \n",
+ " B19013_001E | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " GEOID | \n",
+ " geoid_like | \n",
+ " GEOID | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " GEOIDFQ | \n",
+ " geoid_like | \n",
+ " GEOIDFQ | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " GEOID_Data | \n",
+ " geoid_like | \n",
+ " GEOID_Data | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " NAME | \n",
+ " other | \n",
+ " NAME | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " original classification canonical\n",
+ "0 B02001_E001 acs_new_style B02001_001E\n",
+ "1 B02001_M001 acs_new_style B02001_001M\n",
+ "2 B02001_001E acs_canonical B02001_001E\n",
+ "3 B19013_001E acs_canonical B19013_001E\n",
+ "4 GEOID geoid_like GEOID\n",
+ "5 GEOIDFQ geoid_like GEOIDFQ\n",
+ "6 GEOID_Data geoid_like GEOID_Data\n",
+ "7 NAME other NAME"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "test_cols = [\n",
+ " \"B02001_E001\",\n",
+ " \"B02001_M001\",\n",
+ " \"B02001_001E\",\n",
+ " \"B19013_001E\",\n",
+ " \"GEOID\",\n",
+ " \"GEOIDFQ\",\n",
+ " \"GEOID_Data\",\n",
+ " \"NAME\",\n",
+ "]\n",
+ "\n",
+ "test_df = pd.DataFrame({\n",
+ " \"original\": test_cols,\n",
+ " \"classification\": [classify_column(c) for c in test_cols],\n",
+ " \"canonical\": [canonicalize_column(c) for c in test_cols],\n",
+ "})\n",
+ "\n",
+ "display(test_df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c7fc46df-fb64-4190-9de2-249ea92c0309",
+ "metadata": {},
+ "source": [
+ "## Compare the files between vintages\n",
+ "\n",
+ "Just verifying comparable tables and identifying what is new"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "07bd7b42-cdb1-475f-9cd8-9d1adfbdadc5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def parse_parquet_filename(path: Path) -> dict:\n",
+ " \"\"\"\n",
+ " Parse known ACS parquet filenames into structured parts and normalize\n",
+ " year-specific pieces so 2021 and 2022 comparable files align.\n",
+ " \"\"\"\n",
+ " name = path.name\n",
+ "\n",
+ " # acs_demographic_profile_2022_bg.parquet\n",
+ " m = re.match(\n",
+ " r\"^acs_demographic_profile_(\\d{4})_(\\w+)\\.parquet$\",\n",
+ " name,\n",
+ " flags=re.IGNORECASE,\n",
+ " )\n",
+ " if m:\n",
+ " year, geography = m.groups()\n",
+ " return {\n",
+ " \"file\": name,\n",
+ " \"year\": int(year),\n",
+ " \"kind\": \"demographic_profile\",\n",
+ " \"x_code\": None,\n",
+ " \"table_name\": \"demographic_profile\",\n",
+ " \"geography\": geography,\n",
+ " \"group_key\": f\"demographic_profile::{geography}\",\n",
+ " \"sort_key\": (9998, \"demographic_profile\", geography),\n",
+ " }\n",
+ "\n",
+ " # acs_2022_X29_VOTING_AGE_POPULATION_bg.parquet\n",
+ " m = re.match(\n",
+ " r\"^acs_(\\d{4})_(X\\d{2})_(.+)_(\\w+)\\.parquet$\",\n",
+ " name,\n",
+ " flags=re.IGNORECASE,\n",
+ " )\n",
+ " if m:\n",
+ " year, x_code, table_name, geography = m.groups()\n",
+ " x_code = x_code.upper()\n",
+ " return {\n",
+ " \"file\": name,\n",
+ " \"year\": int(year),\n",
+ " \"kind\": \"x_table\",\n",
+ " \"x_code\": x_code,\n",
+ " \"table_name\": table_name,\n",
+ " \"geography\": geography,\n",
+ " \"group_key\": f\"{x_code}::{table_name}::{geography}\",\n",
+ " \"sort_key\": (int(x_code[1:]), table_name, geography),\n",
+ " }\n",
+ "\n",
+ " # acs_2022_ACS_2022_5YR_BG_bg.parquet\n",
+ " # normalize ACS_2021_5YR_BG and ACS_2022_5YR_BG to ACS_5YR_BG\n",
+ " m = re.match(\n",
+ " r\"^acs_(\\d{4})_(ACS_\\d{4}_5YR_[A-Z]+)_(\\w+)\\.parquet$\",\n",
+ " name,\n",
+ " flags=re.IGNORECASE,\n",
+ " )\n",
+ " if m:\n",
+ " year, source_name, geography = m.groups()\n",
+ " source_name_norm = re.sub(r\"ACS_\\d{4}_5YR_\", \"ACS_5YR_\", source_name, flags=re.IGNORECASE)\n",
+ " return {\n",
+ " \"file\": name,\n",
+ " \"year\": int(year),\n",
+ " \"kind\": \"whole_gdb\",\n",
+ " \"x_code\": None,\n",
+ " \"table_name\": source_name_norm,\n",
+ " \"geography\": geography,\n",
+ " \"group_key\": f\"whole_gdb::{source_name_norm}::{geography}\",\n",
+ " \"sort_key\": (9996, source_name_norm, geography),\n",
+ " }\n",
+ "\n",
+ " # acs_2022_bg.parquet\n",
+ " m = re.match(\n",
+ " r\"^acs_(\\d{4})_(\\w+)\\.parquet$\",\n",
+ " name,\n",
+ " flags=re.IGNORECASE,\n",
+ " )\n",
+ " if m:\n",
+ " year, geography = m.groups()\n",
+ " return {\n",
+ " \"file\": name,\n",
+ " \"year\": int(year),\n",
+ " \"kind\": \"combined\",\n",
+ " \"x_code\": None,\n",
+ " \"table_name\": \"combined\",\n",
+ " \"geography\": geography,\n",
+ " \"group_key\": f\"combined::{geography}\",\n",
+ " \"sort_key\": (9997, \"combined\", geography),\n",
+ " }\n",
+ "\n",
+ " return {\n",
+ " \"file\": name,\n",
+ " \"year\": None,\n",
+ " \"kind\": \"unknown\",\n",
+ " \"x_code\": None,\n",
+ " \"table_name\": name,\n",
+ " \"geography\": None,\n",
+ " \"group_key\": f\"unknown::{name}\",\n",
+ " \"sort_key\": (9999, name, \"\"),\n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "ffbb0145-e40e-42b6-8793-77409c74dab4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2021 parquet files: 26\n",
+ "2022 parquet files: 34\n"
+ ]
+ }
+ ],
+ "source": [
+ "files_2021 = sorted(DIR_2021.glob(\"*.parquet\"))\n",
+ "files_2022 = sorted(DIR_2022.glob(\"*.parquet\"))\n",
+ "\n",
+ "parsed_2021 = pd.DataFrame([parse_parquet_filename(p) for p in files_2021])\n",
+ "parsed_2022 = pd.DataFrame([parse_parquet_filename(p) for p in files_2022])\n",
+ "\n",
+ "print(f\"2021 parquet files: {len(parsed_2021)}\")\n",
+ "print(f\"2022 parquet files: {len(parsed_2022)}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "450715be-8255-45b7-9c3c-b828d78ed639",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " file_2021 | \n",
+ " file_2022 | \n",
+ " exists_2021 | \n",
+ " exists_2022 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " acs_2021_X01_AGE_AND_SEX_bg.parquet | \n",
+ " acs_2022_X01_AGE_AND_SEX_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " acs_2021_X02_RACE_bg.parquet | \n",
+ " acs_2022_X02_RACE_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " acs_2021_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquet | \n",
+ " acs_2022_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " NaN | \n",
+ " acs_2022_X04_ANCESTRY_bg.parquet | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " NaN | \n",
+ " acs_2022_X05_FOREIGN_BORN_CITIZENSHIP_bg.parquet | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " NaN | \n",
+ " acs_2022_X06_PLACE_OF_BIRTH_bg.parquet | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " acs_2021_X07_MIGRATION_bg.parquet | \n",
+ " acs_2022_X07_MIGRATION_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " acs_2021_X08_COMMUTING_bg.parquet | \n",
+ " acs_2022_X08_COMMUTING_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " acs_2021_X09_CHILDREN_HOUSEHOLD_RELATIONSHIP_b... | \n",
+ " acs_2022_X09_CHILDREN_HOUSEHOLD_RELATIONSHIP_b... | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " NaN | \n",
+ " acs_2022_X10_GRANDPARENTS_GRANDCHILDREN_bg.par... | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " acs_2021_X11_HOUSEHOLD_FAMILY_SUBFAMILIES_bg.p... | \n",
+ " acs_2022_X11_HOUSEHOLD_FAMILY_SUBFAMILIES_bg.p... | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " acs_2021_X12_MARITAL_STATUS_AND_HISTORY_bg.par... | \n",
+ " acs_2022_X12_MARITAL_STATUS_AND_HISTORY_bg.par... | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " NaN | \n",
+ " acs_2022_X13_FERTILITY_bg.parquet | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " acs_2021_X14_SCHOOL_ENROLLMENT_bg.parquet | \n",
+ " acs_2022_X14_SCHOOL_ENROLLMENT_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " acs_2021_X15_EDUCATIONAL_ATTAINMENT_bg.parquet | \n",
+ " acs_2022_X15_EDUCATIONAL_ATTAINMENT_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " acs_2021_X16_LANGUAGE_SPOKEN_AT_HOME_bg.parquet | \n",
+ " acs_2022_X16_LANGUAGE_SPOKEN_AT_HOME_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " acs_2021_X17_POVERTY_bg.parquet | \n",
+ " acs_2022_X17_POVERTY_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " NaN | \n",
+ " acs_2022_X18_DISABILITY_bg.parquet | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " acs_2021_X19_INCOME_bg.parquet | \n",
+ " acs_2022_X19_INCOME_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " acs_2021_X20_EARNINGS_bg.parquet | \n",
+ " acs_2022_X20_EARNINGS_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " acs_2021_X21_VETERAN_STATUS_bg.parquet | \n",
+ " acs_2022_X21_VETERAN_STATUS_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " acs_2021_X22_FOOD_STAMPS_bg.parquet | \n",
+ " acs_2022_X22_FOOD_STAMPS_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " acs_2021_X23_EMPLOYMENT_STATUS_bg.parquet | \n",
+ " acs_2022_X23_EMPLOYMENT_STATUS_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " acs_2021_X24_INDUSTRY_OCCUPATION_bg.parquet | \n",
+ " acs_2022_X24_INDUSTRY_OCCUPATION_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " acs_2021_X25_HOUSING_CHARACTERISTICS_bg.parquet | \n",
+ " acs_2022_X25_HOUSING_CHARACTERISTICS_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 25 | \n",
+ " NaN | \n",
+ " acs_2022_X26_GROUP_QUARTERS_bg.parquet | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 26 | \n",
+ " acs_2021_X27_HEALTH_INSURANCE_bg.parquet | \n",
+ " acs_2022_X27_HEALTH_INSURANCE_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 27 | \n",
+ " acs_2021_X28_COMPUTER_AND_INTERNET_USE_bg.parquet | \n",
+ " acs_2022_X28_COMPUTER_AND_INTERNET_USE_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 28 | \n",
+ " acs_2021_X29_VOTING_AGE_POPULATION_bg.parquet | \n",
+ " acs_2022_X29_VOTING_AGE_POPULATION_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 29 | \n",
+ " NaN | \n",
+ " acs_2022_X98_UNWEIGHTED_HOUSING_UNIT_SAMPLE_bg... | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 30 | \n",
+ " acs_2021_X99_IMPUTATION_bg.parquet | \n",
+ " acs_2022_X99_IMPUTATION_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 31 | \n",
+ " acs_2021_ACS_2021_5YR_BG_bg.parquet | \n",
+ " acs_2022_ACS_2022_5YR_BG_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 32 | \n",
+ " acs_2021_bg.parquet | \n",
+ " acs_2022_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 33 | \n",
+ " acs_demographic_profile_2021_bg.parquet | \n",
+ " acs_demographic_profile_2022_bg.parquet | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " file_2021 \\\n",
+ "0 acs_2021_X01_AGE_AND_SEX_bg.parquet \n",
+ "1 acs_2021_X02_RACE_bg.parquet \n",
+ "2 acs_2021_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquet \n",
+ "3 NaN \n",
+ "4 NaN \n",
+ "5 NaN \n",
+ "6 acs_2021_X07_MIGRATION_bg.parquet \n",
+ "7 acs_2021_X08_COMMUTING_bg.parquet \n",
+ "8 acs_2021_X09_CHILDREN_HOUSEHOLD_RELATIONSHIP_b... \n",
+ "9 NaN \n",
+ "10 acs_2021_X11_HOUSEHOLD_FAMILY_SUBFAMILIES_bg.p... \n",
+ "11 acs_2021_X12_MARITAL_STATUS_AND_HISTORY_bg.par... \n",
+ "12 NaN \n",
+ "13 acs_2021_X14_SCHOOL_ENROLLMENT_bg.parquet \n",
+ "14 acs_2021_X15_EDUCATIONAL_ATTAINMENT_bg.parquet \n",
+ "15 acs_2021_X16_LANGUAGE_SPOKEN_AT_HOME_bg.parquet \n",
+ "16 acs_2021_X17_POVERTY_bg.parquet \n",
+ "17 NaN \n",
+ "18 acs_2021_X19_INCOME_bg.parquet \n",
+ "19 acs_2021_X20_EARNINGS_bg.parquet \n",
+ "20 acs_2021_X21_VETERAN_STATUS_bg.parquet \n",
+ "21 acs_2021_X22_FOOD_STAMPS_bg.parquet \n",
+ "22 acs_2021_X23_EMPLOYMENT_STATUS_bg.parquet \n",
+ "23 acs_2021_X24_INDUSTRY_OCCUPATION_bg.parquet \n",
+ "24 acs_2021_X25_HOUSING_CHARACTERISTICS_bg.parquet \n",
+ "25 NaN \n",
+ "26 acs_2021_X27_HEALTH_INSURANCE_bg.parquet \n",
+ "27 acs_2021_X28_COMPUTER_AND_INTERNET_USE_bg.parquet \n",
+ "28 acs_2021_X29_VOTING_AGE_POPULATION_bg.parquet \n",
+ "29 NaN \n",
+ "30 acs_2021_X99_IMPUTATION_bg.parquet \n",
+ "31 acs_2021_ACS_2021_5YR_BG_bg.parquet \n",
+ "32 acs_2021_bg.parquet \n",
+ "33 acs_demographic_profile_2021_bg.parquet \n",
+ "\n",
+ " file_2022 exists_2021 \\\n",
+ "0 acs_2022_X01_AGE_AND_SEX_bg.parquet True \n",
+ "1 acs_2022_X02_RACE_bg.parquet True \n",
+ "2 acs_2022_X03_HISPANIC_OR_LATINO_ORIGIN_bg.parquet True \n",
+ "3 acs_2022_X04_ANCESTRY_bg.parquet False \n",
+ "4 acs_2022_X05_FOREIGN_BORN_CITIZENSHIP_bg.parquet False \n",
+ "5 acs_2022_X06_PLACE_OF_BIRTH_bg.parquet False \n",
+ "6 acs_2022_X07_MIGRATION_bg.parquet True \n",
+ "7 acs_2022_X08_COMMUTING_bg.parquet True \n",
+ "8 acs_2022_X09_CHILDREN_HOUSEHOLD_RELATIONSHIP_b... True \n",
+ "9 acs_2022_X10_GRANDPARENTS_GRANDCHILDREN_bg.par... False \n",
+ "10 acs_2022_X11_HOUSEHOLD_FAMILY_SUBFAMILIES_bg.p... True \n",
+ "11 acs_2022_X12_MARITAL_STATUS_AND_HISTORY_bg.par... True \n",
+ "12 acs_2022_X13_FERTILITY_bg.parquet False \n",
+ "13 acs_2022_X14_SCHOOL_ENROLLMENT_bg.parquet True \n",
+ "14 acs_2022_X15_EDUCATIONAL_ATTAINMENT_bg.parquet True \n",
+ "15 acs_2022_X16_LANGUAGE_SPOKEN_AT_HOME_bg.parquet True \n",
+ "16 acs_2022_X17_POVERTY_bg.parquet True \n",
+ "17 acs_2022_X18_DISABILITY_bg.parquet False \n",
+ "18 acs_2022_X19_INCOME_bg.parquet True \n",
+ "19 acs_2022_X20_EARNINGS_bg.parquet True \n",
+ "20 acs_2022_X21_VETERAN_STATUS_bg.parquet True \n",
+ "21 acs_2022_X22_FOOD_STAMPS_bg.parquet True \n",
+ "22 acs_2022_X23_EMPLOYMENT_STATUS_bg.parquet True \n",
+ "23 acs_2022_X24_INDUSTRY_OCCUPATION_bg.parquet True \n",
+ "24 acs_2022_X25_HOUSING_CHARACTERISTICS_bg.parquet True \n",
+ "25 acs_2022_X26_GROUP_QUARTERS_bg.parquet False \n",
+ "26 acs_2022_X27_HEALTH_INSURANCE_bg.parquet True \n",
+ "27 acs_2022_X28_COMPUTER_AND_INTERNET_USE_bg.parquet True \n",
+ "28 acs_2022_X29_VOTING_AGE_POPULATION_bg.parquet True \n",
+ "29 acs_2022_X98_UNWEIGHTED_HOUSING_UNIT_SAMPLE_bg... False \n",
+ "30 acs_2022_X99_IMPUTATION_bg.parquet True \n",
+ "31 acs_2022_ACS_2022_5YR_BG_bg.parquet True \n",
+ "32 acs_2022_bg.parquet True \n",
+ "33 acs_demographic_profile_2022_bg.parquet True \n",
+ "\n",
+ " exists_2022 \n",
+ "0 True \n",
+ "1 True \n",
+ "2 True \n",
+ "3 True \n",
+ "4 True \n",
+ "5 True \n",
+ "6 True \n",
+ "7 True \n",
+ "8 True \n",
+ "9 True \n",
+ "10 True \n",
+ "11 True \n",
+ "12 True \n",
+ "13 True \n",
+ "14 True \n",
+ "15 True \n",
+ "16 True \n",
+ "17 True \n",
+ "18 True \n",
+ "19 True \n",
+ "20 True \n",
+ "21 True \n",
+ "22 True \n",
+ "23 True \n",
+ "24 True \n",
+ "25 True \n",
+ "26 True \n",
+ "27 True \n",
+ "28 True \n",
+ "29 True \n",
+ "30 True \n",
+ "31 True \n",
+ "32 True \n",
+ "33 True "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "compare_files_df = (\n",
+ " parsed_2021.rename(columns={\"file\": \"file_2021\", \"year\": \"year_2021\"})\n",
+ " .merge(\n",
+ " parsed_2022.rename(columns={\"file\": \"file_2022\", \"year\": \"year_2022\"}),\n",
+ " on=[\"group_key\", \"kind\", \"x_code\", \"table_name\", \"geography\", \"sort_key\"],\n",
+ " how=\"outer\",\n",
+ " )\n",
+ " .sort_values([\"sort_key\", \"kind\", \"table_name\", \"group_key\"])\n",
+ " .reset_index(drop=True)\n",
+ ")\n",
+ "\n",
+ "compare_files_df[\"exists_2021\"] = compare_files_df[\"file_2021\"].notna()\n",
+ "compare_files_df[\"exists_2022\"] = compare_files_df[\"file_2022\"].notna()\n",
+ "\n",
+ "display(\n",
+ " compare_files_df[\n",
+ " [\n",
+ " \"file_2021\",\n",
+ " \"file_2022\",\n",
+ " \"exists_2021\",\n",
+ " \"exists_2022\",\n",
+ " ]\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0a7eada9-7cc9-4ec9-a8c7-a96007df98dc",
+ "metadata": {},
+ "source": [
+ "This cell "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "630c9973-5243-4bef-b807-fbfab1eb1623",
+ "metadata": {},
+ "source": [
+ "We want to see no columns changed for 2021, but many for 2022"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "858ac216-1a48-45ff-a944-2bc83ef2add3",
+ "metadata": {},
+ "source": [
+ "# Housing characteristics??\n",
+ "\n",
+ "Why is this table so much different? Are these really 100 new variables?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "0442576a-e1dd-4633-8af8-141234f0fff6",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2021 exists: True ../build/2021_bg/acs_2021_X25_HOUSING_CHARACTERISTICS_bg.parquet\n",
+ "2022 exists: True ../build/2022_bg/acs_2022_X25_HOUSING_CHARACTERISTICS_bg.parquet\n"
+ ]
+ }
+ ],
+ "source": [
+ "BUILD_ROOT = Path(\"../build\")\n",
+ "\n",
+ "file_2021 = BUILD_ROOT / \"2021_bg\" / \"acs_2021_X25_HOUSING_CHARACTERISTICS_bg.parquet\"\n",
+ "file_2022 = BUILD_ROOT / \"2022_bg\" / \"acs_2022_X25_HOUSING_CHARACTERISTICS_bg.parquet\"\n",
+ "\n",
+ "print(\"2021 exists:\", file_2021.exists(), file_2021)\n",
+ "print(\"2022 exists:\", file_2022.exists(), file_2022)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "e704d78f-58d4-4627-bdff-2e5ef7598442",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "n_cols_2021_raw: 871\n",
+ "n_cols_2022_raw: 971\n"
+ ]
+ }
+ ],
+ "source": [
+ "def read_parquet_columns(path: Path) -> list[str]:\n",
+ " return pq.ParquetFile(path).schema_arrow.names\n",
+ "\n",
+ "cols_2021 = read_parquet_columns(file_2021)\n",
+ "cols_2022 = read_parquet_columns(file_2022)\n",
+ "\n",
+ "print(\"n_cols_2021_raw:\", len(cols_2021))\n",
+ "print(\"n_cols_2022_raw:\", len(cols_2022))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "601a490f-c552-4b3b-86bc-124b7ddeb9b8",
+ "metadata": {},
+ "source": [
+ "Canonicalize the columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "dc4577d0-2679-490e-a143-1a9bdee2b0f9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df21 = pd.DataFrame({\"raw_2021\": cols_2021})\n",
+ "df21[\"canonical\"] = df21[\"raw_2021\"].map(canonicalize_column)\n",
+ "\n",
+ "df22 = pd.DataFrame({\"raw_2022\": cols_2022})\n",
+ "df22[\"canonical\"] = df22[\"raw_2022\"].map(canonicalize_column)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "81387311-9998-4fa3-bd88-aaa8b7b21693",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " raw_2021 | \n",
+ " canonical | \n",
+ " raw_2022 | \n",
+ " present_2021 | \n",
+ " present_2022 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " B25001_001E | \n",
+ " B25001_001E | \n",
+ " B25001_E001 | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " B25002_001E | \n",
+ " B25002_001E | \n",
+ " B25002_E001 | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " B25002_002E | \n",
+ " B25002_002E | \n",
+ " B25002_E002 | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " B25002_003E | \n",
+ " B25002_003E | \n",
+ " B25002_E003 | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " B25003A_001E | \n",
+ " B25003A_001E | \n",
+ " B25003A_E001 | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " raw_2021 canonical raw_2022 present_2021 present_2022\n",
+ "0 B25001_001E B25001_001E B25001_E001 True True\n",
+ "1 B25002_001E B25002_001E B25002_E001 True True\n",
+ "2 B25002_002E B25002_002E B25002_E002 True True\n",
+ "3 B25002_003E B25002_003E B25002_E003 True True\n",
+ "4 B25003A_001E B25003A_001E B25003A_E001 True True"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "aligned = (\n",
+ " df21.merge(df22, on=\"canonical\", how=\"outer\")\n",
+ " .sort_values(\"canonical\")\n",
+ " .reset_index(drop=True)\n",
+ ")\n",
+ "\n",
+ "aligned[\"present_2021\"] = aligned[\"raw_2021\"].notna()\n",
+ "aligned[\"present_2022\"] = aligned[\"raw_2022\"].notna()\n",
+ "\n",
+ "display(aligned.head(5))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cfc8b9b7-dc30-443e-8905-fffde0057daa",
+ "metadata": {},
+ "source": [
+ "# Test the fix\n",
+ "\n",
+ "Pushed a change on 4/14/26"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e5d0658b-e91a-496d-bbae-1a7a0d1c3759",
+ "metadata": {},
+ "source": [
+ "## Synthetic Tests"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "e31113d7-dd10-4027-8c4b-83035577effb",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'/home/dylan/projects/geosnap/build'"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pwd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "1c88c2e2-444b-4711-87b5-8f8745121d2d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Obtaining file:///home/dylan/projects/geosnap\n",
+ " Installing build dependencies ... \u001b[?25ldone\n",
+ "\u001b[?25h Checking if build backend supports build_editable ... \u001b[?25ldone\n",
+ "\u001b[?25h Getting requirements to build editable ... \u001b[?25ldone\n",
+ "\u001b[?25h Preparing editable metadata (pyproject.toml) ... \u001b[?25ldone\n",
+ "\u001b[?25hRequirement already satisfied: numpy in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.3.5)\n",
+ "Requirement already satisfied: pandas in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.3.3)\n",
+ "Requirement already satisfied: geopandas>=0.9 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.1.1)\n",
+ "Requirement already satisfied: matplotlib in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.10.8)\n",
+ "Requirement already satisfied: scikit-learn in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.8.0)\n",
+ "Requirement already satisfied: seaborn in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.13.2)\n",
+ "Requirement already satisfied: libpysal in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (4.13.0)\n",
+ "Requirement already satisfied: mapclassify in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.10.0)\n",
+ "Requirement already satisfied: giddy>=2.2.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.3.8)\n",
+ "Requirement already satisfied: xlrd in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.0.2)\n",
+ "Requirement already satisfied: platformdirs in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (4.5.1)\n",
+ "Requirement already satisfied: tqdm in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (4.67.1)\n",
+ "Requirement already satisfied: quilt3>=3.6 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (7.0.0)\n",
+ "Requirement already satisfied: pyarrow>=0.14.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (22.0.0)\n",
+ "Requirement already satisfied: contextily in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.7.0)\n",
+ "Requirement already satisfied: tobler>=0.8.2 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.12.1)\n",
+ "Requirement already satisfied: spopt>=0.3.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.7.0)\n",
+ "Requirement already satisfied: segregation>=2.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.5.3)\n",
+ "Requirement already satisfied: pyproj>=3 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.7.2)\n",
+ "Requirement already satisfied: pandarm in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.0.3)\n",
+ "Requirement already satisfied: pooch in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.8.2)\n",
+ "Requirement already satisfied: ibis-framework in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (12.0.0)\n",
+ "Requirement already satisfied: packaging in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geopandas>=0.9->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (25.0)\n",
+ "Requirement already satisfied: shapely>=2.0.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geopandas>=0.9->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.1.2)\n",
+ "Requirement already satisfied: esda<2.9,>=2.7 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from giddy>=2.2.1->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.8.0)\n",
+ "Requirement already satisfied: quantecon>=0.8 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from giddy>=2.2.1->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.10.1)\n",
+ "Requirement already satisfied: scipy>=1.12 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from giddy>=2.2.1->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.16.3)\n",
+ "Requirement already satisfied: beautifulsoup4>=4.10 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from libpysal->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (4.14.3)\n",
+ "Requirement already satisfied: requests>=2.27 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from libpysal->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.32.5)\n",
+ "Requirement already satisfied: soupsieve>=1.6.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from beautifulsoup4>=4.10->libpysal->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.8)\n",
+ "Requirement already satisfied: typing-extensions>=4.0.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from beautifulsoup4>=4.10->libpysal->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (4.15.0)\n",
+ "Requirement already satisfied: networkx>=3.2 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from mapclassify->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.6.1)\n",
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from pandas->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.9.0.post0)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from pandas->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2025.2)\n",
+ "Requirement already satisfied: tzdata>=2022.7 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from pandas->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2025.2)\n",
+ "Requirement already satisfied: certifi in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from pyproj>=3->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2026.1.4)\n",
+ "Requirement already satisfied: six>=1.5 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from python-dateutil>=2.8.2->pandas->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.17.0)\n",
+ "Requirement already satisfied: numba>=0.49.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quantecon>=0.8->giddy>=2.2.1->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.63.1)\n",
+ "Requirement already satisfied: sympy in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quantecon>=0.8->giddy>=2.2.1->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.14.0)\n",
+ "Requirement already satisfied: llvmlite<0.47,>=0.46.0dev0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from numba>=0.49.0->quantecon>=0.8->giddy>=2.2.1->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.46.0)\n",
+ "Requirement already satisfied: boto3>=1.21.7 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.40.70)\n",
+ "Requirement already satisfied: jsonlines==1.2.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.2.0)\n",
+ "Requirement already satisfied: PyYAML>=5.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (6.0.3)\n",
+ "Requirement already satisfied: tenacity!=8.4.0,>=5.1.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (9.1.2)\n",
+ "Requirement already satisfied: requests_futures==1.0.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.0.0)\n",
+ "Requirement already satisfied: jsonschema<5,>=3 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (4.25.1)\n",
+ "Requirement already satisfied: pydantic<3.0.0,>=2.0.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.12.5)\n",
+ "Requirement already satisfied: attrs>=22.2.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from jsonschema<5,>=3->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (25.4.0)\n",
+ "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from jsonschema<5,>=3->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2025.9.1)\n",
+ "Requirement already satisfied: referencing>=0.28.4 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from jsonschema<5,>=3->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.37.0)\n",
+ "Requirement already satisfied: rpds-py>=0.7.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from jsonschema<5,>=3->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.30.0)\n",
+ "Requirement already satisfied: annotated-types>=0.6.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from pydantic<3.0.0,>=2.0.0->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.7.0)\n",
+ "Requirement already satisfied: pydantic-core==2.41.5 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from pydantic<3.0.0,>=2.0.0->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.41.5)\n",
+ "Requirement already satisfied: typing-inspection>=0.4.2 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from pydantic<3.0.0,>=2.0.0->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.4.2)\n",
+ "Requirement already satisfied: botocore<1.41.0,>=1.40.70 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from boto3>=1.21.7->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.40.70)\n",
+ "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from boto3>=1.21.7->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.0.1)\n",
+ "Requirement already satisfied: s3transfer<0.15.0,>=0.14.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from boto3>=1.21.7->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.14.0)\n",
+ "Requirement already satisfied: urllib3!=2.2.0,<3,>=1.25.4 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from botocore<1.41.0,>=1.40.70->boto3>=1.21.7->quilt3>=3.6->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.6.1)\n",
+ "Requirement already satisfied: charset_normalizer<4,>=2 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from requests>=2.27->libpysal->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.4.4)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from requests>=2.27->libpysal->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.11)\n",
+ "Requirement already satisfied: joblib>=1.3.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from scikit-learn->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.5.2)\n",
+ "Requirement already satisfied: threadpoolctl>=3.2.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from scikit-learn->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.6.0)\n",
+ "Requirement already satisfied: deprecation in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from segregation>=2.1->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.1.0)\n",
+ "Requirement already satisfied: pointpats>=2.4.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from spopt>=0.3.0->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.5.2)\n",
+ "Requirement already satisfied: pulp>=2.8 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from spopt>=0.3.0->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.8.0)\n",
+ "Requirement already satisfied: spaghetti>=1.7.4 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from spopt>=0.3.0->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.7.6)\n",
+ "Requirement already satisfied: contourpy>=1.0.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from matplotlib->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.3.3)\n",
+ "Requirement already satisfied: cycler>=0.10 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from matplotlib->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.12.1)\n",
+ "Requirement already satisfied: fonttools>=4.22.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from matplotlib->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (4.61.0)\n",
+ "Requirement already satisfied: kiwisolver>=1.3.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from matplotlib->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.4.9)\n",
+ "Requirement already satisfied: pillow>=8 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from matplotlib->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (12.0.0)\n",
+ "Requirement already satisfied: pyparsing>=3 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from matplotlib->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.2.5)\n",
+ "Requirement already satisfied: rtree>=1.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from spaghetti>=1.7.4->spopt>=0.3.0->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.4.1)\n",
+ "Requirement already satisfied: rasterio in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.4.3)\n",
+ "Requirement already satisfied: statsmodels in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.14.6)\n",
+ "Requirement already satisfied: rasterstats in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.20.0)\n",
+ "Requirement already satisfied: geopy in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from contextily->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.4.1)\n",
+ "Requirement already satisfied: mercantile in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from contextily->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.2.1)\n",
+ "Requirement already satisfied: xyzservices in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from contextily->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2025.11.0)\n",
+ "Requirement already satisfied: geographiclib<3,>=1.52 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from geopy->contextily->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.1)\n",
+ "Requirement already satisfied: atpublic>=2.3 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from ibis-framework->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (7.0.0)\n",
+ "Requirement already satisfied: parsy>=2 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from ibis-framework->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.2)\n",
+ "Requirement already satisfied: sqlglot!=26.32.0,>=23.4 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from ibis-framework->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (30.4.3)\n",
+ "Requirement already satisfied: toolz>=0.11 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from ibis-framework->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.1.0)\n",
+ "Requirement already satisfied: click>=3.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from mercantile->contextily->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (8.3.1)\n",
+ "Requirement already satisfied: tables>=3.1 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from pandarm->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.10.2)\n",
+ "Requirement already satisfied: numexpr>=2.6.2 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from tables>=3.1->pandarm->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.14.1)\n",
+ "Requirement already satisfied: py-cpuinfo in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from tables>=3.1->pandarm->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (9.0.0)\n",
+ "Requirement already satisfied: affine in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from rasterio->tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (2.4.0)\n",
+ "Requirement already satisfied: cligj>=0.5 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from rasterio->tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (0.7.2)\n",
+ "Requirement already satisfied: click-plugins in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from rasterio->tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.1.1.2)\n",
+ "Requirement already satisfied: fiona in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from rasterstats->tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.10.1)\n",
+ "Requirement already satisfied: simplejson in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from rasterstats->tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (3.20.2)\n",
+ "Requirement already satisfied: patsy>=0.5.6 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from statsmodels->tobler>=0.8.2->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.0.2)\n",
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages (from sympy->quantecon>=0.8->giddy>=2.2.1->geosnap==0.16.1.dev34+g2f9cb9274.d20260414) (1.3.0)\n",
+ "Building wheels for collected packages: geosnap\n",
+ " Building editable for geosnap (pyproject.toml) ... \u001b[?25ldone\n",
+ "\u001b[?25h Created wheel for geosnap: filename=geosnap-0.16.1.dev34+g2f9cb9274.d20260414-0.editable-py3-none-any.whl size=8639 sha256=d324b655c7466dd269e80cd78beb59a593c7e69b5f00ccc85ec83ddc2dd95bd5\n",
+ " Stored in directory: /tmp/pip-ephem-wheel-cache-aoabk68n/wheels/50/05/f1/5afabb92124d2b3b9c0f2213aed7af8d580c81096a101a27a2\n",
+ "Successfully built geosnap\n",
+ "Installing collected packages: geosnap\n",
+ " Attempting uninstall: geosnap\n",
+ " Found existing installation: geosnap 0.16.1.dev34+g2f9cb9274.d20260414\n",
+ " Uninstalling geosnap-0.16.1.dev34+g2f9cb9274.d20260414:\n",
+ " Successfully uninstalled geosnap-0.16.1.dev34+g2f9cb9274.d20260414\n",
+ "Successfully installed geosnap-0.16.1.dev34+g2f9cb9274.d20260414\n"
+ ]
+ }
+ ],
+ "source": [
+ "!pip install -e /home/dylan/projects/geosnap"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "1afee93e-6d67-426d-a950-c5d6cda5abf2",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/dylan/mambaforge/envs/diss-data/lib/python3.13/site-packages/numba/np/ufunc/parallel.py:373: NumbaWarning: \u001b[1mThe TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.\u001b[0m\n",
+ " warnings.warn(problem)\n"
+ ]
+ }
+ ],
+ "source": [
+ "from geosnap.io.util import normalize_acs_vars"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "0a728115-10a0-484e-807b-79191367ad16",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "B02001e1 -> B02001_001E\n",
+ "B02001e12 -> B02001_012E\n",
+ "B02001_E001 -> B02001_001E\n",
+ "B02001_M001 -> B02001_001M\n",
+ "B02001_001E -> B02001_001E\n",
+ "B02001_001M -> B02001_001M\n",
+ "GEOID -> GEOID\n",
+ "GEOIDFQ -> GEOIDFQ\n",
+ "GEOID_Data -> GEOID_Data\n",
+ "geometry -> geometry\n",
+ "NAME -> NAME\n"
+ ]
+ }
+ ],
+ "source": [
+ "tests = [\n",
+ " \"B02001e1\",\n",
+ " \"B02001e12\",\n",
+ " \"B02001_E001\",\n",
+ " \"B02001_M001\",\n",
+ " \"B02001_001E\",\n",
+ " \"B02001_001M\",\n",
+ " \"GEOID\",\n",
+ " \"GEOIDFQ\",\n",
+ " \"GEOID_Data\",\n",
+ " \"geometry\",\n",
+ " \"NAME\",\n",
+ "]\n",
+ "\n",
+ "for t in tests:\n",
+ " print(f\"{t:15} -> {normalize_acs_vars(t)}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "ca1f46fb-c65c-43a7-bb7a-e80b2eb0ac32",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from geosnap.io.util import find_geoid_column"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "8fa4058c-e98f-4e3e-b75c-029fac66dd3a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['GEOID', 'B01001_E001'] -> GEOID\n",
+ "['GEOIDFQ', 'B01001_E001'] -> GEOIDFQ\n",
+ "['GEOID_Data', 'B01001_E001'] -> GEOID_Data\n",
+ "['GEOID20', 'B01001_E001'] -> GEOID20\n",
+ "['NAME', 'B01001_E001'] -> None\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/dylan/projects/geosnap/geosnap/io/util.py:144: UserWarning: No GEOID-like column found. Columns are: ['NAME', 'B01001_E001']\n",
+ " warn(f\"No GEOID-like column found. Columns are: {list(columns)}\")\n"
+ ]
+ }
+ ],
+ "source": [
+ "cases = [\n",
+ " [\"GEOID\", \"B01001_E001\"],\n",
+ " [\"GEOIDFQ\", \"B01001_E001\"],\n",
+ " [\"GEOID_Data\", \"B01001_E001\"],\n",
+ " [\"GEOID20\", \"B01001_E001\"],\n",
+ " [\"NAME\", \"B01001_E001\"],\n",
+ "]\n",
+ "\n",
+ "for cols in cases:\n",
+ " print(cols, \"->\", find_geoid_column(cols))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "75f32b9d-c11a-49a1-ac0e-6959b1e8f77b",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "All columns:\n",
+ "['B02001e1', 'B02001e12', 'B02001_E001', 'B02001_M001', 'B02001_001E', 'B02001_001M', 'GEOID', 'GEOIDFQ', 'NAME', 'geometry', 'random_column', 'B02001X001']\n"
+ ]
+ }
+ ],
+ "source": [
+ "cols = pd.Index([\n",
+ " # old style\n",
+ " \"B02001e1\", \"B02001e12\",\n",
+ "\n",
+ " # 2022 style\n",
+ " \"B02001_E001\", \"B02001_M001\",\n",
+ "\n",
+ " # canonical style\n",
+ " \"B02001_001E\", \"B02001_001M\",\n",
+ "\n",
+ " # noise / non-ACS\n",
+ " \"GEOID\", \"GEOIDFQ\", \"NAME\", \"geometry\",\n",
+ " \"random_column\", \"B02001X001\",\n",
+ "])\n",
+ "\n",
+ "df = pd.DataFrame(columns=cols)\n",
+ "\n",
+ "print(\"All columns:\")\n",
+ "print(list(df.columns))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "13c8b8ca-0650-4ed0-94a2-10232e61251b",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Selected columns:\n",
+ "['B02001e1', 'B02001e12', 'B02001_E001', 'B02001_M001', 'B02001_001E', 'B02001_001M']\n"
+ ]
+ }
+ ],
+ "source": [
+ "candidate_cols = df.columns[\n",
+ " df.columns.str.match(r\"^[A-Za-z0-9]+e\\d+$\", na=False) # old style\n",
+ " | df.columns.str.match(r\"^[A-Za-z0-9]+_[EM]\\d{3}$\", na=False) # 2022 style\n",
+ " | df.columns.str.match(r\"^[A-Za-z0-9]+_\\d{3}[EM]$\", na=False) # canonical style\n",
+ "]\n",
+ "\n",
+ "print(\"Selected columns:\")\n",
+ "print(list(candidate_cols))\n",
+ "# should not contain any GEOID or 'geometry' or 'random_column'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c07cbfc6-fd89-47e4-95b1-36f3efeffb85",
+ "metadata": {},
+ "source": [
+ "# Examine output from `process_acs`\n",
+ "After turning the new conversion function loose, I applied the `process_acs` function on the resulting combined demographic profile. Let's look at both here"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "1f72c657-b3a7-4967-9957-51293ac16145",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "demographic_profile = pd.read_parquet('2022_bg/acs_demographic_profile_2022_bg.parquet')\n",
+ "processed_acs = pd.read_parquet('2022_bg/acs_2022_bg_processed.parquet')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "b0fb731b-039f-47da-b5b0-cee0a574a0ba",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " STATEFP | \n",
+ " COUNTYFP | \n",
+ " TRACTCE | \n",
+ " BLKGRPCE | \n",
+ " NAMELSAD | \n",
+ " MTFCC | \n",
+ " FUNCSTAT | \n",
+ " ALAND | \n",
+ " AWATER | \n",
+ " INTPTLAT | \n",
+ " ... | \n",
+ " B01002H_003E | \n",
+ " B01002H_003M | \n",
+ " B01002I_001E | \n",
+ " B01002I_001M | \n",
+ " B01002I_002E | \n",
+ " B01002I_002M | \n",
+ " B01002I_003E | \n",
+ " B01002I_003M | \n",
+ " B01003_001E | \n",
+ " B01003_001M | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 010179548002 | \n",
+ " 01 | \n",
+ " 017 | \n",
+ " 954800 | \n",
+ " 2 | \n",
+ " Block Group 2 | \n",
+ " G5030 | \n",
+ " S | \n",
+ " 1094218.0 | \n",
+ " 0.0 | \n",
+ " +32.8662046 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 010179548004 | \n",
+ " 01 | \n",
+ " 017 | \n",
+ " 954800 | \n",
+ " 4 | \n",
+ " Block Group 4 | \n",
+ " G5030 | \n",
+ " S | \n",
+ " 2392140.0 | \n",
+ " 0.0 | \n",
+ " +32.8482537 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 010179548003 | \n",
+ " 01 | \n",
+ " 017 | \n",
+ " 954800 | \n",
+ " 3 | \n",
+ " Block Group 3 | \n",
+ " G5030 | \n",
+ " S | \n",
+ " 902949.0 | \n",
+ " 0.0 | \n",
+ " +32.8577594 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 010150011031 | \n",
+ " 01 | \n",
+ " 015 | \n",
+ " 001103 | \n",
+ " 1 | \n",
+ " Block Group 1 | \n",
+ " G5030 | \n",
+ " S | \n",
+ " 2346322.0 | \n",
+ " 94061.0 | \n",
+ " +33.5892886 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 010150024003 | \n",
+ " 01 | \n",
+ " 015 | \n",
+ " 002400 | \n",
+ " 3 | \n",
+ " Block Group 3 | \n",
+ " G5030 | \n",
+ " S | \n",
+ " 38223047.0 | \n",
+ " 173264.0 | \n",
+ " +33.9079142 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 1500000US720210302002 | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " None | \n",
+ " ... | \n",
+ " -666666666.0 | \n",
+ " -222222222.0 | \n",
+ " 57.6 | \n",
+ " 13.3 | \n",
+ " 53.7 | \n",
+ " 42.9 | \n",
+ " 58.5 | \n",
+ " 17.3 | \n",
+ " 597.0 | \n",
+ " 239.0 | \n",
+ "
\n",
+ " \n",
+ " | 1500000US720210314012 | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " None | \n",
+ " ... | \n",
+ " -666666666.0 | \n",
+ " -222222222.0 | \n",
+ " 58.4 | \n",
+ " 12.9 | \n",
+ " 56.5 | \n",
+ " 13.9 | \n",
+ " 63.3 | \n",
+ " 15.0 | \n",
+ " 977.0 | \n",
+ " 285.0 | \n",
+ "
\n",
+ " \n",
+ " | 1500000US720210312021 | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " None | \n",
+ " ... | \n",
+ " -666666666.0 | \n",
+ " -222222222.0 | \n",
+ " 44.5 | \n",
+ " 6.9 | \n",
+ " 49.2 | \n",
+ " 14.6 | \n",
+ " 43.7 | \n",
+ " 5.3 | \n",
+ " 1837.0 | \n",
+ " 372.0 | \n",
+ "
\n",
+ " \n",
+ " | 1500000US720531504003 | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " None | \n",
+ " ... | \n",
+ " -666666666.0 | \n",
+ " -222222222.0 | \n",
+ " 38.7 | \n",
+ " 13.0 | \n",
+ " 33.8 | \n",
+ " 11.7 | \n",
+ " 47.1 | \n",
+ " 15.1 | \n",
+ " 1115.0 | \n",
+ " 365.0 | \n",
+ "
\n",
+ " \n",
+ " | 1500000US721153304003 | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " None | \n",
+ " ... | \n",
+ " -666666666.0 | \n",
+ " -222222222.0 | \n",
+ " 40.2 | \n",
+ " 10.5 | \n",
+ " 40.2 | \n",
+ " 10.7 | \n",
+ " 40.3 | \n",
+ " 16.3 | \n",
+ " 1892.0 | \n",
+ " 618.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
484672 rows × 8237 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " STATEFP COUNTYFP TRACTCE BLKGRPCE NAMELSAD MTFCC \\\n",
+ "010179548002 01 017 954800 2 Block Group 2 G5030 \n",
+ "010179548004 01 017 954800 4 Block Group 4 G5030 \n",
+ "010179548003 01 017 954800 3 Block Group 3 G5030 \n",
+ "010150011031 01 015 001103 1 Block Group 1 G5030 \n",
+ "010150024003 01 015 002400 3 Block Group 3 G5030 \n",
+ "... ... ... ... ... ... ... \n",
+ "1500000US720210302002 None None None None None None \n",
+ "1500000US720210314012 None None None None None None \n",
+ "1500000US720210312021 None None None None None None \n",
+ "1500000US720531504003 None None None None None None \n",
+ "1500000US721153304003 None None None None None None \n",
+ "\n",
+ " FUNCSTAT ALAND AWATER INTPTLAT ... \\\n",
+ "010179548002 S 1094218.0 0.0 +32.8662046 ... \n",
+ "010179548004 S 2392140.0 0.0 +32.8482537 ... \n",
+ "010179548003 S 902949.0 0.0 +32.8577594 ... \n",
+ "010150011031 S 2346322.0 94061.0 +33.5892886 ... \n",
+ "010150024003 S 38223047.0 173264.0 +33.9079142 ... \n",
+ "... ... ... ... ... ... \n",
+ "1500000US720210302002 None NaN NaN None ... \n",
+ "1500000US720210314012 None NaN NaN None ... \n",
+ "1500000US720210312021 None NaN NaN None ... \n",
+ "1500000US720531504003 None NaN NaN None ... \n",
+ "1500000US721153304003 None NaN NaN None ... \n",
+ "\n",
+ " B01002H_003E B01002H_003M B01002I_001E B01002I_001M \\\n",
+ "010179548002 NaN NaN NaN NaN \n",
+ "010179548004 NaN NaN NaN NaN \n",
+ "010179548003 NaN NaN NaN NaN \n",
+ "010150011031 NaN NaN NaN NaN \n",
+ "010150024003 NaN NaN NaN NaN \n",
+ "... ... ... ... ... \n",
+ "1500000US720210302002 -666666666.0 -222222222.0 57.6 13.3 \n",
+ "1500000US720210314012 -666666666.0 -222222222.0 58.4 12.9 \n",
+ "1500000US720210312021 -666666666.0 -222222222.0 44.5 6.9 \n",
+ "1500000US720531504003 -666666666.0 -222222222.0 38.7 13.0 \n",
+ "1500000US721153304003 -666666666.0 -222222222.0 40.2 10.5 \n",
+ "\n",
+ " B01002I_002E B01002I_002M B01002I_003E B01002I_003M \\\n",
+ "010179548002 NaN NaN NaN NaN \n",
+ "010179548004 NaN NaN NaN NaN \n",
+ "010179548003 NaN NaN NaN NaN \n",
+ "010150011031 NaN NaN NaN NaN \n",
+ "010150024003 NaN NaN NaN NaN \n",
+ "... ... ... ... ... \n",
+ "1500000US720210302002 53.7 42.9 58.5 17.3 \n",
+ "1500000US720210314012 56.5 13.9 63.3 15.0 \n",
+ "1500000US720210312021 49.2 14.6 43.7 5.3 \n",
+ "1500000US720531504003 33.8 11.7 47.1 15.1 \n",
+ "1500000US721153304003 40.2 10.7 40.3 16.3 \n",
+ "\n",
+ " B01003_001E B01003_001M \n",
+ "010179548002 NaN NaN \n",
+ "010179548004 NaN NaN \n",
+ "010179548003 NaN NaN \n",
+ "010150011031 NaN NaN \n",
+ "010150024003 NaN NaN \n",
+ "... ... ... \n",
+ "1500000US720210302002 597.0 239.0 \n",
+ "1500000US720210314012 977.0 285.0 \n",
+ "1500000US720210312021 1837.0 372.0 \n",
+ "1500000US720531504003 1115.0 365.0 \n",
+ "1500000US721153304003 1892.0 618.0 \n",
+ "\n",
+ "[484672 rows x 8237 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "display(demographic_profile)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "27e38351-3998-423c-83bb-67de3b008f82",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " n_total_housing_units | \n",
+ " n_vacant_housing_units | \n",
+ " n_occupied_housing_units | \n",
+ " n_owner_occupied_housing_units | \n",
+ " n_renter_occupied_housing_units | \n",
+ " n_housing_units_multiunit_structures_denom | \n",
+ " n_total_housing_units_sample | \n",
+ " median_home_value | \n",
+ " median_contract_rent | \n",
+ " n_occupied_housing_units_sample | \n",
+ " ... | \n",
+ " p_owner_occupied_units | \n",
+ " p_married | \n",
+ " p_female_headed_families | \n",
+ " p_nonhisp_white_persons | \n",
+ " p_nonhisp_black_persons | \n",
+ " p_hispanic_persons | \n",
+ " p_native_persons | \n",
+ " p_hawaiian_persons | \n",
+ " p_veterans | \n",
+ " geometry | \n",
+ "
\n",
+ " \n",
+ " | GEOID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 010179548002 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " b'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03... | \n",
+ "
\n",
+ " \n",
+ " | 010179548004 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " b'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03... | \n",
+ "
\n",
+ " \n",
+ " | 010179548003 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " b'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03... | \n",
+ "
\n",
+ " \n",
+ " | 010150011031 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " b'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03... | \n",
+ "
\n",
+ " \n",
+ " | 010150024003 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " b'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03... | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 1500000US720210302002 | \n",
+ " 332.0 | \n",
+ " 104.0 | \n",
+ " 228.0 | \n",
+ " 159.0 | \n",
+ " 69.0 | \n",
+ " 332.0 | \n",
+ " 332.0 | \n",
+ " 155500.0 | \n",
+ " 582.0 | \n",
+ " 228.0 | \n",
+ " ... | \n",
+ " 68.674699 | \n",
+ " 9.854015 | \n",
+ " 15.000000 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 100.000000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 9.882747 | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 1500000US720210314012 | \n",
+ " 599.0 | \n",
+ " 155.0 | \n",
+ " 444.0 | \n",
+ " 336.0 | \n",
+ " 108.0 | \n",
+ " 599.0 | \n",
+ " 599.0 | \n",
+ " 132400.0 | \n",
+ " 519.0 | \n",
+ " 444.0 | \n",
+ " ... | \n",
+ " 74.123539 | \n",
+ " 19.549642 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 100.000000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.424770 | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 1500000US720210312021 | \n",
+ " 821.0 | \n",
+ " 129.0 | \n",
+ " 692.0 | \n",
+ " 554.0 | \n",
+ " 138.0 | \n",
+ " 821.0 | \n",
+ " 821.0 | \n",
+ " 103000.0 | \n",
+ " 460.0 | \n",
+ " 692.0 | \n",
+ " ... | \n",
+ " 84.287454 | \n",
+ " 10.917816 | \n",
+ " 9.859155 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 98.530212 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 3.647251 | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 1500000US720531504003 | \n",
+ " 564.0 | \n",
+ " 113.0 | \n",
+ " 451.0 | \n",
+ " 303.0 | \n",
+ " 148.0 | \n",
+ " 564.0 | \n",
+ " 564.0 | \n",
+ " 93700.0 | \n",
+ " 440.0 | \n",
+ " 451.0 | \n",
+ " ... | \n",
+ " 79.964539 | \n",
+ " 15.148189 | \n",
+ " 14.801444 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 100.000000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.807175 | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 1500000US721153304003 | \n",
+ " 787.0 | \n",
+ " 121.0 | \n",
+ " 666.0 | \n",
+ " 472.0 | \n",
+ " 194.0 | \n",
+ " 787.0 | \n",
+ " 787.0 | \n",
+ " 93900.0 | \n",
+ " 426.0 | \n",
+ " 666.0 | \n",
+ " ... | \n",
+ " 84.625159 | \n",
+ " 12.533912 | \n",
+ " 12.801932 | \n",
+ " 1.321353 | \n",
+ " 0.0 | \n",
+ " 98.678647 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.162791 | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
484672 rows × 37 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " n_total_housing_units n_vacant_housing_units \\\n",
+ "GEOID \n",
+ "010179548002 NaN NaN \n",
+ "010179548004 NaN NaN \n",
+ "010179548003 NaN NaN \n",
+ "010150011031 NaN NaN \n",
+ "010150024003 NaN NaN \n",
+ "... ... ... \n",
+ "1500000US720210302002 332.0 104.0 \n",
+ "1500000US720210314012 599.0 155.0 \n",
+ "1500000US720210312021 821.0 129.0 \n",
+ "1500000US720531504003 564.0 113.0 \n",
+ "1500000US721153304003 787.0 121.0 \n",
+ "\n",
+ " n_occupied_housing_units \\\n",
+ "GEOID \n",
+ "010179548002 NaN \n",
+ "010179548004 NaN \n",
+ "010179548003 NaN \n",
+ "010150011031 NaN \n",
+ "010150024003 NaN \n",
+ "... ... \n",
+ "1500000US720210302002 228.0 \n",
+ "1500000US720210314012 444.0 \n",
+ "1500000US720210312021 692.0 \n",
+ "1500000US720531504003 451.0 \n",
+ "1500000US721153304003 666.0 \n",
+ "\n",
+ " n_owner_occupied_housing_units \\\n",
+ "GEOID \n",
+ "010179548002 NaN \n",
+ "010179548004 NaN \n",
+ "010179548003 NaN \n",
+ "010150011031 NaN \n",
+ "010150024003 NaN \n",
+ "... ... \n",
+ "1500000US720210302002 159.0 \n",
+ "1500000US720210314012 336.0 \n",
+ "1500000US720210312021 554.0 \n",
+ "1500000US720531504003 303.0 \n",
+ "1500000US721153304003 472.0 \n",
+ "\n",
+ " n_renter_occupied_housing_units \\\n",
+ "GEOID \n",
+ "010179548002 NaN \n",
+ "010179548004 NaN \n",
+ "010179548003 NaN \n",
+ "010150011031 NaN \n",
+ "010150024003 NaN \n",
+ "... ... \n",
+ "1500000US720210302002 69.0 \n",
+ "1500000US720210314012 108.0 \n",
+ "1500000US720210312021 138.0 \n",
+ "1500000US720531504003 148.0 \n",
+ "1500000US721153304003 194.0 \n",
+ "\n",
+ " n_housing_units_multiunit_structures_denom \\\n",
+ "GEOID \n",
+ "010179548002 NaN \n",
+ "010179548004 NaN \n",
+ "010179548003 NaN \n",
+ "010150011031 NaN \n",
+ "010150024003 NaN \n",
+ "... ... \n",
+ "1500000US720210302002 332.0 \n",
+ "1500000US720210314012 599.0 \n",
+ "1500000US720210312021 821.0 \n",
+ "1500000US720531504003 564.0 \n",
+ "1500000US721153304003 787.0 \n",
+ "\n",
+ " n_total_housing_units_sample median_home_value \\\n",
+ "GEOID \n",
+ "010179548002 NaN NaN \n",
+ "010179548004 NaN NaN \n",
+ "010179548003 NaN NaN \n",
+ "010150011031 NaN NaN \n",
+ "010150024003 NaN NaN \n",
+ "... ... ... \n",
+ "1500000US720210302002 332.0 155500.0 \n",
+ "1500000US720210314012 599.0 132400.0 \n",
+ "1500000US720210312021 821.0 103000.0 \n",
+ "1500000US720531504003 564.0 93700.0 \n",
+ "1500000US721153304003 787.0 93900.0 \n",
+ "\n",
+ " median_contract_rent n_occupied_housing_units_sample \\\n",
+ "GEOID \n",
+ "010179548002 NaN NaN \n",
+ "010179548004 NaN NaN \n",
+ "010179548003 NaN NaN \n",
+ "010150011031 NaN NaN \n",
+ "010150024003 NaN NaN \n",
+ "... ... ... \n",
+ "1500000US720210302002 582.0 228.0 \n",
+ "1500000US720210314012 519.0 444.0 \n",
+ "1500000US720210312021 460.0 692.0 \n",
+ "1500000US720531504003 440.0 451.0 \n",
+ "1500000US721153304003 426.0 666.0 \n",
+ "\n",
+ " ... p_owner_occupied_units p_married \\\n",
+ "GEOID ... \n",
+ "010179548002 ... NaN NaN \n",
+ "010179548004 ... NaN NaN \n",
+ "010179548003 ... NaN NaN \n",
+ "010150011031 ... NaN NaN \n",
+ "010150024003 ... NaN NaN \n",
+ "... ... ... ... \n",
+ "1500000US720210302002 ... 68.674699 9.854015 \n",
+ "1500000US720210314012 ... 74.123539 19.549642 \n",
+ "1500000US720210312021 ... 84.287454 10.917816 \n",
+ "1500000US720531504003 ... 79.964539 15.148189 \n",
+ "1500000US721153304003 ... 84.625159 12.533912 \n",
+ "\n",
+ " p_female_headed_families p_nonhisp_white_persons \\\n",
+ "GEOID \n",
+ "010179548002 NaN NaN \n",
+ "010179548004 NaN NaN \n",
+ "010179548003 NaN NaN \n",
+ "010150011031 NaN NaN \n",
+ "010150024003 NaN NaN \n",
+ "... ... ... \n",
+ "1500000US720210302002 15.000000 0.000000 \n",
+ "1500000US720210314012 0.000000 0.000000 \n",
+ "1500000US720210312021 9.859155 0.000000 \n",
+ "1500000US720531504003 14.801444 0.000000 \n",
+ "1500000US721153304003 12.801932 1.321353 \n",
+ "\n",
+ " p_nonhisp_black_persons p_hispanic_persons \\\n",
+ "GEOID \n",
+ "010179548002 NaN NaN \n",
+ "010179548004 NaN NaN \n",
+ "010179548003 NaN NaN \n",
+ "010150011031 NaN NaN \n",
+ "010150024003 NaN NaN \n",
+ "... ... ... \n",
+ "1500000US720210302002 0.0 100.000000 \n",
+ "1500000US720210314012 0.0 100.000000 \n",
+ "1500000US720210312021 0.0 98.530212 \n",
+ "1500000US720531504003 0.0 100.000000 \n",
+ "1500000US721153304003 0.0 98.678647 \n",
+ "\n",
+ " p_native_persons p_hawaiian_persons p_veterans \\\n",
+ "GEOID \n",
+ "010179548002 NaN NaN NaN \n",
+ "010179548004 NaN NaN NaN \n",
+ "010179548003 NaN NaN NaN \n",
+ "010150011031 NaN NaN NaN \n",
+ "010150024003 NaN NaN NaN \n",
+ "... ... ... ... \n",
+ "1500000US720210302002 0.0 0.0 9.882747 \n",
+ "1500000US720210314012 0.0 0.0 5.424770 \n",
+ "1500000US720210312021 0.0 0.0 3.647251 \n",
+ "1500000US720531504003 0.0 0.0 0.807175 \n",
+ "1500000US721153304003 0.0 0.0 1.162791 \n",
+ "\n",
+ " geometry \n",
+ "GEOID \n",
+ "010179548002 b'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03... \n",
+ "010179548004 b'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03... \n",
+ "010179548003 b'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03... \n",
+ "010150011031 b'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03... \n",
+ "010150024003 b'\\x01\\x06\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x01\\x03... \n",
+ "... ... \n",
+ "1500000US720210302002 None \n",
+ "1500000US720210314012 None \n",
+ "1500000US720210312021 None \n",
+ "1500000US720531504003 None \n",
+ "1500000US721153304003 None \n",
+ "\n",
+ "[484672 rows x 37 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "display(processed_acs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3c862c3c-49e8-4fa3-9b6c-1d376e91dac4",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/environment.yml b/environment.yml
index af76fa00..c1775513 100644
--- a/environment.yml
+++ b/environment.yml
@@ -26,3 +26,5 @@ dependencies:
- pandarm
- ibis-framework
- ultraplot
+ - pyogrio
+ - dask-geopandas
diff --git a/geosnap/io/util.py b/geosnap/io/util.py
index 30d9fd22..d4c3f568 100644
--- a/geosnap/io/util.py
+++ b/geosnap/io/util.py
@@ -7,6 +7,7 @@
import pandas as pd
import pooch
from tqdm.auto import tqdm
+import re
def get_census_gdb(years=None, geom_level="blockgroup", output_dir=".", protocol="ftp"):
@@ -54,25 +55,94 @@ def get_census_gdb(years=None, geom_level="blockgroup", output_dir=".", protocol
pooch.retrieve(urls[protocol], None, progressbar=True, fname=fn, path=pth)
-def reformat_acs_vars(col):
- """Convert variable names to the same format used by the Census Detailed Tables API.
-
+def normalize_acs_vars(col):
+ """Normalize ACS variable names to the canonical Census API format.
+
See for variable descriptions
+ Supported conversions
+ ----------------
+ Old-style TIGER_DP names:
+ B02001e1 -> B02001_001E
+ B19013e1 -> B19013_001E
+
+ Newer TIGER_DP names:
+ B02001_E001 -> B02001_001E
+ B02001_M001 -> B02001_001M
+
+ Already-canonical names:
+ B02001_001E -> B02001_001E
+ B02001_001M -> B02001_001M
+
+ GEOID-like columns are returned unchanged.
Parameters
----------
col : str
- column name to adjust
+ Column name to adjust.
Returns
-------
str
- reformatted column name
+ Normalized ACS-style column name.
"""
- pieces = col.split("e")
- formatted = pieces[0] + "_" + pieces[1].rjust(3, "0") + "E"
- return formatted
+ col = str(col).strip()
+ if col in {"GEOID", "GEOIDFQ", "GEOID_Data", "geometry"}:
+ return col
+
+ # Older style: B02001e1 -> B02001_001E
+ old_style = re.match(r"^([A-Za-z0-9]+)e(\d+)$", col)
+ if old_style:
+ stem, num = old_style.groups()
+ return f"{stem.upper()}_{num.rjust(3, '0')}E"
+
+ # 2022 style: B02001_E001 -> B02001_001E
+ new_style = re.match(r"^([A-Za-z0-9]+)_([EM])(\d{3})$", col, flags=re.IGNORECASE)
+ if new_style:
+ stem, suffix, num = new_style.groups()
+ return f"{stem.upper()}_{num}{suffix.upper()}"
+
+ canonical = re.match(r"^([A-Za-z0-9]+)_(\d{3})([EM])$", col, flags=re.IGNORECASE)
+ if canonical:
+ stem, num, suffix = canonical.groups()
+ return f"{stem.upper()}_{num}{suffix.upper()}"
+
+ return col
+
+
+def find_geoid_column(columns):
+ """Identify the GEOID-like column in a set of column names.
+
+ Supports naming conventions used across Census vintages, e.g.:
+ GEOID
+ GEOIDFQ
+ GEOID_Data
+ GEOID20, GEOID10, etc.
+
+ Parameters
+ ----------
+ columns : iterable
+ Collection of column names (DataFrame.columns)
+
+ Returns
+ -------
+ str or None
+ Name of the detected GEOID-like column, or None if not found
+ """
+ # Preferred explicit matches first (most stable)
+ priority = ["GEOID", "GEOIDFQ", "GEOID_Data"]
+ for candidate in priority:
+ if candidate in columns:
+ return candidate
+
+ # Fallback: regex match for any GEOID-like column
+ for col in columns:
+ if re.match(r"^GEOID", str(col), flags=re.IGNORECASE):
+ return col
+
+ # If no GEOID column found, warn
+ warn(f"No GEOID-like column found. Columns are: {list(columns)}")
+ return None
def convert_census_gdb(
@@ -134,13 +204,15 @@ def convert_census_gdb(
if gdb_path is None:
warn("No `gdb_path` given. Data will be pulled from the Census server")
gdb_path = f"https://www2.census.gov/geo/tiger/TIGER_DP/{year}ACS/ACS_{year}_5YR_{level.upper()}.gdb.zip"
- if layers is None: # grab them all except the metadata
+ if layers is None: # grab them all except metadata layers
year_suffix = year[-2:]
meta_str = f"{level.upper()}_METADATA_20{year_suffix}"
layers = [layer[0] for layer in ogr.list_layers(gdb_path)]
- if meta_str in layers:
- layers.remove(meta_str)
-
+ layers = [
+ layer for layer in layers
+ if layer != meta_str and not layer.endswith("_METADATA")
+ ]
+
tables = list()
existing_files = os.listdir(output_dir)
for i in tqdm(layers):
@@ -163,17 +235,29 @@ def convert_census_gdb(
) # remove prefix for bgs
tables.append(df)
else:
- df = (
- dgpd.read_file(gdb_path, layer=i, npartitions=npartitions)
- .compute()
- .set_index("GEOID")
- )
+ raw = dgpd.read_file(gdb_path, layer=i, npartitions=npartitions).compute()
+
+ geoid_col = find_geoid_column(raw.columns)
+ if geoid_col is None:
+ warn(f"Skipping layer {i} because no GEOID column was found")
+ continue
+
+ df = raw.set_index(geoid_col)
+
if "ACS_" not in i: # only the geoms have the ACS prefix
- df = df[df.columns[df.columns.str.contains("e")]]
- df.columns = pd.Series(df.columns).apply(reformat_acs_vars)
+ candidate_cols = df.columns[
+ df.columns.str.match(r"^[A-Za-z0-9]+e\d+$", na=False) # old style: B02001e1
+ | df.columns.str.match(r"^[A-Za-z0-9]+_[EM]\d{3}$", na=False) # new style: B02001_E001 / B02001_M001
+ | df.columns.str.match(r"^[A-Za-z0-9]+_\d{3}[EM]$", na=False) # canonical: B02001_001E / B02001_001M
+ ]
+ df = df[candidate_cols]
+ df.columns = pd.Index([normalize_acs_vars(col) for col in df.columns])
+
df = df.dropna(axis=1, how="all")
- df.index = df.index.str.replace("14000US", "") # remove prefix for tracts
- df.index = df.index.str.replace("15000US", "") # remove prefix for bgs
+ df.index = df.index.astype(str)
+ df.index = df.index.str.replace("14000US", "", regex=False)
+ df.index = df.index.str.replace("15000US", "", regex=False)
+
if combine:
tables.append(df)
if save_intermediate:
diff --git a/geosnap/io/variables.csv b/geosnap/io/variables.csv
index 65b5b114..da1f16e1 100644
--- a/geosnap/io/variables.csv
+++ b/geosnap/io/variables.csv
@@ -1,195 +1,195 @@
-variable,label,formula,ltdb,ncdb,census_1990_form,census_1990_table_column,census_2000_form,census_2000_table_column,acs,category,notes
-geoid,FIPS code,,geoid,GEO2010,,,,,,,
-n_mexican_pop,persons of Mexican parentage or ancestry,,mex,MEXIC,SF1,P0090001,SF1,PCT011004,B03001_004E,Ethnicity & Immigration,
-n_cuban_pop,persons of Cuban parentage or ancestry,,cuban,CUBAN,SF1,P0090004,SF1,PCT011006,B03001_006E,Ethnicity & Immigration,
-n_puerto_rican_pop,persons of Puerto Rican parentage or ancestry,,pr,PRICAN,SF1,P0090003,SF1,PCT011005,B03001_005E,Ethnicity & Immigration,
-n_russian_pop,persons of Russian/USSR parentage or ancestry,,ruanc,,SF3,P0330022,SF3,PCT016064+PCT016053+PCT016052+PCT016037,B04004_064E,Ethnicity & Immigration,ruancXX (page 17 of LTDB codebook) suggests that USSR is only selected for 1970. I gather you're aggregating soviet countries individually? 1990 doesn't seem to have USSR or several of its constituents
-n_italian_pop,persons of Italian parentage or ancestry,,itanc,,SF3,P0330016,SF3,PCT016051,B04004_051E,Ethnicity & Immigration,
-n_german_pop,persons of German parentage or ancestry,,geanc,,SF3,P0330012,SF3,PCT016042,B04004_042E,Ethnicity & Immigration,
-n_irish_pop,persons of Irish parentage or ancestry,,iranc,,SF3,P0330015,SF3,PCT016049,B04004_049E,Ethnicity & Immigration,
-n_scandaniavian_pop,persons of Scandinavian parentage/ancestry,,scanc,,,,SF3,PCT016059+PCT016039+PCT016033+PCT016090,B04004_065E,Ethnicity & Immigration,"scanXX (page 18 of LTDB codebook) suggests dedicated nationalities are used in 1990 and 2000. This is despite there being a scandinavian category in 2000 SF3 (PCT016065); [ek]: similarly, the ACS lists both the scandanavian category *and* the individual country nationalities?"
-n_total_pop_sample,total population from sample-based data,,dfb,,,,,,,Ethnicity & Immigration,LTDB suggests 1980 only
-n_foreign_born_pop,foreign-born,,fb,FORBORN,SF3,P0360001:10,SF3,P021013,B05002_013E,Ethnicity & Immigration,
-n_recent_immigrant_pop,recent immigrants (within the past 10 years),,n10imm,,SF3,P0360001:04,SF3,P023002,B05005_007E,Ethnicity & Immigration,
-n_naturalized_pop,naturalized foreign-born,,nat,FORBCZN,SF3,P0370005,SF3,P021014,B05002_014E,Ethnicity & Immigration,
-n_age_5_older,persons 5 years and over,,ag5up,,SF3,P0130004:31,SF3,P019001,B16001_001E,Ethnicity & Immigration,
-n_other_language,persons who speak language other than English at home,,olang,,SF3,P0310002:26,SF3,P019001 - (P019025+P019003+P019047),B16001_001E - B16001_002E,Ethnicity & Immigration,Construct census 2000 count by subtraction from P019001
-n_limited_english,persons who speak English not well,,lep,,SF3,P0280004+P0280007+P0280010+P0280014+P0280017+P0280020+P0280024+P0280027+P0280030,SF3,P019022+P019023+P019029+P019013+P019012+P019017+P019018+P019007+P019008+P019061+P019062+P019067+P019066+P019052+P019051+P019057+P019056+P019040+P019045+P019044+P019030+P019039+P019035+P019034,DP02_0113E,Ethnicity & Immigration,"[ljw] cant tell if this includes ""speak other Languages"" as a catchall or if that is a pre-crosstab"
-n_russian_born_pop,persons who were born in Russia/ USSR,,rufb,,,,SF3,PCT019026,B05006_040E,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API"
-n_italian_born_pop,persons who were born in Italy,,itfb,,,,SF3,PCT019016,B05006_023E,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API"
-n_german_born_pop,persons who were born in Germany,,gefb,,,,SF3,PCT019011,B05006_017E,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API"
-n_irish_born_pop,persons who were born in Ireland,,irfb,,,,SF3,PCT019005,B05006_008E,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API"
-n_scandaniavian_born_pop,persons who were born in Scandinavian Countries,,scfb,,,,SF3,PCT019006+PCT019007,B05006_009E+B05006_010E+B05006_011E+B05006_012E,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API"
-p_mexican_pop,percentage of persons of Mexican parentage or ancestry,p_mexican_pop=n_mexican_pop / n_total_pop*100,pmex,,,,,,,Ethnicity & Immigration,
-p_cuban_pop,percentage of persons of Cuban parentage or ancestry,p_cuban_pop=n_cuban_pop / n_total_pop*100,pcuban,,,,,,,Ethnicity & Immigration,
-p_puerto_rican_pop,percentage of persons of Puerto Rican parentage or ancestry,p_puerto_rican_pop=n_puerto_rican_pop / n_total_pop*100,ppr,,,,,,,Ethnicity & Immigration,
-p_russian_pop,percentage of persons of Russian/USSR parentage or ancestry,p_russian_pop=n_russian_pop / n_total_pop*100,pruanc,,,,,,,Ethnicity & Immigration,
-p_italian_pop,percentage of persons of Italian parentage or ancestry,p_italian_pop=n_italian_pop / n_total_pop*100,pitanc,,,,,,,Ethnicity & Immigration,
-p_german_pop,percentage of persons of German parentage or ancestry,p_german_pop=n_german_pop / n_total_pop*100,pgeanc,,,,,,,Ethnicity & Immigration,
-p_irish_pop,percentage of persons of Irish parentage or ancestry,p_irish_pop=n_irish_pop / n_total_pop*100,piranc,,,,,,,Ethnicity & Immigration,
-p_scandanavian_pop,percentage of persons of Scandinavian parentage/ancestry,p_scandanavian_pop=n_scandaniavian_pop / n_total_pop*100,pscanc,,,,,,,Ethnicity & Immigration,
-p_foreign_born_pop,percentage of foreign-born,p_foreign_born_pop=n_foreign_born_pop / n_total_pop*100,pfb,SHRFOR,,,,,,Ethnicity & Immigration,
-p_recent_immigrant_pop,percentage of recent immigrants (within the past 10 years),p_recent_immigrant_pop=n_recent_immigrant_pop / n_total_pop*100,p10imm,,,,,,,Ethnicity & Immigration,
-p_naturalized_pop,percentage of naturalized foreign-born,p_naturalized_pop=n_naturalized_pop / n_total_pop*100,pnat,,,,,,,Ethnicity & Immigration,
-p_other_language,percentage of persons who speak language other than English at home,p_other_language=n_other_language / n_total_pop*100,polang,,,,,,,Ethnicity & Immigration,
-p_limited_english,percentage of persons who speak English not well,p_limited_english=n_limited_english / n_total_pop*100,plep,,,,,,,Ethnicity & Immigration,
-p_russian_born_pop,percentage of persons who were born in Russia/ USSR,p_russian_born_pop=n_russian_born_pop / n_total_pop*100,prufb,,,,,,,Ethnicity & Immigration,
-p_italian_born_pop,percentage of persons who were born in Italy,p_italian_born_pop=n_italian_born_pop / n_total_pop*100,pitfb,,,,,,,Ethnicity & Immigration,
-p_german_born_pop,percentage of persons who were born in Germany,p_german_born_pop=n_german_born_pop / n_total_pop*100,pgefb,,,,,,,Ethnicity & Immigration,
-p_irish_born_pop,percentage of persons who were born in Ireland,p_irish_born_pop=n_irish_born_pop / n_total_pop*100,pirfb,,,,,,,Ethnicity & Immigration,
-p_scandanavian_born_pop,percentage of persons who were born in Scandinavian Countries,p_scandanavian_born_pop=n_scandaniavian_born_pop / n_total_pop*100,pscfb,,,,,,,Ethnicity & Immigration,
-n_total_housing_units,housing units,,hu,TOTHSUN,SF1,H0010001,SF1,H001001,B25002_001E,"Housing, Age, & Marital Status",
-n_vacant_housing_units,vacant housing units,,vac,VACHU,SF1,H0020002,SF1,H003003,B25002_003E,"Housing, Age, & Marital Status",divide by B25002_001E for vacancy rate
-n_occupied_housing_units,occupied housing units,,ohu,OCCHU,SF1,H0020001,SF1,H003002,B25002_002E,"Housing, Age, & Marital Status",
-n_owner_occupied_housing_units,owner-occupied housing units,,own,OWNOCC,SF1,H0030001,SF1,H004002,B25003_002E,"Housing, Age, & Marital Status",
-n_renter_occupied_housing_units,renter-occupied housing units,,rent,RNTOCC,SF1,H0030002,SF1,H004003,B25003_003E,"Housing, Age, & Marital Status",
-n_housing_units_multiunit_structures_denom,housing units denom,n_housing_units_multiunit_structures_denom=n_total_housing_units,dmulti,,,,,,B25024_001E,"Housing, Age, & Marital Status",
-n_housing_units_multiunit_structures,housing units in multi-unit structures,,multi,,,,SF3,H030004+H030005+H030006+H030007+H030008+H030009,B25024_004E+B25024_005E+B25024_006E+B25024_007E+B25024_008E+B25024_009E,"Housing, Age, & Marital Status",[ljw] LTDB is unclear as to the relevant computed column from SF3-H030*. Recorded columns here are all stationary housing units (not mobile home (H030010) or RV/Van/Boat (H030011)
-n_total_housing_units_sample,housing units in sample-based data,n_total_housing_units_sample=n_total_housing_units,husp,,,,,,B25024_001E,"Housing, Age, & Marital Status",
-median_home_value,Median home value,,mhmval,MDVALHS,SF3,H061A001,SF3,H085001,B25077_001E,"Housing, Age, & Marital Status",
-median_contract_rent,Median monthly contract rent,,mrent,MDGRENT,SF3,H043A001,SF3,H056001,B25058_001E,"Housing, Age, & Marital Status",
-n_structures_30_old,structures built more than 30 years ago,,h30old,,SF3,H0250005+H0250006+H0250007+H0250008,SF3,H034010+H034009+H034008+H034007,,"Housing, Age, & Marital Status",
-n_occupied_housing_units_sample,occupied housing units in sample-based data,,ohusp,,SF3,H0040001,SF3,H006001,B25003_001E,"Housing, Age, & Marital Status",
-n_household_recent_move,household heads moved into unit less than 10 years ago,,h10yrs,,SF3,H0250006+H0250007+H0250008,SF3,H038003+H038004+H038005+H038010+H038011+H038012,,"Housing, Age, & Marital Status",
-n_persons_under_18,persons age 17 years and under,,a18und,NCHILD,SF3,P0130012+P0130011+P0130010+P0130009+P0130008+P0130007+P0130006+P0130005+P0130004+P0130003+P0130002+P0130001,SF1,P012003+P012004+P012005+P012006+P012027+P012028+P012029+P012030,B01001_003E+B01001_004E+B01001_005E+B01001_006E+B01001_027E+B01001_028E+B01001_029E+B01001_030E,"Housing, Age, & Marital Status",
-n_persons_over_60,persons age 60 years and over,,a60up,,SF3,P0130025+P0130026+P0130027+P0130028+P0130029+P0130030+P0130031,SF1,P012018:025+ P012042:049,B01001_018E+B01001_019E+B01001_020E+B01001_021E+B01001_022E+B01001_023E+B01001_024E+B01001_025E+B01001_042E+B01001_043E+B01001_044E+B01001_045E+B01001_046E+B01001_047E+B01001_048E+B01001_049E,"Housing, Age, & Marital Status",
-n_persons_over_75,persons age 75 years and over,,a75up,,SF3,P0130029+P0130030+P0130031,SF1,P012023:25+ P012047:49,B01001_047E+B01001_048E+B01001_049E+B01001_023E+B01001_024E+B01001_025E,"Housing, Age, & Marital Status",
-n_persons_over_15,population 15 years and over,,ag15up,PERS15P,SF3,P0130010+P0130011+P0130012+P0130013+P0130014+P0130015+P0130016+P0130017+P0130018+P0130019+P0130020+P0130021+P0130022+P0130023+P0130024+P0130025+P0130026+P0130027+P0130028+P0130029+P0130030+P0130031,SF3,P018001,B12001_001E,"Housing, Age, & Marital Status",
-n_persons_over_25,population 25 years and over,,ag25up,,SF3,P0130018+P0130019+P0130020+P0130021+P0130022+P0130023+P0130024+P0130025+P0130026+P0130027+P0130028+P0130029+P0130030+P0130031,SF3,P037001,B15002_001E,"Housing, Age, & Marital Status",denominator for educational attainment
-n_married,currently married (excluding separated),,mar,MMARSPP,SF3,P0250001+P0250002,SF3,(P018004+P018013) - (P018007+P018016),B12001_005E,"Housing, Age, & Marital Status",
-n_widowed_divorced,"widowed, divorced, and separated",,wds,,SF3,P0270005+P0270011+P0270006+P0270012,SF3,P018007+P018009+P018010+P018016+P018018+P018019,B12001_007E+B12001_009E+B12001_010E+B12001_016E+B12001_018E+B12001_019E,"Housing, Age, & Marital Status",
-n_total_families,total families,,family,FAMSUB,SF3,P0040001,SF1,P031001,B17010_001E,"Housing, Age, & Marital Status",denominator for calculating % female-headed families w/ children
-n_female_headed_families,female-headed families with children,,fhh,NFFH,SF3,P0190005,SF1,P035016,B17010_017E,"Housing, Age, & Marital Status",numerator for calculating % female-headed families w/ children
-p_vacant_housing_units,percentage of vacant housing units,p_vacant_housing_units=n_vacant_housing_units / n_total_housing_units*100,pvac,,,,,,,"Housing, Age, & Marital Status",
-p_owner_occupied_units,percentage of owner-occupied housing units,p_owner_occupied_units=n_occupied_housing_units / n_total_housing_units*100,pown,,,,,,,"Housing, Age, & Marital Status",
-p_housing_units_multiunit_structures,percentage of housing units in multi-unit structures,p_housing_units_multiunit_structures=n_housing_units_multiunit_structures / n_housing_units_multiunit_structures_denom*100,pmulti,,,,,,,"Housing, Age, & Marital Status",
-p_structures_30_old,percentage of structures built more than 30 years ago,p_structures_30_old=n_structures_30_old / n_housing_units_multiunit_structures_denom*100,p30old,,,,,,,"Housing, Age, & Marital Status",
-p_household_recent_move,percentage of household heads moved into unit less than 10 years ago,p_household_recent_move=n_household_recent_move / n_total_households*100,p10yrs,,,,,,,"Housing, Age, & Marital Status",
-p_persons_under_18,percentage of persons age 17 years and under,p_persons_under_18=n_persons_under_18 / n_total_pop*100,p18und,,,,,,,"Housing, Age, & Marital Status",
-p_persons_over_60,percentage of persons age 60 years and over,p_persons_over_60=n_persons_over_60 / n_total_pop*100,p60up,,,,,,,"Housing, Age, & Marital Status",
-p_persons_over_75,percentage of persons age 75 years and over,p_persons_over_75=n_persons_over_75 / n_total_pop*100,p75up,,,,,,,"Housing, Age, & Marital Status",
-p_married,percent currently married (excluding separated),p_married=n_married / n_persons_over_15*100,pmar,,,,,,,"Housing, Age, & Marital Status",
-p_widowed_divorced,"percent widowed, divorced, and separated",p_widowed_divorced=n_widowed_divorced / n_persons_over_15*100,pwds,,,,,,,"Housing, Age, & Marital Status",should denom be families or individuals?
-p_female_headed_families,percentage of female-headed families with children,p_female_headed_families=n_female_headed_families / n_total_families*100,pfhh,,,,,,,"Housing, Age, & Marital Status",should denom be families or individuals?
-n_white_persons,persons of white race,,white,NSHRWHT,SF1,P0100001+P0100006,SF1,P003003,,Race & Age,"[ljw] inferring here, documentation suggests this is only available for 1970, but white alone (regardless of hispanic) gives this count? [ek] I think the question about hispanic ethnicity was added in 1980, so this is the best they can do to measdure the 'white alone' construct in 1970 "
-n_nonhisp_white_persons,"persons of white race, not Hispanic origin",,nhwht,NSHRNHW,SF1,P0100001,SF1,P004005,B03002_003E,Race & Age,"[ljw] I read this as P004005 (not hispanic white alone), not P003003 (white alone)"
-n_black_persons,persons of black race,,black,NSHRBLK,SF1,P0100007+P0100002,SF1,P003004,,Race & Age,
-n_nonhisp_black_persons,"persons of black race, not Hispanic origin",,nhblk,NSHRNHB,SF1,P0100002,SF1,P004006,B03002_004E,Race & Age,
-n_hispanic_persons,persons of Hispanic origin,,hisp,NSHRHSP,SF1,P0080001,SF1,P004002,B03002_012E,Race & Age,
-n_native_persons,"persons of Native American race, not Hispanic origin",,ntv,NSHRAMI,SF1,P0070003,SF1,P004007,B03002_005E,Race & Age,is this nonhispanic?
-n_hawaiian_persons,"persons of Hawaiian race, not Hispanic origin",,haw,NSHRHIP,SF1,P0070017,SF1,P004009,B02001_006E,Race & Age,is this nonhispanic?
-n_asian_indian_persons,persons of Asian Indian race,,india,,SF1,P0070009,SF1,PCT007002,B03002_006E+B03002_007E,Race & Age,
-n_chinese_persons,persons of Chinese race,,china,,SF1,P0070006,SF1,PCT007005+PCT007015,B02015_007E+B02015_020E,Race & Age,
-n_filipino_persons,persons of Filipino race,,filip,,SF1,P0070007,SF1,PCT007006,B02015_008E,Race & Age,
-n_japanese_persons,persons of Japanese race,,japan,,SF1,P0070008,SF1,PCT007009,B02015_011E,Race & Age,
-n_korean_persons,persons of Korean race,,korea,,SF1,P0070010,SF1,PCT007010,B02015_012E,Race & Age,
-n_asian_persons,persons of Asian race,,asian,,SF1,P0060004,SF1,P004008,B03002_006E+B03002_007E,Race & Age,for 1990 this is Asian and PI
-n_vietnamese_persons,persons of Vietnamese race,,viet,,SF1,P0070011,SF1,PCT007017,B02018_022E,Race & Age,
-n_white_age_distribution,white population with known age distribution,,agewht,,SF1,P0120001:62,SF1,PCT012I001,B01001H_001E,Race & Age,
-n_white_under_15,0-15 years old of white race,,a15wht,,SF1,P0120001:09+P0120032:40,SF1,PCT012I003:018+PCT012I108:122,B01001H_003E+B01001H_004E+B01001H_005E+B01001H_018E+B01001H_019E+B01001H_020E,Race & Age,
-n_white_over_60,60 years and older of white race,,a60wht,,SF1,P0120025:31+P0120056:62,SF1,PCT012I063:105+PCT012I167:209,,Race & Age,is this nonhispanic?
-n_white_over_65,65 years and older of non-Hispanic whites,,a65wht,,SF1,P0120027:31+P0120058:62,SF1,PCT012I068:105+PCT012I172:209,B01001H_014E+B01001H_015E+B01001H_016E+B01001H_029E+B01001H_030E+B01001H_031E,Race & Age,
-n_black_age_distribution,black population with known age distribution,,ageblk,,SF1,P0120063:0124,SF1,PCT012J001,B01001B_001E,Race & Age,
-n_black_under_15,0-15 years old of black race,,a15blk,,SF1,P0120063:71+P0120094:102,SF1,PCT012J003:018+PCT012J108:122,B01001B_003E+B01001B_004E+B01001B_005E+B01001B_018E+B01001B_019E+B01001B_020E,Race & Age,
-n_black_over_60,60 years and older of black race,,a60blk,,SF1,P0120087:93+P0120118:124,SF1,PCT012J063:105+PCT012J167:209,,Race & Age,
-n_black_over_65,65 years and older of black race,,a65blk,,SF1,P0120089:93+P0120120:124,SF1,PCT012J068:105+PCT012J172:209,B01001B_014E+B01001B_015E+B01001B_016E+B01001B_029E+B01001B_030E+B01001B_031E,Race & Age,
-n_hispanic_age_distribution,Hispanic population with known age distribution,,agehsp,,SF1,P0130001:62,SF1,PCT012H001,B01001I_001E,Race & Age,
-n_hispanic_under_15,"0-15 years old, persons of Hispanic origins",,a15hsp,,SF1,P0130001:09+P0130032:40,SF1,PCT012H003:018+PCT012H108:122,B01001I_003E+B01001I_004E+B01001I_005E+B01001I_018E+B01001I_019E+B01001I_020E,Race & Age,
-n_hispanic_over_60,"60 years and older, persons of Hispanic origins",,a60hsp,,SF1,P0130025:31+P0130056:62,SF1,PCT012H063:105+PCT012H167:209,,Race & Age,
-n_hispanic_over_65,"65 years and older, persons of Hispanic origins",,a65hsp,,SF1,P0130027:31+P0130058:62,SF1,PCT012H068:105+PCT012H172:209,B01001I_014E+B01001I_015E+B01001I_016E+B01001I_029E+B01001I_030E+B01001I_031E,Race & Age,
-n_native_age_distribution,Native American population with known age distribution,,agentv,,SF1,P0120125:186,SF1,PCT012K001,B01001C_001E,Race & Age,
-n_native_under_15,0-15 years old of Native American race,,a15ntv,,SF1,P0120125:133 +P0120156:164,SF1,PCT012K003:018+PCT012K108:122,B01001C_003E+B01001C_004E+B01001C_005E+B01001C_018E+B01001C_019E+B01001C_020E,Race & Age,
-n_native_over_60,60 years and older of Native American race,,a60ntv,,SF1,P0120149:155+P0120180:186,SF1,PCT012K063:105+PCT012K167:209,,Race & Age,
-n_native_over_65,65 years and older of Native American race,,a65ntv,,SF1,P0120151:155+P0120182:186,SF1,PCT012K068:105+PCT012K172:209,B01001C_014E+B01001C_015E+B01001C_016E+B01001C_029E+B01001C_030E+B01001C_031E,Race & Age,
-n_asian_age_distribution,Asian and Pacific Islander population with known age distribution,,ageasn,,SF1,P0120187:248,SF1,PCT012L001+PCT012M001,B01001D_001E+B01001E_001E,Race & Age,
-n_asian_under_15,0-15 years old of Asians and Pacific Islanders,,a15asn,,SF1,P0120187:195+P0120218:226,SF1,PCT012M003:018+PCT012M108:122+PCT012L003:018+PCT012L108:122,B01001D_003E+B01001D_004E+B01001D_005E+B01001D_018E+B01001D_019E+B01001D_020E+B01001E_003E+B01001E_004E+B01001E_005E+B01001E_018E+B01001E_019E+B01001E_020E,Race & Age,
-n_asian_over_60,60 years and older of Asians and Pacific Islanders,,a60asn,,,,SF1,PCT012M063:105+PCT012M167:209,,Race & Age,
-n_asian_over_65,65 years and older of Asians and Pacific Islanders,,a65asn,,,,SF1,PCT012M068:105+PCT012M172:209+PCT012L068:105+PCT012L172:209,B01001D_014E+B01001D_015E+B01001D_016E+B01001E_014E+B01001E_015E+B01001E_016E+B01001E_029E+B01001E_030E+B01001E_031E+B01001D_029E+B01001D_030E+B01001D_031E,Race & Age,
-p_white_persons,percentage of persons of white race,,pwhite,,,,,,,Race & Age,
-p_black_persons,percentage of persons of black race,,pblack,,,,,,,Race & Age,
-p_nonhisp_white_persons,"percentage of persons of white race, not Hispanic origin",p_nonhisp_white_persons=n_nonhisp_white_persons / n_total_pop*100,pnhwht,SHRNHW,,,,,,Race & Age,
-p_nonhisp_black_persons,"percentage of persons of black race, not Hispanic origin",p_nonhisp_black_persons=n_nonhisp_black_persons / n_total_pop*100,pnhblk,SHRNHB,,,,,,Race & Age,
-p_hispanic_persons,percentage of persons of Hispanic origin,p_hispanic_persons=n_hispanic_persons / n_total_pop*100,phisp,SHRHSP,,,,,,Race & Age,
-p_native_persons,percentage of persons of Native American race,p_native_persons=n_native_persons / n_total_pop*100,pntv,SHRNHI,,,,,,Race & Age,
-p_asian_persons,percentage of persons of Asian race (and Pacific Islander),p_asian_persons=n_asian_persons / n_total_pop*100,pasian,SHRNHR,,,,,,Race & Age,
-p_hawaiian_persons,percentage of persons of Hawaiian race,p_hawaiian_persons=n_hawaiian_persons / n_total_pop*100,phaw,SHRNHH,,,,,,Race & Age,
-p_asian_indian_persons,percentage of persons of Asian Indian race,p_asian_indian_persons=n_asian_indian_persons / n_total_pop*100,pindia,,,,,,,Race & Age,
-p_chinese_persons,percentage of persons of Chinese race,p_chinese_persons=n_chinese_persons / n_total_pop*100,pchina,,,,,,,Race & Age,
-p_filipino_persons,percentage of persons of Filipino race,p_filipino_persons=n_filipino_persons / n_total_pop*100,pfilip,,,,,,,Race & Age,
-p_japanese_persons,percentage of persons of Japanese race,p_japanese_persons=n_japanese_persons / n_total_pop*100,pjapan,,,,,,,Race & Age,
-p_korean_persons,percentage of persons of Korean race,p_korean_persons=n_korean_persons / n_total_pop*100,pkorea,,,,,,,Race & Age,
-p_vietnamese_persons,percentage of persons of Vietnamese race,p_vietnamese_persons=n_vietnamese_persons / n_total_pop*100,pviet,,,,,,,Race & Age,
-p_white_under_15,percentage of 0-15 years old of white race,p_white_under_15=n_white_under_15 / n_total_pop*100,p15wht,,,,,,,Race & Age,
-p_white_over_60,percentage of 60 years and older of white race,p_white_over_60=n_white_over_60 / n_total_pop*100,p60wht,,,,,,,Race & Age,
-p_white_over_65,percentage of 65 years and older of non-Hispanic whites,p_white_over_65=n_white_over_65 / n_total_pop*100,p65wht,,,,,,,Race & Age,
-p_black_under_15,percentage of 0-15 years old of black race,p_black_under_15=n_black_under_15 / n_total_pop*100,p15blk,,,,,,,Race & Age,
-p_black_over_60,percentage of 60 years and older of black race,p_black_over_60=n_black_over_60 / n_total_pop*100,p60blk,,,,,,,Race & Age,
-p_black_over_65,percentage of 65 years and older of black race,p_black_over_65=n_black_over_65 / n_total_pop*100,p65blk,,,,,,,Race & Age,
-p_hispanic_under_15,"percentage of 0-15 years old, persons of Hispanic origins",p_hispanic_under_15=n_hispanic_under_15 / n_total_pop*100,p15hsp,,,,,,,Race & Age,
-p_hispanic_over_60,"percentage of 60 years and older, persons of Hispanic origins",p_hispanic_over_60=n_hispanic_over_60 / n_total_pop*100,p60hsp,,,,,,,Race & Age,
-p_hispanic_over_65,"percentage of 65 years and older, persons of Hispanic origins",p_hispanic_over_65=n_hispanic_over_65 / n_total_pop*100,p65hsp,,,,,,,Race & Age,
-p_native_under_15,percentage of 0-15 years old of Native American race,p_native_under_15=n_native_under_15 / n_total_pop*100,p15ntv,,,,,,,Race & Age,
-p_native_over_60,percentage of 60 years and older of Native American race,p_native_over_60=n_native_over_60 / n_total_pop*100,p60ntv,,,,,,,Race & Age,
-p_native_over_65,percentage of 65 years and older of Native American race,p_native_over_65=n_native_over_65 / n_total_pop*100,p65ntv,,,,,,,Race & Age,
-p_asian_under_15,percentage of 0-15 years old of Asians and Pacific Islanders,p_asian_under_15=n_asian_under_15 / n_total_pop*100,p15asn,,,,,,,Race & Age,
-p_asian_over_60,percentage of 60 years and older of Asians and Pacific Islanders,p_asian_over_60=n_asian_over_60 / n_total_pop*100,p60asn,,,,,,,Race & Age,
-p_asian_over_65,percentage of 65 years and older of Asians and Pacific Islanders,p_asian_over_65=n_asian_over_65 / n_total_pop*100,p65asn,,,,,,,Race & Age,
-n_female_over_16,"females 16 years and over, except in armed forces",,dflabf,DCFEPR,SF3,P0700006+P0700007+P0700008,SF3,P043012,,Socioeconomic Status,
-n_female_labor_force,females in labor force,,flabf,FEPR,SF3,P0700006+P0700007,SF3,P043010,,Socioeconomic Status,
-n_labor_force,civilian labor force,,clf,,SF3,P0700002+P0700003+P0700006+P0700007,SF3,P043005+P043012,B27011_002E,Socioeconomic Status,
-n_unemployed_persons,unemployed persons,,unemp,,SF3,P0700003+P0700007,SF3,P043007+P043014,B23001_008E+B23001_015E+B23001_022E+B23001_029E+B23001_036E+B23001_044E+B23001_050E+B23001_057E+B23001_064E+B23001_071E+B23001_094E+B23001_101E+B23001_108E+B23001_115E+B23001_122E+B23001_129E+B23001_136E+B23001_143E+B23001_150E+B23001_157E,Socioeconomic Status,
-n_employed_over_16,employed persons 16 years and over,,empclf,EMPMT,SF3,P0700002+P0700006,SF3,P049001,B23001_007E+B23001_014E+B23001_021E+B23001_028E+B23001_035E+B23001_042E+B23001_049E+B23001_049E+B23001_056E+B23001_063E+B23001_070E+B23001_093E+B23001_100E+B23001_107E+B23001_114E+B23001_121E+B23001_128E+B23001_135E+B23001_142E+B23001_149E+B23001_156E,Socioeconomic Status,
-n_employed_professional,professional employees (by occupations),,prof,DLFRAT,SF3,P0780001+P0780002,SF3,P049017+P049044,,Socioeconomic Status,
-n_employed_manufacturing,manufacturing employees (by industries),,manuf,PRFEMP,SF3,P0770004+P0770005,SF3,P049007+P049034,,Socioeconomic Status,
-n_employed_self_employed,self-employed,,semp,,SF3,P0790006,SF3,P051012+P051023+P051033+P051044+P051055+P051065,,Socioeconomic Status,
-n_civilians_over_16,civilian population 16 years and over,,ag16cv,,SF3,P0640002+P0640003+P0640005+P0640006+P0640008+P0640009 +P0640011+P0640012,SF3,P043005+P043012,C24010_001E,Socioeconomic Status,
-n_civilians_over_18,civilian population 18 years and over,,ag18cv,,,,SF3,P039005+P039010+P039016+P039021,,Socioeconomic Status,
-n_veterans,veterans,,vet,,SF3,P0640002+P0640005+P0640008+P0640011,SF3,P039006+P039011+P039017+P039022,B21001_002E,Socioeconomic Status,
-n_civilians_16_64,civilian non-institutionalized persons 16-64 years old,,cni16u,,SF3,P0640002+P0640003+P0640008+P0640009,SF3,P042001,,Socioeconomic Status,
-n_disabled,disabled,,dis,,SF3,P0680001+P0680002+P0680005+P0680006+P0680009+P0680010+P0680013+P0680014,SF3,P042004+P042007+P042014+P042021+P042024+P042028+P042031+P042038+P042045+P042048,,Socioeconomic Status,
-median_household_income,Median household income,,hinc,MDHHY,SF3,P080A001,SF3,P053001,B19013_001E,Socioeconomic Status,"in 2015 dollars, will need inflation adjustment for timeseries"
-n_total_households,total households in sample-based data,,hh,NUMHHS,SF3,P0050001,SF3,P010001,B19001_001E,Socioeconomic Status,
-median_income_whitehh,Median household income for whites,,hincw,,,,SF3,P152A001,B19013H_001E,Socioeconomic Status,"[ek] the 1990 table noted in the LTDB docs only has ranges, not median (e.g. P0820001)"
-n_white_households,total white households in sample-based data,,hhw,,SF3,P0080001,SF3,P146A001,B19001H_001E,Socioeconomic Status,
-median_income_blackhh,Median household income for blacks,,hincb,,,,SF3,P152B001,B19013B_001E,Socioeconomic Status,"[ek] the 1990 table noted in the LTDB docs only has ranges, not median (e.g. P0820001)"
-n_black_households,total black households in sample-based data,,hhb,,SF3,P0080002,SF3,P146B001,B19001B_001E,Socioeconomic Status,
-median_income_hispanichh,Median household income for Hispanics,,hinch,,,,SF3,P152H001,B19013I_001E,Socioeconomic Status,"[ek] the 1990 table noted in the LTDB docs only has ranges, not median (e.g. P0820001)"
-n_hispanic_households,total Hispanic households in sample-based data,,hhh,,SF3,P0210001:07,SF3,P146H001,B19001I_001E,Socioeconomic Status,"[ek] the 1990 value is calculated differently than the LTDB codebook, because the their reference (P0830001) doesnt include hispanic origin"
-median_income_asianhh,Median household income for Asians and Pacific Islanders,,hinca,,,,SF3,P152D001,,Socioeconomic Status,"[ek] the 1990 and 2010 tables noted in the LTDB docs only have ranges, not median (e.g. P0820001 for 1990 and B19001F_012E for 2010)"
-n_asian_households,total Asian/Pacific Islander households in sample-based data,,hha,,SF3,P0080004,SF3,P152D001+P152E001,B19001D_001E+B19001E_001E,Socioeconomic Status,"unclear how to calculate, since this is only provided as asian or as PI for 2000. Column recorded is asian+pacific islander"
-per_capita_income,Per capita income,,incpc,,SF3,P114A001,SF3,P082001,B19301_001E,Socioeconomic Status,
-n_poverty_determined_persons,persons for whom poverty status is determined,,dpov,DPOVRAT,SF3,P1170001:24,SF3,P087001,B17001_001E,Socioeconomic Status,denominator for calculating poverty rate
-n_poverty_persons,persons in poverty,,npov,NPOVRAT,SF3,P1170013:24,SF3,P087002,B17001_002E,Socioeconomic Status,numerator for calculating poverty rate
-n_poverty_over_65,persons 65 years and older in poverty,,n65pov,NELDPOO,SF3,P1170023+P1170024,SF3,P087008+P087009,B17001_015E+B17001_016E+B17001_029E+B17001_030E,Socioeconomic Status,
-n_poverty_determined_families,families for whom poverty status is determined,,dfmpov,,SF3,P1230001:24,SF3,P090001,B17001_001E,Socioeconomic Status,
-n_poverty_families_children,families with children in poverty,,nfmpov,,,P1230013:15+P1230017:19+P1230021:23,SF3,P090002,B17010_004E+B17010_011E+B17010_017E,Socioeconomic Status,
-n_poverty_determined_white,white persons for whom poverty status is determined,,dwpov,DWHTPR,SF3,P1190001:07+P1190036:42,SF3,P159A001,B17001A_001E,Socioeconomic Status,is this nonhispanic? Recorded white (regardless). White (not hispanic) is P159I
-n_poverty_white,whites in poverty,,nwpov,NWHTPR,SF3,P1190036:42,SF3,P159A002,B17001A_002E,Socioeconomic Status,
-n_poverty_determined_black,black persons for whom poverty status is determined,,dbpov,DBLKPR,SF3,P1190008:14+P1190043:49,SF3,P159B001,B17001B_001E,Socioeconomic Status,
-n_poverty_black,blacks in poverty,,nbpov,NBLKPR,SF3,P1190043:49,SF3,P159B002,B17001B_002E,Socioeconomic Status,
-n_poverty_determined_hispanic,Hispanics for whom poverty status is determined,,dhpov,DHISPR,,,SF3,P159H001,B17020I_001E,Socioeconomic Status,[ek] it's not clear to me how LTDB computed values from this variable https://api.census.gov/data/1990/sf3/variables/P1200001.json
-n_poverty_hispanic,Hispanics in poverty,,nhpov,NHISPR,,,SF3,P159H002,B17020I_002E,Socioeconomic Status,
-n_poverty_determined_native,Native American for whom poverty status is determined,,dnapov,DINDPR,SF3,P1190015:21+P1190050:56,SF3,P159C001,B17020C_001E,Socioeconomic Status,
-n_poverty_native,Native Americans in poverty,,nnapov,INDPR,SF3,P1190050:56,SF3,P159C002,B17020C_002E,Socioeconomic Status,
-n_poverty_determined_asian,Asians and Pacific Islanders for whom poverty status is determined,,dapov,DASNPR,SF3,P1190022:28+P1190058:63,SF3,P159D001+P159E001,B17020E_001E,Socioeconomic Status,"asian alone is D, hawaiian and pac islander is E"
-n_poverty_asian,Asians and Pacific Islanders in poverty,,napov,NASNPR,SF3,P1190058:63,SF3,P159D002+P159E002,B17020E_002E,Socioeconomic Status,
-n_edu_college_greater,persons with at least a four-year college degree,,col,EDUC16,SF3,P0570006+P0570007,SF3,P037015:18+P037032:35,B15002_015E+B15002_016E+B15002_017E+B15002_018E+B15002_032E+B15002_033E+B15002_034E+B15002_035E,Socioeconomic Status,
-n_edu_hs_less,persons with high school degree or less,,hs,EDUC12,SF3,P0570001+P0570002+P0570003,SF3,P037003:011+P037020:028,B15002_003E+B15002_004E+B15002_005E+B15002_006E+B15002_007E+B15002_008E+B15002_009E+B15002_010E+B15002_020E+B15002_021E+B15002_022E+B15002_023E+B15002_024E+B15002_025E+B15002_026E+B15002_027E,Socioeconomic Status,
-p_edu_hs_less,percentage of persons with high school degree or less,p_edu_hs_less=n_edu_hs_less / n_persons_over_25*100,phs,,,,,,,Socioeconomic Status,
-p_edu_college_greater,percentage of persons with at least a four-year college degree,p_edu_college_greater=n_edu_college_greater / n_persons_over_25*100,pcol,,,,,,,Socioeconomic Status,
-p_unemployment_rate,percent unemployed,p_unemployment_rate=n_unemployed_persons / n_labor_force*100,punemp,UNEMPRT,,,,,,Socioeconomic Status,
-p_female_labor_force,percentage of females in labor force,,pflabf,,,,,,,Socioeconomic Status,
-p_employed_professional,percentage of professional employees (by occupations),p_employed_professional=n_employed_professional / n_employed_over_16*100,pprof,,,,,,,Socioeconomic Status,
-p_employed_manufacturing,percentage of manufacturing employees (by industries),p_employed_manufacturing=n_employed_manufacturing / n_employed_over_16*100,pmanuf,,,,,,,Socioeconomic Status,
-p_employed_self_employed,percentage of self-employed,p_employed_self_employed=n_employed_self_employed / n_employed_over_16*100,psemp,,,,,,,Socioeconomic Status,
-p_veterans,percentage of veterans,p_veterans=n_veterans / n_total_pop*100,pvet,,,,,,,Socioeconomic Status,
-p_disabled,percent with disability,p_disabled=n_disabled / n_total_pop*100,pdis,,,,,,,Socioeconomic Status,
-p_poverty_rate,percent poor,p_poverty_rate=n_poverty_persons / n_poverty_determined_persons*100,ppov,POVRAT,,,,,,Socioeconomic Status,
-p_poverty_rate_over_65,percentage of 65 years and older in poverty,p_poverty_rate_over_65=n_poverty_over_65 / n_poverty_determined_persons*100,p65pov,ELDPOO,,,,,,Socioeconomic Status,
-p_poverty_rate_children,percentage of families with children in poverty,p_poverty_rate_children=n_poverty_families_children / n_poverty_determined_families*100,pfmpov,,,,,,,Socioeconomic Status,
-p_poverty_rate_white,percentage of whites in poverty,p_poverty_rate_white=n_poverty_white / n_poverty_determined_persons*100,pwpov,WHTPR,,,,,,Socioeconomic Status,
-p_poverty_rate_black,percentage of blacks in poverty,p_poverty_rate_black=n_poverty_black / n_poverty_determined_persons*100,pbpov,BLKPR,,,,,,Socioeconomic Status,
-p_poverty_rate_hispanic,percentage of Hispanics in poverty,p_poverty_rate_hispanic=n_poverty_hispanic / n_poverty_determined_persons*100,phpov,,,,,,,Socioeconomic Status,
-p_poverty_rate_native,percentage of Native Americans in poverty,p_poverty_rate_native=n_poverty_native / n_poverty_determined_persons*100,pnapov,,,,,,,Socioeconomic Status,
-p_poverty_rate_asian,percentage of Asian and Pacific Islanders in poverty,p_poverty_rate_asian=n_poverty_asian / n_poverty_determined_persons*100,papov,RASPR,,,,,,Socioeconomic Status,
-n_total_pop,total population,,pop,TRCTPOP,SF1,P0010001,SF1,P001001,B01003_001E,total population,
\ No newline at end of file
+,Unnamed: 0.1,Unnamed: 0,variable,label,formula,ltdb,ncdb,census_1990_form,census_1990_table_column,census_2000_form,census_2000_table_column,acs,census_2020_table,census_2020_notes,category,notes
+0,0,0,geoid,FIPS code,,geoid,GEO2010,,,,,,,,,
+1,1,1,n_mexican_pop,persons of Mexican parentage or ancestry,,mex,MEXIC,SF1,P0090001,SF1,PCT011004,B03001_004E,,,Ethnicity & Immigration,
+2,2,2,n_cuban_pop,persons of Cuban parentage or ancestry,,cuban,CUBAN,SF1,P0090004,SF1,PCT011006,B03001_006E,,,Ethnicity & Immigration,
+3,3,3,n_puerto_rican_pop,persons of Puerto Rican parentage or ancestry,,pr,PRICAN,SF1,P0090003,SF1,PCT011005,B03001_005E,,,Ethnicity & Immigration,
+4,4,4,n_russian_pop,persons of Russian/USSR parentage or ancestry,,ruanc,,SF3,P0330022,SF3,PCT016064+PCT016053+PCT016052+PCT016037,B04004_064E,,,Ethnicity & Immigration,ruancXX (page 17 of LTDB codebook) suggests that USSR is only selected for 1970. I gather you're aggregating soviet countries individually? 1990 doesn't seem to have USSR or several of its constituents
+5,5,5,n_italian_pop,persons of Italian parentage or ancestry,,itanc,,SF3,P0330016,SF3,PCT016051,B04004_051E,,,Ethnicity & Immigration,
+6,6,6,n_german_pop,persons of German parentage or ancestry,,geanc,,SF3,P0330012,SF3,PCT016042,B04004_042E,,,Ethnicity & Immigration,
+7,7,7,n_irish_pop,persons of Irish parentage or ancestry,,iranc,,SF3,P0330015,SF3,PCT016049,B04004_049E,,,Ethnicity & Immigration,
+8,8,8,n_scandaniavian_pop,persons of Scandinavian parentage/ancestry,,scanc,,,,SF3,PCT016059+PCT016039+PCT016033+PCT016090,B04004_065E,,,Ethnicity & Immigration,"scanXX (page 18 of LTDB codebook) suggests dedicated nationalities are used in 1990 and 2000. This is despite there being a scandinavian category in 2000 SF3 (PCT016065); [ek]: similarly, the ACS lists both the scandanavian category *and* the individual country nationalities?"
+9,9,9,n_total_pop_sample,total population from sample-based data,,dfb,,,,,,,,,Ethnicity & Immigration,LTDB suggests 1980 only
+10,10,10,n_foreign_born_pop,foreign-born,,fb,FORBORN,SF3,P0360001:10,SF3,P021013,B05002_013E,,,Ethnicity & Immigration,
+11,11,11,n_recent_immigrant_pop,recent immigrants (within the past 10 years),,n10imm,,SF3,P0360001:04,SF3,P023002,B05005_007E,,,Ethnicity & Immigration,
+12,12,12,n_naturalized_pop,naturalized foreign-born,,nat,FORBCZN,SF3,P0370005,SF3,P021014,B05002_014E,,,Ethnicity & Immigration,
+13,13,13,n_age_5_older,persons 5 years and over,,ag5up,,SF3,P0130004:31,SF3,P019001,B16001_001E,,,Ethnicity & Immigration,
+14,14,14,n_other_language,persons who speak language other than English at home,,olang,,SF3,P0310002:26,SF3,P019001 - (P019025+P019003+P019047),B16001_001E - B16001_002E,,,Ethnicity & Immigration,Construct census 2000 count by subtraction from P019001
+15,15,15,n_limited_english,persons who speak English not well,,lep,,SF3,P0280004+P0280007+P0280010+P0280014+P0280017+P0280020+P0280024+P0280027+P0280030,SF3,P019022+P019023+P019029+P019013+P019012+P019017+P019018+P019007+P019008+P019061+P019062+P019067+P019066+P019052+P019051+P019057+P019056+P019040+P019045+P019044+P019030+P019039+P019035+P019034,DP02_0113E,,,Ethnicity & Immigration,"[ljw] cant tell if this includes ""speak other Languages"" as a catchall or if that is a pre-crosstab"
+16,16,16,n_russian_born_pop,persons who were born in Russia/ USSR,,rufb,,,,SF3,PCT019026,B05006_040E,,,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API"
+17,17,17,n_italian_born_pop,persons who were born in Italy,,itfb,,,,SF3,PCT019016,B05006_023E,,,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API"
+18,18,18,n_german_born_pop,persons who were born in Germany,,gefb,,,,SF3,PCT019011,B05006_017E,,,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API"
+19,19,19,n_irish_born_pop,persons who were born in Ireland,,irfb,,,,SF3,PCT019005,B05006_008E,,,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API"
+20,20,20,n_scandaniavian_born_pop,persons who were born in Scandinavian Countries,,scfb,,,,SF3,PCT019006+PCT019007,B05006_009E+B05006_010E+B05006_011E+B05006_012E,,,Ethnicity & Immigration,"[ek] this is STF4 in 1990, so not available from the API"
+21,21,21,p_mexican_pop,percentage of persons of Mexican parentage or ancestry,p_mexican_pop=n_mexican_pop / n_total_pop*100,pmex,,,,,,,,,Ethnicity & Immigration,
+22,22,22,p_cuban_pop,percentage of persons of Cuban parentage or ancestry,p_cuban_pop=n_cuban_pop / n_total_pop*100,pcuban,,,,,,,,,Ethnicity & Immigration,
+23,23,23,p_puerto_rican_pop,percentage of persons of Puerto Rican parentage or ancestry,p_puerto_rican_pop=n_puerto_rican_pop / n_total_pop*100,ppr,,,,,,,,,Ethnicity & Immigration,
+24,24,24,p_russian_pop,percentage of persons of Russian/USSR parentage or ancestry,p_russian_pop=n_russian_pop / n_total_pop*100,pruanc,,,,,,,,,Ethnicity & Immigration,
+25,25,25,p_italian_pop,percentage of persons of Italian parentage or ancestry,p_italian_pop=n_italian_pop / n_total_pop*100,pitanc,,,,,,,,,Ethnicity & Immigration,
+26,26,26,p_german_pop,percentage of persons of German parentage or ancestry,p_german_pop=n_german_pop / n_total_pop*100,pgeanc,,,,,,,,,Ethnicity & Immigration,
+27,27,27,p_irish_pop,percentage of persons of Irish parentage or ancestry,p_irish_pop=n_irish_pop / n_total_pop*100,piranc,,,,,,,,,Ethnicity & Immigration,
+28,28,28,p_scandanavian_pop,percentage of persons of Scandinavian parentage/ancestry,p_scandanavian_pop=n_scandaniavian_pop / n_total_pop*100,pscanc,,,,,,,,,Ethnicity & Immigration,
+29,29,29,p_foreign_born_pop,percentage of foreign-born,p_foreign_born_pop=n_foreign_born_pop / n_total_pop*100,pfb,SHRFOR,,,,,,,,Ethnicity & Immigration,
+30,30,30,p_recent_immigrant_pop,percentage of recent immigrants (within the past 10 years),p_recent_immigrant_pop=n_recent_immigrant_pop / n_total_pop*100,p10imm,,,,,,,,,Ethnicity & Immigration,
+31,31,31,p_naturalized_pop,percentage of naturalized foreign-born,p_naturalized_pop=n_naturalized_pop / n_total_pop*100,pnat,,,,,,,,,Ethnicity & Immigration,
+32,32,32,p_other_language,percentage of persons who speak language other than English at home,p_other_language=n_other_language / n_total_pop*100,polang,,,,,,,,,Ethnicity & Immigration,
+33,33,33,p_limited_english,percentage of persons who speak English not well,p_limited_english=n_limited_english / n_total_pop*100,plep,,,,,,,,,Ethnicity & Immigration,
+34,34,34,p_russian_born_pop,percentage of persons who were born in Russia/ USSR,p_russian_born_pop=n_russian_born_pop / n_total_pop*100,prufb,,,,,,,,,Ethnicity & Immigration,
+35,35,35,p_italian_born_pop,percentage of persons who were born in Italy,p_italian_born_pop=n_italian_born_pop / n_total_pop*100,pitfb,,,,,,,,,Ethnicity & Immigration,
+36,36,36,p_german_born_pop,percentage of persons who were born in Germany,p_german_born_pop=n_german_born_pop / n_total_pop*100,pgefb,,,,,,,,,Ethnicity & Immigration,
+37,37,37,p_irish_born_pop,percentage of persons who were born in Ireland,p_irish_born_pop=n_irish_born_pop / n_total_pop*100,pirfb,,,,,,,,,Ethnicity & Immigration,
+38,38,38,p_scandanavian_born_pop,percentage of persons who were born in Scandinavian Countries,p_scandanavian_born_pop=n_scandaniavian_born_pop / n_total_pop*100,pscfb,,,,,,,,,Ethnicity & Immigration,
+39,39,39,n_total_housing_units,housing units,,hu,TOTHSUN,SF1,H0010001,SF1,H001001,B25002_001E,,,"Housing, Age, & Marital Status",
+40,40,40,n_vacant_housing_units,vacant housing units,,vac,VACHU,SF1,H0020002,SF1,H003003,B25002_003E,,,"Housing, Age, & Marital Status",divide by B25002_001E for vacancy rate
+41,41,41,n_occupied_housing_units,occupied housing units,,ohu,OCCHU,SF1,H0020001,SF1,H003002,B25002_002E,,,"Housing, Age, & Marital Status",
+42,42,42,n_owner_occupied_housing_units,owner-occupied housing units,,own,OWNOCC,SF1,H0030001,SF1,H004002,B25003_002E,,,"Housing, Age, & Marital Status",
+43,43,43,n_renter_occupied_housing_units,renter-occupied housing units,,rent,RNTOCC,SF1,H0030002,SF1,H004003,B25003_003E,,,"Housing, Age, & Marital Status",
+44,44,44,n_housing_units_multiunit_structures_denom,housing units denom,n_housing_units_multiunit_structures_denom=n_total_housing_units,dmulti,,,,,,B25024_001E,,,"Housing, Age, & Marital Status",
+45,45,45,n_housing_units_multiunit_structures,housing units in multi-unit structures,,multi,,,,SF3,H030004+H030005+H030006+H030007+H030008+H030009,B25024_004E+B25024_005E+B25024_006E+B25024_007E+B25024_008E+B25024_009E,,,"Housing, Age, & Marital Status",[ljw] LTDB is unclear as to the relevant computed column from SF3-H030*. Recorded columns here are all stationary housing units (not mobile home (H030010) or RV/Van/Boat (H030011)
+46,46,46,n_total_housing_units_sample,housing units in sample-based data,n_total_housing_units_sample=n_total_housing_units,husp,,,,,,B25024_001E,,,"Housing, Age, & Marital Status",
+47,47,47,median_home_value,Median home value,,mhmval,MDVALHS,SF3,H061A001,SF3,H085001,B25077_001E,,,"Housing, Age, & Marital Status",
+48,48,48,median_contract_rent,Median monthly contract rent,,mrent,MDGRENT,SF3,H043A001,SF3,H056001,B25058_001E,,,"Housing, Age, & Marital Status",
+49,49,49,n_structures_30_old,structures built more than 30 years ago,,h30old,,SF3,H0250005+H0250006+H0250007+H0250008,SF3,H034010+H034009+H034008+H034007,,,,"Housing, Age, & Marital Status",
+50,50,50,n_occupied_housing_units_sample,occupied housing units in sample-based data,,ohusp,,SF3,H0040001,SF3,H006001,B25003_001E,,,"Housing, Age, & Marital Status",
+51,51,51,n_household_recent_move,household heads moved into unit less than 10 years ago,,h10yrs,,SF3,H0250006+H0250007+H0250008,SF3,H038003+H038004+H038005+H038010+H038011+H038012,,,,"Housing, Age, & Marital Status",
+52,52,52,n_persons_under_18,persons age 17 years and under,,a18und,NCHILD,SF3,P0130012+P0130011+P0130010+P0130009+P0130008+P0130007+P0130006+P0130005+P0130004+P0130003+P0130002+P0130001,SF1,P012003+P012004+P012005+P012006+P012027+P012028+P012029+P012030,B01001_003E+B01001_004E+B01001_005E+B01001_006E+B01001_027E+B01001_028E+B01001_029E+B01001_030E,,,"Housing, Age, & Marital Status",
+53,53,53,n_persons_over_60,persons age 60 years and over,,a60up,,SF3,P0130025+P0130026+P0130027+P0130028+P0130029+P0130030+P0130031,SF1,P012018:025+ P012042:049,B01001_018E+B01001_019E+B01001_020E+B01001_021E+B01001_022E+B01001_023E+B01001_024E+B01001_025E+B01001_042E+B01001_043E+B01001_044E+B01001_045E+B01001_046E+B01001_047E+B01001_048E+B01001_049E,,,"Housing, Age, & Marital Status",
+54,54,54,n_persons_over_75,persons age 75 years and over,,a75up,,SF3,P0130029+P0130030+P0130031,SF1,P012023:25+ P012047:49,B01001_047E+B01001_048E+B01001_049E+B01001_023E+B01001_024E+B01001_025E,,,"Housing, Age, & Marital Status",
+55,55,55,n_persons_over_15,population 15 years and over,,ag15up,PERS15P,SF3,P0130010+P0130011+P0130012+P0130013+P0130014+P0130015+P0130016+P0130017+P0130018+P0130019+P0130020+P0130021+P0130022+P0130023+P0130024+P0130025+P0130026+P0130027+P0130028+P0130029+P0130030+P0130031,SF3,P018001,B12001_001E,,,"Housing, Age, & Marital Status",
+56,56,56,n_persons_over_25,population 25 years and over,,ag25up,,SF3,P0130018+P0130019+P0130020+P0130021+P0130022+P0130023+P0130024+P0130025+P0130026+P0130027+P0130028+P0130029+P0130030+P0130031,SF3,P037001,B15002_001E,,,"Housing, Age, & Marital Status",denominator for educational attainment
+57,57,57,n_married,currently married (excluding separated),,mar,MMARSPP,SF3,P0250001+P0250002,SF3,(P018004+P018013) - (P018007+P018016),B12001_005E,,,"Housing, Age, & Marital Status",
+58,58,58,n_widowed_divorced,"widowed, divorced, and separated",,wds,,SF3,P0270005+P0270011+P0270006+P0270012,SF3,P018007+P018009+P018010+P018016+P018018+P018019,B12001_007E+B12001_009E+B12001_010E+B12001_016E+B12001_018E+B12001_019E,,,"Housing, Age, & Marital Status",
+59,59,59,n_total_families,total families,,family,FAMSUB,SF3,P0040001,SF1,P031001,B17010_001E,,,"Housing, Age, & Marital Status",denominator for calculating % female-headed families w/ children
+60,60,60,n_female_headed_families,female-headed families with children,,fhh,NFFH,SF3,P0190005,SF1,P035016,B17010_017E,,,"Housing, Age, & Marital Status",numerator for calculating % female-headed families w/ children
+61,61,61,p_vacant_housing_units,percentage of vacant housing units,p_vacant_housing_units=n_vacant_housing_units / n_total_housing_units*100,pvac,,,,,,,,,"Housing, Age, & Marital Status",
+62,62,62,p_owner_occupied_units,percentage of owner-occupied housing units,p_owner_occupied_units=n_occupied_housing_units / n_total_housing_units*100,pown,,,,,,,,,"Housing, Age, & Marital Status",
+63,63,63,p_housing_units_multiunit_structures,percentage of housing units in multi-unit structures,p_housing_units_multiunit_structures=n_housing_units_multiunit_structures / n_housing_units_multiunit_structures_denom*100,pmulti,,,,,,,,,"Housing, Age, & Marital Status",
+64,64,64,p_structures_30_old,percentage of structures built more than 30 years ago,p_structures_30_old=n_structures_30_old / n_housing_units_multiunit_structures_denom*100,p30old,,,,,,,,,"Housing, Age, & Marital Status",
+65,65,65,p_household_recent_move,percentage of household heads moved into unit less than 10 years ago,p_household_recent_move=n_household_recent_move / n_total_households*100,p10yrs,,,,,,,,,"Housing, Age, & Marital Status",
+66,66,66,p_persons_under_18,percentage of persons age 17 years and under,p_persons_under_18=n_persons_under_18 / n_total_pop*100,p18und,,,,,,,,,"Housing, Age, & Marital Status",
+67,67,67,p_persons_over_60,percentage of persons age 60 years and over,p_persons_over_60=n_persons_over_60 / n_total_pop*100,p60up,,,,,,,,,"Housing, Age, & Marital Status",
+68,68,68,p_persons_over_75,percentage of persons age 75 years and over,p_persons_over_75=n_persons_over_75 / n_total_pop*100,p75up,,,,,,,,,"Housing, Age, & Marital Status",
+69,69,69,p_married,percent currently married (excluding separated),p_married=n_married / n_persons_over_15*100,pmar,,,,,,,,,"Housing, Age, & Marital Status",
+70,70,70,p_widowed_divorced,"percent widowed, divorced, and separated",p_widowed_divorced=n_widowed_divorced / n_persons_over_15*100,pwds,,,,,,,,,"Housing, Age, & Marital Status",should denom be families or individuals?
+71,71,71,p_female_headed_families,percentage of female-headed families with children,p_female_headed_families=n_female_headed_families / n_total_families*100,pfhh,,,,,,,,,"Housing, Age, & Marital Status",should denom be families or individuals?
+72,72,72,n_white_persons,persons of white race,,white,NSHRWHT,SF1,P0100001+P0100006,SF1,P003003,,,,Race & Age,"[ljw] inferring here, documentation suggests this is only available for 1970, but white alone (regardless of hispanic) gives this count? [ek] I think the question about hispanic ethnicity was added in 1980, so this is the best they can do to measdure the 'white alone' construct in 1970 "
+73,73,73,n_nonhisp_white_persons,"persons of white race, not Hispanic origin",,nhwht,NSHRNHW,SF1,P0100001,SF1,P004005,B03002_003E,,,Race & Age,"[ljw] I read this as P004005 (not hispanic white alone), not P003003 (white alone)"
+74,74,74,n_black_persons,persons of black race,,black,NSHRBLK,SF1,P0100007+P0100002,SF1,P003004,,,,Race & Age,
+75,75,75,n_nonhisp_black_persons,"persons of black race, not Hispanic origin",,nhblk,NSHRNHB,SF1,P0100002,SF1,P004006,B03002_004E,,,Race & Age,
+76,76,76,n_hispanic_persons,persons of Hispanic origin,,hisp,NSHRHSP,SF1,P0080001,SF1,P004002,B03002_012E,,,Race & Age,
+77,77,77,n_native_persons,"persons of Native American race, not Hispanic origin",,ntv,NSHRAMI,SF1,P0070003,SF1,P004007,B03002_005E,,,Race & Age,is this nonhispanic?
+78,78,78,n_hawaiian_persons,"persons of Hawaiian race, not Hispanic origin",,haw,NSHRHIP,SF1,P0070017,SF1,P004009,B02001_006E,,,Race & Age,is this nonhispanic?
+79,79,79,n_asian_indian_persons,persons of Asian Indian race,,india,,SF1,P0070009,SF1,PCT007002,B03002_006E+B03002_007E,,,Race & Age,
+80,80,80,n_chinese_persons,persons of Chinese race,,china,,SF1,P0070006,SF1,PCT007005+PCT007015,B02015_007E+B02015_020E,,,Race & Age,
+81,81,81,n_filipino_persons,persons of Filipino race,,filip,,SF1,P0070007,SF1,PCT007006,B02015_008E,,,Race & Age,
+82,82,82,n_japanese_persons,persons of Japanese race,,japan,,SF1,P0070008,SF1,PCT007009,B02015_011E,,,Race & Age,
+83,83,83,n_korean_persons,persons of Korean race,,korea,,SF1,P0070010,SF1,PCT007010,B02015_012E,,,Race & Age,
+84,84,84,n_asian_persons,persons of Asian race,,asian,,SF1,P0060004,SF1,P004008,B03002_006E+B03002_007E,,,Race & Age,for 1990 this is Asian and PI
+85,85,85,n_vietnamese_persons,persons of Vietnamese race,,viet,,SF1,P0070011,SF1,PCT007017,B02018_022E,,,Race & Age,
+86,86,86,n_white_age_distribution,white population with known age distribution,,agewht,,SF1,P0120001:62,SF1,PCT012I001,B01001H_001E,,,Race & Age,
+87,87,87,n_white_under_15,0-15 years old of white race,,a15wht,,SF1,P0120001:09+P0120032:40,SF1,PCT012I003:018+PCT012I108:122,B01001H_003E+B01001H_004E+B01001H_005E+B01001H_018E+B01001H_019E+B01001H_020E,,,Race & Age,
+88,88,88,n_white_over_60,60 years and older of white race,,a60wht,,SF1,P0120025:31+P0120056:62,SF1,PCT012I063:105+PCT012I167:209,,,,Race & Age,is this nonhispanic?
+89,89,89,n_white_over_65,65 years and older of non-Hispanic whites,,a65wht,,SF1,P0120027:31+P0120058:62,SF1,PCT012I068:105+PCT012I172:209,B01001H_014E+B01001H_015E+B01001H_016E+B01001H_029E+B01001H_030E+B01001H_031E,,,Race & Age,
+90,90,90,n_black_age_distribution,black population with known age distribution,,ageblk,,SF1,P0120063:0124,SF1,PCT012J001,B01001B_001E,,,Race & Age,
+91,91,91,n_black_under_15,0-15 years old of black race,,a15blk,,SF1,P0120063:71+P0120094:102,SF1,PCT012J003:018+PCT012J108:122,B01001B_003E+B01001B_004E+B01001B_005E+B01001B_018E+B01001B_019E+B01001B_020E,,,Race & Age,
+92,92,92,n_black_over_60,60 years and older of black race,,a60blk,,SF1,P0120087:93+P0120118:124,SF1,PCT012J063:105+PCT012J167:209,,,,Race & Age,
+93,93,93,n_black_over_65,65 years and older of black race,,a65blk,,SF1,P0120089:93+P0120120:124,SF1,PCT012J068:105+PCT012J172:209,B01001B_014E+B01001B_015E+B01001B_016E+B01001B_029E+B01001B_030E+B01001B_031E,,,Race & Age,
+94,94,94,n_hispanic_age_distribution,Hispanic population with known age distribution,,agehsp,,SF1,P0130001:62,SF1,PCT012H001,B01001I_001E,,,Race & Age,
+95,95,95,n_hispanic_under_15,"0-15 years old, persons of Hispanic origins",,a15hsp,,SF1,P0130001:09+P0130032:40,SF1,PCT012H003:018+PCT012H108:122,B01001I_003E+B01001I_004E+B01001I_005E+B01001I_018E+B01001I_019E+B01001I_020E,,,Race & Age,
+96,96,96,n_hispanic_over_60,"60 years and older, persons of Hispanic origins",,a60hsp,,SF1,P0130025:31+P0130056:62,SF1,PCT012H063:105+PCT012H167:209,,,,Race & Age,
+97,97,97,n_hispanic_over_65,"65 years and older, persons of Hispanic origins",,a65hsp,,SF1,P0130027:31+P0130058:62,SF1,PCT012H068:105+PCT012H172:209,B01001I_014E+B01001I_015E+B01001I_016E+B01001I_029E+B01001I_030E+B01001I_031E,,,Race & Age,
+98,98,98,n_native_age_distribution,Native American population with known age distribution,,agentv,,SF1,P0120125:186,SF1,PCT012K001,B01001C_001E,,,Race & Age,
+99,99,99,n_native_under_15,0-15 years old of Native American race,,a15ntv,,SF1,P0120125:133 +P0120156:164,SF1,PCT012K003:018+PCT012K108:122,B01001C_003E+B01001C_004E+B01001C_005E+B01001C_018E+B01001C_019E+B01001C_020E,,,Race & Age,
+100,100,100,n_native_over_60,60 years and older of Native American race,,a60ntv,,SF1,P0120149:155+P0120180:186,SF1,PCT012K063:105+PCT012K167:209,,,,Race & Age,
+101,101,101,n_native_over_65,65 years and older of Native American race,,a65ntv,,SF1,P0120151:155+P0120182:186,SF1,PCT012K068:105+PCT012K172:209,B01001C_014E+B01001C_015E+B01001C_016E+B01001C_029E+B01001C_030E+B01001C_031E,,,Race & Age,
+102,102,102,n_asian_age_distribution,Asian and Pacific Islander population with known age distribution,,ageasn,,SF1,P0120187:248,SF1,PCT012L001+PCT012M001,B01001D_001E+B01001E_001E,,,Race & Age,
+103,103,103,n_asian_under_15,0-15 years old of Asians and Pacific Islanders,,a15asn,,SF1,P0120187:195+P0120218:226,SF1,PCT012M003:018+PCT012M108:122+PCT012L003:018+PCT012L108:122,B01001D_003E+B01001D_004E+B01001D_005E+B01001D_018E+B01001D_019E+B01001D_020E+B01001E_003E+B01001E_004E+B01001E_005E+B01001E_018E+B01001E_019E+B01001E_020E,,,Race & Age,
+104,104,104,n_asian_over_60,60 years and older of Asians and Pacific Islanders,,a60asn,,,,SF1,PCT012M063:105+PCT012M167:209,,,,Race & Age,
+105,105,105,n_asian_over_65,65 years and older of Asians and Pacific Islanders,,a65asn,,,,SF1,PCT012M068:105+PCT012M172:209+PCT012L068:105+PCT012L172:209,B01001D_014E+B01001D_015E+B01001D_016E+B01001E_014E+B01001E_015E+B01001E_016E+B01001E_029E+B01001E_030E+B01001E_031E+B01001D_029E+B01001D_030E+B01001D_031E,,,Race & Age,
+106,106,106,p_white_persons,percentage of persons of white race,,pwhite,,,,,,,,,Race & Age,
+107,107,107,p_black_persons,percentage of persons of black race,,pblack,,,,,,,,,Race & Age,
+108,108,108,p_nonhisp_white_persons,"percentage of persons of white race, not Hispanic origin",p_nonhisp_white_persons=n_nonhisp_white_persons / n_total_pop*100,pnhwht,SHRNHW,,,,,,,,Race & Age,
+109,109,109,p_nonhisp_black_persons,"percentage of persons of black race, not Hispanic origin",p_nonhisp_black_persons=n_nonhisp_black_persons / n_total_pop*100,pnhblk,SHRNHB,,,,,,,,Race & Age,
+110,110,110,p_hispanic_persons,percentage of persons of Hispanic origin,p_hispanic_persons=n_hispanic_persons / n_total_pop*100,phisp,SHRHSP,,,,,,,,Race & Age,
+111,111,111,p_native_persons,percentage of persons of Native American race,p_native_persons=n_native_persons / n_total_pop*100,pntv,SHRNHI,,,,,,,,Race & Age,
+112,112,112,p_asian_persons,percentage of persons of Asian race (and Pacific Islander),p_asian_persons=n_asian_persons / n_total_pop*100,pasian,SHRNHR,,,,,,,,Race & Age,
+113,113,113,p_hawaiian_persons,percentage of persons of Hawaiian race,p_hawaiian_persons=n_hawaiian_persons / n_total_pop*100,phaw,SHRNHH,,,,,,,,Race & Age,
+114,114,114,p_asian_indian_persons,percentage of persons of Asian Indian race,p_asian_indian_persons=n_asian_indian_persons / n_total_pop*100,pindia,,,,,,,,,Race & Age,
+115,115,115,p_chinese_persons,percentage of persons of Chinese race,p_chinese_persons=n_chinese_persons / n_total_pop*100,pchina,,,,,,,,,Race & Age,
+116,116,116,p_filipino_persons,percentage of persons of Filipino race,p_filipino_persons=n_filipino_persons / n_total_pop*100,pfilip,,,,,,,,,Race & Age,
+117,117,117,p_japanese_persons,percentage of persons of Japanese race,p_japanese_persons=n_japanese_persons / n_total_pop*100,pjapan,,,,,,,,,Race & Age,
+118,118,118,p_korean_persons,percentage of persons of Korean race,p_korean_persons=n_korean_persons / n_total_pop*100,pkorea,,,,,,,,,Race & Age,
+119,119,119,p_vietnamese_persons,percentage of persons of Vietnamese race,p_vietnamese_persons=n_vietnamese_persons / n_total_pop*100,pviet,,,,,,,,,Race & Age,
+120,120,120,p_white_under_15,percentage of 0-15 years old of white race,p_white_under_15=n_white_under_15 / n_total_pop*100,p15wht,,,,,,,,,Race & Age,
+121,121,121,p_white_over_60,percentage of 60 years and older of white race,p_white_over_60=n_white_over_60 / n_total_pop*100,p60wht,,,,,,,,,Race & Age,
+122,122,122,p_white_over_65,percentage of 65 years and older of non-Hispanic whites,p_white_over_65=n_white_over_65 / n_total_pop*100,p65wht,,,,,,,,,Race & Age,
+123,123,123,p_black_under_15,percentage of 0-15 years old of black race,p_black_under_15=n_black_under_15 / n_total_pop*100,p15blk,,,,,,,,,Race & Age,
+124,124,124,p_black_over_60,percentage of 60 years and older of black race,p_black_over_60=n_black_over_60 / n_total_pop*100,p60blk,,,,,,,,,Race & Age,
+125,125,125,p_black_over_65,percentage of 65 years and older of black race,p_black_over_65=n_black_over_65 / n_total_pop*100,p65blk,,,,,,,,,Race & Age,
+126,126,126,p_hispanic_under_15,"percentage of 0-15 years old, persons of Hispanic origins",p_hispanic_under_15=n_hispanic_under_15 / n_total_pop*100,p15hsp,,,,,,,,,Race & Age,
+127,127,127,p_hispanic_over_60,"percentage of 60 years and older, persons of Hispanic origins",p_hispanic_over_60=n_hispanic_over_60 / n_total_pop*100,p60hsp,,,,,,,,,Race & Age,
+128,128,128,p_hispanic_over_65,"percentage of 65 years and older, persons of Hispanic origins",p_hispanic_over_65=n_hispanic_over_65 / n_total_pop*100,p65hsp,,,,,,,,,Race & Age,
+129,129,129,p_native_under_15,percentage of 0-15 years old of Native American race,p_native_under_15=n_native_under_15 / n_total_pop*100,p15ntv,,,,,,,,,Race & Age,
+130,130,130,p_native_over_60,percentage of 60 years and older of Native American race,p_native_over_60=n_native_over_60 / n_total_pop*100,p60ntv,,,,,,,,,Race & Age,
+131,131,131,p_native_over_65,percentage of 65 years and older of Native American race,p_native_over_65=n_native_over_65 / n_total_pop*100,p65ntv,,,,,,,,,Race & Age,
+132,132,132,p_asian_under_15,percentage of 0-15 years old of Asians and Pacific Islanders,p_asian_under_15=n_asian_under_15 / n_total_pop*100,p15asn,,,,,,,,,Race & Age,
+133,133,133,p_asian_over_60,percentage of 60 years and older of Asians and Pacific Islanders,p_asian_over_60=n_asian_over_60 / n_total_pop*100,p60asn,,,,,,,,,Race & Age,
+134,134,134,p_asian_over_65,percentage of 65 years and older of Asians and Pacific Islanders,p_asian_over_65=n_asian_over_65 / n_total_pop*100,p65asn,,,,,,,,,Race & Age,
+135,135,135,n_female_over_16,"females 16 years and over, except in armed forces",,dflabf,DCFEPR,SF3,P0700006+P0700007+P0700008,SF3,P043012,,,,Socioeconomic Status,
+136,136,136,n_female_labor_force,females in labor force,,flabf,FEPR,SF3,P0700006+P0700007,SF3,P043010,,,,Socioeconomic Status,
+137,137,137,n_labor_force,civilian labor force,,clf,,SF3,P0700002+P0700003+P0700006+P0700007,SF3,P043005+P043012,B27011_002E,,,Socioeconomic Status,
+138,138,138,n_unemployed_persons,unemployed persons,,unemp,,SF3,P0700003+P0700007,SF3,P043007+P043014,B23001_008E+B23001_015E+B23001_022E+B23001_029E+B23001_036E+B23001_044E+B23001_050E+B23001_057E+B23001_064E+B23001_071E+B23001_094E+B23001_101E+B23001_108E+B23001_115E+B23001_122E+B23001_129E+B23001_136E+B23001_143E+B23001_150E+B23001_157E,,,Socioeconomic Status,
+139,139,139,n_employed_over_16,employed persons 16 years and over,,empclf,EMPMT,SF3,P0700002+P0700006,SF3,P049001,B23001_007E+B23001_014E+B23001_021E+B23001_028E+B23001_035E+B23001_042E+B23001_049E+B23001_049E+B23001_056E+B23001_063E+B23001_070E+B23001_093E+B23001_100E+B23001_107E+B23001_114E+B23001_121E+B23001_128E+B23001_135E+B23001_142E+B23001_149E+B23001_156E,,,Socioeconomic Status,
+140,140,140,n_employed_professional,professional employees (by occupations),,prof,DLFRAT,SF3,P0780001+P0780002,SF3,P049017+P049044,,,,Socioeconomic Status,
+141,141,141,n_employed_manufacturing,manufacturing employees (by industries),,manuf,PRFEMP,SF3,P0770004+P0770005,SF3,P049007+P049034,,,,Socioeconomic Status,
+142,142,142,n_employed_self_employed,self-employed,,semp,,SF3,P0790006,SF3,P051012+P051023+P051033+P051044+P051055+P051065,,,,Socioeconomic Status,
+143,143,143,n_civilians_over_16,civilian population 16 years and over,,ag16cv,,SF3,P0640002+P0640003+P0640005+P0640006+P0640008+P0640009 +P0640011+P0640012,SF3,P043005+P043012,C24010_001E,,,Socioeconomic Status,
+144,144,144,n_civilians_over_18,civilian population 18 years and over,,ag18cv,,,,SF3,P039005+P039010+P039016+P039021,,,,Socioeconomic Status,
+145,145,145,n_veterans,veterans,,vet,,SF3,P0640002+P0640005+P0640008+P0640011,SF3,P039006+P039011+P039017+P039022,B21001_002E,,,Socioeconomic Status,
+146,146,146,n_civilians_16_64,civilian non-institutionalized persons 16-64 years old,,cni16u,,SF3,P0640002+P0640003+P0640008+P0640009,SF3,P042001,,,,Socioeconomic Status,
+147,147,147,n_disabled,disabled,,dis,,SF3,P0680001+P0680002+P0680005+P0680006+P0680009+P0680010+P0680013+P0680014,SF3,P042004+P042007+P042014+P042021+P042024+P042028+P042031+P042038+P042045+P042048,,,,Socioeconomic Status,
+148,148,148,median_household_income,Median household income,,hinc,MDHHY,SF3,P080A001,SF3,P053001,B19013_001E,,,Socioeconomic Status,"in 2015 dollars, will need inflation adjustment for timeseries"
+149,149,149,n_total_households,total households in sample-based data,,hh,NUMHHS,SF3,P0050001,SF3,P010001,B19001_001E,,,Socioeconomic Status,
+150,150,150,median_income_whitehh,Median household income for whites,,hincw,,,,SF3,P152A001,B19013H_001E,,,Socioeconomic Status,"[ek] the 1990 table noted in the LTDB docs only has ranges, not median (e.g. P0820001)"
+151,151,151,n_white_households,total white households in sample-based data,,hhw,,SF3,P0080001,SF3,P146A001,B19001H_001E,,,Socioeconomic Status,
+152,152,152,median_income_blackhh,Median household income for blacks,,hincb,,,,SF3,P152B001,B19013B_001E,,,Socioeconomic Status,"[ek] the 1990 table noted in the LTDB docs only has ranges, not median (e.g. P0820001)"
+153,153,153,n_black_households,total black households in sample-based data,,hhb,,SF3,P0080002,SF3,P146B001,B19001B_001E,,,Socioeconomic Status,
+154,154,154,median_income_hispanichh,Median household income for Hispanics,,hinch,,,,SF3,P152H001,B19013I_001E,,,Socioeconomic Status,"[ek] the 1990 table noted in the LTDB docs only has ranges, not median (e.g. P0820001)"
+155,155,155,n_hispanic_households,total Hispanic households in sample-based data,,hhh,,SF3,P0210001:07,SF3,P146H001,B19001I_001E,,,Socioeconomic Status,"[ek] the 1990 value is calculated differently than the LTDB codebook, because the their reference (P0830001) doesnt include hispanic origin"
+156,156,156,median_income_asianhh,Median household income for Asians and Pacific Islanders,,hinca,,,,SF3,P152D001,,,,Socioeconomic Status,"[ek] the 1990 and 2010 tables noted in the LTDB docs only have ranges, not median (e.g. P0820001 for 1990 and B19001F_012E for 2010)"
+157,157,157,n_asian_households,total Asian/Pacific Islander households in sample-based data,,hha,,SF3,P0080004,SF3,P152D001+P152E001,B19001D_001E+B19001E_001E,,,Socioeconomic Status,"unclear how to calculate, since this is only provided as asian or as PI for 2000. Column recorded is asian+pacific islander"
+158,158,158,per_capita_income,Per capita income,,incpc,,SF3,P114A001,SF3,P082001,B19301_001E,,,Socioeconomic Status,
+159,159,159,n_poverty_determined_persons,persons for whom poverty status is determined,,dpov,DPOVRAT,SF3,P1170001:24,SF3,P087001,B17001_001E,,,Socioeconomic Status,denominator for calculating poverty rate
+160,160,160,n_poverty_persons,persons in poverty,,npov,NPOVRAT,SF3,P1170013:24,SF3,P087002,B17001_002E,,,Socioeconomic Status,numerator for calculating poverty rate
+161,161,161,n_poverty_over_65,persons 65 years and older in poverty,,n65pov,NELDPOO,SF3,P1170023+P1170024,SF3,P087008+P087009,B17001_015E+B17001_016E+B17001_029E+B17001_030E,,,Socioeconomic Status,
+162,162,162,n_poverty_determined_families,families for whom poverty status is determined,,dfmpov,,SF3,P1230001:24,SF3,P090001,B17001_001E,,,Socioeconomic Status,
+163,163,163,n_poverty_families_children,families with children in poverty,,nfmpov,,,P1230013:15+P1230017:19+P1230021:23,SF3,P090002,B17010_004E+B17010_011E+B17010_017E,,,Socioeconomic Status,
+164,164,164,n_poverty_determined_white,white persons for whom poverty status is determined,,dwpov,DWHTPR,SF3,P1190001:07+P1190036:42,SF3,P159A001,B17001A_001E,,,Socioeconomic Status,is this nonhispanic? Recorded white (regardless). White (not hispanic) is P159I
+165,165,165,n_poverty_white,whites in poverty,,nwpov,NWHTPR,SF3,P1190036:42,SF3,P159A002,B17001A_002E,,,Socioeconomic Status,
+166,166,166,n_poverty_determined_black,black persons for whom poverty status is determined,,dbpov,DBLKPR,SF3,P1190008:14+P1190043:49,SF3,P159B001,B17001B_001E,,,Socioeconomic Status,
+167,167,167,n_poverty_black,blacks in poverty,,nbpov,NBLKPR,SF3,P1190043:49,SF3,P159B002,B17001B_002E,,,Socioeconomic Status,
+168,168,168,n_poverty_determined_hispanic,Hispanics for whom poverty status is determined,,dhpov,DHISPR,,,SF3,P159H001,B17020I_001E,,,Socioeconomic Status,[ek] it's not clear to me how LTDB computed values from this variable https://api.census.gov/data/1990/sf3/variables/P1200001.json
+169,169,169,n_poverty_hispanic,Hispanics in poverty,,nhpov,NHISPR,,,SF3,P159H002,B17020I_002E,,,Socioeconomic Status,
+170,170,170,n_poverty_determined_native,Native American for whom poverty status is determined,,dnapov,DINDPR,SF3,P1190015:21+P1190050:56,SF3,P159C001,B17020C_001E,,,Socioeconomic Status,
+171,171,171,n_poverty_native,Native Americans in poverty,,nnapov,INDPR,SF3,P1190050:56,SF3,P159C002,B17020C_002E,,,Socioeconomic Status,
+172,172,172,n_poverty_determined_asian,Asians and Pacific Islanders for whom poverty status is determined,,dapov,DASNPR,SF3,P1190022:28+P1190058:63,SF3,P159D001+P159E001,B17020E_001E,,,Socioeconomic Status,"asian alone is D, hawaiian and pac islander is E"
+173,173,173,n_poverty_asian,Asians and Pacific Islanders in poverty,,napov,NASNPR,SF3,P1190058:63,SF3,P159D002+P159E002,B17020E_002E,,,Socioeconomic Status,
+174,174,174,n_edu_college_greater,persons with at least a four-year college degree,,col,EDUC16,SF3,P0570006+P0570007,SF3,P037015:18+P037032:35,B15002_015E+B15002_016E+B15002_017E+B15002_018E+B15002_032E+B15002_033E+B15002_034E+B15002_035E,,,Socioeconomic Status,
+175,175,175,n_edu_hs_less,persons with high school degree or less,,hs,EDUC12,SF3,P0570001+P0570002+P0570003,SF3,P037003:011+P037020:028,B15002_003E+B15002_004E+B15002_005E+B15002_006E+B15002_007E+B15002_008E+B15002_009E+B15002_010E+B15002_020E+B15002_021E+B15002_022E+B15002_023E+B15002_024E+B15002_025E+B15002_026E+B15002_027E,,,Socioeconomic Status,
+176,176,176,p_edu_hs_less,percentage of persons with high school degree or less,p_edu_hs_less=n_edu_hs_less / n_persons_over_25*100,phs,,,,,,,,,Socioeconomic Status,
+177,177,177,p_edu_college_greater,percentage of persons with at least a four-year college degree,p_edu_college_greater=n_edu_college_greater / n_persons_over_25*100,pcol,,,,,,,,,Socioeconomic Status,
+178,178,178,p_unemployment_rate,percent unemployed,p_unemployment_rate=n_unemployed_persons / n_labor_force*100,punemp,UNEMPRT,,,,,,,,Socioeconomic Status,
+179,179,179,p_female_labor_force,percentage of females in labor force,,pflabf,,,,,,,,,Socioeconomic Status,
+180,180,180,p_employed_professional,percentage of professional employees (by occupations),p_employed_professional=n_employed_professional / n_employed_over_16*100,pprof,,,,,,,,,Socioeconomic Status,
+181,181,181,p_employed_manufacturing,percentage of manufacturing employees (by industries),p_employed_manufacturing=n_employed_manufacturing / n_employed_over_16*100,pmanuf,,,,,,,,,Socioeconomic Status,
+182,182,182,p_employed_self_employed,percentage of self-employed,p_employed_self_employed=n_employed_self_employed / n_employed_over_16*100,psemp,,,,,,,,,Socioeconomic Status,
+183,183,183,p_veterans,percentage of veterans,p_veterans=n_veterans / n_total_pop*100,pvet,,,,,,,,,Socioeconomic Status,
+184,184,184,p_disabled,percent with disability,p_disabled=n_disabled / n_total_pop*100,pdis,,,,,,,,,Socioeconomic Status,
+185,185,185,p_poverty_rate,percent poor,p_poverty_rate=n_poverty_persons / n_poverty_determined_persons*100,ppov,POVRAT,,,,,,,,Socioeconomic Status,
+186,186,186,p_poverty_rate_over_65,percentage of 65 years and older in poverty,p_poverty_rate_over_65=n_poverty_over_65 / n_poverty_determined_persons*100,p65pov,ELDPOO,,,,,,,,Socioeconomic Status,
+187,187,187,p_poverty_rate_children,percentage of families with children in poverty,p_poverty_rate_children=n_poverty_families_children / n_poverty_determined_families*100,pfmpov,,,,,,,,,Socioeconomic Status,
+188,188,188,p_poverty_rate_white,percentage of whites in poverty,p_poverty_rate_white=n_poverty_white / n_poverty_determined_persons*100,pwpov,WHTPR,,,,,,,,Socioeconomic Status,
+189,189,189,p_poverty_rate_black,percentage of blacks in poverty,p_poverty_rate_black=n_poverty_black / n_poverty_determined_persons*100,pbpov,BLKPR,,,,,,,,Socioeconomic Status,
+190,190,190,p_poverty_rate_hispanic,percentage of Hispanics in poverty,p_poverty_rate_hispanic=n_poverty_hispanic / n_poverty_determined_persons*100,phpov,,,,,,,,,Socioeconomic Status,
+191,191,191,p_poverty_rate_native,percentage of Native Americans in poverty,p_poverty_rate_native=n_poverty_native / n_poverty_determined_persons*100,pnapov,,,,,,,,,Socioeconomic Status,
+192,192,192,p_poverty_rate_asian,percentage of Asian and Pacific Islanders in poverty,p_poverty_rate_asian=n_poverty_asian / n_poverty_determined_persons*100,papov,RASPR,,,,,,,,Socioeconomic Status,
+193,193,193,n_total_pop,total population,,pop,TRCTPOP,SF1,P0010001,SF1,P001001,B01003_001E,,,total population,
diff --git a/tools/check_acs_release.py b/tools/check_acs_release.py
new file mode 100644
index 00000000..f01f321c
--- /dev/null
+++ b/tools/check_acs_release.py
@@ -0,0 +1,235 @@
+from __future__ import annotations
+import os
+import sys
+from pathlib import Path
+import geopandas as gpd
+import requests
+from github import Github
+
+from geosnap.io.util import get_census_gdb, convert_census_gdb, process_acs
+REPO = "oturns/geosnap"
+ISSUE_PREFIX = "New ACS release detected:"
+CENSUS_ROOT = "https://www2.census.gov/geo/tiger/TIGER_DP"
+TIMEOUT = 30
+
+# TODO: make this update dynamically
+LATEST_SUPPORTED_YEAR = 2021
+
+# Start with one geography to keep memory lower and behavior predictable.
+GEOM_LEVEL = "blockgroup" # "blockgroup" or "tract"
+LEVEL_CODE = "bg" if GEOM_LEVEL == "blockgroup" else "tract"
+FILE_SUFFIX = "BG" if GEOM_LEVEL == "blockgroup" else "TRACT"
+
+# ensure the file actually has stuff in it
+MIN_EXPECTED_SIZE_BYTES = 1_250_000_000
+
+WORKDIR = Path("build") / f"{LATEST_SUPPORTED_YEAR + 1}_{LEVEL_CODE}"
+
+
+def census_year_url(year: int) -> str:
+ return f"{CENSUS_ROOT}/{year}ACS/"
+
+
+def expected_file(year: int) -> str:
+ return f"ACS_{year}_5YR_{FILE_SUFFIX}.gdb.zip"
+
+
+def expected_file_url(year: int) -> str:
+ return f"{census_year_url(year)}{expected_file(year)}"
+
+
+def fetch_directory_listing(year: int) -> str | None:
+ url = census_year_url(year)
+ resp = requests.get(url, timeout=TIMEOUT)
+ if resp.status_code == 404:
+ return None
+ resp.raise_for_status()
+ return resp.text
+
+
+def remote_file_size_bytes(url: str) -> int | None:
+ """
+ Try to get the remote file size from HTTP headers.
+
+ Returns:
+ int: size in bytes if available
+ None: if the server does not provide Content-Length
+ """
+ resp = requests.head(url, allow_redirects=True, timeout=TIMEOUT)
+
+ if resp.status_code == 404:
+ return None
+
+ # Some servers do not return Content-Length on HEAD. Fall back to GET stream.
+ if resp.ok:
+ content_length = resp.headers.get("Content-Length")
+ if content_length is not None:
+ return int(content_length)
+
+ resp = requests.get(url, stream=True, allow_redirects=True, timeout=TIMEOUT)
+
+ if resp.status_code == 404:
+ return None
+
+ resp.raise_for_status()
+ content_length = resp.headers.get("Content-Length")
+ if content_length is None:
+ return None
+ return int(content_length)
+
+
+def census_release_status(year: int) -> tuple[bool, str]:
+ """
+ Check whether the release is ready for processing.
+
+ A release is considered ready only if the year directory exists,
+ the expected file is listed in the directory, and the remote
+ file size is at least MIN_EXPECTED_SIZE_BYTES
+ """
+ html = fetch_directory_listing(year)
+ if html is None:
+ return False, f"{census_year_url(year)} not found"
+
+ filename = expected_file(year)
+ if filename not in html:
+ return False, f"{filename} not listed in {census_year_url(year)}"
+
+ file_url = expected_file_url(year)
+ size_bytes = remote_file_size_bytes(file_url)
+
+ if size_bytes is None:
+ return False, f"Could not determine remote file size for {file_url}"
+
+ if size_bytes < MIN_EXPECTED_SIZE_BYTES:
+ return (
+ False,
+ f"{filename} is present but too small "
+ f"({size_bytes:,} bytes < {MIN_EXPECTED_SIZE_BYTES:,} bytes)",
+ )
+
+ return (
+ True,
+ f"{filename} is present and large enough "
+ f"({size_bytes:,} bytes >= {MIN_EXPECTED_SIZE_BYTES:,} bytes)",
+ )
+
+
+def issue_exists(year: int) -> bool:
+ query = f'repo:{REPO} is:issue is:open "{ISSUE_PREFIX} {year}"'
+ gh = Github(os.environ["GITHUB_TOKEN"])
+ return gh.search_issues(query).totalCount > 0
+
+
+def open_issue(year: int, body: str) -> None:
+ gh = Github(os.environ["GITHUB_TOKEN"])
+ repo = gh.get_repo(REPO)
+ repo.create_issue(
+ title=f"{ISSUE_PREFIX} {year}",
+ body=body,
+ )
+
+
+def ensure_workdir() -> None:
+ WORKDIR.mkdir(parents=True, exist_ok=True)
+
+
+def download_raw_gdb(year: int) -> Path:
+ ensure_workdir()
+ filename = expected_file(year)
+ get_census_gdb(
+ years=[year],
+ geom_level=GEOM_LEVEL,
+ output_dir=str(WORKDIR),
+ protocol="https",
+ )
+ matches = [p.resolve() for p in WORKDIR.rglob(filename) if p.is_file()]
+ if not matches:
+ raise FileNotFoundError(
+ f"Could not find downloaded file {filename} under {WORKDIR.resolve()}"
+ )
+
+ if len(matches) > 1:
+ print("Multiple file matches found:", file=sys.stderr)
+ for match in matches:
+ print(f" {match}", file=sys.stderr)
+ chosen = matches[0]
+ print(f"Using downloaded zip file: {chosen}", file=sys.stderr)
+ return chosen
+
+
+def convert_raw_gdb(year: int, gdb_path: Path) -> Path:
+ gdb_path = gdb_path.resolve()
+
+ convert_census_gdb(
+ year=str(year),
+ level=LEVEL_CODE,
+ gdb_path=str(gdb_path),
+ layers=None,
+ save_intermediate=True,
+ overwrite=False,
+ combine=True,
+ output_dir=str(WORKDIR),
+ )
+ return WORKDIR / f"acs_demographic_profile_{year}_{LEVEL_CODE}.parquet"
+
+
+def build_processed_acs(year: int, combined_path: Path) -> Path:
+ df = gpd.read_parquet(combined_path)
+
+ if "GEOID" not in df.columns:
+ df = df.reset_index()
+
+ processed = process_acs(df)
+
+ out_path = WORKDIR / f"acs_{year}_{LEVEL_CODE}.parquet"
+ processed.to_parquet(out_path)
+ return out_path
+
+
+def main() -> int:
+ token = os.environ.get("GITHUB_TOKEN")
+ if not token:
+ print("Missing GITHUB_TOKEN", file=sys.stderr)
+ return 1
+
+ year = LATEST_SUPPORTED_YEAR + 1
+
+ ready, status_message = census_release_status(year)
+ print(status_message)
+
+ if not ready:
+ print(f"{year} release not ready for processing.")
+ return 0
+
+ try:
+ gdb_path = download_raw_gdb(year)
+ print(f"Downloaded: {gdb_path}")
+
+ combined_path = convert_raw_gdb(year, gdb_path)
+ print(f"Combined parquet: {combined_path}")
+
+ final_path = build_processed_acs(year, combined_path)
+ print(f"Processed ACS parquet: {final_path}")
+
+ return 0
+
+ except Exception as exc:
+ msg = (
+ f"Detected Census ACS release for {year}, but automated processing failed.\n\n"
+ f"Checked directory: {census_year_url(year)}\n"
+ f"Checked file: {expected_file_url(year)}\n\n"
+ f"Preflight check: {status_message}\n\n"
+ f"Error:\n```\n{exc}\n```"
+ )
+ print(msg, file=sys.stderr)
+
+ if os.environ.get("DISABLE_GITHUB_ISSUES", "").lower() not in {"1", "true", "yes"}:
+ if not issue_exists(year):
+ open_issue(year, msg)
+ else:
+ print("Skipping issue creation because DISABLE_GITHUB_ISSUES is set.", file=sys.stderr)
+
+ return 1
+
+if __name__ == "__main__":
+ raise SystemExit(main())