preview-tpl

Running

App Files Files Community

cassiebuhler commited on Jun 5

Commit

5ffe768

1 Parent(s): 352224e

cleaning up landvote

Browse files

Files changed (1) hide show

preprocessing/hexes.ipynb +456 -282

preprocessing/hexes.ipynb CHANGED Viewed

@@ -18,12 +18,14 @@
     "from cng.utils import *\n",
     "from cng.h3 import *\n",
     "from ibis import _\n",
     "import os\n",
     "from osgeo import gdal\n",
     "from minio import Minio\n",
     "import streamlit \n",
     "from datetime import timedelta\n",
     "import geopandas as gpd\n",
     "\n",
     "# Get signed URLs to access license-controlled layers\n",
     "key = st.secrets[\"MINIO_KEY\"]\n",
@@ -37,6 +39,22 @@
     "set_secrets(con)"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "0b086a1a-af23-487b-923d-fca595a19111",
@@ -52,7 +70,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def h3_from_geom(con, name, cols, zoom = 8):\n",
     "    \"\"\"\n",
     "    Computes hexes directly from geometry.\n",
     "    \"\"\"\n",
@@ -70,7 +88,7 @@
     "        SELECT {cols}, UNNEST(h{zoom}) AS h{zoom},\n",
     "        ST_GeomFromText(h3_cell_to_boundary_wkt(UNNEST(h{zoom}))) AS geom\n",
     "        FROM t2\n",
-    "    ''').to_parquet(f\"{name}_h3_z{zoom}.parquet\")\n",
     "    return "
    ]
   },
@@ -98,100 +116,93 @@
     "    expires=timedelta(hours=2),\n",
     ")\n",
     "\n",
-    "cols = ['fid', 'TPL_ID', 'State', 'County', 'Municipality',\n",
-    "       'Site_Name', 'Reported_Acres', 'Close_Year', 'Close_Date', 'Owner_Name',\n",
-    "       'Owner_Type', 'Manager_Name', 'Manager_Type', 'Purchase_Type',\n",
-    "       'EasementHolder_Name', 'EasementHolder_Type', 'Public_Access_Type',\n",
-    "       'Purpose_Type', 'Duration_Type', 'Data_Provider', 'Data_Source',\n",
-    "       'Source_Date', 'Data_Aggregator', 'Comments', 'Amount', 'Program_ID',\n",
-    "       'Program_Name', 'Sponsor_ID', 'Sponsor_Name', 'Sponsor_Type']\n",
     "\n",
     "\n",
     "tpl_table = (con.read_parquet(tpl)\n",
-    "             .mutate(geom = _.geom.convert(\"ESRI:102039\", \"EPSG:4326\"))\n",
-    "            )\n",
-    "\n",
     "con.create_table('tpl', tpl_table, overwrite=True)\n",
-    "h3_from_geom(con, 'tpl', cols)\n",
-    "\n",
-    "client.fput_object(bucket_name = \"shared-tpl\",\n",
-    "           object_name = \"tpl_h3_z8.parquet\",\n",
-    "           file_path = \"tpl_h3_z8.parquet\") "
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "3f00cfe9-520c-4839-aeed-46a83b11ecce",
    "metadata": {},
    "source": [
-    "# Census\n",
-    "\n",
-    "Getting polygons and FIPS codes from Census state, county, place, and subdivision data. \n",
-    "\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "aec60670-4abc-4fc4-86ca-88a77ba69d39",
    "metadata": {},
    "outputs": [],
    "source": [
-    "state_url = \"s3://public-census/2024/state/2024_us_state.parquet\"\n",
-    "county_url = \"s3://public-census/2024/county/2024_us_county.parquet\"\n",
-    "\n",
-    "state_file = '2024_us_state_h3_z8.parquet'\n",
-    "county_temp_file = '2024_us_county_h3_z8_temp.parquet'\n",
-    "county_file = '2024_us_county_h3_z8.parquet'\n",
-    "city_file = '2024_us_places_subdivisions_h3_z8.parquet'"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "7cd589ad-5b03-41de-8936-c20091a937e1",
    "metadata": {},
    "source": [
-    "#### State"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "86d2ed94-740f-49cd-a041-50401b7c7984",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# convert shape file to parquet \n",
-    "gdf = gpd.read_file('tl_2024_us_state.shp').to_crs('epsg:4326').rename_geometry('geom').rename(columns={\"GEOID\": \"FIPS\", \"STUSPS\":\"state\", \"NAME\":\"name\"})\n",
-    "con.create_table('state_wkt', gdf, overwrite=True)\n",
-    "\n",
-    "# get geom (duckdb turns geodataframes into wkt)\n",
-    "con.sql(\"\"\"\n",
-    "SELECT * EXCLUDE geom,\n",
-    "  ST_GeomFromWKB(geom) AS geom\n",
-    "FROM state_wkt\n",
-    "\"\"\").to_parquet(state_url)\n",
-    "\n",
-    "# convert to h3\n",
-    "con.read_parquet(state_url, table_name = 'state')\n",
-    "cols = ['STATE','name','FIPS']\n",
-    "h3_from_geom(con, 'state', cols)\n",
-    "\n",
-    "# save file \n",
-    "client.fput_object(bucket_name = \"public-census\",\n",
-    "           file_path = \"state_h3_z8.parquet\",\n",
-    "           object_name = f\"2024/state/{state_file}\") "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "a39905b6-b0b6-46d0-a035-cd1ac0f43d53",
-   "metadata": {},
    "outputs": [],
    "source": [
-    "# grabbing state abbeviations for later\n",
-    "state_ids = con.read_parquet(state_url).select('name','state','FIPS').rename(state_name = 'name')"
    ]
   },
   {
@@ -205,58 +216,39 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "96f3d43f-9ab8-4f3e-a981-51c5eca6f848",
    "metadata": {},
    "outputs": [],
    "source": [
     "%%time\n",
-    "# convert shape to parquet \n",
-    "gdf = gpd.read_file('tl_2024_us_county.shp').to_crs('epsg:4326').rename_geometry('geom').drop('NAME',axis =1).rename(columns={\"GEOID\": \"FIPS\", \"NAMELSAD\":\"name\"})[['geom','name','FIPS','STATEFP']]\n",
-    "con.create_table('county_wkt', gdf, overwrite=True)\n",
-    "\n",
-    "# convert to geom (duckdb turns geodataframes into wkt)\n",
-    "con.sql(\"\"\"\n",
-    "SELECT * EXCLUDE geom,\n",
-    "  ST_GeomFromWKB(geom) AS geom\n",
-    "FROM county_wkt\n",
-    "\"\"\").to_parquet(county_url)\n",
-    "\n",
-    "# convert to h3\n",
-    "con.read_parquet(county_url, table_name = 'county')\n",
-    "cols = ['name','FIPS','STATEFP']\n",
-    "h3_from_geom(con, 'county', cols)\n",
-    "\n",
-    "# save file \n",
-    "client.fput_object(bucket_name = \"public-census\",\n",
-    "           file_path = \"county_h3_z8.parquet\",\n",
-    "           object_name = f\"2024/county/{county_temp_file}\") \n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3b3a5023-45d7-4039-a078-4901ebdd3e10",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# get a non hex version of counties to use as bounds in tpl app\n",
-    "temp = con.read_parquet(county_url)\n",
-    "(temp.left_join(state_ids, [temp.STATEFP == state_ids.FIPS]).drop('FIPS_right','STATEFP')\n",
-    " .rename(county = 'name').select('FIPS','state','state_name','county','geom')\n",
-    ").to_parquet(county_url)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a7829daf-8333-4e35-83f6-0a2bbcd174fa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# get state abbeviations for counties\n",
-    "county_geo = con.read_parquet(f\"s3://public-census/2024/county/{county_temp_file}\")\n",
-    "county_geo.left_join(state_ids, [county_geo.STATEFP == state_ids.FIPS]).drop('FIPS_right','STATEFP').to_parquet(f\"s3://public-census/2024/county/{county_file}\")\n"
    ]
   },
   {
@@ -277,26 +269,30 @@
    "outputs": [],
    "source": [
     "match_pattern = r\"(?i)\\s*(city|town|village|charter|municipality|Borough)\\b\"\n",
-    "city_cols = [\"state\",\"county\",\"FIPS\",\"name\",'city']\n",
     "\n",
     "places_url = \"https://www2.census.gov/geo/docs/reference/codes2020/national_place_by_county2020.txt\"\n",
     "places_fips = (con.read_csv(places_url)\n",
-    "               .rename(state = \"STATE\", county = \"COUNTYNAME\", city = \"PLACENAME\")\n",
     "               .mutate(name=_.city.re_replace(match_pattern, \"\").strip())\n",
     "               .mutate(FIPS = _.STATEFP + _.COUNTYFP)\n",
     "               .select(city_cols))\n",
     "\n",
     "subdivisions_url = \"https://www2.census.gov/geo/docs/reference/codes2020/national_cousub2020.txt\"\n",
     "subdivisions_fips = (con.read_csv(subdivisions_url)\n",
-    "                     .rename(state = \"STATE\", county = \"COUNTYNAME\", city = \"COUSUBNAME\")\n",
     "                     .mutate(name=_.city.re_replace(match_pattern, \"\").strip())\n",
     "                     .mutate(FIPS = _.STATEFP + _.COUNTYFP)\n",
     "                     .select(city_cols))\n",
     "\n",
-    "city_fips = places_fips.union(subdivisions_fips).distinct() #get unique -> some cities are listed in both places and subdivisions\n",
-    "city_geo = city_fips.left_join(county_geo, 'FIPS').drop('FIPS_right','name_right','city') #get h3 from counties \n",
-    "city_joined = city_geo.left_join(state_ids, [city_geo.STATEFP == state_ids.FIPS]).drop('FIPS_right','STATEFP','state_right')# get state ids \n",
-    "city_joined.to_parquet(f\"s3://public-census/2024/places_subdivisions/{city_file}\")\n"
    ]
   },
   {
@@ -312,8 +308,9 @@
     "- First, need to split up landvote into its 3 jurisdictions: state, county, and municipals\n",
     "- Join states with Census \"states\" to get state FIPS/hex\n",
     "- Join counties with Census \"counties\" to get county FIPS/hex\n",
     "- Join municipals with Census \"places\" and \"subdivisions\" to get county FIPS/hex\n",
-    "- Then join all municipal, county, and state data back together!\n",
     "\n"
    ]
   },
@@ -327,44 +324,43 @@
     "landvote_csv = client.get_presigned_url(\n",
     "    \"GET\",\n",
     "    \"shared-tpl\",\n",
-    "    \"landvote.csv\",\n",
     "    expires=timedelta(hours=2),\n",
     ")\n",
-    "\n",
-    "match_pattern = r\"(?i)\\s*(city|town|village|charter|municipality|Borough)\\b\"\n",
-    "landvote = (con.read_csv(landvote_csv, ignore_errors=True)\n",
-    "            .rename(jurisdiction = \"Jurisdiction Type\", state = \"State\")\n",
-    "            .mutate(state = _.state.substitute({'Ore':'OR'}))\n",
     "            .mutate(name=_['Jurisdiction Name'].re_replace(match_pattern, \"\").strip())\n",
-    "            .mutate(landvote_id=ibis.row_number().over())\n",
     "            .mutate(_['Conservation Funds Approved'].replace('$', '')\n",
-    "                    .replace(',', '').cast('float').name('Conservation Funds Approved')))\n",
-    "\n",
-    "\n",
-    "final_columns = ['landvote_id',\n",
-    "    'FIPS',\n",
-    "    'state',\n",
-    "    'state_name',\n",
-    "    'county',\n",
-    "    'city',\n",
-    "    'jurisdiction',\n",
-    "    'Date',\n",
-    "    'Description',\n",
-    "    'Finance Mechanism',\n",
-    "    '\"Other\" Comment',\n",
-    "    'Purpose',\n",
-    "    'Total Funds at Stake',\n",
-    "    'Conservation Funds at Stake',\n",
-    "    'Total Funds Approved',\n",
-    "    'Conservation Funds Approved',\n",
-    "    'Pass?',\n",
-    "    'Status',\n",
-    "    '% Yes',\n",
-    "    '% No',\n",
-    "    'Notes',\n",
-    "    'Voted Acq. Measure',\n",
-    "    'geom',\n",
-    "    'h8']"
    ]
   },
   {
@@ -382,17 +378,27 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "state_geo = con.read_parquet(f\"s3://public-census/2024/state/{state_file}\")\n",
     "states = (landvote.filter(_.jurisdiction == \"State\")\n",
-    "            .rename(state_name = \"Jurisdiction Name\")\n",
     "            .mutate(county = ibis.literal('None'))\n",
     "            .mutate(county_fips = ibis.literal('None'))\n",
     "            .mutate(city = ibis.literal('None')))\n",
     "\n",
-    "landvote_state = (states.left_join(state_geo, [states.name.upper() == state_geo.name.upper()])\n",
-    "                   .select(final_columns))\n",
-    "\n",
-    "#adding state ID and state name from the county/city\n"
    ]
   },
   {
@@ -410,212 +416,380 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "county_geo = con.read_parquet(f\"s3://public-census/2024/county/{county_file}\")\n",
-    "\n",
-    "county_match_pattern = r\"(?i)\\s*(County)\\b\"\n",
     "\n",
     "counties = (landvote.filter(_.jurisdiction == \"County\")\n",
     "            .rename(county = \"Jurisdiction Name\")\n",
     "            .mutate(city = ibis.literal('None'))\n",
-    "            .mutate(name=_.name.re_replace(county_match_pattern, \"\").strip()))\n",
     "\n",
-    "landvote_county = (counties.left_join(county_geo, [counties.name.upper() == county_geo.name.upper(), \n",
-    "                                                    counties.state == county_geo.state])\n",
-    "                   .select(final_columns))"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "cca12e06-8d7f-4a50-906c-7dc02e370072",
    "metadata": {},
    "source": [
-    "#### Municipal level\n",
     "\n",
-    "Because there isn't a 1 to 1 match from municipals to Census data, we need to use both \"Places\" and \"Subdivisons\". "
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "aa81e457-00ba-4b01-86d7-cf46bec04edd",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
    "source": [
-    "city_geo = con.read_parquet(f\"s3://public-census/2024/places_subdivisions/{city_file}\")\n",
-    "\n",
-    "municipals = landvote.filter(_.jurisdiction == \"Municipal\").rename(city = \"Jurisdiction Name\")\n",
-    "\n",
-    "landvote_city = (municipals.left_join(city_geo, [municipals.name.upper() == city_geo.name.upper(), \n",
-    "                                                  municipals.state == city_geo.state])\n",
-    "                 .inner_join(state_ids, [municipals.state == state_ids.state])\n",
-    "                 )"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "6d52d97b-ad14-4d04-89a8-79a813f80353",
    "metadata": {},
    "source": [
-    "#### Joining all the landvote data with census\n",
-    "Note: `landvote_joined` has more rows than `landvote` because some cities span multiple counties. Each additional county creates a new row."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6d481736-82f3-4be2-b5af-6280be5e9d75",
    "metadata": {},
    "outputs": [],
    "source": [
-    "landvote_joined = landvote_city.union(landvote_county).union(landvote_state)\n",
-    "landvote_joined.to_parquet(\"s3://shared-tpl/landvote_h3_z8.parquet\")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "4fbdacae-28cf-4335-b895-f1335749a6e3",
    "metadata": {},
    "source": [
-    "#### And get a non-hex version of landvote"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "67bdc9d2-f195-427b-b1c6-08f1f5f90ac7",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import ibis.selectors as s\n",
     "\n",
-    "state_geo = con.read_parquet(state_url)\n",
-    "landvote_state = (states.left_join(state_geo, [states.name.upper() == state_geo.name.upper()])).select(final_columns[:-1])\n",
     "\n",
-    "county_geo = con.read_parquet(county_url)\n",
-    "landvote_county = (counties.left_join(county_geo, [counties.name.upper() == county_geo.county.upper(), \n",
-    "                                                    counties.state == county_geo.state])).select(final_columns[:-1])\n",
     "\n",
-    "city_fips = places_fips.union(subdivisions_fips).distinct() #get unique -> some cities are listed in both places and subdivisions\n",
-    "city_geo = city_fips.left_join(county_geo, 'FIPS').select(~s.endswith('_right'))\n",
     "\n",
-    "landvote_city = (municipals.left_join(city_geo, [municipals.name.upper() == city_geo.name.upper(), \n",
-    "                                                  municipals.state == city_geo.state])\n",
-    "                 ).select(final_columns[:-1])\n",
     "\n",
-    "landvote_joined = landvote_city.union(landvote_county).union(landvote_state)\n",
-    "landvote_joined.to_parquet(\"s3://shared-tpl/landvote_geom.parquet\")"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "f41a207c-d560-4dab-8ec6-3fee24cb880b",
    "metadata": {},
    "source": [
-    "# Join TPL Almanac with Landvote"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "ac8e6648-d3ad-4e63-81a0-6f78f3b7584c",
    "metadata": {},
    "source": [
-    "- Joining the data\n",
-    "- Generate pmtiles -> converting h8 back to original polygons"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "d2359bf0-1f3b-4af4-a96f-301dde56d81d",
-   "metadata": {},
    "outputs": [],
    "source": [
-    "# joining data\n",
-    "landvote_parquet = client.get_presigned_url(\n",
-    "    \"GET\",\n",
-    "    \"shared-tpl\",\n",
-    "    \"landvote_h3_z8.parquet\",\n",
-    "    expires=timedelta(hours=2),\n",
-    ")\n",
     "\n",
-    "landvote = (con.read_parquet(landvote_parquet)\n",
-    "            .rename(FIPS_county = \"FIPS\",\n",
-    "                    measure_status = \"Status\", measure_purpose = \"Purpose\",measure_amount = 'Conservation Funds Approved')\n",
-    "            .mutate(measure_year = _.Date.year()).drop('Date','geom'))\n",
     "\n",
-    "tpl_parquet = client.get_presigned_url(\n",
-    "    \"GET\",\n",
-    "    \"shared-tpl\",\n",
-    "    \"tpl_h3_z8.parquet\",\n",
-    "    expires=timedelta(hours=2),\n",
-    ")\n",
-    "\n",
-    "tpl_drop_cols = ['Reported_Acres','Close_Date','EasementHolder_Name',\n",
-    "        'Data_Provider','Data_Source','Data_Aggregator',\n",
-    "        'Program_ID','Sponsor_ID']\n",
-    "tpl = con.read_parquet(tpl_parquet).mutate(h8 = _.h8.lower()).drop(tpl_drop_cols)\n",
-    "        \n",
-    "\n",
-    "select_cols = ['fid','TPL_ID','landvote_id',\n",
-    "'state','state_name','county',\n",
-    " 'FIPS_county','city','jurisdiction',\n",
-    " 'Close_Year', 'Site_Name',\n",
-    " 'Owner_Name','Owner_Type',\n",
-    " 'Manager_Name','Manager_Type',\n",
-    " 'Purchase_Type','EasementHolder_Type',\n",
-    " 'Public_Access_Type','Purpose_Type',\n",
-    " 'Duration_Type','Amount',\n",
-    " 'Program_Name','Sponsor_Name',\n",
-    " 'Sponsor_Type','measure_year',\n",
-    " 'measure_status','measure_purpose',\n",
-    " 'measure_amount']\n",
-    "\n",
-    "# joining all data\n",
-    "database = (\n",
-    "  tpl.drop('State','County')\n",
-    "  .left_join(landvote, \"h8\").drop('h8_right')\n",
-    ").select(select_cols).distinct()\n"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8c3d5165-1b9f-4faf-9291-855d51698adc",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# getting original polygons back \n",
-    "tpl_geom_url = client.get_presigned_url(\n",
-    "    \"GET\",\n",
-    "    \"shared-tpl\",\n",
-    "    \"tpl.parquet\",\n",
-    "    expires=timedelta(hours=2),\n",
-    ")\n",
-    "\n",
-    "tpl_geom = con.read_parquet(tpl_geom_url).select('geom','TPL_ID','fid').mutate(geom = _.geom.convert(\"ESRI:102039\", \"EPSG:4326\"))\n",
-    "\n",
-    "database = (database.inner_join(tpl_geom, [database.TPL_ID == tpl_geom.TPL_ID, database.fid == tpl_geom.fid])\n",
-    "            # .mutate(id=ibis.row_number().over())\n",
-    "            # .drop('TPL_ID','fid','landvote_id')\n",
-    "           )\n",
-    "           "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "d9bb698f-3925-4aa6-8b89-56a6ee839312",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# save to parquet/pmtiles \n",
-    "database.to_parquet(\"s3://shared-tpl/tpl_almanac_landvote_geom.parquet\")\n",
-    "database.execute().set_crs('epsg:4326').to_file('tpl_almanac_landvote_geom.geojson')\n",
-    "\n",
-    "to_pmtiles('tpl_almanac_landvote_geom.geojson', 'tpl_almanac_landvote_geom.pmtiles', options = ['--extend-zooms-if-still-dropping'])\n",
-    "s3_cp('tpl_almanac_landvote_geom.pmtiles', \"s3://shared-tpl/tpl_almanac_landvote_geom.pmtiles\", \"minio\")"
    ]
   }
  ],

     "from cng.utils import *\n",
     "from cng.h3 import *\n",
     "from ibis import _\n",
+    "import ibis.selectors as s\n",
     "import os\n",
     "from osgeo import gdal\n",
     "from minio import Minio\n",
     "import streamlit \n",
     "from datetime import timedelta\n",
     "import geopandas as gpd\n",
+    "import re\n",
     "\n",
     "# Get signed URLs to access license-controlled layers\n",
     "key = st.secrets[\"MINIO_KEY\"]\n",
     "set_secrets(con)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "deb47703-31fd-4039-84ce-df6a24cdf702",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "census_path = \"s3://public-census/2024/\"\n",
+    "state_file = census_path + 'state/2024_us_state.parquet'\n",
+    "county_file = census_path + 'county/2024_us_county.parquet'\n",
+    "\n",
+    "state_h3_file = census_path + 'state/2024_us_state_h3_z8.parquet'\n",
+    "county_h3_file = census_path + 'county/2024_us_county_h3_z8.parquet'\n",
+    "city_h3_file = census_path + 'places_subdivisions/2024_us_places_subdivisions_h3_z8.parquet'"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "0b086a1a-af23-487b-923d-fca595a19111",
    "metadata": {},
    "outputs": [],
    "source": [
+    "def h3_from_geom(con, name, cols, save_path, zoom = 8):\n",
     "    \"\"\"\n",
     "    Computes hexes directly from geometry.\n",
     "    \"\"\"\n",
     "        SELECT {cols}, UNNEST(h{zoom}) AS h{zoom},\n",
     "        ST_GeomFromText(h3_cell_to_boundary_wkt(UNNEST(h{zoom}))) AS geom\n",
     "        FROM t2\n",
+    "    ''').to_parquet(save_path)\n",
     "    return "
    ]
   },
     "    expires=timedelta(hours=2),\n",
     ")\n",
     "\n",
+    "cols = ['fid', 'tpl_id', 'state_id', 'state', 'county', 'municipality',\n",
+    "        'site', 'acres', 'year', 'date', 'owner','owner_type','manager',\n",
+    "        'manager_type','purchase_type','easement','easement_type',\n",
+    "        'access_type','purpose_type','duration_type','data_provider',\n",
+    "        'data_source','source_date','data_aggregator','comments','amount',\n",
+    "        'program_id','program','sponsor_id','sponsor','sponsor_type','FIPS']\n",
     "\n",
+    "state_ids = con.read_parquet(state_file).drop('geom')\n",
     "\n",
     "tpl_table = (con.read_parquet(tpl)\n",
+    "    .rename(tpl_id = \"TPL_ID\", state = \"State\", county = \"County\", municipality = \"Municipality\", site = \"Site_Name\",\n",
+    "            acres = \"Reported_Acres\", area = \"Shape_Area\", year = \"Close_Year\", date = \"Close_Date\", owner = \"Owner_Name\",\n",
+    "            owner_type = \"Owner_Type\", manager = \"Manager_Name\", manager_type = \"Manager_Type\",\n",
+    "            purchase_type = \"Purchase_Type\", easement = \"EasementHolder_Name\", easement_type = \"EasementHolder_Type\",\n",
+    "            access_type = \"Public_Access_Type\", purpose_type = \"Purpose_Type\", duration_type = \"Duration_Type\",\n",
+    "            data_provider = \"Data_Provider\", data_source = \"Data_Source\", source_date = \"Source_Date\",\n",
+    "            data_aggregator = \"Data_Aggregator\", comments = \"Comments\", amount = \"Amount\", program_id = 'Program_ID',\n",
+    "            program = 'Program_Name', sponsor_id = \"Sponsor_ID\", sponsor = \"Sponsor_Name\", sponsor_type = \"Sponsor_Type\")\n",
+    "    .mutate(geom = _.geom.convert(\"ESRI:102039\", \"EPSG:4326\"))\n",
+    "    .inner_join(state_ids, 'state'))\n",
+    "    \n",
     "con.create_table('tpl', tpl_table, overwrite=True)\n",
+    "# h3_from_geom(con, 'tpl', cols, save_path = 's3://shared-tpl/conservation_almanac/z8/tpl_h3_z8.parquet')"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "9612c804-0474-4bfe-924d-89dae0105663",
    "metadata": {},
    "source": [
+    "#### Generate PMTiles"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "932476f8-53a9-4aa9-9520-cf3fa42b8150",
    "metadata": {},
    "outputs": [],
    "source": [
+    "tpl_table.to_parquet('s3://shared-tpl/conservation_almanac/tpl.parquet')\n",
+    "tpl_table.to_parquet('tpl_epsg4326.parquet') #local copy to use to_geojson\n",
+    "to_geojson('tpl_epsg4326.parquet', \"tpl.geojson\")\n",
+    "pmtiles = to_pmtiles(\"tpl.geojson\", \"tpl.pmtiles\")\n",
+    "s3_cp('tpl.pmtiles', \"s3://shared-tpl/conservation_almanac/tpl.pmtiles\", \"minio\")"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "3f00cfe9-520c-4839-aeed-46a83b11ecce",
    "metadata": {},
    "source": [
+    "# Census\n",
+    "\n",
+    "Getting polygons and FIPS codes from Census state, county, place, and subdivision data. \n",
+    "\n"
    ]
   },
   {
+   "cell_type": "markdown",
+   "id": "7cd589ad-5b03-41de-8936-c20091a937e1",
    "metadata": {},
    "source": [
+    "#### State"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "08a861e6-fdcd-480a-ad0a-423c38cc1bc1",
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
+    "url = \"/vsizip//vsicurl/https://www2.census.gov/geo/tiger/TIGER2024/STATE/tl_2024_us_state.zip\"\n",
+    "state = (con.read_geo(url)\n",
+    "            .mutate(geom = _.geom.convert('EPSG:4269','EPSG:4326'))\n",
+    "            .rename(FIPS = \"GEOID\", state_id = \"STUSPS\", name = \"NAME\")\n",
+    "            .select('FIPS','state_id','name','geom')\n",
+    "             )\n",
+    "state.to_parquet(state_file)\n",
+    "\n",
+    "#get h3\n",
+    "con.read_parquet(state_file, table_name = 'state')\n",
+    "cols = ['state','state_id','FIPS']\n",
+    "h3_from_geom(con, 'state', cols, save_path = state_h3_file)"
    ]
   },
   {
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "1a6ea98d-878a-4ea5-8eda-ee10f34444e3",
    "metadata": {},
    "outputs": [],
    "source": [
     "%%time\n",
+    "## CT counties changed to \"planning regions\" in 2022, so I'm grabbing older data to get the county boundaries \n",
+    "url = \"/vsizip//vsicurl/https://www2.census.gov/geo/tiger/TIGER2020/COUNTY/tl_2020_us_county.zip\"\n",
+    "con.read_geo(url)\n",
+    "CT_counties = (con.read_geo(url)\n",
+    "            .mutate(geom = _.geom.convert('EPSG:4269','EPSG:4326'))\n",
+    "            .rename(FIPS = \"GEOID\", county = \"NAMELSAD\")\n",
+    "            .select('FIPS','STATEFP','county','geom')\n",
+    "            .filter(_.STATEFP == '09')\n",
+    "             )\n",
+    "\n",
+    "# US counties \n",
+    "url = \"/vsizip//vsicurl/https://www2.census.gov/geo/tiger/TIGER2024/COUNTY/tl_2024_us_county.zip\"\n",
+    "con.read_geo(url)\n",
+    "county = (con.read_geo(url)\n",
+    "            .mutate(geom = _.geom.convert('EPSG:4269','EPSG:4326'))\n",
+    "            .rename(FIPS = \"GEOID\", county = \"NAMELSAD\")\n",
+    "            .select('FIPS','STATEFP','county','geom')\n",
+    "            .union(CT_counties)\n",
+    "         ) \n",
+    "\n",
+    "#adding states to counties\n",
+    "state_ids = con.read_parquet(state_file).drop('geom')\n",
+    "county.inner_join(state_ids, [state_ids.FIPS == county.STATEFP]).select('FIPS','state_id','state','county','geom').to_parquet(county_file)\n",
+    "\n",
+    "#get h3\n",
+    "con.read_parquet(county_file, table_name = 'county')\n",
+    "cols = ['state_id','state','county','FIPS']\n",
+    "# h3_from_geom(con, 'county', cols, save_path = county_h3_file)\n"
    ]
   },
   {
    "outputs": [],
    "source": [
     "match_pattern = r\"(?i)\\s*(city|town|village|charter|municipality|Borough)\\b\"\n",
+    "# match_pattern = r\"(?i)(?<![a-z])(?:city|town|charter|municipality|[Bb]orough)(?![a-z])\"\n",
+    "\n",
+    "city_cols = [\"state_id\",\"county\",\"FIPS\",\"name\",'city']\n",
     "\n",
     "places_url = \"https://www2.census.gov/geo/docs/reference/codes2020/national_place_by_county2020.txt\"\n",
     "places_fips = (con.read_csv(places_url)\n",
+    "               .rename(state_id = \"STATE\", county = \"COUNTYNAME\", city = \"PLACENAME\")\n",
     "               .mutate(name=_.city.re_replace(match_pattern, \"\").strip())\n",
     "               .mutate(FIPS = _.STATEFP + _.COUNTYFP)\n",
     "               .select(city_cols))\n",
     "\n",
     "subdivisions_url = \"https://www2.census.gov/geo/docs/reference/codes2020/national_cousub2020.txt\"\n",
     "subdivisions_fips = (con.read_csv(subdivisions_url)\n",
+    "                     .rename(state_id = \"STATE\", county = \"COUNTYNAME\", city = \"COUSUBNAME\")\n",
     "                     .mutate(name=_.city.re_replace(match_pattern, \"\").strip())\n",
     "                     .mutate(FIPS = _.STATEFP + _.COUNTYFP)\n",
     "                     .select(city_cols))\n",
     "\n",
+    "#get unique -> some cities are listed in both places and subdivisions\n",
+    "city_fips = places_fips.union(subdivisions_fips).distinct() \n",
+    "\n",
+    "#get h3 from counties \n",
+    "county_h3 = con.read_parquet(county_h3_file)\n",
+    "city_fips.inner_join(county_h3, 'FIPS').select('FIPS','state_id','state','county','city','name','geom','h8').to_parquet(city_h3_file)\n"
    ]
   },
   {
     "- First, need to split up landvote into its 3 jurisdictions: state, county, and municipals\n",
     "- Join states with Census \"states\" to get state FIPS/hex\n",
     "- Join counties with Census \"counties\" to get county FIPS/hex\n",
+    "- Join special districts with Census \"places\" and \"subdivisions\" to get county FIPS/hex\n",
     "- Join municipals with Census \"places\" and \"subdivisions\" to get county FIPS/hex\n",
+    "- Then join all municipal, county, special district, and state data back together!\n",
     "\n"
    ]
   },
     "landvote_csv = client.get_presigned_url(\n",
     "    \"GET\",\n",
     "    \"shared-tpl\",\n",
+    "    \"landvote/landvote_utf8.csv\",\n",
     "    expires=timedelta(hours=2),\n",
     ")\n",
+    "collapse_spaces = r\"\\s+\"\n",
+    "match_pattern = r\"(?i)\\b(city|town|charter|municipality|Borough)\\b\"\n",
+    "landvote_ = (con.read_csv(landvote_csv) #it skips the row with a unicode error \n",
+    "            .rename(jurisdiction = \"Jurisdiction Type\", state_id = \"State\")\n",
+    "            .mutate(state_id = _.state_id.substitute({'Ore':'OR'}))\n",
     "            .mutate(name=_['Jurisdiction Name'].re_replace(match_pattern, \"\").strip())\n",
+    "            .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
+    "            .mutate(landvote_id=ibis.row_number().over(order_by=[_.state_id, _.jurisdiction, _.Date]))\n",
     "            .mutate(_['Conservation Funds Approved'].replace('$', '')\n",
+    "                    .replace(',', '').cast('float').name('Conservation Funds Approved'))\n",
+    "            .mutate(year = _.Date.year())\n",
+    "            .rename(date = \"Date\", description = \"Description\", finance_mechanism = \"Finance Mechanism\",\n",
+    "                    other_comments = '\"Other\" Comment', purpose = \"Purpose\", total_funds_at_stake = \"Total Funds at Stake\",\n",
+    "                    conservation_funds_at_stake = \"Conservation Funds at Stake\", total_funds_approved = \"Total Funds Approved\",\n",
+    "                    conservation_funds_approved = \"Conservation Funds Approved\", passed = \"Pass?\", status = \"Status\", \n",
+    "                    percent_yes = '% Yes', percent_no = '% No', notes = 'Notes', voted_acq_measure = \"Voted Acq. Measure\")\n",
+    "            )\n",
+    "\n",
+    "#landvote_id is made with a window function, which can be a bit buggy, so it helps to materialize it after generating \n",
+    "landvote_with_ids = landvote_.execute()  \n",
+    "landvote = con.create_table(\"landvote\", landvote_with_ids, overwrite = True)\n",
+    "\n",
+    "final_columns = ['landvote_id','FIPS',\n",
+    "    'state_id','state','county',\n",
+    "    'city','jurisdiction','year','date',\n",
+    "    'description','finance_mechanism',\n",
+    "    'other_comments','purpose',\n",
+    "    'total_funds_at_stake',\n",
+    "    'conservation_funds_at_stake',\n",
+    "    'total_funds_approved',\n",
+    "    'conservation_funds_approved',\n",
+    "    'passed','status','percent_yes','percent_no',\n",
+    "    'notes','voted_acq_measure',\n",
+    "    'geom','h8']"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "state_z8 = con.read_parquet(state_h3_file)\n",
     "states = (landvote.filter(_.jurisdiction == \"State\")\n",
+    "            .rename(state = \"Jurisdiction Name\")\n",
     "            .mutate(county = ibis.literal('None'))\n",
     "            .mutate(county_fips = ibis.literal('None'))\n",
     "            .mutate(city = ibis.literal('None')))\n",
     "\n",
+    "landvote_state_z8 = (states.inner_join(state_z8, [states.state.upper() == state_z8.state.upper()])\n",
+    "                   .select(final_columns))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "85f50f50-f56a-4b15-a1ad-5b87fe80dd54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# getting non hex version \n",
+    "state_geo = con.read_parquet(state_file)\n",
+    "landvote_state_geo = (states.inner_join(state_geo, [states.state.upper() == state_geo.state.upper()])).select(final_columns[:-1])"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "county_match_pattern = r\"(?i)(?:(\\b[\\w-]+(?:\\s[\\w-]+)*)\\sCounty\\b|of\\s+([\\w-]+(?:\\s[\\w-]+)*))\"\n",
+    "county_vals = {'Columbus and Franklin County Metro Parks':'Franklin',\n",
+    "                ' Columbus and Franklin County Metro Parks':'Franklin',\n",
+    "                'Athens-Clarke County': 'Clarke',\n",
+    "                'City and County of San Francisco':'San Francisco',\n",
+    "                'Cleveland Metropolitan Park District':'Cuyahoga',\n",
+    "                'Denver City and County':'Denver',\n",
+    "                'East Baton Rouge Parish':'East Baton Rouge Parish',\n",
+    "                'Five Rivers MetroParks':'Montgomery',\n",
+    "                'Forest Preserve District of DuPage County':'DuPage',\n",
+    "                'Forest Preserve District of Kane County':'Kane',\n",
+    "                'Forest Preserves of Cook County':'Cook',\n",
+    "                'Great Parks of Hamilton County':'Hamilton',\n",
+    "                'Jacksonville':'Duval',\n",
+    "                'James City County': 'James City',\n",
+    "                'Johnny Appleseed Park District':'Allen',\n",
+    "                'Licking Park District':'Licking',\n",
+    "                'Matanuska-Susitna Borough':'Matanuska-Susitna Borough',\n",
+    "                'MetroParks of Butler County':'Butler',\n",
+    "                ' Metropolitan Park District of Toledo Area':'Lucas',\n",
+    "                'Metropolitan Park District of the Toledo Area':'Lucas',\n",
+    "                'Metropolitan Park District of Toledo Area':'Lucas',\n",
+    "                'Metropolitan Park District of Toledo Area ':'Lucas',\n",
+    "                'Park District of Ottawa County':'Ottawa',\n",
+    "                'Portage Park District':'Portage',\n",
+    "                'Preservation Park District of Delaware County':'Delaware',\n",
+    "                'Preservation Parks of Delaware County':'Delaware',\n",
+    "                'Santa Clara Valley Water District': 'Santa Clara',\n",
+    "                'St. Tammany Parish':'St. Tammany Parish',\n",
+    "                'Summit Metro Parks':'Summit'}\n",
+    "\n",
+    "county_z8 = (con.read_parquet(county_h3_file)\n",
+    "            .mutate(name=_.county.re_extract(county_match_pattern, 1).strip())\n",
+    "            .mutate(name = _.county.substitute(value = county_vals,else_= _.name))\n",
+    "             )\n",
     "\n",
     "counties = (landvote.filter(_.jurisdiction == \"County\")\n",
     "            .rename(county = \"Jurisdiction Name\")\n",
     "            .mutate(city = ibis.literal('None'))\n",
+    "            .mutate(name=_.name.re_extract(county_match_pattern, 1).strip())\n",
+    "            .mutate(name = _.county.substitute(value = county_vals,else_= _.name))\n",
+    "           )\n",
     "\n",
+    "landvote_county_z8 = (counties\n",
+    "    .inner_join(county_z8, [counties.name.upper() == county_z8.name.upper(), counties.state_id == county_z8.state_id])\n",
+    "    .select(final_columns)\n",
+    "                     )"
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "96846068-4efa-4908-a48b-3208e08001ad",
    "metadata": {},
+   "outputs": [],
    "source": [
+    "# getting non hex version \n",
+    "county_geo = (con.read_parquet(county_file)\n",
+    "                .mutate(name=_.county.re_extract(county_match_pattern, 1).strip())\n",
+    "                .mutate(name = _.county.substitute(value = county_vals,else_= _.name))\n",
+    "            )\n",
     "\n",
+    "landvote_county_geo = (counties.inner_join(county_geo, [counties.name.upper() == county_geo.name.upper(), \n",
+    "                                                    counties.state_id == county_geo.state_id])\n",
+    "                   .select(final_columns[:-1])\n",
+    "                  )"
    ]
   },
   {
+   "cell_type": "markdown",
+   "id": "99be20da-2a62-4ece-97e5-69118f400c62",
+   "metadata": {},
    "source": [
+    "#### Special District Level\n"
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4d5a1b9e-49b9-482b-8814-7cbcabb6daae",
    "metadata": {},
+   "outputs": [],
    "source": [
+    "sd_match_pattern = r\"(?i)\\b(city|town|CDP|CCD|village|charter|municipality|Borough|Park District|Authority|Basin|Mountains|2|1|District|Services|Special|Preservation|Assessment|Initiative|Open Space|Metro|Parks|Community|Recreation District)\\b\"\n",
+    "sd_z8 = (\n",
+    "    con.read_parquet(city_h3_file)\n",
+    "    .mutate(name=_.city.re_replace(sd_match_pattern, \"\"))\n",
+    "    .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
+    ")\n",
+    "\n",
+    "sd_vals = {'Tri-Lakes Park and Recreation District':'Monument',\n",
+    "           'Urban Drainage and Flood Control District':'Denver',\n",
+    "           'Blue Heron Recreation District':'Phoenix',\n",
+    "           'Mountains Recreation and Conservation Authority':'Santa Monica',\n",
+    "           'St. Helena Parish Recreation and Parks District':'Greensburg',\n",
+    "           'West Geauga Park and Recreation District':'Chardon',\n",
+    "           'Marin County Open Space District':'San Rafael',\n",
+    "          }\n",
+    "\n",
+    "# filtering landvote to just special districts \n",
+    "sd = (landvote.filter(_.jurisdiction == \"Special District\")\n",
+    "            .rename(city = \"Jurisdiction Name\")\n",
+    "            .mutate(name=_.name.re_replace(sd_match_pattern, \"\"))\n",
+    "            .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
+    "            .mutate(name=_.city.substitute(value=sd_vals, else_=_.name))\n",
+    "     )\n",
+    "\n",
+    "# detecting if a record has multiple counties listed in the notes field \n",
+    "multiple_counties_ = (\n",
+    "    sd\n",
+    "    .filter(~_.notes.isnull())\n",
+    "    .filter( \n",
+    "        (_.notes.contains(\"counties\")) |\n",
+    "        (_.notes.contains(\"Counties\")) |\n",
+    "        (_.notes.split(\"County\").length()-1>1) \n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "#extracting multiple counties from notes column \n",
+    "@ibis.udf.scalar.python\n",
+    "def extract_counties_udf(note: str) -> list[str]:\n",
+    "    pattern = r\"((?:[A-Z][a-zA-Z.\\'-]*(?:\\s+[A-Z][a-zA-Z.\\'-]*)*)(?:,\\s*)?(?:\\s+and\\s+)?)+(?=\\s+(?:[Cc]ounty|[Cc]ounties))\"\n",
+    "    p = re.compile(pattern)\n",
+    "    matches = [m.group(0) for m in p.finditer(note)]  # <-- Use finditer with group(0)\n",
+    "    counties = []\n",
+    "    for match in matches:\n",
+    "        parts = re.split(r',\\s*|\\s+and\\s+', match)\n",
+    "        counties.extend(f\"{part.strip()} County\" for part in parts if part.strip())\n",
+    "    return counties\n",
+    "\n",
+    "multiple_counties = (multiple_counties_\n",
+    "    .mutate(county_list=extract_counties_udf(_.notes))\n",
+    "    .unnest([\"county_list\"])\n",
+    "    .mutate(county=_.county_list)\n",
+    "    .drop(\"county_list\")\n",
+    ")\n",
+    "\n",
+    "multiple_counties_ids = multiple_counties.select('landvote_id').distinct().execute()['landvote_id'].to_list()\n",
+    "\n",
+    "# Only has 1 county in the notes field\n",
+    "single_county_pattern = r'([A-Z][a-zA-Z]+(?:\\s[A-Z][a-zA-Z]*)*\\sCounty)\\.?'\n",
+    "single_county = (sd\n",
+    "    .filter(~_.notes.isnull())\n",
+    "    .filter(_.landvote_id.notin(multiple_counties_ids))\n",
+    "    .mutate(county=_.notes.re_extract(single_county_pattern, 1).strip())\n",
+    "    .mutate(county=_.county.cases(\n",
+    "        ('',_.city.re_extract(single_county_pattern, 1).strip()),\n",
+    "        else_ = _.county))\n",
+    "    .filter(_.county != '')\n",
+    ")\n",
+    "single_county_ids= single_county.select('landvote_id').distinct().execute()['landvote_id'].to_list()\n",
+    "\n",
+    "# Nothing in notes, need to join with census data to get county\n",
+    "manually_fill = (sd\n",
+    "    .filter(_.landvote_id.notin(multiple_counties_ids))\n",
+    "    .filter(_.landvote_id.notin(single_county_ids))\n",
+    "    .inner_join(sd_z8,[_.name.upper() == sd_z8.name.upper(),\n",
+    "                        _.state_id == sd_z8.state_id]) \n",
+    "    .select(final_columns)\n",
+    "    .distinct()\n",
+    ")\n",
+    "\n",
+    "sd_county_vals = {'Western Summit County':'Summit County'}\n",
+    "sd_with_counties = single_county.union(multiple_counties).mutate(county=_.county.substitute(value=sd_county_vals, else_=_.county))\n",
+    "\n",
+    "#since we are joining on counties, there may be duplicate hexes because of the cities \n",
+    "landvote_sd_z8 = (sd_with_counties\n",
+    "                .inner_join(county_z8.distinct(), [sd_with_counties.county.upper() == county_z8.county.upper(), \n",
+    "                                                        sd_with_counties.state_id == county_z8.state_id])\n",
+    "                .select(final_columns)\n",
+    "                .union(manually_fill)\n",
+    "              )"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "63d8df3a-ddc8-49b3-9f3a-1762883472cb",
    "metadata": {},
    "outputs": [],
    "source": [
+    "sd = landvote_sd_z8.drop('h8','geom').distinct()\n",
+    "landvote_sd_geo = (sd.inner_join(county_geo,[sd.county.upper() == county_geo.county.upper(), sd.state_id == county_geo.state_id])\n",
+    "    .select(final_columns[:-1]))\n"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "cca12e06-8d7f-4a50-906c-7dc02e370072",
    "metadata": {},
    "source": [
+    "#### Municipal level\n",
+    "\n",
+    "Because there isn't a 1 to 1 match from municipals to Census data, we need to use both \"Places\" and \"Subdivisons\". "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "aa81e457-00ba-4b01-86d7-cf46bec04edd",
    "metadata": {},
    "outputs": [],
    "source": [
+    "municipal_vals = {\n",
+    "    \"Addison\": \"Addison village\",\n",
+    "    \"Anderson Township Park District\": \"Anderson township\",\n",
+    "    \"Bainbridge Island Metropolitan Park & Recreation District\": \"Bainbridge Island\",\n",
+    "    \"Bainbridge Island Metropolitan Park and Recreation District \": \"Bainbridge Island\",\n",
+    "    \"Bel-Ridge\": \"Bel-Ridge village\",\n",
+    "    \"Bend Park and Recreation District\": \"Bend\",\n",
+    "    \"Boardman Township Park District\": \"Boardman township\",\n",
+    "    \"Carney's Point Township\": \"Carneys Point township\",\n",
+    "    \"Castro Valley\": \"Castro Valley CDP\",\n",
+    "    \"Charter Township of Meridian\": \"Meridian township\",\n",
+    "    \"Charter Township of Oakland\": \"Oakland township\",\n",
+    "    \"Corrales\": \"Corrales village\",\n",
+    "    \"Dobbs Ferry\": \"Dobbs Ferry village\",\n",
+    "    \"Downers Grove Park District\": \"Downers Grove village\",\n",
+    "    \"Gates Mills\": \"Gates Mills village\",\n",
+    "    \"Glen Ellyn Park District\": \"Glen Ellyn village\",\n",
+    "    \"Hillsborough\": \"Hillsborough township\",\n",
+    "    \"Irvington\": \"Irvington village\",\n",
+    "    \"Lake Zurich\": \"Lake Zurich village\",\n",
+    "    \"Lake in the Hills\": \"Lake in the Hills village\",\n",
+    "    \"Libertyville\": \"Libertyville township\",\n",
+    "    \"Loch Arbor Village\": \"Loch Arbour Village\",\n",
+    "    \"Lockport Township Park District\": \"Lockport township\",\n",
+    "    \"Moapa\": \"Moapa CDP\",\n",
+    "    \"Nunda\": \"Nunda township\",\n",
+    "    \"Orland Park\": \"Orland Park village\",\n",
+    "    \"Park Ridge Recreation and Park District\": \"Park Ridge\",\n",
+    "    \"Peapack-Gladstone Borough\": \"Peapack and Gladstone\",\n",
+    "    \"Princeton Township\": \"Princeton\",\n",
+    "    \"Romeoville\": \"Romeoville village\",\n",
+    "    \"San Diego Open Space Park Facilities District No. 1\": \"San Diego\",\n",
+    "    \"Seattle Park District\": \"Seattle\",\n",
+    "    \"Stookey\": \"Stookey township\",\n",
+    "    \"Tarrytown\": \"Tarrytown village\",\n",
+    "    \"Tofte\": \"Tofte township\",\n",
+    "    \"Village of Corrales\": \"Corrales village\",\n",
+    "    \"Village of Lake Barrington\": \"Lake Barrington village\",\n",
+    "    \"Village of Los Ranchos de Albuquerque\": \"Los Ranchos de Albuquerque village\",\n",
+    "    \"West Paterson Borough\": \"Woodland Park\",\n",
+    "    \"Westampton\": \"Westampton township\",\n",
+    "    \"Willamalane Park and Recreation District\": \"Springfield\",\n",
+    "    \"Wilmette Park District\": \"Wilmette village\",  \n",
+    "}\n",
+    "collapse_spaces = r\"\\s+\"\n",
+    "city_z8 = (\n",
+    "    con.read_parquet(city_h3_file)\n",
+    "    .mutate(name=_.city.re_replace(match_pattern, \"\"))\n",
+    "    .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
+    ")\n",
+    "\n",
+    "# filter to only ciites\n",
+    "municipals = (landvote.filter(_.jurisdiction == \"Municipal\")\n",
+    "            .rename(city = \"Jurisdiction Name\")\n",
+    "            .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
+    "            .mutate(name = _.city.substitute(value = municipal_vals, else_= _.name))\n",
+    "             )\n",
+    "\n",
+    "# join with census data \n",
+    "city_joined = (municipals.inner_join(city_z8, [municipals.name.upper() == city_z8.name.upper(), \n",
+    "                                                municipals.state_id == city_z8.state_id]).select(final_columns))\n",
     "\n",
+    "# handling cities with multiple counties\n",
+    "dupes = city_joined.drop('h8','geom').distinct().group_by(\"landvote_id\").agg(county_count = _.count()).filter(_.county_count > 1)\n",
+    "duplicate_ids = dupes.execute()['landvote_id'].to_list()\n",
     "\n",
+    "# 105 that are already filled in, manually scraping the counties from the notes \n",
+    "pattern = r'^\\s*([A-Z][a-z]+(?:\\s[A-Z][a-z]+)*)\\s(?:County|Co)\\.?\\s*$'\n",
+    "counties_filled = (municipals.filter(_.landvote_id.isin(duplicate_ids))\n",
+    "                    .filter(~_.notes.isnull())\n",
+    "                    .mutate(county=_.notes.re_extract(pattern, 1).strip()+ ibis.literal(' County'))\n",
+    "                    .filter(_.county !=' County')\n",
+    "                  )\n",
     "\n",
+    "# since we added the county, join it with the rest of the census data \n",
+    "counties_filled_join = (counties_filled\n",
+    "    .inner_join(city_z8,[counties_filled.name.upper() == city_z8.name.upper(),\n",
+    "                          counties_filled.county.upper() == city_z8.county.upper(), \n",
+    "                          counties_filled.state_id == city_z8.state_id])\n",
+    "    .select(final_columns))\n",
     "\n",
+    "counties_filled_ids = counties_filled_join.select('landvote_id').distinct().execute()['landvote_id'].to_list()\n",
     "\n",
+    "# join with the rest of the municipal data\n",
+    "landvote_city_z8 = city_joined.filter(~_.landvote_id.isin(counties_filled_ids)).union(counties_filled_join).distinct()"
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c751dabd-b39a-4c54-b9ed-17d2b0cb32ea",
    "metadata": {},
+   "outputs": [],
    "source": [
+    "match_pattern = r\"(?i)\\b(city|town|charter|municipality|[Bb]orough)\\b\"\n",
+    "\n",
+    "city_geo = (city_fips.inner_join(county_geo, 'FIPS').select(~s.endswith('_right')).drop('name')\n",
+    "    .mutate(name=_.city.re_replace(match_pattern, \"\"))\n",
+    "    .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip()))\n",
+    "\n",
+    "municipals_counties = (counties_filled\n",
+    "    .mutate(name=_.city.re_replace(match_pattern, \"\"))\n",
+    "    .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
+    "    .mutate(name = _.city.substitute(value = municipal_vals, else_= _.name))\n",
+    "    .inner_join(city_geo,[_.name.upper() == city_geo.name.upper(),\n",
+    "                          _.county.upper() == city_geo.county.upper(), \n",
+    "                          _.state_id == city_geo.state_id])\n",
+    "    .select(final_columns[:-1])\n",
+    "                      )\n",
+    "\n",
+    "other_municipals = (municipals.filter(~_.landvote_id.isin(counties_filled_ids))\n",
+    "    .mutate(name=_.city.re_replace(match_pattern, \"\"))\n",
+    "    .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
+    "    .mutate(name = _.city.substitute(value = municipal_vals, else_= _.name))\n",
+    "    .inner_join(city_geo,[_.name.upper() == city_geo.name.upper(),_.state_id == city_geo.state_id])\n",
+    "    .select(final_columns[:-1]))\n",
+    "\n",
+    "landvote_city_geo = municipals_counties.union(other_municipals).distinct() "
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "6d52d97b-ad14-4d04-89a8-79a813f80353",
    "metadata": {},
    "source": [
+    "#### Joining all the landvote data with census\n",
+    "Note: `landvote_joined` has more unique rows than `landvote` because some cities/special districts span multiple counties. Each additional county creates a new row."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "6d481736-82f3-4be2-b5af-6280be5e9d75",
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
+    "landvote_joined_z8 = landvote_city_z8.union(landvote_county_z8).union(landvote_sd_z8).union(landvote_state_z8)\n",
+    "landvote_joined_z8.to_parquet(\"s3://shared-tpl/landvote/z8/landvote_h3_z8.parquet\")\n",
     "\n",
     "\n",
+    "# and non-hex version \n",
+    "landvote_joined_geo = landvote_city_geo.union(landvote_county_geo).union(landvote_sd_geo).union(landvote_state_geo)\n",
+    "landvote_joined_geo.to_parquet(\"s3://shared-tpl/landvote/landvote_geom.parquet\")"
    ]
   },
   {
+   "cell_type": "markdown",
+   "id": "066e4fdd-f069-4d5d-b2be-f10124cfe19c",
    "metadata": {},
    "source": [
+    "#### Generate PMTiles"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "100f0fd8-9588-4f65-a657-52bd5b942089",
    "metadata": {},
    "outputs": [],
    "source": [
+    "parquet = client.get_presigned_url(\n",
+    "    \"GET\",\n",
+    "    \"shared-tpl\",\n",
+    "    \"landvote/landvote_geom.parquet\",\n",
+    "    expires=timedelta(hours=2),\n",
+    ")\n",
+    "to_geojson(parquet, \"landvote_geom.geojson\")\n",
+    "pmtiles = to_pmtiles(\"landvote_geom.geojson\", \"landvote_geom.pmtiles\")\n",
+    "s3_cp('landvote_geom.pmtiles', \"s3://shared-tpl/landvote/landvote_geom.pmtiles\", \"minio\")"
    ]
   }
  ],