cassiebuhler commited on
Commit
5ffe768
·
1 Parent(s): 352224e

cleaning up landvote

Browse files
Files changed (1) hide show
  1. preprocessing/hexes.ipynb +456 -282
preprocessing/hexes.ipynb CHANGED
@@ -18,12 +18,14 @@
18
  "from cng.utils import *\n",
19
  "from cng.h3 import *\n",
20
  "from ibis import _\n",
 
21
  "import os\n",
22
  "from osgeo import gdal\n",
23
  "from minio import Minio\n",
24
  "import streamlit \n",
25
  "from datetime import timedelta\n",
26
  "import geopandas as gpd\n",
 
27
  "\n",
28
  "# Get signed URLs to access license-controlled layers\n",
29
  "key = st.secrets[\"MINIO_KEY\"]\n",
@@ -37,6 +39,22 @@
37
  "set_secrets(con)"
38
  ]
39
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  {
41
  "cell_type": "markdown",
42
  "id": "0b086a1a-af23-487b-923d-fca595a19111",
@@ -52,7 +70,7 @@
52
  "metadata": {},
53
  "outputs": [],
54
  "source": [
55
- "def h3_from_geom(con, name, cols, zoom = 8):\n",
56
  " \"\"\"\n",
57
  " Computes hexes directly from geometry.\n",
58
  " \"\"\"\n",
@@ -70,7 +88,7 @@
70
  " SELECT {cols}, UNNEST(h{zoom}) AS h{zoom},\n",
71
  " ST_GeomFromText(h3_cell_to_boundary_wkt(UNNEST(h{zoom}))) AS geom\n",
72
  " FROM t2\n",
73
- " ''').to_parquet(f\"{name}_h3_z{zoom}.parquet\")\n",
74
  " return "
75
  ]
76
  },
@@ -98,100 +116,93 @@
98
  " expires=timedelta(hours=2),\n",
99
  ")\n",
100
  "\n",
101
- "cols = ['fid', 'TPL_ID', 'State', 'County', 'Municipality',\n",
102
- " 'Site_Name', 'Reported_Acres', 'Close_Year', 'Close_Date', 'Owner_Name',\n",
103
- " 'Owner_Type', 'Manager_Name', 'Manager_Type', 'Purchase_Type',\n",
104
- " 'EasementHolder_Name', 'EasementHolder_Type', 'Public_Access_Type',\n",
105
- " 'Purpose_Type', 'Duration_Type', 'Data_Provider', 'Data_Source',\n",
106
- " 'Source_Date', 'Data_Aggregator', 'Comments', 'Amount', 'Program_ID',\n",
107
- " 'Program_Name', 'Sponsor_ID', 'Sponsor_Name', 'Sponsor_Type']\n",
108
  "\n",
 
109
  "\n",
110
  "tpl_table = (con.read_parquet(tpl)\n",
111
- " .mutate(geom = _.geom.convert(\"ESRI:102039\", \"EPSG:4326\"))\n",
112
- " )\n",
113
- "\n",
 
 
 
 
 
 
 
 
114
  "con.create_table('tpl', tpl_table, overwrite=True)\n",
115
- "h3_from_geom(con, 'tpl', cols)\n",
116
- "\n",
117
- "client.fput_object(bucket_name = \"shared-tpl\",\n",
118
- " object_name = \"tpl_h3_z8.parquet\",\n",
119
- " file_path = \"tpl_h3_z8.parquet\") "
120
  ]
121
  },
122
  {
123
  "cell_type": "markdown",
124
- "id": "3f00cfe9-520c-4839-aeed-46a83b11ecce",
125
  "metadata": {},
126
  "source": [
127
- "# Census\n",
128
- "\n",
129
- "Getting polygons and FIPS codes from Census state, county, place, and subdivision data. \n",
130
- "\n"
131
  ]
132
  },
133
  {
134
  "cell_type": "code",
135
  "execution_count": null,
136
- "id": "aec60670-4abc-4fc4-86ca-88a77ba69d39",
137
  "metadata": {},
138
  "outputs": [],
139
  "source": [
140
- "state_url = \"s3://public-census/2024/state/2024_us_state.parquet\"\n",
141
- "county_url = \"s3://public-census/2024/county/2024_us_county.parquet\"\n",
142
- "\n",
143
- "state_file = '2024_us_state_h3_z8.parquet'\n",
144
- "county_temp_file = '2024_us_county_h3_z8_temp.parquet'\n",
145
- "county_file = '2024_us_county_h3_z8.parquet'\n",
146
- "city_file = '2024_us_places_subdivisions_h3_z8.parquet'"
147
  ]
148
  },
149
  {
150
  "cell_type": "markdown",
151
- "id": "7cd589ad-5b03-41de-8936-c20091a937e1",
152
  "metadata": {},
153
  "source": [
154
- "#### State"
 
 
 
155
  ]
156
  },
157
  {
158
- "cell_type": "code",
159
- "execution_count": null,
160
- "id": "86d2ed94-740f-49cd-a041-50401b7c7984",
161
  "metadata": {},
162
- "outputs": [],
163
  "source": [
164
- "# convert shape file to parquet \n",
165
- "gdf = gpd.read_file('tl_2024_us_state.shp').to_crs('epsg:4326').rename_geometry('geom').rename(columns={\"GEOID\": \"FIPS\", \"STUSPS\":\"state\", \"NAME\":\"name\"})\n",
166
- "con.create_table('state_wkt', gdf, overwrite=True)\n",
167
- "\n",
168
- "# get geom (duckdb turns geodataframes into wkt)\n",
169
- "con.sql(\"\"\"\n",
170
- "SELECT * EXCLUDE geom,\n",
171
- " ST_GeomFromWKB(geom) AS geom\n",
172
- "FROM state_wkt\n",
173
- "\"\"\").to_parquet(state_url)\n",
174
- "\n",
175
- "# convert to h3\n",
176
- "con.read_parquet(state_url, table_name = 'state')\n",
177
- "cols = ['STATE','name','FIPS']\n",
178
- "h3_from_geom(con, 'state', cols)\n",
179
- "\n",
180
- "# save file \n",
181
- "client.fput_object(bucket_name = \"public-census\",\n",
182
- " file_path = \"state_h3_z8.parquet\",\n",
183
- " object_name = f\"2024/state/{state_file}\") "
184
  ]
185
  },
186
  {
187
  "cell_type": "code",
188
  "execution_count": null,
189
- "id": "a39905b6-b0b6-46d0-a035-cd1ac0f43d53",
190
- "metadata": {},
 
 
191
  "outputs": [],
192
  "source": [
193
- "# grabbing state abbeviations for later\n",
194
- "state_ids = con.read_parquet(state_url).select('name','state','FIPS').rename(state_name = 'name')"
 
 
 
 
 
 
 
 
 
 
195
  ]
196
  },
197
  {
@@ -205,58 +216,39 @@
205
  {
206
  "cell_type": "code",
207
  "execution_count": null,
208
- "id": "96f3d43f-9ab8-4f3e-a981-51c5eca6f848",
209
  "metadata": {},
210
  "outputs": [],
211
  "source": [
212
  "%%time\n",
213
- "# convert shape to parquet \n",
214
- "gdf = gpd.read_file('tl_2024_us_county.shp').to_crs('epsg:4326').rename_geometry('geom').drop('NAME',axis =1).rename(columns={\"GEOID\": \"FIPS\", \"NAMELSAD\":\"name\"})[['geom','name','FIPS','STATEFP']]\n",
215
- "con.create_table('county_wkt', gdf, overwrite=True)\n",
216
- "\n",
217
- "# convert to geom (duckdb turns geodataframes into wkt)\n",
218
- "con.sql(\"\"\"\n",
219
- "SELECT * EXCLUDE geom,\n",
220
- " ST_GeomFromWKB(geom) AS geom\n",
221
- "FROM county_wkt\n",
222
- "\"\"\").to_parquet(county_url)\n",
223
- "\n",
224
- "# convert to h3\n",
225
- "con.read_parquet(county_url, table_name = 'county')\n",
226
- "cols = ['name','FIPS','STATEFP']\n",
227
- "h3_from_geom(con, 'county', cols)\n",
228
- "\n",
229
- "# save file \n",
230
- "client.fput_object(bucket_name = \"public-census\",\n",
231
- " file_path = \"county_h3_z8.parquet\",\n",
232
- " object_name = f\"2024/county/{county_temp_file}\") \n",
233
- "\n"
234
- ]
235
- },
236
- {
237
- "cell_type": "code",
238
- "execution_count": null,
239
- "id": "3b3a5023-45d7-4039-a078-4901ebdd3e10",
240
- "metadata": {},
241
- "outputs": [],
242
- "source": [
243
- "# get a non hex version of counties to use as bounds in tpl app\n",
244
- "temp = con.read_parquet(county_url)\n",
245
- "(temp.left_join(state_ids, [temp.STATEFP == state_ids.FIPS]).drop('FIPS_right','STATEFP')\n",
246
- " .rename(county = 'name').select('FIPS','state','state_name','county','geom')\n",
247
- ").to_parquet(county_url)"
248
- ]
249
- },
250
- {
251
- "cell_type": "code",
252
- "execution_count": null,
253
- "id": "a7829daf-8333-4e35-83f6-0a2bbcd174fa",
254
- "metadata": {},
255
- "outputs": [],
256
- "source": [
257
- "# get state abbeviations for counties\n",
258
- "county_geo = con.read_parquet(f\"s3://public-census/2024/county/{county_temp_file}\")\n",
259
- "county_geo.left_join(state_ids, [county_geo.STATEFP == state_ids.FIPS]).drop('FIPS_right','STATEFP').to_parquet(f\"s3://public-census/2024/county/{county_file}\")\n"
260
  ]
261
  },
262
  {
@@ -277,26 +269,30 @@
277
  "outputs": [],
278
  "source": [
279
  "match_pattern = r\"(?i)\\s*(city|town|village|charter|municipality|Borough)\\b\"\n",
280
- "city_cols = [\"state\",\"county\",\"FIPS\",\"name\",'city']\n",
 
 
281
  "\n",
282
  "places_url = \"https://www2.census.gov/geo/docs/reference/codes2020/national_place_by_county2020.txt\"\n",
283
  "places_fips = (con.read_csv(places_url)\n",
284
- " .rename(state = \"STATE\", county = \"COUNTYNAME\", city = \"PLACENAME\")\n",
285
  " .mutate(name=_.city.re_replace(match_pattern, \"\").strip())\n",
286
  " .mutate(FIPS = _.STATEFP + _.COUNTYFP)\n",
287
  " .select(city_cols))\n",
288
  "\n",
289
  "subdivisions_url = \"https://www2.census.gov/geo/docs/reference/codes2020/national_cousub2020.txt\"\n",
290
  "subdivisions_fips = (con.read_csv(subdivisions_url)\n",
291
- " .rename(state = \"STATE\", county = \"COUNTYNAME\", city = \"COUSUBNAME\")\n",
292
  " .mutate(name=_.city.re_replace(match_pattern, \"\").strip())\n",
293
  " .mutate(FIPS = _.STATEFP + _.COUNTYFP)\n",
294
  " .select(city_cols))\n",
295
  "\n",
296
- "city_fips = places_fips.union(subdivisions_fips).distinct() #get unique -> some cities are listed in both places and subdivisions\n",
297
- "city_geo = city_fips.left_join(county_geo, 'FIPS').drop('FIPS_right','name_right','city') #get h3 from counties \n",
298
- "city_joined = city_geo.left_join(state_ids, [city_geo.STATEFP == state_ids.FIPS]).drop('FIPS_right','STATEFP','state_right')# get state ids \n",
299
- "city_joined.to_parquet(f\"s3://public-census/2024/places_subdivisions/{city_file}\")\n"
 
 
300
  ]
301
  },
302
  {
@@ -312,8 +308,9 @@
312
  "- First, need to split up landvote into its 3 jurisdictions: state, county, and municipals\n",
313
  "- Join states with Census \"states\" to get state FIPS/hex\n",
314
  "- Join counties with Census \"counties\" to get county FIPS/hex\n",
 
315
  "- Join municipals with Census \"places\" and \"subdivisions\" to get county FIPS/hex\n",
316
- "- Then join all municipal, county, and state data back together!\n",
317
  "\n"
318
  ]
319
  },
@@ -327,44 +324,43 @@
327
  "landvote_csv = client.get_presigned_url(\n",
328
  " \"GET\",\n",
329
  " \"shared-tpl\",\n",
330
- " \"landvote.csv\",\n",
331
  " expires=timedelta(hours=2),\n",
332
  ")\n",
333
- "\n",
334
- "match_pattern = r\"(?i)\\s*(city|town|village|charter|municipality|Borough)\\b\"\n",
335
- "landvote = (con.read_csv(landvote_csv, ignore_errors=True)\n",
336
- " .rename(jurisdiction = \"Jurisdiction Type\", state = \"State\")\n",
337
- " .mutate(state = _.state.substitute({'Ore':'OR'}))\n",
338
  " .mutate(name=_['Jurisdiction Name'].re_replace(match_pattern, \"\").strip())\n",
339
- " .mutate(landvote_id=ibis.row_number().over())\n",
 
340
  " .mutate(_['Conservation Funds Approved'].replace('$', '')\n",
341
- " .replace(',', '').cast('float').name('Conservation Funds Approved')))\n",
342
- "\n",
343
- "\n",
344
- "final_columns = ['landvote_id',\n",
345
- " 'FIPS',\n",
346
- " 'state',\n",
347
- " 'state_name',\n",
348
- " 'county',\n",
349
- " 'city',\n",
350
- " 'jurisdiction',\n",
351
- " 'Date',\n",
352
- " 'Description',\n",
353
- " 'Finance Mechanism',\n",
354
- " '\"Other\" Comment',\n",
355
- " 'Purpose',\n",
356
- " 'Total Funds at Stake',\n",
357
- " 'Conservation Funds at Stake',\n",
358
- " 'Total Funds Approved',\n",
359
- " 'Conservation Funds Approved',\n",
360
- " 'Pass?',\n",
361
- " 'Status',\n",
362
- " '% Yes',\n",
363
- " '% No',\n",
364
- " 'Notes',\n",
365
- " 'Voted Acq. Measure',\n",
366
- " 'geom',\n",
367
- " 'h8']"
368
  ]
369
  },
370
  {
@@ -382,17 +378,27 @@
382
  "metadata": {},
383
  "outputs": [],
384
  "source": [
385
- "state_geo = con.read_parquet(f\"s3://public-census/2024/state/{state_file}\")\n",
386
  "states = (landvote.filter(_.jurisdiction == \"State\")\n",
387
- " .rename(state_name = \"Jurisdiction Name\")\n",
388
  " .mutate(county = ibis.literal('None'))\n",
389
  " .mutate(county_fips = ibis.literal('None'))\n",
390
  " .mutate(city = ibis.literal('None')))\n",
391
  "\n",
392
- "landvote_state = (states.left_join(state_geo, [states.name.upper() == state_geo.name.upper()])\n",
393
- " .select(final_columns))\n",
394
- "\n",
395
- "#adding state ID and state name from the county/city\n"
 
 
 
 
 
 
 
 
 
 
396
  ]
397
  },
398
  {
@@ -410,212 +416,380 @@
410
  "metadata": {},
411
  "outputs": [],
412
  "source": [
413
- "county_geo = con.read_parquet(f\"s3://public-census/2024/county/{county_file}\")\n",
414
- "\n",
415
- "county_match_pattern = r\"(?i)\\s*(County)\\b\"\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  "\n",
417
  "counties = (landvote.filter(_.jurisdiction == \"County\")\n",
418
  " .rename(county = \"Jurisdiction Name\")\n",
419
  " .mutate(city = ibis.literal('None'))\n",
420
- " .mutate(name=_.name.re_replace(county_match_pattern, \"\").strip()))\n",
 
 
421
  "\n",
422
- "landvote_county = (counties.left_join(county_geo, [counties.name.upper() == county_geo.name.upper(), \n",
423
- " counties.state == county_geo.state])\n",
424
- " .select(final_columns))"
 
425
  ]
426
  },
427
  {
428
- "cell_type": "markdown",
429
- "id": "cca12e06-8d7f-4a50-906c-7dc02e370072",
 
430
  "metadata": {},
 
431
  "source": [
432
- "#### Municipal level\n",
 
 
 
 
433
  "\n",
434
- "Because there isn't a 1 to 1 match from municipals to Census data, we need to use both \"Places\" and \"Subdivisons\". "
 
 
 
435
  ]
436
  },
437
  {
438
- "cell_type": "code",
439
- "execution_count": null,
440
- "id": "aa81e457-00ba-4b01-86d7-cf46bec04edd",
441
- "metadata": {
442
- "scrolled": true
443
- },
444
- "outputs": [],
445
  "source": [
446
- "city_geo = con.read_parquet(f\"s3://public-census/2024/places_subdivisions/{city_file}\")\n",
447
- "\n",
448
- "municipals = landvote.filter(_.jurisdiction == \"Municipal\").rename(city = \"Jurisdiction Name\")\n",
449
- "\n",
450
- "landvote_city = (municipals.left_join(city_geo, [municipals.name.upper() == city_geo.name.upper(), \n",
451
- " municipals.state == city_geo.state])\n",
452
- " .inner_join(state_ids, [municipals.state == state_ids.state])\n",
453
- " )"
454
  ]
455
  },
456
  {
457
- "cell_type": "markdown",
458
- "id": "6d52d97b-ad14-4d04-89a8-79a813f80353",
 
459
  "metadata": {},
 
460
  "source": [
461
- "#### Joining all the landvote data with census\n",
462
- "Note: `landvote_joined` has more rows than `landvote` because some cities span multiple counties. Each additional county creates a new row."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
  ]
464
  },
465
  {
466
  "cell_type": "code",
467
  "execution_count": null,
468
- "id": "6d481736-82f3-4be2-b5af-6280be5e9d75",
469
  "metadata": {},
470
  "outputs": [],
471
  "source": [
472
- "landvote_joined = landvote_city.union(landvote_county).union(landvote_state)\n",
473
- "landvote_joined.to_parquet(\"s3://shared-tpl/landvote_h3_z8.parquet\")"
 
474
  ]
475
  },
476
  {
477
  "cell_type": "markdown",
478
- "id": "4fbdacae-28cf-4335-b895-f1335749a6e3",
479
  "metadata": {},
480
  "source": [
481
- "#### And get a non-hex version of landvote"
 
 
482
  ]
483
  },
484
  {
485
  "cell_type": "code",
486
  "execution_count": null,
487
- "id": "67bdc9d2-f195-427b-b1c6-08f1f5f90ac7",
488
  "metadata": {},
489
  "outputs": [],
490
  "source": [
491
- "import ibis.selectors as s\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
  "\n",
493
- "state_geo = con.read_parquet(state_url)\n",
494
- "landvote_state = (states.left_join(state_geo, [states.name.upper() == state_geo.name.upper()])).select(final_columns[:-1])\n",
 
495
  "\n",
496
- "county_geo = con.read_parquet(county_url)\n",
497
- "landvote_county = (counties.left_join(county_geo, [counties.name.upper() == county_geo.county.upper(), \n",
498
- " counties.state == county_geo.state])).select(final_columns[:-1])\n",
 
 
 
 
499
  "\n",
500
- "city_fips = places_fips.union(subdivisions_fips).distinct() #get unique -> some cities are listed in both places and subdivisions\n",
501
- "city_geo = city_fips.left_join(county_geo, 'FIPS').select(~s.endswith('_right'))\n",
 
 
 
 
502
  "\n",
503
- "landvote_city = (municipals.left_join(city_geo, [municipals.name.upper() == city_geo.name.upper(), \n",
504
- " municipals.state == city_geo.state])\n",
505
- " ).select(final_columns[:-1])\n",
506
  "\n",
507
- "landvote_joined = landvote_city.union(landvote_county).union(landvote_state)\n",
508
- "landvote_joined.to_parquet(\"s3://shared-tpl/landvote_geom.parquet\")"
509
  ]
510
  },
511
  {
512
- "cell_type": "markdown",
513
- "id": "f41a207c-d560-4dab-8ec6-3fee24cb880b",
 
514
  "metadata": {},
 
515
  "source": [
516
- "# Join TPL Almanac with Landvote"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  ]
518
  },
519
  {
520
  "cell_type": "markdown",
521
- "id": "ac8e6648-d3ad-4e63-81a0-6f78f3b7584c",
522
  "metadata": {},
523
  "source": [
524
- "- Joining the data\n",
525
- "- Generate pmtiles -> converting h8 back to original polygons"
526
  ]
527
  },
528
  {
529
  "cell_type": "code",
530
  "execution_count": null,
531
- "id": "d2359bf0-1f3b-4af4-a96f-301dde56d81d",
532
- "metadata": {},
 
 
533
  "outputs": [],
534
  "source": [
535
- "# joining data\n",
536
- "landvote_parquet = client.get_presigned_url(\n",
537
- " \"GET\",\n",
538
- " \"shared-tpl\",\n",
539
- " \"landvote_h3_z8.parquet\",\n",
540
- " expires=timedelta(hours=2),\n",
541
- ")\n",
542
  "\n",
543
- "landvote = (con.read_parquet(landvote_parquet)\n",
544
- " .rename(FIPS_county = \"FIPS\",\n",
545
- " measure_status = \"Status\", measure_purpose = \"Purpose\",measure_amount = 'Conservation Funds Approved')\n",
546
- " .mutate(measure_year = _.Date.year()).drop('Date','geom'))\n",
547
  "\n",
548
- "tpl_parquet = client.get_presigned_url(\n",
549
- " \"GET\",\n",
550
- " \"shared-tpl\",\n",
551
- " \"tpl_h3_z8.parquet\",\n",
552
- " expires=timedelta(hours=2),\n",
553
- ")\n",
554
- "\n",
555
- "tpl_drop_cols = ['Reported_Acres','Close_Date','EasementHolder_Name',\n",
556
- " 'Data_Provider','Data_Source','Data_Aggregator',\n",
557
- " 'Program_ID','Sponsor_ID']\n",
558
- "tpl = con.read_parquet(tpl_parquet).mutate(h8 = _.h8.lower()).drop(tpl_drop_cols)\n",
559
- " \n",
560
- "\n",
561
- "select_cols = ['fid','TPL_ID','landvote_id',\n",
562
- "'state','state_name','county',\n",
563
- " 'FIPS_county','city','jurisdiction',\n",
564
- " 'Close_Year', 'Site_Name',\n",
565
- " 'Owner_Name','Owner_Type',\n",
566
- " 'Manager_Name','Manager_Type',\n",
567
- " 'Purchase_Type','EasementHolder_Type',\n",
568
- " 'Public_Access_Type','Purpose_Type',\n",
569
- " 'Duration_Type','Amount',\n",
570
- " 'Program_Name','Sponsor_Name',\n",
571
- " 'Sponsor_Type','measure_year',\n",
572
- " 'measure_status','measure_purpose',\n",
573
- " 'measure_amount']\n",
574
- "\n",
575
- "# joining all data\n",
576
- "database = (\n",
577
- " tpl.drop('State','County')\n",
578
- " .left_join(landvote, \"h8\").drop('h8_right')\n",
579
- ").select(select_cols).distinct()\n"
580
  ]
581
  },
582
  {
583
- "cell_type": "code",
584
- "execution_count": null,
585
- "id": "8c3d5165-1b9f-4faf-9291-855d51698adc",
586
  "metadata": {},
587
- "outputs": [],
588
  "source": [
589
- "# getting original polygons back \n",
590
- "tpl_geom_url = client.get_presigned_url(\n",
591
- " \"GET\",\n",
592
- " \"shared-tpl\",\n",
593
- " \"tpl.parquet\",\n",
594
- " expires=timedelta(hours=2),\n",
595
- ")\n",
596
- "\n",
597
- "tpl_geom = con.read_parquet(tpl_geom_url).select('geom','TPL_ID','fid').mutate(geom = _.geom.convert(\"ESRI:102039\", \"EPSG:4326\"))\n",
598
- "\n",
599
- "database = (database.inner_join(tpl_geom, [database.TPL_ID == tpl_geom.TPL_ID, database.fid == tpl_geom.fid])\n",
600
- " # .mutate(id=ibis.row_number().over())\n",
601
- " # .drop('TPL_ID','fid','landvote_id')\n",
602
- " )\n",
603
- " "
604
  ]
605
  },
606
  {
607
  "cell_type": "code",
608
  "execution_count": null,
609
- "id": "d9bb698f-3925-4aa6-8b89-56a6ee839312",
610
  "metadata": {},
611
  "outputs": [],
612
  "source": [
613
- "# save to parquet/pmtiles \n",
614
- "database.to_parquet(\"s3://shared-tpl/tpl_almanac_landvote_geom.parquet\")\n",
615
- "database.execute().set_crs('epsg:4326').to_file('tpl_almanac_landvote_geom.geojson')\n",
616
- "\n",
617
- "to_pmtiles('tpl_almanac_landvote_geom.geojson', 'tpl_almanac_landvote_geom.pmtiles', options = ['--extend-zooms-if-still-dropping'])\n",
618
- "s3_cp('tpl_almanac_landvote_geom.pmtiles', \"s3://shared-tpl/tpl_almanac_landvote_geom.pmtiles\", \"minio\")"
 
 
 
619
  ]
620
  }
621
  ],
 
18
  "from cng.utils import *\n",
19
  "from cng.h3 import *\n",
20
  "from ibis import _\n",
21
+ "import ibis.selectors as s\n",
22
  "import os\n",
23
  "from osgeo import gdal\n",
24
  "from minio import Minio\n",
25
  "import streamlit \n",
26
  "from datetime import timedelta\n",
27
  "import geopandas as gpd\n",
28
+ "import re\n",
29
  "\n",
30
  "# Get signed URLs to access license-controlled layers\n",
31
  "key = st.secrets[\"MINIO_KEY\"]\n",
 
39
  "set_secrets(con)"
40
  ]
41
  },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": null,
45
+ "id": "deb47703-31fd-4039-84ce-df6a24cdf702",
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "census_path = \"s3://public-census/2024/\"\n",
50
+ "state_file = census_path + 'state/2024_us_state.parquet'\n",
51
+ "county_file = census_path + 'county/2024_us_county.parquet'\n",
52
+ "\n",
53
+ "state_h3_file = census_path + 'state/2024_us_state_h3_z8.parquet'\n",
54
+ "county_h3_file = census_path + 'county/2024_us_county_h3_z8.parquet'\n",
55
+ "city_h3_file = census_path + 'places_subdivisions/2024_us_places_subdivisions_h3_z8.parquet'"
56
+ ]
57
+ },
58
  {
59
  "cell_type": "markdown",
60
  "id": "0b086a1a-af23-487b-923d-fca595a19111",
 
70
  "metadata": {},
71
  "outputs": [],
72
  "source": [
73
+ "def h3_from_geom(con, name, cols, save_path, zoom = 8):\n",
74
  " \"\"\"\n",
75
  " Computes hexes directly from geometry.\n",
76
  " \"\"\"\n",
 
88
  " SELECT {cols}, UNNEST(h{zoom}) AS h{zoom},\n",
89
  " ST_GeomFromText(h3_cell_to_boundary_wkt(UNNEST(h{zoom}))) AS geom\n",
90
  " FROM t2\n",
91
+ " ''').to_parquet(save_path)\n",
92
  " return "
93
  ]
94
  },
 
116
  " expires=timedelta(hours=2),\n",
117
  ")\n",
118
  "\n",
119
+ "cols = ['fid', 'tpl_id', 'state_id', 'state', 'county', 'municipality',\n",
120
+ " 'site', 'acres', 'year', 'date', 'owner','owner_type','manager',\n",
121
+ " 'manager_type','purchase_type','easement','easement_type',\n",
122
+ " 'access_type','purpose_type','duration_type','data_provider',\n",
123
+ " 'data_source','source_date','data_aggregator','comments','amount',\n",
124
+ " 'program_id','program','sponsor_id','sponsor','sponsor_type','FIPS']\n",
 
125
  "\n",
126
+ "state_ids = con.read_parquet(state_file).drop('geom')\n",
127
  "\n",
128
  "tpl_table = (con.read_parquet(tpl)\n",
129
+ " .rename(tpl_id = \"TPL_ID\", state = \"State\", county = \"County\", municipality = \"Municipality\", site = \"Site_Name\",\n",
130
+ " acres = \"Reported_Acres\", area = \"Shape_Area\", year = \"Close_Year\", date = \"Close_Date\", owner = \"Owner_Name\",\n",
131
+ " owner_type = \"Owner_Type\", manager = \"Manager_Name\", manager_type = \"Manager_Type\",\n",
132
+ " purchase_type = \"Purchase_Type\", easement = \"EasementHolder_Name\", easement_type = \"EasementHolder_Type\",\n",
133
+ " access_type = \"Public_Access_Type\", purpose_type = \"Purpose_Type\", duration_type = \"Duration_Type\",\n",
134
+ " data_provider = \"Data_Provider\", data_source = \"Data_Source\", source_date = \"Source_Date\",\n",
135
+ " data_aggregator = \"Data_Aggregator\", comments = \"Comments\", amount = \"Amount\", program_id = 'Program_ID',\n",
136
+ " program = 'Program_Name', sponsor_id = \"Sponsor_ID\", sponsor = \"Sponsor_Name\", sponsor_type = \"Sponsor_Type\")\n",
137
+ " .mutate(geom = _.geom.convert(\"ESRI:102039\", \"EPSG:4326\"))\n",
138
+ " .inner_join(state_ids, 'state'))\n",
139
+ " \n",
140
  "con.create_table('tpl', tpl_table, overwrite=True)\n",
141
+ "# h3_from_geom(con, 'tpl', cols, save_path = 's3://shared-tpl/conservation_almanac/z8/tpl_h3_z8.parquet')"
 
 
 
 
142
  ]
143
  },
144
  {
145
  "cell_type": "markdown",
146
+ "id": "9612c804-0474-4bfe-924d-89dae0105663",
147
  "metadata": {},
148
  "source": [
149
+ "#### Generate PMTiles"
 
 
 
150
  ]
151
  },
152
  {
153
  "cell_type": "code",
154
  "execution_count": null,
155
+ "id": "932476f8-53a9-4aa9-9520-cf3fa42b8150",
156
  "metadata": {},
157
  "outputs": [],
158
  "source": [
159
+ "tpl_table.to_parquet('s3://shared-tpl/conservation_almanac/tpl.parquet')\n",
160
+ "tpl_table.to_parquet('tpl_epsg4326.parquet') #local copy to use to_geojson\n",
161
+ "to_geojson('tpl_epsg4326.parquet', \"tpl.geojson\")\n",
162
+ "pmtiles = to_pmtiles(\"tpl.geojson\", \"tpl.pmtiles\")\n",
163
+ "s3_cp('tpl.pmtiles', \"s3://shared-tpl/conservation_almanac/tpl.pmtiles\", \"minio\")"
 
 
164
  ]
165
  },
166
  {
167
  "cell_type": "markdown",
168
+ "id": "3f00cfe9-520c-4839-aeed-46a83b11ecce",
169
  "metadata": {},
170
  "source": [
171
+ "# Census\n",
172
+ "\n",
173
+ "Getting polygons and FIPS codes from Census state, county, place, and subdivision data. \n",
174
+ "\n"
175
  ]
176
  },
177
  {
178
+ "cell_type": "markdown",
179
+ "id": "7cd589ad-5b03-41de-8936-c20091a937e1",
 
180
  "metadata": {},
 
181
  "source": [
182
+ "#### State"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  ]
184
  },
185
  {
186
  "cell_type": "code",
187
  "execution_count": null,
188
+ "id": "08a861e6-fdcd-480a-ad0a-423c38cc1bc1",
189
+ "metadata": {
190
+ "scrolled": true
191
+ },
192
  "outputs": [],
193
  "source": [
194
+ "url = \"/vsizip//vsicurl/https://www2.census.gov/geo/tiger/TIGER2024/STATE/tl_2024_us_state.zip\"\n",
195
+ "state = (con.read_geo(url)\n",
196
+ " .mutate(geom = _.geom.convert('EPSG:4269','EPSG:4326'))\n",
197
+ " .rename(FIPS = \"GEOID\", state_id = \"STUSPS\", name = \"NAME\")\n",
198
+ " .select('FIPS','state_id','name','geom')\n",
199
+ " )\n",
200
+ "state.to_parquet(state_file)\n",
201
+ "\n",
202
+ "#get h3\n",
203
+ "con.read_parquet(state_file, table_name = 'state')\n",
204
+ "cols = ['state','state_id','FIPS']\n",
205
+ "h3_from_geom(con, 'state', cols, save_path = state_h3_file)"
206
  ]
207
  },
208
  {
 
216
  {
217
  "cell_type": "code",
218
  "execution_count": null,
219
+ "id": "1a6ea98d-878a-4ea5-8eda-ee10f34444e3",
220
  "metadata": {},
221
  "outputs": [],
222
  "source": [
223
  "%%time\n",
224
+ "## CT counties changed to \"planning regions\" in 2022, so I'm grabbing older data to get the county boundaries \n",
225
+ "url = \"/vsizip//vsicurl/https://www2.census.gov/geo/tiger/TIGER2020/COUNTY/tl_2020_us_county.zip\"\n",
226
+ "con.read_geo(url)\n",
227
+ "CT_counties = (con.read_geo(url)\n",
228
+ " .mutate(geom = _.geom.convert('EPSG:4269','EPSG:4326'))\n",
229
+ " .rename(FIPS = \"GEOID\", county = \"NAMELSAD\")\n",
230
+ " .select('FIPS','STATEFP','county','geom')\n",
231
+ " .filter(_.STATEFP == '09')\n",
232
+ " )\n",
233
+ "\n",
234
+ "# US counties \n",
235
+ "url = \"/vsizip//vsicurl/https://www2.census.gov/geo/tiger/TIGER2024/COUNTY/tl_2024_us_county.zip\"\n",
236
+ "con.read_geo(url)\n",
237
+ "county = (con.read_geo(url)\n",
238
+ " .mutate(geom = _.geom.convert('EPSG:4269','EPSG:4326'))\n",
239
+ " .rename(FIPS = \"GEOID\", county = \"NAMELSAD\")\n",
240
+ " .select('FIPS','STATEFP','county','geom')\n",
241
+ " .union(CT_counties)\n",
242
+ " ) \n",
243
+ "\n",
244
+ "#adding states to counties\n",
245
+ "state_ids = con.read_parquet(state_file).drop('geom')\n",
246
+ "county.inner_join(state_ids, [state_ids.FIPS == county.STATEFP]).select('FIPS','state_id','state','county','geom').to_parquet(county_file)\n",
247
+ "\n",
248
+ "#get h3\n",
249
+ "con.read_parquet(county_file, table_name = 'county')\n",
250
+ "cols = ['state_id','state','county','FIPS']\n",
251
+ "# h3_from_geom(con, 'county', cols, save_path = county_h3_file)\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  ]
253
  },
254
  {
 
269
  "outputs": [],
270
  "source": [
271
  "match_pattern = r\"(?i)\\s*(city|town|village|charter|municipality|Borough)\\b\"\n",
272
+ "# match_pattern = r\"(?i)(?<![a-z])(?:city|town|charter|municipality|[Bb]orough)(?![a-z])\"\n",
273
+ "\n",
274
+ "city_cols = [\"state_id\",\"county\",\"FIPS\",\"name\",'city']\n",
275
  "\n",
276
  "places_url = \"https://www2.census.gov/geo/docs/reference/codes2020/national_place_by_county2020.txt\"\n",
277
  "places_fips = (con.read_csv(places_url)\n",
278
+ " .rename(state_id = \"STATE\", county = \"COUNTYNAME\", city = \"PLACENAME\")\n",
279
  " .mutate(name=_.city.re_replace(match_pattern, \"\").strip())\n",
280
  " .mutate(FIPS = _.STATEFP + _.COUNTYFP)\n",
281
  " .select(city_cols))\n",
282
  "\n",
283
  "subdivisions_url = \"https://www2.census.gov/geo/docs/reference/codes2020/national_cousub2020.txt\"\n",
284
  "subdivisions_fips = (con.read_csv(subdivisions_url)\n",
285
+ " .rename(state_id = \"STATE\", county = \"COUNTYNAME\", city = \"COUSUBNAME\")\n",
286
  " .mutate(name=_.city.re_replace(match_pattern, \"\").strip())\n",
287
  " .mutate(FIPS = _.STATEFP + _.COUNTYFP)\n",
288
  " .select(city_cols))\n",
289
  "\n",
290
+ "#get unique -> some cities are listed in both places and subdivisions\n",
291
+ "city_fips = places_fips.union(subdivisions_fips).distinct() \n",
292
+ "\n",
293
+ "#get h3 from counties \n",
294
+ "county_h3 = con.read_parquet(county_h3_file)\n",
295
+ "city_fips.inner_join(county_h3, 'FIPS').select('FIPS','state_id','state','county','city','name','geom','h8').to_parquet(city_h3_file)\n"
296
  ]
297
  },
298
  {
 
308
  "- First, need to split up landvote into its 3 jurisdictions: state, county, and municipals\n",
309
  "- Join states with Census \"states\" to get state FIPS/hex\n",
310
  "- Join counties with Census \"counties\" to get county FIPS/hex\n",
311
+ "- Join special districts with Census \"places\" and \"subdivisions\" to get county FIPS/hex\n",
312
  "- Join municipals with Census \"places\" and \"subdivisions\" to get county FIPS/hex\n",
313
+ "- Then join all municipal, county, special district, and state data back together!\n",
314
  "\n"
315
  ]
316
  },
 
324
  "landvote_csv = client.get_presigned_url(\n",
325
  " \"GET\",\n",
326
  " \"shared-tpl\",\n",
327
+ " \"landvote/landvote_utf8.csv\",\n",
328
  " expires=timedelta(hours=2),\n",
329
  ")\n",
330
+ "collapse_spaces = r\"\\s+\"\n",
331
+ "match_pattern = r\"(?i)\\b(city|town|charter|municipality|Borough)\\b\"\n",
332
+ "landvote_ = (con.read_csv(landvote_csv) #it skips the row with a unicode error \n",
333
+ " .rename(jurisdiction = \"Jurisdiction Type\", state_id = \"State\")\n",
334
+ " .mutate(state_id = _.state_id.substitute({'Ore':'OR'}))\n",
335
  " .mutate(name=_['Jurisdiction Name'].re_replace(match_pattern, \"\").strip())\n",
336
+ " .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
337
+ " .mutate(landvote_id=ibis.row_number().over(order_by=[_.state_id, _.jurisdiction, _.Date]))\n",
338
  " .mutate(_['Conservation Funds Approved'].replace('$', '')\n",
339
+ " .replace(',', '').cast('float').name('Conservation Funds Approved'))\n",
340
+ " .mutate(year = _.Date.year())\n",
341
+ " .rename(date = \"Date\", description = \"Description\", finance_mechanism = \"Finance Mechanism\",\n",
342
+ " other_comments = '\"Other\" Comment', purpose = \"Purpose\", total_funds_at_stake = \"Total Funds at Stake\",\n",
343
+ " conservation_funds_at_stake = \"Conservation Funds at Stake\", total_funds_approved = \"Total Funds Approved\",\n",
344
+ " conservation_funds_approved = \"Conservation Funds Approved\", passed = \"Pass?\", status = \"Status\", \n",
345
+ " percent_yes = '% Yes', percent_no = '% No', notes = 'Notes', voted_acq_measure = \"Voted Acq. Measure\")\n",
346
+ " )\n",
347
+ "\n",
348
+ "#landvote_id is made with a window function, which can be a bit buggy, so it helps to materialize it after generating \n",
349
+ "landvote_with_ids = landvote_.execute() \n",
350
+ "landvote = con.create_table(\"landvote\", landvote_with_ids, overwrite = True)\n",
351
+ "\n",
352
+ "final_columns = ['landvote_id','FIPS',\n",
353
+ " 'state_id','state','county',\n",
354
+ " 'city','jurisdiction','year','date',\n",
355
+ " 'description','finance_mechanism',\n",
356
+ " 'other_comments','purpose',\n",
357
+ " 'total_funds_at_stake',\n",
358
+ " 'conservation_funds_at_stake',\n",
359
+ " 'total_funds_approved',\n",
360
+ " 'conservation_funds_approved',\n",
361
+ " 'passed','status','percent_yes','percent_no',\n",
362
+ " 'notes','voted_acq_measure',\n",
363
+ " 'geom','h8']"
 
 
364
  ]
365
  },
366
  {
 
378
  "metadata": {},
379
  "outputs": [],
380
  "source": [
381
+ "state_z8 = con.read_parquet(state_h3_file)\n",
382
  "states = (landvote.filter(_.jurisdiction == \"State\")\n",
383
+ " .rename(state = \"Jurisdiction Name\")\n",
384
  " .mutate(county = ibis.literal('None'))\n",
385
  " .mutate(county_fips = ibis.literal('None'))\n",
386
  " .mutate(city = ibis.literal('None')))\n",
387
  "\n",
388
+ "landvote_state_z8 = (states.inner_join(state_z8, [states.state.upper() == state_z8.state.upper()])\n",
389
+ " .select(final_columns))"
390
+ ]
391
+ },
392
+ {
393
+ "cell_type": "code",
394
+ "execution_count": null,
395
+ "id": "85f50f50-f56a-4b15-a1ad-5b87fe80dd54",
396
+ "metadata": {},
397
+ "outputs": [],
398
+ "source": [
399
+ "# getting non hex version \n",
400
+ "state_geo = con.read_parquet(state_file)\n",
401
+ "landvote_state_geo = (states.inner_join(state_geo, [states.state.upper() == state_geo.state.upper()])).select(final_columns[:-1])"
402
  ]
403
  },
404
  {
 
416
  "metadata": {},
417
  "outputs": [],
418
  "source": [
419
+ "county_match_pattern = r\"(?i)(?:(\\b[\\w-]+(?:\\s[\\w-]+)*)\\sCounty\\b|of\\s+([\\w-]+(?:\\s[\\w-]+)*))\"\n",
420
+ "county_vals = {'Columbus and Franklin County Metro Parks':'Franklin',\n",
421
+ " ' Columbus and Franklin County Metro Parks':'Franklin',\n",
422
+ " 'Athens-Clarke County': 'Clarke',\n",
423
+ " 'City and County of San Francisco':'San Francisco',\n",
424
+ " 'Cleveland Metropolitan Park District':'Cuyahoga',\n",
425
+ " 'Denver City and County':'Denver',\n",
426
+ " 'East Baton Rouge Parish':'East Baton Rouge Parish',\n",
427
+ " 'Five Rivers MetroParks':'Montgomery',\n",
428
+ " 'Forest Preserve District of DuPage County':'DuPage',\n",
429
+ " 'Forest Preserve District of Kane County':'Kane',\n",
430
+ " 'Forest Preserves of Cook County':'Cook',\n",
431
+ " 'Great Parks of Hamilton County':'Hamilton',\n",
432
+ " 'Jacksonville':'Duval',\n",
433
+ " 'James City County': 'James City',\n",
434
+ " 'Johnny Appleseed Park District':'Allen',\n",
435
+ " 'Licking Park District':'Licking',\n",
436
+ " 'Matanuska-Susitna Borough':'Matanuska-Susitna Borough',\n",
437
+ " 'MetroParks of Butler County':'Butler',\n",
438
+ " ' Metropolitan Park District of Toledo Area':'Lucas',\n",
439
+ " 'Metropolitan Park District of the Toledo Area':'Lucas',\n",
440
+ " 'Metropolitan Park District of Toledo Area':'Lucas',\n",
441
+ " 'Metropolitan Park District of Toledo Area ':'Lucas',\n",
442
+ " 'Park District of Ottawa County':'Ottawa',\n",
443
+ " 'Portage Park District':'Portage',\n",
444
+ " 'Preservation Park District of Delaware County':'Delaware',\n",
445
+ " 'Preservation Parks of Delaware County':'Delaware',\n",
446
+ " 'Santa Clara Valley Water District': 'Santa Clara',\n",
447
+ " 'St. Tammany Parish':'St. Tammany Parish',\n",
448
+ " 'Summit Metro Parks':'Summit'}\n",
449
+ "\n",
450
+ "county_z8 = (con.read_parquet(county_h3_file)\n",
451
+ " .mutate(name=_.county.re_extract(county_match_pattern, 1).strip())\n",
452
+ " .mutate(name = _.county.substitute(value = county_vals,else_= _.name))\n",
453
+ " )\n",
454
  "\n",
455
  "counties = (landvote.filter(_.jurisdiction == \"County\")\n",
456
  " .rename(county = \"Jurisdiction Name\")\n",
457
  " .mutate(city = ibis.literal('None'))\n",
458
+ " .mutate(name=_.name.re_extract(county_match_pattern, 1).strip())\n",
459
+ " .mutate(name = _.county.substitute(value = county_vals,else_= _.name))\n",
460
+ " )\n",
461
  "\n",
462
+ "landvote_county_z8 = (counties\n",
463
+ " .inner_join(county_z8, [counties.name.upper() == county_z8.name.upper(), counties.state_id == county_z8.state_id])\n",
464
+ " .select(final_columns)\n",
465
+ " )"
466
  ]
467
  },
468
  {
469
+ "cell_type": "code",
470
+ "execution_count": null,
471
+ "id": "96846068-4efa-4908-a48b-3208e08001ad",
472
  "metadata": {},
473
+ "outputs": [],
474
  "source": [
475
+ "# getting non hex version \n",
476
+ "county_geo = (con.read_parquet(county_file)\n",
477
+ " .mutate(name=_.county.re_extract(county_match_pattern, 1).strip())\n",
478
+ " .mutate(name = _.county.substitute(value = county_vals,else_= _.name))\n",
479
+ " )\n",
480
  "\n",
481
+ "landvote_county_geo = (counties.inner_join(county_geo, [counties.name.upper() == county_geo.name.upper(), \n",
482
+ " counties.state_id == county_geo.state_id])\n",
483
+ " .select(final_columns[:-1])\n",
484
+ " )"
485
  ]
486
  },
487
  {
488
+ "cell_type": "markdown",
489
+ "id": "99be20da-2a62-4ece-97e5-69118f400c62",
490
+ "metadata": {},
 
 
 
 
491
  "source": [
492
+ "#### Special District Level\n"
 
 
 
 
 
 
 
493
  ]
494
  },
495
  {
496
+ "cell_type": "code",
497
+ "execution_count": null,
498
+ "id": "4d5a1b9e-49b9-482b-8814-7cbcabb6daae",
499
  "metadata": {},
500
+ "outputs": [],
501
  "source": [
502
+ "sd_match_pattern = r\"(?i)\\b(city|town|CDP|CCD|village|charter|municipality|Borough|Park District|Authority|Basin|Mountains|2|1|District|Services|Special|Preservation|Assessment|Initiative|Open Space|Metro|Parks|Community|Recreation District)\\b\"\n",
503
+ "sd_z8 = (\n",
504
+ " con.read_parquet(city_h3_file)\n",
505
+ " .mutate(name=_.city.re_replace(sd_match_pattern, \"\"))\n",
506
+ " .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
507
+ ")\n",
508
+ "\n",
509
+ "sd_vals = {'Tri-Lakes Park and Recreation District':'Monument',\n",
510
+ " 'Urban Drainage and Flood Control District':'Denver',\n",
511
+ " 'Blue Heron Recreation District':'Phoenix',\n",
512
+ " 'Mountains Recreation and Conservation Authority':'Santa Monica',\n",
513
+ " 'St. Helena Parish Recreation and Parks District':'Greensburg',\n",
514
+ " 'West Geauga Park and Recreation District':'Chardon',\n",
515
+ " 'Marin County Open Space District':'San Rafael',\n",
516
+ " }\n",
517
+ "\n",
518
+ "# filtering landvote to just special districts \n",
519
+ "sd = (landvote.filter(_.jurisdiction == \"Special District\")\n",
520
+ " .rename(city = \"Jurisdiction Name\")\n",
521
+ " .mutate(name=_.name.re_replace(sd_match_pattern, \"\"))\n",
522
+ " .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
523
+ " .mutate(name=_.city.substitute(value=sd_vals, else_=_.name))\n",
524
+ " )\n",
525
+ "\n",
526
+ "# detecting if a record has multiple counties listed in the notes field \n",
527
+ "multiple_counties_ = (\n",
528
+ " sd\n",
529
+ " .filter(~_.notes.isnull())\n",
530
+ " .filter( \n",
531
+ " (_.notes.contains(\"counties\")) |\n",
532
+ " (_.notes.contains(\"Counties\")) |\n",
533
+ " (_.notes.split(\"County\").length()-1>1) \n",
534
+ " )\n",
535
+ ")\n",
536
+ "\n",
537
+ "#extracting multiple counties from notes column \n",
538
+ "@ibis.udf.scalar.python\n",
539
+ "def extract_counties_udf(note: str) -> list[str]:\n",
540
+ " pattern = r\"((?:[A-Z][a-zA-Z.\\'-]*(?:\\s+[A-Z][a-zA-Z.\\'-]*)*)(?:,\\s*)?(?:\\s+and\\s+)?)+(?=\\s+(?:[Cc]ounty|[Cc]ounties))\"\n",
541
+ " p = re.compile(pattern)\n",
542
+ " matches = [m.group(0) for m in p.finditer(note)] # <-- Use finditer with group(0)\n",
543
+ " counties = []\n",
544
+ " for match in matches:\n",
545
+ " parts = re.split(r',\\s*|\\s+and\\s+', match)\n",
546
+ " counties.extend(f\"{part.strip()} County\" for part in parts if part.strip())\n",
547
+ " return counties\n",
548
+ "\n",
549
+ "multiple_counties = (multiple_counties_\n",
550
+ " .mutate(county_list=extract_counties_udf(_.notes))\n",
551
+ " .unnest([\"county_list\"])\n",
552
+ " .mutate(county=_.county_list)\n",
553
+ " .drop(\"county_list\")\n",
554
+ ")\n",
555
+ "\n",
556
+ "multiple_counties_ids = multiple_counties.select('landvote_id').distinct().execute()['landvote_id'].to_list()\n",
557
+ "\n",
558
+ "# Only has 1 county in the notes field\n",
559
+ "single_county_pattern = r'([A-Z][a-zA-Z]+(?:\\s[A-Z][a-zA-Z]*)*\\sCounty)\\.?'\n",
560
+ "single_county = (sd\n",
561
+ " .filter(~_.notes.isnull())\n",
562
+ " .filter(_.landvote_id.notin(multiple_counties_ids))\n",
563
+ " .mutate(county=_.notes.re_extract(single_county_pattern, 1).strip())\n",
564
+ " .mutate(county=_.county.cases(\n",
565
+ " ('',_.city.re_extract(single_county_pattern, 1).strip()),\n",
566
+ " else_ = _.county))\n",
567
+ " .filter(_.county != '')\n",
568
+ ")\n",
569
+ "single_county_ids= single_county.select('landvote_id').distinct().execute()['landvote_id'].to_list()\n",
570
+ "\n",
571
+ "# Nothing in notes, need to join with census data to get county\n",
572
+ "manually_fill = (sd\n",
573
+ " .filter(_.landvote_id.notin(multiple_counties_ids))\n",
574
+ " .filter(_.landvote_id.notin(single_county_ids))\n",
575
+ " .inner_join(sd_z8,[_.name.upper() == sd_z8.name.upper(),\n",
576
+ " _.state_id == sd_z8.state_id]) \n",
577
+ " .select(final_columns)\n",
578
+ " .distinct()\n",
579
+ ")\n",
580
+ "\n",
581
+ "sd_county_vals = {'Western Summit County':'Summit County'}\n",
582
+ "sd_with_counties = single_county.union(multiple_counties).mutate(county=_.county.substitute(value=sd_county_vals, else_=_.county))\n",
583
+ "\n",
584
+ "#since we are joining on counties, there may be duplicate hexes because of the cities \n",
585
+ "landvote_sd_z8 = (sd_with_counties\n",
586
+ " .inner_join(county_z8.distinct(), [sd_with_counties.county.upper() == county_z8.county.upper(), \n",
587
+ " sd_with_counties.state_id == county_z8.state_id])\n",
588
+ " .select(final_columns)\n",
589
+ " .union(manually_fill)\n",
590
+ " )"
591
  ]
592
  },
593
  {
594
  "cell_type": "code",
595
  "execution_count": null,
596
+ "id": "63d8df3a-ddc8-49b3-9f3a-1762883472cb",
597
  "metadata": {},
598
  "outputs": [],
599
  "source": [
600
+ "sd = landvote_sd_z8.drop('h8','geom').distinct()\n",
601
+ "landvote_sd_geo = (sd.inner_join(county_geo,[sd.county.upper() == county_geo.county.upper(), sd.state_id == county_geo.state_id])\n",
602
+ " .select(final_columns[:-1]))\n"
603
  ]
604
  },
605
  {
606
  "cell_type": "markdown",
607
+ "id": "cca12e06-8d7f-4a50-906c-7dc02e370072",
608
  "metadata": {},
609
  "source": [
610
+ "#### Municipal level\n",
611
+ "\n",
612
+ "Because there isn't a 1 to 1 match from municipals to Census data, we need to use both \"Places\" and \"Subdivisons\". "
613
  ]
614
  },
615
  {
616
  "cell_type": "code",
617
  "execution_count": null,
618
+ "id": "aa81e457-00ba-4b01-86d7-cf46bec04edd",
619
  "metadata": {},
620
  "outputs": [],
621
  "source": [
622
+ "municipal_vals = {\n",
623
+ " \"Addison\": \"Addison village\",\n",
624
+ " \"Anderson Township Park District\": \"Anderson township\",\n",
625
+ " \"Bainbridge Island Metropolitan Park & Recreation District\": \"Bainbridge Island\",\n",
626
+ " \"Bainbridge Island Metropolitan Park and Recreation District \": \"Bainbridge Island\",\n",
627
+ " \"Bel-Ridge\": \"Bel-Ridge village\",\n",
628
+ " \"Bend Park and Recreation District\": \"Bend\",\n",
629
+ " \"Boardman Township Park District\": \"Boardman township\",\n",
630
+ " \"Carney's Point Township\": \"Carneys Point township\",\n",
631
+ " \"Castro Valley\": \"Castro Valley CDP\",\n",
632
+ " \"Charter Township of Meridian\": \"Meridian township\",\n",
633
+ " \"Charter Township of Oakland\": \"Oakland township\",\n",
634
+ " \"Corrales\": \"Corrales village\",\n",
635
+ " \"Dobbs Ferry\": \"Dobbs Ferry village\",\n",
636
+ " \"Downers Grove Park District\": \"Downers Grove village\",\n",
637
+ " \"Gates Mills\": \"Gates Mills village\",\n",
638
+ " \"Glen Ellyn Park District\": \"Glen Ellyn village\",\n",
639
+ " \"Hillsborough\": \"Hillsborough township\",\n",
640
+ " \"Irvington\": \"Irvington village\",\n",
641
+ " \"Lake Zurich\": \"Lake Zurich village\",\n",
642
+ " \"Lake in the Hills\": \"Lake in the Hills village\",\n",
643
+ " \"Libertyville\": \"Libertyville township\",\n",
644
+ " \"Loch Arbor Village\": \"Loch Arbour Village\",\n",
645
+ " \"Lockport Township Park District\": \"Lockport township\",\n",
646
+ " \"Moapa\": \"Moapa CDP\",\n",
647
+ " \"Nunda\": \"Nunda township\",\n",
648
+ " \"Orland Park\": \"Orland Park village\",\n",
649
+ " \"Park Ridge Recreation and Park District\": \"Park Ridge\",\n",
650
+ " \"Peapack-Gladstone Borough\": \"Peapack and Gladstone\",\n",
651
+ " \"Princeton Township\": \"Princeton\",\n",
652
+ " \"Romeoville\": \"Romeoville village\",\n",
653
+ " \"San Diego Open Space Park Facilities District No. 1\": \"San Diego\",\n",
654
+ " \"Seattle Park District\": \"Seattle\",\n",
655
+ " \"Stookey\": \"Stookey township\",\n",
656
+ " \"Tarrytown\": \"Tarrytown village\",\n",
657
+ " \"Tofte\": \"Tofte township\",\n",
658
+ " \"Village of Corrales\": \"Corrales village\",\n",
659
+ " \"Village of Lake Barrington\": \"Lake Barrington village\",\n",
660
+ " \"Village of Los Ranchos de Albuquerque\": \"Los Ranchos de Albuquerque village\",\n",
661
+ " \"West Paterson Borough\": \"Woodland Park\",\n",
662
+ " \"Westampton\": \"Westampton township\",\n",
663
+ " \"Willamalane Park and Recreation District\": \"Springfield\",\n",
664
+ " \"Wilmette Park District\": \"Wilmette village\", \n",
665
+ "}\n",
666
+ "collapse_spaces = r\"\\s+\"\n",
667
+ "city_z8 = (\n",
668
+ " con.read_parquet(city_h3_file)\n",
669
+ " .mutate(name=_.city.re_replace(match_pattern, \"\"))\n",
670
+ " .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
671
+ ")\n",
672
+ "\n",
673
+ "# filter to only ciites\n",
674
+ "municipals = (landvote.filter(_.jurisdiction == \"Municipal\")\n",
675
+ " .rename(city = \"Jurisdiction Name\")\n",
676
+ " .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
677
+ " .mutate(name = _.city.substitute(value = municipal_vals, else_= _.name))\n",
678
+ " )\n",
679
+ "\n",
680
+ "# join with census data \n",
681
+ "city_joined = (municipals.inner_join(city_z8, [municipals.name.upper() == city_z8.name.upper(), \n",
682
+ " municipals.state_id == city_z8.state_id]).select(final_columns))\n",
683
  "\n",
684
+ "# handling cities with multiple counties\n",
685
+ "dupes = city_joined.drop('h8','geom').distinct().group_by(\"landvote_id\").agg(county_count = _.count()).filter(_.county_count > 1)\n",
686
+ "duplicate_ids = dupes.execute()['landvote_id'].to_list()\n",
687
  "\n",
688
+ "# 105 that are already filled in, manually scraping the counties from the notes \n",
689
+ "pattern = r'^\\s*([A-Z][a-z]+(?:\\s[A-Z][a-z]+)*)\\s(?:County|Co)\\.?\\s*$'\n",
690
+ "counties_filled = (municipals.filter(_.landvote_id.isin(duplicate_ids))\n",
691
+ " .filter(~_.notes.isnull())\n",
692
+ " .mutate(county=_.notes.re_extract(pattern, 1).strip()+ ibis.literal(' County'))\n",
693
+ " .filter(_.county !=' County')\n",
694
+ " )\n",
695
  "\n",
696
+ "# since we added the county, join it with the rest of the census data \n",
697
+ "counties_filled_join = (counties_filled\n",
698
+ " .inner_join(city_z8,[counties_filled.name.upper() == city_z8.name.upper(),\n",
699
+ " counties_filled.county.upper() == city_z8.county.upper(), \n",
700
+ " counties_filled.state_id == city_z8.state_id])\n",
701
+ " .select(final_columns))\n",
702
  "\n",
703
+ "counties_filled_ids = counties_filled_join.select('landvote_id').distinct().execute()['landvote_id'].to_list()\n",
 
 
704
  "\n",
705
+ "# join with the rest of the municipal data\n",
706
+ "landvote_city_z8 = city_joined.filter(~_.landvote_id.isin(counties_filled_ids)).union(counties_filled_join).distinct()"
707
  ]
708
  },
709
  {
710
+ "cell_type": "code",
711
+ "execution_count": null,
712
+ "id": "c751dabd-b39a-4c54-b9ed-17d2b0cb32ea",
713
  "metadata": {},
714
+ "outputs": [],
715
  "source": [
716
+ "match_pattern = r\"(?i)\\b(city|town|charter|municipality|[Bb]orough)\\b\"\n",
717
+ "\n",
718
+ "city_geo = (city_fips.inner_join(county_geo, 'FIPS').select(~s.endswith('_right')).drop('name')\n",
719
+ " .mutate(name=_.city.re_replace(match_pattern, \"\"))\n",
720
+ " .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip()))\n",
721
+ "\n",
722
+ "municipals_counties = (counties_filled\n",
723
+ " .mutate(name=_.city.re_replace(match_pattern, \"\"))\n",
724
+ " .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
725
+ " .mutate(name = _.city.substitute(value = municipal_vals, else_= _.name))\n",
726
+ " .inner_join(city_geo,[_.name.upper() == city_geo.name.upper(),\n",
727
+ " _.county.upper() == city_geo.county.upper(), \n",
728
+ " _.state_id == city_geo.state_id])\n",
729
+ " .select(final_columns[:-1])\n",
730
+ " )\n",
731
+ "\n",
732
+ "other_municipals = (municipals.filter(~_.landvote_id.isin(counties_filled_ids))\n",
733
+ " .mutate(name=_.city.re_replace(match_pattern, \"\"))\n",
734
+ " .mutate(name=_.name.re_replace(collapse_spaces, \" \").strip())\n",
735
+ " .mutate(name = _.city.substitute(value = municipal_vals, else_= _.name))\n",
736
+ " .inner_join(city_geo,[_.name.upper() == city_geo.name.upper(),_.state_id == city_geo.state_id])\n",
737
+ " .select(final_columns[:-1]))\n",
738
+ "\n",
739
+ "landvote_city_geo = municipals_counties.union(other_municipals).distinct() "
740
  ]
741
  },
742
  {
743
  "cell_type": "markdown",
744
+ "id": "6d52d97b-ad14-4d04-89a8-79a813f80353",
745
  "metadata": {},
746
  "source": [
747
+ "#### Joining all the landvote data with census\n",
748
+ "Note: `landvote_joined` has more unique rows than `landvote` because some cities/special districts span multiple counties. Each additional county creates a new row."
749
  ]
750
  },
751
  {
752
  "cell_type": "code",
753
  "execution_count": null,
754
+ "id": "6d481736-82f3-4be2-b5af-6280be5e9d75",
755
+ "metadata": {
756
+ "scrolled": true
757
+ },
758
  "outputs": [],
759
  "source": [
760
+ "landvote_joined_z8 = landvote_city_z8.union(landvote_county_z8).union(landvote_sd_z8).union(landvote_state_z8)\n",
761
+ "landvote_joined_z8.to_parquet(\"s3://shared-tpl/landvote/z8/landvote_h3_z8.parquet\")\n",
 
 
 
 
 
762
  "\n",
 
 
 
 
763
  "\n",
764
+ "# and non-hex version \n",
765
+ "landvote_joined_geo = landvote_city_geo.union(landvote_county_geo).union(landvote_sd_geo).union(landvote_state_geo)\n",
766
+ "landvote_joined_geo.to_parquet(\"s3://shared-tpl/landvote/landvote_geom.parquet\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
767
  ]
768
  },
769
  {
770
+ "cell_type": "markdown",
771
+ "id": "066e4fdd-f069-4d5d-b2be-f10124cfe19c",
 
772
  "metadata": {},
 
773
  "source": [
774
+ "#### Generate PMTiles"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
775
  ]
776
  },
777
  {
778
  "cell_type": "code",
779
  "execution_count": null,
780
+ "id": "100f0fd8-9588-4f65-a657-52bd5b942089",
781
  "metadata": {},
782
  "outputs": [],
783
  "source": [
784
+ "parquet = client.get_presigned_url(\n",
785
+ " \"GET\",\n",
786
+ " \"shared-tpl\",\n",
787
+ " \"landvote/landvote_geom.parquet\",\n",
788
+ " expires=timedelta(hours=2),\n",
789
+ ")\n",
790
+ "to_geojson(parquet, \"landvote_geom.geojson\")\n",
791
+ "pmtiles = to_pmtiles(\"landvote_geom.geojson\", \"landvote_geom.pmtiles\")\n",
792
+ "s3_cp('landvote_geom.pmtiles', \"s3://shared-tpl/landvote/landvote_geom.pmtiles\", \"minio\")"
793
  ]
794
  }
795
  ],