cboettig commited on
Commit
5a7aca2
·
1 Parent(s): f942fde

better strategy with h3

Browse files
Files changed (3) hide show
  1. app.R +70 -55
  2. test-data.R +77 -11
  3. utils.R +56 -1
app.R CHANGED
@@ -3,6 +3,7 @@ library(bslib)
3
  library(htmltools)
4
  library(fontawesome)
5
  library(bsicons)
 
6
  library(glue)
7
  library(sf)
8
  library(duckdb.agent)
@@ -14,11 +15,8 @@ library(digest)
14
  library(stringr)
15
  library(shinybusy)
16
 
17
- # initialize a disk-backed database for the session
18
- duckdbfs::close_connection()
19
- duckdbfs::cached_connection(tempfile())
20
 
21
- duckdbfs::load_h3()
22
  duckdbfs::load_spatial()
23
 
24
  css <-
@@ -117,30 +115,50 @@ duckdb_secrets(Sys.getenv("MINIO_KEY"),
117
  "minio.carlboettiger.info")
118
 
119
 
120
- get_h3index <- function(shape, zoom = 0L, precision = 6L) {
121
 
122
- tmp <- tempfile(fileext = ".fgb")
123
- shape |> st_transform(4326) |> write_sf(tmp, append = FALSE)
124
- zoom <- as.integer(zoom)
125
 
126
- # consider auto-retry at higher precision if subset is empty.
127
- precision <- as.integer(precision)
128
- subset <- open_dataset(tmp) |>
129
- mutate(poly = array_extract(unnest(st_dump(geom)),"geom"),
130
- hexid = h3_polygon_wkt_to_cells(poly,{precision}),
131
- hexid = unnest(hexid)
132
- ) |>
133
- mutate(h0 = h3_h3_to_string( h3_cell_to_parent(hexid, {zoom})),
134
- hexid = h3_h3_to_string (hexid) ) |>
135
- select(h0) |>
136
- distinct() |>
137
- pull(h0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
- toupper(subset)
140
- }
141
 
142
  # Define the server
143
  server <- function(input, output, session) {
 
 
144
  output$map <- renderMaplibre({
145
  m <- maplibre(center = c(-110, 38), zoom = 2, pitch = 0, maxZoom = 12) |>
146
  add_draw_control() |>
@@ -148,52 +166,45 @@ server <- function(input, output, session) {
148
 
149
  m
150
  })
151
- observeEvent(input$get_features, {
 
 
152
  bounds <- ""
153
  aoi_info <- NULL
154
 
155
  drawn_features <- get_drawn_features(mapboxgl_proxy("map"))
156
  if(nrow(drawn_features) > 0) {
157
 
158
- bounds <- st_bbox(drawn_features)
159
- output$feature_output <- renderPrint(print(bounds))
 
160
 
161
- attach(as.list(bounds))
162
 
163
- subset <- get_h3index(drawn_features)
 
164
  urls <- paste0("https://minio.carlboettiger.info/public-gbif/hex/h0=", subset, "/part0.parquet")
165
  gbif <- open_dataset(urls, tblname = "gbif")
 
 
166
 
167
- gbif |>
168
- dplyr::filter(between(decimallatitude, ymin, ymax),
169
- between(decimallongitude, xmin, xmax)) |>
170
- as_view("gbif_aoi")
171
-
172
- }
173
-
174
- observeEvent(input$user_msg, {
175
-
176
-
177
 
178
- system_prompt = create_prompt(additional_instructions =
179
- "Note that the columns h1, h2, h3, through h11 contains a geohash representing a H3 hexagon index.
180
- Higher numbers indicate higher zoom resolution (smaller hexes)
181
- Always aggregate results to count the number of rows matching
182
- the query to the desired hexagon. Always name the count column 'count'.
183
- Remember to group by hexagon level to aggregate!
184
- Always rename the chosen hexagon column as 'h3id' in your final answer.
185
- Always use table notation like 'gbif.order' to specify column names.
186
- Be sure to generate fully valid SQL. Check your SQL for possible errors.
187
 
188
- Do not use the 'scientificname' column! Instead, filter specific species using the
189
- binomial name as the 'species' column.
190
 
191
- Always use the table 'gbif_aoi' rather than 'gbif' table if both are present.
192
 
193
- IMPORTANT: return raw JSON only, do not decorate your reply with markdown code syntax.
194
- ")
195
 
196
  model <- reactive(input$select)()
 
197
  if (grepl("cirrus", model)) {
198
  agent <- ellmer::chat_vllm(
199
  base_url = "https://llm.cirrus.carlboettiger.info/v1/",
@@ -203,7 +214,7 @@ observeEvent(input$get_features, {
203
  api_args = list(temperature = 0)
204
  )
205
  } else {
206
- agent <- ellmer::chat_vllm(
207
  base_url = "https://llm.nrp-nautilus.io/",
208
  model = model,
209
  api_key = Sys.getenv("NRP_API_KEY"),
@@ -212,6 +223,7 @@ observeEvent(input$get_features, {
212
  )
213
  }
214
 
 
215
  print("Agent thinking...")
216
  stream <- agent$chat(input$chat)
217
 
@@ -229,20 +241,23 @@ observeEvent(input$get_features, {
229
  output$agent <- renderText(response$agent)
230
  }
231
 
232
-
233
  # cache the query
234
  query_id <- digest::digest(paste(response$query, bounds, collapse=""))
235
  data_url <- glue::glue("https://minio.carlboettiger.info/public-data/cache/{query_id}.h3j")
236
 
237
- # use tempfile. we could use database tempdir
238
  cache_parquet <- tempfile(glue("{query_id}"), fileext = ".parquet")
239
 
240
- # compute if not yet in chache
 
 
 
241
  status <- httr::status_code(httr::HEAD(data_url))
242
  if(status == 404) {
243
  print("Computing...")
244
  time <- bench::bench_time({
245
  agent_query(stream) |>
 
246
  mutate(log_count = log(count)) |>
247
  write_dataset(cache_parquet)
248
  })
 
3
  library(htmltools)
4
  library(fontawesome)
5
  library(bsicons)
6
+ library(bench)
7
  library(glue)
8
  library(sf)
9
  library(duckdb.agent)
 
15
  library(stringr)
16
  library(shinybusy)
17
 
18
+ source("utils.R")
 
 
19
 
 
20
  duckdbfs::load_spatial()
21
 
22
  css <-
 
115
  "minio.carlboettiger.info")
116
 
117
 
 
118
 
 
 
 
119
 
120
+ # system prompt generation is slow, do only once??
121
+
122
+ system_prompt = create_prompt(additional_instructions =
123
+ "Note that the columns h1, h2, h3, through h11 contains a geohash representing a H3 hexagon index.
124
+ Higher numbers indicate higher zoom resolution (smaller hexes)
125
+ Always aggregate results to count the number of rows matching
126
+ the query to the desired hexagon. Always name the count column 'count'.
127
+ Remember to group by hexagon level to aggregate!
128
+
129
+ Always rename the chosen hexagon column as 'h3id' in your final answer.
130
+ Only select the h3id and count in your final answer.
131
+
132
+ Examples:
133
+ user: 'show all bird occurrences at zoom level 6'
134
+
135
+ your reply:
136
+
137
+ {
138
+ 'query': 'CREATE OR REPLACE VIEW bird_occurrences_h6 AS SELECT gbif.h6 AS h3id, COUNT(*) AS count FROM gbif WHERE gbif.class = 'Aves' GROUP BY gbif.h6',
139
+ 'table_name': 'bird_occurrences_h6',
140
+ 'explanation': 'This query creates a view that shows the count of bird occurrences at zoom level 6. It selects the h6 column as the hexagon id, counts the number of rows for each hexagon, and groups the results by the h6 column.'
141
+ }
142
+
143
+ Refer to the full table by its table name as given above.
144
+ Be sure to list column names
145
+ Be sure to generate fully valid SQL. Check your SQL for possible errors.
146
+
147
+
148
+ Do not use the 'scientificname' column! Instead, filter specific species using the
149
+ binomial name as the 'species' column.
150
+
151
+ IMPORTANT: return raw JSON only, do not decorate your reply with markdown code syntax.
152
+ ")
153
+
154
+
155
+
156
 
 
 
157
 
158
  # Define the server
159
  server <- function(input, output, session) {
160
+
161
+ # first we draw the map with geosearch and draw controls.
162
  output$map <- renderMaplibre({
163
  m <- maplibre(center = c(-110, 38), zoom = 2, pitch = 0, maxZoom = 12) |>
164
  add_draw_control() |>
 
166
 
167
  m
168
  })
169
+
170
+ # React to user's polygon
171
+ observeEvent(input$get_features, {
172
  bounds <- ""
173
  aoi_info <- NULL
174
 
175
  drawn_features <- get_drawn_features(mapboxgl_proxy("map"))
176
  if(nrow(drawn_features) > 0) {
177
 
178
+ aoi <- as_dataset.sf(drawn_features)
179
+ h3_aoi <- get_h3_aoi(aoi)
180
+ subset <- h3_aoi |> distinct(h0) |> pull(h0)
181
 
 
182
 
183
+ print(h3_aoi)
184
+
185
  urls <- paste0("https://minio.carlboettiger.info/public-gbif/hex/h0=", subset, "/part0.parquet")
186
  gbif <- open_dataset(urls, tblname = "gbif")
187
+ # would be better to spatial join
188
+ bounds <- st_bbox(drawn_features)
189
 
190
+ # timer <- bench::bench_time({
191
+ # xmin <- bounds[1]; ymin <- bounds[2]; xmax <- bounds[3]; ymax <- bounds[4]
192
+ # open_dataset(urls, tblname = "gbif") |>
193
+ # #filter(between(decimallongitude, xmin, xmax), between(decimallatitude, ymin, ymax)) |>
194
+ # mutate(geom = st_geomfromwkb(geom)) |> spatial_join(aoi) |>
195
+ # as_view("gbif_aoi")
196
+ # })
197
+ # print(timer)
 
 
198
 
199
+ output$feature_output <- renderPrint(print(bounds))
200
+ }
 
 
 
 
 
 
 
201
 
 
 
202
 
 
203
 
204
+ observeEvent(input$user_msg, {
 
205
 
206
  model <- reactive(input$select)()
207
+
208
  if (grepl("cirrus", model)) {
209
  agent <- ellmer::chat_vllm(
210
  base_url = "https://llm.cirrus.carlboettiger.info/v1/",
 
214
  api_args = list(temperature = 0)
215
  )
216
  } else {
217
+ agent <- ellmer::chat_vllm( # NRP models have too small a context window for useful interaction
218
  base_url = "https://llm.nrp-nautilus.io/",
219
  model = model,
220
  api_key = Sys.getenv("NRP_API_KEY"),
 
223
  )
224
  }
225
 
226
+
227
  print("Agent thinking...")
228
  stream <- agent$chat(input$chat)
229
 
 
241
  output$agent <- renderText(response$agent)
242
  }
243
 
 
244
  # cache the query
245
  query_id <- digest::digest(paste(response$query, bounds, collapse=""))
246
  data_url <- glue::glue("https://minio.carlboettiger.info/public-data/cache/{query_id}.h3j")
247
 
248
+ # use tempfile as cache. we could use database tempdir
249
  cache_parquet <- tempfile(glue("{query_id}"), fileext = ".parquet")
250
 
251
+
252
+
253
+
254
+ # compute if not yet in cache
255
  status <- httr::status_code(httr::HEAD(data_url))
256
  if(status == 404) {
257
  print("Computing...")
258
  time <- bench::bench_time({
259
  agent_query(stream) |>
260
+ hex_join(h3_aoi) |>
261
  mutate(log_count = log(count)) |>
262
  write_dataset(cache_parquet)
263
  })
test-data.R CHANGED
@@ -1,23 +1,89 @@
1
-
2
  library(duckdbfs)
3
  library(dplyr)
4
-
5
  library(sf)
6
  library(spData)
7
- bounds <- spData::us_states |> dplyr::filter(NAME == "Arizona") |> sf::st_bbox()
8
- attach(as.list(bounds))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
- duckdb_secrets(Sys.getenv("MINIO_KEY"),
12
- Sys.getenv("MINIO_SECRET"),
13
- "minio.carlboettiger.info")
14
 
15
- gbif <- open_dataset("s3://public-gbif/2024-10-01", tblname = "gbif")
16
- gbif_aoi <-
17
- gbif |> dplyr::filter(decimallatitude > 0) |> show_query()
18
 
 
 
 
 
 
 
19
 
 
20
 
 
 
 
 
21
 
 
22
 
23
- gbif_aoi |> show_query()
 
 
 
1
  library(duckdbfs)
2
  library(dplyr)
 
3
  library(sf)
4
  library(spData)
5
+
6
+ duckdbfs::load_h3()
7
+ duckdbfs::load_spatial()
8
+
9
+
10
+ #fs::file_delete(tmp)
11
+ ex1 <- spData::us_states |> dplyr::filter(NAME == "Arizona")
12
+ ex2 <- world |> filter(iso_a2 == "US")
13
+
14
+
15
+ as_dataset.sf <- function(sf, ...) {
16
+ # cludgy way to get polygon into duckdb as spatial data
17
+ tmp <- tempfile(fileext = ".fgb")
18
+ sf |> st_transform(4326) |> write_sf(tmp, append = FALSE)
19
+ aoi <- open_dataset(tmp, ...)
20
+
21
+ aoi
22
+ }
23
+
24
+ get_h3index <- function(aoi, zoom = 0L, precision = 6L) {
25
+
26
+ zoom <- as.integer(zoom)
27
+
28
+ # consider auto-retry at higher precision if subset is empty.
29
+ precision <- as.integer(precision)
30
+
31
+ res <- paste0("h", precision)
32
+ # multipolygon dump may not be needed for draw tools.
33
+ h3_aoi <- aoi |>
34
+ mutate(poly = array_extract(unnest(st_dump(geom)),"geom"),
35
+ hexid = h3_polygon_wkt_to_cells(poly,{precision}),
36
+ hexid = unnest(hexid)
37
+ ) |>
38
+ mutate(h0 = h3_h3_to_string( h3_cell_to_parent(hexid, {zoom})),
39
+ hexid = h3_h3_to_string (hexid) ) |>
40
+ mutate(h0 = toupper(h0), hexid = toupper(hexid))
41
+
42
+ # create a view as well
43
+ h3_aoi |> select(h0, hexid) |>
44
+ #rename(!!res := hexid) |>
45
+ as_view("h3_aoi")
46
+
47
+ subset <- h3_aoi |>
48
+ select(h0) |>
49
+ distinct() |>
50
+ pull(h0)
51
+
52
+ subset
53
+ }
54
+
55
+ aoi <- as_dataset.sf(ex1)
56
+ subset <- get_h3index(aoi)
57
+ urls <- paste0("https://minio.carlboettiger.info/public-gbif/hex/h0=", subset, "/part0.parquet")
58
+ gbif <- open_dataset(urls, tblname = "gbif")
59
+
60
+
61
+ x <- gbif |> rename(hexid = h8) |> count(hexid, name = "count")
62
+
63
+ con <- cached_connection()
64
+ y <- tbl(con, "h3_aoi")
65
 
66
 
67
+ hex_join <- function(x,y) {
68
+ res_x <- x |> head(1) |> mutate(res = h3_get_resolution(hexid)) |> pull(res)
69
+ res_y <- y |> head(1) |> mutate(res = h3_get_resolution(hexid)) |> pull(res)
70
 
 
 
 
71
 
72
+ if (res_x > res_y) {
73
+ y <- y |>
74
+ mutate(hexid = unnest(
75
+ h3_cell_to_children(hexid, {res_x})),
76
+ hexid = toupper(hexid)
77
+ )
78
 
79
+ }
80
 
81
+ if (res_x < res_y) {
82
+ y <- y |>
83
+ mutate(hexid = h3_cell_to_parent(hexid, {res_x}))
84
+ }
85
 
86
+ inner_join(x, y)
87
 
88
+ }
89
+ hex_join(x,y)
utils.R CHANGED
@@ -1,4 +1,59 @@
1
 
2
- library(tidyverse)
3
  library(duckdbfs)
 
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
+ library(dplyr)
3
  library(duckdbfs)
4
+ library(sf)
5
+ duckdbfs::load_h3()
6
 
7
+ as_dataset.sf <- function(sf, ...) {
8
+ # cludgy way to get polygon into duckdb as spatial data
9
+ tmp <- tempfile(fileext = ".fgb")
10
+ sf |> sf::st_transform(4326) |> sf::write_sf(tmp, append = FALSE)
11
+ aoi <- duckdbfs::open_dataset(tmp, ...)
12
+
13
+ aoi
14
+ }
15
+
16
+ get_h3_aoi <- function(aoi, zoom = 0L, precision = 6L) {
17
+
18
+ zoom <- as.integer(zoom)
19
+
20
+ # consider auto-retry at higher precision if subset is empty.
21
+ precision <- as.integer(precision)
22
+
23
+ res <- paste0("h", precision)
24
+ # multipolygon dump may not be needed for draw tools.
25
+ h3_aoi <- aoi |>
26
+ mutate(poly = array_extract(unnest(st_dump(geom)),"geom"),
27
+ h3id = h3_polygon_wkt_to_cells(poly,{precision}),
28
+ h3id = unnest(h3id)
29
+ ) |>
30
+ mutate(h0 = h3_h3_to_string( h3_cell_to_parent(h3id, {zoom})),
31
+ h3id = h3_h3_to_string (h3id) ) |>
32
+ mutate(h0 = toupper(h0), h3id = toupper(h3id)) |>
33
+ select(h0, h3id) |>
34
+ as_view("h3_aoi")
35
+ }
36
+
37
+ hex_res <- function(x) {
38
+ x |>
39
+ utils::head(1) |>
40
+ dplyr::mutate(res = h3_get_resolution(h3id)) |>
41
+ dplyr::pull(res)
42
+ }
43
+
44
+ hex_join <- function(x,y) {
45
+ res_x <- hex_res(x)
46
+ res_y <- hex_res(y)
47
+
48
+ if (res_x > res_y) {
49
+ y <- y |>
50
+ dplyr::mutate(h3id = unnest(h3_cell_to_children(h3id, {res_x})),
51
+ h3id = toupper(h3id))
52
+ }
53
+ if (res_x < res_y) {
54
+ y <- y |>
55
+ dplyr::mutate(h3id = h3_cell_to_parent(h3id, {res_x}))
56
+ }
57
+
58
+ dplyr::inner_join(x, y)
59
+ }