Spaces:
Sleeping
Sleeping
:construction: :construction:
Browse files- app.R +163 -201
- footer.md +0 -16
- system-prompt.md +0 -34
- test-app.R +44 -63
- test-data.R +13 -29
app.R
CHANGED
@@ -3,16 +3,18 @@ library(bslib)
|
|
3 |
library(htmltools)
|
4 |
library(fontawesome)
|
5 |
library(bsicons)
|
6 |
-
library(gt)
|
7 |
-
library(colourpicker)
|
8 |
library(glue)
|
9 |
-
|
10 |
-
library(
|
11 |
-
library(
|
12 |
library(dplyr)
|
|
|
13 |
library(mapgl)
|
14 |
-
library(
|
15 |
-
|
|
|
|
|
|
|
16 |
|
17 |
css <-
|
18 |
HTML(paste0("<link rel='stylesheet' type='text/css' ",
|
@@ -26,63 +28,51 @@ ui <- page_sidebar(
|
|
26 |
fillable = FALSE, # do not squeeze to vertical screen space
|
27 |
tags$head(css),
|
28 |
titlePanel("Demo App"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
"
|
31 |
-
This is a proof-of-principle for a simple chat-driven interface
|
32 |
-
to dynamically explore geospatial data.
|
33 |
-
",
|
34 |
|
35 |
card(
|
36 |
layout_columns(
|
37 |
textInput("chat",
|
38 |
label = NULL,
|
39 |
-
"
|
40 |
width = "100%"),
|
|
|
41 |
div(
|
42 |
actionButton("user_msg", "", icon = icon("paper-plane"),
|
43 |
class = "btn-primary btn-sm align-bottom"),
|
44 |
class = "align-text-bottom"),
|
45 |
-
col_widths = c(11, 1)
|
46 |
fill = FALSE
|
|
|
47 |
),
|
48 |
|
49 |
textOutput("agent"),
|
50 |
|
51 |
-
|
52 |
-
layout_columns(
|
53 |
-
card(maplibreOutput("map")),
|
54 |
-
card(includeMarkdown("## Plot"),
|
55 |
-
plotOutput("chart1"),
|
56 |
-
plotOutput("chart2"),
|
57 |
-
),
|
58 |
-
col_widths = c(8, 4),
|
59 |
-
row_heights = c("500px"),
|
60 |
-
max_height = "600px"
|
61 |
-
),
|
62 |
-
|
63 |
-
gt_output("table"),
|
64 |
-
|
65 |
-
card(fill = TRUE,
|
66 |
-
card_header(fa("robot"), textOutput("model", inline = TRUE)),
|
67 |
-
accordion(
|
68 |
-
open = FALSE,
|
69 |
-
accordion_panel(
|
70 |
-
title = "show sql",
|
71 |
-
icon = fa("terminal"),
|
72 |
-
verbatimTextOutput("sql_code"),
|
73 |
-
),
|
74 |
-
accordion_panel(
|
75 |
-
title = "explain",
|
76 |
-
icon = fa("user", prefer_type="solid"),
|
77 |
-
textOutput("explanation"),
|
78 |
-
)
|
79 |
-
),
|
80 |
-
),
|
81 |
-
card(
|
82 |
-
card_header("Errata"),
|
83 |
-
shiny::markdown(readr::read_file("footer.md")),
|
84 |
-
),
|
85 |
sidebar = sidebar(
|
|
|
|
|
|
|
|
|
|
|
86 |
selectInput(
|
87 |
"select",
|
88 |
"Select an LLM:",
|
@@ -91,192 +81,164 @@ ui <- page_sidebar(
|
|
91 |
"Gorilla (UC Berkeley)" = "gorilla"
|
92 |
)
|
93 |
),
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
card(
|
106 |
card_header(bs_icon("github"), "Source code:"),
|
107 |
-
a(href = "https://github.com/boettiger-lab/
|
108 |
-
"https://github.com/boettiger-lab/
|
109 |
),
|
110 |
|
111 |
theme = bs_theme(version = "5")
|
112 |
)
|
113 |
|
114 |
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
con <- duckdbfs::cached_connection()
|
119 |
-
svi <- open_dataset(parquet, tblname = "svi") |> filter(RPL_THEMES > 0)
|
120 |
-
|
121 |
-
safe_parse <- function(txt) {
|
122 |
-
gsub("[\r\n]", " ", txt) |> gsub("\\s+", " ", x = _)
|
123 |
-
}
|
124 |
-
|
125 |
-
|
126 |
-
# helper utilities
|
127 |
-
# faster/more scalable to pass maplibre the ids to refilter pmtiles,
|
128 |
-
# than to pass it the full geospatial/sf object
|
129 |
-
filter_column <- function(full_data, filtered_data, id_col = "FIPS") {
|
130 |
-
if (nrow(filtered_data) < 1) return(NULL)
|
131 |
-
values <- full_data |>
|
132 |
-
inner_join(filtered_data, copy = TRUE) |>
|
133 |
-
pull(id_col)
|
134 |
-
# maplibre syntax for the filter of PMTiles
|
135 |
-
list("in", list("get", id_col), list("literal", values))
|
136 |
-
}
|
137 |
-
|
138 |
|
|
|
|
|
139 |
|
140 |
# Define the server
|
141 |
server <- function(input, output, session) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
collect()
|
147 |
-
|
148 |
-
chart1 <- chart1_data |>
|
149 |
-
ggplot(aes(mean_svi)) + geom_density(fill="darkred") +
|
150 |
-
ggtitle("County-level vulnerability nation-wide")
|
151 |
-
|
152 |
-
data <- reactiveValues(df = tibble())
|
153 |
-
output$chart1 <- renderPlot(chart1)
|
154 |
-
|
155 |
-
model <- reactive(input$select)
|
156 |
-
output$model <- renderText(input$select)
|
157 |
-
observe({
|
158 |
-
schema <- read_file("schema.yml")
|
159 |
-
system_prompt <- glue::glue(readr::read_file("system-prompt.md"),
|
160 |
-
.open = "<", .close = ">")
|
161 |
-
chat <- ellmer::chat_vllm(
|
162 |
-
base_url = "https://llm.nrp-nautilus.io/",
|
163 |
-
model = model(),
|
164 |
-
api_key = Sys.getenv("NRP_API_KEY"),
|
165 |
-
system_prompt = system_prompt,
|
166 |
-
api_args = list(temperature = 0)
|
167 |
-
)
|
168 |
-
|
169 |
-
observeEvent(input$user_msg, {
|
170 |
-
stream <- chat$chat(input$chat)
|
171 |
|
172 |
-
|
173 |
-
|
174 |
-
|
|
|
175 |
|
176 |
-
if ("query" %in% names(response)) {
|
177 |
-
output$sql_code <- renderText(stringr::str_wrap(response$query, width = 60))
|
178 |
-
output$explanation <- renderText(response$explanation)
|
179 |
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
182 |
|
183 |
-
|
184 |
-
df <- df |> select(-any_of("Shape"))
|
185 |
-
output$table <- render_gt(df, height = 300)
|
186 |
|
|
|
187 |
|
188 |
-
y_axis <- colnames(df)[!colnames(df) %in% colnames(svi)]
|
189 |
-
chart2 <- df |>
|
190 |
-
rename(social_vulnerability = y_axis) |>
|
191 |
-
ggplot(aes(social_vulnerability)) +
|
192 |
-
geom_density(fill = "darkred") +
|
193 |
-
xlim(c(0, 1)) +
|
194 |
-
ggtitle("Vulnerability of selected areas")
|
195 |
|
196 |
-
output$chart2 <- renderPlot(chart2)
|
197 |
|
198 |
-
|
199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
-
|
202 |
-
# this can confuse the agent and mess up behavior, so we reset:
|
203 |
-
chat$set_turns(NULL)
|
204 |
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
}
|
209 |
|
210 |
-
|
211 |
-
|
|
|
|
|
|
|
|
|
|
|
212 |
|
213 |
|
214 |
-
|
|
|
215 |
|
216 |
-
|
217 |
-
|
218 |
-
m <- m |>
|
219 |
-
add_fill_layer(
|
220 |
-
id = "redlines",
|
221 |
-
source = list(type = "vector",
|
222 |
-
url = paste0("pmtiles://", "https://data.source.coop/cboettig/us-boundaries/mappinginequality.pmtiles")),
|
223 |
-
source_layer = "mappinginequality",
|
224 |
-
fill_color = list("get", "fill")
|
225 |
-
)
|
226 |
-
}
|
227 |
-
if (input$richness) {
|
228 |
-
m <- m |>
|
229 |
-
add_raster_source(id = "richness",
|
230 |
-
tiles = "https://data.source.coop/cboettig/mobi/tiles/red/species-richness-all/{z}/{x}/{y}.png",
|
231 |
-
maxzoom = 11
|
232 |
-
) |>
|
233 |
-
add_raster_layer(id = "richness-layer",
|
234 |
-
source = "richness")
|
235 |
|
236 |
-
|
|
|
|
|
237 |
|
238 |
-
|
239 |
-
|
240 |
-
add_raster_source(id = "rsr",
|
241 |
-
tiles = "https://data.source.coop/cboettig/mobi/tiles/green/range-size-rarity-all/{z}/{x}/{y}.png",
|
242 |
-
maxzoom = 11
|
243 |
-
) |>
|
244 |
-
add_raster_layer(id = "richness-layer",
|
245 |
-
source = "rsr")
|
246 |
|
|
|
|
|
247 |
}
|
248 |
-
if (input$svi) {
|
249 |
-
m <- m |>
|
250 |
-
add_fill_layer(
|
251 |
-
id = "svi_layer",
|
252 |
-
source = list(type = "vector",
|
253 |
-
url = paste0("pmtiles://", pmtiles)),
|
254 |
-
source_layer = "svi",
|
255 |
-
filter = filter_column(svi, data$df, "FIPS"),
|
256 |
-
fill_opacity = 0.5,
|
257 |
-
fill_color = interpolate(column = "RPL_THEMES",
|
258 |
-
values = c(0, 1),
|
259 |
-
stops = c("lightpink", "darkred"),
|
260 |
-
na_color = "lightgrey")
|
261 |
-
)
|
262 |
-
}
|
263 |
-
m |>
|
264 |
-
add_draw_control() |>
|
265 |
-
add_geocoder_control()
|
266 |
-
|
267 |
-
})
|
268 |
-
|
269 |
-
observeEvent(input$color, {
|
270 |
-
maplibre_proxy("map") |>
|
271 |
-
set_paint_property("svi_layer", "fill-color", input$color)
|
272 |
-
})
|
273 |
-
|
274 |
-
observeEvent(input$slider, {
|
275 |
-
maplibre_proxy("map") |>
|
276 |
-
set_filter("svi_layer",
|
277 |
-
list(">=", get_column("BIR74"), input$slider))
|
278 |
-
})
|
279 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
}
|
281 |
|
282 |
# Run the app
|
|
|
3 |
library(htmltools)
|
4 |
library(fontawesome)
|
5 |
library(bsicons)
|
|
|
|
|
6 |
library(glue)
|
7 |
+
library(sf)
|
8 |
+
library(duckdb.agent)
|
9 |
+
library(duckdbfs)
|
10 |
library(dplyr)
|
11 |
+
library(ellmer)
|
12 |
library(mapgl)
|
13 |
+
library(digest)
|
14 |
+
library(stringr)
|
15 |
+
library(shinybusy)
|
16 |
+
|
17 |
+
duckdbfs::close_connection()
|
18 |
|
19 |
css <-
|
20 |
HTML(paste0("<link rel='stylesheet' type='text/css' ",
|
|
|
28 |
fillable = FALSE, # do not squeeze to vertical screen space
|
29 |
tags$head(css),
|
30 |
titlePanel("Demo App"),
|
31 |
+
shinybusy::add_busy_spinner(),
|
32 |
+
|
33 |
+
p("
|
34 |
+
Select a desired area with the draw tools on the map, then hit 'Set Area of Interest' to select.
|
35 |
+
Then, enter your query in the text box below the map to count occurrences of your specified taxonomic group.
|
36 |
+
Use the airplane button to sned your query. The computation may take a few minutes depending on the size and scale of
|
37 |
+
the search.
|
38 |
+
"),
|
39 |
+
|
40 |
+
p("
|
41 |
+
Scroll to zoom, ctrl+click to pitch and rotate. Hitting the area button with no selection to include the entire map.
|
42 |
+
"),
|
43 |
+
|
44 |
+
layout_columns(
|
45 |
+
card(maplibreOutput("map")),
|
46 |
+
div(actionButton("get_features", "Set Area Of Interest", icon = icon("object-group"),
|
47 |
+
class = "btn-primary align-bottom")),
|
48 |
+
col_widths = c(11,1)
|
49 |
+
),
|
50 |
|
|
|
|
|
|
|
|
|
51 |
|
52 |
card(
|
53 |
layout_columns(
|
54 |
textInput("chat",
|
55 |
label = NULL,
|
56 |
+
"show all bird occurrences at zoom level 6",
|
57 |
width = "100%"),
|
58 |
+
|
59 |
div(
|
60 |
actionButton("user_msg", "", icon = icon("paper-plane"),
|
61 |
class = "btn-primary btn-sm align-bottom"),
|
62 |
class = "align-text-bottom"),
|
63 |
+
col_widths = c(11, 1),
|
64 |
fill = FALSE
|
65 |
+
),
|
66 |
),
|
67 |
|
68 |
textOutput("agent"),
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
sidebar = sidebar(
|
71 |
+
card(fill = TRUE,
|
72 |
+
card_header("Selected area:"),
|
73 |
+
verbatimTextOutput("feature_output")
|
74 |
+
),
|
75 |
+
|
76 |
selectInput(
|
77 |
"select",
|
78 |
"Select an LLM:",
|
|
|
81 |
"Gorilla (UC Berkeley)" = "gorilla"
|
82 |
)
|
83 |
),
|
84 |
+
card(fill = TRUE,
|
85 |
+
card_header(fa("robot"), textOutput("model", inline = TRUE)),
|
86 |
+
accordion(
|
87 |
+
open = TRUE,
|
88 |
+
accordion_panel(
|
89 |
+
HTML("<span, class='text-info'>Show SQL query</span>"),
|
90 |
+
icon = fa("terminal"),
|
91 |
+
verbatimTextOutput("sql_code")
|
92 |
+
),
|
93 |
+
accordion_panel(
|
94 |
+
title = "Explain query",
|
95 |
+
icon = fa("user", prefer_type = "solid"),
|
96 |
+
textOutput("explanation")
|
97 |
+
)
|
98 |
+
)
|
99 |
+
),
|
100 |
+
|
101 |
+
|
102 |
card(
|
103 |
card_header(bs_icon("github"), "Source code:"),
|
104 |
+
a(href = "https://github.com/boettiger-lab/biodiversity-justice",
|
105 |
+
"https://github.com/boettiger-lab/biodiversity-justice"))
|
106 |
),
|
107 |
|
108 |
theme = bs_theme(version = "5")
|
109 |
)
|
110 |
|
111 |
|
112 |
+
duckdb_secrets(Sys.getenv("MINIO_KEY"),
|
113 |
+
Sys.getenv("MINIO_SECRET"),
|
114 |
+
"minio.carlboettiger.info")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
+
gbif <- open_dataset("s3://public-gbif/2024-10-01", tblname = "gbif")
|
117 |
+
bounds <- ""
|
118 |
|
119 |
# Define the server
|
120 |
server <- function(input, output, session) {
|
121 |
+
output$map <- renderMaplibre({
|
122 |
+
m <- maplibre(center=c(-110, 38), zoom = 3, pitch = 0) |>
|
123 |
+
add_draw_control() |>
|
124 |
+
add_geocoder_control() #|>
|
125 |
+
# set_projection("globe")
|
126 |
+
|
127 |
+
m
|
128 |
+
})
|
129 |
+
observeEvent(input$get_features, {
|
130 |
|
131 |
+
drawn_features <- get_drawn_features(mapboxgl_proxy("map"))
|
132 |
+
if(nrow(drawn_features) > 0) {
|
133 |
+
bounds <- st_bbox(drawn_features)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
+
# print(bounds)
|
136 |
+
output$feature_output <- renderPrint({
|
137 |
+
print(bounds)
|
138 |
+
})
|
139 |
|
|
|
|
|
|
|
140 |
|
141 |
+
attach(as.list(bounds))
|
142 |
+
gbif_aoi <- gbif |>
|
143 |
+
dplyr::filter(between(decimallatitude, ymin, ymax),
|
144 |
+
between(decimallongitude, xmin, xmax)) |>
|
145 |
+
as_view("gbif_aoi")
|
146 |
|
147 |
+
}
|
|
|
|
|
148 |
|
149 |
+
observeEvent(input$user_msg, {
|
150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
|
|
152 |
|
153 |
+
system_prompt = create_prompt(additional_instructions =
|
154 |
+
"Note that the columns h1, h2, h3, through h11 contains a geohash representing a H3 hexagon index.
|
155 |
+
Higher numbers indicate higher zoom resolution (smaller hexes)
|
156 |
+
Always aggregate results to count the number of rows matching
|
157 |
+
the query to the desired hexagon. Always name the count column 'count'.
|
158 |
+
Remember to group by hexagon level to aggregate!
|
159 |
+
Always rename the chosen hexagon column as 'h3id' in your final answer.
|
160 |
+
Always use table notation like 'gbif.order' to specify column names.
|
161 |
+
Be sure to generate fully valid SQL. Check your SQL for possible errors.
|
162 |
|
163 |
+
Always use the table 'gbif_aoi' rather than 'gbif' table if both are present.
|
|
|
|
|
164 |
|
165 |
+
IMPORTANT: return raw JSON only, do not decorate your reply with markdown code syntax.
|
166 |
+
")
|
|
|
|
|
167 |
|
168 |
+
agent <- ellmer::chat_vllm(
|
169 |
+
base_url = "https://llm.cirrus.carlboettiger.info/v1/",
|
170 |
+
model = "kosbu/Llama-3.3-70B-Instruct-AWQ",
|
171 |
+
api_key = Sys.getenv("CIRRUS_LLM_KEY"),
|
172 |
+
system_prompt = system_prompt,
|
173 |
+
api_args = list(temperature = 0)
|
174 |
+
)
|
175 |
|
176 |
|
177 |
+
print("Agent thinking...")
|
178 |
+
stream <- agent$chat(input$chat)
|
179 |
|
180 |
+
# Parse response
|
181 |
+
response <- jsonlite::fromJSON(stream)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
183 |
+
if ("query" %in% names(response)) {
|
184 |
+
output$sql_code <- renderText(str_wrap(response$query, width = 60))
|
185 |
+
output$explanation <- renderText(response$explanation)
|
186 |
|
187 |
+
# clear agent memory
|
188 |
+
agent$set_turns(NULL)
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
+
} else {
|
191 |
+
output$agent <- renderText(response$agent)
|
192 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
|
194 |
+
|
195 |
+
# cache the query
|
196 |
+
query_id <- digest::digest(paste(response$query, bounds, collapse=""))
|
197 |
+
data_url <- glue::glue("https://minio.carlboettiger.info/public-data/cache/{query_id}.h3j")
|
198 |
+
output$url <- renderText(data_url)
|
199 |
+
|
200 |
+
cache_parquet <- glue("{query_id}.parquet")
|
201 |
+
|
202 |
+
# compute if not yet in chache
|
203 |
+
status <- httr::status_code(httr::HEAD(data_url))
|
204 |
+
if(status == 404) {
|
205 |
+
print("Computing...")
|
206 |
+
time <- bench::bench_time({
|
207 |
+
agent_query(stream) |> write_dataset(cache_parquet)
|
208 |
+
})
|
209 |
+
print(time)
|
210 |
+
}
|
211 |
+
|
212 |
+
# draw on map
|
213 |
+
h3j <- glue("s3://public-data/cache/{query_id}.h3j")
|
214 |
+
open_dataset(cache_parquet) |> to_h3j(h3j)
|
215 |
+
|
216 |
+
# override previous map with drawn map
|
217 |
+
output$map <- renderMaplibre({
|
218 |
+
m <- maplibre(center=c(-110, 38), zoom = 3, pitch = 0, maxZoom = 9) |>
|
219 |
+
add_h3j_source("h3j_source",
|
220 |
+
url = data_url) |>
|
221 |
+
add_fill_extrusion_layer(
|
222 |
+
id = "h3j_layer",
|
223 |
+
source = "h3j_source",
|
224 |
+
tooltip = "count",
|
225 |
+
fill_extrusion_color = interpolate(
|
226 |
+
column = "count",
|
227 |
+
values = c(0, 1000),
|
228 |
+
stops = c("#430254", "#f83c70")
|
229 |
+
),
|
230 |
+
fill_extrusion_height = list(
|
231 |
+
"interpolate",
|
232 |
+
list("linear"),
|
233 |
+
list("zoom"),
|
234 |
+
0, 0, 1000,
|
235 |
+
list("*", 10, list("get", "count"))
|
236 |
+
),
|
237 |
+
fill_extrusion_opacity = 0.7
|
238 |
+
)
|
239 |
+
}) # close renderMaplibre
|
240 |
+
}) # close observeEvent->get_features
|
241 |
+
}) # close observeEvent->user_msg
|
242 |
}
|
243 |
|
244 |
# Run the app
|
footer.md
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
#### Credits
|
2 |
-
|
3 |
-
Developed by Carl Boettiger, UC Berkeley, 2025. BSD License.
|
4 |
-
|
5 |
-
Data from the US Census and CDC's [Social Vulnerability Index](https://www.atsdr.cdc.gov/place-health/php/svi/index.html)
|
6 |
-
|
7 |
-
#### Technical details
|
8 |
-
|
9 |
-
The app is written entirely in R using shiny. The app will translate natural language queries in SQL code using
|
10 |
-
a small open-weights language model. The SQL code is executed using the duckdb backend against cloud-native
|
11 |
-
geoparquet snapshot of the Social Vulnerability Index hosted on Source Cooperative. Summary chart data are also
|
12 |
-
computed in duckdb by streaming, providing responsive updates while needing minimal RAM or disk storage despite
|
13 |
-
the large size of the data sources.
|
14 |
-
|
15 |
-
The map is rendered and updated using MapLibre with PMTiles, which provides responsive rendering for large feature sets.
|
16 |
-
The PMTiles layer is also hosted on Source cooperative where it can be streamed efficiently.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
system-prompt.md
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
|
2 |
-
You are a helpful agent who always replies strictly in JSON-formatted text.
|
3 |
-
Your task is to translate the user's questions about the data into a SQL query
|
4 |
-
that will be run against the "biodiversity_occurrences" table in a duckdb database.
|
5 |
-
The duckdb database has a spatial extension which understands PostGIS operations as well.
|
6 |
-
|
7 |
-
If your answer involves the construction of a SQL query, you must format your answer as follows:
|
8 |
-
|
9 |
-
{
|
10 |
-
"query": "your raw SQL response goes here",
|
11 |
-
"explanation": "your explanation of the query"
|
12 |
-
}
|
13 |
-
|
14 |
-
If your answer does not involve a SQL query, please reply with the following format instead:
|
15 |
-
|
16 |
-
{
|
17 |
-
"user": "user question goes here",
|
18 |
-
"agent": "your response goes here"
|
19 |
-
}
|
20 |
-
|
21 |
-
If you are asked to describe the data or for information about the data schema, give only a human-readable response with SQL.
|
22 |
-
|
23 |
-
In the data, each row represents an individual occurrence of a species. The occurrences
|
24 |
-
are geocoded to US Census counties, with the STATE, COUNTY, and FIPS columns indicating
|
25 |
-
the corresponding state name, county name, and FIPS identifier for the specific County.
|
26 |
-
The FIPS column is an 5-digit number that uniquely identifies a county in a state.
|
27 |
-
Taxonomic classification of the species is given in the corresponding columns, kingdom,
|
28 |
-
phylum, class, order, family, genus, and species.
|
29 |
-
|
30 |
-
The data also includes information about various measures of social vulnerability (RPL_THEMES).
|
31 |
-
Pay attention to the DESCRIPTION of each of the columns (VARIABLE_NAME) from the metadata table:
|
32 |
-
<schema>
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test-app.R
CHANGED
@@ -1,26 +1,24 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
library(tidyverse)
|
4 |
library(duckdbfs)
|
5 |
-
library(
|
6 |
library(ellmer)
|
7 |
-
library(glue)
|
8 |
|
9 |
-
repo <- "https://data.source.coop/cboettig/social-vulnerability"
|
10 |
-
pmtiles <- glue("{repo}/svi2020_us_tract.pmtiles")
|
11 |
-
duckdb_s3_config(s3_endpoint = "minio.carlboettiger.info")
|
12 |
-
svi <-
|
13 |
-
open_dataset("s3://public-gbif/svi", tblname = "biodiversity_occurrences") |>
|
14 |
-
filter(RPL_THEMES > 0)
|
15 |
-
schema <- read_file("schema.yml")
|
16 |
-
system_prompt <- glue::glue(readr::read_file("system-prompt.md"),
|
17 |
-
.open = "<", .close = ">")
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
|
23 |
-
chat <- ellmer::chat_vllm(
|
24 |
base_url = "https://llm.cirrus.carlboettiger.info/v1/",
|
25 |
model = "kosbu/Llama-3.3-70B-Instruct-AWQ",
|
26 |
api_key = Sys.getenv("CIRRUS_LLM_KEY"),
|
@@ -28,55 +26,38 @@ chat <- ellmer::chat_vllm(
|
|
28 |
api_args = list(temperature = 0)
|
29 |
)
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
base_url = "https://llm.nrp-nautilus.io/",
|
34 |
-
model = "olmo",
|
35 |
-
api_key = Sys.getenv("NRP_API_KEY"),
|
36 |
-
system_prompt = system_prompt,
|
37 |
-
api_args = list(temperature = 0)
|
38 |
-
)
|
39 |
-
|
40 |
-
cols <- colnames(svi)
|
41 |
-
rpls <- grep("RPL_THEME.+", cols)
|
42 |
-
keep <- cols[c(1:9, rpls, 161:226)]
|
43 |
-
biodiversity <- svi |> select(all_of(keep))
|
44 |
-
|
45 |
-
# Test a chat-based response
|
46 |
-
chat$chat("Which columns describes racial components of social vulnerability?")
|
47 |
-
chat$set_turns(NULL)
|
48 |
-
## A query-based response
|
49 |
-
stream <- chat$chat("Which counties have the most bird observations?")
|
50 |
-
stream <- chat$chat("Give me the number bird observations per county vs county social vulnerability")
|
51 |
-
response <- jsonlite::fromJSON(stream)
|
52 |
-
|
53 |
|
54 |
-
|
|
|
|
|
55 |
|
56 |
-
response <- jsonlite::fromJSON(stream2)
|
57 |
|
58 |
-
con <- duckdbfs::cached_connection()
|
59 |
-
filtered_data <- DBI::dbGetQuery(con, response$query)
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
82 |
|
|
|
1 |
+
library(duckdb.agent)
|
|
|
|
|
2 |
library(duckdbfs)
|
3 |
+
library(dplyr)
|
4 |
library(ellmer)
|
|
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
duckdb_secrets(Sys.getenv("MINIO_KEY"),
|
8 |
+
Sys.getenv("MINIO_SECRET"),
|
9 |
+
"minio.carlboettiger.info")
|
10 |
+
gbif <- open_dataset("s3://public-gbif/2024-10-01", tblname = "gbif")
|
11 |
+
tracts_url <- "https://minio.carlboettiger.info/public-social-vulnerability/2022-tracts-h3-z8.parquet"
|
12 |
+
tracts_h3 <- open_dataset(tracts_url, tblname = "censustracts")
|
13 |
|
14 |
|
15 |
+
system_prompt = create_prompt(additional_instructions =
|
16 |
+
"Note that the column h8 contains a geohash representing a H3 hexagon index.
|
17 |
+
If asked for data that requires both tables, you should always seek to join on
|
18 |
+
the h8 column. Always aggregate results to count the number of rows matching
|
19 |
+
the query in each h8 hexagon")
|
20 |
|
21 |
+
agent <- ellmer::chat_vllm(
|
|
|
22 |
base_url = "https://llm.cirrus.carlboettiger.info/v1/",
|
23 |
model = "kosbu/Llama-3.3-70B-Instruct-AWQ",
|
24 |
api_key = Sys.getenv("CIRRUS_LLM_KEY"),
|
|
|
26 |
api_args = list(temperature = 0)
|
27 |
)
|
28 |
|
29 |
+
resp <- agent$chat("Birds in Yolo County")
|
30 |
+
out <- agent_query(resp)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
bench::bench_time({
|
33 |
+
out |> rename(h3id = h6) |> to_h3j("s3://public-data/test5.h3j")
|
34 |
+
})
|
35 |
|
|
|
36 |
|
|
|
|
|
37 |
|
38 |
+
library(mapgl)
|
39 |
+
url = "https://minio.carlboettiger.info/public-data/test5.h3j"
|
40 |
+
maplibre(center=c(-110, 38), zoom = 3, pitch = 30) |>
|
41 |
+
add_h3j_source("h3j_source",
|
42 |
+
url = url
|
43 |
+
) |>
|
44 |
+
add_fill_extrusion_layer(
|
45 |
+
id = "h3j_layer",
|
46 |
+
source = "h3j_source",
|
47 |
+
fill_extrusion_color = interpolate(
|
48 |
+
column = "count",
|
49 |
+
values = c(0, 1000),
|
50 |
+
stops = c("#430254", "#f83c70")
|
51 |
+
),
|
52 |
+
fill_extrusion_height = list(
|
53 |
+
"interpolate",
|
54 |
+
list("linear"),
|
55 |
+
list("zoom"),
|
56 |
+
0,
|
57 |
+
0,
|
58 |
+
100,
|
59 |
+
list("*", 2, list("get", "count"))
|
60 |
+
),
|
61 |
+
fill_extrusion_opacity = 0.7
|
62 |
+
)
|
63 |
|
test-data.R
CHANGED
@@ -1,39 +1,23 @@
|
|
1 |
-
library(dplyr)
|
2 |
-
library(duckdbfs)
|
3 |
-
library(mapgl)
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
pmtiles = "https://minio.carlboettiger.info/public-biodiversity/pad-us-4/pad-us-4.pmtiles"
|
8 |
|
9 |
-
|
|
|
|
|
|
|
10 |
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
|
|
|
|
|
|
|
16 |
|
17 |
|
18 |
|
19 |
-
filter_column <- function(full_data, filtered_data, id_col) {
|
20 |
-
#if (nrow(filtered_data) < 1) return(NULL)
|
21 |
-
values <- full_data |>
|
22 |
-
inner_join(filtered_data, copy = TRUE) |>
|
23 |
-
pull(id_col)
|
24 |
-
# maplibre syntax for the filter of PMTiles
|
25 |
-
list("in", list("get", id_col), list("literal", values))
|
26 |
-
}
|
27 |
|
28 |
-
maplibre(center = c(-102.9, 41.3), zoom = 3) |>
|
29 |
-
add_fill_layer(
|
30 |
-
id = "pad",
|
31 |
-
source = list(type = "vector", url = paste0("pmtiles://", pmtiles)),
|
32 |
-
source_layer = "padus4",
|
33 |
-
tooltip = c("Unit_Nm"),
|
34 |
-
# filter = list("in", list("get", "GAP_Sts"), list("literal", values)),
|
35 |
-
filter = filter_column(full_data, filtered_data, "row_n"),
|
36 |
-
fill_opacity = 0.5,
|
37 |
-
fill_color = "darkgreen"
|
38 |
-
)
|
39 |
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
+
library(duckdbfs)
|
3 |
+
library(dplyr)
|
|
|
4 |
|
5 |
+
library(sf)
|
6 |
+
library(spData)
|
7 |
+
bounds <- spData::us_states |> dplyr::filter(NAME == "Arizona") |> sf::st_bbox()
|
8 |
+
attach(as.list(bounds))
|
9 |
|
10 |
|
11 |
+
duckdb_secrets(Sys.getenv("MINIO_KEY"),
|
12 |
+
Sys.getenv("MINIO_SECRET"),
|
13 |
+
"minio.carlboettiger.info")
|
14 |
|
15 |
+
gbif <- open_dataset("s3://public-gbif/2024-10-01", tblname = "gbif")
|
16 |
+
gbif_aoi <-
|
17 |
+
gbif |> dplyr::filter(decimallatitude > 0) |> show_query()
|
18 |
|
19 |
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
+
gbif_aoi |> show_query()
|