cboettig commited on
Commit
639cb71
·
unverified ·
0 Parent(s):

Initial commit

Browse files
Files changed (13) hide show
  1. .github/workflows/deploy.yml +20 -0
  2. .gitignore +49 -0
  3. Dockerfile +21 -0
  4. LICENSE +24 -0
  5. README.md +43 -0
  6. app.R +272 -0
  7. footer.md +16 -0
  8. geo-llm-r.Rproj +17 -0
  9. preprocess.md +27 -0
  10. schema.yml +26 -0
  11. system-prompt.md +35 -0
  12. test.R +69 -0
  13. utils.R +4 -0
.github/workflows/deploy.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+ - name: Push to hub
18
+ env:
19
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
+ run: git push -f https://cboettig:[email protected]/spaces/boettiger-lab/geo-llm-r main
.gitignore ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # History files
2
+ .Rhistory
3
+ .Rapp.history
4
+
5
+ # Session Data files
6
+ .RData
7
+ .RDataTmp
8
+
9
+ # User-specific files
10
+ .Ruserdata
11
+
12
+ # Example code in package build process
13
+ *-Ex.R
14
+
15
+ # Output files from R CMD build
16
+ /*.tar.gz
17
+
18
+ # Output files from R CMD check
19
+ /*.Rcheck/
20
+
21
+ # RStudio files
22
+ .Rproj.user/
23
+
24
+ # produced vignettes
25
+ vignettes/*.html
26
+ vignettes/*.pdf
27
+
28
+ # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
29
+ .httr-oauth
30
+
31
+ # knitr and R markdown default cache directories
32
+ *_cache/
33
+ /cache/
34
+
35
+ # Temporary files created by R markdown
36
+ *.utf8.md
37
+ *.knit.md
38
+
39
+ # R Environment Variables
40
+ .Renviron
41
+
42
+ # pkgdown site
43
+ docs/
44
+
45
+ # translation temp files
46
+ po/*~
47
+
48
+ # RStudio Connect folder
49
+ rsconnect/
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM rocker/geospatial:latest
2
+
3
+ WORKDIR /code
4
+
5
+ RUN install2.r --error \
6
+ bsicons \
7
+ bslib \
8
+ duckdbfs \
9
+ fontawesome \
10
+ gt \
11
+ markdown \
12
+ shiny \
13
+ shinychat \
14
+ tidyverse \
15
+ colourpicker
16
+
17
+ RUN installGithub.r cboettig/mapgl tidyverse/ellmer
18
+
19
+ COPY . .
20
+
21
+ CMD ["R", "--quiet", "-e", "shiny::runApp(host='0.0.0.0', port=7860)"]
LICENSE ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ BSD 2-Clause License
2
+
3
+ Copyright (c) 2024, Boettiger Lab, UC Berkeley
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
+
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Geo Llm R
3
+ emoji: 📚
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: docker
7
+ pinned: false
8
+ license: bsd-2-clause
9
+ ---
10
+
11
+ # Demo Shiny App with Maplibre + open LLM interface
12
+
13
+ :hugs: Shiny App on Huggingface: <https://huggingface.co/spaces/boettiger-lab/geo-llm-r>
14
+
15
+ Work in progress. This is a proof-of-principle for an LLM-driven interface to dynamic mapping. Key technologies include duckdb, geoparquet, pmtiles, maplibre, open LLMs (via VLLM + LiteLLM). R interface through ellmer (LLMs), mapgl (maplibre), shiny, and duckdb.
16
+
17
+ # Setup
18
+
19
+ ## GitHub with HuggingFace Deploy
20
+
21
+ All edits should be pushed to GitHub. Edits to `main` branch are automatically deployed to HuggingFace via GitHub Actions.
22
+ When using this scaffold, you will first have to set up your auto-deploy system:
23
+
24
+ - [Create a new HuggingFace Space](https://huggingface.co/new-space) (any template is fine, will be overwritten).
25
+ - [Create a HuggingFace Token](https://huggingface.co/settings/tokens/new?tokenType=write) with write permissions if you do not have one.
26
+ - In the GitHub Settings of your repository, add the token as a "New Repository Secret" under the `Secrets and Variables` -> `Actions` section of settings (`https://github.com/{USER}/{REPO}/settings/secrets/actions`).
27
+ - Edit the `.github/workflows/deploy.yml` file to specify your HuggingFace user name and HF repo to publish to.
28
+
29
+ ## Language Model setup
30
+
31
+ This example is designed to be able to leverage open source or open weights models. You will need to adjust the API URL and API key accordingly. This could be a local model with `vllm` or `ollama`, and of course commercial models should work too. The demo app currently runs on an VLLM+LiteLLM backed model, currently a Llama3 variant, hosted on the National Research Platform.
32
+
33
+ The LLM plays only a simple role in generating SQL queries from background information on the data including the table schema, see the system prompt for details. Most open models I have experimented with do not support the [tool use](https://ellmer.tidyverse.org/articles/tool-calling.html) or [structured data](https://ellmer.tidyverse.org/articles/structured-data.html) interfaces very well compared to commercial models. An important trick in working with open models used here is merely requesting the reply be structured as JSON. Open models are quite decent at this, and at SQL construction, given necessary context about the data. The map and chart elements merely react the resulting data frames, and the entire analysis is thus transparent and reproducible as it would be if the user had composed their request in SQL instead of plain English.
34
+
35
+ ## Software Dependencies
36
+
37
+ The Dockerfile includes all dependencies required for the HuggingFace deployment, and can be used as a template or directly to serve RStudio server.
38
+
39
+ ## Data pre-processing
40
+
41
+ Pre-processing the data into cloud-native formats and hosting data on a high bandwidth, highly avalialbe server is essential for efficient and scalable renending. Pre-computing expensive operations such as zonal statistics across all features is also necessary. These steps are described in [preprocess.md](preprocess.md) and corresponding scripts.
42
+
43
+
app.R ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ library(shiny)
2
+ library(bslib)
3
+ library(htmltools)
4
+ library(fontawesome)
5
+ library(bsicons)
6
+ library(gt)
7
+ library(colourpicker)
8
+ library(glue)
9
+
10
+ library(ggplot2)
11
+ library(readr)
12
+ library(dplyr)
13
+ library(mapgl)
14
+ library(duckdbfs)
15
+ duckdbfs::load_spatial()
16
+
17
+ css <-
18
+ HTML(paste0("<link rel='stylesheet' type='text/css' ",
19
+ "href='https://demos.creative-tim.com/",
20
+ "material-dashboard/assets/css/",
21
+ "material-dashboard.min.css?v=3.2.0'>"))
22
+
23
+
24
+ # Define the UI
25
+ ui <- page_sidebar(
26
+ fillable = FALSE, # do not squeeze to vertical screen space
27
+ tags$head(css),
28
+ titlePanel("Demo App"),
29
+
30
+ "
31
+ This is a proof-of-principle for a simple chat-driven interface
32
+ to dynamically explore geospatial data.
33
+ ",
34
+
35
+ card(
36
+ layout_columns(
37
+ textInput("chat",
38
+ label = NULL,
39
+ "Which four counties in California have the highest average social vulnerability?",
40
+ width = "100%"),
41
+ div(
42
+ actionButton("user_msg", "", icon = icon("paper-plane"),
43
+ class = "btn-primary btn-sm align-bottom"),
44
+ class = "align-text-bottom"),
45
+ col_widths = c(11, 1)),
46
+ fill = FALSE
47
+ ),
48
+
49
+ textOutput("agent"),
50
+
51
+
52
+ layout_columns(
53
+ card(maplibreOutput("map")),
54
+ card(includeMarkdown("## Plot"),
55
+ plotOutput("chart1"),
56
+ plotOutput("chart2"),
57
+ ),
58
+ col_widths = c(8, 4),
59
+ row_heights = c("500px"),
60
+ max_height = "600px"
61
+ ),
62
+
63
+ gt_output("table"),
64
+
65
+ card(fill = TRUE,
66
+ card_header(fa("robot"), textOutput("model", inline = TRUE)),
67
+ accordion(
68
+ open = FALSE,
69
+ accordion_panel(
70
+ title = "show sql",
71
+ icon = fa("terminal"),
72
+ verbatimTextOutput("sql_code"),
73
+ ),
74
+ accordion_panel(
75
+ title = "explain",
76
+ icon = fa("user", prefer_type="solid"),
77
+ textOutput("explanation"),
78
+ )
79
+ ),
80
+ ),
81
+ card(
82
+ card_header("Errata"),
83
+ shiny::markdown(readr::read_file("footer.md")),
84
+ ),
85
+ sidebar = sidebar(
86
+ selectInput(
87
+ "select",
88
+ "Select an LLM:",
89
+ list("LLama3" = "llama3",
90
+ #"OLMO2 (AllenAI)" = "olmo",
91
+ "Gorilla (UC Berkeley)" = "gorilla"
92
+ )
93
+ ),
94
+
95
+ input_switch("redlines", "Redlined Areas", value = FALSE),
96
+ input_switch("svi", "Social Vulnerability", value = TRUE),
97
+ input_switch("richness", "Biodiversity Richness", value = FALSE),
98
+ input_switch("rsr", "Biodiversity Range Size Rarity", value = FALSE),
99
+
100
+
101
+ card(
102
+ card_header(bs_icon("github"), "Source code:"),
103
+ a(href = "https://github.com/boettiger-lab/geo-llm-r",
104
+ "https://github.com/boettiger-lab/geo-llm-r"))
105
+ ),
106
+
107
+ theme = bs_theme(version = "5")
108
+ )
109
+
110
+
111
+ repo <- "https://data.source.coop/cboettig/social-vulnerability"
112
+ pmtiles <- glue("{repo}/2022/SVI2022_US_tract.pmtiles")
113
+ parquet <- glue("{repo}/2022/SVI2022_US_tract.parquet")
114
+ con <- duckdbfs::cached_connection()
115
+ svi <- open_dataset(parquet, tblname = "svi") |> filter(RPL_THEMES > 0)
116
+
117
+ safe_parse <- function(txt) {
118
+ gsub("[\r\n]", " ", txt) |> gsub("\\s+", " ", x = _)
119
+ }
120
+
121
+
122
+ # helper utilities
123
+ # faster/more scalable to pass maplibre the ids to refilter pmtiles,
124
+ # than to pass it the full geospatial/sf object
125
+ filter_column <- function(full_data, filtered_data, id_col = "FIPS") {
126
+ if (nrow(filtered_data) < 1) return(NULL)
127
+ values <- full_data |>
128
+ inner_join(filtered_data, copy = TRUE) |>
129
+ pull(id_col)
130
+ # maplibre syntax for the filter of PMTiles
131
+ list("in", list("get", id_col), list("literal", values))
132
+ }
133
+
134
+
135
+
136
+ # Define the server
137
+ server <- function(input, output, session) {
138
+
139
+ chart1_data <- svi |>
140
+ group_by(COUNTY) |>
141
+ summarise(mean_svi = mean(RPL_THEMES)) |>
142
+ collect()
143
+
144
+ chart1 <- chart1_data |>
145
+ ggplot(aes(mean_svi)) + geom_density(fill="darkred") +
146
+ ggtitle("County-level vulnerability nation-wide")
147
+
148
+ data <- reactiveValues(df = tibble())
149
+ output$chart1 <- renderPlot(chart1)
150
+
151
+ model <- reactive(input$select)
152
+ output$model <- renderText(input$select)
153
+ observe({
154
+ schema <- read_file("schema.yml")
155
+ system_prompt <- glue::glue(readr::read_file("system-prompt.md"),
156
+ .open = "<", .close = ">")
157
+ chat <- ellmer::chat_vllm(
158
+ base_url = "https://llm.nrp-nautilus.io/",
159
+ model = model(),
160
+ api_key = Sys.getenv("NRP_API_KEY"),
161
+ system_prompt = system_prompt,
162
+ api_args = list(temperature = 0)
163
+ )
164
+
165
+ observeEvent(input$user_msg, {
166
+ stream <- chat$chat(input$chat)
167
+
168
+ # Parse response
169
+ response <- jsonlite::fromJSON(safe_parse(stream))
170
+ #response <- jsonlite::fromJSON(stream)
171
+
172
+ if ("query" %in% names(response)) {
173
+ output$sql_code <- renderText(stringr::str_wrap(response$query, width = 60))
174
+ output$explanation <- renderText(response$explanation)
175
+
176
+ # Actually execute the SQL query generated:
177
+ df <- DBI::dbGetQuery(con, response$query)
178
+
179
+ # don't display shape column in render
180
+ df <- df |> select(-any_of("Shape"))
181
+ output$table <- render_gt(df, height = 300)
182
+
183
+
184
+ y_axis <- colnames(df)[!colnames(df) %in% colnames(svi)]
185
+ chart2 <- df |>
186
+ rename(social_vulnerability = y_axis) |>
187
+ ggplot(aes(social_vulnerability)) +
188
+ geom_density(fill = "darkred") +
189
+ xlim(c(0, 1)) +
190
+ ggtitle("Vulnerability of selected areas")
191
+
192
+ output$chart2 <- renderPlot(chart2)
193
+
194
+ # We need to somehow trigger this df to update the map.
195
+ data$df <- df
196
+
197
+ # Note: ellmer will preserve full chat history automatically.
198
+ # this can confuse the agent and mess up behavior, so we reset:
199
+ chat$set_turns(NULL)
200
+
201
+ } else {
202
+ output$agent <- renderText(response$agent)
203
+
204
+ }
205
+
206
+ })
207
+ })
208
+
209
+
210
+ output$map <- renderMaplibre({
211
+
212
+ m <- maplibre(center = c(-104.9, 40.3), zoom = 3, height = "400")
213
+ if (input$redlines) {
214
+ m <- m |>
215
+ add_fill_layer(
216
+ id = "redlines",
217
+ source = list(type = "vector",
218
+ url = paste0("pmtiles://", "https://data.source.coop/cboettig/us-boundaries/mappinginequality.pmtiles")),
219
+ source_layer = "mappinginequality",
220
+ fill_color = list("get", "fill")
221
+ )
222
+ }
223
+ if (input$richness) {
224
+ m <- m |>
225
+ add_raster_source(id = "richness",
226
+ tiles = "https://data.source.coop/cboettig/mobi/tiles/red/species-richness-all/{z}/{x}/{y}.png",
227
+ maxzoom = 11
228
+ ) |>
229
+ add_raster_layer(id = "richness-layer",
230
+ source = "richness")
231
+
232
+ }
233
+
234
+ if (input$rsr) {
235
+ m <- m |>
236
+ add_raster_source(id = "rsr",
237
+ tiles = "https://data.source.coop/cboettig/mobi/tiles/green/range-size-rarity-all/{z}/{x}/{y}.png",
238
+ maxzoom = 11
239
+ ) |>
240
+ add_raster_layer(id = "richness-layer",
241
+ source = "rsr")
242
+
243
+ }
244
+ if (input$svi) {
245
+ m <- m |>
246
+ add_fill_layer(
247
+ id = "svi_layer",
248
+ source = list(type = "vector",
249
+ url = paste0("pmtiles://", pmtiles)),
250
+ source_layer = "svi",
251
+ tooltip = "RPL_THEMES",
252
+ filter = filter_column(svi, data$df, "FIPS"),
253
+ fill_opacity = 0.5,
254
+ fill_color = interpolate(column = "RPL_THEMES",
255
+ values = c(0, 1),
256
+ stops = c("lightpink", "darkred"),
257
+ na_color = "lightgrey")
258
+ )
259
+ }
260
+ m |>
261
+ add_draw_control() |>
262
+ add_geocoder_control()
263
+
264
+ })
265
+
266
+
267
+
268
+
269
+ }
270
+
271
+ # Run the app
272
+ shinyApp(ui = ui, server = server)
footer.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### Credits
2
+
3
+ Developed by Carl Boettiger, UC Berkeley, 2025. BSD License.
4
+
5
+ Data from the US Census and CDC's [Social Vulnerability Index](https://www.atsdr.cdc.gov/place-health/php/svi/index.html)
6
+
7
+ #### Technical details
8
+
9
+ The app is written entirely in R using shiny. The app will translate natural language queries in SQL code using
10
+ a small open-weights language model. The SQL code is executed using the duckdb backend against cloud-native
11
+ geoparquet snapshot of the Social Vulnerability Index hosted on Source Cooperative. Summary chart data are also
12
+ computed in duckdb by streaming, providing responsive updates while needing minimal RAM or disk storage despite
13
+ the large size of the data sources.
14
+
15
+ The map is rendered and updated using MapLibre with PMTiles, which provides responsive rendering for large feature sets.
16
+ The PMTiles layer is also hosted on Source cooperative where it can be streamed efficiently.
geo-llm-r.Rproj ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Version: 1.0
2
+ ProjectId: 337131c5-7aa5-4963-bc4e-d8a156e206a0
3
+
4
+ RestoreWorkspace: Default
5
+ SaveWorkspace: Default
6
+ AlwaysSaveHistory: Default
7
+
8
+ EnableCodeIndexing: Yes
9
+ UseSpacesForTab: Yes
10
+ NumSpacesForTab: 2
11
+ Encoding: UTF-8
12
+
13
+ RnwWeave: Sweave
14
+ LaTeX: pdfLaTeX
15
+
16
+ AutoAppendNewline: Yes
17
+ StripTrailingWhitespace: Yes
preprocess.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ ---
3
+
4
+
5
+ # Vector Layers
6
+
7
+ The heart of this application design is a vector dataset serialized as both (Geo)Parquet and PMTiles.
8
+ The parquet version allows for real-time calculations through rapid SQL queries via duckdb,
9
+ and the PMTiles version allows the data to be quickly visualized at any zoom through maplibre.
10
+ maplibre can also efficiently filter the PMTiles data given a feature ids returned by duckdb.
11
+
12
+ `gdal_translates` can generate both PMTiles and geoparquet, though `tippecanoe` provides more
13
+ options for PMTiles generation and can produce nicer tile sets.
14
+
15
+ The demo uses the CDC Social Vulnerability data because it is built on the hierachical partitioning
16
+ used by the Census (Country->State->County->Tract) hierarchy.
17
+
18
+ # Raster Layers
19
+
20
+ ## Generating static tiles
21
+
22
+ ## Zonal statistics calculations
23
+
24
+ The application is essentially driven by the vector layer data using SQL.
25
+ I find it helpful to pre-process 'zonal' calculations, e.g. the mean value of each raster layer
26
+ within each feature in the 'focal' vector data set(s).
27
+
schema.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - VARIABLE_NAME: ST
2
+ DESCRIPTION: INTEGER State-level FIPS code (two-digit integer)
3
+ - VARIABLE_NAME: STATE
4
+ DESCRIPTION: State name
5
+ - VARIABLE_NAME: ST_ABBR
6
+ DESCRIPTION: State abbreviation, two-letter string
7
+ - VARIABLE_NAME: STCNTY
8
+ DESCRIPTION: INTEGER County-level FIPS code (5 digit integer)
9
+ - VARIABLE_NAME: COUNTY
10
+ DESCRIPTION: County name
11
+ - VARIABLE_NAME: FIPS
12
+ DESCRIPTION: INTEGER, Tract-level geographic identification (full Census Bureau FIPS code)
13
+ - VARIABLE_NAME: LOCATION
14
+ DESCRIPTION: Text description of tract county state
15
+ - VARIABLE_NAME: AREA_SQMI
16
+ DESCRIPTION: Tract area in square miles
17
+ - VARIABLE_NAME: RPL_THEMES
18
+ DESCRIPTION: Overall social vulnerability. Should always be used unless explicit sub-theme is called for.
19
+ - VARIABLE_NAME: RPL_THEME1
20
+ DESCRIPTION: Subtheme for socio-economic status social vulnerability score
21
+ - VARIABLE_NAME: RPL_THEME2
22
+ DESCRIPTION: Subtheme for Household characteristics vulnerability score
23
+ - VARIABLE_NAME: RPL_THEME3
24
+ DESCRIPTION: Subtheme for Racial and Ethnic Minority status based vulnerability score
25
+ - VARIABLE_NAME: RPL_THEME4
26
+ DESCRIPTION: Subtheme for Housing and transportation-based vulnerability score
system-prompt.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ You are a helpful agent who always replies strictly in JSON-formatted text.
3
+ Your task is to translate the user's questions about the data into a SQL query
4
+ that will be run against the "svi" table in a duckdb database.
5
+ The duckdb database has a spatial extension which understands PostGIS operations as well.
6
+ Include semantically meaningful columns like COUNTY and STATE name.
7
+
8
+ If your answer involves the construction of a SQL query, you must format your answer as follows:
9
+
10
+ {
11
+ "query": "your raw SQL response goes here.",
12
+ "explanation": "your explanation of the query"
13
+ }
14
+
15
+ Think carefully about your SQL query, keep it concise and ensure it is entirely valid SQL syntax.
16
+
17
+ If your answer does not involve a SQL query, please reply with the following format instead:
18
+
19
+ {
20
+ "user": "user question goes here",
21
+ "agent": "your response goes here"
22
+ }
23
+
24
+ If you are asked to describe the data or for information about the data schema, give only a human-readable response with SQL.
25
+
26
+ In the data, each row represents an individual census tract. If asked for
27
+ county or state level statistics, be sure to aggregate across all the tracts
28
+ in that county or state.
29
+
30
+ Refer to this descriptions of each of the columns (VARIABLE_NAME) from the metadata table:
31
+ <schema>
32
+
33
+ Whenenver you SELECT the COUNTY, you must include the STCNTY column as well because county names are not unique across states!
34
+
35
+
test.R ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Illustrate/test core app functionality without shiny
2
+
3
+ library(tidyverse)
4
+ library(duckdbfs)
5
+ library(mapgl)
6
+ library(ellmer)
7
+ library(glue)
8
+
9
+ repo <- "https://data.source.coop/cboettig/social-vulnerability"
10
+ pmtiles <- glue("{repo}/svi2020_us_tract.pmtiles")
11
+ parquet <- glue("{repo}/svi2020_us_tract.parquet")
12
+ svi <- open_dataset(parquet, tblname = "svi") |> filter(RPL_THEMES > 0)
13
+
14
+ schema <- read_file("schema.yml")
15
+ system_prompt <- glue::glue(readr::read_file("system-prompt.md"),
16
+ .open = "<", .close = ">")
17
+
18
+
19
+
20
+ # Or optionally test with cirrus
21
+ chat <- ellmer::chat_vllm(
22
+ base_url = "https://llm.cirrus.carlboettiger.info/v1/",
23
+ model = "kosbu/Llama-3.3-70B-Instruct-AWQ",
24
+ api_key = Sys.getenv("CIRRUS_LLM_KEY"),
25
+ system_prompt = system_prompt,
26
+ api_args = list(temperature = 0)
27
+ )
28
+
29
+ # or use the NRP model
30
+ chat <- ellmer::chat_vllm(
31
+ base_url = "https://llm.nrp-nautilus.io/",
32
+ model = "llama3",
33
+ api_key = Sys.getenv("NRP_API_KEY"),
34
+ system_prompt = system_prompt,
35
+ api_args = list(temperature = 0)
36
+ )
37
+
38
+
39
+ # Test a chat-based response
40
+ chat$chat("Which columns describes racial components of social vulnerability?")
41
+ ## A query-based response
42
+ stream <- chat$chat("Which counties in California have the highest average social vulnerability?")
43
+ response <- jsonlite::fromJSON(stream)
44
+
45
+ con <- duckdbfs::cached_connection()
46
+ filtered_data <- DBI::dbGetQuery(con, response$query)
47
+
48
+ filter_column <- function(full_data, filtered_data, id_col) {
49
+ if (nrow(filtered_data) < 1) return(NULL)
50
+ values <- full_data |>
51
+ inner_join(filtered_data, copy = TRUE) |>
52
+ pull(id_col)
53
+ # maplibre syntax for the filter of PMTiles
54
+ list("in", list("get", id_col), list("literal", values))
55
+ }
56
+
57
+ maplibre(center = c(-102.9, 41.3), zoom = 3) |>
58
+ add_fill_layer(
59
+ id = "svi_layer",
60
+ source = list(type = "vector", url = paste0("pmtiles://", pmtiles)),
61
+ source_layer = "SVI2000_US_tract",
62
+ filter = filter_column(full_data, filtered_data, "FIPS"),
63
+ fill_opacity = 0.5,
64
+ fill_color = interpolate(column = "RPL_THEMES",
65
+ values = c(0, 1),
66
+ stops = c("#e19292c0", "darkblue"),
67
+ na_color = "lightgrey")
68
+ )
69
+
utils.R ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ library(tidyverse)
3
+ library(duckdbfs)
4
+