Spaces:
Running
Running
Commit
·
639cb71
unverified
·
0
Parent(s):
Initial commit
Browse files- .github/workflows/deploy.yml +20 -0
- .gitignore +49 -0
- Dockerfile +21 -0
- LICENSE +24 -0
- README.md +43 -0
- app.R +272 -0
- footer.md +16 -0
- geo-llm-r.Rproj +17 -0
- preprocess.md +27 -0
- schema.yml +26 -0
- system-prompt.md +35 -0
- test.R +69 -0
- utils.R +4 -0
.github/workflows/deploy.yml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face hub
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- uses: actions/checkout@v3
|
14 |
+
with:
|
15 |
+
fetch-depth: 0
|
16 |
+
lfs: true
|
17 |
+
- name: Push to hub
|
18 |
+
env:
|
19 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
20 |
+
run: git push -f https://cboettig:[email protected]/spaces/boettiger-lab/geo-llm-r main
|
.gitignore
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# History files
|
2 |
+
.Rhistory
|
3 |
+
.Rapp.history
|
4 |
+
|
5 |
+
# Session Data files
|
6 |
+
.RData
|
7 |
+
.RDataTmp
|
8 |
+
|
9 |
+
# User-specific files
|
10 |
+
.Ruserdata
|
11 |
+
|
12 |
+
# Example code in package build process
|
13 |
+
*-Ex.R
|
14 |
+
|
15 |
+
# Output files from R CMD build
|
16 |
+
/*.tar.gz
|
17 |
+
|
18 |
+
# Output files from R CMD check
|
19 |
+
/*.Rcheck/
|
20 |
+
|
21 |
+
# RStudio files
|
22 |
+
.Rproj.user/
|
23 |
+
|
24 |
+
# produced vignettes
|
25 |
+
vignettes/*.html
|
26 |
+
vignettes/*.pdf
|
27 |
+
|
28 |
+
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
|
29 |
+
.httr-oauth
|
30 |
+
|
31 |
+
# knitr and R markdown default cache directories
|
32 |
+
*_cache/
|
33 |
+
/cache/
|
34 |
+
|
35 |
+
# Temporary files created by R markdown
|
36 |
+
*.utf8.md
|
37 |
+
*.knit.md
|
38 |
+
|
39 |
+
# R Environment Variables
|
40 |
+
.Renviron
|
41 |
+
|
42 |
+
# pkgdown site
|
43 |
+
docs/
|
44 |
+
|
45 |
+
# translation temp files
|
46 |
+
po/*~
|
47 |
+
|
48 |
+
# RStudio Connect folder
|
49 |
+
rsconnect/
|
Dockerfile
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM rocker/geospatial:latest
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
RUN install2.r --error \
|
6 |
+
bsicons \
|
7 |
+
bslib \
|
8 |
+
duckdbfs \
|
9 |
+
fontawesome \
|
10 |
+
gt \
|
11 |
+
markdown \
|
12 |
+
shiny \
|
13 |
+
shinychat \
|
14 |
+
tidyverse \
|
15 |
+
colourpicker
|
16 |
+
|
17 |
+
RUN installGithub.r cboettig/mapgl tidyverse/ellmer
|
18 |
+
|
19 |
+
COPY . .
|
20 |
+
|
21 |
+
CMD ["R", "--quiet", "-e", "shiny::runApp(host='0.0.0.0', port=7860)"]
|
LICENSE
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
BSD 2-Clause License
|
2 |
+
|
3 |
+
Copyright (c) 2024, Boettiger Lab, UC Berkeley
|
4 |
+
|
5 |
+
Redistribution and use in source and binary forms, with or without
|
6 |
+
modification, are permitted provided that the following conditions are met:
|
7 |
+
|
8 |
+
1. Redistributions of source code must retain the above copyright notice, this
|
9 |
+
list of conditions and the following disclaimer.
|
10 |
+
|
11 |
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
12 |
+
this list of conditions and the following disclaimer in the documentation
|
13 |
+
and/or other materials provided with the distribution.
|
14 |
+
|
15 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
19 |
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
20 |
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
21 |
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
22 |
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
23 |
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
24 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
README.md
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Geo Llm R
|
3 |
+
emoji: 📚
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
license: bsd-2-clause
|
9 |
+
---
|
10 |
+
|
11 |
+
# Demo Shiny App with Maplibre + open LLM interface
|
12 |
+
|
13 |
+
:hugs: Shiny App on Huggingface: <https://huggingface.co/spaces/boettiger-lab/geo-llm-r>
|
14 |
+
|
15 |
+
Work in progress. This is a proof-of-principle for an LLM-driven interface to dynamic mapping. Key technologies include duckdb, geoparquet, pmtiles, maplibre, open LLMs (via VLLM + LiteLLM). R interface through ellmer (LLMs), mapgl (maplibre), shiny, and duckdb.
|
16 |
+
|
17 |
+
# Setup
|
18 |
+
|
19 |
+
## GitHub with HuggingFace Deploy
|
20 |
+
|
21 |
+
All edits should be pushed to GitHub. Edits to `main` branch are automatically deployed to HuggingFace via GitHub Actions.
|
22 |
+
When using this scaffold, you will first have to set up your auto-deploy system:
|
23 |
+
|
24 |
+
- [Create a new HuggingFace Space](https://huggingface.co/new-space) (any template is fine, will be overwritten).
|
25 |
+
- [Create a HuggingFace Token](https://huggingface.co/settings/tokens/new?tokenType=write) with write permissions if you do not have one.
|
26 |
+
- In the GitHub Settings of your repository, add the token as a "New Repository Secret" under the `Secrets and Variables` -> `Actions` section of settings (`https://github.com/{USER}/{REPO}/settings/secrets/actions`).
|
27 |
+
- Edit the `.github/workflows/deploy.yml` file to specify your HuggingFace user name and HF repo to publish to.
|
28 |
+
|
29 |
+
## Language Model setup
|
30 |
+
|
31 |
+
This example is designed to be able to leverage open source or open weights models. You will need to adjust the API URL and API key accordingly. This could be a local model with `vllm` or `ollama`, and of course commercial models should work too. The demo app currently runs on an VLLM+LiteLLM backed model, currently a Llama3 variant, hosted on the National Research Platform.
|
32 |
+
|
33 |
+
The LLM plays only a simple role in generating SQL queries from background information on the data including the table schema, see the system prompt for details. Most open models I have experimented with do not support the [tool use](https://ellmer.tidyverse.org/articles/tool-calling.html) or [structured data](https://ellmer.tidyverse.org/articles/structured-data.html) interfaces very well compared to commercial models. An important trick in working with open models used here is merely requesting the reply be structured as JSON. Open models are quite decent at this, and at SQL construction, given necessary context about the data. The map and chart elements merely react the resulting data frames, and the entire analysis is thus transparent and reproducible as it would be if the user had composed their request in SQL instead of plain English.
|
34 |
+
|
35 |
+
## Software Dependencies
|
36 |
+
|
37 |
+
The Dockerfile includes all dependencies required for the HuggingFace deployment, and can be used as a template or directly to serve RStudio server.
|
38 |
+
|
39 |
+
## Data pre-processing
|
40 |
+
|
41 |
+
Pre-processing the data into cloud-native formats and hosting data on a high bandwidth, highly avalialbe server is essential for efficient and scalable renending. Pre-computing expensive operations such as zonal statistics across all features is also necessary. These steps are described in [preprocess.md](preprocess.md) and corresponding scripts.
|
42 |
+
|
43 |
+
|
app.R
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
library(shiny)
|
2 |
+
library(bslib)
|
3 |
+
library(htmltools)
|
4 |
+
library(fontawesome)
|
5 |
+
library(bsicons)
|
6 |
+
library(gt)
|
7 |
+
library(colourpicker)
|
8 |
+
library(glue)
|
9 |
+
|
10 |
+
library(ggplot2)
|
11 |
+
library(readr)
|
12 |
+
library(dplyr)
|
13 |
+
library(mapgl)
|
14 |
+
library(duckdbfs)
|
15 |
+
duckdbfs::load_spatial()
|
16 |
+
|
17 |
+
css <-
|
18 |
+
HTML(paste0("<link rel='stylesheet' type='text/css' ",
|
19 |
+
"href='https://demos.creative-tim.com/",
|
20 |
+
"material-dashboard/assets/css/",
|
21 |
+
"material-dashboard.min.css?v=3.2.0'>"))
|
22 |
+
|
23 |
+
|
24 |
+
# Define the UI
|
25 |
+
ui <- page_sidebar(
|
26 |
+
fillable = FALSE, # do not squeeze to vertical screen space
|
27 |
+
tags$head(css),
|
28 |
+
titlePanel("Demo App"),
|
29 |
+
|
30 |
+
"
|
31 |
+
This is a proof-of-principle for a simple chat-driven interface
|
32 |
+
to dynamically explore geospatial data.
|
33 |
+
",
|
34 |
+
|
35 |
+
card(
|
36 |
+
layout_columns(
|
37 |
+
textInput("chat",
|
38 |
+
label = NULL,
|
39 |
+
"Which four counties in California have the highest average social vulnerability?",
|
40 |
+
width = "100%"),
|
41 |
+
div(
|
42 |
+
actionButton("user_msg", "", icon = icon("paper-plane"),
|
43 |
+
class = "btn-primary btn-sm align-bottom"),
|
44 |
+
class = "align-text-bottom"),
|
45 |
+
col_widths = c(11, 1)),
|
46 |
+
fill = FALSE
|
47 |
+
),
|
48 |
+
|
49 |
+
textOutput("agent"),
|
50 |
+
|
51 |
+
|
52 |
+
layout_columns(
|
53 |
+
card(maplibreOutput("map")),
|
54 |
+
card(includeMarkdown("## Plot"),
|
55 |
+
plotOutput("chart1"),
|
56 |
+
plotOutput("chart2"),
|
57 |
+
),
|
58 |
+
col_widths = c(8, 4),
|
59 |
+
row_heights = c("500px"),
|
60 |
+
max_height = "600px"
|
61 |
+
),
|
62 |
+
|
63 |
+
gt_output("table"),
|
64 |
+
|
65 |
+
card(fill = TRUE,
|
66 |
+
card_header(fa("robot"), textOutput("model", inline = TRUE)),
|
67 |
+
accordion(
|
68 |
+
open = FALSE,
|
69 |
+
accordion_panel(
|
70 |
+
title = "show sql",
|
71 |
+
icon = fa("terminal"),
|
72 |
+
verbatimTextOutput("sql_code"),
|
73 |
+
),
|
74 |
+
accordion_panel(
|
75 |
+
title = "explain",
|
76 |
+
icon = fa("user", prefer_type="solid"),
|
77 |
+
textOutput("explanation"),
|
78 |
+
)
|
79 |
+
),
|
80 |
+
),
|
81 |
+
card(
|
82 |
+
card_header("Errata"),
|
83 |
+
shiny::markdown(readr::read_file("footer.md")),
|
84 |
+
),
|
85 |
+
sidebar = sidebar(
|
86 |
+
selectInput(
|
87 |
+
"select",
|
88 |
+
"Select an LLM:",
|
89 |
+
list("LLama3" = "llama3",
|
90 |
+
#"OLMO2 (AllenAI)" = "olmo",
|
91 |
+
"Gorilla (UC Berkeley)" = "gorilla"
|
92 |
+
)
|
93 |
+
),
|
94 |
+
|
95 |
+
input_switch("redlines", "Redlined Areas", value = FALSE),
|
96 |
+
input_switch("svi", "Social Vulnerability", value = TRUE),
|
97 |
+
input_switch("richness", "Biodiversity Richness", value = FALSE),
|
98 |
+
input_switch("rsr", "Biodiversity Range Size Rarity", value = FALSE),
|
99 |
+
|
100 |
+
|
101 |
+
card(
|
102 |
+
card_header(bs_icon("github"), "Source code:"),
|
103 |
+
a(href = "https://github.com/boettiger-lab/geo-llm-r",
|
104 |
+
"https://github.com/boettiger-lab/geo-llm-r"))
|
105 |
+
),
|
106 |
+
|
107 |
+
theme = bs_theme(version = "5")
|
108 |
+
)
|
109 |
+
|
110 |
+
|
111 |
+
repo <- "https://data.source.coop/cboettig/social-vulnerability"
|
112 |
+
pmtiles <- glue("{repo}/2022/SVI2022_US_tract.pmtiles")
|
113 |
+
parquet <- glue("{repo}/2022/SVI2022_US_tract.parquet")
|
114 |
+
con <- duckdbfs::cached_connection()
|
115 |
+
svi <- open_dataset(parquet, tblname = "svi") |> filter(RPL_THEMES > 0)
|
116 |
+
|
117 |
+
safe_parse <- function(txt) {
|
118 |
+
gsub("[\r\n]", " ", txt) |> gsub("\\s+", " ", x = _)
|
119 |
+
}
|
120 |
+
|
121 |
+
|
122 |
+
# helper utilities
|
123 |
+
# faster/more scalable to pass maplibre the ids to refilter pmtiles,
|
124 |
+
# than to pass it the full geospatial/sf object
|
125 |
+
filter_column <- function(full_data, filtered_data, id_col = "FIPS") {
|
126 |
+
if (nrow(filtered_data) < 1) return(NULL)
|
127 |
+
values <- full_data |>
|
128 |
+
inner_join(filtered_data, copy = TRUE) |>
|
129 |
+
pull(id_col)
|
130 |
+
# maplibre syntax for the filter of PMTiles
|
131 |
+
list("in", list("get", id_col), list("literal", values))
|
132 |
+
}
|
133 |
+
|
134 |
+
|
135 |
+
|
136 |
+
# Define the server
|
137 |
+
server <- function(input, output, session) {
|
138 |
+
|
139 |
+
chart1_data <- svi |>
|
140 |
+
group_by(COUNTY) |>
|
141 |
+
summarise(mean_svi = mean(RPL_THEMES)) |>
|
142 |
+
collect()
|
143 |
+
|
144 |
+
chart1 <- chart1_data |>
|
145 |
+
ggplot(aes(mean_svi)) + geom_density(fill="darkred") +
|
146 |
+
ggtitle("County-level vulnerability nation-wide")
|
147 |
+
|
148 |
+
data <- reactiveValues(df = tibble())
|
149 |
+
output$chart1 <- renderPlot(chart1)
|
150 |
+
|
151 |
+
model <- reactive(input$select)
|
152 |
+
output$model <- renderText(input$select)
|
153 |
+
observe({
|
154 |
+
schema <- read_file("schema.yml")
|
155 |
+
system_prompt <- glue::glue(readr::read_file("system-prompt.md"),
|
156 |
+
.open = "<", .close = ">")
|
157 |
+
chat <- ellmer::chat_vllm(
|
158 |
+
base_url = "https://llm.nrp-nautilus.io/",
|
159 |
+
model = model(),
|
160 |
+
api_key = Sys.getenv("NRP_API_KEY"),
|
161 |
+
system_prompt = system_prompt,
|
162 |
+
api_args = list(temperature = 0)
|
163 |
+
)
|
164 |
+
|
165 |
+
observeEvent(input$user_msg, {
|
166 |
+
stream <- chat$chat(input$chat)
|
167 |
+
|
168 |
+
# Parse response
|
169 |
+
response <- jsonlite::fromJSON(safe_parse(stream))
|
170 |
+
#response <- jsonlite::fromJSON(stream)
|
171 |
+
|
172 |
+
if ("query" %in% names(response)) {
|
173 |
+
output$sql_code <- renderText(stringr::str_wrap(response$query, width = 60))
|
174 |
+
output$explanation <- renderText(response$explanation)
|
175 |
+
|
176 |
+
# Actually execute the SQL query generated:
|
177 |
+
df <- DBI::dbGetQuery(con, response$query)
|
178 |
+
|
179 |
+
# don't display shape column in render
|
180 |
+
df <- df |> select(-any_of("Shape"))
|
181 |
+
output$table <- render_gt(df, height = 300)
|
182 |
+
|
183 |
+
|
184 |
+
y_axis <- colnames(df)[!colnames(df) %in% colnames(svi)]
|
185 |
+
chart2 <- df |>
|
186 |
+
rename(social_vulnerability = y_axis) |>
|
187 |
+
ggplot(aes(social_vulnerability)) +
|
188 |
+
geom_density(fill = "darkred") +
|
189 |
+
xlim(c(0, 1)) +
|
190 |
+
ggtitle("Vulnerability of selected areas")
|
191 |
+
|
192 |
+
output$chart2 <- renderPlot(chart2)
|
193 |
+
|
194 |
+
# We need to somehow trigger this df to update the map.
|
195 |
+
data$df <- df
|
196 |
+
|
197 |
+
# Note: ellmer will preserve full chat history automatically.
|
198 |
+
# this can confuse the agent and mess up behavior, so we reset:
|
199 |
+
chat$set_turns(NULL)
|
200 |
+
|
201 |
+
} else {
|
202 |
+
output$agent <- renderText(response$agent)
|
203 |
+
|
204 |
+
}
|
205 |
+
|
206 |
+
})
|
207 |
+
})
|
208 |
+
|
209 |
+
|
210 |
+
output$map <- renderMaplibre({
|
211 |
+
|
212 |
+
m <- maplibre(center = c(-104.9, 40.3), zoom = 3, height = "400")
|
213 |
+
if (input$redlines) {
|
214 |
+
m <- m |>
|
215 |
+
add_fill_layer(
|
216 |
+
id = "redlines",
|
217 |
+
source = list(type = "vector",
|
218 |
+
url = paste0("pmtiles://", "https://data.source.coop/cboettig/us-boundaries/mappinginequality.pmtiles")),
|
219 |
+
source_layer = "mappinginequality",
|
220 |
+
fill_color = list("get", "fill")
|
221 |
+
)
|
222 |
+
}
|
223 |
+
if (input$richness) {
|
224 |
+
m <- m |>
|
225 |
+
add_raster_source(id = "richness",
|
226 |
+
tiles = "https://data.source.coop/cboettig/mobi/tiles/red/species-richness-all/{z}/{x}/{y}.png",
|
227 |
+
maxzoom = 11
|
228 |
+
) |>
|
229 |
+
add_raster_layer(id = "richness-layer",
|
230 |
+
source = "richness")
|
231 |
+
|
232 |
+
}
|
233 |
+
|
234 |
+
if (input$rsr) {
|
235 |
+
m <- m |>
|
236 |
+
add_raster_source(id = "rsr",
|
237 |
+
tiles = "https://data.source.coop/cboettig/mobi/tiles/green/range-size-rarity-all/{z}/{x}/{y}.png",
|
238 |
+
maxzoom = 11
|
239 |
+
) |>
|
240 |
+
add_raster_layer(id = "richness-layer",
|
241 |
+
source = "rsr")
|
242 |
+
|
243 |
+
}
|
244 |
+
if (input$svi) {
|
245 |
+
m <- m |>
|
246 |
+
add_fill_layer(
|
247 |
+
id = "svi_layer",
|
248 |
+
source = list(type = "vector",
|
249 |
+
url = paste0("pmtiles://", pmtiles)),
|
250 |
+
source_layer = "svi",
|
251 |
+
tooltip = "RPL_THEMES",
|
252 |
+
filter = filter_column(svi, data$df, "FIPS"),
|
253 |
+
fill_opacity = 0.5,
|
254 |
+
fill_color = interpolate(column = "RPL_THEMES",
|
255 |
+
values = c(0, 1),
|
256 |
+
stops = c("lightpink", "darkred"),
|
257 |
+
na_color = "lightgrey")
|
258 |
+
)
|
259 |
+
}
|
260 |
+
m |>
|
261 |
+
add_draw_control() |>
|
262 |
+
add_geocoder_control()
|
263 |
+
|
264 |
+
})
|
265 |
+
|
266 |
+
|
267 |
+
|
268 |
+
|
269 |
+
}
|
270 |
+
|
271 |
+
# Run the app
|
272 |
+
shinyApp(ui = ui, server = server)
|
footer.md
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#### Credits
|
2 |
+
|
3 |
+
Developed by Carl Boettiger, UC Berkeley, 2025. BSD License.
|
4 |
+
|
5 |
+
Data from the US Census and CDC's [Social Vulnerability Index](https://www.atsdr.cdc.gov/place-health/php/svi/index.html)
|
6 |
+
|
7 |
+
#### Technical details
|
8 |
+
|
9 |
+
The app is written entirely in R using shiny. The app will translate natural language queries in SQL code using
|
10 |
+
a small open-weights language model. The SQL code is executed using the duckdb backend against cloud-native
|
11 |
+
geoparquet snapshot of the Social Vulnerability Index hosted on Source Cooperative. Summary chart data are also
|
12 |
+
computed in duckdb by streaming, providing responsive updates while needing minimal RAM or disk storage despite
|
13 |
+
the large size of the data sources.
|
14 |
+
|
15 |
+
The map is rendered and updated using MapLibre with PMTiles, which provides responsive rendering for large feature sets.
|
16 |
+
The PMTiles layer is also hosted on Source cooperative where it can be streamed efficiently.
|
geo-llm-r.Rproj
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Version: 1.0
|
2 |
+
ProjectId: 337131c5-7aa5-4963-bc4e-d8a156e206a0
|
3 |
+
|
4 |
+
RestoreWorkspace: Default
|
5 |
+
SaveWorkspace: Default
|
6 |
+
AlwaysSaveHistory: Default
|
7 |
+
|
8 |
+
EnableCodeIndexing: Yes
|
9 |
+
UseSpacesForTab: Yes
|
10 |
+
NumSpacesForTab: 2
|
11 |
+
Encoding: UTF-8
|
12 |
+
|
13 |
+
RnwWeave: Sweave
|
14 |
+
LaTeX: pdfLaTeX
|
15 |
+
|
16 |
+
AutoAppendNewline: Yes
|
17 |
+
StripTrailingWhitespace: Yes
|
preprocess.md
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
---
|
3 |
+
|
4 |
+
|
5 |
+
# Vector Layers
|
6 |
+
|
7 |
+
The heart of this application design is a vector dataset serialized as both (Geo)Parquet and PMTiles.
|
8 |
+
The parquet version allows for real-time calculations through rapid SQL queries via duckdb,
|
9 |
+
and the PMTiles version allows the data to be quickly visualized at any zoom through maplibre.
|
10 |
+
maplibre can also efficiently filter the PMTiles data given a feature ids returned by duckdb.
|
11 |
+
|
12 |
+
`gdal_translates` can generate both PMTiles and geoparquet, though `tippecanoe` provides more
|
13 |
+
options for PMTiles generation and can produce nicer tile sets.
|
14 |
+
|
15 |
+
The demo uses the CDC Social Vulnerability data because it is built on the hierachical partitioning
|
16 |
+
used by the Census (Country->State->County->Tract) hierarchy.
|
17 |
+
|
18 |
+
# Raster Layers
|
19 |
+
|
20 |
+
## Generating static tiles
|
21 |
+
|
22 |
+
## Zonal statistics calculations
|
23 |
+
|
24 |
+
The application is essentially driven by the vector layer data using SQL.
|
25 |
+
I find it helpful to pre-process 'zonal' calculations, e.g. the mean value of each raster layer
|
26 |
+
within each feature in the 'focal' vector data set(s).
|
27 |
+
|
schema.yml
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
- VARIABLE_NAME: ST
|
2 |
+
DESCRIPTION: INTEGER State-level FIPS code (two-digit integer)
|
3 |
+
- VARIABLE_NAME: STATE
|
4 |
+
DESCRIPTION: State name
|
5 |
+
- VARIABLE_NAME: ST_ABBR
|
6 |
+
DESCRIPTION: State abbreviation, two-letter string
|
7 |
+
- VARIABLE_NAME: STCNTY
|
8 |
+
DESCRIPTION: INTEGER County-level FIPS code (5 digit integer)
|
9 |
+
- VARIABLE_NAME: COUNTY
|
10 |
+
DESCRIPTION: County name
|
11 |
+
- VARIABLE_NAME: FIPS
|
12 |
+
DESCRIPTION: INTEGER, Tract-level geographic identification (full Census Bureau FIPS code)
|
13 |
+
- VARIABLE_NAME: LOCATION
|
14 |
+
DESCRIPTION: Text description of tract county state
|
15 |
+
- VARIABLE_NAME: AREA_SQMI
|
16 |
+
DESCRIPTION: Tract area in square miles
|
17 |
+
- VARIABLE_NAME: RPL_THEMES
|
18 |
+
DESCRIPTION: Overall social vulnerability. Should always be used unless explicit sub-theme is called for.
|
19 |
+
- VARIABLE_NAME: RPL_THEME1
|
20 |
+
DESCRIPTION: Subtheme for socio-economic status social vulnerability score
|
21 |
+
- VARIABLE_NAME: RPL_THEME2
|
22 |
+
DESCRIPTION: Subtheme for Household characteristics vulnerability score
|
23 |
+
- VARIABLE_NAME: RPL_THEME3
|
24 |
+
DESCRIPTION: Subtheme for Racial and Ethnic Minority status based vulnerability score
|
25 |
+
- VARIABLE_NAME: RPL_THEME4
|
26 |
+
DESCRIPTION: Subtheme for Housing and transportation-based vulnerability score
|
system-prompt.md
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
You are a helpful agent who always replies strictly in JSON-formatted text.
|
3 |
+
Your task is to translate the user's questions about the data into a SQL query
|
4 |
+
that will be run against the "svi" table in a duckdb database.
|
5 |
+
The duckdb database has a spatial extension which understands PostGIS operations as well.
|
6 |
+
Include semantically meaningful columns like COUNTY and STATE name.
|
7 |
+
|
8 |
+
If your answer involves the construction of a SQL query, you must format your answer as follows:
|
9 |
+
|
10 |
+
{
|
11 |
+
"query": "your raw SQL response goes here.",
|
12 |
+
"explanation": "your explanation of the query"
|
13 |
+
}
|
14 |
+
|
15 |
+
Think carefully about your SQL query, keep it concise and ensure it is entirely valid SQL syntax.
|
16 |
+
|
17 |
+
If your answer does not involve a SQL query, please reply with the following format instead:
|
18 |
+
|
19 |
+
{
|
20 |
+
"user": "user question goes here",
|
21 |
+
"agent": "your response goes here"
|
22 |
+
}
|
23 |
+
|
24 |
+
If you are asked to describe the data or for information about the data schema, give only a human-readable response with SQL.
|
25 |
+
|
26 |
+
In the data, each row represents an individual census tract. If asked for
|
27 |
+
county or state level statistics, be sure to aggregate across all the tracts
|
28 |
+
in that county or state.
|
29 |
+
|
30 |
+
Refer to this descriptions of each of the columns (VARIABLE_NAME) from the metadata table:
|
31 |
+
<schema>
|
32 |
+
|
33 |
+
Whenenver you SELECT the COUNTY, you must include the STCNTY column as well because county names are not unique across states!
|
34 |
+
|
35 |
+
|
test.R
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Illustrate/test core app functionality without shiny
|
2 |
+
|
3 |
+
library(tidyverse)
|
4 |
+
library(duckdbfs)
|
5 |
+
library(mapgl)
|
6 |
+
library(ellmer)
|
7 |
+
library(glue)
|
8 |
+
|
9 |
+
repo <- "https://data.source.coop/cboettig/social-vulnerability"
|
10 |
+
pmtiles <- glue("{repo}/svi2020_us_tract.pmtiles")
|
11 |
+
parquet <- glue("{repo}/svi2020_us_tract.parquet")
|
12 |
+
svi <- open_dataset(parquet, tblname = "svi") |> filter(RPL_THEMES > 0)
|
13 |
+
|
14 |
+
schema <- read_file("schema.yml")
|
15 |
+
system_prompt <- glue::glue(readr::read_file("system-prompt.md"),
|
16 |
+
.open = "<", .close = ">")
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
# Or optionally test with cirrus
|
21 |
+
chat <- ellmer::chat_vllm(
|
22 |
+
base_url = "https://llm.cirrus.carlboettiger.info/v1/",
|
23 |
+
model = "kosbu/Llama-3.3-70B-Instruct-AWQ",
|
24 |
+
api_key = Sys.getenv("CIRRUS_LLM_KEY"),
|
25 |
+
system_prompt = system_prompt,
|
26 |
+
api_args = list(temperature = 0)
|
27 |
+
)
|
28 |
+
|
29 |
+
# or use the NRP model
|
30 |
+
chat <- ellmer::chat_vllm(
|
31 |
+
base_url = "https://llm.nrp-nautilus.io/",
|
32 |
+
model = "llama3",
|
33 |
+
api_key = Sys.getenv("NRP_API_KEY"),
|
34 |
+
system_prompt = system_prompt,
|
35 |
+
api_args = list(temperature = 0)
|
36 |
+
)
|
37 |
+
|
38 |
+
|
39 |
+
# Test a chat-based response
|
40 |
+
chat$chat("Which columns describes racial components of social vulnerability?")
|
41 |
+
## A query-based response
|
42 |
+
stream <- chat$chat("Which counties in California have the highest average social vulnerability?")
|
43 |
+
response <- jsonlite::fromJSON(stream)
|
44 |
+
|
45 |
+
con <- duckdbfs::cached_connection()
|
46 |
+
filtered_data <- DBI::dbGetQuery(con, response$query)
|
47 |
+
|
48 |
+
filter_column <- function(full_data, filtered_data, id_col) {
|
49 |
+
if (nrow(filtered_data) < 1) return(NULL)
|
50 |
+
values <- full_data |>
|
51 |
+
inner_join(filtered_data, copy = TRUE) |>
|
52 |
+
pull(id_col)
|
53 |
+
# maplibre syntax for the filter of PMTiles
|
54 |
+
list("in", list("get", id_col), list("literal", values))
|
55 |
+
}
|
56 |
+
|
57 |
+
maplibre(center = c(-102.9, 41.3), zoom = 3) |>
|
58 |
+
add_fill_layer(
|
59 |
+
id = "svi_layer",
|
60 |
+
source = list(type = "vector", url = paste0("pmtiles://", pmtiles)),
|
61 |
+
source_layer = "SVI2000_US_tract",
|
62 |
+
filter = filter_column(full_data, filtered_data, "FIPS"),
|
63 |
+
fill_opacity = 0.5,
|
64 |
+
fill_color = interpolate(column = "RPL_THEMES",
|
65 |
+
values = c(0, 1),
|
66 |
+
stops = c("#e19292c0", "darkblue"),
|
67 |
+
na_color = "lightgrey")
|
68 |
+
)
|
69 |
+
|
utils.R
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
library(tidyverse)
|
3 |
+
library(duckdbfs)
|
4 |
+
|