update
Browse files
app.py
CHANGED
@@ -23,6 +23,7 @@ app = marimo.App(width="medium")
|
|
23 |
@app.cell
|
24 |
def __():
|
25 |
import marimo as mo
|
|
|
26 |
return (mo,)
|
27 |
|
28 |
|
@@ -115,24 +116,23 @@ def __(mo):
|
|
115 |
|
116 |
Text embeddings typically have hundreds of dimensions (512 in our case), making them impossible to visualize directly. We'll use two techniques to make them interpretable:
|
117 |
|
118 |
-
1. **Dimensionality Reduction**:
|
119 |
-
2. **Clustering**:
|
120 |
"""
|
121 |
)
|
122 |
return
|
123 |
|
124 |
|
125 |
@app.cell(hide_code=True)
|
126 |
-
def __(cluster_points, mo,
|
127 |
def md_help(cls):
|
128 |
import inspect
|
129 |
|
130 |
return f"def {cls.__name__} {inspect.signature(cls)}:\n {cls.__doc__}"
|
131 |
|
132 |
-
|
133 |
mo.accordion(
|
134 |
{
|
135 |
-
"`
|
136 |
"`cluster_points`": md_help(cluster_points),
|
137 |
}
|
138 |
)
|
@@ -141,11 +141,12 @@ def __(cluster_points, mo, umap_reduce):
|
|
141 |
|
142 |
@app.cell
|
143 |
def __(np):
|
144 |
-
def
|
145 |
"""
|
146 |
-
Reduce the
|
147 |
-
|
148 |
-
|
|
|
149 |
"""
|
150 |
import umap
|
151 |
|
@@ -157,12 +158,13 @@ def __(np):
|
|
157 |
)
|
158 |
return reducer.fit_transform(np_array)
|
159 |
|
160 |
-
|
161 |
def cluster_points(np_array, min_cluster_size=4, max_cluster_size=50):
|
162 |
"""
|
163 |
-
Cluster the embeddings
|
164 |
-
|
165 |
-
|
|
|
|
|
166 |
"""
|
167 |
import hdbscan
|
168 |
from sklearn.decomposition import PCA
|
@@ -179,7 +181,8 @@ def __(np):
|
|
179 |
return np.where(
|
180 |
hdb.labels_ == -1, "outlier", "cluster_" + hdb.labels_.astype(str)
|
181 |
)
|
182 |
-
|
|
|
183 |
|
184 |
|
185 |
@app.cell
|
@@ -207,7 +210,7 @@ def __(mo):
|
|
207 |
r"""
|
208 |
## Processing the Data
|
209 |
|
210 |
-
Now we'll transform our high-dimensional embeddings into something we can visualize, using `
|
211 |
"""
|
212 |
)
|
213 |
return
|
@@ -220,7 +223,7 @@ def __(
|
|
220 |
embeddings,
|
221 |
metric_dropdown,
|
222 |
mo,
|
223 |
-
|
224 |
):
|
225 |
with mo.status.spinner("Clustering points...") as _s:
|
226 |
import numba
|
@@ -232,7 +235,9 @@ def __(
|
|
232 |
max_cluster_size=cluster_size_slider.value[1],
|
233 |
)
|
234 |
_s.update("Reducing dimensionality...")
|
235 |
-
embeddings_2d =
|
|
|
|
|
236 |
mo.show_code()
|
237 |
return embeddings_2d, embeddings_array, hdb_labels, numba
|
238 |
|
@@ -326,6 +331,7 @@ def __():
|
|
326 |
|
327 |
# ML tools for dimensionality reduction and clustering
|
328 |
import numpy as np
|
|
|
329 |
return alt, duckdb, np, pl, pyarrow
|
330 |
|
331 |
|
|
|
23 |
@app.cell
|
24 |
def __():
|
25 |
import marimo as mo
|
26 |
+
|
27 |
return (mo,)
|
28 |
|
29 |
|
|
|
116 |
|
117 |
Text embeddings typically have hundreds of dimensions (512 in our case), making them impossible to visualize directly. We'll use two techniques to make them interpretable:
|
118 |
|
119 |
+
1. **Dimensionality Reduction**: Convert our 512D vectors into 2D points while preserving relationships between texts
|
120 |
+
2. **Clustering**: Group similar texts together into clusters
|
121 |
"""
|
122 |
)
|
123 |
return
|
124 |
|
125 |
|
126 |
@app.cell(hide_code=True)
|
127 |
+
def __(cluster_points, mo, reduce_dimensions):
|
128 |
def md_help(cls):
|
129 |
import inspect
|
130 |
|
131 |
return f"def {cls.__name__} {inspect.signature(cls)}:\n {cls.__doc__}"
|
132 |
|
|
|
133 |
mo.accordion(
|
134 |
{
|
135 |
+
"`reduce_dimensions`": md_help(reduce_dimensions),
|
136 |
"`cluster_points`": md_help(cluster_points),
|
137 |
}
|
138 |
)
|
|
|
141 |
|
142 |
@app.cell
|
143 |
def __(np):
|
144 |
+
def reduce_dimensions(np_array, metric="cosine"):
|
145 |
"""
|
146 |
+
Reduce the dimensions of embeddings to a 2D space.
|
147 |
+
|
148 |
+
Here we use the UMAP algorithm. UMAP preserves both local and
|
149 |
+
global structure of the high-dimensional data.
|
150 |
"""
|
151 |
import umap
|
152 |
|
|
|
158 |
)
|
159 |
return reducer.fit_transform(np_array)
|
160 |
|
|
|
161 |
def cluster_points(np_array, min_cluster_size=4, max_cluster_size=50):
|
162 |
"""
|
163 |
+
Cluster the embeddings.
|
164 |
+
|
165 |
+
|
166 |
+
Here we use the HDBSCAN algorithm. We first reduce dimensionality to 50D with
|
167 |
+
PCA to speed up clustering, while still preserving most of the important information.
|
168 |
"""
|
169 |
import hdbscan
|
170 |
from sklearn.decomposition import PCA
|
|
|
181 |
return np.where(
|
182 |
hdb.labels_ == -1, "outlier", "cluster_" + hdb.labels_.astype(str)
|
183 |
)
|
184 |
+
|
185 |
+
return cluster_points, reduce_dimensions
|
186 |
|
187 |
|
188 |
@app.cell
|
|
|
210 |
r"""
|
211 |
## Processing the Data
|
212 |
|
213 |
+
Now we'll transform our high-dimensional embeddings into something we can visualize, using `reduce_dimensions` and `cluster_points`. More details on this step [in the blog](https://motherduck.com/blog/MotherDuck-Visualize-Embeddings-Marimo/).
|
214 |
"""
|
215 |
)
|
216 |
return
|
|
|
223 |
embeddings,
|
224 |
metric_dropdown,
|
225 |
mo,
|
226 |
+
reduce_dimensions,
|
227 |
):
|
228 |
with mo.status.spinner("Clustering points...") as _s:
|
229 |
import numba
|
|
|
235 |
max_cluster_size=cluster_size_slider.value[1],
|
236 |
)
|
237 |
_s.update("Reducing dimensionality...")
|
238 |
+
embeddings_2d = reduce_dimensions(
|
239 |
+
embeddings_array, metric=metric_dropdown.value
|
240 |
+
)
|
241 |
mo.show_code()
|
242 |
return embeddings_2d, embeddings_array, hdb_labels, numba
|
243 |
|
|
|
331 |
|
332 |
# ML tools for dimensionality reduction and clustering
|
333 |
import numpy as np
|
334 |
+
|
335 |
return alt, duckdb, np, pl, pyarrow
|
336 |
|
337 |
|