mylessss commited on
Commit
5be1c1d
·
1 Parent(s): 1505be8
Files changed (1) hide show
  1. app.py +23 -17
app.py CHANGED
@@ -23,6 +23,7 @@ app = marimo.App(width="medium")
23
  @app.cell
24
  def __():
25
  import marimo as mo
 
26
  return (mo,)
27
 
28
 
@@ -115,24 +116,23 @@ def __(mo):
115
 
116
  Text embeddings typically have hundreds of dimensions (512 in our case), making them impossible to visualize directly. We'll use two techniques to make them interpretable:
117
 
118
- 1. **Dimensionality Reduction**: UMAP will convert our 512D vectors into 2D points while preserving relationships between texts
119
- 2. **Clustering**: HDBSCAN will group similar texts together
120
  """
121
  )
122
  return
123
 
124
 
125
  @app.cell(hide_code=True)
126
- def __(cluster_points, mo, umap_reduce):
127
  def md_help(cls):
128
  import inspect
129
 
130
  return f"def {cls.__name__} {inspect.signature(cls)}:\n {cls.__doc__}"
131
 
132
-
133
  mo.accordion(
134
  {
135
- "`umap_reduce`": md_help(umap_reduce),
136
  "`cluster_points`": md_help(cluster_points),
137
  }
138
  )
@@ -141,11 +141,12 @@ def __(cluster_points, mo, umap_reduce):
141
 
142
  @app.cell
143
  def __(np):
144
- def umap_reduce(np_array, metric="cosine"):
145
  """
146
- Reduce the dimensionality of the embeddings to 2D using
147
- UMAP algorithm. UMAP preserves both local and global structure
148
- of the high-dimensional data.
 
149
  """
150
  import umap
151
 
@@ -157,12 +158,13 @@ def __(np):
157
  )
158
  return reducer.fit_transform(np_array)
159
 
160
-
161
  def cluster_points(np_array, min_cluster_size=4, max_cluster_size=50):
162
  """
163
- Cluster the embeddings using HDBSCAN algorithm.
164
- We first reduce dimensionality to 50D with PCA to speed up clustering,
165
- while still preserving most of the important information.
 
 
166
  """
167
  import hdbscan
168
  from sklearn.decomposition import PCA
@@ -179,7 +181,8 @@ def __(np):
179
  return np.where(
180
  hdb.labels_ == -1, "outlier", "cluster_" + hdb.labels_.astype(str)
181
  )
182
- return cluster_points, umap_reduce
 
183
 
184
 
185
  @app.cell
@@ -207,7 +210,7 @@ def __(mo):
207
  r"""
208
  ## Processing the Data
209
 
210
- Now we'll transform our high-dimensional embeddings into something we can visualize, using `umap_reduce` and `cluster_points`. More details on this step [in the blog](https://motherduck.com/blog/MotherDuck-Visualize-Embeddings-Marimo/).
211
  """
212
  )
213
  return
@@ -220,7 +223,7 @@ def __(
220
  embeddings,
221
  metric_dropdown,
222
  mo,
223
- umap_reduce,
224
  ):
225
  with mo.status.spinner("Clustering points...") as _s:
226
  import numba
@@ -232,7 +235,9 @@ def __(
232
  max_cluster_size=cluster_size_slider.value[1],
233
  )
234
  _s.update("Reducing dimensionality...")
235
- embeddings_2d = umap_reduce(embeddings_array, metric=metric_dropdown.value)
 
 
236
  mo.show_code()
237
  return embeddings_2d, embeddings_array, hdb_labels, numba
238
 
@@ -326,6 +331,7 @@ def __():
326
 
327
  # ML tools for dimensionality reduction and clustering
328
  import numpy as np
 
329
  return alt, duckdb, np, pl, pyarrow
330
 
331
 
 
23
  @app.cell
24
  def __():
25
  import marimo as mo
26
+
27
  return (mo,)
28
 
29
 
 
116
 
117
  Text embeddings typically have hundreds of dimensions (512 in our case), making them impossible to visualize directly. We'll use two techniques to make them interpretable:
118
 
119
+ 1. **Dimensionality Reduction**: Convert our 512D vectors into 2D points while preserving relationships between texts
120
+ 2. **Clustering**: Group similar texts together into clusters
121
  """
122
  )
123
  return
124
 
125
 
126
  @app.cell(hide_code=True)
127
+ def __(cluster_points, mo, reduce_dimensions):
128
  def md_help(cls):
129
  import inspect
130
 
131
  return f"def {cls.__name__} {inspect.signature(cls)}:\n {cls.__doc__}"
132
 
 
133
  mo.accordion(
134
  {
135
+ "`reduce_dimensions`": md_help(reduce_dimensions),
136
  "`cluster_points`": md_help(cluster_points),
137
  }
138
  )
 
141
 
142
  @app.cell
143
  def __(np):
144
+ def reduce_dimensions(np_array, metric="cosine"):
145
  """
146
+ Reduce the dimensions of embeddings to a 2D space.
147
+
148
+ Here we use the UMAP algorithm. UMAP preserves both local and
149
+ global structure of the high-dimensional data.
150
  """
151
  import umap
152
 
 
158
  )
159
  return reducer.fit_transform(np_array)
160
 
 
161
  def cluster_points(np_array, min_cluster_size=4, max_cluster_size=50):
162
  """
163
+ Cluster the embeddings.
164
+
165
+
166
+ Here we use the HDBSCAN algorithm. We first reduce dimensionality to 50D with
167
+ PCA to speed up clustering, while still preserving most of the important information.
168
  """
169
  import hdbscan
170
  from sklearn.decomposition import PCA
 
181
  return np.where(
182
  hdb.labels_ == -1, "outlier", "cluster_" + hdb.labels_.astype(str)
183
  )
184
+
185
+ return cluster_points, reduce_dimensions
186
 
187
 
188
  @app.cell
 
210
  r"""
211
  ## Processing the Data
212
 
213
+ Now we'll transform our high-dimensional embeddings into something we can visualize, using `reduce_dimensions` and `cluster_points`. More details on this step [in the blog](https://motherduck.com/blog/MotherDuck-Visualize-Embeddings-Marimo/).
214
  """
215
  )
216
  return
 
223
  embeddings,
224
  metric_dropdown,
225
  mo,
226
+ reduce_dimensions,
227
  ):
228
  with mo.status.spinner("Clustering points...") as _s:
229
  import numba
 
235
  max_cluster_size=cluster_size_slider.value[1],
236
  )
237
  _s.update("Reducing dimensionality...")
238
+ embeddings_2d = reduce_dimensions(
239
+ embeddings_array, metric=metric_dropdown.value
240
+ )
241
  mo.show_code()
242
  return embeddings_2d, embeddings_array, hdb_labels, numba
243
 
 
331
 
332
  # ML tools for dimensionality reduction and clustering
333
  import numpy as np
334
+
335
  return alt, duckdb, np, pl, pyarrow
336
 
337