przadka commited on
Commit
b235340
·
verified ·
1 Parent(s): 72c6197

Add BERTopic model

Browse files
Files changed (4) hide show
  1. README.md +73 -0
  2. config.json +16 -0
  3. topic_embeddings.safetensors +3 -0
  4. topics.json +414 -0
README.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ tags:
4
+ - bertopic
5
+ library_name: bertopic
6
+ pipeline_tag: text-classification
7
+ ---
8
+
9
+ # rag-topic-model
10
+
11
+ This is a [BERTopic](https://github.com/MaartenGr/BERTopic) model.
12
+ BERTopic is a flexible and modular topic modeling framework that allows for the generation of easily interpretable topics from large datasets.
13
+
14
+ ## Usage
15
+
16
+ To use this model, please install BERTopic:
17
+
18
+ ```
19
+ pip install -U bertopic
20
+ ```
21
+
22
+ You can use the model as follows:
23
+
24
+ ```python
25
+ from bertopic import BERTopic
26
+ topic_model = BERTopic.load("przadka/rag-topic-model")
27
+
28
+ topic_model.get_topic_info()
29
+ ```
30
+
31
+ ## Topic overview
32
+
33
+ * Number of topics: 4
34
+ * Number of training documents: 203
35
+
36
+ <details>
37
+ <summary>Click here for an overview of all topics.</summary>
38
+
39
+ | Topic ID | Topic Keywords | Topic Frequency | Label |
40
+ |----------|----------------|-----------------|-------|
41
+ | -1 | on - card - my - charge - account | 54 | -1_on_card_my_charge |
42
+ | 0 | refund - my - nike - for - store | 16 | 0_refund_my_nike_for |
43
+ | 1 | to - my - klarna - email - and | 77 | 1_to_my_klarna_email |
44
+ | 2 | my - the - payment - klarna - for | 56 | 2_my_the_payment_klarna |
45
+
46
+ </details>
47
+
48
+ ## Training hyperparameters
49
+
50
+ * calculate_probabilities: False
51
+ * language: None
52
+ * low_memory: False
53
+ * min_topic_size: 10
54
+ * n_gram_range: (1, 1)
55
+ * nr_topics: None
56
+ * seed_topic_list: None
57
+ * top_n_words: 10
58
+ * verbose: False
59
+ * zeroshot_min_similarity: 0.7
60
+ * zeroshot_topic_list: None
61
+
62
+ ## Framework versions
63
+
64
+ * Numpy: 2.1.3
65
+ * HDBSCAN: 0.8.40
66
+ * UMAP: 0.5.7
67
+ * Pandas: 2.2.3
68
+ * Scikit-Learn: 1.6.1
69
+ * Sentence-transformers: 3.1.1
70
+ * Transformers: 4.45.2
71
+ * Numba: 0.61.0
72
+ * Plotly: 6.0.0
73
+ * Python: 3.10.12
config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "calculate_probabilities": false,
3
+ "language": null,
4
+ "low_memory": false,
5
+ "min_topic_size": 10,
6
+ "n_gram_range": [
7
+ 1,
8
+ 1
9
+ ],
10
+ "nr_topics": null,
11
+ "seed_topic_list": null,
12
+ "top_n_words": 10,
13
+ "verbose": false,
14
+ "zeroshot_min_similarity": 0.7,
15
+ "zeroshot_topic_list": null
16
+ }
topic_embeddings.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf46216928afa925d87b82e1f156bffc22e3440931a9e2373ea5a1abfdfdad99
3
+ size 6232
topics.json ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "topic_representations": {
3
+ "-1": [
4
+ [
5
+ "on",
6
+ 0.09163203304952545
7
+ ],
8
+ [
9
+ "card",
10
+ 0.08064970389745767
11
+ ],
12
+ [
13
+ "my",
14
+ 0.0767006922807348
15
+ ],
16
+ [
17
+ "charge",
18
+ 0.06628677938756693
19
+ ],
20
+ [
21
+ "account",
22
+ 0.06147155751770172
23
+ ],
24
+ [
25
+ "onetime",
26
+ 0.06048261659107499
27
+ ],
28
+ [
29
+ "textbooks",
30
+ 0.06048261659107499
31
+ ],
32
+ [
33
+ "for",
34
+ 0.056879211846449326
35
+ ],
36
+ [
37
+ "it",
38
+ 0.055497950502502974
39
+ ],
40
+ [
41
+ "college",
42
+ 0.05263266076836004
43
+ ]
44
+ ],
45
+ "0": [
46
+ [
47
+ "refund",
48
+ 0.10357181740806075
49
+ ],
50
+ [
51
+ "my",
52
+ 0.07945375060858025
53
+ ],
54
+ [
55
+ "nike",
56
+ 0.07939247427741154
57
+ ],
58
+ [
59
+ "for",
60
+ 0.078402687188764
61
+ ],
62
+ [
63
+ "store",
64
+ 0.06732216256657804
65
+ ],
66
+ [
67
+ "to",
68
+ 0.06611804030049008
69
+ ],
70
+ [
71
+ "returned",
72
+ 0.06609171219628653
73
+ ],
74
+ [
75
+ "credit",
76
+ 0.05631544560512463
77
+ ],
78
+ [
79
+ "but",
80
+ 0.053297631611516065
81
+ ],
82
+ [
83
+ "week",
84
+ 0.05302617127651293
85
+ ]
86
+ ],
87
+ "1": [
88
+ [
89
+ "to",
90
+ 0.09520388888087453
91
+ ],
92
+ [
93
+ "my",
94
+ 0.0928884822862621
95
+ ],
96
+ [
97
+ "klarna",
98
+ 0.08419112810016777
99
+ ],
100
+ [
101
+ "email",
102
+ 0.07667389811880415
103
+ ],
104
+ [
105
+ "and",
106
+ 0.07019917226789504
107
+ ],
108
+ [
109
+ "im",
110
+ 0.05913002210894736
111
+ ],
112
+ [
113
+ "account",
114
+ 0.057074666373688056
115
+ ],
116
+ [
117
+ "it",
118
+ 0.051528335009921794
119
+ ],
120
+ [
121
+ "the",
122
+ 0.04709145261380219
123
+ ],
124
+ [
125
+ "in",
126
+ 0.042812985494347804
127
+ ]
128
+ ],
129
+ "2": [
130
+ [
131
+ "my",
132
+ 0.09343260244859648
133
+ ],
134
+ [
135
+ "the",
136
+ 0.07662357986486545
137
+ ],
138
+ [
139
+ "payment",
140
+ 0.07063227480862926
141
+ ],
142
+ [
143
+ "klarna",
144
+ 0.06974001266021526
145
+ ],
146
+ [
147
+ "for",
148
+ 0.06974001266021526
149
+ ],
150
+ [
151
+ "to",
152
+ 0.06301359684981774
153
+ ],
154
+ [
155
+ "pay",
156
+ 0.05560537889134318
157
+ ],
158
+ [
159
+ "can",
160
+ 0.0545200266290082
161
+ ],
162
+ [
163
+ "it",
164
+ 0.05249296656920062
165
+ ],
166
+ [
167
+ "app",
168
+ 0.04903594746152501
169
+ ]
170
+ ]
171
+ },
172
+ "topics": [
173
+ 1,
174
+ 1,
175
+ 1,
176
+ 2,
177
+ 1,
178
+ 2,
179
+ 0,
180
+ 0,
181
+ -1,
182
+ 2,
183
+ 2,
184
+ 1,
185
+ 1,
186
+ 0,
187
+ 2,
188
+ -1,
189
+ 2,
190
+ -1,
191
+ 1,
192
+ 0,
193
+ 1,
194
+ 2,
195
+ 0,
196
+ 2,
197
+ 0,
198
+ 2,
199
+ 1,
200
+ 2,
201
+ 0,
202
+ 0,
203
+ 0,
204
+ 1,
205
+ 2,
206
+ 2,
207
+ 2,
208
+ 0,
209
+ 0,
210
+ 0,
211
+ 2,
212
+ 1,
213
+ 1,
214
+ 1,
215
+ 0,
216
+ 0,
217
+ 0,
218
+ 2,
219
+ 1,
220
+ 2,
221
+ 1,
222
+ 0,
223
+ 1,
224
+ 2,
225
+ 1,
226
+ 1,
227
+ 1,
228
+ 0,
229
+ 0,
230
+ 0,
231
+ 0,
232
+ 0,
233
+ -1,
234
+ 0,
235
+ 0,
236
+ 0,
237
+ 0,
238
+ 0,
239
+ 2,
240
+ 0,
241
+ 0,
242
+ 0,
243
+ 0,
244
+ 0,
245
+ 0,
246
+ 0,
247
+ 0,
248
+ 0,
249
+ 0,
250
+ 0,
251
+ 0,
252
+ 0,
253
+ 0,
254
+ 0,
255
+ 0,
256
+ 0,
257
+ 0,
258
+ 0,
259
+ 0,
260
+ 0,
261
+ 0,
262
+ 0,
263
+ 0,
264
+ 0,
265
+ 0,
266
+ 0,
267
+ 0,
268
+ 0,
269
+ 0,
270
+ 0,
271
+ 0,
272
+ 1,
273
+ 2,
274
+ -1,
275
+ 1,
276
+ -1,
277
+ 2,
278
+ 0,
279
+ -1,
280
+ 0,
281
+ 2,
282
+ 1,
283
+ 1,
284
+ 2,
285
+ 2,
286
+ 2,
287
+ 1,
288
+ 2,
289
+ 2,
290
+ 1,
291
+ 0,
292
+ 2,
293
+ 2,
294
+ 2,
295
+ 2,
296
+ 2,
297
+ 1,
298
+ 0,
299
+ -1,
300
+ 0,
301
+ 1,
302
+ 0,
303
+ 1,
304
+ 1,
305
+ 0,
306
+ 2,
307
+ 1,
308
+ 0,
309
+ 0,
310
+ 1,
311
+ 0,
312
+ -1,
313
+ 0,
314
+ 2,
315
+ -1,
316
+ 2,
317
+ -1,
318
+ 2,
319
+ 0,
320
+ 1,
321
+ 2,
322
+ 2,
323
+ 1,
324
+ 2,
325
+ 1,
326
+ 0,
327
+ 2,
328
+ 1,
329
+ 1,
330
+ 1,
331
+ 2,
332
+ 0,
333
+ -1,
334
+ 2,
335
+ -1,
336
+ 0,
337
+ 1,
338
+ -1,
339
+ 0,
340
+ 0,
341
+ 1,
342
+ 2,
343
+ 2,
344
+ 2,
345
+ 2,
346
+ 2,
347
+ 2,
348
+ 2,
349
+ 2,
350
+ 1,
351
+ 1,
352
+ 2,
353
+ 2,
354
+ 2,
355
+ 2,
356
+ 2,
357
+ 1,
358
+ 1,
359
+ 1,
360
+ 1,
361
+ 1,
362
+ 1,
363
+ 1,
364
+ 1,
365
+ 1,
366
+ 0,
367
+ 1,
368
+ -1,
369
+ 1,
370
+ 1,
371
+ 1,
372
+ 1,
373
+ -1,
374
+ 1,
375
+ 0
376
+ ],
377
+ "topic_sizes": {
378
+ "1": 56,
379
+ "2": 54,
380
+ "0": 77,
381
+ "-1": 16
382
+ },
383
+ "topic_mapper": [
384
+ [
385
+ -1,
386
+ -1,
387
+ -1
388
+ ],
389
+ [
390
+ 0,
391
+ 0,
392
+ 0
393
+ ],
394
+ [
395
+ 1,
396
+ 1,
397
+ 1
398
+ ],
399
+ [
400
+ 2,
401
+ 2,
402
+ 2
403
+ ]
404
+ ],
405
+ "topic_labels": {
406
+ "-1": "-1_on_card_my_charge",
407
+ "0": "0_refund_my_nike_for",
408
+ "1": "1_to_my_klarna_email",
409
+ "2": "2_my_the_payment_klarna"
410
+ },
411
+ "custom_labels": null,
412
+ "_outliers": 1,
413
+ "topic_aspects": {}
414
+ }