Commit
·
1a80e63
verified
·
0
Parent(s):
Duplicate from nomic-ai/nomic-embed-text-v2-moe
Browse filesCo-authored-by: Zach Nussbaum <[email protected]>
- .gitattributes +36 -0
- 1_Pooling/config.json +10 -0
- README.md +273 -0
- config.json +74 -0
- config_sentence_transformers.json +20 -0
- model.safetensors +3 -0
- modules.json +20 -0
- sentence_bert_config.json +4 -0
- sentencepiece.bpe.model +3 -0
- special_tokens_map.json +51 -0
- tokenizer.json +3 -0
- tokenizer_config.json +54 -0
.gitattributes
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
1_Pooling/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 768,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": false,
|
9 |
+
"include_prompt": true
|
10 |
+
}
|
README.md
ADDED
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model:
|
3 |
+
- nomic-ai/nomic-embed-text-v2-moe-unsupervised
|
4 |
+
library_name: sentence-transformers
|
5 |
+
pipeline_tag: sentence-similarity
|
6 |
+
tags:
|
7 |
+
- sentence-transformers
|
8 |
+
- sentence-similarity
|
9 |
+
- feature-extraction
|
10 |
+
license: apache-2.0
|
11 |
+
language:
|
12 |
+
- en
|
13 |
+
- es
|
14 |
+
- fr
|
15 |
+
- de
|
16 |
+
- it
|
17 |
+
- pt
|
18 |
+
- pl
|
19 |
+
- nl
|
20 |
+
- tr
|
21 |
+
- ja
|
22 |
+
- vi
|
23 |
+
- ru
|
24 |
+
- id
|
25 |
+
- ar
|
26 |
+
- cs
|
27 |
+
- ro
|
28 |
+
- sv
|
29 |
+
- el
|
30 |
+
- uk
|
31 |
+
- zh
|
32 |
+
- hu
|
33 |
+
- da
|
34 |
+
- 'no'
|
35 |
+
- hi
|
36 |
+
- fi
|
37 |
+
- bg
|
38 |
+
- ko
|
39 |
+
- sk
|
40 |
+
- th
|
41 |
+
- he
|
42 |
+
- ca
|
43 |
+
- lt
|
44 |
+
- fa
|
45 |
+
- ms
|
46 |
+
- sl
|
47 |
+
- lv
|
48 |
+
- mr
|
49 |
+
- bn
|
50 |
+
- sq
|
51 |
+
- cy
|
52 |
+
- be
|
53 |
+
- ml
|
54 |
+
- kn
|
55 |
+
- mk
|
56 |
+
- ur
|
57 |
+
- fy
|
58 |
+
- te
|
59 |
+
- eu
|
60 |
+
- sw
|
61 |
+
- so
|
62 |
+
- sd
|
63 |
+
- uz
|
64 |
+
- co
|
65 |
+
- hr
|
66 |
+
- gu
|
67 |
+
- ce
|
68 |
+
- eo
|
69 |
+
- jv
|
70 |
+
- la
|
71 |
+
- zu
|
72 |
+
- mn
|
73 |
+
- si
|
74 |
+
- ga
|
75 |
+
- ky
|
76 |
+
- tg
|
77 |
+
- my
|
78 |
+
- km
|
79 |
+
- mg
|
80 |
+
- pa
|
81 |
+
- sn
|
82 |
+
- ha
|
83 |
+
- ht
|
84 |
+
- su
|
85 |
+
- gd
|
86 |
+
- ny
|
87 |
+
- ps
|
88 |
+
- ku
|
89 |
+
- am
|
90 |
+
- ig
|
91 |
+
- lo
|
92 |
+
- mi
|
93 |
+
- nn
|
94 |
+
- sm
|
95 |
+
- yi
|
96 |
+
- st
|
97 |
+
- tl
|
98 |
+
- xh
|
99 |
+
- yo
|
100 |
+
- af
|
101 |
+
- ta
|
102 |
+
- tn
|
103 |
+
- ug
|
104 |
+
- az
|
105 |
+
- ba
|
106 |
+
- bs
|
107 |
+
- dv
|
108 |
+
- et
|
109 |
+
- gl
|
110 |
+
- gn
|
111 |
+
- gv
|
112 |
+
- hy
|
113 |
+
---
|
114 |
+
|
115 |
+
# nomic-embed-text-v2-moe: Multilingual Mixture of Experts Text Embeddings
|
116 |
+
|
117 |
+
## Model Overview
|
118 |
+
`nomic-embed-text-v2-moe` is SoTA multilingual MoE text embedding model that excels at multilingual retrieval:
|
119 |
+
|
120 |
+
- **High Performance**: SoTA Multilingual performance compared to ~300M parameter models, competitive with models 2x in size
|
121 |
+
- **Multilinguality**: Supports ~100 languages and trained on over 1.6B pairs
|
122 |
+
- **Flexible Embedding Dimension**: Trained with [Matryoshka Embeddings](https://arxiv.org/abs/2205.13147) with 3x reductions in storage cost with minimal performance degradations
|
123 |
+
- **Fully Open-Source**: Model weights, [code](https://github.com/nomic-ai/contrastors), and training data (see code repo) released
|
124 |
+
|
125 |
+
|
126 |
+
| Model | Params (M) | Emb Dim | BEIR | MIRACL | Pretrain Data | Finetune Data | Code |
|
127 |
+
|-------|------------|----------|------|---------|---------------|---------------|------|
|
128 |
+
| **Nomic Embed v2** | 305 | 768 | 52.86 | **65.80** | ✅ | ✅ | ✅ |
|
129 |
+
| mE5 Base | 278 | 768 | 48.88 | 62.30 | ❌ | ❌ | ❌ |
|
130 |
+
| mGTE Base | 305 | 768 | 51.10 | 63.40 | ❌ | ❌ | ❌ |
|
131 |
+
| Arctic Embed v2 Base | 305 | 768 | **55.40** | 59.90 | ❌ | ❌ | ❌ |
|
132 |
+
| |
|
133 |
+
| BGE M3 | 568 | 1024 | 48.80 | **69.20** | ❌ | ✅ | ❌ |
|
134 |
+
| Arctic Embed v2 Large | 568 | 1024 | **55.65** | 66.00 | ❌ | ❌ | ❌ |
|
135 |
+
| mE5 Large | 560 | 1024 | 51.40 | 66.50 | ❌ | ❌ | ❌ |
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
## Model Architecture
|
140 |
+
- **Total Parameters**: 475M
|
141 |
+
- **Active Parameters During Inference**: 305M
|
142 |
+
- **Architecture Type**: Mixture of Experts (MoE)
|
143 |
+
- **MoE Configuration**: 8 experts with top-2 routing
|
144 |
+
- **Embedding Dimensions**: Supports flexible dimension from 768 to 256 through Matryoshka representation learning
|
145 |
+
- **Maximum Sequence Length**: 512 tokens
|
146 |
+
- **Languages**: Supports dozens of languages (see Performance section)
|
147 |
+
|
148 |
+
|
149 |
+
## Usage Guide
|
150 |
+
|
151 |
+
### Installation
|
152 |
+
|
153 |
+
The model can be used through SentenceTransformers and Transformers.
|
154 |
+
|
155 |
+
For best performance on GPU, please install
|
156 |
+
|
157 |
+
```bash
|
158 |
+
pip install torch transformers einops git+https://github.com/nomic-ai/megablocks.git
|
159 |
+
```
|
160 |
+
|
161 |
+
> [!IMPORTANT]
|
162 |
+
> **Important!**
|
163 |
+
> The text prompt *must* include a *task instruction prefix*, instructing the model which task is being performed.
|
164 |
+
|
165 |
+
Please use `search_query: ` before your queries/questions, and `search_document: ` before your documents.
|
166 |
+
|
167 |
+
### Transformers
|
168 |
+
|
169 |
+
If using Transformers, **make sure to prepend the task instruction prefix**.
|
170 |
+
|
171 |
+
```python
|
172 |
+
import torch
|
173 |
+
import torch.nn.functional as F
|
174 |
+
from transformers import AutoTokenizer, AutoModel
|
175 |
+
|
176 |
+
tokenizer = AutoTokenizer.from_pretrained("nomic-ai/nomic-embed-text-v2-moe")
|
177 |
+
model = AutoModel.from_pretrained("nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True)
|
178 |
+
|
179 |
+
sentences = ['search_document: Hello!', 'search_document: ¡Hola!']
|
180 |
+
|
181 |
+
def mean_pooling(model_output, attention_mask):
|
182 |
+
token_embeddings = model_output[0]
|
183 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
184 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
185 |
+
|
186 |
+
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
187 |
+
model.eval()
|
188 |
+
with torch.no_grad():
|
189 |
+
model_output = model(**encoded_input)
|
190 |
+
embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
191 |
+
embeddings = F.normalize(embeddings, p=2, dim=1)
|
192 |
+
print(embeddings.shape)
|
193 |
+
# torch.Size([2, 768])
|
194 |
+
|
195 |
+
similarity = F.cosine_similarity(embeddings[0], embeddings[1], dim=0)
|
196 |
+
print(similarity)
|
197 |
+
# tensor(0.9118)
|
198 |
+
```
|
199 |
+
|
200 |
+
### SentenceTransformers
|
201 |
+
|
202 |
+
With SentenceTransformers, you can specify the `prompt_name` as either `"query"` or `"passage"`, and the task instruction will be included automatically.
|
203 |
+
|
204 |
+
```python
|
205 |
+
from sentence_transformers import SentenceTransformer
|
206 |
+
|
207 |
+
model = SentenceTransformer("nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True)
|
208 |
+
sentences = ["Hello!", "¡Hola!"]
|
209 |
+
embeddings = model.encode(sentences, prompt_name="passage")
|
210 |
+
print(embeddings.shape)
|
211 |
+
# (2, 768)
|
212 |
+
|
213 |
+
similarity = model.similarity(embeddings[0], embeddings[1])
|
214 |
+
print(similarity)
|
215 |
+
# tensor([[0.9118]])
|
216 |
+
```
|
217 |
+
|
218 |
+
## Performance
|
219 |
+
|
220 |
+
nomic-embed-text-v2-moe performance on BEIR and MIRACL compared to other open-weights embedding models:
|
221 |
+
|
222 |
+

|
223 |
+
|
224 |
+
nomic-embed-text-v2-moe performance on BEIR at 768 dimension and truncated to 256 dimensions:
|
225 |
+
|
226 |
+

|
227 |
+
|
228 |
+
## Best Practices
|
229 |
+
- Add appropriate prefixes to your text:
|
230 |
+
- For queries: "search_query: "
|
231 |
+
- For documents: "search_document: "
|
232 |
+
- Maximum input length is 512 tokens
|
233 |
+
- For optimal efficiency, consider using the 256-dimension embeddings if storage/compute is a concern
|
234 |
+
|
235 |
+
## Limitations
|
236 |
+
- Performance may vary across different languages
|
237 |
+
- Resource requirements may be higher than traditional dense models due to MoE architecture
|
238 |
+
- Must use `trust_remote_code=True` when loading the model to use our custom architecture implementation
|
239 |
+
|
240 |
+
## Training Details
|
241 |
+
|
242 |
+

|
243 |
+
|
244 |
+
- Trained on 1.6 billion high-quality pairs across multiple languages
|
245 |
+
- Uses consistency filtering to ensure high-quality training data
|
246 |
+
- Incorporates Matryoshka representation learning for dimension flexibility
|
247 |
+
- Training includes both weakly-supervised contrastive pretraining and supervised finetuning
|
248 |
+
|
249 |
+
For more details, please check out the [blog post](https://www.nomic.ai/blog/posts/nomic-embed-text-v2) and [technical report](https://www.arxiv.org/abs/2502.07972).
|
250 |
+
|
251 |
+
|
252 |
+
|
253 |
+
## Join the Nomic Community
|
254 |
+
|
255 |
+
- Nomic: [https://nomic.ai](https://nomic.ai)
|
256 |
+
- Discord: [https://discord.gg/myY5YDR8z8](https://discord.gg/myY5YDR8z8)
|
257 |
+
- Twitter: [https://twitter.com/nomic_ai](https://twitter.com/nomic_ai)
|
258 |
+
|
259 |
+
# Citation
|
260 |
+
|
261 |
+
If you find the model, dataset, or training code useful, please cite our work
|
262 |
+
|
263 |
+
```bibtex
|
264 |
+
@misc{nussbaum2025trainingsparsemixtureexperts,
|
265 |
+
title={Training Sparse Mixture Of Experts Text Embedding Models},
|
266 |
+
author={Zach Nussbaum and Brandon Duderstadt},
|
267 |
+
year={2025},
|
268 |
+
eprint={2502.07972},
|
269 |
+
archivePrefix={arXiv},
|
270 |
+
primaryClass={cs.CL},
|
271 |
+
url={https://arxiv.org/abs/2502.07972},
|
272 |
+
}
|
273 |
+
```
|
config.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "nomic-ai/nomic-xlm-2048",
|
3 |
+
"activation_function": "gelu",
|
4 |
+
"add_pooling_layer": false,
|
5 |
+
"architectures": [
|
6 |
+
"NomicBertModel"
|
7 |
+
],
|
8 |
+
"attn_pdrop": 0.0,
|
9 |
+
"auto_map": {
|
10 |
+
"AutoConfig": "nomic-ai/nomic-bert-2048--configuration_hf_nomic_bert.NomicBertConfig",
|
11 |
+
"AutoModel": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertModel",
|
12 |
+
"AutoModelForMaskedLM": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForPreTraining",
|
13 |
+
"AutoModelForMultipleChoice": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForMultipleChoice",
|
14 |
+
"AutoModelForQuestionAnswering": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForQuestionAnswering",
|
15 |
+
"AutoModelForSequenceClassification": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForSequenceClassification",
|
16 |
+
"AutoModelForTokenClassification": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForTokenClassification"
|
17 |
+
},
|
18 |
+
"bos_token_id": null,
|
19 |
+
"causal": false,
|
20 |
+
"dense_seq_output": true,
|
21 |
+
"embd_pdrop": 0.1,
|
22 |
+
"eos_token_id": null,
|
23 |
+
"expert_choice_router": false,
|
24 |
+
"ffn_div": 1,
|
25 |
+
"fused_bias_fc": true,
|
26 |
+
"fused_dropout_add_ln": true,
|
27 |
+
"initializer_range": 0.02,
|
28 |
+
"layer_norm_epsilon": 1e-05,
|
29 |
+
"max_trained_positions": 2048,
|
30 |
+
"mlp_fc1_bias": true,
|
31 |
+
"mlp_fc2_bias": true,
|
32 |
+
"model_type": "nomic_bert",
|
33 |
+
"moe_every_n_layers": 2,
|
34 |
+
"moe_impl": "megablocks",
|
35 |
+
"moe_normalize_expert_weights": false,
|
36 |
+
"moe_resid_pdrop": 0.0,
|
37 |
+
"moe_top_k": 2,
|
38 |
+
"n_embd": 768,
|
39 |
+
"n_head": 12,
|
40 |
+
"n_inner": 3072,
|
41 |
+
"n_layer": 12,
|
42 |
+
"n_positions": 2048,
|
43 |
+
"num_experts": 8,
|
44 |
+
"num_shared_experts": 0,
|
45 |
+
"pad_token_id": 1,
|
46 |
+
"pad_vocab_size_multiple": 64,
|
47 |
+
"parallel_block": false,
|
48 |
+
"parallel_block_tied_norm": false,
|
49 |
+
"prenorm": false,
|
50 |
+
"qkv_proj_bias": true,
|
51 |
+
"reorder_and_upcast_attn": false,
|
52 |
+
"resid_pdrop": 0.0,
|
53 |
+
"rotary_emb_base": 10000,
|
54 |
+
"rotary_emb_fraction": 1.0,
|
55 |
+
"rotary_emb_interleaved": false,
|
56 |
+
"rotary_emb_scale_base": null,
|
57 |
+
"rotary_scaling_factor": null,
|
58 |
+
"router_aux_loss_coef": 0.1,
|
59 |
+
"scale_attn_by_inverse_layer_idx": false,
|
60 |
+
"scale_attn_weights": true,
|
61 |
+
"summary_activation": null,
|
62 |
+
"summary_first_dropout": 0.1,
|
63 |
+
"summary_proj_to_labels": true,
|
64 |
+
"summary_type": "cls_index",
|
65 |
+
"summary_use_proj": true,
|
66 |
+
"torch_dtype": "float32",
|
67 |
+
"transformers_version": "4.44.2",
|
68 |
+
"type_vocab_size": 1,
|
69 |
+
"use_cache": true,
|
70 |
+
"use_flash_attn": true,
|
71 |
+
"use_rms_norm": null,
|
72 |
+
"use_xentropy": true,
|
73 |
+
"vocab_size": 250048
|
74 |
+
}
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "3.3.0",
|
4 |
+
"transformers": "4.44.2",
|
5 |
+
"pytorch": "2.4.1+cu121"
|
6 |
+
},
|
7 |
+
"prompts": {
|
8 |
+
"query": "search_query: ",
|
9 |
+
"passage": "search_document: ",
|
10 |
+
"Classification": "classification: ",
|
11 |
+
"MultilabelClassification": "classification: ",
|
12 |
+
"Clustering": "clustering: ",
|
13 |
+
"PairClassification": "classification: ",
|
14 |
+
"STS": "classification: ",
|
15 |
+
"Summarization": "classification: ",
|
16 |
+
"Speed": "search_document: "
|
17 |
+
},
|
18 |
+
"default_prompt_name": null,
|
19 |
+
"similarity_fn_name": "cosine"
|
20 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:097012b27af76d80af74fed4bc2ccc9091245286f776adf03ad1758a24ade9a0
|
3 |
+
size 1901187232
|
modules.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"idx": 2,
|
16 |
+
"name": "2",
|
17 |
+
"path": "2_Normalize",
|
18 |
+
"type": "sentence_transformers.models.Normalize"
|
19 |
+
}
|
20 |
+
]
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 512,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
sentencepiece.bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
3 |
+
size 5069051
|
special_tokens_map.json
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "<s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "</s>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"mask_token": {
|
24 |
+
"content": "<mask>",
|
25 |
+
"lstrip": true,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"pad_token": {
|
31 |
+
"content": "<pad>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "</s>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
+
"unk_token": {
|
45 |
+
"content": "<unk>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": false,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false
|
50 |
+
}
|
51 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a56def25aa40facc030ea8b0b87f3688e4b3c39eb8b45d5702b3a1300fe2a20
|
3 |
+
size 17082734
|
tokenizer_config.json
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<s>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<pad>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "</s>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "<unk>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"250001": {
|
36 |
+
"content": "<mask>",
|
37 |
+
"lstrip": true,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"bos_token": "<s>",
|
45 |
+
"clean_up_tokenization_spaces": true,
|
46 |
+
"cls_token": "<s>",
|
47 |
+
"eos_token": "</s>",
|
48 |
+
"mask_token": "<mask>",
|
49 |
+
"model_max_length": 512,
|
50 |
+
"pad_token": "<pad>",
|
51 |
+
"sep_token": "</s>",
|
52 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
53 |
+
"unk_token": "<unk>"
|
54 |
+
}
|