chore-add-mmteb
#62
by
bwang0911
- opened
README.md
CHANGED
@@ -17456,33 +17456,6 @@ model-index:
|
|
17456 |
value: 60.887608967403914
|
17457 |
task:
|
17458 |
type: STS
|
17459 |
-
- dataset:
|
17460 |
-
config: default
|
17461 |
-
name: MTEB QBQTC (default)
|
17462 |
-
revision: 790b0510dc52b1553e8c49f3d2afb48c0e5c48b7
|
17463 |
-
split: test
|
17464 |
-
type: C-MTEB/QBQTC
|
17465 |
-
metrics:
|
17466 |
-
- type: cosine_pearson
|
17467 |
-
value: 34.20049144526891
|
17468 |
-
- type: cosine_spearman
|
17469 |
-
value: 36.41802814113771
|
17470 |
-
- type: euclidean_pearson
|
17471 |
-
value: 34.569942139590626
|
17472 |
-
- type: euclidean_spearman
|
17473 |
-
value: 36.06141660786936
|
17474 |
-
- type: main_score
|
17475 |
-
value: 36.41802814113771
|
17476 |
-
- type: manhattan_pearson
|
17477 |
-
value: 34.537041543916003
|
17478 |
-
- type: manhattan_spearman
|
17479 |
-
value: 36.033418927773825
|
17480 |
-
- type: pearson
|
17481 |
-
value: 34.20049144526891
|
17482 |
-
- type: spearman
|
17483 |
-
value: 36.41802814113771
|
17484 |
-
task:
|
17485 |
-
type: STS
|
17486 |
- dataset:
|
17487 |
config: default
|
17488 |
name: MTEB STSB (default)
|
@@ -25042,7 +25015,7 @@ model-index:
|
|
25042 |
<br><br>
|
25043 |
|
25044 |
<p align="center">
|
25045 |
-
<img src="https://huggingface.co/
|
25046 |
</p>
|
25047 |
|
25048 |
|
@@ -25056,7 +25029,7 @@ model-index:
|
|
25056 |
|
25057 |
## Quick Start
|
25058 |
|
25059 |
-
[Blog](https://jina.ai/news/jina-embeddings-v3-a-frontier-multilingual-embedding-model/#parameter-dimensions) | [Azure](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/jinaai.jina-embeddings-v3
|
25060 |
|
25061 |
|
25062 |
## Intended Usage & Model Info
|
@@ -25083,13 +25056,6 @@ While the foundation model supports 100 languages, we've focused our tuning effo
|
|
25083 |
Hindi, Indonesian, Italian, Japanese, Korean, Latvian, Norwegian, Polish, Portuguese, Romanian,
|
25084 |
Russian, Slovak, Spanish, Swedish, Thai, Turkish, Ukrainian, Urdu,** and **Vietnamese.**
|
25085 |
|
25086 |
-
|
25087 |
-
> **⚠️ Important Notice:**
|
25088 |
-
> We fixed a bug in the `encode` function [#60](https://huggingface.co/jinaai/jina-embeddings-v3/discussions/60) where **Matryoshka embedding truncation** occurred *after normalization*, leading to non-normalized truncated embeddings. This issue has been resolved in the latest code revision.
|
25089 |
-
>
|
25090 |
-
> If you have encoded data using the previous version and wish to maintain consistency, please use the specific code revision when loading the model: `AutoModel.from_pretrained('jinaai/jina-embeddings-v3', code_revision='da863dd04a4e5dce6814c6625adfba87b83838aa', ...)`
|
25091 |
-
|
25092 |
-
|
25093 |
## Usage
|
25094 |
|
25095 |
**<details><summary>Apply mean pooling when integrating the model.</summary>**
|
@@ -25240,15 +25206,6 @@ import onnxruntime
|
|
25240 |
import numpy as np
|
25241 |
from transformers import AutoTokenizer, PretrainedConfig
|
25242 |
|
25243 |
-
# Mean pool function
|
25244 |
-
def mean_pooling(model_output: np.ndarray, attention_mask: np.ndarray):
|
25245 |
-
token_embeddings = model_output
|
25246 |
-
input_mask_expanded = np.expand_dims(attention_mask, axis=-1)
|
25247 |
-
input_mask_expanded = np.broadcast_to(input_mask_expanded, token_embeddings.shape)
|
25248 |
-
sum_embeddings = np.sum(token_embeddings * input_mask_expanded, axis=1)
|
25249 |
-
sum_mask = np.clip(np.sum(input_mask_expanded, axis=1), a_min=1e-9, a_max=None)
|
25250 |
-
return sum_embeddings / sum_mask
|
25251 |
-
|
25252 |
# Load tokenizer and model config
|
25253 |
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v3')
|
25254 |
config = PretrainedConfig.from_pretrained('jinaai/jina-embeddings-v3')
|
@@ -25270,11 +25227,7 @@ inputs = {
|
|
25270 |
}
|
25271 |
|
25272 |
# Run model
|
25273 |
-
outputs = session.run(None, inputs)
|
25274 |
-
|
25275 |
-
# Apply mean pooling and normalization to the model outputs
|
25276 |
-
embeddings = mean_pooling(outputs, input_text["attention_mask"])
|
25277 |
-
embeddings = embeddings / np.linalg.norm(embeddings, ord=2, axis=1, keepdims=True)
|
25278 |
```
|
25279 |
|
25280 |
</p>
|
|
|
17456 |
value: 60.887608967403914
|
17457 |
task:
|
17458 |
type: STS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17459 |
- dataset:
|
17460 |
config: default
|
17461 |
name: MTEB STSB (default)
|
|
|
25015 |
<br><br>
|
25016 |
|
25017 |
<p align="center">
|
25018 |
+
<img src="https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/603763514de52ff951d89793/AFoybzd5lpBQXEBrQHuTt.png?w=200&h=200&f=face" alt="Finetuner logo: Finetuner helps you to create experiments in order to improve embeddings on search tasks. It accompanies you to deliver the last mile of performance-tuning for neural search applications." width="150px">
|
25019 |
</p>
|
25020 |
|
25021 |
|
|
|
25029 |
|
25030 |
## Quick Start
|
25031 |
|
25032 |
+
[Blog](https://jina.ai/news/jina-embeddings-v3-a-frontier-multilingual-embedding-model/#parameter-dimensions) | [Azure](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/jinaai.jina-embeddings-v3) | [AWS SageMaker](https://aws.amazon.com/marketplace/pp/prodview-kdi3xkt62lo32) | [API](https://jina.ai/embeddings)
|
25033 |
|
25034 |
|
25035 |
## Intended Usage & Model Info
|
|
|
25056 |
Hindi, Indonesian, Italian, Japanese, Korean, Latvian, Norwegian, Polish, Portuguese, Romanian,
|
25057 |
Russian, Slovak, Spanish, Swedish, Thai, Turkish, Ukrainian, Urdu,** and **Vietnamese.**
|
25058 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25059 |
## Usage
|
25060 |
|
25061 |
**<details><summary>Apply mean pooling when integrating the model.</summary>**
|
|
|
25206 |
import numpy as np
|
25207 |
from transformers import AutoTokenizer, PretrainedConfig
|
25208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25209 |
# Load tokenizer and model config
|
25210 |
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v3')
|
25211 |
config = PretrainedConfig.from_pretrained('jinaai/jina-embeddings-v3')
|
|
|
25227 |
}
|
25228 |
|
25229 |
# Run model
|
25230 |
+
outputs = session.run(None, inputs)
|
|
|
|
|
|
|
|
|
25231 |
```
|
25232 |
|
25233 |
</p>
|