Reproduce SFR-Embedding-Code model performance on CoIR Leaderboard
#9
by
yliu279
- opened
To reproduce our model performance on CoIR Leaderboard, Please use the following code:
from transformers import AutoModel
import argparse
import logging
import coir
from coir.evaluation import COIR
from typing import List, Dict
import torch
from tqdm import tqdm
import numpy as np
# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def get_detailed_instruct(task_description: str, query: str) -> str:
return f'Instruct: {task_description}\nQuery: {query}'
# Load the remote model with trust_remote_code=True
base_model = AutoModel.from_pretrained(
"Salesforce/SFR-Embedding-Code-2B_R", trust_remote_code=True
)
# Define subclass to override methods
class CustomCodeXEmbedModel2B(base_model.__class__):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def encode_text(self, texts: List[str], batch_size: int = 12, max_length: int = 1024) -> np.ndarray:
logging.info(f"Encoding {len(texts)} texts...")
embeddings = []
for i in tqdm(range(0, len(texts), batch_size), desc="Encoding batches", unit="batch"):
batch_texts = texts[i:i + batch_size]
encoded_input = self.tokenizer(
batch_texts,
padding=True,
add_special_tokens=True,
truncation=True,
max_length=max_length,
return_tensors="pt"
)
device = next(self.model.parameters()).device
encoded_input = {key: val.to(device) for key, val in encoded_input.items()}
with torch.no_grad():
model_output = self.model(**encoded_input)
batch_embeddings = self.last_token_pool(model_output, encoded_input['attention_mask'])
embeddings.append(batch_embeddings.cpu())
embeddings = torch.cat(embeddings, dim=0)
logging.info(f"Encoded {embeddings.shape[0]} embeddings.")
return embeddings.to(dtype=torch.float32).numpy()
def encode_queries(self, queries: List[Dict], batch_size: int = 12, max_length: int = 1024, **kwargs) -> np.ndarray:
task_description = "Given Code or Text, retrieve relevant content."
all_queries = [get_detailed_instruct(task_description, query) for query in queries]
return self.encode_text(all_queries, batch_size, max_length)
def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int = 12, max_length: int = 1024, **kwargs) -> np.ndarray:
all_texts = [doc["title"] + " " + doc["text"] for doc in corpus]
return self.encode_text(all_texts, batch_size, max_length)
# Instantiate the custom model
custom_model = CustomCodeXEmbedModel2B.from_pretrained(
"Salesforce/SFR-Embedding-Code-2B_R", trust_remote_code=True
)
# Define datasets
datasets = [
"codetrans-dl","stackoverflow-qa","apps","codefeedback-mt",
"codefeedback-st", "cosqa", "stackoverflow-qa", "synthetic-text2sql",
"codesearchnet", "codesearchnet-ccr"
]
for dataset in datasets:
tasks = coir.get_tasks(tasks=[dataset])
evaluation = COIR(tasks=tasks, batch_size=16)
results = evaluation.run(custom_model, output_folder='results')
logging.info(f"Results for {dataset}: {results}")
Why aren't the results appearing in MTEB::CoIR?
https://huggingface.co/spaces/mteb/leaderboard
This comment has been hidden