Spaces:
Sleeping
Sleeping
File size: 1,313 Bytes
1c18375 7f092ad 1c18375 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
""" colbert_utils.py
Utilities for building (and using) a ColBERT (retrieval) model.
:author: Didier Guillevic
:email: [email protected]
:creation: 2024-12-21
"""
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
from ragatouille import RAGPretrainedModel
def build_colbert_model(
documents: list[str],
metadatas: list[dict[str, str]],
pretrained_model: str='antoinelouis/colbert-xm',
index_name: str='colbert_index'
) -> RAGPretrainedModel:
"""Build a ColBERT model for retrieval.
Args:
documents: list of documents to index
metadatas: list of metadata for each document
index_name: name of the index built with given documents
pretrined_model: name of the pretrained model to use
Returns:
the ColBERT retrieval model built witt the given documents.
"""
model = RAGPretrainedModel.from_pretrained(pretrained_model)
model.index(
collection=documents,
#document_ids=document_ids, # no unique IDs at the moment
document_metadatas=metadatas,
index_name=index_name,
max_document_length=180,
split_documents=True,
use_faiss=True # set to True if faiss working properly in current env
)
return model
|