File size: 1,313 Bytes
1c18375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f092ad
1c18375
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
""" colbert_utils.py

Utilities for building (and using) a ColBERT (retrieval) model.

:author: Didier Guillevic
:email: [email protected]
:creation: 2024-12-21
"""

import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

from ragatouille import RAGPretrainedModel


def build_colbert_model(
        documents: list[str],
        metadatas: list[dict[str, str]],
        pretrained_model: str='antoinelouis/colbert-xm',
        index_name: str='colbert_index'
    ) -> RAGPretrainedModel:
    """Build a ColBERT model for retrieval.

    Args:
        documents: list of documents to index
        metadatas: list of metadata for each document
        index_name: name of the index built with given documents
        pretrined_model: name of the pretrained model to use

    Returns:
        the ColBERT retrieval model built witt the given documents.
    """
    model = RAGPretrainedModel.from_pretrained(pretrained_model)
    model.index(
        collection=documents, 
        #document_ids=document_ids, # no unique IDs at the moment
        document_metadatas=metadatas,
        index_name=index_name,
        max_document_length=180, 
        split_documents=True,
        use_faiss=True # set to True if faiss working properly in current env
    )
    return model