Spaces:
Runtime error
Runtime error
import json | |
import uuid | |
from huggingface_hub import login | |
from fastembed import SparseTextEmbedding,LateInteractionTextEmbedding | |
from qdrant_client import QdrantClient, models | |
from sentence_transformers import SentenceTransformer | |
from tqdm import tqdm | |
import numpy as np | |
from huggingface_hub import login | |
import os | |
login(os.getenv('HUGGING_FACE_API_KEY')) | |
folder_path = 'data' | |
dense_model = SentenceTransformer(os.getenv('DENSE_MODEL')) | |
sparse_model = SparseTextEmbedding(os.getenv('SPARSE_MODEL')) | |
# late_interaction_embedding_model = LateInteractionTextEmbedding("answerdotai/answerai-colbert-small-v1") | |
data = [] | |
for filename in os.listdir(folder_path): | |
if filename.endswith('.json'): | |
file_path = os.path.join(folder_path, filename) | |
with open(file_path,encoding='utf-8') as f: | |
data = json.load(f) | |
client = QdrantClient(os.getenv('QDRANT_URL'),api_key=os.getenv('QDRANT_API_KEY')) | |
data_array = np.array(data) | |
split_data = np.array_split(data_array, 1000) | |
collection_name = "mne-law" | |
for local_data in split_data: | |
payload = [] | |
documents = [] | |
for obj in local_data: | |
documents.append(obj["tekst"]) | |
payload.append(obj) | |
sparse_embeddings = list( | |
tqdm( | |
sparse_model.passage_embed(doc for doc in documents), | |
total=len(documents), | |
desc="π¨ Encoding Sparse Embeddings" | |
) | |
) | |
# late_interaction_embeddings = list( | |
# tqdm( | |
# late_interaction_embedding_model.passage_embed(doc for doc in documents), | |
# total=len(documents), | |
# desc="π¨ Encoding Late Interaction Embeddings" | |
# ) | |
# ) | |
dense_embeddings = dense_model.encode(documents, show_progress_bar=True, device="cuda") | |
existing_collections = client.get_collections().collections | |
collection_names = [col.name for col in existing_collections] | |
if collection_name not in collection_names: | |
client.create_collection( | |
collection_name=collection_name, | |
vectors_config={ | |
os.getenv('DENSE_MODEL'): models.VectorParams( | |
size=len(dense_embeddings[0]), | |
distance=models.Distance.COSINE, | |
on_disk=True | |
), | |
# "answerdotai/answerai-colbert-small-v1": models.VectorParams( | |
# size=len(late_interaction_embeddings[0][0]), | |
# distance=models.Distance.COSINE, | |
# multivector_config=models.MultiVectorConfig( | |
# comparator=models.MultiVectorComparator.MAX_SIM, | |
# ), | |
# hnsw_config=models.HnswConfigDiff( | |
# m=0, # Disable HNSW graph creation | |
# ), | |
# on_disk=True | |
# ), | |
}, | |
sparse_vectors_config={ | |
"Qdrant/bm25": models.SparseVectorParams( | |
modifier=models.Modifier.IDF, | |
), | |
}, | |
quantization_config=models.ScalarQuantization( | |
scalar=models.ScalarQuantizationConfig( | |
type=models.ScalarType.INT8, | |
always_ram=True | |
) | |
), | |
optimizers_config=models.OptimizersConfigDiff( | |
indexing_threshold=10000, | |
), | |
shard_number = 4, | |
hnsw_config=models.HnswConfigDiff(on_disk=True), | |
) | |
print("π Uploading to qdrant collection: " + collection_name) | |
client.upload_points( | |
collection_name=collection_name, | |
batch_size = 32, | |
parallel = 16, | |
points=[ | |
models.PointStruct( | |
id=uuid.uuid4().hex, | |
vector={ | |
os.getenv('DENSE_MODEL'): dense_embedding, | |
os.getenv('SPARSE_MODEL'): sparse_embedding.as_object(), | |
# "answerdotai/answerai-colbert-small-v1":late_interaction_embedding | |
}, | |
payload=doc, | |
) | |
for doc, dense_embedding, sparse_embedding in zip( | |
payload, dense_embeddings, sparse_embeddings | |
) | |
], | |
) | |
client.create_payload_index( | |
collection_name=collection_name, | |
field_name="dbid", | |
field_schema=models.PayloadSchemaType.INTEGER | |
) | |
client.update_collection( | |
collection_name=collection_name, | |
optimizer_config=models.OptimizersConfigDiff(indexing_threshold=20000), | |
) |