vikramvasudevan's picture
Upload folder using huggingface_hub
e51e296 verified
raw
history blame
1.95 kB
# modules/indexer.py
from typing import Dict, List
from openai import OpenAI
def index_videos(videos: List[Dict], collection, channel_url: str, batch_size: int = 50):
client = OpenAI()
total = len(videos)
print(f"[INDEX] Starting indexing for {total} videos (channel={channel_url})")
# Split into batches
for start in range(0, total, batch_size):
batch = videos[start:start + batch_size]
end = start + len(batch)
percent = round((end / total) * 100, 1)
print(f"[INDEX] Processing batch {start+1} β†’ {end} of {total} β€” {percent}%")
# Prepare text inputs
texts = [f"{vid.get('title', '')} - {vid.get('description', '')}" for vid in batch]
# Call embeddings API in batch
response = client.embeddings.create(
input=texts,
model="text-embedding-3-small"
)
embeddings = [item.embedding for item in response.data]
# Build metadata + ids
metadatas, ids = [], []
for vid in batch:
metadata = {
"video_id": vid.get("video_id"),
"video_title": vid.get("title", ""),
"description": vid.get("description", ""),
"channel_url": channel_url,
}
if "channel_id" in vid:
metadata["channel_id"] = vid["channel_id"]
if "channel_title" in vid:
metadata["channel_title"] = vid["channel_title"]
metadatas.append(metadata)
ids.append(vid.get("video_id"))
# Insert in bulk
collection.add(
documents=texts,
embeddings=embeddings,
metadatas=metadatas,
ids=ids,
)
print(f"[INDEX] βœ… Indexed {len(batch)} videos (total so far: {end}/{total} β€” {percent}%)")
print(f"[INDEX] πŸŽ‰ Finished indexing {total} videos for channel={channel_url}")
return total