|
|
|
from typing import Dict, List |
|
from openai import OpenAI |
|
|
|
def index_videos(videos: List[Dict], collection, channel_url: str, batch_size: int = 50): |
|
client = OpenAI() |
|
|
|
total = len(videos) |
|
print(f"[INDEX] Starting indexing for {total} videos (channel={channel_url})") |
|
|
|
|
|
for start in range(0, total, batch_size): |
|
batch = videos[start:start + batch_size] |
|
end = start + len(batch) |
|
percent = round((end / total) * 100, 1) |
|
|
|
print(f"[INDEX] Processing batch {start+1} β {end} of {total} β {percent}%") |
|
|
|
|
|
texts = [f"{vid.get('title', '')} - {vid.get('description', '')}" for vid in batch] |
|
|
|
|
|
response = client.embeddings.create( |
|
input=texts, |
|
model="text-embedding-3-small" |
|
) |
|
|
|
embeddings = [item.embedding for item in response.data] |
|
|
|
|
|
metadatas, ids = [], [] |
|
for vid in batch: |
|
metadata = { |
|
"video_id": vid.get("video_id"), |
|
"video_title": vid.get("title", ""), |
|
"description": vid.get("description", ""), |
|
"channel_url": channel_url, |
|
} |
|
if "channel_id" in vid: |
|
metadata["channel_id"] = vid["channel_id"] |
|
if "channel_title" in vid: |
|
metadata["channel_title"] = vid["channel_title"] |
|
|
|
metadatas.append(metadata) |
|
ids.append(vid.get("video_id")) |
|
|
|
|
|
collection.add( |
|
documents=texts, |
|
embeddings=embeddings, |
|
metadatas=metadatas, |
|
ids=ids, |
|
) |
|
|
|
print(f"[INDEX] β
Indexed {len(batch)} videos (total so far: {end}/{total} β {percent}%)") |
|
|
|
print(f"[INDEX] π Finished indexing {total} videos for channel={channel_url}") |
|
return total |
|
|
|
|