Spaces:

vikramvasudevan
/

youtube-channel-surfer-ai

Running

Upload folder using huggingface_hub

e51e296 verified about 1 month ago

1.95 kB

	# modules/indexer.py
	from typing import Dict, List
	from openai import OpenAI

	def index_videos(videos: List[Dict], collection, channel_url: str, batch_size: int = 50):
	client = OpenAI()

	total = len(videos)
	print(f"[INDEX] Starting indexing for {total} videos (channel={channel_url})")

	# Split into batches
	for start in range(0, total, batch_size):
	batch = videos[start:start + batch_size]
	end = start + len(batch)
	percent = round((end / total) * 100, 1)

	print(f"[INDEX] Processing batch {start+1} → {end} of {total} — {percent}%")

	# Prepare text inputs
	texts = [f"{vid.get('title', '')} - {vid.get('description', '')}" for vid in batch]

	# Call embeddings API in batch
	response = client.embeddings.create(
	input=texts,
	model="text-embedding-3-small"
	)

	embeddings = [item.embedding for item in response.data]

	# Build metadata + ids
	metadatas, ids = [], []
	for vid in batch:
	metadata = {
	"video_id": vid.get("video_id"),
	"video_title": vid.get("title", ""),
	"description": vid.get("description", ""),
	"channel_url": channel_url,
	}
	if "channel_id" in vid:
	metadata["channel_id"] = vid["channel_id"]
	if "channel_title" in vid:
	metadata["channel_title"] = vid["channel_title"]

	metadatas.append(metadata)
	ids.append(vid.get("video_id"))

	# Insert in bulk
	collection.add(
	documents=texts,
	embeddings=embeddings,
	metadatas=metadatas,
	ids=ids,
	)

	print(f"[INDEX] ✅ Indexed {len(batch)} videos (total so far: {end}/{total} — {percent}%)")

	print(f"[INDEX] 🎉 Finished indexing {total} videos for channel={channel_url}")
	return total