vikramvasudevan commited on
Commit
b9afd09
·
verified ·
1 Parent(s): 68fd3ab

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. modules/indexer.py +44 -24
modules/indexer.py CHANGED
@@ -2,33 +2,53 @@
2
  from typing import Dict, List
3
  from openai import OpenAI
4
 
5
- def index_videos(videos: List[Dict], collection,channel_url : str):
6
  client = OpenAI()
7
 
8
- for vid in videos:
9
- text = f"{vid.get('title', '')} - {vid.get('description', '')}"
10
- embedding = client.embeddings.create(
11
- input=text,
 
 
 
 
 
 
 
 
 
 
12
  model="text-embedding-3-small"
13
- ).data[0].embedding
14
-
15
- # build metadata safely
16
- metadata = {
17
- "video_id": vid.get("video_id"),
18
- "video_title": vid.get("title", ""),
19
- "description" : vid.get('description', ''),
20
- "channel_url" : channel_url,
21
- }
22
-
23
- # add channel info if available
24
- if "channel_id" in vid:
25
- metadata["channel_id"] = vid["channel_id"]
26
- if "channel_title" in vid:
27
- metadata["channel_title"] = vid["channel_title"]
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  collection.add(
30
- documents=[text],
31
- embeddings=[embedding],
32
- metadatas=[metadata],
33
- ids=[vid.get("video_id")]
34
  )
 
 
 
 
 
2
  from typing import Dict, List
3
  from openai import OpenAI
4
 
5
+ def index_videos(videos: List[Dict], collection, channel_url: str, batch_size: int = 20):
6
  client = OpenAI()
7
 
8
+ total = len(videos)
9
+ print(f"[INDEX] Starting indexing for {total} videos (channel={channel_url})")
10
+
11
+ # Split into batches
12
+ for start in range(0, total, batch_size):
13
+ batch = videos[start:start + batch_size]
14
+ print(f"[INDEX] Processing batch {start+1} → {start+len(batch)} of {total}")
15
+
16
+ # Prepare text inputs
17
+ texts = [f"{vid.get('title', '')} - {vid.get('description', '')}" for vid in batch]
18
+
19
+ # Call embeddings API in batch
20
+ response = client.embeddings.create(
21
+ input=texts,
22
  model="text-embedding-3-small"
23
+ )
24
+
25
+ embeddings = [item.embedding for item in response.data]
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ # Build metadata + ids
28
+ metadatas, ids = [], []
29
+ for vid in batch:
30
+ metadata = {
31
+ "video_id": vid.get("video_id"),
32
+ "video_title": vid.get("title", ""),
33
+ "description": vid.get("description", ""),
34
+ "channel_url": channel_url,
35
+ }
36
+ if "channel_id" in vid:
37
+ metadata["channel_id"] = vid["channel_id"]
38
+ if "channel_title" in vid:
39
+ metadata["channel_title"] = vid["channel_title"]
40
+
41
+ metadatas.append(metadata)
42
+ ids.append(vid.get("video_id"))
43
+
44
+ # Insert in bulk
45
  collection.add(
46
+ documents=texts,
47
+ embeddings=embeddings,
48
+ metadatas=metadatas,
49
+ ids=ids,
50
  )
51
+
52
+ print(f"[INDEX] ✅ Indexed {len(batch)} videos (total so far: {start+len(batch)}/{total})")
53
+
54
+ print(f"[INDEX] 🎉 Finished indexing {total} videos for channel={channel_url}")