vikramvasudevan commited on
Commit
82f7076
·
verified ·
1 Parent(s): 5184cba

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. app.py +66 -25
  2. main.py +2 -2
  3. modules/collector.py +47 -51
  4. modules/db.py +3 -3
app.py CHANGED
@@ -3,7 +3,7 @@ import re
3
  import gradio as gr
4
  from gradio_modal import Modal
5
  import chromadb
6
- from modules.collector import fetch_channel_videos_from_url
7
  from modules.db import (
8
  delete_channel_from_collection,
9
  get_collection,
@@ -62,11 +62,23 @@ def enable_if_not_none(question):
62
  # Fetch & index channels
63
  # -------------------------------
64
  def refresh_channel(api_key, channel_url: str):
65
- videos = fetch_channel_videos_from_url(api_key, channel_url)
66
- for v in videos:
67
- v["channel_url"] = channel_url
68
- index_videos(videos, get_collection(), channel_url=channel_url)
69
- return len(videos)
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
 
72
  def index_channels(channel_urls: str):
@@ -111,31 +123,59 @@ def list_channels_radio():
111
  channel_id = key
112
  if channel_id:
113
  choices.append((channel_display_name, channel_id))
114
- print("choices= ", choices)
115
  return choices
116
 
117
 
118
  # -------------------------------
119
  # Fetch channel videos as HTML table
120
  # -------------------------------
121
- def fetch_channel_html(channel_url: str):
122
- api_key = os.environ["YOUTUBE_API_KEY"]
123
- videos = fetch_channel_videos_from_url(api_key, channel_url, max_results=50)
124
- if not videos:
125
- return "<p>No videos found.</p>"
126
- html = "<table border='1' style='border-collapse: collapse; width:100%'>"
127
- html += "<tr><th>#<th>Title</th><th>Video URL</th><th>Description</th></tr>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  for idx, v in enumerate(videos):
129
- html += "<tr>"
130
- html += f"<td>{idx+1}</td>"
131
- html += f"<td>{v['title']}</td>"
132
- html += f"<td><a href='https://youtube.com/watch?v={v['video_id']}' target='_blank'>Watch Video</a></td>"
133
- html += f"<td>{v.get('description','')}</td>"
134
- html += "</tr>"
135
- html += "</table>"
 
 
 
 
136
  return html
137
 
138
- get_collection # -------------------------------
139
 
140
 
141
  # Delete a channel
@@ -203,7 +243,7 @@ with gr.Blocks() as demo:
203
  with gr.Row():
204
  gr.Column()
205
  save_add_channels_btn = gr.Button(
206
- "Add Channels", scale=0, variant="primary"
207
  )
208
  gr.Column()
209
  index_status = gr.Markdown(label="Index Status", container=False)
@@ -358,7 +398,7 @@ with gr.Blocks() as demo:
358
 
359
  # Show videos modal when button clicked
360
  def show_selected_channel_videos(selected_channel_id):
361
- print("selected_channel_id = ", selected_channel_id)
362
  return fetch_channel_html(selected_channel_id)
363
 
364
  channel_radio.change(
@@ -395,7 +435,8 @@ with gr.Blocks() as demo:
395
  def init():
396
  channels = "https://www.youtube.com/@onedayonepasuram6126,https://www.youtube.com/@srisookthi,https://www.youtube.com/@learn-aksharam"
397
  for resp in index_channels(channels):
398
- print(resp)
 
399
 
400
 
401
  if __name__ == "__main__":
 
3
  import gradio as gr
4
  from gradio_modal import Modal
5
  import chromadb
6
+ from modules.collector import fetch_all_channel_videos
7
  from modules.db import (
8
  delete_channel_from_collection,
9
  get_collection,
 
62
  # Fetch & index channels
63
  # -------------------------------
64
  def refresh_channel(api_key, channel_url: str):
65
+ all_videos = []
66
+ for status, videos_batch in fetch_all_channel_videos(api_key, channel_url):
67
+ # enrich each video in the batch with channel_url
68
+ for v in videos_batch:
69
+ v["channel_url"] = channel_url
70
+ all_videos.extend(videos_batch)
71
+
72
+ # (optional) log progress
73
+ print(status, "- total so far:", len(all_videos))
74
+
75
+ # index all at once
76
+ if all_videos:
77
+ print(f"Adding {len(all_videos)} videos to database")
78
+ index_videos(all_videos, get_collection(), channel_url=channel_url)
79
+ print(f"Loaded {len(all_videos)} videos")
80
+
81
+ return len(all_videos)
82
 
83
 
84
  def index_channels(channel_urls: str):
 
123
  channel_id = key
124
  if channel_id:
125
  choices.append((channel_display_name, channel_id))
126
+ # print("choices= ", choices)
127
  return choices
128
 
129
 
130
  # -------------------------------
131
  # Fetch channel videos as HTML table
132
  # -------------------------------
133
+ def fetch_channel_html(channel_id: str):
134
+ # query your collection/db instead of YouTube API
135
+ collection = get_collection()
136
+ results = collection.get(
137
+ where={"channel_id": channel_id},
138
+ include=["documents", "metadatas"]
139
+ )
140
+
141
+ if not results or not results.get("metadatas"):
142
+ return """
143
+ <div style="display:flex;justify-content:center;align-items:center;
144
+ height:200px;flex-direction:column;color:#666;">
145
+ ⚠️ No videos found for this channel.
146
+ </div>
147
+ """
148
+
149
+ videos = results["metadatas"]
150
+
151
+ # build table
152
+ html = """
153
+ <table border="1" style="border-collapse:collapse;width:100%;font-family:sans-serif;">
154
+ <thead style="background:#f0f0f0;">
155
+ <tr>
156
+ <th>#</th>
157
+ <th>Title</th>
158
+ <th>Video URL</th>
159
+ <th>Description</th>
160
+ </tr>
161
+ </thead>
162
+ <tbody>
163
+ """
164
+
165
  for idx, v in enumerate(videos):
166
+ html += f"""
167
+ <tr>
168
+ <td>{idx+1}</td>
169
+ <td>{v.get('video_title','')}</td>
170
+ <td><a href="https://youtube.com/watch?v={v.get('video_id')}"
171
+ target="_blank">Watch Video</a></td>
172
+ <td>{v.get('description','')}</td>
173
+ </tr>
174
+ """
175
+
176
+ html += "</tbody></table>"
177
  return html
178
 
 
179
 
180
 
181
  # Delete a channel
 
243
  with gr.Row():
244
  gr.Column()
245
  save_add_channels_btn = gr.Button(
246
+ "Add Channel(s)", scale=0, variant="primary"
247
  )
248
  gr.Column()
249
  index_status = gr.Markdown(label="Index Status", container=False)
 
398
 
399
  # Show videos modal when button clicked
400
  def show_selected_channel_videos(selected_channel_id):
401
+ # print("selected_channel_id = ", selected_channel_id)
402
  return fetch_channel_html(selected_channel_id)
403
 
404
  channel_radio.change(
 
435
  def init():
436
  channels = "https://www.youtube.com/@onedayonepasuram6126,https://www.youtube.com/@srisookthi,https://www.youtube.com/@learn-aksharam"
437
  for resp in index_channels(channels):
438
+ # print(resp)
439
+ pass
440
 
441
 
442
  if __name__ == "__main__":
main.py CHANGED
@@ -10,7 +10,7 @@ import chromadb
10
  from dotenv import load_dotenv
11
 
12
  from modules.answerer import answer_query
13
- from modules.collector import fetch_channel_videos
14
  from modules.db import get_collection
15
  from modules.indexer import index_videos
16
 
@@ -26,7 +26,7 @@ def main():
26
 
27
  # Collect + Index
28
  for ch in CHANNELS:
29
- videos = fetch_channel_videos(YT_API_KEY, ch)
30
  index_videos(videos, collection)
31
 
32
  # Ask a question
 
10
  from dotenv import load_dotenv
11
 
12
  from modules.answerer import answer_query
13
+ from modules.collector import fetch_channel_videos_by_id
14
  from modules.db import get_collection
15
  from modules.indexer import index_videos
16
 
 
26
 
27
  # Collect + Index
28
  for ch in CHANNELS:
29
+ videos = fetch_channel_videos_by_id(YT_API_KEY, ch)
30
  index_videos(videos, collection)
31
 
32
  # Ask a question
modules/collector.py CHANGED
@@ -1,69 +1,65 @@
1
  # -------------------------------
2
  # 1. Collector
3
  # -------------------------------
4
- from typing import List,Dict
5
  from googleapiclient.discovery import build
6
 
7
  from modules.youtube_utils import get_channel_id
8
 
 
 
9
 
10
- def fetch_channel_videos_from_url(api_key: str, channel_url: str, max_results=20):
11
  youtube = build("youtube", "v3", developerKey=api_key)
12
  channel_id = get_channel_id(youtube, channel_url)
13
 
14
- # Get channel details to fetch its title
15
- channel_response = youtube.channels().list(
16
- part="snippet",
17
- id=channel_id
18
- ).execute()
19
- channel_title = channel_response["items"][0]["snippet"]["title"]
 
 
20
 
21
- request = youtube.search().list(
22
- part="snippet",
23
- channelId=channel_id,
24
- maxResults=max_results,
25
- order="date"
26
- )
27
- response = request.execute()
28
-
29
- videos = []
30
- for item in response.get("items", []):
31
- if item["id"]["kind"] == "youtube#video":
32
- videos.append({
33
- "video_id": item["id"]["videoId"],
34
- "title": item["snippet"]["title"],
35
- "description": item["snippet"].get("description", ""),
36
- "channel_id": channel_id,
37
- "channel_title": channel_title,
38
- })
39
- return videos
40
-
41
- def fetch_channel_videos(api_key: str, channel_id: str, max_results=20):
42
  youtube = build("youtube", "v3", developerKey=api_key)
43
 
44
- # Fetch channel title
45
  channel_response = youtube.channels().list(
46
- part="snippet",
47
- id=channel_id
48
  ).execute()
 
49
  channel_title = channel_response["items"][0]["snippet"]["title"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- request = youtube.search().list(
52
- part="snippet",
53
- channelId=channel_id,
54
- maxResults=max_results,
55
- order="date"
56
- )
57
- response = request.execute()
58
-
59
- videos = []
60
- for item in response.get("items", []):
61
- if item["id"]["kind"] == "youtube#video":
62
- videos.append({
63
- "video_id": item["id"]["videoId"],
64
- "title": item["snippet"]["title"],
65
- "description": item["snippet"].get("description", ""),
66
- "channel_id": channel_id,
67
- "channel_title": channel_title,
68
- })
69
- return videos
 
1
  # -------------------------------
2
  # 1. Collector
3
  # -------------------------------
4
+ from typing import List, Dict
5
  from googleapiclient.discovery import build
6
 
7
  from modules.youtube_utils import get_channel_id
8
 
9
+ from googleapiclient.discovery import build
10
+
11
 
12
+ def fetch_all_channel_videos(api_key: str, channel_url: str, max_results_per_call=50):
13
  youtube = build("youtube", "v3", developerKey=api_key)
14
  channel_id = get_channel_id(youtube, channel_url)
15
 
16
+ final_videos = []
17
+ for videos in fetch_channel_videos_by_id(api_key, channel_id, max_results_per_call):
18
+ final_videos.extend(videos) # extend instead of append
19
+ print("Fetched", len(final_videos))
20
+ yield (f"Fetched {len(final_videos)}", final_videos)
21
+
22
+ yield (f"Fetched {len(final_videos)}", final_videos)
23
+
24
 
25
+ def fetch_channel_videos_by_id(api_key: str, channel_id: str, max_results=50):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  youtube = build("youtube", "v3", developerKey=api_key)
27
 
28
+ # Get uploads playlist ID
29
  channel_response = youtube.channels().list(
30
+ part="contentDetails,snippet", id=channel_id
 
31
  ).execute()
32
+
33
  channel_title = channel_response["items"][0]["snippet"]["title"]
34
+ uploads_playlist_id = channel_response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
35
+
36
+ next_page_token = None
37
+
38
+ while True:
39
+ request = youtube.playlistItems().list(
40
+ part="snippet",
41
+ playlistId=uploads_playlist_id,
42
+ maxResults=max_results,
43
+ pageToken=next_page_token,
44
+ )
45
+ response = request.execute()
46
+
47
+ videos = []
48
+ for item in response.get("items", []):
49
+ snippet = item["snippet"]
50
+ videos.append(
51
+ {
52
+ "video_id": snippet["resourceId"]["videoId"],
53
+ "title": snippet["title"],
54
+ "description": snippet.get("description", ""),
55
+ "channel_id": channel_id,
56
+ "channel_title": channel_title,
57
+ }
58
+ )
59
+
60
+ yield videos # yield one page worth
61
+
62
+ next_page_token = response.get("nextPageToken")
63
+ if not next_page_token:
64
+ break
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/db.py CHANGED
@@ -38,7 +38,7 @@ def get_indexed_channels(collection):
38
 
39
  if cid: # only include if we have a channel_id
40
  channels[cid] = cname
41
- print("channels= ",channels)
42
  return channels
43
 
44
 
@@ -48,8 +48,8 @@ def get_indexed_channels(collection):
48
  def delete_channel_from_collection(channel_id: str):
49
  """Remove a channel from the index and refresh the radio choices."""
50
  # Delete all videos for this channel
51
- print("Deleting channel", channel_id)
52
  data = get_collection().get(where={"channel_id": channel_id})
53
- print("data = ", data)
54
  get_collection().delete(where={"channel_id": channel_id})
55
  collection = get_collection()
 
38
 
39
  if cid: # only include if we have a channel_id
40
  channels[cid] = cname
41
+ # print("channels= ",channels)
42
  return channels
43
 
44
 
 
48
  def delete_channel_from_collection(channel_id: str):
49
  """Remove a channel from the index and refresh the radio choices."""
50
  # Delete all videos for this channel
51
+ # print("Deleting channel", channel_id)
52
  data = get_collection().get(where={"channel_id": channel_id})
53
+ # print("data = ", data)
54
  get_collection().delete(where={"channel_id": channel_id})
55
  collection = get_collection()