vikramvasudevan commited on
Commit
e63f83d
·
verified ·
1 Parent(s): 4332a60

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. app.py +6 -0
  2. modules/db.py +1 -1
  3. pyproject.toml +1 -0
  4. requirements.txt +4 -0
  5. uv.lock +20 -0
  6. youtube_poller.py +74 -0
app.py CHANGED
@@ -1,5 +1,7 @@
 
1
  import os
2
  import re
 
3
  import gradio as gr
4
  from gradio_modal import Modal
5
  import chromadb
@@ -13,6 +15,7 @@ from modules.indexer import index_videos
13
  from modules.answerer import answer_query, LLMAnswer, VideoItem, build_video_html
14
  from dotenv import load_dotenv
15
 
 
16
  from youtube_sync import sync_channels_from_youtube
17
 
18
  load_dotenv()
@@ -545,4 +548,7 @@ with gr.Blocks() as demo:
545
  if __name__ == "__main__":
546
  for msg in init():
547
  print(msg)
 
 
 
548
  demo.launch()
 
1
+ import asyncio
2
  import os
3
  import re
4
+ import threading
5
  import gradio as gr
6
  from gradio_modal import Modal
7
  import chromadb
 
15
  from modules.answerer import answer_query, LLMAnswer, VideoItem, build_video_html
16
  from dotenv import load_dotenv
17
 
18
+ from youtube_poller import start_poll
19
  from youtube_sync import sync_channels_from_youtube
20
 
21
  load_dotenv()
 
548
  if __name__ == "__main__":
549
  for msg in init():
550
  print(msg)
551
+ # Start polling in a background thread
552
+ poll_thread = threading.Thread(target=start_poll, daemon=True)
553
+ poll_thread.start()
554
  demo.launch()
modules/db.py CHANGED
@@ -28,7 +28,7 @@ def get_collection():
28
 
29
 
30
  # modules/db.py
31
- def get_indexed_channels(collection):
32
  results = collection.get(include=["metadatas"])
33
  channels = {}
34
 
 
28
 
29
 
30
  # modules/db.py
31
+ def get_indexed_channels(collection = get_collection()):
32
  results = collection.get(include=["metadatas"])
33
  channels = {}
34
 
pyproject.toml CHANGED
@@ -7,6 +7,7 @@ requires-python = ">=3.13"
7
  dependencies = [
8
  "chromadb>=1.0.20",
9
  "dotenv>=0.9.9",
 
10
  "google-api-python-client>=2.179.0",
11
  "gradio>=5.44.0",
12
  "gradio-modal>=0.0.4",
 
7
  dependencies = [
8
  "chromadb>=1.0.20",
9
  "dotenv>=0.9.9",
10
+ "feedparser>=6.0.11",
11
  "google-api-python-client>=2.179.0",
12
  "gradio>=5.44.0",
13
  "gradio-modal>=0.0.4",
requirements.txt CHANGED
@@ -59,6 +59,8 @@ durationpy==0.10
59
  # via kubernetes
60
  fastapi==0.116.1
61
  # via gradio
 
 
62
  ffmpy==0.6.1
63
  # via gradio
64
  filelock==3.19.1
@@ -295,6 +297,8 @@ safehttpx==0.1.6
295
  # via gradio
296
  semantic-version==2.10.0
297
  # via gradio
 
 
298
  shellingham==1.5.4
299
  # via typer
300
  six==1.17.0
 
59
  # via kubernetes
60
  fastapi==0.116.1
61
  # via gradio
62
+ feedparser==6.0.11
63
+ # via youtube-surfer-ai-agent (pyproject.toml)
64
  ffmpy==0.6.1
65
  # via gradio
66
  filelock==3.19.1
 
297
  # via gradio
298
  semantic-version==2.10.0
299
  # via gradio
300
+ sgmllib3k==1.0.0
301
+ # via feedparser
302
  shellingham==1.5.4
303
  # via typer
304
  six==1.17.0
uv.lock CHANGED
@@ -358,6 +358,18 @@ wheels = [
358
  { url = "https://files.pythonhosted.org/packages/e5/47/d63c60f59a59467fda0f93f46335c9d18526d7071f025cb5b89d5353ea42/fastapi-0.116.1-py3-none-any.whl", hash = "sha256:c46ac7c312df840f0c9e220f7964bada936781bc4e2e6eb71f1c4d7553786565", size = 95631, upload-time = "2025-07-11T16:22:30.485Z" },
359
  ]
360
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  [[package]]
362
  name = "ffmpy"
363
  version = "0.6.1"
@@ -1722,6 +1734,12 @@ wheels = [
1722
  { url = "https://files.pythonhosted.org/packages/6a/23/8146aad7d88f4fcb3a6218f41a60f6c2d4e3a72de72da1825dc7c8f7877c/semantic_version-2.10.0-py2.py3-none-any.whl", hash = "sha256:de78a3b8e0feda74cabc54aab2da702113e33ac9d9eb9d2389bcf1f58b7d9177", size = 15552, upload-time = "2022-05-26T13:35:21.206Z" },
1723
  ]
1724
 
 
 
 
 
 
 
1725
  [[package]]
1726
  name = "shellingham"
1727
  version = "1.5.4"
@@ -2019,6 +2037,7 @@ source = { virtual = "." }
2019
  dependencies = [
2020
  { name = "chromadb" },
2021
  { name = "dotenv" },
 
2022
  { name = "google-api-python-client" },
2023
  { name = "gradio" },
2024
  { name = "gradio-modal" },
@@ -2029,6 +2048,7 @@ dependencies = [
2029
  requires-dist = [
2030
  { name = "chromadb", specifier = ">=1.0.20" },
2031
  { name = "dotenv", specifier = ">=0.9.9" },
 
2032
  { name = "google-api-python-client", specifier = ">=2.179.0" },
2033
  { name = "gradio", specifier = ">=5.44.0" },
2034
  { name = "gradio-modal", specifier = ">=0.0.4" },
 
358
  { url = "https://files.pythonhosted.org/packages/e5/47/d63c60f59a59467fda0f93f46335c9d18526d7071f025cb5b89d5353ea42/fastapi-0.116.1-py3-none-any.whl", hash = "sha256:c46ac7c312df840f0c9e220f7964bada936781bc4e2e6eb71f1c4d7553786565", size = 95631, upload-time = "2025-07-11T16:22:30.485Z" },
359
  ]
360
 
361
+ [[package]]
362
+ name = "feedparser"
363
+ version = "6.0.11"
364
+ source = { registry = "https://pypi.org/simple" }
365
+ dependencies = [
366
+ { name = "sgmllib3k" },
367
+ ]
368
+ sdist = { url = "https://files.pythonhosted.org/packages/ff/aa/7af346ebeb42a76bf108027fe7f3328bb4e57a3a96e53e21fd9ef9dd6dd0/feedparser-6.0.11.tar.gz", hash = "sha256:c9d0407b64c6f2a065d0ebb292c2b35c01050cc0dc33757461aaabdc4c4184d5", size = 286197, upload-time = "2023-12-10T16:03:20.854Z" }
369
+ wheels = [
370
+ { url = "https://files.pythonhosted.org/packages/7c/d4/8c31aad9cc18f451c49f7f9cfb5799dadffc88177f7917bc90a66459b1d7/feedparser-6.0.11-py3-none-any.whl", hash = "sha256:0be7ee7b395572b19ebeb1d6aafb0028dee11169f1c934e0ed67d54992f4ad45", size = 81343, upload-time = "2023-12-10T16:03:19.484Z" },
371
+ ]
372
+
373
  [[package]]
374
  name = "ffmpy"
375
  version = "0.6.1"
 
1734
  { url = "https://files.pythonhosted.org/packages/6a/23/8146aad7d88f4fcb3a6218f41a60f6c2d4e3a72de72da1825dc7c8f7877c/semantic_version-2.10.0-py2.py3-none-any.whl", hash = "sha256:de78a3b8e0feda74cabc54aab2da702113e33ac9d9eb9d2389bcf1f58b7d9177", size = 15552, upload-time = "2022-05-26T13:35:21.206Z" },
1735
  ]
1736
 
1737
+ [[package]]
1738
+ name = "sgmllib3k"
1739
+ version = "1.0.0"
1740
+ source = { registry = "https://pypi.org/simple" }
1741
+ sdist = { url = "https://files.pythonhosted.org/packages/9e/bd/3704a8c3e0942d711c1299ebf7b9091930adae6675d7c8f476a7ce48653c/sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9", size = 5750, upload-time = "2010-08-24T14:33:52.445Z" }
1742
+
1743
  [[package]]
1744
  name = "shellingham"
1745
  version = "1.5.4"
 
2037
  dependencies = [
2038
  { name = "chromadb" },
2039
  { name = "dotenv" },
2040
+ { name = "feedparser" },
2041
  { name = "google-api-python-client" },
2042
  { name = "gradio" },
2043
  { name = "gradio-modal" },
 
2048
  requires-dist = [
2049
  { name = "chromadb", specifier = ">=1.0.20" },
2050
  { name = "dotenv", specifier = ">=0.9.9" },
2051
+ { name = "feedparser", specifier = ">=6.0.11" },
2052
  { name = "google-api-python-client", specifier = ">=2.179.0" },
2053
  { name = "gradio", specifier = ">=5.44.0" },
2054
  { name = "gradio-modal", specifier = ">=0.0.4" },
youtube_poller.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import feedparser
2
+ from modules.db import get_collection, get_indexed_channels
3
+
4
+
5
+ def fetch_channel_videos_rss(channel_id, max_results=50):
6
+ feed_url = f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}"
7
+ feed = feedparser.parse(feed_url)
8
+ videos = []
9
+ for entry in feed.entries[:max_results]:
10
+ videos.append(
11
+ {
12
+ "video_id": entry.yt_videoid,
13
+ "title": entry.title,
14
+ "published": entry.published,
15
+ "link": entry.link,
16
+ "channel_id": channel_id,
17
+ }
18
+ )
19
+ return videos
20
+
21
+
22
+ def get_existing_video_ids(collection, channel_id):
23
+ # n_results: how many results to fetch; use a high number to get all entries
24
+ results = collection.get(where={"channel_id": channel_id})
25
+
26
+ existing_ids = set()
27
+ for metadata in results.get("metadatas", []):
28
+ if metadata and "video_id" in metadata:
29
+ existing_ids.add(metadata["video_id"])
30
+ return existing_ids
31
+
32
+
33
+ def filter_new_videos(videos, existing_ids):
34
+ return [v for v in videos if v["video_id"] not in existing_ids]
35
+
36
+
37
+ def add_to_chroma(collection, new_videos):
38
+ if not new_videos:
39
+ return
40
+ collection.add(
41
+ documents=[v["title"] for v in new_videos],
42
+ metadatas=[
43
+ {
44
+ "video_id": v["video_id"],
45
+ "channel_id": v["channel_id"],
46
+ "link": v["link"],
47
+ }
48
+ for v in new_videos
49
+ ],
50
+ ids=[v["video_id"] for v in new_videos],
51
+ )
52
+
53
+
54
+ def incremental_update(collection, channel_id):
55
+ existing_ids = get_existing_video_ids(collection, channel_id)
56
+ latest_videos = fetch_channel_videos_rss(channel_id)
57
+ new_videos = filter_new_videos(latest_videos, existing_ids)
58
+
59
+ if new_videos:
60
+ add_to_chroma(collection, new_videos)
61
+ print(f"Added {len(new_videos)} new videos from {channel_id}")
62
+ else:
63
+ print(f"No new videos for {channel_id}")
64
+
65
+
66
+ def start_poll():
67
+ import time
68
+
69
+ configured_channels = get_indexed_channels().keys()
70
+
71
+ while True:
72
+ for channel_id in configured_channels:
73
+ incremental_update(get_collection(), channel_id)
74
+ time.sleep(600) # 10 minutes