Upload folder using huggingface_hub
Browse files- app.py +6 -0
- modules/db.py +1 -1
- pyproject.toml +1 -0
- requirements.txt +4 -0
- uv.lock +20 -0
- youtube_poller.py +74 -0
app.py
CHANGED
@@ -1,5 +1,7 @@
|
|
|
|
1 |
import os
|
2 |
import re
|
|
|
3 |
import gradio as gr
|
4 |
from gradio_modal import Modal
|
5 |
import chromadb
|
@@ -13,6 +15,7 @@ from modules.indexer import index_videos
|
|
13 |
from modules.answerer import answer_query, LLMAnswer, VideoItem, build_video_html
|
14 |
from dotenv import load_dotenv
|
15 |
|
|
|
16 |
from youtube_sync import sync_channels_from_youtube
|
17 |
|
18 |
load_dotenv()
|
@@ -545,4 +548,7 @@ with gr.Blocks() as demo:
|
|
545 |
if __name__ == "__main__":
|
546 |
for msg in init():
|
547 |
print(msg)
|
|
|
|
|
|
|
548 |
demo.launch()
|
|
|
1 |
+
import asyncio
|
2 |
import os
|
3 |
import re
|
4 |
+
import threading
|
5 |
import gradio as gr
|
6 |
from gradio_modal import Modal
|
7 |
import chromadb
|
|
|
15 |
from modules.answerer import answer_query, LLMAnswer, VideoItem, build_video_html
|
16 |
from dotenv import load_dotenv
|
17 |
|
18 |
+
from youtube_poller import start_poll
|
19 |
from youtube_sync import sync_channels_from_youtube
|
20 |
|
21 |
load_dotenv()
|
|
|
548 |
if __name__ == "__main__":
|
549 |
for msg in init():
|
550 |
print(msg)
|
551 |
+
# Start polling in a background thread
|
552 |
+
poll_thread = threading.Thread(target=start_poll, daemon=True)
|
553 |
+
poll_thread.start()
|
554 |
demo.launch()
|
modules/db.py
CHANGED
@@ -28,7 +28,7 @@ def get_collection():
|
|
28 |
|
29 |
|
30 |
# modules/db.py
|
31 |
-
def get_indexed_channels(collection):
|
32 |
results = collection.get(include=["metadatas"])
|
33 |
channels = {}
|
34 |
|
|
|
28 |
|
29 |
|
30 |
# modules/db.py
|
31 |
+
def get_indexed_channels(collection = get_collection()):
|
32 |
results = collection.get(include=["metadatas"])
|
33 |
channels = {}
|
34 |
|
pyproject.toml
CHANGED
@@ -7,6 +7,7 @@ requires-python = ">=3.13"
|
|
7 |
dependencies = [
|
8 |
"chromadb>=1.0.20",
|
9 |
"dotenv>=0.9.9",
|
|
|
10 |
"google-api-python-client>=2.179.0",
|
11 |
"gradio>=5.44.0",
|
12 |
"gradio-modal>=0.0.4",
|
|
|
7 |
dependencies = [
|
8 |
"chromadb>=1.0.20",
|
9 |
"dotenv>=0.9.9",
|
10 |
+
"feedparser>=6.0.11",
|
11 |
"google-api-python-client>=2.179.0",
|
12 |
"gradio>=5.44.0",
|
13 |
"gradio-modal>=0.0.4",
|
requirements.txt
CHANGED
@@ -59,6 +59,8 @@ durationpy==0.10
|
|
59 |
# via kubernetes
|
60 |
fastapi==0.116.1
|
61 |
# via gradio
|
|
|
|
|
62 |
ffmpy==0.6.1
|
63 |
# via gradio
|
64 |
filelock==3.19.1
|
@@ -295,6 +297,8 @@ safehttpx==0.1.6
|
|
295 |
# via gradio
|
296 |
semantic-version==2.10.0
|
297 |
# via gradio
|
|
|
|
|
298 |
shellingham==1.5.4
|
299 |
# via typer
|
300 |
six==1.17.0
|
|
|
59 |
# via kubernetes
|
60 |
fastapi==0.116.1
|
61 |
# via gradio
|
62 |
+
feedparser==6.0.11
|
63 |
+
# via youtube-surfer-ai-agent (pyproject.toml)
|
64 |
ffmpy==0.6.1
|
65 |
# via gradio
|
66 |
filelock==3.19.1
|
|
|
297 |
# via gradio
|
298 |
semantic-version==2.10.0
|
299 |
# via gradio
|
300 |
+
sgmllib3k==1.0.0
|
301 |
+
# via feedparser
|
302 |
shellingham==1.5.4
|
303 |
# via typer
|
304 |
six==1.17.0
|
uv.lock
CHANGED
@@ -358,6 +358,18 @@ wheels = [
|
|
358 |
{ url = "https://files.pythonhosted.org/packages/e5/47/d63c60f59a59467fda0f93f46335c9d18526d7071f025cb5b89d5353ea42/fastapi-0.116.1-py3-none-any.whl", hash = "sha256:c46ac7c312df840f0c9e220f7964bada936781bc4e2e6eb71f1c4d7553786565", size = 95631, upload-time = "2025-07-11T16:22:30.485Z" },
|
359 |
]
|
360 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
361 |
[[package]]
|
362 |
name = "ffmpy"
|
363 |
version = "0.6.1"
|
@@ -1722,6 +1734,12 @@ wheels = [
|
|
1722 |
{ url = "https://files.pythonhosted.org/packages/6a/23/8146aad7d88f4fcb3a6218f41a60f6c2d4e3a72de72da1825dc7c8f7877c/semantic_version-2.10.0-py2.py3-none-any.whl", hash = "sha256:de78a3b8e0feda74cabc54aab2da702113e33ac9d9eb9d2389bcf1f58b7d9177", size = 15552, upload-time = "2022-05-26T13:35:21.206Z" },
|
1723 |
]
|
1724 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1725 |
[[package]]
|
1726 |
name = "shellingham"
|
1727 |
version = "1.5.4"
|
@@ -2019,6 +2037,7 @@ source = { virtual = "." }
|
|
2019 |
dependencies = [
|
2020 |
{ name = "chromadb" },
|
2021 |
{ name = "dotenv" },
|
|
|
2022 |
{ name = "google-api-python-client" },
|
2023 |
{ name = "gradio" },
|
2024 |
{ name = "gradio-modal" },
|
@@ -2029,6 +2048,7 @@ dependencies = [
|
|
2029 |
requires-dist = [
|
2030 |
{ name = "chromadb", specifier = ">=1.0.20" },
|
2031 |
{ name = "dotenv", specifier = ">=0.9.9" },
|
|
|
2032 |
{ name = "google-api-python-client", specifier = ">=2.179.0" },
|
2033 |
{ name = "gradio", specifier = ">=5.44.0" },
|
2034 |
{ name = "gradio-modal", specifier = ">=0.0.4" },
|
|
|
358 |
{ url = "https://files.pythonhosted.org/packages/e5/47/d63c60f59a59467fda0f93f46335c9d18526d7071f025cb5b89d5353ea42/fastapi-0.116.1-py3-none-any.whl", hash = "sha256:c46ac7c312df840f0c9e220f7964bada936781bc4e2e6eb71f1c4d7553786565", size = 95631, upload-time = "2025-07-11T16:22:30.485Z" },
|
359 |
]
|
360 |
|
361 |
+
[[package]]
|
362 |
+
name = "feedparser"
|
363 |
+
version = "6.0.11"
|
364 |
+
source = { registry = "https://pypi.org/simple" }
|
365 |
+
dependencies = [
|
366 |
+
{ name = "sgmllib3k" },
|
367 |
+
]
|
368 |
+
sdist = { url = "https://files.pythonhosted.org/packages/ff/aa/7af346ebeb42a76bf108027fe7f3328bb4e57a3a96e53e21fd9ef9dd6dd0/feedparser-6.0.11.tar.gz", hash = "sha256:c9d0407b64c6f2a065d0ebb292c2b35c01050cc0dc33757461aaabdc4c4184d5", size = 286197, upload-time = "2023-12-10T16:03:20.854Z" }
|
369 |
+
wheels = [
|
370 |
+
{ url = "https://files.pythonhosted.org/packages/7c/d4/8c31aad9cc18f451c49f7f9cfb5799dadffc88177f7917bc90a66459b1d7/feedparser-6.0.11-py3-none-any.whl", hash = "sha256:0be7ee7b395572b19ebeb1d6aafb0028dee11169f1c934e0ed67d54992f4ad45", size = 81343, upload-time = "2023-12-10T16:03:19.484Z" },
|
371 |
+
]
|
372 |
+
|
373 |
[[package]]
|
374 |
name = "ffmpy"
|
375 |
version = "0.6.1"
|
|
|
1734 |
{ url = "https://files.pythonhosted.org/packages/6a/23/8146aad7d88f4fcb3a6218f41a60f6c2d4e3a72de72da1825dc7c8f7877c/semantic_version-2.10.0-py2.py3-none-any.whl", hash = "sha256:de78a3b8e0feda74cabc54aab2da702113e33ac9d9eb9d2389bcf1f58b7d9177", size = 15552, upload-time = "2022-05-26T13:35:21.206Z" },
|
1735 |
]
|
1736 |
|
1737 |
+
[[package]]
|
1738 |
+
name = "sgmllib3k"
|
1739 |
+
version = "1.0.0"
|
1740 |
+
source = { registry = "https://pypi.org/simple" }
|
1741 |
+
sdist = { url = "https://files.pythonhosted.org/packages/9e/bd/3704a8c3e0942d711c1299ebf7b9091930adae6675d7c8f476a7ce48653c/sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9", size = 5750, upload-time = "2010-08-24T14:33:52.445Z" }
|
1742 |
+
|
1743 |
[[package]]
|
1744 |
name = "shellingham"
|
1745 |
version = "1.5.4"
|
|
|
2037 |
dependencies = [
|
2038 |
{ name = "chromadb" },
|
2039 |
{ name = "dotenv" },
|
2040 |
+
{ name = "feedparser" },
|
2041 |
{ name = "google-api-python-client" },
|
2042 |
{ name = "gradio" },
|
2043 |
{ name = "gradio-modal" },
|
|
|
2048 |
requires-dist = [
|
2049 |
{ name = "chromadb", specifier = ">=1.0.20" },
|
2050 |
{ name = "dotenv", specifier = ">=0.9.9" },
|
2051 |
+
{ name = "feedparser", specifier = ">=6.0.11" },
|
2052 |
{ name = "google-api-python-client", specifier = ">=2.179.0" },
|
2053 |
{ name = "gradio", specifier = ">=5.44.0" },
|
2054 |
{ name = "gradio-modal", specifier = ">=0.0.4" },
|
youtube_poller.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import feedparser
|
2 |
+
from modules.db import get_collection, get_indexed_channels
|
3 |
+
|
4 |
+
|
5 |
+
def fetch_channel_videos_rss(channel_id, max_results=50):
|
6 |
+
feed_url = f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}"
|
7 |
+
feed = feedparser.parse(feed_url)
|
8 |
+
videos = []
|
9 |
+
for entry in feed.entries[:max_results]:
|
10 |
+
videos.append(
|
11 |
+
{
|
12 |
+
"video_id": entry.yt_videoid,
|
13 |
+
"title": entry.title,
|
14 |
+
"published": entry.published,
|
15 |
+
"link": entry.link,
|
16 |
+
"channel_id": channel_id,
|
17 |
+
}
|
18 |
+
)
|
19 |
+
return videos
|
20 |
+
|
21 |
+
|
22 |
+
def get_existing_video_ids(collection, channel_id):
|
23 |
+
# n_results: how many results to fetch; use a high number to get all entries
|
24 |
+
results = collection.get(where={"channel_id": channel_id})
|
25 |
+
|
26 |
+
existing_ids = set()
|
27 |
+
for metadata in results.get("metadatas", []):
|
28 |
+
if metadata and "video_id" in metadata:
|
29 |
+
existing_ids.add(metadata["video_id"])
|
30 |
+
return existing_ids
|
31 |
+
|
32 |
+
|
33 |
+
def filter_new_videos(videos, existing_ids):
|
34 |
+
return [v for v in videos if v["video_id"] not in existing_ids]
|
35 |
+
|
36 |
+
|
37 |
+
def add_to_chroma(collection, new_videos):
|
38 |
+
if not new_videos:
|
39 |
+
return
|
40 |
+
collection.add(
|
41 |
+
documents=[v["title"] for v in new_videos],
|
42 |
+
metadatas=[
|
43 |
+
{
|
44 |
+
"video_id": v["video_id"],
|
45 |
+
"channel_id": v["channel_id"],
|
46 |
+
"link": v["link"],
|
47 |
+
}
|
48 |
+
for v in new_videos
|
49 |
+
],
|
50 |
+
ids=[v["video_id"] for v in new_videos],
|
51 |
+
)
|
52 |
+
|
53 |
+
|
54 |
+
def incremental_update(collection, channel_id):
|
55 |
+
existing_ids = get_existing_video_ids(collection, channel_id)
|
56 |
+
latest_videos = fetch_channel_videos_rss(channel_id)
|
57 |
+
new_videos = filter_new_videos(latest_videos, existing_ids)
|
58 |
+
|
59 |
+
if new_videos:
|
60 |
+
add_to_chroma(collection, new_videos)
|
61 |
+
print(f"Added {len(new_videos)} new videos from {channel_id}")
|
62 |
+
else:
|
63 |
+
print(f"No new videos for {channel_id}")
|
64 |
+
|
65 |
+
|
66 |
+
def start_poll():
|
67 |
+
import time
|
68 |
+
|
69 |
+
configured_channels = get_indexed_channels().keys()
|
70 |
+
|
71 |
+
while True:
|
72 |
+
for channel_id in configured_channels:
|
73 |
+
incremental_update(get_collection(), channel_id)
|
74 |
+
time.sleep(600) # 10 minutes
|