vikramvasudevan commited on
Commit
f315fdc
·
verified ·
1 Parent(s): 180b122

Upload folder using huggingface_hub

Browse files
.github/workflows/update_space.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Run Python script
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout
14
+ uses: actions/checkout@v2
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v2
18
+ with:
19
+ python-version: '3.9'
20
+
21
+ - name: Install Gradio
22
+ run: python -m pip install gradio
23
+
24
+ - name: Log in to Hugging Face
25
+ run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
26
+
27
+ - name: Deploy to Spaces
28
+ run: gradio deploy
README.md CHANGED
@@ -1,102 +1,102 @@
1
- ---
2
- title: youtube-channel-surfer-ai
3
- license: mit
4
- emoji: "📺"
5
- app_file: "app.py"
6
- sdk: "gradio"
7
- pinned: false
8
- python_version: 3.13
9
- ---
10
-
11
- # 📺 YouTube Metadata Q&A Agent
12
-
13
- This application allows you to index YouTube channels and ask natural language questions about the videos. It leverages **OpenAI embeddings** and **GPT-4o-mini** to provide insightful answers based on video metadata (titles + descriptions), and it displays top relevant videos in a clean, interactive table.
14
-
15
- ---
16
-
17
- ## Features
18
-
19
- - **Index YouTube Channels**: Provide one or more YouTube channel URLs to index video metadata.
20
- - **Search & Answer Questions**: Ask questions about channel content and get answers generated by an LLM.
21
- - **Top Video Results**: View top relevant videos in a structured HTML table with clickable links.
22
- - **Embedded Video Player**: Watch videos directly in the app using YouTube embeds.
23
- - **Refresh Channels**: Update previously indexed channels to include the latest videos.
24
- - **Lightweight Storage**: Uses a local **ChromaDB** persistent database to store video embeddings for fast retrieval.
25
- - **Structured LLM Output**: LLM returns structured `LLMAnswer` objects with textual answer + top videos for clean rendering.
26
-
27
- ---
28
-
29
- ## How it Works
30
-
31
- 1. **Channel Indexing**:
32
- - The app fetches the latest videos from provided YouTube channels using the YouTube Data API.
33
- - Video metadata (title, description, channel, video ID) is embedded with OpenAI embeddings and stored in ChromaDB.
34
-
35
- 2. **Query & Retrieval**:
36
- - User queries are embedded and compared with stored video embeddings.
37
- - Top matching videos are retrieved.
38
-
39
- 3. **Answer Generation**:
40
- - The LLM generates an answer based on the top video metadata.
41
- - The answer and top videos are returned as structured data (`LLMAnswer`).
42
-
43
- 4. **Rendering**:
44
- - Answer text is displayed in Markdown.
45
- - Top videos are displayed in a structured HTML table with clickable links and embedded YouTube players.
46
-
47
- ---
48
-
49
- ## Installation
50
-
51
- ## Steps to Run
52
-
53
- 1. **Clone the repository:**
54
-
55
- git clone <repo_url>
56
- cd youtube_surfer_ai_agent
57
-
58
- 2. **Create and activate a virtual environment:**
59
-
60
- - Linux/macOS:
61
-
62
- python -m venv .venv
63
- source .venv/bin/activate
64
-
65
- - Windows:
66
-
67
- python -m venv .venv
68
- .venv\Scripts\activate
69
-
70
- 3. **Install dependencies:**
71
-
72
- pip install -r requirements.txt
73
-
74
- 4. **Create a `.env` file** in the project root with your API keys:
75
-
76
- YOUTUBE_API_KEY=your_youtube_api_key
77
- OPENAI_API_KEY=your_openai_api_key
78
-
79
- 5. **Run the application:**
80
-
81
- python app.py
82
-
83
- 6. **Open the Gradio interface** in your browser (default: http://127.0.0.1:7860).
84
-
85
- ---
86
-
87
- ## How to Use
88
-
89
- - **Index Channels:** Paste one or more YouTube channel URLs (comma or newline separated) and click "Index Channels".
90
- - **Refresh Channels:** Use the sidebar "Refresh All Channels" button to update existing channels.
91
- - **Ask Questions:** Type a query in the text box and click "Get Answer" to receive a structured response with embedded videos.
92
- - **View Indexed Channels:** The sidebar lists all channels that have been indexed with clickable links.
93
-
94
- ---
95
-
96
- ## Notes
97
-
98
- - The LLM uses structured outputs (`LLMAnswer` + `VideoItem`) internally to produce consistent results.
99
- - Top videos are embedded as iframes in the Gradio interface.
100
- - You can adjust the number of top videos returned by modifying the `top_k` parameter in `answer_query`.
101
-
102
- ---
 
1
+ ---
2
+ title: youtube-channel-surfer-ai
3
+ license: mit
4
+ emoji: "📺"
5
+ app_file: "app.py"
6
+ sdk: "gradio"
7
+ pinned: false
8
+ python_version: 3.13
9
+ ---
10
+
11
+ # 📺 YouTube Metadata Q&A Agent
12
+
13
+ This application allows you to index YouTube channels and ask natural language questions about the videos. It leverages **OpenAI embeddings** and **GPT-4o-mini** to provide insightful answers based on video metadata (titles + descriptions), and it displays top relevant videos in a clean, interactive table.
14
+
15
+ ---
16
+
17
+ ## Features
18
+
19
+ - **Index YouTube Channels**: Provide one or more YouTube channel URLs to index video metadata.
20
+ - **Search & Answer Questions**: Ask questions about channel content and get answers generated by an LLM.
21
+ - **Top Video Results**: View top relevant videos in a structured HTML table with clickable links.
22
+ - **Embedded Video Player**: Watch videos directly in the app using YouTube embeds.
23
+ - **Refresh Channels**: Update previously indexed channels to include the latest videos.
24
+ - **Lightweight Storage**: Uses a local **ChromaDB** persistent database to store video embeddings for fast retrieval.
25
+ - **Structured LLM Output**: LLM returns structured `LLMAnswer` objects with textual answer + top videos for clean rendering.
26
+
27
+ ---
28
+
29
+ ## How it Works
30
+
31
+ 1. **Channel Indexing**:
32
+ - The app fetches the latest videos from provided YouTube channels using the YouTube Data API.
33
+ - Video metadata (title, description, channel, video ID) is embedded with OpenAI embeddings and stored in ChromaDB.
34
+
35
+ 2. **Query & Retrieval**:
36
+ - User queries are embedded and compared with stored video embeddings.
37
+ - Top matching videos are retrieved.
38
+
39
+ 3. **Answer Generation**:
40
+ - The LLM generates an answer based on the top video metadata.
41
+ - The answer and top videos are returned as structured data (`LLMAnswer`).
42
+
43
+ 4. **Rendering**:
44
+ - Answer text is displayed in Markdown.
45
+ - Top videos are displayed in a structured HTML table with clickable links and embedded YouTube players.
46
+
47
+ ---
48
+
49
+ ## Installation
50
+
51
+ ## Steps to Run
52
+
53
+ 1. **Clone the repository:**
54
+
55
+ git clone <repo_url>
56
+ cd youtube_surfer_ai_agent
57
+
58
+ 2. **Create and activate a virtual environment:**
59
+
60
+ - Linux/macOS:
61
+
62
+ python -m venv .venv
63
+ source .venv/bin/activate
64
+
65
+ - Windows:
66
+
67
+ python -m venv .venv
68
+ .venv\Scripts\activate
69
+
70
+ 3. **Install dependencies:**
71
+
72
+ pip install -r requirements.txt
73
+
74
+ 4. **Create a `.env` file** in the project root with your API keys:
75
+
76
+ YOUTUBE_API_KEY=your_youtube_api_key
77
+ OPENAI_API_KEY=your_openai_api_key
78
+
79
+ 5. **Run the application:**
80
+
81
+ python app.py
82
+
83
+ 6. **Open the Gradio interface** in your browser (default: http://127.0.0.1:7860).
84
+
85
+ ---
86
+
87
+ ## How to Use
88
+
89
+ - **Index Channels:** Paste one or more YouTube channel URLs (comma or newline separated) and click "Index Channels".
90
+ - **Refresh Channels:** Use the sidebar "Refresh All Channels" button to update existing channels.
91
+ - **Ask Questions:** Type a query in the text box and click "Get Answer" to receive a structured response with embedded videos.
92
+ - **View Indexed Channels:** The sidebar lists all channels that have been indexed with clickable links.
93
+
94
+ ---
95
+
96
+ ## Notes
97
+
98
+ - The LLM uses structured outputs (`LLMAnswer` + `VideoItem`) internally to produce consistent results.
99
+ - Top videos are embedded as iframes in the Gradio interface.
100
+ - You can adjust the number of top videos returned by modifying the `top_k` parameter in `answer_query`.
101
+
102
+ ---
app.py CHANGED
@@ -1,149 +1,187 @@
1
- import os
2
- import re
3
- import gradio as gr
4
- import chromadb
5
- from modules.collector import fetch_channel_videos_from_url
6
- from modules.db import get_indexed_channels
7
- from modules.indexer import index_videos
8
- from modules.answerer import answer_query, LLMAnswer, VideoItem, build_video_html
9
- from dotenv import load_dotenv
10
-
11
- load_dotenv()
12
-
13
- # -------------------------------
14
- # Setup Chroma
15
- # -------------------------------
16
- client = chromadb.PersistentClient(path="./youtube_db")
17
- collection = client.get_or_create_collection("yt_metadata", embedding_function=None)
18
-
19
-
20
- # -------------------------------
21
- # Utils
22
- # -------------------------------
23
- def refresh_channel(api_key, channel_url: str):
24
- """Fetch + re-index a single channel."""
25
- videos = fetch_channel_videos_from_url(api_key, channel_url)
26
- for v in videos:
27
- v["channel_url"] = channel_url
28
- index_videos(videos, collection, channel_url=channel_url)
29
- return len(videos)
30
-
31
-
32
- def index_channels(channel_urls: str):
33
- yt_api_key = os.environ["YOUTUBE_API_KEY"]
34
- urls = [u.strip() for u in re.split(r"[\n,]+", channel_urls) if u.strip()]
35
- total_videos = sum(refresh_channel(yt_api_key, url) for url in urls)
36
- return (
37
- f"✅ Indexed {total_videos} videos from {len(urls)} channels.",
38
- list_channels(),
39
- )
40
-
41
-
42
- def list_channels():
43
- channels = get_indexed_channels(collection)
44
- if not channels:
45
- return "No channels indexed yet."
46
- md = []
47
- for key, val in channels.items():
48
- if isinstance(val, dict):
49
- cname = val.get("channel_title", "Unknown")
50
- curl = val.get("channel_url", None)
51
- else:
52
- cname = val
53
- curl = key
54
- if curl:
55
- md.append(f"- **{cname}** ([link]({curl}))")
56
- else:
57
- md.append(f"- **{cname}**")
58
- return "\n".join(md)
59
-
60
-
61
- def refresh_all_channels():
62
- yt_api_key = os.environ["YOUTUBE_API_KEY"]
63
- channels = get_indexed_channels(collection)
64
- if not channels:
65
- return "⚠️ No channels available to refresh.", list_channels()
66
- total_videos = 0
67
- for key, val in channels.items():
68
- url = val.get("channel_url") if isinstance(val, dict) else key
69
- if url:
70
- total_videos += refresh_channel(yt_api_key, url)
71
- return (
72
- f"🔄 Refreshed {len(channels)} channels, re-indexed {total_videos} videos.",
73
- list_channels(),
74
- )
75
-
76
-
77
- def handle_query(query: str):
78
- (answer_text, video_html) = answer_query(query, collection) # returns LLMAnswer
79
- return answer_text, video_html
80
-
81
-
82
- # -------------------------------
83
- # Gradio UI
84
- # -------------------------------
85
- def show_component():
86
- return gr.update(visible=True)
87
- def hide_component():
88
- return gr.update(visible=False)
89
- def close_component():
90
- return gr.update(open=False)
91
- def open_component():
92
- return gr.update(open=True)
93
-
94
-
95
- with gr.Blocks() as demo:
96
- gr.Markdown("## 📺 YouTube Metadata Q&A Agent")
97
- from gradio_modal import Modal
98
-
99
- with Modal(visible=False) as add_channel_modal:
100
- channel_input = gr.Textbox(
101
- label="Channel URLs",
102
- placeholder="Paste one or more YouTube channel URLs (comma or newline separated)",
103
- )
104
- save_add_channels_btn = gr.Button("Add Channels")
105
- index_status = gr.Markdown(label="Index Status", container=False)
106
-
107
- with gr.Row():
108
- with gr.Sidebar() as my_sidebar:
109
- gr.Markdown("### 📺 Channels")
110
- channel_list = gr.Markdown(list_channels())
111
- with gr.Row():
112
- refresh_all_btn = gr.Button(
113
- "🔄 Refresh", size="sm", scale=0
114
- )
115
- add_channels_btn = gr.Button("+ Add", size="sm", scale=0)
116
- refresh_status = gr.Markdown(label="Refresh Status", container=False)
117
- refresh_all_btn.click(
118
- fn=refresh_all_channels,
119
- inputs=None,
120
- outputs=[refresh_status, channel_list],
121
- )
122
- add_channels_btn.click(close_component, outputs=[my_sidebar]).then(show_component, outputs=[add_channel_modal])
123
- save_add_channels_btn.click(
124
- index_channels,
125
- inputs=[channel_input],
126
- outputs=[index_status, channel_list],
127
- ).then(hide_component, outputs=[add_channel_modal]).then(open_component, outputs=[my_sidebar])
128
-
129
- with gr.Column(scale=3):
130
- question = gr.Textbox(
131
- label="Ask a Question",
132
- placeholder="e.g., What topics did they cover on AI ethics?",
133
- )
134
- gr.Examples(
135
- [
136
- "Show me some videos that mention Ranganatha.",
137
- "Slokas that mention gajendra moksham",
138
- ],
139
- inputs=question,
140
- )
141
-
142
- answer = gr.Markdown()
143
- video_embed = gr.HTML() # iframe embeds will render here
144
-
145
- ask_btn = gr.Button("Get Answer")
146
- ask_btn.click(handle_query, inputs=question, outputs=[answer, video_embed])
147
-
148
- if __name__ == "__main__":
149
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import gradio as gr
4
+ from gradio_modal import Modal
5
+ import chromadb
6
+ from modules.collector import fetch_channel_videos_from_url
7
+ from modules.db import get_indexed_channels
8
+ from modules.indexer import index_videos
9
+ from modules.answerer import answer_query, LLMAnswer, VideoItem, build_video_html
10
+ from dotenv import load_dotenv
11
+
12
+ load_dotenv()
13
+
14
+ # -------------------------------
15
+ # Setup Chroma
16
+ # -------------------------------
17
+ client = chromadb.PersistentClient(path="./youtube_db")
18
+ collection = client.get_or_create_collection("yt_metadata", embedding_function=None)
19
+
20
+
21
+ # -------------------------------
22
+ # Utils
23
+ # -------------------------------
24
+ def refresh_channel(api_key, channel_url: str):
25
+ """Fetch + re-index a single channel."""
26
+ videos = fetch_channel_videos_from_url(api_key, channel_url)
27
+ for v in videos:
28
+ v["channel_url"] = channel_url
29
+ index_videos(videos, collection, channel_url=channel_url)
30
+ return len(videos)
31
+
32
+
33
+ def index_channels(channel_urls: str):
34
+ yield "saving ...", gr.update()
35
+ yt_api_key = os.environ["YOUTUBE_API_KEY"]
36
+ urls = [u.strip() for u in re.split(r"[\n,]+", channel_urls) if u.strip()]
37
+ total_videos = sum(refresh_channel(yt_api_key, url) for url in urls)
38
+ yield (
39
+ f"✅ Indexed {total_videos} videos from {len(urls)} channels.",
40
+ list_channels(),
41
+ )
42
+ return
43
+
44
+
45
+ def list_channels():
46
+ channels = get_indexed_channels(collection)
47
+ if not channels:
48
+ return "No channels indexed yet."
49
+ md = []
50
+ for key, val in channels.items():
51
+ if isinstance(val, dict):
52
+ cname = val.get("channel_title", "Unknown")
53
+ curl = val.get("channel_url", None)
54
+ else:
55
+ cname = val
56
+ curl = key
57
+ if curl:
58
+ md.append(f"- **{cname}** ([link]({curl}))")
59
+ else:
60
+ md.append(f"- **{cname}**")
61
+ return "\n".join(md)
62
+
63
+
64
+ def refresh_all_channels():
65
+ yt_api_key = os.environ["YOUTUBE_API_KEY"]
66
+ channels = get_indexed_channels(collection)
67
+ if not channels:
68
+ return "⚠️ No channels available to refresh.", list_channels()
69
+ total_videos = 0
70
+ for key, val in channels.items():
71
+ url = val.get("channel_url") if isinstance(val, dict) else key
72
+ if url:
73
+ total_videos += refresh_channel(yt_api_key, url)
74
+ return (
75
+ f"🔄 Refreshed {len(channels)} channels, re-indexed {total_videos} videos.",
76
+ list_channels(),
77
+ )
78
+
79
+
80
+ def handle_query(query: str):
81
+ (answer_text, video_html) = answer_query(query, collection) # returns LLMAnswer
82
+ return answer_text, video_html
83
+
84
+
85
+ # -------------------------------
86
+ # Gradio UI
87
+ # -------------------------------
88
+ def show_component():
89
+ return gr.update(visible=True)
90
+
91
+
92
+ def hide_component():
93
+ return gr.update(visible=False)
94
+
95
+
96
+ def close_component():
97
+ return gr.update(open=False)
98
+
99
+
100
+ def open_component():
101
+ return gr.update(open=True)
102
+
103
+
104
+ def disable_component():
105
+ return gr.update(interactive=False)
106
+
107
+
108
+ def enable_component():
109
+ return gr.update(interactive=True)
110
+
111
+
112
+ def clear_component():
113
+ return gr.update(value="")
114
+
115
+
116
+ def show_loading():
117
+ return gr.update(value="loading")
118
+
119
+
120
+ with gr.Blocks() as demo:
121
+ gr.Markdown("## 📺 YouTube Metadata Q&A Agent")
122
+ with Modal(visible=False) as add_channel_modal:
123
+ channel_input = gr.Textbox(
124
+ label="Channel URLs",
125
+ placeholder="Paste one or more YouTube channel URLs (comma or newline separated)",
126
+ )
127
+ save_add_channels_btn = gr.Button("Add Channels")
128
+ index_status = gr.Markdown(label="Index Status", container=False)
129
+
130
+ with gr.Row():
131
+ with gr.Sidebar() as my_sidebar:
132
+ gr.Markdown("### 📺 Channels")
133
+ channel_list = gr.Markdown(list_channels())
134
+ with gr.Row():
135
+ refresh_all_btn = gr.Button("🔄 Refresh", size="sm", scale=0)
136
+ add_channels_btn = gr.Button("+ Add", size="sm", scale=0)
137
+ refresh_status = gr.Markdown(label="Refresh Status", container=False)
138
+ refresh_all_btn.click(
139
+ fn=refresh_all_channels,
140
+ inputs=None,
141
+ outputs=[refresh_status, channel_list],
142
+ )
143
+ add_channels_btn.click(close_component, outputs=[my_sidebar]).then(
144
+ show_component, outputs=[add_channel_modal]
145
+ )
146
+ save_add_channels_btn.click(
147
+ disable_component, outputs=[save_add_channels_btn]
148
+ ).then(
149
+ index_channels,
150
+ inputs=[channel_input],
151
+ outputs=[index_status, channel_list],
152
+ ).then(
153
+ hide_component, outputs=[add_channel_modal]
154
+ ).then(
155
+ open_component, outputs=[my_sidebar]
156
+ ).then(
157
+ enable_component, outputs=[save_add_channels_btn]
158
+ )
159
+
160
+ with gr.Column(scale=3):
161
+ question = gr.Textbox(
162
+ label="Ask a Question",
163
+ placeholder="e.g., What topics did they cover on AI ethics?",
164
+ )
165
+ gr.Examples(
166
+ [
167
+ "Show me some videos that mention Ranganatha.",
168
+ "Slokas that mention gajendra moksham",
169
+ ],
170
+ inputs=question,
171
+ )
172
+
173
+ answer = gr.Markdown()
174
+ video_embed = gr.HTML() # iframe embeds will render here
175
+
176
+ ask_btn = gr.Button("Get Answer")
177
+ ask_status = gr.Markdown()
178
+ ask_btn.click(show_loading, outputs=[ask_status]).then(
179
+ disable_component, outputs=[ask_btn]
180
+ ).then(handle_query, inputs=question, outputs=[answer, video_embed]).then(
181
+ enable_component, outputs=[ask_btn]
182
+ ).then(
183
+ clear_component, outputs=[ask_status]
184
+ )
185
+
186
+ if __name__ == "__main__":
187
+ demo.launch()
modules/answerer.py CHANGED
@@ -1,109 +1,109 @@
1
- # -------------------------------
2
- # 4. Answerer
3
- # -------------------------------
4
- from typing import List
5
- from pydantic import BaseModel
6
- from openai import OpenAI
7
- from modules.retriever import retrieve_videos
8
-
9
- # -------------------------------
10
- # Structured Output Classes
11
- # -------------------------------
12
- class VideoItem(BaseModel):
13
- video_id: str
14
- title: str
15
- channel: str
16
- description: str
17
-
18
- class LLMAnswer(BaseModel):
19
- answer_text: str
20
- top_videos: List[VideoItem]
21
-
22
- # -------------------------------
23
- # Main Function
24
- # -------------------------------
25
- def answer_query(query: str, collection, top_k: int = 5) -> LLMAnswer:
26
- """
27
- Answer a user query using YouTube video metadata.
28
- Returns an LLMAnswer object with textual answer + list of videos.
29
- """
30
- results = retrieve_videos(query, collection, top_k=top_k)
31
-
32
- if not results:
33
- return LLMAnswer(answer_text="No relevant videos found.", top_videos=[])
34
-
35
- # Build context lines for the LLM
36
- context_lines = []
37
- top_videos_list = []
38
- for r in results:
39
- # Ensure each result is a dict
40
- if not isinstance(r, dict):
41
- continue
42
- vid_id = r.get("video_id", "")
43
- title = r.get("video_title") or r.get("title", "")
44
- channel = r.get("channel") or r.get("channel_title", "")
45
- description = r.get("description", "")
46
- context_lines.append(f"- {title} ({channel}) (https://youtube.com/watch?v={vid_id})\n description: {description}")
47
-
48
- top_videos_list.append(
49
- VideoItem(
50
- video_id=vid_id,
51
- title=title,
52
- channel=channel,
53
- description=description
54
- )
55
- )
56
-
57
- context_text = "\n".join(context_lines)
58
-
59
- # Call LLM with structured output
60
- client = OpenAI()
61
- response = client.chat.completions.parse(
62
- model="gpt-4o-mini",
63
- messages=[
64
- {
65
- "role": "system",
66
- "content": (
67
- "You are a helpful assistant that answers questions using YouTube video metadata. "
68
- "Return your response strictly as the LLMAnswer class, including 'answer_text' and a list of 'top_videos'."
69
- )
70
- },
71
- {
72
- "role": "user",
73
- "content": f"Question: {query}\n\nRelevant videos:\n{context_text}\n\nAnswer based only on this."
74
- }
75
- ],
76
- response_format=LLMAnswer
77
- )
78
-
79
- llm_answer = response.choices[0].message.parsed # already LLMAnswer object
80
- answer_text = llm_answer.answer_text
81
- video_html = build_video_html(llm_answer.top_videos)
82
- return answer_text, video_html
83
-
84
-
85
- def build_video_html(videos: list[VideoItem]) -> str:
86
- """Build a clean HTML table from top_videos."""
87
- if not videos:
88
- return "<p>No relevant videos found.</p>"
89
-
90
- html = """
91
- <table border="1" style="border-collapse: collapse; width: 100%;">
92
- <tr>
93
- <th>Title</th>
94
- <th>Channel</th>
95
- <th>Description</th>
96
- <th>Watch</th>
97
- </tr>
98
- """
99
- for v in videos:
100
- html += f"""
101
- <tr>
102
- <td>{v.title}</td>
103
- <td>{v.channel}</td>
104
- <td>{v.description}</td>
105
- <td><a href="https://youtube.com/watch?v={v.video_id}" target="_blank">▶️ Watch</a></td>
106
- </tr>
107
- """
108
- html += "</table>"
109
- return html
 
1
+ # -------------------------------
2
+ # 4. Answerer
3
+ # -------------------------------
4
+ from typing import List
5
+ from pydantic import BaseModel
6
+ from openai import OpenAI
7
+ from modules.retriever import retrieve_videos
8
+
9
+ # -------------------------------
10
+ # Structured Output Classes
11
+ # -------------------------------
12
+ class VideoItem(BaseModel):
13
+ video_id: str
14
+ title: str
15
+ channel: str
16
+ description: str
17
+
18
+ class LLMAnswer(BaseModel):
19
+ answer_text: str
20
+ top_videos: List[VideoItem]
21
+
22
+ # -------------------------------
23
+ # Main Function
24
+ # -------------------------------
25
+ def answer_query(query: str, collection, top_k: int = 5) -> LLMAnswer:
26
+ """
27
+ Answer a user query using YouTube video metadata.
28
+ Returns an LLMAnswer object with textual answer + list of videos.
29
+ """
30
+ results = retrieve_videos(query, collection, top_k=top_k)
31
+
32
+ if not results:
33
+ return LLMAnswer(answer_text="No relevant videos found.", top_videos=[])
34
+
35
+ # Build context lines for the LLM
36
+ context_lines = []
37
+ top_videos_list = []
38
+ for r in results:
39
+ # Ensure each result is a dict
40
+ if not isinstance(r, dict):
41
+ continue
42
+ vid_id = r.get("video_id", "")
43
+ title = r.get("video_title") or r.get("title", "")
44
+ channel = r.get("channel") or r.get("channel_title", "")
45
+ description = r.get("description", "")
46
+ context_lines.append(f"- {title} ({channel}) (https://youtube.com/watch?v={vid_id})\n description: {description}")
47
+
48
+ top_videos_list.append(
49
+ VideoItem(
50
+ video_id=vid_id,
51
+ title=title,
52
+ channel=channel,
53
+ description=description
54
+ )
55
+ )
56
+
57
+ context_text = "\n".join(context_lines)
58
+
59
+ # Call LLM with structured output
60
+ client = OpenAI()
61
+ response = client.chat.completions.parse(
62
+ model="gpt-4o-mini",
63
+ messages=[
64
+ {
65
+ "role": "system",
66
+ "content": (
67
+ "You are a helpful assistant that answers questions using YouTube video metadata. "
68
+ "Return your response strictly as the LLMAnswer class, including 'answer_text' and a list of 'top_videos'."
69
+ )
70
+ },
71
+ {
72
+ "role": "user",
73
+ "content": f"Question: {query}\n\nRelevant videos:\n{context_text}\n\nAnswer based only on this."
74
+ }
75
+ ],
76
+ response_format=LLMAnswer
77
+ )
78
+
79
+ llm_answer = response.choices[0].message.parsed # already LLMAnswer object
80
+ answer_text = llm_answer.answer_text
81
+ video_html = build_video_html(llm_answer.top_videos)
82
+ return answer_text, video_html
83
+
84
+
85
+ def build_video_html(videos: list[VideoItem]) -> str:
86
+ """Build a clean HTML table from top_videos."""
87
+ if not videos:
88
+ return "<p>No relevant videos found.</p>"
89
+
90
+ html = """
91
+ <table border="1" style="border-collapse: collapse; width: 100%;">
92
+ <tr>
93
+ <th>Title</th>
94
+ <th>Channel</th>
95
+ <th>Description</th>
96
+ <th>Watch</th>
97
+ </tr>
98
+ """
99
+ for v in videos:
100
+ html += f"""
101
+ <tr>
102
+ <td>{v.title}</td>
103
+ <td>{v.channel}</td>
104
+ <td>{v.description}</td>
105
+ <td><a href="https://youtube.com/watch?v={v.video_id}" target="_blank">▶️ Watch</a></td>
106
+ </tr>
107
+ """
108
+ html += "</table>"
109
+ return html
modules/collector.py CHANGED
@@ -1,69 +1,69 @@
1
- # -------------------------------
2
- # 1. Collector
3
- # -------------------------------
4
- from typing import List,Dict
5
- from googleapiclient.discovery import build
6
-
7
- from modules.youtube_utils import get_channel_id
8
-
9
-
10
- def fetch_channel_videos_from_url(api_key: str, channel_url: str, max_results=20):
11
- youtube = build("youtube", "v3", developerKey=api_key)
12
- channel_id = get_channel_id(youtube, channel_url)
13
-
14
- # Get channel details to fetch its title
15
- channel_response = youtube.channels().list(
16
- part="snippet",
17
- id=channel_id
18
- ).execute()
19
- channel_title = channel_response["items"][0]["snippet"]["title"]
20
-
21
- request = youtube.search().list(
22
- part="snippet",
23
- channelId=channel_id,
24
- maxResults=max_results,
25
- order="date"
26
- )
27
- response = request.execute()
28
-
29
- videos = []
30
- for item in response.get("items", []):
31
- if item["id"]["kind"] == "youtube#video":
32
- videos.append({
33
- "video_id": item["id"]["videoId"],
34
- "title": item["snippet"]["title"],
35
- "description": item["snippet"].get("description", ""),
36
- "channel_id": channel_id,
37
- "channel_title": channel_title,
38
- })
39
- return videos
40
-
41
- def fetch_channel_videos(api_key: str, channel_id: str, max_results=20):
42
- youtube = build("youtube", "v3", developerKey=api_key)
43
-
44
- # Fetch channel title
45
- channel_response = youtube.channels().list(
46
- part="snippet",
47
- id=channel_id
48
- ).execute()
49
- channel_title = channel_response["items"][0]["snippet"]["title"]
50
-
51
- request = youtube.search().list(
52
- part="snippet",
53
- channelId=channel_id,
54
- maxResults=max_results,
55
- order="date"
56
- )
57
- response = request.execute()
58
-
59
- videos = []
60
- for item in response.get("items", []):
61
- if item["id"]["kind"] == "youtube#video":
62
- videos.append({
63
- "video_id": item["id"]["videoId"],
64
- "title": item["snippet"]["title"],
65
- "description": item["snippet"].get("description", ""),
66
- "channel_id": channel_id,
67
- "channel_title": channel_title,
68
- })
69
- return videos
 
1
+ # -------------------------------
2
+ # 1. Collector
3
+ # -------------------------------
4
+ from typing import List,Dict
5
+ from googleapiclient.discovery import build
6
+
7
+ from modules.youtube_utils import get_channel_id
8
+
9
+
10
+ def fetch_channel_videos_from_url(api_key: str, channel_url: str, max_results=20):
11
+ youtube = build("youtube", "v3", developerKey=api_key)
12
+ channel_id = get_channel_id(youtube, channel_url)
13
+
14
+ # Get channel details to fetch its title
15
+ channel_response = youtube.channels().list(
16
+ part="snippet",
17
+ id=channel_id
18
+ ).execute()
19
+ channel_title = channel_response["items"][0]["snippet"]["title"]
20
+
21
+ request = youtube.search().list(
22
+ part="snippet",
23
+ channelId=channel_id,
24
+ maxResults=max_results,
25
+ order="date"
26
+ )
27
+ response = request.execute()
28
+
29
+ videos = []
30
+ for item in response.get("items", []):
31
+ if item["id"]["kind"] == "youtube#video":
32
+ videos.append({
33
+ "video_id": item["id"]["videoId"],
34
+ "title": item["snippet"]["title"],
35
+ "description": item["snippet"].get("description", ""),
36
+ "channel_id": channel_id,
37
+ "channel_title": channel_title,
38
+ })
39
+ return videos
40
+
41
+ def fetch_channel_videos(api_key: str, channel_id: str, max_results=20):
42
+ youtube = build("youtube", "v3", developerKey=api_key)
43
+
44
+ # Fetch channel title
45
+ channel_response = youtube.channels().list(
46
+ part="snippet",
47
+ id=channel_id
48
+ ).execute()
49
+ channel_title = channel_response["items"][0]["snippet"]["title"]
50
+
51
+ request = youtube.search().list(
52
+ part="snippet",
53
+ channelId=channel_id,
54
+ maxResults=max_results,
55
+ order="date"
56
+ )
57
+ response = request.execute()
58
+
59
+ videos = []
60
+ for item in response.get("items", []):
61
+ if item["id"]["kind"] == "youtube#video":
62
+ videos.append({
63
+ "video_id": item["id"]["videoId"],
64
+ "title": item["snippet"]["title"],
65
+ "description": item["snippet"].get("description", ""),
66
+ "channel_id": channel_id,
67
+ "channel_title": channel_title,
68
+ })
69
+ return videos
modules/db.py CHANGED
@@ -1,36 +1,36 @@
1
- import chromadb
2
-
3
- def get_collection():
4
- client = chromadb.PersistentClient(path="./youtube_db")
5
-
6
- # Ensure fresh collection with correct dimension
7
- try:
8
- collection = client.get_collection("yt_metadata")
9
- except Exception:
10
- collection = client.create_collection("yt_metadata")
11
-
12
- # Check dimension mismatch
13
- try:
14
- # quick test query
15
- collection.query(query_embeddings=[[0.0] * 1536], n_results=1)
16
- except Exception:
17
- # Delete and recreate with fresh schema
18
- client.delete_collection("yt_metadata")
19
- collection = client.create_collection("yt_metadata")
20
-
21
- return collection
22
-
23
-
24
- # modules/db.py
25
- def get_indexed_channels(collection):
26
- results = collection.get(include=["metadatas"])
27
- channels = {}
28
-
29
- for meta in results["metadatas"]:
30
- cid = meta.get("channel_id") # ✅ safe
31
- cname = meta.get("channel_title", "Unknown Channel")
32
-
33
- if cid: # only include if we have a channel_id
34
- channels[cid] = cname
35
-
36
- return channels
 
1
+ import chromadb
2
+
3
+ def get_collection():
4
+ client = chromadb.PersistentClient(path="./youtube_db")
5
+
6
+ # Ensure fresh collection with correct dimension
7
+ try:
8
+ collection = client.get_collection("yt_metadata")
9
+ except Exception:
10
+ collection = client.create_collection("yt_metadata")
11
+
12
+ # Check dimension mismatch
13
+ try:
14
+ # quick test query
15
+ collection.query(query_embeddings=[[0.0] * 1536], n_results=1)
16
+ except Exception:
17
+ # Delete and recreate with fresh schema
18
+ client.delete_collection("yt_metadata")
19
+ collection = client.create_collection("yt_metadata")
20
+
21
+ return collection
22
+
23
+
24
+ # modules/db.py
25
+ def get_indexed_channels(collection):
26
+ results = collection.get(include=["metadatas"])
27
+ channels = {}
28
+
29
+ for meta in results["metadatas"]:
30
+ cid = meta.get("channel_id") # ✅ safe
31
+ cname = meta.get("channel_title", "Unknown Channel")
32
+
33
+ if cid: # only include if we have a channel_id
34
+ channels[cid] = cname
35
+
36
+ return channels
modules/indexer.py CHANGED
@@ -1,34 +1,34 @@
1
- # modules/indexer.py
2
- from typing import Dict, List
3
- from openai import OpenAI
4
-
5
- def index_videos(videos: List[Dict], collection,channel_url : str):
6
- client = OpenAI()
7
-
8
- for vid in videos:
9
- text = f"{vid.get('title', '')} - {vid.get('description', '')}"
10
- embedding = client.embeddings.create(
11
- input=text,
12
- model="text-embedding-3-small"
13
- ).data[0].embedding
14
-
15
- # build metadata safely
16
- metadata = {
17
- "video_id": vid.get("video_id"),
18
- "video_title": vid.get("title", ""),
19
- "description" : vid.get('description', ''),
20
- "channel_url" : channel_url,
21
- }
22
-
23
- # add channel info if available
24
- if "channel_id" in vid:
25
- metadata["channel_id"] = vid["channel_id"]
26
- if "channel_title" in vid:
27
- metadata["channel_title"] = vid["channel_title"]
28
-
29
- collection.add(
30
- documents=[text],
31
- embeddings=[embedding],
32
- metadatas=[metadata],
33
- ids=[vid.get("video_id")]
34
- )
 
1
+ # modules/indexer.py
2
+ from typing import Dict, List
3
+ from openai import OpenAI
4
+
5
+ def index_videos(videos: List[Dict], collection,channel_url : str):
6
+ client = OpenAI()
7
+
8
+ for vid in videos:
9
+ text = f"{vid.get('title', '')} - {vid.get('description', '')}"
10
+ embedding = client.embeddings.create(
11
+ input=text,
12
+ model="text-embedding-3-small"
13
+ ).data[0].embedding
14
+
15
+ # build metadata safely
16
+ metadata = {
17
+ "video_id": vid.get("video_id"),
18
+ "video_title": vid.get("title", ""),
19
+ "description" : vid.get('description', ''),
20
+ "channel_url" : channel_url,
21
+ }
22
+
23
+ # add channel info if available
24
+ if "channel_id" in vid:
25
+ metadata["channel_id"] = vid["channel_id"]
26
+ if "channel_title" in vid:
27
+ metadata["channel_title"] = vid["channel_title"]
28
+
29
+ collection.add(
30
+ documents=[text],
31
+ embeddings=[embedding],
32
+ metadatas=[metadata],
33
+ ids=[vid.get("video_id")]
34
+ )
modules/retriever.py CHANGED
@@ -1,36 +1,36 @@
1
- # modules/retriever.py
2
- from typing import List, Dict
3
- from openai import OpenAI
4
-
5
- def retrieve_videos(query: str, collection, top_k: int = 3) -> List[Dict]:
6
- client = OpenAI()
7
-
8
- # Create embedding for query
9
- embedding = client.embeddings.create(
10
- input=query,
11
- model="text-embedding-3-small"
12
- ).data[0].embedding
13
-
14
- # Query Chroma
15
- results = collection.query(
16
- query_embeddings=[embedding],
17
- n_results=top_k,
18
- include=["metadatas", "documents", "distances"]
19
- )
20
-
21
- # Build list of standardized dicts
22
- videos = []
23
- metadatas_list = results.get("metadatas", [[]])[0] # list of metadata dicts
24
- documents_list = results.get("documents", [[]])[0] # list of text
25
- distances_list = results.get("distances", [[]])[0] # optional
26
-
27
- for idx, meta in enumerate(metadatas_list):
28
- videos.append({
29
- "video_id": meta.get("video_id", ""),
30
- "video_title": meta.get("video_title", meta.get("title", documents_list[idx])),
31
- "channel": meta.get("channel", meta.get("channel_title", "")),
32
- "description": documents_list[idx] if idx < len(documents_list) else "",
33
- "score": distances_list[idx] if idx < len(distances_list) else None
34
- })
35
-
36
- return videos
 
1
+ # modules/retriever.py
2
+ from typing import List, Dict
3
+ from openai import OpenAI
4
+
5
+ def retrieve_videos(query: str, collection, top_k: int = 3) -> List[Dict]:
6
+ client = OpenAI()
7
+
8
+ # Create embedding for query
9
+ embedding = client.embeddings.create(
10
+ input=query,
11
+ model="text-embedding-3-small"
12
+ ).data[0].embedding
13
+
14
+ # Query Chroma
15
+ results = collection.query(
16
+ query_embeddings=[embedding],
17
+ n_results=top_k,
18
+ include=["metadatas", "documents", "distances"]
19
+ )
20
+
21
+ # Build list of standardized dicts
22
+ videos = []
23
+ metadatas_list = results.get("metadatas", [[]])[0] # list of metadata dicts
24
+ documents_list = results.get("documents", [[]])[0] # list of text
25
+ distances_list = results.get("distances", [[]])[0] # optional
26
+
27
+ for idx, meta in enumerate(metadatas_list):
28
+ videos.append({
29
+ "video_id": meta.get("video_id", ""),
30
+ "video_title": meta.get("video_title", meta.get("title", documents_list[idx])),
31
+ "channel": meta.get("channel", meta.get("channel_title", "")),
32
+ "description": documents_list[idx] if idx < len(documents_list) else "",
33
+ "score": distances_list[idx] if idx < len(distances_list) else None
34
+ })
35
+
36
+ return videos
modules/youtube_utils.py CHANGED
@@ -1,26 +1,26 @@
1
- def get_channel_id(youtube, channel_url: str) -> str:
2
- """
3
- Extract channel ID from a YouTube URL or handle.
4
- Supports:
5
- - https://www.youtube.com/channel/UCxxxx
6
- - https://www.youtube.com/@handle
7
- - @handle
8
- """
9
- # If already a UC... ID
10
- if "channel/" in channel_url:
11
- return channel_url.split("channel/")[-1].split("/")[0]
12
-
13
- # If it's a handle (@xyz or full URL)
14
- if "@" in channel_url:
15
- handle = channel_url.split("@")[-1]
16
- request = youtube.channels().list(
17
- part="id",
18
- forHandle=handle
19
- )
20
- response = request.execute()
21
- return response["items"][0]["id"]
22
-
23
- if channel_url.startswith("UC"):
24
- return channel_url
25
-
26
- raise ValueError(f"Unsupported channel URL format {channel_url}")
 
1
+ def get_channel_id(youtube, channel_url: str) -> str:
2
+ """
3
+ Extract channel ID from a YouTube URL or handle.
4
+ Supports:
5
+ - https://www.youtube.com/channel/UCxxxx
6
+ - https://www.youtube.com/@handle
7
+ - @handle
8
+ """
9
+ # If already a UC... ID
10
+ if "channel/" in channel_url:
11
+ return channel_url.split("channel/")[-1].split("/")[0]
12
+
13
+ # If it's a handle (@xyz or full URL)
14
+ if "@" in channel_url:
15
+ handle = channel_url.split("@")[-1]
16
+ request = youtube.channels().list(
17
+ part="id",
18
+ forHandle=handle
19
+ )
20
+ response = request.execute()
21
+ return response["items"][0]["id"]
22
+
23
+ if channel_url.startswith("UC"):
24
+ return channel_url
25
+
26
+ raise ValueError(f"Unsupported channel URL format {channel_url}")
tests/search.py CHANGED
@@ -1,14 +1,14 @@
1
- from chromadb import PersistentClient
2
-
3
- from modules.db import get_collection
4
- from modules.retriever import retrieve_videos
5
- from dotenv import load_dotenv
6
- load_dotenv()
7
-
8
- collection = get_collection()
9
-
10
- all_metas = collection.get(include=["metadatas"])["metadatas"]
11
- print("Sample metadatas:", all_metas[:5])
12
-
13
- print("-------")
14
  retrieve_videos("Show me some videos that mention Ranganatha.", collection)
 
1
+ from chromadb import PersistentClient
2
+
3
+ from modules.db import get_collection
4
+ from modules.retriever import retrieve_videos
5
+ from dotenv import load_dotenv
6
+ load_dotenv()
7
+
8
+ collection = get_collection()
9
+
10
+ all_metas = collection.get(include=["metadatas"])["metadatas"]
11
+ print("Sample metadatas:", all_metas[:5])
12
+
13
+ print("-------")
14
  retrieve_videos("Show me some videos that mention Ranganatha.", collection)