RaagulQB commited on
Commit
c379cd9
·
1 Parent(s): ccb9a52

Add application file

Browse files
Files changed (2) hide show
  1. app.py +333 -0
  2. requirements.txt +13 -0
app.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import mimetypes
4
+ import requests
5
+ import time
6
+ from yt_dlp import YoutubeDL
7
+ from reportlab.lib.pagesizes import letter
8
+ from reportlab.lib.styles import getSampleStyleSheet
9
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
10
+ from reportlab.lib.units import inch
11
+ import gradio as gr
12
+
13
+ from langchain_community.document_loaders import PyPDFLoader
14
+ from langchain_openai import ChatOpenAI
15
+ from openai import OpenAI, DefaultHttpxClient
16
+ from langchain_chroma import Chroma
17
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
18
+ from langchain_openai import OpenAIEmbeddings
19
+ from langchain_community.document_loaders import WebBaseLoader
20
+ from langchain_core.runnables import RunnableLambda
21
+ from langchain_core.runnables.passthrough import RunnableAssign
22
+ from langchain_core.prompts import ChatPromptTemplate
23
+ from langchain_core.output_parsers import StrOutputParser
24
+ from langchain.output_parsers import PydanticOutputParser
25
+ from langchain_core.pydantic_v1 import BaseModel, Field
26
+ from typing import List
27
+ from pprint import pprint
28
+
29
+
30
+ def download_youtube_video(youtube_url, download_path):
31
+ try:
32
+ ydl_opts = {
33
+ 'format': 'bestaudio/best',
34
+ 'outtmpl': os.path.join(download_path, '%(title)s.%(ext)s'),
35
+ 'postprocessors': [{
36
+ 'key': 'FFmpegExtractAudio',
37
+ 'preferredcodec': 'mp3',
38
+ 'preferredquality': '192',
39
+ }],
40
+ }
41
+ with YoutubeDL(ydl_opts) as ydl:
42
+ info_dict = ydl.extract_info(youtube_url, download=True)
43
+ title = info_dict.get('title', None)
44
+ filename = ydl.prepare_filename(info_dict).replace('.webm', '.mp3').replace('.m4a', '.mp3')
45
+ return filename, title
46
+ except Exception as e:
47
+ print(f"Failed to download video from {youtube_url}: {e}")
48
+ return None, None
49
+
50
+ def upload_file(filepath, api_key):
51
+ url = "https://api.monsterapi.ai/v1/upload"
52
+ headers = {
53
+ "accept": "application/json",
54
+ "authorization": f"Bearer {api_key}"
55
+ }
56
+
57
+ file_name = os.path.basename(filepath)
58
+ get_file_urls = requests.get(f"{url}?filename={file_name}", headers=headers)
59
+
60
+ if get_file_urls.status_code != 200:
61
+ print(f"Failed to get upload URL: {get_file_urls.status_code}")
62
+ return None
63
+
64
+ response_json = get_file_urls.json()
65
+ upload_url = response_json['upload_url']
66
+ download_url = response_json['download_url']
67
+
68
+ data = open(filepath, 'rb').read()
69
+ file_headers = {
70
+ "Content-Type": mimetypes.guess_type(filepath)[0],
71
+ }
72
+
73
+ file_uploaded = requests.put(upload_url, data=data, headers=file_headers)
74
+
75
+ if file_uploaded.status_code == 200:
76
+ print(f"File successfully uploaded. Usable link is {download_url}")
77
+ return download_url
78
+ else:
79
+ print(f"Failed to upload file: {file_uploaded.status_code}")
80
+ return None
81
+
82
+ def generate_process_id(download_url, api_key):
83
+ whisper_url = "https://api.monsterapi.ai/v1/generate/whisper"
84
+ payload = {
85
+ "file": f"{download_url}",
86
+ "language": "en"
87
+ }
88
+ headers = {
89
+ "accept": "application/json",
90
+ "content-type": "application/json",
91
+ "authorization": f"Bearer {api_key}"
92
+ }
93
+
94
+ response = requests.post(whisper_url, json=payload, headers=headers)
95
+
96
+ if response.status_code != 200:
97
+ print(f"Failed to generate process ID: {response.status_code}")
98
+ return None
99
+ else:
100
+ process_id = response.json().get("process_id")
101
+ print(f"Process ID is: {process_id}")
102
+ return process_id
103
+
104
+ def query_job_status(job_id, api_key):
105
+ transcript = ""
106
+ url = f"https://api.monsterapi.ai/v1/status/{job_id}"
107
+ headers = {
108
+ "accept": "application/json",
109
+ "authorization": f"Bearer {api_key}"
110
+ }
111
+
112
+ while True:
113
+ response = requests.get(url, headers=headers)
114
+
115
+ if response.status_code != 200:
116
+ print(f"Failed to get status: {response.status_code}")
117
+ return transcript
118
+
119
+ status = response.json().get("status")
120
+
121
+ if status in ["COMPLETED", "FAILED"]:
122
+ print(f"Job status: {status}")
123
+ if status == "COMPLETED":
124
+ transcript = response.json().get("result")["text"]
125
+ return transcript
126
+
127
+ print(f"Job status: {status}, checking again in 5 seconds...")
128
+ time.sleep(5)
129
+
130
+ def create_pdf(transcripts, file_path):
131
+ doc = SimpleDocTemplate(file_path, pagesize=letter)
132
+ styles = getSampleStyleSheet()
133
+ story = []
134
+
135
+ for i, (title, transcript) in enumerate(transcripts, start=1):
136
+ story.append(Paragraph(f'YouTube Video {i} Title: {title}', styles['Title']))
137
+ story.append(Spacer(1, 12))
138
+ story.append(Paragraph(f'YouTube Video {i} Transcript:', styles['Heading2']))
139
+ story.append(Spacer(1, 12))
140
+ story.append(Paragraph(transcript.replace('\n', '<br/>'), styles['BodyText']))
141
+ story.append(Spacer(1, 24))
142
+
143
+ doc.build(story)
144
+
145
+
146
+ import gradio as gr
147
+ import os
148
+ from transcribe import download_youtube_video, upload_file, generate_process_id, query_job_status, create_pdf
149
+ from langchain_community.document_loaders import PyPDFLoader
150
+ from langchain_openai import ChatOpenAI
151
+ from openai import OpenAI, DefaultHttpxClient
152
+ from langchain_chroma import Chroma
153
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
154
+ from langchain_openai import OpenAIEmbeddings
155
+ from langchain_community.document_loaders import WebBaseLoader
156
+ from langchain_core.runnables import RunnableLambda
157
+ from langchain_core.runnables.passthrough import RunnableAssign
158
+ from langchain_core.prompts import ChatPromptTemplate
159
+ from langchain_core.output_parsers import StrOutputParser
160
+ from langchain.output_parsers import PydanticOutputParser
161
+ from langchain_core.pydantic_v1 import BaseModel, Field
162
+ from typing import List
163
+ from pprint import pprint
164
+
165
+ os.environ["OPENAI_API_KEY"] = "sk-proj-3XiMKGvrD8ev35tnGZ76T3BlbkFJmUSzs9Xpq8RBVF7tMyMh"
166
+
167
+ class DocumentSummaryBase(BaseModel):
168
+ running_summary: str = Field("", description="Running description of the document. Do not override; only update!")
169
+ main_ideas: List[str] = Field([], description="Most important information from the document (max 3)")
170
+ loose_ends: List[str] = Field([], description="Open questions that would be good to incorporate into summary, but that are yet unknown (max 3)")
171
+
172
+ def transcribe_and_save(youtube_urls):
173
+ download_path = os.getcwd()
174
+ api_key = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VybmFtZSI6Ijc4YTFjM2JmYzY4NTRlYmE0YWIxNzkwNzMwZjVlYjY4IiwiY3JlYXRlZF9hdCI6IjIwMjQtMDYtMjFUMDU6Mzc6MzkuNDU1MTM5In0.5-eKWqvK3x11CysTdfjvV36FityW-d_0N2hhht_HajA"
175
+ pdf_output_path = os.getcwd()+"/transcripts.pdf"
176
+ transcripts = []
177
+ for youtube_url in youtube_urls:
178
+ filepath, title = download_youtube_video(youtube_url, download_path)
179
+
180
+ if filepath and title:
181
+ download_url = upload_file(filepath, api_key)
182
+ if download_url:
183
+ process_id = generate_process_id(download_url, api_key)
184
+ if process_id:
185
+ transcript = query_job_status(process_id, api_key)
186
+ transcripts.append((title, transcript))
187
+ # Save all transcripts into a PDF file
188
+ create_pdf(transcripts, "transcripts.pdf")
189
+
190
+ def RExtract(pydantic_class, llm, prompt):
191
+ '''
192
+ Runnable Extraction module
193
+ Returns a knowledge dictionary populated by slot-filling extraction
194
+ '''
195
+ parser = PydanticOutputParser(pydantic_object=pydantic_class)
196
+ instruct_merge = RunnableAssign({'format_instructions' : lambda x: parser.get_format_instructions()})
197
+ def preparse(string):
198
+ if '{' not in string: string = '{' + string
199
+ if '}' not in string: string = string + '}'
200
+ string = (string
201
+ .replace("\\_", "_")
202
+ .replace("\n", " ")
203
+ .replace("\]", "]")
204
+ .replace("\[", "[")
205
+ )
206
+ # print(string) ## Good for diagnostics
207
+ return string
208
+ return instruct_merge | prompt | llm | preparse | parser
209
+
210
+ def RSummarizer(knowledge, llm, prompt, verbose=False):
211
+ '''
212
+ Exercise: Create a chain that summarizes
213
+ '''
214
+ def summarize_docs(docs):
215
+ parse_chain = RunnableAssign({"info_base": RExtract(knowledge.__class__, llm, prompt)})
216
+ state = {"info_base": knowledge}
217
+ all_summaries = [] # List to store all intermediate summaries
218
+
219
+ for i, doc in enumerate(docs):
220
+ state['input'] = doc.page_content
221
+ state = parse_chain.invoke(state)
222
+
223
+ # Store the current info_base in the list
224
+ all_summaries.append(state['info_base'].dict())
225
+
226
+ if verbose:
227
+ print(f"Considered {i+1} documents")
228
+ pprint(state['info_base'].dict())
229
+ return all_summaries
230
+ return RunnableLambda(summarize_docs)
231
+
232
+ def find_first_non_empty_summary(summaries):
233
+ for summary in reversed(summaries):
234
+ if summary['loose_ends'] or summary['main_ideas'] or summary['running_summary']:
235
+ return summary
236
+ return None
237
+
238
+ def create_running_summary(url):
239
+ loader = WebBaseLoader(url)
240
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200,chunk_overlap=100,separators=["\n\n", "\n", ".", ";", ",", " ", ""])
241
+ documents = loader.load()
242
+ docs_split = text_splitter.split_documents(documents)
243
+ summary_prompt =ChatPromptTemplate.from_template("""You are generating a running summary of the document. Make it readable by a technical user.
244
+ After this, the old knowledge base will be replaced by the new one. Make sure a reader can still understand everything.
245
+ Keep it short, but as dense and useful as possible! The information should flow from chunk to (loose ends or main ideas) to running_summary.
246
+ Strictly output a json and nothing else do not output any strings or explanations just the json is enough.
247
+ The updated knowledge base keep all of the information from running_summary here: {info_base}.
248
+ {format_instructions}. Follow the format precisely, including quotations and commas\n\n
249
+ {info_base}\nWithout losing any of the info, update the knowledge base with the following: {input}""")
250
+ instruct_model = llm_1 | StrOutputParser()
251
+ summarizer = RSummarizer(DocumentSummaryBase(), instruct_model, summary_prompt, verbose=True)
252
+ summaries = summarizer.invoke(docs_split)
253
+ summary = find_first_non_empty_summary(summaries)
254
+ return summary
255
+
256
+
257
+ def setup_vectorstore():
258
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
259
+ vector_store = Chroma(collection_name="collection-1",embedding_function=embeddings,persist_directory="./vectorstore",)
260
+ loader = PyPDFLoader(os.getcwd()+"/transcripts.pdf")
261
+ documents = loader.load()
262
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=250,chunk_overlap=0,separators=["\n\n"])
263
+ text = text_splitter.split_documents(documents)
264
+ retriever = vector_store.as_retriever()
265
+ retriever.add_documents(text)
266
+ return retriever
267
+
268
+ def generate(content,examples):
269
+ chat_template = ChatPromptTemplate.from_template("""Your are provided with a few sample youtube video scripts below.
270
+ your task is to create a similar script for the following content provided to you below.
271
+ Follow the style followd in the examples and create a similar script for the content givent to you.
272
+ Create me a script for a youtube video explaining the following content: {content}.
273
+ Here are a few example scripts of my previous videos that you have to adapt: {examples}.""")
274
+ gen_chain = chat_template | llm_2 | StrOutputParser()
275
+ return gen_chain.invoke({"content": content, "examples": examples})
276
+
277
+ def docs2str(docs, title="Document"):
278
+ out_str = ""
279
+ for doc in docs:
280
+ doc_name = getattr(doc, 'metadata', {}).get('Title', title)
281
+ if doc_name:
282
+ out_str += f"[Quote from {doc_name}] "
283
+ out_str += getattr(doc, 'page_content', str(doc)) + "\n"
284
+ return out_str
285
+
286
+ llm_1 = ChatOpenAI(
287
+ model="google/gemma-2-9b-it",
288
+ temperature=0,
289
+ max_tokens=None,
290
+ timeout=None,
291
+ max_retries=2,
292
+ api_key="eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VybmFtZSI6Ijc4YTFjM2JmYzY4NTRlYmE0YWIxNzkwNzMwZjVlYjY4IiwiY3JlYXRlZF9hdCI6IjIwMjQtMDYtMjFUMDU6Mzc6MzkuNDU1MTM5In0.5-eKWqvK3x11CysTdfjvV36FityW-d_0N2hhht_HajA",
293
+ base_url="https://llm.monsterapi.ai/v1/",
294
+ http_client=DefaultHttpxClient(verify = False)
295
+ )
296
+
297
+ llm_2 = ChatOpenAI(
298
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct",
299
+ temperature=0,
300
+ max_tokens=None,
301
+ timeout=None,
302
+ max_retries=2,
303
+ api_key="eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VybmFtZSI6Ijc4YTFjM2JmYzY4NTRlYmE0YWIxNzkwNzMwZjVlYjY4IiwiY3JlYXRlZF9hdCI6IjIwMjQtMDYtMjFUMDU6Mzc6MzkuNDU1MTM5In0.5-eKWqvK3x11CysTdfjvV36FityW-d_0N2hhht_HajA",
304
+ base_url="https://llm.monsterapi.ai/v1/",
305
+ http_client=DefaultHttpxClient(verify = False)
306
+ )
307
+
308
+ def process_links(style_links, context_link):
309
+ # Here you can define the processing logic for the links.
310
+ style_links = style_links.split(",")
311
+ style_links = [link.strip() for link in style_links]
312
+ transcribe_and_save(style_links)
313
+ retriever = setup_vectorstore()
314
+ summary = create_running_summary(context_link)
315
+ summary = summary['running_summary']
316
+ print("Summarized the url successfully:", summary)
317
+ examples = retriever.invoke(summary)
318
+ return generate(summary,examples)
319
+
320
+ # Define the Gradio interface
321
+ with gr.Blocks() as demo:
322
+ gr.Markdown("## Link Processor")
323
+
324
+ style_links = gr.Textbox(lines=5, placeholder="Enter style links separated by commas", label="Style Links")
325
+ context_link = gr.Textbox(lines=1, placeholder="Enter context link", label="Context Link")
326
+
327
+ output = gr.Textbox(lines=2, label="Output")
328
+
329
+ process_button = gr.Button("Process")
330
+
331
+ process_button.click(process_links, inputs=[style_links, context_link], outputs=output)
332
+
333
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sentence_transformers
2
+ pypdf
3
+ chromadb
4
+ langchain
5
+ langchain-openai
6
+ langchain_community
7
+ langchain_chroma
8
+ arxiv
9
+ pymupdf
10
+ openai
11
+ yt_dlp
12
+ reportlab
13
+ gradio