sofzcc commited on
Commit
6f31873
Β·
verified Β·
1 Parent(s): 5382133

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +966 -967
app.py CHANGED
@@ -1,967 +1,966 @@
1
- import os
2
- import time
3
- import streamlit as st
4
- from youtube_transcript_api import YouTubeTranscriptApi
5
- from youtube_search import YoutubeSearch
6
- from fpdf import FPDF
7
- from langchain_openai import ChatOpenAI
8
- from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
9
- from sentence_transformers import SentenceTransformer
10
- from langchain.chains import RetrievalQA
11
- from langchain.prompts import PromptTemplate
12
- from langchain.memory import ConversationBufferWindowMemory
13
- from langchain_community.vectorstores import Chroma
14
- import chromadb
15
- from langchain_core.documents import Document
16
- from pypdf import PdfReader
17
- from langchain_community.document_loaders import PyPDFLoader
18
- from langchain.agents import initialize_agent, Tool
19
- from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
20
- from langchain.agents import Tool, AgentExecutor, create_react_agent, tool
21
- from flask import Flask, request, jsonify
22
- import sqlite3
23
- import re
24
- import textwrap
25
- from langchain.chains.summarize import load_summarize_chain
26
- from langchain_community.document_loaders import WebBaseLoader
27
- from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain, StuffDocumentsChain
28
- from langchain.chains.llm import LLMChain
29
- import torch
30
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
31
- import nltk
32
- from nltk.tokenize import word_tokenize
33
- import pytube
34
- from moviepy.editor import *
35
-
36
- # Download necessary resources
37
- nltk.download('punkt')
38
-
39
-
40
-
41
- # Initialize environment variables
42
- from dotenv import load_dotenv
43
- import traceback
44
- import logging
45
-
46
- load_dotenv()
47
-
48
- OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
49
- HUGGINGFACEHUB_API_TOKEN = os.getenv('HF_TOKEN')
50
- YT_API_KEY = os.getenv('YT_API_KEY')
51
-
52
- LANGCHAIN_TRACING_V2='true'
53
- LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
54
- LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
55
- LANGCHAIN_PROJECT="default"
56
-
57
- # Download and initialize all required models
58
- model = SentenceTransformerEmbeddings(model_name='paraphrase-MiniLM-L6-v2')
59
- summarization_model_name = "suriya7/bart-finetuned-text-summarization"
60
- summarization_model = AutoModelForSeq2SeqLM.from_pretrained(summarization_model_name)
61
- summarization_tokenizer = AutoTokenizer.from_pretrained(summarization_model_name)
62
-
63
-
64
- # Function to load the vector database
65
- def load_vectordb():
66
- """
67
- Load the vector database from Chroma.
68
-
69
- Returns:
70
- langchain_chroma (Chroma): The Chroma vector database.
71
- """
72
- persistent_client = chromadb.PersistentClient("chromadb")
73
-
74
- langchain_chroma = Chroma(
75
- client=persistent_client,
76
- collection_name="knowledge_base",
77
- embedding_function=model,
78
- )
79
-
80
- return langchain_chroma
81
-
82
- vector_db = load_vectordb()
83
-
84
- # Set up logging
85
- logging.basicConfig(level=logging.INFO)
86
- logger = logging.getLogger(__name__)
87
-
88
- def safe_execute(func, *args, **kwargs):
89
- """
90
- Execute a function safely, catching any exceptions and logging errors.
91
-
92
- Args:
93
- func (callable): The function to execute.
94
- *args: Variable length argument list for the function.
95
- **kwargs: Arbitrary keyword arguments for the function.
96
-
97
- Returns:
98
- The result of the function execution, or an error message if an exception occurs.
99
- """
100
- try:
101
- return func(*args, **kwargs)
102
- except Exception as e:
103
- logger.error(f"Error in {func.__name__}: {str(e)}")
104
- logger.error(traceback.format_exc())
105
- return f"An error occurred: {str(e)}"
106
-
107
-
108
- # Initialize LLM
109
- llm = ChatOpenAI(temperature=0.6, model_name="gpt-3.5-turbo-16k")
110
-
111
-
112
- def count_tokens(text):
113
- """
114
- Count the number of tokens in a given text using NLTK's word tokenizer.
115
-
116
- Args:
117
- text (str): The input text.
118
-
119
- Returns:
120
- int: The number of tokens in the text.
121
- """
122
- tokens = word_tokenize(text)
123
- return len(tokens)
124
-
125
- def text_summarize(text):
126
- """
127
- Summarize the input text using a MapReduce approach.
128
-
129
- Args:
130
- text (str): The input text to summarize.
131
-
132
- Returns:
133
- str: The summary of the input text.
134
- """
135
- # Split the text into chunks
136
- text_splitter = CharacterTextSplitter(chunk_size=10000, chunk_overlap=200)
137
-
138
- docs = [Document(page_content=chunk) for chunk in text_splitter.split_text(text)]
139
-
140
- # Map step
141
- map_template = """The following is a document:
142
- {docs}
143
- Based on this document, please identify the main themes and key points.
144
- Helpful Answer:"""
145
- map_prompt = PromptTemplate.from_template(map_template)
146
- map_chain = LLMChain(llm=llm, prompt=map_prompt)
147
-
148
- # Reduce step
149
- reduce_template = """The following is a set of summaries:
150
- {docs}
151
- Take these and distill them into a final, consolidated summary of the main themes and key points.
152
- Helpful Answer:"""
153
- reduce_prompt = PromptTemplate.from_template(reduce_template)
154
- reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
155
-
156
- # Combine
157
- combine_documents_chain = StuffDocumentsChain(
158
- llm_chain=reduce_chain,
159
- document_variable_name="docs"
160
- )
161
-
162
- # Create the MapReduceDocumentsChain
163
- map_reduce_chain = MapReduceDocumentsChain(
164
- llm_chain=map_chain,
165
- reduce_documents_chain=combine_documents_chain,
166
- document_variable_name="docs"
167
- )
168
-
169
- return map_reduce_chain.run(docs)
170
-
171
-
172
- # Function to add documents to the database
173
- def add_documents_to_db(pdf_file):
174
- """
175
- Add documents extracted from a PDF file to the vector database.
176
-
177
- Args:
178
- pdf_file (str): The path to the PDF file to process.
179
- """
180
- try:
181
- texts = extract_text_from_pdf(pdf_file)
182
- cleaned_text = clean_text(texts)
183
- documents = get_text_chunks(cleaned_text)
184
-
185
- if documents:
186
- h_size = 10000
187
- total_documents = len(documents)
188
- processed_documents = 0
189
-
190
- while processed_documents < total_documents:
191
- remaining_documents = total_documents - processed_documents
192
- current_h_size = min(h_size, remaining_documents)
193
-
194
- h_documents = documents[processed_documents:processed_documents + current_h_size]
195
- vector_db.add_documents(h_documents)
196
-
197
- processed_documents += current_h_size
198
-
199
- print(f"Processed {processed_documents} out of {total_documents} documents.")
200
-
201
- print("All documents added to the collection.")
202
- else:
203
- logger.warning(f"No documents found in {pdf_file}.")
204
- except Exception as e:
205
- logger.error(f"Error adding documents to database from {pdf_file}: {str(e)}")
206
- raise # Re-raise the exception for visibility
207
-
208
-
209
- def generate_valid_filename(query):
210
- """
211
- Generate a valid filename by replacing invalid characters with underscores.
212
-
213
- Args:
214
- query (str): The input string to generate the filename from.
215
-
216
- Returns:
217
- str: The generated valid filename.
218
- """
219
- valid_chars = '-_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
220
- filename = ''.join(c if c in valid_chars else '_' for c in query)
221
- return filename
222
-
223
- #################################################
224
- ## NEW FUNCTIONS ##
225
- #################################################
226
- import whisper
227
- import time
228
- from pytube import YouTube
229
-
230
-
231
- def download_video(url):
232
- video = YouTube(url)
233
- stream = video.streams.filter(file_extension='mp4')
234
- stream.download()
235
- return stream.default_filename
236
-
237
-
238
- def video_to_text(filename):
239
- clip = VideoFileClip(filename)
240
- audio_filename = filename[:-4] + ".mp3"
241
- clip.audio.write_audiofile(audio_filename)
242
- clip.close()
243
- time.sleep(5)
244
-
245
- model = whisper.load_model("base")
246
- result = model.transcribe(audio_filename)
247
-
248
- transcription = result["text"]
249
-
250
- return transcription
251
-
252
-
253
- #################################################
254
- # Function to search and transcribe YouTube videos
255
- def search_and_transcribe_videos(query, max_results=20, min_valid_videos=4):
256
- """
257
- Search for YouTube videos and transcribe them.
258
-
259
- Args:
260
- query (str): The search query for YouTube videos.
261
- max_results (int): The maximum number of results to fetch. Default is 20.
262
- min_valid_videos (int): The minimum number of valid videos to transcribe. Default is 4.
263
-
264
- Returns:
265
- str: The path to the transcript file.
266
- """
267
- valid_urls = []
268
- current_max_results = max_results
269
- transcription = ''
270
- while len(valid_urls) < min_valid_videos and current_max_results <= 20:
271
- results = YoutubeSearch(query, max_results=current_max_results).to_dict()
272
- filtered_results = [video for video in results if video.get('liveBroadcastContent') != 'live']
273
- for video in filtered_results:
274
- video_id = video['id']
275
- video_link = f"https://www.youtube.com/watch?v={video_id}"
276
- try:
277
- transcription = YouTubeTranscriptApi.get_transcript(video_id, languages=['en', 'en-US'])
278
- transcript_text = " ".join([line['text'] for line in transcription])
279
- valid_urls.append((transcript_text))
280
-
281
- except:
282
- continue
283
-
284
- if len(valid_urls) >= min_valid_videos:
285
- break
286
-
287
- current_max_results += max_results
288
-
289
- transcript_file = generate_valid_filename(query) + '.txt'
290
- with open(transcript_file, 'a', encoding='utf-8') as f:
291
- for text in valid_urls[:min_valid_videos]:
292
- f.write(f"Text:{text}\n\n")
293
-
294
- return transcript_file
295
-
296
- # Function to create a PDF from a transcript
297
- def create_pdf(input_file):
298
- """
299
- Create a PDF file from a transcript file.
300
-
301
- Args:
302
- input_file (str): The path to the transcript file.
303
-
304
- Returns:
305
- str: The path to the created PDF file.
306
- """
307
- pdf = FPDF()
308
- with open(input_file, 'r', encoding='utf-8') as f:
309
- text = f.read()
310
- pdf.add_page()
311
- pdf.set_font('Arial', size=12)
312
- pdf.multi_cell(0, 10, text.encode('latin-1', 'replace').decode('latin-1'))
313
- filename = input_file.split('.txt')[0]
314
- output_filename = f"{filename}.pdf"
315
- pdf.output(output_filename)
316
- return output_filename
317
-
318
- # Function to extract text from a PDF
319
- def extract_text_from_pdf(pdf_path):
320
- """
321
- Extract text from a PDF file.
322
-
323
- Args:
324
- pdf_path (str): The path to the PDF file.
325
-
326
- Returns:
327
- str: The extracted text.
328
- """
329
- reader = PdfReader(pdf_path)
330
- text = ""
331
- for page in reader.pages:
332
- page_text = page.extract_text()
333
- if page_text:
334
- text += page_text
335
- return text
336
-
337
- # Function to clean extracted text
338
- def clean_text(text):
339
- """
340
- Clean and preprocess the extracted text.
341
-
342
- Args:
343
- text (str): The extracted text.
344
-
345
- Returns:
346
- str: The cleaned text.
347
- """
348
-
349
- text = text.replace('\xa0', ' ')
350
- text = re.sub(r'[^\x00-\x7F]+!?', ' ', text)
351
- return text
352
-
353
- # Function to split text into chunks
354
- def get_text_chunks(text):
355
- """
356
- Split the cleaned text into manageable chunks for further processing.
357
-
358
- Args:
359
- text (str): The cleaned text.
360
- chunk_size (int): The size of each text chunk.
361
-
362
- Returns:
363
- list of Document: List of Document objects containing text chunks.
364
- """
365
-
366
- text_splitter = RecursiveCharacterTextSplitter(
367
- chunk_size=1000,
368
- chunk_overlap=200,
369
- length_function=len
370
- )
371
- chunks = text_splitter.split_text(text)
372
- return [Document(page_content=chunk) for chunk in chunks]
373
-
374
-
375
-
376
- # Function to process YouTube videos
377
- def load_video(url):
378
- """
379
- Retrieve the transcript of a YouTube video, save it to a text file,
380
- convert the text file to a PDF, and return the PDF filename.
381
-
382
- Args:
383
- url (str): The URL of the YouTube video.
384
-
385
- Returns:
386
- str: The filename of the generated PDF.
387
- """
388
- video_id = url.split('v=')[-1]
389
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
390
- transcript_text = ' '.join([t['text'] for t in transcript])
391
- filename = f"{video_id}.txt"
392
- with open(filename, 'w', encoding='utf-8') as f:
393
- f.write(transcript_text)
394
- pdf_filename = create_pdf(filename)
395
- return pdf_filename
396
-
397
- #Initialize the collection
398
- def initialize_collection():
399
- """
400
- Initialize the knowledge base by searching and transcribing YouTube videos
401
- for a predefined set of queries, converting them to PDF, and adding them
402
- to the vector database.
403
-
404
- Returns:
405
- bool: True if the initialization is successful.
406
- """
407
- # Update queries if you want the assistant to have a different knowledge base and uncomment initialize_collection() after this function
408
-
409
- queries = [
410
- "Transfer Learning in Machine Learning",
411
- "Object Detection and Recognition in Computer Vision",
412
- "Sentiment Analysis in Natural Language Processing",
413
- "Generative Adversarial Networks (GANs) in Deep Learning",
414
- "Automatic Speech Recognition (ASR) Systems",
415
- "Reinforcement Learning Applications",
416
- "Image Segmentation Techniques in Computer Vision",
417
- "Text Summarization Methods in NLP",
418
- "Convolutional Neural Networks (CNNs) for Image Classification",
419
- "Speech Synthesis and Text-to-Speech (TTS) Systems",
420
- "Anomaly Detection in Machine Learning",
421
- "Facial Recognition Technology and Ethics",
422
- "Machine Translation and Language Models",
423
- "Recurrent Neural Networks (RNNs) for Sequence Data",
424
- "Speaker Diarization and Identification in Speech Processing",
425
- "Applications of Natural Language Understanding (NLU)",
426
- "Deep Reinforcement Learning for Game AI",
427
- "Semantic Segmentation in Computer Vision",
428
- "Dialogue Systems and Conversational AI",
429
- "Ethical Implications of AI in Healthcare",
430
- "Neural Machine Translation (NMT)",
431
- "Time Series Forecasting with Machine Learning",
432
- "Multi-modal Learning and Fusion",
433
- "Named Entity Recognition (NER) in NLP",
434
- "Human Pose Estimation in Computer Vision",
435
- "Language Generation Models",
436
- "Cognitive Robotics and AI Integration",
437
- "Visual Question Answering (VQA) Systems",
438
- "Privacy and Security in AI Applications",
439
- "Graph Neural Networks (GNNs) for Structured Data",
440
- "Introduction to Python programming",
441
- "Python data types and variables",
442
- "Control flow and loops in Python",
443
- "Functions and modules in Python",
444
- "File handling in Python",
445
- "Object-oriented programming (OOP) in Python",
446
- "Error handling and exceptions in Python",
447
- "Python libraries for data analysis (e.g., Pandas, NumPy)",
448
- "Web scraping with Python (e.g., using BeautifulSoup)",
449
- "Creating GUI applications in Python (e.g., using Tkinter)",
450
- "History of Formula 1 racing",
451
- "Formula 1 car specifications and regulations",
452
- "Famous Formula 1 drivers and their achievements",
453
- "Formula 1 circuits around the world",
454
- "How Formula 1 teams operate and strategize",
455
- "Technological innovations in Formula 1",
456
- "Role of aerodynamics in Formula 1 cars",
457
- "Formula 1 race formats (qualifying, practice sessions, race day)",
458
- "Evolution of safety measures in Formula 1",
459
- "Economic impact of Formula 1 on host countries",
460
- "Formula 1 engine specifications and development",
461
- "Famous rivalries in Formula 1 history",
462
- "Formula 1 team dynamics and hierarchy",
463
- "How Formula 1 impacts automotive technology",
464
- "The role of tire management in Formula 1 races",
465
- "Key differences between Formula 1 and other racing series",
466
- "The influence of sponsors in Formula 1",
467
- "Formula 1 rules and regulations changes over the years",
468
- "Notable controversies in Formula 1",
469
- "The future of Formula 1 racing"
470
- ]
471
- print(len(queries))
472
- for query in queries:
473
- print(query)
474
- transcript_file = search_and_transcribe_videos(query)
475
- print(transcript_file)
476
- time.sleep(5)
477
-
478
- pdf_filename = create_pdf(transcript_file)
479
- time.sleep(10)
480
-
481
- add_documents_to_db(pdf_filename)
482
-
483
- return True
484
-
485
- import tiktoken
486
-
487
- def update_conversation_summary(summarized_conversation, new_interaction):
488
- """
489
- Update the summary of a conversation by appending a new interaction.
490
-
491
- Args:
492
- summarized_conversation (str): The current summarized conversation.
493
- new_interaction (dict): A dictionary containing 'question' and 'answer' keys.
494
-
495
- Returns:
496
- str: The updated summary of the conversation.
497
- """
498
-
499
- new_summary = f"{summarized_conversation}\n- Q: {new_interaction['question']}\n A: {new_interaction['answer']}"
500
-
501
- return new_summary
502
-
503
-
504
- def is_long_task(task, max_tokens=1000):
505
- """
506
- Determine if a given task exceeds the specified token limit.
507
-
508
- Args:
509
- task (str): The task to check.
510
- max_tokens (int): The maximum number of tokens allowed.
511
-
512
- Returns:
513
- bool: True if the task exceeds the token limit, False otherwise.
514
- """
515
-
516
- encoding = tiktoken.encoding_for_model(llm)
517
- num_tokens = len(encoding.encode(task))
518
- return num_tokens > max_tokens
519
-
520
- def split_task(task):
521
- """
522
- Split a long task into smaller subtasks for easier processing.
523
-
524
- Args:
525
- task (str): The task to split.
526
-
527
- Returns:
528
- list of str: A list of subtasks.
529
- """
530
-
531
- prompt = f"""
532
- The following task needs to be split into smaller subtasks:
533
-
534
- {task}
535
-
536
- Please divide this task into 2-4 subtasks. Each subtask should be a complete, standalone task.
537
- Format your response as a Python list of strings, with each string being a subtask.
538
- """
539
-
540
- response = llm.invoke(prompt)
541
- subtasks = eval(response)
542
- return subtasks
543
-
544
- def combine_results(results):
545
- """
546
- Combine the results from multiple subtasks into a single summary.
547
-
548
- Args:
549
- results (list of str): The results from subtasks.
550
-
551
- Returns:
552
- str: A concise summary of the combined results.
553
- """
554
-
555
- combined = "Combined results from subtasks:\n\n"
556
- for i, result in enumerate(results, 1):
557
- combined += f"Subtask {i} result:\n{result}\n\n"
558
-
559
- summary_prompt = f"""
560
- Please provide a concise summary of the following combined results:
561
-
562
- {combined}
563
-
564
- Summarize the key points and overall conclusion.
565
- """
566
-
567
- response = llm.invoke(summary_prompt)
568
- return response
569
-
570
-
571
-
572
- def process_user_input(user_input):
573
- """
574
- Process user input by determining if it's a long task. If so, split it into subtasks,
575
- process each subtask, and combine the results. Otherwise, process the input directly.
576
-
577
- Args:
578
- user_input (str): The user's input to process.
579
-
580
- Returns:
581
- str: The result after processing the user input.
582
- """
583
-
584
- if is_long_task(user_input):
585
- subtasks = split_task(user_input)
586
- results = []
587
- for subtask in subtasks:
588
- result = run_agent(subtask)
589
- results.append(result)
590
- return combine_results(results)
591
- else:
592
- return run_agent(user_input)
593
-
594
- # Uncomment the line below if you want to re-initialize the collection or initialize it with different topics
595
- #initialize_collection()
596
-
597
- def create_qa_chain():
598
- """
599
- Create a question-answering chain using a retriever and a language model.
600
-
601
- Returns:
602
- RetrievalQA: The question-answering chain instance.
603
- """
604
-
605
- retriever = vector_db.as_retriever()
606
- qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
607
- return qa_chain
608
-
609
- def combine_summaries(summaries):
610
- """
611
- Combine multiple summaries into a single summary.
612
-
613
- Args:
614
- summaries (list of str): The list of summaries to combine.
615
-
616
- Returns:
617
- str: The combined summary.
618
- """
619
-
620
- combined_summary = " ".join(summaries)
621
- return combined_summary
622
-
623
- def split_text(text, max_length=1500):
624
- """
625
- Split a long text into smaller chunks, ensuring chunks do not exceed the specified length.
626
-
627
- Args:
628
- text (str): The text to split.
629
- max_length (int): The maximum length of each chunk.
630
-
631
- Returns:
632
- list of str: A list of text chunks.
633
- """
634
-
635
- chunks = []
636
- while len(text) > max_length:
637
- chunk = text[:max_length]
638
- # Find the last complete sentence within the chunk
639
- last_period = chunk.rfind('. ')
640
- if last_period != -1:
641
- chunk = chunk[:last_period+1]
642
- chunks.append(chunk)
643
- text = text[len(chunk):].lstrip()
644
- if text:
645
- chunks.append(text)
646
- return chunks
647
-
648
- def process_large_text(transcript_text):
649
- """
650
- Process a large text by splitting it into chunks, summarizing each chunk,
651
- and then generating a final summary from the combined chunk summaries.
652
-
653
- Args:
654
- transcript_text (str): The large text to process.
655
-
656
- Returns:
657
- str: The final summary of the large text.
658
- """
659
-
660
- # Step 1: Split the cleaned text into manageable chunks
661
- chunks = split_text(transcript_text, max_length=1500)
662
-
663
- # Step 2: Generate summaries for each chunk
664
- chunk_summaries = [text_summarize(chunk) for chunk in chunks]
665
-
666
- # Step 3: Combine the chunk summaries
667
- combined_summary = combine_summaries(chunk_summaries)
668
-
669
- # Step 4: Generate the final summary from combined summaries
670
- final_summ = text_summarize(combined_summary)
671
-
672
- return final_summ
673
-
674
- # Initialize memory with k=5, so the memory object will store the most recent 5 messages or interactions in the conversation
675
- memory = ConversationBufferWindowMemory(k=5)
676
-
677
- # Define agent tools
678
- @tool
679
- def search_kb(query):
680
- """
681
- Search the knowledge base for relevant documents based on a query and return a response.
682
-
683
- Args:
684
- query (str): The search query.
685
-
686
- Returns:
687
- str: The result from the QA chain based on the retrieved documents.
688
- """
689
-
690
- retriever = vector_db.as_retriever()
691
- docs = retriever.get_relevant_documents(query)
692
- summaries = "\n\n".join([doc.page_content for doc in docs])
693
- qa_chain = create_qa_chain()
694
- llm_response = qa_chain({"query": query})
695
- return llm_response["result"]
696
-
697
- @tool
698
- def process_video(url):
699
- """
700
- Processes a YouTube video by extracting its transcript, summarizing it,
701
- and adding the transcript to the knowledge base.
702
-
703
- Args:
704
- url (str): The URL of the YouTube video to process.
705
-
706
- Returns:
707
- str: The summary of the video.
708
- """
709
- # video_id = url.split('v=')[-1]
710
- # transcript = YouTubeTranscriptApi.get_transcript(video_id)
711
- # transcript_text = ' '.join([t['text'] for t in transcript])
712
-
713
- video = download_video(url)
714
- transcript_text = video_to_text(video)
715
-
716
- # Clean the transcript text
717
- cleaned_text = clean_text(transcript_text)
718
- if len(cleaned_text) > 15000:
719
- process_large_text(cleaned_text)
720
-
721
- # Generate a summary for the user
722
- summary = text_summarize(cleaned_text)
723
-
724
- print(f"Added {len(summary)} chunks from YouTube video {url} to the collection.")
725
- return summary
726
-
727
-
728
- @tool
729
- def new_search(query):
730
- """
731
- Perform a new search on YouTube, transcribe videos, create a PDF from the transcript, add documents to the database, and search the knowledge base.
732
-
733
- Args:
734
- query (str): The search query.
735
-
736
- Returns:
737
- str: The path to the created PDF file.
738
- """
739
- transcript = search_and_transcribe_videos(query)
740
- time.sleep(10)
741
- pdf_file = create_pdf(transcript)
742
- time.sleep(10)
743
- add_documents_to_db(pdf_file)
744
- time.sleep(5)
745
- search_kb(query)
746
- return pdf_file
747
-
748
- @tool
749
- def process_pdf(pdf):
750
- """
751
- Processes a PDF File by summarizing it,
752
- and adding it to the knowledge base.
753
-
754
- Args:
755
- pdf (str): The path to the PDF file to process.
756
-
757
- Returns:
758
- str: The summary of the PDF.
759
- """
760
-
761
- loader = PyPDFLoader(pdf)
762
- docs = loader.load_and_split()
763
- chain = load_summarize_chain(llm, chain_type="map_reduce")
764
- summary = chain.run(docs)
765
-
766
- return summary
767
-
768
-
769
-
770
- # Define the agent tools
771
- tools = [
772
- Tool(
773
- name="Search KB",
774
- func=search_kb,
775
- description="useful for when you need to answer questions about Machine Learning, Computer Vision and Natural Language Processing. The input to this tool should be a complete english sentence.",
776
- ),
777
- Tool(
778
- name="Search YouTube",
779
- func=new_search,
780
- description="useful for when the user asks you a question outside of Machine Learning, Computer Vision and Natural Language Processing. You use it to find new information about a topic not in the knowledge base. The input to this tool should be a complete english sentence.",
781
- ),
782
- Tool(
783
- name="Process Video",
784
- func=process_video,
785
- description="Useful for when the user wants to summarize or ask questions about a specific YouTube video. The input to this tool should be a YouTube URL.",
786
- ),
787
- Tool(
788
- name="Process PDF",
789
- func=process_pdf,
790
- description="Useful for when the user wants to summarize or ask questions about a specific PDF file. The input to this tool should be a PDF file path.",
791
- )
792
- ]
793
-
794
-
795
-
796
- # Define the agent prompt
797
- prompt_template_string = """
798
- You are an AI trained on Artificial Intelligence topics and Formula 1.
799
-
800
-
801
- Answer the following questions as best you can, taking into account the context of the conversation.
802
- You have access to the following tools:
803
-
804
- {tools}
805
-
806
- Use the following format:
807
-
808
- Question: the input question you must answer
809
- Thought: you should always think about what to do
810
- Action: the action you should take, should be one of [{tool_names}]
811
- Action Input: the input to the action
812
- Observation: the result of the action
813
- ... (this Thought/Action/Action Input/Observation can repeat N times)
814
- Thought: I now know the final answer
815
- Final Answer: the final answer to the original input question
816
-
817
-
818
- Example 1:
819
- Question: What are dinosaurs?
820
- Thought: I need to check the knowledge base for information on dinosaurs.
821
- Action: Search Knowledge Base
822
- Action Input: What are dinosaurs?
823
- Observation: I don't have information on dinosaurs based on the provided context about machine learning and artificial intelligence.
824
- Thought: I need to find new information about dinosaurs.
825
- Action: Search YouTube
826
- Action Input: Dinosaurs
827
- Observation: Found relevant information and updated the knowledge base.
828
- Thought: Now I can find information in the updated knowledge base.
829
- Action: Search Knowledge Base
830
- Action Input: What are dinosaurs?
831
- Observation: [detailed information about dinosaurs]
832
- Thought: I now know the final answer.
833
- Final Answer: [final detailed answer about dinosaurs]
834
-
835
- Example 2:
836
- Question: Can you summarize this video? https://www.youtube.com/watch?v=dQw4w9WgXcQ
837
- Thought: I need to extract the link to the video to get the summary.
838
- Action: Process input to get link
839
- Action Input: https://www.youtube.com/watch?v=dQw4w9WgXcQ
840
- Observation: [summary of the video]
841
- Thought: Now I can provide the summary of the video.
842
- Final Answer: [summary of the video]
843
-
844
- Example 3:
845
- Question: Explain the content of this video https://www.youtube.com/watch?v=dQw4w9WgXcQ and how it relates to machine learning.
846
- Thought: I need to extract the YouTube link from the input.
847
- Action: Extract YouTube Link
848
- Action Input: Explain the content of this video https://www.youtube.com/watch?v=dQw4w9WgXcQ and how it relates to machine learning.
849
- Observation: Extracted YouTube link: https://www.youtube.com/watch?v=dQw4w9WgXcQ
850
- Thought: I need to process the video to get the summary.
851
- Action: Process Video
852
- Action Input: https://www.youtube.com/watch?v=dQw4w9WgXcQ
853
- Observation: [summary of the video]
854
- Thought: Now I can relate the content to machine learning.
855
- Final Answer: [explanation of how the video content relates to machine learning]
856
-
857
- Example 4:
858
- Question: Who are you?
859
- Thought: I should explain that I'm a chatbot and how I can help.
860
- Final Answer: I am a chatbot that can answer questions about machine learning and other related topics.
861
-
862
- Example 5:
863
- Question: What is your name?
864
- Thought: I don't know.
865
- Final Answer: I don't know the answer for that.
866
-
867
- Question: {input}
868
- {agent_scratchpad}"""
869
-
870
- # Define the agent
871
- prompt = PromptTemplate.from_template(prompt_template_string)
872
-
873
-
874
- agent = create_react_agent(llm, tools, prompt)
875
- agent_executor = AgentExecutor(agent=agent, tools=tools,handle_parsing_errors=True)
876
-
877
-
878
-
879
- # Streamlit App Interface Design
880
- def main():
881
-
882
- # Initialize session state
883
- if 'messages' not in st.session_state:
884
- st.session_state.messages = []
885
- if 'chat_history' not in st.session_state:
886
- st.session_state.chat_history = []
887
- if 'conversation_summary' not in st.session_state:
888
- st.session_state.conversation_summary = ""
889
-
890
- # Function to clear chat history
891
- def clear_chat():
892
- st.session_state.messages = []
893
-
894
- st.title("AI Knowledge Base & Chat")
895
-
896
- # Fixed description at the top
897
- st.markdown("""
898
- **Welcome to the AI Knowledge Base & Chat App!** πŸ€–πŸ’¬
899
-
900
- This interactive application leverages a sophisticated AI model to provide in-depth information and insights across a diverse range of topics. Here’s what you can explore:
901
-
902
- - **Artificial Intelligence and Machine Learning** 🌐
903
- - **Computer Vision** πŸ‘οΈ
904
- - **Python Programming** 🐍
905
- - **Formula 1 Racing** 🏎️
906
-
907
- With its extensive training on these topics, the AI is well-equipped to provide accurate, detailed, and relevant answers to your questions. Enjoy exploring a world of knowledge and get instant responses to your queries! πŸŽ“βœ¨
908
- In addition to answering your questions, you can:
909
-
910
- Upload a PDF File πŸ“„: Submit a PDF document to have it automatically summarized, giving you a concise overview of its contents without having to read through the entire file.
911
-
912
- Provide a YouTube URL πŸŽ₯: Enter a link to a YouTube video to receive a summary of its key points, allowing you to grasp the main ideas quickly.
913
- """)
914
-
915
- # Layout for additional inputs and chat
916
- with st.sidebar:
917
- st.header("Additional Inputs")
918
-
919
- youtube_url = st.text_input("Enter YouTube URL:")
920
- if st.button("Process YouTube Video"):
921
- with st.spinner("Processing YouTube video..."):
922
- summary = process_video(youtube_url)
923
- st.write(summary)
924
- st.session_state.messages.append({"role": "assistant", "content": f"I've processed the YouTube video. Here's a summary:\n\n{summary}"})
925
- st.experimental_rerun()
926
-
927
- uploaded_pdf = st.file_uploader("Upload a PDF file", type="pdf")
928
- if st.button("Process PDF"):
929
- with st.spinner("Processing PDF..."):
930
- texts = extract_text_from_pdf(uploaded_pdf)
931
- pdf_summary = text_summarize(texts)
932
- st.write(pdf_summary)
933
- st.session_state.messages.append({"role": "assistant", "content": f"PDF processed and added to knowledge base. Here's a summary:\n\n{pdf_summary}"})
934
- st.experimental_rerun()
935
-
936
- st.header("Chat")
937
-
938
- # Display chat history
939
- for message in st.session_state.messages:
940
- role = message["role"]
941
- content = message["content"]
942
- if role == "user":
943
- with st.chat_message(role):
944
- st.markdown(content)
945
- else:
946
- with st.chat_message(role):
947
- st.markdown(content)
948
-
949
- user_input = st.chat_input("Ask a question")
950
-
951
- # Button to clear chat
952
- if st.button('Clear Chat'):
953
- clear_chat()
954
-
955
- if user_input:
956
- # Display user message
957
- with st.chat_message("user"):
958
- st.write(user_input)
959
-
960
- # Get AI response
961
- with st.chat_message("assistant"):
962
- response = agent_executor.invoke({"input": user_input})
963
- st.write(response['output'])
964
- st.session_state.messages.append({"role": "assistant", "content": response['output']})
965
-
966
- if __name__ == "__main__":
967
- main()
 
1
+ import os
2
+ import time
3
+ import streamlit as st
4
+ from youtube_transcript_api import YouTubeTranscriptApi
5
+ from youtube_search import YoutubeSearch
6
+ from fpdf import FPDF
7
+ from langchain_openai import ChatOpenAI
8
+ from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
9
+ from sentence_transformers import SentenceTransformer
10
+ from langchain.chains import RetrievalQA
11
+ from langchain.prompts import PromptTemplate
12
+ from langchain.memory import ConversationBufferWindowMemory
13
+ from langchain_community.vectorstores import Chroma
14
+ from langchain_core.documents import Document
15
+ from pypdf import PdfReader
16
+ from langchain_community.document_loaders import PyPDFLoader
17
+ from langchain.agents import initialize_agent, Tool
18
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
19
+ from langchain.agents import Tool, AgentExecutor, create_react_agent, tool
20
+ from flask import Flask, request, jsonify
21
+ import sqlite3
22
+ import re
23
+ import textwrap
24
+ from langchain.chains.summarize import load_summarize_chain
25
+ from langchain_community.document_loaders import WebBaseLoader
26
+ from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain, StuffDocumentsChain
27
+ from langchain.chains.llm import LLMChain
28
+ import torch
29
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
30
+ import nltk
31
+ from nltk.tokenize import word_tokenize
32
+ import pytube
33
+ from moviepy.editor import *
34
+
35
+ # Download necessary resources
36
+ nltk.download('punkt')
37
+
38
+
39
+
40
+ # Initialize environment variables
41
+ from dotenv import load_dotenv
42
+ import traceback
43
+ import logging
44
+
45
+ load_dotenv()
46
+
47
+ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
48
+ HUGGINGFACEHUB_API_TOKEN = os.getenv('HF_TOKEN')
49
+ YT_API_KEY = os.getenv('YT_API_KEY')
50
+
51
+ LANGCHAIN_TRACING_V2='true'
52
+ LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
53
+ LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
54
+ LANGCHAIN_PROJECT="default"
55
+
56
+ # Download and initialize all required models
57
+ model = SentenceTransformerEmbeddings(model_name='paraphrase-MiniLM-L6-v2')
58
+ summarization_model_name = "suriya7/bart-finetuned-text-summarization"
59
+ summarization_model = AutoModelForSeq2SeqLM.from_pretrained(summarization_model_name)
60
+ summarization_tokenizer = AutoTokenizer.from_pretrained(summarization_model_name)
61
+
62
+
63
+ # Function to load the vector database
64
+ def load_vectordb():
65
+ """
66
+ Load the vector database from Chroma.
67
+
68
+ Returns:
69
+ langchain_chroma (Chroma): The Chroma vector database.
70
+ """
71
+ persistent_client = Chroma.PersistentClient("chromadb")
72
+
73
+ langchain_chroma = Chroma(
74
+ client=persistent_client,
75
+ collection_name="knowledge_base",
76
+ embedding_function=model,
77
+ )
78
+
79
+ return langchain_chroma
80
+
81
+ vector_db = load_vectordb()
82
+
83
+ # Set up logging
84
+ logging.basicConfig(level=logging.INFO)
85
+ logger = logging.getLogger(__name__)
86
+
87
+ def safe_execute(func, *args, **kwargs):
88
+ """
89
+ Execute a function safely, catching any exceptions and logging errors.
90
+
91
+ Args:
92
+ func (callable): The function to execute.
93
+ *args: Variable length argument list for the function.
94
+ **kwargs: Arbitrary keyword arguments for the function.
95
+
96
+ Returns:
97
+ The result of the function execution, or an error message if an exception occurs.
98
+ """
99
+ try:
100
+ return func(*args, **kwargs)
101
+ except Exception as e:
102
+ logger.error(f"Error in {func.__name__}: {str(e)}")
103
+ logger.error(traceback.format_exc())
104
+ return f"An error occurred: {str(e)}"
105
+
106
+
107
+ # Initialize LLM
108
+ llm = ChatOpenAI(temperature=0.6, model_name="gpt-3.5-turbo-16k")
109
+
110
+
111
+ def count_tokens(text):
112
+ """
113
+ Count the number of tokens in a given text using NLTK's word tokenizer.
114
+
115
+ Args:
116
+ text (str): The input text.
117
+
118
+ Returns:
119
+ int: The number of tokens in the text.
120
+ """
121
+ tokens = word_tokenize(text)
122
+ return len(tokens)
123
+
124
+ def text_summarize(text):
125
+ """
126
+ Summarize the input text using a MapReduce approach.
127
+
128
+ Args:
129
+ text (str): The input text to summarize.
130
+
131
+ Returns:
132
+ str: The summary of the input text.
133
+ """
134
+ # Split the text into chunks
135
+ text_splitter = CharacterTextSplitter(chunk_size=10000, chunk_overlap=200)
136
+
137
+ docs = [Document(page_content=chunk) for chunk in text_splitter.split_text(text)]
138
+
139
+ # Map step
140
+ map_template = """The following is a document:
141
+ {docs}
142
+ Based on this document, please identify the main themes and key points.
143
+ Helpful Answer:"""
144
+ map_prompt = PromptTemplate.from_template(map_template)
145
+ map_chain = LLMChain(llm=llm, prompt=map_prompt)
146
+
147
+ # Reduce step
148
+ reduce_template = """The following is a set of summaries:
149
+ {docs}
150
+ Take these and distill them into a final, consolidated summary of the main themes and key points.
151
+ Helpful Answer:"""
152
+ reduce_prompt = PromptTemplate.from_template(reduce_template)
153
+ reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
154
+
155
+ # Combine
156
+ combine_documents_chain = StuffDocumentsChain(
157
+ llm_chain=reduce_chain,
158
+ document_variable_name="docs"
159
+ )
160
+
161
+ # Create the MapReduceDocumentsChain
162
+ map_reduce_chain = MapReduceDocumentsChain(
163
+ llm_chain=map_chain,
164
+ reduce_documents_chain=combine_documents_chain,
165
+ document_variable_name="docs"
166
+ )
167
+
168
+ return map_reduce_chain.run(docs)
169
+
170
+
171
+ # Function to add documents to the database
172
+ def add_documents_to_db(pdf_file):
173
+ """
174
+ Add documents extracted from a PDF file to the vector database.
175
+
176
+ Args:
177
+ pdf_file (str): The path to the PDF file to process.
178
+ """
179
+ try:
180
+ texts = extract_text_from_pdf(pdf_file)
181
+ cleaned_text = clean_text(texts)
182
+ documents = get_text_chunks(cleaned_text)
183
+
184
+ if documents:
185
+ h_size = 10000
186
+ total_documents = len(documents)
187
+ processed_documents = 0
188
+
189
+ while processed_documents < total_documents:
190
+ remaining_documents = total_documents - processed_documents
191
+ current_h_size = min(h_size, remaining_documents)
192
+
193
+ h_documents = documents[processed_documents:processed_documents + current_h_size]
194
+ vector_db.add_documents(h_documents)
195
+
196
+ processed_documents += current_h_size
197
+
198
+ print(f"Processed {processed_documents} out of {total_documents} documents.")
199
+
200
+ print("All documents added to the collection.")
201
+ else:
202
+ logger.warning(f"No documents found in {pdf_file}.")
203
+ except Exception as e:
204
+ logger.error(f"Error adding documents to database from {pdf_file}: {str(e)}")
205
+ raise # Re-raise the exception for visibility
206
+
207
+
208
+ def generate_valid_filename(query):
209
+ """
210
+ Generate a valid filename by replacing invalid characters with underscores.
211
+
212
+ Args:
213
+ query (str): The input string to generate the filename from.
214
+
215
+ Returns:
216
+ str: The generated valid filename.
217
+ """
218
+ valid_chars = '-_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
219
+ filename = ''.join(c if c in valid_chars else '_' for c in query)
220
+ return filename
221
+
222
+ #################################################
223
+ ## NEW FUNCTIONS ##
224
+ #################################################
225
+ import whisper
226
+ import time
227
+ from pytube import YouTube
228
+
229
+
230
+ def download_video(url):
231
+ video = YouTube(url)
232
+ stream = video.streams.filter(file_extension='mp4')
233
+ stream.download()
234
+ return stream.default_filename
235
+
236
+
237
+ def video_to_text(filename):
238
+ clip = VideoFileClip(filename)
239
+ audio_filename = filename[:-4] + ".mp3"
240
+ clip.audio.write_audiofile(audio_filename)
241
+ clip.close()
242
+ time.sleep(5)
243
+
244
+ model = whisper.load_model("base")
245
+ result = model.transcribe(audio_filename)
246
+
247
+ transcription = result["text"]
248
+
249
+ return transcription
250
+
251
+
252
+ #################################################
253
+ # Function to search and transcribe YouTube videos
254
+ def search_and_transcribe_videos(query, max_results=20, min_valid_videos=4):
255
+ """
256
+ Search for YouTube videos and transcribe them.
257
+
258
+ Args:
259
+ query (str): The search query for YouTube videos.
260
+ max_results (int): The maximum number of results to fetch. Default is 20.
261
+ min_valid_videos (int): The minimum number of valid videos to transcribe. Default is 4.
262
+
263
+ Returns:
264
+ str: The path to the transcript file.
265
+ """
266
+ valid_urls = []
267
+ current_max_results = max_results
268
+ transcription = ''
269
+ while len(valid_urls) < min_valid_videos and current_max_results <= 20:
270
+ results = YoutubeSearch(query, max_results=current_max_results).to_dict()
271
+ filtered_results = [video for video in results if video.get('liveBroadcastContent') != 'live']
272
+ for video in filtered_results:
273
+ video_id = video['id']
274
+ video_link = f"https://www.youtube.com/watch?v={video_id}"
275
+ try:
276
+ transcription = YouTubeTranscriptApi.get_transcript(video_id, languages=['en', 'en-US'])
277
+ transcript_text = " ".join([line['text'] for line in transcription])
278
+ valid_urls.append((transcript_text))
279
+
280
+ except:
281
+ continue
282
+
283
+ if len(valid_urls) >= min_valid_videos:
284
+ break
285
+
286
+ current_max_results += max_results
287
+
288
+ transcript_file = generate_valid_filename(query) + '.txt'
289
+ with open(transcript_file, 'a', encoding='utf-8') as f:
290
+ for text in valid_urls[:min_valid_videos]:
291
+ f.write(f"Text:{text}\n\n")
292
+
293
+ return transcript_file
294
+
295
+ # Function to create a PDF from a transcript
296
+ def create_pdf(input_file):
297
+ """
298
+ Create a PDF file from a transcript file.
299
+
300
+ Args:
301
+ input_file (str): The path to the transcript file.
302
+
303
+ Returns:
304
+ str: The path to the created PDF file.
305
+ """
306
+ pdf = FPDF()
307
+ with open(input_file, 'r', encoding='utf-8') as f:
308
+ text = f.read()
309
+ pdf.add_page()
310
+ pdf.set_font('Arial', size=12)
311
+ pdf.multi_cell(0, 10, text.encode('latin-1', 'replace').decode('latin-1'))
312
+ filename = input_file.split('.txt')[0]
313
+ output_filename = f"{filename}.pdf"
314
+ pdf.output(output_filename)
315
+ return output_filename
316
+
317
+ # Function to extract text from a PDF
318
+ def extract_text_from_pdf(pdf_path):
319
+ """
320
+ Extract text from a PDF file.
321
+
322
+ Args:
323
+ pdf_path (str): The path to the PDF file.
324
+
325
+ Returns:
326
+ str: The extracted text.
327
+ """
328
+ reader = PdfReader(pdf_path)
329
+ text = ""
330
+ for page in reader.pages:
331
+ page_text = page.extract_text()
332
+ if page_text:
333
+ text += page_text
334
+ return text
335
+
336
+ # Function to clean extracted text
337
+ def clean_text(text):
338
+ """
339
+ Clean and preprocess the extracted text.
340
+
341
+ Args:
342
+ text (str): The extracted text.
343
+
344
+ Returns:
345
+ str: The cleaned text.
346
+ """
347
+
348
+ text = text.replace('\xa0', ' ')
349
+ text = re.sub(r'[^\x00-\x7F]+!?', ' ', text)
350
+ return text
351
+
352
+ # Function to split text into chunks
353
+ def get_text_chunks(text):
354
+ """
355
+ Split the cleaned text into manageable chunks for further processing.
356
+
357
+ Args:
358
+ text (str): The cleaned text.
359
+ chunk_size (int): The size of each text chunk.
360
+
361
+ Returns:
362
+ list of Document: List of Document objects containing text chunks.
363
+ """
364
+
365
+ text_splitter = RecursiveCharacterTextSplitter(
366
+ chunk_size=1000,
367
+ chunk_overlap=200,
368
+ length_function=len
369
+ )
370
+ chunks = text_splitter.split_text(text)
371
+ return [Document(page_content=chunk) for chunk in chunks]
372
+
373
+
374
+
375
+ # Function to process YouTube videos
376
+ def load_video(url):
377
+ """
378
+ Retrieve the transcript of a YouTube video, save it to a text file,
379
+ convert the text file to a PDF, and return the PDF filename.
380
+
381
+ Args:
382
+ url (str): The URL of the YouTube video.
383
+
384
+ Returns:
385
+ str: The filename of the generated PDF.
386
+ """
387
+ video_id = url.split('v=')[-1]
388
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
389
+ transcript_text = ' '.join([t['text'] for t in transcript])
390
+ filename = f"{video_id}.txt"
391
+ with open(filename, 'w', encoding='utf-8') as f:
392
+ f.write(transcript_text)
393
+ pdf_filename = create_pdf(filename)
394
+ return pdf_filename
395
+
396
+ #Initialize the collection
397
+ def initialize_collection():
398
+ """
399
+ Initialize the knowledge base by searching and transcribing YouTube videos
400
+ for a predefined set of queries, converting them to PDF, and adding them
401
+ to the vector database.
402
+
403
+ Returns:
404
+ bool: True if the initialization is successful.
405
+ """
406
+ # Update queries if you want the assistant to have a different knowledge base and uncomment initialize_collection() after this function
407
+
408
+ queries = [
409
+ "Transfer Learning in Machine Learning",
410
+ "Object Detection and Recognition in Computer Vision",
411
+ "Sentiment Analysis in Natural Language Processing",
412
+ "Generative Adversarial Networks (GANs) in Deep Learning",
413
+ "Automatic Speech Recognition (ASR) Systems",
414
+ "Reinforcement Learning Applications",
415
+ "Image Segmentation Techniques in Computer Vision",
416
+ "Text Summarization Methods in NLP",
417
+ "Convolutional Neural Networks (CNNs) for Image Classification",
418
+ "Speech Synthesis and Text-to-Speech (TTS) Systems",
419
+ "Anomaly Detection in Machine Learning",
420
+ "Facial Recognition Technology and Ethics",
421
+ "Machine Translation and Language Models",
422
+ "Recurrent Neural Networks (RNNs) for Sequence Data",
423
+ "Speaker Diarization and Identification in Speech Processing",
424
+ "Applications of Natural Language Understanding (NLU)",
425
+ "Deep Reinforcement Learning for Game AI",
426
+ "Semantic Segmentation in Computer Vision",
427
+ "Dialogue Systems and Conversational AI",
428
+ "Ethical Implications of AI in Healthcare",
429
+ "Neural Machine Translation (NMT)",
430
+ "Time Series Forecasting with Machine Learning",
431
+ "Multi-modal Learning and Fusion",
432
+ "Named Entity Recognition (NER) in NLP",
433
+ "Human Pose Estimation in Computer Vision",
434
+ "Language Generation Models",
435
+ "Cognitive Robotics and AI Integration",
436
+ "Visual Question Answering (VQA) Systems",
437
+ "Privacy and Security in AI Applications",
438
+ "Graph Neural Networks (GNNs) for Structured Data",
439
+ "Introduction to Python programming",
440
+ "Python data types and variables",
441
+ "Control flow and loops in Python",
442
+ "Functions and modules in Python",
443
+ "File handling in Python",
444
+ "Object-oriented programming (OOP) in Python",
445
+ "Error handling and exceptions in Python",
446
+ "Python libraries for data analysis (e.g., Pandas, NumPy)",
447
+ "Web scraping with Python (e.g., using BeautifulSoup)",
448
+ "Creating GUI applications in Python (e.g., using Tkinter)",
449
+ "History of Formula 1 racing",
450
+ "Formula 1 car specifications and regulations",
451
+ "Famous Formula 1 drivers and their achievements",
452
+ "Formula 1 circuits around the world",
453
+ "How Formula 1 teams operate and strategize",
454
+ "Technological innovations in Formula 1",
455
+ "Role of aerodynamics in Formula 1 cars",
456
+ "Formula 1 race formats (qualifying, practice sessions, race day)",
457
+ "Evolution of safety measures in Formula 1",
458
+ "Economic impact of Formula 1 on host countries",
459
+ "Formula 1 engine specifications and development",
460
+ "Famous rivalries in Formula 1 history",
461
+ "Formula 1 team dynamics and hierarchy",
462
+ "How Formula 1 impacts automotive technology",
463
+ "The role of tire management in Formula 1 races",
464
+ "Key differences between Formula 1 and other racing series",
465
+ "The influence of sponsors in Formula 1",
466
+ "Formula 1 rules and regulations changes over the years",
467
+ "Notable controversies in Formula 1",
468
+ "The future of Formula 1 racing"
469
+ ]
470
+ print(len(queries))
471
+ for query in queries:
472
+ print(query)
473
+ transcript_file = search_and_transcribe_videos(query)
474
+ print(transcript_file)
475
+ time.sleep(5)
476
+
477
+ pdf_filename = create_pdf(transcript_file)
478
+ time.sleep(10)
479
+
480
+ add_documents_to_db(pdf_filename)
481
+
482
+ return True
483
+
484
+ import tiktoken
485
+
486
+ def update_conversation_summary(summarized_conversation, new_interaction):
487
+ """
488
+ Update the summary of a conversation by appending a new interaction.
489
+
490
+ Args:
491
+ summarized_conversation (str): The current summarized conversation.
492
+ new_interaction (dict): A dictionary containing 'question' and 'answer' keys.
493
+
494
+ Returns:
495
+ str: The updated summary of the conversation.
496
+ """
497
+
498
+ new_summary = f"{summarized_conversation}\n- Q: {new_interaction['question']}\n A: {new_interaction['answer']}"
499
+
500
+ return new_summary
501
+
502
+
503
+ def is_long_task(task, max_tokens=1000):
504
+ """
505
+ Determine if a given task exceeds the specified token limit.
506
+
507
+ Args:
508
+ task (str): The task to check.
509
+ max_tokens (int): The maximum number of tokens allowed.
510
+
511
+ Returns:
512
+ bool: True if the task exceeds the token limit, False otherwise.
513
+ """
514
+
515
+ encoding = tiktoken.encoding_for_model(llm)
516
+ num_tokens = len(encoding.encode(task))
517
+ return num_tokens > max_tokens
518
+
519
+ def split_task(task):
520
+ """
521
+ Split a long task into smaller subtasks for easier processing.
522
+
523
+ Args:
524
+ task (str): The task to split.
525
+
526
+ Returns:
527
+ list of str: A list of subtasks.
528
+ """
529
+
530
+ prompt = f"""
531
+ The following task needs to be split into smaller subtasks:
532
+
533
+ {task}
534
+
535
+ Please divide this task into 2-4 subtasks. Each subtask should be a complete, standalone task.
536
+ Format your response as a Python list of strings, with each string being a subtask.
537
+ """
538
+
539
+ response = llm.invoke(prompt)
540
+ subtasks = eval(response)
541
+ return subtasks
542
+
543
+ def combine_results(results):
544
+ """
545
+ Combine the results from multiple subtasks into a single summary.
546
+
547
+ Args:
548
+ results (list of str): The results from subtasks.
549
+
550
+ Returns:
551
+ str: A concise summary of the combined results.
552
+ """
553
+
554
+ combined = "Combined results from subtasks:\n\n"
555
+ for i, result in enumerate(results, 1):
556
+ combined += f"Subtask {i} result:\n{result}\n\n"
557
+
558
+ summary_prompt = f"""
559
+ Please provide a concise summary of the following combined results:
560
+
561
+ {combined}
562
+
563
+ Summarize the key points and overall conclusion.
564
+ """
565
+
566
+ response = llm.invoke(summary_prompt)
567
+ return response
568
+
569
+
570
+
571
+ def process_user_input(user_input):
572
+ """
573
+ Process user input by determining if it's a long task. If so, split it into subtasks,
574
+ process each subtask, and combine the results. Otherwise, process the input directly.
575
+
576
+ Args:
577
+ user_input (str): The user's input to process.
578
+
579
+ Returns:
580
+ str: The result after processing the user input.
581
+ """
582
+
583
+ if is_long_task(user_input):
584
+ subtasks = split_task(user_input)
585
+ results = []
586
+ for subtask in subtasks:
587
+ result = run_agent(subtask)
588
+ results.append(result)
589
+ return combine_results(results)
590
+ else:
591
+ return run_agent(user_input)
592
+
593
+ # Uncomment the line below if you want to re-initialize the collection or initialize it with different topics
594
+ #initialize_collection()
595
+
596
+ def create_qa_chain():
597
+ """
598
+ Create a question-answering chain using a retriever and a language model.
599
+
600
+ Returns:
601
+ RetrievalQA: The question-answering chain instance.
602
+ """
603
+
604
+ retriever = vector_db.as_retriever()
605
+ qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
606
+ return qa_chain
607
+
608
+ def combine_summaries(summaries):
609
+ """
610
+ Combine multiple summaries into a single summary.
611
+
612
+ Args:
613
+ summaries (list of str): The list of summaries to combine.
614
+
615
+ Returns:
616
+ str: The combined summary.
617
+ """
618
+
619
+ combined_summary = " ".join(summaries)
620
+ return combined_summary
621
+
622
+ def split_text(text, max_length=1500):
623
+ """
624
+ Split a long text into smaller chunks, ensuring chunks do not exceed the specified length.
625
+
626
+ Args:
627
+ text (str): The text to split.
628
+ max_length (int): The maximum length of each chunk.
629
+
630
+ Returns:
631
+ list of str: A list of text chunks.
632
+ """
633
+
634
+ chunks = []
635
+ while len(text) > max_length:
636
+ chunk = text[:max_length]
637
+ # Find the last complete sentence within the chunk
638
+ last_period = chunk.rfind('. ')
639
+ if last_period != -1:
640
+ chunk = chunk[:last_period+1]
641
+ chunks.append(chunk)
642
+ text = text[len(chunk):].lstrip()
643
+ if text:
644
+ chunks.append(text)
645
+ return chunks
646
+
647
+ def process_large_text(transcript_text):
648
+ """
649
+ Process a large text by splitting it into chunks, summarizing each chunk,
650
+ and then generating a final summary from the combined chunk summaries.
651
+
652
+ Args:
653
+ transcript_text (str): The large text to process.
654
+
655
+ Returns:
656
+ str: The final summary of the large text.
657
+ """
658
+
659
+ # Step 1: Split the cleaned text into manageable chunks
660
+ chunks = split_text(transcript_text, max_length=1500)
661
+
662
+ # Step 2: Generate summaries for each chunk
663
+ chunk_summaries = [text_summarize(chunk) for chunk in chunks]
664
+
665
+ # Step 3: Combine the chunk summaries
666
+ combined_summary = combine_summaries(chunk_summaries)
667
+
668
+ # Step 4: Generate the final summary from combined summaries
669
+ final_summ = text_summarize(combined_summary)
670
+
671
+ return final_summ
672
+
673
+ # Initialize memory with k=5, so the memory object will store the most recent 5 messages or interactions in the conversation
674
+ memory = ConversationBufferWindowMemory(k=5)
675
+
676
+ # Define agent tools
677
+ @tool
678
+ def search_kb(query):
679
+ """
680
+ Search the knowledge base for relevant documents based on a query and return a response.
681
+
682
+ Args:
683
+ query (str): The search query.
684
+
685
+ Returns:
686
+ str: The result from the QA chain based on the retrieved documents.
687
+ """
688
+
689
+ retriever = vector_db.as_retriever()
690
+ docs = retriever.get_relevant_documents(query)
691
+ summaries = "\n\n".join([doc.page_content for doc in docs])
692
+ qa_chain = create_qa_chain()
693
+ llm_response = qa_chain({"query": query})
694
+ return llm_response["result"]
695
+
696
+ @tool
697
+ def process_video(url):
698
+ """
699
+ Processes a YouTube video by extracting its transcript, summarizing it,
700
+ and adding the transcript to the knowledge base.
701
+
702
+ Args:
703
+ url (str): The URL of the YouTube video to process.
704
+
705
+ Returns:
706
+ str: The summary of the video.
707
+ """
708
+ # video_id = url.split('v=')[-1]
709
+ # transcript = YouTubeTranscriptApi.get_transcript(video_id)
710
+ # transcript_text = ' '.join([t['text'] for t in transcript])
711
+
712
+ video = download_video(url)
713
+ transcript_text = video_to_text(video)
714
+
715
+ # Clean the transcript text
716
+ cleaned_text = clean_text(transcript_text)
717
+ if len(cleaned_text) > 15000:
718
+ process_large_text(cleaned_text)
719
+
720
+ # Generate a summary for the user
721
+ summary = text_summarize(cleaned_text)
722
+
723
+ print(f"Added {len(summary)} chunks from YouTube video {url} to the collection.")
724
+ return summary
725
+
726
+
727
+ @tool
728
+ def new_search(query):
729
+ """
730
+ Perform a new search on YouTube, transcribe videos, create a PDF from the transcript, add documents to the database, and search the knowledge base.
731
+
732
+ Args:
733
+ query (str): The search query.
734
+
735
+ Returns:
736
+ str: The path to the created PDF file.
737
+ """
738
+ transcript = search_and_transcribe_videos(query)
739
+ time.sleep(10)
740
+ pdf_file = create_pdf(transcript)
741
+ time.sleep(10)
742
+ add_documents_to_db(pdf_file)
743
+ time.sleep(5)
744
+ search_kb(query)
745
+ return pdf_file
746
+
747
+ @tool
748
+ def process_pdf(pdf):
749
+ """
750
+ Processes a PDF File by summarizing it,
751
+ and adding it to the knowledge base.
752
+
753
+ Args:
754
+ pdf (str): The path to the PDF file to process.
755
+
756
+ Returns:
757
+ str: The summary of the PDF.
758
+ """
759
+
760
+ loader = PyPDFLoader(pdf)
761
+ docs = loader.load_and_split()
762
+ chain = load_summarize_chain(llm, chain_type="map_reduce")
763
+ summary = chain.run(docs)
764
+
765
+ return summary
766
+
767
+
768
+
769
+ # Define the agent tools
770
+ tools = [
771
+ Tool(
772
+ name="Search KB",
773
+ func=search_kb,
774
+ description="useful for when you need to answer questions about Machine Learning, Computer Vision and Natural Language Processing. The input to this tool should be a complete english sentence.",
775
+ ),
776
+ Tool(
777
+ name="Search YouTube",
778
+ func=new_search,
779
+ description="useful for when the user asks you a question outside of Machine Learning, Computer Vision and Natural Language Processing. You use it to find new information about a topic not in the knowledge base. The input to this tool should be a complete english sentence.",
780
+ ),
781
+ Tool(
782
+ name="Process Video",
783
+ func=process_video,
784
+ description="Useful for when the user wants to summarize or ask questions about a specific YouTube video. The input to this tool should be a YouTube URL.",
785
+ ),
786
+ Tool(
787
+ name="Process PDF",
788
+ func=process_pdf,
789
+ description="Useful for when the user wants to summarize or ask questions about a specific PDF file. The input to this tool should be a PDF file path.",
790
+ )
791
+ ]
792
+
793
+
794
+
795
+ # Define the agent prompt
796
+ prompt_template_string = """
797
+ You are an AI trained on Artificial Intelligence topics and Formula 1.
798
+
799
+
800
+ Answer the following questions as best you can, taking into account the context of the conversation.
801
+ You have access to the following tools:
802
+
803
+ {tools}
804
+
805
+ Use the following format:
806
+
807
+ Question: the input question you must answer
808
+ Thought: you should always think about what to do
809
+ Action: the action you should take, should be one of [{tool_names}]
810
+ Action Input: the input to the action
811
+ Observation: the result of the action
812
+ ... (this Thought/Action/Action Input/Observation can repeat N times)
813
+ Thought: I now know the final answer
814
+ Final Answer: the final answer to the original input question
815
+
816
+
817
+ Example 1:
818
+ Question: What are dinosaurs?
819
+ Thought: I need to check the knowledge base for information on dinosaurs.
820
+ Action: Search Knowledge Base
821
+ Action Input: What are dinosaurs?
822
+ Observation: I don't have information on dinosaurs based on the provided context about machine learning and artificial intelligence.
823
+ Thought: I need to find new information about dinosaurs.
824
+ Action: Search YouTube
825
+ Action Input: Dinosaurs
826
+ Observation: Found relevant information and updated the knowledge base.
827
+ Thought: Now I can find information in the updated knowledge base.
828
+ Action: Search Knowledge Base
829
+ Action Input: What are dinosaurs?
830
+ Observation: [detailed information about dinosaurs]
831
+ Thought: I now know the final answer.
832
+ Final Answer: [final detailed answer about dinosaurs]
833
+
834
+ Example 2:
835
+ Question: Can you summarize this video? https://www.youtube.com/watch?v=dQw4w9WgXcQ
836
+ Thought: I need to extract the link to the video to get the summary.
837
+ Action: Process input to get link
838
+ Action Input: https://www.youtube.com/watch?v=dQw4w9WgXcQ
839
+ Observation: [summary of the video]
840
+ Thought: Now I can provide the summary of the video.
841
+ Final Answer: [summary of the video]
842
+
843
+ Example 3:
844
+ Question: Explain the content of this video https://www.youtube.com/watch?v=dQw4w9WgXcQ and how it relates to machine learning.
845
+ Thought: I need to extract the YouTube link from the input.
846
+ Action: Extract YouTube Link
847
+ Action Input: Explain the content of this video https://www.youtube.com/watch?v=dQw4w9WgXcQ and how it relates to machine learning.
848
+ Observation: Extracted YouTube link: https://www.youtube.com/watch?v=dQw4w9WgXcQ
849
+ Thought: I need to process the video to get the summary.
850
+ Action: Process Video
851
+ Action Input: https://www.youtube.com/watch?v=dQw4w9WgXcQ
852
+ Observation: [summary of the video]
853
+ Thought: Now I can relate the content to machine learning.
854
+ Final Answer: [explanation of how the video content relates to machine learning]
855
+
856
+ Example 4:
857
+ Question: Who are you?
858
+ Thought: I should explain that I'm a chatbot and how I can help.
859
+ Final Answer: I am a chatbot that can answer questions about machine learning and other related topics.
860
+
861
+ Example 5:
862
+ Question: What is your name?
863
+ Thought: I don't know.
864
+ Final Answer: I don't know the answer for that.
865
+
866
+ Question: {input}
867
+ {agent_scratchpad}"""
868
+
869
+ # Define the agent
870
+ prompt = PromptTemplate.from_template(prompt_template_string)
871
+
872
+
873
+ agent = create_react_agent(llm, tools, prompt)
874
+ agent_executor = AgentExecutor(agent=agent, tools=tools,handle_parsing_errors=True)
875
+
876
+
877
+
878
+ # Streamlit App Interface Design
879
+ def main():
880
+
881
+ # Initialize session state
882
+ if 'messages' not in st.session_state:
883
+ st.session_state.messages = []
884
+ if 'chat_history' not in st.session_state:
885
+ st.session_state.chat_history = []
886
+ if 'conversation_summary' not in st.session_state:
887
+ st.session_state.conversation_summary = ""
888
+
889
+ # Function to clear chat history
890
+ def clear_chat():
891
+ st.session_state.messages = []
892
+
893
+ st.title("AI Knowledge Base & Chat")
894
+
895
+ # Fixed description at the top
896
+ st.markdown("""
897
+ **Welcome to the AI Knowledge Base & Chat App!** πŸ€–πŸ’¬
898
+
899
+ This interactive application leverages a sophisticated AI model to provide in-depth information and insights across a diverse range of topics. Here’s what you can explore:
900
+
901
+ - **Artificial Intelligence and Machine Learning** 🌐
902
+ - **Computer Vision** πŸ‘οΈ
903
+ - **Python Programming** 🐍
904
+ - **Formula 1 Racing** 🏎️
905
+
906
+ With its extensive training on these topics, the AI is well-equipped to provide accurate, detailed, and relevant answers to your questions. Enjoy exploring a world of knowledge and get instant responses to your queries! πŸŽ“βœ¨
907
+ In addition to answering your questions, you can:
908
+
909
+ Upload a PDF File πŸ“„: Submit a PDF document to have it automatically summarized, giving you a concise overview of its contents without having to read through the entire file.
910
+
911
+ Provide a YouTube URL πŸŽ₯: Enter a link to a YouTube video to receive a summary of its key points, allowing you to grasp the main ideas quickly.
912
+ """)
913
+
914
+ # Layout for additional inputs and chat
915
+ with st.sidebar:
916
+ st.header("Additional Inputs")
917
+
918
+ youtube_url = st.text_input("Enter YouTube URL:")
919
+ if st.button("Process YouTube Video"):
920
+ with st.spinner("Processing YouTube video..."):
921
+ summary = process_video(youtube_url)
922
+ st.write(summary)
923
+ st.session_state.messages.append({"role": "assistant", "content": f"I've processed the YouTube video. Here's a summary:\n\n{summary}"})
924
+ st.experimental_rerun()
925
+
926
+ uploaded_pdf = st.file_uploader("Upload a PDF file", type="pdf")
927
+ if st.button("Process PDF"):
928
+ with st.spinner("Processing PDF..."):
929
+ texts = extract_text_from_pdf(uploaded_pdf)
930
+ pdf_summary = text_summarize(texts)
931
+ st.write(pdf_summary)
932
+ st.session_state.messages.append({"role": "assistant", "content": f"PDF processed and added to knowledge base. Here's a summary:\n\n{pdf_summary}"})
933
+ st.experimental_rerun()
934
+
935
+ st.header("Chat")
936
+
937
+ # Display chat history
938
+ for message in st.session_state.messages:
939
+ role = message["role"]
940
+ content = message["content"]
941
+ if role == "user":
942
+ with st.chat_message(role):
943
+ st.markdown(content)
944
+ else:
945
+ with st.chat_message(role):
946
+ st.markdown(content)
947
+
948
+ user_input = st.chat_input("Ask a question")
949
+
950
+ # Button to clear chat
951
+ if st.button('Clear Chat'):
952
+ clear_chat()
953
+
954
+ if user_input:
955
+ # Display user message
956
+ with st.chat_message("user"):
957
+ st.write(user_input)
958
+
959
+ # Get AI response
960
+ with st.chat_message("assistant"):
961
+ response = agent_executor.invoke({"input": user_input})
962
+ st.write(response['output'])
963
+ st.session_state.messages.append({"role": "assistant", "content": response['output']})
964
+
965
+ if __name__ == "__main__":
966
+ main()