Spaces:

abdullahzunorain
/

Simple-RAG-App-Test

Runtime error

App Files Files Community

abdullahzunorain commited on Nov 2, 2024

Commit

c75f073

verified ·

1 Parent(s): 1982883

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -170

app.py CHANGED Viewed

@@ -1,92 +1,3 @@
-# # Set your Groq API key here or use environment variable
-# GROQ_API_TOKEN = os.getenv("groq_api")
-# client = Groq(api_key=GROQ_API_TOKEN)
-import os
-import ffmpeg
-import whisper
-import streamlit as st
-from groq import Groq
-# Set the title and description of the app
-st.title("Audio/Video Transcription and Summarization")
-st.write("Upload your audio or video file, and this app will transcribe the audio and provide a summary of the transcription.")
-# Retrieve the API key from environment variables or Streamlit secrets
-GROQ_API_KEY = os.getenv("GROQ_API_KEY") or st.secrets["GROQ_API_KEY"]
-os.environ["GROQ_API_KEY"] = GROQ_API_KEY
-# Create a temporary directory if it does not exist
-temp_dir = "temp"
-os.makedirs(temp_dir, exist_ok=True)
-# Upload the audio or video file
-uploaded_file = st.file_uploader("Choose an audio or video file...", type=["mp4", "mov", "avi", "mkv", "wav", "mp3"])
-# Function to extract audio from video
-def extract_audio(video_path, audio_path="temp/temp_audio.wav"):
-    """Extracts audio from video."""
-    try:
-        # Run ffmpeg command with stderr capture for better error handling
-        ffmpeg.input(video_path).output(audio_path).run(overwrite_output=True, capture_stdout=True, capture_stderr=True)
-    except ffmpeg.Error as e:
-        st.error("FFmpeg error encountered: " + e.stderr.decode())
-    return audio_path
-# Function to transcribe audio to text using Whisper model
-def transcribe_audio(audio_path):
-    """Transcribes audio to text using Whisper model."""
-    model = whisper.load_model("base")  # Load the Whisper model
-    result = model.transcribe(audio_path)
-    return result["text"]
-# Function to summarize text using Groq API
-def summarize_text(text):
-    """Summarizes text using Groq API."""
-    client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-    response = client.chat.completions.create(
-        messages=[{"role": "user", "content": f"Summarize the following text: {text}"}],
-        model="llama3-8b-8192"
-    )
-    summary = response.choices[0].message.content
-    return summary
-# Complete function to process audio or video
-def process_media(media_file):
-    """Processes audio or video: extracts audio, transcribes it, and summarizes the transcription."""
-    # Save the uploaded file to a temporary path
-    temp_file_path = os.path.join(temp_dir, media_file.name)
-    with open(temp_file_path, "wb") as f:
-        f.write(media_file.getbuffer())
-    # Determine if the file is a video or audio based on the file extension
-    if media_file.name.endswith(('.mp4', '.mov', '.avi', '.mkv')):
-        # Step 1: Extract audio from video
-        audio_path = extract_audio(temp_file_path)
-    else:
-        audio_path = temp_file_path  # If it's already audio, use it as is
-    # Step 2: Transcribe audio to text
-    transcription = transcribe_audio(audio_path)
-    st.write("### Transcription:")
-    st.write(transcription)
-    # Step 3: Summarize transcription
-    summary = summarize_text(transcription)
-    st.write("### Summary:")
-    st.write(summary)
-    # Clean up temporary files if needed
-    os.remove(temp_file_path)
-    if media_file.name.endswith(('.mp4', '.mov', '.avi', '.mkv')):
-        os.remove(audio_path)
-# Run the app
-if uploaded_file is not None:
-    process_media(uploaded_file)
-else:
-    st.warning("Please upload a file.")
@@ -96,85 +7,85 @@ else:
-# import os
-# import streamlit as st
-# from sentence_transformers import SentenceTransformer, util
-# from groq import Groq
-# from PyPDF2 import PdfReader
-# # Initialize the retriever and Groq client
-# retriever = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-# # client = Groq(api_key=groq_api)  # Replace with your actual Groq API key
-# key = os.getenv("groq_api")
-# client = Groq(api_key = key)
-# # Knowledge base (documents) and embeddings
-# documents = [
-#     "Retrieval-Augmented Generation (RAG) is an AI framework that combines the strengths of retrieval-based and generative models.",
-#     "The main components of a RAG system are the retriever and the generator.",
-#     "A key benefit of Retrieval-Augmented Generation is that it can produce more accurate responses compared to standalone generative models.",
-#     "The retrieval process in a RAG system often relies on embedding-based models, like Sentence-BERT or DPR.",
-#     "Common use cases of RAG include chatbots, customer support systems, and knowledge retrieval for business intelligence."
-# ]
-# document_embeddings = retriever.encode(documents, convert_to_tensor=True)
-# # Function to retrieve top relevant document and truncate context if too long
-# def retrieve(query, top_k=1, max_tokens=100):
-#     query_embedding = retriever.encode(query, convert_to_tensor=True)
-#     hits = util.semantic_search(query_embedding, document_embeddings, top_k=top_k)
-#     top_docs = [documents[hit['corpus_id']] for hit in hits[0]]
-#     # Truncate context to max_tokens if necessary
-#     context = top_docs[0] if hits[0] else ""
-#     context = ' '.join(context.split()[:max_tokens])  # Limit to max_tokens words
-#     return context
-# # Function to generate response using Groq
-# def generate_response(query, context):
-#     response = client.chat.completions.create(
-#         messages=[
-#             {
-#                 "role": "user",
-#                 "content": f"Context: {context} Question: {query} Answer:"
-#             }
-#         ],
-#         model="gemma2-9b-it"
-#     )
-#     return response.choices[0].message.content
-# # Function to handle PDF upload and text extraction
-# def extract_text_from_pdf(file):
-#     pdf_reader = PdfReader(file)
-#     text = ""
-#     for page in pdf_reader.pages:
-#         text += page.extract_text()
-#     return text
-# # Function to update knowledge base with new content from PDF
-# def update_knowledge_base(pdf_text):
-#     global documents, document_embeddings
-#     documents.append(pdf_text)
-#     document_embeddings = retriever.encode(documents, convert_to_tensor=True)
-# # Streamlit app layout
-# st.title("RAG-based Question Answering App")
-# st.write("Upload a PDF, ask questions based on its content, and get answers!")
-# # Upload PDF file
-# uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
-# if uploaded_file:
-#     pdf_text = extract_text_from_pdf(uploaded_file)
-#     update_knowledge_base(pdf_text)
-#     st.write("PDF content successfully added to the knowledge base.")
-# # Question input
-# question = st.text_input("Enter your question:")
-# if question:
-#     retrieved_context = retrieve(question)
-#     if retrieved_context:
-#         answer = generate_response(question, retrieved_context)
-#     else:
-#         answer = "I have no knowledge about this topic."
-#     st.write("Answer:", answer)

+import os
+import streamlit as st
+from sentence_transformers import SentenceTransformer, util
+from groq import Groq
+from PyPDF2 import PdfReader
+# Initialize the retriever and Groq client
+retriever = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+# client = Groq(api_key=groq_api)  # Replace with your actual Groq API key
+key = os.getenv("groq_api")
+client = Groq(api_key = key)
+# Knowledge base (documents) and embeddings
+documents = [
+    "Retrieval-Augmented Generation (RAG) is an AI framework that combines the strengths of retrieval-based and generative models.",
+    "The main components of a RAG system are the retriever and the generator.",
+    "A key benefit of Retrieval-Augmented Generation is that it can produce more accurate responses compared to standalone generative models.",
+    "The retrieval process in a RAG system often relies on embedding-based models, like Sentence-BERT or DPR.",
+    "Common use cases of RAG include chatbots, customer support systems, and knowledge retrieval for business intelligence."
+]
+document_embeddings = retriever.encode(documents, convert_to_tensor=True)
+# Function to retrieve top relevant document and truncate context if too long
+def retrieve(query, top_k=1, max_tokens=100):
+    query_embedding = retriever.encode(query, convert_to_tensor=True)
+    hits = util.semantic_search(query_embedding, document_embeddings, top_k=top_k)
+    top_docs = [documents[hit['corpus_id']] for hit in hits[0]]
+    # Truncate context to max_tokens if necessary
+    context = top_docs[0] if hits[0] else ""
+    context = ' '.join(context.split()[:max_tokens])  # Limit to max_tokens words
+    return context
+# Function to generate response using Groq
+def generate_response(query, context):
+    response = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": f"Context: {context} Question: {query} Answer:"
+            }
+        ],
+        model="gemma2-9b-it"
+    )
+    return response.choices[0].message.content
+# Function to handle PDF upload and text extraction
+def extract_text_from_pdf(file):
+    pdf_reader = PdfReader(file)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    return text
+# Function to update knowledge base with new content from PDF
+def update_knowledge_base(pdf_text):
+    global documents, document_embeddings
+    documents.append(pdf_text)
+    document_embeddings = retriever.encode(documents, convert_to_tensor=True)
+# Streamlit app layout
+st.title("RAG-based Question Answering App")
+st.write("Upload a PDF, ask questions based on its content, and get answers!")
+# Upload PDF file
+uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
+if uploaded_file:
+    pdf_text = extract_text_from_pdf(uploaded_file)
+    update_knowledge_base(pdf_text)
+    st.write("PDF content successfully added to the knowledge base.")
+# Question input
+question = st.text_input("Enter your question:")
+if question:
+    retrieved_context = retrieve(question)
+    if retrieved_context:
+        answer = generate_response(question, retrieved_context)
+    else:
+        answer = "I have no knowledge about this topic."
+    st.write("Answer:", answer)