Spaces:

PebinAPJ
/

ChatPdfs

Sleeping

App Files Files Community

PebinAPJ commited on Jan 26

Commit

13132c7

verified ·

1 Parent(s): c8e541d

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -37

app.py CHANGED Viewed

@@ -1,17 +1,19 @@
 import streamlit as st
 from dotenv import load_dotenv
 from PyPDF2 import PdfReader
-from langchain.text_splitter import CharacterTextSplitter  # Correct (from langchain)
-from langchain_huggingface import HuggingFaceEmbeddings  # Correct (from langchain_community)
-from langchain_community.vectorstores import FAISS  # Correct (from langchain_community)
-from langchain_community.chat_models import ChatOpenAI  # Correct (from langchain_community)
-from langchain.memory import ConversationBufferMemory  # Correct (from langchain)
-from langchain.chains import ConversationalRetrievalChain  # Correct (from langchain)
-from htmlTemplates import css, bot_template, user_template
-from langchain_community.llms import HuggingFaceHub  # Correct (from langchain_community)
-from langchain_huggingface import HuggingFaceEndpoint
 def get_pdf_text(pdf_docs):
     text = ""
     for pdf in pdf_docs:
@@ -21,6 +23,7 @@ def get_pdf_text(pdf_docs):
     return text
 def get_text_chunks(text):
     text_splitter = CharacterTextSplitter(
         separator="\n",
@@ -32,80 +35,79 @@ def get_text_chunks(text):
     return chunks
 def get_vectorstore(text_chunks):
-    embeddings = HuggingFaceEmbeddings(model_name="hkunlp/instructor-base")  # Using the lightweight instructor model
     vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
     return vectorstore
 def get_conversation_chain(vectorstore):
-    # Using a lightweight LLM, e.g., distilgpt2 or distilbart
-    llm = HuggingFaceEndpoint(
-        repo_id="distilgpt2",  # Low-end model (DistilGPT2)
-        temperature=0.7,       # Adjusted for balanced creativity
-        max_new_tokens=100,    # Limiting response length
-        top_p=0.9,             # Nucleus sampling
-        top_k=50,              # Limit token options during generation
-        repetition_penalty=1.0 # Default value to prevent repetition
-    )
-    memory = ConversationBufferMemory(
-        memory_key="chat_history", return_messages=True
-    )
     conversation_chain = ConversationalRetrievalChain.from_llm(
-        llm=llm,
         retriever=vectorstore.as_retriever(),
         memory=memory,
     )
     return conversation_chain
 def handle_userinput(user_question):
     response = st.session_state.conversation({'question': user_question})
     st.session_state.chat_history = response['chat_history']
     for i, message in enumerate(st.session_state.chat_history):
         if i % 2 == 0:
-            st.write(user_template.replace(
-                "{{MSG}}", message.content), unsafe_allow_html=True)
         else:
-            st.write(bot_template.replace(
-                "{{MSG}}", message.content), unsafe_allow_html=True)
 def main():
     load_dotenv()
     st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
-    st.write(css, unsafe_allow_html=True)
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = None
     st.header("Chat with multiple PDFs :books:")
     user_question = st.text_input("Ask a question about your documents:")
     if user_question:
         handle_userinput(user_question)
     with st.sidebar:
         st.subheader("Your documents")
         pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
         if st.button("Process"):
-            with st.spinner("Processing"):
-                # get pdf text
                 raw_text = get_pdf_text(pdf_docs)
-                # get the text chunks
                 text_chunks = get_text_chunks(raw_text)
-                # create vector store
                 vectorstore = get_vectorstore(text_chunks)
-                # create conversation chain
                 st.session_state.conversation = get_conversation_chain(vectorstore)

 import streamlit as st
 from dotenv import load_dotenv
 from PyPDF2 import PdfReader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain_community.chat_models import HuggingFacePipeline
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+from transformers import pipeline  # Hugging Face pipeline for using T5 model
+import os
+# Access Hugging Face API token from Streamlit secrets
+hf_token = st.secrets["huggingface"]["HF_TOKEN"]
+# Function to get text from the PDF documents
 def get_pdf_text(pdf_docs):
     text = ""
     for pdf in pdf_docs:
     return text
+# Function to split the text into manageable chunks
 def get_text_chunks(text):
     text_splitter = CharacterTextSplitter(
         separator="\n",
     return chunks
+# Function to create vectorstore from the text chunks
 def get_vectorstore(text_chunks):
+    embeddings = HuggingFaceEmbeddings(model_name="hkunlp/instructor-base")  # Using lightweight instructor model
     vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
     return vectorstore
+# Function to create the conversation chain using T5 from Hugging Face API
 def get_conversation_chain(vectorstore):
+    # Load the T5 model using Hugging Face's pipeline for text generation
+    t5_model = pipeline("text2text-generation", model="t5-small", tokenizer="t5-small", device=0)  # Running on CPU (device=0 for GPU)
+    # Use ConversationBufferMemory to track the conversation
+    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+    # Create a conversation chain using the T5 model
     conversation_chain = ConversationalRetrievalChain.from_llm(
+        llm=t5_model,
         retriever=vectorstore.as_retriever(),
         memory=memory,
     )
     return conversation_chain
+# Function to handle the user input
 def handle_userinput(user_question):
     response = st.session_state.conversation({'question': user_question})
     st.session_state.chat_history = response['chat_history']
+    # Display the conversation (alternating user and bot messages)
     for i, message in enumerate(st.session_state.chat_history):
         if i % 2 == 0:
+            st.write(f"**You:** {message.content}", unsafe_allow_html=True)
         else:
+            st.write(f"**Bot:** {message.content}", unsafe_allow_html=True)
 def main():
     load_dotenv()
     st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
+    # Initialize session state for conversation
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = None
+    # Title of the app
     st.header("Chat with multiple PDFs :books:")
+    # User input for querying the documents
     user_question = st.text_input("Ask a question about your documents:")
     if user_question:
         handle_userinput(user_question)
     with st.sidebar:
         st.subheader("Your documents")
+        # File uploader to upload PDFs
         pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
         if st.button("Process"):
+            with st.spinner("Processing..."):
+                # Extract text from PDFs
                 raw_text = get_pdf_text(pdf_docs)
+                # Split the text into chunks
                 text_chunks = get_text_chunks(raw_text)
+                # Create a vector store using the text chunks
                 vectorstore = get_vectorstore(text_chunks)
+                # Create the conversation chain using the T5 model
                 st.session_state.conversation = get_conversation_chain(vectorstore)