PebinAPJ commited on
Commit
13132c7
·
verified ·
1 Parent(s): c8e541d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -37
app.py CHANGED
@@ -1,17 +1,19 @@
1
  import streamlit as st
2
  from dotenv import load_dotenv
3
  from PyPDF2 import PdfReader
4
- from langchain.text_splitter import CharacterTextSplitter # Correct (from langchain)
5
- from langchain_huggingface import HuggingFaceEmbeddings # Correct (from langchain_community)
6
- from langchain_community.vectorstores import FAISS # Correct (from langchain_community)
7
- from langchain_community.chat_models import ChatOpenAI # Correct (from langchain_community)
8
- from langchain.memory import ConversationBufferMemory # Correct (from langchain)
9
- from langchain.chains import ConversationalRetrievalChain # Correct (from langchain)
10
- from htmlTemplates import css, bot_template, user_template
11
- from langchain_community.llms import HuggingFaceHub # Correct (from langchain_community)
12
- from langchain_huggingface import HuggingFaceEndpoint
13
-
14
-
 
 
15
  def get_pdf_text(pdf_docs):
16
  text = ""
17
  for pdf in pdf_docs:
@@ -21,6 +23,7 @@ def get_pdf_text(pdf_docs):
21
  return text
22
 
23
 
 
24
  def get_text_chunks(text):
25
  text_splitter = CharacterTextSplitter(
26
  separator="\n",
@@ -32,80 +35,79 @@ def get_text_chunks(text):
32
  return chunks
33
 
34
 
 
35
  def get_vectorstore(text_chunks):
36
- embeddings = HuggingFaceEmbeddings(model_name="hkunlp/instructor-base") # Using the lightweight instructor model
37
  vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
38
  return vectorstore
39
 
40
 
 
41
  def get_conversation_chain(vectorstore):
42
- # Using a lightweight LLM, e.g., distilgpt2 or distilbart
43
- llm = HuggingFaceEndpoint(
44
- repo_id="distilgpt2", # Low-end model (DistilGPT2)
45
- temperature=0.7, # Adjusted for balanced creativity
46
- max_new_tokens=100, # Limiting response length
47
- top_p=0.9, # Nucleus sampling
48
- top_k=50, # Limit token options during generation
49
- repetition_penalty=1.0 # Default value to prevent repetition
50
- )
51
-
52
 
53
-
54
- memory = ConversationBufferMemory(
55
- memory_key="chat_history", return_messages=True
56
- )
57
 
 
58
  conversation_chain = ConversationalRetrievalChain.from_llm(
59
- llm=llm,
60
  retriever=vectorstore.as_retriever(),
61
  memory=memory,
62
  )
63
  return conversation_chain
64
 
65
 
 
66
  def handle_userinput(user_question):
67
  response = st.session_state.conversation({'question': user_question})
68
  st.session_state.chat_history = response['chat_history']
69
 
 
70
  for i, message in enumerate(st.session_state.chat_history):
71
  if i % 2 == 0:
72
- st.write(user_template.replace(
73
- "{{MSG}}", message.content), unsafe_allow_html=True)
74
  else:
75
- st.write(bot_template.replace(
76
- "{{MSG}}", message.content), unsafe_allow_html=True)
77
 
78
 
79
  def main():
80
  load_dotenv()
81
  st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
82
- st.write(css, unsafe_allow_html=True)
83
 
 
84
  if "conversation" not in st.session_state:
85
  st.session_state.conversation = None
86
  if "chat_history" not in st.session_state:
87
  st.session_state.chat_history = None
88
 
 
89
  st.header("Chat with multiple PDFs :books:")
 
 
90
  user_question = st.text_input("Ask a question about your documents:")
91
  if user_question:
92
  handle_userinput(user_question)
93
 
94
  with st.sidebar:
95
  st.subheader("Your documents")
 
 
96
  pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
 
97
  if st.button("Process"):
98
- with st.spinner("Processing"):
99
- # get pdf text
100
  raw_text = get_pdf_text(pdf_docs)
101
 
102
- # get the text chunks
103
  text_chunks = get_text_chunks(raw_text)
104
 
105
- # create vector store
106
  vectorstore = get_vectorstore(text_chunks)
107
 
108
- # create conversation chain
109
  st.session_state.conversation = get_conversation_chain(vectorstore)
110
 
111
 
 
1
  import streamlit as st
2
  from dotenv import load_dotenv
3
  from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain_community.chat_models import HuggingFacePipeline
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.chains import ConversationalRetrievalChain
10
+ from transformers import pipeline # Hugging Face pipeline for using T5 model
11
+ import os
12
+
13
+ # Access Hugging Face API token from Streamlit secrets
14
+ hf_token = st.secrets["huggingface"]["HF_TOKEN"]
15
+
16
+ # Function to get text from the PDF documents
17
  def get_pdf_text(pdf_docs):
18
  text = ""
19
  for pdf in pdf_docs:
 
23
  return text
24
 
25
 
26
+ # Function to split the text into manageable chunks
27
  def get_text_chunks(text):
28
  text_splitter = CharacterTextSplitter(
29
  separator="\n",
 
35
  return chunks
36
 
37
 
38
+ # Function to create vectorstore from the text chunks
39
  def get_vectorstore(text_chunks):
40
+ embeddings = HuggingFaceEmbeddings(model_name="hkunlp/instructor-base") # Using lightweight instructor model
41
  vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
42
  return vectorstore
43
 
44
 
45
+ # Function to create the conversation chain using T5 from Hugging Face API
46
  def get_conversation_chain(vectorstore):
47
+ # Load the T5 model using Hugging Face's pipeline for text generation
48
+ t5_model = pipeline("text2text-generation", model="t5-small", tokenizer="t5-small", device=0) # Running on CPU (device=0 for GPU)
 
 
 
 
 
 
 
 
49
 
50
+ # Use ConversationBufferMemory to track the conversation
51
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
 
 
52
 
53
+ # Create a conversation chain using the T5 model
54
  conversation_chain = ConversationalRetrievalChain.from_llm(
55
+ llm=t5_model,
56
  retriever=vectorstore.as_retriever(),
57
  memory=memory,
58
  )
59
  return conversation_chain
60
 
61
 
62
+ # Function to handle the user input
63
  def handle_userinput(user_question):
64
  response = st.session_state.conversation({'question': user_question})
65
  st.session_state.chat_history = response['chat_history']
66
 
67
+ # Display the conversation (alternating user and bot messages)
68
  for i, message in enumerate(st.session_state.chat_history):
69
  if i % 2 == 0:
70
+ st.write(f"**You:** {message.content}", unsafe_allow_html=True)
 
71
  else:
72
+ st.write(f"**Bot:** {message.content}", unsafe_allow_html=True)
 
73
 
74
 
75
  def main():
76
  load_dotenv()
77
  st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
 
78
 
79
+ # Initialize session state for conversation
80
  if "conversation" not in st.session_state:
81
  st.session_state.conversation = None
82
  if "chat_history" not in st.session_state:
83
  st.session_state.chat_history = None
84
 
85
+ # Title of the app
86
  st.header("Chat with multiple PDFs :books:")
87
+
88
+ # User input for querying the documents
89
  user_question = st.text_input("Ask a question about your documents:")
90
  if user_question:
91
  handle_userinput(user_question)
92
 
93
  with st.sidebar:
94
  st.subheader("Your documents")
95
+
96
+ # File uploader to upload PDFs
97
  pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
98
+
99
  if st.button("Process"):
100
+ with st.spinner("Processing..."):
101
+ # Extract text from PDFs
102
  raw_text = get_pdf_text(pdf_docs)
103
 
104
+ # Split the text into chunks
105
  text_chunks = get_text_chunks(raw_text)
106
 
107
+ # Create a vector store using the text chunks
108
  vectorstore = get_vectorstore(text_chunks)
109
 
110
+ # Create the conversation chain using the T5 model
111
  st.session_state.conversation = get_conversation_chain(vectorstore)
112
 
113