PebinAPJ commited on
Commit
464c6df
·
verified ·
1 Parent(s): 419b9d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -25
app.py CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
2
  from dotenv import load_dotenv
3
  from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter # Correct (from langchain)
5
- from langchain_huggingface import HuggingFaceEmbeddings# Correct (from langchain_community)
6
  from langchain_community.vectorstores import FAISS # Correct (from langchain_community)
7
  from langchain_community.chat_models import ChatOpenAI # Correct (from langchain_community)
8
  from langchain.memory import ConversationBufferMemory # Correct (from langchain)
@@ -12,14 +12,6 @@ from langchain_community.llms import HuggingFaceHub # Correct (from langchain_c
12
  from langchain_huggingface import HuggingFaceEndpoint
13
 
14
 
15
-
16
-
17
-
18
-
19
-
20
-
21
-
22
-
23
  def get_pdf_text(pdf_docs):
24
  text = ""
25
  for pdf in pdf_docs:
@@ -41,25 +33,27 @@ def get_text_chunks(text):
41
 
42
 
43
  def get_vectorstore(text_chunks):
44
- #embeddings = OpenAIEmbeddings()
45
- embeddings = HuggingFaceEmbeddings(model_name="hkunlp/instructor-xl")
46
  vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
47
  return vectorstore
48
 
 
49
  def get_conversation_chain(vectorstore):
50
- # Use HuggingFaceEndpoint with explicitly passed parameters
51
  llm = HuggingFaceEndpoint(
52
- repo_id="sshleifer/distilbart-cnn-12-6",
53
- temperature=0.5,
54
- max_new_tokens=128, # Explicit parameter
55
- )
 
 
 
 
56
 
57
- # Initialize memory
58
  memory = ConversationBufferMemory(
59
  memory_key="chat_history", return_messages=True
60
  )
61
 
62
- # Create Conversational Retrieval Chain
63
  conversation_chain = ConversationalRetrievalChain.from_llm(
64
  llm=llm,
65
  retriever=vectorstore.as_retriever(),
@@ -67,6 +61,7 @@ def get_conversation_chain(vectorstore):
67
  )
68
  return conversation_chain
69
 
 
70
  def handle_userinput(user_question):
71
  response = st.session_state.conversation({'question': user_question})
72
  st.session_state.chat_history = response['chat_history']
@@ -82,8 +77,7 @@ def handle_userinput(user_question):
82
 
83
  def main():
84
  load_dotenv()
85
- st.set_page_config(page_title="Chat with multiple PDFs",
86
- page_icon=":books:")
87
  st.write(css, unsafe_allow_html=True)
88
 
89
  if "conversation" not in st.session_state:
@@ -98,8 +92,7 @@ def main():
98
 
99
  with st.sidebar:
100
  st.subheader("Your documents")
101
- pdf_docs = st.file_uploader(
102
- "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
103
  if st.button("Process"):
104
  with st.spinner("Processing"):
105
  # get pdf text
@@ -112,9 +105,8 @@ def main():
112
  vectorstore = get_vectorstore(text_chunks)
113
 
114
  # create conversation chain
115
- st.session_state.conversation = get_conversation_chain(
116
- vectorstore)
117
 
118
 
119
  if __name__ == '__main__':
120
- main()
 
2
  from dotenv import load_dotenv
3
  from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter # Correct (from langchain)
5
+ from langchain_huggingface import HuggingFaceEmbeddings # Correct (from langchain_community)
6
  from langchain_community.vectorstores import FAISS # Correct (from langchain_community)
7
  from langchain_community.chat_models import ChatOpenAI # Correct (from langchain_community)
8
  from langchain.memory import ConversationBufferMemory # Correct (from langchain)
 
12
  from langchain_huggingface import HuggingFaceEndpoint
13
 
14
 
 
 
 
 
 
 
 
 
15
  def get_pdf_text(pdf_docs):
16
  text = ""
17
  for pdf in pdf_docs:
 
33
 
34
 
35
  def get_vectorstore(text_chunks):
36
+ embeddings = HuggingFaceEmbeddings(model_name="hkunlp/instructor-base") # Using the lightweight instructor model
 
37
  vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
38
  return vectorstore
39
 
40
+
41
  def get_conversation_chain(vectorstore):
42
+ # Using a lightweight LLM, e.g., distilgpt2 or distilbart
43
  llm = HuggingFaceEndpoint(
44
+ repo_id="distilgpt2", # Low-end model (DistilGPT2)
45
+ temperature=0.5, # Randomness control
46
+ max_new_tokens=128, # Maximum tokens for generation
47
+ top_p=0.95, # Nucleus sampling
48
+ top_k=50, # Limit token options during generation
49
+ repetition_penalty=1.2 # Prevent repetitive answers
50
+ )
51
+
52
 
 
53
  memory = ConversationBufferMemory(
54
  memory_key="chat_history", return_messages=True
55
  )
56
 
 
57
  conversation_chain = ConversationalRetrievalChain.from_llm(
58
  llm=llm,
59
  retriever=vectorstore.as_retriever(),
 
61
  )
62
  return conversation_chain
63
 
64
+
65
  def handle_userinput(user_question):
66
  response = st.session_state.conversation({'question': user_question})
67
  st.session_state.chat_history = response['chat_history']
 
77
 
78
  def main():
79
  load_dotenv()
80
+ st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
 
81
  st.write(css, unsafe_allow_html=True)
82
 
83
  if "conversation" not in st.session_state:
 
92
 
93
  with st.sidebar:
94
  st.subheader("Your documents")
95
+ pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
 
96
  if st.button("Process"):
97
  with st.spinner("Processing"):
98
  # get pdf text
 
105
  vectorstore = get_vectorstore(text_chunks)
106
 
107
  # create conversation chain
108
+ st.session_state.conversation = get_conversation_chain(vectorstore)
 
109
 
110
 
111
  if __name__ == '__main__':
112
+ main()