PebinAPJ commited on
Commit
38a268f
·
verified ·
1 Parent(s): 1be1eb7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -105
app.py CHANGED
@@ -1,106 +1,111 @@
1
- import streamlit as st
2
- from dotenv import load_dotenv
3
- from PyPDF2 import PdfReader
4
- from langchain.text_splitter import CharacterTextSplitter
5
- from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
- from langchain.vectorstores import FAISS
7
- from langchain.chat_models import ChatOpenAI
8
- from langchain.memory import ConversationBufferMemory
9
- from langchain.chains import ConversationalRetrievalChain
10
- from htmlTemplates import css, bot_template, user_template
11
- from langchain.llms import HuggingFaceHub
12
-
13
-
14
-
15
- def get_pdf_text(pdf_docs):
16
- text = ""
17
- for pdf in pdf_docs:
18
- pdf_reader = PdfReader(pdf)
19
- for page in pdf_reader.pages:
20
- text += page.extract_text()
21
- return text
22
-
23
-
24
- def get_text_chunks(text):
25
- text_splitter = CharacterTextSplitter(
26
- separator="\n",
27
- chunk_size=1000,
28
- chunk_overlap=200,
29
- length_function=len
30
- )
31
- chunks = text_splitter.split_text(text)
32
- return chunks
33
-
34
-
35
- def get_vectorstore(text_chunks):
36
- #embeddings = OpenAIEmbeddings()
37
- embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",api_token="")
38
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
39
- return vectorstore
40
-
41
-
42
- def get_conversation_chain(vectorstore):
43
- # llm = ChatOpenAI()
44
- llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512},huggingfacehub_api_token="")
45
-
46
- memory = ConversationBufferMemory(
47
- memory_key='chat_history', return_messages=True)
48
- conversation_chain = ConversationalRetrievalChain.from_llm(
49
- llm=llm,
50
- retriever=vectorstore.as_retriever(),
51
- memory=memory
52
- )
53
- return conversation_chain
54
-
55
-
56
- def handle_userinput(user_question):
57
- response = st.session_state.conversation({'question': user_question})
58
- st.session_state.chat_history = response['chat_history']
59
-
60
- for i, message in enumerate(st.session_state.chat_history):
61
- if i % 2 == 0:
62
- st.write(user_template.replace(
63
- "{{MSG}}", message.content), unsafe_allow_html=True)
64
- else:
65
- st.write(bot_template.replace(
66
- "{{MSG}}", message.content), unsafe_allow_html=True)
67
-
68
-
69
- def main():
70
- load_dotenv()
71
- st.set_page_config(page_title="Chat with multiple PDFs",
72
- page_icon=":books:")
73
- st.write(css, unsafe_allow_html=True)
74
-
75
- if "conversation" not in st.session_state:
76
- st.session_state.conversation = None
77
- if "chat_history" not in st.session_state:
78
- st.session_state.chat_history = None
79
-
80
- st.header("Chat with multiple PDFs :books:")
81
- user_question = st.text_input("Ask a question about your documents:")
82
- if user_question:
83
- handle_userinput(user_question)
84
-
85
- with st.sidebar:
86
- st.subheader("Your documents")
87
- pdf_docs = st.file_uploader(
88
- "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
89
- if st.button("Process"):
90
- with st.spinner("Processing"):
91
- # get pdf text
92
- raw_text = get_pdf_text(pdf_docs)
93
-
94
- # get the text chunks
95
- text_chunks = get_text_chunks(raw_text)
96
-
97
- # create vector store
98
- vectorstore = get_vectorstore(text_chunks)
99
-
100
- # create conversation chain
101
- st.session_state.conversation = get_conversation_chain(
102
- vectorstore)
103
-
104
-
105
- if __name__ == '__main__':
 
 
 
 
 
106
  main()
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.chains import ConversationalRetrievalChain
10
+ from htmlTemplates import css, bot_template, user_template
11
+ from langchain.llms import HuggingFaceHub
12
+ import os
13
+
14
+ # Fetch the API token from the environment
15
+ huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
16
+
17
+
18
+
19
+
20
+ def get_pdf_text(pdf_docs):
21
+ text = ""
22
+ for pdf in pdf_docs:
23
+ pdf_reader = PdfReader(pdf)
24
+ for page in pdf_reader.pages:
25
+ text += page.extract_text()
26
+ return text
27
+
28
+
29
+ def get_text_chunks(text):
30
+ text_splitter = CharacterTextSplitter(
31
+ separator="\n",
32
+ chunk_size=1000,
33
+ chunk_overlap=200,
34
+ length_function=len
35
+ )
36
+ chunks = text_splitter.split_text(text)
37
+ return chunks
38
+
39
+
40
+ def get_vectorstore(text_chunks):
41
+ #embeddings = OpenAIEmbeddings()
42
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",api_token=huggingface_token)
43
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
44
+ return vectorstore
45
+
46
+
47
+ def get_conversation_chain(vectorstore):
48
+ # llm = ChatOpenAI()
49
+ llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512},huggingfacehub_api_token=huggingface_token)
50
+
51
+ memory = ConversationBufferMemory(
52
+ memory_key='chat_history', return_messages=True)
53
+ conversation_chain = ConversationalRetrievalChain.from_llm(
54
+ llm=llm,
55
+ retriever=vectorstore.as_retriever(),
56
+ memory=memory
57
+ )
58
+ return conversation_chain
59
+
60
+
61
+ def handle_userinput(user_question):
62
+ response = st.session_state.conversation({'question': user_question})
63
+ st.session_state.chat_history = response['chat_history']
64
+
65
+ for i, message in enumerate(st.session_state.chat_history):
66
+ if i % 2 == 0:
67
+ st.write(user_template.replace(
68
+ "{{MSG}}", message.content), unsafe_allow_html=True)
69
+ else:
70
+ st.write(bot_template.replace(
71
+ "{{MSG}}", message.content), unsafe_allow_html=True)
72
+
73
+
74
+ def main():
75
+ load_dotenv()
76
+ st.set_page_config(page_title="Chat with multiple PDFs",
77
+ page_icon=":books:")
78
+ st.write(css, unsafe_allow_html=True)
79
+
80
+ if "conversation" not in st.session_state:
81
+ st.session_state.conversation = None
82
+ if "chat_history" not in st.session_state:
83
+ st.session_state.chat_history = None
84
+
85
+ st.header("Chat with multiple PDFs :books:")
86
+ user_question = st.text_input("Ask a question about your documents:")
87
+ if user_question:
88
+ handle_userinput(user_question)
89
+
90
+ with st.sidebar:
91
+ st.subheader("Your documents")
92
+ pdf_docs = st.file_uploader(
93
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
94
+ if st.button("Process"):
95
+ with st.spinner("Processing"):
96
+ # get pdf text
97
+ raw_text = get_pdf_text(pdf_docs)
98
+
99
+ # get the text chunks
100
+ text_chunks = get_text_chunks(raw_text)
101
+
102
+ # create vector store
103
+ vectorstore = get_vectorstore(text_chunks)
104
+
105
+ # create conversation chain
106
+ st.session_state.conversation = get_conversation_chain(
107
+ vectorstore)
108
+
109
+
110
+ if __name__ == '__main__':
111
  main()