junkmind arnold-anand commited on
Commit
4032a9c
·
0 Parent(s):

Duplicate from arnold-anand/chat-with-pdf

Browse files

Co-authored-by: Arnold Anand P <[email protected]>

Files changed (6) hide show
  1. .env +1 -0
  2. .gitattributes +35 -0
  3. .gitignore +1 -0
  4. README.md +14 -0
  5. app.py +77 -0
  6. requirements.txt +8 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY= 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
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Chat With Pdf
3
+ emoji: 🚀
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.21.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: gpl-3.0
11
+ duplicated_from: arnold-anand/chat-with-pdf
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import streamlit as st
3
+ from dotenv import load_dotenv
4
+ from PyPDF2 import PdfReader
5
+ from langchain.text_splitter import CharacterTextSplitter
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
9
+ from langchain.memory import ConversationBufferMemory
10
+ from langchain.chains import ConversationalRetrievalChain
11
+ # from langchain.llms import HuggingFaceHub
12
+ from streamlit_chat import message
13
+ def get_pdf_text(pdfs):
14
+ text=""
15
+ for pdf in pdfs:
16
+ pdf_reader = PdfReader(pdf)
17
+ for page in pdf_reader.pages:
18
+ text+= page.extract_text()
19
+ return text
20
+
21
+ def get_text_chunks(text):
22
+ text_splitter = CharacterTextSplitter(separator="\n",
23
+ chunk_size=1000, chunk_overlap = 200, length_function=len)
24
+ chunks = text_splitter.split_text(text)
25
+ return chunks
26
+
27
+ def get_vectorstore(text_chunks):
28
+ embeddings = OpenAIEmbeddings()
29
+ # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
30
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
31
+ return vectorstore
32
+
33
+ def get_conversation_chain(vectorstore):
34
+ # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl")
35
+ llm = ChatOpenAI()
36
+ memory = ConversationBufferMemory(
37
+ memory_key='chat_history', return_messages=True)
38
+ conversation_chain = ConversationalRetrievalChain.from_llm(
39
+ llm=llm,
40
+ retriever=vectorstore.as_retriever(),
41
+ memory=memory
42
+ )
43
+ return conversation_chain
44
+ def user_input(user_question):
45
+ response = st.session_state.conversation({'question':user_question})
46
+ st.session_state.chat_history = response['chat_history']
47
+ for i, messages in enumerate(st.session_state.chat_history):
48
+ if i % 2 == 0:
49
+ message(messages.content, is_user=True)
50
+ else:
51
+ message(messages.content)
52
+ def main():
53
+ load_dotenv()
54
+ st.set_page_config(page_title="Chat with PDF")
55
+ if "conversation" not in st.session_state:
56
+ st.session_state.conversation = None
57
+ if "chat_history" not in st.session_state:
58
+ st.session_state.chat_history = None
59
+
60
+ st.header("Chat with PDF")
61
+ user_question = st.text_input("Ask a question about your documents...")
62
+ if user_question:
63
+ user_input(user_question)
64
+ with st.sidebar:
65
+ st.subheader("Your Documents")
66
+ pdfs = st.file_uploader("Upload here", accept_multiple_files=True)
67
+ if st.button("Process"):
68
+ with st.spinner("Processing"):
69
+ raw_text = get_pdf_text(pdfs)
70
+ # print(raw_text)
71
+ chunks = get_text_chunks(raw_text)
72
+ vectorstore = get_vectorstore(chunks)
73
+ st.session_state.conversation = get_conversation_chain(vectorstore)
74
+ st.success("Processing Complete !")
75
+
76
+ if __name__ == '__main__':
77
+ main()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ langchain==0.0.222
2
+ PyPDF2==3.0.1
3
+ python-dotenv==1.0.0
4
+ streamlit==1.24.0
5
+ streamlit_chat==0.1.1
6
+ openai==0.27.8
7
+ tiktoken==0.4.0
8
+ faiss-cpu