Sarveshj commited on
Commit
4dbedd6
·
1 Parent(s): 1373704

INITIAL_COMMIT

Browse files
Files changed (4) hide show
  1. ._requirements.txt +0 -0
  2. ._talk2doc_app.py +0 -0
  3. requirements.txt +22 -0
  4. talk2doc_app.py +136 -0
._requirements.txt ADDED
Binary file (4.1 kB). View file
 
._talk2doc_app.py ADDED
Binary file (4.1 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain_openai
3
+ langchain_core
4
+ python-dotenv
5
+ streamlit
6
+ langchain_community
7
+ langserve
8
+ sse_starlette
9
+ bs4
10
+ pypdf
11
+ chromadb
12
+ faiss-cpu
13
+ groq
14
+ cassio
15
+ langchain-groq
16
+ langchainhub
17
+ sentence_transformers
18
+ PyPDF2
19
+ langchain-objectbox
20
+ pytesseract
21
+ pdf2image
22
+ pillow
talk2doc_app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pytesseract
3
+ import streamlit as st
4
+ import time
5
+
6
+ from dotenv import load_dotenv
7
+ from langchain.chains import create_retrieval_chain
8
+ from langchain.chains.combine_documents import create_stuff_documents_chain
9
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain.schema import Document
12
+ from langchain_community.vectorstores import FAISS
13
+ from langchain_core.prompts import ChatPromptTemplate
14
+ from langchain_groq import ChatGroq
15
+ from pdf2image import convert_from_path
16
+ from PyPDF2 import PdfReader
17
+ from io import BytesIO
18
+
19
+ load_dotenv()
20
+
21
+ groq_api_key = os.environ["groq"]
22
+ os.environ["LANGCHAIN_TRACING_V2"] = "true"
23
+ os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
24
+
25
+ st.title("Talk to your Documents!!!")
26
+ st.sidebar.title("Model fine-tuning")
27
+ model = st.sidebar.selectbox("Select the model that you want to use",
28
+ ('llama-3.1-8b-instant', 'llama-3.1-70b-versatile', 'llama3-70b-8192',
29
+ 'llama3-8b-8192', 'mixtral-8x7b-32768', 'gemma2-9b-it', 'gemma-7b-it'))
30
+ temprature = st.sidebar.slider("Temperature", min_value=0., max_value=1., value=0.7)
31
+ tokens = st.sidebar.slider("Tokens", min_value=256, max_value=4096, value=1024)
32
+
33
+ llm = ChatGroq(groq_api_key=groq_api_key, model_name=model, temperature=temprature, max_tokens=tokens)
34
+ prompt = ChatPromptTemplate.from_template(
35
+ """
36
+ Answer the questions based on the provided context only.
37
+ Please provide the most accurate response based on the question.
38
+ Be respectful and friendly, and you can use emojis too.
39
+ You do not know anything out of context, and if the question
40
+ is out of context simply say that you do not know.
41
+ Do not provide output based on your general knowledge.
42
+ The response provided must be more than 256 tokens.
43
+ <context>
44
+ {context}
45
+ <context>
46
+ Questions:{input}
47
+ """
48
+ )
49
+
50
+ def ocr_pdf_page(file_path, page_number):
51
+ images = convert_from_path(file_path, first_page=page_number, last_page=page_number)
52
+ if images:
53
+ return pytesseract.image_to_string(images[0])
54
+ return ""
55
+
56
+
57
+ def process_uploaded_pdfs(uploaded_files):
58
+ documents = []
59
+ for uploaded_file in uploaded_files:
60
+ with BytesIO(uploaded_file.read()) as pdf_stream:
61
+ pdf_reader = PdfReader(pdf_stream)
62
+ for i, page in enumerate(pdf_reader.pages):
63
+ text = page.extract_text() or ""
64
+ if not text.strip():
65
+ text = ocr_pdf_page(uploaded_file.name, i + 1)
66
+ documents.append(Document(page_content=text, metadata={"source": uploaded_file.name, "page": i + 1}))
67
+ return documents
68
+
69
+ # def process_uploaded_pdfs(uploaded_files):
70
+ # documents = []
71
+ # for uploaded_file in uploaded_files:
72
+ # with BytesIO(uploaded_file.read()) as pdf_stream:
73
+ # pdf_reader = PdfReader(pdf_stream)
74
+ # for i, page in enumerate(pdf_reader.pages):
75
+ # text = page.extract_text() or ""
76
+ # if not text.strip():
77
+ # text = ocr_pdf_page(uploaded_file.name, i + 1)
78
+ # documents.append({"page_content": text, "metadata": {"source": uploaded_file.name, "page": i + 1}})
79
+ # return documents
80
+
81
+ def vector_embedding_pdfs(uploaded_files):
82
+ if "vector" not in st.session_state:
83
+ st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
84
+ st.session_state.pdf_docs = process_uploaded_pdfs(uploaded_files)
85
+ st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
86
+ st.session_state.final_documents = st.session_state.text_splitter.split_documents(st.session_state.pdf_docs)
87
+ st.session_state.vectors = FAISS.from_documents(st.session_state.final_documents, st.session_state.embeddings)
88
+ st.session_state.vectors.save_local("./faissDsRagGroq", "index_hf")
89
+
90
+ uploaded_files = st.file_uploader("Upload PDF documents", type="pdf", accept_multiple_files=True)
91
+
92
+ if uploaded_files and st.button("Documents Embedding"):
93
+ start = time.process_time()
94
+ vector_embedding_pdfs(uploaded_files)
95
+ end = time.process_time()
96
+ st.write("Embedding completed!!!")
97
+ st.write(f"Time taken for generating embeddings: {(end - start):.2f} seconds...")
98
+
99
+ if st.button("Load vector db"):
100
+ st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
101
+ start = time.process_time()
102
+ st.session_state.vector_db = FAISS.load_local("./faissDsRagGroq", embeddings=st.session_state.embeddings,
103
+ index_name="index_hf", allow_dangerous_deserialization=True)
104
+ end = time.process_time()
105
+ st.write("Embeddings Loaded!!!")
106
+ st.write(f"Time taken for loading embeddings: {(end - start):.2f} seconds")
107
+
108
+ if "chat_history" not in st.session_state:
109
+ st.session_state.chat_history = []
110
+
111
+ input_prompt = st.text_input("Enter Your Question From Documents")
112
+
113
+ if input_prompt:
114
+ document_chain = create_stuff_documents_chain(llm, prompt)
115
+ retriever = st.session_state.vector_db.as_retriever()
116
+ retrieval_chain = create_retrieval_chain(retriever, document_chain)
117
+
118
+ start = time.process_time()
119
+ response = retrieval_chain.invoke({"input": input_prompt})
120
+ response_time = time.process_time() - start
121
+
122
+ st.session_state.chat_history.append({"question": input_prompt, "response": response['answer']})
123
+
124
+ st.write(f"Response time: {response_time:.2f} seconds")
125
+ st.write(response['answer'])
126
+
127
+ with st.expander("Document Similarity Search"):
128
+ for i, doc in enumerate(response["context"]):
129
+ st.write(doc.page_content)
130
+ st.write("--------------------------------")
131
+
132
+ with st.expander("Chat History"):
133
+ for chat in st.session_state.chat_history:
134
+ st.write(f"**Question:** {chat['question']}")
135
+ st.write(f"**Response:** {chat['response']}")
136
+ st.write("---")