ashok2216 commited on
Commit
48010b4
·
verified ·
1 Parent(s): b3e0053

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -119
app.py CHANGED
@@ -1,27 +1,12 @@
1
  import chromadb
2
  from chromadb.utils import embedding_functions
 
3
  from sentence_transformers import SentenceTransformer
4
  from transformers import pipeline
5
  import streamlit as st
6
  import fitz # PyMuPDF for PDF parsing
7
 
8
- # # Step 1: Setup ChromaDB
9
- # def setup_chromadb():
10
- # # Initialize ChromaDB in-memory instance
11
- # client = chromadb.Client()
12
- # try:
13
- # client.delete_collection("pdf_data")
14
- # print("Existing collection 'pdf_data' deleted.")
15
- # except:
16
- # print("Collection 'pdf_data' not found, creating a new one.")
17
- # # Create a new collection with the embedding function
18
- # ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="sentence-transformers/all-MiniLM-L6-v2")
19
- # collection = client.create_collection("pdf_data", embedding_function=ef)
20
- # return client, collection
21
-
22
 
23
- # import chromadb
24
- from chromadb.config import Settings
25
 
26
  # Configure ChromaDB with persistent SQLite database
27
  config = Settings(
@@ -29,9 +14,6 @@ config = Settings(
29
  chroma_db_impl="sqlite",
30
  )
31
 
32
-
33
- import chromadb
34
-
35
  # Initialize persistent client with SQLite
36
  def setup_chromadb():
37
  client = chromadb.PersistentClient(path="./chromadb_data")
@@ -43,36 +25,6 @@ def setup_chromadb():
43
  )
44
  return client, collection
45
 
46
-
47
- # Initialize ChromaDB client
48
- # def setup_chromadb():
49
- # try:
50
- # client = chromadb.Client(config)
51
- # collections = client.list_collections()
52
- # print(f"Existing collections: {collections}")
53
- # if "pdf_data" in [c.name for c in collections]:
54
- # client.delete_collection("pdf_data")
55
- # print("Existing collection 'pdf_data' deleted.")
56
- # collection = client.create_collection(
57
- # "pdf_data",
58
- # embedding_function=chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction(
59
- # model_name="sentence-transformers/all-MiniLM-L6-v2"
60
- # ),
61
- # )
62
- # return client, collection
63
- # except Exception as e:
64
- # print("Error setting up ChromaDB:", e)
65
- # raise e
66
-
67
-
68
- # Step 2: Extract Text from PDF
69
- # def extract_text_from_pdf(pdf_path):
70
- # pdf_text = ""
71
- # with fitz.open(pdf_path) as doc:
72
- # for page in doc:
73
- # pdf_text += page.get_text()
74
- # return pdf_text
75
-
76
  def extract_text_from_pdf(uploaded_file):
77
  with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
78
  text = ""
@@ -80,7 +32,6 @@ def extract_text_from_pdf(uploaded_file):
80
  text += page.get_text()
81
  return text
82
 
83
- # Step 3: Add Extracted Text to Vector Database
84
  def add_pdf_text_to_db(collection, pdf_text):
85
  sentences = pdf_text.split("\n") # Split text into lines for granularity
86
  for idx, sentence in enumerate(sentences):
@@ -91,7 +42,6 @@ def add_pdf_text_to_db(collection, pdf_text):
91
  metadatas={"line_number": idx, "text": sentence}
92
  )
93
 
94
- # Step 4: Query Function
95
  def query_pdf_data(collection, query, retriever_model):
96
  results = collection.query(
97
  query_texts=[query],
@@ -134,73 +84,5 @@ def main():
134
  st.error(f"Error extracting text: {e}")
135
 
136
 
137
-
138
- # if uploaded_file:
139
- # st.write("Extracting text and populating the database...")
140
- # pdf_text = extract_text_from_pdf(uploaded_file)
141
- # add_pdf_text_to_db(collection, pdf_text)
142
- # st.success("PDF text has been added to the database. You can now query it!")
143
-
144
- # # Query Input
145
- # query = st.text_input("Enter your query about the PDF:")
146
- # if query:
147
- # try:
148
- # answer, metadata = query_pdf_data(collection, query, retriever_model)
149
- # st.subheader("Answer:")
150
- # st.write(answer[0]['generated_text'])
151
- # st.subheader("Retrieved Context:")
152
- # for meta in metadata[0]:
153
- # st.write(meta)
154
- # except Exception as e:
155
- # st.error(f"An error occurred: {str(e)}")
156
-
157
  if __name__ == "__main__":
158
  main()
159
-
160
-
161
- # import tempfile
162
- # import PyPDF2
163
- # import streamlit as st
164
- # from transformers import GPT2LMHeadModel, GPT2Tokenizer
165
-
166
- # # Load pre-trained GPT-3 model and tokenizer
167
- # tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
168
- # model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
169
-
170
-
171
- # def extract_text_from_pdf(file_path):
172
- # text = ""
173
- # with open(file_path, "rb") as f:
174
- # reader = PyPDF2.PdfFileReader(f)
175
- # for page_num in range(reader.numPages):
176
- # text += reader.getPage(page_num).extractText()
177
- # return text
178
-
179
- # def generate_response(user_input):
180
- # input_ids = tokenizer.encode(user_input, return_tensors="pt")
181
- # output = model.generate(input_ids, max_length=100, num_return_sequences=1, temperature=0.7)
182
- # response = tokenizer.decode(output[0], skip_special_tokens=True)
183
- # return response
184
-
185
- # def main():
186
- # st.title("PDF Chatbot")
187
-
188
- # pdf_file = st.file_uploader("Upload an pdf file", type=["pdf"], accept_multiple_files=False)
189
-
190
- # if pdf_file is not None:
191
- # with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
192
- # tmp_file.write(pdf_file.read())
193
- # st.success("PDF file successfully uploaded and stored temporally.")
194
- # file_path = tmp_file.name
195
- # pdf_text = extract_text_from_pdf(file_path)
196
- # st.text_area("PDF Content", pdf_text)
197
- # else:
198
- # st.markdown('File not found!')
199
-
200
- # user_input = st.text_input("You:", "")
201
- # if st.button("Send"):
202
- # response = generate_response(user_input)
203
- # st.text_area("Chatbot:", response)
204
-
205
- # if __name__ == "__main__":
206
- # main()
 
1
  import chromadb
2
  from chromadb.utils import embedding_functions
3
+ from chromadb.config import Settings
4
  from sentence_transformers import SentenceTransformer
5
  from transformers import pipeline
6
  import streamlit as st
7
  import fitz # PyMuPDF for PDF parsing
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
 
 
10
 
11
  # Configure ChromaDB with persistent SQLite database
12
  config = Settings(
 
14
  chroma_db_impl="sqlite",
15
  )
16
 
 
 
 
17
  # Initialize persistent client with SQLite
18
  def setup_chromadb():
19
  client = chromadb.PersistentClient(path="./chromadb_data")
 
25
  )
26
  return client, collection
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def extract_text_from_pdf(uploaded_file):
29
  with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
30
  text = ""
 
32
  text += page.get_text()
33
  return text
34
 
 
35
  def add_pdf_text_to_db(collection, pdf_text):
36
  sentences = pdf_text.split("\n") # Split text into lines for granularity
37
  for idx, sentence in enumerate(sentences):
 
42
  metadatas={"line_number": idx, "text": sentence}
43
  )
44
 
 
45
  def query_pdf_data(collection, query, retriever_model):
46
  results = collection.query(
47
  query_texts=[query],
 
84
  st.error(f"Error extracting text: {e}")
85
 
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  if __name__ == "__main__":
88
  main()