Codequestt commited on
Commit
a42aebf
·
verified ·
1 Parent(s): 260d06a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -420
app.py CHANGED
@@ -1,422 +1,3 @@
1
- # import gradio as gr
2
- # import pandas as pd
3
- # import os
4
- # import io
5
- # import zipfile
6
- # import shutil
7
- # from bs4 import BeautifulSoup
8
- # from typing import List, TypedDict
9
- # from langchain_huggingface import HuggingFaceEmbeddings
10
- # from langchain_community.vectorstores import Chroma
11
- # from langchain_core.documents import Document
12
- # from langchain_core.prompts import PromptTemplate
13
- # from langchain_core.output_parsers import StrOutputParser
14
- # from langchain_core.runnables import RunnablePassthrough
15
- # from langchain_nvidia_ai_endpoints import ChatNVIDIA
16
- # from langchain_community.tools.tavily_search import TavilySearchResults
17
- # from langgraph.graph import END, StateGraph, START
18
- # import chromadb
19
-
20
- # # ... (Keep all necessary imports from section 1 here)
21
-
22
- # def process_documents(folder_path):
23
- # """Process documents from the uploaded folder."""
24
- # d = {"chunk": [], "url": []}
25
-
26
- # for path in os.listdir(folder_path):
27
- # if not path.endswith(".html"): # Skip non-HTML files
28
- # continue
29
-
30
- # url = "https://" + path.replace("=", "/")
31
- # file_path = os.path.join(folder_path, path)
32
-
33
- # with open(file_path, 'rb') as stream:
34
- # content = stream.read().decode("utf-8")
35
- # soup = BeautifulSoup(content, "html.parser")
36
-
37
- # title = soup.find("title")
38
- # title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
39
-
40
- # main_content = soup.find("main")
41
- # text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
42
-
43
- # full_content = f"{title_text}\n\n{text_content}"
44
-
45
- # d["chunk"].append(full_content)
46
- # d["url"].append(url)
47
-
48
- # return pd.DataFrame(d)
49
-
50
- # def setup_rag_system(folder_path):
51
- # """Initialize the RAG system with the provided documents."""
52
- # # ... (Keep your existing setup_rag_system implementation here)
53
- # return vector_store
54
-
55
- # def create_workflow(vector_store):
56
- # """Create the RAG workflow."""
57
- # # ... (Keep your existing workflow creation code here)
58
- # return workflow.compile()
59
-
60
- # def handle_upload(folder_files, csv_file):
61
- # try:
62
- # # Create temporary directory
63
- # temp_dir = "temp_upload"
64
- # os.makedirs(temp_dir, exist_ok=True)
65
-
66
- # # Process document files
67
- # doc_dir = os.path.join(temp_dir, "docs")
68
- # os.makedirs(doc_dir, exist_ok=True)
69
-
70
- # # Handle zip file or individual files
71
- # for file in folder_files:
72
- # if file.name.endswith('.zip'):
73
- # with zipfile.ZipFile(io.BytesIO(file.read())) as zip_ref:
74
- # zip_ref.extractall(doc_dir)
75
- # else:
76
- # with open(os.path.join(doc_dir, file.name), "wb") as f:
77
- # f.write(file.read())
78
-
79
- # # Process CSV requirements
80
- # csv_content = csv_file.read()
81
- # requirements_df = pd.read_csv(io.BytesIO(csv_content), encoding='latin-1')
82
- # requirements = requirements_df.iloc[:, 0].tolist() # Get first column
83
-
84
- # # Setup RAG system
85
- # vector_store = setup_rag_system(doc_dir)
86
- # app = create_workflow(vector_store)
87
-
88
- # # Process requirements
89
- # results = []
90
- # for question in requirements:
91
- # inputs = {"question": question}
92
- # output = app.invoke(inputs)
93
- # results.append({
94
- # "Requirement": question,
95
- # "Response": output.get("generation", "No response generated")
96
- # })
97
-
98
- # # Cleanup
99
- # shutil.rmtree(temp_dir)
100
-
101
- # return pd.DataFrame(results)
102
-
103
- # except Exception as e:
104
- # return pd.DataFrame({"Error": [str(e)]})
105
-
106
- # def create_gradio_interface():
107
- # iface = gr.Interface(
108
- # fn=handle_upload,
109
- # inputs=[
110
- # gr.File(file_count="multiple", label="Upload Documents (ZIP or HTML files)"),
111
- # gr.File(label="Upload Requirements CSV", type="binary")
112
- # ],
113
- # outputs=gr.Dataframe(),
114
- # title="RAG System for RFP Analysis",
115
- # description="Upload documents (ZIP or HTML files) and a CSV file with requirements."
116
- # )
117
- # return iface
118
-
119
- # if __name__ == "__main__":
120
- # iface = create_gradio_interface()
121
- # iface.launch()
122
-
123
- # import gradio as gr
124
- # import pandas as pd
125
- # import os
126
- # import torch
127
- # import zipfile
128
- # import tempfile
129
- # import shutil
130
- # from bs4 import BeautifulSoup
131
- # from typing import List, TypedDict
132
- # from langchain_huggingface import HuggingFaceEmbeddings
133
- # from langchain_community.vectorstores import Chroma
134
- # from langchain_core.documents import Document
135
- # from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
136
- # from langchain_core.output_parsers import StrOutputParser
137
- # from langchain_core.runnables import RunnablePassthrough
138
- # from langchain_nvidia_ai_endpoints import ChatNVIDIA
139
- # from langchain_core.pydantic_v1 import BaseModel, Field
140
- # from langchain_community.tools.tavily_search import TavilySearchResults
141
- # from langgraph.graph import END, StateGraph, START
142
- # import chromadb
143
- # import io
144
-
145
- # # Environment variables setup
146
- # os.environ["TAVILY_API_KEY"] = "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"
147
- # os.environ["NVIDIA_API_KEY"] = "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"
148
- # os.environ["LANGCHAIN_PROJECT"] = "RAG project"
149
-
150
- # class GradeDocuments(BaseModel):
151
- # """Binary score for relevance check on retrieved documents."""
152
- # binary_score: str = Field(description="Documents are relevant to the question, 'yes' or 'no'")
153
-
154
- # class GraphState(TypedDict):
155
- # """Represents the state of our graph."""
156
- # question: str
157
- # generation: str
158
- # decision: str
159
- # documents: List[str]
160
-
161
- # import os
162
- # from bs4 import BeautifulSoup
163
- # import pandas as pd
164
-
165
- # def process_documents(temp_dir):
166
- # """Process documents from the extracted zip folder with enhanced error handling."""
167
- # d = {"chunk": [], "url": []}
168
-
169
- # # Debug information
170
- # print(f"Scanning directory: {temp_dir}")
171
-
172
- # file_count = 0
173
- # processed_count = 0
174
- # error_count = 0
175
-
176
- # # Recursively traverse the directory
177
- # for root, dirs, files in os.walk(temp_dir):
178
- # for file_name in files:
179
- # file_count += 1
180
- # file_path = os.path.join(root, file_name)
181
- # print(f"Processing file: {file_path}")
182
-
183
- # try:
184
- # # Try different encodings
185
- # encodings = ['utf-8', 'latin-1', 'cp1252']
186
- # content = None
187
-
188
- # for encoding in encodings:
189
- # try:
190
- # with open(file_path, 'r', encoding=encoding) as stream:
191
- # content = stream.read()
192
- # break
193
- # except UnicodeDecodeError:
194
- # continue
195
-
196
- # if content is None:
197
- # print(f"Failed to read file {file_path} with any encoding")
198
- # error_count += 1
199
- # continue
200
-
201
- # soup = BeautifulSoup(content, "html.parser")
202
-
203
- # title = soup.find("title")
204
- # title_text = title.string.replace(" | Dataiku", "") if title else "No Title"
205
-
206
- # main_content = soup.find("main")
207
- # text_content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
208
-
209
- # if not text_content.strip():
210
- # print(f"No content extracted from {file_path}")
211
- # error_count += 1
212
- # continue
213
-
214
- # full_content = f"{title_text}\n\n{text_content}"
215
-
216
- # d["chunk"].append(full_content)
217
- # d["url"].append("https://" + file_name.replace("=", "/"))
218
- # processed_count += 1
219
- # print(f"Successfully processed {file_path}")
220
-
221
- # except Exception as e:
222
- # print(f"Error processing file {file_path}: {str(e)}")
223
- # error_count += 1
224
- # continue
225
-
226
- # print(f"\nProcessing Summary:")
227
- # print(f"Total files found: {file_count}")
228
- # print(f"Successfully processed: {processed_count}")
229
- # print(f"Errors encountered: {error_count}")
230
-
231
- # if not d["chunk"]:
232
- # raise ValueError(f"No valid documents were processed. Processed {file_count} files with {error_count} errors.")
233
-
234
- # return pd.DataFrame(d)
235
-
236
-
237
-
238
- # # The rest of the code remains the same...
239
-
240
- # def setup_rag_system(temp_dir):
241
- # """Initialize the RAG system with the provided documents."""
242
- # # Initialize embedding model
243
- # model_name = "dunzhang/stella_en_1.5B_v5"
244
- # model_kwargs = {'trust_remote_code': 'True'}
245
- # embedding_model = HuggingFaceEmbeddings(
246
- # model_name=model_name,
247
- # show_progress=True,
248
- # model_kwargs=model_kwargs
249
- # )
250
-
251
- # # Process documents
252
- # df = process_documents(temp_dir)
253
- # if df.empty:
254
- # raise ValueError("No valid documents were processed")
255
-
256
- # df["chunk_id"] = range(len(df))
257
-
258
- # # Create documents list
259
- # list_of_documents = [
260
- # Document(
261
- # page_content=record['chunk'],
262
- # metadata={"source_url": record['url']}
263
- # )
264
- # for record in df[['chunk', 'url']].to_dict(orient='records')
265
- # ]
266
-
267
- # # Setup vector store
268
- # ids = [str(i) for i in df['chunk_id'].to_list()]
269
- # client = chromadb.PersistentClient(path=tempfile.mkdtemp())
270
- # vector_store = Chroma(
271
- # client=client,
272
- # collection_name="rag-chroma",
273
- # embedding_function=embedding_model,
274
- # )
275
-
276
- # # Add documents in batches
277
- # batch_size = 100
278
- # for i in range(0, len(list_of_documents), batch_size):
279
- # end_idx = min(i + batch_size, len(list_of_documents))
280
- # vector_store.add_documents(
281
- # documents=list_of_documents[i:end_idx],
282
- # ids=ids[i:end_idx]
283
- # )
284
-
285
- # return vector_store
286
-
287
- # def create_workflow(vector_store):
288
- # """Create the RAG workflow."""
289
- # retriever = vector_store.as_retriever(search_kwargs={"k": 7})
290
- # llm = ChatNVIDIA(model="meta/llama-3.3-70b-instruct", temperature=0)
291
-
292
- # rag_prompt = PromptTemplate.from_template(
293
- # """You are an assistant for responding to Request For Proposal documents for a
294
- # bidder in the field of Data Science and Engineering. Use the following pieces
295
- # of retrieved context to respond to the requests. If you don't know the answer,
296
- # just say that you don't know. Provide detailed responses with specific examples
297
- # and capabilities where possible.
298
-
299
- # Question: {question}
300
- # Context: {context}
301
- # Answer:"""
302
- # )
303
-
304
- # def format_docs(result):
305
- # return "\n\n".join(doc.page_content for doc in result)
306
-
307
- # rag_chain = (
308
- # {"context": retriever | format_docs, "question": RunnablePassthrough()}
309
- # | rag_prompt
310
- # | llm
311
- # | StrOutputParser()
312
- # )
313
-
314
- # return rag_chain
315
-
316
- # def preprocess_csv(csv_file):
317
- # """Preprocess the CSV file to ensure proper format."""
318
- # try:
319
- # # First try reading as is
320
- # df = pd.read_csv(csv_file.name, encoding='latin-1')
321
-
322
- # # If there's only one column and no header
323
- # if len(df.columns) == 1 and df.columns[0] != 'requirement':
324
- # # Read again with no header and assign column name
325
- # df = pd.read_csv(csv_file.name, encoding='latin-1', header=None, names=['requirement'])
326
-
327
- # # If there's no 'requirement' column, assume first column is requirements
328
- # if 'requirement' not in df.columns:
329
- # df = df.rename(columns={df.columns[0]: 'requirement'})
330
-
331
- # return df
332
- # except Exception as e:
333
- # # If standard CSV reading fails, try reading as plain text
334
- # try:
335
- # with open(csv_file.name, 'r', encoding='latin-1') as f:
336
- # requirements = f.read().strip().split('\n')
337
- # return pd.DataFrame({'requirement': requirements})
338
- # except Exception as e2:
339
- # raise ValueError(f"Could not process CSV file: {str(e2)}")
340
-
341
- # def handle_upload(zip_file, csv_file):
342
- # """Handle file uploads and process requirements with enhanced error handling."""
343
- # try:
344
- # # Create temporary directory
345
- # temp_dir = tempfile.mkdtemp()
346
- # print(f"Created temporary directory: {temp_dir}")
347
-
348
- # try:
349
- # # Extract zip file
350
- # print(f"Extracting ZIP file: {zip_file.name}")
351
- # with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
352
- # zip_ref.extractall(temp_dir)
353
- # print(f"ZIP contents: {zip_ref.namelist()}")
354
-
355
- # # Process documents
356
- # print("Processing documents...")
357
- # df = process_documents(temp_dir)
358
- # print(f"Processed {len(df)} documents")
359
-
360
- # # Preprocess and read requirements CSV
361
- # print("Processing CSV file...")
362
- # requirements_df = preprocess_csv(csv_file)
363
- # print(f"Found {len(requirements_df)} requirements")
364
-
365
- # # Setup RAG system
366
- # print("Setting up RAG system...")
367
- # vector_store = setup_rag_system(temp_dir)
368
- # rag_chain = create_workflow(vector_store)
369
-
370
- # # Process requirements
371
- # results = []
372
- # for idx, req in enumerate(requirements_df['requirement'], 1):
373
- # print(f"Processing requirement {idx}/{len(requirements_df)}")
374
- # try:
375
- # response = rag_chain.invoke(req)
376
- # results.append({
377
- # 'requirement': req,
378
- # 'response': response
379
- # })
380
- # except Exception as e:
381
- # error_msg = f"Error processing requirement: {str(e)}"
382
- # print(error_msg)
383
- # results.append({
384
- # 'requirement': req,
385
- # 'response': error_msg
386
- # })
387
-
388
- # return pd.DataFrame(results)
389
-
390
- # finally:
391
- # # Cleanup
392
- # print(f"Cleaning up temporary directory: {temp_dir}")
393
- # shutil.rmtree(temp_dir)
394
-
395
- # except Exception as e:
396
- # error_msg = f"Processing error: {str(e)}"
397
- # print(error_msg)
398
- # return pd.DataFrame([{'error': error_msg}])
399
- # def main():
400
- # """Main function to run the Gradio interface."""
401
- # iface = gr.Interface(
402
- # fn=handle_upload,
403
- # inputs=[
404
- # gr.File(label="Upload ZIP folder containing URLs", file_types=[".zip"]),
405
- # gr.File(label="Upload Requirements CSV", file_types=[".csv", ".txt"])
406
- # ],
407
- # outputs=gr.Dataframe(),
408
- # title="RAG System for RFP Analysis",
409
- # description="""Upload a ZIP folder containing URL documents and a CSV file with requirements to analyze.
410
- # The CSV file should contain requirements either as a single column or with a 'requirement' column header.""",
411
- # examples=[],
412
- # cache_examples=False
413
- # )
414
-
415
- # iface.launch(share=True)
416
-
417
- # if __name__ == "__main__":
418
- # main()
419
-
420
  import gradio as gr
421
  import pandas as pd
422
  import os
@@ -698,7 +279,7 @@ def main():
698
  gr.Textbox(label="Enter your NVIDIA API Key", type="password")
699
  ],
700
  outputs=gr.Dataframe(),
701
- title="RAG System for RFP Analysis",
702
  description="""Upload a ZIP folder containing URL documents and a CSV file with requirements to analyze.
703
  The CSV file should contain requirements either as a single column or with a 'requirement' column header.
704
  Enter your NVIDIA API key to use the service.""",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import os
 
279
  gr.Textbox(label="Enter your NVIDIA API Key", type="password")
280
  ],
281
  outputs=gr.Dataframe(),
282
+ title="RAG System for RFP Analysis (This agent helps you verify if a specific tool matches your project requirements by uploading your tool documentation and your CSV containing your requirements.)",
283
  description="""Upload a ZIP folder containing URL documents and a CSV file with requirements to analyze.
284
  The CSV file should contain requirements either as a single column or with a 'requirement' column header.
285
  Enter your NVIDIA API key to use the service.""",