Prat0 commited on
Commit
a132eda
·
verified ·
1 Parent(s): 11173d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -29
app.py CHANGED
@@ -13,30 +13,29 @@ def chunk_text(text, chunk_size=1000):
13
  print(f"Chunking text into {chunk_size}-character chunks...")
14
  return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
15
 
16
- # Load and index the legal document data
17
- async def load_data(uploaded_files):
18
  documents = []
 
19
  for uploaded_file in uploaded_files:
20
- document_text = ""
21
- if uploaded_file.type == "application/pdf":
22
- pdf_reader = PyPDF2.PdfReader(uploaded_file)
23
- for page in pdf_reader.pages:
24
- document_text += page.extract_text()
25
- else:
26
- document_text = uploaded_file.getvalue().decode("utf-8")
27
-
28
- # Chunk the document text
29
- chunks = chunk_text(document_text)
30
- for chunk in chunks:
31
- documents.append(Document(text=chunk))
32
-
33
- print("Setting up Gemini embedding and LLM...")
34
- Settings.embed_model = GeminiEmbedding(api_key=os.getenv("GOOGLE_API_KEY"), model_name="models/embedding-001")
35
- Settings.llm = Gemini(api_key=os.getenv("GOOGLE_API_KEY"), temperature=0.8, model_name="models/gemini-pro")
36
 
37
- print("Creating index from documents...")
38
- index = VectorStoreIndex.from_documents(documents)
39
- return index
 
40
 
41
  # Asynchronously generate legal document summary
42
  async def generate_summary(index, document_text):
@@ -70,14 +69,22 @@ async def main():
70
  if uploaded_files:
71
  st.write("Analyzing legal documents...")
72
 
73
- # Load data and generate summaries
74
- print("Loading data and creating index...")
75
- index = await load_data(uploaded_files)
76
- summaries = []
 
 
 
 
 
 
 
 
77
 
78
- # Collect tasks for asynchronous execution
 
79
  tasks = []
80
-
81
  for uploaded_file in uploaded_files:
82
  document_text = ""
83
  if uploaded_file.type == "application/pdf":
@@ -86,7 +93,7 @@ async def main():
86
  document_text += page.extract_text()
87
  else:
88
  document_text = uploaded_file.getvalue().decode("utf-8")
89
-
90
  # Chunk the document text for summarization
91
  chunks = chunk_text(document_text)
92
  for chunk in chunks:
@@ -104,4 +111,4 @@ async def main():
104
  if __name__ == "__main__":
105
  print("Starting application...")
106
  asyncio.run(main())
107
- print("Application finished.")
 
13
  print(f"Chunking text into {chunk_size}-character chunks...")
14
  return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
15
 
16
+ # Asynchronously load and convert text to documents
17
+ async def load_documents(uploaded_files):
18
  documents = []
19
+ tasks = []
20
  for uploaded_file in uploaded_files:
21
+ tasks.append(convert_to_documents(uploaded_file, documents))
22
+ await asyncio.gather(*tasks)
23
+ return documents
24
+
25
+ # Convert uploaded file to documents
26
+ async def convert_to_documents(uploaded_file, documents):
27
+ document_text = ""
28
+ if uploaded_file.type == "application/pdf":
29
+ pdf_reader = PyPDF2.PdfReader(uploaded_file)
30
+ for page in pdf_reader.pages:
31
+ document_text += page.extract_text()
32
+ else:
33
+ document_text = uploaded_file.getvalue().decode("utf-8")
 
 
 
34
 
35
+ # Chunk the document text
36
+ chunks = chunk_text(document_text)
37
+ for chunk in chunks:
38
+ documents.append(Document(text=chunk))
39
 
40
  # Asynchronously generate legal document summary
41
  async def generate_summary(index, document_text):
 
69
  if uploaded_files:
70
  st.write("Analyzing legal documents...")
71
 
72
+ # Load data and convert to documents asynchronously
73
+ print("Loading data and converting to documents...")
74
+ documents = await load_documents(uploaded_files)
75
+
76
+ # Set up Gemini embedding and LLM
77
+ print("Setting up Gemini embedding and LLM...")
78
+ Settings.embed_model = GeminiEmbedding(api_key=os.getenv("GOOGLE_API_KEY"), model_name="models/embedding-001")
79
+ Settings.llm = Gemini(api_key=os.getenv("GOOGLE_API_KEY"), temperature=0.8, model_name="models/gemini-pro")
80
+
81
+ # Create index from documents
82
+ print("Creating index from documents...")
83
+ index = VectorStoreIndex.from_documents(documents)
84
 
85
+ # Generate summaries asynchronously
86
+ print("Generating summaries...")
87
  tasks = []
 
88
  for uploaded_file in uploaded_files:
89
  document_text = ""
90
  if uploaded_file.type == "application/pdf":
 
93
  document_text += page.extract_text()
94
  else:
95
  document_text = uploaded_file.getvalue().decode("utf-8")
96
+
97
  # Chunk the document text for summarization
98
  chunks = chunk_text(document_text)
99
  for chunk in chunks:
 
111
  if __name__ == "__main__":
112
  print("Starting application...")
113
  asyncio.run(main())
114
+ print("Application finished.")