Spaces:

Prat0
/

LegalSummarizer

Sleeping

App Files Files Community

Prat0 commited on Aug 11, 2024

Commit

a132eda

verified ·

1 Parent(s): 11173d1

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -29

app.py CHANGED Viewed

@@ -13,30 +13,29 @@ def chunk_text(text, chunk_size=1000):
     print(f"Chunking text into {chunk_size}-character chunks...")
     return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
-# Load and index the legal document data
-async def load_data(uploaded_files):
     documents = []
     for uploaded_file in uploaded_files:
-        document_text = ""
-        if uploaded_file.type == "application/pdf":
-            pdf_reader = PyPDF2.PdfReader(uploaded_file)
-            for page in pdf_reader.pages:
-                document_text += page.extract_text()
-        else:
-            document_text = uploaded_file.getvalue().decode("utf-8")
-        # Chunk the document text
-        chunks = chunk_text(document_text)
-        for chunk in chunks:
-            documents.append(Document(text=chunk))
-    print("Setting up Gemini embedding and LLM...")
-    Settings.embed_model = GeminiEmbedding(api_key=os.getenv("GOOGLE_API_KEY"), model_name="models/embedding-001")
-    Settings.llm = Gemini(api_key=os.getenv("GOOGLE_API_KEY"), temperature=0.8, model_name="models/gemini-pro")
-    print("Creating index from documents...")
-    index = VectorStoreIndex.from_documents(documents)
-    return index
 # Asynchronously generate legal document summary
 async def generate_summary(index, document_text):
@@ -70,14 +69,22 @@ async def main():
     if uploaded_files:
         st.write("Analyzing legal documents...")
-        # Load data and generate summaries
-        print("Loading data and creating index...")
-        index = await load_data(uploaded_files)
-        summaries = []
-        # Collect tasks for asynchronous execution
         tasks = []
         for uploaded_file in uploaded_files:
             document_text = ""
             if uploaded_file.type == "application/pdf":
@@ -86,7 +93,7 @@ async def main():
                     document_text += page.extract_text()
             else:
                 document_text = uploaded_file.getvalue().decode("utf-8")
             # Chunk the document text for summarization
             chunks = chunk_text(document_text)
             for chunk in chunks:
@@ -104,4 +111,4 @@ async def main():
 if __name__ == "__main__":
     print("Starting application...")
     asyncio.run(main())
-    print("Application finished.")

     print(f"Chunking text into {chunk_size}-character chunks...")
     return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
+# Asynchronously load and convert text to documents
+async def load_documents(uploaded_files):
     documents = []
+    tasks = []
     for uploaded_file in uploaded_files:
+        tasks.append(convert_to_documents(uploaded_file, documents))
+    await asyncio.gather(*tasks)
+    return documents
+# Convert uploaded file to documents
+async def convert_to_documents(uploaded_file, documents):
+    document_text = ""
+    if uploaded_file.type == "application/pdf":
+        pdf_reader = PyPDF2.PdfReader(uploaded_file)
+        for page in pdf_reader.pages:
+            document_text += page.extract_text()
+    else:
+        document_text = uploaded_file.getvalue().decode("utf-8")
+    # Chunk the document text
+    chunks = chunk_text(document_text)
+    for chunk in chunks:
+        documents.append(Document(text=chunk))
 # Asynchronously generate legal document summary
 async def generate_summary(index, document_text):
     if uploaded_files:
         st.write("Analyzing legal documents...")
+        # Load data and convert to documents asynchronously
+        print("Loading data and converting to documents...")
+        documents = await load_documents(uploaded_files)
+        # Set up Gemini embedding and LLM
+        print("Setting up Gemini embedding and LLM...")
+        Settings.embed_model = GeminiEmbedding(api_key=os.getenv("GOOGLE_API_KEY"), model_name="models/embedding-001")
+        Settings.llm = Gemini(api_key=os.getenv("GOOGLE_API_KEY"), temperature=0.8, model_name="models/gemini-pro")
+        # Create index from documents
+        print("Creating index from documents...")
+        index = VectorStoreIndex.from_documents(documents)
+        # Generate summaries asynchronously
+        print("Generating summaries...")
         tasks = []
         for uploaded_file in uploaded_files:
             document_text = ""
             if uploaded_file.type == "application/pdf":
                     document_text += page.extract_text()
             else:
                 document_text = uploaded_file.getvalue().decode("utf-8")
             # Chunk the document text for summarization
             chunks = chunk_text(document_text)
             for chunk in chunks:
 if __name__ == "__main__":
     print("Starting application...")
     asyncio.run(main())
+    print("Application finished.")