Prat0 commited on
Commit
bd4e82f
·
verified ·
1 Parent(s): af78cdc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -74
app.py CHANGED
@@ -1,47 +1,38 @@
1
  import streamlit as st
2
  from llama_index.core import Settings
3
- from llama_index.core import VectorStoreIndex, Document
4
  from llama_index.embeddings.gemini import GeminiEmbedding
5
  from llama_index.llms.gemini import Gemini
 
 
6
  import os
7
  import PyPDF2
8
- import asyncio
9
 
10
- # Function to chunk text into smaller pieces
11
- def chunk_text(text, chunk_size=9000):
12
- """Split the text into chunks of specified size."""
13
- print(f"Chunking text into {chunk_size}-character chunks...")
14
- return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
15
 
16
- # Asynchronously load and convert text to documents
17
- async def load_documents(uploaded_files):
18
- documents = []
19
- tasks = []
20
- for uploaded_file in uploaded_files:
21
- tasks.append(convert_to_documents(uploaded_file, documents))
22
- await asyncio.gather(*tasks)
 
 
 
 
23
  return documents
24
 
25
- # Convert uploaded file to documents
26
- async def convert_to_documents(uploaded_file, documents):
27
- document_text = ""
28
- if uploaded_file.type == "application/pdf":
29
- pdf_reader = PyPDF2.PdfReader(uploaded_file)
30
- for page in pdf_reader.pages:
31
- document_text += page.extract_text()
32
- else:
33
- document_text = uploaded_file.getvalue().decode("utf-8")
34
-
35
- # Chunk the document text
36
- chunks = chunk_text(document_text)
37
- for chunk in chunks:
38
- documents.append(Document(text=chunk))
39
 
40
- # Asynchronously generate legal document summary
41
- async def generate_summary(index, document_text):
42
- print("Generating summary...")
43
  query_engine = index.as_query_engine()
44
- response = await query_engine.query(f"""
45
  You are a skilled legal analyst. Your task is to provide a comprehensive summary of the given legal document.
46
  Analyze the following legal document and summarize it:
47
  {document_text}
@@ -59,56 +50,37 @@ async def generate_summary(index, document_text):
59
  return response.response
60
 
61
  # Streamlit app
62
- async def main():
63
  st.title("Legal Document Summarizer")
64
- st.write("Upload legal documents, and let our AI summarize them!")
65
 
66
  # File uploader
67
- uploaded_files = st.file_uploader("Choose legal document files", type=["txt", "pdf"], accept_multiple_files=True)
68
 
69
- if uploaded_files:
70
- st.write("Analyzing legal documents...")
71
-
72
- # Load data and convert to documents asynchronously
73
- print("Loading data and converting to documents...")
74
- documents = await load_documents(uploaded_files)
 
 
 
75
 
76
- # Set up Gemini embedding and LLM
77
- print("Setting up Gemini embedding and LLM...")
78
- Settings.embed_model = GeminiEmbedding(api_key=os.getenv("GOOGLE_API_KEY"), model_name="models/embedding-001")
79
- Settings.llm = Gemini(api_key=os.getenv("GOOGLE_API_KEY"), temperature=0.8, model_name="models/gemini-pro")
80
 
81
- # Create index from documents
82
- print("Creating index from documents...")
83
- index = VectorStoreIndex.from_documents(documents)
84
 
85
- # Generate summaries asynchronously
86
- print("Generating summaries...")
87
- tasks = []
88
- for uploaded_file in uploaded_files:
89
- document_text = ""
90
- if uploaded_file.type == "application/pdf":
91
- pdf_reader = PyPDF2.PdfReader(uploaded_file)
92
- for page in pdf_reader.pages:
93
- document_text += page.extract_text()
94
- else:
95
- document_text = uploaded_file.getvalue().decode("utf-8")
96
-
97
- # Chunk the document text for summarization
98
- chunks = chunk_text(document_text)
99
- for chunk in chunks:
100
- tasks.append(generate_summary(index, chunk))
101
 
102
- # Await all summaries
103
- print("Awaiting summaries...")
104
- summaries = await asyncio.gather(*tasks)
105
 
106
- st.write("## Legal Document Summaries")
107
- for i, summary in enumerate(summaries):
108
- st.write(f"### Summary of Document {i + 1}")
109
- st.write(summary)
110
 
111
  if __name__ == "__main__":
112
- print("Starting application...")
113
- asyncio.run(main())
114
- print("Application finished.")
 
1
  import streamlit as st
2
  from llama_index.core import Settings
3
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
4
  from llama_index.embeddings.gemini import GeminiEmbedding
5
  from llama_index.llms.gemini import Gemini
6
+ from llama_index.core import Document
7
+ import google.generativeai as genai
8
  import os
9
  import PyPDF2
10
+ from io import BytesIO
11
 
12
+ # Set up Google API key
13
+ os.environ["GOOGLE_API_KEY"] = "your_api_key_here" # Replace with your actual API key
 
 
 
14
 
15
+ # Configure Google Gemini
16
+ Settings.embed_model = GeminiEmbedding(api_key=os.getenv("GOOGLE_API_KEY"), model_name="models/embedding-001")
17
+ Settings.llm = Gemini(api_key=os.getenv("GOOGLE_API_KEY"), temperature=0.8, model_name="models/gemini-pro")
18
+
19
+ def write_to_file(content, filename="test.pdf"):
20
+ with open(filename, "wb") as f:
21
+ f.write(content)
22
+
23
+ def ingest_documents():
24
+ reader = SimpleDirectoryReader("./")
25
+ documents = reader.load_data()
26
  return documents
27
 
28
+ def load_data(documents):
29
+ index = VectorStoreIndex.from_documents(documents)
30
+ return index
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # Generate legal document summary
33
+ def generate_summary(index, document_text):
 
34
  query_engine = index.as_query_engine()
35
+ response = query_engine.query(f"""
36
  You are a skilled legal analyst. Your task is to provide a comprehensive summary of the given legal document.
37
  Analyze the following legal document and summarize it:
38
  {document_text}
 
50
  return response.response
51
 
52
  # Streamlit app
53
+ def main():
54
  st.title("Legal Document Summarizer")
55
+ st.write("Upload a legal document, and let our AI summarize it!")
56
 
57
  # File uploader
58
+ uploaded_file = st.file_uploader("Choose a legal document file", type=["txt", "pdf"])
59
 
60
+ if uploaded_file is not None:
61
+ # Read file contents
62
+ if uploaded_file.type == "application/pdf":
63
+ pdf_reader = PyPDF2.PdfReader(BytesIO(uploaded_file.getvalue()))
64
+ document_text = ""
65
+ for page in pdf_reader.pages:
66
+ document_text += page.extract_text()
67
+ else:
68
+ document_text = uploaded_file.getvalue().decode("utf-8")
69
 
70
+ # Write content to file
71
+ write_to_file(uploaded_file.getvalue())
 
 
72
 
73
+ st.write("Analyzing legal document...")
 
 
74
 
75
+ # Ingest documents using SimpleDirectoryReader
76
+ documents = ingest_documents()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ # Load data and generate summary
79
+ index = load_data(documents)
80
+ summary = generate_summary(index, document_text)
81
 
82
+ st.write("## Legal Document Summary")
83
+ st.write(summary)
 
 
84
 
85
  if __name__ == "__main__":
86
+ main()