TourLover commited on
Commit
b78546e
·
verified ·
1 Parent(s): 4f15244

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -0
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import pytesseract
3
+ from PIL import Image
4
+ import io
5
+ from fpdf import FPDF
6
+ import gradio as gr
7
+ import os
8
+ from pathlib import Path
9
+
10
+ GROQ_API_KEY = "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"
11
+ client = Groq(api_key=GROQ_API_KEY)
12
+
13
+ # Function to summarize text using Groq API
14
+ def summarize_text(text, model="llama-3.1-70b-versatile"):
15
+ try:
16
+ chat_completion = client.chat.completions.create(
17
+ messages=[{"role": "user", "content": "Summarize this page in 15-20 lines under the heading of summary. You have to summarize, even if there are different, unlike topics on that page. (Kindly provide the response in proper paragraphing). However, if there is no text, then print Nothing to summarize. Additionally, after summarizing the text, enlist difficult terms up to 15, along with their single line meaning." + text}],
18
+ model=model,
19
+ )
20
+ return chat_completion.choices[0].message.content
21
+ except Exception as e:
22
+ print(f"[ERROR] Error in summarizing text: {e}")
23
+ return "Error in summarizing text."
24
+
25
+ # Function to extract text from PDF and generate a summarized PDF
26
+ def extract_text_and_summarize(filepath):
27
+ try:
28
+ if not os.path.exists(filepath):
29
+ print(f"[ERROR] File does not exist: {filepath}")
30
+ return "File does not exist."
31
+
32
+ doc = fitz.open(filepath)
33
+ print("[INFO] PDF opened successfully.")
34
+
35
+ # Initialize PDF to store summaries
36
+ pdf_summary = FPDF()
37
+ pdf_summary.set_auto_page_break(auto=True, margin=15)
38
+
39
+ # Process up to 50 pages
40
+ for page_num in range(min(doc.page_count, 50)):
41
+ data = "" # Initialize text variable for each page
42
+
43
+ print(f"[INFO] Extracting text from Page {page_num + 1}.")
44
+
45
+ page = doc.load_page(page_num)
46
+
47
+ # Extracting text
48
+ data += page.get_text()
49
+ print(f"[DEBUG] Text extracted from Page {page_num + 1}: {data[:100]}...") # Print first 100 characters
50
+
51
+ # Extracting images and performing OCR
52
+ image_list = page.get_images(full=True)
53
+ for img_index, img in enumerate(image_list):
54
+ try:
55
+ xref = img[0]
56
+ base_image = doc.extract_image(xref)
57
+ image_bytes = base_image["image"]
58
+ image_ext = base_image["ext"]
59
+ image = Image.open(io.BytesIO(image_bytes))
60
+
61
+ # Performing OCR on the image
62
+ ocr_text = pytesseract.image_to_string(image)
63
+ data += ocr_text
64
+ except Exception as e:
65
+ print(f"[ERROR] Error processing image on Page {page_num + 1}: {e}")
66
+
67
+ # Send the extracted text to LLM for summarization
68
+ summary = summarize_text(data)
69
+
70
+ # Add the summary to the output PDF
71
+ pdf_summary.add_page()
72
+ pdf_summary.set_font("Arial", size=12)
73
+ pdf_summary.multi_cell(0, 7, f"Summary of Page {page_num + 1}\n\n" + summary.encode('utf-8').decode('latin-1'))
74
+
75
+ # Save the output PDF
76
+ output_pdf_path = filepath.replace(".pdf", "_summary.pdf")
77
+ pdf_summary.output(output_pdf_path)
78
+ print("[INFO] Output PDF saved successfully.")
79
+
80
+ return output_pdf_path
81
+
82
+ except Exception as e:
83
+ print(f"[ERROR] Error processing the PDF: {e}")
84
+ return "Error processing the PDF."
85
+
86
+ # Function to handle file upload and initiate processing
87
+ def upload_file(filepath):
88
+ # Set status message to indicate processing
89
+ status_message = "Processing, Please wait..."
90
+ output_pdf_path = extract_text_and_summarize(filepath)
91
+ # Return status message and download button
92
+ return status_message, gr.update( visible=True, value=output_pdf_path)
93
+
94
+ def reset_interface(): # Function to reset the interface
95
+ return "", gr.update(visible=False)
96
+
97
+ # Create Gradio Interface
98
+ with gr.Blocks(css=".block {max-width: 800px; margin: auto; padding: 20px;}") as demo:
99
+ gr.Markdown("""
100
+ <h1 style='text-align: center; color: #4CAF50;'>PDF Summarizer</h1>
101
+ <p style='text-align: center; font-size: 1.2em; color: #666;'>Upload a PDF document and get a summarized version with page-by-page analysis and difficult term definitions for first 50 pages. Perfect for quick reviews and study aids!</p>
102
+ """)
103
+
104
+ with gr.Row():
105
+ u = gr.UploadButton("📁 Upload your PDF", file_count="single")
106
+ d = gr.DownloadButton("⬇️ Download the summarized PDF", visible=False)
107
+ status = gr.Textbox(label="Status", placeholder="Status will be displayed here...", visible=True, interactive=False)
108
+
109
+ u.upload(upload_file, u, [status, d])
110
+ d.click(reset_interface, None, [status, d])
111
+
112
+ if __name__ == "__main__":
113
+ demo.launch()