hariharan220 commited on
Commit
6adb8e8
Β·
verified Β·
1 Parent(s): 796afe3

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +93 -0
main.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ import re
3
+ import nltk
4
+ import torch
5
+ import uvicorn
6
+ import os
7
+ import threading
8
+ import time
9
+ from nltk.tokenize import sent_tokenize
10
+ from transformers import pipeline
11
+ from fastapi import FastAPI, File, UploadFile, HTTPException
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+
14
+ # βœ… Ensure NLTK dependencies
15
+ try:
16
+ nltk.data.find('tokenizers/punkt')
17
+ except LookupError:
18
+ nltk.download('punkt')
19
+
20
+ # βœ… Initialize FastAPI App
21
+ app = FastAPI()
22
+
23
+ # βœ… Enable CORS for API Accessibility
24
+ app.add_middleware(
25
+ CORSMiddleware,
26
+ allow_origins=["*"],
27
+ allow_credentials=True,
28
+ allow_methods=["*"],
29
+ allow_headers=["*"],
30
+ )
31
+
32
+ # βœ… Automatically Detect Device (Use GPU if Available)
33
+ device = 0 if torch.cuda.is_available() else -1
34
+ print(f"Using Device: {'GPU' if device == 0 else 'CPU'}")
35
+
36
+ # βœ… Load Summarization Model
37
+ summarizer = pipeline("summarization", model="google/pegasus-xsum", device=device)
38
+
39
+ # --- **Generalized Cleaning** ---
40
+ def clean_text(text):
41
+ text = re.sub(r"\[\d+\]|\(\d+\)|\(\d{4}\)", "", text)
42
+ text = re.sub(r"(References:.*$)", "", text, flags=re.IGNORECASE)
43
+ text = re.sub(r"https?://\S+|www\.\S+", "", text)
44
+ text = re.sub(r"\s+", " ", text).strip()
45
+ return text
46
+
47
+ # --- **PDF Text Extraction** ---
48
+ def extract_text_from_pdf(pdf_path):
49
+ with pdfplumber.open(pdf_path) as pdf:
50
+ extracted_text = [page.extract_text() for page in pdf.pages if page.extract_text()]
51
+ return "\n".join(extracted_text)
52
+
53
+ # --- **Chunking for Summarization** ---
54
+ def split_text(text, chunk_size=2048):
55
+ sentences = sent_tokenize(text)
56
+ chunks, current_chunk = [], ""
57
+ for sentence in sentences:
58
+ if len(current_chunk) + len(sentence) + 1 <= chunk_size:
59
+ current_chunk += sentence + " "
60
+ else:
61
+ chunks.append(current_chunk.strip())
62
+ current_chunk = sentence + " "
63
+ if current_chunk:
64
+ chunks.append(current_chunk.strip())
65
+ return chunks
66
+
67
+ # --- **Summarization Endpoint** ---
68
+ @app.post("/summarize-pdf/")
69
+ async def summarize_pdf(file: UploadFile = File(...)):
70
+ try:
71
+ start_time = time.time()
72
+ pdf_content = await file.read()
73
+ pdf_path = "temp.pdf"
74
+ with open(pdf_path, "wb") as f:
75
+ f.write(pdf_content)
76
+
77
+ full_text = extract_text_from_pdf(pdf_path)
78
+ if not full_text.strip():
79
+ return {"error": "No text extracted from the PDF."}
80
+
81
+ cleaned_text = clean_text(full_text)
82
+ text_chunks = split_text(cleaned_text, chunk_size=2048)
83
+ summaries = [summarizer(chunk, max_new_tokens=250, num_beams=5, truncation=True)[0]['summary_text'] for chunk in text_chunks]
84
+
85
+ final_summary = " ".join(summaries)
86
+ return {"summary": final_summary}
87
+
88
+ except Exception as e:
89
+ return {"error": str(e)}
90
+
91
+ # βœ… Run FastAPI Server (Only for Local Debugging)
92
+ if __name__ == "__main__":
93
+ uvicorn.run(app, host="0.0.0.0", port=7860)