hariharan220 commited on
Commit
3b4877b
Β·
verified Β·
1 Parent(s): aaa2dd0

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +28 -19
main.py CHANGED
@@ -1,4 +1,13 @@
1
  import os
 
 
 
 
 
 
 
 
 
2
  import pdfplumber
3
  import re
4
  import nltk
@@ -7,29 +16,30 @@ import uvicorn
7
  import time
8
  from nltk.tokenize import sent_tokenize
9
  from transformers import pipeline
10
- from fastapi import FastAPI, File, UploadFile
11
  from fastapi.middleware.cors import CORSMiddleware
12
 
13
- # βœ… Set cache directories to writable locations for Hugging Face
14
- os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface_cache"
15
- os.environ["HF_HOME"] = "/tmp/hf_home"
16
- os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/huggingface_cache"
17
-
18
- os.makedirs("/tmp/huggingface_cache", exist_ok=True)
19
- os.makedirs("/tmp/hf_home", exist_ok=True)
20
-
21
- # βœ… Ensure NLTK Dependencies are Stored in a Writable Directory
22
  NLTK_DATA_DIR = "/tmp/nltk_data"
23
  os.makedirs(NLTK_DATA_DIR, exist_ok=True)
24
  nltk.data.path.append(NLTK_DATA_DIR)
25
 
26
- # βœ… Fix: Download only 'punkt' (NOT 'punkt_tab')
27
- nltk.download("punkt", download_dir=NLTK_DATA_DIR)
 
 
 
 
 
 
 
 
 
28
 
29
- # βœ… Initialize FastAPI App
30
  app = FastAPI()
31
 
32
- # βœ… Enable CORS for API Accessibility
33
  app.add_middleware(
34
  CORSMiddleware,
35
  allow_origins=["*"],
@@ -38,11 +48,11 @@ app.add_middleware(
38
  allow_headers=["*"],
39
  )
40
 
41
- # βœ… Automatically Detect Device (Use GPU if Available)
42
  device = 0 if torch.cuda.is_available() else -1
43
  print(f"Using Device: {'GPU' if device == 0 else 'CPU'}")
44
 
45
- # βœ… Load Summarization Model
46
  summarizer = pipeline("summarization", model="google/pegasus-xsum", device=device)
47
 
48
  # --- **Generalized Cleaning** ---
@@ -73,13 +83,13 @@ def split_text(text, chunk_size=2048):
73
  chunks.append(current_chunk.strip())
74
  return chunks
75
 
76
- # βœ… **Summarization API**
77
  @app.post("/summarize-pdf/")
78
  async def summarize_pdf(file: UploadFile = File(...)):
79
  try:
80
  start_time = time.time()
81
  pdf_content = await file.read()
82
- pdf_path = "/tmp/temp.pdf" # βœ… Store in /tmp/
83
  with open(pdf_path, "wb") as f:
84
  f.write(pdf_content)
85
 
@@ -97,6 +107,5 @@ async def summarize_pdf(file: UploadFile = File(...)):
97
  except Exception as e:
98
  return {"error": str(e)}
99
 
100
- # βœ… Start Uvicorn for Hugging Face Spaces
101
  if __name__ == "__main__":
102
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  import os
2
+
3
+ # Set cache directories to writable locations
4
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface_cache"
5
+ os.environ["HF_HOME"] = "/tmp/hf_home"
6
+ os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/huggingface_cache"
7
+
8
+ os.makedirs("/tmp/huggingface_cache", exist_ok=True)
9
+ os.makedirs("/tmp/hf_home", exist_ok=True)
10
+
11
  import pdfplumber
12
  import re
13
  import nltk
 
16
  import time
17
  from nltk.tokenize import sent_tokenize
18
  from transformers import pipeline
19
+ from fastapi import FastAPI, File, UploadFile, HTTPException
20
  from fastapi.middleware.cors import CORSMiddleware
21
 
22
+ # Set NLTK data directory to a writable location
 
 
 
 
 
 
 
 
23
  NLTK_DATA_DIR = "/tmp/nltk_data"
24
  os.makedirs(NLTK_DATA_DIR, exist_ok=True)
25
  nltk.data.path.append(NLTK_DATA_DIR)
26
 
27
+ # Download required NLTK resources
28
+ try:
29
+ nltk.data.find("tokenizers/punkt")
30
+ except LookupError:
31
+ nltk.download("punkt", download_dir=NLTK_DATA_DIR)
32
+
33
+ # Download punkt_tab as well (to fix the error)
34
+ try:
35
+ nltk.data.find("tokenizers/punkt_tab")
36
+ except LookupError:
37
+ nltk.download("punkt_tab", download_dir=NLTK_DATA_DIR)
38
 
39
+ # Initialize FastAPI App
40
  app = FastAPI()
41
 
42
+ # Enable CORS for API Accessibility
43
  app.add_middleware(
44
  CORSMiddleware,
45
  allow_origins=["*"],
 
48
  allow_headers=["*"],
49
  )
50
 
51
+ # Automatically Detect Device (Use GPU if Available)
52
  device = 0 if torch.cuda.is_available() else -1
53
  print(f"Using Device: {'GPU' if device == 0 else 'CPU'}")
54
 
55
+ # Load Summarization Model
56
  summarizer = pipeline("summarization", model="google/pegasus-xsum", device=device)
57
 
58
  # --- **Generalized Cleaning** ---
 
83
  chunks.append(current_chunk.strip())
84
  return chunks
85
 
86
+ # --- **Summarization Endpoint** ---
87
  @app.post("/summarize-pdf/")
88
  async def summarize_pdf(file: UploadFile = File(...)):
89
  try:
90
  start_time = time.time()
91
  pdf_content = await file.read()
92
+ pdf_path = "/tmp/temp.pdf" # Store in /tmp/
93
  with open(pdf_path, "wb") as f:
94
  f.write(pdf_content)
95
 
 
107
  except Exception as e:
108
  return {"error": str(e)}
109
 
 
110
  if __name__ == "__main__":
111
  uvicorn.run(app, host="0.0.0.0", port=7860)