hariharan220 commited on
Commit
9e8f213
Β·
verified Β·
1 Parent(s): a8e50f2

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +30 -28
main.py CHANGED
@@ -1,13 +1,4 @@
1
  import os
2
-
3
- # Set cache directories to writable locations
4
- os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface_cache"
5
- os.environ["HF_HOME"] = "/tmp/hf_home"
6
- os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/huggingface_cache"
7
-
8
- os.makedirs("/tmp/huggingface_cache", exist_ok=True)
9
- os.makedirs("/tmp/hf_home", exist_ok=True)
10
-
11
  import pdfplumber
12
  import re
13
  import nltk
@@ -16,30 +7,32 @@ import uvicorn
16
  import time
17
  from nltk.tokenize import sent_tokenize
18
  from transformers import pipeline
19
- from fastapi import FastAPI, File, UploadFile, HTTPException
20
  from fastapi.middleware.cors import CORSMiddleware
21
 
22
- # Set NLTK data directory to a writable location
23
- NLTK_DATA_DIR = "/tmp/nltk_data"
 
 
 
 
 
 
 
 
24
  os.makedirs(NLTK_DATA_DIR, exist_ok=True)
25
  nltk.data.path.append(NLTK_DATA_DIR)
26
 
27
- # Download required NLTK resources
28
  try:
29
  nltk.data.find("tokenizers/punkt")
30
  except LookupError:
31
  nltk.download("punkt", download_dir=NLTK_DATA_DIR)
32
 
33
- # Download punkt_tab as well (to fix the error)
34
- try:
35
- nltk.data.find("tokenizers/punkt_tab")
36
- except LookupError:
37
- nltk.download("punkt_tab", download_dir=NLTK_DATA_DIR)
38
-
39
- # Initialize FastAPI App
40
  app = FastAPI()
41
 
42
- # Enable CORS for API Accessibility
43
  app.add_middleware(
44
  CORSMiddleware,
45
  allow_origins=["*"],
@@ -48,12 +41,20 @@ app.add_middleware(
48
  allow_headers=["*"],
49
  )
50
 
51
- # Automatically Detect Device (Use GPU if Available)
52
  device = 0 if torch.cuda.is_available() else -1
53
- print(f"Using Device: {'GPU' if device == 0 else 'CPU'}")
54
-
55
- # Load Summarization Model
56
- summarizer = pipeline("summarization", model="google/pegasus-xsum", device=device)
 
 
 
 
 
 
 
 
57
 
58
  # --- **Generalized Cleaning** ---
59
  def clean_text(text):
@@ -83,13 +84,13 @@ def split_text(text, chunk_size=2048):
83
  chunks.append(current_chunk.strip())
84
  return chunks
85
 
86
- # --- **Summarization Endpoint** ---
87
  @app.post("/summarize-pdf/")
88
  async def summarize_pdf(file: UploadFile = File(...)):
89
  try:
90
  start_time = time.time()
91
  pdf_content = await file.read()
92
- pdf_path = "/tmp/temp.pdf" # Store in /tmp/
93
  with open(pdf_path, "wb") as f:
94
  f.write(pdf_content)
95
 
@@ -107,5 +108,6 @@ async def summarize_pdf(file: UploadFile = File(...)):
107
  except Exception as e:
108
  return {"error": str(e)}
109
 
 
110
  if __name__ == "__main__":
111
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  import os
 
 
 
 
 
 
 
 
 
2
  import pdfplumber
3
  import re
4
  import nltk
 
7
  import time
8
  from nltk.tokenize import sent_tokenize
9
  from transformers import pipeline
10
+ from fastapi import FastAPI, File, UploadFile
11
  from fastapi.middleware.cors import CORSMiddleware
12
 
13
+ # βœ… Set cache directories to writable locations for Hugging Face
14
+ TMP_DIR = "/tmp/huggingface_cache"
15
+ os.environ["TRANSFORMERS_CACHE"] = TMP_DIR
16
+ os.environ["HF_HOME"] = TMP_DIR
17
+ os.environ["HUGGINGFACE_HUB_CACHE"] = TMP_DIR
18
+
19
+ os.makedirs(TMP_DIR, exist_ok=True)
20
+
21
+ # βœ… Ensure NLTK Dependencies are Stored in a Writable Directory
22
+ NLTK_DATA_DIR = "/tmp/nltk_data"
23
  os.makedirs(NLTK_DATA_DIR, exist_ok=True)
24
  nltk.data.path.append(NLTK_DATA_DIR)
25
 
26
+ # βœ… Fix: Download only 'punkt' (NOT 'punkt_tab')
27
  try:
28
  nltk.data.find("tokenizers/punkt")
29
  except LookupError:
30
  nltk.download("punkt", download_dir=NLTK_DATA_DIR)
31
 
32
+ # βœ… Initialize FastAPI App
 
 
 
 
 
 
33
  app = FastAPI()
34
 
35
+ # βœ… Enable CORS for API Accessibility
36
  app.add_middleware(
37
  CORSMiddleware,
38
  allow_origins=["*"],
 
41
  allow_headers=["*"],
42
  )
43
 
44
+ # βœ… Force GPU Usage if Available
45
  device = 0 if torch.cuda.is_available() else -1
46
+ if device == 0:
47
+ print("πŸš€ Running on GPU!")
48
+ else:
49
+ print("⚠️ GPU Not Available! Running on CPU.")
50
+
51
+ # βœ… Load Summarization Model (Force Cache in /tmp/)
52
+ summarizer = pipeline(
53
+ "summarization",
54
+ model="google/pegasus-xsum",
55
+ device=device, # βœ… Force GPU if available
56
+ cache_dir=TMP_DIR
57
+ )
58
 
59
  # --- **Generalized Cleaning** ---
60
  def clean_text(text):
 
84
  chunks.append(current_chunk.strip())
85
  return chunks
86
 
87
+ # βœ… **Summarization API**
88
  @app.post("/summarize-pdf/")
89
  async def summarize_pdf(file: UploadFile = File(...)):
90
  try:
91
  start_time = time.time()
92
  pdf_content = await file.read()
93
+ pdf_path = "/tmp/temp.pdf" # βœ… Store in /tmp/
94
  with open(pdf_path, "wb") as f:
95
  f.write(pdf_content)
96
 
 
108
  except Exception as e:
109
  return {"error": str(e)}
110
 
111
+ # βœ… Start Uvicorn for Hugging Face Spaces
112
  if __name__ == "__main__":
113
  uvicorn.run(app, host="0.0.0.0", port=7860)