talhashoaib commited on
Commit
f1a5550
Β·
verified Β·
1 Parent(s): 6c75a17

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -36
app.py CHANGED
@@ -1,50 +1,93 @@
1
- import gradio as gr
2
  import pdfplumber
 
 
 
 
3
  from pdf2image import convert_from_path
4
  import pytesseract
5
- import shutil
6
 
7
- def extract_text_debug(file_path):
8
- logs = []
 
 
 
 
 
 
9
  text = ""
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- # Debug paths
12
- logs.append(f"pdftoppm path: {shutil.which('pdftoppm')}")
13
- logs.append(f"pdftocairo path: {shutil.which('pdftocairo')}")
 
 
 
 
14
 
15
- # Try pdfplumber
16
- try:
17
- with pdfplumber.open(file_path) as pdf:
18
- for page in pdf.pages:
19
- t = page.extract_text() or ""
20
- text += t
21
- if text.strip():
22
- logs.append("βœ… Extracted text using pdfplumber")
23
- return text[:800] + "\n\n---\n" + "\n".join(logs)
24
- else:
25
- logs.append("⚠️ pdfplumber gave empty text, trying OCR…")
26
- except Exception as e:
27
- logs.append(f"❌ pdfplumber failed: {e}")
28
 
29
- # OCR fallback
30
- try:
31
- images = convert_from_path(file_path, dpi=200, poppler_path="/usr/bin")
32
- ocr_text = [pytesseract.image_to_string(img) for img in images[:2]]
33
- text = "\n".join(ocr_text)
34
- if text.strip():
35
- logs.append("βœ… OCR worked via pdf2image + Tesseract")
36
  else:
37
- logs.append("⚠️ OCR returned empty text")
38
  except Exception as e:
39
- logs.append(f"❌ OCR fallback failed: {e}")
40
 
41
- return (text[:800] if text.strip() else "❌ No text extracted") + "\n\n---\n" + "\n".join(logs)
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  with gr.Blocks() as demo:
44
- gr.Markdown("# πŸ“„ PDF Extractor Debug")
45
- inp = gr.File(file_types=[".pdf"], type="filepath")
46
- out = gr.Textbox(lines=20, label="Text + Debug Logs")
47
- inp.change(extract_text_debug, inputs=inp, outputs=out)
 
 
 
48
 
49
- if __name__ == "__main__":
50
- demo.launch()
 
1
+ import os, re, shutil
2
  import pdfplumber
3
+ import docx
4
+ import torch
5
+ import gradio as gr
6
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
  from pdf2image import convert_from_path
8
  import pytesseract
 
9
 
10
+ # --- Debug check for system binaries ---
11
+ print("pdftoppm path:", shutil.which("pdftoppm"))
12
+ print("pdftocairo path:", shutil.which("pdftocairo"))
13
+ print("tesseract path:", shutil.which("tesseract"))
14
+
15
+ # ------------------- Text extraction -------------------
16
+ def extract_text(path: str, max_chars: int = 4000) -> str:
17
+ ext = os.path.splitext(path.lower())[1]
18
  text = ""
19
+ try:
20
+ if ext == ".pdf":
21
+ # First try pdfplumber
22
+ with pdfplumber.open(path) as pdf:
23
+ chunks = []
24
+ for page in pdf.pages:
25
+ t = page.extract_text() or ""
26
+ if t.strip():
27
+ chunks.append(t)
28
+ if sum(len(c) for c in chunks) >= max_chars:
29
+ break
30
+ text = "\n".join(chunks)
31
 
32
+ # If still empty β†’ fallback to OCR
33
+ if not text.strip():
34
+ images = convert_from_path(path, dpi=200)
35
+ ocr_text = []
36
+ for img in images[:3]: # limit 3 pages for speed
37
+ ocr_text.append(pytesseract.image_to_string(img))
38
+ text = "\n".join(ocr_text)
39
 
40
+ elif ext == ".docx":
41
+ d = docx.Document(path)
42
+ text = "\n".join(p.text for p in d.paragraphs)
 
 
 
 
 
 
 
 
 
 
43
 
44
+ elif ext == ".txt":
45
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
46
+ text = f.read()
 
 
 
 
47
  else:
48
+ return ""
49
  except Exception as e:
50
+ return f"[Error: {e}]"
51
 
52
+ text = re.sub(r"\s+", " ", text).strip()
53
+ return text[:max_chars]
54
 
55
+ # ------------------- Load Detector -------------------
56
+ device = "cuda" if torch.cuda.is_available() else "cpu"
57
+ model_name = "Hello-SimpleAI/chatgpt-detector-roberta" # βœ… open-source
58
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
59
+ model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
60
+
61
+ # ------------------- Detection -------------------
62
+ def detect_ai(files):
63
+ results = []
64
+ for path in files:
65
+ text = extract_text(path)
66
+ if not text or text.startswith("[Error"):
67
+ results.append([os.path.basename(path), "❌ Could not extract text"])
68
+ continue
69
+
70
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
71
+ with torch.no_grad():
72
+ logits = model(**inputs).logits
73
+ probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
74
+ human_score, ai_score = probs[0], probs[1]
75
+
76
+ # Convert AI probability β†’ Rating 1–10
77
+ rating = round(ai_score * 10)
78
+ rating = max(1, min(10, rating))
79
+
80
+ results.append([os.path.basename(path), rating, f"AI: {ai_score:.2f}", f"Human: {human_score:.2f}"])
81
+ return results
82
+
83
+ # ------------------- Gradio UI -------------------
84
  with gr.Blocks() as demo:
85
+ gr.Markdown("# πŸ” AI / Human Text Detector (Open Source)")
86
+ gr.Markdown("Upload **PDF/DOCX/TXT** files. Output is a **1–10 AI-likelihood rating**.")
87
+
88
+ file_input = gr.File(file_types=[".pdf", ".docx", ".txt"], type="filepath", file_count="multiple")
89
+ output = gr.Dataframe(headers=["File", "AI Rating (1=Human, 10=AI)", "AI Score", "Human Score"], label="Results")
90
+
91
+ file_input.change(detect_ai, inputs=file_input, outputs=output)
92
 
93
+ demo.launch()