Spaces:

talhashoaib
/

AI-Notes-Detector

Running

App Files Files Community

talhashoaib commited on Aug 26

Commit

f1a5550

verified ·

1 Parent(s): 6c75a17

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -36

app.py CHANGED Viewed

@@ -1,50 +1,93 @@
-import gradio as gr
 import pdfplumber
 from pdf2image import convert_from_path
 import pytesseract
-import shutil
-def extract_text_debug(file_path):
-    logs = []
     text = ""
-    # Debug paths
-    logs.append(f"pdftoppm path: {shutil.which('pdftoppm')}")
-    logs.append(f"pdftocairo path: {shutil.which('pdftocairo')}")
-    # Try pdfplumber
-    try:
-        with pdfplumber.open(file_path) as pdf:
-            for page in pdf.pages:
-                t = page.extract_text() or ""
-                text += t
-        if text.strip():
-            logs.append("✅ Extracted text using pdfplumber")
-            return text[:800] + "\n\n---\n" + "\n".join(logs)
-        else:
-            logs.append("⚠️ pdfplumber gave empty text, trying OCR…")
-    except Exception as e:
-        logs.append(f"❌ pdfplumber failed: {e}")
-    # OCR fallback
-    try:
-        images = convert_from_path(file_path, dpi=200, poppler_path="/usr/bin")
-        ocr_text = [pytesseract.image_to_string(img) for img in images[:2]]
-        text = "\n".join(ocr_text)
-        if text.strip():
-            logs.append("✅ OCR worked via pdf2image + Tesseract")
         else:
-            logs.append("⚠️ OCR returned empty text")
     except Exception as e:
-        logs.append(f"❌ OCR fallback failed: {e}")
-    return (text[:800] if text.strip() else "❌ No text extracted") + "\n\n---\n" + "\n".join(logs)
 with gr.Blocks() as demo:
-    gr.Markdown("# 📄 PDF Extractor Debug")
-    inp = gr.File(file_types=[".pdf"], type="filepath")
-    out = gr.Textbox(lines=20, label="Text + Debug Logs")
-    inp.change(extract_text_debug, inputs=inp, outputs=out)
-if __name__ == "__main__":
-    demo.launch()

+import os, re, shutil
 import pdfplumber
+import docx
+import torch
+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from pdf2image import convert_from_path
 import pytesseract
+# --- Debug check for system binaries ---
+print("pdftoppm path:", shutil.which("pdftoppm"))
+print("pdftocairo path:", shutil.which("pdftocairo"))
+print("tesseract path:", shutil.which("tesseract"))
+# ------------------- Text extraction -------------------
+def extract_text(path: str, max_chars: int = 4000) -> str:
+    ext = os.path.splitext(path.lower())[1]
     text = ""
+    try:
+        if ext == ".pdf":
+            # First try pdfplumber
+            with pdfplumber.open(path) as pdf:
+                chunks = []
+                for page in pdf.pages:
+                    t = page.extract_text() or ""
+                    if t.strip():
+                        chunks.append(t)
+                    if sum(len(c) for c in chunks) >= max_chars:
+                        break
+                text = "\n".join(chunks)
+            # If still empty → fallback to OCR
+            if not text.strip():
+                images = convert_from_path(path, dpi=200)
+                ocr_text = []
+                for img in images[:3]:  # limit 3 pages for speed
+                    ocr_text.append(pytesseract.image_to_string(img))
+                text = "\n".join(ocr_text)
+        elif ext == ".docx":
+            d = docx.Document(path)
+            text = "\n".join(p.text for p in d.paragraphs)
+        elif ext == ".txt":
+            with open(path, "r", encoding="utf-8", errors="ignore") as f:
+                text = f.read()
         else:
+            return ""
     except Exception as e:
+        return f"[Error: {e}]"
+    text = re.sub(r"\s+", " ", text).strip()
+    return text[:max_chars]
+# ------------------- Load Detector -------------------
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_name = "Hello-SimpleAI/chatgpt-detector-roberta"  # ✅ open-source
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
+# ------------------- Detection -------------------
+def detect_ai(files):
+    results = []
+    for path in files:
+        text = extract_text(path)
+        if not text or text.startswith("[Error"):
+            results.append([os.path.basename(path), "❌ Could not extract text"])
+            continue
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
+        with torch.no_grad():
+            logits = model(**inputs).logits
+            probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
+        human_score, ai_score = probs[0], probs[1]
+        # Convert AI probability → Rating 1–10
+        rating = round(ai_score * 10)
+        rating = max(1, min(10, rating))
+        results.append([os.path.basename(path), rating, f"AI: {ai_score:.2f}", f"Human: {human_score:.2f}"])
+    return results
+# ------------------- Gradio UI -------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# 🔍 AI / Human Text Detector (Open Source)")
+    gr.Markdown("Upload **PDF/DOCX/TXT** files. Output is a **1–10 AI-likelihood rating**.")
+    file_input = gr.File(file_types=[".pdf", ".docx", ".txt"], type="filepath", file_count="multiple")
+    output = gr.Dataframe(headers=["File", "AI Rating (1=Human, 10=AI)", "AI Score", "Human Score"], label="Results")
+    file_input.change(detect_ai, inputs=file_input, outputs=output)
+demo.launch()