import os, re, shutil import pdfplumber import docx import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForSequenceClassification from pdf2image import convert_from_path import pytesseract # --- Debug check for system binaries --- print("pdftoppm path:", shutil.which("pdftoppm")) print("pdftocairo path:", shutil.which("pdftocairo")) print("tesseract path:", shutil.which("tesseract")) # ------------------- Text extraction ------------------- def extract_text(path: str, max_chars: int = 4000) -> str: ext = os.path.splitext(path.lower())[1] text = "" try: if ext == ".pdf": # First try pdfplumber with pdfplumber.open(path) as pdf: chunks = [] for page in pdf.pages: t = page.extract_text() or "" if t.strip(): chunks.append(t) if sum(len(c) for c in chunks) >= max_chars: break text = "\n".join(chunks) # If still empty → fallback to OCR if not text.strip(): images = convert_from_path(path, dpi=200) ocr_text = [] for img in images[:3]: # limit 3 pages for speed ocr_text.append(pytesseract.image_to_string(img)) text = "\n".join(ocr_text) elif ext == ".docx": d = docx.Document(path) text = "\n".join(p.text for p in d.paragraphs) elif ext == ".txt": with open(path, "r", encoding="utf-8", errors="ignore") as f: text = f.read() else: return "" except Exception as e: return f"[Error: {e}]" text = re.sub(r"\s+", " ", text).strip() return text[:max_chars] # ------------------- Load Detector ------------------- device = "cuda" if torch.cuda.is_available() else "cpu" model_name = "Hello-SimpleAI/chatgpt-detector-roberta" # ✅ open-source tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device) # ------------------- Detection ------------------- def detect_ai(files): results = [] for path in files: text = extract_text(path) if not text or text.startswith("[Error"): results.append([os.path.basename(path), "❌ Could not extract text"]) continue inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device) with torch.no_grad(): logits = model(**inputs).logits probs = torch.softmax(logits, dim=-1).cpu().numpy()[0] human_score, ai_score = probs[0], probs[1] # Convert AI probability → Rating 1–10 rating = round(ai_score * 10) rating = max(1, min(10, rating)) results.append([os.path.basename(path), rating, f"AI: {ai_score:.2f}", f"Human: {human_score:.2f}"]) return results # ------------------- Gradio UI ------------------- with gr.Blocks() as demo: gr.Markdown("# 🔍 AI / Human Text Detector (Open Source)") gr.Markdown("Upload **PDF/DOCX/TXT** files. Output is a **1–10 AI-likelihood rating**.") file_input = gr.File(file_types=[".pdf", ".docx", ".txt"], type="filepath", file_count="multiple") output = gr.Dataframe(headers=["File", "AI Rating (1=Human, 10=AI)", "AI Score", "Human Score"], label="Results") file_input.change(detect_ai, inputs=file_input, outputs=output) demo.launch()