import os, re, shutil
import pdfplumber
import docx
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from pdf2image import convert_from_path
import pytesseract

# --- Debug check for system binaries ---
print("pdftoppm path:", shutil.which("pdftoppm"))
print("pdftocairo path:", shutil.which("pdftocairo"))
print("tesseract path:", shutil.which("tesseract"))

# ------------------- Text extraction -------------------
def extract_text(path: str, max_chars: int = 4000) -> str:
    ext = os.path.splitext(path.lower())[1]
    text = ""
    try:
        if ext == ".pdf":
            # First try pdfplumber
            with pdfplumber.open(path) as pdf:
                chunks = []
                for page in pdf.pages:
                    t = page.extract_text() or ""
                    if t.strip():
                        chunks.append(t)
                    if sum(len(c) for c in chunks) >= max_chars:
                        break
                text = "\n".join(chunks)

            # If still empty → fallback to OCR
            if not text.strip():
                images = convert_from_path(path, dpi=200)
                ocr_text = []
                for img in images[:3]:  # limit 3 pages for speed
                    ocr_text.append(pytesseract.image_to_string(img))
                text = "\n".join(ocr_text)

        elif ext == ".docx":
            d = docx.Document(path)
            text = "\n".join(p.text for p in d.paragraphs)

        elif ext == ".txt":
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()
        else:
            return ""
    except Exception as e:
        return f"[Error: {e}]"

    text = re.sub(r"\s+", " ", text).strip()
    return text[:max_chars]

# ------------------- Load Detector -------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "Hello-SimpleAI/chatgpt-detector-roberta"  # ✅ open-source
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

# ------------------- Detection -------------------
def detect_ai(files):
    results = []
    for path in files:
        text = extract_text(path)
        if not text or text.startswith("[Error"):
            results.append([os.path.basename(path), "❌ Could not extract text"])
            continue

        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
        with torch.no_grad():
            logits = model(**inputs).logits
            probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
        human_score, ai_score = probs[0], probs[1]

        # Convert AI probability → Rating 1–10
        rating = round(ai_score * 10)
        rating = max(1, min(10, rating))

        results.append([os.path.basename(path), rating, f"AI: {ai_score:.2f}", f"Human: {human_score:.2f}"])
    return results

# ------------------- Gradio UI -------------------
with gr.Blocks() as demo:
    gr.Markdown("# 🔍 AI / Human Text Detector (Open Source)")
    gr.Markdown("Upload **PDF/DOCX/TXT** files. Output is a **1–10 AI-likelihood rating**.")

    file_input = gr.File(file_types=[".pdf", ".docx", ".txt"], type="filepath", file_count="multiple")
    output = gr.Dataframe(headers=["File", "AI Rating (1=Human, 10=AI)", "AI Score", "Human Score"], label="Results")

    file_input.change(detect_ai, inputs=file_input, outputs=output)

demo.launch()