import gradio as gr
import pdfplumber
import docx
import pytesseract
from pdf2image import convert_from_path
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import os

# Load model
MODEL_NAME = "roberta-base-openai-detector"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# ---- File Processing ----
def extract_text_from_file(file_path):
    text = ""

    if file_path.endswith(".pdf"):
        try:
            # First try pdfplumber
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    text += page.extract_text() or ""

            # If nothing extracted, fallback to OCR
            if not text.strip():
                images = convert_from_path(file_path)
                for img in images:
                    text += pytesseract.image_to_string(img)
        except Exception as e:
            return f"❌ PDF extraction error: {str(e)}"

    elif file_path.endswith(".docx"):
        try:
            doc = docx.Document(file_path)
            text = " ".join([para.text for para in doc.paragraphs])
        except Exception as e:
            return f"❌ DOCX extraction error: {str(e)}"

    elif file_path.endswith(".txt"):
        try:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()
        except Exception as e:
            return f"❌ TXT extraction error: {str(e)}"

    return text.strip()

# ---- AI Detection ----
def detect_ai(file_path):
    text = extract_text_from_file(file_path)
    if not text or text.startswith("❌"):
        return "❌ Could not extract text"

    # Truncate to fit model input
    inputs = tokenizer(text[:1000], return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        ai_prob = probs[1].item()

    score = round(ai_prob * 10, 2)  # scale 0–10
    return f"🤖 AI-Generated Likelihood: {score}/10"

# ---- Gradio UI ----
with gr.Blocks() as demo:
    gr.Markdown("## 📄 AI Text Detector\nUpload a PDF, DOCX, or TXT file to check if it's AI-generated.")

    file_input = gr.File(type="filepath", file_types=[".pdf", ".docx", ".txt"], label="Upload File")
    output = gr.Textbox(label="Detection Result")

    file_input.change(fn=detect_ai, inputs=file_input, outputs=output)

if __name__ == "__main__":
    demo.launch()