import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# Set the model name and parameters
model_name = "xiddiqui/News_Summarizer"
max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection
load_in_4bit = False  # Use False if we aren't using 4bit quantization

# Set the device to CPU explicitly
device = "cpu"

# Load model and tokenizer

# 1. Load the base model (unsloth/meta-llama-3.1-8b)
base_model_name = "unsloth/Meta-Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Disable GPU quantization or 4-bit loading explicitly to avoid bitsandbytes
model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float32)

# 2. Load your fine-tuned model with the LoRA adapter
adapter_model_name = "xiddiqui/News_Summarizer"  # Your model path on Hugging Face
model = PeftModel.from_pretrained(model, adapter_model_name)

# Move model to CPU (no need for GPU)
model.to(device)

# Define the summarization function
def generate_summary(input_text):
    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction:
    Summarize the following: 

    ### Input:
    {}

    ### Response:
    """

    # Tokenize and prepare the input text
    inputs = tokenizer(
        [alpaca_prompt.format(input_text)], 
        return_tensors="pt", 
        truncation=True, 
        max_length=max_seq_length
    ).to(device)  # Ensure computations are done on the CPU

    # Generate summary
    summary_ids = model.generate(
        **inputs, 
        max_length=128,  # Limit the length of the output
        num_beams=4,  # Set the beam search to get a better output
        no_repeat_ngram_size=2,  # Avoid repeating n-grams in the summary
        early_stopping=True
    )

    # Decode the output summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Set up Gradio interface
iface = gr.Interface(
    fn=generate_summary,  # Function to be called on user input
    inputs="text",  # Single text box for user input
    outputs="text",  # Output as text
    live=True,  # Optional: updates summary as user types
    title="News Summarizer",  # Title of the app
    description="Enter a news article, and get a concise summary of the content."
)

# Launch Gradio app
iface.launch()