import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import gradio as gr
import os
import json
import gc
from huggingface_hub import hf_hub_download
import shutil
import tempfile

# Free up memory
gc.collect()

print("Setting up model loading...")

# Create a temporary directory for model modifications
temp_dir = tempfile.mkdtemp()
print(f"Created temporary directory: {temp_dir}")

# Your model name
model_name = "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit"

try:
    # Download the config.json file
    print("Downloading configuration file...")
    config_path = hf_hub_download(repo_id=model_name, filename="config.json")
    
    # Load and modify the config to remove quantization
    print("Modifying configuration...")
    with open(config_path, 'r') as file:
        config_dict = json.load(file)
    
    # Remove any quantization configs
    if "quantization_config" in config_dict:
        del config_dict["quantization_config"]
    
    # Save the modified config to the temp directory
    modified_config_path = os.path.join(temp_dir, "config.json")
    with open(modified_config_path, 'w') as file:
        json.dump(config_dict, file)
    
    print("Modified configuration saved")
    
    # Now try to load with the modified config
    print("Loading model with modified configuration...")
    config = AutoConfig.from_pretrained(temp_dir)
    
    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        config=config,
        device_map="auto",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        trust_remote_code=True
    )
    print("Model loaded successfully")
    
except Exception as e:
    print(f"Error during custom loading: {e}")
    # If the first approach fails, try a direct approach with explicit params
    print("Attempting alternative loading method...")
    try:
        base_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            load_in_4bit=False,
            load_in_8bit=False,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            trust_remote_code=True,
        )
        print("Model loaded with alternative method")
    except Exception as e2:
        print(f"Error with alternative loading: {e2}")
        raise RuntimeError("Failed to load model in any format")
finally:
    # Clean up temp directory
    shutil.rmtree(temp_dir)
    print(f"Cleaned up temporary directory")

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Function to generate response
def generate_response(message, history):
    # Format history for the model
    prompt = ""
    if history:
        for user_msg, assistant_msg in history:
            prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
    prompt += f"User: {message}\nAssistant: "
    
    print(f"Tokenizing input...")
    inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
    
    # Free up memory before generation
    gc.collect()
    
    print(f"Generating response...")
    with torch.no_grad():  # Disable gradient calculation to save memory
        outputs = base_model.generate(
            **inputs,
            max_new_tokens=256,  # Reduced from 300 to conserve memory
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.95
        )
    
    print(f"Decoding response...")
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the assistant's response
    if "Assistant: " in response:
        response = response.split("Assistant: ")[-1]
    
    return response

# Launch Gradio UI with memory-efficient settings
print("Setting up Gradio interface...")
with gr.Blocks() as demo:
    gr.Markdown("### 🦙 Chat with Your Fine-tuned LLaMA 3.2 3B")
    chatbot = gr.ChatInterface(
        generate_response,
        chatbot=gr.Chatbot(height=400),
        textbox=gr.Textbox(placeholder="Type your message here...", container=False, scale=7),
        submit_btn="Send",
        retry_btn="Retry",
        clear_btn="Clear",
    )

print("Launching interface...")
demo.launch(share=False, show_api=False)  # Disable sharing and API to save resources