import gradio as gr
import torch
import os
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from peft import PeftModel
from huggingface_hub import HfApi
import shutil
import gc

# Configuration - UPDATE THESE
LORA_REPO = "your-username/qwen25vl-lora-adapter"  # Your LoRA repo
OUTPUT_REPO = "your-username/qwen25vl-invoice-merged"  # Output repo
BASE_MODEL = "unsloth/Qwen2.5-VL-7B-Instruct"
HF_TOKEN = os.environ.get("HF_TOKEN")  # Set in Space secrets

def merge_model():
    """Merge LoRA with base model and upload to Hub"""
    try:
        # Use Space's disk efficiently
        work_dir = "/tmp/merge"
        if os.path.exists(work_dir):
            shutil.rmtree(work_dir)
        os.makedirs(work_dir)
        
        # Update status
        yield "Loading base model..."
        
        # Load model with CPU offload to save GPU memory
        model = Qwen2VLForConditionalGeneration.from_pretrained(
            BASE_MODEL,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True,
            cache_dir=work_dir,
            low_cpu_mem_usage=True
        )
        
        yield "Loading processor..."
        processor = AutoProcessor.from_pretrained(
            BASE_MODEL,
            trust_remote_code=True,
            cache_dir=work_dir
        )
        
        yield "Loading LoRA adapter from Hub..."
        model = PeftModel.from_pretrained(model, LORA_REPO)
        
        yield "Merging weights... This may take a few minutes..."
        model = model.merge_and_unload()
        
        # Clear GPU cache
        torch.cuda.empty_cache()
        gc.collect()
        
        yield "Saving merged model..."
        output_dir = os.path.join(work_dir, "merged")
        os.makedirs(output_dir, exist_ok=True)
        
        # Save with smaller shards
        model.save_pretrained(
            output_dir,
            max_shard_size="2GB",
            safe_serialization=True
        )
        processor.save_pretrained(output_dir)
        
        yield "Uploading to HuggingFace Hub..."
        api = HfApi(token=HF_TOKEN)
        
        # Create output repo
        api.create_repo(OUTPUT_REPO, exist_ok=True, private=True)
        
        # Upload the merged model
        api.upload_folder(
            folder_path=output_dir,
            repo_id=OUTPUT_REPO,
            repo_type="model",
            commit_message="Merged LoRA adapter with base model"
        )
        
        # Cleanup
        shutil.rmtree(work_dir)
        
        yield f"✅ Success! Model merged and uploaded to: {OUTPUT_REPO}"
        
    except Exception as e:
        yield f"❌ Error: {str(e)}"
        # Cleanup on error
        if os.path.exists("/tmp/merge"):
            shutil.rmtree("/tmp/merge")

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="Qwen2.5-VL LoRA Merger") as demo:
        gr.Markdown(
            """
            # Qwen2.5-VL LoRA Merger
            
            This Space will merge your LoRA adapter with the base model and upload to HuggingFace.
            
            **Configuration:**
            - Base Model: `{}`
            - LoRA Adapter: `{}`
            - Output Repo: `{}`
            """.format(BASE_MODEL, LORA_REPO, OUTPUT_REPO)
        )
        
        status = gr.Textbox(label="Status", lines=10)
        merge_btn = gr.Button("Start Merge", variant="primary")
        
        merge_btn.click(
            fn=merge_model,
            inputs=[],
            outputs=[status]
        )
        
    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch()