from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from huggingface_hub import snapshot_download
from safetensors.torch import load_file

class ModelInput(BaseModel):
    prompt: str
    max_new_tokens: int = 2048

app = FastAPI()

# Define model paths
base_model_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
adapter_path = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"

# Load the model and tokenizer
def load_model_and_tokenizer():
    try:
        print("Loading base model...")
        model = AutoModelForCausalLM.from_pretrained(
            base_model_path,
            torch_dtype=torch.float16,
            trust_remote_code=True,
            device_map="auto"
        )

        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(base_model_path)

        print("Downloading adapter weights...")
        adapter_path_local = snapshot_download(repo_id=adapter_path)

        print("Loading adapter weights...")
        adapter_file = f"{adapter_path_local}/adapter_model.safetensors"
        state_dict = load_file(adapter_file)

        print("Applying adapter weights...")
        model.load_state_dict(state_dict, strict=False)

        print("Model and adapter loaded successfully!")

        return model, tokenizer
    except Exception as e:
        print(f"Error during model loading: {e}")
        raise

model, tokenizer = load_model_and_tokenizer()

def generate_response(model, tokenizer, instruction, max_new_tokens=2048):
    """Generate a response from the model based on an instruction."""
    try:
        # Encode input with truncation
        inputs = tokenizer.encode(
            instruction,
            return_tensors="pt",
            truncation=True,
            max_length=tokenizer.model_max_length
        ).to(model.device)

        # Generate response
        outputs = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
        )

        # Decode and strip input prompt from response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_text = response[len(instruction):].strip()

        print(f"Instruction: {instruction}")  # Debugging line
        print(f"Generated Response: {generated_text}")  # Debugging line

        return generated_text

    except Exception as e:
        print(f"Error generating response: {e}")
        raise ValueError(f"Error generating response: {e}")


@app.post("/generate")
async def generate_text(input: ModelInput):
    try:
        response = generate_response(
            model=model,
            tokenizer=tokenizer,
            instruction=input.prompt,
            max_new_tokens=input.max_new_tokens
        )
        return {"generated_text": response}

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/")
async def root():
    return {"message": "Welcome to the Model API!"}