import gradio as gr
import requests
import os

# Retrieve the Hugging Face token from environment variables
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
if not HF_TOKEN:
    raise ValueError("Please set your Hugging Face API token as HF_API_TOKEN in the Secrets settings.")

# Model details
MODEL_ID = "meta-llama/llama-3.2-70b-instruct"  # Change to the exact model ID
API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"

# Headers for API requests
HEADERS = {
    "Authorization": f"Bearer {HF_TOKEN}",
    "Content-Type": "application/json"
}

def chat_with_llama(prompt, temperature=0.7, max_tokens=256):
    """Sends a request to Hugging Face Inference API and returns the response."""
    payload = {
        "inputs": prompt,
        "parameters": {
            "temperature": temperature,
            "max_new_tokens": max_tokens,
            "top_p": 0.95
        }
    }

    response = requests.post(API_URL, headers=HEADERS, json=payload)

    if response.status_code == 200:
        return response.json()[0]["generated_text"]
    else:
        return f"Error {response.status_code}: {response.text}"

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("<h2 align='center'>🚀 Llama 3.2 3B Instruct Chatbot</h2>")
    
    with gr.Row():
        with gr.Column():
            prompt = gr.Textbox(label="Enter your prompt:", placeholder="Ask me anything...", lines=3)
            temperature = gr.Slider(0.1, 1.5, value=0.7, label="Temperature")
            max_tokens = gr.Slider(50, 1024, value=256, label="Max Tokens")
            submit = gr.Button("Generate Response")
        
        with gr.Column():
            output = gr.Textbox(label="AI Response", interactive=False, lines=10)

    submit.click(chat_with_llama, inputs=[prompt, temperature, max_tokens], outputs=output)

# Launch app
if __name__ == "__main__":
    demo.launch()