import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Define your repository names.
MODEL_NAME = "wedo2910/research_ai"
TOKENIZER_NAME = "wedo2910/research_ai_tok"

# Check if CUDA is available and choose an appropriate device mapping.
if torch.cuda.is_available():
    device = "cuda"
    # When using GPU, you might let the model auto-map to available GPUs.
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME, 
        trust_remote_code=True,
        device_map="auto"
    )
else:
    device = "cpu"
    # Force CPU loading; this bypasses GPU-specific integrations like bitsandbytes.
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME, 
        trust_remote_code=True,
        device_map="cpu"
    )

# Load the tokenizer.
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, trust_remote_code=True)

# Optionally set the model to evaluation mode.
model.eval()

def single_inference(question: str, max_new_tokens: int, temperature: float) -> str:
    """
    Generates an answer for the given question.
    
    The prompt is constructed using a system instruction in Arabic, and the question is appended.
    """
    # Define messages for a simulated chat conversation.
    messages = [
        {"role": "system", "content": "اجب علي الاتي بالعربي فقط."},
        {"role": "user", "content": question},
    ]
    
    # If the tokenizer has an `apply_chat_template` method, use it; otherwise, build the prompt manually.
    if hasattr(tokenizer, "apply_chat_template"):
        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(device)
    else:
        system_prompt = "اجب علي الاتي بالعربي فقط.\n"
        user_prompt = f"السؤال: {question}\n"
        full_prompt = system_prompt + user_prompt
        input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids.to(device)
    
    # Generate the output.
    outputs = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        # You can add more generation parameters if needed.
    )
    
    # Remove the prompt part from the generated output.
    generated_ids = outputs[0][input_ids.shape[-1]:]
    
    # Decode the tokens into a string.
    output_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
    
    return output_text

# Streamlit UI
st.title("Arabic AI Research QA")
st.subheader("Ask a question and get an answer from the research AI model.")

# Input field for the question.
question = st.text_input("Question", placeholder="Enter your question here...")

# Settings for generation.
st.subheader("Settings")
max_new_tokens = st.number_input("Max New Tokens", min_value=1, max_value=1000, value=256)
temperature = st.slider("Temperature", min_value=0.0, max_value=1.0, value=0.4, step=0.1)

# When the button is pressed, generate the answer.
if st.button("Get Answer"):
    if not question:
        st.error("Please enter a question.")
    else:
        with st.spinner("Generating answer..."):
            try:
                answer = single_inference(question, max_new_tokens, temperature)
                st.subheader("Result")
                st.markdown(f"**Question:** {question}")
                st.markdown(f"**Answer:** {answer}")
            except Exception as e:
                st.error(f"Error: {e}")