import gradio as gr
import subprocess
import requests
import time
import logging
from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Cache for loaded models
loaded_models = {}

def check_ollama_running():
    """Wait until Ollama is fully ready."""
    url = "http://127.0.0.1:11434/api/tags"
    for _ in range(10):  # Try for ~10 seconds
        try:
            response = requests.get(url, timeout=2)
            if response.status_code == 200:
                logger.info("Ollama is running.")
                return True
        except requests.exceptions.RequestException:
            logger.warning("Waiting for Ollama to start...")
        time.sleep(2)
    raise RuntimeError("Ollama is not running. Please check the server.")

def pull_model(model_name):
    """Ensure the model is available before use."""
    if model_name in loaded_models:
        logger.info(f"Model {model_name} is already loaded.")
        return
    try:
        subprocess.run(["ollama", "pull", model_name], check=True)
        logger.info(f"Model {model_name} pulled successfully.")
        loaded_models[model_name] = True
    except subprocess.CalledProcessError as e:
        logger.error(f"Failed to pull model {model_name}: {e}")
        raise

def get_llm(model_name):
    """Get an LLM instance with streaming enabled."""
    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
    return Ollama(model=model_name, base_url="http://127.0.0.1:11434", callback_manager=callback_manager)

def query_model(model_name, prompt):
    """Generate responses from the model with streaming."""
    check_ollama_running()  # Ensure Ollama is ready
    pull_model(model_name)  # Make sure the model is available
    llm = get_llm(model_name)  # Load the model

    response = ""
    for token in llm.stream(prompt):
        response += token
        yield response  # Stream response in real-time

# Define Gradio interface
iface = gr.Interface(
    fn=query_model,
    inputs=[
        gr.Dropdown(["deepseek-r1:1.5b", "mistral:7b"], label="Select Model"),
        gr.Textbox(label="Enter your prompt")
    ],
    outputs="text",
    title="Ollama via LangChain & Gradio",
    description="Enter a prompt to interact with the Ollama-based model with streaming response.",
    flagging_dir="/app/flagged"
)

if __name__ == "__main__":
    iface.launch(server_name="0.0.0.0", server_port=7860)