import os
import time
import gc
import threading
from itertools import islice
from datetime import datetime
import re  # for parsing <think> blocks
import gradio as gr
import torch
from transformers import pipeline, TextIteratorStreamer
from transformers import AutoTokenizer
from duckduckgo_search import DDGS
import spaces  # Import spaces early to enable ZeroGPU support

# Optional: Disable GPU visibility if you wish to force CPU usage
# os.environ["CUDA_VISIBLE_DEVICES"] = ""

# ------------------------------
# Global Cancellation Event
# ------------------------------
cancel_event = threading.Event()

# ------------------------------
# Torch-Compatible Model Definitions with Adjusted Descriptions
# ------------------------------
MODELS = {
    "Yee-R1-mini":      {"repo_id":"sds-ai/Yee-R1-mini","description":"小熠（Yee）AI 数据安全专家"},
    "secgpt-mini":      {"repo_id":"clouditera/secgpt-mini","description":"SecGPT 是由 云起无垠 于 2023 年正式推出的开源大模型，专为网络安全场景打造，旨在以人工智能技术全面提升安全防护效率与效果。"},
    "Qwen3-0.6B":    {"repo_id":"Qwen/Qwen3-0.6B","description":"Dense causal language model with 0.6 B total parameters (0.44 B non-embedding), 28 transformer layers, 16 query heads & 8 KV heads, native 32 768-token context window, dual-mode generation, full multilingual & agentic capabilities."},
    "Qwen3-1.7B":    {"repo_id":"Qwen/Qwen3-1.7B","description":"Dense causal language model with 1.7 B total parameters (1.4 B non-embedding), 28 layers, 16 query heads & 8 KV heads, 32 768-token context, stronger reasoning vs. 0.6 B variant, dual-mode inference, instruction following across 100+ languages."},
}

# Global cache for pipelines to avoid re-loading.
PIPELINES = {}

def load_pipeline(model_name):
    """
    Load and cache a transformers pipeline for text generation.
    Tries bfloat16, falls back to float16 or float32 if unsupported.
    """
    global PIPELINES
    if model_name in PIPELINES:
        return PIPELINES[model_name]
    repo = MODELS[model_name]["repo_id"]
    tokenizer = AutoTokenizer.from_pretrained(repo)
    for dtype in (torch.bfloat16, torch.float16, torch.float32):
        try:
            pipe = pipeline(
                task="text-generation",
                model=repo,
                tokenizer=tokenizer,
                trust_remote_code=True,
                torch_dtype=dtype,
                device_map="auto"
            )
            PIPELINES[model_name] = pipe
            return pipe
        except Exception:
            continue
    # Final fallback
    pipe = pipeline(
        task="text-generation",
        model=repo,
        tokenizer=tokenizer,
        trust_remote_code=True,
        device_map="auto"
    )
    PIPELINES[model_name] = pipe
    return pipe


def retrieve_context(query, max_results=6, max_chars=1000):
    """
    Retrieve search snippets from DuckDuckGo (runs in background).
    Returns a list of result strings.
    """
    try:
        with DDGS() as ddgs:
            return [f"{i+1}. {r.get('title','No Title')} - {r.get('body','')[:max_chars]}"
                    for i, r in enumerate(islice(ddgs.text(query, region="wt-wt", safesearch="off", timelimit="y"), max_results))]
    except Exception:
        return []

def format_conversation(history, system_prompt, tokenizer):
    if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
        messages = [{"role": "system", "content": system_prompt.strip()}] + history
        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
    else:
        # Fallback for base LMs without chat template
        prompt = system_prompt.strip() + "\n"
        for msg in history:
            if msg['role'] == 'user':
                prompt += "User: " + msg['content'].strip() + "\n"
            elif msg['role'] == 'assistant':
                prompt += "Assistant: " + msg['content'].strip() + "\n"
        if not prompt.strip().endswith("Assistant:"):
            prompt += "Assistant: "
        return prompt

@spaces.GPU(duration=60)
def chat_response(user_msg, chat_history, system_prompt,
                  enable_search, max_results, max_chars,
                  model_name, max_tokens, temperature,
                  top_k, top_p, repeat_penalty, search_timeout):
    """
    Generates streaming chat responses, optionally with background web search.
    """
    cancel_event.clear()
    history = list(chat_history or [])
    history.append({'role': 'user', 'content': user_msg})

    # Launch web search if enabled
    debug = ''
    search_results = []
    if enable_search:
        debug = 'Search task started.'
        thread_search = threading.Thread(
            target=lambda: search_results.extend(
                retrieve_context(user_msg, int(max_results), int(max_chars))
            )
        )
        thread_search.daemon = True
        thread_search.start()
    else:
        debug = 'Web search disabled.'

    try:

        # merge any fetched search results into the system prompt
        if search_results:
            enriched = system_prompt.strip() + "\n\nRelevant context:\n" + "\n".join(search_results)
        else:
            enriched = system_prompt

        # wait up to 1s for snippets, then replace debug with them
        if enable_search:
            thread_search.join(timeout=float(search_timeout))
            if search_results:
                debug = "### Search results merged into prompt\n\n" + "\n".join(
                    f"- {r}" for r in search_results
                )
            else:
                debug = "*No web search results found.*"

        # merge fetched snippets into the system prompt
        if search_results:
            enriched = system_prompt.strip() + "\n\nRelevant context:\n" + "\n".join(search_results)
        else:
            enriched = system_prompt

        pipe = load_pipeline(model_name)
        prompt = format_conversation(history, enriched, pipe.tokenizer)
        prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
        streamer = TextIteratorStreamer(pipe.tokenizer,
                                        skip_prompt=True,
                                        skip_special_tokens=True)
        gen_thread = threading.Thread(
            target=pipe,
            args=(prompt,),
            kwargs={
                'max_new_tokens': max_tokens,
                'temperature': temperature,
                'top_k': top_k,
                'top_p': top_p,
                'repetition_penalty': repeat_penalty,
                'streamer': streamer,
                'return_full_text': False,
            }
        )
        gen_thread.start()

        # Buffers for thought vs answer
        thought_buf = ''
        answer_buf = ''
        in_thought = False

        # Stream tokens
        for chunk in streamer:
            if cancel_event.is_set():
                break
            text = chunk

            # Detect start of thinking
            if not in_thought and '<think>' in text:
                in_thought = True
                # Insert thought placeholder
                history.append({
                    'role': 'assistant',
                    'content': '',
                    'metadata': {'title': '💭 Thought'}
                })
                # Capture after opening tag
                after = text.split('<think>', 1)[1]
                thought_buf += after
                # If closing tag in same chunk
                if '</think>' in thought_buf:
                    before, after2 = thought_buf.split('</think>', 1)
                    history[-1]['content'] = before.strip()
                    in_thought = False
                    # Start answer buffer
                    answer_buf = after2
                    history.append({'role': 'assistant', 'content': answer_buf})
                else:
                    history[-1]['content'] = thought_buf
                yield history, debug
                continue

            # Continue thought streaming
            if in_thought:
                thought_buf += text
                if '</think>' in thought_buf:
                    before, after2 = thought_buf.split('</think>', 1)
                    history[-1]['content'] = before.strip()
                    in_thought = False
                    # Start answer buffer
                    answer_buf = after2
                    history.append({'role': 'assistant', 'content': answer_buf})
                else:
                    history[-1]['content'] = thought_buf
                yield history, debug
                continue

            # Stream answer
            if not answer_buf:
                history.append({'role': 'assistant', 'content': ''})
            answer_buf += text
            history[-1]['content'] = answer_buf
            yield history, debug

        gen_thread.join()
        yield history, debug + prompt_debug
    except Exception as e:
        history.append({'role': 'assistant', 'content': f"Error: {e}"})
        yield history, debug
    finally:
        gc.collect()


def cancel_generation():
    cancel_event.set()
    return 'Generation cancelled.'


def update_default_prompt(enable_search):
    today = datetime.now().strftime('%Y-%m-%d')
    return f"You are a helpful assistant. Today is {today}."


def toggle_theme(current_theme):
    """Toggle between light and dark themes"""
    if current_theme == "light":
        return "dark", "☀️ Light Mode"
    else:
        return "light", "🌙 Dark Mode"


def toggle_language(current_lang):
    """Toggle between Chinese and English"""
    if current_lang == "zh":
        return "en"
    else:
        return "zh"


def get_ui_text(lang):
    """Get UI text based on language"""
    texts = {
        "zh": {
            "title": "## Yee-R1 Demo",
            "subtitle": "小熠（Yee）AI 数据安全专家",
            "dark_mode": "🌙 暗黑模式",
            "light_mode": "☀️ 明亮模式",
            "lang_btn": "🌐 English",
            "select_model": "选择模型",
            "enable_search": "启用网络搜索",
            "system_prompt": "系统提示词",
            "gen_params": "### 生成参数",
            "max_tokens": "最大令牌数",
            "temperature": "温度",
            "top_k": "Top-K",
            "top_p": "Top-P", 
            "repeat_penalty": "重复惩罚",
            "search_settings": "### 网络搜索设置",
            "max_results": "最大结果数",
            "max_chars": "每个结果最大字符数",
            "search_timeout": "搜索超时时间 (秒)",
            "clear_chat": "清空对话",
            "cancel_gen": "取消生成",
            "placeholder": "输入您的消息并按回车..."
        },
        "en": {
            "title": "## Yee-R1 Demo",
            "subtitle": "Yee AI Data Security Expert",
            "dark_mode": "🌙 Dark Mode",
            "light_mode": "☀️ Light Mode",
            "lang_btn": "🌐 中文",
            "select_model": "Select Model",
            "enable_search": "Enable Web Search",
            "system_prompt": "System Prompt",
            "gen_params": "### Generation Parameters",
            "max_tokens": "Max Tokens",
            "temperature": "Temperature",
            "top_k": "Top-K",
            "top_p": "Top-P",
            "repeat_penalty": "Repetition Penalty",
            "search_settings": "### Web Search Settings",
            "max_results": "Max Results",
            "max_chars": "Max Chars/Result",
            "search_timeout": "Search Timeout (s)",
            "clear_chat": "Clear Chat",
            "cancel_gen": "Cancel Generation",
            "placeholder": "Type your message and press Enter..."
        }
    }
    return texts[lang]


# ------------------------------
# Gradio UI
# ------------------------------
with gr.Blocks(title="Yee-R1-Demo", theme=gr.themes.Default()) as demo:
    # States
    theme_state = gr.State("light")
    lang_state = gr.State("zh")
    
    # Header with controls
    with gr.Row():
        title_md = gr.Markdown("## Yee-R1 Demo")
        with gr.Row(scale=0):
            lang_btn = gr.Button("🌐 English", size="sm")
            theme_btn = gr.Button("🌙 暗黑模式", size="sm")
    
    subtitle_md = gr.Markdown("小熠（Yee）AI 数据安全专家")
    
    with gr.Row():
        with gr.Column(scale=3):
            model_dd = gr.Dropdown(label="选择模型", choices=list(MODELS.keys()), value=list(MODELS.keys())[0])
            search_chk = gr.Checkbox(label="启用网络搜索", value=True)
            sys_prompt = gr.Textbox(label="系统提示词", lines=3, value=update_default_prompt(search_chk.value))
            gen_params_md = gr.Markdown("### 生成参数")
            max_tok = gr.Slider(64, 16384, value=4096, step=32, label="最大令牌数")
            temp = gr.Slider(0.1, 2.0, value=0.6, step=0.1, label="温度")
            k = gr.Slider(1, 100, value=40, step=1, label="Top-K")
            p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
            rp = gr.Slider(1.0, 2.0, value=1.2, step=0.1, label="重复惩罚")
            search_settings_md = gr.Markdown("### 网络搜索设置")
            mr = gr.Number(value=6, precision=0, label="最大结果数")
            mc = gr.Number(value=600, precision=0, label="每个结果最大字符数")
            st = gr.Slider(minimum=0.0, maximum=30.0, step=0.5, value=5.0, label="搜索超时时间 (秒)")
            clr = gr.Button("清空对话")
            cnl = gr.Button("取消生成")
        with gr.Column(scale=7):
            chat = gr.Chatbot(type="messages", show_copy_all_button=True, height="50vh")
            txt = gr.Textbox(placeholder="输入您的消息并按回车...")
            dbg = gr.Markdown()

    # Event handlers
    search_chk.change(fn=update_default_prompt, inputs=search_chk, outputs=sys_prompt)
    clr.click(fn=lambda: ([], "", ""), outputs=[chat, txt, dbg])
    cnl.click(fn=cancel_generation, outputs=dbg)
    
    # Theme toggle functionality
    def handle_theme_toggle(current_theme, current_lang):
        new_theme, _ = toggle_theme(current_theme)
        ui_text = get_ui_text(current_lang)
        new_btn_text = ui_text["light_mode"] if new_theme == "dark" else ui_text["dark_mode"]
        
        if new_theme == "dark":
            demo._theme = gr.themes.Monochrome()
        else:
            demo._theme = gr.themes.Default()
        return new_theme, new_btn_text
    
    # Language toggle functionality
    def handle_language_toggle(current_lang, current_theme):
        new_lang = toggle_language(current_lang)
        ui_text = get_ui_text(new_lang)
        
        # Update all UI text
        updates = [
            new_lang,  # lang_state
            ui_text["lang_btn"],  # lang_btn
            ui_text["light_mode"] if current_theme == "dark" else ui_text["dark_mode"],  # theme_btn
            ui_text["title"],  # title_md
            ui_text["subtitle"],  # subtitle_md
            ui_text["select_model"],  # model_dd label
            ui_text["enable_search"],  # search_chk label
            ui_text["system_prompt"],  # sys_prompt label
            ui_text["gen_params"],  # gen_params_md
            ui_text["max_tokens"],  # max_tok label
            ui_text["temperature"],  # temp label
            ui_text["top_k"],  # k label
            ui_text["top_p"],  # p label
            ui_text["repeat_penalty"],  # rp label
            ui_text["search_settings"],  # search_settings_md
            ui_text["max_results"],  # mr label
            ui_text["max_chars"],  # mc label
            ui_text["search_timeout"],  # st label
            ui_text["clear_chat"],  # clr
            ui_text["cancel_gen"],  # cnl
            ui_text["placeholder"]  # txt placeholder
        ]
        
        return updates
    
    theme_btn.click(
        fn=handle_theme_toggle,
        inputs=[theme_state, lang_state],
        outputs=[theme_state, theme_btn]
    )
    
    lang_btn.click(
        fn=handle_language_toggle,
        inputs=[lang_state, theme_state],
        outputs=[
            lang_state, lang_btn, theme_btn, title_md, subtitle_md,
            model_dd, search_chk, sys_prompt, gen_params_md,
            max_tok, temp, k, p, rp, search_settings_md,
            mr, mc, st, clr, cnl, txt
        ]
    )
     
    txt.submit(fn=chat_response,
               inputs=[txt, chat, sys_prompt, search_chk, mr, mc,
                       model_dd, max_tok, temp, k, p, rp, st],
               outputs=[chat, dbg])
    
    demo.launch()