import os
import random
import uuid
import json
import time
import asyncio
import tempfile
from threading import Thread
import base64
import shutil
import re  # Added for the new tools

import gradio as gr
import spaces
import torch
import numpy as np
from PIL import Image
import edge_tts
import trimesh
import smolagents  # For the new tools

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextIteratorStreamer,
    Qwen2VLForConditionalGeneration,
    AutoProcessor,
)
from transformers.image_utils import load_image

from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
from diffusers.utils import export_to_ply

# -----------------------------------------------------------------------------
# Global constants and helper functions
# -----------------------------------------------------------------------------

MAX_SEED = np.iinfo(np.int32).max

def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    return seed

def glb_to_data_url(glb_path: str) -> str:
    """
    Reads a GLB file from disk and returns a data URL with a base64 encoded representation.
    (Not used in this method.)
    """
    with open(glb_path, "rb") as f:
        data = f.read()
    b64_data = base64.b64encode(data).decode("utf-8")
    return f"data:model/gltf-binary;base64,{b64_data}"

# -----------------------------------------------------------------------------
# Model class for Text-to-3D Generation (ShapE)
# -----------------------------------------------------------------------------

class Model:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
        self.pipe.to(self.device)
        # Ensure the text encoder is in half precision to avoid dtype mismatches.
        if torch.cuda.is_available():
            try:
                self.pipe.text_encoder = self.pipe.text_encoder.half()
            except AttributeError:
                pass

        self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
        self.pipe_img.to(self.device)
        # Use getattr with a default value to avoid AttributeError if text_encoder is missing.
        if torch.cuda.is_available():
            text_encoder_img = getattr(self.pipe_img, "text_encoder", None)
            if text_encoder_img is not None:
                self.pipe_img.text_encoder = text_encoder_img.half()

    def to_glb(self, ply_path: str) -> str:
        mesh = trimesh.load(ply_path)
        # Rotate the mesh for proper orientation
        rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
        mesh.apply_transform(rot)
        rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
        mesh.apply_transform(rot)
        mesh_path = tempfile.NamedTemporaryFile(suffix=".glb", delete=False)
        mesh.export(mesh_path.name, file_type="glb")
        return mesh_path.name

    def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
        generator = torch.Generator(device=self.device).manual_seed(seed)
        images = self.pipe(
            prompt,
            generator=generator,
            guidance_scale=guidance_scale,
            num_inference_steps=num_steps,
            output_type="mesh",
        ).images
        ply_path = tempfile.NamedTemporaryFile(suffix=".ply", delete=False, mode="w+b")
        export_to_ply(images[0], ply_path.name)
        return self.to_glb(ply_path.name)

    def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
        generator = torch.Generator(device=self.device).manual_seed(seed)
        images = self.pipe_img(
            image,
            generator=generator,
            guidance_scale=guidance_scale,
            num_inference_steps=num_steps,
            output_type="mesh",
        ).images
        ply_path = tempfile.NamedTemporaryFile(suffix=".ply", delete=False, mode="w+b")
        export_to_ply(images[0], ply_path.name)
        return self.to_glb(ply_path.name)

# -----------------------------------------------------------------------------
# Gradio UI configuration
# -----------------------------------------------------------------------------

DESCRIPTION = """
# Agent Dino 🌠
"""

css = '''
h1 {
  text-align: center;
  display: block;
}

#duplicate-button {
  margin: auto;
  color: #fff;
  background: #1565c0;
  border-radius: 100vh;
}
'''

MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# -----------------------------------------------------------------------------
# Load Models and Pipelines for Chat, Image, and Multimodal Processing
# -----------------------------------------------------------------------------

# Load the text-only model and tokenizer (for pure text chat)
model_id = "prithivMLmods/FastThink-0.5B-Tiny"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
model.eval()

# Voices for text-to-speech
TTS_VOICES = [
    "en-US-JennyNeural",  # @tts1
    "en-US-GuyNeural",    # @tts2
]

# Load multimodal processor and model (e.g. for OCR and image processing)
MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct" 
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model_m = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to("cuda").eval()

# -----------------------------------------------------------------------------
# Asynchronous text-to-speech
# -----------------------------------------------------------------------------

async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
    """Convert text to speech using Edge TTS and save as MP3"""
    communicate = edge_tts.Communicate(text, voice)
    await communicate.save(output_file)
    return output_file

# -----------------------------------------------------------------------------
# Utility function to clean conversation history
# -----------------------------------------------------------------------------

def clean_chat_history(chat_history):
    """
    Filter out any chat entries whose "content" is not a string.
    This helps prevent errors when concatenating previous messages.
    """
    cleaned = []
    for msg in chat_history:
        if isinstance(msg, dict) and isinstance(msg.get("content"), str):
            cleaned.append(msg)
    return cleaned

# -----------------------------------------------------------------------------
# Stable Diffusion XL Pipeline for Image Generation
# -----------------------------------------------------------------------------

MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation

sd_pipe = StableDiffusionXLPipeline.from_pretrained(
    MODEL_ID_SD,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    use_safetensors=True,
    add_watermarker=False,
).to(device)
sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)

if torch.cuda.is_available():
    sd_pipe.text_encoder = sd_pipe.text_encoder.half()

if USE_TORCH_COMPILE:
    sd_pipe.compile()

if ENABLE_CPU_OFFLOAD:
    sd_pipe.enable_model_cpu_offload()

def save_image(img: Image.Image) -> str:
    """Save a PIL image with a unique filename and return the path."""
    unique_name = str(uuid.uuid4()) + ".png"
    img.save(unique_name)
    return unique_name

@spaces.GPU(duration=60, enable_queue=True)
def generate_image_fn(
    prompt: str,
    negative_prompt: str = "",
    use_negative_prompt: bool = False,
    seed: int = 1,
    width: int = 1024,
    height: int = 1024,
    guidance_scale: float = 3,
    num_inference_steps: int = 25,
    randomize_seed: bool = False,
    use_resolution_binning: bool = True,
    num_images: int = 1,
    progress=gr.Progress(track_tqdm=True),
):
    """Generate images using the SDXL pipeline."""
    seed = int(randomize_seed_fn(seed, randomize_seed))
    generator = torch.Generator(device=device).manual_seed(seed)

    options = {
        "prompt": [prompt] * num_images,
        "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
        "width": width,
        "height": height,
        "guidance_scale": guidance_scale,
        "num_inference_steps": num_inference_steps,
        "generator": generator,
        "output_type": "pil",
    }
    if use_resolution_binning:
        options["use_resolution_binning"] = True

    images = []
    # Process in batches
    for i in range(0, num_images, BATCH_SIZE):
        batch_options = options.copy()
        batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
        if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
            batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
        if device.type == "cuda":
            with torch.autocast("cuda", dtype=torch.float16):
                outputs = sd_pipe(**batch_options)
        else:
            outputs = sd_pipe(**batch_options)
        images.extend(outputs.images)
    image_paths = [save_image(img) for img in images]
    return image_paths, seed

# -----------------------------------------------------------------------------
# Tools for Web Search and Webpage Visiting using DuckDuckGo and smolagents
# -----------------------------------------------------------------------------

from typing import Any, Optional
from smolagents.tools import Tool
import requests
import markdownify
import duckduckgo_search

class VisitWebpageTool(Tool):
    name = "visit_webpage"
    description = "Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."
    inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
    output_type = "string"

    def forward(self, url: str) -> str:
        try:
            from markdownify import markdownify
            from requests.exceptions import RequestException
            from smolagents.utils import truncate_content
        except ImportError as e:
            raise ImportError(
                "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
            ) from e
        try:
            response = requests.get(url, timeout=20)
            response.raise_for_status()  # Raise an exception for bad status codes
            # Convert the HTML content to Markdown
            markdown_content = markdownify.markdownify(response.text).strip()
            # Remove multiple line breaks
            markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
            return truncate_content(markdown_content, 10000)
        except requests.exceptions.Timeout:
            return "The request timed out. Please try again later or check the URL."
        except RequestException as e:
            return f"Error fetching the webpage: {str(e)}"
        except Exception as e:
            return f"An unexpected error occurred: {str(e)}"

class DuckDuckGoSearchTool(Tool):
    name = "web_search"
    description = "Performs a duckduckgo web search based on your query (think a Google search) then returns the top search results."
    inputs = {'query': {'type': 'string', 'description': 'The search query to perform.'}}
    output_type = "string"

    def __init__(self, max_results=10, **kwargs):
        super().__init__()
        self.max_results = max_results
        try:
            from duckduckgo_search import DDGS
        except ImportError as e:
            raise ImportError(
                "You must install package `duckduckgo_search` to run this tool: for instance run `pip install duckduckgo-search`."
            ) from e
        self.ddgs = DDGS(**kwargs)

    def forward(self, query: str) -> str:
        results = self.ddgs.text(query, max_results=self.max_results)
        if len(results) == 0:
            raise Exception("No results found! Try a less restrictive/shorter query.")
        postprocessed_results = [
            f"[{result['title']}]({result['href']})\n{result['body']}" for result in results
        ]
        return "## Search Results\n\n" + "\n\n".join(postprocessed_results)

# -----------------------------------------------------------------------------
# Chat Generation Function with support for @tts, @image, @3d, and @web commands
# -----------------------------------------------------------------------------

@spaces.GPU
def generate(
    input_dict: dict,
    chat_history: list[dict],
    max_new_tokens: int = 1024,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.2,
):
    """
    Generates chatbot responses with support for multimodal input, TTS, image generation,
    3D model generation, and web search/webpage visiting.
    
    Special commands:
      - "@tts1" or "@tts2": triggers text-to-speech.
      - "@image": triggers image generation using the SDXL pipeline.
      - "@3d": triggers 3D model generation using the ShapE pipeline.
      - "@web": triggers a web command. Use "visit" to visit a URL (e.g., "@web visit https://example.com")
                or "search" to perform a DuckDuckGo search (e.g., "@web search AI news").
    """
    text = input_dict["text"]
    files = input_dict.get("files", [])

    # --- 3D Generation branch ---
    if text.strip().lower().startswith("@3d"):
        prompt = text[len("@3d"):].strip()
        yield "Hold tight, generating a 3D mesh GLB file....."
        glb_path, used_seed = generate_3d_fn(
            prompt=prompt,
            seed=1,
            guidance_scale=15.0,
            num_steps=64,
            randomize_seed=True,
        )
        # Copy the GLB file to a static folder.
        static_folder = os.path.join(os.getcwd(), "static")
        if not os.path.exists(static_folder):
            os.makedirs(static_folder)
        new_filename = f"mesh_{uuid.uuid4()}.glb"
        new_filepath = os.path.join(static_folder, new_filename)
        shutil.copy(glb_path, new_filepath)
        
        yield gr.File(new_filepath)
        return

    # --- Image Generation branch ---
    if text.strip().lower().startswith("@image"):
        prompt = text[len("@image"):].strip()
        yield "Generating image..."
        image_paths, used_seed = generate_image_fn(
            prompt=prompt,
            negative_prompt="",
            use_negative_prompt=False,
            seed=1,
            width=1024,
            height=1024,
            guidance_scale=3,
            num_inference_steps=25,
            randomize_seed=True,
            use_resolution_binning=True,
            num_images=1,
        )
        yield gr.Image(image_paths[0])
        return

    # --- Web Search/Visit branch ---
    if text.strip().lower().startswith("@web"):
        command_text = text[len("@web"):].strip()
        if command_text.lower().startswith("visit "):
            url = command_text[len("visit"):].strip()
            yield "Visiting webpage..."
            result = VisitWebpageTool().forward(url)
            yield result
        elif command_text.lower().startswith("search "):
            query = command_text[len("search"):].strip()
            yield "Performing web search..."
            result = DuckDuckGoSearchTool().forward(query)
            yield result
        else:
            # Default to web search if no subcommand is specified.
            yield "Performing web search..."
            result = DuckDuckGoSearchTool().forward(command_text)
            yield result
        return

    # --- Text and TTS branch ---
    tts_prefix = "@tts"
    is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
    voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
    
    if is_tts and voice_index:
        voice = TTS_VOICES[voice_index - 1]
        text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
        conversation = [{"role": "user", "content": text}]
    else:
        voice = None
        text = text.replace(tts_prefix, "").strip()
        conversation = clean_chat_history(chat_history)
        conversation.append({"role": "user", "content": text})

    if files:
        if len(files) > 1:
            images = [load_image(image) for image in files]
        elif len(files) == 1:
            images = [load_image(files[0])]
        else:
            images = []
        messages = [{
            "role": "user",
            "content": [
                *[{"type": "image", "image": image} for image in images],
                {"type": "text", "text": text},
            ]
        }]
        prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
        thread.start()

        buffer = ""
        yield "Thinking..."
        for new_text in streamer:
            buffer += new_text
            buffer = buffer.replace("<|im_end|>", "")
            time.sleep(0.01)
            yield buffer
    else:
        input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
        if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
            input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
            gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
        input_ids = input_ids.to(model.device)
        streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
        generation_kwargs = {
            "input_ids": input_ids,
            "streamer": streamer,
            "max_new_tokens": max_new_tokens,
            "do_sample": True,
            "top_p": top_p,
            "top_k": top_k,
            "temperature": temperature,
            "num_beams": 1,
            "repetition_penalty": repetition_penalty,
        }
        t = Thread(target=model.generate, kwargs=generation_kwargs)
        t.start()

        outputs = []
        for new_text in streamer:
            outputs.append(new_text)
            yield "".join(outputs)

        final_response = "".join(outputs)
        yield final_response

        if is_tts and voice:
            output_file = asyncio.run(text_to_speech(final_response, voice))
            yield gr.Audio(output_file, autoplay=True)

# -----------------------------------------------------------------------------
# Gradio Chat Interface Setup and Launch
# -----------------------------------------------------------------------------

demo = gr.ChatInterface(
    fn=generate,
    additional_inputs=[
        gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
        gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
        gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
        gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
        gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
    ],
    examples=[
        ["@tts1 Who is Nikola Tesla, and why did he die?"],
        ["@3d A birthday cupcake with cherry"],
        ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
        [{"text": "summarize the letter", "files": ["examples/1.png"]}],
        ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
        ["Explain newton's third law"],
        ["@tts2 What causes rainbows to form?"],
    ],
    cache_examples=False,
    type="messages",
    description=DESCRIPTION,
    css=css,
    fill_height=True,
    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
    stop_btn="Stop Generation",
    multimodal=True,
)

# Ensure the static folder exists
if not os.path.exists("static"):
    os.makedirs("static")

# Mount the static folder onto the FastAPI app so that GLB files are served.
from fastapi.staticfiles import StaticFiles
demo.app.mount("/static", StaticFiles(directory="static"), name="static")

if __name__ == "__main__":
    demo.queue(max_size=20).launch(share=True)