import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces
import re
from markdownify import markdownify


models = {
    "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True).eval().to("cuda"),
    "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True).eval().to("cuda")
}

tokenizers = {
    "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True),
    "jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True),
}


@spaces.GPU
def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
    print("Start Model Processing")
    model = models[model_id]
    tokenizer = tokenizers[model_id]
    messages = [{"role": "user", "content": html_content}]
    input_text=tokenizer.apply_chat_template(messages, tokenize=False)
    inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
    outputs = model.generate(inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08)
    pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
    assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
    print("Start Markdownify Processing")
    markdownify_output = markdownify(html_content)
    return assistant_response[0], markdownify_output


css = """
  #output {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""

example_html = """<div id="myDIV" class="header">
  <h2>My To Do List</h2>
  <input type="text" id="myInput" placeholder="Title...">
  <span onclick="newElement()" class="addBtn">Add</span>
</div>

<ul id="myUL">
  <li>Hit the gym</li>
  <li class="checked">Pay bills</li>
  <li>Meet George</li>
  <li>Buy eggs</li>
  <li>Read a book</li>
  <li>Organize office</li>
</ul>"""

with gr.Blocks(css=css) as demo:
    gr.Markdown("""
    # HTML-to-Markdown
    Try out model based HTML-to-Markdown with [Reader LM](https://huggingface.co/jinaai/reader-lm-1.5b) and rule based with [Markdownify](https://github.com/matthewwithanm/python-markdownify).
    """)
    with gr.Row():
        with gr.Column():
            model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="jinaai/reader-lm-1.5b")
            html_content = gr.Textbox(label="HTML")
            submit_btn = gr.Button(value="Submit")
        with gr.Column():
            model_output_text = gr.Textbox(label="Reader LM Output")
            markdownify_output = gr.Textbox(label="Markdownify Output")

    gr.Examples(
        examples=[
            [example_html],
        ],
        inputs=[html_content],
        outputs=[model_output_text, markdownify_output],
        fn=run_example,
        cache_examples=True,
        label="Try examples"
    )

    submit_btn.click(run_example, [html_content, model_selector], [model_output_text, markdownify_output])

demo.launch(debug=True)