Spaces:

Emova-ollm
/

EMOVA-demo

Runtime error

App Files Files Community

KaiChen1998 commited on Oct 18, 2024

Commit

0fa20f6

1 Parent(s): 256f531

upload emova hf demo

Browse files

Files changed (4) hide show

.gitignore +3 -0
app.py +544 -4
conversation_public.py +506 -0
requirements.txt +29 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__/
+speech/
+examples/

app.py CHANGED Viewed

@@ -1,7 +1,547 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import argparse
+import datetime
+import json
+import os
+import time
+import hashlib
+import uuid
+import spaces
 import gradio as gr
+from conversation_public import default_conversation, conv_templates, SeparatorStyle
+auth_token = os.environ.get("TOKEN_FROM_SECRET")
+##########################################
+# Audio part
+##########################################
+from huggingface_hub import snapshot_download
+snapshot_download(repo_id="Emova-ollm/emova_speech_tokenizer", local_dir='./speech', token=auth_token)
+from speech.speech_utils import s2u_extract_unit_demo, get_ckpt_config_path, load_model
+from speech.speech_utils import load_condition_centroid, get_config_checkpoint_file, load_U2S_model, synthesis
+####################
+# S2U
+####################
+reduced=True
+reduced_mark = 'reduced' if reduced else 'unreduced'
+unit_type = '40ms_multilingual_8888'
+language = 'English'
+s2u_model_name = 'SPIRAL-FSQ-CTC'
+ckpt_path, config_path = get_ckpt_config_path(unit_type, language)
+s2u_model = load_model(ckpt_path, config_path, s2u_model_name)
+####################
+# U2S
+####################
+condition2style_centroid_file = "./speech/condition_style_centroid/condition2style_centroid.txt"
+condition2style_centroid_file_dict, condition2style_centroid_embedding_dict = load_condition_centroid(condition2style_centroid_file)
+unit_type = '40ms_multilingual_8888_xujing_cosyvoice_FT'
+language = 'Chinese'
+model_config_file, model_checkpoint_file = get_config_checkpoint_file(unit_type, language)
+net_g, hps = load_U2S_model(model_config_file, model_checkpoint_file, unit_type)
+####################
+# task format
+####################
+asr_format = "Please recognize the text corresponding to the follwing speech.\n"
+tts_format = "Please synthesize the speech corresponding to the follwing text.\n"
+chat_format = r'Please recognize the texts, emotion and pitch from the user question speech units and provide the texts, emotion, pitch and speech units for the assistant response. \nEmotion should be chosen from ["neutral", "happy", "sad", "angry", "surprised", "disgusted", "fearful"]. \nPitch should be chosen from ["low", "normal", "high"].\nYour output should be in json format.\nAn output example is:\n{"user question text": "", "user question emotion": "", "user question pitch": "", "assistant response text": "", "assistant response emotion": "", "assistant response pitch": ""，"assistant response speech": ""}\n\nuser question speech:'
+@spaces.GPU(duration=5)
+def s2u_asr(text, audio_file):
+    return asr_format + s2u_extract_unit_demo(s2u_model, audio_file, model_name=s2u_model_name, reduced=reduced)
+@spaces.GPU(duration=5)
+def s2u_chat(text, audio_file):
+    return chat_format + s2u_extract_unit_demo(s2u_model, audio_file, model_name=s2u_model_name, reduced=reduced)
+def u2s_tts(text, audio_file):
+    return tts_format + text
+mode2func = dict(
+    asr=s2u_asr,
+    chat=s2u_chat,
+    tts=u2s_tts,
+)
+##########################################
+# LLM part
+##########################################
+import torch
+from transformers import AutoModel, AutoProcessor, TextIteratorStreamer
+from threading import Thread
+model_name = "Emova-ollm/emova_llama3_1-8b"
+model = AutoModel.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16,
+    use_flash_attn=True,
+    low_cpu_mem_usage=True,
+    trust_remote_code=True,
+    token=auth_token).eval().cuda()
+processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
+def stream_response(model, inputs, streamer, prompt, gen_kwargs):
+    thread = Thread(target=model.generate, kwargs=dict(
+        streamer=streamer,
+        **inputs,
+        **gen_kwargs
+    ))
+    thread.start()
+    generated_text = prompt
+    for new_text in streamer:
+        generated_text += new_text
+        yield generated_text
+##########################################
+# Gradio part
+##########################################
+no_change_btn = gr.Button()
+enable_btn = gr.Button(interactive=True)
+disable_btn = gr.Button(interactive=False)
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+def load_demo_refresh_model_list():
+    print(f"load_demo.")
+    state = default_conversation.copy()
+    return state
+def regenerate(state, image_process_mode):
+    print(f"regenerate.")
+    state.messages[-1][-1] = None
+    prev_human_msg = state.messages[-2]
+    if type(prev_human_msg[1]) in (tuple, list):
+        prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode, *prev_human_msg[1][3:])
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot_public(), "", None, None) + (disable_btn,) * 2
+def clear_history():
+    print(f"clear_history.")
+    state = default_conversation.copy()
+    return (state, state.to_gradio_chatbot_public(), "", None) + (disable_btn,) * 2 + (None,)
+############
+# Show prompt in the chatbot
+# Input: [state, textbox, imagebox, image_process_mode, audio_input, audio_mode]
+# Return: [state, chatbot, textbox, imagebox, audio_input] + btn_list
+############
+def add_text(state, text, image, image_process_mode, audio_input, audio_mode):
+    ############
+    # Input legality checking
+    ############
+    print(f"add_text. len: {len(text)}")
+    if len(text) <= 0 and image is None and audio_input is None:
+        state.skip_next = True
+        return (state, state.to_gradio_chatbot_public(), "", None, None) + (no_change_btn,) * 2
+    ############
+    # Re-initialize if having conducted audio conversations
+    ############
+    for i, (role, msg) in enumerate(state.messages[state.offset:]):
+        if isinstance(msg, tuple) and msg[-1] is not None:
+            state = default_conversation.copy()
+            break
+    ############
+    # Deal with image inputs
+    ############
+    if image is not None:
+        if '<image>' not in text:
+            text = text + '\n<image>'
+        text = (text, image, image_process_mode, None)
+        state = default_conversation.copy()
+    ############
+    # Deal with audio inputs
+    ############
+    if audio_input is not None or audio_mode == 'tts':
+        if isinstance(text, tuple):
+            if audio_mode == 'chat':
+                prompt = mode2func[audio_mode](text[0][:-len("\n<image>")], audio_input)
+                text = (prompt + "\n<image>", text[1], text[2], audio_input)
+            elif audio_mode == 'tts':
+                prompt = mode2func[audio_mode](text[0][:-len("\n<image>")], audio_input)
+                text = (prompt, None, None, None)
+            else:
+                prompt = mode2func[audio_mode](text, audio_input)
+                text = (prompt, None, None, audio_input)
+        else:
+            prompt = mode2func[audio_mode](text, audio_input)
+            text = (prompt, None, None, audio_input)
+        state = default_conversation.copy()
+    state.append_message(state.roles[0], text)
+    state.append_message(state.roles[1], None)
+    state.skip_next = False
+    print(str(state.messages))
+    return (state, state.to_gradio_chatbot_public(), "", None, None) + (disable_btn,) * 2
+############
+# Get response
+# Input: [state, temperature, top_p, max_output_tokens, speaker]
+# Return: [state, chatbot] + btn_list
+############
+@spaces.GPU
+def http_bot(state, temperature, top_p, max_new_tokens, speaker):
+    print(f"http_bot.")
+    if state.skip_next:
+        yield (state, state.to_gradio_chatbot_public()) + (no_change_btn,) * 2
+        return
+    if len(state.messages) == state.offset + 2:
+        # First round of conversation
+        if 'llama-2' in model_name.lower():
+            template_name = "llava_llama_2"
+        elif "mistral" in model_name.lower() or "mixtral" in model_name.lower():
+            if 'orca' in model_name.lower():
+                template_name = "mistral_orca"
+            elif 'hermes' in model_name.lower():
+                template_name = "chatml_direct"
+            else:
+                template_name = "mistral_instruct"
+        elif 'llava-v1.6-34b' in model_name.lower():
+            template_name = "chatml_direct"
+        elif "v1" in model_name.lower():
+            if 'mmtag' in model_name.lower():
+                template_name = "v1_mmtag"
+            elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
+                template_name = "v1_mmtag"
+            else:
+                template_name = "llava_v1"
+        elif "mpt" in model_name.lower():
+            template_name = "mpt"
+        elif "llama3" in model_name.lower():
+            template_name = 'llama3_demo'
+        else:
+            if 'mmtag' in model_name.lower():
+                template_name = "v0_mmtag"
+            elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
+                template_name = "v0_mmtag"
+            else:
+                template_name = "llava_v0"
+        new_state = conv_templates[template_name].copy()
+        new_state.append_message(new_state.roles[0], state.messages[-2][1])
+        new_state.append_message(new_state.roles[1], None)
+        state = new_state
+    # Construct prompt
+    prompt = state.get_prompt()
+    all_images = state.get_images(return_pil=True)
+    all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
+    # Make requests
+    pload = {
+        "model": model_name,
+        "prompt": prompt,
+        "temperature": float(temperature),
+        "top_p": float(top_p),
+        "max_new_tokens": int(max_new_tokens),
+        "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
+        "images": f'List of {len(state.get_images())} images: {all_image_hash}',
+    }
+    print(f"==== request ====\n{pload}")
+    pload['images'] = all_images
+    # Process inputs
+    inputs = processor(text=[prompt], images=all_images if len(all_images) > 0 else None, return_tensors="pt")
+    inputs.to(model.device)
+    if len(all_images) > 0:
+        inputs['pixel_values'] = inputs['pixel_values'].to(model.dtype)
+    # Process hyperparameters
+    temperature = float(pload.get("temperature", 1.0))
+    top_p = float(pload.get("top_p", 1.0))
+    stop_str = pload.get("stop", None)
+    do_sample = True if temperature > 0.001 else False
+    max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
+    max_new_tokens = int(pload.get("max_new_tokens", 256))
+    max_new_tokens = min(max_new_tokens, max_context_length - inputs['input_ids'].shape[1])
+    gen_kwargs = dict(
+        do_sample=do_sample,
+        temperature=temperature,
+        top_p=top_p,
+        max_new_tokens=max_new_tokens,
+        use_cache=True,
+    )
+    if max_new_tokens < 1:
+        state.messages[-1][-1] = "Exceeds max token length. Please start a new conversation, thanks."
+        yield (state, state.to_gradio_chatbot_public()) + (disable_btn,) * 2
+        return
+    state.messages[-1][-1] = "▌"
+    yield (state, state.to_gradio_chatbot_public()) + (disable_btn,) * 2
+    # Stream output
+    try:
+        for generated_text in stream_response(model, inputs, streamer, prompt, gen_kwargs):
+            output = generated_text[len(prompt):].strip()
+            if tts_format not in prompt and chat_format not in prompt:
+                state.messages[-1][-1] = output + "▌"
+            else:
+                state.messages[-1][-1] = "▌"
+                # state.messages[-1][-1] = "[😁 GENERATING AUDIO {}%...]".format(round(output.count("<|speech_") / max_new_tokens * 100, 1)) + "\n" + output + "▌"
+            yield (state, state.to_gradio_chatbot_public()) + (disable_btn,) * 2
+    except Exception as e:
+        state.messages[-1][-1] = server_error_msg
+        yield (state, state.to_gradio_chatbot_public()) + (enable_btn,) * 2
+        return
+    ################
+    # decode output to audio
+    ################
+    temp_file = None
+    if tts_format in prompt or chat_format in prompt:
+        try:
+            try:
+                if output.startswith("{"):
+                    if output.endswith("|>"):
+                        output += "\"}"
+                    elif output.endswith("\""):
+                        output += "}"
+                info_dict = json.loads(output)
+                content_unit = info_dict['assistant response speech'].replace('<|speech_', '').replace('|>', ' ').strip()
+                emotion = info_dict['assistant response emotion'] if hasattr(info_dict, 'assistant response emotion') else "neutral"
+                speed = info_dict['assistant response speed'] if hasattr(info_dict, 'assistant response speed') else "normal"
+                pitch = info_dict['assistant response pitch'] if hasattr(info_dict, 'assistant response pitch') else "normal"
+                gender = speaker.lower() if speaker else 'female'
+            except:
+                content_unit = output.replace('<|speech_', '').replace('|>', ' ').strip()
+                emotion = 'neutral'
+                speed = "normal"
+                pitch = "normal"
+                gender = speaker.lower() if speaker else 'female'
+            condition = f'gender-{gender}_emotion-{emotion}_speed-{speed}_pitch-{pitch}'
+            style_centroid_file = condition2style_centroid_file_dict[condition]
+            style_centroid_embedding = condition2style_centroid_embedding_dict[condition]
+            print(condition)
+            id = str(uuid.uuid4())
+            os.makedirs("./demo_audio", exist_ok=True)
+            synthesis(content_unit, style_centroid_embedding, hps, net_g, f"./demo_audio/{id}_temp_audio.wav")
+            temp_file = f"./demo_audio/{id}_temp_audio.wav"
+        except Exception as e:
+            print(e)
+    state.messages[-1][-1] = state.messages[-1][-1][:-1]
+    if tts_format in prompt or chat_format in prompt:
+        if temp_file is not None:
+            state.messages[-1][-1] = (output, temp_file)
+            yield (state, state.to_gradio_chatbot_public()) + (enable_btn,) * 2
+        else:
+            state.messages[-1][-1] = server_error_msg
+            yield (state, state.to_gradio_chatbot_public()) + (enable_btn,) * 2
+    else:
+        yield (state, state.to_gradio_chatbot_public()) + (enable_btn,) * 2
+    if temp_file is not None:
+        os.system("rm {}".format(temp_file))
+    print(f"{output}")
+############
+# Layout Markdown
+############
+title_markdown = ("""
+<div style="display: flex; align-items: center; padding: 20px; border-radius: 10px; background-color: #f0f0f0;">
+  <div style="margin-right: 20px;">
+    <img src="https://emova-ollm.github.io/static/images/icons/emova.png" alt="Icon" style="width: 100px; height: 100px; border-radius: 10px;">
+  </div>
+  <div>
+    <h1 style="margin: 0;">EMOVA: Empowering Language Models to See, Hear and Speak with Vivid Emotion</h2>
+    <p style="margin: 10px 0;">
+      1. Note that to use the Webcam and Microphone, open <a href="chrome://flags/#unsafely-treat-insecure-origin-as-secure">chrome://flags/#unsafely-treat-insecure-origin-as-secure</a> and put this link into the box.<br/>
+      2. To chat with EMOVA, upload images, enter texts or record audios and then do not forget to <mark>Click 💬 Chat Button</mark> ^v^!<br/>
+      3. Heighten the <code>Max output tokens</code> if necessary to talk longer with EMOVA.
+    </p>
+  </div>
+</div>
+""")
+tos_markdown = ("""
+### Terms of use
+By using this service, users are required to agree to the following terms:
+The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
+For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
+""")
+learn_more_markdown = ("""
+### License
+The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
+### Acknowledgement
+The service is built upon [LLaVA](https://github.com/haotian-liu/LLaVA/). We thanks the authors for open-sourcing the wonderful code.
+""")
+block_css = """
+#buttons button {
+    min-width: min(120px,100%);
+}
+.message-row img {
+    margin: 0px !important;
+}
+.avatar-container img {
+    padding: 0px !important;
+}
+"""
+############
+# Layout Demo
+############
+def build_demo(embed_mode, cur_dir=None):
+    textbox = gr.Textbox(label="Text", show_label=False, placeholder="Enter text or record audio in the right and then click 💬 Chat to talk with me ^v^", container=False, scale=6)
+    audio_input = gr.Audio(label="Audio", sources=["microphone", "upload"], type="filepath", max_length=10, show_download_button=True, waveform_options=dict(sample_rate=16000), scale=2)
+    with gr.Blocks(title="EMOVA", theme=gr.themes.Default(), css=block_css) as demo:
+        state = gr.State()
+        if not embed_mode:
+            gr.Markdown(title_markdown)
+        ##############
+        # Chatbot
+        ##############
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                imagebox = gr.Image(type="pil", label="Image")
+                image_process_mode = gr.Radio(
+                    ["Crop", "Resize", "Pad", "Default"],
+                    value="Default",
+                    label="Preprocess for non-square image", visible=False)
+                ##############
+                # Parameters
+                ##############
+                with gr.Accordion("Parameters", open=True) as parameter_row:
+                    temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature")
+                    top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P")
+                    max_output_tokens = gr.Slider(minimum=0, maximum=4096, value=2048, step=32, interactive=True, label="Max output tokens")
+                    speaker = gr.Radio(["Female", "Male"], label="Speaker")
+            with gr.Column(scale=8):
+                chatbot = gr.Chatbot(
+                    elem_id="chatbot",
+                    label="EMOVA Chatbot",
+                    layout="bubble",
+                    avatar_images=["examples/user_avator.png", "examples/icon_256.png"]
+                )
+                with gr.Row(equal_height=True):
+                    textbox.render()
+                    audio_input.render()
+                with gr.Row(elem_id="buttons") as button_row:
+                    submit_btn = gr.Button(value="💬  Chat", variant="primary")
+                    #stop_btn = gr.Button(value="⏹️  Stop Generation", interactive=False)
+                    regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
+                    clear_btn = gr.Button(value="🗑️  Clear", interactive=False)
+        ##############
+        # Examples
+        ##############
+        if cur_dir is None:
+            cur_dir = os.path.dirname(os.path.abspath(__file__))
+        with gr.Row():
+            with gr.Column(scale=9):
+                gr.Examples(examples=[
+                    [f"{cur_dir}/examples/emo-speech/what_is_your_name.wav"],
+                    [f"{cur_dir}/examples/emo-speech/parent.wav"],
+                    [f"{cur_dir}/examples/emo-speech/I_am_so_sad.wav"],
+                    [f"{cur_dir}/examples/emo-speech/wedding(CH).wav"],
+                ], inputs=[audio_input], label='Audio Examples')
+        with gr.Row(equal_height=True):
+            gr.Examples(examples=[
+                [f"{cur_dir}/examples/image-text/example_1.png", "Why is this image funny?"],
+                [f"{cur_dir}/examples/image-text/example_2.png", "First please perform reasoning, and think step by step to provide best answer to the following question:\n\nWhat is the original price for pork belly before discount?"],
+                [f"{cur_dir}/examples/image-text/example_3.png", "Convert this table to markdown format."],
+            ], inputs=[imagebox, textbox], label='Image Examples')
+            gr.Examples(examples=[
+                [f"{cur_dir}/examples/emo-speech/write_a_poem.jfif", f"{cur_dir}/examples/emo-speech/write_a_poem.wav"],
+                [f"{cur_dir}/examples/emo-speech/I_am_happy_get_my_offer.webp", f"{cur_dir}/examples/emo-speech/I_am_happy_get_my_offer.wav"],
+                [f"{cur_dir}/examples/structure-speech/names_of_main_actors.jpg", f"{cur_dir}/examples/structure-speech/names_of_main_actors.wav"],
+            ], inputs=[imagebox, audio_input], label='Omni Examples 1')
+            gr.Examples(examples=[
+                [f"{cur_dir}/examples/structure-speech/how_to_save_water.png", f"{cur_dir}/examples/structure-speech/how_to_save_water.wav"],
+                [f"{cur_dir}/examples/structure-speech/internet_coverage.png", f"{cur_dir}/examples/structure-speech/internet_coverage.wav"],
+                [f"{cur_dir}/examples/structure-speech/how_to_use_website.PNG", f"{cur_dir}/examples/structure-speech/how_to_use_website.wav"],
+            ], inputs=[imagebox, audio_input], label='Omni Examples 2')
+        if not embed_mode:
+            gr.Markdown(tos_markdown)
+            gr.Markdown(learn_more_markdown)
+        # Register listeners
+        btn_list = [regenerate_btn, clear_btn]
+        regenerate_btn.click(
+            regenerate,
+            [state, image_process_mode],
+            [state, chatbot, textbox, imagebox, audio_input] + btn_list
+        ).then(
+            http_bot,
+            [state, temperature, top_p, max_output_tokens, speaker],
+            [state, chatbot] + btn_list,
+        )
+        clear_btn.click(
+            clear_history,
+            None,
+            [state, chatbot, textbox, imagebox] + btn_list + [audio_input],
+            queue=False
+        )
+        # probably mean press enter
+        textbox.submit(
+            add_text,
+            [state, textbox, imagebox, image_process_mode, audio_input, gr.Number(value='chat', visible=False)],
+            [state, chatbot, textbox, imagebox, audio_input] + btn_list,
+            queue=False
+        ).then(
+            http_bot,
+            [state, temperature, top_p, max_output_tokens, speaker],
+            [state, chatbot] + btn_list,
+        )
+        submit_btn.click(
+            add_text,
+            [state, textbox, imagebox, image_process_mode, audio_input, gr.Number(value='chat', visible=False)],
+            [state, chatbot, textbox, imagebox, audio_input] + btn_list
+        ).then(
+            http_bot,
+            [state, temperature, top_p, max_output_tokens, speaker],
+            [state, chatbot] + btn_list,
+        )
+        ##############
+        # Demo loading
+        ##############
+        demo.load(
+            load_demo_refresh_model_list,
+            None,
+            [state],
+            queue=False
+        )
+    return demo
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument("--embed", action="store_true")
+    args = parser.parse_args()
+    demo = build_demo(args.embed)
+    demo.queue(
+        api_open=False
+    ).launch(
+        favicon_path="./examples/icon_256.png",
+        allowed_paths=["/"],
+        share=args.share
+    )

conversation_public.py ADDED Viewed

	@@ -0,0 +1,506 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+import base64
+from io import BytesIO
+from PIL import Image
+import base64
+tts_format = "Please synthesize the speech corresponding to the follwing text.\n"
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+    GLM4 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple and messages[0][1][1] is not None:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message[:3]
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message[:3]
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message[:3]
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message[:3]
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message[:3]
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        elif self.sep_style == SeparatorStyle.GLM4:
+            role = ("<|user|>", "<|assistant|>")
+            ret = self.system + role[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message[:3]
+                    ret += self.sep + message + role[(i+1) % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        if isinstance(self.messages, tuple):
+            self.messages += ([role, message],)
+        else:
+            self.messages.append([role, message])
+    def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
+        if image_process_mode == "Pad":
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+        if max(image.size) > max_len:
+            max_hw, min_hw = max(image.size), min(image.size)
+            aspect_ratio = max_hw / min_hw
+            shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+            longest_edge = int(shortest_edge * aspect_ratio)
+            W, H = image.size
+            if H > W:
+                H, W = longest_edge, shortest_edge
+            else:
+                H, W = shortest_edge, longest_edge
+            image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple and msg[1] is not None:
+                    msg, image, image_process_mode = msg[:3]
+                    image = self.process_image(image, image_process_mode, return_pil=return_pil)
+                    images.append(image)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    img_b64_str = self.process_image(
+                        image, "Default", return_pil=False,
+                        image_format='JPEG')
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def to_gradio_chatbot_public(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode, audio_input = msg
+                    ret_msg = ""
+                    if image is not None:
+                        img_b64_str = self.process_image(
+                            image, "Default", return_pil=False,
+                            image_format='JPEG')
+                        img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                        ret_msg += img_str
+                    if audio_input is not None:
+                        audio_b64_str = base64.b64encode(open(audio_input, "rb").read()).decode("utf-8")
+                        audio_str = f'<audio src="data:audio/wav;base64,{audio_b64_str}" controls ></audio>'
+                        ret_msg += audio_str
+                    else:
+                        ret_msg += msg.replace('<image>', '').replace(tts_format, '').strip()
+                    ret.append([ret_msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                if type(msg) is tuple:
+                    audio_b64_str = base64.b64encode(open(msg[1], "rb").read()).decode("utf-8")
+                    msg = f'<audio src="data:audio/wav;base64,{audio_b64_str}" controls autoplay></audio>'
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+conv_mistral_instruct = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="",
+    sep2="</s>",
+)
+conv_chatml_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_llama3 = Conversation(
+    system="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.""",
+    roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
+    version="llama3",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|eot_id|>",
+)
+conv_llama3_demo = Conversation(
+    system="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. Your name is emova, and you are purely developed by the emova Team.""",
+    roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
+    version="llama3_demo",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|eot_id|>",
+)
+conv_llama3_without_system = Conversation(
+    system="",
+    roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
+    version="llama3_without_system",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|eot_id|>",
+)
+conv_llama3_without_systemV2 = Conversation(
+    system="",
+    roles=("user:", "assistant:"),
+    version="llama3_without_systemv2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="\n\n",
+)
+conv_qwen2 = Conversation(
+    system='<|im_start|>system\nYou are a helpful assistant.',
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="qwen2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>\n",
+)
+conv_glm4 = Conversation(
+    system='[gMASK]<sop>',
+    roles=("<|user|>", "<|assistant|>"),
+    version="glm4",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.GLM4,
+    sep="\n",
+)
+default_conversation = conv_vicuna_v1
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+    "mistral_instruct": conv_mistral_instruct,
+    "chatml_direct": conv_chatml_direct,
+    "mistral_direct": conv_chatml_direct,
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+    "llama3": conv_llama3,
+    "llama3_demo": conv_llama3_demo,
+    "llama3_without_system": conv_llama3_without_system,
+    "conv_llama3_without_systemV2": conv_llama3_without_systemV2,
+    "mpt": conv_mpt,
+    "qwen2": conv_qwen2,
+    "glm4": conv_glm4,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+omegaconf
+torch==2.0.1
+torchvision==0.15.2
+transformers==4.44.0
+sentencepiece==0.1.99
+accelerate==0.33.0
+einops==0.6.1
+einops-exts==0.0.4
+timm==0.6.13
+scipy
+gradio
+monotonic_align
+librosa==0.8.0
+phonemizer
+unidecode
+hydra-core==1.3.2
+pytorch_lightning==1.1.0
+wget
+wrapt
+onnx
+frozendict
+inflect
+braceexpand
+webdataset
+torch_stft
+sox
+editdistance
+numpy==1.23.5