KaiChen1998 commited on
Commit
4722d2a
Β·
1 Parent(s): 7351eb1

reformulate demo interface

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +45 -58
  3. conversation_public.py +11 -0
  4. requirements.txt +2 -2
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ“š
4
  colorFrom: yellow
5
  colorTo: gray
6
  sdk: gradio
7
- sdk_version: 5.1.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
4
  colorFrom: yellow
5
  colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 5.4.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
app.py CHANGED
@@ -5,6 +5,9 @@ import os
5
  import time
6
  import hashlib
7
  import uuid
 
 
 
8
 
9
  import spaces
10
  import gradio as gr
@@ -51,11 +54,11 @@ asr_format = "Please recognize the text corresponding to the follwing speech.\n"
51
  tts_format = "Please synthesize the speech corresponding to the follwing text.\n"
52
  chat_format = r'Please recognize the texts, emotion and pitch from the user question speech units and provide the texts, emotion, pitch and speech units for the assistant response. \nEmotion should be chosen from ["neutral", "happy", "sad", "angry", "surprised", "disgusted", "fearful"]. \nPitch should be chosen from ["low", "normal", "high"].\nYour output should be in json format.\nAn output example is:\n{"user question text": "", "user question emotion": "", "user question pitch": "", "assistant response text": "", "assistant response emotion": "", "assistant response pitch": "","assistant response speech": ""}\n\nuser question speech:'
53
 
54
- @spaces.GPU(duration=20)
55
  def s2u_asr(text, audio_file):
56
  return asr_format + s2u_extract_unit_demo(s2u_model, audio_file, model_name=s2u_model_name, reduced=reduced)
57
 
58
- @spaces.GPU(duration=20)
59
  def s2u_chat(text, audio_file):
60
  return chat_format + s2u_extract_unit_demo(s2u_model, audio_file, model_name=s2u_model_name, reduced=reduced)
61
 
@@ -108,12 +111,12 @@ disable_btn = gr.Button(interactive=False)
108
  server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
109
 
110
  def load_demo_refresh_model_list():
111
- print(f"load_demo.")
112
  state = default_conversation.copy()
113
  return state
114
 
115
  def regenerate(state, image_process_mode):
116
- print(f"regenerate.")
117
  state.messages[-1][-1] = None
118
  prev_human_msg = state.messages[-2]
119
  if type(prev_human_msg[1]) in (tuple, list):
@@ -122,7 +125,7 @@ def regenerate(state, image_process_mode):
122
  return (state, state.to_gradio_chatbot_public(), "", None, None) + (disable_btn,) * 2
123
 
124
  def clear_history():
125
- print(f"clear_history.")
126
  state = default_conversation.copy()
127
  return (state, state.to_gradio_chatbot_public(), "", None) + (disable_btn,) * 2 + (None,)
128
 
@@ -135,7 +138,7 @@ def add_text(state, text, image, image_process_mode, audio_input, audio_mode):
135
  ############
136
  # Input legality checking
137
  ############
138
- print(f"add_text. len: {len(text)}")
139
  if len(text) <= 0 and image is None and audio_input is None:
140
  state.skip_next = True
141
  return (state, state.to_gradio_chatbot_public(), "", None, None) + (no_change_btn,) * 2
@@ -178,7 +181,7 @@ def add_text(state, text, image, image_process_mode, audio_input, audio_mode):
178
  state.append_message(state.roles[0], text)
179
  state.append_message(state.roles[1], None)
180
  state.skip_next = False
181
- print(str(state.messages))
182
  return (state, state.to_gradio_chatbot_public(), "", None, None) + (disable_btn,) * 2
183
 
184
  ############
@@ -186,9 +189,9 @@ def add_text(state, text, image, image_process_mode, audio_input, audio_mode):
186
  # Input: [state, temperature, top_p, max_output_tokens, speaker]
187
  # Return: [state, chatbot] + btn_list
188
  ############
189
- @spaces.GPU(duration=120)
190
  def http_bot(state, temperature, top_p, max_new_tokens, speaker):
191
- print(f"http_bot.")
192
 
193
  if state.skip_next:
194
  yield (state, state.to_gradio_chatbot_public()) + (no_change_btn,) * 2
@@ -196,35 +199,12 @@ def http_bot(state, temperature, top_p, max_new_tokens, speaker):
196
 
197
  if len(state.messages) == state.offset + 2:
198
  # First round of conversation
199
- if 'llama-2' in model_name.lower():
200
- template_name = "llava_llama_2"
201
- elif "mistral" in model_name.lower() or "mixtral" in model_name.lower():
202
- if 'orca' in model_name.lower():
203
- template_name = "mistral_orca"
204
- elif 'hermes' in model_name.lower():
205
- template_name = "chatml_direct"
206
- else:
207
- template_name = "mistral_instruct"
208
- elif 'llava-v1.6-34b' in model_name.lower():
209
- template_name = "chatml_direct"
210
- elif "v1" in model_name.lower():
211
- if 'mmtag' in model_name.lower():
212
- template_name = "v1_mmtag"
213
- elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
214
- template_name = "v1_mmtag"
215
- else:
216
- template_name = "llava_v1"
217
- elif "mpt" in model_name.lower():
218
- template_name = "mpt"
219
- elif "llama3" in model_name.lower():
220
  template_name = 'llama3_demo'
 
 
221
  else:
222
- if 'mmtag' in model_name.lower():
223
- template_name = "v0_mmtag"
224
- elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
225
- template_name = "v0_mmtag"
226
- else:
227
- template_name = "llava_v0"
228
 
229
  new_state = conv_templates[template_name].copy()
230
  new_state.append_message(new_state.roles[0], state.messages[-2][1])
@@ -234,7 +214,6 @@ def http_bot(state, temperature, top_p, max_new_tokens, speaker):
234
  # Construct prompt
235
  prompt = state.get_prompt()
236
  all_images = state.get_images(return_pil=True)
237
- all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
238
 
239
  # Make requests
240
  pload = {
@@ -244,10 +223,9 @@ def http_bot(state, temperature, top_p, max_new_tokens, speaker):
244
  "top_p": float(top_p),
245
  "max_new_tokens": int(max_new_tokens),
246
  "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
247
- "images": f'List of {len(state.get_images())} images: {all_image_hash}',
248
  }
249
- print(f"==== request ====\n{pload}")
250
- pload['images'] = all_images
251
 
252
  # Process inputs
253
  inputs = processor(text=[prompt], images=all_images if len(all_images) > 0 else None, return_tensors="pt")
@@ -290,7 +268,8 @@ def http_bot(state, temperature, top_p, max_new_tokens, speaker):
290
  # state.messages[-1][-1] = "[😁 GENERATING AUDIO {}%...]".format(round(output.count("<|speech_") / max_new_tokens * 100, 1)) + "\n" + output + "β–Œ"
291
  yield (state, state.to_gradio_chatbot_public()) + (disable_btn,) * 2
292
  except Exception as e:
293
- print(e)
 
294
  state.messages[-1][-1] = server_error_msg
295
  yield (state, state.to_gradio_chatbot_public()) + (enable_btn,) * 2
296
  return
@@ -323,14 +302,15 @@ def http_bot(state, temperature, top_p, max_new_tokens, speaker):
323
  condition = f'gender-{gender}_emotion-{emotion}_speed-{speed}_pitch-{pitch}'
324
  style_centroid_file = condition2style_centroid_file_dict[condition]
325
  style_centroid_embedding = condition2style_centroid_embedding_dict[condition].cuda()
326
- print(condition)
327
 
328
  id = str(uuid.uuid4())
329
  os.makedirs("./demo_audio", exist_ok=True)
330
  synthesis(content_unit, style_centroid_embedding, hps, net_g, f"./demo_audio/{id}_temp_audio.wav")
331
  temp_file = f"./demo_audio/{id}_temp_audio.wav"
332
  except Exception as e:
333
- print(e)
 
334
 
335
  state.messages[-1][-1] = state.messages[-1][-1][:-1]
336
  if tts_format in prompt or chat_format in prompt:
@@ -346,39 +326,48 @@ def http_bot(state, temperature, top_p, max_new_tokens, speaker):
346
  if temp_file is not None:
347
  os.system("rm {}".format(temp_file))
348
 
349
- print(f"{output}")
350
 
351
  ############
352
  # Layout Markdown
353
  ############
354
  title_markdown = ("""
355
  <div style="display: flex; align-items: center; padding: 20px; border-radius: 10px; background-color: #f0f0f0;">
356
- <div style="margin-right: 20px;">
357
  <img src="https://emova-ollm.github.io/static/images/icons/emova.png" alt="Icon" style="width: 100px; height: 100px; border-radius: 10px;">
358
  </div>
359
  <div>
360
- <h1 style="margin: 0;">EMOVA: Empowering Language Models to See, Hear and Speak with Vivid Emotion</h2>
361
- <p style="margin: 10px 0;">
362
- 1. To chat with EMOVA, upload images, enter texts or record audios and then do not forget to <mark>Click πŸ’¬ Chat Button</mark> ^v^!<br/>
363
- 2. Heighten the <code>Max output tokens</code> if necessary to talk longer with EMOVA.
 
364
  </p>
365
  </div>
366
  </div>
367
  """)
368
 
369
  tos_markdown = ("""
370
- ### Terms of use
371
  By using this service, users are required to agree to the following terms:
372
  The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
373
  For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
374
  """)
375
 
376
  learn_more_markdown = ("""
377
- ### License
378
  The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
379
 
380
- ### Acknowledgement
381
  The service is built upon [LLaVA](https://github.com/haotian-liu/LLaVA/). We thanks the authors for open-sourcing the wonderful code.
 
 
 
 
 
 
 
 
382
  """)
383
 
384
  block_css = """
@@ -398,7 +387,7 @@ block_css = """
398
  ############
399
  # Layout Demo
400
  ############
401
- def build_demo(embed_mode, cur_dir=None):
402
  textbox = gr.Textbox(label="Text", show_label=False, placeholder="Enter text or record audio in the right and then click πŸ’¬ Chat to talk with me ^v^", container=False, scale=6)
403
  audio_input = gr.Audio(label="Audio", sources=["microphone", "upload"], type="filepath", max_length=10, show_download_button=True, waveform_options=dict(sample_rate=16000), scale=2)
404
  with gr.Blocks(title="EMOVA", theme=gr.themes.Default(), css=block_css) as demo:
@@ -424,7 +413,7 @@ def build_demo(embed_mode, cur_dir=None):
424
  temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature")
425
  top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P")
426
  max_output_tokens = gr.Slider(minimum=0, maximum=4096, value=2048, step=32, interactive=True, label="Max output tokens")
427
- speaker = gr.Radio(["Female", "Male"], label="Speaker")
428
 
429
  with gr.Column(scale=8):
430
  chatbot = gr.Chatbot(
@@ -445,15 +434,12 @@ def build_demo(embed_mode, cur_dir=None):
445
  ##############
446
  # Examples
447
  ##############
448
- if cur_dir is None:
449
- cur_dir = os.path.dirname(os.path.abspath(__file__))
450
-
451
  with gr.Row():
452
  with gr.Column(scale=9):
453
  gr.Examples(examples=[
454
  ["./examples/emo-speech/what_is_your_name.wav"],
455
- ["./examples/emo-speech/parent.wav"],
456
  ["./examples/emo-speech/I_am_so_sad.wav"],
 
457
  ["./examples/emo-speech/wedding(CH).wav"],
458
  ], inputs=[audio_input], label='Audio Examples (Click to load the examples~)')
459
 
@@ -538,6 +524,7 @@ args = parser.parse_args()
538
 
539
  demo = build_demo(args.embed)
540
  demo.queue(
 
541
  api_open=False
542
  ).launch(
543
  favicon_path="./examples/icon_256.png",
 
5
  import time
6
  import hashlib
7
  import uuid
8
+ import traceback
9
+ import logging
10
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
11
 
12
  import spaces
13
  import gradio as gr
 
54
  tts_format = "Please synthesize the speech corresponding to the follwing text.\n"
55
  chat_format = r'Please recognize the texts, emotion and pitch from the user question speech units and provide the texts, emotion, pitch and speech units for the assistant response. \nEmotion should be chosen from ["neutral", "happy", "sad", "angry", "surprised", "disgusted", "fearful"]. \nPitch should be chosen from ["low", "normal", "high"].\nYour output should be in json format.\nAn output example is:\n{"user question text": "", "user question emotion": "", "user question pitch": "", "assistant response text": "", "assistant response emotion": "", "assistant response pitch": "","assistant response speech": ""}\n\nuser question speech:'
56
 
57
+ @spaces.GPU(duration=15)
58
  def s2u_asr(text, audio_file):
59
  return asr_format + s2u_extract_unit_demo(s2u_model, audio_file, model_name=s2u_model_name, reduced=reduced)
60
 
61
+ @spaces.GPU(duration=15)
62
  def s2u_chat(text, audio_file):
63
  return chat_format + s2u_extract_unit_demo(s2u_model, audio_file, model_name=s2u_model_name, reduced=reduced)
64
 
 
111
  server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
112
 
113
  def load_demo_refresh_model_list():
114
+ logging.info(f"load_demo.")
115
  state = default_conversation.copy()
116
  return state
117
 
118
  def regenerate(state, image_process_mode):
119
+ logging.info(f"regenerate.")
120
  state.messages[-1][-1] = None
121
  prev_human_msg = state.messages[-2]
122
  if type(prev_human_msg[1]) in (tuple, list):
 
125
  return (state, state.to_gradio_chatbot_public(), "", None, None) + (disable_btn,) * 2
126
 
127
  def clear_history():
128
+ logging.info(f"clear_history.")
129
  state = default_conversation.copy()
130
  return (state, state.to_gradio_chatbot_public(), "", None) + (disable_btn,) * 2 + (None,)
131
 
 
138
  ############
139
  # Input legality checking
140
  ############
141
+ logging.info(f"add_text. len: {len(text)}")
142
  if len(text) <= 0 and image is None and audio_input is None:
143
  state.skip_next = True
144
  return (state, state.to_gradio_chatbot_public(), "", None, None) + (no_change_btn,) * 2
 
181
  state.append_message(state.roles[0], text)
182
  state.append_message(state.roles[1], None)
183
  state.skip_next = False
184
+ logging.info(str(state.messages))
185
  return (state, state.to_gradio_chatbot_public(), "", None, None) + (disable_btn,) * 2
186
 
187
  ############
 
189
  # Input: [state, temperature, top_p, max_output_tokens, speaker]
190
  # Return: [state, chatbot] + btn_list
191
  ############
192
+ @spaces.GPU(duration=90)
193
  def http_bot(state, temperature, top_p, max_new_tokens, speaker):
194
+ logging.info(f"http_bot.")
195
 
196
  if state.skip_next:
197
  yield (state, state.to_gradio_chatbot_public()) + (no_change_btn,) * 2
 
199
 
200
  if len(state.messages) == state.offset + 2:
201
  # First round of conversation
202
+ if "llama3" in model_name.lower():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  template_name = 'llama3_demo'
204
+ elif "qwen2" in model_name.lower():
205
+ template_name = 'qwen2_demo'
206
  else:
207
+ template_name = "default"
 
 
 
 
 
208
 
209
  new_state = conv_templates[template_name].copy()
210
  new_state.append_message(new_state.roles[0], state.messages[-2][1])
 
214
  # Construct prompt
215
  prompt = state.get_prompt()
216
  all_images = state.get_images(return_pil=True)
 
217
 
218
  # Make requests
219
  pload = {
 
223
  "top_p": float(top_p),
224
  "max_new_tokens": int(max_new_tokens),
225
  "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
226
+ "images": f'List of {len(state.get_images())} images: {all_images}',
227
  }
228
+ logging.info(f"==== request ====\n{pload}")
 
229
 
230
  # Process inputs
231
  inputs = processor(text=[prompt], images=all_images if len(all_images) > 0 else None, return_tensors="pt")
 
268
  # state.messages[-1][-1] = "[😁 GENERATING AUDIO {}%...]".format(round(output.count("<|speech_") / max_new_tokens * 100, 1)) + "\n" + output + "β–Œ"
269
  yield (state, state.to_gradio_chatbot_public()) + (disable_btn,) * 2
270
  except Exception as e:
271
+ os.system("nvidia-smi")
272
+ logging.info(traceback.print_exc())
273
  state.messages[-1][-1] = server_error_msg
274
  yield (state, state.to_gradio_chatbot_public()) + (enable_btn,) * 2
275
  return
 
302
  condition = f'gender-{gender}_emotion-{emotion}_speed-{speed}_pitch-{pitch}'
303
  style_centroid_file = condition2style_centroid_file_dict[condition]
304
  style_centroid_embedding = condition2style_centroid_embedding_dict[condition].cuda()
305
+ logging.info(condition)
306
 
307
  id = str(uuid.uuid4())
308
  os.makedirs("./demo_audio", exist_ok=True)
309
  synthesis(content_unit, style_centroid_embedding, hps, net_g, f"./demo_audio/{id}_temp_audio.wav")
310
  temp_file = f"./demo_audio/{id}_temp_audio.wav"
311
  except Exception as e:
312
+ os.system("nvidia-smi")
313
+ logging.info(traceback.print_exc())
314
 
315
  state.messages[-1][-1] = state.messages[-1][-1][:-1]
316
  if tts_format in prompt or chat_format in prompt:
 
326
  if temp_file is not None:
327
  os.system("rm {}".format(temp_file))
328
 
329
+ logging.info(f"{output}")
330
 
331
  ############
332
  # Layout Markdown
333
  ############
334
  title_markdown = ("""
335
  <div style="display: flex; align-items: center; padding: 20px; border-radius: 10px; background-color: #f0f0f0;">
336
+ <div style="margin-left: 20px; margin-right: 40px;">
337
  <img src="https://emova-ollm.github.io/static/images/icons/emova.png" alt="Icon" style="width: 100px; height: 100px; border-radius: 10px;">
338
  </div>
339
  <div>
340
+ <h1 style="margin: 0;">EMOVA: Empowering Language Models to See, Hear and Speak with Vivid Emotions</h1>
341
+ <h2 style="margin: 10px 0;">πŸ“ƒ <a href="https://arxiv.org/abs/2409.18042" style="font-weight: 300;">Paper</a> | πŸ’» <a href="https://github.com/emova-ollm/EMOVA" style="font-weight: 300;">Code</a> | πŸ€— <a href="https://huggingface.co/Emova-ollm" style="font-weight: 300;">HuggingFace</a> | 🌐 <a href="https://emova-ollm.github.io/" style="font-weight: 300;">Website</a></h2>
342
+ <p style="margin: 20px 0;">
343
+ <strong>1. To chat with EMOVA, upload images, enter texts or record audios and then do not forget to <mark>Click πŸ’¬ Chat Button</mark> ^v^!</strong><br/>
344
+ <strong>2. Heighten the <code>Max output tokens</code> if necessary to talk longer with EMOVA.</strong>
345
  </p>
346
  </div>
347
  </div>
348
  """)
349
 
350
  tos_markdown = ("""
351
+ ## Terms of use
352
  By using this service, users are required to agree to the following terms:
353
  The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
354
  For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
355
  """)
356
 
357
  learn_more_markdown = ("""
358
+ ## License
359
  The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
360
 
361
+ ## Acknowledgement
362
  The service is built upon [LLaVA](https://github.com/haotian-liu/LLaVA/). We thanks the authors for open-sourcing the wonderful code.
363
+
364
+ ## Citation
365
+ <pre><code>@article{chen2024emova,
366
+ title={Emova: Empowering language models to see, hear and speak with vivid emotions},
367
+ author={Chen, Kai and Gou, Yunhao and Huang, Runhui and Liu, Zhili and Tan, Daxin and Xu, Jing and Wang, Chunwei and Zhu, Yi and Zeng, Yihan and Yang, Kuo and others},
368
+ journal={arXiv preprint arXiv:2409.18042},
369
+ year={2024}
370
+ }</code></pre>
371
  """)
372
 
373
  block_css = """
 
387
  ############
388
  # Layout Demo
389
  ############
390
+ def build_demo(embed_mode):
391
  textbox = gr.Textbox(label="Text", show_label=False, placeholder="Enter text or record audio in the right and then click πŸ’¬ Chat to talk with me ^v^", container=False, scale=6)
392
  audio_input = gr.Audio(label="Audio", sources=["microphone", "upload"], type="filepath", max_length=10, show_download_button=True, waveform_options=dict(sample_rate=16000), scale=2)
393
  with gr.Blocks(title="EMOVA", theme=gr.themes.Default(), css=block_css) as demo:
 
413
  temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature")
414
  top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P")
415
  max_output_tokens = gr.Slider(minimum=0, maximum=4096, value=2048, step=32, interactive=True, label="Max output tokens")
416
+ speaker = gr.Radio(["Female", "Male"], value="Female", label="Speaker")
417
 
418
  with gr.Column(scale=8):
419
  chatbot = gr.Chatbot(
 
434
  ##############
435
  # Examples
436
  ##############
 
 
 
437
  with gr.Row():
438
  with gr.Column(scale=9):
439
  gr.Examples(examples=[
440
  ["./examples/emo-speech/what_is_your_name.wav"],
 
441
  ["./examples/emo-speech/I_am_so_sad.wav"],
442
+ ["./examples/emo-speech/parent.wav"],
443
  ["./examples/emo-speech/wedding(CH).wav"],
444
  ], inputs=[audio_input], label='Audio Examples (Click to load the examples~)')
445
 
 
524
 
525
  demo = build_demo(args.embed)
526
  demo.queue(
527
+ max_size=10,
528
  api_open=False
529
  ).launch(
530
  favicon_path="./examples/icon_256.png",
conversation_public.py CHANGED
@@ -462,6 +462,16 @@ conv_qwen2 = Conversation(
462
  sep="<|im_end|>\n",
463
  )
464
 
 
 
 
 
 
 
 
 
 
 
465
  conv_glm4 = Conversation(
466
  system='[gMASK]<sop>',
467
  roles=("<|user|>", "<|assistant|>"),
@@ -498,6 +508,7 @@ conv_templates = {
498
 
499
  "mpt": conv_mpt,
500
  "qwen2": conv_qwen2,
 
501
  "glm4": conv_glm4,
502
  }
503
 
 
462
  sep="<|im_end|>\n",
463
  )
464
 
465
+ conv_qwen2_demo = Conversation(
466
+ system='<|im_start|>system\nYou are a helpful assistant. Your name is emova, and you are purely developed by the emova Team.',
467
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
468
+ version="qwen2_demo",
469
+ messages=(),
470
+ offset=0,
471
+ sep_style=SeparatorStyle.MPT,
472
+ sep="<|im_end|>\n",
473
+ )
474
+
475
  conv_glm4 = Conversation(
476
  system='[gMASK]<sop>',
477
  roles=("<|user|>", "<|assistant|>"),
 
508
 
509
  "mpt": conv_mpt,
510
  "qwen2": conv_qwen2,
511
+ "qwen2_demo": conv_qwen2_demo,
512
  "glm4": conv_glm4,
513
  }
514
 
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
  omegaconf
2
  # torch
3
  # torchvision
4
- transformers==4.44.0
5
  sentencepiece==0.1.99
6
  accelerate==0.33.0
7
  einops==0.6.1
@@ -10,7 +10,7 @@ timm==0.6.13
10
  # flash_attn
11
  https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
12
  scipy
13
- gradio
14
 
15
  monotonic_align
16
  librosa==0.8.0
 
1
  omegaconf
2
  # torch
3
  # torchvision
4
+ transformers==4.44.2
5
  sentencepiece==0.1.99
6
  accelerate==0.33.0
7
  einops==0.6.1
 
10
  # flash_attn
11
  https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
12
  scipy
13
+ # gradio
14
 
15
  monotonic_align
16
  librosa==0.8.0