AiCoderv2 commited on
Commit
455e799
·
verified ·
1 Parent(s): 98617f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -70
app.py CHANGED
@@ -1,77 +1,36 @@
1
  import gradio as gr
2
- import torch
3
- import soundfile as sf
4
- from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
5
- from qwen_omni_utils import process_mm_info
6
 
7
- MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
8
 
9
- # Load model & processor
10
- model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
11
- MODEL_ID, torch_dtype="auto", device_map="auto"
12
- )
13
- processor = Qwen2_5OmniProcessor.from_pretrained(MODEL_ID)
14
 
15
- SYSTEM_MESSAGE = {
16
- "role": "system",
17
- "content": [
18
- {
19
- "type": "text",
20
- "text": "You are Qwen, a virtual human capable of understanding text, image, audio, and video, and responding with text and natural speech."
21
- }
22
- ],
23
- }
24
 
25
- def infer(conversation, use_audio=True):
26
- # Apply template and extract modalities
27
- text = processor.apply_chat_template(
28
- conversation + [SYSTEM_MESSAGE],
29
- add_generation_prompt=True,
30
- tokenize=False,
31
- return_dict=True,
32
- use_audio_in_video=use_audio,
33
- )
34
- audios, images, videos = process_mm_info(conversation, use_audio_in_video=use_audio)
35
- inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True).to(model.device)
36
-
37
- text_ids, audio = model.generate(**inputs, use_audio_in_video=use_audio)
38
- reply_text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
39
-
40
- audio_path = None
41
- if audio is not None and use_audio:
42
- audio_np = audio.reshape(-1).detach().cpu().numpy()
43
- audio_path = "output.wav"
44
- sf.write(audio_path, audio_np, samplerate=24000)
45
-
46
- return reply_text, audio_path
47
 
48
- def chat_interface(input_text, uploaded_image=None, uploaded_audio=None, uploaded_video=None):
49
- conv = [SYSTEM_MESSAGE]
50
- user_message = []
51
- if input_text:
52
- user_message.append({"type": "text", "text": input_text})
53
- if uploaded_image:
54
- user_message.append({"type": "image", "image": uploaded_image.name})
55
- if uploaded_audio:
56
- user_message.append({"type": "audio", "audio": uploaded_audio.name})
57
- if uploaded_video:
58
- user_message.append({"type": "video", "video": uploaded_video.name})
59
- conv.append({"role": "user", "content": user_message})
60
-
61
- reply, audio_file = infer(conv)
62
- return reply, audio_file
63
 
64
- # Gradio interface
65
- iface = gr.Interface(
66
- fn=chat_interface,
67
- inputs=[
68
- gr.Textbox(label="Enter text"),
69
- gr.File(label="Upload image"),
70
- gr.File(label="Upload audio"),
71
- gr.File(label="Upload video"),
72
- ],
73
- outputs=[gr.Textbox(label="Response"), gr.Audio(label="Speech Output")],
74
- title="Qwen2.5‑Omni Multimodal Assistant",
75
- description="Upload any image/audio/video + text prompt and get back text + speech",
76
- )
77
- iface.launch()
 
1
  import gradio as gr
2
+ from train_and_serve import generate_video_with_audio
3
+ import threading
 
 
4
 
5
+ MAX_CPU = 16 # assume availability
6
 
7
+ chat_history = []
 
 
 
 
8
 
9
+ def chatbot_response(user_msg):
10
+ # simple echo + context
11
+ chat_history.append(("User", user_msg))
12
+ resp = f"I heard: '{user_msg}'. Ask me about video generation!"
13
+ chat_history.append(("Bot", resp))
14
+ return resp
 
 
 
15
 
16
+ def handle_prompt(prompt, song_text):
17
+ vpath, apath = generate_video_with_audio(prompt, song_text=song_text)
18
+ return vpath, apath
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ with gr.Blocks() as demo:
21
+ gr.Markdown("# 🎬 FineToon Video Chat & Generator")
22
+ with gr.Row():
23
+ with gr.Column(scale=2):
24
+ prompt = gr.Textbox(label="Video Prompt Text")
25
+ song = gr.Textbox(label="Optional Song Lyrics / Voice Text")
26
+ gen_btn = gr.Button("Generate Video")
27
+ video_out = gr.Video(label="Generated Video")
28
+ audio_out = gr.Audio(label="Generated Audio (Song / TTS)")
29
+ with gr.Column(scale=1):
30
+ chat_in = gr.Textbox(label="Chat with Assistant")
31
+ chat_out = gr.Chatbot(label="Conversation")
 
 
 
32
 
33
+ gen_btn.click(handle_prompt, inputs=[prompt, song], outputs=[video_out, audio_out])
34
+ chat_in.submit(lambda m: chatbot_response(m), inputs=chat_in, outputs=chat_out)
35
+
36
+ demo.queue(concurrency_count=1, max_size=4).launch()