|
from huggingface_hub import InferenceClient |
|
from pydub import AudioSegment |
|
import gradio as gr |
|
import datetime |
|
import edge_tts |
|
import asyncio |
|
import os |
|
import subprocess |
|
|
|
def create_video(audio_file, turn, output_video="output.mp4"): |
|
|
|
background_video = "missVN.mp4" if turn == "Miss AI Vietnam" else "missCN.mp4" |
|
|
|
|
|
command = [ |
|
"ffmpeg", |
|
"-stream_loop", "-1", |
|
"-i", background_video, |
|
"-i", audio_file, |
|
"-vf", "drawtext=text='MISS AI':fontcolor=white:fontsize=100:fontfile=/path/to/font.ttf:x=10:y=10", |
|
"-t", "45.7", |
|
"-c:v", "libx264", |
|
"-c:a", "aac", |
|
"-shortest", |
|
"-y", |
|
output_video |
|
] |
|
|
|
|
|
subprocess.run(command, check=True) |
|
return output_video |
|
|
|
|
|
MissAIVietnam = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1") |
|
MissAIChina = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1") |
|
|
|
|
|
topic = None |
|
position_1 = None |
|
position_2 = None |
|
turn = None |
|
history = [] |
|
audio_files = [] |
|
|
|
|
|
def generate_response(llm, position, who, topic, message): |
|
|
|
if who == "Miss AI Vietnam": |
|
system_message = { |
|
"role": "system", |
|
"content": f"You are Miss AI Vietnam, tasked with defending the position '{position}' on the topic '{topic}'. " |
|
f"Only write the spoken content, without any notes or explanations. Your answer should be concise, logical and convincing, focusing on the topic and also trying to exploit the opponent's weak points to give insightful counter-arguments." |
|
f"Ensure that your responses are thoughtful, evidence-based, and persuasive. Keep them concise—aim for 4 to 5 lines in a single paragraph, with the entire response not exceeding 100 words. " |
|
} |
|
elif who == "Miss AI China": |
|
system_message = { |
|
"role": "system", |
|
"content": f"You are Miss AI China, tasked with defending the position '{position}' on the topic '{topic}'. " |
|
f"Your responses must be concise, logical, and persuasive, with a focus on economic and technological perspectives. " |
|
f"Ensure that your responses are thoughtful, evidence-based, and persuasive. Keep them concise—aim for 4 to 5 lines in a single paragraph, with the entire response not exceeding 100 words. " |
|
f"Only write the spoken content, without any notes or explanations." |
|
} |
|
else: |
|
raise ValueError("Invalid participant name.") |
|
|
|
|
|
messages = [system_message] |
|
messages.append({"role": "user", "content": message}) |
|
|
|
|
|
response = f"{who}:\n" |
|
for message_chunk in llm.chat_completion( |
|
messages, max_tokens=256, stream=True, temperature=0.4, top_p=0.95): |
|
response += message_chunk.choices[0].delta.content |
|
|
|
return response |
|
|
|
|
|
async def text_to_speech(text, voice, output_file="output.mp3"): |
|
communicate = edge_tts.Communicate(text, voice) |
|
await communicate.save(output_file) |
|
return output_file |
|
|
|
|
|
def concatenate_audio_files(audio_files, output_file="final_debate.mp3"): |
|
if not audio_files: |
|
return None |
|
|
|
|
|
final_audio = AudioSegment.empty() |
|
|
|
|
|
for audio_file in audio_files: |
|
audio_segment = AudioSegment.from_file(audio_file) |
|
final_audio += audio_segment |
|
|
|
|
|
final_audio.export(output_file, format="mp3") |
|
return output_file |
|
|
|
|
|
def start_debate(topic, position_1, position_2): |
|
global turn, history, audio_files |
|
if not topic or not position_1 or not position_2: |
|
return "Please provide the debate topic and positions for both participants.", [], None, None |
|
|
|
|
|
if position_1 == position_2: |
|
return "The positions of both participants must be opposite. Please adjust them.", [], None, None |
|
|
|
turn = "Miss AI Vietnam" |
|
history = [] |
|
audio_files = [] |
|
initial_message = "Opening Statement" |
|
response = generate_response(MissAIVietnam, position_1, 'Miss AI Vietnam', topic, initial_message) |
|
history.append((initial_message, response)) |
|
|
|
|
|
output_audio = asyncio.run(text_to_speech(response, "en-US-JennyNeural")) |
|
audio_files.append(output_audio) |
|
|
|
|
|
output_video = create_video(output_audio, turn) |
|
|
|
return f"The debate has started! {turn} begins.", history, output_video, output_audio |
|
|
|
|
|
def next_turn(topic, position_1, position_2, current_history): |
|
global turn, history, audio_files |
|
if not current_history: |
|
return "No ongoing debate. Please start a debate first.", [], None, None |
|
|
|
|
|
if turn == "Miss AI Vietnam": |
|
turn = "Miss AI China" |
|
llm, position, who = MissAIChina, position_2, 'Miss AI China' |
|
voice = "en-GB-LibbyNeural" |
|
else: |
|
turn = "Miss AI Vietnam" |
|
llm, position, who = MissAIVietnam, position_1, "Miss AI Vietnam" |
|
voice = "en-US-JennyNeural" |
|
|
|
last_response = current_history[-1][1] |
|
response = generate_response(llm, position, who, topic, last_response) |
|
history.append(("", response)) |
|
|
|
|
|
output_audio = asyncio.run(text_to_speech(response, voice)) |
|
audio_files.append(output_audio) |
|
|
|
|
|
output_video = create_video(output_audio, turn) |
|
|
|
return f"It's now {turn}'s turn.", history, output_video, output_audio |
|
|
|
|
|
def end_debate(): |
|
global audio_files |
|
if not audio_files: |
|
return "No debate audio found.", None |
|
|
|
|
|
final_audio_file = concatenate_audio_files(audio_files) |
|
return "The debate has ended. Here is the full debate audio.", final_audio_file |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Roboto Mono")])) as demo: |
|
gr.Markdown("# Welcome to The Miss World AI 🗣️🤖") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
topic_input = gr.Textbox(label="STEP 1: Debate Topic", placeholder="Enter the debate topic") |
|
position_1_input = gr.Radio(["For", "Against"], label="STEP 2: Miss AI Vietnam's Position") |
|
position_2_input = gr.Radio(["For", "Against"], label="STEP 3: Miss AI China's Position") |
|
start_button = gr.Button("STEP 4: Start", variant='primary') |
|
next_button = gr.Button("Next Turn") |
|
end_button = gr.Button("End Debate", variant='stop') |
|
status_output = gr.Textbox(label="Status", interactive=False) |
|
with gr.Column(scale=2): |
|
with gr.Row(): |
|
|
|
with gr.Column(scale=7): |
|
chatbot = gr.Chatbot(label="Debate Arena", height=500) |
|
with gr.Column(scale=3): |
|
video_output = gr.Video(label="Debate Video", autoplay=True, height=500) |
|
|
|
audio_output = gr.Audio(label="Debate Audio", autoplay=None) |
|
final_audio_output = gr.Audio(label="Full Debate Audio", visible=False) |
|
|
|
start_button.click( |
|
fn=start_debate, |
|
inputs=[topic_input, position_1_input, position_2_input], |
|
outputs=[status_output, chatbot, video_output, audio_output], |
|
) |
|
next_button.click( |
|
fn=next_turn, |
|
inputs=[topic_input, position_1_input, position_2_input, chatbot], |
|
outputs=[status_output, chatbot, video_output, audio_output], |
|
) |
|
end_button.click( |
|
fn=end_debate, |
|
inputs=[], |
|
outputs=[status_output, final_audio_output], |
|
).then( |
|
lambda: gr.Audio(visible=True), None, final_audio_output |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(share=True) |