Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,754 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
import asyncio
|
| 4 |
+
import torch
|
| 5 |
+
import io
|
| 6 |
+
import json
|
| 7 |
+
import re
|
| 8 |
+
import httpx
|
| 9 |
+
import tempfile
|
| 10 |
+
import wave
|
| 11 |
+
import base64
|
| 12 |
+
import numpy as np
|
| 13 |
+
import soundfile as sf
|
| 14 |
+
import subprocess
|
| 15 |
+
import shutil
|
| 16 |
+
from dataclasses import dataclass
|
| 17 |
+
from typing import List, Tuple, Dict, Optional
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from threading import Thread
|
| 20 |
+
from dotenv import load_dotenv
|
| 21 |
+
|
| 22 |
+
# Edge TTS imports
|
| 23 |
+
import edge_tts
|
| 24 |
+
from pydub import AudioSegment
|
| 25 |
+
|
| 26 |
+
# OpenAI imports
|
| 27 |
+
from openai import OpenAI
|
| 28 |
+
|
| 29 |
+
# Transformers imports (for local mode)
|
| 30 |
+
from transformers import (
|
| 31 |
+
AutoModelForCausalLM,
|
| 32 |
+
AutoTokenizer,
|
| 33 |
+
TextIteratorStreamer,
|
| 34 |
+
BitsAndBytesConfig,
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# Spark TTS imports
|
| 38 |
+
try:
|
| 39 |
+
from huggingface_hub import snapshot_download
|
| 40 |
+
SPARK_AVAILABLE = True
|
| 41 |
+
except:
|
| 42 |
+
SPARK_AVAILABLE = False
|
| 43 |
+
|
| 44 |
+
# MeloTTS imports (for local mode)
|
| 45 |
+
try:
|
| 46 |
+
os.system("python -m unidic download")
|
| 47 |
+
from melo.api import TTS as MeloTTS
|
| 48 |
+
MELO_AVAILABLE = True
|
| 49 |
+
except:
|
| 50 |
+
MELO_AVAILABLE = False
|
| 51 |
+
|
| 52 |
+
load_dotenv()
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
@dataclass
|
| 56 |
+
class ConversationConfig:
|
| 57 |
+
max_words: int = 6000
|
| 58 |
+
prefix_url: str = "https://r.jina.ai/"
|
| 59 |
+
model_name: str = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
|
| 60 |
+
local_model_name: str = "NousResearch/Hermes-2-Pro-Llama-3-8B"
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class UnifiedAudioConverter:
|
| 64 |
+
def __init__(self, config: ConversationConfig):
|
| 65 |
+
self.config = config
|
| 66 |
+
self.llm_client = None
|
| 67 |
+
self.local_model = None
|
| 68 |
+
self.tokenizer = None
|
| 69 |
+
self.melo_models = None
|
| 70 |
+
self.spark_model_dir = None
|
| 71 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 72 |
+
|
| 73 |
+
def initialize_api_mode(self, api_key: str):
|
| 74 |
+
"""Initialize API mode with Together API"""
|
| 75 |
+
self.llm_client = OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")
|
| 76 |
+
|
| 77 |
+
def initialize_local_mode(self):
|
| 78 |
+
"""Initialize local mode with Hugging Face model"""
|
| 79 |
+
if self.local_model is None:
|
| 80 |
+
quantization_config = BitsAndBytesConfig(
|
| 81 |
+
load_in_4bit=True,
|
| 82 |
+
bnb_4bit_compute_dtype=torch.float16
|
| 83 |
+
)
|
| 84 |
+
self.local_model = AutoModelForCausalLM.from_pretrained(
|
| 85 |
+
self.config.local_model_name,
|
| 86 |
+
quantization_config=quantization_config
|
| 87 |
+
)
|
| 88 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 89 |
+
self.config.local_model_name,
|
| 90 |
+
revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
def initialize_spark_tts(self):
|
| 94 |
+
"""Initialize Spark TTS model by downloading if needed"""
|
| 95 |
+
if not SPARK_AVAILABLE:
|
| 96 |
+
raise RuntimeError("Spark TTS dependencies not available")
|
| 97 |
+
|
| 98 |
+
model_dir = "pretrained_models/Spark-TTS-0.5B"
|
| 99 |
+
|
| 100 |
+
# Check if model exists, if not download it
|
| 101 |
+
if not os.path.exists(model_dir):
|
| 102 |
+
print("Downloading Spark-TTS model...")
|
| 103 |
+
try:
|
| 104 |
+
os.makedirs("pretrained_models", exist_ok=True)
|
| 105 |
+
snapshot_download(
|
| 106 |
+
"SparkAudio/Spark-TTS-0.5B",
|
| 107 |
+
local_dir=model_dir
|
| 108 |
+
)
|
| 109 |
+
print("Spark-TTS model downloaded successfully")
|
| 110 |
+
except Exception as e:
|
| 111 |
+
raise RuntimeError(f"Failed to download Spark-TTS model: {e}")
|
| 112 |
+
|
| 113 |
+
self.spark_model_dir = model_dir
|
| 114 |
+
|
| 115 |
+
# Check if we have the CLI inference script
|
| 116 |
+
if not os.path.exists("cli/inference.py"):
|
| 117 |
+
print("Warning: Spark-TTS CLI not found. Please clone the Spark-TTS repository.")
|
| 118 |
+
|
| 119 |
+
def initialize_melo_tts(self):
|
| 120 |
+
"""Initialize MeloTTS models"""
|
| 121 |
+
if MELO_AVAILABLE and self.melo_models is None:
|
| 122 |
+
self.melo_models = {"EN": MeloTTS(language="EN", device=self.device)}
|
| 123 |
+
|
| 124 |
+
def fetch_text(self, url: str) -> str:
|
| 125 |
+
"""Fetch text content from URL"""
|
| 126 |
+
if not url:
|
| 127 |
+
raise ValueError("URL cannot be empty")
|
| 128 |
+
|
| 129 |
+
if not url.startswith("http://") and not url.startswith("https://"):
|
| 130 |
+
raise ValueError("URL must start with 'http://' or 'https://'")
|
| 131 |
+
|
| 132 |
+
full_url = f"{self.config.prefix_url}{url}"
|
| 133 |
+
try:
|
| 134 |
+
response = httpx.get(full_url, timeout=60.0)
|
| 135 |
+
response.raise_for_status()
|
| 136 |
+
return response.text
|
| 137 |
+
except httpx.HTTPError as e:
|
| 138 |
+
raise RuntimeError(f"Failed to fetch URL: {e}")
|
| 139 |
+
|
| 140 |
+
def _build_prompt(self, text: str, language: str = "English") -> str:
|
| 141 |
+
"""Build prompt for conversation generation"""
|
| 142 |
+
if language == "Korean":
|
| 143 |
+
template = """
|
| 144 |
+
{
|
| 145 |
+
"conversation": [
|
| 146 |
+
{"speaker": "", "text": ""},
|
| 147 |
+
{"speaker": "", "text": ""}
|
| 148 |
+
]
|
| 149 |
+
}
|
| 150 |
+
"""
|
| 151 |
+
return (
|
| 152 |
+
f"{text}\n\n์ ๊ณต๋ ํ
์คํธ๋ฅผ ๋ ๋ช
์ ์ ๋ฌธ๊ฐ ๊ฐ์ ์งง๊ณ ์ ์ตํ๋ฉฐ ๋ช
ํํ "
|
| 153 |
+
f"ํ์บ์คํธ ๋ํ๋ก ๋ณํํด๏ฟฝ๏ฟฝ์ธ์. ํค์ ์ ๋ฌธ์ ์ด๊ณ ๋งค๋ ฅ์ ์ด์ด์ผ ํฉ๋๋ค. "
|
| 154 |
+
f"๋ค์ ํ์์ ์ค์ํ๊ณ JSON๋ง ๋ฐํํด์ฃผ์ธ์:\n{template}"
|
| 155 |
+
)
|
| 156 |
+
else:
|
| 157 |
+
template = """
|
| 158 |
+
{
|
| 159 |
+
"conversation": [
|
| 160 |
+
{"speaker": "", "text": ""},
|
| 161 |
+
{"speaker": "", "text": ""}
|
| 162 |
+
]
|
| 163 |
+
}
|
| 164 |
+
"""
|
| 165 |
+
return (
|
| 166 |
+
f"{text}\n\nConvert the provided text into a short, informative and crisp "
|
| 167 |
+
f"podcast conversation between two experts. The tone should be "
|
| 168 |
+
f"professional and engaging. Please adhere to the following "
|
| 169 |
+
f"format and return ONLY the JSON:\n{template}"
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
def extract_conversation_api(self, text: str, language: str = "English") -> Dict:
|
| 173 |
+
"""Extract conversation using API"""
|
| 174 |
+
if not self.llm_client:
|
| 175 |
+
raise RuntimeError("API mode not initialized")
|
| 176 |
+
|
| 177 |
+
try:
|
| 178 |
+
# ์ธ์ด๋ณ ํ๋กฌํํธ ๊ตฌ์ฑ
|
| 179 |
+
if language == "Korean":
|
| 180 |
+
system_message = "๋น์ ์ ํ๊ตญ์ด๋ก ํ์บ์คํธ ๋ํ๋ฅผ ์์ฑํ๋ ์ ๋ฌธ๊ฐ์
๋๋ค. ์์ฐ์ค๋ฝ๊ณ ์ ์ตํ ํ๊ตญ์ด ๋ํ๋ฅผ ๋ง๋ค์ด์ฃผ์ธ์."
|
| 181 |
+
else:
|
| 182 |
+
system_message = "You are an expert at creating podcast conversations in English. Create natural and informative English conversations."
|
| 183 |
+
|
| 184 |
+
chat_completion = self.llm_client.chat.completions.create(
|
| 185 |
+
messages=[
|
| 186 |
+
{"role": "system", "content": system_message},
|
| 187 |
+
{"role": "user", "content": self._build_prompt(text, language)}
|
| 188 |
+
],
|
| 189 |
+
model=self.config.model_name,
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
|
| 193 |
+
json_match = re.search(pattern, chat_completion.choices[0].message.content)
|
| 194 |
+
|
| 195 |
+
if not json_match:
|
| 196 |
+
raise ValueError("No valid JSON found in response")
|
| 197 |
+
|
| 198 |
+
return json.loads(json_match.group())
|
| 199 |
+
except Exception as e:
|
| 200 |
+
raise RuntimeError(f"Failed to extract conversation: {e}")
|
| 201 |
+
|
| 202 |
+
def extract_conversation_local(self, text: str, language: str = "English", progress=None) -> Dict:
|
| 203 |
+
"""Extract conversation using local model"""
|
| 204 |
+
if not self.local_model or not self.tokenizer:
|
| 205 |
+
raise RuntimeError("Local mode not initialized")
|
| 206 |
+
|
| 207 |
+
# ์ธ์ด๋ณ ์์คํ
๋ฉ์์ง
|
| 208 |
+
if language == "Korean":
|
| 209 |
+
system_message = "๋น์ ์ ํ๊ตญ์ด๋ก ํ์บ์คํธ ๋ํ๋ฅผ ์์ฑํ๋ ์ ๋ฌธ๊ฐ์
๋๋ค. ์์ฐ์ค๋ฝ๊ณ ์ ์ตํ ํ๊ตญ์ด ๋ํ๋ฅผ ๋ง๋ค์ด์ฃผ์ธ์."
|
| 210 |
+
else:
|
| 211 |
+
system_message = "You are an expert at creating podcast conversations in English. Create natural and informative English conversations."
|
| 212 |
+
|
| 213 |
+
chat = [
|
| 214 |
+
{"role": "system", "content": system_message},
|
| 215 |
+
{"role": "user", "content": self._build_prompt(text, language)}
|
| 216 |
+
]
|
| 217 |
+
|
| 218 |
+
terminators = [
|
| 219 |
+
self.tokenizer.eos_token_id,
|
| 220 |
+
self.tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
| 221 |
+
]
|
| 222 |
+
|
| 223 |
+
messages = self.tokenizer.apply_chat_template(
|
| 224 |
+
chat, tokenize=False, add_generation_prompt=True
|
| 225 |
+
)
|
| 226 |
+
model_inputs = self.tokenizer([messages], return_tensors="pt").to(self.device)
|
| 227 |
+
|
| 228 |
+
streamer = TextIteratorStreamer(
|
| 229 |
+
self.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
generate_kwargs = dict(
|
| 233 |
+
model_inputs,
|
| 234 |
+
streamer=streamer,
|
| 235 |
+
max_new_tokens=4000,
|
| 236 |
+
do_sample=True,
|
| 237 |
+
temperature=0.9,
|
| 238 |
+
eos_token_id=terminators,
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
t = Thread(target=self.local_model.generate, kwargs=generate_kwargs)
|
| 242 |
+
t.start()
|
| 243 |
+
|
| 244 |
+
partial_text = ""
|
| 245 |
+
for new_text in streamer:
|
| 246 |
+
partial_text += new_text
|
| 247 |
+
|
| 248 |
+
pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
|
| 249 |
+
json_match = re.search(pattern, partial_text)
|
| 250 |
+
|
| 251 |
+
if json_match:
|
| 252 |
+
return json.loads(json_match.group())
|
| 253 |
+
else:
|
| 254 |
+
# Return a default template based on language
|
| 255 |
+
if language == "Korean":
|
| 256 |
+
return {
|
| 257 |
+
"conversation": [
|
| 258 |
+
{"speaker": "์งํ์", "text": "์๋
ํ์ธ์, ํ์บ์คํธ์ ์ค์ ๊ฒ์ ํ์ํฉ๋๋ค."},
|
| 259 |
+
{"speaker": "๊ฒ์คํธ", "text": "์๋
ํ์ธ์, ์ด๋ํด ์ฃผ์
์ ๊ฐ์ฌํฉ๋๋ค."}
|
| 260 |
+
]
|
| 261 |
+
}
|
| 262 |
+
else:
|
| 263 |
+
return {
|
| 264 |
+
"conversation": [
|
| 265 |
+
{"speaker": "Host", "text": "Welcome to our podcast."},
|
| 266 |
+
{"speaker": "Guest", "text": "Thank you for having me."}
|
| 267 |
+
]
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
def parse_conversation_text(self, conversation_text: str) -> Dict:
|
| 271 |
+
"""Parse conversation text back to JSON format"""
|
| 272 |
+
lines = conversation_text.strip().split('\n')
|
| 273 |
+
conversation_data = {"conversation": []}
|
| 274 |
+
|
| 275 |
+
for line in lines:
|
| 276 |
+
if ':' in line:
|
| 277 |
+
speaker, text = line.split(':', 1)
|
| 278 |
+
conversation_data["conversation"].append({
|
| 279 |
+
"speaker": speaker.strip(),
|
| 280 |
+
"text": text.strip()
|
| 281 |
+
})
|
| 282 |
+
|
| 283 |
+
return conversation_data
|
| 284 |
+
|
| 285 |
+
async def text_to_speech_edge(self, conversation_json: Dict, language: str = "English") -> Tuple[str, str]:
|
| 286 |
+
"""Convert text to speech using Edge TTS"""
|
| 287 |
+
output_dir = Path(self._create_output_directory())
|
| 288 |
+
filenames = []
|
| 289 |
+
|
| 290 |
+
try:
|
| 291 |
+
# ์ธ์ด๋ณ ์์ฑ ์ค์
|
| 292 |
+
if language == "Korean":
|
| 293 |
+
voices = [
|
| 294 |
+
"ko-KR-SunHiNeural", # ์ฌ์ฑ ์์ฑ (์์ฐ์ค๋ฌ์ด ํ๊ตญ์ด)
|
| 295 |
+
"ko-KR-InJoonNeural" # ๋จ์ฑ ์์ฑ (์์ฐ์ค๋ฌ์ด ํ๊ตญ์ด)
|
| 296 |
+
]
|
| 297 |
+
else:
|
| 298 |
+
voices = [
|
| 299 |
+
"en-US-AvaMultilingualNeural", # ์ฌ์ฑ ์์ฑ
|
| 300 |
+
"en-US-AndrewMultilingualNeural" # ๋จ์ฑ ์์ฑ
|
| 301 |
+
]
|
| 302 |
+
|
| 303 |
+
for i, turn in enumerate(conversation_json["conversation"]):
|
| 304 |
+
filename = output_dir / f"output_{i}.wav"
|
| 305 |
+
voice = voices[i % len(voices)]
|
| 306 |
+
|
| 307 |
+
tmp_path = await self._generate_audio_edge(turn["text"], voice)
|
| 308 |
+
os.rename(tmp_path, filename)
|
| 309 |
+
filenames.append(str(filename))
|
| 310 |
+
|
| 311 |
+
# Combine audio files
|
| 312 |
+
final_output = os.path.join(output_dir, "combined_output.wav")
|
| 313 |
+
self._combine_audio_files(filenames, final_output)
|
| 314 |
+
|
| 315 |
+
# Generate conversation text
|
| 316 |
+
conversation_text = "\n".join(
|
| 317 |
+
f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
|
| 318 |
+
for i, turn in enumerate(conversation_json["conversation"])
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
return final_output, conversation_text
|
| 322 |
+
except Exception as e:
|
| 323 |
+
raise RuntimeError(f"Failed to convert text to speech: {e}")
|
| 324 |
+
|
| 325 |
+
async def _generate_audio_edge(self, text: str, voice: str) -> str:
|
| 326 |
+
"""Generate audio using Edge TTS"""
|
| 327 |
+
if not text.strip():
|
| 328 |
+
raise ValueError("Text cannot be empty")
|
| 329 |
+
|
| 330 |
+
voice_short_name = voice.split(" - ")[0] if " - " in voice else voice
|
| 331 |
+
communicate = edge_tts.Communicate(text, voice_short_name)
|
| 332 |
+
|
| 333 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
| 334 |
+
tmp_path = tmp_file.name
|
| 335 |
+
await communicate.save(tmp_path)
|
| 336 |
+
|
| 337 |
+
return tmp_path
|
| 338 |
+
|
| 339 |
+
def text_to_speech_spark(self, conversation_json: Dict, language: str = "English", progress=None) -> Tuple[str, str]:
|
| 340 |
+
"""Convert text to speech using Spark TTS CLI"""
|
| 341 |
+
if not SPARK_AVAILABLE or not self.spark_model_dir:
|
| 342 |
+
raise RuntimeError("Spark TTS not available")
|
| 343 |
+
|
| 344 |
+
try:
|
| 345 |
+
output_dir = self._create_output_directory()
|
| 346 |
+
audio_files = []
|
| 347 |
+
|
| 348 |
+
# Create different voice characteristics for different speakers
|
| 349 |
+
if language == "Korean":
|
| 350 |
+
voice_configs = [
|
| 351 |
+
{"prompt_text": "์๋
ํ์ธ์, ์ค๋ ํ์บ์คํธ ์งํ์ ๋งก์ ์งํ์์
๋๋ค.", "gender": "female"},
|
| 352 |
+
{"prompt_text": "์๋
ํ์ธ์, ์ค๋ ๊ฒ์คํธ๋ก ์ฐธ์ฌํ๊ฒ ๋์ด ๊ธฐ์ฉ๋๋ค.", "gender": "male"}
|
| 353 |
+
]
|
| 354 |
+
else:
|
| 355 |
+
voice_configs = [
|
| 356 |
+
{"prompt_text": "Hello, welcome to our podcast. I'm your host today.", "gender": "female"},
|
| 357 |
+
{"prompt_text": "Thank you for having me. I'm excited to be here.", "gender": "male"}
|
| 358 |
+
]
|
| 359 |
+
|
| 360 |
+
for i, turn in enumerate(conversation_json["conversation"]):
|
| 361 |
+
text = turn["text"]
|
| 362 |
+
if not text.strip():
|
| 363 |
+
continue
|
| 364 |
+
|
| 365 |
+
# Use different voice config for each speaker
|
| 366 |
+
voice_config = voice_configs[i % len(voice_configs)]
|
| 367 |
+
|
| 368 |
+
output_file = os.path.join(output_dir, f"spark_output_{i}.wav")
|
| 369 |
+
|
| 370 |
+
# Run Spark TTS CLI inference
|
| 371 |
+
cmd = [
|
| 372 |
+
"python", "-m", "cli.inference",
|
| 373 |
+
"--text", text,
|
| 374 |
+
"--device", "0" if torch.cuda.is_available() else "cpu",
|
| 375 |
+
"--save_dir", output_dir,
|
| 376 |
+
"--model_dir", self.spark_model_dir,
|
| 377 |
+
"--prompt_text", voice_config["prompt_text"],
|
| 378 |
+
"--output_name", f"spark_output_{i}.wav"
|
| 379 |
+
]
|
| 380 |
+
|
| 381 |
+
try:
|
| 382 |
+
# Run the command
|
| 383 |
+
result = subprocess.run(
|
| 384 |
+
cmd,
|
| 385 |
+
capture_output=True,
|
| 386 |
+
text=True,
|
| 387 |
+
timeout=60,
|
| 388 |
+
cwd="." # Make sure we're in the right directory
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
if result.returncode == 0:
|
| 392 |
+
audio_files.append(output_file)
|
| 393 |
+
else:
|
| 394 |
+
print(f"Spark TTS error for turn {i}: {result.stderr}")
|
| 395 |
+
# Create a short silence as fallback
|
| 396 |
+
silence = np.zeros(int(22050 * 1.0)) # 1 second of silence
|
| 397 |
+
sf.write(output_file, silence, 22050)
|
| 398 |
+
audio_files.append(output_file)
|
| 399 |
+
|
| 400 |
+
except subprocess.TimeoutExpired:
|
| 401 |
+
print(f"Spark TTS timeout for turn {i}")
|
| 402 |
+
# Create silence as fallback
|
| 403 |
+
silence = np.zeros(int(22050 * 1.0))
|
| 404 |
+
sf.write(output_file, silence, 22050)
|
| 405 |
+
audio_files.append(output_file)
|
| 406 |
+
except Exception as e:
|
| 407 |
+
print(f"Error running Spark TTS for turn {i}: {e}")
|
| 408 |
+
# Create silence as fallback
|
| 409 |
+
silence = np.zeros(int(22050 * 1.0))
|
| 410 |
+
sf.write(output_file, silence, 22050)
|
| 411 |
+
audio_files.append(output_file)
|
| 412 |
+
|
| 413 |
+
# Combine all audio files
|
| 414 |
+
if audio_files:
|
| 415 |
+
final_output = os.path.join(output_dir, "spark_combined.wav")
|
| 416 |
+
self._combine_audio_files(audio_files, final_output)
|
| 417 |
+
else:
|
| 418 |
+
raise RuntimeError("No audio files generated")
|
| 419 |
+
|
| 420 |
+
# Generate conversation text
|
| 421 |
+
conversation_text = "\n".join(
|
| 422 |
+
f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
|
| 423 |
+
for i, turn in enumerate(conversation_json["conversation"])
|
| 424 |
+
)
|
| 425 |
+
|
| 426 |
+
return final_output, conversation_text
|
| 427 |
+
|
| 428 |
+
except Exception as e:
|
| 429 |
+
raise RuntimeError(f"Failed to convert text to speech with Spark TTS: {e}")
|
| 430 |
+
|
| 431 |
+
def text_to_speech_melo(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
|
| 432 |
+
"""Convert text to speech using MeloTTS"""
|
| 433 |
+
if not MELO_AVAILABLE or not self.melo_models:
|
| 434 |
+
raise RuntimeError("MeloTTS not available")
|
| 435 |
+
|
| 436 |
+
speakers = ["EN-Default", "EN-US"]
|
| 437 |
+
combined_audio = AudioSegment.empty()
|
| 438 |
+
|
| 439 |
+
for i, turn in enumerate(conversation_json["conversation"]):
|
| 440 |
+
bio = io.BytesIO()
|
| 441 |
+
text = turn["text"]
|
| 442 |
+
speaker = speakers[i % 2]
|
| 443 |
+
speaker_id = self.melo_models["EN"].hps.data.spk2id[speaker]
|
| 444 |
+
|
| 445 |
+
# Generate audio
|
| 446 |
+
self.melo_models["EN"].tts_to_file(
|
| 447 |
+
text, speaker_id, bio, speed=1.0,
|
| 448 |
+
pbar=progress.tqdm if progress else None,
|
| 449 |
+
format="wav"
|
| 450 |
+
)
|
| 451 |
+
|
| 452 |
+
bio.seek(0)
|
| 453 |
+
audio_segment = AudioSegment.from_file(bio, format="wav")
|
| 454 |
+
combined_audio += audio_segment
|
| 455 |
+
|
| 456 |
+
# Save final audio
|
| 457 |
+
final_audio_path = "melo_podcast.mp3"
|
| 458 |
+
combined_audio.export(final_audio_path, format="mp3")
|
| 459 |
+
|
| 460 |
+
# Generate conversation text
|
| 461 |
+
conversation_text = "\n".join(
|
| 462 |
+
f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
|
| 463 |
+
for i, turn in enumerate(conversation_json["conversation"])
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
return final_audio_path, conversation_text
|
| 467 |
+
|
| 468 |
+
def _create_output_directory(self) -> str:
|
| 469 |
+
"""Create a unique output directory"""
|
| 470 |
+
random_bytes = os.urandom(8)
|
| 471 |
+
folder_name = base64.urlsafe_b64encode(random_bytes).decode("utf-8")
|
| 472 |
+
os.makedirs(folder_name, exist_ok=True)
|
| 473 |
+
return folder_name
|
| 474 |
+
|
| 475 |
+
def _combine_audio_files(self, filenames: List[str], output_file: str) -> None:
|
| 476 |
+
"""Combine multiple audio files into one"""
|
| 477 |
+
if not filenames:
|
| 478 |
+
raise ValueError("No input files provided")
|
| 479 |
+
|
| 480 |
+
try:
|
| 481 |
+
audio_segments = []
|
| 482 |
+
for filename in filenames:
|
| 483 |
+
if os.path.exists(filename):
|
| 484 |
+
audio_segment = AudioSegment.from_file(filename)
|
| 485 |
+
audio_segments.append(audio_segment)
|
| 486 |
+
|
| 487 |
+
if audio_segments:
|
| 488 |
+
combined = sum(audio_segments)
|
| 489 |
+
combined.export(output_file, format="wav")
|
| 490 |
+
|
| 491 |
+
# Clean up temporary files
|
| 492 |
+
for filename in filenames:
|
| 493 |
+
if os.path.exists(filename):
|
| 494 |
+
os.remove(filename)
|
| 495 |
+
|
| 496 |
+
except Exception as e:
|
| 497 |
+
raise RuntimeError(f"Failed to combine audio files: {e}")
|
| 498 |
+
|
| 499 |
+
|
| 500 |
+
# Global converter instance
|
| 501 |
+
converter = UnifiedAudioConverter(ConversationConfig())
|
| 502 |
+
|
| 503 |
+
|
| 504 |
+
async def synthesize(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS", language: str = "English"):
|
| 505 |
+
"""Main synthesis function"""
|
| 506 |
+
if not article_url:
|
| 507 |
+
return "Please provide a valid URL.", None
|
| 508 |
+
|
| 509 |
+
try:
|
| 510 |
+
# Fetch text from URL
|
| 511 |
+
text = converter.fetch_text(article_url)
|
| 512 |
+
|
| 513 |
+
# Limit text to max words
|
| 514 |
+
words = text.split()
|
| 515 |
+
if len(words) > converter.config.max_words:
|
| 516 |
+
text = " ".join(words[:converter.config.max_words])
|
| 517 |
+
|
| 518 |
+
# Extract conversation based on mode
|
| 519 |
+
if mode == "API":
|
| 520 |
+
api_key = os.environ.get("TOGETHER_API_KEY")
|
| 521 |
+
if not api_key:
|
| 522 |
+
return "API key not found. Please set TOGETHER_API_KEY environment variable.", None
|
| 523 |
+
converter.initialize_api_mode(api_key)
|
| 524 |
+
conversation_json = converter.extract_conversation_api(text, language)
|
| 525 |
+
else: # Local mode
|
| 526 |
+
converter.initialize_local_mode()
|
| 527 |
+
conversation_json = converter.extract_conversation_local(text, language)
|
| 528 |
+
|
| 529 |
+
# Generate conversation text
|
| 530 |
+
conversation_text = "\n".join(
|
| 531 |
+
f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
|
| 532 |
+
for i, turn in enumerate(conversation_json["conversation"])
|
| 533 |
+
)
|
| 534 |
+
|
| 535 |
+
return conversation_text, None
|
| 536 |
+
|
| 537 |
+
except Exception as e:
|
| 538 |
+
return f"Error: {str(e)}", None
|
| 539 |
+
|
| 540 |
+
|
| 541 |
+
async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS", language: str = "English"):
|
| 542 |
+
"""Regenerate audio from edited conversation text"""
|
| 543 |
+
if not conversation_text.strip():
|
| 544 |
+
return "Please provide conversation text.", None
|
| 545 |
+
|
| 546 |
+
try:
|
| 547 |
+
# Parse the conversation text back to JSON format
|
| 548 |
+
conversation_json = converter.parse_conversation_text(conversation_text)
|
| 549 |
+
|
| 550 |
+
if not conversation_json["conversation"]:
|
| 551 |
+
return "No valid conversation found in the text.", None
|
| 552 |
+
|
| 553 |
+
# ํ๊ตญ์ด์ธ ๊ฒฝ์ฐ Edge-TTS๋ง ์ฌ์ฉ (๋ค๋ฅธ TTS๋ ํ๊ตญ์ด ์ง์์ด ์ ํ์ )
|
| 554 |
+
if language == "Korean" and tts_engine != "Edge-TTS":
|
| 555 |
+
return "ํ๊ตญ์ด๋ Edge-TTS๋ง ์ง์๋ฉ๋๋ค. TTS ์์ง์ด ์๋์ผ๋ก Edge-TTS๋ก ๋ณ๊ฒฝ๋ฉ๋๋ค.", None
|
| 556 |
+
|
| 557 |
+
# Generate audio based on TTS engine
|
| 558 |
+
if tts_engine == "Edge-TTS":
|
| 559 |
+
output_file, _ = await converter.text_to_speech_edge(conversation_json, language)
|
| 560 |
+
elif tts_engine == "Spark-TTS":
|
| 561 |
+
if not SPARK_AVAILABLE:
|
| 562 |
+
return "Spark TTS not available. Please install required dependencies and clone the Spark-TTS repository.", None
|
| 563 |
+
converter.initialize_spark_tts()
|
| 564 |
+
output_file, _ = converter.text_to_speech_spark(conversation_json, language)
|
| 565 |
+
else: # MeloTTS
|
| 566 |
+
if not MELO_AVAILABLE:
|
| 567 |
+
return "MeloTTS not available. Please install required dependencies.", None
|
| 568 |
+
if language == "Korean":
|
| 569 |
+
return "MeloTTS does not support Korean. Please use Edge-TTS for Korean.", None
|
| 570 |
+
converter.initialize_melo_tts()
|
| 571 |
+
output_file, _ = converter.text_to_speech_melo(conversation_json)
|
| 572 |
+
|
| 573 |
+
return "Audio generated successfully!", output_file
|
| 574 |
+
|
| 575 |
+
except Exception as e:
|
| 576 |
+
return f"Error generating audio: {str(e)}", None
|
| 577 |
+
|
| 578 |
+
|
| 579 |
+
def synthesize_sync(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS", language: str = "English"):
|
| 580 |
+
"""Synchronous wrapper for async synthesis"""
|
| 581 |
+
return asyncio.run(synthesize(article_url, mode, tts_engine, language))
|
| 582 |
+
|
| 583 |
+
|
| 584 |
+
def regenerate_audio_sync(conversation_text: str, tts_engine: str = "Edge-TTS", language: str = "English"):
|
| 585 |
+
"""Synchronous wrapper for async audio regeneration"""
|
| 586 |
+
return asyncio.run(regenerate_audio(conversation_text, tts_engine, language))
|
| 587 |
+
|
| 588 |
+
|
| 589 |
+
def update_tts_engine_for_korean(language):
|
| 590 |
+
"""ํ๊ตญ์ด ์ ํ ์ TTS ์์ง ์ต์
์
๋ฐ์ดํธ"""
|
| 591 |
+
if language == "Korean":
|
| 592 |
+
return gr.Radio(
|
| 593 |
+
choices=["Edge-TTS"],
|
| 594 |
+
value="Edge-TTS",
|
| 595 |
+
label="TTS Engine",
|
| 596 |
+
info="ํ๊ตญ์ด๋ Edge-TTS๋ง ์ง์๋ฉ๋๋ค",
|
| 597 |
+
interactive=False
|
| 598 |
+
)
|
| 599 |
+
else:
|
| 600 |
+
return gr.Radio(
|
| 601 |
+
choices=["Edge-TTS", "Spark-TTS", "MeloTTS"],
|
| 602 |
+
value="Edge-TTS",
|
| 603 |
+
label="TTS Engine",
|
| 604 |
+
info="Edge-TTS: Cloud-based, natural voices | Spark-TTS: Local AI model | MeloTTS: Local, requires GPU",
|
| 605 |
+
interactive=True
|
| 606 |
+
)
|
| 607 |
+
|
| 608 |
+
|
| 609 |
+
# Gradio Interface
|
| 610 |
+
with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
|
| 611 |
+
gr.Markdown("# ๐๏ธ URL to Podcast Converter")
|
| 612 |
+
gr.Markdown("Convert any article, blog, or news into an engaging podcast conversation!")
|
| 613 |
+
|
| 614 |
+
with gr.Row():
|
| 615 |
+
with gr.Column(scale=3):
|
| 616 |
+
url_input = gr.Textbox(
|
| 617 |
+
label="Article URL",
|
| 618 |
+
placeholder="Enter the article URL here...",
|
| 619 |
+
value=""
|
| 620 |
+
)
|
| 621 |
+
with gr.Column(scale=1):
|
| 622 |
+
# ์ธ์ด ์ ํ ์ถ๊ฐ
|
| 623 |
+
language_selector = gr.Radio(
|
| 624 |
+
choices=["English", "Korean"],
|
| 625 |
+
value="English",
|
| 626 |
+
label="Language / ์ธ์ด",
|
| 627 |
+
info="Select output language / ์ถ๋ ฅ ์ธ์ด๋ฅผ ์ ํํ์ธ์"
|
| 628 |
+
)
|
| 629 |
+
|
| 630 |
+
mode_selector = gr.Radio(
|
| 631 |
+
choices=["API", "Local"],
|
| 632 |
+
value="API",
|
| 633 |
+
label="Processing Mode",
|
| 634 |
+
info="API: Faster, requires API key | Local: Slower, runs on device"
|
| 635 |
+
)
|
| 636 |
+
|
| 637 |
+
# TTS ์์ง ์ ํ
|
| 638 |
+
with gr.Group():
|
| 639 |
+
gr.Markdown("### TTS Engine Selection")
|
| 640 |
+
tts_selector = gr.Radio(
|
| 641 |
+
choices=["Edge-TTS", "Spark-TTS", "MeloTTS"],
|
| 642 |
+
value="Edge-TTS",
|
| 643 |
+
label="TTS Engine",
|
| 644 |
+
info="Edge-TTS: Cloud-based, natural voices | Spark-TTS: Local AI model | MeloTTS: Local, requires GPU"
|
| 645 |
+
)
|
| 646 |
+
|
| 647 |
+
gr.Markdown("""
|
| 648 |
+
**Recommended:**
|
| 649 |
+
- ๐ **Edge-TTS**: Best quality, cloud-based, instant setup
|
| 650 |
+
- ๐ค **Spark-TTS**: Local AI model (0.5B), zero-shot voice cloning
|
| 651 |
+
|
| 652 |
+
**Additional Option:**
|
| 653 |
+
- โก **MeloTTS**: Local processing, GPU recommended
|
| 654 |
+
|
| 655 |
+
**ํ๊ตญ์ด ์ง์:**
|
| 656 |
+
- ๐ฐ๐ท ํ๊ตญ์ด ์ ํ ์ Edge-TTS๋ง ์ฌ์ฉ ๊ฐ๋ฅํฉ๋๋ค
|
| 657 |
+
""")
|
| 658 |
+
|
| 659 |
+
convert_btn = gr.Button("๐ฏ Generate Conversation / ๋ํ ์์ฑ", variant="primary", size="lg")
|
| 660 |
+
|
| 661 |
+
with gr.Row():
|
| 662 |
+
with gr.Column():
|
| 663 |
+
conversation_output = gr.Textbox(
|
| 664 |
+
label="Generated Conversation (Editable) / ์์ฑ๋ ๋ํ (ํธ์ง ๊ฐ๋ฅ)",
|
| 665 |
+
lines=15,
|
| 666 |
+
max_lines=30,
|
| 667 |
+
interactive=True,
|
| 668 |
+
placeholder="Generated conversation will appear here. You can edit it before generating audio.\n์์ฑ๋ ๋ํ๊ฐ ์ฌ๊ธฐ์ ํ์๋ฉ๋๋ค. ์ค๋์ค ์์ฑ ์ ์ ํธ์งํ ์ ์์ต๋๋ค.",
|
| 669 |
+
info="Edit the conversation as needed. Format: 'Speaker Name: Text' / ํ์์ ๋ฐ๋ผ ๋ํ๋ฅผ ํธ์งํ์ธ์. ํ์: 'ํ์ ์ด๋ฆ: ํ
์คํธ'"
|
| 670 |
+
)
|
| 671 |
+
|
| 672 |
+
# ์ค๋์ค ์์ฑ ๋ฒํผ ์ถ๊ฐ
|
| 673 |
+
with gr.Row():
|
| 674 |
+
generate_audio_btn = gr.Button("๐๏ธ Generate Audio from Text / ํ
์คํธ์์ ์ค๋์ค ์์ฑ", variant="secondary", size="lg")
|
| 675 |
+
gr.Markdown("*Edit the conversation above, then click to generate audio / ์์ ๋ํ๋ฅผ ํธ์งํ ํ ํด๋ฆญํ์ฌ ์ค๋์ค๋ฅผ ์์ฑํ์ธ์*")
|
| 676 |
+
|
| 677 |
+
with gr.Column():
|
| 678 |
+
audio_output = gr.Audio(
|
| 679 |
+
label="Podcast Audio / ํ์บ์คํธ ์ค๋์ค",
|
| 680 |
+
type="filepath",
|
| 681 |
+
interactive=False
|
| 682 |
+
)
|
| 683 |
+
|
| 684 |
+
# ์ํ ๋ฉ์์ง ์ถ๊ฐ
|
| 685 |
+
status_output = gr.Textbox(
|
| 686 |
+
label="Status / ์ํ",
|
| 687 |
+
interactive=False,
|
| 688 |
+
visible=True
|
| 689 |
+
)
|
| 690 |
+
|
| 691 |
+
# TTS ์์ง๋ณ ์ค๋ช
๋ฐ ์ค์น ์๋ด ์ถ๊ฐ
|
| 692 |
+
with gr.Row():
|
| 693 |
+
gr.Markdown("""
|
| 694 |
+
### TTS Engine Details / TTS ์์ง ์์ธ์ ๋ณด:
|
| 695 |
+
|
| 696 |
+
- **Edge-TTS**: Microsoft's cloud TTS service with high-quality natural voices. Requires internet connection.
|
| 697 |
+
- ๐ฐ๐ท **ํ๊ตญ์ด ์ง์**: ์์ฐ์ค๋ฌ์ด ํ๊ตญ์ด ์์ฑ (์ฌ์ฑ: SunHi, ๋จ์ฑ: InJoon)
|
| 698 |
+
- **Spark-TTS**: SparkAudio's local AI model (0.5B parameters) with zero-shot voice cloning capability.
|
| 699 |
+
- **Setup required**: Clone [Spark-TTS repository](https://github.com/SparkAudio/Spark-TTS) in current directory
|
| 700 |
+
- Features: Bilingual support (Chinese/English), controllable speech generation
|
| 701 |
+
- License: CC BY-NC-SA (Non-commercial use only)
|
| 702 |
+
- โ ๏ธ **ํ๊ตญ์ด ๋ฏธ์ง์**
|
| 703 |
+
- **MeloTTS**: Local TTS with multiple voice options. GPU recommended for better performance.
|
| 704 |
+
- โ ๏ธ **ํ๊ตญ์ด ๋ฏธ์ง์**
|
| 705 |
+
|
| 706 |
+
### Spark-TTS Setup Instructions:
|
| 707 |
+
```bash
|
| 708 |
+
git clone https://github.com/SparkAudio/Spark-TTS.git
|
| 709 |
+
cd Spark-TTS
|
| 710 |
+
pip install -r requirements.txt
|
| 711 |
+
```
|
| 712 |
+
""")
|
| 713 |
+
|
| 714 |
+
gr.Examples(
|
| 715 |
+
examples=[
|
| 716 |
+
["https://huggingface.co/blog/openfree/cycle-navigator", "API", "Edge-TTS", "English"],
|
| 717 |
+
["https://www.bbc.com/news/technology-67988517", "API", "Spark-TTS", "English"],
|
| 718 |
+
["https://arxiv.org/abs/2301.00810", "API", "Edge-TTS", "Korean"],
|
| 719 |
+
],
|
| 720 |
+
inputs=[url_input, mode_selector, tts_selector, language_selector],
|
| 721 |
+
outputs=[conversation_output, status_output],
|
| 722 |
+
fn=synthesize_sync,
|
| 723 |
+
cache_examples=False,
|
| 724 |
+
)
|
| 725 |
+
|
| 726 |
+
# ์ธ์ด ๋ณ๊ฒฝ ์ TTS ์์ง ์ต์
์
๋ฐ์ดํธ
|
| 727 |
+
language_selector.change(
|
| 728 |
+
fn=update_tts_engine_for_korean,
|
| 729 |
+
inputs=[language_selector],
|
| 730 |
+
outputs=[tts_selector]
|
| 731 |
+
)
|
| 732 |
+
|
| 733 |
+
# ์ด๋ฒคํธ ์ฐ๊ฒฐ
|
| 734 |
+
convert_btn.click(
|
| 735 |
+
fn=synthesize_sync,
|
| 736 |
+
inputs=[url_input, mode_selector, tts_selector, language_selector],
|
| 737 |
+
outputs=[conversation_output, status_output]
|
| 738 |
+
)
|
| 739 |
+
|
| 740 |
+
generate_audio_btn.click(
|
| 741 |
+
fn=regenerate_audio_sync,
|
| 742 |
+
inputs=[conversation_output, tts_selector, language_selector],
|
| 743 |
+
outputs=[status_output, audio_output]
|
| 744 |
+
)
|
| 745 |
+
|
| 746 |
+
|
| 747 |
+
# Launch the app
|
| 748 |
+
if __name__ == "__main__":
|
| 749 |
+
demo.queue(api_open=True, default_concurrency_limit=10).launch(
|
| 750 |
+
show_api=True,
|
| 751 |
+
share=False,
|
| 752 |
+
server_name="0.0.0.0",
|
| 753 |
+
server_port=7860
|
| 754 |
+
)
|