Spaces:
Sleeping
Sleeping
Commit
·
5dfc9ca
1
Parent(s):
59378f1
Switching back to Parler TTS
Browse files- app.py +43 -17
- requirements.txt +11 -0
- setup.py +0 -31
app.py
CHANGED
|
@@ -11,6 +11,11 @@ import nltk
|
|
| 11 |
import io
|
| 12 |
|
| 13 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# Set environment variables
|
| 16 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
@@ -91,29 +96,52 @@ def generate_roast(caption, llm_components):
|
|
| 91 |
|
| 92 |
return response
|
| 93 |
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
-
def
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
caption = analyze_image(image, vision_components)
|
| 109 |
roast = generate_roast(caption, llm_components)
|
| 110 |
-
audio = text_to_speech(roast
|
| 111 |
return caption, roast, audio
|
| 112 |
|
| 113 |
def setup_processing_chain(video_feed, analysis_output, roast_output, audio_output):
|
| 114 |
vision_components = initialize_vision_model()
|
| 115 |
llm_components = initialize_llm()
|
| 116 |
-
tts_model, speaker_ids = initialize_tts_model()
|
| 117 |
last_process_time = time.time() - 10
|
| 118 |
processing_interval = 5
|
| 119 |
def process_webcam(image):
|
|
@@ -124,9 +152,7 @@ def setup_processing_chain(video_feed, analysis_output, roast_output, audio_outp
|
|
| 124 |
caption, roast, audio = process_frame(
|
| 125 |
image,
|
| 126 |
vision_components,
|
| 127 |
-
llm_components
|
| 128 |
-
tts_model,
|
| 129 |
-
'EN-US' # Default accent
|
| 130 |
)
|
| 131 |
return image, caption, roast, audio
|
| 132 |
return image, None, None, None
|
|
|
|
| 11 |
import io
|
| 12 |
|
| 13 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 14 |
+
from parler_tts import ParlerTTSForConditionalGeneration
|
| 15 |
+
from transformers import AutoFeatureExtractor, set_seed
|
| 16 |
+
from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
|
| 17 |
+
from string import punctuation
|
| 18 |
+
import re
|
| 19 |
|
| 20 |
# Set environment variables
|
| 21 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
|
| 96 |
|
| 97 |
return response
|
| 98 |
|
| 99 |
+
# Parler-TTS setup
|
| 100 |
+
parler_device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 101 |
+
parler_repo_id = "parler-tts/parler-tts-mini-expresso"
|
| 102 |
+
parler_model = ParlerTTSForConditionalGeneration.from_pretrained(parler_repo_id).to(parler_device)
|
| 103 |
+
parler_tokenizer = AutoTokenizer.from_pretrained(parler_repo_id)
|
| 104 |
+
parler_feature_extractor = AutoFeatureExtractor.from_pretrained(parler_repo_id)
|
| 105 |
+
PARLER_SAMPLE_RATE = parler_feature_extractor.sampling_rate
|
| 106 |
+
PARLER_SEED = 42
|
| 107 |
+
parler_number_normalizer = EnglishNumberNormalizer()
|
| 108 |
|
| 109 |
+
def parler_preprocess(text):
|
| 110 |
+
text = parler_number_normalizer(text).strip()
|
| 111 |
+
if text and text[-1] not in punctuation:
|
| 112 |
+
text = f"{text}."
|
| 113 |
+
abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
|
| 114 |
+
def separate_abb(chunk):
|
| 115 |
+
chunk = chunk.replace(".", "")
|
| 116 |
+
return " ".join(chunk)
|
| 117 |
+
abbreviations = re.findall(abbreviations_pattern, text)
|
| 118 |
+
for abv in abbreviations:
|
| 119 |
+
if abv in text:
|
| 120 |
+
text = text.replace(abv, separate_abb(abv))
|
| 121 |
+
return text
|
| 122 |
|
| 123 |
+
def text_to_speech(text):
|
| 124 |
+
# Asian mom nagging style description
|
| 125 |
+
description = ("Elisabeth speaks in a mature, strict, nagging, and slightly disappointed tone, "
|
| 126 |
+
"with a hint of love and high expectations, at a moderate pace with high quality audio. "
|
| 127 |
+
"She sounds like a stereotypical Asian mother who compares you to your cousins, "
|
| 128 |
+
"questions your life choices, and threatens you with a slipper, but ultimately wants the best for you.")
|
| 129 |
+
inputs = parler_tokenizer(description, return_tensors="pt").to(parler_device)
|
| 130 |
+
prompt = parler_tokenizer(parler_preprocess(text), return_tensors="pt").to(parler_device)
|
| 131 |
+
set_seed(PARLER_SEED)
|
| 132 |
+
generation = parler_model.generate(input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids)
|
| 133 |
+
audio_arr = generation.cpu().numpy().squeeze()
|
| 134 |
+
return (PARLER_SAMPLE_RATE, audio_arr)
|
| 135 |
+
|
| 136 |
+
def process_frame(image, vision_components, llm_components):
|
| 137 |
caption = analyze_image(image, vision_components)
|
| 138 |
roast = generate_roast(caption, llm_components)
|
| 139 |
+
audio = text_to_speech(roast)
|
| 140 |
return caption, roast, audio
|
| 141 |
|
| 142 |
def setup_processing_chain(video_feed, analysis_output, roast_output, audio_output):
|
| 143 |
vision_components = initialize_vision_model()
|
| 144 |
llm_components = initialize_llm()
|
|
|
|
| 145 |
last_process_time = time.time() - 10
|
| 146 |
processing_interval = 5
|
| 147 |
def process_webcam(image):
|
|
|
|
| 152 |
caption, roast, audio = process_frame(
|
| 153 |
image,
|
| 154 |
vision_components,
|
| 155 |
+
llm_components
|
|
|
|
|
|
|
| 156 |
)
|
| 157 |
return image, caption, roast, audio
|
| 158 |
return image, None, None, None
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
git+https://github.com/huggingface/parler-tts.git
|
| 2 |
+
accelerate
|
| 3 |
+
torch==2.1.0
|
| 4 |
+
transformers
|
| 5 |
+
gradio
|
| 6 |
+
pillow
|
| 7 |
+
numpy<2
|
| 8 |
+
opencv-python
|
| 9 |
+
nltk
|
| 10 |
+
unidic-lite
|
| 11 |
+
torchvision
|
setup.py
DELETED
|
@@ -1,31 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from setuptools import setup, find_packages
|
| 3 |
-
from setuptools.command.develop import develop
|
| 4 |
-
from setuptools.command.install import install
|
| 5 |
-
|
| 6 |
-
cwd = os.path.dirname(os.path.abspath(__file__))
|
| 7 |
-
requirements = open(os.path.join(cwd, "requirements.txt"), "r").readlines()
|
| 8 |
-
|
| 9 |
-
class PostInstallCommand(install):
|
| 10 |
-
"""Post-installation for installation mode."""
|
| 11 |
-
def run(self):
|
| 12 |
-
install.run(self)
|
| 13 |
-
os.system('python -m unidic download')
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
class PostDevelopCommand(develop):
|
| 17 |
-
"""Post-installation for development mode."""
|
| 18 |
-
def run(self):
|
| 19 |
-
develop.run(self)
|
| 20 |
-
os.system('python -m unidic download')
|
| 21 |
-
|
| 22 |
-
setup(
|
| 23 |
-
name='melo',
|
| 24 |
-
version='0.1.0',
|
| 25 |
-
packages=find_packages(),
|
| 26 |
-
include_package_data=True,
|
| 27 |
-
install_requires=requirements,
|
| 28 |
-
package_data={
|
| 29 |
-
'': ['*.txt', 'cmudict_*'],
|
| 30 |
-
},
|
| 31 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|