csm-1b

Running on Zero

App Files Files Community

Bradarr commited on 11 days ago

Commit

09bb564

verified ·

1 Parent(s): ef55fce

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -224

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
 import gradio as gr
 import numpy as np
 import spaces
@@ -8,251 +7,187 @@ import torchaudio
 from generator import Segment, load_csm_1b
 from huggingface_hub import hf_hub_download, login
 from watermarking import watermark
-api_key = os.getenv("HF_TOKEN")
-gpu_timeout = int(os.getenv("GPU_TIMEOUT", 60))
-CSM_1B_HF_WATERMARK = list(map(int, os.getenv("WATERMARK_KEY").split(" ")))
-login(token=api_key)
-SPACE_INTRO_TEXT = """\
-# Sesame CSM 1B
-Generate from CSM 1B (Conversational Speech Model).
-Code is available on GitHub: [SesameAILabs/csm](https://github.com/SesameAILabs/csm).
-Checkpoint is [hosted on HuggingFace](https://huggingface.co/sesame/csm-1b).
-Try out our interactive demo [sesame.com/voicedemo](https://www.sesame.com/voicedemo),
-this uses a fine-tuned variant of CSM.
-The model has some capacity for non-English languages due to data contamination in the training
-data, but it is likely not to perform well.
----
-"""
-CONVO_INTRO_TEXT = """\
-## Conversation content
-Each line is an utterance in the conversation to generate. Speakers alternate between A and B, starting with speaker A.
 """
-DEFAULT_CONVERSATION = """\
-Hey how are you doing.
-Pretty good, pretty good.
-I'm great, so happy to be speaking to you.
-Me too, this is some cool stuff huh?
-Yeah, I've been reading more about speech generation, and it really seems like context is important.
-Definitely.
-"""
-SPEAKER_PROMPTS = {
-    "conversational_a": {
-        "text": (
-            "like revising for an exam I'd have to try and like keep up the momentum because I'd "
-            "start really early I'd be like okay I'm gonna start revising now and then like "
-            "you're revising for ages and then I just like start losing steam I didn't do that "
-            "for the exam we had recently to be fair that was a more of a last minute scenario "
-            "but like yeah I'm trying to like yeah I noticed this yesterday that like Mondays I "
-            "sort of start the day with this not like a panic but like a"
-        ),
-        "audio": "prompts/conversational_a.wav",
-    },
-    "conversational_b": {
-        "text": (
-            "like a super Mario level. Like it's very like high detail. And like, once you get "
-            "into the park, it just like, everything looks like a computer game and they have all "
-            "these, like, you know, if, if there's like a, you know, like in a Mario game, they "
-            "will have like a question block. And if you like, you know, punch it, a coin will "
-            "come out. So like everyone, when they come into the park, they get like this little "
-            "bracelet and then you can go punching question blocks around."
-        ),
-        "audio": "prompts/conversational_b.wav",
-    },
-    "read_speech_a": {
-        "text": (
-            "And Lake turned round upon me, a little abruptly, his odd yellowish eyes, a little "
-            "like those of the sea eagle, and the ghost of his smile that flickered on his "
-            "singularly pale face, with a stern and insidious look, confronted me."
-        ),
-        "audio": "prompts/read_speech_a.wav",
-    },
-    "read_speech_b": {
-        "text": (
-            "He was such a big boy that he wore high boots and carried a jack knife. He gazed and "
-            "gazed at the cap, and could not keep from fingering the blue tassel."
-        ),
-        "audio": "prompts/read_speech_b.wav",
-    },
-    "read_speech_c": {
-        "text": (
-            "All passed so quickly, there was so much going on around him, the Tree quite forgot "
-            "to look to himself."
-        ),
-        "audio": "prompts/read_speech_c.wav",
-    },
-    "read_speech_d": {
-        "text": (
-            "Suddenly I was back in the old days Before you felt we ought to drift apart. It was "
-            "some trick-the way your eyebrows raise."
-        ),
-        "audio": "prompts/read_speech_d.wav",
-    },
-}
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_path = hf_hub_download(repo_id="sesame/csm-1b", filename="ckpt.pt")
-generator = load_csm_1b(model_path, device)
 @spaces.GPU(duration=gpu_timeout)
-def infer(
-    text_prompt_speaker_a,
-    text_prompt_speaker_b,
-    audio_prompt_speaker_a,
-    audio_prompt_speaker_b,
-    gen_conversation_input,
-) -> tuple[np.ndarray, int]:
-    # Estimate token limit, otherwise failure might happen after many utterances have been generated.
-    if len(gen_conversation_input.strip() + text_prompt_speaker_a.strip() + text_prompt_speaker_b.strip()) >= 2000:
-        raise gr.Error("Prompts and conversation too long.", duration=30)
     try:
-        return _infer(
-            text_prompt_speaker_a,
-            text_prompt_speaker_b,
-            audio_prompt_speaker_a,
-            audio_prompt_speaker_b,
-            gen_conversation_input,
-        )
-    except ValueError as e:
-        raise gr.Error(f"Error generating audio: {e}", duration=120)
-def _infer(
-    text_prompt_speaker_a,
-    text_prompt_speaker_b,
-    audio_prompt_speaker_a,
-    audio_prompt_speaker_b,
-    gen_conversation_input,
-) -> tuple[np.ndarray, int]:
-    audio_prompt_a = prepare_prompt(text_prompt_speaker_a, 0, audio_prompt_speaker_a)
-    audio_prompt_b = prepare_prompt(text_prompt_speaker_b, 1, audio_prompt_speaker_b)
-    prompt_segments: list[Segment] = [audio_prompt_a, audio_prompt_b]
-    generated_segments: list[Segment] = []
-    conversation_lines = [line.strip() for line in gen_conversation_input.strip().split("\n") if line.strip()]
-    for i, line in enumerate(conversation_lines):
-        # Alternating speakers A and B, starting with A
-        speaker_id = i % 2
-        audio_tensor = generator.generate(
-            text=line,
-            speaker=speaker_id,
-            context=prompt_segments + generated_segments,
-            max_audio_length_ms=30_000,
-        )
-        generated_segments.append(Segment(text=line, speaker=speaker_id, audio=audio_tensor))
-    # Concatenate all generations and convert to 16-bit int format
-    audio_tensors = [segment.audio for segment in generated_segments]
-    audio_tensor = torch.cat(audio_tensors, dim=0)
-    # This applies an imperceptible watermark to identify audio as AI-generated.
-    # Watermarking ensures transparency, dissuades misuse, and enables traceability.
-    # Please be a responsible AI citizen and keep the watermarking in place.
-    # If using CSM 1B in another application, use your own private key and keep it secret.
-    audio_tensor, wm_sample_rate = watermark(
-        generator._watermarker, audio_tensor, generator.sample_rate, CSM_1B_HF_WATERMARK
-    )
-    audio_tensor = torchaudio.functional.resample(
-        audio_tensor, orig_freq=wm_sample_rate, new_freq=generator.sample_rate
-    )
-    audio_array = (audio_tensor * 32768).to(torch.int16).cpu().numpy()
-    return generator.sample_rate, audio_array
-def prepare_prompt(text: str, speaker: int, audio_path: str) -> Segment:
-    audio_tensor, _ = load_prompt_audio(audio_path)
-    return Segment(text=text, speaker=speaker, audio=audio_tensor)
-def load_prompt_audio(audio_path: str) -> torch.Tensor:
-    audio_tensor, sample_rate = torchaudio.load(audio_path)
-    audio_tensor = audio_tensor.squeeze(0)
-    if sample_rate != generator.sample_rate:
-        audio_tensor = torchaudio.functional.resample(
-            audio_tensor, orig_freq=sample_rate, new_freq=generator.sample_rate
         )
-    return audio_tensor, generator.sample_rate
-def create_speaker_prompt_ui(speaker_name: str):
-    speaker_dropdown = gr.Dropdown(
-        choices=list(SPEAKER_PROMPTS.keys()), label="Select a predefined speaker", value=speaker_name
-    )
-    with gr.Accordion("Or add your own voice prompt", open=False):
-        text_prompt_speaker = gr.Textbox(label="Speaker prompt", lines=4, value=SPEAKER_PROMPTS[speaker_name]["text"])
-        audio_prompt_speaker = gr.Audio(
-            label="Speaker prompt", type="filepath", value=SPEAKER_PROMPTS[speaker_name]["audio"]
         )
-    return speaker_dropdown, text_prompt_speaker, audio_prompt_speaker
 with gr.Blocks() as app:
     gr.Markdown(SPACE_INTRO_TEXT)
-    gr.Markdown("## Voices")
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("### Speaker A")
-            speaker_a_dropdown, text_prompt_speaker_a, audio_prompt_speaker_a = create_speaker_prompt_ui(
-                "conversational_a"
-            )
-        with gr.Column():
-            gr.Markdown("### Speaker B")
-            speaker_b_dropdown, text_prompt_speaker_b, audio_prompt_speaker_b = create_speaker_prompt_ui(
-                "conversational_b"
-            )
-    def update_audio(speaker):
-        if speaker in SPEAKER_PROMPTS:
-            return SPEAKER_PROMPTS[speaker]["audio"]
-        return None
-    def update_text(speaker):
-        if speaker in SPEAKER_PROMPTS:
-            return SPEAKER_PROMPTS[speaker]["text"]
-        return None
-    speaker_a_dropdown.change(fn=update_audio, inputs=[speaker_a_dropdown], outputs=[audio_prompt_speaker_a])
-    speaker_b_dropdown.change(fn=update_audio, inputs=[speaker_b_dropdown], outputs=[audio_prompt_speaker_b])
-    speaker_a_dropdown.change(fn=update_text, inputs=[speaker_a_dropdown], outputs=[text_prompt_speaker_a])
-    speaker_b_dropdown.change(fn=update_text, inputs=[speaker_b_dropdown], outputs=[text_prompt_speaker_b])
-    gr.Markdown(CONVO_INTRO_TEXT)
-    gen_conversation_input = gr.TextArea(label="conversation", lines=20, value=DEFAULT_CONVERSATION)
-    generate_btn = gr.Button("Generate conversation", variant="primary")
-    gr.Markdown("GPU time limited to 3 minutes, for longer usage duplicate the space.")
-    audio_output = gr.Audio(label="Synthesized audio")
-    generate_btn.click(
-        infer,
-        inputs=[
-            text_prompt_speaker_a,
-            text_prompt_speaker_b,
-            audio_prompt_speaker_a,
-            audio_prompt_speaker_b,
-            gen_conversation_input,
-        ],
-        outputs=[audio_output],
-    )
-app.launch(ssr_mode=True)

 import os
 import gradio as gr
 import numpy as np
 import spaces
 from generator import Segment, load_csm_1b
 from huggingface_hub import hf_hub_download, login
 from watermarking import watermark
+import whisperx
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Authentication and Configuration
+try:
+    api_key = os.getenv("HF_TOKEN")
+    if not api_key:
+        raise ValueError("HF_TOKEN not found in environment variables.")
+    login(token=api_key)
+    CSM_1B_HF_WATERMARK = list(map(int, os.getenv("WATERMARK_KEY").split(" ")))
+    if not CSM_1B_HF_WATERMARK:
+        raise ValueError("WATERMARK_KEY not found or invalid in environment variables.")
+    gpu_timeout = int(os.getenv("GPU_TIMEOUT", 180))
+except (ValueError, TypeError) as e:
+    logging.error(f"Configuration error: {e}")
+    raise  # Re-raise the exception to halt the application
+SPACE_INTRO_TEXT = """\
+# Sesame CSM 1B - Conversational Demo
+This demo allows you to have a conversation with Sesame CSM 1B, leveraging WhisperX for speech-to-text and Gemma for generating responses. This is an experimental integration and may require significant resources.
+*Disclaimer: This demo relies on several large models. Expect longer processing times, and potential resource limitations.*
 """
+# Model Loading
+try:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model_path = hf_hub_download(repo_id="sesame/csm-1b", filename="ckpt.pt")
+    generator = load_csm_1b(model_path, device)
+    logging.info("Sesame CSM 1B loaded successfully.")
+    whisper_model, whisper_metadata = whisperx.load_model("large-v2", device)
+    model_a, whisper_metadata = whisperx.load_align_model(language_code=whisper_metadata.language, device=device)
+    logging.info("WhisperX model loaded successfully.")
+    # Load Gemma 1.1 2B - adjust model name if needed
+    tokenizer_gemma = AutoTokenizer.from_pretrained("google/gemma-3-1b-pt")
+    model_gemma = AutoModelForCausalLM.from_pretrained("google/gemma-3-1b-pt").to(device)
+    logging.info("Gemma 3 1B pt model loaded successfully.")
+except Exception as e:
+    logging.error(f"Model loading error: {e}")
+    raise  # Re-raise to prevent the app from launching with incomplete models
+# Constants
+SPEAKER_ID = 0  # Arbitrary speaker ID
+MAX_CONTEXT_SEGMENTS = 5
+MAX_GEMMA_LENGTH = 300 #Reduce for the 1.1 2b model
+# Global conversation history (important: keep it inside app scope)
+conversation_history = []
+# --- HELPER FUNCTIONS ---
+def transcribe_audio(audio_path: str) -> str:
+    """Transcribes audio using WhisperX."""
+    try:
+        audio = whisperx.load_audio(audio_path)
+        result = whisper_model.transcribe(audio, batch_size=16) # Added batch_size
+        # Align Whisper output
+        result_aligned = whisperx.align(result["segments"], model_a, whisper_metadata, audio, whisper_model, device, return_char_alignments=False)
+        return result_aligned["segments"][0]["text"]
+    except Exception as e:
+        logging.error(f"WhisperX transcription error: {e}")
+        return "Error: Could not transcribe audio." # Return an error message
+def generate_response(text: str) -> str:
+    """Generates a response using Gemma."""
+    try:
+        input_text = "Here is a response for the user. " + text
+        input = tokenizer_gemma(input_text, return_tensors="pt").to(device)
+        generated_output = model_gemma.generate(**input, max_length=MAX_GEMMA_LENGTH, early_stopping=True) # Added early_stopping
+        return tokenizer_gemma.decode(generated_output[0], skip_special_tokens=True)
+    except Exception as e:
+        logging.error(f"Gemma response generation error: {e}")
+        return "I'm sorry, I encountered an error generating a response."  # Error fallback
+def load_audio(audio_path: str) -> torch.Tensor:
+    """Loads audio from file and returns a torch tensor."""
+    try:
+        audio_tensor, sample_rate = torchaudio.load(audio_path)
+        audio_tensor = audio_tensor.mean(dim=0)  # Mono audio
+        if sample_rate != generator.sample_rate:
+            audio_tensor = torchaudio.functional.resample(
+                audio_tensor, orig_freq=sample_rate, new_freq=generator.sample_rate
+            )
+        return audio_tensor
+    except Exception as e:
+        logging.error(f"Audio loading error: {e}")
+        raise gr.Error("Could not load or process the audio file.") from e # Re-raise as Gradio error
+def clear_history():
+    """Clears the conversation history"""
+    global conversation_history
+    conversation_history = []
+    logging.info("Conversation history cleared.")
+    return "Conversation history cleared."
+# --- MAIN INFERENCE FUNCTION ---
 @spaces.GPU(duration=gpu_timeout)
+def infer(user_audio) -> tuple[int, np.ndarray]:  # Return sample_rate as int
+    """Infers a response from the user audio."""
     try:
+        if not user_audio:
+            raise ValueError("No audio input received.")
+        return _infer(user_audio)
+    except Exception as e:
+        logging.exception(f"Inference error: {e}") # Log the full exception
+        raise gr.Error(f"An error occurred during processing: {e}")
+def _infer(user_audio) -> tuple[int, np.ndarray]:  # Return sample_rate as int
+    """Processes the user input, generates a response, and returns audio."""
+    global conversation_history # Declare to modify the global list
+    try:
+        # 1. ASR: Transcribe user audio using WhisperX
+        user_text = transcribe_audio(user_audio)
+        logging.info(f"User: {user_text}")
+        # 2. LLM: Generate a response using Gemma
+        ai_text = generate_response(user_text)
+        logging.info(f"AI: {ai_text}")
+        # 3. Generate audio using the CSM model
+        try:
+            ai_audio = generator.generate(
+                text=ai_text,
+                speaker=SPEAKER_ID,
+                context=conversation_history,
+                max_audio_length_ms=30_000,
+            )
+            logging.info("Audio generated successfully.")
+        except Exception as e:
+             logging.error(f"Gemma response generation error: {e}")
+             raise gr.Error(f"Gemma response generation error: {e}")  # Error fallback
+        #Update conversation history with user input and ai response.
+        user_segment = Segment(speaker = SPEAKER_ID, text = 'User Audio', audio = load_audio(user_audio))
+        ai_segment =  Segment(speaker = SPEAKER_ID, text = 'AI Audio', audio = ai_audio)
+        conversation_history.append(user_segment)
+        conversation_history.append(ai_segment)
+        #Limit Conversation History
+        if len(conversation_history) > MAX_CONTEXT_SEGMENTS:
+            conversation_history.pop(0)
+        # 4. Watermarking and Audio Conversion
+        audio_tensor, wm_sample_rate = watermark(
+            generator._watermarker, ai_audio, generator.sample_rate, CSM_1B_HF_WATERMARK
         )
+        audio_tensor = torchaudio.functional.resample(
+            audio_tensor, orig_freq=wm_sample_rate, new_freq=generator.sample_rate
         )
+        ai_audio_array = (audio_tensor * 32768).to(torch.int16).cpu().numpy()
+        return generator.sample_rate, ai_audio_array
+    except Exception as e:
+        logging.exception(f"Error in _infer: {e}")
+        # Log the full exception including stack trace for debugging.
+        # It's crucial to log the *exception*, not just the error message.
+        raise gr.Error(f"An error occurred during processing: {e}")
+# --- GRADIO INTERFACE ---
 with gr.Blocks() as app:
     gr.Markdown(SPACE_INTRO_TEXT)
+    audio_input = gr.Audio(label="Your Input", source="microphone", type="filepath")
+    audio_output = gr.Audio(label="AI Response")
+    clear_button = gr.Button("Clear Conversation History")
+    status_display = gr.Textbox(label="Status", visible=False)
+    btn = gr.Button("Generate Response")
+    btn.click(infer, inputs=[audio_input], outputs=[audio_output])
+    clear_button.click(clear_history, outputs=[status_display]) # No input needed
+app.launch(ssr_mode=True)