"""Røst speech-to-text demo."""

import logging
import os
import warnings

import gradio as gr
import numpy as np
import samplerate
import torch
from punctfix import PunctFixer
from transformers import pipeline
from dotenv import load_dotenv

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s ⋅ %(name)s ⋅ %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("roest-asr-demo")

load_dotenv()

warnings.filterwarnings("ignore", category=FutureWarning)

icon = """
<svg xmlns="http://www.w3.org/2000/svg" width="14px" viewBox="0 0 24 24" fill="none"
     stroke="currentColor" stroke-width="2" stroke-linecap="round"
     stroke-linejoin="round" style="display: inline;">
  <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
  <polyline points="17 8 12 3 7 8"/>
  <line x1="12" y1="3" x2="12" y2="15"/>
</svg>
"""
TITLE = "Røst Speech-to-Text Demo"
DESCRIPTION = f"""
This is a demo of the Danish speech recognition model
[Røst](https://huggingface.co/alexandrainst/roest-315m).

Press "Record" to record your
own voice. When you're done you can press "Stop" to stop recording and "Submit" to
send the audio to the model for transcription. You can also upload an audio file by
pressing the {icon} button.
"""

logger.info("Loading the ASR model...")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
transcriber = pipeline(
    task="automatic-speech-recognition",
    model="alexandrainst/roest-315m",
    device=device,
    token=os.getenv("HUGGINGFACE_HUB_TOKEN", True),
)

logger.info("Loading the punctuation fixer model...")
transcription_fixer = PunctFixer(language="da", device=device)

logger.info("Models loaded, ready to transcribe audio.")

def transcribe_audio(sampling_rate_and_audio: tuple[int, np.ndarray] | None) -> str:
    """Transcribe the audio.

    Args:
        sampling_rate_and_audio:
            A tuple with the sampling rate and the audio, or None if no audio was
            provided.

    Returns:
        The transcription.
    """
    if sampling_rate_and_audio is None:
        return (
            "No audio was provided. Please record or upload an audio clip, and try "
            "again."
        )

    sampling_rate, audio = sampling_rate_and_audio
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)
    audio = samplerate.resample(audio, 16_000 / sampling_rate, "sinc_best")

    logger.info(f"Transcribing audio clip of {len(audio) / 16_000:.2f} seconds...")
    transcription = transcriber(inputs=audio)
    if not isinstance(transcription, dict):
        return ""

    logger.info(f"Raw transcription is {transcription['text']!r}. Cleaning it up...")
    cleaned_transcription = transcription_fixer.punctuate(
        text=transcription["text"]
    )

    logger.info(f"Final transcription: {cleaned_transcription!r}")
    return cleaned_transcription

demo = gr.Interface(
    fn=transcribe_audio,
    inputs=gr.Audio(
        sources=["microphone", "upload"], show_label=False, min_length=1, max_length=60
    ),
    outputs="textbox",
    title=TITLE,
    description=DESCRIPTION,
    css="p { font-size: 1.0rem; }",
    allow_flagging="never",
    examples=[
        "https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/bornholmsk.wav",
        "https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/soenderjysk.wav",
        "https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/nordjysk.wav",
        "https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/accent.wav",
    ],
    cache_examples=False,
    theme=gr.themes.Soft(primary_hue="orange"),
)

demo.launch()