Spaces:

MakiAi
/

KotobaTranscriber

Running on Zero

MakiAi commited on Apr 24, 2024

Commit

edf2658

1 Parent(s): 4cd874e

[feat] 音声認識アプリの実装

- Gradioを使用したWeb UIの実装
- Whisperモデルを使用した音声認識機能の実装
- Dockerfileとdocker-compose.ymlの作成
- GPUを使用する場合のDockerfile.gpuとdocker-compose.gpu.ymlの作成
- 必要なライブラリをrequirements.txtに記載

Files changed (6) hide show

Dockerfile +10 -0
app.py +47 -11
docker-compose.yml +12 -0
docker/Dockerfile.gpu +25 -0
docker/docker-compose.gpu.yml +10 -0
requirements.txt +5 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,10 @@

+FROM python:3.11
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+RUN apt-get update && \
+    apt-get install -y ffmpeg && \
+    rm -rf /var/lib/apt/lists/*

app.py CHANGED Viewed

@@ -1,15 +1,51 @@
-import streamlit as st
-def load_markdown(file_path):
-    with open(file_path, encoding="utf8") as f:
-        return f.read()
-def display_front_page():
-    html_front = load_markdown('docs/page_front.md')
-    st.markdown(f"{html_front}", unsafe_allow_html=True)
-if __name__ == "__main__":
-    display_front_page()
-    x = st.slider('Select a value')
-    st.write(x, 'squared is', x * x)

+import gradio as gr
+import torch
+from transformers import pipeline
+import librosa
+# モデルの設定
+model_id = "kotoba-tech/kotoba-whisper-v1.0"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+model_kwargs = {"attn_implementation": "sdpa"} if torch.cuda.is_available() else {}
+generate_kwargs = {"language": "japanese", "task": "transcribe"}
+# モデルのロード
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model=model_id,
+    torch_dtype=torch_dtype,
+    device=device,
+    model_kwargs=model_kwargs
+)
+# 文字起こし関数
+def transcribe(audio_file):
+    # 音声の読み込み
+    audio, sr = librosa.load(audio_file, sr=None)
+    # 音声をリサンプリング
+    target_sr = 16000
+    audio_resampled = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
+    # 推論の実行
+    result = pipe(audio_resampled, generate_kwargs=generate_kwargs)
+    return result["text"]
+description = """
+The bot was trained to answer questions based on Rick and Morty dialogues. Ask Rick anything!
+<img src="https://huggingface.co/spaces/course-demos/Rick_and_Morty_QA/resolve/main/rick.png" width=200px>
+"""
+# Gradioインターフェースの定義
+iface = gr.Interface(
+    fn=transcribe,
+    inputs=gr.Audio(type="filepath", label="Upload Audio (MP3 or MP4)"),
+    outputs="text",
+    title="Speech-to-Text App",
+    description=description,
+    theme=gr.themes.Soft(),
+)
+# アプリの起動
+iface.launch(server_name="0.0.0.0", server_port=7860, share=True)

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,12 @@

+version: '3'
+services:
+  app:
+    build: .
+    ports:
+      - "7860:7860"
+    volumes:
+      - ./:/app
+      - .cache:/root/.cache
+    command: python app.py
+    tty: true

docker/Dockerfile.gpu ADDED Viewed

	@@ -0,0 +1,25 @@

+FROM nvidia/cuda:12.0.1-cudnn8-runtime-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update \
+    && apt-get upgrade -y \
+    && apt-get install -y --no-install-recommends \
+        gcc \
+        curl \
+        wget \
+        sudo \
+        pciutils \
+        python3-all-dev \
+        python-is-python3 \
+        python3-pip \
+        ffmpeg \
+        libsdl2-dev \
+        pulseaudio \
+        alsa-utils \
+        portaudio19-dev \
+    && pip install pip -U
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt

docker/docker-compose.gpu.yml ADDED Viewed

	@@ -0,0 +1,10 @@

+version: '3'
+services:
+  app:
+    build: .
+    ports:
+      - "7860:7860"
+    volumes:
+      - ./:/app
+    command: python app.py

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+torch
+transformers
+datasets[audio]
+librosa