Spaces:

inwneon
/

project-voice-diarzation

Paused

App Files Files

sivakorn-su commited on Jun 19

Commit

78dde53

1 Parent(s): e6d32bd

feat: add voice diarization project

Browse files

Files changed (4) hide show

Dockerfile +15 -0
README.md +81 -4
app.py +345 -0
requirements.txt +18 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,15 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+WORKDIR /app
+COPY ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt \
+    && pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+COPY . /app
+COPY .env.prod .env.prod
+ENV ENV=production
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8300"]

README.md CHANGED Viewed

@@ -1,10 +1,87 @@
 ---
-title: Project Voice Diarzation
-emoji: 🏆
-colorFrom: red
-colorTo: pink
 sdk: docker
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: WhisperPyanoteLLM
+emoji: 📉
+colorFrom: indigo
+colorTo: green
 sdk: docker
 pinned: false
+license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# WhisperPyanoteLLM
+A FastAPI-based app for speaker diarization and transcription using Whisper and PyAnnote, with LLM-powered summarization.
+## Features
+- Speaker diarization with pyannote.audio
+- Transcription with OpenAI Whisper
+- Summarization with Together LLM
+- REST API for video/audio upload and processing
+## Quick Start (Development)
+1. **Clone the repository:**
+   ```sh
+   git clone <your-repo-url>
+   cd WhisperPyanoteLLM
+   ```
+2. **Create a `.env` file:**
+   ```env
+   HF_TOKEN=your_huggingface_token
+   TOGETHER_API_KEY=your_together_api_key
+   NGROK_AUTH_TOKEN=your_ngrok_token
+   ```
+3. **Install dependencies:**
+   ```sh
+   pip install -r requirements.txt
+   ```
+4. **Run the app:**
+   ```sh
+   uvicorn app:app --reload --port 8300
+   ```
+5. **Access the API:**
+   - Health check: [http://localhost:8300/health](http://localhost:8300/health)
+   - Upload endpoint: `/upload_video/`
+---
+## Production (Docker)
+1. **Create a `.env.prod` file:**
+   ```env
+   HF_TOKEN=your_huggingface_token
+   TOGETHER_API_KEY=your_together_api_key
+   NGROK_AUTH_TOKEN=your_ngrok_token
+   ```
+2. **Build the Docker image:**
+   ```sh
+   docker build -t whisperpyanote .
+   ```
+3. **Run the Docker container:**
+   ```sh
+   docker run --env-file .env.prod -p 8300:8300 whisperpyanote
+   ```
+4. **Access the API:**
+   - Health check: [http://localhost:8300/health](http://localhost:8300/health)
+   - Upload endpoint: `/upload_video/`
+---
+## Notes
+- Make sure your `.env` and `.env.prod` files are **not** committed to version control.
+- For best performance, run on a machine with a CUDA-enabled GPU.
+- For more details, see the code and comments in `app.py`.
+---
+## License
+Apache-2.0

app.py ADDED Viewed

	@@ -0,0 +1,345 @@

+import os
+import shutil
+import time
+from collections import Counter
+import torch
+import whisper
+from pyannote.audio import Pipeline
+from torch.serialization import add_safe_globals
+from omegaconf import ListConfig
+import nest_asyncio
+import uvicorn
+from fastapi import FastAPI, UploadFile, File
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from pyngrok import ngrok, conf
+from pydub import AudioSegment, effects
+import pandas as pd
+from moviepy.editor import VideoFileClip
+from together import Together
+# Hugging Face Spaces injects secrets as environment variables automatically
+token = os.environ.get('HF_TOKEN')
+together_api_key = os.environ.get('TOGETHER_API_KEY')
+ngrok_auth_token = os.environ.get('NGROK_AUTH_TOKEN')
+pipelines, models, others = [], [], []
+n = torch.cuda.device_count()
+if n == 0:
+    device = "cpu"
+    pipelines.append(Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=token).to(device))
+    models.append(whisper.load_model("large").to(device))
+elif n == 1:
+    device = "cuda:0"
+    pipelines.append(Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=token).to(device))
+    models.append(whisper.load_model("large").to(device))
+else:
+    device_pyannote = torch.device("cuda:0")
+    device_whisper = torch.device("cuda:1")
+    pipeline = Pipeline.from_pretrained(
+        "pyannote/speaker-diarization-3.1",
+        use_auth_token=token
+    )
+    pipeline.to(device_pyannote)
+    model = whisper.load_model("large").to(device_whisper)
+nest_asyncio.apply()
+together = Together(api_key=together_api_key)
+conf.get_default().auth_token = ngrok_auth_token
+add_safe_globals({ListConfig})
+UPLOAD_FOLDER = "uploads"
+os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+app = FastAPI()
+origins = [
+    "http://127.0.0.1:8000",
+    "http://localhost:8000",
+    "https://project-diarzation-production.up.railway.app"
+]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.on_event("startup")
+def on_startup():
+    global pipeline, model, device
+    pipeline, model, device = setup_models()
+    # ... any other startup logic
+@app.get("/health")
+def health_check():
+    return {
+        "status": "ok",
+        "model_loaded": model is not None,
+        "diarization_pipeline_loaded": pipeline is not None,
+        "device": device
+    }
+@app.get("/")
+def check_api():
+    return {"message": "API is up and running"}
+@app.get("/key")
+def check_env():
+    return {
+        "env": os.environ.get("ENV", "dev"),
+        "openai_key_exists": bool(os.environ.get("OPENAI_API_KEY")),
+    }
+def save_uploaded_file(file: UploadFile) -> str:
+    os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+    filepath = os.path.join(UPLOAD_FOLDER, file.filename)
+    with open(filepath, "wb") as f:
+        shutil.copyfileobj(file.file, f)
+    return filepath
+def extract_and_normalize_audio(video_path: str) -> str:
+    clip = VideoFileClip(video_path)
+    audio_path = os.path.join(UPLOAD_FOLDER, "extracted_audio.wav")
+    clip.audio.write_audiofile(audio_path)
+    audio = AudioSegment.from_wav(audio_path)
+    normalized_audio = effects.normalize(audio)
+    cleaned_path = os.path.join(UPLOAD_FOLDER, "cleaned.wav")
+    normalized_audio.export(cleaned_path, format="wav")
+    return cleaned_path
+def diarize_audio(audio_path: str) -> pd.DataFrame:
+    diarization = pipeline(audio_path)
+    return pd.DataFrame([
+        {"start": round(turn.start, 3), "end": round(turn.end, 3), "speaker": speaker}
+        for turn, _, speaker in diarization.itertracks(yield_label=True)
+    ])
+def split_segments(audio_path: str, df: pd.DataFrame) -> str:
+    segment_folder = os.path.join(UPLOAD_FOLDER, "segments")
+    if os.path.exists(segment_folder):
+        shutil.rmtree(segment_folder)
+    os.makedirs(segment_folder, exist_ok=True)
+    audio = AudioSegment.from_file(audio_path)
+    for i, row in df.iterrows():
+        start_ms = int(row['start'] * 1000)
+        end_ms = int(row['end'] * 1000)
+        segment = audio[start_ms:end_ms]
+        filename = f"segment_{i:03d}_{row['speaker']}.wav"
+        segment.export(os.path.join(segment_folder, filename), format="wav")
+    return segment_folder
+def transcribe_segments(segment_folder: str) -> pd.DataFrame:
+    files = sorted(os.listdir(segment_folder))
+    results = []
+    for filename in files:
+        segment_path = os.path.join(segment_folder, filename)
+        res = model.transcribe(segment_path, language="th")
+        results.append({
+            "filename": filename,
+            "text": res["text"].strip()
+        })
+    return pd.DataFrame(results)
+def clean_summary(text):
+    import re
+    if not text or len(str(text).strip()) == 0:
+        return "ไม่มีข้อมูลสำคัญที่จะสรุป"
+    text = str(text)
+    # Patterns to remove (more comprehensive)
+    patterns_to_remove = [
+        # Headers and labels
+        r'สรุป:\s*',
+        r'สรุปการประชุม:\s*',
+        r'บทสรุป:\s*',
+        r'ข้อสรุป:\s*',
+        r'\*\*Key Messages:\*\*|\*\*หัวข้อหลัก:\*\*',
+        r'\*\*Action Items:\*\*|\*\*ประเด็นสำคัญ:\*\*',
+        r'\*\*Summary:\*\*|\*\*สรุป:\*\*',
+        # Bullet points and markers
+        r'^[-•]\s*Key Messages?:?\s*',
+        r'^[-•]\s*Action Items?:?\s*',
+        r'^[-•]\s*หัวข้อหลัก:?\s*',
+        r'^[-•]\s*ประเด็นสำคัญ:?\s*',
+        r'^[-•]\s*ข้อมูลน่าสนใจ:?\s*',
+        r'^[-•]\s*บทสรุป:?\s*',
+        # Line breaks and formatting
+        r'\r\n|\r|\n',
+        r'\t+',
+        # Disclaimers and notes
+        r'หมายเหตุ:.*?(?=\n|\r|$)',
+        r'เนื่องจาก.*?(?=\n|\r|$)',
+        r'ไม่มีข้อความ.*?(?=\n|\r|$)',
+        r'ไม่มีประเด็น.*?(?=\n|\r|$)',
+        r'ไม่มี Action Items.*?(?=\n|\r|$)',
+        r'ไม่มีรายการ.*?(?=\n|\r|$)',
+        r'ต้องการข้อมูลเพิ่มเติม.*?(?=\n|\r|$)',
+        r'ต้องขอความชัดเจนเพิ่มเติม.*?(?=\n|\r|$)',
+        # Meta comments
+        r'\(ตัดประโยคที่ไม่เกี่ยวข้องหรือซ้ำซ้อนออก.*?\)',
+        r'\(.*?เพื่อเน้นความชัดเจน.*?\)',
+        # AI-generated phrases
+        r'ตามที่ได้กล่าวไว้.*?(?=\n|\r|$)',
+        r'จากข้อความที่ให้มา.*?(?=\n|\r|$)',
+        r'Based on the provided text.*?(?=\n|\r|$)',
+        r'According to the text.*?(?=\n|\r|$)',
+        # Multiple spaces (keep at end)
+        r'\s+'
+    ]
+    cleaned_text = text
+    # Apply cleaning patterns
+    for pattern in patterns_to_remove:
+        if pattern == r'\s+':
+            # Replace multiple spaces with single space
+            cleaned_text = re.sub(pattern, ' ', cleaned_text)
+        else:
+            cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
+    # Remove markdown formatting but keep content
+    cleaned_text = re.sub(r'\*\*(.*?)\*\*', r'\1', cleaned_text)  # Bold
+    cleaned_text = re.sub(r'\*(.*?)\*', r'\1', cleaned_text)      # Italic
+    cleaned_text = re.sub(r'_{2,}(.*?)_{2,}', r'\1', cleaned_text) # Underline
+    # Remove excessive punctuation
+    cleaned_text = re.sub(r'[.]{3,}', '...', cleaned_text)
+    cleaned_text = re.sub(r'[!]{2,}', '!', cleaned_text)
+    cleaned_text = re.sub(r'[?]{2,}', '?', cleaned_text)
+    # Clean up bullet points and numbering
+    cleaned_text = re.sub(r'^[-•*]\s*', '', cleaned_text, flags=re.MULTILINE)
+    cleaned_text = re.sub(r'^\d+\.\s*', '', cleaned_text, flags=re.MULTILINE)
+    # Useless phrases (more comprehensive)
+    useless_phrases = [
+        'ไม่มี',
+        'ไม่สามารถสรุปได้',
+        'ข้อความต้นฉบับไม่มีความหมาย',
+        'ไม่มีข้อมูลเพียงพอ',
+        'ไม่มีประเด็นสำคัญ',
+        'ไม่มี Action Items',
+        'ต้องขอความชัดเจนเพิ่มเติม',
+        'ไม่มีข้อมูลที่สำคัญ',
+        'ไม่สามารถระบุได้',
+        'ข้อมูลไม่ชัดเจน',
+        'ไม่มีเนื้อหาที่เกี่ยวข้อง',
+        'N/A',
+        'n/a',
+        'Not applicable',
+        'No content',
+        'No summary available'
+    ]
+    cleaned_text = cleaned_text.strip()
+    if (len(cleaned_text) < 15 or
+        any(phrase.lower() in cleaned_text.lower() for phrase in useless_phrases) or
+        cleaned_text.lower() in [phrase.lower() for phrase in useless_phrases]):
+        return "ไม่มีข้อมูลสำคัญที่จะสรุปมากพอ"
+    cleaned_text = re.sub(r'\s+([.!?])', r'\1', cleaned_text)
+    cleaned_text = re.sub(r'([.!?])\s*([A-Za-zก-๙])', r'\1 \2', cleaned_text)
+    return cleaned_text
+from together import Together
+import time
+def summarize_texts(texts, api_key, model="deepseek-ai/DeepSeek-V3", delay=1):
+    client = Together(api_key=api_key)
+    summaries = []
+    for idx, text in enumerate(texts):
+        prompt = f"""
+สรุปข้อความประชุมนี้เป็นภาษาไทยสั้น ๆ เน้นประเด็นสำคัญ (key messages) และ Action Items โดยตัดรายละเอียดที���ไม่สำคัญออก:
+ข้อความ:
+{text}
+สรุป:
+- Key Messages:
+- Action Items:
+"""
+        try:
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": "คุณเป็นผู้เชี่ยวชาญในการสรุปเนื้อหา ตอบเป็นภาษาไทยเสมอ เน้นหัวข้อหลักและข้อมูลสำคัญ"},
+                    {"role": "user", "content": prompt}
+                ],
+                max_tokens=1024,
+                temperature=0.7,
+            )
+            summary = response.choices[0].message.content.strip()
+            summary = clean_summary(summary)
+            summaries.append(summary)
+        except Exception as e:
+            print(f"Error at index {idx}: {e}")
+            summaries.append("ไม่สามารถสรุปได้")
+        if idx < len(texts) - 1:
+            time.sleep(delay)
+    return summaries
+@app.post("/upload_video/")
+async def upload_video(file: UploadFile = File(...)):
+    video_path = save_uploaded_file(file)
+    audio_path = extract_and_normalize_audio(video_path)
+    df_diarization = diarize_audio(audio_path)
+    segment_folder = split_segments(audio_path, df_diarization)
+    df_transcriptions = transcribe_segments(segment_folder)
+    min_len = min(len(df_diarization), len(df_transcriptions))
+    df_merged = pd.concat([
+        df_diarization.iloc[:min_len].reset_index(drop=True),
+        df_transcriptions.iloc[:min_len].reset_index(drop=True)
+    ], axis=1)
+    result = df_merged.to_dict(orient="records")
+    speaker_array = df_diarization["speaker"].unique().tolist()
+    counter = Counter(df_diarization["speaker"])
+    result_array = [{"speaker": spk, "count": cnt} for spk, cnt in counter.most_common()]
+    # api_key = "9d698113d5c677fa44aae75a51882e5b2f094f20381e763df82188fc5585bfed"
+    # summaries = summarize_texts(df_merged["text"].tolist(), api_key, delay=2)
+    duration_minutes = len(AudioSegment.from_wav(audio_path)) / 1000 / 60
+    return JSONResponse(content={
+        "video_path": video_path,
+        "audio_path": audio_path,
+        "audio_length": duration_minutes,
+        "data": result,
+        "speaker_array": speaker_array,
+        "count_speaker": result_array,
+        "num_speakers": len(speaker_array),
+        "summaries": '',
+        "total_sentence": len(df_merged['text']),
+    })
+public_url = ngrok.connect(8300)
+print(f"Public URL: {public_url}")
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8300)

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+fastapi
+uvicorn[standard]
+openai-whisper
+pyannote.audio
+moviepy
+pydub
+pyngrok
+python-multipart
+together
+torch
+# For CUDA-enabled torch, install via Dockerfile:
+# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+torchvision
+torchaudio
+omegaconf
+pandas
+nest_asyncio
+python-dotenv