|
|
import os |
|
|
import shutil |
|
|
import time |
|
|
from collections import Counter |
|
|
|
|
|
import torch |
|
|
import whisper |
|
|
from pyannote.audio import Pipeline |
|
|
from torch.serialization import add_safe_globals |
|
|
from omegaconf import ListConfig |
|
|
import nest_asyncio |
|
|
import uvicorn |
|
|
from fastapi import FastAPI, UploadFile, File |
|
|
from fastapi.middleware.cors import CORSMiddleware |
|
|
from fastapi.responses import JSONResponse |
|
|
from pyngrok import ngrok, conf |
|
|
from pydub import AudioSegment, effects |
|
|
import pandas as pd |
|
|
from moviepy.editor import VideoFileClip |
|
|
from together import Together |
|
|
import asyncio |
|
|
import logging |
|
|
|
|
|
HF_CACHE_DIR = "/tmp/hf_cache" |
|
|
WHISPER_CACHE_DIR = "/tmp/whisper_cache" |
|
|
|
|
|
os.makedirs(HF_CACHE_DIR, exist_ok=True) |
|
|
os.makedirs(WHISPER_CACHE_DIR, exist_ok=True) |
|
|
|
|
|
os.environ["HUGGINGFACE_HUB_CACHE"] = HF_CACHE_DIR |
|
|
os.environ["TORCH_HOME"] = WHISPER_CACHE_DIR |
|
|
|
|
|
token = os.environ.get('HF_TOKEN') |
|
|
together_api_key = os.environ.get('TOGETHER_API_KEY') |
|
|
ngrok_auth_token = os.environ.get('NGROK_AUTH_TOKEN') |
|
|
|
|
|
pipelines, models, others = [], [], [] |
|
|
|
|
|
def load_model_bundle(): |
|
|
n = torch.cuda.device_count() |
|
|
logger.info(f"🖥️ Found {n} CUDA device(s)") |
|
|
|
|
|
if n == 0: |
|
|
device = "cpu" |
|
|
pipeline = Pipeline.from_pretrained( |
|
|
"pyannote/speaker-diarization-3.1", |
|
|
use_auth_token=token, |
|
|
cache_dir=HF_CACHE_DIR |
|
|
).to(device) |
|
|
model = whisper.load_model("large", download_root=WHISPER_CACHE_DIR).to(device) |
|
|
elif n == 1: |
|
|
device = "cuda:0" |
|
|
pipeline = Pipeline.from_pretrained( |
|
|
"pyannote/speaker-diarization-3.1", |
|
|
use_auth_token=token, |
|
|
cache_dir=HF_CACHE_DIR |
|
|
).to(device) |
|
|
model = whisper.load_model("large", download_root=WHISPER_CACHE_DIR).to(device) |
|
|
else: |
|
|
device_pyannote = torch.device("cuda:0") |
|
|
device_whisper = torch.device("cuda:1") |
|
|
pipeline = Pipeline.from_pretrained( |
|
|
"pyannote/speaker-diarization-3.1", |
|
|
use_auth_token=token, |
|
|
cache_dir=HF_CACHE_DIR |
|
|
).to(device_pyannote) |
|
|
model = whisper.load_model("large", download_root=WHISPER_CACHE_DIR).to(device_whisper) |
|
|
|
|
|
pipelines.append(pipeline) |
|
|
models.append(model) |
|
|
|
|
|
|
|
|
together = Together(api_key=together_api_key) |
|
|
conf.get_default().auth_token = ngrok_auth_token |
|
|
|
|
|
add_safe_globals({ListConfig}) |
|
|
|
|
|
UPLOAD_FOLDER = "/tmp/uploads" |
|
|
os.makedirs(UPLOAD_FOLDER, exist_ok=True) |
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
origins = os.getenv("CORS_ORIGINS", "").split(",") |
|
|
origins = [o.strip() for o in origins if o.strip()] or [ |
|
|
"*", |
|
|
] |
|
|
|
|
|
app.add_middleware( |
|
|
CORSMiddleware, |
|
|
allow_origins=origins, |
|
|
allow_credentials=True, |
|
|
allow_methods=["*"], |
|
|
allow_headers=["*"], |
|
|
) |
|
|
|
|
|
|
|
|
@app.get("/") |
|
|
async def check_api(): |
|
|
async with model_lock: |
|
|
if not pipelines or not models: |
|
|
logger.info("🔁 Lazy loading models now...") |
|
|
await load_model_bundle() |
|
|
|
|
|
return { |
|
|
"status": "running", |
|
|
"models_loaded": { |
|
|
"pipelines": len(pipelines), |
|
|
"whisper_models": len(models) |
|
|
}, |
|
|
"cuda_available": torch.cuda.is_available(), |
|
|
"cuda_devices": torch.cuda.device_count() if torch.cuda.is_available() else 0 |
|
|
} |
|
|
|
|
|
@app.get("/key") |
|
|
async def check_env(): |
|
|
return { |
|
|
"env": os.environ.get("ENV", "dev"), |
|
|
"openai_key_exists": bool(os.environ.get("OPENAI_API_KEY")), |
|
|
} |
|
|
|
|
|
def save_uploaded_file(file: UploadFile) -> str: |
|
|
os.makedirs(UPLOAD_FOLDER, exist_ok=True) |
|
|
filepath = os.path.join(UPLOAD_FOLDER, file.filename) |
|
|
with open(filepath, "wb") as f: |
|
|
shutil.copyfileobj(file.file, f) |
|
|
return filepath |
|
|
|
|
|
def extract_and_normalize_audio(video_path: str) -> str: |
|
|
clip = VideoFileClip(video_path) |
|
|
audio_path = os.path.join(UPLOAD_FOLDER, "extracted_audio.wav") |
|
|
clip.audio.write_audiofile(audio_path) |
|
|
|
|
|
audio = AudioSegment.from_wav(audio_path) |
|
|
normalized_audio = effects.normalize(audio) |
|
|
cleaned_path = os.path.join(UPLOAD_FOLDER, "cleaned.wav") |
|
|
normalized_audio.export(cleaned_path, format="wav") |
|
|
return cleaned_path |
|
|
|
|
|
def diarize_audio(audio_path: str) -> pd.DataFrame: |
|
|
diarization = pipeline(audio_path) |
|
|
return pd.DataFrame([ |
|
|
{"start": round(turn.start, 3), "end": round(turn.end, 3), "speaker": speaker} |
|
|
for turn, _, speaker in diarization.itertracks(yield_label=True) |
|
|
]) |
|
|
|
|
|
def split_segments(audio_path: str, df: pd.DataFrame) -> str: |
|
|
segment_folder = os.path.join(UPLOAD_FOLDER, "segments") |
|
|
if os.path.exists(segment_folder): |
|
|
shutil.rmtree(segment_folder) |
|
|
os.makedirs(segment_folder, exist_ok=True) |
|
|
|
|
|
audio = AudioSegment.from_file(audio_path) |
|
|
for i, row in df.iterrows(): |
|
|
start_ms = int(row['start'] * 1000) |
|
|
end_ms = int(row['end'] * 1000) |
|
|
segment = audio[start_ms:end_ms] |
|
|
filename = f"segment_{i:03d}_{row['speaker']}.wav" |
|
|
segment.export(os.path.join(segment_folder, filename), format="wav") |
|
|
|
|
|
return segment_folder |
|
|
|
|
|
def transcribe_segments(segment_folder: str) -> pd.DataFrame: |
|
|
files = sorted(os.listdir(segment_folder)) |
|
|
results = [] |
|
|
for filename in files: |
|
|
segment_path = os.path.join(segment_folder, filename) |
|
|
res = model.transcribe(segment_path, language="th") |
|
|
results.append({ |
|
|
"filename": filename, |
|
|
"text": res["text"].strip() |
|
|
}) |
|
|
return pd.DataFrame(results) |
|
|
|
|
|
def clean_summary(text): |
|
|
import re |
|
|
|
|
|
if not text or len(str(text).strip()) == 0: |
|
|
return "ไม่มีข้อมูลสำคัญที่จะสรุป" |
|
|
|
|
|
text = str(text) |
|
|
|
|
|
|
|
|
patterns_to_remove = [ |
|
|
|
|
|
r'สรุป:\s*', |
|
|
r'สรุปการประชุม:\s*', |
|
|
r'บทสรุป:\s*', |
|
|
r'ข้อสรุป:\s*', |
|
|
r'\*\*Key Messages:\*\*|\*\*หัวข้อหลัก:\*\*', |
|
|
r'\*\*Action Items:\*\*|\*\*ประเด็นสำคัญ:\*\*', |
|
|
r'\*\*Summary:\*\*|\*\*สรุป:\*\*', |
|
|
|
|
|
|
|
|
r'^[-•]\s*Key Messages?:?\s*', |
|
|
r'^[-•]\s*Action Items?:?\s*', |
|
|
r'^[-•]\s*หัวข้อหลัก:?\s*', |
|
|
r'^[-•]\s*ประเด็นสำคัญ:?\s*', |
|
|
r'^[-•]\s*ข้อมูลน่าสนใจ:?\s*', |
|
|
r'^[-•]\s*บทสรุป:?\s*', |
|
|
|
|
|
|
|
|
r'\r\n|\r|\n', |
|
|
r'\t+', |
|
|
|
|
|
|
|
|
r'หมายเหตุ:.*?(?=\n|\r|$)', |
|
|
r'เนื่องจาก.*?(?=\n|\r|$)', |
|
|
r'ไม่มีข้อความ.*?(?=\n|\r|$)', |
|
|
r'ไม่มีประเด็น.*?(?=\n|\r|$)', |
|
|
r'ไม่มี Action Items.*?(?=\n|\r|$)', |
|
|
r'ไม่มีรายการ.*?(?=\n|\r|$)', |
|
|
r'ต้องการข้อมูลเพิ่มเติม.*?(?=\n|\r|$)', |
|
|
r'ต้องขอความชัดเจนเพิ่มเติม.*?(?=\n|\r|$)', |
|
|
|
|
|
|
|
|
r'\(ตัดประโยคที่ไม่เกี่ยวข้องหรือซ้ำซ้อนออก.*?\)', |
|
|
r'\(.*?เพื่อเน้นความชัดเจน.*?\)', |
|
|
|
|
|
|
|
|
r'ตามที่ได้กล่าวไว้.*?(?=\n|\r|$)', |
|
|
r'จากข้อความที่ให้มา.*?(?=\n|\r|$)', |
|
|
r'Based on the provided text.*?(?=\n|\r|$)', |
|
|
r'According to the text.*?(?=\n|\r|$)', |
|
|
|
|
|
|
|
|
r'\s+' |
|
|
] |
|
|
|
|
|
cleaned_text = text |
|
|
|
|
|
|
|
|
for pattern in patterns_to_remove: |
|
|
if pattern == r'\s+': |
|
|
|
|
|
cleaned_text = re.sub(pattern, ' ', cleaned_text) |
|
|
else: |
|
|
cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) |
|
|
|
|
|
|
|
|
cleaned_text = re.sub(r'\*\*(.*?)\*\*', r'\1', cleaned_text) |
|
|
cleaned_text = re.sub(r'\*(.*?)\*', r'\1', cleaned_text) |
|
|
cleaned_text = re.sub(r'_{2,}(.*?)_{2,}', r'\1', cleaned_text) |
|
|
|
|
|
|
|
|
cleaned_text = re.sub(r'[.]{3,}', '...', cleaned_text) |
|
|
cleaned_text = re.sub(r'[!]{2,}', '!', cleaned_text) |
|
|
cleaned_text = re.sub(r'[?]{2,}', '?', cleaned_text) |
|
|
|
|
|
|
|
|
cleaned_text = re.sub(r'^[-•*]\s*', '', cleaned_text, flags=re.MULTILINE) |
|
|
cleaned_text = re.sub(r'^\d+\.\s*', '', cleaned_text, flags=re.MULTILINE) |
|
|
|
|
|
|
|
|
useless_phrases = [ |
|
|
'ไม่มี', |
|
|
'ไม่สามารถสรุปได้', |
|
|
'ข้อความต้นฉบับไม่มีความหมาย', |
|
|
'ไม่มีข้อมูลเพียงพอ', |
|
|
'ไม่มีประเด็นสำคัญ', |
|
|
'ไม่มี Action Items', |
|
|
'ต้องขอความชัดเจนเพิ่มเติม', |
|
|
'ไม่มีข้อมูลที่สำคัญ', |
|
|
'ไม่สามารถระบุได้', |
|
|
'ข้อมูลไม่ชัดเจน', |
|
|
'ไม่มีเนื้อหาที่เกี่ยวข้อง', |
|
|
'N/A', |
|
|
'n/a', |
|
|
'Not applicable', |
|
|
'No content', |
|
|
'No summary available' |
|
|
] |
|
|
|
|
|
cleaned_text = cleaned_text.strip() |
|
|
|
|
|
if (len(cleaned_text) < 15 or |
|
|
any(phrase.lower() in cleaned_text.lower() for phrase in useless_phrases) or |
|
|
cleaned_text.lower() in [phrase.lower() for phrase in useless_phrases]): |
|
|
return "ไม่มีข้อมูลสำคัญที่จะสรุปมากพอ" |
|
|
|
|
|
cleaned_text = re.sub(r'\s+([.!?])', r'\1', cleaned_text) |
|
|
cleaned_text = re.sub(r'([.!?])\s*([A-Za-zก-๙])', r'\1 \2', cleaned_text) |
|
|
|
|
|
return cleaned_text |
|
|
|
|
|
from together import Together |
|
|
import time |
|
|
|
|
|
def summarize_texts(texts, api_key, model="deepseek-ai/DeepSeek-V3", delay=1): |
|
|
client = Together(api_key=api_key) |
|
|
summaries = [] |
|
|
|
|
|
for idx, text in enumerate(texts): |
|
|
prompt = f""" |
|
|
สรุปข้อความประชุมนี้เป็นภาษาไทยสั้น ๆ เน้นประเด็นสำคัญ (key messages) และ Action Items โดยตัดรายละเอียดที่ไม่สำคัญออก: |
|
|
|
|
|
ข้อความ: |
|
|
{text} |
|
|
|
|
|
สรุป: |
|
|
- Key Messages: |
|
|
- Action Items: |
|
|
""" |
|
|
try: |
|
|
response = client.chat.completions.create( |
|
|
model=model, |
|
|
messages=[ |
|
|
{"role": "system", "content": "คุณเป็นผู้เชี่ยวชาญในการสรุปเนื้อหา ตอบเป็นภาษาไทยเสมอ เน้นหัวข้อหลักและข้อมูลสำคัญ"}, |
|
|
{"role": "user", "content": prompt} |
|
|
], |
|
|
max_tokens=1024, |
|
|
temperature=0.7, |
|
|
) |
|
|
|
|
|
summary = response.choices[0].message.content.strip() |
|
|
summary = clean_summary(summary) |
|
|
summaries.append(summary) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error at index {idx}: {e}") |
|
|
summaries.append("ไม่สามารถสรุปได้") |
|
|
|
|
|
if idx < len(texts) - 1: |
|
|
time.sleep(delay) |
|
|
|
|
|
return summaries |
|
|
|
|
|
@app.post('/video') |
|
|
async def upload_video(file: UploadFile = File(...)): |
|
|
video_path = save_uploaded_file(file) |
|
|
return video_path |
|
|
|
|
|
@app.post("/upload_video/") |
|
|
async def upload_video(file: UploadFile = File(...)): |
|
|
|
|
|
async with model_lock: |
|
|
if not pipelines or not models: |
|
|
logger.info("🔁 Lazy loading models now...") |
|
|
load_model_bundle() |
|
|
|
|
|
video_path = save_uploaded_file(file) |
|
|
audio_path = extract_and_normalize_audio(video_path) |
|
|
df_diarization = diarize_audio(audio_path) |
|
|
segment_folder = split_segments(audio_path, df_diarization) |
|
|
df_transcriptions = transcribe_segments(segment_folder) |
|
|
|
|
|
min_len = min(len(df_diarization), len(df_transcriptions)) |
|
|
df_merged = pd.concat([ |
|
|
df_diarization.iloc[:min_len].reset_index(drop=True), |
|
|
df_transcriptions.iloc[:min_len].reset_index(drop=True) |
|
|
], axis=1) |
|
|
|
|
|
result = df_merged.to_dict(orient="records") |
|
|
speaker_array = df_diarization["speaker"].unique().tolist() |
|
|
counter = Counter(df_diarization["speaker"]) |
|
|
result_array = [{"speaker": spk, "count": cnt} for spk, cnt in counter.most_common()] |
|
|
api_key = together |
|
|
summaries = summarize_texts(df_merged["text"].tolist(), api_key, delay=2) |
|
|
duration_minutes = len(AudioSegment.from_wav(audio_path)) / 1000 / 60 |
|
|
|
|
|
return JSONResponse(content={ |
|
|
"video_path": video_path, |
|
|
"audio_path": audio_path, |
|
|
"audio_length": duration_minutes, |
|
|
"data": result, |
|
|
"speaker_array": speaker_array, |
|
|
"count_speaker": result_array, |
|
|
"num_speakers": len(speaker_array), |
|
|
"summaries": summaries, |
|
|
"total_sentence": len(df_merged['text']), |
|
|
}) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
uvicorn.run( |
|
|
app, |
|
|
host="0.0.0.0", |
|
|
port=7860, |
|
|
reload=False |
|
|
) |