Spaces:

inwneon
/

project-voice-diarzation

Paused

sivakorn-su

chore: Fix CORS

3b8d1bc 5 months ago

13.6 kB

	import os
	import shutil
	import time
	from collections import Counter

	import torch
	import whisper
	from pyannote.audio import Pipeline
	from torch.serialization import add_safe_globals
	from omegaconf import ListConfig
	import nest_asyncio
	import uvicorn
	from fastapi import FastAPI, UploadFile, File
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import JSONResponse
	from pyngrok import ngrok, conf
	from pydub import AudioSegment, effects
	import pandas as pd
	from moviepy.editor import VideoFileClip
	from together import Together
	import asyncio
	import logging

	HF_CACHE_DIR = "/tmp/hf_cache"
	WHISPER_CACHE_DIR = "/tmp/whisper_cache"

	os.makedirs(HF_CACHE_DIR, exist_ok=True)
	os.makedirs(WHISPER_CACHE_DIR, exist_ok=True)

	os.environ["HUGGINGFACE_HUB_CACHE"] = HF_CACHE_DIR
	os.environ["TORCH_HOME"] = WHISPER_CACHE_DIR

	token = os.environ.get('HF_TOKEN')
	together_api_key = os.environ.get('TOGETHER_API_KEY')
	ngrok_auth_token = os.environ.get('NGROK_AUTH_TOKEN')

	pipelines, models, others = [], [], []

	def load_model_bundle():
	n = torch.cuda.device_count()
	logger.info(f"🖥️ Found {n} CUDA device(s)")

	if n == 0:
	device = "cpu"
	pipeline = Pipeline.from_pretrained(
	"pyannote/speaker-diarization-3.1",
	use_auth_token=token,
	cache_dir=HF_CACHE_DIR
	).to(device)
	model = whisper.load_model("large", download_root=WHISPER_CACHE_DIR).to(device)
	elif n == 1:
	device = "cuda:0"
	pipeline = Pipeline.from_pretrained(
	"pyannote/speaker-diarization-3.1",
	use_auth_token=token,
	cache_dir=HF_CACHE_DIR
	).to(device)
	model = whisper.load_model("large", download_root=WHISPER_CACHE_DIR).to(device)
	else:
	device_pyannote = torch.device("cuda:0")
	device_whisper = torch.device("cuda:1")
	pipeline = Pipeline.from_pretrained(
	"pyannote/speaker-diarization-3.1",
	use_auth_token=token,
	cache_dir=HF_CACHE_DIR
	).to(device_pyannote)
	model = whisper.load_model("large", download_root=WHISPER_CACHE_DIR).to(device_whisper)

	pipelines.append(pipeline)
	models.append(model)


	together = Together(api_key=together_api_key)
	conf.get_default().auth_token = ngrok_auth_token

	add_safe_globals({ListConfig})

	UPLOAD_FOLDER = "/tmp/uploads"
	os.makedirs(UPLOAD_FOLDER, exist_ok=True)

	app = FastAPI()

	origins = os.getenv("CORS_ORIGINS", "").split(",")
	origins = [o.strip() for o in origins if o.strip()] or [
	"*",
	]

	app.add_middleware(
	CORSMiddleware,
	allow_origins=origins,
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	@app.get("/")
	async def check_api():
	async with model_lock:
	if not pipelines or not models:
	logger.info("🔁 Lazy loading models now...")
	await load_model_bundle()

	return {
	"status": "running",
	"models_loaded": {
	"pipelines": len(pipelines),
	"whisper_models": len(models)
	},
	"cuda_available": torch.cuda.is_available(),
	"cuda_devices": torch.cuda.device_count() if torch.cuda.is_available() else 0
	}

	@app.get("/key")
	async def check_env():
	return {
	"env": os.environ.get("ENV", "dev"),
	"openai_key_exists": bool(os.environ.get("OPENAI_API_KEY")),
	}

	def save_uploaded_file(file: UploadFile) -> str:
	os.makedirs(UPLOAD_FOLDER, exist_ok=True)
	filepath = os.path.join(UPLOAD_FOLDER, file.filename)
	with open(filepath, "wb") as f:
	shutil.copyfileobj(file.file, f)
	return filepath

	def extract_and_normalize_audio(video_path: str) -> str:
	clip = VideoFileClip(video_path)
	audio_path = os.path.join(UPLOAD_FOLDER, "extracted_audio.wav")
	clip.audio.write_audiofile(audio_path)

	audio = AudioSegment.from_wav(audio_path)
	normalized_audio = effects.normalize(audio)
	cleaned_path = os.path.join(UPLOAD_FOLDER, "cleaned.wav")
	normalized_audio.export(cleaned_path, format="wav")
	return cleaned_path

	def diarize_audio(audio_path: str) -> pd.DataFrame:
	diarization = pipeline(audio_path)
	return pd.DataFrame([
	{"start": round(turn.start, 3), "end": round(turn.end, 3), "speaker": speaker}
	for turn, _, speaker in diarization.itertracks(yield_label=True)
	])

	def split_segments(audio_path: str, df: pd.DataFrame) -> str:
	segment_folder = os.path.join(UPLOAD_FOLDER, "segments")
	if os.path.exists(segment_folder):
	shutil.rmtree(segment_folder)
	os.makedirs(segment_folder, exist_ok=True)

	audio = AudioSegment.from_file(audio_path)
	for i, row in df.iterrows():
	start_ms = int(row['start'] * 1000)
	end_ms = int(row['end'] * 1000)
	segment = audio[start_ms:end_ms]
	filename = f"segment_{i:03d}_{row['speaker']}.wav"
	segment.export(os.path.join(segment_folder, filename), format="wav")

	return segment_folder

	def transcribe_segments(segment_folder: str) -> pd.DataFrame:
	files = sorted(os.listdir(segment_folder))
	results = []
	for filename in files:
	segment_path = os.path.join(segment_folder, filename)
	res = model.transcribe(segment_path, language="th")
	results.append({
	"filename": filename,
	"text": res["text"].strip()
	})
	return pd.DataFrame(results)

	def clean_summary(text):
	import re

	if not text or len(str(text).strip()) == 0:
	return "ไม่มีข้อมูลสำคัญที่จะสรุป"

	text = str(text)

	# Patterns to remove (more comprehensive)
	patterns_to_remove = [
	# Headers and labels
	r'สรุป:\s*',
	r'สรุปการประชุม:\s*',
	r'บทสรุป:\s*',
	r'ข้อสรุป:\s*',
	r'\\Key Messages:\\\|\\หัวข้อหลัก:\\',
	r'\\Action Items:\\\|\\ประเด็นสำคัญ:\\',
	r'\\Summary:\\\|\\สรุป:\\',

	# Bullet points and markers
	r'^[-•]\sKey Messages?:?\s',
	r'^[-•]\sAction Items?:?\s',
	r'^[-•]\sหัวข้อหลัก:?\s',
	r'^[-•]\sประเด็นสำคัญ:?\s',
	r'^[-•]\sข้อมูลน่าสนใจ:?\s',
	r'^[-•]\sบทสรุป:?\s',

	# Line breaks and formatting
	r'\r\n\|\r\|\n',
	r'\t+',

	# Disclaimers and notes
	r'หมายเหตุ:.*?(?=\n\|\r\|$)',
	r'เนื่องจาก.*?(?=\n\|\r\|$)',
	r'ไม่มีข้อความ.*?(?=\n\|\r\|$)',
	r'ไม่มีประเด็น.*?(?=\n\|\r\|$)',
	r'ไม่มี Action Items.*?(?=\n\|\r\|$)',
	r'ไม่มีรายการ.*?(?=\n\|\r\|$)',
	r'ต้องการข้อมูลเพิ่มเติม.*?(?=\n\|\r\|$)',
	r'ต้องขอความชัดเจนเพิ่มเติม.*?(?=\n\|\r\|$)',

	# Meta comments
	r'$ตัดประโยคที่ไม่เกี่ยวข้องหรือซ้ำซ้อนออก.*?$',
	r'$.?เพื่อเน้นความชัดเจน.?$',

	# AI-generated phrases
	r'ตามที่ได้กล่าวไว้.*?(?=\n\|\r\|$)',
	r'จากข้อความที่ให้มา.*?(?=\n\|\r\|$)',
	r'Based on the provided text.*?(?=\n\|\r\|$)',
	r'According to the text.*?(?=\n\|\r\|$)',

	# Multiple spaces (keep at end)
	r'\s+'
	]

	cleaned_text = text

	# Apply cleaning patterns
	for pattern in patterns_to_remove:
	if pattern == r'\s+':
	# Replace multiple spaces with single space
	cleaned_text = re.sub(pattern, ' ', cleaned_text)
	else:
	cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE \| re.MULTILINE \| re.DOTALL)

	# Remove markdown formatting but keep content
	cleaned_text = re.sub(r'\\(.?)\\*', r'\1', cleaned_text) # Bold
	cleaned_text = re.sub(r'\(.?)\*', r'\1', cleaned_text) # Italic
	cleaned_text = re.sub(r'_{2,}(.*?)_{2,}', r'\1', cleaned_text) # Underline

	# Remove excessive punctuation
	cleaned_text = re.sub(r'[.]{3,}', '...', cleaned_text)
	cleaned_text = re.sub(r'[!]{2,}', '!', cleaned_text)
	cleaned_text = re.sub(r'[?]{2,}', '?', cleaned_text)

	# Clean up bullet points and numbering
	cleaned_text = re.sub(r'^[-•]\s', '', cleaned_text, flags=re.MULTILINE)
	cleaned_text = re.sub(r'^\d+\.\s*', '', cleaned_text, flags=re.MULTILINE)

	# Useless phrases (more comprehensive)
	useless_phrases = [
	'ไม่มี',
	'ไม่สามารถสรุปได้',
	'ข้อความต้นฉบับไม่มีความหมาย',
	'ไม่มีข้อมูลเพียงพอ',
	'ไม่มีประเด็นสำคัญ',
	'ไม่มี Action Items',
	'ต้องขอความชัดเจนเพิ่มเติม',
	'ไม่มีข้อมูลที่สำคัญ',
	'ไม่สามารถระบุได้',
	'ข้อมูลไม่ชัดเจน',
	'ไม่มีเนื้อหาที่เกี่ยวข้อง',
	'N/A',
	'n/a',
	'Not applicable',
	'No content',
	'No summary available'
	]

	cleaned_text = cleaned_text.strip()

	if (len(cleaned_text) < 15 or
	any(phrase.lower() in cleaned_text.lower() for phrase in useless_phrases) or
	cleaned_text.lower() in [phrase.lower() for phrase in useless_phrases]):
	return "ไม่มีข้อมูลสำคัญที่จะสรุปมากพอ"

	cleaned_text = re.sub(r'\s+([.!?])', r'\1', cleaned_text)
	cleaned_text = re.sub(r'([.!?])\s*([A-Za-zก-๙])', r'\1 \2', cleaned_text)

	return cleaned_text

	from together import Together
	import time

	def summarize_texts(texts, api_key, model="deepseek-ai/DeepSeek-V3", delay=1):
	client = Together(api_key=api_key)
	summaries = []

	for idx, text in enumerate(texts):
	prompt = f"""
	สรุปข้อความประชุมนี้เป็นภาษาไทยสั้น ๆ เน้นประเด็นสำคัญ (key messages) และ Action Items โดยตัดรายละเอียดที่ไม่สำคัญออก:

	ข้อความ:
	{text}

	สรุป:
	- Key Messages:
	- Action Items:
	"""
	try:
	response = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": "คุณเป็นผู้เชี่ยวชาญในการสรุปเนื้อหา ตอบเป็นภาษาไทยเสมอ เน้นหัวข้อหลักและข้อมูลสำคัญ"},
	{"role": "user", "content": prompt}
	],
	max_tokens=1024,
	temperature=0.7,
	)

	summary = response.choices[0].message.content.strip()
	summary = clean_summary(summary)
	summaries.append(summary)

	except Exception as e:
	print(f"Error at index {idx}: {e}")
	summaries.append("ไม่สามารถสรุปได้")

	if idx < len(texts) - 1:
	time.sleep(delay)

	return summaries

	@app.post('/video')
	async def upload_video(file: UploadFile = File(...)):
	video_path = save_uploaded_file(file)
	return video_path

	@app.post("/upload_video/")
	async def upload_video(file: UploadFile = File(...)):

	async with model_lock:
	if not pipelines or not models:
	logger.info("🔁 Lazy loading models now...")
	load_model_bundle()

	video_path = save_uploaded_file(file)
	audio_path = extract_and_normalize_audio(video_path)
	df_diarization = diarize_audio(audio_path)
	segment_folder = split_segments(audio_path, df_diarization)
	df_transcriptions = transcribe_segments(segment_folder)

	min_len = min(len(df_diarization), len(df_transcriptions))
	df_merged = pd.concat([
	df_diarization.iloc[:min_len].reset_index(drop=True),
	df_transcriptions.iloc[:min_len].reset_index(drop=True)
	], axis=1)

	result = df_merged.to_dict(orient="records")
	speaker_array = df_diarization["speaker"].unique().tolist()
	counter = Counter(df_diarization["speaker"])
	result_array = [{"speaker": spk, "count": cnt} for spk, cnt in counter.most_common()]
	api_key = together
	summaries = summarize_texts(df_merged["text"].tolist(), api_key, delay=2)
	duration_minutes = len(AudioSegment.from_wav(audio_path)) / 1000 / 60

	return JSONResponse(content={
	"video_path": video_path,
	"audio_path": audio_path,
	"audio_length": duration_minutes,
	"data": result,
	"speaker_array": speaker_array,
	"count_speaker": result_array,
	"num_speakers": len(speaker_array),
	"summaries": summaries,
	"total_sentence": len(df_merged['text']),
	})


	if __name__ == "__main__":
	uvicorn.run(
	app,
	host="0.0.0.0",
	port=7860,
	reload=False
	)