Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import WhisperForConditionalGeneration, WhisperProcessor | |
from pydub import AudioSegment | |
import re | |
# ์ ๋ก๋ํ ๋ชจ๋ธ ๋ก๋ | |
repo_name = "ireneminhee/speech-to-depression" | |
model = WhisperForConditionalGeneration.from_pretrained(repo_name) | |
processor = WhisperProcessor.from_pretrained(repo_name) | |
# ์์ฑ์ ํ ์คํธ๋ก ๋ณํํ๋ ํจ์ | |
def transcribe(audio): | |
inputs = processor(audio, return_tensors="pt", sampling_rate=16000) | |
generated_ids = model.generate(inputs.input_features) | |
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
return transcription | |
# ์ฐ์ธ์ฆ ์์ธก ๋ชจ๋ธ ๋ก๋ | |
def load_model_from_safetensors(model_name, safetensors_path): | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=model_name) | |
state_dict = torch.load(safetensors_path) # safetensors๋ฅผ ๋ชจ๋ธ๋ก ๋ก๋ | |
model.load_state_dict(state_dict) | |
model.eval() | |
return model, tokenizer | |
# ์์ธก ํจ์ | |
def predict_depression(sentences, model, tokenizer): | |
results = [] | |
for sentence in sentences: | |
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True) | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
logits = outputs.logits | |
prediction = torch.argmax(logits, dim=-1).item() | |
results.append((sentence, prediction)) | |
return results | |
# ์ ์ฒด ํ๋ก์ธ์ค๋ฅผ ์คํํ๋ ํจ์ | |
def process_audio_and_predict(audio): | |
# 1. Whisper ๋ชจ๋ธ๋ก ์์ฑ์ ํ ์คํธ๋ก ๋ณํ | |
text = transcribe_audio(audio) | |
# 2. ํ ์คํธ๋ฅผ ๋ฌธ์ฅ ๋จ์๋ก ๋๋๊ธฐ | |
#sentences = split_sentences_using_gpt(text) | |
# 3. ๋ชจ๋ธ ๋ก๋ (๋ชจ๋ธ ๊ฒฝ๋ก์ ๋ง๊ฒ ์์ ) | |
# ๋ชจ๋ธ๊ณผ tokenizer ๊ฒฝ๋ก (์ฌ์ฉ์ ํ๊ฒฝ์ ๋ง๊ฒ ์ค์ ) | |
safetensors_path = "./model/model.safetensors" # SafeTensors ๋ชจ๋ธ ํ์ผ ๊ฒฝ๋ก | |
model_name = "klue/bert-base" # ๋ชจ๋ธ ์ด๋ฆ ๋๋ Hugging Face ๊ฒฝ๋ก | |
model, tokenizer = load_model_from_safetensors(model_name, safetensors_path) | |
# 4. ๋ฌธ์ฅ๋ณ๋ก ์ฐ์ธ ์ฆ์ ์์ธก | |
results = predict_depression(text, model, tokenizer) | |
# 5. ๊ฒฐ๊ณผ๋ฅผ ๋ฐํ | |
df_result = pd.DataFrame(results, columns=["Sentence", "Depression_Prediction"]) | |
average_probability = df_result["Depression_Prediction"].mean() | |
return f"Average Depression Probability: {average_probability:.2f}" | |
# Gradio ์ธํฐํ์ด์ค๋ก ์ฐ๊ฒฐํ ํจ์ | |
def gradio_process_audio(audio_data): | |
# ์ฌ์ฉ์๊ฐ ๋ง์ดํฌ๋ก ์ ๋ ฅํ ์์ฑ์ ์์ ํ์ผ๋ก ์ ์ฅ | |
temp_audio_path = "temp_audio.wav" | |
with open(temp_audio_path, "wb") as f: | |
f.write(audio_data) | |
# ์ค๋์ค ์ฒ๋ฆฌ ๋ฐ ์์ธก | |
average_probability, df_result = process_audio_and_detect_depression(temp_audio_path, safetensors_path, model_name) | |
# ๊ฒฐ๊ณผ ์ถ๋ ฅ | |
return f"Average Depression Probability: {average_probability:.2f}", df_result | |
# Gradio ์ธํฐํ์ด์ค ์ ์ | |
interface = gr.Interface( | |
fn=gradio_process_audio, # Gradio์์ ํธ์ถํ ํจ์ | |
inputs=gr.Audio(type="numpy"), # ์ฌ์ฉ์ ์์ฑ ์ ๋ ฅ (๋ง์ดํฌ) | |
outputs=[ | |
gr.Textbox(label="Depression Probability"), # ํ๊ท ํ๋ฅ | |
gr.Dataframe(label="Sentence-wise Analysis") # ์์ธ ๋ถ์ ๊ฒฐ๊ณผ | |
], | |
title="Depression Detection from Audio", | |
description="Record your voice, and the model will analyze the text for depression likelihood." | |
) | |
# Gradio ์คํ | |
interface.launch(share=True) |