vrajshroff's picture
Update app.py
cdc5c5a verified
import gradio as gr
import soundfile as sf
import wave
from pyannote.audio import Pipeline
import torch
import os
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.0",
use_auth_token=os.getenv("HF_AUTH_TOKEN"))
pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
def process_audio(audio_file):
diarization = pipeline(audio_file)
with open("audio.rttm", "w") as rttm:
diarization.write_rttm(rttm)
speaker_durations = {}
first_speaker = None
with open("audio.rttm", "r") as file:
for line in file:
parts = line.strip().split()
speaker = parts[7]
start_time = float(parts[3])
duration = float(parts[4])
if first_speaker is None:
first_speaker = speaker
if speaker not in speaker_durations:
speaker_durations[speaker] = 0
speaker_durations[speaker] += duration
total_duration = sum(speaker_durations.values())
first_speaker_duration = speaker_durations.get(first_speaker, 0)
percentage_first_speaker = (first_speaker_duration / total_duration) * 100 if total_duration > 0 else 0
return percentage_first_speaker
def record_and_process(audio):
if audio is None:
return "No audio was recorded. Please try again."
sample_rate, audio_data = audio
file_path = "audio.wav"
sf.write(file_path, audio_data, sample_rate)
percentage = process_audio(file_path)
return f"Percentage of time spoken by the first speaker: {percentage:.2f}%"
interface = gr.Interface(
fn=record_and_process,
inputs=gr.Audio(type="numpy"),
outputs="text",
title="See How Much You Talk in a Conversation",
description=(
"Make sure you are the first person to speak!<br>"
"You can also use a sample audio file for testing: "
"<a href='https://www.uclass.psychol.ucl.ac.uk/Release2/Conversation/AudioOnly/wav/M_0025_11y10m_1.wav' "
"target='_blank'>sample audio</a>."
),
allow_flagging="never"
)
interface.launch()