|
import random |
|
from pydub import AudioSegment |
|
|
|
|
|
|
|
|
|
def get_audio_volume_db(audio): |
|
"""Estimate the volume in dBFS (decibels relative to full scale) using PyDub.""" |
|
return audio.dBFS if audio.dBFS != float('-inf') else -50.0 |
|
|
|
|
|
def adjust_volume(audio, volume_change_db): |
|
"""Adjusts the volume of an AudioSegment.""" |
|
return audio + volume_change_db |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def place_in_stereo(audio, pan_value): |
|
"""Places audio in stereo field (-1.0 = full left, 1.0 = full right).""" |
|
return audio.pan(pan_value) |
|
|
|
|
|
def overlay_audio(speech_audio, noise_audio): |
|
"""Overlays speech and noise using PyDub.""" |
|
return speech_audio.overlay(noise_audio) |
|
|
|
|
|
def process_audio(speech_data, noise_data, speech_sr, noise_sr, alpha, beta): |
|
""" |
|
Process speech and noise audio data to create a mixed audio output. |
|
|
|
Args: |
|
speech_data (numpy.ndarray): Speech audio data |
|
noise_data (numpy.ndarray): Noise audio data |
|
speech_sr (int): Speech sample rate |
|
noise_sr (int): Noise sample rate |
|
alpha (float): Speech volume adjustment |
|
beta (float): Noise volume adjustment |
|
|
|
Returns: |
|
AudioSegment: Processed audio |
|
""" |
|
|
|
speech_audio = AudioSegment( |
|
speech_data.tobytes(), |
|
frame_rate=speech_sr, |
|
sample_width=speech_data.dtype.itemsize, |
|
channels=1 |
|
) |
|
|
|
noise_audio = AudioSegment( |
|
noise_data.tobytes(), |
|
frame_rate=noise_sr, |
|
sample_width=noise_data.dtype.itemsize, |
|
channels=1 |
|
) |
|
|
|
|
|
speech_duration = len(speech_audio) / 1000.0 |
|
|
|
|
|
if len(noise_audio) / 1000.0 <= speech_duration: |
|
trimmed_noise = noise_audio |
|
else: |
|
start_time = random.uniform(0, len(noise_audio) / 1000.0 - speech_duration) * 1000 |
|
trimmed_noise = noise_audio[start_time:start_time + (speech_duration * 1000)] |
|
|
|
trimmed_noise = trimmed_noise.set_frame_rate(8000) |
|
|
|
|
|
speech_vol = get_audio_volume_db(speech_audio) |
|
noise_vol = get_audio_volume_db(trimmed_noise) |
|
|
|
current_snr = speech_vol - noise_vol |
|
adjustment_needed = 10 - current_snr |
|
|
|
if adjustment_needed > 0: |
|
speech_adjust = min(adjustment_needed, 2) |
|
noise_adjust = -min(adjustment_needed / 2, 5) |
|
else: |
|
speech_adjust = max(adjustment_needed, -5) |
|
noise_adjust = -5 / 2 |
|
|
|
|
|
adjusted_speech = adjust_volume(speech_audio, speech_adjust + alpha) |
|
adjusted_noise = adjust_volume(trimmed_noise, noise_adjust + beta) |
|
|
|
final_audio = overlay_audio(adjusted_speech, adjusted_noise) |
|
|
|
return final_audio |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|