VigneshDark's picture
fix: typo
82f19d8
import random
from pydub import AudioSegment # type: ignore
# from pydub.effects import normalize
# import numpy as np # type: ignore
def get_audio_volume_db(audio):
"""Estimate the volume in dBFS (decibels relative to full scale) using PyDub.""" # noqa
return audio.dBFS if audio.dBFS != float('-inf') else -50.0 # Default to -50 dB for silence # noqa
def adjust_volume(audio, volume_change_db):
"""Adjusts the volume of an AudioSegment."""
return audio + volume_change_db
# def compress_audio(audio):
# """Apply compression to normalize speech volume."""
# return normalize(audio)
def place_in_stereo(audio, pan_value):
"""Places audio in stereo field (-1.0 = full left, 1.0 = full right)."""
return audio.pan(pan_value)
def overlay_audio(speech_audio, noise_audio):
"""Overlays speech and noise using PyDub."""
return speech_audio.overlay(noise_audio)
def process_audio(speech_data, noise_data, speech_sr, noise_sr, alpha, beta):
"""
Process speech and noise audio data to create a mixed audio output.
Args:
speech_data (numpy.ndarray): Speech audio data
noise_data (numpy.ndarray): Noise audio data
speech_sr (int): Speech sample rate
noise_sr (int): Noise sample rate
alpha (float): Speech volume adjustment
beta (float): Noise volume adjustment
Returns:
AudioSegment: Processed audio
"""
# Convert numpy arrays to AudioSegment
speech_audio = AudioSegment(
speech_data.tobytes(),
frame_rate=speech_sr,
sample_width=speech_data.dtype.itemsize,
channels=1
)
noise_audio = AudioSegment(
noise_data.tobytes(),
frame_rate=noise_sr,
sample_width=noise_data.dtype.itemsize,
channels=1
)
# Get speech duration
speech_duration = len(speech_audio) / 1000.0 # Convert ms to sec
# Cut noise segment
if len(noise_audio) / 1000.0 <= speech_duration:
trimmed_noise = noise_audio
else:
start_time = random.uniform(0, len(noise_audio) / 1000.0 - speech_duration) * 1000 # noqa
trimmed_noise = noise_audio[start_time:start_time + (speech_duration * 1000)] # noqa
trimmed_noise = trimmed_noise.set_frame_rate(8000)
# Calculate volumes and adjustments
speech_vol = get_audio_volume_db(speech_audio)
noise_vol = get_audio_volume_db(trimmed_noise)
current_snr = speech_vol - noise_vol
adjustment_needed = 10 - current_snr # target_snr hardcoded to 10
if adjustment_needed > 0: # Speech too quiet
speech_adjust = min(adjustment_needed, 2)
noise_adjust = -min(adjustment_needed / 2, 5)
else: # Speech too loud
speech_adjust = max(adjustment_needed, -5)
noise_adjust = -5 / 2
# Apply adjustments
adjusted_speech = adjust_volume(speech_audio, speech_adjust + alpha)
adjusted_noise = adjust_volume(trimmed_noise, noise_adjust + beta)
final_audio = overlay_audio(adjusted_speech, adjusted_noise)
return final_audio
# final_audio = process_audio("anushka.wav", "traffic.wav")
# # Single write operation at the end
# final_audio.export("output-traffic.wav", format="wav")
# print("Processing complete. Check output.wav!")
# -18, -20 for office
# -13 , -20 for market
# -18, -20 for traffic