import random
from pydub import AudioSegment  # type: ignore
# from pydub.effects import normalize
# import numpy as np  # type: ignore


def get_audio_volume_db(audio):
    """Estimate the volume in dBFS (decibels relative to full scale) using PyDub."""  # noqa
    return audio.dBFS if audio.dBFS != float('-inf') else -50.0  # Default to -50 dB for silence  # noqa


def adjust_volume(audio, volume_change_db):
    """Adjusts the volume of an AudioSegment."""
    return audio + volume_change_db


# def compress_audio(audio):
#     """Apply compression to normalize speech volume."""
#     return normalize(audio)


def place_in_stereo(audio, pan_value):
    """Places audio in stereo field (-1.0 = full left, 1.0 = full right)."""
    return audio.pan(pan_value)


def overlay_audio(speech_audio, noise_audio):
    """Overlays speech and noise using PyDub."""
    return speech_audio.overlay(noise_audio)


def process_audio(speech_data, noise_data, speech_sr, noise_sr, alpha, beta):
    """
    Process speech and noise audio data to create a mixed audio output.

    Args:
        speech_data (numpy.ndarray): Speech audio data
        noise_data (numpy.ndarray): Noise audio data
        speech_sr (int): Speech sample rate
        noise_sr (int): Noise sample rate
        alpha (float): Speech volume adjustment
        beta (float): Noise volume adjustment

    Returns:
        AudioSegment: Processed audio
    """
    # Convert numpy arrays to AudioSegment
    speech_audio = AudioSegment(
        speech_data.tobytes(),
        frame_rate=speech_sr,
        sample_width=speech_data.dtype.itemsize,
        channels=1
    )

    noise_audio = AudioSegment(
        noise_data.tobytes(),
        frame_rate=noise_sr,
        sample_width=noise_data.dtype.itemsize,
        channels=1
    )

    # Get speech duration
    speech_duration = len(speech_audio) / 1000.0  # Convert ms to sec

    # Cut noise segment
    if len(noise_audio) / 1000.0 <= speech_duration:
        trimmed_noise = noise_audio
    else:
        start_time = random.uniform(0, len(noise_audio) / 1000.0 - speech_duration) * 1000  # noqa
        trimmed_noise = noise_audio[start_time:start_time + (speech_duration * 1000)]  # noqa

    trimmed_noise = trimmed_noise.set_frame_rate(8000)

    # Calculate volumes and adjustments
    speech_vol = get_audio_volume_db(speech_audio)
    noise_vol = get_audio_volume_db(trimmed_noise)

    current_snr = speech_vol - noise_vol
    adjustment_needed = 10 - current_snr  # target_snr hardcoded to 10

    if adjustment_needed > 0:  # Speech too quiet
        speech_adjust = min(adjustment_needed, 2)
        noise_adjust = -min(adjustment_needed / 2, 5)
    else:  # Speech too loud
        speech_adjust = max(adjustment_needed, -5)
        noise_adjust = -5 / 2

    # Apply adjustments
    adjusted_speech = adjust_volume(speech_audio, speech_adjust + alpha)
    adjusted_noise = adjust_volume(trimmed_noise, noise_adjust + beta)

    final_audio = overlay_audio(adjusted_speech, adjusted_noise)

    return final_audio


# final_audio = process_audio("anushka.wav", "traffic.wav")
# # Single write operation at the end
# final_audio.export("output-traffic.wav", format="wav")

# print("Processing complete. Check output.wav!")


# -18, -20 for office
# -13 , -20 for market
# -18, -20 for traffic