Spaces:

thepatch
/

the-slot-machine

Sleeping

File size: 9,934 Bytes

import gradio as gr
from musiclang_predict import MusicLangPredictor
import random
import subprocess
import os
import torchaudio
import torch
import numpy as np
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
from pydub import AudioSegment

# Utility Functions
def peak_normalize(y, target_peak=0.97):
    return target_peak * (y / np.max(np.abs(y)))

def rms_normalize(y, target_rms=0.05):
    return y * (target_rms / np.sqrt(np.mean(y**2)))

def preprocess_audio(waveform):
    waveform_np = waveform.cpu().squeeze().numpy()  # Move to CPU before converting to NumPy
#   processed_waveform_np = rms_normalize(peak_normalize(waveform_np))
    return torch.from_numpy(waveform_np).unsqueeze(0).to(device)

def create_slices(song, sr, slice_duration, bpm, num_slices=5):
    song_length = song.shape[-1] / sr
    slices = []
    
    # Ensure the first slice is from the beginning of the song
    first_slice_waveform = song[..., :int(slice_duration * sr)]
    slices.append(first_slice_waveform)
    
    for i in range(1, num_slices):
        random_start = random.choice(range(int(slice_duration * sr), int(song_length * sr), int(4 * 60 / bpm * sr)))
        slice_end = random_start + int(slice_duration * sr)
        
        if slice_end > song_length * sr:
            # Wrap around to the beginning of the song
            remaining_samples = int(slice_end - song_length * sr)
            slice_waveform = torch.cat([song[..., random_start:], song[..., :remaining_samples]], dim=-1)
        else:
            slice_waveform = song[..., random_start:slice_end]
        
        if len(slice_waveform.squeeze()) < int(slice_duration * sr):
            additional_samples_needed = int(slice_duration * sr) - len(slice_waveform.squeeze())
            slice_waveform = torch.cat([slice_waveform, song[..., :additional_samples_needed]], dim=-1)
        
        slices.append(slice_waveform)
        
    return slices

def calculate_duration(bpm, min_duration=29, max_duration=30):
    single_bar_duration = 4 * 60 / bpm
    bars = max(min_duration // single_bar_duration, 1)
    
    while single_bar_duration * bars < min_duration:
        bars += 1
    
    duration = single_bar_duration * bars
    
    while duration > max_duration and bars > 1:
        bars -= 1
        duration = single_bar_duration * bars
    
    return duration

def generate_music(seed, use_chords, chord_progression, prompt_duration, musicgen_model, num_iterations, bpm):
    if seed == "":
        seed = random.randint(1, 10000)

    ml = MusicLangPredictor('musiclang/musiclang-v2')

    try:
        seed = int(seed)
    except ValueError:
        seed = random.randint(1, 10000)

    nb_tokens = 2048
    temperature = 0.9
    top_p = 1.0

    if use_chords and chord_progression.strip():
        score = ml.predict_chords(
            chord_progression,
            time_signature=(4, 4),
            temperature=temperature,
            topp=top_p,
            rng_seed=seed
        )
    else:
        score = ml.predict(
            nb_tokens=nb_tokens,
            temperature=temperature,
            topp=top_p,
            rng_seed=seed
        )

    midi_filename = f"output_{seed}.mid"
    wav_filename = midi_filename.replace(".mid", ".wav")

    score.to_midi(midi_filename, tempo=bpm, time_signature=(4, 4))

    subprocess.run(["fluidsynth", "-ni", "font.sf2", midi_filename, "-F", wav_filename, "-r", "44100"])

    # Load the generated audio
    song, sr = torchaudio.load(wav_filename)
    song = song.to(device)

    # Use the user-provided BPM value for duration calculation
    duration = calculate_duration(bpm)

    # Create slices from the song using the user-provided BPM value
    slices = create_slices(song, sr, 35, bpm, num_slices=5)

    # Load the model
    model_name = musicgen_model.split(" ")[0]
    model_continue = MusicGen.get_pretrained(model_name)

    # Setting generation parameters
    model_continue.set_generation_params(
        use_sampling=True,
        top_k=250,
        top_p=0.0,
        temperature=1.0,
        duration=duration,
        cfg_coef=3
    )

    all_audio_files = []

    for i in range(num_iterations):
        slice_idx = i % len(slices)
        
        print(f"Running iteration {i + 1} using slice {slice_idx}...")
        
        prompt_waveform = slices[slice_idx][..., :int(prompt_duration * sr)]
        prompt_waveform = preprocess_audio(prompt_waveform)
        
        output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
        output = output.cpu()  # Move the output tensor back to CPU
        
        # Make sure the output tensor has at most 2 dimensions
        if len(output.size()) > 2:
            output = output.squeeze()
        
        filename_without_extension = f'continue_{i}'
        filename_with_extension = f'{filename_without_extension}.wav'
        
        audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
        all_audio_files.append(f'{filename_without_extension}.wav.wav')  # Assuming the library appends an extra .wav

    # Combine all audio files
    combined_audio = AudioSegment.empty()
    for filename in all_audio_files:
        combined_audio += AudioSegment.from_wav(filename)

    combined_audio_filename = f"combined_audio_{seed}.mp3"
    combined_audio.export(combined_audio_filename, format="mp3")

    # Clean up temporary files
    os.remove(midi_filename)
    os.remove(wav_filename)
    for filename in all_audio_files:
        os.remove(filename)

    return combined_audio_filename

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the expandable sections
musiclang_blurb = """

## MusicLang

MusicLang is a controllable AI MIDI model. It can generate MIDI sequences based on user-provided parameters, or unconditionally.

- [<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="vertical-align:middle"> musiclang github](https://github.com/MusicLang/musiclang_predict)

- [<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face" width="20" style="vertical-align:middle"> musiclang huggingface space](https://huggingface.co/spaces/musiclang/musiclang-predict)

"""

musicgen_blurb = """

## musicgen

musicgen is a transformer-based music model that generates audio. It can also do something called a continuation, which was initially meant to extend musicgen outputs beyond 30 seconds. it can be used with any input audio to produce surprising results.

- [<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> musicGen gitHub](https://github.com/facebookresearch/audiocraft)

"""

finetunes_blurb = """

## Fine-tuned Models

the fine-tunes hosted on the huggingface hub are provided collectively by the musicgen discord community. thanks to vanya, mj, hoenn, and of course, lyra.

- [<img src="https://cdn.iconscout.com/icon/free/png-256/discord-3691244-3073764.png" alt="Discord" width="20" style="vertical-align:middle"> musicgen discord](https://discord.gg/93kX8rGZ)

- [<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="vertical-align:middle"> fine-tuning colab notebook by lyra](https://colab.research.google.com/drive/13tbcC3A42KlaUZ21qvUXd25SFLu8WIvb)

"""

# Create the Gradio interface
with gr.Blocks() as iface:
    gr.Markdown("# the-slot machine")
    gr.Markdown("two ai's jamming. warning: outputs will be very strange, likely stupid, and possibly rad.")
    gr.Markdown("this is a musical slot machine. using musiclang, we get a midi output. then, we let a musicgen model continue, semi-randomly, from different sections of the midi track. the slot machine combines em all at the end into something very bizarre. pick a number for the seed between 1 and 10k, or leave it blank to unlock the full rnjesus powers. if you wanna be lame, you can control the chord progression, prompt duration, musicgen model, number of iterations, and BPM.")

    with gr.Accordion("More Info", open=False):
        gr.Markdown(musiclang_blurb)
        gr.Markdown(musicgen_blurb)
        gr.Markdown(finetunes_blurb)

    with gr.Row():
        with gr.Column():
            seed = gr.Textbox(label="Seed (leave blank for random)", value="")
            use_chords = gr.Checkbox(label="Control Chord Progression", value=False)
            chord_progression = gr.Textbox(label="Chord Progression (e.g., Am CM Dm E7 Am)", visible=True)
            prompt_duration = gr.Dropdown(label="Prompt Duration (seconds)", choices=list(range(1, 11)), value=7)
            musicgen_models = [
                "thepatch/vanya_ai_dnb_0.1 (small)",
                "thepatch/budots_remix (small)",
                "thepatch/PhonkV2 (small)",
                "thepatch/bleeps-medium (medium)",
                "thepatch/hoenn_lofi (large)"
            ]

            musicgen_model = gr.Dropdown(label="MusicGen Model", choices=musicgen_models, value=musicgen_models[0])
            num_iterations = gr.Slider(label="Number of Iterations", minimum=1, maximum=10, step=1, value=3)
            bpm = gr.Slider(label="BPM", minimum=60, maximum=200, step=1, value=140)
            generate_button = gr.Button("Generate Music")
        with gr.Column():
            output_audio = gr.Audio(label="Generated Music")

    generate_button.click(generate_music, inputs=[seed, use_chords, chord_progression, prompt_duration, musicgen_model, num_iterations, bpm], outputs=output_audio)

iface.launch()