grad-svc

Running

App Files Files Community

grad-svc / anyf0 /f0_extractor.py

R-Kentaren

Create anyf0/f0_extractor.py

83a5b06 verified 2 months ago

raw

history blame

7.44 kB

	import dataclasses
	import pathlib

	import audioflux
	import librosa
	import numpy as np
	import parselmouth
	import pyworld as pw
	import resampy
	import torch
	import torchcrepe
	import torchfcpe

	from anyf0.rmvpe import RMVPE


	def hz_to_cents(F, F_ref=55.0):
	"""
	Converts frequency in Hz to cents.

	Parameters
	----------
	F : float or ndarray
	Frequency value in Hz
	F_ref : float
	Reference frequency in Hz (Default value = 55.0)
	Returns
	-------
	F_cents : float or ndarray
	Frequency in cents
	"""

	# Avoid division by 0
	F_temp = np.array(F).astype(float)
	F_temp[F_temp == 0] = np.nan

	F_cents = 1200 * np.log2(F_temp / F_ref)

	return F_cents


	@dataclasses.dataclass
	class F0Extractor:
	wav_path: pathlib.Path
	sample_rate: int = 44100
	hop_length: int = 512
	f0_min: int = 50
	f0_max: int = 1600
	method: str = "praat_ac"
	x: np.ndarray = dataclasses.field(init=False)

	def __post_init__(self):
	self.x, self.sample_rate = librosa.load(self.wav_path, sr=self.sample_rate)

	@property
	def hop_size(self) -> float:
	return self.hop_length / self.sample_rate

	@property
	def wav16k(self) -> np.ndarray:
	return resampy.resample(self.x, self.sample_rate, 16000)

	def extract_f0(self) -> np.ndarray:
	f0 = None
	match self.method:
	case "dio":
	_f0, t = pw.dio(
	self.x.astype("double"),
	self.sample_rate,
	f0_floor=self.f0_min,
	f0_ceil=self.f0_max,
	channels_in_octave=2,
	frame_period=(1000 * self.hop_size),
	)
	f0 = pw.stonemask(self.x.astype("double"), _f0, t, self.sample_rate)
	f0 = f0.astype("float")
	case "harvest":
	f0, _ = pw.harvest(
	self.x.astype("double"),
	self.sample_rate,
	f0_floor=self.f0_min,
	f0_ceil=self.f0_max,
	frame_period=(1000 * self.hop_size),
	)
	f0 = f0.astype("float")
	case "pyin":
	f0, _, _ = librosa.pyin(
	y=self.wav16k,
	fmin=self.f0_min,
	fmax=self.f0_max,
	sr=16000,
	hop_length=80,
	)
	case "piptrack":
	pitches, magnitudes = librosa.piptrack(
	y=self.wav16k,
	fmin=self.f0_min,
	fmax=self.f0_max,
	sr=16000,
	hop_length=80,
	)
	max_indexes = np.argmax(magnitudes, axis=0)
	f0 = pitches[max_indexes, range(magnitudes.shape[1])]
	case "cep" \| "hps" \| "lhs" \| "ncf" \| "pef":
	f0 = {
	"cep": audioflux.PitchCEP,
	"hps": audioflux.PitchHPS,
	"lhs": audioflux.PitchLHS,
	"ncf": audioflux.PitchNCF,
	"pef": audioflux.PitchPEF,
	}[self.method](
	16000,
	low_fre=self.f0_min,
	high_fre=self.f0_max,
	slide_length=80,
	).pitch(np.pad(self.wav16k, (2048, 2048)))
	case "stft":
	f0, _ = audioflux.PitchSTFT(
	16000,
	low_fre=self.f0_min,
	high_fre=self.f0_max,
	slide_length=80,
	).pitch(np.pad(self.wav16k, (2048, 2048)))
	case "yin":
	f0, _, _ = audioflux.PitchYIN(
	16000,
	low_fre=self.f0_min,
	high_fre=self.f0_max,
	slide_length=80,
	).pitch(np.pad(self.wav16k, (2048, 2048)))
	case "torchcrepe":
	device = "cuda" if torch.cuda.is_available() else "cpu"

	wav16k_torch = torch.FloatTensor(self.wav16k).unsqueeze(0).to(device)
	f0 = torchcrepe.predict(
	wav16k_torch,
	sample_rate=16000,
	hop_length=80,
	batch_size=1024,
	fmin=self.f0_min,
	fmax=self.f0_max,
	device=device,
	)
	f0 = f0[0].cpu().numpy()
	case "torchfcpe":
	device = "cuda" if torch.cuda.is_available() else "cpu"
	audio = librosa.to_mono(self.x)
	audio_length = len(audio)
	f0_target_length = (audio_length // self.hop_length) + 1
	audio = torch.from_numpy(audio).float().unsqueeze(0).unsqueeze(-1).to(device)
	model = torchfcpe.spawn_bundled_infer_model(device=device)

	f0 = model.infer(
	audio,
	sr=self.sample_rate,
	decoder_mode='local_argmax',
	threshold=0.006,
	f0_min=self.f0_min,
	f0_max=self.f0_max,
	interp_uv=False,
	output_interp_target_length=f0_target_length,
	)
	f0 = f0.squeeze().cpu().numpy()
	case "rmvpe":
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model_rmvpe = RMVPE(
	"rmvpe.pt",
	is_half=True,
	device=device,
	hop_length=80
	)
	f0 = model_rmvpe.infer_from_audio(self.wav16k, thred=0.03)
	case "praat_ac" \| "praat_cc":
	l_pad = int(np.ceil(1.5 / self.f0_min * self.sample_rate))
	r_pad = int(self.hop_size * ((len(self.x) - 1) // self.hop_size + 1) - len(self.x) + l_pad + 1)
	f0 = (
	getattr(
	parselmouth.Sound(np.pad(self.x, (l_pad, r_pad)), self.sample_rate),
	"to_pitch_" + self.method.rpartition('_')[-1]
	)(
	time_step=self.hop_size,
	voicing_threshold=0.6,
	pitch_floor=self.f0_min,
	pitch_ceiling=self.f0_max,
	)
	.selected_array["frequency"]
	)
	case "praat_shs":
	l_pad = int(np.ceil(1.5 / self.f0_min * self.sample_rate))
	r_pad = int(self.hop_size * ((len(self.x) - 1) // self.hop_size + 1) - len(self.x) + l_pad + 1)
	f0 = parselmouth.Sound(
	np.pad(self.x, (l_pad, r_pad)), self.sample_rate
	).to_pitch_shs(
	time_step=self.hop_size,
	minimum_pitch=self.f0_min,
	maximum_frequency_component=self.f0_max,
	).selected_array["frequency"]
	case _:
	raise ValueError(f"Unknown method: {self.method}")
	return hz_to_cents(f0, librosa.midi_to_hz(0))

	def plot_f0(self, f0):
	from matplotlib import pyplot as plt

	plt.figure(figsize=(10, 4))
	plt.plot(f0)
	plt.title(self.method)
	plt.xlabel("Time (frames)")
	plt.ylabel("F0 (cents)")
	plt.show()