Spaces:

wsntxxn
/

efficient_audio_captioning

Running

wsntxxn

Change to Hugging Face calling

dd3d338 6 months ago

3.38 kB

	from functools import partial
	import gradio as gr
	import torch
	from torchaudio.functional import resample
	from transformers import AutoModel, PreTrainedTokenizerFast


	def load_model(model_name,
	device):
	if model_name == "AudioCaps":
	model = AutoModel.from_pretrained(
	"wsntxxn/effb2-trm-audiocaps-captioning",
	trust_remote_code=True
	).to(device)
	tokenizer = PreTrainedTokenizerFast.from_pretrained(
	"wsntxxn/audiocaps-simple-tokenizer"
	)
	elif model_name == "Clotho":
	model = AutoModel.from_pretrained(
	"wsntxxn/effb2-trm-clotho-captioning",
	trust_remote_code=True
	).to(device)
	tokenizer = PreTrainedTokenizerFast.from_pretrained(
	"wsntxxn/clotho-simple-tokenizer"
	)
	return model, tokenizer


	def infer(file, runner):
	sr, wav = file
	wav = torch.as_tensor(wav)
	if wav.dtype == torch.short:
	wav = wav / 2 ** 15
	elif wav.dtype == torch.int:
	wav = wav / 2 ** 31
	if wav.ndim > 1:
	wav = wav.mean(1)
	wav = resample(wav, sr, runner.target_sr)
	wav_len = len(wav)
	wav = wav.float().unsqueeze(0)
	with torch.no_grad():
	word_idx = runner.model(
	audio=wav,
	audio_length=[wav_len]
	)[0]
	cap = runner.tokenizer.decode(word_idx, skip_special_tokens=True)
	return cap

	# def input_toggle(input_type):
	# if input_type == "file":
	# return gr.update(visible=True), gr.update(visible=False)
	# elif input_type == "mic":
	# return gr.update(visible=False), gr.update(visible=True)

	class InferRunner:

	def __init__(self, model_name):
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.model, self.tokenizer = load_model(model_name, self.device)
	self.target_sr = self.model.config.sample_rate

	def change_model(self, model_name):
	self.model, self.tokenizer = load_model(model_name, self.device)
	self.target_sr = self.model.config.sample_rate


	def change_model(radio):
	global infer_runner
	infer_runner.change_model(radio)


	with gr.Blocks() as demo:
	with gr.Row():
	gr.Markdown("# Lightweight EfficientNetB2-Transformer Audio Captioning")

	with gr.Row():
	gr.Markdown("""
	[![arXiv](https://img.shields.io/badge/arXiv-2407.14329-brightgreen.svg?style=flat-square)](https://arxiv.org/abs/2407.14329)

	[![github](https://img.shields.io/badge/GitHub-Code-blue?logo=Github&style=flat-square)](https://github.com/wsntxxn/AudioCaption?tab=readme-ov-file#lightweight-effb2-transformer-model)
	""")
	with gr.Row():
	with gr.Column():
	radio = gr.Radio(
	["AudioCaps", "Clotho"],
	value="AudioCaps",
	label="Select model"
	)
	infer_runner = InferRunner(radio.value)
	file = gr.Audio(label="Input", visible=True)
	radio.change(fn=change_model, inputs=[radio,],)
	btn = gr.Button("Run")
	with gr.Column():
	output = gr.Textbox(label="Output")
	btn.click(
	fn=partial(infer,
	runner=infer_runner),
	inputs=[file,],
	outputs=output
	)

	demo.launch()