Spaces:

Lcarrot
/

VisualAssistant

Sleeping

App Files Files Community

VisualAssistant / app.py

Lcarrot

Update app.py

604118d verified 2 months ago

raw

history blame contribute delete

4.51 kB

	from transformers import pipeline
	asr = pipeline(task="automatic-speech-recognition", model="openai/whisper-base")

	def get_text_from_audio(audio):
	output = asr(audio, max_new_tokens=256,chunk_length_s=30,batch_size=8)
	return output['text']


	from transformers import MarianMTModel, MarianTokenizer

	# Загрузка модели и токенизатора для перевода с русского на английский
	tr_ru_model_name = "Helsinki-NLP/opus-mt-ru-en"
	tr_ru_tokenizer = MarianTokenizer.from_pretrained(tr_ru_model_name)
	tr_ru_model = MarianMTModel.from_pretrained(tr_ru_model_name)

	# Функция для перевода текста
	def translate_ru_to_en(text):
	# Токенизация входного текста
	tokenized_text = tr_ru_tokenizer.prepare_seq2seq_batch([text], return_tensors="pt")

	# Перевод текста
	translated = tr_ru_model.generate(**tokenized_text)

	# Декодирование переведенного текста
	translated_text = tr_ru_tokenizer.decode(translated[0], skip_special_tokens=True)

	return translated_text


	import requests
	from PIL import Image

	сurrent_images = []

	def load_image(image_url):
	image = Image.open(requests.get(image_url, stream=True).raw)
	if сurrent_images:
	сurrent_images.pop(0)
	сurrent_images.append(image)
	return image


	from transformers import ViltProcessor, ViltForQuestionAnswering

	# Загрузка процессора и модели VQA
	img_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
	img_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

	# Функция для получения ответа на вопрос по изображению
	def ask_question_about_image(question):
	# Подготовка входных данных для модели
	encoding = img_processor(сurrent_images[0], text=question, return_tensors="pt")

	# Получение ответа от модели
	outputs = img_model(**encoding)
	logits = outputs.logits
	idx = logits.argmax(-1).item()

	# Декодирование ответа
	answer = img_model.config.id2label[idx]

	return answer


	from transformers import MarianMTModel, MarianTokenizer

	# Загрузка модели и токенизатора для перевода с русского на английский
	tr_en_model_name = "Helsinki-NLP/opus-mt-en-ru"
	tr_en_tokenizer = MarianTokenizer.from_pretrained(tr_en_model_name)
	tr_en_model = MarianMTModel.from_pretrained(tr_en_model_name)

	# Функция для перевода текста
	def translate_en_to_ru(text):
	# Токенизация входного текста
	tokenized_text = tr_en_tokenizer.prepare_seq2seq_batch([text], return_tensors="pt")

	# Перевод текста
	translated = tr_en_model.generate(**tokenized_text)

	# Декодирование переведенного текста
	translated_text = tr_en_tokenizer.decode(translated[0], skip_special_tokens=True)

	return translated_text


	from transformers import pipeline
	import torch
	import io
	import soundfile as sf
	import numpy as np

	# Загружаем TTS-модель для русского языка
	tts_pipe = pipeline("text-to-speech", model="facebook/mms-tts-rus")

	def text_to_speech(text, output_file="output.wav"):
	output = tts_pipe(text)
	print(output)
	sf.write(output_file, output['audio'][0], samplerate=output['sampling_rate'])

	return output_file


	def transcribe_long_form(filepath):
	if filepath is None:
	gr.Warning("No audio found, please retry.")
	return

	ru_text = get_text_from_audio(filepath)
	eng_text = translate_ru_to_en(ru_text)
	answer = ask_question_about_image(eng_text)
	ru_text_ans = translate_en_to_ru(answer)
	speech_filename = text_to_speech(ru_text_ans)
	return speech_filename


	import os
	import gradio as gr

	import gradio as gr
	demo = gr.Blocks()

	mic_transcribe = gr.Interface(
	fn=transcribe_long_form,
	inputs=gr.Audio(sources="microphone",
	type="filepath"),
	outputs="audio",
	allow_flagging="never")

	file_load = gr.Interface(
	fn=load_image,
	inputs="text",
	outputs="image",
	allow_flagging="never",
	)

	with demo:
	gr.TabbedInterface(
	[mic_transcribe,
	file_load],
	["Transcribe Microphone",
	"Transcribe Audio File"],
	)
	demo.launch(share=True)