VisualAssistant / app.py
Lcarrot's picture
Update app.py
604118d verified
from transformers import pipeline
asr = pipeline(task="automatic-speech-recognition", model="openai/whisper-base")
def get_text_from_audio(audio):
output = asr(audio, max_new_tokens=256,chunk_length_s=30,batch_size=8)
return output['text']
from transformers import MarianMTModel, MarianTokenizer
# Загрузка модели и токенизатора для перевода с русского на английский
tr_ru_model_name = "Helsinki-NLP/opus-mt-ru-en"
tr_ru_tokenizer = MarianTokenizer.from_pretrained(tr_ru_model_name)
tr_ru_model = MarianMTModel.from_pretrained(tr_ru_model_name)
# Функция для перевода текста
def translate_ru_to_en(text):
# Токенизация входного текста
tokenized_text = tr_ru_tokenizer.prepare_seq2seq_batch([text], return_tensors="pt")
# Перевод текста
translated = tr_ru_model.generate(**tokenized_text)
# Декодирование переведенного текста
translated_text = tr_ru_tokenizer.decode(translated[0], skip_special_tokens=True)
return translated_text
import requests
from PIL import Image
сurrent_images = []
def load_image(image_url):
image = Image.open(requests.get(image_url, stream=True).raw)
if сurrent_images:
сurrent_images.pop(0)
сurrent_images.append(image)
return image
from transformers import ViltProcessor, ViltForQuestionAnswering
# Загрузка процессора и модели VQA
img_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
img_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
# Функция для получения ответа на вопрос по изображению
def ask_question_about_image(question):
# Подготовка входных данных для модели
encoding = img_processor(сurrent_images[0], text=question, return_tensors="pt")
# Получение ответа от модели
outputs = img_model(**encoding)
logits = outputs.logits
idx = logits.argmax(-1).item()
# Декодирование ответа
answer = img_model.config.id2label[idx]
return answer
from transformers import MarianMTModel, MarianTokenizer
# Загрузка модели и токенизатора для перевода с русского на английский
tr_en_model_name = "Helsinki-NLP/opus-mt-en-ru"
tr_en_tokenizer = MarianTokenizer.from_pretrained(tr_en_model_name)
tr_en_model = MarianMTModel.from_pretrained(tr_en_model_name)
# Функция для перевода текста
def translate_en_to_ru(text):
# Токенизация входного текста
tokenized_text = tr_en_tokenizer.prepare_seq2seq_batch([text], return_tensors="pt")
# Перевод текста
translated = tr_en_model.generate(**tokenized_text)
# Декодирование переведенного текста
translated_text = tr_en_tokenizer.decode(translated[0], skip_special_tokens=True)
return translated_text
from transformers import pipeline
import torch
import io
import soundfile as sf
import numpy as np
# Загружаем TTS-модель для русского языка
tts_pipe = pipeline("text-to-speech", model="facebook/mms-tts-rus")
def text_to_speech(text, output_file="output.wav"):
output = tts_pipe(text)
print(output)
sf.write(output_file, output['audio'][0], samplerate=output['sampling_rate'])
return output_file
def transcribe_long_form(filepath):
if filepath is None:
gr.Warning("No audio found, please retry.")
return
ru_text = get_text_from_audio(filepath)
eng_text = translate_ru_to_en(ru_text)
answer = ask_question_about_image(eng_text)
ru_text_ans = translate_en_to_ru(answer)
speech_filename = text_to_speech(ru_text_ans)
return speech_filename
import os
import gradio as gr
import gradio as gr
demo = gr.Blocks()
mic_transcribe = gr.Interface(
fn=transcribe_long_form,
inputs=gr.Audio(sources="microphone",
type="filepath"),
outputs="audio",
allow_flagging="never")
file_load = gr.Interface(
fn=load_image,
inputs="text",
outputs="image",
allow_flagging="never",
)
with demo:
gr.TabbedInterface(
[mic_transcribe,
file_load],
["Transcribe Microphone",
"Transcribe Audio File"],
)
demo.launch(share=True)