from transformers import pipeline asr = pipeline(task="automatic-speech-recognition", model="openai/whisper-base") def get_text_from_audio(audio): output = asr(audio, max_new_tokens=256,chunk_length_s=30,batch_size=8) return output['text'] from transformers import MarianMTModel, MarianTokenizer # Загрузка модели и токенизатора для перевода с русского на английский tr_ru_model_name = "Helsinki-NLP/opus-mt-ru-en" tr_ru_tokenizer = MarianTokenizer.from_pretrained(tr_ru_model_name) tr_ru_model = MarianMTModel.from_pretrained(tr_ru_model_name) # Функция для перевода текста def translate_ru_to_en(text): # Токенизация входного текста tokenized_text = tr_ru_tokenizer.prepare_seq2seq_batch([text], return_tensors="pt") # Перевод текста translated = tr_ru_model.generate(**tokenized_text) # Декодирование переведенного текста translated_text = tr_ru_tokenizer.decode(translated[0], skip_special_tokens=True) return translated_text import requests from PIL import Image сurrent_images = [] def load_image(image_url): image = Image.open(requests.get(image_url, stream=True).raw) if сurrent_images: сurrent_images.pop(0) сurrent_images.append(image) return image from transformers import ViltProcessor, ViltForQuestionAnswering # Загрузка процессора и модели VQA img_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") img_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") # Функция для получения ответа на вопрос по изображению def ask_question_about_image(question): # Подготовка входных данных для модели encoding = img_processor(сurrent_images[0], text=question, return_tensors="pt") # Получение ответа от модели outputs = img_model(**encoding) logits = outputs.logits idx = logits.argmax(-1).item() # Декодирование ответа answer = img_model.config.id2label[idx] return answer from transformers import MarianMTModel, MarianTokenizer # Загрузка модели и токенизатора для перевода с русского на английский tr_en_model_name = "Helsinki-NLP/opus-mt-en-ru" tr_en_tokenizer = MarianTokenizer.from_pretrained(tr_en_model_name) tr_en_model = MarianMTModel.from_pretrained(tr_en_model_name) # Функция для перевода текста def translate_en_to_ru(text): # Токенизация входного текста tokenized_text = tr_en_tokenizer.prepare_seq2seq_batch([text], return_tensors="pt") # Перевод текста translated = tr_en_model.generate(**tokenized_text) # Декодирование переведенного текста translated_text = tr_en_tokenizer.decode(translated[0], skip_special_tokens=True) return translated_text from transformers import pipeline import torch import io import soundfile as sf import numpy as np # Загружаем TTS-модель для русского языка tts_pipe = pipeline("text-to-speech", model="facebook/mms-tts-rus") def text_to_speech(text, output_file="output.wav"): output = tts_pipe(text) print(output) sf.write(output_file, output['audio'][0], samplerate=output['sampling_rate']) return output_file def transcribe_long_form(filepath): if filepath is None: gr.Warning("No audio found, please retry.") return ru_text = get_text_from_audio(filepath) eng_text = translate_ru_to_en(ru_text) answer = ask_question_about_image(eng_text) ru_text_ans = translate_en_to_ru(answer) speech_filename = text_to_speech(ru_text_ans) return speech_filename import os import gradio as gr import gradio as gr demo = gr.Blocks() mic_transcribe = gr.Interface( fn=transcribe_long_form, inputs=gr.Audio(sources="microphone", type="filepath"), outputs="audio", allow_flagging="never") file_load = gr.Interface( fn=load_image, inputs="text", outputs="image", allow_flagging="never", ) with demo: gr.TabbedInterface( [mic_transcribe, file_load], ["Transcribe Microphone", "Transcribe Audio File"], ) demo.launch(share=True)