Spaces:
Running
Running
File size: 4,514 Bytes
697571c 604118d 697571c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
from transformers import pipeline
asr = pipeline(task="automatic-speech-recognition", model="openai/whisper-base")
def get_text_from_audio(audio):
output = asr(audio, max_new_tokens=256,chunk_length_s=30,batch_size=8)
return output['text']
from transformers import MarianMTModel, MarianTokenizer
# Загрузка модели и токенизатора для перевода с русского на английский
tr_ru_model_name = "Helsinki-NLP/opus-mt-ru-en"
tr_ru_tokenizer = MarianTokenizer.from_pretrained(tr_ru_model_name)
tr_ru_model = MarianMTModel.from_pretrained(tr_ru_model_name)
# Функция для перевода текста
def translate_ru_to_en(text):
# Токенизация входного текста
tokenized_text = tr_ru_tokenizer.prepare_seq2seq_batch([text], return_tensors="pt")
# Перевод текста
translated = tr_ru_model.generate(**tokenized_text)
# Декодирование переведенного текста
translated_text = tr_ru_tokenizer.decode(translated[0], skip_special_tokens=True)
return translated_text
import requests
from PIL import Image
сurrent_images = []
def load_image(image_url):
image = Image.open(requests.get(image_url, stream=True).raw)
if сurrent_images:
сurrent_images.pop(0)
сurrent_images.append(image)
return image
from transformers import ViltProcessor, ViltForQuestionAnswering
# Загрузка процессора и модели VQA
img_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
img_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
# Функция для получения ответа на вопрос по изображению
def ask_question_about_image(question):
# Подготовка входных данных для модели
encoding = img_processor(сurrent_images[0], text=question, return_tensors="pt")
# Получение ответа от модели
outputs = img_model(**encoding)
logits = outputs.logits
idx = logits.argmax(-1).item()
# Декодирование ответа
answer = img_model.config.id2label[idx]
return answer
from transformers import MarianMTModel, MarianTokenizer
# Загрузка модели и токенизатора для перевода с русского на английский
tr_en_model_name = "Helsinki-NLP/opus-mt-en-ru"
tr_en_tokenizer = MarianTokenizer.from_pretrained(tr_en_model_name)
tr_en_model = MarianMTModel.from_pretrained(tr_en_model_name)
# Функция для перевода текста
def translate_en_to_ru(text):
# Токенизация входного текста
tokenized_text = tr_en_tokenizer.prepare_seq2seq_batch([text], return_tensors="pt")
# Перевод текста
translated = tr_en_model.generate(**tokenized_text)
# Декодирование переведенного текста
translated_text = tr_en_tokenizer.decode(translated[0], skip_special_tokens=True)
return translated_text
from transformers import pipeline
import torch
import io
import soundfile as sf
import numpy as np
# Загружаем TTS-модель для русского языка
tts_pipe = pipeline("text-to-speech", model="facebook/mms-tts-rus")
def text_to_speech(text, output_file="output.wav"):
output = tts_pipe(text)
print(output)
sf.write(output_file, output['audio'][0], samplerate=output['sampling_rate'])
return output_file
def transcribe_long_form(filepath):
if filepath is None:
gr.Warning("No audio found, please retry.")
return
ru_text = get_text_from_audio(filepath)
eng_text = translate_ru_to_en(ru_text)
answer = ask_question_about_image(eng_text)
ru_text_ans = translate_en_to_ru(answer)
speech_filename = text_to_speech(ru_text_ans)
return speech_filename
import os
import gradio as gr
import gradio as gr
demo = gr.Blocks()
mic_transcribe = gr.Interface(
fn=transcribe_long_form,
inputs=gr.Audio(sources="microphone",
type="filepath"),
outputs="audio",
allow_flagging="never")
file_load = gr.Interface(
fn=load_image,
inputs="text",
outputs="image",
allow_flagging="never",
)
with demo:
gr.TabbedInterface(
[mic_transcribe,
file_load],
["Transcribe Microphone",
"Transcribe Audio File"],
)
demo.launch(share=True) |