amaniM's picture
Update app.py
11c161c verified
import gradio as gr
from transformers import BlipForImageTextRetrieval, AutoProcessor, WhisperForConditionalGeneration, AutoTokenizer
from gtts import gTTS
import speech_recognition as sr
import torch
from PIL import Image
# تحميل النماذج والمعالجات
image_model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
image_processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
whisper_tokenizer = AutoTokenizer.from_pretrained("openai/whisper-base")
# دالة مطابقة الصورة مع النص
def image_text_matching(img, text):
raw_image = img.convert('RGB')
inputs = image_processor(images=raw_image, text=text, return_tensors="pt")
outputs = image_model(**inputs)
result = outputs[0][0]
softmax_result = torch.softmax(result, dim=0)
max_index = torch.argmax(softmax_result).item()
return 'Match' if max_index == 1 else 'No Match'
# دالة تحويل النص إلى صوت
def text_to_audio(text):
tts = gTTS(text=text, lang='en') # يمكنك تعديل اللغة إلى 'ar' للنصوص العربية
audio_file = "output.mp3"
tts.save(audio_file)
return audio_file
# دالة تحويل الصوت إلى نص
def audio_to_text(audio):
recognizer = sr.Recognizer()
with sr.AudioFile(audio) as source:
audio_data = recognizer.record(source)
text = recognizer.recognize_google(audio_data, language='ar')
return text
# إعداد واجهة Gradio
iface = gr.Interface(
fn=lambda img, text, audio: (
image_text_matching(img, text),
text_to_audio(text),
audio_to_text(audio) if audio else "No audio uploaded"
),
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(label="Enter Text"),
gr.Audio(label="Upload Audio", type="filepath") # تعديل هنا
],
outputs=["text", "audio", "text"],
title="AI Project: Image-Text Matching and Audio Tasks",
description="Upload an image and enter text to see if they match. Also, convert text to audio and audio to text."
)
# تشغيل الواجهة
iface.launch()