import gradio as gr
from transformers import BlipForImageTextRetrieval, AutoProcessor, WhisperForConditionalGeneration, AutoTokenizer
from gtts import gTTS
import speech_recognition as sr
import torch
from PIL import Image

# تحميل النماذج والمعالجات
image_model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
image_processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
whisper_tokenizer = AutoTokenizer.from_pretrained("openai/whisper-base")

# دالة مطابقة الصورة مع النص
def image_text_matching(img, text):
    raw_image = img.convert('RGB')
    inputs = image_processor(images=raw_image, text=text, return_tensors="pt")
    outputs = image_model(**inputs)
    result = outputs[0][0]
    softmax_result = torch.softmax(result, dim=0)
    max_index = torch.argmax(softmax_result).item()
    return 'Match' if max_index == 1 else 'No Match'

# دالة تحويل النص إلى صوت
def text_to_audio(text):
    tts = gTTS(text=text, lang='en')  # يمكنك تعديل اللغة إلى 'ar' للنصوص العربية
    audio_file = "output.mp3"
    tts.save(audio_file)
    return audio_file

# دالة تحويل الصوت إلى نص
def audio_to_text(audio):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio) as source:
        audio_data = recognizer.record(source)
        text = recognizer.recognize_google(audio_data, language='ar')
    return text

# إعداد واجهة Gradio
iface = gr.Interface(
    fn=lambda img, text, audio: (
        image_text_matching(img, text),
        text_to_audio(text),
        audio_to_text(audio) if audio else "No audio uploaded"
    ),
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Textbox(label="Enter Text"),
        gr.Audio(label="Upload Audio", type="filepath")  # تعديل هنا
    ],
    outputs=["text", "audio", "text"],
    title="AI Project: Image-Text Matching and Audio Tasks",
    description="Upload an image and enter text to see if they match. Also, convert text to audio and audio to text."
)

# تشغيل الواجهة
iface.launch()