amaniM commited on
Commit
f932717
·
verified ·
1 Parent(s): a01b8b8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -0
app.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # تثبيت المكتبات المطلوبة
2
+ !pip install transformers gradio torch gtts pydub SpeechRecognition
3
+
4
+ import gradio as gr
5
+ from transformers import BlipForImageTextRetrieval, AutoProcessor, WhisperForConditionalGeneration, AutoTokenizer
6
+ from gtts import gTTS
7
+ import speech_recognition as sr
8
+ import torch
9
+ from PIL import Image
10
+
11
+ # تحميل النماذج والمعالجات
12
+ image_model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
13
+ image_processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
14
+ whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
15
+ whisper_tokenizer = AutoTokenizer.from_pretrained("openai/whisper-base")
16
+
17
+ # دالة مطابقة الصورة مع النص
18
+ def image_text_matching(img, text):
19
+ raw_image = img.convert('RGB')
20
+ inputs = image_processor(images=raw_image, text=text, return_tensors="pt")
21
+ outputs = image_model(**inputs)
22
+ result = outputs[0][0]
23
+ softmax_result = torch.softmax(result, dim=0)
24
+ max_index = torch.argmax(softmax_result).item()
25
+ return 'Match' if max_index == 1 else 'No Match'
26
+
27
+ # دالة تحويل النص إلى صوت
28
+ def text_to_audio(text):
29
+ tts = gTTS(text=text, lang='en') # يمكنك تعديل اللغة إلى 'ar' للنصوص العربية
30
+ audio_file = "output.mp3"
31
+ tts.save(audio_file)
32
+ return audio_file
33
+
34
+ # دالة تحويل الصوت إلى نص
35
+ def audio_to_text(audio):
36
+ recognizer = sr.Recognizer()
37
+ with sr.AudioFile(audio) as source:
38
+ audio_data = recognizer.record(source)
39
+ text = recognizer.recognize_google(audio_data, language='ar')
40
+ return text
41
+
42
+ # إعداد واجهة Gradio
43
+ iface = gr.Interface(
44
+ fn=lambda img, text, audio: (
45
+ image_text_matching(img, text),
46
+ text_to_audio(text),
47
+ audio_to_text(audio) if audio else "No audio uploaded"
48
+ ),
49
+ inputs=[
50
+ gr.Image(type="pil", label="Upload Image"),
51
+ gr.Textbox(label="Enter Text"),
52
+ gr.Audio(label="Upload Audio", type="filepath") # تعديل هنا
53
+ ],
54
+ outputs=["text", "audio", "text"],
55
+ title="AI Project: Image-Text Matching and Audio Tasks",
56
+ description="Upload an image and enter text to see if they match. Also, convert text to audio and audio to text."
57
+ )
58
+
59
+ # تشغيل الواجهة
60
+ iface.launch()