Walid-Ahmed commited on
Commit
5eb3adc
·
verified ·
1 Parent(s): 4040430

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +44 -0
  2. packages.txt +1 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import pipeline
3
+ from PIL import Image
4
+ from scipy.io import wavfile
5
+ import simpleaudio as sa
6
+ import gradio as gr
7
+ import numpy as np
8
+ # Specify the device (CPU or GPU)
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+
11
+ # Load the image-to-text pipeline
12
+ caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
13
+ # Load the image-to-text pipeline with the vit-gpt2 model
14
+ #caption_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device)
15
+
16
+ # Load the text-to-speech pipeline
17
+ narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)
18
+
19
+
20
+ def process_image(image):
21
+ # Generate the caption
22
+ caption = caption_image(image)[0]['generated_text']
23
+
24
+ # Generate speech from the caption
25
+ speech = narrator(caption)
26
+
27
+ # Convert the audio to PCM format
28
+ audio_data = np.array(speech["audio"][0] * 32767, dtype=np.int16)
29
+
30
+ # Save the audio to a WAV file
31
+ audio_path = "caption.wav"
32
+ wavfile.write(audio_path, rate=speech["sampling_rate"], data=audio_data)
33
+
34
+ return caption, audio_path
35
+
36
+ # Create Gradio interface
37
+ iface = gr.Interface(
38
+ fn=process_image,
39
+ inputs=gr.Image(type="pil"),
40
+ outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(label="Generated Audio", type="filepath")]
41
+ )
42
+
43
+ # Launch the interface
44
+ iface.launch()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ espeak
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ gradio
3
+ timm
4
+ scipy
5
+ phonemizer