HakimHa commited on
Commit
a9fada1
Β·
1 Parent(s): 435902d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -25
app.py CHANGED
@@ -1,9 +1,9 @@
1
  import gradio as gr
2
  from PIL import Image
3
- from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
4
  import soundfile as sf
5
  import torch
6
- from speech_recognition import AudioFile, Recognizer
7
 
8
  model_name_or_path = "microsoft/DialoGPT-large"
9
 
@@ -17,7 +17,9 @@ model = AutoModelForCausalLM.from_pretrained(
17
  trust_remote_code=True,
18
  )
19
 
20
-
 
 
21
 
22
  # Function to handle text input
23
  def handle_text(text):
@@ -32,36 +34,24 @@ def handle_image(img):
32
  return "This image seems nice!"
33
 
34
  # Function to handle audio input
35
- def stt(audio: object, language: str) -> str:
36
- """Converts speech to text.
37
-
38
- Args:
39
- audio: record of user speech
40
-
41
- Returns:
42
- text (str): recognized speech of user
43
- """
44
-
45
- # Create a Recognizer object
46
- r = Recognizer()
47
- # Open the audio file
48
- with AudioFile(audio) as source:
49
- # Listen for the data (load audio to memory)
50
- audio_data = r.record(source)
51
- # Transcribe the audio using Google's speech-to-text API
52
- text = r.recognize_google(audio_data, language=language)
53
- return text
54
 
55
  def chatbot(text, img, audio):
56
  text_output = handle_text(text) if text is not None else ''
57
  img_output = handle_image(img) if img is not None else ''
58
- audio_output = handle_text(stt(audio,'english')) if audio is not None else ''
59
 
60
  outputs = [o for o in [text_output, img_output, audio_output] if o]
61
  return "\n".join(outputs)
62
 
63
-
64
-
65
  iface = gr.Interface(
66
  fn=chatbot,
67
  inputs=[
 
1
  import gradio as gr
2
  from PIL import Image
3
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, Wav2Vec2Processor, Wav2Vec2ForCTC
4
  import soundfile as sf
5
  import torch
6
+ import numpy as np
7
 
8
  model_name_or_path = "microsoft/DialoGPT-large"
9
 
 
17
  trust_remote_code=True,
18
  )
19
 
20
+ # Initialize the Wav2Vec2 model and processor
21
+ wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
22
+ wav2vec2_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
23
 
24
  # Function to handle text input
25
  def handle_text(text):
 
34
  return "This image seems nice!"
35
 
36
  # Function to handle audio input
37
+ def handle_audio(audio):
38
+ # gradio Audio returns a tuple (sample_rate, audio_np_array)
39
+ # we only need the audio data, hence accessing the second element
40
+ audio = audio[1]
41
+ input_values = wav2vec2_processor(audio, sampling_rate=16_000, return_tensors="pt").input_values
42
+ logits = wav2vec2_model(input_values).logits
43
+ predicted_ids = torch.argmax(logits, dim=-1)
44
+ transcriptions = wav2vec2_processor.decode(predicted_ids[0])
45
+ return handle_text(transcriptions)
 
 
 
 
 
 
 
 
 
 
46
 
47
  def chatbot(text, img, audio):
48
  text_output = handle_text(text) if text is not None else ''
49
  img_output = handle_image(img) if img is not None else ''
50
+ audio_output = handle_audio(audio) if audio is not None else ''
51
 
52
  outputs = [o for o in [text_output, img_output, audio_output] if o]
53
  return "\n".join(outputs)
54
 
 
 
55
  iface = gr.Interface(
56
  fn=chatbot,
57
  inputs=[