Spaces:

HakimHa
/

wanderJoy

Runtime error

App Files Files Community

HakimHa commited on Jul 20, 2023

Commit

97f8dd4

1 Parent(s): 29ac35f

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -12

app.py CHANGED Viewed

@@ -1,25 +1,31 @@
 import gradio as gr
 from PIL import Image
-import speech_recognition as sr
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 # Load pre-trained model and tokenizer
-tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-model = GPT2LMHeadModel.from_pretrained("gpt2")
 # Placeholder function to handle text input
 def handle_text(text):
     # encode the new user input, add the eos_token and return a tensor in Pytorch
-    new_user_input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors='pt')
     # append the new user input tokens to the chat history
     bot_input_ids = new_user_input_ids
     # generate a response
-    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
     # Print the generated chat
-    chat_output = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
     return chat_output
 # Placeholder function to handle image input
@@ -29,12 +35,16 @@ def handle_image(img):
 # Placeholder function to handle audio input
 def handle_audio(audio):
-    # This is a placeholder function, replace with your own audio processing function
-    r = sr.Recognizer()
-    with sr.AudioFile(audio.name) as source:
-        audio_data = r.record(source)
-        text = r.recognize_google(audio_data)
-        return handle_text(text)
 def chatbot(inputs):
     text, img, audio = inputs

 import gradio as gr
 from PIL import Image
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
+import soundfile as sf
+import torch
 # Load pre-trained model and tokenizer
+gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
+# Load pre-trained model and processor
+processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
 # Placeholder function to handle text input
 def handle_text(text):
     # encode the new user input, add the eos_token and return a tensor in Pytorch
+    new_user_input_ids = gpt2_tokenizer.encode(text + gpt2_tokenizer.eos_token, return_tensors='pt')
     # append the new user input tokens to the chat history
     bot_input_ids = new_user_input_ids
     # generate a response
+    chat_history_ids = gpt2_model.generate(bot_input_ids, max_length=1000, pad_token_id=gpt2_tokenizer.eos_token_id)
     # Print the generated chat
+    chat_output = gpt2_tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
     return chat_output
 # Placeholder function to handle image input
 # Placeholder function to handle audio input
 def handle_audio(audio):
+    # load audio
+    speech, _ = sf.read(audio)
+    # transcribe speech to text
+    input_values = processor(speech, return_tensors="pt").input_values
+    # perform forward pass
+    logits = model(input_values).logits
+    # take argmax and decode
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcriptions = processor.decode(predicted_ids[0])
+    return handle_text(transcriptions)
 def chatbot(inputs):
     text, img, audio = inputs