Spaces:

HakimHa
/

wanderJoy

Runtime error

App Files Files Community

HakimHa commited on Jul 20, 2023

Commit

3442c7b

1 Parent(s): 4263327

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -24

app.py CHANGED Viewed

@@ -1,47 +1,42 @@
 import gradio as gr
 from PIL import Image
-from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
-from transformers import GPT2LMHeadModel, GPT2Tokenizer
 import soundfile as sf
 import torch
-# Load pre-trained model and tokenizer for GPT2
-gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
 # Load pre-trained model and processor for Wav2Vec2
 processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
-model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
 # Function to handle text input
 def handle_text(text):
-    # encode the new user input, add the eos_token and return a tensor in Pytorch
-    new_user_input_ids = gpt2_tokenizer.encode(text + gpt2_tokenizer.eos_token, return_tensors='pt')
-    # append the new user input tokens to the chat history
     bot_input_ids = new_user_input_ids
-    # generate a response
-    chat_history_ids = gpt2_model.generate(bot_input_ids, max_length=1000, pad_token_id=gpt2_tokenizer.eos_token_id)
-    # Print the generated chat
-    chat_output = gpt2_tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
     return chat_output
 # Function to handle image input
 def handle_image(img):
-    # This is a placeholder function, replace with your own image processing function
     return "This image seems nice!"
 # Function to handle audio input
 def handle_audio(audio):
-    # load audio
     speech, _ = sf.read(audio)
-    # transcribe speech to text
     input_values = processor(speech, return_tensors="pt").input_values
-    # perform forward pass
-    logits = model(input_values).logits
-    # take argmax and decode
     predicted_ids = torch.argmax(logits, dim=-1)
     transcriptions = processor.decode(predicted_ids[0])
     return handle_text(transcriptions)
@@ -54,7 +49,6 @@ def chatbot(text, img, audio):
     outputs = [o for o in [text_output, img_output, audio_output] if o]
     return "\n".join(outputs)
-# Define the Gradio interface
 iface = gr.Interface(
     fn=chatbot,
     inputs=[
@@ -67,5 +61,4 @@ iface = gr.Interface(
     description="This chatbot can handle text, image, and audio inputs. Try it out!",
 )
-# Launch the Gradio interface
 iface.launch()

 import gradio as gr
 from PIL import Image
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoModelForCausalLM, AutoTokenizer
 import soundfile as sf
 import torch
+model_name_or_path = "bofenghuang/vigogne-falcon-7b-chat"
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side="right", use_fast=False)
+tokenizer.pad_token = tokenizer.eos_token
+model = AutoModelForCausalLM.from_pretrained(
+    model_name_or_path,
+    torch_dtype=torch.float16,
+    device_map="auto",
+    trust_remote_code=True,
+)
 # Load pre-trained model and processor for Wav2Vec2
 processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+wav2vec2_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
 # Function to handle text input
 def handle_text(text):
+    new_user_input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors='pt')
     bot_input_ids = new_user_input_ids
+    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
+    chat_output = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
     return chat_output
 # Function to handle image input
 def handle_image(img):
     return "This image seems nice!"
 # Function to handle audio input
 def handle_audio(audio):
     speech, _ = sf.read(audio)
     input_values = processor(speech, return_tensors="pt").input_values
+    logits = wav2vec2_model(input_values).logits
     predicted_ids = torch.argmax(logits, dim=-1)
     transcriptions = processor.decode(predicted_ids[0])
     return handle_text(transcriptions)
     outputs = [o for o in [text_output, img_output, audio_output] if o]
     return "\n".join(outputs)
 iface = gr.Interface(
     fn=chatbot,
     inputs=[
     description="This chatbot can handle text, image, and audio inputs. Try it out!",
 )
 iface.launch()