|
from openai import OpenAI |
|
import gradio as gr |
|
from pathlib import Path |
|
import base64 |
|
|
|
def call_gpt(client, text, image_b64): |
|
response = client.chat.completions.create( |
|
model="gpt-4-vision-preview", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": "I am in a corporate conference call, give me some useful feedback on how to be more corporate using the following text and image. Be concise and don't provide general advice. If I use profanities point it out to me as something I shouldn't do in a corporate world. Use an extremely corporate, HR-like tone.", |
|
}, |
|
{ |
|
"type": "text", |
|
"text": text, |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": f"data:image/jpeg;base64,{image_b64}" |
|
} |
|
} |
|
], |
|
} |
|
], |
|
max_tokens = 4000 |
|
) |
|
return response.choices[0].message.content |
|
|
|
def process(api_key, audio_file, webcam_file, chatbot): |
|
if(not audio_file or not webcam_file): |
|
return chatbot, None |
|
if not api_key: |
|
raise ValueError("API_KEY not set. Not gonna pay for you") |
|
client = OpenAI(api_key=api_key) |
|
audio_file = Path(audio_file) |
|
transcript = client.audio.translations.create( |
|
model="whisper-1", |
|
file=audio_file |
|
) |
|
webcam_file = Path(webcam_file) |
|
with open(webcam_file, "rb") as image_file: |
|
image = base64.b64encode(image_file.read()).decode('utf-8') |
|
response = call_gpt(client, transcript.text, image) |
|
chatbot.append((f"", None)) |
|
chatbot.append((transcript.text, response)) |
|
return chatbot, None |
|
|
|
|
|
WELCOME_TEXT = """ |
|
# Corporate Coach for your Conference Calls |
|
|
|
A demo that uses GPT-4V with π€+π₯ to tell you how to be a better corporate employee |
|
|
|
If you want more tips on how to be a good corporate employee visit www.stefanobaccianella.com |
|
""" |
|
|
|
css = """ |
|
.upload-container > div:has(> .uploading) { |
|
display: none !important; |
|
} |
|
""" |
|
|
|
with gr.Blocks(css=css) as demo: |
|
gr.Markdown(WELCOME_TEXT) |
|
with gr.Row(): |
|
webcam = gr.Image(sources=["webcam"], streaming=True, type="filepath") |
|
with gr.Column(): |
|
api_key_textbox = gr.Textbox( |
|
label="OpenAI API KEY", type="password", value="") |
|
chatbot = gr.Chatbot( |
|
height=500, bubble_full_width=False) |
|
audio = gr.Audio(sources=["microphone"], type="filepath") |
|
audio.change(fn=process, |
|
inputs=[api_key_textbox, audio, webcam,chatbot], |
|
outputs=[chatbot, audio] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(show_error=True) |