#%% import os import openai import gradio as gr import sys sys.path.append('./') from gpt_helper import GPT4VisionClass, response_to_json # Placeholder for the model variable and the confirmation text model = None model_status = "Model is not initialized." def initialize_model(api_key): global model, model_status if model is None: # Initialize the model only if it hasn't been already model = GPT4VisionClass(key=api_key, max_tokens=1024, temperature=0.9, gpt_model="gpt-4-vision-preview", role_msg="You are a helpful agent with vision capabilities; do not respond to objects not depicted in images.") model_status = "Model initialized successfully with the provided API key." else: model_status = "Model has already been initialized." return model_status def add_text(state, query_text, image_paths=None, images=None): if model is None: return state, [("Error", "Model is not initialized. Please enter your OpenAI API Key.")] images = image_paths if image_paths is not None else images response_interaction = model.chat(query_text=query_text, image_paths=image_paths, images=None, PRINT_USER_MSG=False, PRINT_GPT_OUTPUT=False, RESET_CHAT=False, RETURN_RESPONSE=True, VISUALIZE=False, DETAIL='high') result = model._get_response_content() state.append((query_text, result)) return state, state def scenario_button_clicked(scenario_name): print(f"Scenario clicked: {scenario_name}") return f"Scenario clicked: {scenario_name}" if __name__ == "__main__": # Define image paths for each subcategory under the main categories image_paths = { "Semantic Preference": { "Color Preference": "./images/semantic/color/4.png", "Shape Preference": "./images/semantic/shape/5.png", "Category Preference: Fruits and Beverages ": "./images/semantic/category/1/5.png", "Category Preference: Beverages and Snacks": "./images/semantic/category/2/5.png", }, "Spatial Pattern Preference": { "Vertical Line": "./images/spatial-pattern/vertical/5.png", "Horizontal Line": "./images/spatial-pattern/horizontal/5.png", "Diagonal Line": "./images/spatial-pattern/diagonal/4.png", "Quadrants": "./images/spatial-pattern/quadrant/5.png", }, } with gr.Blocks() as demo: ######## Introduction for the demo with gr.Column(): gr.Markdown("""
[Running Examples] Chain-of-Visual-Residuals
""") gr.Markdown(""" In this paper, we focus on the problem of inferring underlying human preferences from a sequence of raw visual observations in tabletop manipulation environments with a variety of object types, named **V**isual **P**reference **I**nference (**VPI**). To facilitate visual reasoning in the context of manipulation, we introduce the Chain-of-Visual-Residuals (CoVR) method. CoVR employs a prompting mechanism """) with gr.Row(): for category, scenarios in image_paths.items(): with gr.Column(): gr.Markdown(f"## {category}") with gr.Row(wrap=True): for scenario, img_path in scenarios.items(): with gr.Column(layout='horizontal', variant='panel'): gr.Image(value=img_path, tool=None).style(width='33%', margin='5px') gr.Button(scenario, onclick=lambda x=scenario: scenario_button_clicked(x)).style(width='33%', margin='5px') ######## Input OpenAI API Key and display initialization result with gr.Row(): # Use gr.Row for horizontal layout # API Key Input with gr.Column(): openai_gpt4_key = gr.Textbox(label="OpenAI GPT4 Key", type="password", placeholder="sk..", info="You have to provide your own GPT4 keys for this app to function properly") initialize_button = gr.Button("Initialize Model") # Initialization Button and Result Display with gr.Column(): model_status_text = gr.Text(label="Initialize API Result", info="The result of the model initialization will be displayed here.") initialize_button.click(initialize_model, inputs=[openai_gpt4_key], outputs=[model_status_text]) ######## Chatbot chatbot = gr.Chatbot(elem_id="chatbot") state = gr.State([]) with gr.Row(): query_text = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False) query_text.submit(add_text, inputs=[state, query_text], outputs=[state, chatbot]) query_text.submit(lambda: "", inputs=None, outputs=query_text) demo.launch(share=True, inline=True)