Spaces:

Salesforce
/

BLIP2

Running

App Files Files Community

Dongxu Li commited on Feb 1, 2023

Commit

8f68280

1 Parent(s): 30474d6

add generation options.

Browse files

Files changed (5) hide show

.gitattributes +2 -0
app.py +140 -73
house.png +3 -0
sunset.png +3 -0
utils.py +24 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+house.png filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
-from PIL import Image
-import requests
-import json
 import gradio as gr
-from io import BytesIO
 def encode_image(image):
     buffered = BytesIO()
     image.save(buffered, format="JPEG")
@@ -15,16 +15,19 @@ def encode_image(image):
     return buffered
-def query_api(image, prompt, decoding_method):
-    # local host for testing
-    url = "http://34.132.142.70:5000/api/generate"
-    headers = {
-        'User-Agent': 'BLIP-2 HuggingFace Space'
     }
-    data = {"prompt": prompt, "use_nucleus_sampling": decoding_method == "Nucleus sampling"}
     image = encode_image(image)
     files = {"image": image}
@@ -36,80 +39,144 @@ def query_api(image, prompt, decoding_method):
         return "Error: " + response.text
-def prepend_question(text):
-    text = text.strip().lower()
-    return "question: " + text
-def prepend_answer(text):
-    text = text.strip().lower()
-    return "answer: " + text
-def get_prompt_from_history(history):
-    prompts = []
-    for i in range(len(history)):
-        if i % 2 == 0:
-            prompts.append(prepend_question(history[i]))
-        else:
-            prompts.append(prepend_answer(history[i]))
-    return "\n".join(prompts)
-def postp_answer(text):
-    if text.startswith("answer: "):
-        return text[8:]
-    elif text.startswith("a: "):
-        return text[2:]
-    else:
-        return text
-def prep_question(text):
-    if text.startswith("question: "):
-        text = text[10:]
-    elif text.startswith("q: "):
-        text = text[2:]
-    if not text.endswith("?"):
-        text += "?"
-    return text
-def inference(image, text_input, decoding_method, history=[]):
-    text_input = prep_question(text_input)
-    history.append(text_input)
-    # prompt = '\n'.join(history)
-    prompt = get_prompt_from_history(history)
-    # print("prompt: " + prompt)
-    output = query_api(image, prompt, decoding_method)
-    output = [postp_answer(output[0])]
-    history += output
-    chat = [(history[i], history[i+1]) for i in range(0, len(history)-1, 2)]  # convert to tuples of list
-    return chat, history
-inputs = [gr.inputs.Image(type='pil'),
-          gr.inputs.Textbox(lines=2, label="Text input"),
-          gr.inputs.Radio(choices=['Nucleus sampling','Beam search'], type="value", default="Nucleus sampling", label="Text Decoding Method"),
-          "state",
-         ]
-outputs = ["chatbot", "state"]
-title = "BLIP-2"
 description = """Gradio demo for BLIP-2, a multimodal chatbot from Salesforce Research. To use it, simply upload your image, or click one of the examples to load them. Please visit our <a href='https://github.com/salesforce/LAVIS/tree/main/projects/blip2' target='_blank'>project webpage</a>.</p>
 <p> <strong>Disclaimer</strong>: This is a research prototype and is not intended for production use. No data including but not restricted to text and images is collected. </p>"""
 article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2201.12086' target='_blank'>BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models</a>"
-iface = gr.Interface(inference, inputs, outputs, title=title, description=description, article=article)
-iface.launch(enable_queue=True)

+from io import BytesIO
+import string
 import gradio as gr
+import requests
+from PIL import Image
+from utils import Endpoint
 def encode_image(image):
     buffered = BytesIO()
     image.save(buffered, format="JPEG")
     return buffered
+def query_api(image, prompt, decoding_method, temperature, len_penalty, repetition_penalty):
+    url = endpoint.url
+    headers = {"User-Agent": "BLIP-2 HuggingFace Space"}
+    data = {
+        "prompt": prompt,
+        "use_nucleus_sampling": decoding_method == "Nucleus sampling",
+        "temperature": temperature,
+        "length_penalty": len_penalty,
+        "repetition_penalty": repetition_penalty,
     }
     image = encode_image(image)
     files = {"image": image}
         return "Error: " + response.text
+def postprocess_output(output):
+    # if last character is not a punctuation, add a full stop
+    if not output[0][-1] in string.punctuation:
+        output[0] += "."
+    return output
+def inference(
+    image,
+    text_input,
+    decoding_method,
+    temperature,
+    length_penalty,
+    repetition_penalty,
+    history=[],
+):
+    text_input = text_input
+    history.append(text_input)
+    prompt = " ".join(history)
+    output = query_api(image, prompt, decoding_method, temperature, length_penalty, repetition_penalty)
+    output = postprocess_output(output)
+    history += output
+    chat = [
+        (history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)
+    ]  # convert to tuples of list
+    return chat, history
+# image source: https://m.facebook.com/112483753737319/photos/112489593736735/
+endpoint = Endpoint()
+examples = [
+    ["house.png", "How could someone get out of the house?"],
+    [
+        "sunset.png",
+        "Write a romantic message that goes along this photo.",
+    ],
+]
+# outputs = ["chatbot", "state"]
+title = """<h1 align="center">BLIP-2</h1>"""
 description = """Gradio demo for BLIP-2, a multimodal chatbot from Salesforce Research. To use it, simply upload your image, or click one of the examples to load them. Please visit our <a href='https://github.com/salesforce/LAVIS/tree/main/projects/blip2' target='_blank'>project webpage</a>.</p>
 <p> <strong>Disclaimer</strong>: This is a research prototype and is not intended for production use. No data including but not restricted to text and images is collected. </p>"""
 article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2201.12086' target='_blank'>BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models</a>"
+# iface = gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples)
+def reset_all(text_input, image_input, chatbot, history):
+    return "", None, None, []
+def reset_chatbot(chatbot, history):
+    return None, []
+with gr.Blocks() as iface:
+    state = gr.State([])
+    gr.Markdown(title)
+    gr.Markdown(description)
+    gr.Markdown(article)
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="pil")
+            text_input = gr.Textbox(lines=2, label="Text input")
+            sampling = gr.Radio(
+                choices=["Beam search", "Nucleus sampling"],
+                value="Beam search",
+                label="Text Decoding Method",
+                interactive=True,
+            )
+            with gr.Row():
+                temperature = gr.Slider(
+                    minimum=0.5,
+                    maximum=1.0,
+                    value=0.8,
+                    interactive=True,
+                    label="Temperature",
+                )
+                len_penalty = gr.Slider(
+                    minimum=-2.0,
+                    maximum=2.0,
+                    value=1.0,
+                    step=0.5,
+                    interactive=True,
+                    label="Length Penalty",
+                )
+                rep_penalty = gr.Slider(
+                    minimum=1.0,
+                    maximum=10.0,
+                    value=1.0,
+                    step=0.5,
+                    interactive=True,
+                    label="Repetition Penalty",
+                )
+        with gr.Column():
+            chatbot = gr.Chatbot()
+            with gr.Row():
+                clear_button = gr.Button(value="Clear", interactive=True)
+                clear_button.click(
+                    reset_all,
+                    [text_input, image_input, chatbot, state],
+                    [text_input, image_input, chatbot, state],
+                )
+                submit_button = gr.Button(value="Submit", interactive=True, variant="primary")
+                submit_button.click(
+                    inference,
+                    [
+                        image_input,
+                        text_input,
+                        sampling,
+                        temperature,
+                        len_penalty,
+                        state,
+                    ],
+                    [chatbot, state],
+                )
+    image_input.change(reset_chatbot, [chatbot, state], [chatbot, state])
+    examples = gr.Examples(
+        examples=examples,
+        inputs=[image_input, text_input],
+    )
+iface.queue(concurrency_count=1)
+iface.launch(enable_queue=True, debug=True)

house.png ADDED Viewed

Git LFS Details

SHA256: a7b8999524f8f178a43d3417b9f7dfa80d8aff7ccb7ea1b5ba0e5f96bc17bdc0
Pointer size: 132 Bytes
Size of remote file: 1.24 MB

sunset.png ADDED Viewed

Git LFS Details

SHA256: 9a3778b1890ee461c7b052a5f25ce566ffbd706d6c2beb7280f1393052808008
Pointer size: 130 Bytes
Size of remote file: 78 kB

utils.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import requests
+class Endpoint:
+    def __init__(self):
+        self.config_path = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/projects/blip2/config.json"
+        self._url = None
+    @property
+    def url(self):
+        if self._url is None:
+            self._url = self.get_url()
+        return self._url
+    def get_url(self):
+        response = requests.get(self.config_path)
+        config = response.json()
+        return config["endpoint"]