BLIP2-with-transformers

Runtime error

App Files Files Community

hysts HF Staff commited on Sep 19, 2023

Commit

40ccb44

1 Parent(s): 056e5c6

Update

Browse files

Files changed (2) hide show

app.py +13 -53
style.css +7 -0

app.py CHANGED Viewed

@@ -12,8 +12,6 @@ from transformers import AutoProcessor, Blip2ForConditionalGeneration
 DESCRIPTION = "# [BLIP-2](https://github.com/salesforce/LAVIS/tree/main/projects/blip2)"
-if (SPACE_ID := os.getenv("SPACE_ID")) is not None:
-    DESCRIPTION += f'\n<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/{SPACE_ID}?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
 if not torch.cuda.is_available():
     DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
@@ -21,40 +19,23 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 MODEL_ID_OPT_6_7B = "Salesforce/blip2-opt-6.7b"
 MODEL_ID_FLAN_T5_XXL = "Salesforce/blip2-flan-t5-xxl"
 if torch.cuda.is_available():
-    model_dict = {
-        # MODEL_ID_OPT_6_7B: {
-        #    'processor':
-        #    AutoProcessor.from_pretrained(MODEL_ID_OPT_6_7B),
-        #    'model':
-        #    Blip2ForConditionalGeneration.from_pretrained(MODEL_ID_OPT_6_7B,
-        #                                                  device_map='auto',
-        #                                                  load_in_8bit=True),
-        # },
-        MODEL_ID_FLAN_T5_XXL: {
-            "processor": AutoProcessor.from_pretrained(MODEL_ID_FLAN_T5_XXL),
-            "model": Blip2ForConditionalGeneration.from_pretrained(
-                MODEL_ID_FLAN_T5_XXL, device_map="auto", load_in_8bit=True
-            ),
-        }
-    }
 else:
-    model_dict = {}
 def generate_caption(
-    model_id: str,
     image: PIL.Image.Image,
     decoding_method: str,
     temperature: float,
     length_penalty: float,
     repetition_penalty: float,
 ) -> str:
-    model_info = model_dict[model_id]
-    processor = model_info["processor"]
-    model = model_info["model"]
     inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
     generated_ids = model.generate(
         pixel_values=inputs.pixel_values,
@@ -72,7 +53,6 @@ def generate_caption(
 def answer_question(
-    model_id: str,
     image: PIL.Image.Image,
     text: str,
     decoding_method: str,
@@ -80,10 +60,6 @@ def answer_question(
     length_penalty: float,
     repetition_penalty: float,
 ) -> str:
-    model_info = model_dict[model_id]
-    processor = model_info["processor"]
-    model = model_info["model"]
     inputs = processor(images=image, text=text, return_tensors="pt").to(device, torch.float16)
     generated_ids = model.generate(
         **inputs,
@@ -107,7 +83,6 @@ def postprocess_output(output: str) -> str:
 def chat(
-    model_id: str,
     image: PIL.Image.Image,
     text: str,
     decoding_method: str,
@@ -123,7 +98,6 @@ def chat(
     prompt = " ".join(history_qa)
     output = answer_question(
-        model_id,
         image,
         prompt,
         decoding_method,
@@ -164,24 +138,14 @@ examples = [
 with gr.Blocks(css="style.css") as demo:
     gr.Markdown(DESCRIPTION)
     image = gr.Image(type="pil")
     with gr.Accordion(label="Advanced settings", open=False):
-        with gr.Row():
-            model_id_caption = gr.Dropdown(
-                label="Model ID for image captioning",
-                choices=[MODEL_ID_OPT_6_7B, MODEL_ID_FLAN_T5_XXL],
-                value=MODEL_ID_FLAN_T5_XXL,
-                interactive=False,
-                visible=False,
-            )
-            model_id_chat = gr.Dropdown(
-                label="Model ID for VQA",
-                choices=[MODEL_ID_OPT_6_7B, MODEL_ID_FLAN_T5_XXL],
-                value=MODEL_ID_FLAN_T5_XXL,
-                interactive=False,
-                visible=False,
-            )
         sampling_method = gr.Radio(
             label="Text Decoding Method",
             choices=["Beam search", "Nucleus sampling"],
@@ -225,16 +189,12 @@ with gr.Blocks(css="style.css") as demo:
     gr.Examples(
         examples=examples,
-        inputs=[
-            image,
-            vqa_input,
-        ],
     )
     caption_button.click(
         fn=generate_caption,
         inputs=[
-            model_id_caption,
             image,
             sampling_method,
             temperature,
@@ -246,7 +206,6 @@ with gr.Blocks(css="style.css") as demo:
     )
     chat_inputs = [
-        model_id_chat,
         image,
         vqa_input,
         sampling_method,
@@ -296,4 +255,5 @@ with gr.Blocks(css="style.css") as demo:
         queue=False,
     )
-demo.queue(max_size=10).launch()

 DESCRIPTION = "# [BLIP-2](https://github.com/salesforce/LAVIS/tree/main/projects/blip2)"
 if not torch.cuda.is_available():
     DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
 MODEL_ID_OPT_6_7B = "Salesforce/blip2-opt-6.7b"
 MODEL_ID_FLAN_T5_XXL = "Salesforce/blip2-flan-t5-xxl"
+MODEL_ID = os.getenv("MODEL_ID", MODEL_ID_FLAN_T5_XXL)
 if torch.cuda.is_available():
+    processor = AutoProcessor.from_pretrained(MODEL_ID)
+    model = Blip2ForConditionalGeneration.from_pretrained(MODEL_ID, device_map="auto", load_in_8bit=True)
 else:
+    processor = None
+    model = None
 def generate_caption(
     image: PIL.Image.Image,
     decoding_method: str,
     temperature: float,
     length_penalty: float,
     repetition_penalty: float,
 ) -> str:
     inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
     generated_ids = model.generate(
         pixel_values=inputs.pixel_values,
 def answer_question(
     image: PIL.Image.Image,
     text: str,
     decoding_method: str,
     length_penalty: float,
     repetition_penalty: float,
 ) -> str:
     inputs = processor(images=image, text=text, return_tensors="pt").to(device, torch.float16)
     generated_ids = model.generate(
         **inputs,
 def chat(
     image: PIL.Image.Image,
     text: str,
     decoding_method: str,
     prompt = " ".join(history_qa)
     output = answer_question(
         image,
         prompt,
         decoding_method,
 with gr.Blocks(css="style.css") as demo:
     gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(
+        value="Duplicate Space for private use",
+        elem_id="duplicate-button",
+        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
+    )
     image = gr.Image(type="pil")
     with gr.Accordion(label="Advanced settings", open=False):
         sampling_method = gr.Radio(
             label="Text Decoding Method",
             choices=["Beam search", "Nucleus sampling"],
     gr.Examples(
         examples=examples,
+        inputs=[image, vqa_input],
     )
     caption_button.click(
         fn=generate_caption,
         inputs=[
             image,
             sampling_method,
             temperature,
     )
     chat_inputs = [
         image,
         vqa_input,
         sampling_method,
         queue=False,
     )
+if __name__ == "__main__":
+    demo.queue(max_size=10).launch()

style.css CHANGED Viewed

@@ -1,3 +1,10 @@
 h1 {
   text-align: center;
 }

 h1 {
   text-align: center;
 }
+#duplicate-button {
+  margin: auto;
+  color: #fff;
+  background: #1565c0;
+  border-radius: 100vh;
+}