Multimodal-OCR

Runtime error

App Files Files Community

prithivMLmods commited on Mar 4

Commit

239e8eb

verified ·

1 Parent(s): b625ee3

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -38

app.py CHANGED Viewed

@@ -28,40 +28,6 @@ aya_model = AutoModelForImageTextToText.from_pretrained(
     AYA_MODEL_ID, device_map="auto", torch_dtype=torch.float16
 )
-def aya_vision_chat(image, text_prompt):
-    # If image is provided as a URL, load it via requests.
-    if isinstance(image, str):
-        response = requests.get(image)
-        image = Image.open(BytesIO(response.content))
-    messages = [{
-        "role": "user",
-        "content": [
-            {"type": "image", "image": image},
-            {"type": "text", "text": text_prompt},
-        ],
-    }]
-    inputs = aya_processor.apply_chat_template(
-        messages,
-        padding=True,
-        add_generation_prompt=True,
-        tokenize=True,
-        return_dict=True,
-        return_tensors="pt"
-    ).to(aya_model.device)
-    gen_tokens = aya_model.generate(
-        **inputs, max_new_tokens=300, do_sample=True, temperature=0.3
-    )
-    # Decode only the newly generated tokens.
-    response_text = aya_processor.tokenizer.decode(
-        gen_tokens[0][inputs.input_ids.shape[1]:],
-        skip_special_tokens=True
-    )
-    return response_text
 @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"].strip()
@@ -77,9 +43,40 @@ def model_inference(input_dict, history):
             # For simplicity, use the first provided image.
             image = load_image(files[0])
             yield "Processing with Aya-Vision ███████▒▒▒ 69%"
-            response_text = aya_vision_chat(image, text_prompt)
-            yield response_text
             return
     # Load images if provided.
     if len(files) > 1:
         images = [load_image(image) for image in files]
@@ -146,9 +143,9 @@ examples = [
 demo = gr.ChatInterface(
     fn=model_inference,
-    description="# **Multimodal OCR** `@aya-vision 'prompt..'`",
     examples=examples,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
     stop_btn="Stop Generation",
     multimodal=True,
     cache_examples=False,

     AYA_MODEL_ID, device_map="auto", torch_dtype=torch.float16
 )
 @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"].strip()
             # For simplicity, use the first provided image.
             image = load_image(files[0])
             yield "Processing with Aya-Vision ███████▒▒▒ 69%"
+            messages = [{
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": text_prompt},
+                ],
+            }]
+            inputs = aya_processor.apply_chat_template(
+                messages,
+                padding=True,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt"
+            ).to(aya_model.device)
+            # Set up a streamer for Aya-Vision output
+            streamer = TextIteratorStreamer(aya_processor, skip_prompt=True, skip_special_tokens=True)
+            generation_kwargs = dict(
+                inputs,
+                streamer=streamer,
+                max_new_tokens=300,
+                do_sample=True,
+                temperature=0.3
+            )
+            thread = Thread(target=aya_model.generate, kwargs=generation_kwargs)
+            thread.start()
+            buffer = ""
+            for new_text in streamer:
+                buffer += new_text
+                buffer = buffer.replace("<|im_end|>", "")
+                time.sleep(0.01)
+                yield buffer
             return
     # Load images if provided.
     if len(files) > 1:
         images = [load_image(image) for image in files]
 demo = gr.ChatInterface(
     fn=model_inference,
+    description="# Multimodal OCR `@aya-vision 'prompt..'`",
     examples=examples,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="By default, it runs Qwen2VL. Tag @aya-vision for Aya Vision 8B"),
     stop_btn="Stop Generation",
     multimodal=True,
     cache_examples=False,