RE-N-Y
/

logic2vision

@@ -33,51 +33,123 @@ The model has been finetuned using LoRA to generate python pseudocode outputs to
 ## Uses
-The inference method is identical to [LLaVA-1.5-13B](https://huggingface.co/llava-hf/llava-1.5-13b-hf).
 ```python
 import torch
-from transformers import AutoProcessor, LlavaForConditionalGeneration
 from PIL import Image
-image = Image.open("<path to image>")
-image = image.convert("RGB")
-question = "What material attribute do the stove, the oven behind the white and dirty wall and the tea_kettle have in common?"
-codes = """
-selected_wall = select(wall)
-filtered_wall = filter(selected_wall, ['white', 'dirty'])
-related_oven = relate(oven, behind, o, filtered_wall)
-selected_stove = select(stove)
-selected_tea_kettle = select(tea_kettle)
-materials = query_material(related_oven, selected_stove, selected_tea_kettle)
-material = common(materials)
-"""
-prompt = """
-USER: <image>
-Executes the code and logs the results step-by-step to provide an answer to the question.
-Question
-{question}
-Code
-{codes}
-ASSISTANT:
-Log
 """
-prompt = prompt.format(question=question, codes=codes)
-model = LlavaForConditionalGeneration.from_pretrained("RE-N-Y/logic2vision", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
-processor = AutoProcessor.from_pretrained("RE-N-Y/logic2vision")
-processor.tokenizer.pad_token = processor.tokenizer.eos_token
-processor.tokenizer.padding_side = "left"
-prompts = processor(images=image, text=prompt, return_tensors="pt")
 generate_ids = model.generate(**inputs, max_new_tokens=256)
-processor.batch_decode(generate_ids, skip_special_tokens=True)
 ```
 ## Bias, Risks, and Limitations

 ## Uses
+The inference method is similar to [LLaVA-1.5-13B](https://huggingface.co/llava-hf/llava-1.5-13b-hf).
+### Example images
+[zebras.jpg](https://huggingface.co/RE-N-Y/logic2vision/resolve/main/zebras.jpg)
+[room.jpg](https://huggingface.co/RE-N-Y/logic2vision/resolve/main/room.jpg)
 ```python
 import torch
+from transformers import LlavaProcessor, LlavaForConditionalGeneration
+import requests
 from PIL import Image
+class LLaVACodeTemplate:
+    prompt = """
+    USER: <image>
+    Executes the code and logs the results step-by-step to provide an answer to the question.
+    Question
+    {question}
+    Code
+    {codes}
+    ASSISTANT:
+    Log
+    """
+    answer = """
+    {logs}
+    Answer:
+    {answer}</s>
+    """
+template = LLaVACodeTemplate()
+model = LlavaForConditionalGeneration.from_pretrained("RE-N-Y/logic2vision", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, cache_dir="/data/tir/projects/tir6/general/sakter/cache")
+model.to("cuda")
+processor = LlavaProcessor.from_pretrained("RE-N-Y/logic2vision")
+processor.tokenizer.pad_token = processor.tokenizer.eos_token
+processor.tokenizer.padding_side = "left"
+image = Image.open(requests.get("https://huggingface.co/RE-N-Y/logic2vision/resolve/main/zebras.jpg", stream=True).raw)
+question = "What else in the image is striped as the rope and the mane to the left of the white clouds?"
+codes = """selected_clouds = select(clouds)
+filtered_clouds = filter(selected_clouds, ['white'])
+related_mane = relate(mane, to the left of, o, filtered_clouds)
+selected_rope = select(rope)
+pattern = query_pattern(['selected_rope', 'related_mane'])
+result = select(objects, attr=pattern)
 """
+prompt = template.prompt.format(question=question, codes=codes)
+inputs = processor(images=image, text=prompt, return_tensors="pt")
+inputs.to("cuda")
+generate_ids = model.generate(**inputs, max_new_tokens=256)
+output = processor.batch_decode(generate_ids, skip_special_tokens=True)
+print(output[0])
+# USER:
+# Executes the code and logs the results step-by-step to provide an answer to the question.
+# Question
+# What else in the image is striped as the rope and the mane to the left of the white clouds?
+# Code
+# selected_clouds = select(clouds)
+# filtered_clouds = filter(selected_clouds, ['white'])
+# related_mane = relate(mane, to the left of, o, filtered_clouds)
+# selected_rope = select(rope)
+# pattern = query_pattern(['selected_rope', 'related_mane'])
+# result = select(objects, attr=pattern)
+# ASSISTANT:
+# Log
+# ('clouds', ['white'])
+# ('clouds', ['white'])
+# ('mane', ['striped'])
+# ('rope', ['no object'])
+# ['the question itself is problematic']
+# ['the question itself is problematic']
+# Answer:
+# the question itself is problematic
+image = Image.open(requests.get("https://huggingface.co/RE-N-Y/logic2vision/resolve/main/room.jpg", stream=True).raw)
+question = "What material do the chair and the table have in common?"
+codes = """selected_chair = select(chair)
+selected_table = select(table)
+materials = query_material([selected_chair, selected_table])
+common_material = common(materials)
+"""
+prompt = template.prompt.format(question=question, codes=codes)
+inputs = processor(images=image, text=prompt, return_tensors="pt")
+inputs.to("cuda")
 generate_ids = model.generate(**inputs, max_new_tokens=256)
+output = processor.batch_decode(generate_ids, skip_special_tokens=True)
+print(output[0])
+# USER:
+# Executes the code and logs the results step-by-step to provide an answer to the question.
+# Question
+# What material do the chair and the table have in common?
+# Code
+# selected_chair = select(chair)
+# selected_table = select(table)
+# materials = query_material([selected_chair, selected_table])
+# common_material = common(materials)
+# ASSISTANT:
+# Log
+# ('chair', ['wood'])
+# ('table', ['wood'])
+# [['wood'], ['wood']]
+# ['wood']
+# Answer:
+# wood
 ```
 ## Bias, Risks, and Limitations