Upload full model folder with custom handler
Browse files- README.md +0 -1
- handler.py +34 -25
README.md
CHANGED
@@ -2,7 +2,6 @@
|
|
2 |
base_model:
|
3 |
- meta-llama/Llama-3.1-8B-Instruct
|
4 |
- google/siglip-so400m-patch14-384
|
5 |
-
- fancyfeast/llama-joycaption-alpha-two-hf-llava
|
6 |
tags:
|
7 |
- captioning
|
8 |
---
|
|
|
2 |
base_model:
|
3 |
- meta-llama/Llama-3.1-8B-Instruct
|
4 |
- google/siglip-so400m-patch14-384
|
|
|
5 |
tags:
|
6 |
- captioning
|
7 |
---
|
handler.py
CHANGED
@@ -16,14 +16,21 @@ class EndpointHandler():
|
|
16 |
self.model.eval()
|
17 |
|
18 |
def __call__(self, data):
|
19 |
-
|
20 |
-
prompt =
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
24 |
try:
|
25 |
-
|
26 |
-
|
|
|
|
|
27 |
except Exception as e:
|
28 |
return {"error": f"Failed to decode image: {str(e)}"}
|
29 |
|
@@ -41,32 +48,34 @@ class EndpointHandler():
|
|
41 |
if not isinstance(convo_string, str):
|
42 |
return {"error": "Failed to create conversation string."}
|
43 |
|
44 |
-
# Prepare the inputs for the model
|
45 |
-
|
46 |
text=[convo_string],
|
47 |
-
images=
|
48 |
return_tensors="pt"
|
49 |
)
|
50 |
-
|
51 |
-
if "pixel_values" in
|
52 |
-
|
53 |
|
54 |
-
# Generate caption tokens
|
55 |
generate_ids = self.model.generate(
|
56 |
-
**
|
57 |
max_new_tokens=300,
|
58 |
do_sample=True,
|
59 |
temperature=0.6,
|
60 |
top_p=0.9
|
61 |
-
)
|
62 |
-
|
63 |
-
# Optionally, trim off the prompt tokens
|
64 |
-
generate_ids = generate_ids[inputs["input_ids"].shape[1]:]
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
-
return {"
|
|
|
16 |
self.model.eval()
|
17 |
|
18 |
def __call__(self, data):
|
19 |
+
inputs = data.get("inputs", {})
|
20 |
+
prompt = inputs.get("prompt", "Generate a caption for this image.")
|
21 |
+
images_b64 = inputs.get("images")
|
22 |
+
|
23 |
+
# Handle both single image and list of images
|
24 |
+
if isinstance(images_b64, str):
|
25 |
+
images_b64 = [images_b64]
|
26 |
+
if not images_b64:
|
27 |
+
return {"error": "No images provided in the payload."}
|
28 |
+
|
29 |
try:
|
30 |
+
images = [
|
31 |
+
Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
|
32 |
+
for img_b64 in images_b64
|
33 |
+
]
|
34 |
except Exception as e:
|
35 |
return {"error": f"Failed to decode image: {str(e)}"}
|
36 |
|
|
|
48 |
if not isinstance(convo_string, str):
|
49 |
return {"error": "Failed to create conversation string."}
|
50 |
|
51 |
+
# Prepare the inputs for the model - process all images at once
|
52 |
+
model_inputs = self.processor(
|
53 |
text=[convo_string],
|
54 |
+
images=images,
|
55 |
return_tensors="pt"
|
56 |
)
|
57 |
+
model_inputs = {k: v.to(self.device) for k, v in model_inputs.items()}
|
58 |
+
if "pixel_values" in model_inputs:
|
59 |
+
model_inputs["pixel_values"] = model_inputs["pixel_values"].to(torch.bfloat16)
|
60 |
|
61 |
+
# Generate caption tokens for all images at once
|
62 |
generate_ids = self.model.generate(
|
63 |
+
**model_inputs,
|
64 |
max_new_tokens=300,
|
65 |
do_sample=True,
|
66 |
temperature=0.6,
|
67 |
top_p=0.9
|
68 |
+
)
|
|
|
|
|
|
|
69 |
|
70 |
+
# Trim off the prompt tokens and decode all captions
|
71 |
+
generate_ids = generate_ids[:, model_inputs["input_ids"].shape[1]:]
|
72 |
+
captions = [
|
73 |
+
self.processor.tokenizer.decode(
|
74 |
+
ids,
|
75 |
+
skip_special_tokens=True,
|
76 |
+
clean_up_tokenization_spaces=False
|
77 |
+
).strip()
|
78 |
+
for ids in generate_ids
|
79 |
+
]
|
80 |
|
81 |
+
return {"captions": captions if len(captions) > 1 else captions[0]}
|