Spaces:
Paused
Paused
Rishi Desai
commited on
Commit
·
ebbd273
1
Parent(s):
d8dc4ed
removed hardcoded trigger word
Browse files- caption.py +20 -20
caption.py
CHANGED
|
@@ -3,6 +3,7 @@ import io
|
|
| 3 |
import os
|
| 4 |
from together import Together
|
| 5 |
|
|
|
|
| 6 |
|
| 7 |
def get_system_prompt():
|
| 8 |
return """Automated Image Captioning (for LoRA Training)
|
|
@@ -10,9 +11,8 @@ def get_system_prompt():
|
|
| 10 |
Role: You are an expert AI captioning system generating precise, structured descriptions for character images optimized for LoRA model training in Stable Diffusion and Flux.1-dev.
|
| 11 |
|
| 12 |
IMPORTANT: You MUST follow these rules EXACTLY:
|
| 13 |
-
1. EVERY caption MUST start with the word "
|
| 14 |
-
2. You MUST use the exact format:
|
| 15 |
-
3. DO NOT include any additional text, explanations, or formatting
|
| 16 |
4. DO NOT use bullet points, lists, or any other formatting
|
| 17 |
5. DO NOT include any text before or after the caption
|
| 18 |
6. If you don't follow this format exactly, the caption will be rejected
|
|
@@ -34,7 +34,7 @@ Avoid Describing These Unless Variable Across Dataset or Uncertain from Concept:
|
|
| 34 |
- Known accessories that always appear (unless outfit-specific)
|
| 35 |
|
| 36 |
Caption Format (MUST FOLLOW EXACTLY):
|
| 37 |
-
|
| 38 |
|
| 39 |
Captioning Principles:
|
| 40 |
- Emphasize visual variation and context-specific details (outfit, pose, lighting, expression, camera angle).
|
|
@@ -44,15 +44,15 @@ Captioning Principles:
|
|
| 44 |
- Specify lighting conditions (soft lighting, harsh shadows, glowing backlight).
|
| 45 |
- Explicitly state camera angle (e.g., front view, right side profile, low-angle, high-angle, overhead).
|
| 46 |
- Avoid mentioning real or fictional identities.
|
| 47 |
-
- Always prefix with the trigger word "
|
| 48 |
|
| 49 |
Examples (MUST FOLLOW THIS EXACT FORMAT):
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
|
| 55 |
-
REMEMBER: Your response must be a single line starting with "
|
| 56 |
"""
|
| 57 |
|
| 58 |
|
|
@@ -82,10 +82,10 @@ def get_together_client():
|
|
| 82 |
|
| 83 |
def extract_caption(line):
|
| 84 |
"""Extract caption from a line of text."""
|
| 85 |
-
if
|
| 86 |
-
# If caption doesn't start with
|
| 87 |
-
if not line.startswith(
|
| 88 |
-
return line[line.index(
|
| 89 |
return line
|
| 90 |
return ""
|
| 91 |
|
|
@@ -117,7 +117,7 @@ def caption_single_image(client, img_str):
|
|
| 117 |
break
|
| 118 |
|
| 119 |
if not caption:
|
| 120 |
-
error_msg = "Failed to extract a valid caption (containing '
|
| 121 |
error_msg += f"\n\nActual response:\n{full_response}"
|
| 122 |
raise CaptioningError(error_msg)
|
| 123 |
|
|
@@ -155,8 +155,8 @@ def process_batch_response(response, image_strings):
|
|
| 155 |
image_count = len(image_strings)
|
| 156 |
captions = [""] * image_count
|
| 157 |
|
| 158 |
-
# Extract lines that start with or contain
|
| 159 |
-
caption_lines = [line for line in lines if
|
| 160 |
|
| 161 |
# Assign captions to images
|
| 162 |
for i in range(image_count):
|
|
@@ -171,7 +171,7 @@ def process_batch_response(response, image_strings):
|
|
| 171 |
def validate_batch_captions(captions, image_count, full_response):
|
| 172 |
"""Validate captions extracted from a batch response."""
|
| 173 |
# Check if all captions are empty or don't contain the trigger word
|
| 174 |
-
valid_captions = [c for c in captions if c and
|
| 175 |
if not valid_captions:
|
| 176 |
error_msg = "Failed to parse any valid captions from batch response."
|
| 177 |
error_msg += f"\n\nActual response:\n{full_response}"
|
|
@@ -180,7 +180,7 @@ def validate_batch_captions(captions, image_count, full_response):
|
|
| 180 |
# Check if some captions are missing
|
| 181 |
if len(valid_captions) < image_count:
|
| 182 |
missing_count = image_count - len(valid_captions)
|
| 183 |
-
invalid_captions = [(i, c) for i, c in enumerate(captions) if not c or
|
| 184 |
error_msg = f"Failed to parse captions for {missing_count} of {image_count} images in batch mode"
|
| 185 |
error_msg += "\n\nMalformed captions:"
|
| 186 |
for idx, caption in invalid_captions:
|
|
@@ -204,6 +204,6 @@ def extract_captions(file_path):
|
|
| 204 |
captions = []
|
| 205 |
with open(file_path, 'r') as file:
|
| 206 |
for line in file:
|
| 207 |
-
if line.startswith(
|
| 208 |
captions.append(line.strip())
|
| 209 |
return captions
|
|
|
|
| 3 |
import os
|
| 4 |
from together import Together
|
| 5 |
|
| 6 |
+
TRIGGER_WORD = "tr1gger"
|
| 7 |
|
| 8 |
def get_system_prompt():
|
| 9 |
return """Automated Image Captioning (for LoRA Training)
|
|
|
|
| 11 |
Role: You are an expert AI captioning system generating precise, structured descriptions for character images optimized for LoRA model training in Stable Diffusion and Flux.1-dev.
|
| 12 |
|
| 13 |
IMPORTANT: You MUST follow these rules EXACTLY:
|
| 14 |
+
1. EVERY caption MUST start with the word "{TRIGGER_WORD}" (exactly like this, no variations)
|
| 15 |
+
2. You MUST use the exact format: {TRIGGER_WORD} [Style], [Notable Visual Features], [Clothing], [Pose], [Expression/Mood], [Background/Setting], [Lighting], [Camera Angle]
|
|
|
|
| 16 |
4. DO NOT use bullet points, lists, or any other formatting
|
| 17 |
5. DO NOT include any text before or after the caption
|
| 18 |
6. If you don't follow this format exactly, the caption will be rejected
|
|
|
|
| 34 |
- Known accessories that always appear (unless outfit-specific)
|
| 35 |
|
| 36 |
Caption Format (MUST FOLLOW EXACTLY):
|
| 37 |
+
{TRIGGER_WORD} [Style], [Notable Visual Features], [Clothing], [Pose], [Expression/Mood], [Background/Setting], [Lighting], [Camera Angle]
|
| 38 |
|
| 39 |
Captioning Principles:
|
| 40 |
- Emphasize visual variation and context-specific details (outfit, pose, lighting, expression, camera angle).
|
|
|
|
| 44 |
- Specify lighting conditions (soft lighting, harsh shadows, glowing backlight).
|
| 45 |
- Explicitly state camera angle (e.g., front view, right side profile, low-angle, high-angle, overhead).
|
| 46 |
- Avoid mentioning real or fictional identities.
|
| 47 |
+
- Always prefix with the trigger word "{TRIGGER_WORD}."
|
| 48 |
|
| 49 |
Examples (MUST FOLLOW THIS EXACT FORMAT):
|
| 50 |
+
{TRIGGER_WORD} photorealistic, combat gear, tactical vest and gloves, standing in profile, neutral, empty room, overcast lighting, side profile
|
| 51 |
+
{TRIGGER_WORD} 3D-rendered, digital patterns, hooded cloak, seated cross-legged, calm, meditation chamber, low ambient lighting, front view
|
| 52 |
+
{TRIGGER_WORD} anime-style, school uniform with blue necktie, standing with arms behind back, gentle smile, classroom, soft daylight, three-quarter view
|
| 53 |
+
{TRIGGER_WORD} photorealistic, long trench coat and combat boots, walking, determined, rain-soaked street, dramatic shadows, low-angle view
|
| 54 |
|
| 55 |
+
REMEMBER: Your response must be a single line starting with "{TRIGGER_WORD}" and following the exact format above. No additional text, formatting, or explanations are allowed.
|
| 56 |
"""
|
| 57 |
|
| 58 |
|
|
|
|
| 82 |
|
| 83 |
def extract_caption(line):
|
| 84 |
"""Extract caption from a line of text."""
|
| 85 |
+
if TRIGGER_WORD in line:
|
| 86 |
+
# If caption doesn't start with trigger_word but contains it, extract just that part
|
| 87 |
+
if not line.startswith(TRIGGER_WORD):
|
| 88 |
+
return line[line.index(TRIGGER_WORD):]
|
| 89 |
return line
|
| 90 |
return ""
|
| 91 |
|
|
|
|
| 117 |
break
|
| 118 |
|
| 119 |
if not caption:
|
| 120 |
+
error_msg = f"Failed to extract a valid caption (containing '{TRIGGER_WORD}') from the response"
|
| 121 |
error_msg += f"\n\nActual response:\n{full_response}"
|
| 122 |
raise CaptioningError(error_msg)
|
| 123 |
|
|
|
|
| 155 |
image_count = len(image_strings)
|
| 156 |
captions = [""] * image_count
|
| 157 |
|
| 158 |
+
# Extract lines that start with or contain trigger_word
|
| 159 |
+
caption_lines = [line for line in lines if TRIGGER_WORD in line]
|
| 160 |
|
| 161 |
# Assign captions to images
|
| 162 |
for i in range(image_count):
|
|
|
|
| 171 |
def validate_batch_captions(captions, image_count, full_response):
|
| 172 |
"""Validate captions extracted from a batch response."""
|
| 173 |
# Check if all captions are empty or don't contain the trigger word
|
| 174 |
+
valid_captions = [c for c in captions if c and TRIGGER_WORD in c]
|
| 175 |
if not valid_captions:
|
| 176 |
error_msg = "Failed to parse any valid captions from batch response."
|
| 177 |
error_msg += f"\n\nActual response:\n{full_response}"
|
|
|
|
| 180 |
# Check if some captions are missing
|
| 181 |
if len(valid_captions) < image_count:
|
| 182 |
missing_count = image_count - len(valid_captions)
|
| 183 |
+
invalid_captions = [(i, c) for i, c in enumerate(captions) if not c or TRIGGER_WORD not in c]
|
| 184 |
error_msg = f"Failed to parse captions for {missing_count} of {image_count} images in batch mode"
|
| 185 |
error_msg += "\n\nMalformed captions:"
|
| 186 |
for idx, caption in invalid_captions:
|
|
|
|
| 204 |
captions = []
|
| 205 |
with open(file_path, 'r') as file:
|
| 206 |
for line in file:
|
| 207 |
+
if line.startswith(TRIGGER_WORD):
|
| 208 |
captions.append(line.strip())
|
| 209 |
return captions
|