Fraser commited on
Commit
ce1f183
·
1 Parent(s): 48e6066
src/lib/components/PicletGenerator/PicletGenerator.svelte CHANGED
@@ -289,22 +289,14 @@ Focus on: colors, body shape, eyes, limbs, mouth, and key visual features. Omit
289
  }
290
 
291
  try {
292
- // Use enhanced captioning for object identification
293
  const captionResult = await EnhancedCaptionService.generateEnhancedCaption(
294
  joyCaptionClient,
295
  workflowState.userImage
296
  );
297
 
298
- // Store caption results
299
- workflowState.imageCaption = captionResult.fullCaption;
300
- workflowState.objectName = captionResult.canonicalName;
301
- workflowState.objectAttributes = captionResult.variation ? [captionResult.variation] : [];
302
- workflowState.visualDetails = captionResult.visualDetails;
303
-
304
- console.log('Object identified:', workflowState.objectName);
305
- console.log('Variation:', captionResult.variation);
306
- console.log('Tier:', captionResult.tier);
307
- console.log('Visual details:', workflowState.visualDetails);
308
 
309
  // Skip server lookup for now - always create new piclet
310
  workflowState.discoveryStatus = 'new';
@@ -323,46 +315,82 @@ Focus on: colors, body shape, eyes, limbs, mouth, and key visual features. Omit
323
  return;
324
  }
325
 
326
- if (!gptOssClient || !workflowState.objectName) {
327
- throw new Error('Text generation service not available or no object identified');
328
  }
329
 
330
- // Create monster prompt using object and visual details
331
- const monsterPrompt = EnhancedCaptionService.createMonsterPrompt(
332
- workflowState.objectName,
333
- workflowState.visualDetails || '',
334
- workflowState.objectAttributes || []
335
- );
336
 
337
- const conceptPrompt = `${monsterPrompt}
 
 
 
 
338
 
339
- Format your response exactly as follows:
340
  \`\`\`md
 
 
 
 
 
 
 
 
341
  # Object Rarity
342
- {Assess rarity based on the ${workflowState.objectName}. Use: common, uncommon, rare, epic, or legendary}
343
 
344
  # Monster Name
345
- {Creative name related to ${workflowState.objectName}, 11 letters max}
346
 
347
  # Primary Type
348
- {Choose the most fitting type based on ${workflowState.objectName}: beast, bug, aquatic, flora, mineral, space, machina, structure, culture, or cuisine}
349
 
350
  # Physical Stats
351
  Height: {e.g., "1.2m" or "3'5\""}
352
  Weight: {e.g., "15kg" or "33 lbs"}
353
 
354
  # Personality
355
- {Brief 1-2 sentence personality description. e.g., "Playful and curious, loves exploring new places" or "Shy but loyal, protective of friends"}
356
 
357
  # Monster Description
358
- {Detailed physical description of the ${workflowState.objectName}-based creature. Include how the object's features become creature features. Focus on eyes, limbs, mouth, and distinctive elements. This is the bio text for the creature.}
359
 
360
  # Monster Image Prompt
361
- {Visual description of the ${workflowState.objectName} monster for image generation. Include body shape, colors, pose, and key features. Focus on anime-style visual design.}
362
- \`\`\``;
363
-
 
 
 
 
 
 
 
364
  try {
365
  const responseText = await generateText(conceptPrompt);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
 
367
  if (!responseText || responseText.trim() === '') {
368
  throw new Error('Failed to generate monster concept');
@@ -543,7 +571,11 @@ Create a concise visual description (1-3 sentences, max 100 words). Focus only o
543
 
544
  // Extract description
545
  const descriptionMatch = workflowState.picletConcept.match(/# Monster Description\s*\n([\s\S]*?)(?=^#|$)/m);
546
- let description = descriptionMatch ? descriptionMatch[1].trim() : workflowState.imageCaption || 'A mysterious creature';
 
 
 
 
547
 
548
  // Extract physical stats
549
  const physicalStatsMatch = workflowState.picletConcept.match(/# Physical Stats\s*\n([\s\S]*?)(?=^#|$)/m);
 
289
  }
290
 
291
  try {
292
+ // Get detailed scene description from Joy Caption
293
  const captionResult = await EnhancedCaptionService.generateEnhancedCaption(
294
  joyCaptionClient,
295
  workflowState.userImage
296
  );
297
 
298
+ workflowState.imageCaption = captionResult.caption;
299
+ console.log('Scene description:', captionResult.caption);
 
 
 
 
 
 
 
 
300
 
301
  // Skip server lookup for now - always create new piclet
302
  workflowState.discoveryStatus = 'new';
 
315
  return;
316
  }
317
 
318
+ if (!gptOssClient || !workflowState.imageCaption) {
319
+ throw new Error('Cannot generate concept without scene description');
320
  }
321
 
322
+ const conceptPrompt = `You are analyzing an image to create a Pokemon-style creature. Here's the image description:
323
+
324
+ "${workflowState.imageCaption}"
 
 
 
325
 
326
+ Your task:
327
+ 1. Identify the PRIMARY PHYSICAL OBJECT with SPECIFICITY (e.g., "macbook" not "laptop", "eiffel tower" not "tower", "iphone" not "phone", "starbucks mug" not "mug")
328
+ 2. Determine if there's a meaningful VARIATION (e.g., "silver", "pro", "night", "gaming", "vintage")
329
+ 3. Assess rarity based on uniqueness
330
+ 4. Create a complete Pokemon-style monster concept
331
 
332
+ Format your response EXACTLY as follows:
333
  \`\`\`md
334
+ # Canonical Object
335
+ {Specific object name: "macbook", "eiffel tower", "iphone", "tesla", "le creuset mug", "nintendo switch"}
336
+ {NOT generic terms like: "laptop", "tower", "phone", "car", "mug", "console"}
337
+ {Include brand/model/landmark name when identifiable}
338
+
339
+ # Variation
340
+ {OPTIONAL: one distinctive attribute like "silver", "pro", "night", "gaming", or NONE if not applicable}
341
+
342
  # Object Rarity
343
+ {common, uncommon, rare, epic, or legendary based on object uniqueness}
344
 
345
  # Monster Name
346
+ {Creative 8-11 letter name based on the SPECIFIC object, e.g., "Macbyte" for MacBook, "Towerfell" for Eiffel Tower}
347
 
348
  # Primary Type
349
+ {beast, bug, aquatic, flora, mineral, space, machina, structure, culture, or cuisine}
350
 
351
  # Physical Stats
352
  Height: {e.g., "1.2m" or "3'5\""}
353
  Weight: {e.g., "15kg" or "33 lbs"}
354
 
355
  # Personality
356
+ {1-2 sentences describing personality traits}
357
 
358
  # Monster Description
359
+ {2-3 paragraphs describing how the SPECIFIC object's features translate into monster features. Reference the actual object by name. This is the creature's bio.}
360
 
361
  # Monster Image Prompt
362
+ {Concise visual description for anime-style image generation focusing on colors, shapes, and key features inspired by the specific object}
363
+ \`\`\`
364
+
365
+ CRITICAL RULES:
366
+ - Canonical Object MUST be SPECIFIC: "macbook" not "laptop", "big ben" not "clock tower", "coca cola" not "soda"
367
+ - If you can identify a brand, model, or proper name from the description, USE IT
368
+ - Variation should be meaningful and distinctive (material, style, color, context, or model variant)
369
+ - Monster Description must describe the CREATURE with references to the specific object's features
370
+ - Primary Type must match the object category (machina for electronics, structure for buildings, etc.)`;
371
+
372
  try {
373
  const responseText = await generateText(conceptPrompt);
374
+
375
+ // Validate response has expected structure
376
+ if (!responseText.includes('# Canonical Object') ||
377
+ !responseText.includes('# Monster Name')) {
378
+ console.error('GPT-OSS returned invalid response:', responseText);
379
+ throw new Error('Failed to generate valid monster concept');
380
+ }
381
+
382
+ workflowState.picletConcept = responseText;
383
+
384
+ // Extract and store canonical name and variation immediately for use in other steps
385
+ const canonicalMatch = responseText.match(/# Canonical Object\s*\n([\s\S]*?)(?=^#)/m);
386
+ const variationMatch = responseText.match(/# Variation\s*\n([\s\S]*?)(?=^#)/m);
387
+
388
+ workflowState.objectName = canonicalMatch ? canonicalMatch[1].trim().toLowerCase() : 'unknown';
389
+ const variationText = variationMatch ? variationMatch[1].trim() : '';
390
+ workflowState.objectAttributes = variationText && variationText !== 'NONE' ? [variationText.toLowerCase()] : [];
391
+
392
+ console.log('Parsed specific object:', workflowState.objectName);
393
+ console.log('Parsed variation:', workflowState.objectAttributes);
394
 
395
  if (!responseText || responseText.trim() === '') {
396
  throw new Error('Failed to generate monster concept');
 
571
 
572
  // Extract description
573
  const descriptionMatch = workflowState.picletConcept.match(/# Monster Description\s*\n([\s\S]*?)(?=^#|$)/m);
574
+ if (!descriptionMatch) {
575
+ console.error('Monster description not found in concept:', workflowState.picletConcept);
576
+ throw new Error('Failed to extract monster description from AI response');
577
+ }
578
+ let description = descriptionMatch[1].trim();
579
 
580
  // Extract physical stats
581
  const physicalStatsMatch = workflowState.picletConcept.match(/# Physical Stats\s*\n([\s\S]*?)(?=^#|$)/m);
src/lib/components/PicletGenerator/WorkflowProgress.svelte CHANGED
@@ -32,8 +32,8 @@
32
  },
33
  {
34
  id: 'statsGenerating',
35
- label: 'Battle Stats',
36
- description: 'Generating abilities'
37
  },
38
  {
39
  id: 'promptCrafting',
 
32
  },
33
  {
34
  id: 'statsGenerating',
35
+ label: 'Characteristics',
36
+ description: 'Generating traits'
37
  },
38
  {
39
  id: 'promptCrafting',
src/lib/services/enhancedCaption.ts CHANGED
@@ -1,233 +1,35 @@
1
  import type { GradioClient } from '$lib/types';
2
 
3
  export interface EnhancedCaptionResult {
4
- objectCaption: string; // Main object identification
5
- visualDetails: string; // Additional visual details for monster generation
6
- fullCaption: string; // Complete original caption
7
- extractedObject?: string; // Parsed primary object (normalized)
8
- extractedAttributes?: string[]; // Parsed attributes
9
- canonicalName: string; // Normalized canonical object name
10
- variation?: string; // Optional variation (e.g., "night", "toy", "vintage")
11
- tier: 'low' | 'medium' | 'high' | 'legendary'; // Rarity tier based on uniqueness
12
  }
13
 
14
  export class EnhancedCaptionService {
15
  /**
16
- * Generate multiple captions to extract object and visual details
17
  */
18
  static async generateEnhancedCaption(
19
  client: GradioClient,
20
  image: Blob | File
21
  ): Promise<EnhancedCaptionResult> {
22
  try {
23
- // First caption: Focus on object identification
24
- const objectPrompt = "Identify the main object in this image. Start with 'This is a/an' followed by the object name and up to 2-3 key attributes (material, color, or style). Be concise and focus only on WHAT the object is, not where it is or what surrounds it.";
25
-
26
- const objectResult = await client.predict("/stream_chat", [
27
- image,
28
- "Descriptive", // caption type
29
- "short", // length - short for object identification
30
- [], // extra_options
31
- "", // name_input
32
- objectPrompt // custom_prompt for object focus
33
- ]);
34
-
35
- const objectCaption = objectResult.data[1] as string;
36
-
37
- // Second caption: Get visual details for monster generation
38
- const detailsPrompt = "Describe the unique visual characteristics, textures, patterns, and interesting details of this object that would make it distinctive as a creature. Focus on surface details, decorative elements, and any unusual features. Do not repeat the object name.";
39
-
40
- const detailsResult = await client.predict("/stream_chat", [
41
  image,
42
  "Descriptive",
43
- "medium-length", // More details for visual generation
44
  [],
45
  "",
46
- detailsPrompt
47
  ]);
48
 
49
- const visualDetails = detailsResult.data[1] as string;
50
-
51
- // Third caption: Full descriptive caption as backup
52
- const fullResult = await client.predict("/stream_chat", [
53
- image,
54
- "Descriptive",
55
- "long",
56
- [],
57
- "",
58
- "" // No custom prompt for natural full description
59
- ]);
60
-
61
- const fullCaption = fullResult.data[1] as string;
62
-
63
- // Extract structured data from object caption
64
- const extraction = this.parseObjectCaption(objectCaption);
65
-
66
- // Determine variation from attributes (limit to 1 meaningful variation)
67
- const variation = this.determineVariation(extraction.attributes);
68
-
69
- // Assess tier based on object uniqueness
70
- const tier = this.assessTier(extraction.object, fullCaption);
71
 
72
  return {
73
- objectCaption,
74
- visualDetails,
75
- fullCaption,
76
- extractedObject: extraction.object,
77
- extractedAttributes: extraction.attributes,
78
- canonicalName: extraction.object,
79
- variation,
80
- tier
81
  };
82
  } catch (error) {
83
- console.error('Enhanced caption generation failed:', error);
84
  throw error;
85
  }
86
  }
87
-
88
- /**
89
- * Parse the object-focused caption to extract structured data
90
- */
91
- private static parseObjectCaption(caption: string): {
92
- object: string;
93
- attributes: string[];
94
- } {
95
- // Remove "This is a/an" prefix
96
- let cleaned = caption
97
- .replace(/^(This is|It'?s|That'?s)\s+(a|an|the)?\s*/i, '')
98
- .trim();
99
-
100
- // Common attribute words to extract
101
- const attributePatterns = [
102
- // Materials
103
- /\b(wooden|metal|plastic|glass|leather|velvet|silk|cotton|stone|marble|ceramic|porcelain)\b/gi,
104
- // Colors
105
- /\b(red|blue|green|yellow|purple|orange|black|white|gray|brown|pink|gold|silver)\b/gi,
106
- // Styles
107
- /\b(modern|vintage|antique|rustic|minimalist|ornate|gothic|retro|classic)\b/gi,
108
- // Patterns
109
- /\b(striped|checkered|floral|geometric|polka-dot|plaid)\b/gi
110
- ];
111
-
112
- const attributes: string[] = [];
113
- let objectText = cleaned;
114
-
115
- // Extract attributes from the caption
116
- for (const pattern of attributePatterns) {
117
- const matches = cleaned.match(pattern);
118
- if (matches) {
119
- attributes.push(...matches.map(m => m.toLowerCase()));
120
- // Remove matched attributes from object text
121
- objectText = objectText.replace(pattern, '').trim();
122
- }
123
- }
124
-
125
- // Clean up object text - get the core noun
126
- const words = objectText.split(/\s+/).filter(w => w.length > 0);
127
-
128
- // Remove common descriptive words that aren't the object
129
- const filterWords = ['with', 'that', 'which', 'having', 'featuring', 'very', 'quite', 'rather'];
130
- const objectWords = words.filter(w => !filterWords.includes(w.toLowerCase()));
131
-
132
- // The object is typically the first significant noun
133
- let object = objectWords[0] || 'object';
134
-
135
- // Handle compound objects (e.g., "coffee mug" -> "mug", "throw pillow" -> "pillow")
136
- const compoundMappings: Record<string, string> = {
137
- 'coffee': 'mug',
138
- 'throw': 'pillow',
139
- 'picture': 'frame',
140
- 'water': 'bottle',
141
- 'wine': 'glass',
142
- 'flower': 'vase',
143
- 'table': 'lamp',
144
- 'desk': 'lamp',
145
- 'floor': 'lamp'
146
- };
147
-
148
- if (compoundMappings[object.toLowerCase()] && objectWords.length > 1) {
149
- object = objectWords[1];
150
- }
151
-
152
- // Limit attributes to top 3 most relevant
153
- const uniqueAttributes = [...new Set(attributes)].slice(0, 3);
154
-
155
- return {
156
- object: object.toLowerCase(),
157
- attributes: uniqueAttributes
158
- };
159
- }
160
-
161
- /**
162
- * Determine variation from attributes (limit to 1 most meaningful)
163
- * Priority: style > material > color
164
- */
165
- private static determineVariation(attributes: string[]): string | undefined {
166
- if (attributes.length === 0) return undefined;
167
-
168
- // Style attributes take priority (most distinctive)
169
- const styleWords = ['modern', 'vintage', 'antique', 'rustic', 'minimalist', 'ornate', 'gothic', 'retro', 'classic'];
170
- const styleAttr = attributes.find(attr => styleWords.includes(attr.toLowerCase()));
171
- if (styleAttr) return styleAttr;
172
-
173
- // Material attributes second priority
174
- const materialWords = ['wooden', 'metal', 'plastic', 'glass', 'leather', 'velvet', 'silk', 'cotton', 'stone', 'marble', 'ceramic', 'porcelain'];
175
- const materialAttr = attributes.find(attr => materialWords.includes(attr.toLowerCase()));
176
- if (materialAttr) return materialAttr;
177
-
178
- // Only use color if no style or material
179
- const colorWords = ['red', 'blue', 'green', 'yellow', 'purple', 'orange', 'black', 'white', 'gray', 'brown', 'pink', 'gold', 'silver'];
180
- const colorAttr = attributes.find(attr => colorWords.includes(attr.toLowerCase()));
181
- if (colorAttr) return colorAttr;
182
-
183
- return undefined; // No meaningful variation
184
- }
185
-
186
- /**
187
- * Assess tier/rarity based on object uniqueness
188
- */
189
- private static assessTier(objectName: string, caption: string): 'low' | 'medium' | 'high' | 'legendary' {
190
- const lowerCaption = caption.toLowerCase();
191
- const lowerObject = objectName.toLowerCase();
192
-
193
- // Legendary: Famous landmarks, unique cultural items, specific branded items
194
- const legendaryKeywords = ['tower', 'monument', 'statue', 'pyramid', 'temple', 'castle', 'cathedral', 'famous', 'iconic', 'historic', 'ancient'];
195
- if (legendaryKeywords.some(kw => lowerObject.includes(kw) || lowerCaption.includes(kw))) {
196
- return 'legendary';
197
- }
198
-
199
- // High: Art, collectibles, specialized items
200
- const highKeywords = ['art', 'sculpture', 'antique', 'collectible', 'rare', 'ornate', 'decorative', 'handmade', 'crafted'];
201
- if (highKeywords.some(kw => lowerCaption.includes(kw))) {
202
- return 'high';
203
- }
204
-
205
- // Low: Common everyday items
206
- const commonObjects = ['cup', 'mug', 'plate', 'bowl', 'pen', 'pencil', 'paper', 'bottle', 'box', 'bag', 'chair', 'table'];
207
- if (commonObjects.some(obj => lowerObject.includes(obj))) {
208
- return 'low';
209
- }
210
-
211
- // Medium: Default for most items
212
- return 'medium';
213
- }
214
-
215
- /**
216
- * Generate a combined prompt for monster generation
217
- */
218
- static createMonsterPrompt(
219
- objectName: string,
220
- visualDetails: string,
221
- attributes: string[]
222
- ): string {
223
- const attributeText = attributes.length > 0
224
- ? ` with ${attributes.join(', ')} characteristics`
225
- : '';
226
-
227
- return `Create a Pokemon-style creature based on a ${objectName}${attributeText}.
228
-
229
- Visual inspiration: ${visualDetails}
230
-
231
- The creature should embody the essence of a ${objectName} while incorporating these visual elements into its design. Make it cute but distinctive, with clear ${objectName}-inspired features.`;
232
- }
233
  }
 
1
  import type { GradioClient } from '$lib/types';
2
 
3
  export interface EnhancedCaptionResult {
4
+ caption: string; // Detailed scene description with specific objects/brands
 
 
 
 
 
 
 
5
  }
6
 
7
  export class EnhancedCaptionService {
8
  /**
9
+ * Generate a detailed scene description for GPT-OSS to parse
10
  */
11
  static async generateEnhancedCaption(
12
  client: GradioClient,
13
  image: Blob | File
14
  ): Promise<EnhancedCaptionResult> {
15
  try {
16
+ const result = await client.predict("/stream_chat", [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  image,
18
  "Descriptive",
19
+ "medium-length",
20
  [],
21
  "",
22
+ "Describe this image in detail, identifying any recognizable objects, brands, logos, or specific models. Be specific about product names and types."
23
  ]);
24
 
25
+ const caption = result.data[1] as string;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  return {
28
+ caption
 
 
 
 
 
 
 
29
  };
30
  } catch (error) {
31
+ console.error('Caption generation failed:', error);
32
  throw error;
33
  }
34
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  }