Spaces:

Fraser
/

piclets

Running

App Files Files Community

Fraser commited on Sep 30

Commit

ce1f183

1 Parent(s): 48e6066

fix

Browse files

Files changed (3) hide show

src/lib/components/PicletGenerator/PicletGenerator.svelte +62 -30
src/lib/components/PicletGenerator/WorkflowProgress.svelte +2 -2
src/lib/services/enhancedCaption.ts +8 -206

src/lib/components/PicletGenerator/PicletGenerator.svelte CHANGED Viewed

@@ -289,22 +289,14 @@ Focus on: colors, body shape, eyes, limbs, mouth, and key visual features. Omit
     }
     try {
-      // Use enhanced captioning for object identification
       const captionResult = await EnhancedCaptionService.generateEnhancedCaption(
         joyCaptionClient,
         workflowState.userImage
       );
-      // Store caption results
-      workflowState.imageCaption = captionResult.fullCaption;
-      workflowState.objectName = captionResult.canonicalName;
-      workflowState.objectAttributes = captionResult.variation ? [captionResult.variation] : [];
-      workflowState.visualDetails = captionResult.visualDetails;
-      console.log('Object identified:', workflowState.objectName);
-      console.log('Variation:', captionResult.variation);
-      console.log('Tier:', captionResult.tier);
-      console.log('Visual details:', workflowState.visualDetails);
       // Skip server lookup for now - always create new piclet
       workflowState.discoveryStatus = 'new';
@@ -323,46 +315,82 @@ Focus on: colors, body shape, eyes, limbs, mouth, and key visual features. Omit
       return;
     }
-    if (!gptOssClient || !workflowState.objectName) {
-      throw new Error('Text generation service not available or no object identified');
     }
-    // Create monster prompt using object and visual details
-    const monsterPrompt = EnhancedCaptionService.createMonsterPrompt(
-      workflowState.objectName,
-      workflowState.visualDetails || '',
-      workflowState.objectAttributes || []
-    );
-    const conceptPrompt = `${monsterPrompt}
-Format your response exactly as follows:
 \`\`\`md
 # Object Rarity
-{Assess rarity based on the ${workflowState.objectName}. Use: common, uncommon, rare, epic, or legendary}
 # Monster Name
-{Creative name related to ${workflowState.objectName}, 11 letters max}
 # Primary Type
-{Choose the most fitting type based on ${workflowState.objectName}: beast, bug, aquatic, flora, mineral, space, machina, structure, culture, or cuisine}
 # Physical Stats
 Height: {e.g., "1.2m" or "3'5\""}
 Weight: {e.g., "15kg" or "33 lbs"}
 # Personality
-{Brief 1-2 sentence personality description. e.g., "Playful and curious, loves exploring new places" or "Shy but loyal, protective of friends"}
 # Monster Description
-{Detailed physical description of the ${workflowState.objectName}-based creature. Include how the object's features become creature features. Focus on eyes, limbs, mouth, and distinctive elements. This is the bio text for the creature.}
 # Monster Image Prompt
-{Visual description of the ${workflowState.objectName} monster for image generation. Include body shape, colors, pose, and key features. Focus on anime-style visual design.}
-\`\`\``;
     try {
       const responseText = await generateText(conceptPrompt);
       if (!responseText || responseText.trim() === '') {
         throw new Error('Failed to generate monster concept');
@@ -543,7 +571,11 @@ Create a concise visual description (1-3 sentences, max 100 words). Focus only o
       // Extract description
       const descriptionMatch = workflowState.picletConcept.match(/# Monster Description\s*\n([\s\S]*?)(?=^#|$)/m);
-      let description = descriptionMatch ? descriptionMatch[1].trim() : workflowState.imageCaption || 'A mysterious creature';
       // Extract physical stats
       const physicalStatsMatch = workflowState.picletConcept.match(/# Physical Stats\s*\n([\s\S]*?)(?=^#|$)/m);

     }
     try {
+      // Get detailed scene description from Joy Caption
       const captionResult = await EnhancedCaptionService.generateEnhancedCaption(
         joyCaptionClient,
         workflowState.userImage
       );
+      workflowState.imageCaption = captionResult.caption;
+      console.log('Scene description:', captionResult.caption);
       // Skip server lookup for now - always create new piclet
       workflowState.discoveryStatus = 'new';
       return;
     }
+    if (!gptOssClient || !workflowState.imageCaption) {
+      throw new Error('Cannot generate concept without scene description');
     }
+    const conceptPrompt = `You are analyzing an image to create a Pokemon-style creature. Here's the image description:
+"${workflowState.imageCaption}"
+Your task:
+1. Identify the PRIMARY PHYSICAL OBJECT with SPECIFICITY (e.g., "macbook" not "laptop", "eiffel tower" not "tower", "iphone" not "phone", "starbucks mug" not "mug")
+2. Determine if there's a meaningful VARIATION (e.g., "silver", "pro", "night", "gaming", "vintage")
+3. Assess rarity based on uniqueness
+4. Create a complete Pokemon-style monster concept
+Format your response EXACTLY as follows:
 \`\`\`md
+# Canonical Object
+{Specific object name: "macbook", "eiffel tower", "iphone", "tesla", "le creuset mug", "nintendo switch"}
+{NOT generic terms like: "laptop", "tower", "phone", "car", "mug", "console"}
+{Include brand/model/landmark name when identifiable}
+# Variation
+{OPTIONAL: one distinctive attribute like "silver", "pro", "night", "gaming", or NONE if not applicable}
 # Object Rarity
+{common, uncommon, rare, epic, or legendary based on object uniqueness}
 # Monster Name
+{Creative 8-11 letter name based on the SPECIFIC object, e.g., "Macbyte" for MacBook, "Towerfell" for Eiffel Tower}
 # Primary Type
+{beast, bug, aquatic, flora, mineral, space, machina, structure, culture, or cuisine}
 # Physical Stats
 Height: {e.g., "1.2m" or "3'5\""}
 Weight: {e.g., "15kg" or "33 lbs"}
 # Personality
+{1-2 sentences describing personality traits}
 # Monster Description
+{2-3 paragraphs describing how the SPECIFIC object's features translate into monster features. Reference the actual object by name. This is the creature's bio.}
 # Monster Image Prompt
+{Concise visual description for anime-style image generation focusing on colors, shapes, and key features inspired by the specific object}
+\`\`\`
+CRITICAL RULES:
+- Canonical Object MUST be SPECIFIC: "macbook" not "laptop", "big ben" not "clock tower", "coca cola" not "soda"
+- If you can identify a brand, model, or proper name from the description, USE IT
+- Variation should be meaningful and distinctive (material, style, color, context, or model variant)
+- Monster Description must describe the CREATURE with references to the specific object's features
+- Primary Type must match the object category (machina for electronics, structure for buildings, etc.)`;
     try {
       const responseText = await generateText(conceptPrompt);
+      // Validate response has expected structure
+      if (!responseText.includes('# Canonical Object') ||
+          !responseText.includes('# Monster Name')) {
+        console.error('GPT-OSS returned invalid response:', responseText);
+        throw new Error('Failed to generate valid monster concept');
+      }
+      workflowState.picletConcept = responseText;
+      // Extract and store canonical name and variation immediately for use in other steps
+      const canonicalMatch = responseText.match(/# Canonical Object\s*\n([\s\S]*?)(?=^#)/m);
+      const variationMatch = responseText.match(/# Variation\s*\n([\s\S]*?)(?=^#)/m);
+      workflowState.objectName = canonicalMatch ? canonicalMatch[1].trim().toLowerCase() : 'unknown';
+      const variationText = variationMatch ? variationMatch[1].trim() : '';
+      workflowState.objectAttributes = variationText && variationText !== 'NONE' ? [variationText.toLowerCase()] : [];
+      console.log('Parsed specific object:', workflowState.objectName);
+      console.log('Parsed variation:', workflowState.objectAttributes);
       if (!responseText || responseText.trim() === '') {
         throw new Error('Failed to generate monster concept');
       // Extract description
       const descriptionMatch = workflowState.picletConcept.match(/# Monster Description\s*\n([\s\S]*?)(?=^#|$)/m);
+      if (!descriptionMatch) {
+        console.error('Monster description not found in concept:', workflowState.picletConcept);
+        throw new Error('Failed to extract monster description from AI response');
+      }
+      let description = descriptionMatch[1].trim();
       // Extract physical stats
       const physicalStatsMatch = workflowState.picletConcept.match(/# Physical Stats\s*\n([\s\S]*?)(?=^#|$)/m);

src/lib/components/PicletGenerator/WorkflowProgress.svelte CHANGED Viewed

@@ -32,8 +32,8 @@
     },
     {
       id: 'statsGenerating',
-      label: 'Battle Stats',
-      description: 'Generating abilities'
     },
     {
       id: 'promptCrafting',

     },
     {
       id: 'statsGenerating',
+      label: 'Characteristics',
+      description: 'Generating traits'
     },
     {
       id: 'promptCrafting',

src/lib/services/enhancedCaption.ts CHANGED Viewed

@@ -1,233 +1,35 @@
 import type { GradioClient } from '$lib/types';
 export interface EnhancedCaptionResult {
-  objectCaption: string;      // Main object identification
-  visualDetails: string;       // Additional visual details for monster generation
-  fullCaption: string;        // Complete original caption
-  extractedObject?: string;   // Parsed primary object (normalized)
-  extractedAttributes?: string[]; // Parsed attributes
-  canonicalName: string;      // Normalized canonical object name
-  variation?: string;         // Optional variation (e.g., "night", "toy", "vintage")
-  tier: 'low' | 'medium' | 'high' | 'legendary'; // Rarity tier based on uniqueness
 }
 export class EnhancedCaptionService {
   /**
-   * Generate multiple captions to extract object and visual details
    */
   static async generateEnhancedCaption(
     client: GradioClient,
     image: Blob | File
   ): Promise<EnhancedCaptionResult> {
     try {
-      // First caption: Focus on object identification
-      const objectPrompt = "Identify the main object in this image. Start with 'This is a/an' followed by the object name and up to 2-3 key attributes (material, color, or style). Be concise and focus only on WHAT the object is, not where it is or what surrounds it.";
-      const objectResult = await client.predict("/stream_chat", [
-        image,
-        "Descriptive",     // caption type
-        "short",          // length - short for object identification
-        [],               // extra_options
-        "",              // name_input
-        objectPrompt     // custom_prompt for object focus
-      ]);
-      const objectCaption = objectResult.data[1] as string;
-      // Second caption: Get visual details for monster generation
-      const detailsPrompt = "Describe the unique visual characteristics, textures, patterns, and interesting details of this object that would make it distinctive as a creature. Focus on surface details, decorative elements, and any unusual features. Do not repeat the object name.";
-      const detailsResult = await client.predict("/stream_chat", [
         image,
         "Descriptive",
-        "medium-length",  // More details for visual generation
         [],
         "",
-        detailsPrompt
       ]);
-      const visualDetails = detailsResult.data[1] as string;
-      // Third caption: Full descriptive caption as backup
-      const fullResult = await client.predict("/stream_chat", [
-        image,
-        "Descriptive",
-        "long",
-        [],
-        "",
-        ""  // No custom prompt for natural full description
-      ]);
-      const fullCaption = fullResult.data[1] as string;
-      // Extract structured data from object caption
-      const extraction = this.parseObjectCaption(objectCaption);
-      // Determine variation from attributes (limit to 1 meaningful variation)
-      const variation = this.determineVariation(extraction.attributes);
-      // Assess tier based on object uniqueness
-      const tier = this.assessTier(extraction.object, fullCaption);
       return {
-        objectCaption,
-        visualDetails,
-        fullCaption,
-        extractedObject: extraction.object,
-        extractedAttributes: extraction.attributes,
-        canonicalName: extraction.object,
-        variation,
-        tier
       };
     } catch (error) {
-      console.error('Enhanced caption generation failed:', error);
       throw error;
     }
   }
-  /**
-   * Parse the object-focused caption to extract structured data
-   */
-  private static parseObjectCaption(caption: string): {
-    object: string;
-    attributes: string[];
-  } {
-    // Remove "This is a/an" prefix
-    let cleaned = caption
-      .replace(/^(This is|It'?s|That'?s)\s+(a|an|the)?\s*/i, '')
-      .trim();
-    // Common attribute words to extract
-    const attributePatterns = [
-      // Materials
-      /\b(wooden|metal|plastic|glass|leather|velvet|silk|cotton|stone|marble|ceramic|porcelain)\b/gi,
-      // Colors
-      /\b(red|blue|green|yellow|purple|orange|black|white|gray|brown|pink|gold|silver)\b/gi,
-      // Styles
-      /\b(modern|vintage|antique|rustic|minimalist|ornate|gothic|retro|classic)\b/gi,
-      // Patterns
-      /\b(striped|checkered|floral|geometric|polka-dot|plaid)\b/gi
-    ];
-    const attributes: string[] = [];
-    let objectText = cleaned;
-    // Extract attributes from the caption
-    for (const pattern of attributePatterns) {
-      const matches = cleaned.match(pattern);
-      if (matches) {
-        attributes.push(...matches.map(m => m.toLowerCase()));
-        // Remove matched attributes from object text
-        objectText = objectText.replace(pattern, '').trim();
-      }
-    }
-    // Clean up object text - get the core noun
-    const words = objectText.split(/\s+/).filter(w => w.length > 0);
-    // Remove common descriptive words that aren't the object
-    const filterWords = ['with', 'that', 'which', 'having', 'featuring', 'very', 'quite', 'rather'];
-    const objectWords = words.filter(w => !filterWords.includes(w.toLowerCase()));
-    // The object is typically the first significant noun
-    let object = objectWords[0] || 'object';
-    // Handle compound objects (e.g., "coffee mug" -> "mug", "throw pillow" -> "pillow")
-    const compoundMappings: Record<string, string> = {
-      'coffee': 'mug',
-      'throw': 'pillow',
-      'picture': 'frame',
-      'water': 'bottle',
-      'wine': 'glass',
-      'flower': 'vase',
-      'table': 'lamp',
-      'desk': 'lamp',
-      'floor': 'lamp'
-    };
-    if (compoundMappings[object.toLowerCase()] && objectWords.length > 1) {
-      object = objectWords[1];
-    }
-    // Limit attributes to top 3 most relevant
-    const uniqueAttributes = [...new Set(attributes)].slice(0, 3);
-    return {
-      object: object.toLowerCase(),
-      attributes: uniqueAttributes
-    };
-  }
-  /**
-   * Determine variation from attributes (limit to 1 most meaningful)
-   * Priority: style > material > color
-   */
-  private static determineVariation(attributes: string[]): string | undefined {
-    if (attributes.length === 0) return undefined;
-    // Style attributes take priority (most distinctive)
-    const styleWords = ['modern', 'vintage', 'antique', 'rustic', 'minimalist', 'ornate', 'gothic', 'retro', 'classic'];
-    const styleAttr = attributes.find(attr => styleWords.includes(attr.toLowerCase()));
-    if (styleAttr) return styleAttr;
-    // Material attributes second priority
-    const materialWords = ['wooden', 'metal', 'plastic', 'glass', 'leather', 'velvet', 'silk', 'cotton', 'stone', 'marble', 'ceramic', 'porcelain'];
-    const materialAttr = attributes.find(attr => materialWords.includes(attr.toLowerCase()));
-    if (materialAttr) return materialAttr;
-    // Only use color if no style or material
-    const colorWords = ['red', 'blue', 'green', 'yellow', 'purple', 'orange', 'black', 'white', 'gray', 'brown', 'pink', 'gold', 'silver'];
-    const colorAttr = attributes.find(attr => colorWords.includes(attr.toLowerCase()));
-    if (colorAttr) return colorAttr;
-    return undefined; // No meaningful variation
-  }
-  /**
-   * Assess tier/rarity based on object uniqueness
-   */
-  private static assessTier(objectName: string, caption: string): 'low' | 'medium' | 'high' | 'legendary' {
-    const lowerCaption = caption.toLowerCase();
-    const lowerObject = objectName.toLowerCase();
-    // Legendary: Famous landmarks, unique cultural items, specific branded items
-    const legendaryKeywords = ['tower', 'monument', 'statue', 'pyramid', 'temple', 'castle', 'cathedral', 'famous', 'iconic', 'historic', 'ancient'];
-    if (legendaryKeywords.some(kw => lowerObject.includes(kw) || lowerCaption.includes(kw))) {
-      return 'legendary';
-    }
-    // High: Art, collectibles, specialized items
-    const highKeywords = ['art', 'sculpture', 'antique', 'collectible', 'rare', 'ornate', 'decorative', 'handmade', 'crafted'];
-    if (highKeywords.some(kw => lowerCaption.includes(kw))) {
-      return 'high';
-    }
-    // Low: Common everyday items
-    const commonObjects = ['cup', 'mug', 'plate', 'bowl', 'pen', 'pencil', 'paper', 'bottle', 'box', 'bag', 'chair', 'table'];
-    if (commonObjects.some(obj => lowerObject.includes(obj))) {
-      return 'low';
-    }
-    // Medium: Default for most items
-    return 'medium';
-  }
-  /**
-   * Generate a combined prompt for monster generation
-   */
-  static createMonsterPrompt(
-    objectName: string,
-    visualDetails: string,
-    attributes: string[]
-  ): string {
-    const attributeText = attributes.length > 0
-      ? ` with ${attributes.join(', ')} characteristics`
-      : '';
-    return `Create a Pokemon-style creature based on a ${objectName}${attributeText}.
-Visual inspiration: ${visualDetails}
-The creature should embody the essence of a ${objectName} while incorporating these visual elements into its design. Make it cute but distinctive, with clear ${objectName}-inspired features.`;
-  }
 }

 import type { GradioClient } from '$lib/types';
 export interface EnhancedCaptionResult {
+  caption: string;  // Detailed scene description with specific objects/brands
 }
 export class EnhancedCaptionService {
   /**
+   * Generate a detailed scene description for GPT-OSS to parse
    */
   static async generateEnhancedCaption(
     client: GradioClient,
     image: Blob | File
   ): Promise<EnhancedCaptionResult> {
     try {
+      const result = await client.predict("/stream_chat", [
         image,
         "Descriptive",
+        "medium-length",
         [],
         "",
+        "Describe this image in detail, identifying any recognizable objects, brands, logos, or specific models. Be specific about product names and types."
       ]);
+      const caption = result.data[1] as string;
       return {
+        caption
       };
     } catch (error) {
+      console.error('Caption generation failed:', error);
       throw error;
     }
   }
 }