Spaces:

DawnC
/

PawMatchAI

Running on Zero

App Files Files Community

DawnC commited on Dec 29, 2024

Commit

5045da7

1 Parent(s): 9f7a41d

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -48

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import gradio as gr
 import time
 import traceback
 import spaces
-from torchvision.models import convnext_base, ConvNeXt_Base_Weights
 from torchvision.ops import nms, box_iou
 import torch.nn.functional as F
 from torchvision import transforms
@@ -72,56 +72,77 @@ dog_breeds = ["Afghan_Hound", "African_Hunting_Dog", "Airedale", "American_Staff
 class MultiHeadAttention(nn.Module):
     def __init__(self, in_dim, num_heads=8):
         super().__init__()
         self.num_heads = num_heads
-        self.head_dim = max(1, in_dim // num_heads)
-        self.scaled_dim = self.head_dim * num_heads
-        self.fc_in = nn.Linear(in_dim, self.scaled_dim)
-        self.query = nn.Linear(self.scaled_dim, self.scaled_dim)
-        self.key = nn.Linear(self.scaled_dim, self.scaled_dim)
-        self.value = nn.Linear(self.scaled_dim, self.scaled_dim)
-        self.fc_out = nn.Linear(self.scaled_dim, in_dim)
     def forward(self, x):
-        N = x.shape[0]
-        x = self.fc_in(x)
-        q = self.query(x).view(N, self.num_heads, self.head_dim)
-        k = self.key(x).view(N, self.num_heads, self.head_dim)
-        v = self.value(x).view(N, self.num_heads, self.head_dim)
-        energy = torch.einsum("nqd,nkd->nqk", [q, k])
-        attention = F.softmax(energy / (self.head_dim ** 0.5), dim=2)
         out = torch.einsum("nqk,nvd->nqd", [attention, v])
-        out = out.reshape(N, self.scaled_dim)
-        out = self.fc_out(out)
         return out
 class BaseModel(nn.Module):
     def __init__(self, num_classes, device='cuda' if torch.cuda.is_available() else 'cpu'):
         super().__init__()
         self.device = device
-        # 1. 初始化 backbone
-        self.backbone = convnext_base(weights=ConvNeXt_Base_Weights.IMAGENET1K_V1)
-        self.backbone.classifier = nn.Identity()  # 移除原始分類器
-        # 2. 使用測試數據確定實際的特徵維度
-        with torch.no_grad():  # 不需要計算梯度
-            dummy_input = torch.randn(1, 3, 224, 224)  # 創建示例輸入
             features = self.backbone(dummy_input)
-            if len(features.shape) > 2:  # 如果特徵是多維的
-                features = features.mean([-2, -1])  # 進行全局平均池化
-            self.feature_dim = features.shape[1]  # 獲取正確的特徵維度
-        print(f"Feature Dim: {self.feature_dim}")  # 幫助調試
-        # 3. 設置多頭注意力層
         self.num_heads = max(1, min(8, self.feature_dim // 64))
         self.attention = MultiHeadAttention(self.feature_dim, num_heads=self.num_heads)
-        # 4. 設置分類器
         self.classifier = nn.Sequential(
             nn.LayerNorm(self.feature_dim),
             nn.Dropout(0.3),
@@ -130,27 +151,27 @@ class BaseModel(nn.Module):
     def forward(self, x):
         """
-        模型的前向傳播過程
         Args:
-            x (Tensor): 輸入圖像張量，形狀為 [batch_size, channels, height, width]
         Returns:
-            Tuple[Tensor, Tensor]: 分類邏輯值和注意力特徵
         """
         x = x.to(self.device)
-        # 1. 提取基礎特徵
         features = self.backbone(x)
-        # 2. 處理特徵維度
         if len(features.shape) > 2:
-            # 如果特徵維度是 [batch_size, channels, height, width]
-            # 轉換為 [batch_size, channels]
-            features = features.mean([-2, -1])  # 使用全局平均池化
-        # 3. 應用注意力機制
         attended_features = self.attention(features)
-        # 4. 最終分類
         logits = self.classifier(attended_features)
         return logits, attended_features
@@ -211,7 +232,7 @@ class ModelManager:
             ).to(self.device)
             checkpoint = torch.load(
-                'ConvNextBase_best_model_dog.pth',
                 map_location=self.device  # 確保checkpoint加載到正確的設備
             )
             self._breed_model.load_state_dict(checkpoint['base_model'], strict=False)
@@ -271,7 +292,7 @@ def predict_single_dog(image):
         return probabilities[0], breeds[:3], relative_probs
 @spaces.GPU
-def detect_multiple_dogs(image, conf_threshold=0.3, iou_threshold=0.55):
     """
     使用YOLO模型檢測圖片中的狗。
     只保留被識別為狗（class 16）的物體，並標記它們的狀態。
@@ -310,10 +331,10 @@ def detect_multiple_dogs(image, conf_threshold=0.3, iou_threshold=0.55):
         x1, y1, x2, y2 = box
         w, h = x2 - x1, y2 - y1
         # 擴大檢測框範圍以包含完整的狗
-        x1 = max(0, x1 - w * 0.05)
-        y1 = max(0, y1 - h * 0.05)
-        x2 = min(image.width, x2 + w * 0.05)
-        y2 = min(image.height, y2 + h * 0.05)
         cropped_image = image.crop((x1, y1, x2, y2))
         detected_objects.append((cropped_image, confidence, [x1, y1, x2, y2], is_dog))
@@ -442,9 +463,9 @@ def predict(image):
                 combined_confidence = detection_confidence * top1_prob
                 # 根據信心度決定輸出格式
-                if combined_confidence < 0.2:
                     dogs_info += format_unknown_breed_message(color, i+1)
-                elif top1_prob >= 0.45:
                     breed = topk_breeds[0]
                     description = get_dog_description(breed)
                     if description is None:
@@ -555,7 +576,7 @@ def main():
                 'Border_Collie.jpg',
                 'Golden_Retriever.jpeg',
                 'Saint_Bernard.jpeg',
-                'Samoyed.jpg',
                 'French_Bulldog.jpeg'
             ]
             detection_components = create_detection_tab(predict, example_images)

 import time
 import traceback
 import spaces
+import timm
 from torchvision.ops import nms, box_iou
 import torch.nn.functional as F
 from torchvision import transforms
 class MultiHeadAttention(nn.Module):
     def __init__(self, in_dim, num_heads=8):
+        """
+        Initializes the MultiHeadAttention module.
+        Args:
+            in_dim (int): Dimension of the input features.
+            num_heads (int): Number of attention heads. Defaults to 8.
+        """
         super().__init__()
         self.num_heads = num_heads
+        self.head_dim = max(1, in_dim // num_heads)  # Compute dimension per head
+        self.scaled_dim = self.head_dim * num_heads  # Scaled dimension after splitting into heads
+        self.fc_in = nn.Linear(in_dim, self.scaled_dim)  # Linear layer to project input to scaled_dim
+        self.query = nn.Linear(self.scaled_dim, self.scaled_dim)  # Query projection
+        self.key = nn.Linear(self.scaled_dim, self.scaled_dim)  # Key projection
+        self.value = nn.Linear(self.scaled_dim, self.scaled_dim)  # Value projection
+        self.fc_out = nn.Linear(self.scaled_dim, in_dim)  # Linear layer to project output back to in_dim
     def forward(self, x):
+        """
+        Forward pass for multi-head attention mechanism.
+        Args:
+            x (Tensor): Input tensor of shape (batch_size, input_dim).
+        Returns:
+            Tensor: Output tensor after applying attention mechanism.
+        """
+        N = x.shape[0]  # Batch size
+        x = self.fc_in(x)  # Project input to scaled_dim
+        q = self.query(x).view(N, self.num_heads, self.head_dim)  # Compute queries
+        k = self.key(x).view(N, self.num_heads, self.head_dim)  # Compute keys
+        v = self.value(x).view(N, self.num_heads, self.head_dim)  # Compute values
+        # Calculate attention scores
+        energy = torch.einsum("nqd,nkd->nqk", [q, k])  # Dot product between queries and keys
+        attention = F.softmax(energy / (self.head_dim ** 0.5), dim=2)  # Apply softmax with scaling
+        # Compute weighted sum of values based on attention scores
         out = torch.einsum("nqk,nvd->nqd", [attention, v])
+        out = out.reshape(N, self.scaled_dim)  # Concatenate all heads
+        out = self.fc_out(out)  # Project back to original input dimension
         return out
 class BaseModel(nn.Module):
     def __init__(self, num_classes, device='cuda' if torch.cuda.is_available() else 'cpu'):
         super().__init__()
         self.device = device
+        # 1. Initialize backbone, num_classes=0 to remove classifier layer
+        self.backbone = timm.create_model(
+                'convnextv2_base',
+                pretrained=True,
+                num_classes=0
+        )
+        # 2. Use test data to determine actual feature dimensions
+        with torch.no_grad():   # No need to compute gradients
+            dummy_input = torch.randn(1, 3, 224, 224) # Create example input
             features = self.backbone(dummy_input)
+            if len(features.shape) > 2:  # If features are multi-dimensional
+                features = features.mean([-2, -1]) # Apply global average pooling
+            self.feature_dim = features.shape[1] # Get correct feature dimension
+        print(f"Feature Dimension from V2 backbone: {self.feature_dim}")
+        # 3. Setup multi-head attention layer
         self.num_heads = max(1, min(8, self.feature_dim // 64))
         self.attention = MultiHeadAttention(self.feature_dim, num_heads=self.num_heads)
+        # 4. Setup classifier
         self.classifier = nn.Sequential(
             nn.LayerNorm(self.feature_dim),
             nn.Dropout(0.3),
     def forward(self, x):
         """
+        The forward propagation process combines V2's FCCA and the multi-head attention mechanism.
         Args:
+            x (Tensor): Input image tensor with shape [batch_size, channels, height, width]
         Returns:
+            Tuple[Tensor, Tensor]: Classification logits and attention features.
         """
         x = x.to(self.device)
+        # 1. Extract base features
         features = self.backbone(x)
+        # 2. Process feature dimensions
         if len(features.shape) > 2:
+            # If feature dimensions are [batch_size, channels, height, width]
+            # Convert to [batch_size, channels]
+            features = features.mean([-2, -1])  # Use global average pooling
+        # 3. Apply attention mechanism
         attended_features = self.attention(features)
+        # 4. Final classification
         logits = self.classifier(attended_features)
         return logits, attended_features
             ).to(self.device)
             checkpoint = torch.load(
+                'ConvNextV2Base_best_model_dog.pth',
                 map_location=self.device  # 確保checkpoint加載到正確的設備
             )
             self._breed_model.load_state_dict(checkpoint['base_model'], strict=False)
         return probabilities[0], breeds[:3], relative_probs
 @spaces.GPU
+def detect_multiple_dogs(image, conf_threshold=0.3, iou_threshold=0.3):
     """
     使用YOLO模型檢測圖片中的狗。
     只保留被識別為狗（class 16）的物體，並標記它們的狀態。
         x1, y1, x2, y2 = box
         w, h = x2 - x1, y2 - y1
         # 擴大檢測框範圍以包含完整的狗
+        x1 = max(0, x1 - w * 0.02)
+        y1 = max(0, y1 - h * 0.02)
+        x2 = min(image.width, x2 + w * 0.02)
+        y2 = min(image.height, y2 + h * 0.02)
         cropped_image = image.crop((x1, y1, x2, y2))
         detected_objects.append((cropped_image, confidence, [x1, y1, x2, y2], is_dog))
                 combined_confidence = detection_confidence * top1_prob
                 # 根據信心度決定輸出格式
+                if combined_confidence < 0.15:
                     dogs_info += format_unknown_breed_message(color, i+1)
+                elif top1_prob >= 0.4:
                     breed = topk_breeds[0]
                     description = get_dog_description(breed)
                     if description is None:
                 'Border_Collie.jpg',
                 'Golden_Retriever.jpeg',
                 'Saint_Bernard.jpeg',
+                'Samoyed.jpeg',
                 'French_Bulldog.jpeg'
             ]
             detection_components = create_detection_tab(predict, example_images)