Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,7 +6,7 @@ import gradio as gr
|
|
| 6 |
import time
|
| 7 |
import traceback
|
| 8 |
import spaces
|
| 9 |
-
|
| 10 |
from torchvision.ops import nms, box_iou
|
| 11 |
import torch.nn.functional as F
|
| 12 |
from torchvision import transforms
|
|
@@ -72,56 +72,77 @@ dog_breeds = ["Afghan_Hound", "African_Hunting_Dog", "Airedale", "American_Staff
|
|
| 72 |
class MultiHeadAttention(nn.Module):
|
| 73 |
|
| 74 |
def __init__(self, in_dim, num_heads=8):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
super().__init__()
|
| 76 |
self.num_heads = num_heads
|
| 77 |
-
self.head_dim = max(1, in_dim // num_heads)
|
| 78 |
-
self.scaled_dim = self.head_dim * num_heads
|
| 79 |
-
self.fc_in = nn.Linear(in_dim, self.scaled_dim)
|
| 80 |
-
self.query = nn.Linear(self.scaled_dim, self.scaled_dim)
|
| 81 |
-
self.key = nn.Linear(self.scaled_dim, self.scaled_dim)
|
| 82 |
-
self.value = nn.Linear(self.scaled_dim, self.scaled_dim)
|
| 83 |
-
self.fc_out = nn.Linear(self.scaled_dim, in_dim)
|
| 84 |
|
| 85 |
def forward(self, x):
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
|
|
|
| 94 |
|
|
|
|
| 95 |
out = torch.einsum("nqk,nvd->nqd", [attention, v])
|
| 96 |
-
out = out.reshape(N, self.scaled_dim)
|
| 97 |
-
out = self.fc_out(out)
|
| 98 |
return out
|
| 99 |
|
|
|
|
| 100 |
class BaseModel(nn.Module):
|
| 101 |
|
| 102 |
def __init__(self, num_classes, device='cuda' if torch.cuda.is_available() else 'cpu'):
|
| 103 |
super().__init__()
|
| 104 |
self.device = device
|
| 105 |
|
| 106 |
-
# 1.
|
| 107 |
-
self.backbone =
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
-
# 2.
|
| 111 |
-
with torch.no_grad():
|
| 112 |
-
dummy_input = torch.randn(1, 3, 224, 224)
|
| 113 |
features = self.backbone(dummy_input)
|
| 114 |
-
if len(features.shape) > 2: # 如果特徵是多維的
|
| 115 |
-
features = features.mean([-2, -1]) # 進行全局平均池化
|
| 116 |
-
self.feature_dim = features.shape[1] # 獲取正確的特徵維度
|
| 117 |
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
-
# 3.
|
| 121 |
self.num_heads = max(1, min(8, self.feature_dim // 64))
|
| 122 |
self.attention = MultiHeadAttention(self.feature_dim, num_heads=self.num_heads)
|
| 123 |
|
| 124 |
-
# 4.
|
| 125 |
self.classifier = nn.Sequential(
|
| 126 |
nn.LayerNorm(self.feature_dim),
|
| 127 |
nn.Dropout(0.3),
|
|
@@ -130,27 +151,27 @@ class BaseModel(nn.Module):
|
|
| 130 |
|
| 131 |
def forward(self, x):
|
| 132 |
"""
|
| 133 |
-
|
| 134 |
Args:
|
| 135 |
-
x (Tensor):
|
| 136 |
Returns:
|
| 137 |
-
Tuple[Tensor, Tensor]:
|
| 138 |
"""
|
| 139 |
x = x.to(self.device)
|
| 140 |
|
| 141 |
-
# 1.
|
| 142 |
features = self.backbone(x)
|
| 143 |
|
| 144 |
-
# 2.
|
| 145 |
if len(features.shape) > 2:
|
| 146 |
-
#
|
| 147 |
-
#
|
| 148 |
-
features = features.mean([-2, -1]) #
|
| 149 |
|
| 150 |
-
# 3.
|
| 151 |
attended_features = self.attention(features)
|
| 152 |
|
| 153 |
-
# 4.
|
| 154 |
logits = self.classifier(attended_features)
|
| 155 |
|
| 156 |
return logits, attended_features
|
|
@@ -211,7 +232,7 @@ class ModelManager:
|
|
| 211 |
).to(self.device)
|
| 212 |
|
| 213 |
checkpoint = torch.load(
|
| 214 |
-
'
|
| 215 |
map_location=self.device # 確保checkpoint加載到正確的設備
|
| 216 |
)
|
| 217 |
self._breed_model.load_state_dict(checkpoint['base_model'], strict=False)
|
|
@@ -271,7 +292,7 @@ def predict_single_dog(image):
|
|
| 271 |
return probabilities[0], breeds[:3], relative_probs
|
| 272 |
|
| 273 |
@spaces.GPU
|
| 274 |
-
def detect_multiple_dogs(image, conf_threshold=0.3, iou_threshold=0.
|
| 275 |
"""
|
| 276 |
使用YOLO模型檢測圖片中的狗。
|
| 277 |
只保留被識別為狗(class 16)的物體,並標記它們的狀態。
|
|
@@ -310,10 +331,10 @@ def detect_multiple_dogs(image, conf_threshold=0.3, iou_threshold=0.55):
|
|
| 310 |
x1, y1, x2, y2 = box
|
| 311 |
w, h = x2 - x1, y2 - y1
|
| 312 |
# 擴大檢測框範圍以包含完整的狗
|
| 313 |
-
x1 = max(0, x1 - w * 0.
|
| 314 |
-
y1 = max(0, y1 - h * 0.
|
| 315 |
-
x2 = min(image.width, x2 + w * 0.
|
| 316 |
-
y2 = min(image.height, y2 + h * 0.
|
| 317 |
cropped_image = image.crop((x1, y1, x2, y2))
|
| 318 |
detected_objects.append((cropped_image, confidence, [x1, y1, x2, y2], is_dog))
|
| 319 |
|
|
@@ -442,9 +463,9 @@ def predict(image):
|
|
| 442 |
combined_confidence = detection_confidence * top1_prob
|
| 443 |
|
| 444 |
# 根據信心度決定輸出格式
|
| 445 |
-
if combined_confidence < 0.
|
| 446 |
dogs_info += format_unknown_breed_message(color, i+1)
|
| 447 |
-
elif top1_prob >= 0.
|
| 448 |
breed = topk_breeds[0]
|
| 449 |
description = get_dog_description(breed)
|
| 450 |
if description is None:
|
|
@@ -555,7 +576,7 @@ def main():
|
|
| 555 |
'Border_Collie.jpg',
|
| 556 |
'Golden_Retriever.jpeg',
|
| 557 |
'Saint_Bernard.jpeg',
|
| 558 |
-
'Samoyed.
|
| 559 |
'French_Bulldog.jpeg'
|
| 560 |
]
|
| 561 |
detection_components = create_detection_tab(predict, example_images)
|
|
|
|
| 6 |
import time
|
| 7 |
import traceback
|
| 8 |
import spaces
|
| 9 |
+
import timm
|
| 10 |
from torchvision.ops import nms, box_iou
|
| 11 |
import torch.nn.functional as F
|
| 12 |
from torchvision import transforms
|
|
|
|
| 72 |
class MultiHeadAttention(nn.Module):
|
| 73 |
|
| 74 |
def __init__(self, in_dim, num_heads=8):
|
| 75 |
+
"""
|
| 76 |
+
Initializes the MultiHeadAttention module.
|
| 77 |
+
Args:
|
| 78 |
+
in_dim (int): Dimension of the input features.
|
| 79 |
+
num_heads (int): Number of attention heads. Defaults to 8.
|
| 80 |
+
"""
|
| 81 |
super().__init__()
|
| 82 |
self.num_heads = num_heads
|
| 83 |
+
self.head_dim = max(1, in_dim // num_heads) # Compute dimension per head
|
| 84 |
+
self.scaled_dim = self.head_dim * num_heads # Scaled dimension after splitting into heads
|
| 85 |
+
self.fc_in = nn.Linear(in_dim, self.scaled_dim) # Linear layer to project input to scaled_dim
|
| 86 |
+
self.query = nn.Linear(self.scaled_dim, self.scaled_dim) # Query projection
|
| 87 |
+
self.key = nn.Linear(self.scaled_dim, self.scaled_dim) # Key projection
|
| 88 |
+
self.value = nn.Linear(self.scaled_dim, self.scaled_dim) # Value projection
|
| 89 |
+
self.fc_out = nn.Linear(self.scaled_dim, in_dim) # Linear layer to project output back to in_dim
|
| 90 |
|
| 91 |
def forward(self, x):
|
| 92 |
+
"""
|
| 93 |
+
Forward pass for multi-head attention mechanism.
|
| 94 |
+
Args:
|
| 95 |
+
x (Tensor): Input tensor of shape (batch_size, input_dim).
|
| 96 |
+
Returns:
|
| 97 |
+
Tensor: Output tensor after applying attention mechanism.
|
| 98 |
+
"""
|
| 99 |
+
N = x.shape[0] # Batch size
|
| 100 |
+
x = self.fc_in(x) # Project input to scaled_dim
|
| 101 |
+
q = self.query(x).view(N, self.num_heads, self.head_dim) # Compute queries
|
| 102 |
+
k = self.key(x).view(N, self.num_heads, self.head_dim) # Compute keys
|
| 103 |
+
v = self.value(x).view(N, self.num_heads, self.head_dim) # Compute values
|
| 104 |
|
| 105 |
+
# Calculate attention scores
|
| 106 |
+
energy = torch.einsum("nqd,nkd->nqk", [q, k]) # Dot product between queries and keys
|
| 107 |
+
attention = F.softmax(energy / (self.head_dim ** 0.5), dim=2) # Apply softmax with scaling
|
| 108 |
|
| 109 |
+
# Compute weighted sum of values based on attention scores
|
| 110 |
out = torch.einsum("nqk,nvd->nqd", [attention, v])
|
| 111 |
+
out = out.reshape(N, self.scaled_dim) # Concatenate all heads
|
| 112 |
+
out = self.fc_out(out) # Project back to original input dimension
|
| 113 |
return out
|
| 114 |
|
| 115 |
+
|
| 116 |
class BaseModel(nn.Module):
|
| 117 |
|
| 118 |
def __init__(self, num_classes, device='cuda' if torch.cuda.is_available() else 'cpu'):
|
| 119 |
super().__init__()
|
| 120 |
self.device = device
|
| 121 |
|
| 122 |
+
# 1. Initialize backbone, num_classes=0 to remove classifier layer
|
| 123 |
+
self.backbone = timm.create_model(
|
| 124 |
+
'convnextv2_base',
|
| 125 |
+
pretrained=True,
|
| 126 |
+
num_classes=0
|
| 127 |
+
)
|
| 128 |
|
| 129 |
+
# 2. Use test data to determine actual feature dimensions
|
| 130 |
+
with torch.no_grad(): # No need to compute gradients
|
| 131 |
+
dummy_input = torch.randn(1, 3, 224, 224) # Create example input
|
| 132 |
features = self.backbone(dummy_input)
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
+
if len(features.shape) > 2: # If features are multi-dimensional
|
| 135 |
+
features = features.mean([-2, -1]) # Apply global average pooling
|
| 136 |
+
|
| 137 |
+
self.feature_dim = features.shape[1] # Get correct feature dimension
|
| 138 |
+
|
| 139 |
+
print(f"Feature Dimension from V2 backbone: {self.feature_dim}")
|
| 140 |
|
| 141 |
+
# 3. Setup multi-head attention layer
|
| 142 |
self.num_heads = max(1, min(8, self.feature_dim // 64))
|
| 143 |
self.attention = MultiHeadAttention(self.feature_dim, num_heads=self.num_heads)
|
| 144 |
|
| 145 |
+
# 4. Setup classifier
|
| 146 |
self.classifier = nn.Sequential(
|
| 147 |
nn.LayerNorm(self.feature_dim),
|
| 148 |
nn.Dropout(0.3),
|
|
|
|
| 151 |
|
| 152 |
def forward(self, x):
|
| 153 |
"""
|
| 154 |
+
The forward propagation process combines V2's FCCA and the multi-head attention mechanism.
|
| 155 |
Args:
|
| 156 |
+
x (Tensor): Input image tensor with shape [batch_size, channels, height, width]
|
| 157 |
Returns:
|
| 158 |
+
Tuple[Tensor, Tensor]: Classification logits and attention features.
|
| 159 |
"""
|
| 160 |
x = x.to(self.device)
|
| 161 |
|
| 162 |
+
# 1. Extract base features
|
| 163 |
features = self.backbone(x)
|
| 164 |
|
| 165 |
+
# 2. Process feature dimensions
|
| 166 |
if len(features.shape) > 2:
|
| 167 |
+
# If feature dimensions are [batch_size, channels, height, width]
|
| 168 |
+
# Convert to [batch_size, channels]
|
| 169 |
+
features = features.mean([-2, -1]) # Use global average pooling
|
| 170 |
|
| 171 |
+
# 3. Apply attention mechanism
|
| 172 |
attended_features = self.attention(features)
|
| 173 |
|
| 174 |
+
# 4. Final classification
|
| 175 |
logits = self.classifier(attended_features)
|
| 176 |
|
| 177 |
return logits, attended_features
|
|
|
|
| 232 |
).to(self.device)
|
| 233 |
|
| 234 |
checkpoint = torch.load(
|
| 235 |
+
'ConvNextV2Base_best_model_dog.pth',
|
| 236 |
map_location=self.device # 確保checkpoint加載到正確的設備
|
| 237 |
)
|
| 238 |
self._breed_model.load_state_dict(checkpoint['base_model'], strict=False)
|
|
|
|
| 292 |
return probabilities[0], breeds[:3], relative_probs
|
| 293 |
|
| 294 |
@spaces.GPU
|
| 295 |
+
def detect_multiple_dogs(image, conf_threshold=0.3, iou_threshold=0.3):
|
| 296 |
"""
|
| 297 |
使用YOLO模型檢測圖片中的狗。
|
| 298 |
只保留被識別為狗(class 16)的物體,並標記它們的狀態。
|
|
|
|
| 331 |
x1, y1, x2, y2 = box
|
| 332 |
w, h = x2 - x1, y2 - y1
|
| 333 |
# 擴大檢測框範圍以包含完整的狗
|
| 334 |
+
x1 = max(0, x1 - w * 0.02)
|
| 335 |
+
y1 = max(0, y1 - h * 0.02)
|
| 336 |
+
x2 = min(image.width, x2 + w * 0.02)
|
| 337 |
+
y2 = min(image.height, y2 + h * 0.02)
|
| 338 |
cropped_image = image.crop((x1, y1, x2, y2))
|
| 339 |
detected_objects.append((cropped_image, confidence, [x1, y1, x2, y2], is_dog))
|
| 340 |
|
|
|
|
| 463 |
combined_confidence = detection_confidence * top1_prob
|
| 464 |
|
| 465 |
# 根據信心度決定輸出格式
|
| 466 |
+
if combined_confidence < 0.15:
|
| 467 |
dogs_info += format_unknown_breed_message(color, i+1)
|
| 468 |
+
elif top1_prob >= 0.4:
|
| 469 |
breed = topk_breeds[0]
|
| 470 |
description = get_dog_description(breed)
|
| 471 |
if description is None:
|
|
|
|
| 576 |
'Border_Collie.jpg',
|
| 577 |
'Golden_Retriever.jpeg',
|
| 578 |
'Saint_Bernard.jpeg',
|
| 579 |
+
'Samoyed.jpeg',
|
| 580 |
'French_Bulldog.jpeg'
|
| 581 |
]
|
| 582 |
detection_components = create_detection_tab(predict, example_images)
|