Spaces:
Running
Running
עיצוב מחדש עם תמונות דוגמה נוספות
Browse files- app.py +47 -32
- sam2.1/configs/sam2.1/sam2.1_hiera_b+.yaml +0 -116
- sam2.1/configs/sam2.1/sam2.1_hiera_l.yaml +0 -120
- sam2.1/configs/sam2.1/sam2.1_hiera_s.yaml +0 -119
- sam2.1/configs/sam2.1/sam2.1_hiera_t.yaml +0 -121
- style.css +41 -0
app.py
CHANGED
@@ -37,42 +37,57 @@ def inference(image: Image.Image, gemini_api_key: str):
|
|
37 |
error_message += traceback.format_exc()
|
38 |
return None, gr.update(value=error_message, visible=True) # החזרת Textbox גלוי עם שגיאה
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
"""
|
47 |
|
48 |
-
#
|
49 |
-
|
50 |
|
51 |
-
|
52 |
-
fn=inference,
|
53 |
-
inputs=[
|
54 |
-
gr.Image(type="pil", label="בחר תמונה לניתוח או גרור אותה לכאן"),
|
55 |
-
gr.Textbox(
|
56 |
-
label="מפתח API של Gemini",
|
57 |
-
placeholder="הכנס את מפתח ה-API כאן",
|
58 |
-
type="password"
|
59 |
-
)
|
60 |
-
],
|
61 |
-
outputs=[
|
62 |
-
gr.Image(type="pil", label="תוצאה סופית"),
|
63 |
-
gr.Textbox(label="שגיאות", visible=False) # הוספת רכיב להצגת שגיאות
|
64 |
-
],
|
65 |
title=title_str,
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
flagging_mode="never",
|
71 |
-
theme=gr.themes.Default() # עיצוב קליל לממשק
|
72 |
-
)
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
78 |
|
|
|
|
|
|
37 |
error_message += traceback.format_exc()
|
38 |
return None, gr.update(value=error_message, visible=True) # החזרת Textbox גלוי עם שגיאה
|
39 |
|
40 |
+
title_str = "🤖 זיהוי וטשטוש נשים בתמונה"
|
41 |
+
description_str = """
|
42 |
+
<div style='text-align: center; direction: rtl'>
|
43 |
+
<p>
|
44 |
+
ברוכים הבאים לכלי לזיהוי וטשטוש נשים בתמונה!
|
45 |
+
<br>
|
46 |
+
העלו תמונה, הזינו את מפתח ה־API של Gemini,
|
47 |
+
ולחצו על "הרץ" כדי לנתח את התמונה ולטשטש אוטומטית נשים.
|
48 |
+
</p>
|
49 |
+
<p>
|
50 |
+
שימו לב: נדרש מפתח API תקין של Gemini כדי להשתמש בכלי זה.
|
51 |
+
<br>
|
52 |
+
הכלי משתמש בטכנולוגיות מתקדמות כמו YOLO, SAM2 ו-Gemini.
|
53 |
+
</p>
|
54 |
+
</div>
|
55 |
"""
|
56 |
|
57 |
+
# נתיבים לתמונות דוגמה
|
58 |
+
EXAMPLE_IMAGES = ["example_images/example.jpg", "example_images/example2.jpg", "example_images/example3.jpg"]
|
59 |
|
60 |
+
with gr.Blocks(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
title=title_str,
|
62 |
+
css="style.css" # קישור לקובץ CSS
|
63 |
+
) as demo:
|
64 |
+
gr.Markdown(f"<h1 style='text-align: center;'>{title_str}</h1>")
|
65 |
+
gr.Markdown(description_str)
|
|
|
|
|
|
|
66 |
|
67 |
+
with gr.Row():
|
68 |
+
with gr.Column():
|
69 |
+
image_input = gr.Image(type="pil", label="🖼️ בחרו תמונה לניתוח")
|
70 |
+
api_key_input = gr.Textbox(
|
71 |
+
label="🔑 מפתח API של Gemini",
|
72 |
+
placeholder="הכניסו את מפתח ה-API כאן",
|
73 |
+
type="password"
|
74 |
+
)
|
75 |
+
submit_button = gr.Button("🚀 הרץ", variant="primary")
|
76 |
+
gr.Examples(
|
77 |
+
examples=EXAMPLE_IMAGES,
|
78 |
+
inputs=image_input,
|
79 |
+
label="👇 דוגמאות",
|
80 |
+
# cache_examples=True # caching examples speeds up start time, but uses more memory
|
81 |
+
)
|
82 |
+
with gr.Column():
|
83 |
+
image_output = gr.Image(type="pil", label="🖼️ תוצאה לאחר טשטוש")
|
84 |
+
error_output = gr.Textbox(label="📜 שגיאות", visible=False, lines=5)
|
85 |
|
86 |
+
submit_button.click(
|
87 |
+
fn=inference,
|
88 |
+
inputs=[image_input, api_key_input],
|
89 |
+
outputs=[image_output, error_output]
|
90 |
+
)
|
91 |
|
92 |
+
if __name__ == "__main__":
|
93 |
+
demo.launch()
|
sam2.1/configs/sam2.1/sam2.1_hiera_b+.yaml
DELETED
@@ -1,116 +0,0 @@
|
|
1 |
-
# @package _global_
|
2 |
-
|
3 |
-
# Model
|
4 |
-
model:
|
5 |
-
_target_: sam2.modeling.sam2_base.SAM2Base
|
6 |
-
image_encoder:
|
7 |
-
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8 |
-
scalp: 1
|
9 |
-
trunk:
|
10 |
-
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11 |
-
embed_dim: 112
|
12 |
-
num_heads: 2
|
13 |
-
neck:
|
14 |
-
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
15 |
-
position_encoding:
|
16 |
-
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
17 |
-
num_pos_feats: 256
|
18 |
-
normalize: true
|
19 |
-
scale: null
|
20 |
-
temperature: 10000
|
21 |
-
d_model: 256
|
22 |
-
backbone_channel_list: [896, 448, 224, 112]
|
23 |
-
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
24 |
-
fpn_interp_model: nearest
|
25 |
-
|
26 |
-
memory_attention:
|
27 |
-
_target_: sam2.modeling.memory_attention.MemoryAttention
|
28 |
-
d_model: 256
|
29 |
-
pos_enc_at_input: true
|
30 |
-
layer:
|
31 |
-
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
32 |
-
activation: relu
|
33 |
-
dim_feedforward: 2048
|
34 |
-
dropout: 0.1
|
35 |
-
pos_enc_at_attn: false
|
36 |
-
self_attention:
|
37 |
-
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
38 |
-
rope_theta: 10000.0
|
39 |
-
feat_sizes: [64, 64]
|
40 |
-
embedding_dim: 256
|
41 |
-
num_heads: 1
|
42 |
-
downsample_rate: 1
|
43 |
-
dropout: 0.1
|
44 |
-
d_model: 256
|
45 |
-
pos_enc_at_cross_attn_keys: true
|
46 |
-
pos_enc_at_cross_attn_queries: false
|
47 |
-
cross_attention:
|
48 |
-
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
49 |
-
rope_theta: 10000.0
|
50 |
-
feat_sizes: [64, 64]
|
51 |
-
rope_k_repeat: True
|
52 |
-
embedding_dim: 256
|
53 |
-
num_heads: 1
|
54 |
-
downsample_rate: 1
|
55 |
-
dropout: 0.1
|
56 |
-
kv_in_dim: 64
|
57 |
-
num_layers: 4
|
58 |
-
|
59 |
-
memory_encoder:
|
60 |
-
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
61 |
-
out_dim: 64
|
62 |
-
position_encoding:
|
63 |
-
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
64 |
-
num_pos_feats: 64
|
65 |
-
normalize: true
|
66 |
-
scale: null
|
67 |
-
temperature: 10000
|
68 |
-
mask_downsampler:
|
69 |
-
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
70 |
-
kernel_size: 3
|
71 |
-
stride: 2
|
72 |
-
padding: 1
|
73 |
-
fuser:
|
74 |
-
_target_: sam2.modeling.memory_encoder.Fuser
|
75 |
-
layer:
|
76 |
-
_target_: sam2.modeling.memory_encoder.CXBlock
|
77 |
-
dim: 256
|
78 |
-
kernel_size: 7
|
79 |
-
padding: 3
|
80 |
-
layer_scale_init_value: 1e-6
|
81 |
-
use_dwconv: True # depth-wise convs
|
82 |
-
num_layers: 2
|
83 |
-
|
84 |
-
num_maskmem: 7
|
85 |
-
image_size: 1024
|
86 |
-
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
87 |
-
sigmoid_scale_for_mem_enc: 20.0
|
88 |
-
sigmoid_bias_for_mem_enc: -10.0
|
89 |
-
use_mask_input_as_output_without_sam: true
|
90 |
-
# Memory
|
91 |
-
directly_add_no_mem_embed: true
|
92 |
-
no_obj_embed_spatial: true
|
93 |
-
# use high-resolution feature map in the SAM mask decoder
|
94 |
-
use_high_res_features_in_sam: true
|
95 |
-
# output 3 masks on the first click on initial conditioning frames
|
96 |
-
multimask_output_in_sam: true
|
97 |
-
# SAM heads
|
98 |
-
iou_prediction_use_sigmoid: True
|
99 |
-
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
100 |
-
use_obj_ptrs_in_encoder: true
|
101 |
-
add_tpos_enc_to_obj_ptrs: true
|
102 |
-
proj_tpos_enc_in_obj_ptrs: true
|
103 |
-
use_signed_tpos_enc_to_obj_ptrs: true
|
104 |
-
only_obj_ptrs_in_the_past_for_eval: true
|
105 |
-
# object occlusion prediction
|
106 |
-
pred_obj_scores: true
|
107 |
-
pred_obj_scores_mlp: true
|
108 |
-
fixed_no_obj_ptr: true
|
109 |
-
# multimask tracking settings
|
110 |
-
multimask_output_for_tracking: true
|
111 |
-
use_multimask_token_for_obj_ptr: true
|
112 |
-
multimask_min_pt_num: 0
|
113 |
-
multimask_max_pt_num: 1
|
114 |
-
use_mlp_for_obj_ptr_proj: true
|
115 |
-
# Compilation flag
|
116 |
-
compile_image_encoder: False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sam2.1/configs/sam2.1/sam2.1_hiera_l.yaml
DELETED
@@ -1,120 +0,0 @@
|
|
1 |
-
# @package _global_
|
2 |
-
|
3 |
-
# Model
|
4 |
-
model:
|
5 |
-
_target_: sam2.modeling.sam2_base.SAM2Base
|
6 |
-
image_encoder:
|
7 |
-
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8 |
-
scalp: 1
|
9 |
-
trunk:
|
10 |
-
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11 |
-
embed_dim: 144
|
12 |
-
num_heads: 2
|
13 |
-
stages: [2, 6, 36, 4]
|
14 |
-
global_att_blocks: [23, 33, 43]
|
15 |
-
window_pos_embed_bkg_spatial_size: [7, 7]
|
16 |
-
window_spec: [8, 4, 16, 8]
|
17 |
-
neck:
|
18 |
-
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
19 |
-
position_encoding:
|
20 |
-
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
21 |
-
num_pos_feats: 256
|
22 |
-
normalize: true
|
23 |
-
scale: null
|
24 |
-
temperature: 10000
|
25 |
-
d_model: 256
|
26 |
-
backbone_channel_list: [1152, 576, 288, 144]
|
27 |
-
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
28 |
-
fpn_interp_model: nearest
|
29 |
-
|
30 |
-
memory_attention:
|
31 |
-
_target_: sam2.modeling.memory_attention.MemoryAttention
|
32 |
-
d_model: 256
|
33 |
-
pos_enc_at_input: true
|
34 |
-
layer:
|
35 |
-
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
36 |
-
activation: relu
|
37 |
-
dim_feedforward: 2048
|
38 |
-
dropout: 0.1
|
39 |
-
pos_enc_at_attn: false
|
40 |
-
self_attention:
|
41 |
-
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
42 |
-
rope_theta: 10000.0
|
43 |
-
feat_sizes: [64, 64]
|
44 |
-
embedding_dim: 256
|
45 |
-
num_heads: 1
|
46 |
-
downsample_rate: 1
|
47 |
-
dropout: 0.1
|
48 |
-
d_model: 256
|
49 |
-
pos_enc_at_cross_attn_keys: true
|
50 |
-
pos_enc_at_cross_attn_queries: false
|
51 |
-
cross_attention:
|
52 |
-
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
53 |
-
rope_theta: 10000.0
|
54 |
-
feat_sizes: [64, 64]
|
55 |
-
rope_k_repeat: True
|
56 |
-
embedding_dim: 256
|
57 |
-
num_heads: 1
|
58 |
-
downsample_rate: 1
|
59 |
-
dropout: 0.1
|
60 |
-
kv_in_dim: 64
|
61 |
-
num_layers: 4
|
62 |
-
|
63 |
-
memory_encoder:
|
64 |
-
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
65 |
-
out_dim: 64
|
66 |
-
position_encoding:
|
67 |
-
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
68 |
-
num_pos_feats: 64
|
69 |
-
normalize: true
|
70 |
-
scale: null
|
71 |
-
temperature: 10000
|
72 |
-
mask_downsampler:
|
73 |
-
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
74 |
-
kernel_size: 3
|
75 |
-
stride: 2
|
76 |
-
padding: 1
|
77 |
-
fuser:
|
78 |
-
_target_: sam2.modeling.memory_encoder.Fuser
|
79 |
-
layer:
|
80 |
-
_target_: sam2.modeling.memory_encoder.CXBlock
|
81 |
-
dim: 256
|
82 |
-
kernel_size: 7
|
83 |
-
padding: 3
|
84 |
-
layer_scale_init_value: 1e-6
|
85 |
-
use_dwconv: True # depth-wise convs
|
86 |
-
num_layers: 2
|
87 |
-
|
88 |
-
num_maskmem: 7
|
89 |
-
image_size: 1024
|
90 |
-
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
91 |
-
sigmoid_scale_for_mem_enc: 20.0
|
92 |
-
sigmoid_bias_for_mem_enc: -10.0
|
93 |
-
use_mask_input_as_output_without_sam: true
|
94 |
-
# Memory
|
95 |
-
directly_add_no_mem_embed: true
|
96 |
-
no_obj_embed_spatial: true
|
97 |
-
# use high-resolution feature map in the SAM mask decoder
|
98 |
-
use_high_res_features_in_sam: true
|
99 |
-
# output 3 masks on the first click on initial conditioning frames
|
100 |
-
multimask_output_in_sam: true
|
101 |
-
# SAM heads
|
102 |
-
iou_prediction_use_sigmoid: True
|
103 |
-
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
104 |
-
use_obj_ptrs_in_encoder: true
|
105 |
-
add_tpos_enc_to_obj_ptrs: true
|
106 |
-
proj_tpos_enc_in_obj_ptrs: true
|
107 |
-
use_signed_tpos_enc_to_obj_ptrs: true
|
108 |
-
only_obj_ptrs_in_the_past_for_eval: true
|
109 |
-
# object occlusion prediction
|
110 |
-
pred_obj_scores: true
|
111 |
-
pred_obj_scores_mlp: true
|
112 |
-
fixed_no_obj_ptr: true
|
113 |
-
# multimask tracking settings
|
114 |
-
multimask_output_for_tracking: true
|
115 |
-
use_multimask_token_for_obj_ptr: true
|
116 |
-
multimask_min_pt_num: 0
|
117 |
-
multimask_max_pt_num: 1
|
118 |
-
use_mlp_for_obj_ptr_proj: true
|
119 |
-
# Compilation flag
|
120 |
-
compile_image_encoder: False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sam2.1/configs/sam2.1/sam2.1_hiera_s.yaml
DELETED
@@ -1,119 +0,0 @@
|
|
1 |
-
# @package _global_
|
2 |
-
|
3 |
-
# Model
|
4 |
-
model:
|
5 |
-
_target_: sam2.modeling.sam2_base.SAM2Base
|
6 |
-
image_encoder:
|
7 |
-
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8 |
-
scalp: 1
|
9 |
-
trunk:
|
10 |
-
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11 |
-
embed_dim: 96
|
12 |
-
num_heads: 1
|
13 |
-
stages: [1, 2, 11, 2]
|
14 |
-
global_att_blocks: [7, 10, 13]
|
15 |
-
window_pos_embed_bkg_spatial_size: [7, 7]
|
16 |
-
neck:
|
17 |
-
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
18 |
-
position_encoding:
|
19 |
-
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
20 |
-
num_pos_feats: 256
|
21 |
-
normalize: true
|
22 |
-
scale: null
|
23 |
-
temperature: 10000
|
24 |
-
d_model: 256
|
25 |
-
backbone_channel_list: [768, 384, 192, 96]
|
26 |
-
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
27 |
-
fpn_interp_model: nearest
|
28 |
-
|
29 |
-
memory_attention:
|
30 |
-
_target_: sam2.modeling.memory_attention.MemoryAttention
|
31 |
-
d_model: 256
|
32 |
-
pos_enc_at_input: true
|
33 |
-
layer:
|
34 |
-
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
35 |
-
activation: relu
|
36 |
-
dim_feedforward: 2048
|
37 |
-
dropout: 0.1
|
38 |
-
pos_enc_at_attn: false
|
39 |
-
self_attention:
|
40 |
-
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
41 |
-
rope_theta: 10000.0
|
42 |
-
feat_sizes: [64, 64]
|
43 |
-
embedding_dim: 256
|
44 |
-
num_heads: 1
|
45 |
-
downsample_rate: 1
|
46 |
-
dropout: 0.1
|
47 |
-
d_model: 256
|
48 |
-
pos_enc_at_cross_attn_keys: true
|
49 |
-
pos_enc_at_cross_attn_queries: false
|
50 |
-
cross_attention:
|
51 |
-
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
52 |
-
rope_theta: 10000.0
|
53 |
-
feat_sizes: [64, 64]
|
54 |
-
rope_k_repeat: True
|
55 |
-
embedding_dim: 256
|
56 |
-
num_heads: 1
|
57 |
-
downsample_rate: 1
|
58 |
-
dropout: 0.1
|
59 |
-
kv_in_dim: 64
|
60 |
-
num_layers: 4
|
61 |
-
|
62 |
-
memory_encoder:
|
63 |
-
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
64 |
-
out_dim: 64
|
65 |
-
position_encoding:
|
66 |
-
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
67 |
-
num_pos_feats: 64
|
68 |
-
normalize: true
|
69 |
-
scale: null
|
70 |
-
temperature: 10000
|
71 |
-
mask_downsampler:
|
72 |
-
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
73 |
-
kernel_size: 3
|
74 |
-
stride: 2
|
75 |
-
padding: 1
|
76 |
-
fuser:
|
77 |
-
_target_: sam2.modeling.memory_encoder.Fuser
|
78 |
-
layer:
|
79 |
-
_target_: sam2.modeling.memory_encoder.CXBlock
|
80 |
-
dim: 256
|
81 |
-
kernel_size: 7
|
82 |
-
padding: 3
|
83 |
-
layer_scale_init_value: 1e-6
|
84 |
-
use_dwconv: True # depth-wise convs
|
85 |
-
num_layers: 2
|
86 |
-
|
87 |
-
num_maskmem: 7
|
88 |
-
image_size: 1024
|
89 |
-
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
90 |
-
sigmoid_scale_for_mem_enc: 20.0
|
91 |
-
sigmoid_bias_for_mem_enc: -10.0
|
92 |
-
use_mask_input_as_output_without_sam: true
|
93 |
-
# Memory
|
94 |
-
directly_add_no_mem_embed: true
|
95 |
-
no_obj_embed_spatial: true
|
96 |
-
# use high-resolution feature map in the SAM mask decoder
|
97 |
-
use_high_res_features_in_sam: true
|
98 |
-
# output 3 masks on the first click on initial conditioning frames
|
99 |
-
multimask_output_in_sam: true
|
100 |
-
# SAM heads
|
101 |
-
iou_prediction_use_sigmoid: True
|
102 |
-
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
103 |
-
use_obj_ptrs_in_encoder: true
|
104 |
-
add_tpos_enc_to_obj_ptrs: true
|
105 |
-
proj_tpos_enc_in_obj_ptrs: true
|
106 |
-
use_signed_tpos_enc_to_obj_ptrs: true
|
107 |
-
only_obj_ptrs_in_the_past_for_eval: true
|
108 |
-
# object occlusion prediction
|
109 |
-
pred_obj_scores: true
|
110 |
-
pred_obj_scores_mlp: true
|
111 |
-
fixed_no_obj_ptr: true
|
112 |
-
# multimask tracking settings
|
113 |
-
multimask_output_for_tracking: true
|
114 |
-
use_multimask_token_for_obj_ptr: true
|
115 |
-
multimask_min_pt_num: 0
|
116 |
-
multimask_max_pt_num: 1
|
117 |
-
use_mlp_for_obj_ptr_proj: true
|
118 |
-
# Compilation flag
|
119 |
-
compile_image_encoder: False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sam2.1/configs/sam2.1/sam2.1_hiera_t.yaml
DELETED
@@ -1,121 +0,0 @@
|
|
1 |
-
# @package _global_
|
2 |
-
|
3 |
-
# Model
|
4 |
-
model:
|
5 |
-
_target_: sam2.modeling.sam2_base.SAM2Base
|
6 |
-
image_encoder:
|
7 |
-
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8 |
-
scalp: 1
|
9 |
-
trunk:
|
10 |
-
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11 |
-
embed_dim: 96
|
12 |
-
num_heads: 1
|
13 |
-
stages: [1, 2, 7, 2]
|
14 |
-
global_att_blocks: [5, 7, 9]
|
15 |
-
window_pos_embed_bkg_spatial_size: [7, 7]
|
16 |
-
neck:
|
17 |
-
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
18 |
-
position_encoding:
|
19 |
-
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
20 |
-
num_pos_feats: 256
|
21 |
-
normalize: true
|
22 |
-
scale: null
|
23 |
-
temperature: 10000
|
24 |
-
d_model: 256
|
25 |
-
backbone_channel_list: [768, 384, 192, 96]
|
26 |
-
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
27 |
-
fpn_interp_model: nearest
|
28 |
-
|
29 |
-
memory_attention:
|
30 |
-
_target_: sam2.modeling.memory_attention.MemoryAttention
|
31 |
-
d_model: 256
|
32 |
-
pos_enc_at_input: true
|
33 |
-
layer:
|
34 |
-
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
35 |
-
activation: relu
|
36 |
-
dim_feedforward: 2048
|
37 |
-
dropout: 0.1
|
38 |
-
pos_enc_at_attn: false
|
39 |
-
self_attention:
|
40 |
-
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
41 |
-
rope_theta: 10000.0
|
42 |
-
feat_sizes: [64, 64]
|
43 |
-
embedding_dim: 256
|
44 |
-
num_heads: 1
|
45 |
-
downsample_rate: 1
|
46 |
-
dropout: 0.1
|
47 |
-
d_model: 256
|
48 |
-
pos_enc_at_cross_attn_keys: true
|
49 |
-
pos_enc_at_cross_attn_queries: false
|
50 |
-
cross_attention:
|
51 |
-
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
52 |
-
rope_theta: 10000.0
|
53 |
-
feat_sizes: [64, 64]
|
54 |
-
rope_k_repeat: True
|
55 |
-
embedding_dim: 256
|
56 |
-
num_heads: 1
|
57 |
-
downsample_rate: 1
|
58 |
-
dropout: 0.1
|
59 |
-
kv_in_dim: 64
|
60 |
-
num_layers: 4
|
61 |
-
|
62 |
-
memory_encoder:
|
63 |
-
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
64 |
-
out_dim: 64
|
65 |
-
position_encoding:
|
66 |
-
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
67 |
-
num_pos_feats: 64
|
68 |
-
normalize: true
|
69 |
-
scale: null
|
70 |
-
temperature: 10000
|
71 |
-
mask_downsampler:
|
72 |
-
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
73 |
-
kernel_size: 3
|
74 |
-
stride: 2
|
75 |
-
padding: 1
|
76 |
-
fuser:
|
77 |
-
_target_: sam2.modeling.memory_encoder.Fuser
|
78 |
-
layer:
|
79 |
-
_target_: sam2.modeling.memory_encoder.CXBlock
|
80 |
-
dim: 256
|
81 |
-
kernel_size: 7
|
82 |
-
padding: 3
|
83 |
-
layer_scale_init_value: 1e-6
|
84 |
-
use_dwconv: True # depth-wise convs
|
85 |
-
num_layers: 2
|
86 |
-
|
87 |
-
num_maskmem: 7
|
88 |
-
image_size: 1024
|
89 |
-
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
90 |
-
# SAM decoder
|
91 |
-
sigmoid_scale_for_mem_enc: 20.0
|
92 |
-
sigmoid_bias_for_mem_enc: -10.0
|
93 |
-
use_mask_input_as_output_without_sam: true
|
94 |
-
# Memory
|
95 |
-
directly_add_no_mem_embed: true
|
96 |
-
no_obj_embed_spatial: true
|
97 |
-
# use high-resolution feature map in the SAM mask decoder
|
98 |
-
use_high_res_features_in_sam: true
|
99 |
-
# output 3 masks on the first click on initial conditioning frames
|
100 |
-
multimask_output_in_sam: true
|
101 |
-
# SAM heads
|
102 |
-
iou_prediction_use_sigmoid: True
|
103 |
-
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
104 |
-
use_obj_ptrs_in_encoder: true
|
105 |
-
add_tpos_enc_to_obj_ptrs: true
|
106 |
-
proj_tpos_enc_in_obj_ptrs: true
|
107 |
-
use_signed_tpos_enc_to_obj_ptrs: true
|
108 |
-
only_obj_ptrs_in_the_past_for_eval: true
|
109 |
-
# object occlusion prediction
|
110 |
-
pred_obj_scores: true
|
111 |
-
pred_obj_scores_mlp: true
|
112 |
-
fixed_no_obj_ptr: true
|
113 |
-
# multimask tracking settings
|
114 |
-
multimask_output_for_tracking: true
|
115 |
-
use_multimask_token_for_obj_ptr: true
|
116 |
-
multimask_min_pt_num: 0
|
117 |
-
multimask_max_pt_num: 1
|
118 |
-
use_mlp_for_obj_ptr_proj: true
|
119 |
-
# Compilation flag
|
120 |
-
# HieraT does not currently support compilation, should always be set to False
|
121 |
-
compile_image_encoder: False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
style.css
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* שינוי צבע כפתור "הרץ" */
|
2 |
+
.primary {
|
3 |
+
background-color: #4CAF50 !important; /* גוון ירוק */
|
4 |
+
color: white !important;
|
5 |
+
border-color: #4CAF50 !important;
|
6 |
+
}
|
7 |
+
|
8 |
+
/* עיצוב תיבות קלט ופלט */
|
9 |
+
.gr-image, .gr-textbox {
|
10 |
+
border: 2px solid #ccc;
|
11 |
+
border-radius: 8px;
|
12 |
+
padding: 10px;
|
13 |
+
}
|
14 |
+
|
15 |
+
/* עיצוב כותרות */
|
16 |
+
.gr-image label, .gr-textbox label {
|
17 |
+
font-weight: bold;
|
18 |
+
color: #333;
|
19 |
+
}
|
20 |
+
|
21 |
+
/* עיצוב גלריית הדוגמאות */
|
22 |
+
.sample-container {
|
23 |
+
border: 1px solid #ccc;
|
24 |
+
border-radius: 8px;
|
25 |
+
margin-bottom: 10px; /* מרווח בין גלריית הדוגמאות לכפתור */
|
26 |
+
}
|
27 |
+
|
28 |
+
.sample-container > .prose > h4 {
|
29 |
+
margin-bottom: 0px; /* ביטול מרווח מתחת לכותרת של גלריית הדוגמאות */
|
30 |
+
}
|
31 |
+
|
32 |
+
.gr-sample img {
|
33 |
+
border: 1px solid #ccc;
|
34 |
+
border-radius: 8px;
|
35 |
+
object-fit: cover; /* חיתוך תמונה לגודל קבוע */
|
36 |
+
}
|
37 |
+
|
38 |
+
.gr-sample img:hover {
|
39 |
+
border-color: #4CAF50;
|
40 |
+
cursor: pointer;
|
41 |
+
}
|