Spaces:

NHLOCAL
/

Kav-Venaki

Running

App Files Files Community

NHLOCAL commited on 30 days ago

Commit

0bce39f

1 Parent(s): f539f23

עדכון מבנה

Browse files

Files changed (12) hide show

__pycache__/backend.cpython-310.pyc +0 -0
app.py +3 -3
backend.py +18 -55
cache/models--facebook--sam2.1-hiera-tiny/.no_exist/36f406a75c9be63c7f429da63246273f028c6fd4/pytorch_model.bin +0 -0
cache/models--facebook--sam2.1-hiera-tiny/refs/main +1 -0
checkpoints/{sam2_hiera_small.pt → sam2.1_hiera_tiny.pt} +2 -2
configs/sam2.1_hiera_t.yaml +121 -0
requirements.txt +2 -1
sam2.1/configs/sam2.1/sam2.1_hiera_b+.yaml +116 -0
sam2.1/configs/sam2.1/sam2.1_hiera_l.yaml +120 -0
configs/sam2_hiera_s.yaml → sam2.1/configs/sam2.1/sam2.1_hiera_s.yaml +4 -1
sam2.1/configs/sam2.1/sam2.1_hiera_t.yaml +121 -0

__pycache__/backend.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/backend.cpython-310.pyc and b/__pycache__/backend.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -52,12 +52,12 @@ demo = gr.Interface(
     examples=[
         [EXAMPLE_IMAGE]  # תמונה בלבד, ללא מפתח API
     ],
-    allow_flagging="never",
-    theme="compact"  # עיצוב קליל לממשק
 )
 if __name__ == "__main__":
     # ניתן להגדיר share=True אם רוצים לשתף מחוץ לרשת המקומית
-    demo.launch()

     examples=[
         [EXAMPLE_IMAGE]  # תמונה בלבד, ללא מפתח API
     ],
+    flagging_mode="never",
+    theme=gr.themes.Default()  # עיצוב קליל לממשק
 )
 if __name__ == "__main__":
     # ניתן להגדיר share=True אם רוצים לשתף מחוץ לרשת המקומית
+    demo.launch(debug=True)

backend.py CHANGED Viewed

@@ -7,6 +7,9 @@ import numpy as np
 import cv2
 from PIL import Image, ImageFilter
 from scipy.ndimage import binary_dilation
 # -----------------------------
 # 1) הגדרת המפתח API של Gemini כפרמטר
@@ -140,7 +143,7 @@ def send_and_receive(api_key: str) -> str:
 # 3) טעינת מודל YOLO
 # -----------------------------
 from ultralytics import YOLO
-YOLO_MODEL_PATH =  '../../models/yolo11m.pt'
 try:
     yolo_model = YOLO(YOLO_MODEL_PATH)
@@ -156,63 +159,22 @@ CONF_THRESHOLD = 0.2
 # -----------------------------
 # 4) הכנה ל-SAM2
 # -----------------------------
-from typing import Any
-import supervision as sv
-from sam2.build_sam import build_sam2
-from sam2.sam2_image_predictor import SAM2ImagePredictor
-SAM2_CHECKPOINT = "checkpoints/sam2_hiera_small.pt"
-SAM2_CONFIG = "configs/sam2_hiera_s.yaml"
-def load_sam_image_model(
-    device: torch.device,
-    config: str = SAM2_CONFIG,
-    checkpoint: str = SAM2_CHECKPOINT
-) -> SAM2ImagePredictor:
-    try:
-        # יצירת התיקיות אם הן לא קיימות
-        os.makedirs(os.path.dirname(checkpoint), exist_ok=True)
-        os.makedirs(os.path.dirname(config), exist_ok=True)
-        # הורדת קובץ ה-checkpoint אם אינו קיים
-        if not os.path.exists(checkpoint):
-            print("[SAM2] מודל SAM2 לא נמצא. מנסה להוריד את המודל...")
-            sam2_url = "https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2_hiera_small.pt"
-            response = requests.get(sam2_url)
-            if response.status_code == 200:
-                with open(checkpoint, 'wb') as f:
-                    f.write(response.content)
-                print("[SAM2] מודל SAM2 הורד בהצלחה.")
-            else:
-                raise FileNotFoundError(f"לא הצליח להוריד את מודל SAM2 מהכתובת: {sam2_url}")
-        # הורדת קובץ הקונפיג אם אינו קיים
-        if not os.path.exists(config):
-            print("[SAM2] קובץ הקונפיג SAM2 לא נמצא. מנסה להוריד את הקונפיג...")
-            sam2_config_url = "https://raw.githubusercontent.com/facebookresearch/sam2/refs/heads/main/sam2/configs/sam2/sam2_hiera_s.yaml"  # עדכן לכתובת הנכונה
-            response = requests.get(sam2_config_url)
-            if response.status_code == 200:
-                with open(config, 'wb') as f:
-                    f.write(response.content)
-                print("[SAM2] קובץ הקונפיג SAM2 הורד בהצלחה.")
-            else:
-                raise FileNotFoundError(f"לא הצליח להוריד את קובץ הקונפיג SAM2 מהכתובת: {sam2_config_url}")
-        # בניית המודל
-        print("[SAM2] מנסה לבנות את המודל SAM2...")
-        model = build_sam2(config, checkpoint, device=device)
-        print("[SAM2] המודל SAM2 נבנה בהצלחה.")
-        return SAM2ImagePredictor(sam_model=model)
-    except Exception as e:
-        print(f"[SAM2] שגיאה בטעינת המודל SAM2: {e}")
-        raise e  # העלאת השגיאה למעלה כדי שתוכל להיתפס ולהדפיס בהמשך
 try:
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    sam2_predictor = load_sam_image_model(device=device)
     sam2_predictor.model.to(device)
 except Exception as e:
-    print(f"[SAM2] לא מצליח לטעון את SAM2: {e}")
     sam2_predictor = None
 # -----------------------------
@@ -257,6 +219,7 @@ def blur_regions_with_mask(
     return Image.fromarray(combined)
 # -----------------------------
 # 6) הפונקציה המרכזית
 # -----------------------------

 import cv2
 from PIL import Image, ImageFilter
 from scipy.ndimage import binary_dilation
+import hydra
+from hydra import initialize, compose
+from omegaconf import OmegaConf
 # -----------------------------
 # 1) הגדרת המפתח API של Gemini כפרמטר
 # 3) טעינת מודל YOLO
 # -----------------------------
 from ultralytics import YOLO
+YOLO_MODEL_PATH = '../../models/yolo11m.pt'
 try:
     yolo_model = YOLO(YOLO_MODEL_PATH)
 # -----------------------------
 # 4) הכנה ל-SAM2
 # -----------------------------
 try:
+    from hydra import initialize
+    from sam2.sam2_image_predictor import SAM2ImagePredictor
+    SAM2_CONFIG_PATH = "sam2.1/"
+    SAM2_MODEL_NAME = "facebook/sam2.1-hiera-tiny"
+    sam2_predictor = None
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    with initialize(config_path=SAM2_CONFIG_PATH):
+        sam2_predictor = SAM2ImagePredictor.from_pretrained(SAM2_MODEL_NAME)
     sam2_predictor.model.to(device)
 except Exception as e:
+    print("[SAM2] לא מצליח לטעון את SAM2. ודא שהנתיב והקונפיג נכונים.")
     sam2_predictor = None
 # -----------------------------
     return Image.fromarray(combined)
 # -----------------------------
 # 6) הפונקציה המרכזית
 # -----------------------------

cache/models--facebook--sam2.1-hiera-tiny/.no_exist/36f406a75c9be63c7f429da63246273f028c6fd4/pytorch_model.bin ADDED Viewed

File without changes

cache/models--facebook--sam2.1-hiera-tiny/refs/main ADDED Viewed

	@@ -0,0 +1 @@


1	+ 36f406a75c9be63c7f429da63246273f028c6fd4

checkpoints/{sam2_hiera_small.pt → sam2.1_hiera_tiny.pt} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7ec09b256af142490dd9f363799c90f0bbb854e19142070fbe045eb1e7673ed6
-size 243

 version https://git-lfs.github.com/spec/v1
+oid sha256:7402e0d864fa82708a20fbd15bc84245c2f26dff0eb43a4b5b93452deb34be69
+size 156008466

configs/sam2.1_hiera_t.yaml ADDED Viewed

	@@ -0,0 +1,121 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 96
+      num_heads: 1
+      stages: [1, 2, 7, 2]
+      global_att_blocks: [5, 7, 9]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [768, 384, 192, 96]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  # SAM decoder
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  no_obj_embed_spatial: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: true
+  proj_tpos_enc_in_obj_ptrs: true
+  use_signed_tpos_enc_to_obj_ptrs: true
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  # HieraT does not currently support compilation, should always be set to False
+  compile_image_encoder: False

requirements.txt CHANGED Viewed

@@ -9,4 +9,5 @@ scipy
 hydra-core
 torchvision
 supervision
-git+https://github.com/facebookresearch/sam2.git

 hydra-core
 torchvision
 supervision
+git+https://github.com/facebookresearch/sam2.git
+omegaconf

sam2.1/configs/sam2.1/sam2.1_hiera_b+.yaml ADDED Viewed

	@@ -0,0 +1,116 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 112
+      num_heads: 2
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [896, 448, 224, 112]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  no_obj_embed_spatial: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: true
+  proj_tpos_enc_in_obj_ptrs: true
+  use_signed_tpos_enc_to_obj_ptrs: true
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False

sam2.1/configs/sam2.1/sam2.1_hiera_l.yaml ADDED Viewed

	@@ -0,0 +1,120 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 144
+      num_heads: 2
+      stages: [2, 6, 36, 4]
+      global_att_blocks: [23, 33, 43]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+      window_spec: [8, 4, 16, 8]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [1152, 576, 288, 144]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  no_obj_embed_spatial: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: true
+  proj_tpos_enc_in_obj_ptrs: true
+  use_signed_tpos_enc_to_obj_ptrs: true
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False

configs/sam2_hiera_s.yaml → sam2.1/configs/sam2.1/sam2.1_hiera_s.yaml RENAMED Viewed

@@ -92,6 +92,7 @@ model:
   use_mask_input_as_output_without_sam: true
   # Memory
   directly_add_no_mem_embed: true
   # use high-resolution feature map in the SAM mask decoder
   use_high_res_features_in_sam: true
   # output 3 masks on the first click on initial conditioning frames
@@ -100,7 +101,9 @@ model:
   iou_prediction_use_sigmoid: True
   # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
   use_obj_ptrs_in_encoder: true
-  add_tpos_enc_to_obj_ptrs: false
   only_obj_ptrs_in_the_past_for_eval: true
   # object occlusion prediction
   pred_obj_scores: true

   use_mask_input_as_output_without_sam: true
   # Memory
   directly_add_no_mem_embed: true
+  no_obj_embed_spatial: true
   # use high-resolution feature map in the SAM mask decoder
   use_high_res_features_in_sam: true
   # output 3 masks on the first click on initial conditioning frames
   iou_prediction_use_sigmoid: True
   # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
   use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: true
+  proj_tpos_enc_in_obj_ptrs: true
+  use_signed_tpos_enc_to_obj_ptrs: true
   only_obj_ptrs_in_the_past_for_eval: true
   # object occlusion prediction
   pred_obj_scores: true

sam2.1/configs/sam2.1/sam2.1_hiera_t.yaml ADDED Viewed

	@@ -0,0 +1,121 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 96
+      num_heads: 1
+      stages: [1, 2, 7, 2]
+      global_att_blocks: [5, 7, 9]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [768, 384, 192, 96]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  # SAM decoder
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  no_obj_embed_spatial: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: true
+  proj_tpos_enc_in_obj_ptrs: true
+  use_signed_tpos_enc_to_obj_ptrs: true
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  # HieraT does not currently support compilation, should always be set to False
+  compile_image_encoder: False