File size: 3,861 Bytes
092aa0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
DEFAULT_TEST_DATASET = dict(
    flickr=dict(
        filename='./reactiondata/real_test.jsonl',
        image_folder='./reaction_image',
        template_file='./config/_base_/dataset/template/reaction.json',
        type='FlickrDataset'),
    reg=dict(
        filename='./reactiondata/train_OCR.jsonl',
        image_folder='./reaction_image_OCR',
        template_file='./config/_base_/dataset/template/OCR.json',
        type='REGDataset'))
DEFAULT_TRAIN_DATASET = dict(
    flickr=dict(
        filename='./reactiondata/reaction_real_structed.jsonl',
        image_folder='./reaction_image',
        template_file='./config/_base_/dataset/template/reaction.json',
        type='FlickrDataset'),
    reg=dict(
        filename='./reactiondata/train_OCR.jsonl',
        image_folder='./reaction_image_OCR',
        template_file='./config/_base_/dataset/template/OCR.json',
        type='REGDataset'))
data_args = dict(
    collator_kwargs=dict(max_length=1024, padding=True),
    compute_metric=None,
    gen_kwargs=dict(max_new_tokens=1024, num_beams=1),
    test=None,
    train=dict(
        cfgs=[
            dict(
                filename='./reactiondata/train_OCR.jsonl',
                image_folder='./reaction_image_OCR',
                template_file='./config/_base_/dataset/template/OCR.json',
                type='REGDataset'),
            dict(
                filename='./reactiondata/reaction_real_structed.jsonl',
                image_folder='./reaction_image',
                template_file='./config/_base_/dataset/template/reaction.json',
                type='FlickrDataset'),
        ],
        probabilities=[
            0.0,
            1,
        ],
        seed=None,
        stopping_strategy='first_exhausted',
        type='InterleaveDateset'),
    validation=dict(
        cfgs=[
            dict(
                filename='./reactiondata/real_test.jsonl',
                image_folder='./reaction_image',
                template_file='./config/_base_/dataset/template/reaction.json',
                type='FlickrDataset'),
        ],
        type='ConcatDatasetWithShuffle'))
model_args = dict(
    cache_dir=None,
    conv_args=dict(
        conv_template='vicuna_v1.1',
        tokenize_kwargs=dict(truncation_size=2048)),
    freeze_backbone=False,
    freeze_mm_mlp_adapter=False,
    gen_kwargs_set_bos_token_id=True,
    gen_kwargs_set_eos_token_id=True,
    gen_kwargs_set_pad_token_id=True,
    image_token_len=300,
    mm_use_im_start_end=True,
    mm_vision_select_layer=-2,
    model_max_length=2048,
    model_name_or_path='./exp/reaction_4.2.1',
    pretrain_mm_mlp_adapter=None,
    process_func_args=dict(
        conv=dict(type='ShikraConvProcess'),
        image=dict(type='ShikraImageProcessor'),
        target=dict(type='BoxFormatProcess'),
        text=dict(type='ShikraTextProcess')),
    sep_image_conv_front=False,
    target_processor=dict(boxes=dict(type='PlainBoxFormatter')),
    tune_mm_mlp_adapter=False,
    type='shikra',
    version='v1',
    vision_tower='SenseTime/deformable-detr')
training_args = dict(
    bf16=True,
    dataloader_num_workers=4,
    do_eval=False,
    do_predict=False,
    do_train=True,
    evaluation_strategy='no',
    fsdp='full_shard auto_wrap',
    fsdp_transformer_layer_cls_to_wrap='LlamaDecoderLayer',
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    learning_rate=2e-05,
    logging_steps=10,
    lr_scheduler_type='cosine',
    num_train_epochs=50,
    output_dir='./exp/reaction_4.2.2-large',
    overwrite_output_dir=False,
    per_device_eval_batch_size=4,
    per_device_train_batch_size=4,
    predict_with_generate=True,
    remove_unused_columns=False,
    report_to='none',
    save_steps=10000,
    save_strategy='steps',
    save_total_limit=1,
    seed=42,
    tf32=True,
    warmup_ratio=0.03,
    weight_decay=0.05)