auto_scale_lr = dict(base_batch_size=4096) data_preprocessor = dict( mean=[ 123.675, 116.28, 103.53, ], non_blocking=True, std=[ 58.395, 57.12, 57.375, ], to_rgb=True, type='SelfSupDataPreprocessor') data_root = '/workdir/ILSVRC2012/' dataset_type = 'ImageNet' default_hooks = dict( checkpoint=dict(interval=1, max_keep_ckpts=3, type='CheckpointHook'), logger=dict(interval=20, type='LoggerHook'), param_scheduler=dict(type='ParamSchedulerHook'), sampler_seed=dict(type='DistSamplerSeedHook'), timer=dict(type='IterTimerHook'), visualization=dict(enable=False, type='VisualizationHook')) default_scope = 'mmpretrain' env_cfg = dict( cudnn_benchmark=True, dist_cfg=dict(backend='nccl'), mp_cfg=dict(mp_start_method='spawn', opencv_num_threads=0)) launcher = 'pytorch' load_from = None log_level = 'INFO' model = dict( backbone=dict(arch='b', mask_ratio=0.75, patch_size=16, type='MAELLaMA'), head=dict( loss=dict(criterion='L2', type='PixelReconstructionLoss'), norm_pix=True, patch_size=16, type='MAEPretrainHead'), init_cfg=[ dict(distribution='uniform', layer='Linear', type='Xavier'), dict(bias=0.0, layer='LayerNorm', type='Constant', val=1.0), ], neck=dict( decoder_depth=8, decoder_embed_dim=512, decoder_num_heads=16, embed_dim=768, in_chans=3, mlp_ratio=4.0, patch_size=16, type='MAEPretrainDecoder'), type='MAE') optim_wrapper = dict( loss_scale='dynamic', optimizer=dict( betas=( 0.9, 0.95, ), lr=0.0024, type='AdamW', weight_decay=0.05), paramwise_cfg=dict( custom_keys=dict( bias=dict(decay_mult=0.0), cls_token=dict(decay_mult=0.0), ln=dict(decay_mult=0.0), mask_token=dict(decay_mult=0.0), pos_embed=dict(decay_mult=0.0))), type='AmpOptimWrapper') param_scheduler = [ dict( begin=0, by_epoch=True, convert_to_iter_based=True, end=40, start_factor=0.0001, type='LinearLR'), dict( T_max=1560, begin=40, by_epoch=True, convert_to_iter_based=True, end=1600, type='CosineAnnealingLR'), ] randomness = dict(deterministic=False, diff_rank_seed=True, seed=0) resume = True train_cfg = dict(max_epochs=1600, type='EpochBasedTrainLoop') train_dataloader = dict( batch_size=512, collate_fn=dict(type='default_collate'), dataset=dict( data_root='/workdir/ILSVRC2012/', pipeline=[ dict(type='LoadImageFromFile'), dict( backend='pillow', crop_ratio_range=( 0.2, 1.0, ), interpolation='bicubic', scale=224, type='RandomResizedCrop'), dict(prob=0.5, type='RandomFlip'), dict(type='PackInputs'), ], split='train', type='ImageNet'), num_workers=8, persistent_workers=True, pin_memory=True, sampler=dict(shuffle=True, type='DefaultSampler')) train_pipeline = [ dict(type='LoadImageFromFile'), dict( backend='pillow', crop_ratio_range=( 0.2, 1.0, ), interpolation='bicubic', scale=224, type='RandomResizedCrop'), dict(prob=0.5, type='RandomFlip'), dict(type='PackInputs'), ] vis_backends = [ dict(type='LocalVisBackend'), ] visualizer = dict( type='UniversalVisualizer', vis_backends=[ dict(type='LocalVisBackend'), ]) work_dir = './work_dirs/mae_lama-base-p16_8xb512-amp-coslr-1600e_in1k'