@@ -1435,11 +1435,11 @@ def _cfg(url='', **kwargs):
14351435 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 1280 ),
14361436
14371437 'vit_base_patch32_clip_224.datacompxl' : _cfg (
1438- hf_hub_id = 'laion/' ,
1438+ hf_hub_id = 'laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K ' ,
14391439 hf_hub_filename = 'open_clip_pytorch_model.bin' ,
14401440 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 512 ),
14411441 'vit_base_patch32_clip_256.datacompxl' : _cfg (
1442- hf_hub_id = 'laion/' ,
1442+ hf_hub_id = 'laion/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K ' ,
14431443 hf_hub_filename = 'open_clip_pytorch_model.bin' ,
14441444 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
14451445 crop_pct = 1.0 , input_size = (3 , 256 , 256 ), num_classes = 512 ),
@@ -1994,6 +1994,17 @@ def vit_base_patch32_clip_224(pretrained=False, **kwargs) -> VisionTransformer:
19941994 return model
19951995
19961996
1997+ @register_model
1998+ def vit_base_patch32_clip_256 (pretrained = False , ** kwargs ) -> VisionTransformer :
1999+ """ ViT-B/32 CLIP image tower @ 256x256
2000+ """
2001+ model_args = dict (
2002+ patch_size = 32 , embed_dim = 768 , depth = 12 , num_heads = 12 , pre_norm = True , norm_layer = nn .LayerNorm )
2003+ model = _create_vision_transformer (
2004+ 'vit_base_patch32_clip_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2005+ return model
2006+
2007+
19972008@register_model
19982009def vit_base_patch32_clip_384 (pretrained = False , ** kwargs ) -> VisionTransformer :
19992010 """ ViT-B/32 CLIP image tower @ 384x384
0 commit comments