{ "architectures": [ "ArlowVLVisionModel" ], "deepstack_visual_indexes": [ 12, 24, 44 ], "deformable_attention_strength": 4.0, "deformable_attention_window": 0.25, "depth": 48, "dtype": "float16", "embed_dim": 1536, "hidden_act": "gelu_pytorch_tanh", "hidden_size": 3072, "in_channels": 3, "initializer_range": 0.02, "max_position_embeddings": 32768, "mlp_ratio": 4, "model_type": "arlow", "mrope_sections": [ 21, 21, 22 ], "num_attention_heads": 24, "num_heads": 24, "patch_size": 14, "spatial_merge_size": 2, "temporal_patch_size": 2, "token_pruning_ratio": 0.0, "transformers_version": "5.3.0.dev0", "use_deformable_attention": true, "use_progressive_patches": true }