| { | |
| "in_channels": 768, | |
| "audio_codec": { | |
| "encoder_dim": 64, | |
| "encoder_rates": [ | |
| 2, | |
| 8, | |
| 10, | |
| 12 | |
| ], | |
| "latent_dim": 1024, | |
| "decoder_dim": 1536, | |
| "decoder_rates": [ | |
| 12, | |
| 10, | |
| 8, | |
| 2 | |
| ], | |
| "n_codebooks": 16, | |
| "codebook_size": 1024, | |
| "codebook_dim": 128, | |
| "quantizer_dropout": false, | |
| "sample_rate": 48000, | |
| "mean": 0.0, | |
| "std": 1.0 | |
| }, | |
| "text_encoder": { | |
| "dim": 768, | |
| "name": "t5-base", | |
| "max_length": 512, | |
| "pad_mode": "longest" | |
| }, | |
| "vision_encoder": { | |
| "dim": 1024, | |
| "batch_size": 300, | |
| "name": "PE-Core-L14-336", | |
| "normalize_feature": true, | |
| "interpolation_mode": "BICUBIC", | |
| "image_size": 336 | |
| }, | |
| "transformer": { | |
| "dim": 1536, | |
| "n_heads": 12, | |
| "n_layers": 12, | |
| "dropout": 0.1, | |
| "norm_eps": 1e-05, | |
| "qk_norm": true, | |
| "fc_bias": false, | |
| "ffn_exp": 4, | |
| "ffn_dim_multiplier": 1, | |
| "multiple_of": 64, | |
| "non_linearity": "swiglu", | |
| "use_rope": true, | |
| "max_positions": 10000, | |
| "frequency_embedding_dim": 256, | |
| "timestep_non_linearity": "swiglu", | |
| "t_block_non_linearity": "silu", | |
| "t_block_bias": true, | |
| "context_dim": 1536, | |
| "context_non_linearity": "swiglu", | |
| "context_embedder_dropout": 0.0, | |
| "context_norm": false, | |
| "out_channels": 256, | |
| "in_channels": null | |
| }, | |
| "num_anchors": 3, | |
| "anchor_embedding_dim": 128, | |
| "visual_ranker": { | |
| "checkpoint": null, | |
| "kind": "imagebind" | |
| }, | |
| "text_ranker": { | |
| "rankers": { | |
| "clap": [ | |
| { | |
| "checkpoint": null, | |
| "kind": "clap" | |
| }, | |
| 5.0 | |
| ], | |
| "judge": [ | |
| { | |
| "checkpoint_or_model_id": "facebook/sam-audio-judge", | |
| "kind": "judge" | |
| }, | |
| 1.0 | |
| ] | |
| }, | |
| "kind": "ensemble" | |
| }, | |
| "span_predictor": "pe-a-frame-large" | |
| } | |