sam-audio-small / config.json
prince-canuma's picture
Upload folder using huggingface_hub
1be2e40 verified
{
"in_channels": 768,
"audio_codec": {
"encoder_dim": 64,
"encoder_rates": [
2,
8,
10,
12
],
"latent_dim": 1024,
"decoder_dim": 1536,
"decoder_rates": [
12,
10,
8,
2
],
"n_codebooks": 16,
"codebook_size": 1024,
"codebook_dim": 128,
"quantizer_dropout": false,
"sample_rate": 48000,
"mean": 0.0,
"std": 1.0
},
"text_encoder": {
"dim": 768,
"name": "t5-base",
"max_length": 512,
"pad_mode": "longest"
},
"vision_encoder": {
"dim": 1024,
"batch_size": 300,
"name": "PE-Core-L14-336",
"normalize_feature": true,
"interpolation_mode": "BICUBIC",
"image_size": 336
},
"transformer": {
"dim": 1536,
"n_heads": 12,
"n_layers": 12,
"dropout": 0.1,
"norm_eps": 1e-05,
"qk_norm": true,
"fc_bias": false,
"ffn_exp": 4,
"ffn_dim_multiplier": 1,
"multiple_of": 64,
"non_linearity": "swiglu",
"use_rope": true,
"max_positions": 10000,
"frequency_embedding_dim": 256,
"timestep_non_linearity": "swiglu",
"t_block_non_linearity": "silu",
"t_block_bias": true,
"context_dim": 1536,
"context_non_linearity": "swiglu",
"context_embedder_dropout": 0.0,
"context_norm": false,
"out_channels": 256,
"in_channels": null
},
"num_anchors": 3,
"anchor_embedding_dim": 128,
"visual_ranker": {
"checkpoint": null,
"kind": "imagebind"
},
"text_ranker": {
"rankers": {
"clap": [
{
"checkpoint": null,
"kind": "clap"
},
5.0
],
"judge": [
{
"checkpoint_or_model_id": "facebook/sam-audio-judge",
"kind": "judge"
},
1.0
]
},
"kind": "ensemble"
},
"span_predictor": "pe-a-frame-large"
}