mlx-community
/

sam-audio-small

speech generation

voice isolation

Model card Files Files and versions

sam-audio-small / config.json

prince-canuma's picture

Upload folder using huggingface_hub

1be2e40 verified 17 days ago

history blame contribute delete

2.25 kB

	{
	"in_channels": 768,
	"audio_codec": {
	"encoder_dim": 64,
	"encoder_rates": [
	2,
	8,
	10,
	12
	],
	"latent_dim": 1024,
	"decoder_dim": 1536,
	"decoder_rates": [
	12,
	10,
	8,
	2
	],
	"n_codebooks": 16,
	"codebook_size": 1024,
	"codebook_dim": 128,
	"quantizer_dropout": false,
	"sample_rate": 48000,
	"mean": 0.0,
	"std": 1.0
	},
	"text_encoder": {
	"dim": 768,
	"name": "t5-base",
	"max_length": 512,
	"pad_mode": "longest"
	},
	"vision_encoder": {
	"dim": 1024,
	"batch_size": 300,
	"name": "PE-Core-L14-336",
	"normalize_feature": true,
	"interpolation_mode": "BICUBIC",
	"image_size": 336
	},
	"transformer": {
	"dim": 1536,
	"n_heads": 12,
	"n_layers": 12,
	"dropout": 0.1,
	"norm_eps": 1e-05,
	"qk_norm": true,
	"fc_bias": false,
	"ffn_exp": 4,
	"ffn_dim_multiplier": 1,
	"multiple_of": 64,
	"non_linearity": "swiglu",
	"use_rope": true,
	"max_positions": 10000,
	"frequency_embedding_dim": 256,
	"timestep_non_linearity": "swiglu",
	"t_block_non_linearity": "silu",
	"t_block_bias": true,
	"context_dim": 1536,
	"context_non_linearity": "swiglu",
	"context_embedder_dropout": 0.0,
	"context_norm": false,
	"out_channels": 256,
	"in_channels": null
	},
	"num_anchors": 3,
	"anchor_embedding_dim": 128,
	"visual_ranker": {
	"checkpoint": null,
	"kind": "imagebind"
	},
	"text_ranker": {
	"rankers": {
	"clap": [
	{
	"checkpoint": null,
	"kind": "clap"
	},
	5.0
	],
	"judge": [
	{
	"checkpoint_or_model_id": "facebook/sam-audio-judge",
	"kind": "judge"
	},
	1.0
	]
	},
	"kind": "ensemble"
	},
	"span_predictor": "pe-a-frame-large"
	}