Spaces:
Runtime error
Runtime error
File size: 3,157 Bytes
ea174b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
input_sample_rate: &input_sample_rate 16000
output_sample_rate: &output_sample_rate 24000
generator_params:
input_sample_rate: *input_sample_rate
output_sample_rate: *output_sample_rate
feature_extractor_kwargs:
chunk_length: 30
feature_size: 80
hop_length: 160
n_fft: 400
n_samples: 480000
nb_max_frames: 3000
padding_side: right
padding_value: 0.0
return_attention_mask: false
sampling_rate: *input_sample_rate
## Codec Args
## semantic channel
semantic_encoder_kwargs: # 100hz -> 50hz
num_mel_bins: 80
sampling_rate: *input_sample_rate
hop_length: 160
stride_size: 2
kernel_size: 3
d_model: 768
scale_embedding: false
max_audio_seconds: 30
encoder_layers: 12
encoder_attention_heads: 12
encoder_ffn_dim: 3072
activation_function: "gelu"
semantic_encoder_adapter_kwargs: # 50hz
input_dim: 768
output_dim: 768
d_model: 768
max_source_positions: 1500
encoder_layers: 4
encoder_attention_heads: 12
encoder_ffn_dim: 3072
## acoustic channel
acoustic_encoder_kwargs: # 100hz -> 50hz
num_mel_bins: 80
sampling_rate: *input_sample_rate
hop_length: 160
stride_size: 2
kernel_size: 3
d_model: 768
scale_embedding: false
max_audio_seconds: 30
encoder_layers: 12
encoder_attention_heads: 12
encoder_ffn_dim: 3072
activation_function: "gelu"
## semantic & acoustic shared parameters
pre_rvq_adapter_kwargs: # 50hz
input_dim: 1536
output_dim: 768
d_model: 768
max_source_positions: 1500
encoder_layers: 4
encoder_attention_heads: 12
encoder_ffn_dim: 3072
downsample_kwargs: # 50hz -> 12.5hz
d_model: 768
avg_pooler: 4
quantizer_kwargs: # 12.5hz
input_dim: 3072
rvq_dim: 512
output_dim: 3072
num_quantizers: 8
codebook_size: 1024
codebook_dim: 512
quantizer_dropout: 0.0
commitment: 1
post_rvq_adapter_kwargs: # 12.5hz
input_dim: 3072
output_dim: 3072
d_model: 768
max_source_positions: 375
encoder_layers: 4
encoder_attention_heads: 12
encoder_ffn_dim: 3072
upsample_kwargs: # 12.5hz -> 50hz
d_model: 768
stride: 4
## acoustic channel
acoustic_decoder_kwargs: # 50hz -> 100hz
num_mel_bins: 80
sampling_rate: *input_sample_rate
hop_length: 160
stride_size: 2
kernel_size: 3
d_model: 768
scale_embedding: false
max_audio_seconds: 30
decoder_layers: 12
decoder_attention_heads: 12
decoder_ffn_dim: 3072
activation_function: "gelu"
vocos_kwargs: # 100hz -> 24khz
input_channels: 80
dim: 512
intermediate_dim: 4096
num_layers: 30
n_fft: 960
hop_size: 240
padding: "same" |