File size: 3,157 Bytes
ea174b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
input_sample_rate: &input_sample_rate 16000
output_sample_rate: &output_sample_rate 24000

generator_params:
    input_sample_rate: *input_sample_rate
    output_sample_rate: *output_sample_rate

    feature_extractor_kwargs:
        chunk_length: 30
        feature_size: 80
        hop_length: 160
        n_fft: 400
        n_samples: 480000
        nb_max_frames: 3000
        padding_side: right
        padding_value: 0.0
        return_attention_mask: false
        sampling_rate: *input_sample_rate

    ## Codec Args

    ## semantic channel
    semantic_encoder_kwargs:  # 100hz -> 50hz
        num_mel_bins: 80
        sampling_rate: *input_sample_rate
        hop_length: 160
        stride_size: 2
        kernel_size: 3
        d_model: 768
        scale_embedding: false
        max_audio_seconds: 30
        encoder_layers: 12
        encoder_attention_heads: 12
        encoder_ffn_dim: 3072
        activation_function: "gelu"

    semantic_encoder_adapter_kwargs: # 50hz
        input_dim: 768
        output_dim: 768
        d_model: 768
        max_source_positions: 1500
        encoder_layers: 4
        encoder_attention_heads: 12
        encoder_ffn_dim: 3072


    ## acoustic channel
    acoustic_encoder_kwargs:  # 100hz -> 50hz
        num_mel_bins: 80
        sampling_rate: *input_sample_rate
        hop_length: 160
        stride_size: 2
        kernel_size: 3
        d_model: 768
        scale_embedding: false
        max_audio_seconds: 30
        encoder_layers: 12
        encoder_attention_heads: 12
        encoder_ffn_dim: 3072
        activation_function: "gelu"


    ## semantic & acoustic shared parameters 
    pre_rvq_adapter_kwargs: # 50hz
        input_dim: 1536
        output_dim: 768
        d_model: 768
        max_source_positions: 1500
        encoder_layers: 4
        encoder_attention_heads: 12
        encoder_ffn_dim: 3072
        
    downsample_kwargs:  # 50hz -> 12.5hz
        d_model: 768
        avg_pooler: 4

    quantizer_kwargs:  # 12.5hz
        input_dim: 3072
        rvq_dim: 512
        output_dim: 3072
        num_quantizers: 8
        codebook_size: 1024
        codebook_dim: 512
        quantizer_dropout: 0.0
        commitment: 1
    
    post_rvq_adapter_kwargs: # 12.5hz
        input_dim: 3072
        output_dim: 3072
        d_model: 768
        max_source_positions: 375
        encoder_layers: 4
        encoder_attention_heads: 12
        encoder_ffn_dim: 3072

    upsample_kwargs:  # 12.5hz -> 50hz
        d_model: 768
        stride: 4
    
    ## acoustic channel
    acoustic_decoder_kwargs:  # 50hz -> 100hz
        num_mel_bins: 80
        sampling_rate: *input_sample_rate
        hop_length: 160
        stride_size: 2
        kernel_size: 3
        d_model: 768
        scale_embedding: false
        max_audio_seconds: 30
        decoder_layers: 12
        decoder_attention_heads: 12
        decoder_ffn_dim: 3072
        activation_function: "gelu"
    
    vocos_kwargs:  # 100hz -> 24khz
        input_channels: 80
        dim: 512
        intermediate_dim: 4096
        num_layers: 30
        n_fft: 960
        hop_size: 240
        padding: "same"