Spaces:
Running
Running
File size: 7,231 Bytes
a28f35e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
import os
import struct
import argparse
import torch
import numpy as np
from silero_vad import load_silero_vad, __version__ as silero_version
def convert_silero_vad(output_path, print_tensors=True):
model = load_silero_vad()
state_dict = model.state_dict()
# Clean up state dict keys - filter out 8k model
cleaned_dict = {}
for key, value in state_dict.items():
# Skip 8k model
if "_8k" not in key:
clean_key = key
if not key.startswith("_model."):
clean_key = "_model." + key
cleaned_dict[clean_key] = value
base, ext = os.path.splitext(output_path)
output_file = f"{base}-v{silero_version}-ggml{ext}"
print(f"Saving GGML Silero-VAD model to {output_file}")
print("\nTensor info for debugging:")
for key, tensor in cleaned_dict.items():
print(f" - {key}: {tensor.shape} ({tensor.dtype})")
print()
with open(output_file, "wb") as fout:
# Write magic and version
fout.write(struct.pack("i", 0x67676d6c))
model_type = "silero-16k"
str_len = len(model_type)
fout.write(struct.pack("i", str_len))
fout.write(model_type.encode('utf-8'))
version_parts = silero_version.split('.')
major, minor, patch = map(int, version_parts)
print(f"Version: {major}.{minor}.{patch}")
fout.write(struct.pack("i", major))
fout.write(struct.pack("i", minor))
fout.write(struct.pack("i", patch))
# Write model architecture parameters
window_size = 512
fout.write(struct.pack("i", window_size))
context_size = 64
fout.write(struct.pack("i", context_size))
n_encoder_layers = 4
fout.write(struct.pack("i", n_encoder_layers))
# Write encoder dimensions
input_channels = 129
encoder_in_channels = [input_channels, 128, 64, 64]
encoder_out_channels = [128, 64, 64, 128]
kernel_size = 3
for i in range(n_encoder_layers):
fout.write(struct.pack("i", encoder_in_channels[i]))
fout.write(struct.pack("i", encoder_out_channels[i]))
fout.write(struct.pack("i", kernel_size))
# Write LSTM dimensions
lstm_input_size = 128
lstm_hidden_size = 128
fout.write(struct.pack("i", lstm_input_size))
fout.write(struct.pack("i", lstm_hidden_size))
# Write final conv dimensions
final_conv_in = 128
final_conv_out = 1
fout.write(struct.pack("i", final_conv_in))
fout.write(struct.pack("i", final_conv_out))
# Define tensor keys to write
tensor_keys = []
# Encoder weights
for i in range(n_encoder_layers):
weight_key = f"_model.encoder.{i}.reparam_conv.weight"
bias_key = f"_model.encoder.{i}.reparam_conv.bias"
if weight_key in cleaned_dict and bias_key in cleaned_dict:
tensor_keys.append(weight_key)
tensor_keys.append(bias_key)
# LSTM weights
lstm_keys = [
"_model.decoder.rnn.weight_ih",
"_model.decoder.rnn.weight_hh",
"_model.decoder.rnn.bias_ih",
"_model.decoder.rnn.bias_hh"
]
tensor_keys.extend([k for k in lstm_keys if k in cleaned_dict])
# Final conv weights
final_keys = [
"_model.decoder.decoder.2.weight",
"_model.decoder.decoder.2.bias"
]
tensor_keys.extend([k for k in final_keys if k in cleaned_dict])
# STFT basis - add this last
stft_tensor = "_model.stft.forward_basis_buffer"
tensor_keys.append(stft_tensor)
print(f"Writing {len(tensor_keys)} tensors:")
for key in tensor_keys:
if key in cleaned_dict:
print(f" - {key}: {cleaned_dict[key].shape}")
else:
print(f" - {key}: MISSING")
# Process each tensor
for key in tensor_keys:
if key not in cleaned_dict:
print(f"Warning: Missing tensor {key}, skipping")
continue
tensor = cleaned_dict[key]
# Special handling for STFT tensor
if key == "_model.stft.forward_basis_buffer":
# Get the original numpy array without squeezing
data = tensor.detach().cpu().numpy()
# Ensure it has the expected shape
print(f"STFT tensor original shape: {data.shape}")
n_dims = 3
tensor_shape = [data.shape[2], data.shape[1], data.shape[0]]
is_conv_weight = True
else:
# For other tensors, we can use standard processing
data = tensor.detach().cpu().squeeze().numpy()
tensor_shape = list(data.shape)
# Ensure we have at most 4 dimensions for GGML
n_dims = min(len(tensor_shape), 4)
# Reverse dimensions for GGML
tensor_shape = tensor_shape[:n_dims]
tensor_shape.reverse()
# Check if this is a convolution weight tensor
is_conv_weight = "weight" in key and ("encoder" in key or "_model.decoder.decoder.2" in key)
# Convert to float16 for convolution weights
if is_conv_weight:
data = data.astype(np.float16)
ftype = 1 # float16
else:
ftype = 0 # float32
# Debug printing of tensor info
print(f"\nWriting tensor: {key}")
print(f" Original shape: {tensor.shape}")
print(f" Processed shape: {data.shape}")
print(f" GGML dimensions: {n_dims}")
print(f" GGML shape: {tensor_shape}")
print(f" Type: {'float16' if ftype == 1 else 'float32'}")
# Convert tensor name to bytes
name_bytes = key.encode('utf-8')
name_length = len(name_bytes)
# Write tensor header
fout.write(struct.pack("i", n_dims))
fout.write(struct.pack("i", name_length))
fout.write(struct.pack("i", ftype))
# Write tensor dimensions
for i in range(n_dims):
size = tensor_shape[i] if i < len(tensor_shape) else 1
fout.write(struct.pack("i", size))
print(f" Writing dimension {i}: {size}")
# Write tensor name
fout.write(name_bytes)
# Write tensor data
data.tofile(fout)
print(f" Wrote {data.size * (2 if ftype==1 else 4)} bytes")
print(f"\nDone! Model has been converted to GGML format: {output_file}")
print(f"File size: {os.path.getsize(output_file)} bytes")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert Silero-VAD PyTorch model to GGML format")
parser.add_argument("--output", type=str, required=True, help="Path to output GGML model file")
parser.add_argument("--print-tensors", action="store_true", help="Print tensor values", default=True)
args = parser.parse_args()
convert_silero_vad(args.output, args.print_tensors)
|