import gradio as gr import numpy as np import torch import torchaudio import sys def convert_to_16_bit_wav(data): # Based on: https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.wavfile.write.html #breakpoint() if data.dtype == np.float32: print( "Audio data is not in 16-bit integer format.", "Trying to convert to 16-bit int format.", file=sys.stderr ) data = data / np.abs(data).max() data = data * 32767 data = data.astype(np.int16) elif data.dtype == np.int32: print( "Audio data is not in 16-bit integer format.", "Trying to convert to 16-bit int format.", file=sys.stderr ) data = data / 65538 data = data.astype(np.int16) elif data.dtype == np.int16: pass elif data.dtype == np.uint8: print( "Audio data is not in 16-bit integer format.", "Trying to convert to 16-bit int format.", file=sys.stderr ) data = data * 257 - 32768 data = data.astype(np.int16) else: raise ValueError("Audio data cannot be converted to " "16-bit int format.") return data def pcm2float(sig, dtype='float32'): """ https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182 """ sig = np.asarray(sig) if sig.dtype.kind not in 'iu': raise TypeError("'sig' must be an array of integers") dtype = np.dtype(dtype) if dtype.kind != 'f': raise TypeError("'dtype' must be a floating point type") i = np.iinfo(sig.dtype) abs_max = 2 ** (i.bits - 1) offset = i.min + abs_max return (sig.astype(dtype) - offset) / abs_max def float2pcm(sig, dtype='int16'): """ https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182 """ sig = np.asarray(sig) if sig.dtype.kind != 'f': raise TypeError("'sig' must be a float array") dtype = np.dtype(dtype) if dtype.kind not in 'iu': raise TypeError("'dtype' must be an integer type") i = np.iinfo(dtype) abs_max = 2 ** (i.bits - 1) offset = i.min + abs_max return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype) @torch.no_grad() def inference(audio, model_tag="hifigan_bn_tdnnf_wav2vec2_vq_48_v1"): sr, audio = audio audio = convert_to_16_bit_wav(audio) audio = pcm2float(audio) audio = torch.tensor(audio).unsqueeze(0) audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio) print(model_tag, file=sys.stderr) model = torch.hub.load("deep-privacy/SA-toolkit", "anonymization", tag_version=model_tag, trust_repo=True, force_reload=True) model.eval() wav_conv = model.convert(audio, target="6081") # hard coded target return 16000, float2pcm(wav_conv.squeeze().cpu().numpy()) article = "
PhD thesis: Anonymizing Speech: Evaluating and Designing Speaker Anonymization Techniques | Github Repo
" with gr.Blocks() as interface: gr.Markdown( """ # SA-toolkit Demo: Speaker speech anonymization toolkit in python """ ) with gr.Row(): with gr.Column(): audio_input = gr.Audio(sources=["upload", "microphone"], type="numpy", label="File", interactive=True, elem_id="melody-input") model_tag = gr.Dropdown([ 'hifigan_bn_tdnnf_wav2vec2_vq_48_v1+f0-transformation=quant_16_awgn_2', 'hifigan_clean_bn_tdnnf_wav2vec2_train_600_vq_48_v1', 'hifigan_clean_bn_tdnnf_wav2vec2_train_600_vq_48_v1+f0-transformation=quant_16_awgn_2', 'hifigan_inception_bn_tdnnf_wav2vec2_train_600_vq_48_v1+f0-transformation=quant_16_awgn_2', 'hifigan_bn_tdnnf_wav2vec2_vq_48_v1', 'hifigan_bn_tdnnf_wav2vec2_100h_aug_v1', 'hifigan_bn_tdnnf_600h_aug_v1', 'hifigan_bn_tdnnf_600h_vq_48_v1', 'hifigan_bn_tdnnf_100h_vq_64_v1', 'hifigan_bn_tdnnf_100h_vq_256_v1', 'hifigan_bn_tdnnf_100h_aug_v1'], type='value', value='hifigan_bn_tdnnf_wav2vec2_vq_48_v1', label='Model') with gr.Row(): submit = gr.Button("Submit") with gr.Column(): audio_output = gr.Audio(label="Output") submit.click(inference, inputs=[audio_input, model_tag], outputs=[audio_output], batch=False) gr.Examples(fn=inference, examples=[['3853-163249-0000.flac']], inputs=[audio_input, "hifigan_bn_tdnnf_wav2vec2_vq_48_v1"], outputs=[audio_output], batch=False) gr.HTML(article) interface.launch()