Champion commited on
Commit
f5a24cf
·
unverified ·
1 Parent(s): c4a4cbb
Files changed (4) hide show
  1. 3853-163249-0000.flac +0 -0
  2. app.py +134 -0
  3. packages.txt +0 -0
  4. requirements.txt +3 -0
3853-163249-0000.flac ADDED
Binary file (214 kB). View file
 
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+ import torchaudio
5
+ import sys
6
+
7
+
8
+ def convert_to_16_bit_wav(data):
9
+ # Based on: https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.wavfile.write.html
10
+ #breakpoint()
11
+ if data.dtype == np.float32:
12
+ print(
13
+ "Audio data is not in 16-bit integer format.",
14
+ "Trying to convert to 16-bit int format.",
15
+ file=sys.stderr
16
+ )
17
+ data = data / np.abs(data).max()
18
+ data = data * 32767
19
+ data = data.astype(np.int16)
20
+ elif data.dtype == np.int32:
21
+ print(
22
+ "Audio data is not in 16-bit integer format.",
23
+ "Trying to convert to 16-bit int format.",
24
+ file=sys.stderr
25
+ )
26
+ data = data / 65538
27
+ data = data.astype(np.int16)
28
+ elif data.dtype == np.int16:
29
+ pass
30
+ elif data.dtype == np.uint8:
31
+ print(
32
+ "Audio data is not in 16-bit integer format.",
33
+ "Trying to convert to 16-bit int format.",
34
+ file=sys.stderr
35
+ )
36
+ data = data * 257 - 32768
37
+ data = data.astype(np.int16)
38
+ else:
39
+ raise ValueError("Audio data cannot be converted to " "16-bit int format.")
40
+ return data
41
+
42
+ def pcm2float(sig, dtype='float32'):
43
+ """
44
+ https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
45
+ """
46
+ sig = np.asarray(sig)
47
+ if sig.dtype.kind not in 'iu':
48
+ raise TypeError("'sig' must be an array of integers")
49
+ dtype = np.dtype(dtype)
50
+ if dtype.kind != 'f':
51
+ raise TypeError("'dtype' must be a floating point type")
52
+
53
+ i = np.iinfo(sig.dtype)
54
+ abs_max = 2 ** (i.bits - 1)
55
+ offset = i.min + abs_max
56
+ return (sig.astype(dtype) - offset) / abs_max
57
+
58
+
59
+ def float2pcm(sig, dtype='int16'):
60
+ """
61
+ https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
62
+ """
63
+ sig = np.asarray(sig)
64
+ if sig.dtype.kind != 'f':
65
+ raise TypeError("'sig' must be a float array")
66
+ dtype = np.dtype(dtype)
67
+ if dtype.kind not in 'iu':
68
+ raise TypeError("'dtype' must be an integer type")
69
+ i = np.iinfo(dtype)
70
+ abs_max = 2 ** (i.bits - 1)
71
+ offset = i.min + abs_max
72
+ return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
73
+
74
+
75
+ @torch.no_grad()
76
+ def inference(audio, model_tag):
77
+ sr, audio = audio
78
+ audio = convert_to_16_bit_wav(audio)
79
+ audio = pcm2float(audio)
80
+ audio = torch.tensor(audio).unsqueeze(0)
81
+ audio = torchaudio.transforms.Resample(orig_freq=sr,
82
+ new_freq=16000)(audio)
83
+ print(model_tag, file=sys.stderr)
84
+ model = torch.hub.load("deep-privacy/SA-toolkit", "anonymization", tag_version=model_tag, trust_repo=True)
85
+ model.eval()
86
+ wav_conv = model.convert(audio, target="6081") # hard coded target
87
+ return 16000, float2pcm(wav_conv.squeeze().cpu().numpy())
88
+
89
+
90
+ article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2308.04455' target='_blank'>PhD thesis: Anonymizing Speech: Evaluating and Designing Speaker Anonymization Techniques</a> | <a href='https://github.com/deep-privacy/SA-toolkit' target='_blank'>Github Repo</a></p>"
91
+
92
+
93
+ def toggle_audio_src(choice):
94
+ if choice == "mic":
95
+ return gr.update(source="microphone", value=None, label="Microphone (best with a headset)")
96
+ else:
97
+ return gr.update(source="upload", value=None, label="File")
98
+
99
+ with gr.Blocks() as interface:
100
+ gr.Markdown(
101
+ """
102
+ # SA-toolkit
103
+ Demo: Speaker speech anonymization toolkit in python
104
+ """
105
+ )
106
+ with gr.Row():
107
+ with gr.Column():
108
+ radio = gr.Radio(["file", "mic"], value="file",
109
+ label="Input speech (File or Mic)")
110
+ audio_input = gr.Audio(source="upload", type="numpy", label="File",
111
+ interactive=True, elem_id="melody-input")
112
+ model_tag = gr.Dropdown(['hifigan_bn_tdnnf_wav2vec2_vq_48_v1',
113
+ 'hifigan_bn_tdnnf_wav2vec2_100h_aug_v1',
114
+ 'hifigan_bn_tdnnf_600h_aug_v1',
115
+ 'hifigan_bn_tdnnf_100h_vq_64_v1',
116
+ 'hifigan_bn_tdnnf_100h_vq_256_v1',
117
+ 'hifigan_bn_tdnnf_100h_aug_v1'], type='value',
118
+ value='hifigan_bn_tdnnf_wav2vec2_vq_48_v1',
119
+ label='Model')
120
+ with gr.Row():
121
+ submit = gr.Button("Submit")
122
+ with gr.Column():
123
+ audio_output = gr.Audio(label="Output")
124
+ submit.click(inference, inputs=[audio_input, model_tag],
125
+ outputs=[audio_output], batch=False)
126
+ radio.change(toggle_audio_src, radio, [audio_input], queue=False, show_progress=False)
127
+ gr.Examples(fn=inference,
128
+ examples=[['3853-163249-0000.flac']],
129
+ inputs=[audio_input, "hifigan_bn_tdnnf_wav2vec2_vq_48_v1"],
130
+ outputs=[audio_output], batch=False)
131
+
132
+
133
+ gr.HTML(article)
134
+ interface.queue().launch()
packages.txt ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ numpy