| | |
| |
|
| | |
| | |
| |
|
| | import soundfile |
| | import json |
| | import numpy as np |
| | import audb |
| | from pathlib import Path |
| |
|
| | LABELS = ['arousal', 'dominance', 'valence'] |
| |
|
| |
|
| |
|
| |
|
| | def load_speech(split=None): |
| | DB = [ |
| | |
| | |
| | |
| | ['emodb', '1.2.0', 'emotion.categories.train.gold_standard', False], |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | ] |
| |
|
| | output_list = [] |
| | for database_name, ver, table, has_timedeltas in DB: |
| |
|
| | a = audb.load(database_name, |
| | sampling_rate=16000, |
| | format='wav', |
| | mixdown=True, |
| | version=ver, |
| | cache_root='/cache/audb/') |
| | a = a[table].get() |
| | if has_timedeltas: |
| | print(f'{has_timedeltas=}') |
| | |
| | |
| | |
| | else: |
| | output_list += [f for f in a.index] |
| | return output_list |
| |
|
| |
|
| |
|
| |
|
| |
|
| | |
| |
|
| |
|
| |
|
| |
|
| | |
| | natural_wav_paths = load_speech() |
| |
|
| |
|
| | |
| | import msinference |
| | import os |
| | from random import shuffle |
| | import audiofile |
| | with open('harvard.json', 'r') as f: |
| | harvard_individual_sentences = json.load(f)['sentences'] |
| |
|
| |
|
| |
|
| | synthetic_wav_paths = ['./enslow/' + i for i in |
| | os.listdir('./enslow/')] |
| | synthetic_wav_paths_4x = ['./style_vector_v2/' + i for i in |
| | os.listdir('./style_vector_v2/')] |
| | synthetic_wav_paths_foreign = ['./mimic3_foreign/' + i for i in os.listdir('./mimic3_foreign/') if 'en_U' not in i] |
| | synthetic_wav_paths_foreign_4x = ['./mimic3_foreign_4x/' + i for i in os.listdir('./mimic3_foreign_4x/') if 'en_U' not in i] |
| |
|
| | |
| | synthetic_wav_paths_foreign = [i for i in synthetic_wav_paths_foreign if audiofile.duration(i) > 2] |
| | synthetic_wav_paths_foreign_4x = [i for i in synthetic_wav_paths_foreign_4x if audiofile.duration(i) > 2] |
| | synthetic_wav_paths = [i for i in synthetic_wav_paths if audiofile.duration(i) > 2] |
| | synthetic_wav_pathsn_4x = [i for i in synthetic_wav_paths_4x if audiofile.duration(i) > 2] |
| |
|
| | shuffle(synthetic_wav_paths_foreign_4x) |
| | shuffle(synthetic_wav_paths_foreign) |
| | shuffle(synthetic_wav_paths) |
| | shuffle(synthetic_wav_paths_4x) |
| | print(len(synthetic_wav_paths_foreign_4x), len(synthetic_wav_paths_foreign), |
| | len(synthetic_wav_paths), len(synthetic_wav_paths_4x)) |
| | for audio_prompt in ['english', |
| | 'english_4x', |
| | 'human', |
| | 'foreign', |
| | 'foreign_4x']: |
| | OUT_FILE = f'{audio_prompt}_hfullh.wav' |
| | if not os.path.isfile(OUT_FILE): |
| | total_audio = [] |
| | total_style = [] |
| | ix = 0 |
| | for list_of_10 in harvard_individual_sentences[:1000]: |
| | |
| | |
| | for text in list_of_10['sentences']: |
| | if audio_prompt == 'english': |
| | _p = synthetic_wav_paths[ix % len(synthetic_wav_paths)] |
| | style_vec = msinference.compute_style(_p) |
| | elif audio_prompt == 'english_4x': |
| | _p = synthetic_wav_paths_4x[ix % len(synthetic_wav_paths_4x)] |
| | style_vec = msinference.compute_style(_p) |
| | elif audio_prompt == 'human': |
| | _p = natural_wav_paths[ix % len(natural_wav_paths)] |
| | style_vec = msinference.compute_style(_p) |
| | elif audio_prompt == 'foreign': |
| | _p = synthetic_wav_paths_foreign[ix % len(synthetic_wav_paths_foreign)] |
| | style_vec = msinference.compute_style(_p) |
| | elif audio_prompt == 'foreign_4x': |
| | _p = synthetic_wav_paths_foreign_4x[ix % len(synthetic_wav_paths_foreign_4x)] |
| | style_vec = msinference.compute_style(_p) |
| | else: |
| | print('unknonw list of style vector') |
| | print(ix, text) |
| | ix += 1 |
| | x = msinference.inference(text, |
| | style_vec, |
| | alpha=0.3, |
| | beta=0.7, |
| | diffusion_steps=7, |
| | embedding_scale=1) |
| | |
| | total_audio.append(x) |
| | _st, fsr = audiofile.read(_p) |
| | total_style.append(_st[:len(x)]) |
| | |
| | |
| | print('_____________________') |
| | |
| | total_audio = np.concatenate(total_audio) |
| | soundfile.write(OUT_FILE, total_audio, 24000) |
| | total_style = np.concatenate(total_style) |
| | soundfile.write('_st_' + OUT_FILE, total_style, fsr) |
| | |
| | else: |
| | print('\nALREADY EXISTS\n') |
| |
|