File size: 3,123 Bytes
fda2aa0
9b9c715
fda2aa0
9b9c715
fda2aa0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b9c715
 
 
 
 
 
fda2aa0
9b9c715
 
 
fda2aa0
9b9c715
 
fda2aa0
 
9b9c715
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fda2aa0
9b9c715
 
 
 
fda2aa0
 
 
 
 
 
 
 
 
 
 
 
 
9b9c715
 
 
fda2aa0
9b9c715
 
 
fda2aa0
9b9c715
 
 
 
 
 
 
 
 
 
fda2aa0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Synthesize all Harvard Lists - 767 sentences as single .wav
#
# 1.
#
#     './prompt_mimic3_english/'
#
# 2.
#
#     './prompt_mimic3_english_4x/'
#
# 3.
#
#     './prompt_human/'
#
# 4.
#
#     './prompt_mimic3_foreign/'
#
# 5.
#   
#     './prompt_mimic3_foreign_4x/
#
#
#  ----> THE .wavs will be used for visualisation

import soundfile
import json
import numpy as np
import audb
from pathlib import Path
import os
LABELS = ['arousal', 'dominance', 'valence']


def load_human_speech(split=None):
    DB = [
        # [dataset, version, table, has_timdeltas_or_is_full_wavfile]
        #    ['crema-d', '1.1.1', 'emotion.voice.test', False],
           ['emodb',  '1.2.0', 'emotion.categories.train.gold_standard', False],
    ]

    output_list = []
    for database_name, ver, table, has_timedeltas in DB:

        a = audb.load(database_name,
                        sampling_rate=16000,
                        format='wav',
                        mixdown=True,
                        version=ver,
                        cache_root='/cache/audb/')
        a = a[table].get()
        if has_timedeltas:
            print(f'{has_timedeltas=}')
            # a = a.reset_index()[['file', 'start', 'end']]
            # output_list += [[*t] for t
            #         in zip(a.file.values, a.start.dt.total_seconds().values, a.end.dt.total_seconds().values)]
        else:
            output_list += [f for f in a.index]  # use file (no timedeltas)
    return output_list

    



# SYNTHESIZE mimic mimicx4 crema-d
import msinference
prompt_paths = {}

with open('harvard.json', 'r') as f:
    harvard_individual_sentences = json.load(f)['sentences']

for audio_prompt in [#'mimic3_english',
                     #'mimic3_english_4x',
                     'human',
                     'mimic3_foreign',
                     'mimic3_foreign_4x']:
    
    if audio_prompt == 'human':
        prompt_paths = load_human_speech()  # better emodb ?
    else:
        prompt_dir = '/data/dkounadis/artificial-styletts2/' + audio_prompt + '/'
        prompt_paths = [prompt_dir + f for f in os.listdir(prompt_dir)]
    prompt_paths = prompt_paths[:10]
    print(prompt_paths,'\n\n__________')

    total_audio = []
    ix = 0
    for list_of_10 in harvard_individual_sentences[:1]:
        # long_sentence = ' '.join(list_of_10['sentences'])
        # harvard.append(long_sentence.replace('.', ' '))
        for text in list_of_10['sentences']:
            style_vec = msinference.compute_style(prompt_paths[ix % len(prompt_paths)])
            print(ix, text)
            ix += 1
            x = msinference.inference(text,
                                        style_vec,
                                        alpha=0.3,
                                        beta=0.7,
                                        diffusion_steps=7,
                                        embedding_scale=1)
            
            total_audio.append(x)
    total_audio = np.concatenate(total_audio) # -- concat 77x lists
    soundfile.write(f'{audio_prompt}_767_5.wav', total_audio, 24000)
    print(f'{audio_prompt}_767_5.wav')