dkounadis
/

artificial-styletts2

@@ -17,9 +17,9 @@ LABELS = ['arousal', 'dominance', 'valence']
 def load_speech(split=None):
     DB = [
         # [dataset, version, table, has_timdeltas_or_is_full_wavfile]
-        #    ['crema-d', '1.1.1', 'emotion.voice.test', False],
-        ['librispeech', '3.1.0', 'test-clean', False],
- #           ['emodb',  '1.2.0', 'emotion.categories.train.gold_standard', False],
   #          ['entertain-playtestcloud', '1.1.0', 'emotion.categories.train.gold_standard', True],
    #         ['erik', '2.2.0', 'emotion.categories.train.gold_standard', True],
     #        ['meld', '1.3.1', 'emotion.categories.train.gold_standard', False],
@@ -116,19 +116,19 @@ for audio_prompt in ['english',
                         # harvard.append(long_sentence.replace('.', ' '))
                         for text in list_of_10['sentences']:
                             if audio_prompt == 'english':
-                                _p = synthetic_wav_paths[ix % 134]
                                 style_vec = msinference.compute_style(_p)
                             elif audio_prompt == 'english_4x':
-                                _p = synthetic_wav_paths_4x[ix % 134]
                                 style_vec = msinference.compute_style(_p)
                             elif audio_prompt == 'human':
                                 _p = natural_wav_paths[ix % len(natural_wav_paths)]
                                 style_vec = msinference.compute_style(_p)
                             elif audio_prompt == 'foreign':
-                                _p = synthetic_wav_paths_foreign[ix % 204]
                                 style_vec = msinference.compute_style(_p)
                             elif audio_prompt == 'foreign_4x':
-                                _p = synthetic_wav_paths_foreign_4x[ix % 204]
                                 style_vec = msinference.compute_style(_p)
                             else:
                                 print('unknonw list of style vector')
@@ -154,4 +154,4 @@ for audio_prompt in ['english',
                     soundfile.write('_st_' + OUT_FILE, total_style, fsr)  # take this fs from the loading
     else:
-        print('\nALREADY EXISTS\n')

 def load_speech(split=None):
     DB = [
         # [dataset, version, table, has_timdeltas_or_is_full_wavfile]
+          #  ['crema-d', '1.1.1', 'emotion.voice.test', False],
+        #['librispeech', '3.1.0', 'test-clean', False],
+            ['emodb',  '1.2.0', 'emotion.categories.train.gold_standard', False],
   #          ['entertain-playtestcloud', '1.1.0', 'emotion.categories.train.gold_standard', True],
    #         ['erik', '2.2.0', 'emotion.categories.train.gold_standard', True],
     #        ['meld', '1.3.1', 'emotion.categories.train.gold_standard', False],
                         # harvard.append(long_sentence.replace('.', ' '))
                         for text in list_of_10['sentences']:
                             if audio_prompt == 'english':
+                                _p = synthetic_wav_paths[ix % len(synthetic_wav_paths)] #134]
                                 style_vec = msinference.compute_style(_p)
                             elif audio_prompt == 'english_4x':
+                                _p = synthetic_wav_paths_4x[ix % len(synthetic_wav_paths_4x)] # 134]
                                 style_vec = msinference.compute_style(_p)
                             elif audio_prompt == 'human':
                                 _p = natural_wav_paths[ix % len(natural_wav_paths)]
                                 style_vec = msinference.compute_style(_p)
                             elif audio_prompt == 'foreign':
+                                _p = synthetic_wav_paths_foreign[ix % len(synthetic_wav_paths_foreign)] #179]  # 204 some short styles are discarded
                                 style_vec = msinference.compute_style(_p)
                             elif audio_prompt == 'foreign_4x':
+                                _p = synthetic_wav_paths_foreign_4x[ix % len(synthetic_wav_paths_foreign_4x)] #179]  # 204
                                 style_vec = msinference.compute_style(_p)
                             else:
                                 print('unknonw list of style vector')
                     soundfile.write('_st_' + OUT_FILE, total_style, fsr)  # take this fs from the loading
     else:
+        print('\nALREADY EXISTS\n')