Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

create_dataset.py 3.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
  1. import argparse
  2. import os
  3. from pathlib import Path
  4. import librosa
  5. import numpy as np
  6. import tqdm
  7. import ruamel.yaml
  8. from preprocessing.text_processing import Phonemizer, TextCleaner
  9. from utils.audio import melspectrogram
  10. parser = argparse.ArgumentParser()
  11. parser.add_argument('--config', dest='CONFIG', type=str, required=True)
  12. parser.add_argument('--dont_cache_phonemes', dest='CACHE_PHON', action='store_false')
  13. parser.add_argument('--njobs', dest='NJOBS', type=int, default=16)
  14. parser.add_argument('--col_sep', dest='COLUMN_SEP', type=str, default='|')
  15. parser.add_argument('--recompute_phon', dest='RECOMPUTE_PHON', action='store_true')
  16. args = parser.parse_args()
  17. for arg in vars(args):
  18. print('{}: {}'.format(arg, getattr(args, arg)))
  19. yaml = ruamel.yaml.YAML()
  20. with open(str(Path(args.CONFIG) / 'data_config.yaml'), 'rb') as conf_yaml:
  21. config = yaml.load(conf_yaml)
  22. args.DATA_DIR = config['data_directory']
  23. args.META_FILE = os.path.join(args.DATA_DIR, config['metadata_filename'])
  24. args.WAV_DIR = os.path.join(args.DATA_DIR, config['wav_subdir_name'])
  25. args.TARGET_DIR = config['train_data_directory']
  26. if args.TARGET_DIR is None:
  27. args.TARGET_DIR = args.DATA_DIR
  28. mel_dir = os.path.join(args.TARGET_DIR, 'mels')
  29. if not os.path.exists(mel_dir):
  30. os.makedirs(mel_dir)
  31. phon_path = os.path.join(args.TARGET_DIR, 'phonemes.npy')
  32. if os.path.exists(phon_path) and not args.RECOMPUTE_PHON:
  33. print("using cached phonemes")
  34. audio_data = np.load(phon_path)
  35. else:
  36. print('\nLoading and cleaning text')
  37. text_cleaner = TextCleaner()
  38. audio_data = []
  39. with open(args.META_FILE, 'r', encoding='utf-8') as f:
  40. for l in f.readlines():
  41. l_split = l.split(args.COLUMN_SEP)
  42. filename, text = l_split[0], l_split[-1]
  43. if filename.endswith('.wav'):
  44. filename = filename.split('.')[-1]
  45. text = text_cleaner.clean(text)
  46. audio_data.append((filename, text))
  47. audio_data = np.array(audio_data)
  48. print('\nPhonemizing')
  49. phonemizer = Phonemizer(config['phoneme_language'])
  50. texts = audio_data[:, 1]
  51. batch_size = 250 # batch phonemization to avoid memory issues.
  52. phonemes = []
  53. for i in tqdm.tqdm(range(0, len(audio_data), batch_size)):
  54. batch = texts[i: i + batch_size]
  55. batch = phonemizer.encode(batch, njobs=args.NJOBS, clean=False)
  56. phonemes.extend(batch)
  57. audio_data = np.concatenate([np.array(audio_data), np.expand_dims(phonemes, axis=1)], axis=1)
  58. if args.CACHE_PHON:
  59. np.save(phon_path, audio_data, allow_pickle=True)
  60. print('\nBuilding dataset and writing files')
  61. np.random.seed(42)
  62. np.random.shuffle(audio_data)
  63. test_metafile = os.path.join(args.TARGET_DIR, 'test_metafile.txt')
  64. train_metafile = os.path.join(args.TARGET_DIR, 'train_metafile.txt')
  65. test_lines = [''.join([filename, '|', text, '|', phon, '\n']) for filename, text, phon in
  66. audio_data[:config['n_test']]]
  67. train_lines = [''.join([filename, '|', text, '|', phon, '\n']) for filename, text, phon in
  68. audio_data[config['n_test']:-1]]
  69. with open(test_metafile, 'w+', encoding='utf-8') as test_f:
  70. test_f.writelines(test_lines)
  71. with open(train_metafile, 'w+', encoding='utf-8') as train_f:
  72. train_f.writelines(train_lines)
  73. for i in tqdm.tqdm(range(len(audio_data))):
  74. filename, _, _ = audio_data[i]
  75. wav_path = os.path.join(args.WAV_DIR, filename + '.wav')
  76. y, sr = librosa.load(wav_path, sr=config['sampling_rate'])
  77. mel = melspectrogram(y, config)
  78. mel_path = os.path.join(mel_dir, filename)
  79. np.save(mel_path, mel.T)
  80. print('\nDone')
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...