Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

create_dataset.py 3.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
  1. import argparse
  2. import os
  3. from pathlib import Path
  4. import librosa
  5. import numpy as np
  6. import tqdm
  7. import ruamel.yaml
  8. from preprocessing.text_processing import Phonemizer, TextCleaner
  9. from utils.audio import melspectrogram
  10. parser = argparse.ArgumentParser()
  11. parser.add_argument('--config', dest='CONFIG', type=str, required=True)
  12. parser.add_argument('--dont_cache_phonemes', dest='CACHE_PHON', action='store_false')
  13. parser.add_argument('--njobs', dest='NJOBS', type=int, default=16)
  14. parser.add_argument('--col_sep', dest='COLUMN_SEP', type=str, default='|')
  15. parser.add_argument('--recompute_phon', dest='RECOMPUTE_PHON', action='store_true')
  16. args = parser.parse_args()
  17. for arg in vars(args):
  18. print('{}: {}'.format(arg, getattr(args, arg)))
  19. yaml = ruamel.yaml.YAML()
  20. config = yaml.load(open(str(Path(args.CONFIG) / 'data_config.yaml'), 'rb'))
  21. args.DATA_DIR = config['data_directory']
  22. args.META_FILE = os.path.join(args.DATA_DIR, config['metadata_filename'])
  23. args.WAV_DIR = os.path.join(args.DATA_DIR, config['wav_subdir_name'])
  24. args.TARGET_DIR = config['train_data_directory']
  25. if args.TARGET_DIR is None:
  26. args.TARGET_DIR = args.DATA_DIR
  27. mel_dir = os.path.join(args.TARGET_DIR, 'mels')
  28. if not os.path.exists(mel_dir):
  29. os.makedirs(mel_dir)
  30. phon_path = os.path.join(args.TARGET_DIR, 'phonemes.npy')
  31. if os.path.exists(phon_path) and not args.RECOMPUTE_PHON:
  32. print("using cached phonemes")
  33. audio_data = np.load(phon_path)
  34. else:
  35. print('\nLoading and cleaning text')
  36. text_cleaner = TextCleaner()
  37. audio_data = []
  38. with open(args.META_FILE, 'r', encoding='utf-8') as f:
  39. for l in f.readlines():
  40. l_split = l.split(args.COLUMN_SEP)
  41. filename, text = l_split[0], l_split[-1]
  42. if filename.endswith('.wav'):
  43. filename = filename.split('.')[-1]
  44. text = text_cleaner.clean(text)
  45. audio_data.append((filename, text))
  46. audio_data = np.array(audio_data)
  47. print('\nPhonemizing')
  48. phonemizer = Phonemizer(config['phoneme_language'])
  49. texts = audio_data[:, 1]
  50. batch_size = 250 # batch phonemization to avoid memory issues.
  51. phonemes = []
  52. for i in tqdm.tqdm(range(0, len(audio_data), batch_size)):
  53. batch = texts[i: i + batch_size]
  54. batch = phonemizer.encode(batch, njobs=args.NJOBS, clean=False)
  55. phonemes.extend(batch)
  56. audio_data = np.concatenate([np.array(audio_data), np.expand_dims(phonemes, axis=1)], axis=1)
  57. if args.CACHE_PHON:
  58. np.save(phon_path, audio_data, allow_pickle=True)
  59. print('\nBuilding dataset and writing files')
  60. np.random.seed(42)
  61. np.random.shuffle(audio_data)
  62. test_metafile = os.path.join(args.TARGET_DIR, 'test_metafile.txt')
  63. train_metafile = os.path.join(args.TARGET_DIR, 'train_metafile.txt')
  64. test_lines = [''.join([filename, '|', text, '|', phon, '\n']) for filename, text, phon in
  65. audio_data[:config['n_test']]]
  66. train_lines = [''.join([filename, '|', text, '|', phon, '\n']) for filename, text, phon in
  67. audio_data[config['n_test']:-1]]
  68. with open(test_metafile, 'w+', encoding='utf-8') as test_f:
  69. test_f.writelines(test_lines)
  70. with open(train_metafile, 'w+', encoding='utf-8') as train_f:
  71. train_f.writelines(train_lines)
  72. for i in tqdm.tqdm(range(len(audio_data))):
  73. filename, _, _ = audio_data[i]
  74. wav_path = os.path.join(args.WAV_DIR, filename + '.wav')
  75. y, sr = librosa.load(wav_path, sr=config['sampling_rate'])
  76. mel = melspectrogram(y, config)
  77. mel_path = os.path.join(mel_dir, filename)
  78. np.save(mel_path, mel.T)
  79. print('\nDone')
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...