Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

config.yml 3.7 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
  1. log_dir: "Models/LJSpeech"
  2. first_stage_path: "first_stage.pth"
  3. save_freq: 1
  4. log_interval: 10
  5. device: "cuda"
  6. epochs_1st: 200 # number of epochs for first stage training (pre-training)
  7. epochs_2nd: 100 # number of peochs for second stage training (joint training)
  8. batch_size: 16
  9. max_len: 800 # maximum number of frames
  10. pretrained_model: ""
  11. second_stage_load_pretrained: false # set to true if the pre-trained model is for 2nd stage
  12. load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
  13. F0_path: "Utils/JDC/bst.t7"
  14. ASR_config: "Utils/ASR/config.yml"
  15. ASR_path: "Utils/ASR/epoch_00080.pth"
  16. PLBERT_dir: 'Utils/PLBERT/'
  17. data_params:
  18. train_data: "Data/train_list.txt"
  19. val_data: "Data/val_list.txt"
  20. root_path: "/local/LJSpeech-1.1/wavs"
  21. OOD_data: "Data/OOD_texts.txt"
  22. min_length: 50 # sample until texts with this size are obtained for OOD texts
  23. logger: "wandb"
  24. preprocess_params:
  25. sr: 44100
  26. spect_params:
  27. n_fft: 2048
  28. win_length: 1200
  29. hop_length: 300
  30. model_params:
  31. multispeaker: true
  32. dim_in: 64
  33. hidden_dim: 512
  34. max_conv_dim: 512
  35. n_layer: 3
  36. n_mels: 80
  37. n_token: 178 # number of phoneme tokens
  38. max_dur: 50 # maximum duration of a single phoneme
  39. style_dim: 128 # style vector size
  40. dropout: 0.2
  41. # config for decoder
  42. decoder:
  43. type: 'hifigan' # either hifigan or istftnet
  44. resblock_kernel_sizes: [3,7,11]
  45. upsample_rates : [10, 6]
  46. upsample_initial_channel: 512
  47. resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
  48. upsample_kernel_sizes: [20, 12]
  49. gen_istft_n_fft: 20
  50. gen_istft_hop_size: 5
  51. # speech language model config
  52. slm:
  53. model: 'microsoft/wavlm-base-plus'
  54. sr: 16000 # sampling rate of SLM
  55. hidden: 768 # hidden size of SLM
  56. nlayers: 13 # number of layers of SLM
  57. initial_channel: 64 # initial channels of SLM discriminator head
  58. # style diffusion model config
  59. diffusion:
  60. embedding_mask_proba: 0.1
  61. # transformer config
  62. transformer:
  63. num_layers: 3
  64. num_heads: 8
  65. head_features: 64
  66. multiplier: 2
  67. # diffusion distribution config
  68. dist:
  69. sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
  70. estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
  71. mean: -3.0
  72. std: 1.0
  73. loss_params:
  74. lambda_mel: 5. # mel reconstruction loss
  75. lambda_gen: 1. # generator loss
  76. lambda_slm: 1. # slm feature matching loss
  77. lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
  78. lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
  79. TMA_epoch: 50 # TMA starting epoch (1st stage)
  80. lambda_F0: 1. # F0 reconstruction loss (2nd stage)
  81. lambda_norm: 1. # norm reconstruction loss (2nd stage)
  82. lambda_dur: 1. # duration loss (2nd stage)
  83. lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
  84. lambda_sty: 1. # style reconstruction loss (2nd stage)
  85. lambda_diff: 1. # score matching loss (2nd stage)
  86. diff_epoch: 20 # style diffusion starting epoch (2nd stage)
  87. joint_epoch: 50 # joint training starting epoch (2nd stage)
  88. optimizer_params:
  89. lr: 0.0001 # general learning rate
  90. bert_lr: 0.00001 # learning rate for PLBERT
  91. ft_lr: 0.00001 # learning rate for acoustic modules
  92. slmadv_params:
  93. min_len: 400 # minimum length of samples
  94. max_len: 500 # maximum length of samples
  95. batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
  96. iter: 10 # update the discriminator every this iterations of generator update
  97. thresh: 5 # gradient norm above which the gradient is scaled
  98. scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
  99. sig: 1.5 # sigma for differentiable duration modeling
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...