Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

config_ft.yml 3.4 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
  1. log_dir: "Models/LJSpeech"
  2. save_freq: 5
  3. log_interval: 10
  4. device: "cuda"
  5. epochs: 50 # number of finetuning epoch (1 hour of data)
  6. batch_size: 8
  7. max_len: 800 # maximum number of frames
  8. pretrained_model: "Models/LibriTTS/epochs_2nd_00020.pth"
  9. second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
  10. load_only_params: true # set to true if do not want to load epoch numbers and optimizer parameters
  11. F0_path: "Utils/JDC/bst.t7"
  12. ASR_config: "Utils/ASR/config.yml"
  13. ASR_path: "Utils/ASR/epoch_00080.pth"
  14. PLBERT_dir: 'Utils/PLBERT/'
  15. data_params:
  16. train_data: "Data/train_list.txt"
  17. val_data: "Data/val_list.txt"
  18. root_path: "/local/LJSpeech-1.1/wavs"
  19. OOD_data: "Data/OOD_texts.txt"
  20. min_length: 50 # sample until texts with this size are obtained for OOD texts
  21. logger: "wandb"
  22. preprocess_params:
  23. sr: 44100
  24. spect_params:
  25. n_fft: 2048
  26. win_length: 1200
  27. hop_length: 300
  28. model_params:
  29. multispeaker: true
  30. dim_in: 64
  31. hidden_dim: 512
  32. max_conv_dim: 512
  33. n_layer: 3
  34. n_mels: 80
  35. n_token: 178 # number of phoneme tokens
  36. max_dur: 50 # maximum duration of a single phoneme
  37. style_dim: 128 # style vector size
  38. dropout: 0.2
  39. # config for decoder
  40. decoder:
  41. type: 'hifigan' # either hifigan or istftnet
  42. resblock_kernel_sizes: [3,7,11]
  43. upsample_rates : [10,5,3,2]
  44. upsample_initial_channel: 512
  45. resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
  46. upsample_kernel_sizes: [20,10,6,4]
  47. # speech language model config
  48. slm:
  49. model: 'microsoft/wavlm-base-plus'
  50. sr: 16000 # sampling rate of SLM
  51. hidden: 768 # hidden size of SLM
  52. nlayers: 13 # number of layers of SLM
  53. initial_channel: 64 # initial channels of SLM discriminator head
  54. # style diffusion model config
  55. diffusion:
  56. embedding_mask_proba: 0.1
  57. # transformer config
  58. transformer:
  59. num_layers: 3
  60. num_heads: 8
  61. head_features: 64
  62. multiplier: 2
  63. # diffusion distribution config
  64. dist:
  65. sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
  66. estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
  67. mean: -3.0
  68. std: 1.0
  69. loss_params:
  70. lambda_mel: 5. # mel reconstruction loss
  71. lambda_gen: 1. # generator loss
  72. lambda_slm: 1. # slm feature matching loss
  73. lambda_mono: 1. # monotonic alignment loss (TMA)
  74. lambda_s2s: 1. # sequence-to-sequence loss (TMA)
  75. lambda_F0: 1. # F0 reconstruction loss
  76. lambda_norm: 1. # norm reconstruction loss
  77. lambda_dur: 1. # duration loss
  78. lambda_ce: 20. # duration predictor probability output CE loss
  79. lambda_sty: 1. # style reconstruction loss
  80. lambda_diff: 1. # score matching loss
  81. diff_epoch: 10 # style diffusion starting epoch
  82. joint_epoch: 30 # joint training starting epoch
  83. optimizer_params:
  84. lr: 0.0001 # general learning rate
  85. bert_lr: 0.00001 # learning rate for PLBERT
  86. ft_lr: 0.0001 # learning rate for acoustic modules
  87. slmadv_params:
  88. min_len: 400 # minimum length of samples
  89. max_len: 500 # maximum length of samples
  90. batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
  91. iter: 10 # update the discriminator every this iterations of generator update
  92. thresh: 5 # gradient norm above which the gradient is scaled
  93. scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
  94. sig: 1.5 # sigma for differentiable duration modeling
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...