Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

losses.py 9.0 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
  1. import torch
  2. from torch import nn
  3. import torch.nn.functional as F
  4. import torchaudio
  5. from transformers import AutoModel
  6. class SpectralConvergengeLoss(torch.nn.Module):
  7. """Spectral convergence loss module."""
  8. def __init__(self):
  9. """Initilize spectral convergence loss module."""
  10. super(SpectralConvergengeLoss, self).__init__()
  11. def forward(self, x_mag, y_mag):
  12. """Calculate forward propagation.
  13. Args:
  14. x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
  15. y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
  16. Returns:
  17. Tensor: Spectral convergence loss value.
  18. """
  19. return torch.norm(y_mag - x_mag, p=1) / torch.norm(y_mag, p=1)
  20. class STFTLoss(torch.nn.Module):
  21. """STFT loss module."""
  22. def __init__(self, fft_size=1024, shift_size=120, win_length=600, window=torch.hann_window):
  23. """Initialize STFT loss module."""
  24. super(STFTLoss, self).__init__()
  25. self.fft_size = fft_size
  26. self.shift_size = shift_size
  27. self.win_length = win_length
  28. self.to_mel = torchaudio.transforms.MelSpectrogram(sample_rate=24000, n_fft=fft_size, win_length=win_length, hop_length=shift_size, window_fn=window)
  29. self.spectral_convergenge_loss = SpectralConvergengeLoss()
  30. def forward(self, x, y):
  31. """Calculate forward propagation.
  32. Args:
  33. x (Tensor): Predicted signal (B, T).
  34. y (Tensor): Groundtruth signal (B, T).
  35. Returns:
  36. Tensor: Spectral convergence loss value.
  37. Tensor: Log STFT magnitude loss value.
  38. """
  39. x_mag = self.to_mel(x)
  40. mean, std = -4, 4
  41. x_mag = (torch.log(1e-5 + x_mag) - mean) / std
  42. y_mag = self.to_mel(y)
  43. mean, std = -4, 4
  44. y_mag = (torch.log(1e-5 + y_mag) - mean) / std
  45. sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
  46. return sc_loss
  47. class MultiResolutionSTFTLoss(torch.nn.Module):
  48. """Multi resolution STFT loss module."""
  49. def __init__(self,
  50. fft_sizes=[1024, 2048, 512],
  51. hop_sizes=[120, 240, 50],
  52. win_lengths=[600, 1200, 240],
  53. window=torch.hann_window):
  54. """Initialize Multi resolution STFT loss module.
  55. Args:
  56. fft_sizes (list): List of FFT sizes.
  57. hop_sizes (list): List of hop sizes.
  58. win_lengths (list): List of window lengths.
  59. window (str): Window function type.
  60. """
  61. super(MultiResolutionSTFTLoss, self).__init__()
  62. assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
  63. self.stft_losses = torch.nn.ModuleList()
  64. for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
  65. self.stft_losses += [STFTLoss(fs, ss, wl, window)]
  66. def forward(self, x, y):
  67. """Calculate forward propagation.
  68. Args:
  69. x (Tensor): Predicted signal (B, T).
  70. y (Tensor): Groundtruth signal (B, T).
  71. Returns:
  72. Tensor: Multi resolution spectral convergence loss value.
  73. Tensor: Multi resolution log STFT magnitude loss value.
  74. """
  75. sc_loss = 0.0
  76. for f in self.stft_losses:
  77. sc_l = f(x, y)
  78. sc_loss += sc_l
  79. sc_loss /= len(self.stft_losses)
  80. return sc_loss
  81. def feature_loss(fmap_r, fmap_g):
  82. loss = 0
  83. for dr, dg in zip(fmap_r, fmap_g):
  84. for rl, gl in zip(dr, dg):
  85. loss += torch.mean(torch.abs(rl - gl))
  86. return loss*2
  87. def discriminator_loss(disc_real_outputs, disc_generated_outputs):
  88. loss = 0
  89. r_losses = []
  90. g_losses = []
  91. for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
  92. r_loss = torch.mean((1-dr)**2)
  93. g_loss = torch.mean(dg**2)
  94. loss += (r_loss + g_loss)
  95. r_losses.append(r_loss.item())
  96. g_losses.append(g_loss.item())
  97. return loss, r_losses, g_losses
  98. def generator_loss(disc_outputs):
  99. loss = 0
  100. gen_losses = []
  101. for dg in disc_outputs:
  102. l = torch.mean((1-dg)**2)
  103. gen_losses.append(l)
  104. loss += l
  105. return loss, gen_losses
  106. """ https://dl.acm.org/doi/abs/10.1145/3573834.3574506 """
  107. def discriminator_TPRLS_loss(disc_real_outputs, disc_generated_outputs):
  108. loss = 0
  109. for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
  110. tau = 0.04
  111. m_DG = torch.median((dr-dg))
  112. L_rel = torch.mean((((dr - dg) - m_DG)**2)[dr < dg + m_DG])
  113. loss += tau - F.relu(tau - L_rel)
  114. return loss
  115. def generator_TPRLS_loss(disc_real_outputs, disc_generated_outputs):
  116. loss = 0
  117. for dg, dr in zip(disc_real_outputs, disc_generated_outputs):
  118. tau = 0.04
  119. m_DG = torch.median((dr-dg))
  120. L_rel = torch.mean((((dr - dg) - m_DG)**2)[dr < dg + m_DG])
  121. loss += tau - F.relu(tau - L_rel)
  122. return loss
  123. class GeneratorLoss(torch.nn.Module):
  124. def __init__(self, mpd, msd):
  125. super(GeneratorLoss, self).__init__()
  126. self.mpd = mpd
  127. self.msd = msd
  128. def forward(self, y, y_hat):
  129. y_df_hat_r, y_df_hat_g, fmap_f_r, fmap_f_g = self.mpd(y, y_hat)
  130. y_ds_hat_r, y_ds_hat_g, fmap_s_r, fmap_s_g = self.msd(y, y_hat)
  131. loss_fm_f = feature_loss(fmap_f_r, fmap_f_g)
  132. loss_fm_s = feature_loss(fmap_s_r, fmap_s_g)
  133. loss_gen_f, losses_gen_f = generator_loss(y_df_hat_g)
  134. loss_gen_s, losses_gen_s = generator_loss(y_ds_hat_g)
  135. loss_rel = generator_TPRLS_loss(y_df_hat_r, y_df_hat_g) + generator_TPRLS_loss(y_ds_hat_r, y_ds_hat_g)
  136. loss_gen_all = loss_gen_s + loss_gen_f + loss_fm_s + loss_fm_f + loss_rel
  137. return loss_gen_all.mean()
  138. class DiscriminatorLoss(torch.nn.Module):
  139. def __init__(self, mpd, msd):
  140. super(DiscriminatorLoss, self).__init__()
  141. self.mpd = mpd
  142. self.msd = msd
  143. def forward(self, y, y_hat):
  144. # MPD
  145. y_df_hat_r, y_df_hat_g, _, _ = self.mpd(y, y_hat)
  146. loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss(y_df_hat_r, y_df_hat_g)
  147. # MSD
  148. y_ds_hat_r, y_ds_hat_g, _, _ = self.msd(y, y_hat)
  149. loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss(y_ds_hat_r, y_ds_hat_g)
  150. loss_rel = discriminator_TPRLS_loss(y_df_hat_r, y_df_hat_g) + discriminator_TPRLS_loss(y_ds_hat_r, y_ds_hat_g)
  151. d_loss = loss_disc_s + loss_disc_f + loss_rel
  152. return d_loss.mean()
  153. class WavLMLoss(torch.nn.Module):
  154. def __init__(self, model, wd, model_sr, slm_sr=16000):
  155. super(WavLMLoss, self).__init__()
  156. self.wavlm = AutoModel.from_pretrained(model)
  157. self.wd = wd
  158. self.resample = torchaudio.transforms.Resample(model_sr, slm_sr)
  159. def forward(self, wav, y_rec):
  160. with torch.no_grad():
  161. wav_16 = self.resample(wav)
  162. wav_embeddings = self.wavlm(input_values=wav_16, output_hidden_states=True).hidden_states
  163. y_rec_16 = self.resample(y_rec)
  164. y_rec_embeddings = self.wavlm(input_values=y_rec_16.squeeze(), output_hidden_states=True).hidden_states
  165. floss = 0
  166. for er, eg in zip(wav_embeddings, y_rec_embeddings):
  167. floss += torch.mean(torch.abs(er - eg))
  168. return floss.mean()
  169. def generator(self, y_rec):
  170. y_rec_16 = self.resample(y_rec)
  171. y_rec_embeddings = self.wavlm(input_values=y_rec_16, output_hidden_states=True).hidden_states
  172. y_rec_embeddings = torch.stack(y_rec_embeddings, dim=1).transpose(-1, -2).flatten(start_dim=1, end_dim=2)
  173. y_df_hat_g = self.wd(y_rec_embeddings)
  174. loss_gen = torch.mean((1-y_df_hat_g)**2)
  175. return loss_gen
  176. def discriminator(self, wav, y_rec):
  177. with torch.no_grad():
  178. wav_16 = self.resample(wav)
  179. wav_embeddings = self.wavlm(input_values=wav_16, output_hidden_states=True).hidden_states
  180. y_rec_16 = self.resample(y_rec)
  181. y_rec_embeddings = self.wavlm(input_values=y_rec_16, output_hidden_states=True).hidden_states
  182. y_embeddings = torch.stack(wav_embeddings, dim=1).transpose(-1, -2).flatten(start_dim=1, end_dim=2)
  183. y_rec_embeddings = torch.stack(y_rec_embeddings, dim=1).transpose(-1, -2).flatten(start_dim=1, end_dim=2)
  184. y_d_rs = self.wd(y_embeddings)
  185. y_d_gs = self.wd(y_rec_embeddings)
  186. y_df_hat_r, y_df_hat_g = y_d_rs, y_d_gs
  187. r_loss = torch.mean((1-y_df_hat_r)**2)
  188. g_loss = torch.mean((y_df_hat_g)**2)
  189. loss_disc_f = r_loss + g_loss
  190. return loss_disc_f.mean()
  191. def discriminator_forward(self, wav):
  192. with torch.no_grad():
  193. wav_16 = self.resample(wav)
  194. wav_embeddings = self.wavlm(input_values=wav_16, output_hidden_states=True).hidden_states
  195. y_embeddings = torch.stack(wav_embeddings, dim=1).transpose(-1, -2).flatten(start_dim=1, end_dim=2)
  196. y_d_rs = self.wd(y_embeddings)
  197. return y_d_rs
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...