acanivet commited on
Commit
bdac835
β€’
1 Parent(s): 55056c0
__pycache__/model.cpython-310.pyc ADDED
Binary file (1.96 kB). View file
 
app.py CHANGED
@@ -1,4 +1,32 @@
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
1
  import streamlit as st
2
+ from model import generate
3
+ import numpy as np
4
+
5
+ if "result" not in st.session_state:
6
+ st.session_state["result"] = np.empty(16000*4)
7
+
8
+ st.title("Sound Exploration")
9
+
10
+ col1, col2 = st.columns(2)
11
+
12
+ with col1:
13
+ instrument = st.selectbox(
14
+ 'Which intrument do you want ?',
15
+ ('🎸 Bass', '🎺 Brass', 'πŸͺˆ Flute', 'πŸͺ• Guitar', '🎹 Keyboard', 'πŸ”¨ Mallet', 'Organ', 'Reed', '🎻 String', 'Synth lead', 'πŸŽ™οΈ Vocal')
16
+ )
17
+
18
+ with col2:
19
+ instrument_t = st.selectbox(
20
+ 'Which type intrument do you want ?',
21
+ ('πŸ“― Acoustic', 'πŸŽ™οΈ Electronic', 'πŸŽ›οΈ Synthetic')
22
+ )
23
+
24
+ with st.expander("Magical parameters πŸͺ„"):
25
+ p1 = st.slider('p1', 0., 1., step=0.001)
26
+
27
+ if st.button("Generate ✨", type="primary"):
28
+ st.session_state["result"] = generate([instrument, instrument_t])
29
+
30
+ if st.session_state["result"].any():
31
+ st.audio(st.session_state["result"], sample_rate=16000)
32
 
 
 
cvae/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from .models import (
2
+ Encoder, Decoder, VAE, CVAE
3
+ )
4
+
5
+ from .blocks import (
6
+ UpResConvBlock, DownResConvBlock
7
+ )
cvae/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (316 Bytes). View file
 
cvae/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (430 Bytes). View file
 
cvae/__pycache__/blocks.cpython-310.pyc ADDED
Binary file (2.01 kB). View file
 
cvae/__pycache__/blocks.cpython-311.pyc ADDED
Binary file (4.32 kB). View file
 
cvae/__pycache__/models.cpython-310.pyc ADDED
Binary file (6.09 kB). View file
 
cvae/__pycache__/models.cpython-311.pyc ADDED
Binary file (12 kB). View file
 
cvae/blocks.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+
3
+ class UpResConvBlock(nn.Module):
4
+ def __init__(self, in_channels, out_channels, kernel_size):
5
+ super(UpResConvBlock, self).__init__()
6
+
7
+ self.residual = nn.Sequential(
8
+ nn.Upsample(scale_factor=2),
9
+ nn.Conv1d(in_channels, out_channels, 1, 1, bias=False),
10
+ )
11
+
12
+ self.main = nn.Sequential(
13
+ nn.Upsample(scale_factor=2),
14
+ nn.Conv1d(in_channels, out_channels, kernel_size, 1),
15
+ nn.GroupNorm(1, out_channels),
16
+ nn.GELU(),
17
+ nn.Conv1d(out_channels, out_channels, kernel_size, 1),
18
+ nn.GroupNorm(1, out_channels),
19
+ nn.GELU()
20
+ )
21
+
22
+ def forward(self, x):
23
+ return self.main(x) + self.residual(x)
24
+
25
+ class DownResConvBlock(nn.Module):
26
+ def __init__(self, in_channels, out_channels, kernel_size):
27
+ super(DownResConvBlock, self).__init__()
28
+
29
+ self.residual = nn.Conv1d(in_channels, out_channels, 1, 2, bias=False)
30
+
31
+ self.main = nn.Sequential(
32
+ nn.Conv1d(in_channels, out_channels, kernel_size, 2),
33
+ nn.GroupNorm(1, out_channels),
34
+ nn.GELU(),
35
+ nn.Conv1d(out_channels, out_channels, kernel_size, 1),
36
+ nn.GroupNorm(1, out_channels),
37
+ nn.GELU()
38
+ )
39
+
40
+ def forward(self, x):
41
+ return self.main(x) + self.residual(x)
42
+
43
+ class ResConvBlock(nn.Module):
44
+ def __init__(self, in_channels, out_channels, kernel_size):
45
+ super(ResConvBlock, self).__init__()
46
+
47
+ self.residual = nn.Identity() if in_channels == out_channels else nn.Conv1d(in_channels, out_channels, 1, bias=False)
48
+
49
+ self.main = nn.Sequential(
50
+ nn.Conv1d(in_channels, out_channels, kernel_size),
51
+ nn.GroupNorm(1, out_channels),
52
+ nn.GELU(),
53
+ nn.Conv1d(out_channels, out_channels, kernel_size),
54
+ nn.GroupNorm(1, out_channels),
55
+ nn.GELU()
56
+ )
57
+
58
+ def forward(self, x):
59
+ return self.main(x) + self.residual(x)
cvae/models.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn, Tensor
3
+ from torch.optim import Optimizer
4
+ from .blocks import UpResConvBlock, DownResConvBlock
5
+ import lightning as L
6
+ from auraloss.freq import MultiResolutionSTFTLoss
7
+
8
+ class Encoder(nn.Module):
9
+ def __init__(self,
10
+ in_channels: int,
11
+ in_features: int,
12
+ out_features: int,
13
+ channels: list = None,
14
+ ) -> None:
15
+ super(Encoder, self).__init__()
16
+
17
+ assert in_features % 2**len(channels) == 0, f"in_features ({in_features}) must be a multiple of downscale factor ({2**len(channels)})"
18
+
19
+ modules = [
20
+ nn.Conv1d(in_channels, channels[0], 1),
21
+ nn.GELU()
22
+ ]
23
+
24
+ for in_channel, out_channel in zip(channels, channels[1:]+[channels[-1]]):
25
+ modules += [
26
+ DownResConvBlock(in_channel, out_channel, 1),
27
+ ]
28
+
29
+ n_features = int(in_features*.5**len(channels))
30
+
31
+ modules += [
32
+ nn.Flatten(),
33
+ nn.Linear(n_features*channels[-1], 2*out_features)
34
+ ]
35
+
36
+ self.net = nn.Sequential(*modules)
37
+
38
+ def forward(self, x):
39
+ mean, logvar = self.net(x).chunk(2, dim=1)
40
+ return mean, logvar
41
+
42
+ class Decoder(nn.Module):
43
+ def __init__(self,
44
+ out_channels: int,
45
+ in_features: int,
46
+ out_features: int,
47
+ channels: list = None,
48
+ ) -> None:
49
+ super(Decoder, self).__init__()
50
+
51
+ n_features = int(out_features/2**len(channels))
52
+
53
+ modules = [
54
+ nn.Linear(in_features, n_features*channels[0]),
55
+ nn.Unflatten(-1, (channels[0], n_features))
56
+ ]
57
+
58
+ for in_channel, out_channel in zip(channels, channels[1:]+[channels[-1]]):
59
+ modules += [
60
+ UpResConvBlock(in_channel, out_channel, 1),
61
+ ]
62
+
63
+ modules += [
64
+ nn.Conv1d(channels[-1], out_channels, 1),
65
+ nn.GELU()
66
+ ]
67
+
68
+ self.net = nn.Sequential(*modules)
69
+
70
+ def forward(self, x):
71
+ x = torch.tanh(self.net(x))
72
+ return x
73
+
74
+
75
+ class VAE(L.LightningModule):
76
+ def __init__(self, io_channels: int, io_features: int, latent_features: int, channels: list, learning_rate: float):
77
+ super().__init__()
78
+ self.encoder = Encoder(io_channels, io_features, latent_features, channels)
79
+ channels.reverse()
80
+ self.decoder = Decoder(io_channels, latent_features, io_features, channels)
81
+ self.latent_features = latent_features
82
+ self.audio_loss_func = MultiResolutionSTFTLoss()
83
+ self.learning_rate = learning_rate
84
+
85
+ @torch.no_grad()
86
+ def sample(self, eps=None):
87
+ if eps is None:
88
+ eps = torch.rand((1, self.latent_features))
89
+ return self.decoder(eps)
90
+
91
+ def loss_function(self, x, x_hat, mean, logvar):
92
+ audio_loss = self.audio_loss_func(x, x_hat)
93
+ kld_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
94
+ return audio_loss + kld_loss
95
+
96
+ def reparameterize(self, mean, logvar):
97
+ std= torch.exp(0.5 * logvar)
98
+ eps = torch.randn_like(std)
99
+ return eps * std + mean
100
+
101
+ def forward(self, x):
102
+ mean, logvar = self.encoder(x)
103
+ z = self.reparameterize(mean, logvar)
104
+ return self.decoder(z), mean, logvar
105
+
106
+ def training_step(self, batch: Tensor, batch_idx: int, log: bool = True) -> Tensor:
107
+ x_hat, mean, logvar = self.forward(batch)
108
+ loss = self.loss_function(batch, x_hat, mean, logvar)
109
+ if log: self.log("train_loss", loss, prog_bar=True)
110
+ return loss
111
+
112
+ def configure_optimizers(self) -> Optimizer:
113
+ optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
114
+ return optimizer
115
+
116
+
117
+ class CVAE(L.LightningModule):
118
+ def __init__(self, io_channels: int, io_features: int, latent_features: int, channels: list, num_classes: int, learning_rate: float):
119
+ super().__init__()
120
+ self.class_embedder = nn.Linear(num_classes, io_features)
121
+ self.data_embedder = nn.Conv1d(io_channels, io_channels, kernel_size=1)
122
+ self.encoder = Encoder(io_channels+1, io_features, latent_features, channels)
123
+ channels.reverse()
124
+ self.decoder = Decoder(io_channels, latent_features+num_classes, io_features, channels)
125
+ self.num_classes = num_classes
126
+ self.latent_features = latent_features
127
+ self.audio_loss_func = MultiResolutionSTFTLoss()
128
+ self.learning_rate = learning_rate
129
+
130
+ @torch.no_grad()
131
+ def sample(self, c, eps=None):
132
+ c = nn.functional.one_hot(c, num_classes=self.num_classes).float().unsqueeze(0)
133
+ if eps is None:
134
+ eps = torch.rand((1, self.latent_features))
135
+ z = torch.cat([eps, c], dim=1)
136
+ return self.decoder(z)
137
+
138
+ def loss_function(self, x, x_hat, mean, logvar):
139
+ audio_loss = self.audio_loss_func(x, x_hat)
140
+ kld_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
141
+ return audio_loss + kld_loss
142
+
143
+ def reparameterize(self, mean, logvar):
144
+ std= torch.exp(0.5 * logvar)
145
+ eps = torch.randn_like(std)
146
+ return eps * std + mean
147
+
148
+ def forward(self, x, c):
149
+ c = nn.functional.one_hot(c, num_classes=self.num_classes).float()
150
+ c_embedding = self.class_embedder(c).unsqueeze(1)
151
+ x_embedding = self.data_embedder(x)
152
+ x = torch.cat([x_embedding, c_embedding], dim = 1)
153
+ mean, logvar = self.encoder(x)
154
+ z = self.reparameterize(mean, logvar)
155
+ z = torch.cat([z, c], dim = 1)
156
+ return self.decoder(z), mean, logvar
157
+
158
+ def training_step(self, batch: Tensor, batch_idx: int, log: bool = True) -> Tensor:
159
+ x, c = batch
160
+ x_hat, mean, logvar = self.forward(x, c)
161
+ loss = self.loss_function(x, x_hat, mean, logvar)
162
+ if log: self.log("train_loss", loss, prog_bar=True)
163
+ return loss
164
+
165
+ def configure_optimizers(self) -> Optimizer:
166
+ optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
167
+ return optimizer
epoch=17-step=650718.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cbccc3cf4b4a124831ab6fc7f23b4270ed90fcb41e1e87277ec4155787362c8
3
+ size 651547328
model.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from cvae import CVAE
2
+ import torch
3
+ from typing import Sequence
4
+ import re
5
+
6
+ instruments = ['bass_acoustic', 'brass_acoustic', 'flute_acoustic', 'guitar_acoustic', 'keyboard_acoustic', 'mallet_acoustic', 'organ_acoustic', 'reed_acoustic', 'string_acoustic', 'synth_lead_acoustic', 'vocal_acoustic', 'bass_synthetic', 'brass_synthetic', 'flute_synthetic', 'guitar_synthetic', 'keyboard_synthetic', 'mallet_synthetic', 'organ_synthetic', 'reed_synthetic', 'string_synthetic', 'synth_lead_synthetic', 'vocal_synthetic', 'bass_electronic', 'brass_electronic', 'flute_electronic', 'guitar_electronic', 'keyboard_electronic', 'mallet_electronic', 'organ_electronic', 'reed_electronic', 'string_electronic', 'synth_lead_electronic', 'vocal_electronic']
7
+
8
+ model = CVAE.load_from_checkpoint(
9
+ 'epoch=17-step=650718.ckpt',
10
+ io_channels=1,
11
+ io_features=16000*4,
12
+ latent_features=5,
13
+ channels=[32, 64, 128, 256, 512],
14
+ num_classes=len(instruments),
15
+ learning_rate=1e-5
16
+ )
17
+
18
+ def format(text):
19
+ text = text.split(' ')[-1]
20
+ return text.replace(" ", "").lower()
21
+
22
+ def choice_to_tensor(choice: Sequence[str]) -> torch.Tensor:
23
+ choice = '_'.join([format(i) for i in choice])
24
+ return torch.tensor(instruments.index(choice))
25
+
26
+ def generate(choice: Sequence[str], params: Sequence[int]=None):
27
+ noise = torch.tensor(params).unsqueeze(0).to('cuda') if params else torch.randn(1, 5).to('cuda')
28
+ return model.sample(eps=noise, c = choice_to_tensor(choice).to('cuda')).cpu().numpy()[0]